FFmpeg
rgb2rgb.c
Go to the documentation of this file.
1 /*
2  * software RGB to RGB converter
3  * pluralize by software PAL8 to RGB converter
4  * software YUV to YUV converter
5  * software YUV to RGB converter
6  * Written by Nick Kurshev.
7  * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
8  *
9  * This file is part of FFmpeg.
10  *
11  * FFmpeg is free software; you can redistribute it and/or
12  * modify it under the terms of the GNU Lesser General Public
13  * License as published by the Free Software Foundation; either
14  * version 2.1 of the License, or (at your option) any later version.
15  *
16  * FFmpeg is distributed in the hope that it will be useful,
17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19  * Lesser General Public License for more details.
20  *
21  * You should have received a copy of the GNU Lesser General Public
22  * License along with FFmpeg; if not, write to the Free Software
23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24  */
25 
26 #include <stdint.h>
27 
28 #include "config.h"
29 #include "libavutil/attributes.h"
30 #include "libavutil/x86/cpu.h"
31 #include "libavutil/cpu.h"
32 #include "libavutil/bswap.h"
33 #include "libavutil/mem_internal.h"
34 
35 #include "libswscale/rgb2rgb.h"
36 #include "libswscale/swscale.h"
38 
39 #if HAVE_INLINE_ASM
40 #include "libavutil/x86/asm.h"
41 
42 DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL;
43 DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL;
44 DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL;
45 DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL;
46 DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL;
47 DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL;
48 DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL;
49 DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL;
50 DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL;
51 DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL;
52 DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL;
53 DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL;
54 DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL;
55 DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
56 DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
57 DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL;
58 DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL;
59 DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL;
60 #define mask16b mask15b
61 DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL;
62 DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL;
63 DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL;
64 DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
65 DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL;
66 DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL;
67 DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
68 DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
69 DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL;
70 DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL;
71 DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL;
72 
73 #define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
74 #define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
75 #define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
76 #define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
77 #define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
78 #define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
79 #define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
80 #define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
81 #define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
82 
83 // MMXEXT versions
84 #define PREFETCH "prefetchnta"
85 #define PAVGB "pavgb"
86 #define MOVNTQ "movntq"
87 #define SFENCE "sfence"
88 
89 #define EMMS "emms"
90 
91 static inline void rgb24tobgr32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
92 {
93  uint8_t *dest = dst;
94  const uint8_t *s = src;
95  const uint8_t *end;
96  const uint8_t *mm_end;
97  end = s + src_size;
98  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
99  mm_end = end - 23;
100  __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
101  while (s < mm_end) {
102  __asm__ volatile(
103  PREFETCH" 32(%1) \n\t"
104  "movd (%1), %%mm0 \n\t"
105  "punpckldq 3(%1), %%mm0 \n\t"
106  "movd 6(%1), %%mm1 \n\t"
107  "punpckldq 9(%1), %%mm1 \n\t"
108  "movd 12(%1), %%mm2 \n\t"
109  "punpckldq 15(%1), %%mm2 \n\t"
110  "movd 18(%1), %%mm3 \n\t"
111  "punpckldq 21(%1), %%mm3 \n\t"
112  "por %%mm7, %%mm0 \n\t"
113  "por %%mm7, %%mm1 \n\t"
114  "por %%mm7, %%mm2 \n\t"
115  "por %%mm7, %%mm3 \n\t"
116  MOVNTQ" %%mm0, (%0) \n\t"
117  MOVNTQ" %%mm1, 8(%0) \n\t"
118  MOVNTQ" %%mm2, 16(%0) \n\t"
119  MOVNTQ" %%mm3, 24(%0)"
120  :: "r"(dest), "r"(s)
121  :"memory");
122  dest += 32;
123  s += 24;
124  }
125  __asm__ volatile(SFENCE:::"memory");
126  __asm__ volatile(EMMS:::"memory");
127  while (s < end) {
128  *dest++ = *s++;
129  *dest++ = *s++;
130  *dest++ = *s++;
131  *dest++ = 255;
132  }
133 }
134 
135 #define STORE_BGR24_MMX \
136  "psrlq $8, %%mm2 \n\t" \
137  "psrlq $8, %%mm3 \n\t" \
138  "psrlq $8, %%mm6 \n\t" \
139  "psrlq $8, %%mm7 \n\t" \
140  "pand "MANGLE(mask24l)", %%mm0\n\t" \
141  "pand "MANGLE(mask24l)", %%mm1\n\t" \
142  "pand "MANGLE(mask24l)", %%mm4\n\t" \
143  "pand "MANGLE(mask24l)", %%mm5\n\t" \
144  "pand "MANGLE(mask24h)", %%mm2\n\t" \
145  "pand "MANGLE(mask24h)", %%mm3\n\t" \
146  "pand "MANGLE(mask24h)", %%mm6\n\t" \
147  "pand "MANGLE(mask24h)", %%mm7\n\t" \
148  "por %%mm2, %%mm0 \n\t" \
149  "por %%mm3, %%mm1 \n\t" \
150  "por %%mm6, %%mm4 \n\t" \
151  "por %%mm7, %%mm5 \n\t" \
152  \
153  "movq %%mm1, %%mm2 \n\t" \
154  "movq %%mm4, %%mm3 \n\t" \
155  "psllq $48, %%mm2 \n\t" \
156  "psllq $32, %%mm3 \n\t" \
157  "por %%mm2, %%mm0 \n\t" \
158  "psrlq $16, %%mm1 \n\t" \
159  "psrlq $32, %%mm4 \n\t" \
160  "psllq $16, %%mm5 \n\t" \
161  "por %%mm3, %%mm1 \n\t" \
162  "por %%mm5, %%mm4 \n\t" \
163  \
164  MOVNTQ" %%mm0, (%0) \n\t" \
165  MOVNTQ" %%mm1, 8(%0) \n\t" \
166  MOVNTQ" %%mm4, 16(%0)"
167 
168 
169 static inline void rgb32tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
170 {
171  uint8_t *dest = dst;
172  const uint8_t *s = src;
173  const uint8_t *end;
174  const uint8_t *mm_end;
175  end = s + src_size;
176  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
177  mm_end = end - 31;
178  while (s < mm_end) {
179  __asm__ volatile(
180  PREFETCH" 32(%1) \n\t"
181  "movq (%1), %%mm0 \n\t"
182  "movq 8(%1), %%mm1 \n\t"
183  "movq 16(%1), %%mm4 \n\t"
184  "movq 24(%1), %%mm5 \n\t"
185  "movq %%mm0, %%mm2 \n\t"
186  "movq %%mm1, %%mm3 \n\t"
187  "movq %%mm4, %%mm6 \n\t"
188  "movq %%mm5, %%mm7 \n\t"
189  STORE_BGR24_MMX
190  :: "r"(dest), "r"(s)
191  NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
192  :"memory");
193  dest += 24;
194  s += 32;
195  }
196  __asm__ volatile(SFENCE:::"memory");
197  __asm__ volatile(EMMS:::"memory");
198  while (s < end) {
199  *dest++ = *s++;
200  *dest++ = *s++;
201  *dest++ = *s++;
202  s++;
203  }
204 }
205 
206 /*
207  original by Strepto/Astral
208  ported to gcc & bugfixed: A'rpi
209  MMXEXT, 3DNOW optimization by Nick Kurshev
210  32-bit C version, and and&add trick by Michael Niedermayer
211 */
212 static inline void rgb15to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
213 {
214  register const uint8_t* s=src;
215  register uint8_t* d=dst;
216  register const uint8_t *end;
217  const uint8_t *mm_end;
218  end = s + src_size;
219  __asm__ volatile(PREFETCH" %0"::"m"(*s));
220  __asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
221  mm_end = end - 15;
222  while (s<mm_end) {
223  __asm__ volatile(
224  PREFETCH" 32(%1) \n\t"
225  "movq (%1), %%mm0 \n\t"
226  "movq 8(%1), %%mm2 \n\t"
227  "movq %%mm0, %%mm1 \n\t"
228  "movq %%mm2, %%mm3 \n\t"
229  "pand %%mm4, %%mm0 \n\t"
230  "pand %%mm4, %%mm2 \n\t"
231  "paddw %%mm1, %%mm0 \n\t"
232  "paddw %%mm3, %%mm2 \n\t"
233  MOVNTQ" %%mm0, (%0) \n\t"
234  MOVNTQ" %%mm2, 8(%0)"
235  :: "r"(d), "r"(s)
236  );
237  d+=16;
238  s+=16;
239  }
240  __asm__ volatile(SFENCE:::"memory");
241  __asm__ volatile(EMMS:::"memory");
242  mm_end = end - 3;
243  while (s < mm_end) {
244  register unsigned x= *((const uint32_t *)s);
245  *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
246  d+=4;
247  s+=4;
248  }
249  if (s < end) {
250  register unsigned short x= *((const uint16_t *)s);
251  *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
252  }
253 }
254 
255 static inline void rgb16to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
256 {
257  register const uint8_t* s=src;
258  register uint8_t* d=dst;
259  register const uint8_t *end;
260  const uint8_t *mm_end;
261  end = s + src_size;
262  __asm__ volatile(PREFETCH" %0"::"m"(*s));
263  __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
264  __asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
265  mm_end = end - 15;
266  while (s<mm_end) {
267  __asm__ volatile(
268  PREFETCH" 32(%1) \n\t"
269  "movq (%1), %%mm0 \n\t"
270  "movq 8(%1), %%mm2 \n\t"
271  "movq %%mm0, %%mm1 \n\t"
272  "movq %%mm2, %%mm3 \n\t"
273  "psrlq $1, %%mm0 \n\t"
274  "psrlq $1, %%mm2 \n\t"
275  "pand %%mm7, %%mm0 \n\t"
276  "pand %%mm7, %%mm2 \n\t"
277  "pand %%mm6, %%mm1 \n\t"
278  "pand %%mm6, %%mm3 \n\t"
279  "por %%mm1, %%mm0 \n\t"
280  "por %%mm3, %%mm2 \n\t"
281  MOVNTQ" %%mm0, (%0) \n\t"
282  MOVNTQ" %%mm2, 8(%0)"
283  :: "r"(d), "r"(s)
284  );
285  d+=16;
286  s+=16;
287  }
288  __asm__ volatile(SFENCE:::"memory");
289  __asm__ volatile(EMMS:::"memory");
290  mm_end = end - 3;
291  while (s < mm_end) {
292  register uint32_t x= *((const uint32_t*)s);
293  *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
294  s+=4;
295  d+=4;
296  }
297  if (s < end) {
298  register uint16_t x= *((const uint16_t*)s);
299  *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
300  }
301 }
302 
303 static inline void rgb32to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
304 {
305  const uint8_t *s = src;
306  const uint8_t *end;
307  const uint8_t *mm_end;
308  uint16_t *d = (uint16_t *)dst;
309  end = s + src_size;
310  mm_end = end - 15;
311  __asm__ volatile(
312  "movq %3, %%mm5 \n\t"
313  "movq %4, %%mm6 \n\t"
314  "movq %5, %%mm7 \n\t"
315  "jmp 2f \n\t"
316  ".p2align 4 \n\t"
317  "1: \n\t"
318  PREFETCH" 32(%1) \n\t"
319  "movd (%1), %%mm0 \n\t"
320  "movd 4(%1), %%mm3 \n\t"
321  "punpckldq 8(%1), %%mm0 \n\t"
322  "punpckldq 12(%1), %%mm3 \n\t"
323  "movq %%mm0, %%mm1 \n\t"
324  "movq %%mm3, %%mm4 \n\t"
325  "pand %%mm6, %%mm0 \n\t"
326  "pand %%mm6, %%mm3 \n\t"
327  "pmaddwd %%mm7, %%mm0 \n\t"
328  "pmaddwd %%mm7, %%mm3 \n\t"
329  "pand %%mm5, %%mm1 \n\t"
330  "pand %%mm5, %%mm4 \n\t"
331  "por %%mm1, %%mm0 \n\t"
332  "por %%mm4, %%mm3 \n\t"
333  "psrld $5, %%mm0 \n\t"
334  "pslld $11, %%mm3 \n\t"
335  "por %%mm3, %%mm0 \n\t"
336  MOVNTQ" %%mm0, (%0) \n\t"
337  "add $16, %1 \n\t"
338  "add $8, %0 \n\t"
339  "2: \n\t"
340  "cmp %2, %1 \n\t"
341  " jb 1b \n\t"
342  : "+r" (d), "+r"(s)
343  : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
344  );
345  __asm__ volatile(SFENCE:::"memory");
346  __asm__ volatile(EMMS:::"memory");
347  while (s < end) {
348  register int rgb = *(const uint32_t*)s; s += 4;
349  *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
350  }
351 }
352 
353 static inline void rgb32tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
354 {
355  const uint8_t *s = src;
356  const uint8_t *end;
357  const uint8_t *mm_end;
358  uint16_t *d = (uint16_t *)dst;
359  end = s + src_size;
360  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
361  __asm__ volatile(
362  "movq %0, %%mm7 \n\t"
363  "movq %1, %%mm6 \n\t"
364  ::"m"(red_16mask),"m"(green_16mask));
365  mm_end = end - 15;
366  while (s < mm_end) {
367  __asm__ volatile(
368  PREFETCH" 32(%1) \n\t"
369  "movd (%1), %%mm0 \n\t"
370  "movd 4(%1), %%mm3 \n\t"
371  "punpckldq 8(%1), %%mm0 \n\t"
372  "punpckldq 12(%1), %%mm3 \n\t"
373  "movq %%mm0, %%mm1 \n\t"
374  "movq %%mm0, %%mm2 \n\t"
375  "movq %%mm3, %%mm4 \n\t"
376  "movq %%mm3, %%mm5 \n\t"
377  "psllq $8, %%mm0 \n\t"
378  "psllq $8, %%mm3 \n\t"
379  "pand %%mm7, %%mm0 \n\t"
380  "pand %%mm7, %%mm3 \n\t"
381  "psrlq $5, %%mm1 \n\t"
382  "psrlq $5, %%mm4 \n\t"
383  "pand %%mm6, %%mm1 \n\t"
384  "pand %%mm6, %%mm4 \n\t"
385  "psrlq $19, %%mm2 \n\t"
386  "psrlq $19, %%mm5 \n\t"
387  "pand %2, %%mm2 \n\t"
388  "pand %2, %%mm5 \n\t"
389  "por %%mm1, %%mm0 \n\t"
390  "por %%mm4, %%mm3 \n\t"
391  "por %%mm2, %%mm0 \n\t"
392  "por %%mm5, %%mm3 \n\t"
393  "psllq $16, %%mm3 \n\t"
394  "por %%mm3, %%mm0 \n\t"
395  MOVNTQ" %%mm0, (%0) \n\t"
396  :: "r"(d),"r"(s),"m"(blue_16mask):"memory");
397  d += 4;
398  s += 16;
399  }
400  __asm__ volatile(SFENCE:::"memory");
401  __asm__ volatile(EMMS:::"memory");
402  while (s < end) {
403  register int rgb = *(const uint32_t*)s; s += 4;
404  *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
405  }
406 }
407 
408 static inline void rgb32to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
409 {
410  const uint8_t *s = src;
411  const uint8_t *end;
412  const uint8_t *mm_end;
413  uint16_t *d = (uint16_t *)dst;
414  end = s + src_size;
415  mm_end = end - 15;
416  __asm__ volatile(
417  "movq %3, %%mm5 \n\t"
418  "movq %4, %%mm6 \n\t"
419  "movq %5, %%mm7 \n\t"
420  "jmp 2f \n\t"
421  ".p2align 4 \n\t"
422  "1: \n\t"
423  PREFETCH" 32(%1) \n\t"
424  "movd (%1), %%mm0 \n\t"
425  "movd 4(%1), %%mm3 \n\t"
426  "punpckldq 8(%1), %%mm0 \n\t"
427  "punpckldq 12(%1), %%mm3 \n\t"
428  "movq %%mm0, %%mm1 \n\t"
429  "movq %%mm3, %%mm4 \n\t"
430  "pand %%mm6, %%mm0 \n\t"
431  "pand %%mm6, %%mm3 \n\t"
432  "pmaddwd %%mm7, %%mm0 \n\t"
433  "pmaddwd %%mm7, %%mm3 \n\t"
434  "pand %%mm5, %%mm1 \n\t"
435  "pand %%mm5, %%mm4 \n\t"
436  "por %%mm1, %%mm0 \n\t"
437  "por %%mm4, %%mm3 \n\t"
438  "psrld $6, %%mm0 \n\t"
439  "pslld $10, %%mm3 \n\t"
440  "por %%mm3, %%mm0 \n\t"
441  MOVNTQ" %%mm0, (%0) \n\t"
442  "add $16, %1 \n\t"
443  "add $8, %0 \n\t"
444  "2: \n\t"
445  "cmp %2, %1 \n\t"
446  " jb 1b \n\t"
447  : "+r" (d), "+r"(s)
448  : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
449  );
450  __asm__ volatile(SFENCE:::"memory");
451  __asm__ volatile(EMMS:::"memory");
452  while (s < end) {
453  register int rgb = *(const uint32_t*)s; s += 4;
454  *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
455  }
456 }
457 
458 static inline void rgb32tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
459 {
460  const uint8_t *s = src;
461  const uint8_t *end;
462  const uint8_t *mm_end;
463  uint16_t *d = (uint16_t *)dst;
464  end = s + src_size;
465  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
466  __asm__ volatile(
467  "movq %0, %%mm7 \n\t"
468  "movq %1, %%mm6 \n\t"
469  ::"m"(red_15mask),"m"(green_15mask));
470  mm_end = end - 15;
471  while (s < mm_end) {
472  __asm__ volatile(
473  PREFETCH" 32(%1) \n\t"
474  "movd (%1), %%mm0 \n\t"
475  "movd 4(%1), %%mm3 \n\t"
476  "punpckldq 8(%1), %%mm0 \n\t"
477  "punpckldq 12(%1), %%mm3 \n\t"
478  "movq %%mm0, %%mm1 \n\t"
479  "movq %%mm0, %%mm2 \n\t"
480  "movq %%mm3, %%mm4 \n\t"
481  "movq %%mm3, %%mm5 \n\t"
482  "psllq $7, %%mm0 \n\t"
483  "psllq $7, %%mm3 \n\t"
484  "pand %%mm7, %%mm0 \n\t"
485  "pand %%mm7, %%mm3 \n\t"
486  "psrlq $6, %%mm1 \n\t"
487  "psrlq $6, %%mm4 \n\t"
488  "pand %%mm6, %%mm1 \n\t"
489  "pand %%mm6, %%mm4 \n\t"
490  "psrlq $19, %%mm2 \n\t"
491  "psrlq $19, %%mm5 \n\t"
492  "pand %2, %%mm2 \n\t"
493  "pand %2, %%mm5 \n\t"
494  "por %%mm1, %%mm0 \n\t"
495  "por %%mm4, %%mm3 \n\t"
496  "por %%mm2, %%mm0 \n\t"
497  "por %%mm5, %%mm3 \n\t"
498  "psllq $16, %%mm3 \n\t"
499  "por %%mm3, %%mm0 \n\t"
500  MOVNTQ" %%mm0, (%0) \n\t"
501  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
502  d += 4;
503  s += 16;
504  }
505  __asm__ volatile(SFENCE:::"memory");
506  __asm__ volatile(EMMS:::"memory");
507  while (s < end) {
508  register int rgb = *(const uint32_t*)s; s += 4;
509  *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
510  }
511 }
512 
513 static inline void rgb24tobgr16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
514 {
515  const uint8_t *s = src;
516  const uint8_t *end;
517  const uint8_t *mm_end;
518  uint16_t *d = (uint16_t *)dst;
519  end = s + src_size;
520  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
521  __asm__ volatile(
522  "movq %0, %%mm7 \n\t"
523  "movq %1, %%mm6 \n\t"
524  ::"m"(red_16mask),"m"(green_16mask));
525  mm_end = end - 11;
526  while (s < mm_end) {
527  __asm__ volatile(
528  PREFETCH" 32(%1) \n\t"
529  "movd (%1), %%mm0 \n\t"
530  "movd 3(%1), %%mm3 \n\t"
531  "punpckldq 6(%1), %%mm0 \n\t"
532  "punpckldq 9(%1), %%mm3 \n\t"
533  "movq %%mm0, %%mm1 \n\t"
534  "movq %%mm0, %%mm2 \n\t"
535  "movq %%mm3, %%mm4 \n\t"
536  "movq %%mm3, %%mm5 \n\t"
537  "psrlq $3, %%mm0 \n\t"
538  "psrlq $3, %%mm3 \n\t"
539  "pand %2, %%mm0 \n\t"
540  "pand %2, %%mm3 \n\t"
541  "psrlq $5, %%mm1 \n\t"
542  "psrlq $5, %%mm4 \n\t"
543  "pand %%mm6, %%mm1 \n\t"
544  "pand %%mm6, %%mm4 \n\t"
545  "psrlq $8, %%mm2 \n\t"
546  "psrlq $8, %%mm5 \n\t"
547  "pand %%mm7, %%mm2 \n\t"
548  "pand %%mm7, %%mm5 \n\t"
549  "por %%mm1, %%mm0 \n\t"
550  "por %%mm4, %%mm3 \n\t"
551  "por %%mm2, %%mm0 \n\t"
552  "por %%mm5, %%mm3 \n\t"
553  "psllq $16, %%mm3 \n\t"
554  "por %%mm3, %%mm0 \n\t"
555  MOVNTQ" %%mm0, (%0) \n\t"
556  ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
557  d += 4;
558  s += 12;
559  }
560  __asm__ volatile(SFENCE:::"memory");
561  __asm__ volatile(EMMS:::"memory");
562  while (s < end) {
563  const int b = *s++;
564  const int g = *s++;
565  const int r = *s++;
566  *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
567  }
568 }
569 
570 static inline void rgb24to16_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
571 {
572  const uint8_t *s = src;
573  const uint8_t *end;
574  const uint8_t *mm_end;
575  uint16_t *d = (uint16_t *)dst;
576  end = s + src_size;
577  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
578  __asm__ volatile(
579  "movq %0, %%mm7 \n\t"
580  "movq %1, %%mm6 \n\t"
581  ::"m"(red_16mask),"m"(green_16mask));
582  mm_end = end - 15;
583  while (s < mm_end) {
584  __asm__ volatile(
585  PREFETCH" 32(%1) \n\t"
586  "movd (%1), %%mm0 \n\t"
587  "movd 3(%1), %%mm3 \n\t"
588  "punpckldq 6(%1), %%mm0 \n\t"
589  "punpckldq 9(%1), %%mm3 \n\t"
590  "movq %%mm0, %%mm1 \n\t"
591  "movq %%mm0, %%mm2 \n\t"
592  "movq %%mm3, %%mm4 \n\t"
593  "movq %%mm3, %%mm5 \n\t"
594  "psllq $8, %%mm0 \n\t"
595  "psllq $8, %%mm3 \n\t"
596  "pand %%mm7, %%mm0 \n\t"
597  "pand %%mm7, %%mm3 \n\t"
598  "psrlq $5, %%mm1 \n\t"
599  "psrlq $5, %%mm4 \n\t"
600  "pand %%mm6, %%mm1 \n\t"
601  "pand %%mm6, %%mm4 \n\t"
602  "psrlq $19, %%mm2 \n\t"
603  "psrlq $19, %%mm5 \n\t"
604  "pand %2, %%mm2 \n\t"
605  "pand %2, %%mm5 \n\t"
606  "por %%mm1, %%mm0 \n\t"
607  "por %%mm4, %%mm3 \n\t"
608  "por %%mm2, %%mm0 \n\t"
609  "por %%mm5, %%mm3 \n\t"
610  "psllq $16, %%mm3 \n\t"
611  "por %%mm3, %%mm0 \n\t"
612  MOVNTQ" %%mm0, (%0) \n\t"
613  ::"r"(d),"r"(s),"m"(blue_16mask):"memory");
614  d += 4;
615  s += 12;
616  }
617  __asm__ volatile(SFENCE:::"memory");
618  __asm__ volatile(EMMS:::"memory");
619  while (s < end) {
620  const int r = *s++;
621  const int g = *s++;
622  const int b = *s++;
623  *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
624  }
625 }
626 
627 static inline void rgb24tobgr15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
628 {
629  const uint8_t *s = src;
630  const uint8_t *end;
631  const uint8_t *mm_end;
632  uint16_t *d = (uint16_t *)dst;
633  end = s + src_size;
634  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
635  __asm__ volatile(
636  "movq %0, %%mm7 \n\t"
637  "movq %1, %%mm6 \n\t"
638  ::"m"(red_15mask),"m"(green_15mask));
639  mm_end = end - 11;
640  while (s < mm_end) {
641  __asm__ volatile(
642  PREFETCH" 32(%1) \n\t"
643  "movd (%1), %%mm0 \n\t"
644  "movd 3(%1), %%mm3 \n\t"
645  "punpckldq 6(%1), %%mm0 \n\t"
646  "punpckldq 9(%1), %%mm3 \n\t"
647  "movq %%mm0, %%mm1 \n\t"
648  "movq %%mm0, %%mm2 \n\t"
649  "movq %%mm3, %%mm4 \n\t"
650  "movq %%mm3, %%mm5 \n\t"
651  "psrlq $3, %%mm0 \n\t"
652  "psrlq $3, %%mm3 \n\t"
653  "pand %2, %%mm0 \n\t"
654  "pand %2, %%mm3 \n\t"
655  "psrlq $6, %%mm1 \n\t"
656  "psrlq $6, %%mm4 \n\t"
657  "pand %%mm6, %%mm1 \n\t"
658  "pand %%mm6, %%mm4 \n\t"
659  "psrlq $9, %%mm2 \n\t"
660  "psrlq $9, %%mm5 \n\t"
661  "pand %%mm7, %%mm2 \n\t"
662  "pand %%mm7, %%mm5 \n\t"
663  "por %%mm1, %%mm0 \n\t"
664  "por %%mm4, %%mm3 \n\t"
665  "por %%mm2, %%mm0 \n\t"
666  "por %%mm5, %%mm3 \n\t"
667  "psllq $16, %%mm3 \n\t"
668  "por %%mm3, %%mm0 \n\t"
669  MOVNTQ" %%mm0, (%0) \n\t"
670  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
671  d += 4;
672  s += 12;
673  }
674  __asm__ volatile(SFENCE:::"memory");
675  __asm__ volatile(EMMS:::"memory");
676  while (s < end) {
677  const int b = *s++;
678  const int g = *s++;
679  const int r = *s++;
680  *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
681  }
682 }
683 
684 static inline void rgb24to15_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
685 {
686  const uint8_t *s = src;
687  const uint8_t *end;
688  const uint8_t *mm_end;
689  uint16_t *d = (uint16_t *)dst;
690  end = s + src_size;
691  __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
692  __asm__ volatile(
693  "movq %0, %%mm7 \n\t"
694  "movq %1, %%mm6 \n\t"
695  ::"m"(red_15mask),"m"(green_15mask));
696  mm_end = end - 15;
697  while (s < mm_end) {
698  __asm__ volatile(
699  PREFETCH" 32(%1) \n\t"
700  "movd (%1), %%mm0 \n\t"
701  "movd 3(%1), %%mm3 \n\t"
702  "punpckldq 6(%1), %%mm0 \n\t"
703  "punpckldq 9(%1), %%mm3 \n\t"
704  "movq %%mm0, %%mm1 \n\t"
705  "movq %%mm0, %%mm2 \n\t"
706  "movq %%mm3, %%mm4 \n\t"
707  "movq %%mm3, %%mm5 \n\t"
708  "psllq $7, %%mm0 \n\t"
709  "psllq $7, %%mm3 \n\t"
710  "pand %%mm7, %%mm0 \n\t"
711  "pand %%mm7, %%mm3 \n\t"
712  "psrlq $6, %%mm1 \n\t"
713  "psrlq $6, %%mm4 \n\t"
714  "pand %%mm6, %%mm1 \n\t"
715  "pand %%mm6, %%mm4 \n\t"
716  "psrlq $19, %%mm2 \n\t"
717  "psrlq $19, %%mm5 \n\t"
718  "pand %2, %%mm2 \n\t"
719  "pand %2, %%mm5 \n\t"
720  "por %%mm1, %%mm0 \n\t"
721  "por %%mm4, %%mm3 \n\t"
722  "por %%mm2, %%mm0 \n\t"
723  "por %%mm5, %%mm3 \n\t"
724  "psllq $16, %%mm3 \n\t"
725  "por %%mm3, %%mm0 \n\t"
726  MOVNTQ" %%mm0, (%0) \n\t"
727  ::"r"(d),"r"(s),"m"(blue_15mask):"memory");
728  d += 4;
729  s += 12;
730  }
731  __asm__ volatile(SFENCE:::"memory");
732  __asm__ volatile(EMMS:::"memory");
733  while (s < end) {
734  const int r = *s++;
735  const int g = *s++;
736  const int b = *s++;
737  *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
738  }
739 }
740 
741 static inline void rgb15tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
742 {
743  const uint16_t *end;
744  const uint16_t *mm_end;
745  uint8_t *d = dst;
746  const uint16_t *s = (const uint16_t*)src;
747  end = s + src_size/2;
748  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
749  mm_end = end - 7;
750  while (s < mm_end) {
751  __asm__ volatile(
752  PREFETCH" 32(%1) \n\t"
753  "movq (%1), %%mm0 \n\t"
754  "movq (%1), %%mm1 \n\t"
755  "movq (%1), %%mm2 \n\t"
756  "pand %2, %%mm0 \n\t"
757  "pand %3, %%mm1 \n\t"
758  "pand %4, %%mm2 \n\t"
759  "psllq $5, %%mm0 \n\t"
760  "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
761  "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
762  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
763  "movq %%mm0, %%mm3 \n\t"
764  "movq %%mm1, %%mm4 \n\t"
765  "movq %%mm2, %%mm5 \n\t"
766  "punpcklwd %5, %%mm0 \n\t"
767  "punpcklwd %5, %%mm1 \n\t"
768  "punpcklwd %5, %%mm2 \n\t"
769  "punpckhwd %5, %%mm3 \n\t"
770  "punpckhwd %5, %%mm4 \n\t"
771  "punpckhwd %5, %%mm5 \n\t"
772  "psllq $8, %%mm1 \n\t"
773  "psllq $16, %%mm2 \n\t"
774  "por %%mm1, %%mm0 \n\t"
775  "por %%mm2, %%mm0 \n\t"
776  "psllq $8, %%mm4 \n\t"
777  "psllq $16, %%mm5 \n\t"
778  "por %%mm4, %%mm3 \n\t"
779  "por %%mm5, %%mm3 \n\t"
780 
781  "movq %%mm0, %%mm6 \n\t"
782  "movq %%mm3, %%mm7 \n\t"
783 
784  "movq 8(%1), %%mm0 \n\t"
785  "movq 8(%1), %%mm1 \n\t"
786  "movq 8(%1), %%mm2 \n\t"
787  "pand %2, %%mm0 \n\t"
788  "pand %3, %%mm1 \n\t"
789  "pand %4, %%mm2 \n\t"
790  "psllq $5, %%mm0 \n\t"
791  "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
792  "pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
793  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
794  "movq %%mm0, %%mm3 \n\t"
795  "movq %%mm1, %%mm4 \n\t"
796  "movq %%mm2, %%mm5 \n\t"
797  "punpcklwd %5, %%mm0 \n\t"
798  "punpcklwd %5, %%mm1 \n\t"
799  "punpcklwd %5, %%mm2 \n\t"
800  "punpckhwd %5, %%mm3 \n\t"
801  "punpckhwd %5, %%mm4 \n\t"
802  "punpckhwd %5, %%mm5 \n\t"
803  "psllq $8, %%mm1 \n\t"
804  "psllq $16, %%mm2 \n\t"
805  "por %%mm1, %%mm0 \n\t"
806  "por %%mm2, %%mm0 \n\t"
807  "psllq $8, %%mm4 \n\t"
808  "psllq $16, %%mm5 \n\t"
809  "por %%mm4, %%mm3 \n\t"
810  "por %%mm5, %%mm3 \n\t"
811 
812  :"=m"(*d)
813  :"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
814  NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi)
815  :"memory");
816  /* borrowed 32 to 24 */
817  __asm__ volatile(
818  "movq %%mm0, %%mm4 \n\t"
819  "movq %%mm3, %%mm5 \n\t"
820  "movq %%mm6, %%mm0 \n\t"
821  "movq %%mm7, %%mm1 \n\t"
822 
823  "movq %%mm4, %%mm6 \n\t"
824  "movq %%mm5, %%mm7 \n\t"
825  "movq %%mm0, %%mm2 \n\t"
826  "movq %%mm1, %%mm3 \n\t"
827 
828  STORE_BGR24_MMX
829 
830  :: "r"(d), "m"(*s)
831  NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
832  :"memory");
833  d += 24;
834  s += 8;
835  }
836  __asm__ volatile(SFENCE:::"memory");
837  __asm__ volatile(EMMS:::"memory");
838  while (s < end) {
839  register uint16_t bgr;
840  bgr = *s++;
841  *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
842  *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
843  *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
844  }
845 }
846 
847 static inline void rgb16tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
848 {
849  const uint16_t *end;
850  const uint16_t *mm_end;
851  uint8_t *d = (uint8_t *)dst;
852  const uint16_t *s = (const uint16_t *)src;
853  end = s + src_size/2;
854  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
855  mm_end = end - 7;
856  while (s < mm_end) {
857  __asm__ volatile(
858  PREFETCH" 32(%1) \n\t"
859  "movq (%1), %%mm0 \n\t"
860  "movq (%1), %%mm1 \n\t"
861  "movq (%1), %%mm2 \n\t"
862  "pand %2, %%mm0 \n\t"
863  "pand %3, %%mm1 \n\t"
864  "pand %4, %%mm2 \n\t"
865  "psllq $5, %%mm0 \n\t"
866  "psrlq $1, %%mm2 \n\t"
867  "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
868  "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
869  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
870  "movq %%mm0, %%mm3 \n\t"
871  "movq %%mm1, %%mm4 \n\t"
872  "movq %%mm2, %%mm5 \n\t"
873  "punpcklwd %5, %%mm0 \n\t"
874  "punpcklwd %5, %%mm1 \n\t"
875  "punpcklwd %5, %%mm2 \n\t"
876  "punpckhwd %5, %%mm3 \n\t"
877  "punpckhwd %5, %%mm4 \n\t"
878  "punpckhwd %5, %%mm5 \n\t"
879  "psllq $8, %%mm1 \n\t"
880  "psllq $16, %%mm2 \n\t"
881  "por %%mm1, %%mm0 \n\t"
882  "por %%mm2, %%mm0 \n\t"
883  "psllq $8, %%mm4 \n\t"
884  "psllq $16, %%mm5 \n\t"
885  "por %%mm4, %%mm3 \n\t"
886  "por %%mm5, %%mm3 \n\t"
887 
888  "movq %%mm0, %%mm6 \n\t"
889  "movq %%mm3, %%mm7 \n\t"
890 
891  "movq 8(%1), %%mm0 \n\t"
892  "movq 8(%1), %%mm1 \n\t"
893  "movq 8(%1), %%mm2 \n\t"
894  "pand %2, %%mm0 \n\t"
895  "pand %3, %%mm1 \n\t"
896  "pand %4, %%mm2 \n\t"
897  "psllq $5, %%mm0 \n\t"
898  "psrlq $1, %%mm2 \n\t"
899  "pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
900  "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
901  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
902  "movq %%mm0, %%mm3 \n\t"
903  "movq %%mm1, %%mm4 \n\t"
904  "movq %%mm2, %%mm5 \n\t"
905  "punpcklwd %5, %%mm0 \n\t"
906  "punpcklwd %5, %%mm1 \n\t"
907  "punpcklwd %5, %%mm2 \n\t"
908  "punpckhwd %5, %%mm3 \n\t"
909  "punpckhwd %5, %%mm4 \n\t"
910  "punpckhwd %5, %%mm5 \n\t"
911  "psllq $8, %%mm1 \n\t"
912  "psllq $16, %%mm2 \n\t"
913  "por %%mm1, %%mm0 \n\t"
914  "por %%mm2, %%mm0 \n\t"
915  "psllq $8, %%mm4 \n\t"
916  "psllq $16, %%mm5 \n\t"
917  "por %%mm4, %%mm3 \n\t"
918  "por %%mm5, %%mm3 \n\t"
919  :"=m"(*d)
920  :"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
921  NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi)
922  :"memory");
923  /* borrowed 32 to 24 */
924  __asm__ volatile(
925  "movq %%mm0, %%mm4 \n\t"
926  "movq %%mm3, %%mm5 \n\t"
927  "movq %%mm6, %%mm0 \n\t"
928  "movq %%mm7, %%mm1 \n\t"
929 
930  "movq %%mm4, %%mm6 \n\t"
931  "movq %%mm5, %%mm7 \n\t"
932  "movq %%mm0, %%mm2 \n\t"
933  "movq %%mm1, %%mm3 \n\t"
934 
935  STORE_BGR24_MMX
936 
937  :: "r"(d), "m"(*s)
938  NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
939  :"memory");
940  d += 24;
941  s += 8;
942  }
943  __asm__ volatile(SFENCE:::"memory");
944  __asm__ volatile(EMMS:::"memory");
945  while (s < end) {
946  register uint16_t bgr;
947  bgr = *s++;
948  *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
949  *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
950  *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
951  }
952 }
953 
954 /*
955  * mm0 = 00 B3 00 B2 00 B1 00 B0
956  * mm1 = 00 G3 00 G2 00 G1 00 G0
957  * mm2 = 00 R3 00 R2 00 R1 00 R0
958  * mm6 = FF FF FF FF FF FF FF FF
959  * mm7 = 00 00 00 00 00 00 00 00
960  */
961 #define PACK_RGB32 \
962  "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
963  "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
964  "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
965  "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
966  "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
967  "movq %%mm0, %%mm3 \n\t" \
968  "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
969  "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
970  MOVNTQ" %%mm0, (%0) \n\t" \
971  MOVNTQ" %%mm3, 8(%0) \n\t" \
972 
973 static inline void rgb15to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
974 {
975  const uint16_t *end;
976  const uint16_t *mm_end;
977  uint8_t *d = dst;
978  const uint16_t *s = (const uint16_t *)src;
979  end = s + src_size/2;
980  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
981  __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
982  __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
983  mm_end = end - 3;
984  while (s < mm_end) {
985  __asm__ volatile(
986  PREFETCH" 32(%1) \n\t"
987  "movq (%1), %%mm0 \n\t"
988  "movq (%1), %%mm1 \n\t"
989  "movq (%1), %%mm2 \n\t"
990  "pand %2, %%mm0 \n\t"
991  "pand %3, %%mm1 \n\t"
992  "pand %4, %%mm2 \n\t"
993  "psllq $5, %%mm0 \n\t"
994  "pmulhw %5, %%mm0 \n\t"
995  "pmulhw %5, %%mm1 \n\t"
996  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
997  PACK_RGB32
998  ::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
999  NAMED_CONSTRAINTS_ADD(mul15_hi)
1000  :"memory");
1001  d += 16;
1002  s += 4;
1003  }
1004  __asm__ volatile(SFENCE:::"memory");
1005  __asm__ volatile(EMMS:::"memory");
1006  while (s < end) {
1007  register uint16_t bgr;
1008  bgr = *s++;
1009  *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1010  *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
1011  *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
1012  *d++ = 255;
1013  }
1014 }
1015 
1016 static inline void rgb16to32_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
1017 {
1018  const uint16_t *end;
1019  const uint16_t *mm_end;
1020  uint8_t *d = dst;
1021  const uint16_t *s = (const uint16_t*)src;
1022  end = s + src_size/2;
1023  __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1024  __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1025  __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1026  mm_end = end - 3;
1027  while (s < mm_end) {
1028  __asm__ volatile(
1029  PREFETCH" 32(%1) \n\t"
1030  "movq (%1), %%mm0 \n\t"
1031  "movq (%1), %%mm1 \n\t"
1032  "movq (%1), %%mm2 \n\t"
1033  "pand %2, %%mm0 \n\t"
1034  "pand %3, %%mm1 \n\t"
1035  "pand %4, %%mm2 \n\t"
1036  "psllq $5, %%mm0 \n\t"
1037  "psrlq $1, %%mm2 \n\t"
1038  "pmulhw %5, %%mm0 \n\t"
1039  "pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
1040  "pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
1041  PACK_RGB32
1042  ::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
1043  NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi)
1044  :"memory");
1045  d += 16;
1046  s += 4;
1047  }
1048  __asm__ volatile(SFENCE:::"memory");
1049  __asm__ volatile(EMMS:::"memory");
1050  while (s < end) {
1051  register uint16_t bgr;
1052  bgr = *s++;
1053  *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1054  *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
1055  *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
1056  *d++ = 255;
1057  }
1058 }
1059 
1060 static inline void rgb24tobgr24_mmxext(const uint8_t *src, uint8_t *dst, int src_size)
1061 {
1062  x86_reg mmx_size= 23 - src_size;
1063  __asm__ volatile (
1064  "test %%"FF_REG_a", %%"FF_REG_a" \n\t"
1065  "jns 2f \n\t"
1066  "movq "MANGLE(mask24r)", %%mm5 \n\t"
1067  "movq "MANGLE(mask24g)", %%mm6 \n\t"
1068  "movq "MANGLE(mask24b)", %%mm7 \n\t"
1069  ".p2align 4 \n\t"
1070  "1: \n\t"
1071  PREFETCH" 32(%1, %%"FF_REG_a") \n\t"
1072  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG
1073  "movq (%1, %%"FF_REG_a"), %%mm1 \n\t" // BGR BGR BG
1074  "movq 2(%1, %%"FF_REG_a"), %%mm2 \n\t" // R BGR BGR B
1075  "psllq $16, %%mm0 \n\t" // 00 BGR BGR
1076  "pand %%mm5, %%mm0 \n\t"
1077  "pand %%mm6, %%mm1 \n\t"
1078  "pand %%mm7, %%mm2 \n\t"
1079  "por %%mm0, %%mm1 \n\t"
1080  "por %%mm2, %%mm1 \n\t"
1081  "movq 6(%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG
1082  MOVNTQ" %%mm1,(%2, %%"FF_REG_a") \n\t" // RGB RGB RG
1083  "movq 8(%1, %%"FF_REG_a"), %%mm1 \n\t" // R BGR BGR B
1084  "movq 10(%1, %%"FF_REG_a"), %%mm2 \n\t" // GR BGR BGR
1085  "pand %%mm7, %%mm0 \n\t"
1086  "pand %%mm5, %%mm1 \n\t"
1087  "pand %%mm6, %%mm2 \n\t"
1088  "por %%mm0, %%mm1 \n\t"
1089  "por %%mm2, %%mm1 \n\t"
1090  "movq 14(%1, %%"FF_REG_a"), %%mm0 \n\t" // R BGR BGR B
1091  MOVNTQ" %%mm1, 8(%2, %%"FF_REG_a")\n\t" // B RGB RGB R
1092  "movq 16(%1, %%"FF_REG_a"), %%mm1 \n\t" // GR BGR BGR
1093  "movq 18(%1, %%"FF_REG_a"), %%mm2 \n\t" // BGR BGR BG
1094  "pand %%mm6, %%mm0 \n\t"
1095  "pand %%mm7, %%mm1 \n\t"
1096  "pand %%mm5, %%mm2 \n\t"
1097  "por %%mm0, %%mm1 \n\t"
1098  "por %%mm2, %%mm1 \n\t"
1099  MOVNTQ" %%mm1, 16(%2, %%"FF_REG_a") \n\t"
1100  "add $24, %%"FF_REG_a" \n\t"
1101  " js 1b \n\t"
1102  "2: \n\t"
1103  : "+a" (mmx_size)
1104  : "r" (src-mmx_size), "r"(dst-mmx_size)
1105  NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b)
1106  );
1107 
1108  __asm__ volatile(SFENCE:::"memory");
1109  __asm__ volatile(EMMS:::"memory");
1110 
1111  if (mmx_size==23) return; //finished, was multiple of 8
1112 
1113  src+= src_size;
1114  dst+= src_size;
1115  src_size= 23-mmx_size;
1116  src-= src_size;
1117  dst-= src_size;
1118  for (unsigned i = 0; i < src_size; i +=3) {
1119  register uint8_t x;
1120  x = src[i + 2];
1121  dst[i + 1] = src[i + 1];
1122  dst[i + 2] = src[i + 0];
1123  dst[i + 0] = x;
1124  }
1125 }
1126 
1127 static inline void yuvPlanartoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1128  int width, int height,
1129  int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1130 {
1131  const x86_reg chromWidth= width>>1;
1132  for (int y = 0; y < height; y++) {
1133  //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1134  __asm__ volatile(
1135  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
1136  ".p2align 4 \n\t"
1137  "1: \n\t"
1138  PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t"
1139  PREFETCH" 32(%2, %%"FF_REG_a") \n\t"
1140  PREFETCH" 32(%3, %%"FF_REG_a") \n\t"
1141  "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0)
1142  "movq %%mm0, %%mm2 \n\t" // U(0)
1143  "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0)
1144  "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1145  "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1146 
1147  "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0)
1148  "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8)
1149  "movq %%mm3, %%mm4 \n\t" // Y(0)
1150  "movq %%mm5, %%mm6 \n\t" // Y(8)
1151  "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1152  "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1153  "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1154  "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1155 
1156  MOVNTQ" %%mm3, (%0, %%"FF_REG_a", 4) \n\t"
1157  MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t"
1158  MOVNTQ" %%mm5, 16(%0, %%"FF_REG_a", 4) \n\t"
1159  MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t"
1160 
1161  "add $8, %%"FF_REG_a" \n\t"
1162  "cmp %4, %%"FF_REG_a" \n\t"
1163  " jb 1b \n\t"
1164  ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1165  : "%"FF_REG_a
1166  );
1167  if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1168  usrc += chromStride;
1169  vsrc += chromStride;
1170  }
1171  ysrc += lumStride;
1172  dst += dstStride;
1173  }
1174  __asm__(EMMS" \n\t"
1175  SFENCE" \n\t"
1176  :::"memory");
1177 }
1178 
1179 /**
1180  * Height should be a multiple of 2 and width should be a multiple of 16.
1181  * (If this is a problem for anyone then tell me, and I will fix it.)
1182  */
1183 static inline void yv12toyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1184  int width, int height,
1185  int lumStride, int chromStride, int dstStride)
1186 {
1187  //FIXME interpolate chroma
1188  yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1189 }
1190 
1191 static inline void yuvPlanartouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1192  int width, int height,
1193  int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1194 {
1195  const x86_reg chromWidth= width>>1;
1196  for (int y = 0; y < height; y++) {
1197  //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1198  __asm__ volatile(
1199  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
1200  ".p2align 4 \n\t"
1201  "1: \n\t"
1202  PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t"
1203  PREFETCH" 32(%2, %%"FF_REG_a") \n\t"
1204  PREFETCH" 32(%3, %%"FF_REG_a") \n\t"
1205  "movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0)
1206  "movq %%mm0, %%mm2 \n\t" // U(0)
1207  "movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0)
1208  "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1209  "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1210 
1211  "movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0)
1212  "movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8)
1213  "movq %%mm0, %%mm4 \n\t" // Y(0)
1214  "movq %%mm2, %%mm6 \n\t" // Y(8)
1215  "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1216  "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1217  "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1218  "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1219 
1220  MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 4) \n\t"
1221  MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t"
1222  MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 4) \n\t"
1223  MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t"
1224 
1225  "add $8, %%"FF_REG_a" \n\t"
1226  "cmp %4, %%"FF_REG_a" \n\t"
1227  " jb 1b \n\t"
1228  ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1229  : "%"FF_REG_a
1230  );
1231  if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1232  usrc += chromStride;
1233  vsrc += chromStride;
1234  }
1235  ysrc += lumStride;
1236  dst += dstStride;
1237  }
1238  __asm__(EMMS" \n\t"
1239  SFENCE" \n\t"
1240  :::"memory");
1241 }
1242 
1243 /**
1244  * Height should be a multiple of 2 and width should be a multiple of 16
1245  * (If this is a problem for anyone then tell me, and I will fix it.)
1246  */
1247 static inline void yv12touyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1248  int width, int height,
1249  int lumStride, int chromStride, int dstStride)
1250 {
1251  //FIXME interpolate chroma
1252  yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1253 }
1254 
1255 /**
1256  * Width should be a multiple of 16.
1257  */
1258 static inline void yuv422ptouyvy_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1259  int width, int height,
1260  int lumStride, int chromStride, int dstStride)
1261 {
1262  yuvPlanartouyvy_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1263 }
1264 
1265 /**
1266  * Width should be a multiple of 16.
1267  */
1268 static inline void yuv422ptoyuy2_mmxext(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1269  int width, int height,
1270  int lumStride, int chromStride, int dstStride)
1271 {
1272  yuvPlanartoyuy2_mmxext(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1273 }
1274 
1275 /**
1276  * Height should be a multiple of 2 and width should be a multiple of 16.
1277  * (If this is a problem for anyone then tell me, and I will fix it.)
1278  */
1279 static inline void yuy2toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1280  int width, int height,
1281  int lumStride, int chromStride, int srcStride)
1282 {
1283  const x86_reg chromWidth= width>>1;
1284  for (int y = 0; y < height; y += 2) {
1285  __asm__ volatile(
1286  "xor %%"FF_REG_a", %%"FF_REG_a"\n\t"
1287  "pcmpeqw %%mm7, %%mm7 \n\t"
1288  "psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1289  ".p2align 4 \n\t"
1290  "1: \n\t"
1291  PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t"
1292  "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1293  "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1294  "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1295  "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1296  "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1297  "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1298  "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1299  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1300  "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1301  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1302 
1303  MOVNTQ" %%mm2, (%1, %%"FF_REG_a", 2) \n\t"
1304 
1305  "movq 16(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1306  "movq 24(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1307  "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1308  "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1309  "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1310  "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1311  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1312  "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1313  "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1314  "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1315 
1316  MOVNTQ" %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t"
1317 
1318  "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1319  "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1320  "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1321  "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1322  "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1323  "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1324  "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1325  "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1326 
1327  MOVNTQ" %%mm0, (%3, %%"FF_REG_a") \n\t"
1328  MOVNTQ" %%mm2, (%2, %%"FF_REG_a") \n\t"
1329 
1330  "add $8, %%"FF_REG_a" \n\t"
1331  "cmp %4, %%"FF_REG_a" \n\t"
1332  " jb 1b \n\t"
1333  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1334  : "memory", "%"FF_REG_a
1335  );
1336 
1337  ydst += lumStride;
1338  src += srcStride;
1339 
1340  __asm__ volatile(
1341  "xor %%"FF_REG_a", %%"FF_REG_a"\n\t"
1342  ".p2align 4 \n\t"
1343  "1: \n\t"
1344  PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t"
1345  "movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1346  "movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1347  "movq 16(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1348  "movq 24(%0, %%"FF_REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1349  "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1350  "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1351  "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1352  "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1353  "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1354  "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1355 
1356  MOVNTQ" %%mm0, (%1, %%"FF_REG_a", 2) \n\t"
1357  MOVNTQ" %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t"
1358 
1359  "add $8, %%"FF_REG_a"\n\t"
1360  "cmp %4, %%"FF_REG_a"\n\t"
1361  " jb 1b \n\t"
1362 
1363  ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1364  : "memory", "%"FF_REG_a
1365  );
1366  udst += chromStride;
1367  vdst += chromStride;
1368  ydst += lumStride;
1369  src += srcStride;
1370  }
1371  __asm__ volatile(EMMS" \n\t"
1372  SFENCE" \n\t"
1373  :::"memory");
1374 }
1375 
1376 static inline void planar2x_mmxext(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1377 {
1378  dst[0]= src[0];
1379 
1380  // first line
1381  for (int x = 0; x < srcWidth - 1; x++) {
1382  dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1383  dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1384  }
1385  dst[2*srcWidth-1]= src[srcWidth-1];
1386 
1387  dst+= dstStride;
1388 
1389  for (int y = 1; y < srcHeight; y++) {
1390  x86_reg mmxSize= srcWidth&~15;
1391 
1392  if (mmxSize) {
1393  __asm__ volatile(
1394  "mov %4, %%"FF_REG_a" \n\t"
1395  "movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1396  "movq (%0, %%"FF_REG_a"), %%mm4 \n\t"
1397  "movq %%mm4, %%mm2 \n\t"
1398  "psllq $8, %%mm4 \n\t"
1399  "pand %%mm0, %%mm2 \n\t"
1400  "por %%mm2, %%mm4 \n\t"
1401  "movq (%1, %%"FF_REG_a"), %%mm5 \n\t"
1402  "movq %%mm5, %%mm3 \n\t"
1403  "psllq $8, %%mm5 \n\t"
1404  "pand %%mm0, %%mm3 \n\t"
1405  "por %%mm3, %%mm5 \n\t"
1406  "1: \n\t"
1407  "movq (%0, %%"FF_REG_a"), %%mm0 \n\t"
1408  "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
1409  "movq 1(%0, %%"FF_REG_a"), %%mm2 \n\t"
1410  "movq 1(%1, %%"FF_REG_a"), %%mm3 \n\t"
1411  PAVGB" %%mm0, %%mm5 \n\t"
1412  PAVGB" %%mm0, %%mm3 \n\t"
1413  PAVGB" %%mm0, %%mm5 \n\t"
1414  PAVGB" %%mm0, %%mm3 \n\t"
1415  PAVGB" %%mm1, %%mm4 \n\t"
1416  PAVGB" %%mm1, %%mm2 \n\t"
1417  PAVGB" %%mm1, %%mm4 \n\t"
1418  PAVGB" %%mm1, %%mm2 \n\t"
1419  "movq %%mm5, %%mm7 \n\t"
1420  "movq %%mm4, %%mm6 \n\t"
1421  "punpcklbw %%mm3, %%mm5 \n\t"
1422  "punpckhbw %%mm3, %%mm7 \n\t"
1423  "punpcklbw %%mm2, %%mm4 \n\t"
1424  "punpckhbw %%mm2, %%mm6 \n\t"
1425  MOVNTQ" %%mm5, (%2, %%"FF_REG_a", 2) \n\t"
1426  MOVNTQ" %%mm7, 8(%2, %%"FF_REG_a", 2) \n\t"
1427  MOVNTQ" %%mm4, (%3, %%"FF_REG_a", 2) \n\t"
1428  MOVNTQ" %%mm6, 8(%3, %%"FF_REG_a", 2) \n\t"
1429  "add $8, %%"FF_REG_a" \n\t"
1430  "movq -1(%0, %%"FF_REG_a"), %%mm4 \n\t"
1431  "movq -1(%1, %%"FF_REG_a"), %%mm5 \n\t"
1432  " js 1b \n\t"
1433  :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1434  "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1435  "g" (-mmxSize)
1436  NAMED_CONSTRAINTS_ADD(mmx_ff)
1437  : "%"FF_REG_a
1438  );
1439  } else {
1440  mmxSize = 1;
1441  dst[0] = (src[0] * 3 + src[srcStride]) >> 2;
1442  dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
1443  }
1444 
1445  for (int x = mmxSize - 1; x < srcWidth - 1; x++) {
1446  dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1447  dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1448  dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1449  dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1450  }
1451  dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1452  dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1453 
1454  dst+=dstStride*2;
1455  src+=srcStride;
1456  }
1457 
1458  // last line
1459  dst[0]= src[0];
1460 
1461  for (int x = 0; x < srcWidth - 1; x++) {
1462  dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1463  dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1464  }
1465  dst[2*srcWidth-1]= src[srcWidth-1];
1466 
1467  __asm__ volatile(EMMS" \n\t"
1468  SFENCE" \n\t"
1469  :::"memory");
1470 }
1471 
1472 /**
1473  * Height should be a multiple of 2 and width should be a multiple of 2.
1474  * (If this is a problem for anyone then tell me, and I will fix it.)
1475  * Chrominance data is only taken from every second line,
1476  * others are ignored in the C version.
1477  * FIXME: Write HQ version.
1478  */
1479 #if ARCH_X86_32 && HAVE_7REGS
1480 DECLARE_ASM_CONST(8, uint64_t, bgr2YOffset) = 0x1010101010101010ULL;
1481 DECLARE_ASM_CONST(8, uint64_t, bgr2UVOffset) = 0x8080808080808080ULL;
1482 DECLARE_ASM_CONST(8, uint64_t, w1111) = 0x0001000100010001ULL;
1483 
1484 static inline void rgb24toyv12_mmxext(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1485  int width, int height,
1486  int lumStride, int chromStride, int srcStride,
1487  const int32_t *rgb2yuv)
1488 {
1489 #define BGR2Y_IDX "16*4+16*32"
1490 #define BGR2U_IDX "16*4+16*33"
1491 #define BGR2V_IDX "16*4+16*34"
1492  int y;
1493  const x86_reg chromWidth= width>>1;
1494 
1495  if (height > 2) {
1496  ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv);
1497  src += 2*srcStride;
1498  ydst += 2*lumStride;
1499  udst += chromStride;
1500  vdst += chromStride;
1501  height -= 2;
1502  }
1503 
1504  for (y = 0; y < height - 2; y += 2) {
1505  for (int i = 0; i < 2; i++) {
1506  __asm__ volatile(
1507  "mov %2, %%"FF_REG_a"\n\t"
1508  "movq "BGR2Y_IDX"(%3), %%mm6 \n\t"
1509  "movq "MANGLE(w1111)", %%mm5 \n\t"
1510  "pxor %%mm7, %%mm7 \n\t"
1511  "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
1512  ".p2align 4 \n\t"
1513  "1: \n\t"
1514  PREFETCH" 64(%0, %%"FF_REG_d") \n\t"
1515  "movd (%0, %%"FF_REG_d"), %%mm0 \n\t"
1516  "movd 3(%0, %%"FF_REG_d"), %%mm1 \n\t"
1517  "punpcklbw %%mm7, %%mm0 \n\t"
1518  "punpcklbw %%mm7, %%mm1 \n\t"
1519  "movd 6(%0, %%"FF_REG_d"), %%mm2 \n\t"
1520  "movd 9(%0, %%"FF_REG_d"), %%mm3 \n\t"
1521  "punpcklbw %%mm7, %%mm2 \n\t"
1522  "punpcklbw %%mm7, %%mm3 \n\t"
1523  "pmaddwd %%mm6, %%mm0 \n\t"
1524  "pmaddwd %%mm6, %%mm1 \n\t"
1525  "pmaddwd %%mm6, %%mm2 \n\t"
1526  "pmaddwd %%mm6, %%mm3 \n\t"
1527  "psrad $8, %%mm0 \n\t"
1528  "psrad $8, %%mm1 \n\t"
1529  "psrad $8, %%mm2 \n\t"
1530  "psrad $8, %%mm3 \n\t"
1531  "packssdw %%mm1, %%mm0 \n\t"
1532  "packssdw %%mm3, %%mm2 \n\t"
1533  "pmaddwd %%mm5, %%mm0 \n\t"
1534  "pmaddwd %%mm5, %%mm2 \n\t"
1535  "packssdw %%mm2, %%mm0 \n\t"
1536  "psraw $7, %%mm0 \n\t"
1537 
1538  "movd 12(%0, %%"FF_REG_d"), %%mm4 \n\t"
1539  "movd 15(%0, %%"FF_REG_d"), %%mm1 \n\t"
1540  "punpcklbw %%mm7, %%mm4 \n\t"
1541  "punpcklbw %%mm7, %%mm1 \n\t"
1542  "movd 18(%0, %%"FF_REG_d"), %%mm2 \n\t"
1543  "movd 21(%0, %%"FF_REG_d"), %%mm3 \n\t"
1544  "punpcklbw %%mm7, %%mm2 \n\t"
1545  "punpcklbw %%mm7, %%mm3 \n\t"
1546  "pmaddwd %%mm6, %%mm4 \n\t"
1547  "pmaddwd %%mm6, %%mm1 \n\t"
1548  "pmaddwd %%mm6, %%mm2 \n\t"
1549  "pmaddwd %%mm6, %%mm3 \n\t"
1550  "psrad $8, %%mm4 \n\t"
1551  "psrad $8, %%mm1 \n\t"
1552  "psrad $8, %%mm2 \n\t"
1553  "psrad $8, %%mm3 \n\t"
1554  "packssdw %%mm1, %%mm4 \n\t"
1555  "packssdw %%mm3, %%mm2 \n\t"
1556  "pmaddwd %%mm5, %%mm4 \n\t"
1557  "pmaddwd %%mm5, %%mm2 \n\t"
1558  "add $24, %%"FF_REG_d"\n\t"
1559  "packssdw %%mm2, %%mm4 \n\t"
1560  "psraw $7, %%mm4 \n\t"
1561 
1562  "packuswb %%mm4, %%mm0 \n\t"
1563  "paddusb "MANGLE(bgr2YOffset)", %%mm0 \n\t"
1564 
1565  MOVNTQ" %%mm0, (%1, %%"FF_REG_a") \n\t"
1566  "add $8, %%"FF_REG_a" \n\t"
1567  " js 1b \n\t"
1568  : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
1569  NAMED_CONSTRAINTS_ADD(w1111,bgr2YOffset)
1570  : "%"FF_REG_a, "%"FF_REG_d
1571  );
1572  ydst += lumStride;
1573  src += srcStride;
1574  }
1575  src -= srcStride*2;
1576  __asm__ volatile(
1577  "mov %4, %%"FF_REG_a"\n\t"
1578  "movq "MANGLE(w1111)", %%mm5 \n\t"
1579  "movq "BGR2U_IDX"(%5), %%mm6 \n\t"
1580  "pxor %%mm7, %%mm7 \n\t"
1581  "lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t"
1582  "add %%"FF_REG_d", %%"FF_REG_d"\n\t"
1583  ".p2align 4 \n\t"
1584  "1: \n\t"
1585  PREFETCH" 64(%0, %%"FF_REG_d") \n\t"
1586  PREFETCH" 64(%1, %%"FF_REG_d") \n\t"
1587  "movq (%0, %%"FF_REG_d"), %%mm0 \n\t"
1588  "movq (%1, %%"FF_REG_d"), %%mm1 \n\t"
1589  "movq 6(%0, %%"FF_REG_d"), %%mm2 \n\t"
1590  "movq 6(%1, %%"FF_REG_d"), %%mm3 \n\t"
1591  PAVGB" %%mm1, %%mm0 \n\t"
1592  PAVGB" %%mm3, %%mm2 \n\t"
1593  "movq %%mm0, %%mm1 \n\t"
1594  "movq %%mm2, %%mm3 \n\t"
1595  "psrlq $24, %%mm0 \n\t"
1596  "psrlq $24, %%mm2 \n\t"
1597  PAVGB" %%mm1, %%mm0 \n\t"
1598  PAVGB" %%mm3, %%mm2 \n\t"
1599  "punpcklbw %%mm7, %%mm0 \n\t"
1600  "punpcklbw %%mm7, %%mm2 \n\t"
1601  "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
1602  "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
1603 
1604  "pmaddwd %%mm0, %%mm1 \n\t"
1605  "pmaddwd %%mm2, %%mm3 \n\t"
1606  "pmaddwd %%mm6, %%mm0 \n\t"
1607  "pmaddwd %%mm6, %%mm2 \n\t"
1608  "psrad $8, %%mm0 \n\t"
1609  "psrad $8, %%mm1 \n\t"
1610  "psrad $8, %%mm2 \n\t"
1611  "psrad $8, %%mm3 \n\t"
1612  "packssdw %%mm2, %%mm0 \n\t"
1613  "packssdw %%mm3, %%mm1 \n\t"
1614  "pmaddwd %%mm5, %%mm0 \n\t"
1615  "pmaddwd %%mm5, %%mm1 \n\t"
1616  "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1617  "psraw $7, %%mm0 \n\t"
1618 
1619  "movq 12(%0, %%"FF_REG_d"), %%mm4 \n\t"
1620  "movq 12(%1, %%"FF_REG_d"), %%mm1 \n\t"
1621  "movq 18(%0, %%"FF_REG_d"), %%mm2 \n\t"
1622  "movq 18(%1, %%"FF_REG_d"), %%mm3 \n\t"
1623  PAVGB" %%mm1, %%mm4 \n\t"
1624  PAVGB" %%mm3, %%mm2 \n\t"
1625  "movq %%mm4, %%mm1 \n\t"
1626  "movq %%mm2, %%mm3 \n\t"
1627  "psrlq $24, %%mm4 \n\t"
1628  "psrlq $24, %%mm2 \n\t"
1629  PAVGB" %%mm1, %%mm4 \n\t"
1630  PAVGB" %%mm3, %%mm2 \n\t"
1631  "punpcklbw %%mm7, %%mm4 \n\t"
1632  "punpcklbw %%mm7, %%mm2 \n\t"
1633  "movq "BGR2V_IDX"(%5), %%mm1 \n\t"
1634  "movq "BGR2V_IDX"(%5), %%mm3 \n\t"
1635 
1636  "pmaddwd %%mm4, %%mm1 \n\t"
1637  "pmaddwd %%mm2, %%mm3 \n\t"
1638  "pmaddwd %%mm6, %%mm4 \n\t"
1639  "pmaddwd %%mm6, %%mm2 \n\t"
1640  "psrad $8, %%mm4 \n\t"
1641  "psrad $8, %%mm1 \n\t"
1642  "psrad $8, %%mm2 \n\t"
1643  "psrad $8, %%mm3 \n\t"
1644  "packssdw %%mm2, %%mm4 \n\t"
1645  "packssdw %%mm3, %%mm1 \n\t"
1646  "pmaddwd %%mm5, %%mm4 \n\t"
1647  "pmaddwd %%mm5, %%mm1 \n\t"
1648  "add $24, %%"FF_REG_d"\n\t"
1649  "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1650  "psraw $7, %%mm4 \n\t"
1651 
1652  "movq %%mm0, %%mm1 \n\t"
1653  "punpckldq %%mm4, %%mm0 \n\t"
1654  "punpckhdq %%mm4, %%mm1 \n\t"
1655  "packsswb %%mm1, %%mm0 \n\t"
1656  "paddb "MANGLE(bgr2UVOffset)", %%mm0 \n\t"
1657  "movd %%mm0, (%2, %%"FF_REG_a") \n\t"
1658  "punpckhdq %%mm0, %%mm0 \n\t"
1659  "movd %%mm0, (%3, %%"FF_REG_a") \n\t"
1660  "add $4, %%"FF_REG_a" \n\t"
1661  " js 1b \n\t"
1662  : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
1663  NAMED_CONSTRAINTS_ADD(w1111,bgr2UVOffset)
1664  : "%"FF_REG_a, "%"FF_REG_d
1665  );
1666 
1667  udst += chromStride;
1668  vdst += chromStride;
1669  src += srcStride*2;
1670  }
1671 
1672  __asm__ volatile(EMMS" \n\t"
1673  SFENCE" \n\t"
1674  :::"memory");
1675 
1676  ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
1677 }
1678 #endif /* HAVE_7REGS */
1679 
1680 static inline void vu9_to_vu12_mmxext(const uint8_t *src1, const uint8_t *src2,
1681  uint8_t *dst1, uint8_t *dst2,
1682  int width, int height,
1683  int srcStride1, int srcStride2,
1684  int dstStride1, int dstStride2)
1685 {
1686  int w,h;
1687  w=width/2; h=height/2;
1688  __asm__ volatile(
1689  PREFETCH" %0 \n\t"
1690  PREFETCH" %1 \n\t"
1691  ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
1692  for (x86_reg y = 0; y < h; y++) {
1693  const uint8_t* s1=src1+srcStride1*(y>>1);
1694  uint8_t* d=dst1+dstStride1*y;
1695  x86_reg x = 0;
1696  for (;x<w-31;x+=32) {
1697  __asm__ volatile(
1698  PREFETCH" 32(%1,%2) \n\t"
1699  "movq (%1,%2), %%mm0 \n\t"
1700  "movq 8(%1,%2), %%mm2 \n\t"
1701  "movq 16(%1,%2), %%mm4 \n\t"
1702  "movq 24(%1,%2), %%mm6 \n\t"
1703  "movq %%mm0, %%mm1 \n\t"
1704  "movq %%mm2, %%mm3 \n\t"
1705  "movq %%mm4, %%mm5 \n\t"
1706  "movq %%mm6, %%mm7 \n\t"
1707  "punpcklbw %%mm0, %%mm0 \n\t"
1708  "punpckhbw %%mm1, %%mm1 \n\t"
1709  "punpcklbw %%mm2, %%mm2 \n\t"
1710  "punpckhbw %%mm3, %%mm3 \n\t"
1711  "punpcklbw %%mm4, %%mm4 \n\t"
1712  "punpckhbw %%mm5, %%mm5 \n\t"
1713  "punpcklbw %%mm6, %%mm6 \n\t"
1714  "punpckhbw %%mm7, %%mm7 \n\t"
1715  MOVNTQ" %%mm0, (%0,%2,2) \n\t"
1716  MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
1717  MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
1718  MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
1719  MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
1720  MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
1721  MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
1722  MOVNTQ" %%mm7, 56(%0,%2,2)"
1723  :: "r"(d), "r"(s1), "r"(x)
1724  :"memory");
1725  }
1726  for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
1727  }
1728  for (x86_reg y = 0; y < h; y++) {
1729  const uint8_t* s2=src2+srcStride2*(y>>1);
1730  uint8_t* d=dst2+dstStride2*y;
1731  x86_reg x = 0;
1732  for (;x<w-31;x+=32) {
1733  __asm__ volatile(
1734  PREFETCH" 32(%1,%2) \n\t"
1735  "movq (%1,%2), %%mm0 \n\t"
1736  "movq 8(%1,%2), %%mm2 \n\t"
1737  "movq 16(%1,%2), %%mm4 \n\t"
1738  "movq 24(%1,%2), %%mm6 \n\t"
1739  "movq %%mm0, %%mm1 \n\t"
1740  "movq %%mm2, %%mm3 \n\t"
1741  "movq %%mm4, %%mm5 \n\t"
1742  "movq %%mm6, %%mm7 \n\t"
1743  "punpcklbw %%mm0, %%mm0 \n\t"
1744  "punpckhbw %%mm1, %%mm1 \n\t"
1745  "punpcklbw %%mm2, %%mm2 \n\t"
1746  "punpckhbw %%mm3, %%mm3 \n\t"
1747  "punpcklbw %%mm4, %%mm4 \n\t"
1748  "punpckhbw %%mm5, %%mm5 \n\t"
1749  "punpcklbw %%mm6, %%mm6 \n\t"
1750  "punpckhbw %%mm7, %%mm7 \n\t"
1751  MOVNTQ" %%mm0, (%0,%2,2) \n\t"
1752  MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
1753  MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
1754  MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
1755  MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
1756  MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
1757  MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
1758  MOVNTQ" %%mm7, 56(%0,%2,2)"
1759  :: "r"(d), "r"(s2), "r"(x)
1760  :"memory");
1761  }
1762  for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
1763  }
1764  __asm__(
1765  EMMS" \n\t"
1766  SFENCE" \n\t"
1767  ::: "memory"
1768  );
1769 }
1770 
1771 static inline void yvu9_to_yuy2_mmxext(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
1772  uint8_t *dst,
1773  int width, int height,
1774  int srcStride1, int srcStride2,
1775  int srcStride3, int dstStride)
1776 {
1777  int w,h;
1778  w=width/2; h=height;
1779  for (int y = 0; y < h; y++) {
1780  const uint8_t* yp=src1+srcStride1*y;
1781  const uint8_t* up=src2+srcStride2*(y>>2);
1782  const uint8_t* vp=src3+srcStride3*(y>>2);
1783  uint8_t* d=dst+dstStride*y;
1784  x86_reg x = 0;
1785  for (;x<w-7;x+=8) {
1786  __asm__ volatile(
1787  PREFETCH" 32(%1, %0) \n\t"
1788  PREFETCH" 32(%2, %0) \n\t"
1789  PREFETCH" 32(%3, %0) \n\t"
1790  "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
1791  "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
1792  "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
1793  "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
1794  "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
1795  "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
1796  "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
1797  "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
1798  "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
1799  "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
1800 
1801  "movq %%mm1, %%mm6 \n\t"
1802  "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
1803  "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
1804  "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
1805  MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
1806  MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
1807 
1808  "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
1809  "movq 8(%1, %0, 4), %%mm0 \n\t"
1810  "movq %%mm0, %%mm3 \n\t"
1811  "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
1812  "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
1813  MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
1814  MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
1815 
1816  "movq %%mm4, %%mm6 \n\t"
1817  "movq 16(%1, %0, 4), %%mm0 \n\t"
1818  "movq %%mm0, %%mm3 \n\t"
1819  "punpcklbw %%mm5, %%mm4 \n\t"
1820  "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
1821  "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
1822  MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
1823  MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
1824 
1825  "punpckhbw %%mm5, %%mm6 \n\t"
1826  "movq 24(%1, %0, 4), %%mm0 \n\t"
1827  "movq %%mm0, %%mm3 \n\t"
1828  "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
1829  "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
1830  MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
1831  MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
1832 
1833  : "+r" (x)
1834  : "r"(yp), "r" (up), "r"(vp), "r"(d)
1835  :"memory");
1836  }
1837  for (; x<w; x++) {
1838  const int x2 = x<<2;
1839  d[8*x+0] = yp[x2];
1840  d[8*x+1] = up[x];
1841  d[8*x+2] = yp[x2+1];
1842  d[8*x+3] = vp[x];
1843  d[8*x+4] = yp[x2+2];
1844  d[8*x+5] = up[x];
1845  d[8*x+6] = yp[x2+3];
1846  d[8*x+7] = vp[x];
1847  }
1848  }
1849  __asm__(
1850  EMMS" \n\t"
1851  SFENCE" \n\t"
1852  ::: "memory"
1853  );
1854 }
1855 
1856 static void extract_even_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count)
1857 {
1858  dst += count;
1859  src += 2*count;
1860  count= - count;
1861 
1862  if(count <= -16) {
1863  count += 15;
1864  __asm__ volatile(
1865  "pcmpeqw %%mm7, %%mm7 \n\t"
1866  "psrlw $8, %%mm7 \n\t"
1867  "1: \n\t"
1868  "movq -30(%1, %0, 2), %%mm0 \n\t"
1869  "movq -22(%1, %0, 2), %%mm1 \n\t"
1870  "movq -14(%1, %0, 2), %%mm2 \n\t"
1871  "movq -6(%1, %0, 2), %%mm3 \n\t"
1872  "pand %%mm7, %%mm0 \n\t"
1873  "pand %%mm7, %%mm1 \n\t"
1874  "pand %%mm7, %%mm2 \n\t"
1875  "pand %%mm7, %%mm3 \n\t"
1876  "packuswb %%mm1, %%mm0 \n\t"
1877  "packuswb %%mm3, %%mm2 \n\t"
1878  MOVNTQ" %%mm0,-15(%2, %0) \n\t"
1879  MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
1880  "add $16, %0 \n\t"
1881  " js 1b \n\t"
1882  : "+r"(count)
1883  : "r"(src), "r"(dst)
1884  );
1885  count -= 15;
1886  }
1887  while(count<0) {
1888  dst[count]= src[2*count];
1889  count++;
1890  }
1891 }
1892 
1893 static void extract_odd_mmxext(const uint8_t *src, uint8_t *dst, x86_reg count)
1894 {
1895  src ++;
1896  dst += count;
1897  src += 2*count;
1898  count= - count;
1899 
1900  if(count < -16) {
1901  count += 16;
1902  __asm__ volatile(
1903  "pcmpeqw %%mm7, %%mm7 \n\t"
1904  "psrlw $8, %%mm7 \n\t"
1905  "1: \n\t"
1906  "movq -32(%1, %0, 2), %%mm0 \n\t"
1907  "movq -24(%1, %0, 2), %%mm1 \n\t"
1908  "movq -16(%1, %0, 2), %%mm2 \n\t"
1909  "movq -8(%1, %0, 2), %%mm3 \n\t"
1910  "pand %%mm7, %%mm0 \n\t"
1911  "pand %%mm7, %%mm1 \n\t"
1912  "pand %%mm7, %%mm2 \n\t"
1913  "pand %%mm7, %%mm3 \n\t"
1914  "packuswb %%mm1, %%mm0 \n\t"
1915  "packuswb %%mm3, %%mm2 \n\t"
1916  MOVNTQ" %%mm0,-16(%2, %0) \n\t"
1917  MOVNTQ" %%mm2,- 8(%2, %0) \n\t"
1918  "add $16, %0 \n\t"
1919  " js 1b \n\t"
1920  : "+r"(count)
1921  : "r"(src), "r"(dst)
1922  );
1923  count -= 16;
1924  }
1925  while(count<0) {
1926  dst[count]= src[2*count];
1927  count++;
1928  }
1929 }
1930 
1931 #if ARCH_X86_32
1932 static void extract_even2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
1933 {
1934  dst0+= count;
1935  dst1+= count;
1936  src += 4*count;
1937  count= - count;
1938  if(count <= -8) {
1939  count += 7;
1940  __asm__ volatile(
1941  "pcmpeqw %%mm7, %%mm7 \n\t"
1942  "psrlw $8, %%mm7 \n\t"
1943  "1: \n\t"
1944  "movq -28(%1, %0, 4), %%mm0 \n\t"
1945  "movq -20(%1, %0, 4), %%mm1 \n\t"
1946  "movq -12(%1, %0, 4), %%mm2 \n\t"
1947  "movq -4(%1, %0, 4), %%mm3 \n\t"
1948  "pand %%mm7, %%mm0 \n\t"
1949  "pand %%mm7, %%mm1 \n\t"
1950  "pand %%mm7, %%mm2 \n\t"
1951  "pand %%mm7, %%mm3 \n\t"
1952  "packuswb %%mm1, %%mm0 \n\t"
1953  "packuswb %%mm3, %%mm2 \n\t"
1954  "movq %%mm0, %%mm1 \n\t"
1955  "movq %%mm2, %%mm3 \n\t"
1956  "psrlw $8, %%mm0 \n\t"
1957  "psrlw $8, %%mm2 \n\t"
1958  "pand %%mm7, %%mm1 \n\t"
1959  "pand %%mm7, %%mm3 \n\t"
1960  "packuswb %%mm2, %%mm0 \n\t"
1961  "packuswb %%mm3, %%mm1 \n\t"
1962  MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
1963  MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
1964  "add $8, %0 \n\t"
1965  " js 1b \n\t"
1966  : "+r"(count)
1967  : "r"(src), "r"(dst0), "r"(dst1)
1968  );
1969  count -= 7;
1970  }
1971  while(count<0) {
1972  dst0[count]= src[4*count+0];
1973  dst1[count]= src[4*count+2];
1974  count++;
1975  }
1976 }
1977 #endif /* ARCH_X86_32 */
1978 
1979 static void extract_even2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
1980 {
1981  dst0 += count;
1982  dst1 += count;
1983  src0 += 4*count;
1984  src1 += 4*count;
1985  count= - count;
1986 #ifdef PAVGB
1987  if(count <= -8) {
1988  count += 7;
1989  __asm__ volatile(
1990  "pcmpeqw %%mm7, %%mm7 \n\t"
1991  "psrlw $8, %%mm7 \n\t"
1992  "1: \n\t"
1993  "movq -28(%1, %0, 4), %%mm0 \n\t"
1994  "movq -20(%1, %0, 4), %%mm1 \n\t"
1995  "movq -12(%1, %0, 4), %%mm2 \n\t"
1996  "movq -4(%1, %0, 4), %%mm3 \n\t"
1997  PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
1998  PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
1999  PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2000  PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2001  "pand %%mm7, %%mm0 \n\t"
2002  "pand %%mm7, %%mm1 \n\t"
2003  "pand %%mm7, %%mm2 \n\t"
2004  "pand %%mm7, %%mm3 \n\t"
2005  "packuswb %%mm1, %%mm0 \n\t"
2006  "packuswb %%mm3, %%mm2 \n\t"
2007  "movq %%mm0, %%mm1 \n\t"
2008  "movq %%mm2, %%mm3 \n\t"
2009  "psrlw $8, %%mm0 \n\t"
2010  "psrlw $8, %%mm2 \n\t"
2011  "pand %%mm7, %%mm1 \n\t"
2012  "pand %%mm7, %%mm3 \n\t"
2013  "packuswb %%mm2, %%mm0 \n\t"
2014  "packuswb %%mm3, %%mm1 \n\t"
2015  MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2016  MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2017  "add $8, %0 \n\t"
2018  " js 1b \n\t"
2019  : "+r"(count)
2020  : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2021  );
2022  count -= 7;
2023  }
2024 #endif
2025  while(count<0) {
2026  dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2027  dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2028  count++;
2029  }
2030 }
2031 
2032 static void extract_odd2_mmxext(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2033 {
2034  dst0+= count;
2035  dst1+= count;
2036  src += 4*count;
2037  count= - count;
2038  if(count <= -8) {
2039  count += 7;
2040  __asm__ volatile(
2041  "pcmpeqw %%mm7, %%mm7 \n\t"
2042  "psrlw $8, %%mm7 \n\t"
2043  "1: \n\t"
2044  "movq -28(%1, %0, 4), %%mm0 \n\t"
2045  "movq -20(%1, %0, 4), %%mm1 \n\t"
2046  "movq -12(%1, %0, 4), %%mm2 \n\t"
2047  "movq -4(%1, %0, 4), %%mm3 \n\t"
2048  "psrlw $8, %%mm0 \n\t"
2049  "psrlw $8, %%mm1 \n\t"
2050  "psrlw $8, %%mm2 \n\t"
2051  "psrlw $8, %%mm3 \n\t"
2052  "packuswb %%mm1, %%mm0 \n\t"
2053  "packuswb %%mm3, %%mm2 \n\t"
2054  "movq %%mm0, %%mm1 \n\t"
2055  "movq %%mm2, %%mm3 \n\t"
2056  "psrlw $8, %%mm0 \n\t"
2057  "psrlw $8, %%mm2 \n\t"
2058  "pand %%mm7, %%mm1 \n\t"
2059  "pand %%mm7, %%mm3 \n\t"
2060  "packuswb %%mm2, %%mm0 \n\t"
2061  "packuswb %%mm3, %%mm1 \n\t"
2062  MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2063  MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2064  "add $8, %0 \n\t"
2065  " js 1b \n\t"
2066  : "+r"(count)
2067  : "r"(src), "r"(dst0), "r"(dst1)
2068  );
2069  count -= 7;
2070  }
2071  src++;
2072  while(count<0) {
2073  dst0[count]= src[4*count+0];
2074  dst1[count]= src[4*count+2];
2075  count++;
2076  }
2077 }
2078 
2079 static void extract_odd2avg_mmxext(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2080 {
2081  dst0 += count;
2082  dst1 += count;
2083  src0 += 4*count;
2084  src1 += 4*count;
2085  count= - count;
2086 #ifdef PAVGB
2087  if(count <= -8) {
2088  count += 7;
2089  __asm__ volatile(
2090  "pcmpeqw %%mm7, %%mm7 \n\t"
2091  "psrlw $8, %%mm7 \n\t"
2092  "1: \n\t"
2093  "movq -28(%1, %0, 4), %%mm0 \n\t"
2094  "movq -20(%1, %0, 4), %%mm1 \n\t"
2095  "movq -12(%1, %0, 4), %%mm2 \n\t"
2096  "movq -4(%1, %0, 4), %%mm3 \n\t"
2097  PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2098  PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2099  PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2100  PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2101  "psrlw $8, %%mm0 \n\t"
2102  "psrlw $8, %%mm1 \n\t"
2103  "psrlw $8, %%mm2 \n\t"
2104  "psrlw $8, %%mm3 \n\t"
2105  "packuswb %%mm1, %%mm0 \n\t"
2106  "packuswb %%mm3, %%mm2 \n\t"
2107  "movq %%mm0, %%mm1 \n\t"
2108  "movq %%mm2, %%mm3 \n\t"
2109  "psrlw $8, %%mm0 \n\t"
2110  "psrlw $8, %%mm2 \n\t"
2111  "pand %%mm7, %%mm1 \n\t"
2112  "pand %%mm7, %%mm3 \n\t"
2113  "packuswb %%mm2, %%mm0 \n\t"
2114  "packuswb %%mm3, %%mm1 \n\t"
2115  MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2116  MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2117  "add $8, %0 \n\t"
2118  " js 1b \n\t"
2119  : "+r"(count)
2120  : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2121  );
2122  count -= 7;
2123  }
2124 #endif
2125  src0++;
2126  src1++;
2127  while(count<0) {
2128  dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2129  dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2130  count++;
2131  }
2132 }
2133 
2134 static void yuyvtoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2135  int width, int height,
2136  int lumStride, int chromStride, int srcStride)
2137 {
2138  const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2139 
2140  for (int y = 0; y < height; y++) {
2141  extract_even_mmxext(src, ydst, width);
2142  if(y&1) {
2143  extract_odd2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth);
2144  udst+= chromStride;
2145  vdst+= chromStride;
2146  }
2147 
2148  src += srcStride;
2149  ydst+= lumStride;
2150  }
2151  __asm__(
2152  EMMS" \n\t"
2153  SFENCE" \n\t"
2154  ::: "memory"
2155  );
2156 }
2157 
2158 static void yuyvtoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2159  int width, int height,
2160  int lumStride, int chromStride, int srcStride)
2161 {
2162  const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2163 
2164  for (int y = 0; y < height; y++) {
2165  extract_even_mmxext(src, ydst, width);
2166  extract_odd2_mmxext(src, udst, vdst, chromWidth);
2167 
2168  src += srcStride;
2169  ydst+= lumStride;
2170  udst+= chromStride;
2171  vdst+= chromStride;
2172  }
2173  __asm__(
2174  EMMS" \n\t"
2175  SFENCE" \n\t"
2176  ::: "memory"
2177  );
2178 }
2179 
2180 static void uyvytoyuv420_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2181  int width, int height,
2182  int lumStride, int chromStride, int srcStride)
2183 {
2184  const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2185 
2186  for (int y = 0; y < height; y++) {
2187  extract_odd_mmxext(src, ydst, width);
2188  if(y&1) {
2189  extract_even2avg_mmxext(src-srcStride, src, udst, vdst, chromWidth);
2190  udst+= chromStride;
2191  vdst+= chromStride;
2192  }
2193 
2194  src += srcStride;
2195  ydst+= lumStride;
2196  }
2197  __asm__(
2198  EMMS" \n\t"
2199  SFENCE" \n\t"
2200  ::: "memory"
2201  );
2202 }
2203 
2204 #if ARCH_X86_32
2205 static void uyvytoyuv422_mmxext(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2206  int width, int height,
2207  int lumStride, int chromStride, int srcStride)
2208 {
2209  const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2210 
2211  for (int y = 0; y < height; y++) {
2212  extract_odd_mmxext(src, ydst, width);
2213  extract_even2_mmxext(src, udst, vdst, chromWidth);
2214 
2215  src += srcStride;
2216  ydst+= lumStride;
2217  udst+= chromStride;
2218  vdst+= chromStride;
2219  }
2220  __asm__(
2221  EMMS" \n\t"
2222  SFENCE" \n\t"
2223  ::: "memory"
2224  );
2225 }
2226 #endif /* ARCH_X86_32 */
2227 
2228 static av_cold void rgb2rgb_init_mmxext(void)
2229 {
2230  rgb15to16 = rgb15to16_mmxext;
2231  rgb15tobgr24 = rgb15tobgr24_mmxext;
2232  rgb15to32 = rgb15to32_mmxext;
2233  rgb16tobgr24 = rgb16tobgr24_mmxext;
2234  rgb16to32 = rgb16to32_mmxext;
2235  rgb16to15 = rgb16to15_mmxext;
2236  rgb24tobgr16 = rgb24tobgr16_mmxext;
2237  rgb24tobgr15 = rgb24tobgr15_mmxext;
2238  rgb24tobgr32 = rgb24tobgr32_mmxext;
2239  rgb32to16 = rgb32to16_mmxext;
2240  rgb32to15 = rgb32to15_mmxext;
2241  rgb32tobgr24 = rgb32tobgr24_mmxext;
2242  rgb24to15 = rgb24to15_mmxext;
2243  rgb24to16 = rgb24to16_mmxext;
2244  rgb24tobgr24 = rgb24tobgr24_mmxext;
2245  rgb32tobgr16 = rgb32tobgr16_mmxext;
2246  rgb32tobgr15 = rgb32tobgr15_mmxext;
2247  yv12toyuy2 = yv12toyuy2_mmxext;
2248  yv12touyvy = yv12touyvy_mmxext;
2249  yuv422ptoyuy2 = yuv422ptoyuy2_mmxext;
2250  yuv422ptouyvy = yuv422ptouyvy_mmxext;
2251  yuy2toyv12 = yuy2toyv12_mmxext;
2252  vu9_to_vu12 = vu9_to_vu12_mmxext;
2253  yvu9_to_yuy2 = yvu9_to_yuy2_mmxext;
2254 #if ARCH_X86_32
2255  uyvytoyuv422 = uyvytoyuv422_mmxext;
2256 #endif
2257  yuyvtoyuv422 = yuyvtoyuv422_mmxext;
2258 
2259  planar2x = planar2x_mmxext;
2260 #if ARCH_X86_32 && HAVE_7REGS
2261  ff_rgb24toyv12 = rgb24toyv12_mmxext;
2262 #endif /* ARCH_X86_32 && HAVE_7REGS */
2263 
2264  yuyvtoyuv420 = yuyvtoyuv420_mmxext;
2265  uyvytoyuv420 = uyvytoyuv420_mmxext;
2266 }
2267 
2268 //SSE2 versions
2269 static void interleave_bytes_sse2(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
2270  int width, int height, int src1Stride,
2271  int src2Stride, int dstStride)
2272 {
2273  for (int h = 0; h < height; h++) {
2274  if (width >= 16) {
2275  if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) {
2276  __asm__(
2277  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
2278  "1: \n\t"
2279  PREFETCH" 64(%1, %%"FF_REG_a") \n\t"
2280  PREFETCH" 64(%2, %%"FF_REG_a") \n\t"
2281  "movdqa (%1, %%"FF_REG_a"), %%xmm0 \n\t"
2282  "movdqa (%1, %%"FF_REG_a"), %%xmm1 \n\t"
2283  "movdqa (%2, %%"FF_REG_a"), %%xmm2 \n\t"
2284  "punpcklbw %%xmm2, %%xmm0 \n\t"
2285  "punpckhbw %%xmm2, %%xmm1 \n\t"
2286  "movntdq %%xmm0, (%0, %%"FF_REG_a", 2) \n\t"
2287  "movntdq %%xmm1, 16(%0, %%"FF_REG_a", 2) \n\t"
2288  "add $16, %%"FF_REG_a" \n\t"
2289  "cmp %3, %%"FF_REG_a" \n\t"
2290  " jb 1b \n\t"
2291  ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2292  : "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"FF_REG_a
2293  );
2294  } else
2295  __asm__(
2296  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
2297  "1: \n\t"
2298  PREFETCH" 64(%1, %%"FF_REG_a") \n\t"
2299  PREFETCH" 64(%2, %%"FF_REG_a") \n\t"
2300  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
2301  "movq 8(%1, %%"FF_REG_a"), %%mm2 \n\t"
2302  "movq %%mm0, %%mm1 \n\t"
2303  "movq %%mm2, %%mm3 \n\t"
2304  "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
2305  "movq 8(%2, %%"FF_REG_a"), %%mm5 \n\t"
2306  "punpcklbw %%mm4, %%mm0 \n\t"
2307  "punpckhbw %%mm4, %%mm1 \n\t"
2308  "punpcklbw %%mm5, %%mm2 \n\t"
2309  "punpckhbw %%mm5, %%mm3 \n\t"
2310  MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 2) \n\t"
2311  MOVNTQ" %%mm1, 8(%0, %%"FF_REG_a", 2) \n\t"
2312  MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 2) \n\t"
2313  MOVNTQ" %%mm3, 24(%0, %%"FF_REG_a", 2) \n\t"
2314  "add $16, %%"FF_REG_a" \n\t"
2315  "cmp %3, %%"FF_REG_a" \n\t"
2316  " jb 1b \n\t"
2317  ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
2318  : "memory", "%"FF_REG_a
2319  );
2320 
2321  }
2322  for (int w = (width & (~15)); w < width; w++) {
2323  dest[2*w+0] = src1[w];
2324  dest[2*w+1] = src2[w];
2325  }
2326  dest += dstStride;
2327  src1 += src1Stride;
2328  src2 += src2Stride;
2329  }
2330  __asm__(
2331  EMMS" \n\t"
2332  SFENCE" \n\t"
2333  ::: "memory"
2334  );
2335 }
2336 
2337 /*
2338  RGB15->RGB16 original by Strepto/Astral
2339  ported to gcc & bugfixed : A'rpi
2340  MMXEXT, 3DNOW optimization by Nick Kurshev
2341  32-bit C version, and and&add trick by Michael Niedermayer
2342 */
2343 
2344 #endif /* HAVE_INLINE_ASM */
2345 
2346 void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2347 void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2348 void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2349 void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2350 void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2351 void ff_shuffle_bytes_3102_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2352 void ff_shuffle_bytes_2013_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2353 void ff_shuffle_bytes_2130_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2354 void ff_shuffle_bytes_1203_ssse3(const uint8_t *src, uint8_t *dst, int src_size);
2355 
2356 #if ARCH_X86_64
2357 void ff_shuffle_bytes_2103_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2358 void ff_shuffle_bytes_0321_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2359 void ff_shuffle_bytes_1230_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2360 void ff_shuffle_bytes_3012_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2361 void ff_shuffle_bytes_3210_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2362 void ff_shuffle_bytes_3102_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2363 void ff_shuffle_bytes_2013_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2364 void ff_shuffle_bytes_2130_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2365 void ff_shuffle_bytes_1203_avx2(const uint8_t *src, uint8_t *dst, int src_size);
2366 
2367 void ff_shuffle_bytes_2103_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2368 void ff_shuffle_bytes_0321_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2369 void ff_shuffle_bytes_1230_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2370 void ff_shuffle_bytes_3012_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2371 void ff_shuffle_bytes_3210_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2372 void ff_shuffle_bytes_3102_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2373 void ff_shuffle_bytes_2013_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2374 void ff_shuffle_bytes_2130_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2375 void ff_shuffle_bytes_1203_avx512icl(const uint8_t *src, uint8_t *dst, int src_size);
2376 
2377 void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2378  const uint8_t *src, int width, int height,
2379  int lumStride, int chromStride, int srcStride);
2380 void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2381  const uint8_t *src, int width, int height,
2382  int lumStride, int chromStride, int srcStride);
2383 void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
2384  const uint8_t *src, int width, int height,
2385  int lumStride, int chromStride, int srcStride);
2386 #endif
2387 
2388 #define DEINTERLEAVE_BYTES(cpuext) \
2389 void ff_nv12ToUV_ ## cpuext(uint8_t *dstU, uint8_t *dstV, \
2390  const uint8_t *unused, \
2391  const uint8_t *src1, \
2392  const uint8_t *src2, \
2393  int w, \
2394  uint32_t *unused2, \
2395  void *opq); \
2396 static void deinterleave_bytes_ ## cpuext(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, \
2397  int width, int height, int srcStride, \
2398  int dst1Stride, int dst2Stride) \
2399 { \
2400  for (int h = 0; h < height; h++) { \
2401  if (width >= 16) \
2402  ff_nv12ToUV_ ## cpuext(dst1, dst2, NULL, src, NULL, width - 15, NULL, NULL); \
2403  for (int w = (width & (~15)); w < width; w++) { \
2404  dst1[w] = src[2*w+0]; \
2405  dst2[w] = src[2*w+1]; \
2406  } \
2407  src += srcStride; \
2408  dst1 += dst1Stride; \
2409  dst2 += dst2Stride; \
2410  } \
2411 }
2412 
2413 #if HAVE_SSE2_EXTERNAL
2414 DEINTERLEAVE_BYTES(sse2)
2415 #endif
2416 #if HAVE_AVX_EXTERNAL
2417 DEINTERLEAVE_BYTES(avx)
2418 #endif
2419 
2421 {
2422  int cpu_flags = av_get_cpu_flags();
2423 
2424 #if HAVE_INLINE_ASM
2425  if (INLINE_MMXEXT(cpu_flags))
2426  rgb2rgb_init_mmxext();
2427  if (INLINE_SSE2(cpu_flags))
2428  interleaveBytes = interleave_bytes_sse2;
2429 #endif /* HAVE_INLINE_ASM */
2430 
2431 #if HAVE_SSE2_EXTERNAL
2432  if (EXTERNAL_SSE2(cpu_flags)) {
2433 #if ARCH_X86_64
2434  uyvytoyuv422 = ff_uyvytoyuv422_sse2;
2435 #endif
2436  deinterleaveBytes = deinterleave_bytes_sse2;
2437  }
2438 #endif
2439  if (EXTERNAL_SSSE3(cpu_flags)) {
2449  }
2450 #if HAVE_AVX_EXTERNAL
2451  if (EXTERNAL_AVX(cpu_flags)) {
2452  deinterleaveBytes = deinterleave_bytes_avx;
2453 #if ARCH_X86_64
2454  uyvytoyuv422 = ff_uyvytoyuv422_avx;
2455  }
2457  shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx2;
2458  shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx2;
2459  shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx2;
2460  shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx2;
2461  shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx2;
2462  shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx2;
2463  shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx2;
2464  shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx2;
2465  shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx2;
2466  }
2468  shuffle_bytes_0321 = ff_shuffle_bytes_0321_avx512icl;
2469  shuffle_bytes_2103 = ff_shuffle_bytes_2103_avx512icl;
2470  shuffle_bytes_1230 = ff_shuffle_bytes_1230_avx512icl;
2471  shuffle_bytes_3012 = ff_shuffle_bytes_3012_avx512icl;
2472  shuffle_bytes_3210 = ff_shuffle_bytes_3210_avx512icl;
2473  shuffle_bytes_3102 = ff_shuffle_bytes_3102_avx512icl;
2474  shuffle_bytes_2013 = ff_shuffle_bytes_2013_avx512icl;
2475  shuffle_bytes_2130 = ff_shuffle_bytes_2130_avx512icl;
2476  shuffle_bytes_1203 = ff_shuffle_bytes_1203_avx512icl;
2477  }
2479  uyvytoyuv422 = ff_uyvytoyuv422_avx2;
2480 #endif
2481  }
2482 #endif
2483 }
rgb32tobgr24
void(* rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:35
shuffle_bytes_3012
void(* shuffle_bytes_3012)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:57
cpu.h
r
const char * r
Definition: vf_curves.c:127
mem_internal.h
yv12toyuy2
void(* yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Definition: rgb2rgb.c:65
ff_shuffle_bytes_3210_ssse3
void ff_shuffle_bytes_3210_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
src1
const pixel * src1
Definition: h264pred_template.c:420
DEINTERLEAVE_BYTES
#define DEINTERLEAVE_BYTES(cpuext)
Definition: rgb2rgb.c:2388
x86_reg
int x86_reg
Definition: asm.h:72
EXTERNAL_AVX2_FAST
#define EXTERNAL_AVX2_FAST(flags)
Definition: cpu.h:79
w
uint8_t w
Definition: llviddspenc.c:38
yuy2toyv12
void(* yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Definition: rgb2rgb.c:81
b
#define b
Definition: input.c:41
shuffle_bytes_3210
void(* shuffle_bytes_3210)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:58
rgb2yuv
static const char rgb2yuv[]
Definition: vf_scale_vulkan.c:73
rgb32tobgr16
void(* rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:36
yuyvtoyuv422
void(* yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
Definition: rgb2rgb.c:117
rgb24tobgr16
void(* rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:40
DECLARE_ASM_CONST
#define DECLARE_ASM_CONST(n, t, v)
Definition: mem_internal.h:90
rgb15to32
void(* rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:52
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
yv12touyvy
void(* yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Height should be a multiple of 2 and width should be a multiple of 16.
Definition: rgb2rgb.c:69
rgb32to16
void(* rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:45
rgb
Definition: rpzaenc.c:60
shuffle_bytes_2130
void(* shuffle_bytes_2130)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:61
MANGLE
#define MANGLE(a)
Definition: asm.h:127
ff_shuffle_bytes_0321_ssse3
void ff_shuffle_bytes_0321_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
ff_shuffle_bytes_2013_ssse3
void ff_shuffle_bytes_2013_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
av_cold
#define av_cold
Definition: attributes.h:90
rgb16tobgr24
void(* rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:42
ff_shuffle_bytes_3012_ssse3
void ff_shuffle_bytes_3012_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
NAMED_CONSTRAINTS_ADD
#define NAMED_CONSTRAINTS_ADD(...)
Definition: asm.h:145
s
#define s(width, name)
Definition: cbs_vp9.c:198
AV_CEIL_RSHIFT
#define AV_CEIL_RSHIFT(a, b)
Definition: common.h:60
INLINE_SSE2
#define INLINE_SSE2(flags)
Definition: cpu.h:90
g
const char * g
Definition: vf_curves.c:128
shuffle_bytes_1230
void(* shuffle_bytes_1230)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:56
rgb15tobgr24
void(* rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:43
yuv422ptoyuy2
void(* yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
Definition: rgb2rgb.c:73
shuffle_bytes_2103
void(* shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:55
rgb32tobgr15
void(* rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:37
interleaveBytes
void(* interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst, int width, int height, int src1Stride, int src2Stride, int dstStride)
Definition: rgb2rgb.c:92
yvu9_to_yuy2
void(* yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, uint8_t *dst, int width, int height, int srcStride1, int srcStride2, int srcStride3, int dstStride)
Definition: rgb2rgb.c:103
shuffle_bytes_3102
void(* shuffle_bytes_3102)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:59
rgb16to15
void(* rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:50
asm.h
yuyvtoyuv420
void(* yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
Definition: rgb2rgb.c:114
rgb24tobgr32
void(* rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:38
PAVGB
#define PAVGB(a, b)
Definition: postprocess_template.c:81
height
#define height
Definition: dsp.h:85
ff_shuffle_bytes_3102_ssse3
void ff_shuffle_bytes_3102_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
cpu.h
rgb2rgb_init_x86
av_cold void rgb2rgb_init_x86(void)
Definition: rgb2rgb.c:2420
shuffle_bytes_0321
void(* shuffle_bytes_0321)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:54
attributes.h
rgb24to16
void(* rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:47
EXTERNAL_SSE2
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:59
uyvytoyuv422
void(* uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
Definition: rgb2rgb.c:111
ff_rgb24toyv12
void(* ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride, const int32_t *rgb2yuv)
Height should be a multiple of 2 and width should be a multiple of 2.
Definition: rgb2rgb.c:85
ff_shuffle_bytes_2130_ssse3
void ff_shuffle_bytes_2130_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
rgb24to15
void(* rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:48
src2
const pixel * src2
Definition: h264pred_template.c:421
rgb32to15
void(* rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:46
swscale_internal.h
PREFETCH
#define PREFETCH
Definition: hscale_fast_bilinear_simd.c:28
ff_shuffle_bytes_1203_ssse3
void ff_shuffle_bytes_1203_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
bswap.h
EXTERNAL_AVX
#define EXTERNAL_AVX(flags)
Definition: cpu.h:70
deinterleaveBytes
void(* deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride, int dst1Stride, int dst2Stride)
Definition: rgb2rgb.c:95
XMM_CLOBBERS
#define XMM_CLOBBERS(...)
Definition: asm.h:98
uyvytoyuv420
void(* uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, int width, int height, int lumStride, int chromStride, int srcStride)
Definition: rgb2rgb.c:108
EXTERNAL_AVX512ICL
#define EXTERNAL_AVX512ICL(flags)
Definition: cpu.h:83
rgb16to32
void(* rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:49
rgb24tobgr15
void(* rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:41
shuffle_bytes_2013
void(* shuffle_bytes_2013)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:60
MOVNTQ
#define MOVNTQ(a, b)
Definition: swscale_template.c:34
rgb15to16
void(* rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:51
shuffle_bytes_1203
void(* shuffle_bytes_1203)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:62
yuv422ptouyvy
void(* yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, int width, int height, int lumStride, int chromStride, int dstStride)
Width should be a multiple of 16.
Definition: rgb2rgb.c:77
vu9_to_vu12
void(* vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst1, uint8_t *dst2, int width, int height, int srcStride1, int srcStride2, int dstStride1, int dstStride2)
Definition: rgb2rgb.c:98
INLINE_MMXEXT
#define INLINE_MMXEXT(flags)
Definition: cpu.h:88
rgb24tobgr24
void(* rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
Definition: rgb2rgb.c:39
src0
const pixel *const src0
Definition: h264pred_template.c:419
ff_shuffle_bytes_1230_ssse3
void ff_shuffle_bytes_1230_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
ff_shuffle_bytes_2103_ssse3
void ff_shuffle_bytes_2103_ssse3(const uint8_t *src, uint8_t *dst, int src_size)
int32_t
int32_t
Definition: audioconvert.c:56
ff_rgb24toyv12_c
void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, int width, int height, int lumStride, int chromStride, int srcStride, const int32_t *rgb2yuv)
width should be a multiple of 2.
Definition: rgb2rgb_template.c:650
planar2x
void(* planar2x)(const uint8_t *src, uint8_t *dst, int width, int height, int srcStride, int dstStride)
Definition: rgb2rgb.c:90
h
h
Definition: vp9dsp_template.c:2070
width
#define width
Definition: dsp.h:85
EXTERNAL_SSSE3
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:65
rgb2rgb.h
src
#define src
Definition: vp8dsp.c:248
swscale.h