32 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
33 #define MOVNTQ2 "movntq "
34 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
36 #define YSCALEYUV2PACKEDX_UV \
38 "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
42 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\
43 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
44 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
45 "movq %%mm3, %%mm4 \n\t"\
48 "movq 8(%%"FF_REG_d"), %%mm0 \n\t" \
49 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" \
50 "add %6, %%"FF_REG_S" \n\t" \
51 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm5 \n\t" \
52 "add $16, %%"FF_REG_d" \n\t"\
53 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
54 "pmulhw %%mm0, %%mm2 \n\t"\
55 "pmulhw %%mm0, %%mm5 \n\t"\
56 "paddw %%mm2, %%mm3 \n\t"\
57 "paddw %%mm5, %%mm4 \n\t"\
58 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
61 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
62 "lea "offset"(%0), %%"FF_REG_d" \n\t"\
63 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
64 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
65 "movq "#dst1", "#dst2" \n\t"\
68 "movq 8(%%"FF_REG_d"), "#coeff" \n\t" \
69 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" \
70 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" \
71 "add $16, %%"FF_REG_d" \n\t"\
72 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
73 "pmulhw "#coeff", "#src1" \n\t"\
74 "pmulhw "#coeff", "#src2" \n\t"\
75 "paddw "#src1", "#dst1" \n\t"\
76 "paddw "#src2", "#dst2" \n\t"\
77 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
80 #define YSCALEYUV2PACKEDX \
81 YSCALEYUV2PACKEDX_UV \
82 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
84 #define YSCALEYUV2PACKEDX_END \
85 :: "r" (&c->redDither), \
86 "m" (dummy), "m" (dummy), "m" (dummy),\
87 "r" (dest), "m" (dstW_reg), "m"(uv_off) \
88 NAMED_CONSTRAINTS_ADD(bF8,bFC) \
89 : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S \
92 #define YSCALEYUV2PACKEDX_ACCURATE_UV \
94 "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\
98 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\
99 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
100 "pxor %%mm4, %%mm4 \n\t"\
101 "pxor %%mm5, %%mm5 \n\t"\
102 "pxor %%mm6, %%mm6 \n\t"\
103 "pxor %%mm7, %%mm7 \n\t"\
106 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm0 \n\t" \
107 "add %6, %%"FF_REG_S" \n\t" \
108 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" \
109 "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
110 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm1 \n\t" \
111 "movq %%mm0, %%mm3 \n\t"\
112 "punpcklwd %%mm1, %%mm0 \n\t"\
113 "punpckhwd %%mm1, %%mm3 \n\t"\
114 "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1 \n\t" \
115 "pmaddwd %%mm1, %%mm0 \n\t"\
116 "pmaddwd %%mm1, %%mm3 \n\t"\
117 "paddd %%mm0, %%mm4 \n\t"\
118 "paddd %%mm3, %%mm5 \n\t"\
119 "add %6, %%"FF_REG_S" \n\t" \
120 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm3 \n\t" \
121 "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
122 "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
123 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
124 "movq %%mm2, %%mm0 \n\t"\
125 "punpcklwd %%mm3, %%mm2 \n\t"\
126 "punpckhwd %%mm3, %%mm0 \n\t"\
127 "pmaddwd %%mm1, %%mm2 \n\t"\
128 "pmaddwd %%mm1, %%mm0 \n\t"\
129 "paddd %%mm2, %%mm6 \n\t"\
130 "paddd %%mm0, %%mm7 \n\t"\
132 "psrad $16, %%mm4 \n\t"\
133 "psrad $16, %%mm5 \n\t"\
134 "psrad $16, %%mm6 \n\t"\
135 "psrad $16, %%mm7 \n\t"\
136 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
137 "packssdw %%mm5, %%mm4 \n\t"\
138 "packssdw %%mm7, %%mm6 \n\t"\
139 "paddw %%mm0, %%mm4 \n\t"\
140 "paddw %%mm0, %%mm6 \n\t"\
141 "movq %%mm4, "U_TEMP"(%0) \n\t"\
142 "movq %%mm6, "V_TEMP"(%0) \n\t"\
144 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
145 "lea "offset"(%0), %%"FF_REG_d" \n\t"\
146 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
147 "pxor %%mm1, %%mm1 \n\t"\
148 "pxor %%mm5, %%mm5 \n\t"\
149 "pxor %%mm7, %%mm7 \n\t"\
150 "pxor %%mm6, %%mm6 \n\t"\
153 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0 \n\t" \
154 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2 \n\t" \
155 "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
156 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4 \n\t" \
157 "movq %%mm0, %%mm3 \n\t"\
158 "punpcklwd %%mm4, %%mm0 \n\t"\
159 "punpckhwd %%mm4, %%mm3 \n\t"\
160 "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4 \n\t" \
161 "pmaddwd %%mm4, %%mm0 \n\t"\
162 "pmaddwd %%mm4, %%mm3 \n\t"\
163 "paddd %%mm0, %%mm1 \n\t"\
164 "paddd %%mm3, %%mm5 \n\t"\
165 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3 \n\t" \
166 "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\
167 "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\
168 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\
169 "movq %%mm2, %%mm0 \n\t"\
170 "punpcklwd %%mm3, %%mm2 \n\t"\
171 "punpckhwd %%mm3, %%mm0 \n\t"\
172 "pmaddwd %%mm4, %%mm2 \n\t"\
173 "pmaddwd %%mm4, %%mm0 \n\t"\
174 "paddd %%mm2, %%mm7 \n\t"\
175 "paddd %%mm0, %%mm6 \n\t"\
177 "psrad $16, %%mm1 \n\t"\
178 "psrad $16, %%mm5 \n\t"\
179 "psrad $16, %%mm7 \n\t"\
180 "psrad $16, %%mm6 \n\t"\
181 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
182 "packssdw %%mm5, %%mm1 \n\t"\
183 "packssdw %%mm6, %%mm7 \n\t"\
184 "paddw %%mm0, %%mm1 \n\t"\
185 "paddw %%mm0, %%mm7 \n\t"\
186 "movq "U_TEMP"(%0), %%mm3 \n\t"\
187 "movq "V_TEMP"(%0), %%mm4 \n\t"\
189 #define YSCALEYUV2PACKEDX_ACCURATE \
190 YSCALEYUV2PACKEDX_ACCURATE_UV \
191 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
193 #define YSCALEYUV2RGBX \
194 "psubw "U_OFFSET"(%0), %%mm3 \n\t" \
195 "psubw "V_OFFSET"(%0), %%mm4 \n\t" \
196 "movq %%mm3, %%mm2 \n\t" \
197 "movq %%mm4, %%mm5 \n\t" \
198 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
199 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
201 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
202 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
203 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" \
204 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" \
205 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
206 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
208 "paddw %%mm3, %%mm4 \n\t"\
209 "movq %%mm2, %%mm0 \n\t"\
210 "movq %%mm5, %%mm6 \n\t"\
211 "movq %%mm4, %%mm3 \n\t"\
212 "punpcklwd %%mm2, %%mm2 \n\t"\
213 "punpcklwd %%mm5, %%mm5 \n\t"\
214 "punpcklwd %%mm4, %%mm4 \n\t"\
215 "paddw %%mm1, %%mm2 \n\t"\
216 "paddw %%mm1, %%mm5 \n\t"\
217 "paddw %%mm1, %%mm4 \n\t"\
218 "punpckhwd %%mm0, %%mm0 \n\t"\
219 "punpckhwd %%mm6, %%mm6 \n\t"\
220 "punpckhwd %%mm3, %%mm3 \n\t"\
221 "paddw %%mm7, %%mm0 \n\t"\
222 "paddw %%mm7, %%mm6 \n\t"\
223 "paddw %%mm7, %%mm3 \n\t"\
225 "packuswb %%mm0, %%mm2 \n\t"\
226 "packuswb %%mm6, %%mm5 \n\t"\
227 "packuswb %%mm3, %%mm4 \n\t"\
229 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
230 "movq "#b", "#q2" \n\t" \
231 "movq "#r", "#t" \n\t" \
232 "punpcklbw "#g", "#b" \n\t" \
233 "punpcklbw "#a", "#r" \n\t" \
234 "punpckhbw "#g", "#q2" \n\t" \
235 "punpckhbw "#a", "#t" \n\t" \
236 "movq "#b", "#q0" \n\t" \
237 "movq "#q2", "#q3" \n\t" \
238 "punpcklwd "#r", "#q0" \n\t" \
239 "punpckhwd "#r", "#b" \n\t" \
240 "punpcklwd "#t", "#q2" \n\t" \
241 "punpckhwd "#t", "#q3" \n\t" \
243 MOVNTQ( q0, (dst, index, 4))\
244 MOVNTQ( b, 8(dst, index, 4))\
245 MOVNTQ( q2, 16(dst, index, 4))\
246 MOVNTQ( q3, 24(dst, index, 4))\
248 "add $8, "#index" \n\t"\
249 "cmp "dstw", "#index" \n\t"\
251 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
254 const int16_t **lumSrc,
int lumFilterSize,
255 const int16_t *chrFilter,
const int16_t **chrUSrc,
256 const int16_t **chrVSrc,
257 int chrFilterSize,
const int16_t **alpSrc,
258 uint8_t *dest,
int dstW,
int dstY)
264 if (CONFIG_SWSCALE_ALPHA &&
c->needAlpha) {
267 "movq %%mm2, "U_TEMP"(%0) \n\t"
268 "movq %%mm4, "V_TEMP"(%0) \n\t"
269 "movq %%mm5, "Y_TEMP"(%0) \n\t"
271 "movq "Y_TEMP"(%0), %%mm5 \n\t"
272 "psraw $3, %%mm1 \n\t"
273 "psraw $3, %%mm7 \n\t"
274 "packuswb %%mm7, %%mm1 \n\t"
275 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
280 "pcmpeqd %%mm7, %%mm7 \n\t"
281 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
287 const int16_t **lumSrc,
int lumFilterSize,
288 const int16_t *chrFilter,
const int16_t **chrUSrc,
289 const int16_t **chrVSrc,
290 int chrFilterSize,
const int16_t **alpSrc,
291 uint8_t *dest,
int dstW,
int dstY)
297 if (CONFIG_SWSCALE_ALPHA &&
c->needAlpha) {
301 "psraw $3, %%mm1 \n\t"
302 "psraw $3, %%mm7 \n\t"
303 "packuswb %%mm7, %%mm1 \n\t"
304 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
309 "pcmpeqd %%mm7, %%mm7 \n\t"
310 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
316 const int16_t **lumSrc,
int lumFilterSize,
317 const int16_t *chrFilter,
const int16_t **chrUSrc,
318 const int16_t **chrVSrc,
319 int chrFilterSize,
const int16_t **alpSrc,
320 uint8_t *dest,
int dstW,
int dstY)
326 if (CONFIG_SWSCALE_ALPHA &&
c->needAlpha) {
330 "psraw $3, %%mm1 \n\t"
331 "psraw $3, %%mm7 \n\t"
332 "packuswb %%mm7, %%mm1 \n\t"
333 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
338 "pcmpeqd %%mm7, %%mm7 \n\t"
339 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
344 #define REAL_WRITERGB16(dst, dstw, index) \
345 "pand "MANGLE(bF8)", %%mm2 \n\t" \
346 "pand "MANGLE(bFC)", %%mm4 \n\t" \
347 "pand "MANGLE(bF8)", %%mm5 \n\t" \
348 "psrlq $3, %%mm2 \n\t"\
350 "movq %%mm2, %%mm1 \n\t"\
351 "movq %%mm4, %%mm3 \n\t"\
353 "punpcklbw %%mm7, %%mm3 \n\t"\
354 "punpcklbw %%mm5, %%mm2 \n\t"\
355 "punpckhbw %%mm7, %%mm4 \n\t"\
356 "punpckhbw %%mm5, %%mm1 \n\t"\
358 "psllq $3, %%mm3 \n\t"\
359 "psllq $3, %%mm4 \n\t"\
361 "por %%mm3, %%mm2 \n\t"\
362 "por %%mm4, %%mm1 \n\t"\
364 MOVNTQ(%%mm2, (dst, index, 2))\
365 MOVNTQ(%%mm1, 8(dst, index, 2))\
367 "add $8, "#index" \n\t"\
368 "cmp "dstw", "#index" \n\t"\
370 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
373 const int16_t **lumSrc,
int lumFilterSize,
374 const int16_t *chrFilter,
const int16_t **chrUSrc,
375 const int16_t **chrVSrc,
376 int chrFilterSize,
const int16_t **alpSrc,
377 uint8_t *dest,
int dstW,
int dstY)
385 "pxor %%mm7, %%mm7 \n\t"
397 const int16_t **lumSrc,
int lumFilterSize,
398 const int16_t *chrFilter,
const int16_t **chrUSrc,
399 const int16_t **chrVSrc,
400 int chrFilterSize,
const int16_t **alpSrc,
401 uint8_t *dest,
int dstW,
int dstY)
409 "pxor %%mm7, %%mm7 \n\t"
420 #define REAL_WRITERGB15(dst, dstw, index) \
421 "pand "MANGLE(bF8)", %%mm2 \n\t" \
422 "pand "MANGLE(bF8)", %%mm4 \n\t" \
423 "pand "MANGLE(bF8)", %%mm5 \n\t" \
424 "psrlq $3, %%mm2 \n\t"\
425 "psrlq $1, %%mm5 \n\t"\
427 "movq %%mm2, %%mm1 \n\t"\
428 "movq %%mm4, %%mm3 \n\t"\
430 "punpcklbw %%mm7, %%mm3 \n\t"\
431 "punpcklbw %%mm5, %%mm2 \n\t"\
432 "punpckhbw %%mm7, %%mm4 \n\t"\
433 "punpckhbw %%mm5, %%mm1 \n\t"\
435 "psllq $2, %%mm3 \n\t"\
436 "psllq $2, %%mm4 \n\t"\
438 "por %%mm3, %%mm2 \n\t"\
439 "por %%mm4, %%mm1 \n\t"\
441 MOVNTQ(%%mm2, (dst, index, 2))\
442 MOVNTQ(%%mm1, 8(dst, index, 2))\
444 "add $8, "#index" \n\t"\
445 "cmp "dstw", "#index" \n\t"\
447 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
450 const int16_t **lumSrc,
int lumFilterSize,
451 const int16_t *chrFilter,
const int16_t **chrUSrc,
452 const int16_t **chrVSrc,
453 int chrFilterSize,
const int16_t **alpSrc,
454 uint8_t *dest,
int dstW,
int dstY)
462 "pxor %%mm7, %%mm7 \n\t"
474 const int16_t **lumSrc,
int lumFilterSize,
475 const int16_t *chrFilter,
const int16_t **chrUSrc,
476 const int16_t **chrVSrc,
477 int chrFilterSize,
const int16_t **alpSrc,
478 uint8_t *dest,
int dstW,
int dstY)
486 "pxor %%mm7, %%mm7 \n\t"
497 #define WRITEBGR24MMX(dst, dstw, index) \
499 "movq %%mm2, %%mm1 \n\t" \
500 "movq %%mm5, %%mm6 \n\t" \
501 "punpcklbw %%mm4, %%mm2 \n\t" \
502 "punpcklbw %%mm7, %%mm5 \n\t" \
503 "punpckhbw %%mm4, %%mm1 \n\t" \
504 "punpckhbw %%mm7, %%mm6 \n\t" \
505 "movq %%mm2, %%mm0 \n\t" \
506 "movq %%mm1, %%mm3 \n\t" \
507 "punpcklwd %%mm5, %%mm0 \n\t" \
508 "punpckhwd %%mm5, %%mm2 \n\t" \
509 "punpcklwd %%mm6, %%mm1 \n\t" \
510 "punpckhwd %%mm6, %%mm3 \n\t" \
512 "movq %%mm0, %%mm4 \n\t" \
513 "movq %%mm2, %%mm6 \n\t" \
514 "movq %%mm1, %%mm5 \n\t" \
515 "movq %%mm3, %%mm7 \n\t" \
517 "psllq $40, %%mm0 \n\t" \
518 "psllq $40, %%mm2 \n\t" \
519 "psllq $40, %%mm1 \n\t" \
520 "psllq $40, %%mm3 \n\t" \
522 "punpckhdq %%mm4, %%mm0 \n\t" \
523 "punpckhdq %%mm6, %%mm2 \n\t" \
524 "punpckhdq %%mm5, %%mm1 \n\t" \
525 "punpckhdq %%mm7, %%mm3 \n\t" \
527 "psrlq $8, %%mm0 \n\t" \
528 "movq %%mm2, %%mm6 \n\t" \
529 "psllq $40, %%mm2 \n\t" \
530 "por %%mm2, %%mm0 \n\t" \
531 MOVNTQ(%%mm0, (dst))\
533 "psrlq $24, %%mm6 \n\t" \
534 "movq %%mm1, %%mm5 \n\t" \
535 "psllq $24, %%mm1 \n\t" \
536 "por %%mm1, %%mm6 \n\t" \
537 MOVNTQ(%%mm6, 8(dst))\
539 "psrlq $40, %%mm5 \n\t" \
540 "psllq $8, %%mm3 \n\t" \
541 "por %%mm3, %%mm5 \n\t" \
542 MOVNTQ(%%mm5, 16(dst))\
544 "add $24, "#dst" \n\t"\
546 "add $8, "#index" \n\t"\
547 "cmp "dstw", "#index" \n\t"\
550 #define WRITEBGR24MMXEXT(dst, dstw, index) \
552 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
553 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
554 "pshufw $0x50, %%mm2, %%mm1 \n\t" \
555 "pshufw $0x50, %%mm4, %%mm3 \n\t" \
556 "pshufw $0x00, %%mm5, %%mm6 \n\t" \
558 "pand %%mm0, %%mm1 \n\t" \
559 "pand %%mm0, %%mm3 \n\t" \
560 "pand %%mm7, %%mm6 \n\t" \
562 "psllq $8, %%mm3 \n\t" \
563 "por %%mm1, %%mm6 \n\t"\
564 "por %%mm3, %%mm6 \n\t"\
565 MOVNTQ(%%mm6, (dst))\
567 "psrlq $8, %%mm4 \n\t" \
568 "pshufw $0xA5, %%mm2, %%mm1 \n\t" \
569 "pshufw $0x55, %%mm4, %%mm3 \n\t" \
570 "pshufw $0xA5, %%mm5, %%mm6 \n\t" \
572 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" \
573 "pand %%mm7, %%mm3 \n\t" \
574 "pand %%mm0, %%mm6 \n\t" \
576 "por %%mm1, %%mm3 \n\t" \
577 "por %%mm3, %%mm6 \n\t"\
578 MOVNTQ(%%mm6, 8(dst))\
580 "pshufw $0xFF, %%mm2, %%mm1 \n\t" \
581 "pshufw $0xFA, %%mm4, %%mm3 \n\t" \
582 "pshufw $0xFA, %%mm5, %%mm6 \n\t" \
584 "pand %%mm7, %%mm1 \n\t" \
585 "pand %%mm0, %%mm3 \n\t" \
586 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" \
588 "por %%mm1, %%mm3 \n\t"\
589 "por %%mm3, %%mm6 \n\t"\
590 MOVNTQ(%%mm6, 16(dst))\
592 "add $24, "#dst" \n\t"\
594 "add $8, "#index" \n\t"\
595 "cmp "dstw", "#index" \n\t"\
599 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
603 const int16_t **lumSrc,
int lumFilterSize,
604 const int16_t *chrFilter,
const int16_t **chrUSrc,
605 const int16_t **chrVSrc,
606 int chrFilterSize,
const int16_t **alpSrc,
607 uint8_t *dest,
int dstW,
int dstY)
615 "pxor %%mm7, %%mm7 \n\t"
616 "lea (%%"FF_REG_a
", %%"FF_REG_a
", 2), %%"FF_REG_c
"\n\t"
617 "add %4, %%"FF_REG_c
" \n\t"
619 ::
"r" (&
c->redDither),
621 "r" (dest),
"m" (dstW_reg),
"m"(uv_off)
623 :
"%"FF_REG_a,
"%"FF_REG_c,
"%"FF_REG_d,
"%"FF_REG_S
628 const int16_t **lumSrc,
int lumFilterSize,
629 const int16_t *chrFilter,
const int16_t **chrUSrc,
630 const int16_t **chrVSrc,
631 int chrFilterSize,
const int16_t **alpSrc,
632 uint8_t *dest,
int dstW,
int dstY)
640 "pxor %%mm7, %%mm7 \n\t"
641 "lea (%%"FF_REG_a
", %%"FF_REG_a
", 2), %%"FF_REG_c
" \n\t"
642 "add %4, %%"FF_REG_c
" \n\t"
644 ::
"r" (&
c->redDither),
646 "r" (dest),
"m" (dstW_reg),
"m"(uv_off)
648 :
"%"FF_REG_a,
"%"FF_REG_c,
"%"FF_REG_d,
"%"FF_REG_S
653 #define REAL_WRITEYUY2(dst, dstw, index) \
654 "packuswb %%mm3, %%mm3 \n\t"\
655 "packuswb %%mm4, %%mm4 \n\t"\
656 "packuswb %%mm7, %%mm1 \n\t"\
657 "punpcklbw %%mm4, %%mm3 \n\t"\
658 "movq %%mm1, %%mm7 \n\t"\
659 "punpcklbw %%mm3, %%mm1 \n\t"\
660 "punpckhbw %%mm3, %%mm7 \n\t"\
662 MOVNTQ(%%mm1, (dst, index, 2))\
663 MOVNTQ(%%mm7, 8(dst, index, 2))\
665 "add $8, "#index" \n\t"\
666 "cmp "dstw", "#index" \n\t"\
668 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
671 const int16_t **lumSrc,
int lumFilterSize,
672 const int16_t *chrFilter,
const int16_t **chrUSrc,
673 const int16_t **chrVSrc,
674 int chrFilterSize,
const int16_t **alpSrc,
675 uint8_t *dest,
int dstW,
int dstY)
683 "psraw $3, %%mm3 \n\t"
684 "psraw $3, %%mm4 \n\t"
685 "psraw $3, %%mm1 \n\t"
686 "psraw $3, %%mm7 \n\t"
692 const int16_t **lumSrc,
int lumFilterSize,
693 const int16_t *chrFilter,
const int16_t **chrUSrc,
694 const int16_t **chrVSrc,
695 int chrFilterSize,
const int16_t **alpSrc,
696 uint8_t *dest,
int dstW,
int dstY)
704 "psraw $3, %%mm3 \n\t"
705 "psraw $3, %%mm4 \n\t"
706 "psraw $3, %%mm1 \n\t"
707 "psraw $3, %%mm7 \n\t"
712 #define REAL_YSCALEYUV2RGB_UV(index, c) \
713 "xor "#index", "#index" \n\t"\
716 "movq (%2, "#index"), %%mm2 \n\t" \
717 "movq (%3, "#index"), %%mm3 \n\t" \
718 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
719 "movq (%2, "#index"), %%mm5 \n\t" \
720 "movq (%3, "#index"), %%mm4 \n\t" \
721 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
722 "psubw %%mm3, %%mm2 \n\t" \
723 "psubw %%mm4, %%mm5 \n\t" \
724 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
725 "pmulhw %%mm0, %%mm2 \n\t" \
726 "pmulhw %%mm0, %%mm5 \n\t" \
727 "psraw $4, %%mm3 \n\t" \
728 "psraw $4, %%mm4 \n\t" \
729 "paddw %%mm2, %%mm3 \n\t" \
730 "paddw %%mm5, %%mm4 \n\t" \
731 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
732 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
733 "movq %%mm3, %%mm2 \n\t" \
734 "movq %%mm4, %%mm5 \n\t" \
735 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
736 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
739 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
740 "movq ("#b1", "#index", 2), %%mm0 \n\t" \
741 "movq ("#b2", "#index", 2), %%mm1 \n\t" \
742 "movq 8("#b1", "#index", 2), %%mm6 \n\t" \
743 "movq 8("#b2", "#index", 2), %%mm7 \n\t" \
744 "psubw %%mm1, %%mm0 \n\t" \
745 "psubw %%mm7, %%mm6 \n\t" \
746 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \
747 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \
748 "psraw $4, %%mm1 \n\t" \
749 "psraw $4, %%mm7 \n\t" \
750 "paddw %%mm0, %%mm1 \n\t" \
751 "paddw %%mm6, %%mm7 \n\t" \
753 #define REAL_YSCALEYUV2RGB_COEFF(c) \
754 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
755 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
756 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
757 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
758 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
759 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
761 "paddw %%mm3, %%mm4 \n\t"\
762 "movq %%mm2, %%mm0 \n\t"\
763 "movq %%mm5, %%mm6 \n\t"\
764 "movq %%mm4, %%mm3 \n\t"\
765 "punpcklwd %%mm2, %%mm2 \n\t"\
766 "punpcklwd %%mm5, %%mm5 \n\t"\
767 "punpcklwd %%mm4, %%mm4 \n\t"\
768 "paddw %%mm1, %%mm2 \n\t"\
769 "paddw %%mm1, %%mm5 \n\t"\
770 "paddw %%mm1, %%mm4 \n\t"\
771 "punpckhwd %%mm0, %%mm0 \n\t"\
772 "punpckhwd %%mm6, %%mm6 \n\t"\
773 "punpckhwd %%mm3, %%mm3 \n\t"\
774 "paddw %%mm7, %%mm0 \n\t"\
775 "paddw %%mm7, %%mm6 \n\t"\
776 "paddw %%mm7, %%mm3 \n\t"\
778 "packuswb %%mm0, %%mm2 \n\t"\
779 "packuswb %%mm6, %%mm5 \n\t"\
780 "packuswb %%mm3, %%mm4 \n\t"\
782 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
784 #define YSCALEYUV2RGB(index, c) \
785 REAL_YSCALEYUV2RGB_UV(index, c) \
786 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
787 REAL_YSCALEYUV2RGB_COEFF(c)
793 const int16_t *ubuf[2],
const int16_t *vbuf[2],
794 const int16_t *abuf[2], uint8_t *dest,
795 int dstW,
int yalpha,
int uvalpha,
int y)
797 const int16_t *buf0 = buf[0], *buf1 = buf[1],
798 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
800 if (CONFIG_SWSCALE_ALPHA &&
c->needAlpha) {
801 const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
806 "psraw $3, %%mm1 \n\t"
807 "psraw $3, %%mm7 \n\t"
808 "packuswb %%mm7, %%mm1 \n\t"
809 WRITEBGR32(%4,
DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
810 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"r" (dest),
812 "r" (abuf0),
"r" (abuf1)
816 c->u_temp=(intptr_t)abuf0;
817 c->v_temp=(intptr_t)abuf1;
820 "mov %4, %%"FF_REG_b
" \n\t"
821 "push %%"FF_REG_BP
" \n\t"
825 "mov "U_TEMP"(%5), %0 \n\t"
826 "mov "V_TEMP"(%5), %1 \n\t"
828 "psraw $3, %%mm1 \n\t"
829 "psraw $3, %%mm7 \n\t"
830 "packuswb %%mm7, %%mm1 \n\t"
833 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
834 "pop %%"FF_REG_BP
" \n\t"
836 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
843 "mov %4, %%"FF_REG_b
" \n\t"
844 "push %%"FF_REG_BP
" \n\t"
846 "pcmpeqd %%mm7, %%mm7 \n\t"
847 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
848 "pop %%"FF_REG_BP
" \n\t"
850 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
857 const int16_t *ubuf[2],
const int16_t *vbuf[2],
858 const int16_t *abuf[2], uint8_t *dest,
859 int dstW,
int yalpha,
int uvalpha,
int y)
861 const int16_t *buf0 = buf[0], *buf1 = buf[1],
862 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
866 "mov %4, %%"FF_REG_b
" \n\t"
867 "push %%"FF_REG_BP
" \n\t"
869 "pxor %%mm7, %%mm7 \n\t"
871 "pop %%"FF_REG_BP
" \n\t"
873 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
880 const int16_t *ubuf[2],
const int16_t *vbuf[2],
881 const int16_t *abuf[2], uint8_t *dest,
882 int dstW,
int yalpha,
int uvalpha,
int y)
884 const int16_t *buf0 = buf[0], *buf1 = buf[1],
885 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
889 "mov %4, %%"FF_REG_b
" \n\t"
890 "push %%"FF_REG_BP
" \n\t"
892 "pxor %%mm7, %%mm7 \n\t"
900 "pop %%"FF_REG_BP
" \n\t"
902 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
909 const int16_t *ubuf[2],
const int16_t *vbuf[2],
910 const int16_t *abuf[2], uint8_t *dest,
911 int dstW,
int yalpha,
int uvalpha,
int y)
913 const int16_t *buf0 = buf[0], *buf1 = buf[1],
914 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
918 "mov %4, %%"FF_REG_b
" \n\t"
919 "push %%"FF_REG_BP
" \n\t"
921 "pxor %%mm7, %%mm7 \n\t"
929 "pop %%"FF_REG_BP
" \n\t"
931 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
937 #define REAL_YSCALEYUV2PACKED(index, c) \
938 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
939 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
940 "psraw $3, %%mm0 \n\t"\
941 "psraw $3, %%mm1 \n\t"\
942 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
943 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
944 "xor "#index", "#index" \n\t"\
947 "movq (%2, "#index"), %%mm2 \n\t" \
948 "movq (%3, "#index"), %%mm3 \n\t" \
949 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
950 "movq (%2, "#index"), %%mm5 \n\t" \
951 "movq (%3, "#index"), %%mm4 \n\t" \
952 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
953 "psubw %%mm3, %%mm2 \n\t" \
954 "psubw %%mm4, %%mm5 \n\t" \
955 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
956 "pmulhw %%mm0, %%mm2 \n\t" \
957 "pmulhw %%mm0, %%mm5 \n\t" \
958 "psraw $7, %%mm3 \n\t" \
959 "psraw $7, %%mm4 \n\t" \
960 "paddw %%mm2, %%mm3 \n\t" \
961 "paddw %%mm5, %%mm4 \n\t" \
962 "movq (%0, "#index", 2), %%mm0 \n\t" \
963 "movq (%1, "#index", 2), %%mm1 \n\t" \
964 "movq 8(%0, "#index", 2), %%mm6 \n\t" \
965 "movq 8(%1, "#index", 2), %%mm7 \n\t" \
966 "psubw %%mm1, %%mm0 \n\t" \
967 "psubw %%mm7, %%mm6 \n\t" \
968 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \
969 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \
970 "psraw $7, %%mm1 \n\t" \
971 "psraw $7, %%mm7 \n\t" \
972 "paddw %%mm0, %%mm1 \n\t" \
973 "paddw %%mm6, %%mm7 \n\t" \
975 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
978 const int16_t *ubuf[2],
const int16_t *vbuf[2],
979 const int16_t *abuf[2], uint8_t *dest,
980 int dstW,
int yalpha,
int uvalpha,
int y)
982 const int16_t *buf0 = buf[0], *buf1 = buf[1],
983 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
987 "mov %4, %%"FF_REG_b
" \n\t"
988 "push %%"FF_REG_BP
" \n\t"
991 "pop %%"FF_REG_BP
" \n\t"
993 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
998 #define REAL_YSCALEYUV2RGB1(index, c) \
999 "xor "#index", "#index" \n\t"\
1002 "movq (%2, "#index"), %%mm3 \n\t" \
1003 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1004 "movq (%2, "#index"), %%mm4 \n\t" \
1005 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1006 "psraw $4, %%mm3 \n\t" \
1007 "psraw $4, %%mm4 \n\t" \
1008 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
1009 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
1010 "movq %%mm3, %%mm2 \n\t" \
1011 "movq %%mm4, %%mm5 \n\t" \
1012 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1013 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1015 "movq (%0, "#index", 2), %%mm1 \n\t" \
1016 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
1017 "psraw $4, %%mm1 \n\t" \
1018 "psraw $4, %%mm7 \n\t" \
1019 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1020 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1021 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
1022 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
1023 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1024 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1026 "paddw %%mm3, %%mm4 \n\t"\
1027 "movq %%mm2, %%mm0 \n\t"\
1028 "movq %%mm5, %%mm6 \n\t"\
1029 "movq %%mm4, %%mm3 \n\t"\
1030 "punpcklwd %%mm2, %%mm2 \n\t"\
1031 "punpcklwd %%mm5, %%mm5 \n\t"\
1032 "punpcklwd %%mm4, %%mm4 \n\t"\
1033 "paddw %%mm1, %%mm2 \n\t"\
1034 "paddw %%mm1, %%mm5 \n\t"\
1035 "paddw %%mm1, %%mm4 \n\t"\
1036 "punpckhwd %%mm0, %%mm0 \n\t"\
1037 "punpckhwd %%mm6, %%mm6 \n\t"\
1038 "punpckhwd %%mm3, %%mm3 \n\t"\
1039 "paddw %%mm7, %%mm0 \n\t"\
1040 "paddw %%mm7, %%mm6 \n\t"\
1041 "paddw %%mm7, %%mm3 \n\t"\
1043 "packuswb %%mm0, %%mm2 \n\t"\
1044 "packuswb %%mm6, %%mm5 \n\t"\
1045 "packuswb %%mm3, %%mm4 \n\t"\
1047 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
1050 #define REAL_YSCALEYUV2RGB1b(index, c) \
1051 "xor "#index", "#index" \n\t"\
1054 "movq (%2, "#index"), %%mm2 \n\t" \
1055 "movq (%3, "#index"), %%mm3 \n\t" \
1056 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1057 "movq (%2, "#index"), %%mm5 \n\t" \
1058 "movq (%3, "#index"), %%mm4 \n\t" \
1059 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1060 "paddw %%mm2, %%mm3 \n\t" \
1061 "paddw %%mm5, %%mm4 \n\t" \
1062 "psrlw $5, %%mm3 \n\t" \
1063 "psrlw $5, %%mm4 \n\t" \
1064 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \
1065 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \
1066 "movq %%mm3, %%mm2 \n\t" \
1067 "movq %%mm4, %%mm5 \n\t" \
1068 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
1069 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
1071 "movq (%0, "#index", 2), %%mm1 \n\t" \
1072 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
1073 "psraw $4, %%mm1 \n\t" \
1074 "psraw $4, %%mm7 \n\t" \
1075 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
1076 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
1077 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \
1078 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \
1079 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
1080 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
1082 "paddw %%mm3, %%mm4 \n\t"\
1083 "movq %%mm2, %%mm0 \n\t"\
1084 "movq %%mm5, %%mm6 \n\t"\
1085 "movq %%mm4, %%mm3 \n\t"\
1086 "punpcklwd %%mm2, %%mm2 \n\t"\
1087 "punpcklwd %%mm5, %%mm5 \n\t"\
1088 "punpcklwd %%mm4, %%mm4 \n\t"\
1089 "paddw %%mm1, %%mm2 \n\t"\
1090 "paddw %%mm1, %%mm5 \n\t"\
1091 "paddw %%mm1, %%mm4 \n\t"\
1092 "punpckhwd %%mm0, %%mm0 \n\t"\
1093 "punpckhwd %%mm6, %%mm6 \n\t"\
1094 "punpckhwd %%mm3, %%mm3 \n\t"\
1095 "paddw %%mm7, %%mm0 \n\t"\
1096 "paddw %%mm7, %%mm6 \n\t"\
1097 "paddw %%mm7, %%mm3 \n\t"\
1099 "packuswb %%mm0, %%mm2 \n\t"\
1100 "packuswb %%mm6, %%mm5 \n\t"\
1101 "packuswb %%mm3, %%mm4 \n\t"\
1103 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
1105 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \
1106 "movq (%1, "#index", 2), %%mm7 \n\t" \
1107 "movq 8(%1, "#index", 2), %%mm1 \n\t" \
1108 "psraw $7, %%mm7 \n\t" \
1109 "psraw $7, %%mm1 \n\t" \
1110 "packuswb %%mm1, %%mm7 \n\t"
1111 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
1117 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1118 const int16_t *abuf0, uint8_t *dest,
1119 int dstW,
int uvalpha,
int y)
1121 const int16_t *ubuf0 = ubuf[0];
1122 const int16_t *buf1= buf0;
1124 if (uvalpha < 2048) {
1125 const int16_t *ubuf1 = ubuf[0];
1126 if (CONFIG_SWSCALE_ALPHA &&
c->needAlpha) {
1129 "mov %4, %%"FF_REG_b
" \n\t"
1130 "push %%"FF_REG_BP
" \n\t"
1133 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1134 "pop %%"FF_REG_BP
" \n\t"
1136 ::
"c" (buf0),
"d" (abuf0),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1142 "mov %4, %%"FF_REG_b
" \n\t"
1143 "push %%"FF_REG_BP
" \n\t"
1145 "pcmpeqd %%mm7, %%mm7 \n\t"
1146 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1147 "pop %%"FF_REG_BP
" \n\t"
1149 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1154 const int16_t *ubuf1 = ubuf[1];
1155 if (CONFIG_SWSCALE_ALPHA &&
c->needAlpha) {
1158 "mov %4, %%"FF_REG_b
" \n\t"
1159 "push %%"FF_REG_BP
" \n\t"
1162 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1163 "pop %%"FF_REG_BP
" \n\t"
1165 ::
"c" (buf0),
"d" (abuf0),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1171 "mov %4, %%"FF_REG_b
" \n\t"
1172 "push %%"FF_REG_BP
" \n\t"
1174 "pcmpeqd %%mm7, %%mm7 \n\t"
1175 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1176 "pop %%"FF_REG_BP
" \n\t"
1178 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1186 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1187 const int16_t *abuf0, uint8_t *dest,
1188 int dstW,
int uvalpha,
int y)
1190 const int16_t *ubuf0 = ubuf[0];
1191 const int16_t *buf1= buf0;
1193 if (uvalpha < 2048) {
1194 const int16_t *ubuf1 = ubuf[0];
1197 "mov %4, %%"FF_REG_b
" \n\t"
1198 "push %%"FF_REG_BP
" \n\t"
1200 "pxor %%mm7, %%mm7 \n\t"
1202 "pop %%"FF_REG_BP
" \n\t"
1204 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1209 const int16_t *ubuf1 = ubuf[1];
1212 "mov %4, %%"FF_REG_b
" \n\t"
1213 "push %%"FF_REG_BP
" \n\t"
1215 "pxor %%mm7, %%mm7 \n\t"
1217 "pop %%"FF_REG_BP
" \n\t"
1219 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1227 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1228 const int16_t *abuf0, uint8_t *dest,
1229 int dstW,
int uvalpha,
int y)
1231 const int16_t *ubuf0 = ubuf[0];
1232 const int16_t *buf1= buf0;
1234 if (uvalpha < 2048) {
1235 const int16_t *ubuf1 = ubuf[0];
1238 "mov %4, %%"FF_REG_b
" \n\t"
1239 "push %%"FF_REG_BP
" \n\t"
1241 "pxor %%mm7, %%mm7 \n\t"
1249 "pop %%"FF_REG_BP
" \n\t"
1251 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1256 const int16_t *ubuf1 = ubuf[1];
1259 "mov %4, %%"FF_REG_b
" \n\t"
1260 "push %%"FF_REG_BP
" \n\t"
1262 "pxor %%mm7, %%mm7 \n\t"
1270 "pop %%"FF_REG_BP
" \n\t"
1272 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1280 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1281 const int16_t *abuf0, uint8_t *dest,
1282 int dstW,
int uvalpha,
int y)
1284 const int16_t *ubuf0 = ubuf[0];
1285 const int16_t *buf1= buf0;
1287 if (uvalpha < 2048) {
1288 const int16_t *ubuf1 = ubuf[0];
1291 "mov %4, %%"FF_REG_b
" \n\t"
1292 "push %%"FF_REG_BP
" \n\t"
1294 "pxor %%mm7, %%mm7 \n\t"
1302 "pop %%"FF_REG_BP
" \n\t"
1304 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1309 const int16_t *ubuf1 = ubuf[1];
1312 "mov %4, %%"FF_REG_b
" \n\t"
1313 "push %%"FF_REG_BP
" \n\t"
1315 "pxor %%mm7, %%mm7 \n\t"
1323 "pop %%"FF_REG_BP
" \n\t"
1325 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1332 #define REAL_YSCALEYUV2PACKED1(index, c) \
1333 "xor "#index", "#index" \n\t"\
1336 "movq (%2, "#index"), %%mm3 \n\t" \
1337 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1338 "movq (%2, "#index"), %%mm4 \n\t" \
1339 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1340 "psraw $7, %%mm3 \n\t" \
1341 "psraw $7, %%mm4 \n\t" \
1342 "movq (%0, "#index", 2), %%mm1 \n\t" \
1343 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
1344 "psraw $7, %%mm1 \n\t" \
1345 "psraw $7, %%mm7 \n\t" \
1347 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
1349 #define REAL_YSCALEYUV2PACKED1b(index, c) \
1350 "xor "#index", "#index" \n\t"\
1353 "movq (%2, "#index"), %%mm2 \n\t" \
1354 "movq (%3, "#index"), %%mm3 \n\t" \
1355 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1356 "movq (%2, "#index"), %%mm5 \n\t" \
1357 "movq (%3, "#index"), %%mm4 \n\t" \
1358 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
1359 "paddw %%mm2, %%mm3 \n\t" \
1360 "paddw %%mm5, %%mm4 \n\t" \
1361 "psrlw $8, %%mm3 \n\t" \
1362 "psrlw $8, %%mm4 \n\t" \
1363 "movq (%0, "#index", 2), %%mm1 \n\t" \
1364 "movq 8(%0, "#index", 2), %%mm7 \n\t" \
1365 "psraw $7, %%mm1 \n\t" \
1366 "psraw $7, %%mm7 \n\t"
1367 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
1370 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1371 const int16_t *abuf0, uint8_t *dest,
1372 int dstW,
int uvalpha,
int y)
1374 const int16_t *ubuf0 = ubuf[0];
1375 const int16_t *buf1= buf0;
1377 if (uvalpha < 2048) {
1378 const int16_t *ubuf1 = ubuf[0];
1381 "mov %4, %%"FF_REG_b
" \n\t"
1382 "push %%"FF_REG_BP
" \n\t"
1385 "pop %%"FF_REG_BP
" \n\t"
1387 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1391 const int16_t *ubuf1 = ubuf[1];
1394 "mov %4, %%"FF_REG_b
" \n\t"
1395 "push %%"FF_REG_BP
" \n\t"
1398 "pop %%"FF_REG_BP
" \n\t"
1400 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1409 c->use_mmx_vfilter= 0;
1415 switch (
c->dstFormat) {
1427 c->use_mmx_vfilter= 1;
1429 switch (
c->dstFormat) {
1443 switch (
c->dstFormat) {
1445 c->yuv2packed1 =
RENAME(yuv2rgb32_1);
1446 c->yuv2packed2 =
RENAME(yuv2rgb32_2);
1449 c->yuv2packed1 =
RENAME(yuv2bgr24_1);
1450 c->yuv2packed2 =
RENAME(yuv2bgr24_2);
1453 c->yuv2packed1 =
RENAME(yuv2rgb555_1);
1454 c->yuv2packed2 =
RENAME(yuv2rgb555_2);
1457 c->yuv2packed1 =
RENAME(yuv2rgb565_1);
1458 c->yuv2packed2 =
RENAME(yuv2rgb565_2);
1461 c->yuv2packed1 =
RENAME(yuv2yuyv422_1);
1462 c->yuv2packed2 =
RENAME(yuv2yuyv422_2);
1470 if (
c->srcBpc == 8 &&
c->dstBpc <= 14) {
1476 c->hyscale_fast =
NULL;
1477 c->hcscale_fast =
NULL;