47 "movq (%3), %%mm0 \n\t"
48 "movq 8(%3), %%mm1 \n\t"
49 "movq 16(%3), %%mm2 \n\t"
50 "movq 24(%3), %%mm3 \n\t"
51 "movq 32(%3), %%mm4 \n\t"
52 "movq 40(%3), %%mm5 \n\t"
53 "movq 48(%3), %%mm6 \n\t"
54 "movq 56(%3), %%mm7 \n\t"
55 "packuswb %%mm1, %%mm0 \n\t"
56 "packuswb %%mm3, %%mm2 \n\t"
57 "packuswb %%mm5, %%mm4 \n\t"
58 "packuswb %%mm7, %%mm6 \n\t"
59 "movq %%mm0, (%0) \n\t"
60 "movq %%mm2, (%0, %1) \n\t"
61 "movq %%mm4, (%0, %1, 2) \n\t"
62 "movq %%mm6, (%0, %2) \n\t"
73 "movq (%3), %%mm0 \n\t"
74 "movq 8(%3), %%mm1 \n\t"
75 "movq 16(%3), %%mm2 \n\t"
76 "movq 24(%3), %%mm3 \n\t"
77 "movq 32(%3), %%mm4 \n\t"
78 "movq 40(%3), %%mm5 \n\t"
79 "movq 48(%3), %%mm6 \n\t"
80 "movq 56(%3), %%mm7 \n\t"
81 "packuswb %%mm1, %%mm0 \n\t"
82 "packuswb %%mm3, %%mm2 \n\t"
83 "packuswb %%mm5, %%mm4 \n\t"
84 "packuswb %%mm7, %%mm6 \n\t"
85 "movq %%mm0, (%0) \n\t"
86 "movq %%mm2, (%0, %1) \n\t"
87 "movq %%mm4, (%0, %1, 2) \n\t"
88 "movq %%mm6, (%0, %2) \n\t"
89 ::
"r"(pix),
"r"((
x86_reg)line_size),
"r"((
x86_reg)line_size * 3),
"r"(p)
93 #define put_signed_pixels_clamped_mmx_half(off) \
94 "movq "#off"(%2), %%mm1 \n\t" \
95 "movq 16 + "#off"(%2), %%mm2 \n\t" \
96 "movq 32 + "#off"(%2), %%mm3 \n\t" \
97 "movq 48 + "#off"(%2), %%mm4 \n\t" \
98 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
99 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
100 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
101 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
102 "paddb %%mm0, %%mm1 \n\t" \
103 "paddb %%mm0, %%mm2 \n\t" \
104 "paddb %%mm0, %%mm3 \n\t" \
105 "paddb %%mm0, %%mm4 \n\t" \
106 "movq %%mm1, (%0) \n\t" \
107 "movq %%mm2, (%0, %3) \n\t" \
108 "movq %%mm3, (%0, %3, 2) \n\t" \
109 "movq %%mm4, (%0, %1) \n\t"
119 "lea (%3, %3, 2), %1 \n\t"
120 put_signed_pixels_clamped_mmx_half(0)
121 "lea (%0, %3, 4), %0 \n\t"
122 put_signed_pixels_clamped_mmx_half(64)
123 :
"+&r"(pixels),
"=&r"(line_skip3)
124 :
"r"(block),
"r"(line_skip)
142 "movq (%2), %%mm0 \n\t"
143 "movq 8(%2), %%mm1 \n\t"
144 "movq 16(%2), %%mm2 \n\t"
145 "movq 24(%2), %%mm3 \n\t"
146 "movq %0, %%mm4 \n\t"
147 "movq %1, %%mm6 \n\t"
148 "movq %%mm4, %%mm5 \n\t"
149 "punpcklbw %%mm7, %%mm4 \n\t"
150 "punpckhbw %%mm7, %%mm5 \n\t"
151 "paddsw %%mm4, %%mm0 \n\t"
152 "paddsw %%mm5, %%mm1 \n\t"
153 "movq %%mm6, %%mm5 \n\t"
154 "punpcklbw %%mm7, %%mm6 \n\t"
155 "punpckhbw %%mm7, %%mm5 \n\t"
156 "paddsw %%mm6, %%mm2 \n\t"
157 "paddsw %%mm5, %%mm3 \n\t"
158 "packuswb %%mm1, %%mm0 \n\t"
159 "packuswb %%mm3, %%mm2 \n\t"
160 "movq %%mm0, %0 \n\t"
161 "movq %%mm2, %1 \n\t"
162 :
"+m"(*pix),
"+m"(*(pix + line_size))
165 pix += line_size * 2;
170 #define CLEAR_BLOCKS(name, n) \
171 void name(int16_t *blocks) \
174 "pxor %%mm7, %%mm7 \n\t" \
175 "mov %1, %%"REG_a" \n\t" \
177 "movq %%mm7, (%0, %%"REG_a") \n\t" \
178 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
179 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
180 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
181 "add $32, %%"REG_a" \n\t" \
183 :: "r"(((uint8_t *)blocks) + 128 * n), \
194 "xorps %%xmm0, %%xmm0 \n"
195 "movaps %%xmm0, (%0) \n"
196 "movaps %%xmm0, 16(%0) \n"
197 "movaps %%xmm0, 32(%0) \n"
198 "movaps %%xmm0, 48(%0) \n"
199 "movaps %%xmm0, 64(%0) \n"
200 "movaps %%xmm0, 80(%0) \n"
201 "movaps %%xmm0, 96(%0) \n"
202 "movaps %%xmm0, 112(%0) \n"
211 "xorps %%xmm0, %%xmm0 \n"
212 "mov %1, %%"REG_a
" \n"
214 "movaps %%xmm0, (%0, %%"REG_a
") \n"
215 "movaps %%xmm0, 16(%0, %%"REG_a
") \n"
216 "movaps %%xmm0, 32(%0, %%"REG_a
") \n"
217 "movaps %%xmm0, 48(%0, %%"REG_a
") \n"
218 "movaps %%xmm0, 64(%0, %%"REG_a
") \n"
219 "movaps %%xmm0, 80(%0, %%"REG_a
") \n"
220 "movaps %%xmm0, 96(%0, %%"REG_a
") \n"
221 "movaps %%xmm0, 112(%0, %%"REG_a
") \n"
222 "add $128, %%"REG_a
" \n"
224 ::
"r"(((
uint8_t *)blocks) + 128 * 6),
236 "movq (%1, %0), %%mm0 \n\t"
237 "movq (%2, %0), %%mm1 \n\t"
238 "paddb %%mm0, %%mm1 \n\t"
239 "movq %%mm1, (%2, %0) \n\t"
240 "movq 8(%1, %0), %%mm0 \n\t"
241 "movq 8(%2, %0), %%mm1 \n\t"
242 "paddb %%mm0, %%mm1 \n\t"
243 "movq %%mm1, 8(%2, %0) \n\t"
249 :
"r"(src),
"r"(dst),
"r"((
x86_reg)w - 15)
252 dst[i + 0] += src[i + 0];
258 int w,
int h,
int sides)
263 last_line = buf + (height - 1) * wrap;
269 "movd (%0), %%mm0 \n\t"
270 "punpcklbw %%mm0, %%mm0 \n\t"
271 "punpcklwd %%mm0, %%mm0 \n\t"
272 "punpckldq %%mm0, %%mm0 \n\t"
273 "movq %%mm0, -8(%0) \n\t"
274 "movq -8(%0, %2), %%mm1 \n\t"
275 "punpckhbw %%mm1, %%mm1 \n\t"
276 "punpckhwd %%mm1, %%mm1 \n\t"
277 "punpckhdq %%mm1, %%mm1 \n\t"
278 "movq %%mm1, (%0, %2) \n\t"
288 "movd (%0), %%mm0 \n\t"
289 "punpcklbw %%mm0, %%mm0 \n\t"
290 "punpcklwd %%mm0, %%mm0 \n\t"
291 "punpckldq %%mm0, %%mm0 \n\t"
292 "movq %%mm0, -8(%0) \n\t"
293 "movq %%mm0, -16(%0) \n\t"
294 "movq -8(%0, %2), %%mm1 \n\t"
295 "punpckhbw %%mm1, %%mm1 \n\t"
296 "punpckhwd %%mm1, %%mm1 \n\t"
297 "punpckhdq %%mm1, %%mm1 \n\t"
298 "movq %%mm1, (%0, %2) \n\t"
299 "movq %%mm1, 8(%0, %2) \n\t"
310 "movd (%0), %%mm0 \n\t"
311 "punpcklbw %%mm0, %%mm0 \n\t"
312 "punpcklwd %%mm0, %%mm0 \n\t"
313 "movd %%mm0, -4(%0) \n\t"
314 "movd -4(%0, %2), %%mm1 \n\t"
315 "punpcklbw %%mm1, %%mm1 \n\t"
316 "punpckhwd %%mm1, %%mm1 \n\t"
317 "punpckhdq %%mm1, %%mm1 \n\t"
318 "movd %%mm1, (%0, %2) \n\t"
329 for (i = 0; i < h; i += 4) {
330 ptr = buf - (i + 1) * wrap - w;
333 "movq (%1, %0), %%mm0 \n\t"
334 "movq %%mm0, (%0) \n\t"
335 "movq %%mm0, (%0, %2) \n\t"
336 "movq %%mm0, (%0, %2, 2) \n\t"
337 "movq %%mm0, (%0, %3) \n\t"
343 "r"((
x86_reg) -wrap * 3),
"r"(ptr + width + 2 * w)
349 for (i = 0; i < h; i += 4) {
350 ptr = last_line + (i + 1) * wrap - w;
353 "movq (%1, %0), %%mm0 \n\t"
354 "movq %%mm0, (%0) \n\t"
355 "movq %%mm0, (%0, %2) \n\t"
356 "movq %%mm0, (%0, %2, 2) \n\t"
357 "movq %%mm0, (%0, %3) \n\t"
364 "r"(ptr + width + 2 * w)
370 typedef void emulated_edge_mc_func(
uint8_t *dst,
const uint8_t *src,
371 ptrdiff_t dst_stride,
372 ptrdiff_t src_linesize,
373 int block_w,
int block_h,
374 int src_x,
int src_y,
int w,
int h);
377 int stride,
int h,
int ox,
int oy,
378 int dxx,
int dxy,
int dyx,
int dyy,
379 int shift,
int r,
int width,
int height,
380 emulated_edge_mc_func *emu_edge_fn)
383 const int ix = ox >> (16 +
shift);
384 const int iy = oy >> (16 +
shift);
385 const int oxs = ox >> 4;
386 const int oys = oy >> 4;
387 const int dxxs = dxx >> 4;
388 const int dxys = dxy >> 4;
389 const int dyxs = dyx >> 4;
390 const int dyys = dyy >> 4;
391 const uint16_t r4[4] = {
r,
r,
r, r };
392 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
393 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
395 #define MAX_STRIDE 4096U
397 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
400 const int dxw = (dxx - (1 << (16 +
shift))) * (w - 1);
401 const int dyh = (dyy - (1 << (16 +
shift))) * (h - 1);
402 const int dxh = dxy * (h - 1);
403 const int dyw = dyx * (w - 1);
404 int need_emu = (unsigned)ix >= width - w ||
405 (
unsigned)iy >= height - h;
408 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
409 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 +
shift)
411 || (dxx | dxy | dyx | dyy) & 15
412 || (need_emu && (h > MAX_H ||
stride > MAX_STRIDE))) {
414 ff_gmc_c(dst, src,
stride, h, ox, oy, dxx, dxy, dyx, dyy,
421 emu_edge_fn(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height);
426 "movd %0, %%mm6 \n\t"
427 "pxor %%mm7, %%mm7 \n\t"
428 "punpcklwd %%mm6, %%mm6 \n\t"
429 "punpcklwd %%mm6, %%mm6 \n\t"
433 for (x = 0; x < w; x += 4) {
434 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
435 oxs - dxys + dxxs * (x + 1),
436 oxs - dxys + dxxs * (x + 2),
437 oxs - dxys + dxxs * (x + 3) };
438 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
439 oys - dyys + dyxs * (x + 1),
440 oys - dyys + dyxs * (x + 2),
441 oys - dyys + dyxs * (x + 3) };
443 for (
y = 0;
y < h;
y++) {
445 "movq %0, %%mm4 \n\t"
446 "movq %1, %%mm5 \n\t"
447 "paddw %2, %%mm4 \n\t"
448 "paddw %3, %%mm5 \n\t"
449 "movq %%mm4, %0 \n\t"
450 "movq %%mm5, %1 \n\t"
451 "psrlw $12, %%mm4 \n\t"
452 "psrlw $12, %%mm5 \n\t"
453 :
"+m"(*dx4),
"+m"(*dy4)
454 :
"m"(*dxy4),
"m"(*dyy4)
458 "movq %%mm6, %%mm2 \n\t"
459 "movq %%mm6, %%mm1 \n\t"
460 "psubw %%mm4, %%mm2 \n\t"
461 "psubw %%mm5, %%mm1 \n\t"
462 "movq %%mm2, %%mm0 \n\t"
463 "movq %%mm4, %%mm3 \n\t"
464 "pmullw %%mm1, %%mm0 \n\t"
465 "pmullw %%mm5, %%mm3 \n\t"
466 "pmullw %%mm5, %%mm2 \n\t"
467 "pmullw %%mm4, %%mm1 \n\t"
469 "movd %4, %%mm5 \n\t"
470 "movd %3, %%mm4 \n\t"
471 "punpcklbw %%mm7, %%mm5 \n\t"
472 "punpcklbw %%mm7, %%mm4 \n\t"
473 "pmullw %%mm5, %%mm3 \n\t"
474 "pmullw %%mm4, %%mm2 \n\t"
476 "movd %2, %%mm5 \n\t"
477 "movd %1, %%mm4 \n\t"
478 "punpcklbw %%mm7, %%mm5 \n\t"
479 "punpcklbw %%mm7, %%mm4 \n\t"
480 "pmullw %%mm5, %%mm1 \n\t"
481 "pmullw %%mm4, %%mm0 \n\t"
482 "paddw %5, %%mm1 \n\t"
483 "paddw %%mm3, %%mm2 \n\t"
484 "paddw %%mm1, %%mm0 \n\t"
485 "paddw %%mm2, %%mm0 \n\t"
487 "psrlw %6, %%mm0 \n\t"
488 "packuswb %%mm0, %%mm0 \n\t"
489 "movd %%mm0, %0 \n\t"
491 :
"=m"(dst[x +
y * stride])
492 :
"m"(src[0]),
"m"(src[1]),
493 "m"(src[stride]),
"m"(src[stride + 1]),
498 src += 4 - h * stride;
506 int stride,
int h,
int ox,
int oy,
507 int dxx,
int dxy,
int dyx,
int dyy,
508 int shift,
int r,
int width,
int height)
510 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
511 width, height, &ff_emulated_edge_mc_8);
515 int stride,
int h,
int ox,
int oy,
516 int dxx,
int dxy,
int dyx,
int dyy,
517 int shift,
int r,
int width,
int height)
519 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
520 width, height, &ff_emulated_edge_mc_8);
524 int stride,
int h,
int ox,
int oy,
525 int dxx,
int dxy,
int dyx,
int dyy,
526 int shift,
int r,
int width,
int height)
528 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
529 width, height, &ff_emulated_edge_mc_8);
534 #if CONFIG_DIRAC_DECODER
535 #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
536 void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
539 ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
541 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
543 void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
546 ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
548 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
550 void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
553 ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
555 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
556 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
561 PIXELS16(
static, ff_avg, , , _mmxext)
562 DIRAC_PIXOP(put, ff_put, mmx)
563 DIRAC_PIXOP(avg, ff_avg, mmx)
567 DIRAC_PIXOP(avg, ff_avg, mmxext)
572 ff_put_dirac_pixels16_c(dst, src, stride, h);
579 ff_avg_dirac_pixels16_c(dst, src, stride, h);
586 ff_put_dirac_pixels32_c(dst, src, stride, h);
595 ff_avg_dirac_pixels32_c(dst, src, stride, h);
605 float min,
float max,
int len)
609 "movss %3, %%xmm4 \n\t"
610 "movss %4, %%xmm5 \n\t"
611 "shufps $0, %%xmm4, %%xmm4 \n\t"
612 "shufps $0, %%xmm5, %%xmm5 \n\t"
614 "movaps (%2, %0), %%xmm0 \n\t"
615 "movaps 16(%2, %0), %%xmm1 \n\t"
616 "movaps 32(%2, %0), %%xmm2 \n\t"
617 "movaps 48(%2, %0), %%xmm3 \n\t"
618 "maxps %%xmm4, %%xmm0 \n\t"
619 "maxps %%xmm4, %%xmm1 \n\t"
620 "maxps %%xmm4, %%xmm2 \n\t"
621 "maxps %%xmm4, %%xmm3 \n\t"
622 "minps %%xmm5, %%xmm0 \n\t"
623 "minps %%xmm5, %%xmm1 \n\t"
624 "minps %%xmm5, %%xmm2 \n\t"
625 "minps %%xmm5, %%xmm3 \n\t"
626 "movaps %%xmm0, (%1, %0) \n\t"
627 "movaps %%xmm1, 16(%1, %0) \n\t"
628 "movaps %%xmm2, 32(%1, %0) \n\t"
629 "movaps %%xmm3, 48(%1, %0) \n\t"
633 :
"r"(dst),
"r"(
src),
"m"(min),
"m"(max)