82 #define hadamard_func(cpu) \
83 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
84 uint8_t *src2, ptrdiff_t stride, int h); \
85 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
86 uint8_t *src2, ptrdiff_t stride, int h);
100 score1 =
c->mecc.sse[0](
c, pix1, pix2,
stride,
h);
107 return score1 +
FFABS(score2) *
c->avctx->nsse_weight;
109 return score1 +
FFABS(score2) * 8;
120 return score1 +
FFABS(score2) *
c->avctx->nsse_weight;
122 return score1 +
FFABS(score2) * 8;
137 #define SUM(in0, in1, out0, out1) \
138 "movq (%0), %%mm2\n" \
139 "movq 8(%0), %%mm3\n" \
141 "movq %%mm2, " #out0 "\n" \
142 "movq %%mm3, " #out1 "\n" \
143 "psubusb " #in0 ", %%mm2\n" \
144 "psubusb " #in1 ", %%mm3\n" \
145 "psubusb " #out0 ", " #in0 "\n" \
146 "psubusb " #out1 ", " #in1 "\n" \
147 "por %%mm2, " #in0 "\n" \
148 "por %%mm3, " #in1 "\n" \
149 "movq " #in0 ", %%mm2\n" \
150 "movq " #in1 ", %%mm3\n" \
151 "punpcklbw %%mm7, " #in0 "\n" \
152 "punpcklbw %%mm7, " #in1 "\n" \
153 "punpckhbw %%mm7, %%mm2\n" \
154 "punpckhbw %%mm7, %%mm3\n" \
155 "paddw " #in1 ", " #in0 "\n" \
156 "paddw %%mm3, %%mm2\n" \
157 "paddw %%mm2, " #in0 "\n" \
158 "paddw " #in0 ", %%mm6\n"
163 "pxor %%mm6, %%mm6\n"
164 "pxor %%mm7, %%mm7\n"
166 "movq 8(%0), %%mm1\n"
171 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
173 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
178 "movq %%mm6, %%mm0\n"
180 "paddw %%mm6, %%mm0\n"
181 "movq %%mm0, %%mm6\n"
183 "paddw %%mm6, %%mm0\n"
185 :
"+r" (pix),
"=r" (
tmp)
193 static int vsad16_mmx(
MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
202 #define SUM(in0, in1, out0, out1) \
203 "movq (%0), %%mm2\n" \
204 "movq (%1), " #out0 "\n" \
205 "movq 8(%0), %%mm3\n" \
206 "movq 8(%1), " #out1 "\n" \
209 "psubb " #out0 ", %%mm2\n" \
210 "psubb " #out1 ", %%mm3\n" \
211 "pxor %%mm7, %%mm2\n" \
212 "pxor %%mm7, %%mm3\n" \
213 "movq %%mm2, " #out0 "\n" \
214 "movq %%mm3, " #out1 "\n" \
215 "psubusb " #in0 ", %%mm2\n" \
216 "psubusb " #in1 ", %%mm3\n" \
217 "psubusb " #out0 ", " #in0 "\n" \
218 "psubusb " #out1 ", " #in1 "\n" \
219 "por %%mm2, " #in0 "\n" \
220 "por %%mm3, " #in1 "\n" \
221 "movq " #in0 ", %%mm2\n" \
222 "movq " #in1 ", %%mm3\n" \
223 "punpcklbw %%mm7, " #in0 "\n" \
224 "punpcklbw %%mm7, " #in1 "\n" \
225 "punpckhbw %%mm7, %%mm2\n" \
226 "punpckhbw %%mm7, %%mm3\n" \
227 "paddw " #in1 ", " #in0 "\n" \
228 "paddw %%mm3, %%mm2\n" \
229 "paddw %%mm2, " #in0 "\n" \
230 "paddw " #in0 ", %%mm6\n"
235 "pxor %%mm6, %%mm6\n"
236 "pcmpeqw %%mm7, %%mm7\n"
238 "packsswb %%mm7, %%mm7\n"
241 "movq 8(%0), %%mm1\n"
242 "movq 8(%1), %%mm3\n"
245 "psubb %%mm2, %%mm0\n"
246 "psubb %%mm3, %%mm1\n"
247 "pxor %%mm7, %%mm0\n"
248 "pxor %%mm7, %%mm1\n"
252 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
254 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
259 "movq %%mm6, %%mm0\n"
261 "paddw %%mm6, %%mm0\n"
262 "movq %%mm0, %%mm6\n"
264 "paddw %%mm6, %%mm0\n"
266 :
"+r" (pix1),
"+r" (pix2),
"=r" (
tmp)
275 0x0000000000000000ULL,
276 0x0001000100010001ULL,
277 0x0002000200020002ULL,
280 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
287 "movq (%1, %%"FF_REG_a
"), %%mm0 \n\t"
288 "movq (%2, %%"FF_REG_a
"), %%mm2 \n\t"
289 "movq (%2, %%"FF_REG_a
"), %%mm4 \n\t"
290 "add %3, %%"FF_REG_a
" \n\t"
291 "psubusb %%mm0, %%mm2 \n\t"
292 "psubusb %%mm4, %%mm0 \n\t"
293 "movq (%1, %%"FF_REG_a
"), %%mm1 \n\t"
294 "movq (%2, %%"FF_REG_a
"), %%mm3 \n\t"
295 "movq (%2, %%"FF_REG_a
"), %%mm5 \n\t"
296 "psubusb %%mm1, %%mm3 \n\t"
297 "psubusb %%mm5, %%mm1 \n\t"
298 "por %%mm2, %%mm0 \n\t"
299 "por %%mm1, %%mm3 \n\t"
300 "movq %%mm0, %%mm1 \n\t"
301 "movq %%mm3, %%mm2 \n\t"
302 "punpcklbw %%mm7, %%mm0 \n\t"
303 "punpckhbw %%mm7, %%mm1 \n\t"
304 "punpcklbw %%mm7, %%mm3 \n\t"
305 "punpckhbw %%mm7, %%mm2 \n\t"
306 "paddw %%mm1, %%mm0 \n\t"
307 "paddw %%mm3, %%mm2 \n\t"
308 "paddw %%mm2, %%mm0 \n\t"
309 "paddw %%mm0, %%mm6 \n\t"
310 "add %3, %%"FF_REG_a
" \n\t"
316 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
323 "movq (%1, %%"FF_REG_a
"), %%mm0 \n\t"
324 "movq (%2, %%"FF_REG_a
"), %%mm1 \n\t"
325 "movq (%1, %%"FF_REG_a
"), %%mm2 \n\t"
326 "movq (%2, %%"FF_REG_a
"), %%mm3 \n\t"
327 "punpcklbw %%mm7, %%mm0 \n\t"
328 "punpcklbw %%mm7, %%mm1 \n\t"
329 "punpckhbw %%mm7, %%mm2 \n\t"
330 "punpckhbw %%mm7, %%mm3 \n\t"
331 "paddw %%mm0, %%mm1 \n\t"
332 "paddw %%mm2, %%mm3 \n\t"
333 "movq (%3, %%"FF_REG_a
"), %%mm4 \n\t"
334 "movq (%3, %%"FF_REG_a
"), %%mm2 \n\t"
335 "paddw %%mm5, %%mm1 \n\t"
336 "paddw %%mm5, %%mm3 \n\t"
337 "psrlw $1, %%mm1 \n\t"
338 "psrlw $1, %%mm3 \n\t"
339 "packuswb %%mm3, %%mm1 \n\t"
340 "psubusb %%mm1, %%mm4 \n\t"
341 "psubusb %%mm2, %%mm1 \n\t"
342 "por %%mm4, %%mm1 \n\t"
343 "movq %%mm1, %%mm0 \n\t"
344 "punpcklbw %%mm7, %%mm0 \n\t"
345 "punpckhbw %%mm7, %%mm1 \n\t"
346 "paddw %%mm1, %%mm0 \n\t"
347 "paddw %%mm0, %%mm6 \n\t"
348 "add %4, %%"FF_REG_a
" \n\t"
351 :
"r" (blk1a -
len),
"r" (blk1b -
len),
"r" (blk2 -
len),
355 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
360 "movq (%1, %%"FF_REG_a
"), %%mm0\n\t"
361 "movq 1(%1, %%"FF_REG_a
"), %%mm2\n\t"
362 "movq %%mm0, %%mm1 \n\t"
363 "movq %%mm2, %%mm3 \n\t"
364 "punpcklbw %%mm7, %%mm0 \n\t"
365 "punpckhbw %%mm7, %%mm1 \n\t"
366 "punpcklbw %%mm7, %%mm2 \n\t"
367 "punpckhbw %%mm7, %%mm3 \n\t"
368 "paddw %%mm2, %%mm0 \n\t"
369 "paddw %%mm3, %%mm1 \n\t"
372 "movq (%2, %%"FF_REG_a
"), %%mm2\n\t"
373 "movq 1(%2, %%"FF_REG_a
"), %%mm4\n\t"
374 "movq %%mm2, %%mm3 \n\t"
375 "movq %%mm4, %%mm5 \n\t"
376 "punpcklbw %%mm7, %%mm2 \n\t"
377 "punpckhbw %%mm7, %%mm3 \n\t"
378 "punpcklbw %%mm7, %%mm4 \n\t"
379 "punpckhbw %%mm7, %%mm5 \n\t"
380 "paddw %%mm4, %%mm2 \n\t"
381 "paddw %%mm5, %%mm3 \n\t"
382 "movq %5, %%mm5 \n\t"
383 "paddw %%mm2, %%mm0 \n\t"
384 "paddw %%mm3, %%mm1 \n\t"
385 "paddw %%mm5, %%mm0 \n\t"
386 "paddw %%mm5, %%mm1 \n\t"
387 "movq (%3, %%"FF_REG_a
"), %%mm4 \n\t"
388 "movq (%3, %%"FF_REG_a
"), %%mm5 \n\t"
389 "psrlw $2, %%mm0 \n\t"
390 "psrlw $2, %%mm1 \n\t"
391 "packuswb %%mm1, %%mm0 \n\t"
392 "psubusb %%mm0, %%mm4 \n\t"
393 "psubusb %%mm5, %%mm0 \n\t"
394 "por %%mm4, %%mm0 \n\t"
395 "movq %%mm0, %%mm4 \n\t"
396 "punpcklbw %%mm7, %%mm0 \n\t"
397 "punpckhbw %%mm7, %%mm4 \n\t"
398 "paddw %%mm0, %%mm6 \n\t"
399 "paddw %%mm4, %%mm6 \n\t"
400 "movq %%mm2, %%mm0 \n\t"
401 "movq %%mm3, %%mm1 \n\t"
402 "add %4, %%"FF_REG_a
" \n\t"
406 "r" (
stride),
"m" (round_tab[2]));
409 static inline int sum_mmx(
void)
413 "movq %%mm6, %%mm0 \n\t"
414 "psrlq $32, %%mm6 \n\t"
415 "paddw %%mm0, %%mm6 \n\t"
416 "movq %%mm6, %%mm0 \n\t"
417 "psrlq $16, %%mm6 \n\t"
418 "paddw %%mm0, %%mm6 \n\t"
419 "movd %%mm6, %0 \n\t"
424 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
427 sad8_2_mmx(blk1, blk1 + 1, blk2,
stride,
h);
430 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
436 #define PIX_SAD(suf) \
437 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
438 uint8_t *blk1, ptrdiff_t stride, int h) \
440 av_assert2(h == 8); \
442 "pxor %%mm7, %%mm7 \n\t" \
443 "pxor %%mm6, %%mm6 \n\t" \
446 sad8_1_ ## suf(blk1, blk2, stride, 8); \
448 return sum_ ## suf(); \
451 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
452 uint8_t *blk1, ptrdiff_t stride, int h) \
454 av_assert2(h == 8); \
456 "pxor %%mm7, %%mm7 \n\t" \
457 "pxor %%mm6, %%mm6 \n\t" \
458 "movq %0, %%mm5 \n\t" \
459 :: "m" (round_tab[1])); \
461 sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
463 return sum_ ## suf(); \
466 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
467 uint8_t *blk1, ptrdiff_t stride, int h) \
469 av_assert2(h == 8); \
471 "pxor %%mm7, %%mm7 \n\t" \
472 "pxor %%mm6, %%mm6 \n\t" \
473 "movq %0, %%mm5 \n\t" \
474 :: "m" (round_tab[1])); \
476 sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
478 return sum_ ## suf(); \
481 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
482 uint8_t *blk1, ptrdiff_t stride, int h) \
484 av_assert2(h == 8); \
486 "pxor %%mm7, %%mm7 \n\t" \
487 "pxor %%mm6, %%mm6 \n\t" \
490 sad8_4_ ## suf(blk1, blk2, stride, 8); \
492 return sum_ ## suf(); \
495 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
496 uint8_t *blk1, ptrdiff_t stride, int h) \
499 "pxor %%mm7, %%mm7 \n\t" \
500 "pxor %%mm6, %%mm6 \n\t" \
503 sad8_1_ ## suf(blk1, blk2, stride, h); \
504 sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
506 return sum_ ## suf(); \
509 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
510 uint8_t *blk1, ptrdiff_t stride, int h) \
513 "pxor %%mm7, %%mm7 \n\t" \
514 "pxor %%mm6, %%mm6 \n\t" \
515 "movq %0, %%mm5 \n\t" \
516 :: "m" (round_tab[1])); \
518 sad8_x2a_ ## suf(blk1, blk2, stride, h); \
519 sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
521 return sum_ ## suf(); \
524 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
525 uint8_t *blk1, ptrdiff_t stride, int h) \
528 "pxor %%mm7, %%mm7 \n\t" \
529 "pxor %%mm6, %%mm6 \n\t" \
530 "movq %0, %%mm5 \n\t" \
531 :: "m" (round_tab[1])); \
533 sad8_y2a_ ## suf(blk1, blk2, stride, h); \
534 sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
536 return sum_ ## suf(); \
539 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
540 uint8_t *blk1, ptrdiff_t stride, int h) \
543 "pxor %%mm7, %%mm7 \n\t" \
544 "pxor %%mm6, %%mm6 \n\t" \
547 sad8_4_ ## suf(blk1, blk2, stride, h); \
548 sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
550 return sum_ ## suf(); \
563 c->pix_abs[0][0] = sad16_mmx;
564 c->pix_abs[0][1] = sad16_x2_mmx;
565 c->pix_abs[0][2] = sad16_y2_mmx;
566 c->pix_abs[0][3] = sad16_xy2_mmx;
567 c->pix_abs[1][0] = sad8_mmx;
568 c->pix_abs[1][1] = sad8_x2_mmx;
569 c->pix_abs[1][2] = sad8_y2_mmx;
570 c->pix_abs[1][3] = sad8_xy2_mmx;
572 c->sad[0] = sad16_mmx;
573 c->sad[1] = sad8_mmx;
575 c->vsad[4] = vsad_intra16_mmx;
578 c->vsad[0] = vsad16_mmx;
585 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
586 c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
591 c->nsse[0] = nsse16_mmx;
592 c->nsse[1] = nsse8_mmx;
597 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
598 c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
627 #if HAVE_ALIGNED_STACK
628 c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
629 c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
647 #if HAVE_ALIGNED_STACK
648 c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
649 c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;