FFmpeg
me_cmp_init.c
Go to the documentation of this file.
1 /*
2  * SIMD-optimized motion estimation
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/mem_internal.h"
28 #include "libavutil/x86/asm.h"
29 #include "libavutil/x86/cpu.h"
30 #include "libavcodec/me_cmp.h"
31 #include "libavcodec/mpegvideo.h"
32 
33 int ff_sum_abs_dctelem_mmx(int16_t *block);
34 int ff_sum_abs_dctelem_mmxext(int16_t *block);
35 int ff_sum_abs_dctelem_sse2(int16_t *block);
36 int ff_sum_abs_dctelem_ssse3(int16_t *block);
37 int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
38  ptrdiff_t stride, int h);
39 int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
40  ptrdiff_t stride, int h);
41 int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
42  ptrdiff_t stride, int h);
43 int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
44 int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
45 int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
46  ptrdiff_t stride, int h);
47 int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
48  ptrdiff_t stride, int h);
49 int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
50  ptrdiff_t stride, int h);
51 int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
52  ptrdiff_t stride, int h);
53 int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
54  ptrdiff_t stride, int h);
55 int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
56  ptrdiff_t stride, int h);
57 int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
58  ptrdiff_t stride, int h);
59 int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
60  ptrdiff_t stride, int h);
61 int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
62  ptrdiff_t stride, int h);
63 int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
64  ptrdiff_t stride, int h);
65 int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
66  ptrdiff_t stride, int h);
67 int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
68  ptrdiff_t stride, int h);
69 int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
70  ptrdiff_t stride, int h);
71 int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
72  ptrdiff_t stride, int h);
73 int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
74  ptrdiff_t stride, int h);
75 int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
76  ptrdiff_t stride, int h);
77 int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
78  ptrdiff_t stride, int h);
79 int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
80  ptrdiff_t stride, int h);
81 
82 #define hadamard_func(cpu) \
83  int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
84  uint8_t *src2, ptrdiff_t stride, int h); \
85  int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
86  uint8_t *src2, ptrdiff_t stride, int h);
87 
89 hadamard_func(mmxext)
90 hadamard_func(sse2)
91 hadamard_func(ssse3)
92 
93 #if HAVE_X86ASM
94 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
95  ptrdiff_t stride, int h)
96 {
97  int score1, score2;
98 
99  if (c)
100  score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
101  else
102  score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
103  score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
104  - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
105 
106  if (c)
107  return score1 + FFABS(score2) * c->avctx->nsse_weight;
108  else
109  return score1 + FFABS(score2) * 8;
110 }
111 
112 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
113  ptrdiff_t stride, int h)
114 {
115  int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
116  int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
117  ff_hf_noise8_mmx(pix2, stride, h);
118 
119  if (c)
120  return score1 + FFABS(score2) * c->avctx->nsse_weight;
121  else
122  return score1 + FFABS(score2) * 8;
123 }
124 
125 #endif /* HAVE_X86ASM */
126 
127 #if HAVE_INLINE_ASM
128 
129 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
130  ptrdiff_t stride, int h)
131 {
132  int tmp;
133 
134  av_assert2(((uintptr_t) pix & 7) == 0);
135  av_assert2((stride & 7) == 0);
136 
137 #define SUM(in0, in1, out0, out1) \
138  "movq (%0), %%mm2\n" \
139  "movq 8(%0), %%mm3\n" \
140  "add %2,%0\n" \
141  "movq %%mm2, " #out0 "\n" \
142  "movq %%mm3, " #out1 "\n" \
143  "psubusb " #in0 ", %%mm2\n" \
144  "psubusb " #in1 ", %%mm3\n" \
145  "psubusb " #out0 ", " #in0 "\n" \
146  "psubusb " #out1 ", " #in1 "\n" \
147  "por %%mm2, " #in0 "\n" \
148  "por %%mm3, " #in1 "\n" \
149  "movq " #in0 ", %%mm2\n" \
150  "movq " #in1 ", %%mm3\n" \
151  "punpcklbw %%mm7, " #in0 "\n" \
152  "punpcklbw %%mm7, " #in1 "\n" \
153  "punpckhbw %%mm7, %%mm2\n" \
154  "punpckhbw %%mm7, %%mm3\n" \
155  "paddw " #in1 ", " #in0 "\n" \
156  "paddw %%mm3, %%mm2\n" \
157  "paddw %%mm2, " #in0 "\n" \
158  "paddw " #in0 ", %%mm6\n"
159 
160 
161  __asm__ volatile (
162  "movl %3, %%ecx\n"
163  "pxor %%mm6, %%mm6\n"
164  "pxor %%mm7, %%mm7\n"
165  "movq (%0), %%mm0\n"
166  "movq 8(%0), %%mm1\n"
167  "add %2, %0\n"
168  "jmp 2f\n"
169  "1:\n"
170 
171  SUM(%%mm4, %%mm5, %%mm0, %%mm1)
172  "2:\n"
173  SUM(%%mm0, %%mm1, %%mm4, %%mm5)
174 
175  "subl $2, %%ecx\n"
176  "jnz 1b\n"
177 
178  "movq %%mm6, %%mm0\n"
179  "psrlq $32, %%mm6\n"
180  "paddw %%mm6, %%mm0\n"
181  "movq %%mm0, %%mm6\n"
182  "psrlq $16, %%mm0\n"
183  "paddw %%mm6, %%mm0\n"
184  "movd %%mm0, %1\n"
185  : "+r" (pix), "=r" (tmp)
186  : "r" (stride), "m" (h)
187  : "%ecx");
188 
189  return tmp & 0xFFFF;
190 }
191 #undef SUM
192 
193 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
194  ptrdiff_t stride, int h)
195 {
196  int tmp;
197 
198  av_assert2(((uintptr_t)pix1 & 7) == 0);
199  av_assert2(((uintptr_t)pix2 & 7) == 0);
200  av_assert2((stride & 7) == 0);
201 
202 #define SUM(in0, in1, out0, out1) \
203  "movq (%0), %%mm2\n" \
204  "movq (%1), " #out0 "\n" \
205  "movq 8(%0), %%mm3\n" \
206  "movq 8(%1), " #out1 "\n" \
207  "add %3, %0\n" \
208  "add %3, %1\n" \
209  "psubb " #out0 ", %%mm2\n" \
210  "psubb " #out1 ", %%mm3\n" \
211  "pxor %%mm7, %%mm2\n" \
212  "pxor %%mm7, %%mm3\n" \
213  "movq %%mm2, " #out0 "\n" \
214  "movq %%mm3, " #out1 "\n" \
215  "psubusb " #in0 ", %%mm2\n" \
216  "psubusb " #in1 ", %%mm3\n" \
217  "psubusb " #out0 ", " #in0 "\n" \
218  "psubusb " #out1 ", " #in1 "\n" \
219  "por %%mm2, " #in0 "\n" \
220  "por %%mm3, " #in1 "\n" \
221  "movq " #in0 ", %%mm2\n" \
222  "movq " #in1 ", %%mm3\n" \
223  "punpcklbw %%mm7, " #in0 "\n" \
224  "punpcklbw %%mm7, " #in1 "\n" \
225  "punpckhbw %%mm7, %%mm2\n" \
226  "punpckhbw %%mm7, %%mm3\n" \
227  "paddw " #in1 ", " #in0 "\n" \
228  "paddw %%mm3, %%mm2\n" \
229  "paddw %%mm2, " #in0 "\n" \
230  "paddw " #in0 ", %%mm6\n"
231 
232 
233  __asm__ volatile (
234  "movl %4, %%ecx\n"
235  "pxor %%mm6, %%mm6\n"
236  "pcmpeqw %%mm7, %%mm7\n"
237  "psllw $15, %%mm7\n"
238  "packsswb %%mm7, %%mm7\n"
239  "movq (%0), %%mm0\n"
240  "movq (%1), %%mm2\n"
241  "movq 8(%0), %%mm1\n"
242  "movq 8(%1), %%mm3\n"
243  "add %3, %0\n"
244  "add %3, %1\n"
245  "psubb %%mm2, %%mm0\n"
246  "psubb %%mm3, %%mm1\n"
247  "pxor %%mm7, %%mm0\n"
248  "pxor %%mm7, %%mm1\n"
249  "jmp 2f\n"
250  "1:\n"
251 
252  SUM(%%mm4, %%mm5, %%mm0, %%mm1)
253  "2:\n"
254  SUM(%%mm0, %%mm1, %%mm4, %%mm5)
255 
256  "subl $2, %%ecx\n"
257  "jnz 1b\n"
258 
259  "movq %%mm6, %%mm0\n"
260  "psrlq $32, %%mm6\n"
261  "paddw %%mm6, %%mm0\n"
262  "movq %%mm0, %%mm6\n"
263  "psrlq $16, %%mm0\n"
264  "paddw %%mm6, %%mm0\n"
265  "movd %%mm0, %2\n"
266  : "+r" (pix1), "+r" (pix2), "=r" (tmp)
267  : "r" (stride), "m" (h)
268  : "%ecx");
269 
270  return tmp & 0x7FFF;
271 }
272 #undef SUM
273 
274 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
275  0x0000000000000000ULL,
276  0x0001000100010001ULL,
277  0x0002000200020002ULL,
278 };
279 
280 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
281  ptrdiff_t stride, int h)
282 {
283  x86_reg len = -stride * h;
284  __asm__ volatile (
285  ".p2align 4 \n\t"
286  "1: \n\t"
287  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
288  "movq (%2, %%"FF_REG_a"), %%mm2 \n\t"
289  "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
290  "add %3, %%"FF_REG_a" \n\t"
291  "psubusb %%mm0, %%mm2 \n\t"
292  "psubusb %%mm4, %%mm0 \n\t"
293  "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
294  "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
295  "movq (%2, %%"FF_REG_a"), %%mm5 \n\t"
296  "psubusb %%mm1, %%mm3 \n\t"
297  "psubusb %%mm5, %%mm1 \n\t"
298  "por %%mm2, %%mm0 \n\t"
299  "por %%mm1, %%mm3 \n\t"
300  "movq %%mm0, %%mm1 \n\t"
301  "movq %%mm3, %%mm2 \n\t"
302  "punpcklbw %%mm7, %%mm0 \n\t"
303  "punpckhbw %%mm7, %%mm1 \n\t"
304  "punpcklbw %%mm7, %%mm3 \n\t"
305  "punpckhbw %%mm7, %%mm2 \n\t"
306  "paddw %%mm1, %%mm0 \n\t"
307  "paddw %%mm3, %%mm2 \n\t"
308  "paddw %%mm2, %%mm0 \n\t"
309  "paddw %%mm0, %%mm6 \n\t"
310  "add %3, %%"FF_REG_a" \n\t"
311  " js 1b \n\t"
312  : "+a" (len)
313  : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
314 }
315 
316 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
317  ptrdiff_t stride, int h)
318 {
319  x86_reg len = -stride * h;
320  __asm__ volatile (
321  ".p2align 4 \n\t"
322  "1: \n\t"
323  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
324  "movq (%2, %%"FF_REG_a"), %%mm1 \n\t"
325  "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
326  "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
327  "punpcklbw %%mm7, %%mm0 \n\t"
328  "punpcklbw %%mm7, %%mm1 \n\t"
329  "punpckhbw %%mm7, %%mm2 \n\t"
330  "punpckhbw %%mm7, %%mm3 \n\t"
331  "paddw %%mm0, %%mm1 \n\t"
332  "paddw %%mm2, %%mm3 \n\t"
333  "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
334  "movq (%3, %%"FF_REG_a"), %%mm2 \n\t"
335  "paddw %%mm5, %%mm1 \n\t"
336  "paddw %%mm5, %%mm3 \n\t"
337  "psrlw $1, %%mm1 \n\t"
338  "psrlw $1, %%mm3 \n\t"
339  "packuswb %%mm3, %%mm1 \n\t"
340  "psubusb %%mm1, %%mm4 \n\t"
341  "psubusb %%mm2, %%mm1 \n\t"
342  "por %%mm4, %%mm1 \n\t"
343  "movq %%mm1, %%mm0 \n\t"
344  "punpcklbw %%mm7, %%mm0 \n\t"
345  "punpckhbw %%mm7, %%mm1 \n\t"
346  "paddw %%mm1, %%mm0 \n\t"
347  "paddw %%mm0, %%mm6 \n\t"
348  "add %4, %%"FF_REG_a" \n\t"
349  " js 1b \n\t"
350  : "+a" (len)
351  : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
352  "r" (stride));
353 }
354 
355 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
356  ptrdiff_t stride, int h)
357 {
358  x86_reg len = -stride * h;
359  __asm__ volatile (
360  "movq (%1, %%"FF_REG_a"), %%mm0\n\t"
361  "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
362  "movq %%mm0, %%mm1 \n\t"
363  "movq %%mm2, %%mm3 \n\t"
364  "punpcklbw %%mm7, %%mm0 \n\t"
365  "punpckhbw %%mm7, %%mm1 \n\t"
366  "punpcklbw %%mm7, %%mm2 \n\t"
367  "punpckhbw %%mm7, %%mm3 \n\t"
368  "paddw %%mm2, %%mm0 \n\t"
369  "paddw %%mm3, %%mm1 \n\t"
370  ".p2align 4 \n\t"
371  "1: \n\t"
372  "movq (%2, %%"FF_REG_a"), %%mm2\n\t"
373  "movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
374  "movq %%mm2, %%mm3 \n\t"
375  "movq %%mm4, %%mm5 \n\t"
376  "punpcklbw %%mm7, %%mm2 \n\t"
377  "punpckhbw %%mm7, %%mm3 \n\t"
378  "punpcklbw %%mm7, %%mm4 \n\t"
379  "punpckhbw %%mm7, %%mm5 \n\t"
380  "paddw %%mm4, %%mm2 \n\t"
381  "paddw %%mm5, %%mm3 \n\t"
382  "movq %5, %%mm5 \n\t"
383  "paddw %%mm2, %%mm0 \n\t"
384  "paddw %%mm3, %%mm1 \n\t"
385  "paddw %%mm5, %%mm0 \n\t"
386  "paddw %%mm5, %%mm1 \n\t"
387  "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
388  "movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
389  "psrlw $2, %%mm0 \n\t"
390  "psrlw $2, %%mm1 \n\t"
391  "packuswb %%mm1, %%mm0 \n\t"
392  "psubusb %%mm0, %%mm4 \n\t"
393  "psubusb %%mm5, %%mm0 \n\t"
394  "por %%mm4, %%mm0 \n\t"
395  "movq %%mm0, %%mm4 \n\t"
396  "punpcklbw %%mm7, %%mm0 \n\t"
397  "punpckhbw %%mm7, %%mm4 \n\t"
398  "paddw %%mm0, %%mm6 \n\t"
399  "paddw %%mm4, %%mm6 \n\t"
400  "movq %%mm2, %%mm0 \n\t"
401  "movq %%mm3, %%mm1 \n\t"
402  "add %4, %%"FF_REG_a" \n\t"
403  " js 1b \n\t"
404  : "+a" (len)
405  : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
406  "r" (stride), "m" (round_tab[2]));
407 }
408 
409 static inline int sum_mmx(void)
410 {
411  int ret;
412  __asm__ volatile (
413  "movq %%mm6, %%mm0 \n\t"
414  "psrlq $32, %%mm6 \n\t"
415  "paddw %%mm0, %%mm6 \n\t"
416  "movq %%mm6, %%mm0 \n\t"
417  "psrlq $16, %%mm6 \n\t"
418  "paddw %%mm0, %%mm6 \n\t"
419  "movd %%mm6, %0 \n\t"
420  : "=r" (ret));
421  return ret & 0xFFFF;
422 }
423 
424 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
425  ptrdiff_t stride, int h)
426 {
427  sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
428 }
429 
430 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
431  ptrdiff_t stride, int h)
432 {
433  sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
434 }
435 
436 #define PIX_SAD(suf) \
437 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
438  uint8_t *blk1, ptrdiff_t stride, int h) \
439 { \
440  av_assert2(h == 8); \
441  __asm__ volatile ( \
442  "pxor %%mm7, %%mm7 \n\t" \
443  "pxor %%mm6, %%mm6 \n\t" \
444  :); \
445  \
446  sad8_1_ ## suf(blk1, blk2, stride, 8); \
447  \
448  return sum_ ## suf(); \
449 } \
450  \
451 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
452  uint8_t *blk1, ptrdiff_t stride, int h) \
453 { \
454  av_assert2(h == 8); \
455  __asm__ volatile ( \
456  "pxor %%mm7, %%mm7 \n\t" \
457  "pxor %%mm6, %%mm6 \n\t" \
458  "movq %0, %%mm5 \n\t" \
459  :: "m" (round_tab[1])); \
460  \
461  sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
462  \
463  return sum_ ## suf(); \
464 } \
465  \
466 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
467  uint8_t *blk1, ptrdiff_t stride, int h) \
468 { \
469  av_assert2(h == 8); \
470  __asm__ volatile ( \
471  "pxor %%mm7, %%mm7 \n\t" \
472  "pxor %%mm6, %%mm6 \n\t" \
473  "movq %0, %%mm5 \n\t" \
474  :: "m" (round_tab[1])); \
475  \
476  sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
477  \
478  return sum_ ## suf(); \
479 } \
480  \
481 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
482  uint8_t *blk1, ptrdiff_t stride, int h) \
483 { \
484  av_assert2(h == 8); \
485  __asm__ volatile ( \
486  "pxor %%mm7, %%mm7 \n\t" \
487  "pxor %%mm6, %%mm6 \n\t" \
488  ::); \
489  \
490  sad8_4_ ## suf(blk1, blk2, stride, 8); \
491  \
492  return sum_ ## suf(); \
493 } \
494  \
495 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
496  uint8_t *blk1, ptrdiff_t stride, int h) \
497 { \
498  __asm__ volatile ( \
499  "pxor %%mm7, %%mm7 \n\t" \
500  "pxor %%mm6, %%mm6 \n\t" \
501  :); \
502  \
503  sad8_1_ ## suf(blk1, blk2, stride, h); \
504  sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
505  \
506  return sum_ ## suf(); \
507 } \
508  \
509 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
510  uint8_t *blk1, ptrdiff_t stride, int h) \
511 { \
512  __asm__ volatile ( \
513  "pxor %%mm7, %%mm7 \n\t" \
514  "pxor %%mm6, %%mm6 \n\t" \
515  "movq %0, %%mm5 \n\t" \
516  :: "m" (round_tab[1])); \
517  \
518  sad8_x2a_ ## suf(blk1, blk2, stride, h); \
519  sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
520  \
521  return sum_ ## suf(); \
522 } \
523  \
524 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
525  uint8_t *blk1, ptrdiff_t stride, int h) \
526 { \
527  __asm__ volatile ( \
528  "pxor %%mm7, %%mm7 \n\t" \
529  "pxor %%mm6, %%mm6 \n\t" \
530  "movq %0, %%mm5 \n\t" \
531  :: "m" (round_tab[1])); \
532  \
533  sad8_y2a_ ## suf(blk1, blk2, stride, h); \
534  sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
535  \
536  return sum_ ## suf(); \
537 } \
538  \
539 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
540  uint8_t *blk1, ptrdiff_t stride, int h) \
541 { \
542  __asm__ volatile ( \
543  "pxor %%mm7, %%mm7 \n\t" \
544  "pxor %%mm6, %%mm6 \n\t" \
545  ::); \
546  \
547  sad8_4_ ## suf(blk1, blk2, stride, h); \
548  sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
549  \
550  return sum_ ## suf(); \
551 } \
552 
553 PIX_SAD(mmx)
554 
555 #endif /* HAVE_INLINE_ASM */
556 
558 {
559  int cpu_flags = av_get_cpu_flags();
560 
561 #if HAVE_INLINE_ASM
562  if (INLINE_MMX(cpu_flags)) {
563  c->pix_abs[0][0] = sad16_mmx;
564  c->pix_abs[0][1] = sad16_x2_mmx;
565  c->pix_abs[0][2] = sad16_y2_mmx;
566  c->pix_abs[0][3] = sad16_xy2_mmx;
567  c->pix_abs[1][0] = sad8_mmx;
568  c->pix_abs[1][1] = sad8_x2_mmx;
569  c->pix_abs[1][2] = sad8_y2_mmx;
570  c->pix_abs[1][3] = sad8_xy2_mmx;
571 
572  c->sad[0] = sad16_mmx;
573  c->sad[1] = sad8_mmx;
574 
575  c->vsad[4] = vsad_intra16_mmx;
576 
577  if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
578  c->vsad[0] = vsad16_mmx;
579  }
580  }
581 
582 #endif /* HAVE_INLINE_ASM */
583 
584  if (EXTERNAL_MMX(cpu_flags)) {
585  c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
586  c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
587  c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
588  c->sse[0] = ff_sse16_mmx;
589  c->sse[1] = ff_sse8_mmx;
590 #if HAVE_X86ASM
591  c->nsse[0] = nsse16_mmx;
592  c->nsse[1] = nsse8_mmx;
593 #endif
594  }
595 
596  if (EXTERNAL_MMXEXT(cpu_flags)) {
597  c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
598  c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
599  c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
600 
601  c->sad[0] = ff_sad16_mmxext;
602  c->sad[1] = ff_sad8_mmxext;
603 
604  c->pix_abs[0][0] = ff_sad16_mmxext;
605  c->pix_abs[0][1] = ff_sad16_x2_mmxext;
606  c->pix_abs[0][2] = ff_sad16_y2_mmxext;
607  c->pix_abs[1][0] = ff_sad8_mmxext;
608  c->pix_abs[1][1] = ff_sad8_x2_mmxext;
609  c->pix_abs[1][2] = ff_sad8_y2_mmxext;
610 
611  c->vsad[4] = ff_vsad_intra16_mmxext;
612  c->vsad[5] = ff_vsad_intra8_mmxext;
613 
614  if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
615  c->pix_abs[0][3] = ff_sad16_approx_xy2_mmxext;
616  c->pix_abs[1][3] = ff_sad8_approx_xy2_mmxext;
617 
618  c->vsad[0] = ff_vsad16_approx_mmxext;
619  c->vsad[1] = ff_vsad8_approx_mmxext;
620  }
621  }
622 
623  if (EXTERNAL_SSE2(cpu_flags)) {
624  c->sse[0] = ff_sse16_sse2;
625  c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
626 
627 #if HAVE_ALIGNED_STACK
628  c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
629  c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
630 #endif
632  c->sad[0] = ff_sad16_sse2;
633  c->pix_abs[0][0] = ff_sad16_sse2;
634  c->pix_abs[0][1] = ff_sad16_x2_sse2;
635  c->pix_abs[0][2] = ff_sad16_y2_sse2;
636 
637  c->vsad[4] = ff_vsad_intra16_sse2;
638  if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
639  c->pix_abs[0][3] = ff_sad16_approx_xy2_sse2;
640  c->vsad[0] = ff_vsad16_approx_sse2;
641  }
642  }
643  }
644 
645  if (EXTERNAL_SSSE3(cpu_flags)) {
646  c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
647 #if HAVE_ALIGNED_STACK
648  c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
649  c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
650 #endif
651  }
652 }
INLINE_MMX
#define INLINE_MMX(flags)
Definition: cpu.h:86
stride
int stride
Definition: mace.c:144
cpu.h
mem_internal.h
ff_vsad16_approx_mmxext
int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
ff_sad8_y2_mmxext
int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
DECLARE_ASM_CONST
#define DECLARE_ASM_CONST(n, t, v)
Definition: mem.h:118
ff_sad16_y2_mmxext
int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
ff_sse16_mmx
int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
ff_me_cmp_init_x86
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
mpegvideo.h
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:98
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:50
ff_vsad16_approx_sse2
int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
AVCodecContext::flags
int flags
AV_CODEC_FLAG_*.
Definition: avcodec.h:463
ff_sum_abs_dctelem_sse2
int ff_sum_abs_dctelem_sse2(int16_t *block)
ff_sad16_y2_sse2
int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
av_cold
#define av_cold
Definition: attributes.h:90
ff_sad16_approx_xy2_sse2
int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
ff_sad8_mmxext
int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
AVCodecContext::codec_id
enum AVCodecID codec_id
Definition: avcodec.h:393
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:65
ff_vsad_intra16_mmxext
int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
MECmpContext
Definition: me_cmp.h:53
ff_sum_abs_dctelem_ssse3
int ff_sum_abs_dctelem_ssse3(int16_t *block)
ff_sse16_sse2
int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
ff_vsad_intra16_sse2
int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
ff_hf_noise16_mmx
int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
ff_sad16_sse2
int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
cpu.h
AV_CPU_FLAG_SSE2SLOW
#define AV_CPU_FLAG_SSE2SLOW
SSE2 supported, but usually not faster.
Definition: cpu.h:35
asm.h
ff_sad16_x2_mmxext
int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
attributes.h
ff_sad8_approx_xy2_mmxext
int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
EXTERNAL_SSE2
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:59
ff_vsad_intra8_mmxext
int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
ff_sad16_mmxext
int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
av_assert2
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:64
len
int len
Definition: vorbis_enc_data.h:426
ff_sad8_x2_mmxext
int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
ret
ret
Definition: filter_design.txt:187
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
me_cmp.h
AVCodecContext
main external API structure.
Definition: avcodec.h:383
AV_CODEC_ID_SNOW
@ AV_CODEC_ID_SNOW
Definition: codec_id.h:262
dummy
int dummy
Definition: motion.c:65
ff_sum_abs_dctelem_mmx
int ff_sum_abs_dctelem_mmx(int16_t *block)
ff_vsad8_approx_mmxext
int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
AV_CODEC_FLAG_BITEXACT
#define AV_CODEC_FLAG_BITEXACT
Use only bitexact stuff (except (I)DCT).
Definition: avcodec.h:272
hadamard_func
#define hadamard_func(cpu)
Definition: me_cmp_init.c:82
ff_hf_noise8_mmx
int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
ff_sad16_x2_sse2
int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
ff_sad16_approx_xy2_mmxext
int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
x86_reg
int x86_reg
Definition: asm.h:72
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
h
h
Definition: vp9dsp_template.c:2038
ff_sum_abs_dctelem_mmxext
int ff_sum_abs_dctelem_mmxext(int16_t *block)
EXTERNAL_SSSE3
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:65
EXTERNAL_MMX
#define EXTERNAL_MMX(flags)
Definition: cpu.h:56
MpegEncContext
MpegEncContext.
Definition: mpegvideo.h:71
EXTERNAL_MMXEXT
#define EXTERNAL_MMXEXT(flags)
Definition: cpu.h:57
ff_sse8_mmx
int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)