FFmpeg
postprocess_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 /**
22  * @file
23  * mmx/mmx2/3dnow postprocess code.
24  */
25 
26 #include "libavutil/mem_internal.h"
27 #include "libavutil/x86/asm.h"
28 
29 /* A single TEMPLATE_PP_* should be defined (to 1) when this template is
30  * included. The following macros will define its dependencies to 1 as well
31  * (like MMX2 depending on MMX), and will define to 0 all the others. Every
32  * TEMPLATE_PP_* need to be undef at the end. */
33 
34 #ifdef TEMPLATE_PP_C
35 # define RENAME(a) a ## _C
36 #else
37 # define TEMPLATE_PP_C 0
38 #endif
39 
40 #ifdef TEMPLATE_PP_ALTIVEC
41 # define RENAME(a) a ## _altivec
42 #else
43 # define TEMPLATE_PP_ALTIVEC 0
44 #endif
45 
46 #ifdef TEMPLATE_PP_MMX
47 # define RENAME(a) a ## _MMX
48 #else
49 # define TEMPLATE_PP_MMX 0
50 #endif
51 
52 #ifdef TEMPLATE_PP_MMXEXT
53 # undef TEMPLATE_PP_MMX
54 # define TEMPLATE_PP_MMX 1
55 # define RENAME(a) a ## _MMX2
56 #else
57 # define TEMPLATE_PP_MMXEXT 0
58 #endif
59 
60 #ifdef TEMPLATE_PP_3DNOW
61 # undef TEMPLATE_PP_MMX
62 # define TEMPLATE_PP_MMX 1
63 # define RENAME(a) a ## _3DNow
64 #else
65 # define TEMPLATE_PP_3DNOW 0
66 #endif
67 
68 #ifdef TEMPLATE_PP_SSE2
69 # undef TEMPLATE_PP_MMX
70 # define TEMPLATE_PP_MMX 1
71 # undef TEMPLATE_PP_MMXEXT
72 # define TEMPLATE_PP_MMXEXT 1
73 # define RENAME(a) a ## _SSE2
74 #else
75 # define TEMPLATE_PP_SSE2 0
76 #endif
77 
78 #undef REAL_PAVGB
79 #undef PAVGB
80 #undef PMINUB
81 #undef PMAXUB
82 
83 #if TEMPLATE_PP_MMXEXT
84 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
85 #elif TEMPLATE_PP_3DNOW
86 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
87 #endif
88 #define PAVGB(a,b) REAL_PAVGB(a,b)
89 
90 #if TEMPLATE_PP_MMXEXT
91 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
92 #elif TEMPLATE_PP_MMX
93 #define PMINUB(b,a,t) \
94  "movq " #a ", " #t " \n\t"\
95  "psubusb " #b ", " #t " \n\t"\
96  "psubb " #t ", " #a " \n\t"
97 #endif
98 
99 #if TEMPLATE_PP_MMXEXT
100 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
101 #elif TEMPLATE_PP_MMX
102 #define PMAXUB(a,b) \
103  "psubusb " #a ", " #b " \n\t"\
104  "paddb " #a ", " #b " \n\t"
105 #endif
106 
107 //FIXME? |255-0| = 1 (should not be a problem ...)
108 #if TEMPLATE_PP_MMX
109 /**
110  * Check if the middle 8x8 Block in the given 8x16 block is flat
111  */
112 static inline int RENAME(vertClassify)(const uint8_t src[], int stride, PPContext *c){
113  int numEq= 0, dcOk;
114  src+= stride*4; // src points to begin of the 8x8 Block
115  __asm__ volatile(
116  "movq %0, %%mm7 \n\t"
117  "movq %1, %%mm6 \n\t"
118  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
119  );
120 
121  __asm__ volatile(
122  "lea (%2, %3), %%"FF_REG_a" \n\t"
123 // 0 1 2 3 4 5 6 7 8 9
124 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
125 
126  "movq (%2), %%mm0 \n\t"
127  "movq (%%"FF_REG_a"), %%mm1 \n\t"
128  "movq %%mm0, %%mm3 \n\t"
129  "movq %%mm0, %%mm4 \n\t"
130  PMAXUB(%%mm1, %%mm4)
131  PMINUB(%%mm1, %%mm3, %%mm5)
132  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
133  "paddb %%mm7, %%mm0 \n\t"
134  "pcmpgtb %%mm6, %%mm0 \n\t"
135 
136  "movq (%%"FF_REG_a",%3), %%mm2 \n\t"
137  PMAXUB(%%mm2, %%mm4)
138  PMINUB(%%mm2, %%mm3, %%mm5)
139  "psubb %%mm2, %%mm1 \n\t"
140  "paddb %%mm7, %%mm1 \n\t"
141  "pcmpgtb %%mm6, %%mm1 \n\t"
142  "paddb %%mm1, %%mm0 \n\t"
143 
144  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
145  PMAXUB(%%mm1, %%mm4)
146  PMINUB(%%mm1, %%mm3, %%mm5)
147  "psubb %%mm1, %%mm2 \n\t"
148  "paddb %%mm7, %%mm2 \n\t"
149  "pcmpgtb %%mm6, %%mm2 \n\t"
150  "paddb %%mm2, %%mm0 \n\t"
151 
152  "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
153 
154  "movq (%2, %3, 4), %%mm2 \n\t"
155  PMAXUB(%%mm2, %%mm4)
156  PMINUB(%%mm2, %%mm3, %%mm5)
157  "psubb %%mm2, %%mm1 \n\t"
158  "paddb %%mm7, %%mm1 \n\t"
159  "pcmpgtb %%mm6, %%mm1 \n\t"
160  "paddb %%mm1, %%mm0 \n\t"
161 
162  "movq (%%"FF_REG_a"), %%mm1 \n\t"
163  PMAXUB(%%mm1, %%mm4)
164  PMINUB(%%mm1, %%mm3, %%mm5)
165  "psubb %%mm1, %%mm2 \n\t"
166  "paddb %%mm7, %%mm2 \n\t"
167  "pcmpgtb %%mm6, %%mm2 \n\t"
168  "paddb %%mm2, %%mm0 \n\t"
169 
170  "movq (%%"FF_REG_a", %3), %%mm2 \n\t"
171  PMAXUB(%%mm2, %%mm4)
172  PMINUB(%%mm2, %%mm3, %%mm5)
173  "psubb %%mm2, %%mm1 \n\t"
174  "paddb %%mm7, %%mm1 \n\t"
175  "pcmpgtb %%mm6, %%mm1 \n\t"
176  "paddb %%mm1, %%mm0 \n\t"
177 
178  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
179  PMAXUB(%%mm1, %%mm4)
180  PMINUB(%%mm1, %%mm3, %%mm5)
181  "psubb %%mm1, %%mm2 \n\t"
182  "paddb %%mm7, %%mm2 \n\t"
183  "pcmpgtb %%mm6, %%mm2 \n\t"
184  "paddb %%mm2, %%mm0 \n\t"
185  "psubusb %%mm3, %%mm4 \n\t"
186 
187  " \n\t"
188 #if TEMPLATE_PP_MMXEXT
189  "pxor %%mm7, %%mm7 \n\t"
190  "psadbw %%mm7, %%mm0 \n\t"
191 #else
192  "movq %%mm0, %%mm1 \n\t"
193  "psrlw $8, %%mm0 \n\t"
194  "paddb %%mm1, %%mm0 \n\t"
195  "movq %%mm0, %%mm1 \n\t"
196  "psrlq $16, %%mm0 \n\t"
197  "paddb %%mm1, %%mm0 \n\t"
198  "movq %%mm0, %%mm1 \n\t"
199  "psrlq $32, %%mm0 \n\t"
200  "paddb %%mm1, %%mm0 \n\t"
201 #endif
202  "movq %4, %%mm7 \n\t" // QP,..., QP
203  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
204  "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0
205  "packssdw %%mm4, %%mm4 \n\t"
206  "movd %%mm0, %0 \n\t"
207  "movd %%mm4, %1 \n\t"
208 
209  : "=r" (numEq), "=r" (dcOk)
210  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
211  : "%"FF_REG_a
212  );
213 
214  numEq= (-numEq) &0xFF;
215  if(numEq > c->ppMode.flatnessThreshold){
216  if(dcOk) return 0;
217  else return 1;
218  }else{
219  return 2;
220  }
221 }
222 #endif //TEMPLATE_PP_MMX
223 
224 /**
225  * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
226  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
227  */
228 #if !TEMPLATE_PP_ALTIVEC
229 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
230 {
231 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
232  src+= stride*3;
233  __asm__ volatile( //"movv %0 %1 %2\n\t"
234  "movq %2, %%mm0 \n\t" // QP,..., QP
235  "pxor %%mm4, %%mm4 \n\t"
236 
237  "movq (%0), %%mm6 \n\t"
238  "movq (%0, %1), %%mm5 \n\t"
239  "movq %%mm5, %%mm1 \n\t"
240  "movq %%mm6, %%mm2 \n\t"
241  "psubusb %%mm6, %%mm5 \n\t"
242  "psubusb %%mm1, %%mm2 \n\t"
243  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
244  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
245  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
246 
247  "pand %%mm2, %%mm6 \n\t"
248  "pandn %%mm1, %%mm2 \n\t"
249  "por %%mm2, %%mm6 \n\t"// First Line to Filter
250 
251  "movq (%0, %1, 8), %%mm5 \n\t"
252  "lea (%0, %1, 4), %%"FF_REG_a" \n\t"
253  "lea (%0, %1, 8), %%"FF_REG_c" \n\t"
254  "sub %1, %%"FF_REG_c" \n\t"
255  "add %1, %0 \n\t" // %0 points to line 1 not 0
256  "movq (%0, %1, 8), %%mm7 \n\t"
257  "movq %%mm5, %%mm1 \n\t"
258  "movq %%mm7, %%mm2 \n\t"
259  "psubusb %%mm7, %%mm5 \n\t"
260  "psubusb %%mm1, %%mm2 \n\t"
261  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
262  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
263  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
264 
265  "pand %%mm2, %%mm7 \n\t"
266  "pandn %%mm1, %%mm2 \n\t"
267  "por %%mm2, %%mm7 \n\t" // First Line to Filter
268 
269 
270  // 1 2 3 4 5 6 7 8
271  // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
272  // 6 4 2 2 1 1
273  // 6 4 4 2
274  // 6 8 2
275 
276  "movq (%0, %1), %%mm0 \n\t" // 1
277  "movq %%mm0, %%mm1 \n\t" // 1
278  PAVGB(%%mm6, %%mm0) //1 1 /2
279  PAVGB(%%mm6, %%mm0) //3 1 /4
280 
281  "movq (%0, %1, 4), %%mm2 \n\t" // 1
282  "movq %%mm2, %%mm5 \n\t" // 1
283  PAVGB((%%FF_REGa), %%mm2) // 11 /2
284  PAVGB((%0, %1, 2), %%mm2) // 211 /4
285  "movq %%mm2, %%mm3 \n\t" // 211 /4
286  "movq (%0), %%mm4 \n\t" // 1
287  PAVGB(%%mm4, %%mm3) // 4 211 /8
288  PAVGB(%%mm0, %%mm3) //642211 /16
289  "movq %%mm3, (%0) \n\t" // X
290  // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
291  "movq %%mm1, %%mm0 \n\t" // 1
292  PAVGB(%%mm6, %%mm0) //1 1 /2
293  "movq %%mm4, %%mm3 \n\t" // 1
294  PAVGB((%0,%1,2), %%mm3) // 1 1 /2
295  PAVGB((%%FF_REGa,%1,2), %%mm5) // 11 /2
296  PAVGB((%%FF_REGa), %%mm5) // 211 /4
297  PAVGB(%%mm5, %%mm3) // 2 2211 /8
298  PAVGB(%%mm0, %%mm3) //4242211 /16
299  "movq %%mm3, (%0,%1) \n\t" // X
300  // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
301  PAVGB(%%mm4, %%mm6) //11 /2
302  "movq (%%"FF_REG_c"), %%mm0 \n\t" // 1
303  PAVGB((%%FF_REGa, %1, 2), %%mm0) // 11/2
304  "movq %%mm0, %%mm3 \n\t" // 11/2
305  PAVGB(%%mm1, %%mm0) // 2 11/4
306  PAVGB(%%mm6, %%mm0) //222 11/8
307  PAVGB(%%mm2, %%mm0) //22242211/16
308  "movq (%0, %1, 2), %%mm2 \n\t" // 1
309  "movq %%mm0, (%0, %1, 2) \n\t" // X
310  // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
311  "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
312  PAVGB((%%FF_REGc), %%mm0) // 11 /2
313  PAVGB(%%mm0, %%mm6) //11 11 /4
314  PAVGB(%%mm1, %%mm4) // 11 /2
315  PAVGB(%%mm2, %%mm1) // 11 /2
316  PAVGB(%%mm1, %%mm6) //1122 11 /8
317  PAVGB(%%mm5, %%mm6) //112242211 /16
318  "movq (%%"FF_REG_a"), %%mm5 \n\t" // 1
319  "movq %%mm6, (%%"FF_REG_a") \n\t" // X
320  // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
321  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t" // 1
322  PAVGB(%%mm7, %%mm6) // 11 /2
323  PAVGB(%%mm4, %%mm6) // 11 11 /4
324  PAVGB(%%mm3, %%mm6) // 11 2211 /8
325  PAVGB(%%mm5, %%mm2) // 11 /2
326  "movq (%0, %1, 4), %%mm4 \n\t" // 1
327  PAVGB(%%mm4, %%mm2) // 112 /4
328  PAVGB(%%mm2, %%mm6) // 112242211 /16
329  "movq %%mm6, (%0, %1, 4) \n\t" // X
330  // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
331  PAVGB(%%mm7, %%mm1) // 11 2 /4
332  PAVGB(%%mm4, %%mm5) // 11 /2
333  PAVGB(%%mm5, %%mm0) // 11 11 /4
334  "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" // 1
335  PAVGB(%%mm6, %%mm1) // 11 4 2 /8
336  PAVGB(%%mm0, %%mm1) // 11224222 /16
337  "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t" // X
338  // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
339  PAVGB((%%FF_REGc), %%mm2) // 112 4 /8
340  "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
341  PAVGB(%%mm0, %%mm6) // 1 1 /2
342  PAVGB(%%mm7, %%mm6) // 1 12 /4
343  PAVGB(%%mm2, %%mm6) // 1122424 /4
344  "movq %%mm6, (%%"FF_REG_c") \n\t" // X
345  // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
346  PAVGB(%%mm7, %%mm5) // 11 2 /4
347  PAVGB(%%mm7, %%mm5) // 11 6 /8
348 
349  PAVGB(%%mm3, %%mm0) // 112 /4
350  PAVGB(%%mm0, %%mm5) // 112246 /16
351  "movq %%mm5, (%%"FF_REG_a", %1, 4) \n\t" // X
352  "sub %1, %0 \n\t"
353 
354  :
355  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
356  : "%"FF_REG_a, "%"FF_REG_c
357  );
358 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
359  const int l1= stride;
360  const int l2= stride + l1;
361  const int l3= stride + l2;
362  const int l4= stride + l3;
363  const int l5= stride + l4;
364  const int l6= stride + l5;
365  const int l7= stride + l6;
366  const int l8= stride + l7;
367  const int l9= stride + l8;
368  int x;
369  src+= stride*3;
370  for(x=0; x<BLOCK_SIZE; x++){
371  const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
372  const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
373 
374  int sums[10];
375  sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
376  sums[1] = sums[0] - first + src[l4];
377  sums[2] = sums[1] - first + src[l5];
378  sums[3] = sums[2] - first + src[l6];
379  sums[4] = sums[3] - first + src[l7];
380  sums[5] = sums[4] - src[l1] + src[l8];
381  sums[6] = sums[5] - src[l2] + last;
382  sums[7] = sums[6] - src[l3] + last;
383  sums[8] = sums[7] - src[l4] + last;
384  sums[9] = sums[8] - src[l5] + last;
385 
386  src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
387  src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
388  src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
389  src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
390  src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
391  src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
392  src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
393  src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
394 
395  src++;
396  }
397 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
398 }
399 #endif //TEMPLATE_PP_ALTIVEC
400 
401 /**
402  * Experimental Filter 1
403  * will not damage linear gradients
404  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
405  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
406  * MMX2 version does correct clipping C version does not
407  */
408 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
409 {
410 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
411  src+= stride*3;
412 
413  __asm__ volatile(
414  "pxor %%mm7, %%mm7 \n\t" // 0
415  "lea (%0, %1), %%"FF_REG_a" \n\t"
416  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
417 // 0 1 2 3 4 5 6 7 8 9
418 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
419  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
420  "movq (%0, %1, 4), %%mm1 \n\t" // line 4
421  "movq %%mm1, %%mm2 \n\t" // line 4
422  "psubusb %%mm0, %%mm1 \n\t"
423  "psubusb %%mm2, %%mm0 \n\t"
424  "por %%mm1, %%mm0 \n\t" // |l2 - l3|
425  "movq (%%"FF_REG_c"), %%mm3 \n\t" // line 5
426  "movq (%%"FF_REG_c", %1), %%mm4 \n\t" // line 6
427  "movq %%mm3, %%mm5 \n\t" // line 5
428  "psubusb %%mm4, %%mm3 \n\t"
429  "psubusb %%mm5, %%mm4 \n\t"
430  "por %%mm4, %%mm3 \n\t" // |l5 - l6|
431  PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
432  "movq %%mm2, %%mm1 \n\t" // line 4
433  "psubusb %%mm5, %%mm2 \n\t"
434  "movq %%mm2, %%mm4 \n\t"
435  "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
436  "psubusb %%mm1, %%mm5 \n\t"
437  "por %%mm5, %%mm4 \n\t" // |l4 - l5|
438  "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
439  "movq %%mm4, %%mm3 \n\t" // d
440  "movq %2, %%mm0 \n\t"
441  "paddusb %%mm0, %%mm0 \n\t"
442  "psubusb %%mm0, %%mm4 \n\t"
443  "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
444  "psubusb "MANGLE(b01)", %%mm3 \n\t"
445  "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
446 
447  PAVGB(%%mm7, %%mm3) // d/2
448  "movq %%mm3, %%mm1 \n\t" // d/2
449  PAVGB(%%mm7, %%mm3) // d/4
450  PAVGB(%%mm1, %%mm3) // 3*d/8
451 
452  "movq (%0, %1, 4), %%mm0 \n\t" // line 4
453  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
454  "psubusb %%mm3, %%mm0 \n\t"
455  "pxor %%mm2, %%mm0 \n\t"
456  "movq %%mm0, (%0, %1, 4) \n\t" // line 4
457 
458  "movq (%%"FF_REG_c"), %%mm0 \n\t" // line 5
459  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
460  "paddusb %%mm3, %%mm0 \n\t"
461  "pxor %%mm2, %%mm0 \n\t"
462  "movq %%mm0, (%%"FF_REG_c") \n\t" // line 5
463 
464  PAVGB(%%mm7, %%mm1) // d/4
465 
466  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
467  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
468  "psubusb %%mm1, %%mm0 \n\t"
469  "pxor %%mm2, %%mm0 \n\t"
470  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t" // line 3
471 
472  "movq (%%"FF_REG_c", %1), %%mm0 \n\t" // line 6
473  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
474  "paddusb %%mm1, %%mm0 \n\t"
475  "pxor %%mm2, %%mm0 \n\t"
476  "movq %%mm0, (%%"FF_REG_c", %1) \n\t" // line 6
477 
478  PAVGB(%%mm7, %%mm1) // d/8
479 
480  "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // line 2
481  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
482  "psubusb %%mm1, %%mm0 \n\t"
483  "pxor %%mm2, %%mm0 \n\t"
484  "movq %%mm0, (%%"FF_REG_a", %1) \n\t" // line 2
485 
486  "movq (%%"FF_REG_c", %1, 2), %%mm0 \n\t" // line 7
487  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
488  "paddusb %%mm1, %%mm0 \n\t"
489  "pxor %%mm2, %%mm0 \n\t"
490  "movq %%mm0, (%%"FF_REG_c", %1, 2) \n\t" // line 7
491 
492  :
493  : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
495  : "%"FF_REG_a, "%"FF_REG_c
496  );
497 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
498 
499  const int l1= stride;
500  const int l2= stride + l1;
501  const int l3= stride + l2;
502  const int l4= stride + l3;
503  const int l5= stride + l4;
504  const int l6= stride + l5;
505  const int l7= stride + l6;
506 // const int l8= stride + l7;
507 // const int l9= stride + l8;
508  int x;
509 
510  src+= stride*3;
511  for(x=0; x<BLOCK_SIZE; x++){
512  int a= src[l3] - src[l4];
513  int b= src[l4] - src[l5];
514  int c= src[l5] - src[l6];
515 
516  int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
517  d= FFMAX(d, 0);
518 
519  if(d < co->QP*2){
520  int v = d * FFSIGN(-b);
521 
522  src[l2] +=v>>3;
523  src[l3] +=v>>2;
524  src[l4] +=(3*v)>>3;
525  src[l5] -=(3*v)>>3;
526  src[l6] -=v>>2;
527  src[l7] -=v>>3;
528  }
529  src++;
530  }
531 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
532 }
533 
534 #if !TEMPLATE_PP_ALTIVEC
535 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
536 {
537 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
538 /*
539  uint8_t tmp[16];
540  const int l1= stride;
541  const int l2= stride + l1;
542  const int l3= stride + l2;
543  const int l4= (int)tmp - (int)src - stride*3;
544  const int l5= (int)tmp - (int)src - stride*3 + 8;
545  const int l6= stride*3 + l3;
546  const int l7= stride + l6;
547  const int l8= stride + l7;
548 
549  memcpy(tmp, src+stride*7, 8);
550  memcpy(tmp+8, src+stride*8, 8);
551 */
552  src+= stride*4;
553  __asm__ volatile(
554 
555 #if 0 //slightly more accurate and slightly slower
556  "pxor %%mm7, %%mm7 \n\t" // 0
557  "lea (%0, %1), %%"FF_REG_a" \n\t"
558  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
559 // 0 1 2 3 4 5 6 7
560 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
561 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
562 
563 
564  "movq (%0, %1, 2), %%mm0 \n\t" // l2
565  "movq (%0), %%mm1 \n\t" // l0
566  "movq %%mm0, %%mm2 \n\t" // l2
567  PAVGB(%%mm7, %%mm0) // ~l2/2
568  PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
569  PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
570 
571  "movq (%%"FF_REG_a"), %%mm1 \n\t" // l1
572  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t" // l3
573  "movq %%mm1, %%mm4 \n\t" // l1
574  PAVGB(%%mm7, %%mm1) // ~l1/2
575  PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
576  PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
577 
578  "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
579  "psubusb %%mm1, %%mm0 \n\t"
580  "psubusb %%mm4, %%mm1 \n\t"
581  "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
582 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
583 
584  "movq (%0, %1, 4), %%mm0 \n\t" // l4
585  "movq %%mm0, %%mm4 \n\t" // l4
586  PAVGB(%%mm7, %%mm0) // ~l4/2
587  PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
588  PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
589 
590  "movq (%%"FF_REG_c"), %%mm2 \n\t" // l5
591  "movq %%mm3, %%mm5 \n\t" // l3
592  PAVGB(%%mm7, %%mm3) // ~l3/2
593  PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
594  PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
595 
596  "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
597  "psubusb %%mm3, %%mm0 \n\t"
598  "psubusb %%mm6, %%mm3 \n\t"
599  "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
600  "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
601 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
602 
603  "movq (%%"FF_REG_c", %1), %%mm6 \n\t" // l6
604  "movq %%mm6, %%mm5 \n\t" // l6
605  PAVGB(%%mm7, %%mm6) // ~l6/2
606  PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
607  PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
608 
609  "movq (%%"FF_REG_c", %1, 2), %%mm5 \n\t" // l7
610  "movq %%mm2, %%mm4 \n\t" // l5
611  PAVGB(%%mm7, %%mm2) // ~l5/2
612  PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
613  PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
614 
615  "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
616  "psubusb %%mm2, %%mm6 \n\t"
617  "psubusb %%mm4, %%mm2 \n\t"
618  "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
619 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
620 
621 
622  PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
623  "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
624  "paddusb "MANGLE(b01)", %%mm4 \n\t"
625  "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
626  "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
627  "pand %%mm4, %%mm3 \n\t"
628 
629  "movq %%mm3, %%mm1 \n\t"
630 // "psubusb "MANGLE(b01)", %%mm3 \n\t"
631  PAVGB(%%mm7, %%mm3)
632  PAVGB(%%mm7, %%mm3)
633  "paddusb %%mm1, %%mm3 \n\t"
634 // "paddusb "MANGLE(b01)", %%mm3 \n\t"
635 
636  "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" //l3
637  "movq (%0, %1, 4), %%mm5 \n\t" //l4
638  "movq (%0, %1, 4), %%mm4 \n\t" //l4
639  "psubusb %%mm6, %%mm5 \n\t"
640  "psubusb %%mm4, %%mm6 \n\t"
641  "por %%mm6, %%mm5 \n\t" // |l3-l4|
642  "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
643  "pxor %%mm6, %%mm0 \n\t"
644  "pand %%mm0, %%mm3 \n\t"
645  PMINUB(%%mm5, %%mm3, %%mm0)
646 
647  "psubusb "MANGLE(b01)", %%mm3 \n\t"
648  PAVGB(%%mm7, %%mm3)
649 
650  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
651  "movq (%0, %1, 4), %%mm2 \n\t"
652  "pxor %%mm6, %%mm0 \n\t"
653  "pxor %%mm6, %%mm2 \n\t"
654  "psubb %%mm3, %%mm0 \n\t"
655  "paddb %%mm3, %%mm2 \n\t"
656  "pxor %%mm6, %%mm0 \n\t"
657  "pxor %%mm6, %%mm2 \n\t"
658  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
659  "movq %%mm2, (%0, %1, 4) \n\t"
660 #endif //0
661 
662  "lea (%0, %1), %%"FF_REG_a" \n\t"
663  "pcmpeqb %%mm6, %%mm6 \n\t" // -1
664 // 0 1 2 3 4 5 6 7
665 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
666 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
667 
668 
669  "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t" // l3
670  "movq (%0, %1, 4), %%mm0 \n\t" // l4
671  "pxor %%mm6, %%mm1 \n\t" // -l3-1
672  PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
673 // mm1=-l3-1, mm0=128-q
674 
675  "movq (%%"FF_REG_a", %1, 4), %%mm2 \n\t" // l5
676  "movq (%%"FF_REG_a", %1), %%mm3 \n\t" // l2
677  "pxor %%mm6, %%mm2 \n\t" // -l5-1
678  "movq %%mm2, %%mm5 \n\t" // -l5-1
679  "movq "MANGLE(b80)", %%mm4 \n\t" // 128
680  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
681  PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
682  PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
683  PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
684  PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
685 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
686 
687  "movq (%%"FF_REG_a"), %%mm2 \n\t" // l1
688  "pxor %%mm6, %%mm2 \n\t" // -l1-1
689  PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
690  PAVGB((%0), %%mm1) // (l0-l3+256)/2
691  "movq "MANGLE(b80)", %%mm3 \n\t" // 128
692  PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
693  PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
694  PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
695 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
696 
697  PAVGB((%%FF_REGc, %1), %%mm5) // (l6-l5+256)/2
698  "movq (%%"FF_REG_c", %1, 2), %%mm1 \n\t" // l7
699  "pxor %%mm6, %%mm1 \n\t" // -l7-1
700  PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
701  "movq "MANGLE(b80)", %%mm2 \n\t" // 128
702  PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
703  PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
704  PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
705 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
706 
707  "movq "MANGLE(b00)", %%mm1 \n\t" // 0
708  "movq "MANGLE(b00)", %%mm5 \n\t" // 0
709  "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
710  "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
711  PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
712  PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
713  PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
714 
715 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
716 
717  "movq "MANGLE(b00)", %%mm7 \n\t" // 0
718  "movq %2, %%mm2 \n\t" // QP
719  PAVGB(%%mm6, %%mm2) // 128 + QP/2
720  "psubb %%mm6, %%mm2 \n\t"
721 
722  "movq %%mm4, %%mm1 \n\t"
723  "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
724  "pxor %%mm1, %%mm4 \n\t"
725  "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
726  "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
727  "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
728 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
729 
730  "movq %%mm4, %%mm3 \n\t" // d
731  "psubusb "MANGLE(b01)", %%mm4 \n\t"
732  PAVGB(%%mm7, %%mm4) // d/32
733  PAVGB(%%mm7, %%mm4) // (d + 32)/64
734  "paddb %%mm3, %%mm4 \n\t" // 5d/64
735  "pand %%mm2, %%mm4 \n\t"
736 
737  "movq "MANGLE(b80)", %%mm5 \n\t" // 128
738  "psubb %%mm0, %%mm5 \n\t" // q
739  "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
740  "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
741  "pxor %%mm7, %%mm5 \n\t"
742 
743  PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
744  "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
745 
746  "pand %%mm7, %%mm4 \n\t"
747  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
748  "movq (%0, %1, 4), %%mm2 \n\t"
749  "pxor %%mm1, %%mm0 \n\t"
750  "pxor %%mm1, %%mm2 \n\t"
751  "paddb %%mm4, %%mm0 \n\t"
752  "psubb %%mm4, %%mm2 \n\t"
753  "pxor %%mm1, %%mm0 \n\t"
754  "pxor %%mm1, %%mm2 \n\t"
755  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
756  "movq %%mm2, (%0, %1, 4) \n\t"
757 
758  :
759  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
760  NAMED_CONSTRAINTS_ADD(b80,b00,b01)
761  : "%"FF_REG_a, "%"FF_REG_c
762  );
763 
764 /*
765  {
766  int x;
767  src-= stride;
768  for(x=0; x<BLOCK_SIZE; x++){
769  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
770  if(FFABS(middleEnergy)< 8*QP){
771  const int q=(src[l4] - src[l5])/2;
772  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
773  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
774 
775  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
776  d= FFMAX(d, 0);
777 
778  d= (5*d + 32) >> 6;
779  d*= FFSIGN(-middleEnergy);
780 
781  if(q>0){
782  d= d<0 ? 0 : d;
783  d= d>q ? q : d;
784  }else{
785  d= d>0 ? 0 : d;
786  d= d<q ? q : d;
787  }
788 
789  src[l4]-= d;
790  src[l5]+= d;
791  }
792  src++;
793  }
794  src-=8;
795  for(x=0; x<8; x++){
796  int y;
797  for(y=4; y<6; y++){
798  int d= src[x+y*stride] - tmp[x+(y-4)*8];
799  int ad= FFABS(d);
800  static int max=0;
801  static int sum=0;
802  static int num=0;
803  static int bias=0;
804 
805  if(max<ad) max=ad;
806  sum+= ad>3 ? 1 : 0;
807  if(ad>3){
808  src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
809  }
810  if(y==4) bias+=d;
811  num++;
812  if(num%1000000 == 0){
813  av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
814  }
815  }
816  }
817 }
818 */
819 #elif TEMPLATE_PP_MMX
820  DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
821  src+= stride*4;
822  __asm__ volatile(
823  "pxor %%mm7, %%mm7 \n\t"
824 // 0 1 2 3 4 5 6 7
825 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1
826 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1
827 
828  "movq (%0), %%mm0 \n\t"
829  "movq %%mm0, %%mm1 \n\t"
830  "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
831  "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
832 
833  "movq (%0, %1), %%mm2 \n\t"
834  "lea (%0, %1, 2), %%"FF_REG_a" \n\t"
835  "movq %%mm2, %%mm3 \n\t"
836  "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
837  "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
838 
839  "movq (%%"FF_REG_a"), %%mm4 \n\t"
840  "movq %%mm4, %%mm5 \n\t"
841  "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
842  "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
843 
844  "paddw %%mm0, %%mm0 \n\t" // 2L0
845  "paddw %%mm1, %%mm1 \n\t" // 2H0
846  "psubw %%mm4, %%mm2 \n\t" // L1 - L2
847  "psubw %%mm5, %%mm3 \n\t" // H1 - H2
848  "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
849  "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
850 
851  "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
852  "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
853  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
854  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
855 
856  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
857  "movq %%mm2, %%mm3 \n\t"
858  "punpcklbw %%mm7, %%mm2 \n\t" // L3
859  "punpckhbw %%mm7, %%mm3 \n\t" // H3
860 
861  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
862  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
863  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
864  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
865  "movq %%mm0, (%3) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
866  "movq %%mm1, 8(%3) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
867 
868  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
869  "movq %%mm0, %%mm1 \n\t"
870  "punpcklbw %%mm7, %%mm0 \n\t" // L4
871  "punpckhbw %%mm7, %%mm1 \n\t" // H4
872 
873  "psubw %%mm0, %%mm2 \n\t" // L3 - L4
874  "psubw %%mm1, %%mm3 \n\t" // H3 - H4
875  "movq %%mm2, 16(%3) \n\t" // L3 - L4
876  "movq %%mm3, 24(%3) \n\t" // H3 - H4
877  "paddw %%mm4, %%mm4 \n\t" // 2L2
878  "paddw %%mm5, %%mm5 \n\t" // 2H2
879  "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
880  "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
881 
882  "lea (%%"FF_REG_a", %1), %0 \n\t"
883  "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
884  "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
885  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
886  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
887 //50 opcodes so far
888  "movq (%0, %1, 2), %%mm2 \n\t"
889  "movq %%mm2, %%mm3 \n\t"
890  "punpcklbw %%mm7, %%mm2 \n\t" // L5
891  "punpckhbw %%mm7, %%mm3 \n\t" // H5
892  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
893  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
894  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
895  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
896 
897  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
898  "punpcklbw %%mm7, %%mm6 \n\t" // L6
899  "psubw %%mm6, %%mm2 \n\t" // L5 - L6
900  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
901  "punpckhbw %%mm7, %%mm6 \n\t" // H6
902  "psubw %%mm6, %%mm3 \n\t" // H5 - H6
903 
904  "paddw %%mm0, %%mm0 \n\t" // 2L4
905  "paddw %%mm1, %%mm1 \n\t" // 2H4
906  "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
907  "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
908 
909  "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
910  "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
911  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
912  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
913 
914  "movq (%0, %1, 4), %%mm2 \n\t"
915  "movq %%mm2, %%mm3 \n\t"
916  "punpcklbw %%mm7, %%mm2 \n\t" // L7
917  "punpckhbw %%mm7, %%mm3 \n\t" // H7
918 
919  "paddw %%mm2, %%mm2 \n\t" // 2L7
920  "paddw %%mm3, %%mm3 \n\t" // 2H7
921  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
922  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
923 
924  "movq (%3), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
925  "movq 8(%3), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
926 
927 #if TEMPLATE_PP_MMXEXT
928  "movq %%mm7, %%mm6 \n\t" // 0
929  "psubw %%mm0, %%mm6 \n\t"
930  "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
931  "movq %%mm7, %%mm6 \n\t" // 0
932  "psubw %%mm1, %%mm6 \n\t"
933  "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
934  "movq %%mm7, %%mm6 \n\t" // 0
935  "psubw %%mm2, %%mm6 \n\t"
936  "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
937  "movq %%mm7, %%mm6 \n\t" // 0
938  "psubw %%mm3, %%mm6 \n\t"
939  "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
940 #else
941  "movq %%mm7, %%mm6 \n\t" // 0
942  "pcmpgtw %%mm0, %%mm6 \n\t"
943  "pxor %%mm6, %%mm0 \n\t"
944  "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
945  "movq %%mm7, %%mm6 \n\t" // 0
946  "pcmpgtw %%mm1, %%mm6 \n\t"
947  "pxor %%mm6, %%mm1 \n\t"
948  "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
949  "movq %%mm7, %%mm6 \n\t" // 0
950  "pcmpgtw %%mm2, %%mm6 \n\t"
951  "pxor %%mm6, %%mm2 \n\t"
952  "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
953  "movq %%mm7, %%mm6 \n\t" // 0
954  "pcmpgtw %%mm3, %%mm6 \n\t"
955  "pxor %%mm6, %%mm3 \n\t"
956  "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
957 #endif
958 
959 #if TEMPLATE_PP_MMXEXT
960  "pminsw %%mm2, %%mm0 \n\t"
961  "pminsw %%mm3, %%mm1 \n\t"
962 #else
963  "movq %%mm0, %%mm6 \n\t"
964  "psubusw %%mm2, %%mm6 \n\t"
965  "psubw %%mm6, %%mm0 \n\t"
966  "movq %%mm1, %%mm6 \n\t"
967  "psubusw %%mm3, %%mm6 \n\t"
968  "psubw %%mm6, %%mm1 \n\t"
969 #endif
970 
971  "movd %2, %%mm2 \n\t" // QP
972  "punpcklbw %%mm7, %%mm2 \n\t"
973 
974  "movq %%mm7, %%mm6 \n\t" // 0
975  "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
976  "pxor %%mm6, %%mm4 \n\t"
977  "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
978  "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
979  "pxor %%mm7, %%mm5 \n\t"
980  "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
981 // 100 opcodes
982  "psllw $3, %%mm2 \n\t" // 8QP
983  "movq %%mm2, %%mm3 \n\t" // 8QP
984  "pcmpgtw %%mm4, %%mm2 \n\t"
985  "pcmpgtw %%mm5, %%mm3 \n\t"
986  "pand %%mm2, %%mm4 \n\t"
987  "pand %%mm3, %%mm5 \n\t"
988 
989 
990  "psubusw %%mm0, %%mm4 \n\t" // hd
991  "psubusw %%mm1, %%mm5 \n\t" // ld
992 
993 
994  "movq "MANGLE(w05)", %%mm2 \n\t" // 5
995  "pmullw %%mm2, %%mm4 \n\t"
996  "pmullw %%mm2, %%mm5 \n\t"
997  "movq "MANGLE(w20)", %%mm2 \n\t" // 32
998  "paddw %%mm2, %%mm4 \n\t"
999  "paddw %%mm2, %%mm5 \n\t"
1000  "psrlw $6, %%mm4 \n\t"
1001  "psrlw $6, %%mm5 \n\t"
1002 
1003  "movq 16(%3), %%mm0 \n\t" // L3 - L4
1004  "movq 24(%3), %%mm1 \n\t" // H3 - H4
1005 
1006  "pxor %%mm2, %%mm2 \n\t"
1007  "pxor %%mm3, %%mm3 \n\t"
1008 
1009  "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1010  "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1011  "pxor %%mm2, %%mm0 \n\t"
1012  "pxor %%mm3, %%mm1 \n\t"
1013  "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1014  "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
1015  "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1016  "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
1017 
1018  "pxor %%mm6, %%mm2 \n\t"
1019  "pxor %%mm7, %%mm3 \n\t"
1020  "pand %%mm2, %%mm4 \n\t"
1021  "pand %%mm3, %%mm5 \n\t"
1022 
1023 #if TEMPLATE_PP_MMXEXT
1024  "pminsw %%mm0, %%mm4 \n\t"
1025  "pminsw %%mm1, %%mm5 \n\t"
1026 #else
1027  "movq %%mm4, %%mm2 \n\t"
1028  "psubusw %%mm0, %%mm2 \n\t"
1029  "psubw %%mm2, %%mm4 \n\t"
1030  "movq %%mm5, %%mm2 \n\t"
1031  "psubusw %%mm1, %%mm2 \n\t"
1032  "psubw %%mm2, %%mm5 \n\t"
1033 #endif
1034  "pxor %%mm6, %%mm4 \n\t"
1035  "pxor %%mm7, %%mm5 \n\t"
1036  "psubw %%mm6, %%mm4 \n\t"
1037  "psubw %%mm7, %%mm5 \n\t"
1038  "packsswb %%mm5, %%mm4 \n\t"
1039  "movq (%0), %%mm0 \n\t"
1040  "paddb %%mm4, %%mm0 \n\t"
1041  "movq %%mm0, (%0) \n\t"
1042  "movq (%0, %1), %%mm0 \n\t"
1043  "psubb %%mm4, %%mm0 \n\t"
1044  "movq %%mm0, (%0, %1) \n\t"
1045 
1046  : "+r" (src)
1047  : "r" ((x86_reg)stride), "m" (c->pQPb), "r"(tmp)
1048  NAMED_CONSTRAINTS_ADD(w05,w20)
1049  : "%"FF_REG_a
1050  );
1051 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1052  const int l1= stride;
1053  const int l2= stride + l1;
1054  const int l3= stride + l2;
1055  const int l4= stride + l3;
1056  const int l5= stride + l4;
1057  const int l6= stride + l5;
1058  const int l7= stride + l6;
1059  const int l8= stride + l7;
1060 // const int l9= stride + l8;
1061  int x;
1062  src+= stride*3;
1063  for(x=0; x<BLOCK_SIZE; x++){
1064  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1065  if(FFABS(middleEnergy) < 8*c->QP){
1066  const int q=(src[l4] - src[l5])/2;
1067  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1068  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1069 
1070  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
1071  d= FFMAX(d, 0);
1072 
1073  d= (5*d + 32) >> 6;
1074  d*= FFSIGN(-middleEnergy);
1075 
1076  if(q>0){
1077  d = FFMAX(d, 0);
1078  d = FFMIN(d, q);
1079  }else{
1080  d = FFMIN(d, 0);
1081  d = FFMAX(d, q);
1082  }
1083 
1084  src[l4]-= d;
1085  src[l5]+= d;
1086  }
1087  src++;
1088  }
1089 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1090 }
1091 #endif //TEMPLATE_PP_ALTIVEC
1092 
1093 #if !TEMPLATE_PP_ALTIVEC
1094 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1095 {
1096 #if HAVE_7REGS && (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW)
1097  DECLARE_ALIGNED(8, uint64_t, tmp)[3];
1098  __asm__ volatile(
1099  "pxor %%mm6, %%mm6 \n\t"
1100  "pcmpeqb %%mm7, %%mm7 \n\t"
1101  "movq %2, %%mm0 \n\t"
1102  "punpcklbw %%mm6, %%mm0 \n\t"
1103  "psrlw $1, %%mm0 \n\t"
1104  "psubw %%mm7, %%mm0 \n\t"
1105  "packuswb %%mm0, %%mm0 \n\t"
1106  "movq %%mm0, %3 \n\t"
1107 
1108  "lea (%0, %1), %%"FF_REG_a" \n\t"
1109  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1110 
1111 // 0 1 2 3 4 5 6 7 8 9
1112 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1113 
1114 #undef REAL_FIND_MIN_MAX
1115 #undef FIND_MIN_MAX
1116 #if TEMPLATE_PP_MMXEXT
1117 #define REAL_FIND_MIN_MAX(addr)\
1118  "movq " #addr ", %%mm0 \n\t"\
1119  "pminub %%mm0, %%mm7 \n\t"\
1120  "pmaxub %%mm0, %%mm6 \n\t"
1121 #else
1122 #define REAL_FIND_MIN_MAX(addr)\
1123  "movq " #addr ", %%mm0 \n\t"\
1124  "movq %%mm7, %%mm1 \n\t"\
1125  "psubusb %%mm0, %%mm6 \n\t"\
1126  "paddb %%mm0, %%mm6 \n\t"\
1127  "psubusb %%mm0, %%mm1 \n\t"\
1128  "psubb %%mm1, %%mm7 \n\t"
1129 #endif
1130 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr)
1131 
1132 FIND_MIN_MAX((%%FF_REGa))
1133 FIND_MIN_MAX((%%FF_REGa, %1))
1134 FIND_MIN_MAX((%%FF_REGa, %1, 2))
1135 FIND_MIN_MAX((%0, %1, 4))
1136 FIND_MIN_MAX((%%FF_REGd))
1137 FIND_MIN_MAX((%%FF_REGd, %1))
1138 FIND_MIN_MAX((%%FF_REGd, %1, 2))
1139 FIND_MIN_MAX((%0, %1, 8))
1140 
1141  "movq %%mm7, %%mm4 \n\t"
1142  "psrlq $8, %%mm7 \n\t"
1143 #if TEMPLATE_PP_MMXEXT
1144  "pminub %%mm4, %%mm7 \n\t" // min of pixels
1145  "pshufw $0xF9, %%mm7, %%mm4 \n\t"
1146  "pminub %%mm4, %%mm7 \n\t" // min of pixels
1147  "pshufw $0xFE, %%mm7, %%mm4 \n\t"
1148  "pminub %%mm4, %%mm7 \n\t"
1149 #else
1150  "movq %%mm7, %%mm1 \n\t"
1151  "psubusb %%mm4, %%mm1 \n\t"
1152  "psubb %%mm1, %%mm7 \n\t"
1153  "movq %%mm7, %%mm4 \n\t"
1154  "psrlq $16, %%mm7 \n\t"
1155  "movq %%mm7, %%mm1 \n\t"
1156  "psubusb %%mm4, %%mm1 \n\t"
1157  "psubb %%mm1, %%mm7 \n\t"
1158  "movq %%mm7, %%mm4 \n\t"
1159  "psrlq $32, %%mm7 \n\t"
1160  "movq %%mm7, %%mm1 \n\t"
1161  "psubusb %%mm4, %%mm1 \n\t"
1162  "psubb %%mm1, %%mm7 \n\t"
1163 #endif
1164 
1165 
1166  "movq %%mm6, %%mm4 \n\t"
1167  "psrlq $8, %%mm6 \n\t"
1168 #if TEMPLATE_PP_MMXEXT
1169  "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
1170  "pshufw $0xF9, %%mm6, %%mm4 \n\t"
1171  "pmaxub %%mm4, %%mm6 \n\t"
1172  "pshufw $0xFE, %%mm6, %%mm4 \n\t"
1173  "pmaxub %%mm4, %%mm6 \n\t"
1174 #else
1175  "psubusb %%mm4, %%mm6 \n\t"
1176  "paddb %%mm4, %%mm6 \n\t"
1177  "movq %%mm6, %%mm4 \n\t"
1178  "psrlq $16, %%mm6 \n\t"
1179  "psubusb %%mm4, %%mm6 \n\t"
1180  "paddb %%mm4, %%mm6 \n\t"
1181  "movq %%mm6, %%mm4 \n\t"
1182  "psrlq $32, %%mm6 \n\t"
1183  "psubusb %%mm4, %%mm6 \n\t"
1184  "paddb %%mm4, %%mm6 \n\t"
1185 #endif
1186  "movq %%mm6, %%mm0 \n\t" // max
1187  "psubb %%mm7, %%mm6 \n\t" // max - min
1188  "push %%"FF_REG_a" \n\t"
1189  "movd %%mm6, %%eax \n\t"
1190  "cmpb "MANGLE(deringThreshold)", %%al \n\t"
1191  "pop %%"FF_REG_a" \n\t"
1192  " jb 1f \n\t"
1193  PAVGB(%%mm0, %%mm7) // a=(max + min)/2
1194  "punpcklbw %%mm7, %%mm7 \n\t"
1195  "punpcklbw %%mm7, %%mm7 \n\t"
1196  "punpcklbw %%mm7, %%mm7 \n\t"
1197  "movq %%mm7, (%4) \n\t"
1198 
1199  "movq (%0), %%mm0 \n\t" // L10
1200  "movq %%mm0, %%mm1 \n\t" // L10
1201  "movq %%mm0, %%mm2 \n\t" // L10
1202  "psllq $8, %%mm1 \n\t"
1203  "psrlq $8, %%mm2 \n\t"
1204  "movd -4(%0), %%mm3 \n\t"
1205  "movd 8(%0), %%mm4 \n\t"
1206  "psrlq $24, %%mm3 \n\t"
1207  "psllq $56, %%mm4 \n\t"
1208  "por %%mm3, %%mm1 \n\t" // L00
1209  "por %%mm4, %%mm2 \n\t" // L20
1210  "movq %%mm1, %%mm3 \n\t" // L00
1211  PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
1212  PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
1213  "psubusb %%mm7, %%mm0 \n\t"
1214  "psubusb %%mm7, %%mm2 \n\t"
1215  "psubusb %%mm7, %%mm3 \n\t"
1216  "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1
1217  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1
1218  "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1
1219  "paddb %%mm2, %%mm0 \n\t"
1220  "paddb %%mm3, %%mm0 \n\t"
1221 
1222  "movq (%%"FF_REG_a"), %%mm2 \n\t" // L11
1223  "movq %%mm2, %%mm3 \n\t" // L11
1224  "movq %%mm2, %%mm4 \n\t" // L11
1225  "psllq $8, %%mm3 \n\t"
1226  "psrlq $8, %%mm4 \n\t"
1227  "movd -4(%%"FF_REG_a"), %%mm5 \n\t"
1228  "movd 8(%%"FF_REG_a"), %%mm6 \n\t"
1229  "psrlq $24, %%mm5 \n\t"
1230  "psllq $56, %%mm6 \n\t"
1231  "por %%mm5, %%mm3 \n\t" // L01
1232  "por %%mm6, %%mm4 \n\t" // L21
1233  "movq %%mm3, %%mm5 \n\t" // L01
1234  PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
1235  PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
1236  "psubusb %%mm7, %%mm2 \n\t"
1237  "psubusb %%mm7, %%mm4 \n\t"
1238  "psubusb %%mm7, %%mm5 \n\t"
1239  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1
1240  "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1
1241  "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1
1242  "paddb %%mm4, %%mm2 \n\t"
1243  "paddb %%mm5, %%mm2 \n\t"
1244 // 0, 2, 3, 1
1245 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1246  "movq " #src ", " #sx " \n\t" /* src[0] */\
1247  "movq " #sx ", " #lx " \n\t" /* src[0] */\
1248  "movq " #sx ", " #t0 " \n\t" /* src[0] */\
1249  "psllq $8, " #lx " \n\t"\
1250  "psrlq $8, " #t0 " \n\t"\
1251  "movd -4" #src ", " #t1 " \n\t"\
1252  "psrlq $24, " #t1 " \n\t"\
1253  "por " #t1 ", " #lx " \n\t" /* src[-1] */\
1254  "movd 8" #src ", " #t1 " \n\t"\
1255  "psllq $56, " #t1 " \n\t"\
1256  "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1257  "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
1258  PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
1259  PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
1260  PAVGB(lx, pplx) \
1261  "movq " #lx ", 8(%4) \n\t"\
1262  "movq (%4), " #lx " \n\t"\
1263  "psubusb " #lx ", " #t1 " \n\t"\
1264  "psubusb " #lx ", " #t0 " \n\t"\
1265  "psubusb " #lx ", " #sx " \n\t"\
1266  "movq "MANGLE(b00)", " #lx " \n\t"\
1267  "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
1268  "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
1269  "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
1270  "paddb " #t1 ", " #t0 " \n\t"\
1271  "paddb " #t0 ", " #sx " \n\t"\
1272 \
1273  PAVGB(plx, pplx) /* filtered */\
1274  "movq " #dst ", " #t0 " \n\t" /* dst */\
1275  "movq " #t0 ", " #t1 " \n\t" /* dst */\
1276  "psubusb %3, " #t0 " \n\t"\
1277  "paddusb %3, " #t1 " \n\t"\
1278  PMAXUB(t0, pplx)\
1279  PMINUB(t1, pplx, t0)\
1280  "paddb " #sx ", " #ppsx " \n\t"\
1281  "paddb " #psx ", " #ppsx " \n\t"\
1282  "#paddb "MANGLE(b02)", " #ppsx " \n\t"\
1283  "pand "MANGLE(b08)", " #ppsx " \n\t"\
1284  "pcmpeqb " #lx ", " #ppsx " \n\t"\
1285  "pand " #ppsx ", " #pplx " \n\t"\
1286  "pandn " #dst ", " #ppsx " \n\t"\
1287  "por " #pplx ", " #ppsx " \n\t"\
1288  "movq " #ppsx ", " #dst " \n\t"\
1289  "movq 8(%4), " #lx " \n\t"
1290 
1291 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1292  REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
1293 /*
1294 0000000
1295 1111111
1296 
1297 1111110
1298 1111101
1299 1111100
1300 1111011
1301 1111010
1302 1111001
1303 
1304 1111000
1305 1110111
1306 
1307 */
1308 //DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1309 DERING_CORE((%%FF_REGa) ,(%%FF_REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1310 DERING_CORE((%%FF_REGa, %1) ,(%%FF_REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1311 DERING_CORE((%%FF_REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1312 DERING_CORE((%0, %1, 4) ,(%%FF_REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1313 DERING_CORE((%%FF_REGd) ,(%%FF_REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1314 DERING_CORE((%%FF_REGd, %1) ,(%%FF_REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1315 DERING_CORE((%%FF_REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1316 DERING_CORE((%0, %1, 8) ,(%%FF_REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1317 
1318  "1: \n\t"
1319  : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2), "q"(tmp)
1320  NAMED_CONSTRAINTS_ADD(deringThreshold,b00,b02,b08)
1321  : "%"FF_REG_a, "%"FF_REG_d
1322  );
1323 #else // HAVE_7REGS && (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW)
1324  int y;
1325  int min=255;
1326  int max=0;
1327  int avg;
1328  uint8_t *p;
1329  int s[10];
1330  const int QP2= c->QP/2 + 1;
1331 
1332  src --;
1333  for(y=1; y<9; y++){
1334  int x;
1335  p= src + stride*y;
1336  for(x=1; x<9; x++){
1337  p++;
1338  if(*p > max) max= *p;
1339  if(*p < min) min= *p;
1340  }
1341  }
1342  avg= (min + max + 1)>>1;
1343 
1344  if(max - min <deringThreshold) return;
1345 
1346  for(y=0; y<10; y++){
1347  int t = 0;
1348 
1349  if(src[stride*y + 0] > avg) t+= 1;
1350  if(src[stride*y + 1] > avg) t+= 2;
1351  if(src[stride*y + 2] > avg) t+= 4;
1352  if(src[stride*y + 3] > avg) t+= 8;
1353  if(src[stride*y + 4] > avg) t+= 16;
1354  if(src[stride*y + 5] > avg) t+= 32;
1355  if(src[stride*y + 6] > avg) t+= 64;
1356  if(src[stride*y + 7] > avg) t+= 128;
1357  if(src[stride*y + 8] > avg) t+= 256;
1358  if(src[stride*y + 9] > avg) t+= 512;
1359 
1360  t |= (~t)<<16;
1361  t &= (t<<1) & (t>>1);
1362  s[y] = t;
1363  }
1364 
1365  for(y=1; y<9; y++){
1366  int t = s[y-1] & s[y] & s[y+1];
1367  t|= t>>16;
1368  s[y-1]= t;
1369  }
1370 
1371  for(y=1; y<9; y++){
1372  int x;
1373  int t = s[y-1];
1374 
1375  p= src + stride*y;
1376  for(x=1; x<9; x++){
1377  p++;
1378  if(t & (1<<x)){
1379  int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1380  +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1381  +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1382  f= (f + 8)>>4;
1383 
1384 #ifdef DEBUG_DERING_THRESHOLD
1385  __asm__ volatile("emms\n\t":);
1386  {
1387  static uint64_t numPixels=0;
1388  if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1389 // if((max-min)<20 || (max-min)*QP<200)
1390 // if((max-min)*QP < 500)
1391 // if(max-min<QP/2)
1392  if(max-min < 20){
1393  static int numSkipped=0;
1394  static int errorSum=0;
1395  static int worstQP=0;
1396  static int worstRange=0;
1397  static int worstDiff=0;
1398  int diff= (f - *p);
1399  int absDiff= FFABS(diff);
1400  int error= diff*diff;
1401 
1402  if(x==1 || x==8 || y==1 || y==8) continue;
1403 
1404  numSkipped++;
1405  if(absDiff > worstDiff){
1406  worstDiff= absDiff;
1407  worstQP= QP;
1408  worstRange= max-min;
1409  }
1410  errorSum+= error;
1411 
1412  if(1024LL*1024LL*1024LL % numSkipped == 0){
1413  av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
1414  "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1415  (float)errorSum/numSkipped, numSkipped, worstQP, worstRange,
1416  worstDiff, (float)numSkipped/numPixels);
1417  }
1418  }
1419  }
1420 #endif
1421  if (*p + QP2 < f) *p= *p + QP2;
1422  else if(*p - QP2 > f) *p= *p - QP2;
1423  else *p=f;
1424  }
1425  }
1426  }
1427 #ifdef DEBUG_DERING_THRESHOLD
1428  if(max-min < 20){
1429  for(y=1; y<9; y++){
1430  int x;
1431  int t = 0;
1432  p= src + stride*y;
1433  for(x=1; x<9; x++){
1434  p++;
1435  *p = FFMIN(*p + 20, 255);
1436  }
1437  }
1438 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1439  }
1440 #endif
1441 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1442 }
1443 #endif //TEMPLATE_PP_ALTIVEC
1444 
1445 /**
1446  * Deinterlace the given block by linearly interpolating every second line.
1447  * will be called for every 8x8 block and can read & write from line 4-15
1448  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1449  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1450  */
1451 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1452 {
1453 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1454  src+= 4*stride;
1455  __asm__ volatile(
1456  "lea (%0, %1), %%"FF_REG_a" \n\t"
1457  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
1458 // 0 1 2 3 4 5 6 7 8 9
1459 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
1460 
1461  "movq (%0), %%mm0 \n\t"
1462  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1463  PAVGB(%%mm1, %%mm0)
1464  "movq %%mm0, (%%"FF_REG_a") \n\t"
1465  "movq (%0, %1, 4), %%mm0 \n\t"
1466  PAVGB(%%mm0, %%mm1)
1467  "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t"
1468  "movq (%%"FF_REG_c", %1), %%mm1 \n\t"
1469  PAVGB(%%mm1, %%mm0)
1470  "movq %%mm0, (%%"FF_REG_c") \n\t"
1471  "movq (%0, %1, 8), %%mm0 \n\t"
1472  PAVGB(%%mm0, %%mm1)
1473  "movq %%mm1, (%%"FF_REG_c", %1, 2) \n\t"
1474 
1475  : : "r" (src), "r" ((x86_reg)stride)
1476  : "%"FF_REG_a, "%"FF_REG_c
1477  );
1478 #else
1479  int a, b, x;
1480  src+= 4*stride;
1481 
1482  for(x=0; x<2; x++){
1483  a= *(uint32_t*)&src[stride*0];
1484  b= *(uint32_t*)&src[stride*2];
1485  *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1486  a= *(uint32_t*)&src[stride*4];
1487  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1488  b= *(uint32_t*)&src[stride*6];
1489  *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1490  a= *(uint32_t*)&src[stride*8];
1491  *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1492  src += 4;
1493  }
1494 #endif
1495 }
1496 
1497 /**
1498  * Deinterlace the given block by cubic interpolating every second line.
1499  * will be called for every 8x8 block and can read & write from line 4-15
1500  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1501  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1502  * this filter will read lines 3-15 and write 7-13
1503  */
1504 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1505 {
1506 #if TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1507  src+= stride*3;
1508  __asm__ volatile(
1509  "lea (%0, %1), %%"FF_REG_a" \n\t"
1510  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1511  "lea (%%"FF_REG_d", %1, 4), %%"FF_REG_c"\n\t"
1512  "add %1, %%"FF_REG_c" \n\t"
1513 #if TEMPLATE_PP_SSE2
1514  "pxor %%xmm7, %%xmm7 \n\t"
1515 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1516  "movq " #a ", %%xmm0 \n\t"\
1517  "movq " #b ", %%xmm1 \n\t"\
1518  "movq " #d ", %%xmm2 \n\t"\
1519  "movq " #e ", %%xmm3 \n\t"\
1520  "pavgb %%xmm2, %%xmm1 \n\t"\
1521  "pavgb %%xmm3, %%xmm0 \n\t"\
1522  "punpcklbw %%xmm7, %%xmm0 \n\t"\
1523  "punpcklbw %%xmm7, %%xmm1 \n\t"\
1524  "psubw %%xmm1, %%xmm0 \n\t"\
1525  "psraw $3, %%xmm0 \n\t"\
1526  "psubw %%xmm0, %%xmm1 \n\t"\
1527  "packuswb %%xmm1, %%xmm1 \n\t"\
1528  "movlps %%xmm1, " #c " \n\t"
1529 #else //TEMPLATE_PP_SSE2
1530  "pxor %%mm7, %%mm7 \n\t"
1531 // 0 1 2 3 4 5 6 7 8 9 10
1532 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1533 
1534 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1535  "movq " #a ", %%mm0 \n\t"\
1536  "movq " #b ", %%mm1 \n\t"\
1537  "movq " #d ", %%mm2 \n\t"\
1538  "movq " #e ", %%mm3 \n\t"\
1539  PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1540  PAVGB(%%mm3, %%mm0) /* (a+e) /2 */\
1541  "movq %%mm0, %%mm2 \n\t"\
1542  "punpcklbw %%mm7, %%mm0 \n\t"\
1543  "punpckhbw %%mm7, %%mm2 \n\t"\
1544  "movq %%mm1, %%mm3 \n\t"\
1545  "punpcklbw %%mm7, %%mm1 \n\t"\
1546  "punpckhbw %%mm7, %%mm3 \n\t"\
1547  "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1548  "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
1549  "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
1550  "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
1551  "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
1552  "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
1553  "packuswb %%mm3, %%mm1 \n\t"\
1554  "movq %%mm1, " #c " \n\t"
1555 #endif //TEMPLATE_PP_SSE2
1556 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e)
1557 
1558 DEINT_CUBIC((%0) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd, %1))
1559 DEINT_CUBIC((%%FF_REGa, %1), (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%0, %1, 8))
1560 DEINT_CUBIC((%0, %1, 4) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGc))
1561 DEINT_CUBIC((%%FF_REGd, %1), (%0, %1, 8) , (%%FF_REGd, %1, 4), (%%FF_REGc) , (%%FF_REGc, %1, 2))
1562 
1563  : : "r" (src), "r" ((x86_reg)stride)
1564  :
1565 #if TEMPLATE_PP_SSE2
1566  XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm7",)
1567 #endif
1568  "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c
1569  );
1570 #undef REAL_DEINT_CUBIC
1571 #else //TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1572  int x;
1573  src+= stride*3;
1574  for(x=0; x<8; x++){
1575  src[stride*3] = av_clip_uint8((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1576  src[stride*5] = av_clip_uint8((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1577  src[stride*7] = av_clip_uint8((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1578  src[stride*9] = av_clip_uint8((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1579  src++;
1580  }
1581 #endif //TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1582 }
1583 
1584 /**
1585  * Deinterlace the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1586  * will be called for every 8x8 block and can read & write from line 4-15
1587  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1588  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1589  * this filter will read lines 4-13 and write 5-11
1590  */
1591 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1592 {
1593 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1594  src+= stride*4;
1595  __asm__ volatile(
1596  "lea (%0, %1), %%"FF_REG_a" \n\t"
1597  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1598  "pxor %%mm7, %%mm7 \n\t"
1599  "movq (%2), %%mm0 \n\t"
1600 // 0 1 2 3 4 5 6 7 8 9 10
1601 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1602 
1603 #define REAL_DEINT_FF(a,b,c,d)\
1604  "movq " #a ", %%mm1 \n\t"\
1605  "movq " #b ", %%mm2 \n\t"\
1606  "movq " #c ", %%mm3 \n\t"\
1607  "movq " #d ", %%mm4 \n\t"\
1608  PAVGB(%%mm3, %%mm1) \
1609  PAVGB(%%mm4, %%mm0) \
1610  "movq %%mm0, %%mm3 \n\t"\
1611  "punpcklbw %%mm7, %%mm0 \n\t"\
1612  "punpckhbw %%mm7, %%mm3 \n\t"\
1613  "movq %%mm1, %%mm4 \n\t"\
1614  "punpcklbw %%mm7, %%mm1 \n\t"\
1615  "punpckhbw %%mm7, %%mm4 \n\t"\
1616  "psllw $2, %%mm1 \n\t"\
1617  "psllw $2, %%mm4 \n\t"\
1618  "psubw %%mm0, %%mm1 \n\t"\
1619  "psubw %%mm3, %%mm4 \n\t"\
1620  "movq %%mm2, %%mm5 \n\t"\
1621  "movq %%mm2, %%mm0 \n\t"\
1622  "punpcklbw %%mm7, %%mm2 \n\t"\
1623  "punpckhbw %%mm7, %%mm5 \n\t"\
1624  "paddw %%mm2, %%mm1 \n\t"\
1625  "paddw %%mm5, %%mm4 \n\t"\
1626  "psraw $2, %%mm1 \n\t"\
1627  "psraw $2, %%mm4 \n\t"\
1628  "packuswb %%mm4, %%mm1 \n\t"\
1629  "movq %%mm1, " #b " \n\t"\
1630 
1631 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d)
1632 
1633 DEINT_FF((%0) , (%%FF_REGa) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2))
1634 DEINT_FF((%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) )
1635 DEINT_FF((%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2))
1636 DEINT_FF((%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4))
1637 
1638  "movq %%mm0, (%2) \n\t"
1639  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
1640  : "%"FF_REG_a, "%"FF_REG_d
1641  );
1642 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1643  int x;
1644  src+= stride*4;
1645  for(x=0; x<8; x++){
1646  int t1= tmp[x];
1647  int t2= src[stride*1];
1648 
1649  src[stride*1]= av_clip_uint8((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1650  t1= src[stride*4];
1651  src[stride*3]= av_clip_uint8((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1652  t2= src[stride*6];
1653  src[stride*5]= av_clip_uint8((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1654  t1= src[stride*8];
1655  src[stride*7]= av_clip_uint8((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1656  tmp[x]= t1;
1657 
1658  src++;
1659  }
1660 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1661 }
1662 
1663 /**
1664  * Deinterlace the given block by filtering every line with a (-1 2 6 2 -1) filter.
1665  * will be called for every 8x8 block and can read & write from line 4-15
1666  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1667  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1668  * this filter will read lines 4-13 and write 4-11
1669  */
1670 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1671 {
1672 #if (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
1673  src+= stride*4;
1674  __asm__ volatile(
1675  "lea (%0, %1), %%"FF_REG_a" \n\t"
1676  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1677  "pxor %%mm7, %%mm7 \n\t"
1678  "movq (%2), %%mm0 \n\t"
1679  "movq (%3), %%mm1 \n\t"
1680 // 0 1 2 3 4 5 6 7 8 9 10
1681 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1682 
1683 #define REAL_DEINT_L5(t1,t2,a,b,c)\
1684  "movq " #a ", %%mm2 \n\t"\
1685  "movq " #b ", %%mm3 \n\t"\
1686  "movq " #c ", %%mm4 \n\t"\
1687  PAVGB(t2, %%mm3) \
1688  PAVGB(t1, %%mm4) \
1689  "movq %%mm2, %%mm5 \n\t"\
1690  "movq %%mm2, " #t1 " \n\t"\
1691  "punpcklbw %%mm7, %%mm2 \n\t"\
1692  "punpckhbw %%mm7, %%mm5 \n\t"\
1693  "movq %%mm2, %%mm6 \n\t"\
1694  "paddw %%mm2, %%mm2 \n\t"\
1695  "paddw %%mm6, %%mm2 \n\t"\
1696  "movq %%mm5, %%mm6 \n\t"\
1697  "paddw %%mm5, %%mm5 \n\t"\
1698  "paddw %%mm6, %%mm5 \n\t"\
1699  "movq %%mm3, %%mm6 \n\t"\
1700  "punpcklbw %%mm7, %%mm3 \n\t"\
1701  "punpckhbw %%mm7, %%mm6 \n\t"\
1702  "paddw %%mm3, %%mm3 \n\t"\
1703  "paddw %%mm6, %%mm6 \n\t"\
1704  "paddw %%mm3, %%mm2 \n\t"\
1705  "paddw %%mm6, %%mm5 \n\t"\
1706  "movq %%mm4, %%mm6 \n\t"\
1707  "punpcklbw %%mm7, %%mm4 \n\t"\
1708  "punpckhbw %%mm7, %%mm6 \n\t"\
1709  "psubw %%mm4, %%mm2 \n\t"\
1710  "psubw %%mm6, %%mm5 \n\t"\
1711  "psraw $2, %%mm2 \n\t"\
1712  "psraw $2, %%mm5 \n\t"\
1713  "packuswb %%mm5, %%mm2 \n\t"\
1714  "movq %%mm2, " #a " \n\t"\
1715 
1716 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c)
1717 
1718 DEINT_L5(%%mm0, %%mm1, (%0) , (%%FF_REGa) , (%%FF_REGa, %1) )
1719 DEINT_L5(%%mm1, %%mm0, (%%FF_REGa) , (%%FF_REGa, %1) , (%%FF_REGa, %1, 2))
1720 DEINT_L5(%%mm0, %%mm1, (%%FF_REGa, %1) , (%%FF_REGa, %1, 2), (%0, %1, 4) )
1721 DEINT_L5(%%mm1, %%mm0, (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) )
1722 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1) )
1723 DEINT_L5(%%mm1, %%mm0, (%%FF_REGd) , (%%FF_REGd, %1) , (%%FF_REGd, %1, 2))
1724 DEINT_L5(%%mm0, %%mm1, (%%FF_REGd, %1) , (%%FF_REGd, %1, 2), (%0, %1, 8) )
1725 DEINT_L5(%%mm1, %%mm0, (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4))
1726 
1727  "movq %%mm0, (%2) \n\t"
1728  "movq %%mm1, (%3) \n\t"
1729  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
1730  : "%"FF_REG_a, "%"FF_REG_d
1731  );
1732 #else //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
1733  int x;
1734  src+= stride*4;
1735  for(x=0; x<8; x++){
1736  int t1= tmp[x];
1737  int t2= tmp2[x];
1738  int t3= src[0];
1739 
1740  src[stride*0]= av_clip_uint8((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1741  t1= src[stride*1];
1742  src[stride*1]= av_clip_uint8((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1743  t2= src[stride*2];
1744  src[stride*2]= av_clip_uint8((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1745  t3= src[stride*3];
1746  src[stride*3]= av_clip_uint8((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1747  t1= src[stride*4];
1748  src[stride*4]= av_clip_uint8((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1749  t2= src[stride*5];
1750  src[stride*5]= av_clip_uint8((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1751  t3= src[stride*6];
1752  src[stride*6]= av_clip_uint8((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1753  t1= src[stride*7];
1754  src[stride*7]= av_clip_uint8((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1755 
1756  tmp[x]= t3;
1757  tmp2[x]= t1;
1758 
1759  src++;
1760  }
1761 #endif //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
1762 }
1763 
1764 /**
1765  * Deinterlace the given block by filtering all lines with a (1 2 1) filter.
1766  * will be called for every 8x8 block and can read & write from line 4-15
1767  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1768  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1769  * this filter will read lines 4-13 and write 4-11
1770  */
1771 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1772 {
1773 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1774  src+= 4*stride;
1775  __asm__ volatile(
1776  "lea (%0, %1), %%"FF_REG_a" \n\t"
1777  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1778 // 0 1 2 3 4 5 6 7 8 9
1779 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1780 
1781  "movq (%2), %%mm0 \n\t" // L0
1782  "movq (%%"FF_REG_a"), %%mm1 \n\t" // L2
1783  PAVGB(%%mm1, %%mm0) // L0+L2
1784  "movq (%0), %%mm2 \n\t" // L1
1785  PAVGB(%%mm2, %%mm0)
1786  "movq %%mm0, (%0) \n\t"
1787  "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // L3
1788  PAVGB(%%mm0, %%mm2) // L1+L3
1789  PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1790  "movq %%mm2, (%%"FF_REG_a") \n\t"
1791  "movq (%%"FF_REG_a", %1, 2), %%mm2 \n\t" // L4
1792  PAVGB(%%mm2, %%mm1) // L2+L4
1793  PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1794  "movq %%mm1, (%%"FF_REG_a", %1) \n\t"
1795  "movq (%0, %1, 4), %%mm1 \n\t" // L5
1796  PAVGB(%%mm1, %%mm0) // L3+L5
1797  PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
1798  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
1799  "movq (%%"FF_REG_d"), %%mm0 \n\t" // L6
1800  PAVGB(%%mm0, %%mm2) // L4+L6
1801  PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
1802  "movq %%mm2, (%0, %1, 4) \n\t"
1803  "movq (%%"FF_REG_d", %1), %%mm2 \n\t" // L7
1804  PAVGB(%%mm2, %%mm1) // L5+L7
1805  PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
1806  "movq %%mm1, (%%"FF_REG_d") \n\t"
1807  "movq (%%"FF_REG_d", %1, 2), %%mm1 \n\t" // L8
1808  PAVGB(%%mm1, %%mm0) // L6+L8
1809  PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
1810  "movq %%mm0, (%%"FF_REG_d", %1) \n\t"
1811  "movq (%0, %1, 8), %%mm0 \n\t" // L9
1812  PAVGB(%%mm0, %%mm2) // L7+L9
1813  PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
1814  "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t"
1815  "movq %%mm1, (%2) \n\t"
1816 
1817  : : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
1818  : "%"FF_REG_a, "%"FF_REG_d
1819  );
1820 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1821  int a, b, c, x;
1822  src+= 4*stride;
1823 
1824  for(x=0; x<2; x++){
1825  a= *(uint32_t*)&tmp[stride*0];
1826  b= *(uint32_t*)&src[stride*0];
1827  c= *(uint32_t*)&src[stride*1];
1828  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1829  *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1830 
1831  a= *(uint32_t*)&src[stride*2];
1832  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1833  *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1834 
1835  b= *(uint32_t*)&src[stride*3];
1836  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1837  *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1838 
1839  c= *(uint32_t*)&src[stride*4];
1840  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1841  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1842 
1843  a= *(uint32_t*)&src[stride*5];
1844  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1845  *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1846 
1847  b= *(uint32_t*)&src[stride*6];
1848  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1849  *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1850 
1851  c= *(uint32_t*)&src[stride*7];
1852  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1853  *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1854 
1855  a= *(uint32_t*)&src[stride*8];
1856  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1857  *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1858 
1859  *(uint32_t*)&tmp[stride*0]= c;
1860  src += 4;
1861  tmp += 4;
1862  }
1863 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1864 }
1865 
1866 /**
1867  * Deinterlace the given block by applying a median filter to every second line.
1868  * will be called for every 8x8 block and can read & write from line 4-15,
1869  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1870  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1871  */
1872 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1873 {
1874 #if TEMPLATE_PP_MMX
1875  src+= 4*stride;
1876 #if TEMPLATE_PP_MMXEXT
1877  __asm__ volatile(
1878  "lea (%0, %1), %%"FF_REG_a" \n\t"
1879  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1880 // 0 1 2 3 4 5 6 7 8 9
1881 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1882 
1883  "movq (%0), %%mm0 \n\t"
1884  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
1885  "movq (%%"FF_REG_a"), %%mm1 \n\t"
1886  "movq %%mm0, %%mm3 \n\t"
1887  "pmaxub %%mm1, %%mm0 \n\t"
1888  "pminub %%mm3, %%mm1 \n\t"
1889  "pmaxub %%mm2, %%mm1 \n\t"
1890  "pminub %%mm1, %%mm0 \n\t"
1891  "movq %%mm0, (%%"FF_REG_a") \n\t"
1892 
1893  "movq (%0, %1, 4), %%mm0 \n\t"
1894  "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t"
1895  "movq %%mm2, %%mm3 \n\t"
1896  "pmaxub %%mm1, %%mm2 \n\t"
1897  "pminub %%mm3, %%mm1 \n\t"
1898  "pmaxub %%mm0, %%mm1 \n\t"
1899  "pminub %%mm1, %%mm2 \n\t"
1900  "movq %%mm2, (%%"FF_REG_a", %1, 2) \n\t"
1901 
1902  "movq (%%"FF_REG_d"), %%mm2 \n\t"
1903  "movq (%%"FF_REG_d", %1), %%mm1 \n\t"
1904  "movq %%mm2, %%mm3 \n\t"
1905  "pmaxub %%mm0, %%mm2 \n\t"
1906  "pminub %%mm3, %%mm0 \n\t"
1907  "pmaxub %%mm1, %%mm0 \n\t"
1908  "pminub %%mm0, %%mm2 \n\t"
1909  "movq %%mm2, (%%"FF_REG_d") \n\t"
1910 
1911  "movq (%%"FF_REG_d", %1, 2), %%mm2 \n\t"
1912  "movq (%0, %1, 8), %%mm0 \n\t"
1913  "movq %%mm2, %%mm3 \n\t"
1914  "pmaxub %%mm0, %%mm2 \n\t"
1915  "pminub %%mm3, %%mm0 \n\t"
1916  "pmaxub %%mm1, %%mm0 \n\t"
1917  "pminub %%mm0, %%mm2 \n\t"
1918  "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t"
1919 
1920 
1921  : : "r" (src), "r" ((x86_reg)stride)
1922  : "%"FF_REG_a, "%"FF_REG_d
1923  );
1924 
1925 #else // MMX without MMX2
1926  __asm__ volatile(
1927  "lea (%0, %1), %%"FF_REG_a" \n\t"
1928  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1929 // 0 1 2 3 4 5 6 7 8 9
1930 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1931  "pxor %%mm7, %%mm7 \n\t"
1932 
1933 #define REAL_MEDIAN(a,b,c)\
1934  "movq " #a ", %%mm0 \n\t"\
1935  "movq " #b ", %%mm2 \n\t"\
1936  "movq " #c ", %%mm1 \n\t"\
1937  "movq %%mm0, %%mm3 \n\t"\
1938  "movq %%mm1, %%mm4 \n\t"\
1939  "movq %%mm2, %%mm5 \n\t"\
1940  "psubusb %%mm1, %%mm3 \n\t"\
1941  "psubusb %%mm2, %%mm4 \n\t"\
1942  "psubusb %%mm0, %%mm5 \n\t"\
1943  "pcmpeqb %%mm7, %%mm3 \n\t"\
1944  "pcmpeqb %%mm7, %%mm4 \n\t"\
1945  "pcmpeqb %%mm7, %%mm5 \n\t"\
1946  "movq %%mm3, %%mm6 \n\t"\
1947  "pxor %%mm4, %%mm3 \n\t"\
1948  "pxor %%mm5, %%mm4 \n\t"\
1949  "pxor %%mm6, %%mm5 \n\t"\
1950  "por %%mm3, %%mm1 \n\t"\
1951  "por %%mm4, %%mm2 \n\t"\
1952  "por %%mm5, %%mm0 \n\t"\
1953  "pand %%mm2, %%mm0 \n\t"\
1954  "pand %%mm1, %%mm0 \n\t"\
1955  "movq %%mm0, " #b " \n\t"
1956 #define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c)
1957 
1958 MEDIAN((%0) , (%%FF_REGa) , (%%FF_REGa, %1))
1959 MEDIAN((%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4))
1960 MEDIAN((%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1))
1961 MEDIAN((%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8))
1962 
1963  : : "r" (src), "r" ((x86_reg)stride)
1964  : "%"FF_REG_a, "%"FF_REG_d
1965  );
1966 #endif //TEMPLATE_PP_MMXEXT
1967 #else //TEMPLATE_PP_MMX
1968  int x, y;
1969  src+= 4*stride;
1970  // FIXME - there should be a way to do a few columns in parallel like w/mmx
1971  for(x=0; x<8; x++){
1972  uint8_t *colsrc = src;
1973  for (y=0; y<4; y++){
1974  int a, b, c, d, e, f;
1975  a = colsrc[0 ];
1976  b = colsrc[stride ];
1977  c = colsrc[stride*2];
1978  d = (a-b)>>31;
1979  e = (b-c)>>31;
1980  f = (c-a)>>31;
1981  colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
1982  colsrc += stride*2;
1983  }
1984  src++;
1985  }
1986 #endif //TEMPLATE_PP_MMX
1987 }
1988 
1989 #if TEMPLATE_PP_MMX
1990 /**
1991  * Transpose and shift the given 8x8 Block into dst1 and dst2.
1992  */
1993 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, const uint8_t *src, int srcStride)
1994 {
1995  __asm__(
1996  "lea (%0, %1), %%"FF_REG_a" \n\t"
1997 // 0 1 2 3 4 5 6 7 8 9
1998 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1999  "movq (%0), %%mm0 \n\t" // 12345678
2000  "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
2001  "movq %%mm0, %%mm2 \n\t" // 12345678
2002  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2003  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2004 
2005  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
2006  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
2007  "movq %%mm1, %%mm4 \n\t"
2008  "punpcklbw %%mm3, %%mm1 \n\t"
2009  "punpckhbw %%mm3, %%mm4 \n\t"
2010 
2011  "movq %%mm0, %%mm3 \n\t"
2012  "punpcklwd %%mm1, %%mm0 \n\t"
2013  "punpckhwd %%mm1, %%mm3 \n\t"
2014  "movq %%mm2, %%mm1 \n\t"
2015  "punpcklwd %%mm4, %%mm2 \n\t"
2016  "punpckhwd %%mm4, %%mm1 \n\t"
2017 
2018  "movd %%mm0, 128(%2) \n\t"
2019  "psrlq $32, %%mm0 \n\t"
2020  "movd %%mm0, 144(%2) \n\t"
2021  "movd %%mm3, 160(%2) \n\t"
2022  "psrlq $32, %%mm3 \n\t"
2023  "movd %%mm3, 176(%2) \n\t"
2024  "movd %%mm3, 48(%3) \n\t"
2025  "movd %%mm2, 192(%2) \n\t"
2026  "movd %%mm2, 64(%3) \n\t"
2027  "psrlq $32, %%mm2 \n\t"
2028  "movd %%mm2, 80(%3) \n\t"
2029  "movd %%mm1, 96(%3) \n\t"
2030  "psrlq $32, %%mm1 \n\t"
2031  "movd %%mm1, 112(%3) \n\t"
2032 
2033  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_a"\n\t"
2034 
2035  "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
2036  "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
2037  "movq %%mm0, %%mm2 \n\t" // 12345678
2038  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2039  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2040 
2041  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
2042  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
2043  "movq %%mm1, %%mm4 \n\t"
2044  "punpcklbw %%mm3, %%mm1 \n\t"
2045  "punpckhbw %%mm3, %%mm4 \n\t"
2046 
2047  "movq %%mm0, %%mm3 \n\t"
2048  "punpcklwd %%mm1, %%mm0 \n\t"
2049  "punpckhwd %%mm1, %%mm3 \n\t"
2050  "movq %%mm2, %%mm1 \n\t"
2051  "punpcklwd %%mm4, %%mm2 \n\t"
2052  "punpckhwd %%mm4, %%mm1 \n\t"
2053 
2054  "movd %%mm0, 132(%2) \n\t"
2055  "psrlq $32, %%mm0 \n\t"
2056  "movd %%mm0, 148(%2) \n\t"
2057  "movd %%mm3, 164(%2) \n\t"
2058  "psrlq $32, %%mm3 \n\t"
2059  "movd %%mm3, 180(%2) \n\t"
2060  "movd %%mm3, 52(%3) \n\t"
2061  "movd %%mm2, 196(%2) \n\t"
2062  "movd %%mm2, 68(%3) \n\t"
2063  "psrlq $32, %%mm2 \n\t"
2064  "movd %%mm2, 84(%3) \n\t"
2065  "movd %%mm1, 100(%3) \n\t"
2066  "psrlq $32, %%mm1 \n\t"
2067  "movd %%mm1, 116(%3) \n\t"
2068 
2069 
2070  :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2)
2071  : "%"FF_REG_a
2072  );
2073 }
2074 
2075 /**
2076  * Transpose the given 8x8 block.
2077  */
2078 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, const uint8_t *src)
2079 {
2080  __asm__(
2081  "lea (%0, %1), %%"FF_REG_a" \n\t"
2082  "lea (%%"FF_REG_a",%1,4), %%"FF_REG_d" \n\t"
2083 // 0 1 2 3 4 5 6 7 8 9
2084 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2085  "movq (%2), %%mm0 \n\t" // 12345678
2086  "movq 16(%2), %%mm1 \n\t" // abcdefgh
2087  "movq %%mm0, %%mm2 \n\t" // 12345678
2088  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2089  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2090 
2091  "movq 32(%2), %%mm1 \n\t"
2092  "movq 48(%2), %%mm3 \n\t"
2093  "movq %%mm1, %%mm4 \n\t"
2094  "punpcklbw %%mm3, %%mm1 \n\t"
2095  "punpckhbw %%mm3, %%mm4 \n\t"
2096 
2097  "movq %%mm0, %%mm3 \n\t"
2098  "punpcklwd %%mm1, %%mm0 \n\t"
2099  "punpckhwd %%mm1, %%mm3 \n\t"
2100  "movq %%mm2, %%mm1 \n\t"
2101  "punpcklwd %%mm4, %%mm2 \n\t"
2102  "punpckhwd %%mm4, %%mm1 \n\t"
2103 
2104  "movd %%mm0, (%0) \n\t"
2105  "psrlq $32, %%mm0 \n\t"
2106  "movd %%mm0, (%%"FF_REG_a") \n\t"
2107  "movd %%mm3, (%%"FF_REG_a", %1) \n\t"
2108  "psrlq $32, %%mm3 \n\t"
2109  "movd %%mm3, (%%"FF_REG_a", %1, 2) \n\t"
2110  "movd %%mm2, (%0, %1, 4) \n\t"
2111  "psrlq $32, %%mm2 \n\t"
2112  "movd %%mm2, (%%"FF_REG_d") \n\t"
2113  "movd %%mm1, (%%"FF_REG_d", %1) \n\t"
2114  "psrlq $32, %%mm1 \n\t"
2115  "movd %%mm1, (%%"FF_REG_d", %1, 2) \n\t"
2116 
2117 
2118  "movq 64(%2), %%mm0 \n\t" // 12345678
2119  "movq 80(%2), %%mm1 \n\t" // abcdefgh
2120  "movq %%mm0, %%mm2 \n\t" // 12345678
2121  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2122  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2123 
2124  "movq 96(%2), %%mm1 \n\t"
2125  "movq 112(%2), %%mm3 \n\t"
2126  "movq %%mm1, %%mm4 \n\t"
2127  "punpcklbw %%mm3, %%mm1 \n\t"
2128  "punpckhbw %%mm3, %%mm4 \n\t"
2129 
2130  "movq %%mm0, %%mm3 \n\t"
2131  "punpcklwd %%mm1, %%mm0 \n\t"
2132  "punpckhwd %%mm1, %%mm3 \n\t"
2133  "movq %%mm2, %%mm1 \n\t"
2134  "punpcklwd %%mm4, %%mm2 \n\t"
2135  "punpckhwd %%mm4, %%mm1 \n\t"
2136 
2137  "movd %%mm0, 4(%0) \n\t"
2138  "psrlq $32, %%mm0 \n\t"
2139  "movd %%mm0, 4(%%"FF_REG_a") \n\t"
2140  "movd %%mm3, 4(%%"FF_REG_a", %1) \n\t"
2141  "psrlq $32, %%mm3 \n\t"
2142  "movd %%mm3, 4(%%"FF_REG_a", %1, 2) \n\t"
2143  "movd %%mm2, 4(%0, %1, 4) \n\t"
2144  "psrlq $32, %%mm2 \n\t"
2145  "movd %%mm2, 4(%%"FF_REG_d") \n\t"
2146  "movd %%mm1, 4(%%"FF_REG_d", %1) \n\t"
2147  "psrlq $32, %%mm1 \n\t"
2148  "movd %%mm1, 4(%%"FF_REG_d", %1, 2) \n\t"
2149 
2150  :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src)
2151  : "%"FF_REG_a, "%"FF_REG_d
2152  );
2153 }
2154 #endif //TEMPLATE_PP_MMX
2155 //static long test=0;
2156 
2157 #if !TEMPLATE_PP_ALTIVEC
2158 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2159  uint8_t *tempBlurred, uint32_t *tempBlurredPast, const int *maxNoise)
2160 {
2161  // to save a register (FIXME do this outside of the loops)
2162  tempBlurredPast[127]= maxNoise[0];
2163  tempBlurredPast[128]= maxNoise[1];
2164  tempBlurredPast[129]= maxNoise[2];
2165 
2166 #define FAST_L2_DIFF
2167 //#define L1_DIFF //u should change the thresholds too if u try that one
2168 #if (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
2169  __asm__ volatile(
2170  "lea (%2, %2, 2), %%"FF_REG_a" \n\t" // 3*stride
2171  "lea (%2, %2, 4), %%"FF_REG_d" \n\t" // 5*stride
2172  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2173 // 0 1 2 3 4 5 6 7 8 9
2174 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
2175 //FIXME reorder?
2176 #ifdef L1_DIFF //needs mmx2
2177  "movq (%0), %%mm0 \n\t" // L0
2178  "psadbw (%1), %%mm0 \n\t" // |L0-R0|
2179  "movq (%0, %2), %%mm1 \n\t" // L1
2180  "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
2181  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2182  "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
2183  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2184  "psadbw (%1, %%"FF_REG_a"), %%mm3 \n\t" // |L3-R3|
2185 
2186  "movq (%0, %2, 4), %%mm4 \n\t" // L4
2187  "paddw %%mm1, %%mm0 \n\t"
2188  "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
2189  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
2190  "paddw %%mm2, %%mm0 \n\t"
2191  "psadbw (%1, %%"FF_REG_d"), %%mm5 \n\t" // |L5-R5|
2192  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
2193  "paddw %%mm3, %%mm0 \n\t"
2194  "psadbw (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // |L6-R6|
2195  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
2196  "paddw %%mm4, %%mm0 \n\t"
2197  "psadbw (%1, %%"FF_REG_c"), %%mm7 \n\t" // |L7-R7|
2198  "paddw %%mm5, %%mm6 \n\t"
2199  "paddw %%mm7, %%mm6 \n\t"
2200  "paddw %%mm6, %%mm0 \n\t"
2201 #else //L1_DIFF
2202 #if defined (FAST_L2_DIFF)
2203  "pcmpeqb %%mm7, %%mm7 \n\t"
2204  "movq "MANGLE(b80)", %%mm6 \n\t"
2205  "pxor %%mm0, %%mm0 \n\t"
2206 #define REAL_L2_DIFF_CORE(a, b)\
2207  "movq " #a ", %%mm5 \n\t"\
2208  "movq " #b ", %%mm2 \n\t"\
2209  "pxor %%mm7, %%mm2 \n\t"\
2210  PAVGB(%%mm2, %%mm5)\
2211  "paddb %%mm6, %%mm5 \n\t"\
2212  "movq %%mm5, %%mm2 \n\t"\
2213  "psllw $8, %%mm5 \n\t"\
2214  "pmaddwd %%mm5, %%mm5 \n\t"\
2215  "pmaddwd %%mm2, %%mm2 \n\t"\
2216  "paddd %%mm2, %%mm5 \n\t"\
2217  "psrld $14, %%mm5 \n\t"\
2218  "paddd %%mm5, %%mm0 \n\t"
2219 
2220 #else //defined (FAST_L2_DIFF)
2221  "pxor %%mm7, %%mm7 \n\t"
2222  "pxor %%mm0, %%mm0 \n\t"
2223 #define REAL_L2_DIFF_CORE(a, b)\
2224  "movq " #a ", %%mm5 \n\t"\
2225  "movq " #b ", %%mm2 \n\t"\
2226  "movq %%mm5, %%mm1 \n\t"\
2227  "movq %%mm2, %%mm3 \n\t"\
2228  "punpcklbw %%mm7, %%mm5 \n\t"\
2229  "punpckhbw %%mm7, %%mm1 \n\t"\
2230  "punpcklbw %%mm7, %%mm2 \n\t"\
2231  "punpckhbw %%mm7, %%mm3 \n\t"\
2232  "psubw %%mm2, %%mm5 \n\t"\
2233  "psubw %%mm3, %%mm1 \n\t"\
2234  "pmaddwd %%mm5, %%mm5 \n\t"\
2235  "pmaddwd %%mm1, %%mm1 \n\t"\
2236  "paddd %%mm1, %%mm5 \n\t"\
2237  "paddd %%mm5, %%mm0 \n\t"
2238 
2239 #endif //defined (FAST_L2_DIFF)
2240 
2241 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b)
2242 
2243 L2_DIFF_CORE((%0) , (%1))
2244 L2_DIFF_CORE((%0, %2) , (%1, %2))
2245 L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2))
2246 L2_DIFF_CORE((%0, %%FF_REGa) , (%1, %%FF_REGa))
2247 L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4))
2248 L2_DIFF_CORE((%0, %%FF_REGd) , (%1, %%FF_REGd))
2249 L2_DIFF_CORE((%0, %%FF_REGa,2), (%1, %%FF_REGa,2))
2250 L2_DIFF_CORE((%0, %%FF_REGc) , (%1, %%FF_REGc))
2251 
2252 #endif //L1_DIFF
2253 
2254  "movq %%mm0, %%mm4 \n\t"
2255  "psrlq $32, %%mm0 \n\t"
2256  "paddd %%mm0, %%mm4 \n\t"
2257  "movd %%mm4, %%ecx \n\t"
2258  "shll $2, %%ecx \n\t"
2259  "mov %3, %%"FF_REG_d" \n\t"
2260  "addl -4(%%"FF_REG_d"), %%ecx \n\t"
2261  "addl 4(%%"FF_REG_d"), %%ecx \n\t"
2262  "addl -1024(%%"FF_REG_d"), %%ecx \n\t"
2263  "addl $4, %%ecx \n\t"
2264  "addl 1024(%%"FF_REG_d"), %%ecx \n\t"
2265  "shrl $3, %%ecx \n\t"
2266  "movl %%ecx, (%%"FF_REG_d") \n\t"
2267 
2268 // "mov %3, %%"FF_REG_c" \n\t"
2269 // "mov %%"FF_REG_c", test \n\t"
2270 // "jmp 4f \n\t"
2271  "cmpl 512(%%"FF_REG_d"), %%ecx \n\t"
2272  " jb 2f \n\t"
2273  "cmpl 516(%%"FF_REG_d"), %%ecx \n\t"
2274  " jb 1f \n\t"
2275 
2276  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2277  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2278  "movq (%0), %%mm0 \n\t" // L0
2279  "movq (%0, %2), %%mm1 \n\t" // L1
2280  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2281  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2282  "movq (%0, %2, 4), %%mm4 \n\t" // L4
2283  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
2284  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
2285  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
2286  "movq %%mm0, (%1) \n\t" // L0
2287  "movq %%mm1, (%1, %2) \n\t" // L1
2288  "movq %%mm2, (%1, %2, 2) \n\t" // L2
2289  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // L3
2290  "movq %%mm4, (%1, %2, 4) \n\t" // L4
2291  "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // L5
2292  "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // L6
2293  "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // L7
2294  "jmp 4f \n\t"
2295 
2296  "1: \n\t"
2297  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2298  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2299  "movq (%0), %%mm0 \n\t" // L0
2300  PAVGB((%1), %%mm0) // L0
2301  "movq (%0, %2), %%mm1 \n\t" // L1
2302  PAVGB((%1, %2), %%mm1) // L1
2303  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2304  PAVGB((%1, %2, 2), %%mm2) // L2
2305  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2306  PAVGB((%1, %%FF_REGa), %%mm3) // L3
2307  "movq (%0, %2, 4), %%mm4 \n\t" // L4
2308  PAVGB((%1, %2, 4), %%mm4) // L4
2309  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
2310  PAVGB((%1, %%FF_REGd), %%mm5) // L5
2311  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
2312  PAVGB((%1, %%FF_REGa, 2), %%mm6) // L6
2313  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
2314  PAVGB((%1, %%FF_REGc), %%mm7) // L7
2315  "movq %%mm0, (%1) \n\t" // R0
2316  "movq %%mm1, (%1, %2) \n\t" // R1
2317  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2318  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
2319  "movq %%mm4, (%1, %2, 4) \n\t" // R4
2320  "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // R5
2321  "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // R6
2322  "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // R7
2323  "movq %%mm0, (%0) \n\t" // L0
2324  "movq %%mm1, (%0, %2) \n\t" // L1
2325  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2326  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
2327  "movq %%mm4, (%0, %2, 4) \n\t" // L4
2328  "movq %%mm5, (%0, %%"FF_REG_d") \n\t" // L5
2329  "movq %%mm6, (%0, %%"FF_REG_a", 2) \n\t" // L6
2330  "movq %%mm7, (%0, %%"FF_REG_c") \n\t" // L7
2331  "jmp 4f \n\t"
2332 
2333  "2: \n\t"
2334  "cmpl 508(%%"FF_REG_d"), %%ecx \n\t"
2335  " jb 3f \n\t"
2336 
2337  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2338  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2339  "movq (%0), %%mm0 \n\t" // L0
2340  "movq (%0, %2), %%mm1 \n\t" // L1
2341  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2342  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2343  "movq (%1), %%mm4 \n\t" // R0
2344  "movq (%1, %2), %%mm5 \n\t" // R1
2345  "movq (%1, %2, 2), %%mm6 \n\t" // R2
2346  "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
2347  PAVGB(%%mm4, %%mm0)
2348  PAVGB(%%mm5, %%mm1)
2349  PAVGB(%%mm6, %%mm2)
2350  PAVGB(%%mm7, %%mm3)
2351  PAVGB(%%mm4, %%mm0)
2352  PAVGB(%%mm5, %%mm1)
2353  PAVGB(%%mm6, %%mm2)
2354  PAVGB(%%mm7, %%mm3)
2355  "movq %%mm0, (%1) \n\t" // R0
2356  "movq %%mm1, (%1, %2) \n\t" // R1
2357  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2358  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
2359  "movq %%mm0, (%0) \n\t" // L0
2360  "movq %%mm1, (%0, %2) \n\t" // L1
2361  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2362  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
2363 
2364  "movq (%0, %2, 4), %%mm0 \n\t" // L4
2365  "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
2366  "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
2367  "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
2368  "movq (%1, %2, 4), %%mm4 \n\t" // R4
2369  "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
2370  "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
2371  "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
2372  PAVGB(%%mm4, %%mm0)
2373  PAVGB(%%mm5, %%mm1)
2374  PAVGB(%%mm6, %%mm2)
2375  PAVGB(%%mm7, %%mm3)
2376  PAVGB(%%mm4, %%mm0)
2377  PAVGB(%%mm5, %%mm1)
2378  PAVGB(%%mm6, %%mm2)
2379  PAVGB(%%mm7, %%mm3)
2380  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2381  "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
2382  "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
2383  "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
2384  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2385  "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
2386  "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
2387  "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
2388  "jmp 4f \n\t"
2389 
2390  "3: \n\t"
2391  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2392  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2393  "movq (%0), %%mm0 \n\t" // L0
2394  "movq (%0, %2), %%mm1 \n\t" // L1
2395  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2396  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2397  "movq (%1), %%mm4 \n\t" // R0
2398  "movq (%1, %2), %%mm5 \n\t" // R1
2399  "movq (%1, %2, 2), %%mm6 \n\t" // R2
2400  "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
2401  PAVGB(%%mm4, %%mm0)
2402  PAVGB(%%mm5, %%mm1)
2403  PAVGB(%%mm6, %%mm2)
2404  PAVGB(%%mm7, %%mm3)
2405  PAVGB(%%mm4, %%mm0)
2406  PAVGB(%%mm5, %%mm1)
2407  PAVGB(%%mm6, %%mm2)
2408  PAVGB(%%mm7, %%mm3)
2409  PAVGB(%%mm4, %%mm0)
2410  PAVGB(%%mm5, %%mm1)
2411  PAVGB(%%mm6, %%mm2)
2412  PAVGB(%%mm7, %%mm3)
2413  "movq %%mm0, (%1) \n\t" // R0
2414  "movq %%mm1, (%1, %2) \n\t" // R1
2415  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2416  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
2417  "movq %%mm0, (%0) \n\t" // L0
2418  "movq %%mm1, (%0, %2) \n\t" // L1
2419  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2420  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
2421 
2422  "movq (%0, %2, 4), %%mm0 \n\t" // L4
2423  "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
2424  "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
2425  "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
2426  "movq (%1, %2, 4), %%mm4 \n\t" // R4
2427  "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
2428  "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
2429  "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
2430  PAVGB(%%mm4, %%mm0)
2431  PAVGB(%%mm5, %%mm1)
2432  PAVGB(%%mm6, %%mm2)
2433  PAVGB(%%mm7, %%mm3)
2434  PAVGB(%%mm4, %%mm0)
2435  PAVGB(%%mm5, %%mm1)
2436  PAVGB(%%mm6, %%mm2)
2437  PAVGB(%%mm7, %%mm3)
2438  PAVGB(%%mm4, %%mm0)
2439  PAVGB(%%mm5, %%mm1)
2440  PAVGB(%%mm6, %%mm2)
2441  PAVGB(%%mm7, %%mm3)
2442  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2443  "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
2444  "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
2445  "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
2446  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2447  "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
2448  "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
2449  "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
2450 
2451  "4: \n\t"
2452 
2453  :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast)
2455  : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c, "memory"
2456  );
2457 #else //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
2458 {
2459  int y;
2460  int d=0;
2461 // int sysd=0;
2462  int i;
2463 
2464  for(y=0; y<8; y++){
2465  int x;
2466  for(x=0; x<8; x++){
2467  int ref= tempBlurred[ x + y*stride ];
2468  int cur= src[ x + y*stride ];
2469  int d1=ref - cur;
2470 // if(x==0 || x==7) d1+= d1>>1;
2471 // if(y==0 || y==7) d1+= d1>>1;
2472 // d+= FFABS(d1);
2473  d+= d1*d1;
2474 // sysd+= d1;
2475  }
2476  }
2477  i=d;
2478  d= (
2479  4*d
2480  +(*(tempBlurredPast-256))
2481  +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
2482  +(*(tempBlurredPast+256))
2483  +4)>>3;
2484  *tempBlurredPast=i;
2485 // ((*tempBlurredPast)*3 + d + 2)>>2;
2486 
2487 /*
2488 Switch between
2489  1 0 0 0 0 0 0 (0)
2490 64 32 16 8 4 2 1 (1)
2491 64 48 36 27 20 15 11 (33) (approx)
2492 64 56 49 43 37 33 29 (200) (approx)
2493 */
2494  if(d > maxNoise[1]){
2495  if(d < maxNoise[2]){
2496  for(y=0; y<8; y++){
2497  int x;
2498  for(x=0; x<8; x++){
2499  int ref= tempBlurred[ x + y*stride ];
2500  int cur= src[ x + y*stride ];
2501  tempBlurred[ x + y*stride ]=
2502  src[ x + y*stride ]=
2503  (ref + cur + 1)>>1;
2504  }
2505  }
2506  }else{
2507  for(y=0; y<8; y++){
2508  int x;
2509  for(x=0; x<8; x++){
2510  tempBlurred[ x + y*stride ]= src[ x + y*stride ];
2511  }
2512  }
2513  }
2514  }else{
2515  if(d < maxNoise[0]){
2516  for(y=0; y<8; y++){
2517  int x;
2518  for(x=0; x<8; x++){
2519  int ref= tempBlurred[ x + y*stride ];
2520  int cur= src[ x + y*stride ];
2521  tempBlurred[ x + y*stride ]=
2522  src[ x + y*stride ]=
2523  (ref*7 + cur + 4)>>3;
2524  }
2525  }
2526  }else{
2527  for(y=0; y<8; y++){
2528  int x;
2529  for(x=0; x<8; x++){
2530  int ref= tempBlurred[ x + y*stride ];
2531  int cur= src[ x + y*stride ];
2532  tempBlurred[ x + y*stride ]=
2533  src[ x + y*stride ]=
2534  (ref*3 + cur + 2)>>2;
2535  }
2536  }
2537  }
2538  }
2539 }
2540 #endif //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
2541 }
2542 #endif //TEMPLATE_PP_ALTIVEC
2543 
2544 #if TEMPLATE_PP_MMX
2545 /**
2546  * accurate deblock filter
2547  */
2548 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, const PPContext *c, int mode){
2549  int64_t dc_mask, eq_mask, both_masks;
2550  int64_t sums[10*8*2];
2551  src+= step*3; // src points to begin of the 8x8 Block
2552 
2553  __asm__ volatile(
2554  "movq %0, %%mm7 \n\t"
2555  "movq %1, %%mm6 \n\t"
2556  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
2557  );
2558 
2559  __asm__ volatile(
2560  "lea (%2, %3), %%"FF_REG_a" \n\t"
2561 // 0 1 2 3 4 5 6 7 8 9
2562 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
2563 
2564  "movq (%2), %%mm0 \n\t"
2565  "movq (%%"FF_REG_a"), %%mm1 \n\t"
2566  "movq %%mm1, %%mm3 \n\t"
2567  "movq %%mm1, %%mm4 \n\t"
2568  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
2569  "paddb %%mm7, %%mm0 \n\t"
2570  "pcmpgtb %%mm6, %%mm0 \n\t"
2571 
2572  "movq (%%"FF_REG_a",%3), %%mm2 \n\t"
2573  PMAXUB(%%mm2, %%mm4)
2574  PMINUB(%%mm2, %%mm3, %%mm5)
2575  "psubb %%mm2, %%mm1 \n\t"
2576  "paddb %%mm7, %%mm1 \n\t"
2577  "pcmpgtb %%mm6, %%mm1 \n\t"
2578  "paddb %%mm1, %%mm0 \n\t"
2579 
2580  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
2581  PMAXUB(%%mm1, %%mm4)
2582  PMINUB(%%mm1, %%mm3, %%mm5)
2583  "psubb %%mm1, %%mm2 \n\t"
2584  "paddb %%mm7, %%mm2 \n\t"
2585  "pcmpgtb %%mm6, %%mm2 \n\t"
2586  "paddb %%mm2, %%mm0 \n\t"
2587 
2588  "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
2589 
2590  "movq (%2, %3, 4), %%mm2 \n\t"
2591  PMAXUB(%%mm2, %%mm4)
2592  PMINUB(%%mm2, %%mm3, %%mm5)
2593  "psubb %%mm2, %%mm1 \n\t"
2594  "paddb %%mm7, %%mm1 \n\t"
2595  "pcmpgtb %%mm6, %%mm1 \n\t"
2596  "paddb %%mm1, %%mm0 \n\t"
2597 
2598  "movq (%%"FF_REG_a"), %%mm1 \n\t"
2599  PMAXUB(%%mm1, %%mm4)
2600  PMINUB(%%mm1, %%mm3, %%mm5)
2601  "psubb %%mm1, %%mm2 \n\t"
2602  "paddb %%mm7, %%mm2 \n\t"
2603  "pcmpgtb %%mm6, %%mm2 \n\t"
2604  "paddb %%mm2, %%mm0 \n\t"
2605 
2606  "movq (%%"FF_REG_a", %3), %%mm2 \n\t"
2607  PMAXUB(%%mm2, %%mm4)
2608  PMINUB(%%mm2, %%mm3, %%mm5)
2609  "psubb %%mm2, %%mm1 \n\t"
2610  "paddb %%mm7, %%mm1 \n\t"
2611  "pcmpgtb %%mm6, %%mm1 \n\t"
2612  "paddb %%mm1, %%mm0 \n\t"
2613 
2614  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
2615  PMAXUB(%%mm1, %%mm4)
2616  PMINUB(%%mm1, %%mm3, %%mm5)
2617  "psubb %%mm1, %%mm2 \n\t"
2618  "paddb %%mm7, %%mm2 \n\t"
2619  "pcmpgtb %%mm6, %%mm2 \n\t"
2620  "paddb %%mm2, %%mm0 \n\t"
2621 
2622  "movq (%2, %3, 8), %%mm2 \n\t"
2623  PMAXUB(%%mm2, %%mm4)
2624  PMINUB(%%mm2, %%mm3, %%mm5)
2625  "psubb %%mm2, %%mm1 \n\t"
2626  "paddb %%mm7, %%mm1 \n\t"
2627  "pcmpgtb %%mm6, %%mm1 \n\t"
2628  "paddb %%mm1, %%mm0 \n\t"
2629 
2630  "movq (%%"FF_REG_a", %3, 4), %%mm1 \n\t"
2631  "psubb %%mm1, %%mm2 \n\t"
2632  "paddb %%mm7, %%mm2 \n\t"
2633  "pcmpgtb %%mm6, %%mm2 \n\t"
2634  "paddb %%mm2, %%mm0 \n\t"
2635  "psubusb %%mm3, %%mm4 \n\t"
2636 
2637  "pxor %%mm6, %%mm6 \n\t"
2638  "movq %4, %%mm7 \n\t" // QP,..., QP
2639  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
2640  "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0
2641  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2642  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2643  "movq %%mm7, %1 \n\t"
2644 
2645  "movq %5, %%mm7 \n\t"
2646  "punpcklbw %%mm7, %%mm7 \n\t"
2647  "punpcklbw %%mm7, %%mm7 \n\t"
2648  "punpcklbw %%mm7, %%mm7 \n\t"
2649  "psubb %%mm0, %%mm6 \n\t"
2650  "pcmpgtb %%mm7, %%mm6 \n\t"
2651  "movq %%mm6, %0 \n\t"
2652 
2653  : "=m" (eq_mask), "=m" (dc_mask)
2654  : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2655  : "%"FF_REG_a
2656  );
2657 
2658  both_masks = dc_mask & eq_mask;
2659 
2660  if(both_masks){
2661  x86_reg offset= -8*step;
2662  int64_t *temp_sums= sums;
2663 
2664  __asm__ volatile(
2665  "movq %2, %%mm0 \n\t" // QP,..., QP
2666  "pxor %%mm4, %%mm4 \n\t"
2667 
2668  "movq (%0), %%mm6 \n\t"
2669  "movq (%0, %1), %%mm5 \n\t"
2670  "movq %%mm5, %%mm1 \n\t"
2671  "movq %%mm6, %%mm2 \n\t"
2672  "psubusb %%mm6, %%mm5 \n\t"
2673  "psubusb %%mm1, %%mm2 \n\t"
2674  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2675  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2676  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2677 
2678  "pxor %%mm6, %%mm1 \n\t"
2679  "pand %%mm0, %%mm1 \n\t"
2680  "pxor %%mm1, %%mm6 \n\t"
2681  // 0:QP 6:First
2682 
2683  "movq (%0, %1, 8), %%mm5 \n\t"
2684  "add %1, %0 \n\t" // %0 points to line 1 not 0
2685  "movq (%0, %1, 8), %%mm7 \n\t"
2686  "movq %%mm5, %%mm1 \n\t"
2687  "movq %%mm7, %%mm2 \n\t"
2688  "psubusb %%mm7, %%mm5 \n\t"
2689  "psubusb %%mm1, %%mm2 \n\t"
2690  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2691  "movq %2, %%mm0 \n\t" // QP,..., QP
2692  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2693  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2694 
2695  "pxor %%mm7, %%mm1 \n\t"
2696  "pand %%mm0, %%mm1 \n\t"
2697  "pxor %%mm1, %%mm7 \n\t"
2698 
2699  "movq %%mm6, %%mm5 \n\t"
2700  "punpckhbw %%mm4, %%mm6 \n\t"
2701  "punpcklbw %%mm4, %%mm5 \n\t"
2702  // 4:0 5/6:First 7:Last
2703 
2704  "movq %%mm5, %%mm0 \n\t"
2705  "movq %%mm6, %%mm1 \n\t"
2706  "psllw $2, %%mm0 \n\t"
2707  "psllw $2, %%mm1 \n\t"
2708  "paddw "MANGLE(w04)", %%mm0 \n\t"
2709  "paddw "MANGLE(w04)", %%mm1 \n\t"
2710 
2711 #define NEXT\
2712  "movq (%0), %%mm2 \n\t"\
2713  "movq (%0), %%mm3 \n\t"\
2714  "add %1, %0 \n\t"\
2715  "punpcklbw %%mm4, %%mm2 \n\t"\
2716  "punpckhbw %%mm4, %%mm3 \n\t"\
2717  "paddw %%mm2, %%mm0 \n\t"\
2718  "paddw %%mm3, %%mm1 \n\t"
2719 
2720 #define PREV\
2721  "movq (%0), %%mm2 \n\t"\
2722  "movq (%0), %%mm3 \n\t"\
2723  "add %1, %0 \n\t"\
2724  "punpcklbw %%mm4, %%mm2 \n\t"\
2725  "punpckhbw %%mm4, %%mm3 \n\t"\
2726  "psubw %%mm2, %%mm0 \n\t"\
2727  "psubw %%mm3, %%mm1 \n\t"
2728 
2729 
2730  NEXT //0
2731  NEXT //1
2732  NEXT //2
2733  "movq %%mm0, (%3) \n\t"
2734  "movq %%mm1, 8(%3) \n\t"
2735 
2736  NEXT //3
2737  "psubw %%mm5, %%mm0 \n\t"
2738  "psubw %%mm6, %%mm1 \n\t"
2739  "movq %%mm0, 16(%3) \n\t"
2740  "movq %%mm1, 24(%3) \n\t"
2741 
2742  NEXT //4
2743  "psubw %%mm5, %%mm0 \n\t"
2744  "psubw %%mm6, %%mm1 \n\t"
2745  "movq %%mm0, 32(%3) \n\t"
2746  "movq %%mm1, 40(%3) \n\t"
2747 
2748  NEXT //5
2749  "psubw %%mm5, %%mm0 \n\t"
2750  "psubw %%mm6, %%mm1 \n\t"
2751  "movq %%mm0, 48(%3) \n\t"
2752  "movq %%mm1, 56(%3) \n\t"
2753 
2754  NEXT //6
2755  "psubw %%mm5, %%mm0 \n\t"
2756  "psubw %%mm6, %%mm1 \n\t"
2757  "movq %%mm0, 64(%3) \n\t"
2758  "movq %%mm1, 72(%3) \n\t"
2759 
2760  "movq %%mm7, %%mm6 \n\t"
2761  "punpckhbw %%mm4, %%mm7 \n\t"
2762  "punpcklbw %%mm4, %%mm6 \n\t"
2763 
2764  NEXT //7
2765  "mov %4, %0 \n\t"
2766  "add %1, %0 \n\t"
2767  PREV //0
2768  "movq %%mm0, 80(%3) \n\t"
2769  "movq %%mm1, 88(%3) \n\t"
2770 
2771  PREV //1
2772  "paddw %%mm6, %%mm0 \n\t"
2773  "paddw %%mm7, %%mm1 \n\t"
2774  "movq %%mm0, 96(%3) \n\t"
2775  "movq %%mm1, 104(%3) \n\t"
2776 
2777  PREV //2
2778  "paddw %%mm6, %%mm0 \n\t"
2779  "paddw %%mm7, %%mm1 \n\t"
2780  "movq %%mm0, 112(%3) \n\t"
2781  "movq %%mm1, 120(%3) \n\t"
2782 
2783  PREV //3
2784  "paddw %%mm6, %%mm0 \n\t"
2785  "paddw %%mm7, %%mm1 \n\t"
2786  "movq %%mm0, 128(%3) \n\t"
2787  "movq %%mm1, 136(%3) \n\t"
2788 
2789  PREV //4
2790  "paddw %%mm6, %%mm0 \n\t"
2791  "paddw %%mm7, %%mm1 \n\t"
2792  "movq %%mm0, 144(%3) \n\t"
2793  "movq %%mm1, 152(%3) \n\t"
2794 
2795  "mov %4, %0 \n\t" //FIXME
2796 
2797  : "+&r"(src)
2798  : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src)
2800  );
2801 
2802  src+= step; // src points to begin of the 8x8 Block
2803 
2804  __asm__ volatile(
2805  "movq %4, %%mm6 \n\t"
2806  "pcmpeqb %%mm5, %%mm5 \n\t"
2807  "pxor %%mm6, %%mm5 \n\t"
2808  "pxor %%mm7, %%mm7 \n\t"
2809 
2810  "1: \n\t"
2811  "movq (%1), %%mm0 \n\t"
2812  "movq 8(%1), %%mm1 \n\t"
2813  "paddw 32(%1), %%mm0 \n\t"
2814  "paddw 40(%1), %%mm1 \n\t"
2815  "movq (%0, %3), %%mm2 \n\t"
2816  "movq %%mm2, %%mm3 \n\t"
2817  "movq %%mm2, %%mm4 \n\t"
2818  "punpcklbw %%mm7, %%mm2 \n\t"
2819  "punpckhbw %%mm7, %%mm3 \n\t"
2820  "paddw %%mm2, %%mm0 \n\t"
2821  "paddw %%mm3, %%mm1 \n\t"
2822  "paddw %%mm2, %%mm0 \n\t"
2823  "paddw %%mm3, %%mm1 \n\t"
2824  "psrlw $4, %%mm0 \n\t"
2825  "psrlw $4, %%mm1 \n\t"
2826  "packuswb %%mm1, %%mm0 \n\t"
2827  "pand %%mm6, %%mm0 \n\t"
2828  "pand %%mm5, %%mm4 \n\t"
2829  "por %%mm4, %%mm0 \n\t"
2830  "movq %%mm0, (%0, %3) \n\t"
2831  "add $16, %1 \n\t"
2832  "add %2, %0 \n\t"
2833  " js 1b \n\t"
2834 
2835  : "+r"(offset), "+r"(temp_sums)
2836  : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks)
2837  );
2838  }else
2839  src+= step; // src points to begin of the 8x8 Block
2840 
2841  if(eq_mask != -1LL){
2842  uint8_t *temp_src= src;
2843  DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
2844  __asm__ volatile(
2845  "pxor %%mm7, %%mm7 \n\t"
2846 // 0 1 2 3 4 5 6 7 8 9
2847 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1
2848 
2849  "movq (%0), %%mm0 \n\t"
2850  "movq %%mm0, %%mm1 \n\t"
2851  "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
2852  "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
2853 
2854  "movq (%0, %1), %%mm2 \n\t"
2855  "lea (%0, %1, 2), %%"FF_REG_a" \n\t"
2856  "movq %%mm2, %%mm3 \n\t"
2857  "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
2858  "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
2859 
2860  "movq (%%"FF_REG_a"), %%mm4 \n\t"
2861  "movq %%mm4, %%mm5 \n\t"
2862  "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
2863  "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
2864 
2865  "paddw %%mm0, %%mm0 \n\t" // 2L0
2866  "paddw %%mm1, %%mm1 \n\t" // 2H0
2867  "psubw %%mm4, %%mm2 \n\t" // L1 - L2
2868  "psubw %%mm5, %%mm3 \n\t" // H1 - H2
2869  "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
2870  "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
2871 
2872  "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
2873  "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
2874  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
2875  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
2876 
2877  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
2878  "movq %%mm2, %%mm3 \n\t"
2879  "punpcklbw %%mm7, %%mm2 \n\t" // L3
2880  "punpckhbw %%mm7, %%mm3 \n\t" // H3
2881 
2882  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
2883  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
2884  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2885  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2886  "movq %%mm0, (%4) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2887  "movq %%mm1, 8(%4) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2888 
2889  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
2890  "movq %%mm0, %%mm1 \n\t"
2891  "punpcklbw %%mm7, %%mm0 \n\t" // L4
2892  "punpckhbw %%mm7, %%mm1 \n\t" // H4
2893 
2894  "psubw %%mm0, %%mm2 \n\t" // L3 - L4
2895  "psubw %%mm1, %%mm3 \n\t" // H3 - H4
2896  "movq %%mm2, 16(%4) \n\t" // L3 - L4
2897  "movq %%mm3, 24(%4) \n\t" // H3 - H4
2898  "paddw %%mm4, %%mm4 \n\t" // 2L2
2899  "paddw %%mm5, %%mm5 \n\t" // 2H2
2900  "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
2901  "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
2902 
2903  "lea (%%"FF_REG_a", %1), %0 \n\t"
2904  "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
2905  "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
2906  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
2907  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
2908 //50 opcodes so far
2909  "movq (%0, %1, 2), %%mm2 \n\t"
2910  "movq %%mm2, %%mm3 \n\t"
2911  "punpcklbw %%mm7, %%mm2 \n\t" // L5
2912  "punpckhbw %%mm7, %%mm3 \n\t" // H5
2913  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
2914  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
2915  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
2916  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
2917 
2918  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2919  "punpcklbw %%mm7, %%mm6 \n\t" // L6
2920  "psubw %%mm6, %%mm2 \n\t" // L5 - L6
2921  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2922  "punpckhbw %%mm7, %%mm6 \n\t" // H6
2923  "psubw %%mm6, %%mm3 \n\t" // H5 - H6
2924 
2925  "paddw %%mm0, %%mm0 \n\t" // 2L4
2926  "paddw %%mm1, %%mm1 \n\t" // 2H4
2927  "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
2928  "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
2929 
2930  "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
2931  "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
2932  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
2933  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
2934 
2935  "movq (%0, %1, 4), %%mm2 \n\t"
2936  "movq %%mm2, %%mm3 \n\t"
2937  "punpcklbw %%mm7, %%mm2 \n\t" // L7
2938  "punpckhbw %%mm7, %%mm3 \n\t" // H7
2939 
2940  "paddw %%mm2, %%mm2 \n\t" // 2L7
2941  "paddw %%mm3, %%mm3 \n\t" // 2H7
2942  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
2943  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
2944 
2945  "movq (%4), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2946  "movq 8(%4), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2947 
2948 #if TEMPLATE_PP_MMXEXT
2949  "movq %%mm7, %%mm6 \n\t" // 0
2950  "psubw %%mm0, %%mm6 \n\t"
2951  "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2952  "movq %%mm7, %%mm6 \n\t" // 0
2953  "psubw %%mm1, %%mm6 \n\t"
2954  "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2955  "movq %%mm7, %%mm6 \n\t" // 0
2956  "psubw %%mm2, %%mm6 \n\t"
2957  "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2958  "movq %%mm7, %%mm6 \n\t" // 0
2959  "psubw %%mm3, %%mm6 \n\t"
2960  "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2961 #else
2962  "movq %%mm7, %%mm6 \n\t" // 0
2963  "pcmpgtw %%mm0, %%mm6 \n\t"
2964  "pxor %%mm6, %%mm0 \n\t"
2965  "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2966  "movq %%mm7, %%mm6 \n\t" // 0
2967  "pcmpgtw %%mm1, %%mm6 \n\t"
2968  "pxor %%mm6, %%mm1 \n\t"
2969  "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2970  "movq %%mm7, %%mm6 \n\t" // 0
2971  "pcmpgtw %%mm2, %%mm6 \n\t"
2972  "pxor %%mm6, %%mm2 \n\t"
2973  "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2974  "movq %%mm7, %%mm6 \n\t" // 0
2975  "pcmpgtw %%mm3, %%mm6 \n\t"
2976  "pxor %%mm6, %%mm3 \n\t"
2977  "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2978 #endif
2979 
2980 #if TEMPLATE_PP_MMXEXT
2981  "pminsw %%mm2, %%mm0 \n\t"
2982  "pminsw %%mm3, %%mm1 \n\t"
2983 #else
2984  "movq %%mm0, %%mm6 \n\t"
2985  "psubusw %%mm2, %%mm6 \n\t"
2986  "psubw %%mm6, %%mm0 \n\t"
2987  "movq %%mm1, %%mm6 \n\t"
2988  "psubusw %%mm3, %%mm6 \n\t"
2989  "psubw %%mm6, %%mm1 \n\t"
2990 #endif
2991 
2992  "movd %2, %%mm2 \n\t" // QP
2993  "punpcklbw %%mm7, %%mm2 \n\t"
2994 
2995  "movq %%mm7, %%mm6 \n\t" // 0
2996  "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
2997  "pxor %%mm6, %%mm4 \n\t"
2998  "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
2999  "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
3000  "pxor %%mm7, %%mm5 \n\t"
3001  "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
3002 // 100 opcodes
3003  "psllw $3, %%mm2 \n\t" // 8QP
3004  "movq %%mm2, %%mm3 \n\t" // 8QP
3005  "pcmpgtw %%mm4, %%mm2 \n\t"
3006  "pcmpgtw %%mm5, %%mm3 \n\t"
3007  "pand %%mm2, %%mm4 \n\t"
3008  "pand %%mm3, %%mm5 \n\t"
3009 
3010 
3011  "psubusw %%mm0, %%mm4 \n\t" // hd
3012  "psubusw %%mm1, %%mm5 \n\t" // ld
3013 
3014 
3015  "movq "MANGLE(w05)", %%mm2 \n\t" // 5
3016  "pmullw %%mm2, %%mm4 \n\t"
3017  "pmullw %%mm2, %%mm5 \n\t"
3018  "movq "MANGLE(w20)", %%mm2 \n\t" // 32
3019  "paddw %%mm2, %%mm4 \n\t"
3020  "paddw %%mm2, %%mm5 \n\t"
3021  "psrlw $6, %%mm4 \n\t"
3022  "psrlw $6, %%mm5 \n\t"
3023 
3024  "movq 16(%4), %%mm0 \n\t" // L3 - L4
3025  "movq 24(%4), %%mm1 \n\t" // H3 - H4
3026 
3027  "pxor %%mm2, %%mm2 \n\t"
3028  "pxor %%mm3, %%mm3 \n\t"
3029 
3030  "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
3031  "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
3032  "pxor %%mm2, %%mm0 \n\t"
3033  "pxor %%mm3, %%mm1 \n\t"
3034  "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
3035  "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
3036  "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
3037  "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
3038 
3039  "pxor %%mm6, %%mm2 \n\t"
3040  "pxor %%mm7, %%mm3 \n\t"
3041  "pand %%mm2, %%mm4 \n\t"
3042  "pand %%mm3, %%mm5 \n\t"
3043 
3044 #if TEMPLATE_PP_MMXEXT
3045  "pminsw %%mm0, %%mm4 \n\t"
3046  "pminsw %%mm1, %%mm5 \n\t"
3047 #else
3048  "movq %%mm4, %%mm2 \n\t"
3049  "psubusw %%mm0, %%mm2 \n\t"
3050  "psubw %%mm2, %%mm4 \n\t"
3051  "movq %%mm5, %%mm2 \n\t"
3052  "psubusw %%mm1, %%mm2 \n\t"
3053  "psubw %%mm2, %%mm5 \n\t"
3054 #endif
3055  "pxor %%mm6, %%mm4 \n\t"
3056  "pxor %%mm7, %%mm5 \n\t"
3057  "psubw %%mm6, %%mm4 \n\t"
3058  "psubw %%mm7, %%mm5 \n\t"
3059  "packsswb %%mm5, %%mm4 \n\t"
3060  "movq %3, %%mm1 \n\t"
3061  "pandn %%mm4, %%mm1 \n\t"
3062  "movq (%0), %%mm0 \n\t"
3063  "paddb %%mm1, %%mm0 \n\t"
3064  "movq %%mm0, (%0) \n\t"
3065  "movq (%0, %1), %%mm0 \n\t"
3066  "psubb %%mm1, %%mm0 \n\t"
3067  "movq %%mm0, (%0, %1) \n\t"
3068 
3069  : "+r" (temp_src)
3070  : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask), "r"(tmp)
3071  NAMED_CONSTRAINTS_ADD(w05,w20)
3072  : "%"FF_REG_a
3073  );
3074  }
3075 }
3076 #endif //TEMPLATE_PP_MMX
3077 
3078 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3079  const int8_t QPs[], int QPStride, int isColor, PPContext *c);
3080 
3081 /**
3082  * Copy a block from src to dst and fixes the blacklevel.
3083  * levelFix == 0 -> do not touch the brightness & contrast
3084  */
3085 #undef REAL_SCALED_CPY
3086 #undef SCALED_CPY
3087 
3088 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
3089  int levelFix, int64_t *packedOffsetAndScale)
3090 {
3091 #if !TEMPLATE_PP_MMX || !HAVE_6REGS
3092  int i;
3093 #endif
3094  if(levelFix){
3095 #if TEMPLATE_PP_MMX && HAVE_6REGS
3096  __asm__ volatile(
3097  "movq (%%"FF_REG_a"), %%mm2 \n\t" // packedYOffset
3098  "movq 8(%%"FF_REG_a"), %%mm3 \n\t" // packedYScale
3099  "lea (%2,%4), %%"FF_REG_a" \n\t"
3100  "lea (%3,%5), %%"FF_REG_d" \n\t"
3101  "pxor %%mm4, %%mm4 \n\t"
3102 #if TEMPLATE_PP_MMXEXT
3103 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3104  "movq " #src1 ", %%mm0 \n\t"\
3105  "movq " #src1 ", %%mm5 \n\t"\
3106  "movq " #src2 ", %%mm1 \n\t"\
3107  "movq " #src2 ", %%mm6 \n\t"\
3108  "punpcklbw %%mm0, %%mm0 \n\t"\
3109  "punpckhbw %%mm5, %%mm5 \n\t"\
3110  "punpcklbw %%mm1, %%mm1 \n\t"\
3111  "punpckhbw %%mm6, %%mm6 \n\t"\
3112  "pmulhuw %%mm3, %%mm0 \n\t"\
3113  "pmulhuw %%mm3, %%mm5 \n\t"\
3114  "pmulhuw %%mm3, %%mm1 \n\t"\
3115  "pmulhuw %%mm3, %%mm6 \n\t"\
3116  "psubw %%mm2, %%mm0 \n\t"\
3117  "psubw %%mm2, %%mm5 \n\t"\
3118  "psubw %%mm2, %%mm1 \n\t"\
3119  "psubw %%mm2, %%mm6 \n\t"\
3120  "packuswb %%mm5, %%mm0 \n\t"\
3121  "packuswb %%mm6, %%mm1 \n\t"\
3122  "movq %%mm0, " #dst1 " \n\t"\
3123  "movq %%mm1, " #dst2 " \n\t"\
3124 
3125 #else //TEMPLATE_PP_MMXEXT
3126 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3127  "movq " #src1 ", %%mm0 \n\t"\
3128  "movq " #src1 ", %%mm5 \n\t"\
3129  "punpcklbw %%mm4, %%mm0 \n\t"\
3130  "punpckhbw %%mm4, %%mm5 \n\t"\
3131  "psubw %%mm2, %%mm0 \n\t"\
3132  "psubw %%mm2, %%mm5 \n\t"\
3133  "movq " #src2 ", %%mm1 \n\t"\
3134  "psllw $6, %%mm0 \n\t"\
3135  "psllw $6, %%mm5 \n\t"\
3136  "pmulhw %%mm3, %%mm0 \n\t"\
3137  "movq " #src2 ", %%mm6 \n\t"\
3138  "pmulhw %%mm3, %%mm5 \n\t"\
3139  "punpcklbw %%mm4, %%mm1 \n\t"\
3140  "punpckhbw %%mm4, %%mm6 \n\t"\
3141  "psubw %%mm2, %%mm1 \n\t"\
3142  "psubw %%mm2, %%mm6 \n\t"\
3143  "psllw $6, %%mm1 \n\t"\
3144  "psllw $6, %%mm6 \n\t"\
3145  "pmulhw %%mm3, %%mm1 \n\t"\
3146  "pmulhw %%mm3, %%mm6 \n\t"\
3147  "packuswb %%mm5, %%mm0 \n\t"\
3148  "packuswb %%mm6, %%mm1 \n\t"\
3149  "movq %%mm0, " #dst1 " \n\t"\
3150  "movq %%mm1, " #dst2 " \n\t"\
3151 
3152 #endif //TEMPLATE_PP_MMXEXT
3153 #define SCALED_CPY(src1, src2, dst1, dst2)\
3154  REAL_SCALED_CPY(src1, src2, dst1, dst2)
3155 
3156 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
3157 SCALED_CPY((%2, %4, 2), (%%FF_REGa, %4, 2), (%3, %5, 2), (%%FF_REGd, %5, 2))
3158 SCALED_CPY((%2, %4, 4), (%%FF_REGa, %4, 4), (%3, %5, 4), (%%FF_REGd, %5, 4))
3159  "lea (%%"FF_REG_a",%4,4), %%"FF_REG_a" \n\t"
3160  "lea (%%"FF_REG_d",%5,4), %%"FF_REG_d" \n\t"
3161 SCALED_CPY((%%FF_REGa, %4), (%%FF_REGa, %4, 2), (%%FF_REGd, %5), (%%FF_REGd, %5, 2))
3162 
3163 
3164  : "=&a" (packedOffsetAndScale)
3165  : "0" (packedOffsetAndScale),
3166  "r"(src),
3167  "r"(dst),
3168  "r" ((x86_reg)srcStride),
3169  "r" ((x86_reg)dstStride)
3170  : "%"FF_REG_d
3171  );
3172 #else //TEMPLATE_PP_MMX && HAVE_6REGS
3173  for(i=0; i<8; i++)
3174  memcpy( &(dst[dstStride*i]),
3175  &(src[srcStride*i]), BLOCK_SIZE);
3176 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
3177  }else{
3178 #if TEMPLATE_PP_MMX && HAVE_6REGS
3179  __asm__ volatile(
3180  "lea (%0,%2), %%"FF_REG_a" \n\t"
3181  "lea (%1,%3), %%"FF_REG_d" \n\t"
3182 
3183 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
3184  "movq " #src1 ", %%mm0 \n\t"\
3185  "movq " #src2 ", %%mm1 \n\t"\
3186  "movq %%mm0, " #dst1 " \n\t"\
3187  "movq %%mm1, " #dst2 " \n\t"\
3188 
3189 #define SIMPLE_CPY(src1, src2, dst1, dst2)\
3190  REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
3191 
3192 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
3193 SIMPLE_CPY((%0, %2, 2), (%%FF_REGa, %2, 2), (%1, %3, 2), (%%FF_REGd, %3, 2))
3194 SIMPLE_CPY((%0, %2, 4), (%%FF_REGa, %2, 4), (%1, %3, 4), (%%FF_REGd, %3, 4))
3195  "lea (%%"FF_REG_a",%2,4), %%"FF_REG_a" \n\t"
3196  "lea (%%"FF_REG_d",%3,4), %%"FF_REG_d" \n\t"
3197 SIMPLE_CPY((%%FF_REGa, %2), (%%FF_REGa, %2, 2), (%%FF_REGd, %3), (%%FF_REGd, %3, 2))
3198 
3199  : : "r" (src),
3200  "r" (dst),
3201  "r" ((x86_reg)srcStride),
3202  "r" ((x86_reg)dstStride)
3203  : "%"FF_REG_a, "%"FF_REG_d
3204  );
3205 #else //TEMPLATE_PP_MMX && HAVE_6REGS
3206  for(i=0; i<8; i++)
3207  memcpy( &(dst[dstStride*i]),
3208  &(src[srcStride*i]), BLOCK_SIZE);
3209 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
3210  }
3211 }
3212 
3213 /**
3214  * Duplicate the given 8 src pixels ? times upward
3215  */
3216 static inline void RENAME(duplicate)(uint8_t src[], int stride)
3217 {
3218 #if TEMPLATE_PP_MMX
3219  __asm__ volatile(
3220  "movq (%0), %%mm0 \n\t"
3221  "movq %%mm0, (%0, %1, 4) \n\t"
3222  "add %1, %0 \n\t"
3223  "movq %%mm0, (%0) \n\t"
3224  "movq %%mm0, (%0, %1) \n\t"
3225  "movq %%mm0, (%0, %1, 2) \n\t"
3226  "movq %%mm0, (%0, %1, 4) \n\t"
3227  : "+r" (src)
3228  : "r" ((x86_reg)-stride)
3229  );
3230 #else
3231  int i;
3232  uint8_t *p=src;
3233  for(i=0; i<5; i++){
3234  p-= stride;
3235  memcpy(p, src, 8);
3236  }
3237 #endif
3238 }
3239 
3240 #if ARCH_X86 && TEMPLATE_PP_MMXEXT
3241 static inline void RENAME(prefetchnta)(const void *p)
3242 {
3243  __asm__ volatile( "prefetchnta (%0)\n\t"
3244  : : "r" (p)
3245  );
3246 }
3247 
3248 static inline void RENAME(prefetcht0)(const void *p)
3249 {
3250  __asm__ volatile( "prefetcht0 (%0)\n\t"
3251  : : "r" (p)
3252  );
3253 }
3254 
3255 static inline void RENAME(prefetcht1)(const void *p)
3256 {
3257  __asm__ volatile( "prefetcht1 (%0)\n\t"
3258  : : "r" (p)
3259  );
3260 }
3261 
3262 static inline void RENAME(prefetcht2)(const void *p)
3263 {
3264  __asm__ volatile( "prefetcht2 (%0)\n\t"
3265  : : "r" (p)
3266  );
3267 }
3268 #elif !ARCH_X86 && AV_GCC_VERSION_AT_LEAST(3,2)
3269 static inline void RENAME(prefetchnta)(const void *p)
3270 {
3271  __builtin_prefetch(p,0,0);
3272 }
3273 static inline void RENAME(prefetcht0)(const void *p)
3274 {
3275  __builtin_prefetch(p,0,1);
3276 }
3277 static inline void RENAME(prefetcht1)(const void *p)
3278 {
3279  __builtin_prefetch(p,0,2);
3280 }
3281 static inline void RENAME(prefetcht2)(const void *p)
3282 {
3283  __builtin_prefetch(p,0,3);
3284 }
3285 #else
3286 static inline void RENAME(prefetchnta)(const void *p)
3287 {
3288  return;
3289 }
3290 static inline void RENAME(prefetcht0)(const void *p)
3291 {
3292  return;
3293 }
3294 static inline void RENAME(prefetcht1)(const void *p)
3295 {
3296  return;
3297 }
3298 static inline void RENAME(prefetcht2)(const void *p)
3299 {
3300  return;
3301 }
3302 #endif
3303 /**
3304  * Filter array of bytes (Y or U or V values)
3305  */
3306 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3307  const int8_t QPs[], int QPStride, int isColor, PPContext *c2)
3308 {
3309  DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
3310  int x,y;
3311 #ifdef TEMPLATE_PP_TIME_MODE
3312  const int mode= TEMPLATE_PP_TIME_MODE;
3313 #else
3314  const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
3315 #endif
3316  int black=0, white=255; // blackest black and whitest white in the picture
3317  int QPCorrecture= 256*256;
3318 
3319  int copyAhead;
3320 #if TEMPLATE_PP_MMX
3321  int i;
3322 #endif
3323 
3324  const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
3325  const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
3326 
3327  //FIXME remove
3328  uint64_t * const yHistogram= c.yHistogram;
3329  uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
3330  uint8_t * const tempDst= (dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride) + 32;
3331  //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
3332 
3333  if (mode & VISUALIZE){
3334  if(!(mode & (V_A_DEBLOCK | H_A_DEBLOCK)) || TEMPLATE_PP_MMX) {
3335  av_log(c2, AV_LOG_WARNING, "Visualization is currently only supported with the accurate deblock filter without SIMD\n");
3336  }
3337  }
3338 
3339 #if TEMPLATE_PP_MMX
3340  for(i=0; i<57; i++){
3341  int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
3342  int threshold= offset*2 + 1;
3343  c.mmxDcOffset[i]= 0x7F - offset;
3344  c.mmxDcThreshold[i]= 0x7F - threshold;
3345  c.mmxDcOffset[i]*= 0x0101010101010101LL;
3346  c.mmxDcThreshold[i]*= 0x0101010101010101LL;
3347  }
3348 #endif
3349 
3350  if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
3351  else if( (mode & LINEAR_BLEND_DEINT_FILTER)
3352  || (mode & FFMPEG_DEINT_FILTER)
3353  || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
3354  else if( (mode & V_DEBLOCK)
3356  || (mode & MEDIAN_DEINT_FILTER)
3357  || (mode & V_A_DEBLOCK)) copyAhead=13;
3358  else if(mode & V_X1_FILTER) copyAhead=11;
3359 // else if(mode & V_RK1_FILTER) copyAhead=10;
3360  else if(mode & DERING) copyAhead=9;
3361  else copyAhead=8;
3362 
3363  copyAhead-= 8;
3364 
3365  if(!isColor){
3366  uint64_t sum= 0;
3367  int i;
3368  uint64_t maxClipped;
3369  uint64_t clipped;
3370  AVRational scale;
3371 
3372  c.frameNum++;
3373  // first frame is fscked so we ignore it
3374  if(c.frameNum == 1) yHistogram[0]= width*(uint64_t)height/64*15/256;
3375 
3376  for(i=0; i<256; i++){
3377  sum+= yHistogram[i];
3378  }
3379 
3380  /* We always get a completely black picture first. */
3381  maxClipped= av_rescale(sum, c.ppMode.maxClippedThreshold.num, c.ppMode.maxClippedThreshold.den);
3382 
3383  clipped= sum;
3384  for(black=255; black>0; black--){
3385  if(clipped < maxClipped) break;
3386  clipped-= yHistogram[black];
3387  }
3388 
3389  clipped= sum;
3390  for(white=0; white<256; white++){
3391  if(clipped < maxClipped) break;
3392  clipped-= yHistogram[white];
3393  }
3394 
3395  scale = (AVRational){c.ppMode.maxAllowedY - c.ppMode.minAllowedY, white - black};
3396 
3397 #if TEMPLATE_PP_MMXEXT
3398  c.packedYScale = (uint16_t)av_rescale(scale.num, 256, scale.den);
3399  c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
3400 #else
3401  c.packedYScale = (uint16_t)av_rescale(scale.num, 1024, scale.den);
3402  c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
3403 #endif
3404 
3405  c.packedYOffset|= c.packedYOffset<<32;
3406  c.packedYOffset|= c.packedYOffset<<16;
3407 
3408  c.packedYScale|= c.packedYScale<<32;
3409  c.packedYScale|= c.packedYScale<<16;
3410 
3411  if(mode & LEVEL_FIX) QPCorrecture= (int)av_rescale(scale.num, 256*256, scale.den);
3412  else QPCorrecture= 256*256;
3413  }else{
3414  c.packedYScale= 0x0100010001000100LL;
3415  c.packedYOffset= 0;
3416  QPCorrecture= 256*256;
3417  }
3418 
3419  /* copy & deinterlace first row of blocks */
3420  y=-BLOCK_SIZE;
3421  {
3422  const uint8_t *srcBlock= &(src[y*srcStride]);
3423  uint8_t *dstBlock= tempDst + dstStride;
3424 
3425  // From this point on it is guaranteed that we can read and write 16 lines downward
3426  // finish 1 block before the next otherwise we might have a problem
3427  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
3428  for(x=0; x<width; x+=BLOCK_SIZE){
3429  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
3430  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
3431  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
3432  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
3433 
3434  RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
3435  srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3436 
3437  RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
3438 
3440  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3441  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3442  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3443  else if(mode & MEDIAN_DEINT_FILTER)
3444  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3445  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3446  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3447  else if(mode & FFMPEG_DEINT_FILTER)
3448  RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3449  else if(mode & LOWPASS5_DEINT_FILTER)
3450  RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3451 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3452  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3453 */
3454  dstBlock+=8;
3455  srcBlock+=8;
3456  }
3457  if(width==FFABS(dstStride))
3458  linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
3459  else{
3460  int i;
3461  for(i=0; i<copyAhead; i++){
3462  memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
3463  }
3464  }
3465  }
3466 
3467  for(y=0; y<height; y+=BLOCK_SIZE){
3468  //1% speedup if these are here instead of the inner loop
3469  const uint8_t *srcBlock= &(src[y*srcStride]);
3470  uint8_t *dstBlock= &(dst[y*dstStride]);
3471 #if TEMPLATE_PP_MMX
3472  uint8_t *tempBlock1= c.tempBlocks;
3473  uint8_t *tempBlock2= c.tempBlocks + 8;
3474 #endif
3475  const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
3476  int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
3477  int QP=0, nonBQP=0;
3478  /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3479  if not than use a temporary buffer */
3480  if(y+15 >= height){
3481  int i;
3482  /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3483  blockcopy to dst later */
3484  linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3485  FFMAX(height-y-copyAhead, 0), srcStride);
3486 
3487  /* duplicate last line of src to fill the void up to line (copyAhead+7) */
3488  for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
3489  memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
3490 
3491  /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3492  linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
3493 
3494  /* duplicate last line of dst to fill the void up to line (copyAhead) */
3495  for(i=height-y+1; i<=copyAhead; i++)
3496  memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
3497 
3498  dstBlock= tempDst + dstStride;
3499  srcBlock= tempSrc;
3500  }
3501 
3502  // From this point on it is guaranteed that we can read and write 16 lines downward
3503  // finish 1 block before the next otherwise we might have a problem
3504  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
3505  for(x=0; x<width; ){
3506  int startx = x;
3507  int endx = FFMIN(width, x+32);
3508  uint8_t *dstBlockStart = dstBlock;
3509  const uint8_t *srcBlockStart = srcBlock;
3510  int qp_index = 0;
3511  for(qp_index=0; qp_index < (endx-startx)/BLOCK_SIZE; qp_index++){
3512  QP = QPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
3513  nonBQP = nonBQPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
3514  if(!isColor){
3515  QP= (QP* QPCorrecture + 256*128)>>16;
3516  nonBQP= (nonBQP* QPCorrecture + 256*128)>>16;
3517  yHistogram[(srcBlock+qp_index*8)[srcStride*12 + 4]]++;
3518  }
3519  c.QP_block[qp_index] = QP;
3520  c.nonBQP_block[qp_index] = nonBQP;
3521 #if TEMPLATE_PP_MMX
3522  __asm__ volatile(
3523  "movd %1, %%mm7 \n\t"
3524  "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3525  "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3526  "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3527  "movq %%mm7, %0 \n\t"
3528  : "=m" (c.pQPb_block[qp_index])
3529  : "r" (QP)
3530  );
3531 #endif
3532  }
3533  for(; x < endx; x+=BLOCK_SIZE){
3534  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
3535  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
3536  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
3537  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
3538 
3539  RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3540  srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3541 
3543  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3544  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3545  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3546  else if(mode & MEDIAN_DEINT_FILTER)
3547  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3548  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3549  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3550  else if(mode & FFMPEG_DEINT_FILTER)
3551  RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3552  else if(mode & LOWPASS5_DEINT_FILTER)
3553  RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3554 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3555  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3556 */
3557  dstBlock+=8;
3558  srcBlock+=8;
3559  }
3560 
3561  dstBlock = dstBlockStart;
3562  srcBlock = srcBlockStart;
3563 
3564  for(x = startx, qp_index = 0; x < endx; x+=BLOCK_SIZE, qp_index++){
3565  const int stride= dstStride;
3566  //temporary while changing QP stuff to make things continue to work
3567  //eventually QP,nonBQP,etc will be arrays and this will be unnecessary
3568  c.QP = c.QP_block[qp_index];
3569  c.nonBQP = c.nonBQP_block[qp_index];
3570  c.pQPb = c.pQPb_block[qp_index];
3571  c.pQPb2 = c.pQPb2_block[qp_index];
3572 
3573  /* only deblock if we have 2 blocks */
3574  if(y + 8 < height){
3575  if(mode & V_X1_FILTER)
3576  RENAME(vertX1Filter)(dstBlock, stride, &c);
3577  else if(mode & V_DEBLOCK){
3578  const int t= RENAME(vertClassify)(dstBlock, stride, &c);
3579 
3580  if(t==1)
3581  RENAME(doVertLowPass)(dstBlock, stride, &c);
3582  else if(t==2)
3583  RENAME(doVertDefFilter)(dstBlock, stride, &c);
3584  }else if(mode & V_A_DEBLOCK){
3585  RENAME(do_a_deblock)(dstBlock, stride, 1, &c, mode);
3586  }
3587  }
3588 
3589  dstBlock+=8;
3590  srcBlock+=8;
3591  }
3592 
3593  dstBlock = dstBlockStart;
3594  srcBlock = srcBlockStart;
3595 
3596  for(x = startx, qp_index=0; x < endx; x+=BLOCK_SIZE, qp_index++){
3597  const int stride= dstStride;
3598  c.QP = c.QP_block[qp_index];
3599  c.nonBQP = c.nonBQP_block[qp_index];
3600  c.pQPb = c.pQPb_block[qp_index];
3601  c.pQPb2 = c.pQPb2_block[qp_index];
3602 #if TEMPLATE_PP_MMX
3603  RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3604 #endif
3605  /* check if we have a previous block to deblock it with dstBlock */
3606  if(x - 8 >= 0){
3607 #if TEMPLATE_PP_MMX
3608  if(mode & H_X1_FILTER)
3609  RENAME(vertX1Filter)(tempBlock1, 16, &c);
3610  else if(mode & H_DEBLOCK){
3611  const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
3612  if(t==1)
3613  RENAME(doVertLowPass)(tempBlock1, 16, &c);
3614  else if(t==2)
3615  RENAME(doVertDefFilter)(tempBlock1, 16, &c);
3616  }else if(mode & H_A_DEBLOCK){
3617  RENAME(do_a_deblock)(tempBlock1, 16, 1, &c, mode);
3618  }
3619 
3620  RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3621 
3622 #else
3623  if(mode & H_X1_FILTER)
3624  horizX1Filter(dstBlock-4, stride, c.QP);
3625  else if(mode & H_DEBLOCK){
3626 #if TEMPLATE_PP_ALTIVEC
3627  DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];
3628  int t;
3629  transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3630 
3631  t = vertClassify_altivec(tempBlock-48, 16, &c);
3632  if(t==1) {
3633  doVertLowPass_altivec(tempBlock-48, 16, &c);
3634  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3635  }
3636  else if(t==2) {
3637  doVertDefFilter_altivec(tempBlock-48, 16, &c);
3638  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3639  }
3640 #else
3641  const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
3642 
3643  if(t==1)
3644  RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
3645  else if(t==2)
3646  RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
3647 #endif
3648  }else if(mode & H_A_DEBLOCK){
3649  RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c, mode);
3650  }
3651 #endif //TEMPLATE_PP_MMX
3652  if(mode & DERING){
3653  //FIXME filter first line
3654  if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
3655  }
3656 
3657  if(mode & TEMP_NOISE_FILTER)
3658  {
3659  RENAME(tempNoiseReducer)(dstBlock-8, stride,
3660  c.tempBlurred[isColor] + y*dstStride + x,
3661  c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3662  c.ppMode.maxTmpNoise);
3663  }
3664  }
3665 
3666  dstBlock+=8;
3667  srcBlock+=8;
3668 
3669 #if TEMPLATE_PP_MMX
3670  FFSWAP(uint8_t *, tempBlock1, tempBlock2);
3671 #endif
3672  }
3673  }
3674 
3675  if(mode & DERING){
3676  if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
3677  }
3678 
3679  if((mode & TEMP_NOISE_FILTER)){
3680  RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3681  c.tempBlurred[isColor] + y*dstStride + x,
3682  c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3683  c.ppMode.maxTmpNoise);
3684  }
3685 
3686  /* did we use a tmp buffer for the last lines*/
3687  if(y+15 >= height){
3688  uint8_t *dstBlock= &(dst[y*dstStride]);
3689  if(width==FFABS(dstStride))
3690  linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
3691  else{
3692  int i;
3693  for(i=0; i<height-y; i++){
3694  memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3695  }
3696  }
3697  }
3698  }
3699 #if TEMPLATE_PP_3DNOW
3700  __asm__ volatile("femms");
3701 #elif TEMPLATE_PP_MMX
3702  __asm__ volatile("emms");
3703 #endif
3704 
3705 #ifdef DEBUG_BRIGHTNESS
3706  if(!isColor){
3707  int max=1;
3708  int i;
3709  for(i=0; i<256; i++)
3710  if(yHistogram[i] > max) max=yHistogram[i];
3711 
3712  for(i=1; i<256; i++){
3713  int x;
3714  int start=yHistogram[i-1]/(max/256+1);
3715  int end=yHistogram[i]/(max/256+1);
3716  int inc= end > start ? 1 : -1;
3717  for(x=start; x!=end+inc; x+=inc)
3718  dst[ i*dstStride + x]+=128;
3719  }
3720 
3721  for(i=0; i<100; i+=2){
3722  dst[ (white)*dstStride + i]+=128;
3723  dst[ (black)*dstStride + i]+=128;
3724  }
3725  }
3726 #endif
3727 
3728  *c2= c; //copy local context back
3729 
3730 }
3731 
3732 #undef RENAME
3733 #undef TEMPLATE_PP_C
3734 #undef TEMPLATE_PP_ALTIVEC
3735 #undef TEMPLATE_PP_MMX
3736 #undef TEMPLATE_PP_MMXEXT
3737 #undef TEMPLATE_PP_3DNOW
3738 #undef TEMPLATE_PP_SSE2
error
static void error(const char *err)
Definition: target_bsf_fuzzer.c:31
FFMPEG_DEINT_FILTER
#define FFMPEG_DEINT_FILTER
Definition: postprocess_internal.h:67
stride
int stride
Definition: mace.c:144
AV_LOG_WARNING
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:186
mem_internal.h
PPContext
postprocess context.
Definition: postprocess_internal.h:116
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
LOWPASS5_DEINT_FILTER
#define LOWPASS5_DEINT_FILTER
Definition: postprocess_internal.h:68
b
#define b
Definition: input.c:40
NAMED_CONSTRAINTS_ADD
#define NAMED_CONSTRAINTS_ADD(...)
Definition: asm.h:145
horizX1Filter
static void horizX1Filter(uint8_t *src, int stride, int QP)
Experimental Filter 1 (Horizontal) will not damage linear gradients Flat blocks should look like they...
Definition: postprocess.c:347
doVertLowPass_altivec
static void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
Definition: postprocess_altivec_template.c:214
MEDIAN
@ MEDIAN
Definition: huffyuv.h:52
t1
#define t1
Definition: regdef.h:29
max
#define max(a, b)
Definition: cuda_runtime.h:33
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
H_A_DEBLOCK
#define H_A_DEBLOCK
Definition: postprocess_internal.h:56
FFSIGN
#define FFSIGN(a)
Definition: common.h:66
QP
#define QP(qP, depth)
Definition: h264data.c:190
scale
static av_always_inline float scale(float x, float s)
Definition: vf_v360.c:1388
first
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But first
Definition: rate_distortion.txt:12
postProcess
static void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, const int8_t QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
Definition: postprocess.c:559
width
#define width
s
#define s(width, name)
Definition: cbs_vp9.c:257
V_A_DEBLOCK
#define V_A_DEBLOCK
Definition: postprocess_internal.h:52
V_DEBLOCK
#define V_DEBLOCK
Definition: postprocess_internal.h:36
TEMP_NOISE_FILTER
#define TEMP_NOISE_FILTER
Definition: postprocess_internal.h:70
XMM_CLOBBERS
#define XMM_CLOBBERS(...)
Definition: asm.h:98
f
#define f(width, name)
Definition: cbs_vp9.c:255
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:65
TEMPLATE_PP_SSE2
#define TEMPLATE_PP_SSE2
Definition: postprocess_template.c:75
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
MEDIAN_DEINT_FILTER
#define MEDIAN_DEINT_FILTER
Definition: postprocess_internal.h:66
src
#define src
Definition: vp8dsp.c:255
linecpy
static void linecpy(void *dest, const void *src, int lines, int stride)
Definition: postprocess_internal.h:177
V_X1_FILTER
#define V_X1_FILTER
Definition: postprocess_internal.h:51
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
transpose_16x8_char_toPackedAlign_altivec
static void transpose_16x8_char_toPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1016
PAVGB
#define PAVGB(a, b)
Definition: postprocess_template.c:88
asm.h
transpose_8x16_char_fromPackedAlign_altivec
static void transpose_8x16_char_fromPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1121
avg
#define avg(a, b, c, d)
Definition: colorspacedsp_template.c:28
PREV
@ PREV
Definition: vf_fftdnoiz.c:32
height
#define height
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
H_DEBLOCK
#define H_DEBLOCK
Definition: postprocess_internal.h:37
AV_LOG_INFO
#define AV_LOG_INFO
Standard information.
Definition: log.h:191
DERING
#define DERING
Definition: postprocess_internal.h:38
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:116
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:271
VISUALIZE
#define VISUALIZE
Definition: postprocess_internal.h:73
t3
#define t3
Definition: regdef.h:31
RENAME
#define RENAME(name)
Definition: ffv1.h:195
av_always_inline
#define av_always_inline
Definition: attributes.h:49
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
av_rescale
int64_t av_rescale(int64_t a, int64_t b, int64_t c)
Rescale a 64-bit integer with rounding to nearest.
Definition: mathematics.c:128
NEXT
@ NEXT
Definition: vf_fftdnoiz.c:33
CUBIC_IPOL_DEINT_FILTER
#define CUBIC_IPOL_DEINT_FILTER
Definition: postprocess_internal.h:65
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
FFSWAP
#define FFSWAP(type, a, b)
Definition: macros.h:52
vertClassify_altivec
static int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:59
TEMPLATE_PP_MMX
#define TEMPLATE_PP_MMX
Definition: postprocess_template.c:49
c2
static const uint64_t c2
Definition: murmur3.c:52
t2
#define t2
Definition: regdef.h:30
doVertDefFilter_altivec
static void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:412
mode
mode
Definition: ebur128.h:83
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:107
LINEAR_BLEND_DEINT_FILTER
#define LINEAR_BLEND_DEINT_FILTER
Definition: postprocess_internal.h:63
av_clip_uint8
#define av_clip_uint8
Definition: common.h:102
LINEAR_IPOL_DEINT_FILTER
#define LINEAR_IPOL_DEINT_FILTER
Definition: postprocess_internal.h:62
MANGLE
#define MANGLE(a)
Definition: asm.h:127
diff
static av_always_inline int diff(const uint32_t a, const uint32_t b)
Definition: vf_palettegen.c:139
H_X1_FILTER
#define H_X1_FILTER
Definition: postprocess_internal.h:55
x86_reg
int x86_reg
Definition: asm.h:72
d
d
Definition: ffmpeg_filter.c:153
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:28
BLOCK_SIZE
#define BLOCK_SIZE
Definition: adx.h:53
int
int
Definition: ffmpeg_filter.c:153
LEVEL_FIX
#define LEVEL_FIX
Brightness & Contrast.
Definition: postprocess_internal.h:39
min
float min
Definition: vorbis_enc_data.h:429