FFmpeg
postprocess_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 /**
22  * @file
23  * mmx/mmx2/3dnow postprocess code.
24  */
25 
26 #include "libavutil/x86/asm.h"
27 
28 /* A single TEMPLATE_PP_* should be defined (to 1) when this template is
29  * included. The following macros will define its dependencies to 1 as well
30  * (like MMX2 depending on MMX), and will define to 0 all the others. Every
31  * TEMPLATE_PP_* need to be undef at the end. */
32 
33 #ifdef TEMPLATE_PP_C
34 # define RENAME(a) a ## _C
35 #else
36 # define TEMPLATE_PP_C 0
37 #endif
38 
39 #ifdef TEMPLATE_PP_ALTIVEC
40 # define RENAME(a) a ## _altivec
41 #else
42 # define TEMPLATE_PP_ALTIVEC 0
43 #endif
44 
45 #ifdef TEMPLATE_PP_MMX
46 # define RENAME(a) a ## _MMX
47 #else
48 # define TEMPLATE_PP_MMX 0
49 #endif
50 
51 #ifdef TEMPLATE_PP_MMXEXT
52 # undef TEMPLATE_PP_MMX
53 # define TEMPLATE_PP_MMX 1
54 # define RENAME(a) a ## _MMX2
55 #else
56 # define TEMPLATE_PP_MMXEXT 0
57 #endif
58 
59 #ifdef TEMPLATE_PP_3DNOW
60 # undef TEMPLATE_PP_MMX
61 # define TEMPLATE_PP_MMX 1
62 # define RENAME(a) a ## _3DNow
63 #else
64 # define TEMPLATE_PP_3DNOW 0
65 #endif
66 
67 #ifdef TEMPLATE_PP_SSE2
68 # undef TEMPLATE_PP_MMX
69 # define TEMPLATE_PP_MMX 1
70 # undef TEMPLATE_PP_MMXEXT
71 # define TEMPLATE_PP_MMXEXT 1
72 # define RENAME(a) a ## _SSE2
73 #else
74 # define TEMPLATE_PP_SSE2 0
75 #endif
76 
77 #undef REAL_PAVGB
78 #undef PAVGB
79 #undef PMINUB
80 #undef PMAXUB
81 
82 #if TEMPLATE_PP_MMXEXT
83 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
84 #elif TEMPLATE_PP_3DNOW
85 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
86 #endif
87 #define PAVGB(a,b) REAL_PAVGB(a,b)
88 
89 #if TEMPLATE_PP_MMXEXT
90 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
91 #elif TEMPLATE_PP_MMX
92 #define PMINUB(b,a,t) \
93  "movq " #a ", " #t " \n\t"\
94  "psubusb " #b ", " #t " \n\t"\
95  "psubb " #t ", " #a " \n\t"
96 #endif
97 
98 #if TEMPLATE_PP_MMXEXT
99 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
100 #elif TEMPLATE_PP_MMX
101 #define PMAXUB(a,b) \
102  "psubusb " #a ", " #b " \n\t"\
103  "paddb " #a ", " #b " \n\t"
104 #endif
105 
106 //FIXME? |255-0| = 1 (should not be a problem ...)
107 #if TEMPLATE_PP_MMX
108 /**
109  * Check if the middle 8x8 Block in the given 8x16 block is flat
110  */
111 static inline int RENAME(vertClassify)(const uint8_t src[], int stride, PPContext *c){
112  int numEq= 0, dcOk;
113  src+= stride*4; // src points to begin of the 8x8 Block
114  __asm__ volatile(
115  "movq %0, %%mm7 \n\t"
116  "movq %1, %%mm6 \n\t"
117  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
118  );
119 
120  __asm__ volatile(
121  "lea (%2, %3), %%"FF_REG_a" \n\t"
122 // 0 1 2 3 4 5 6 7 8 9
123 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
124 
125  "movq (%2), %%mm0 \n\t"
126  "movq (%%"FF_REG_a"), %%mm1 \n\t"
127  "movq %%mm0, %%mm3 \n\t"
128  "movq %%mm0, %%mm4 \n\t"
129  PMAXUB(%%mm1, %%mm4)
130  PMINUB(%%mm1, %%mm3, %%mm5)
131  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
132  "paddb %%mm7, %%mm0 \n\t"
133  "pcmpgtb %%mm6, %%mm0 \n\t"
134 
135  "movq (%%"FF_REG_a",%3), %%mm2 \n\t"
136  PMAXUB(%%mm2, %%mm4)
137  PMINUB(%%mm2, %%mm3, %%mm5)
138  "psubb %%mm2, %%mm1 \n\t"
139  "paddb %%mm7, %%mm1 \n\t"
140  "pcmpgtb %%mm6, %%mm1 \n\t"
141  "paddb %%mm1, %%mm0 \n\t"
142 
143  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
144  PMAXUB(%%mm1, %%mm4)
145  PMINUB(%%mm1, %%mm3, %%mm5)
146  "psubb %%mm1, %%mm2 \n\t"
147  "paddb %%mm7, %%mm2 \n\t"
148  "pcmpgtb %%mm6, %%mm2 \n\t"
149  "paddb %%mm2, %%mm0 \n\t"
150 
151  "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
152 
153  "movq (%2, %3, 4), %%mm2 \n\t"
154  PMAXUB(%%mm2, %%mm4)
155  PMINUB(%%mm2, %%mm3, %%mm5)
156  "psubb %%mm2, %%mm1 \n\t"
157  "paddb %%mm7, %%mm1 \n\t"
158  "pcmpgtb %%mm6, %%mm1 \n\t"
159  "paddb %%mm1, %%mm0 \n\t"
160 
161  "movq (%%"FF_REG_a"), %%mm1 \n\t"
162  PMAXUB(%%mm1, %%mm4)
163  PMINUB(%%mm1, %%mm3, %%mm5)
164  "psubb %%mm1, %%mm2 \n\t"
165  "paddb %%mm7, %%mm2 \n\t"
166  "pcmpgtb %%mm6, %%mm2 \n\t"
167  "paddb %%mm2, %%mm0 \n\t"
168 
169  "movq (%%"FF_REG_a", %3), %%mm2 \n\t"
170  PMAXUB(%%mm2, %%mm4)
171  PMINUB(%%mm2, %%mm3, %%mm5)
172  "psubb %%mm2, %%mm1 \n\t"
173  "paddb %%mm7, %%mm1 \n\t"
174  "pcmpgtb %%mm6, %%mm1 \n\t"
175  "paddb %%mm1, %%mm0 \n\t"
176 
177  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
178  PMAXUB(%%mm1, %%mm4)
179  PMINUB(%%mm1, %%mm3, %%mm5)
180  "psubb %%mm1, %%mm2 \n\t"
181  "paddb %%mm7, %%mm2 \n\t"
182  "pcmpgtb %%mm6, %%mm2 \n\t"
183  "paddb %%mm2, %%mm0 \n\t"
184  "psubusb %%mm3, %%mm4 \n\t"
185 
186  " \n\t"
187 #if TEMPLATE_PP_MMXEXT
188  "pxor %%mm7, %%mm7 \n\t"
189  "psadbw %%mm7, %%mm0 \n\t"
190 #else
191  "movq %%mm0, %%mm1 \n\t"
192  "psrlw $8, %%mm0 \n\t"
193  "paddb %%mm1, %%mm0 \n\t"
194  "movq %%mm0, %%mm1 \n\t"
195  "psrlq $16, %%mm0 \n\t"
196  "paddb %%mm1, %%mm0 \n\t"
197  "movq %%mm0, %%mm1 \n\t"
198  "psrlq $32, %%mm0 \n\t"
199  "paddb %%mm1, %%mm0 \n\t"
200 #endif
201  "movq %4, %%mm7 \n\t" // QP,..., QP
202  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
203  "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0
204  "packssdw %%mm4, %%mm4 \n\t"
205  "movd %%mm0, %0 \n\t"
206  "movd %%mm4, %1 \n\t"
207 
208  : "=r" (numEq), "=r" (dcOk)
209  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
210  : "%"FF_REG_a
211  );
212 
213  numEq= (-numEq) &0xFF;
214  if(numEq > c->ppMode.flatnessThreshold){
215  if(dcOk) return 0;
216  else return 1;
217  }else{
218  return 2;
219  }
220 }
221 #endif //TEMPLATE_PP_MMX
222 
223 /**
224  * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle)
225  * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16
226  */
227 #if !TEMPLATE_PP_ALTIVEC
228 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
229 {
230 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
231  src+= stride*3;
232  __asm__ volatile( //"movv %0 %1 %2\n\t"
233  "movq %2, %%mm0 \n\t" // QP,..., QP
234  "pxor %%mm4, %%mm4 \n\t"
235 
236  "movq (%0), %%mm6 \n\t"
237  "movq (%0, %1), %%mm5 \n\t"
238  "movq %%mm5, %%mm1 \n\t"
239  "movq %%mm6, %%mm2 \n\t"
240  "psubusb %%mm6, %%mm5 \n\t"
241  "psubusb %%mm1, %%mm2 \n\t"
242  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
243  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
244  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
245 
246  "pand %%mm2, %%mm6 \n\t"
247  "pandn %%mm1, %%mm2 \n\t"
248  "por %%mm2, %%mm6 \n\t"// First Line to Filter
249 
250  "movq (%0, %1, 8), %%mm5 \n\t"
251  "lea (%0, %1, 4), %%"FF_REG_a" \n\t"
252  "lea (%0, %1, 8), %%"FF_REG_c" \n\t"
253  "sub %1, %%"FF_REG_c" \n\t"
254  "add %1, %0 \n\t" // %0 points to line 1 not 0
255  "movq (%0, %1, 8), %%mm7 \n\t"
256  "movq %%mm5, %%mm1 \n\t"
257  "movq %%mm7, %%mm2 \n\t"
258  "psubusb %%mm7, %%mm5 \n\t"
259  "psubusb %%mm1, %%mm2 \n\t"
260  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
261  "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0
262  "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF
263 
264  "pand %%mm2, %%mm7 \n\t"
265  "pandn %%mm1, %%mm2 \n\t"
266  "por %%mm2, %%mm7 \n\t" // First Line to Filter
267 
268 
269  // 1 2 3 4 5 6 7 8
270  // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1
271  // 6 4 2 2 1 1
272  // 6 4 4 2
273  // 6 8 2
274 
275  "movq (%0, %1), %%mm0 \n\t" // 1
276  "movq %%mm0, %%mm1 \n\t" // 1
277  PAVGB(%%mm6, %%mm0) //1 1 /2
278  PAVGB(%%mm6, %%mm0) //3 1 /4
279 
280  "movq (%0, %1, 4), %%mm2 \n\t" // 1
281  "movq %%mm2, %%mm5 \n\t" // 1
282  PAVGB((%%FF_REGa), %%mm2) // 11 /2
283  PAVGB((%0, %1, 2), %%mm2) // 211 /4
284  "movq %%mm2, %%mm3 \n\t" // 211 /4
285  "movq (%0), %%mm4 \n\t" // 1
286  PAVGB(%%mm4, %%mm3) // 4 211 /8
287  PAVGB(%%mm0, %%mm3) //642211 /16
288  "movq %%mm3, (%0) \n\t" // X
289  // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9
290  "movq %%mm1, %%mm0 \n\t" // 1
291  PAVGB(%%mm6, %%mm0) //1 1 /2
292  "movq %%mm4, %%mm3 \n\t" // 1
293  PAVGB((%0,%1,2), %%mm3) // 1 1 /2
294  PAVGB((%%FF_REGa,%1,2), %%mm5) // 11 /2
295  PAVGB((%%FF_REGa), %%mm5) // 211 /4
296  PAVGB(%%mm5, %%mm3) // 2 2211 /8
297  PAVGB(%%mm0, %%mm3) //4242211 /16
298  "movq %%mm3, (%0,%1) \n\t" // X
299  // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9
300  PAVGB(%%mm4, %%mm6) //11 /2
301  "movq (%%"FF_REG_c"), %%mm0 \n\t" // 1
302  PAVGB((%%FF_REGa, %1, 2), %%mm0) // 11/2
303  "movq %%mm0, %%mm3 \n\t" // 11/2
304  PAVGB(%%mm1, %%mm0) // 2 11/4
305  PAVGB(%%mm6, %%mm0) //222 11/8
306  PAVGB(%%mm2, %%mm0) //22242211/16
307  "movq (%0, %1, 2), %%mm2 \n\t" // 1
308  "movq %%mm0, (%0, %1, 2) \n\t" // X
309  // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9
310  "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
311  PAVGB((%%FF_REGc), %%mm0) // 11 /2
312  PAVGB(%%mm0, %%mm6) //11 11 /4
313  PAVGB(%%mm1, %%mm4) // 11 /2
314  PAVGB(%%mm2, %%mm1) // 11 /2
315  PAVGB(%%mm1, %%mm6) //1122 11 /8
316  PAVGB(%%mm5, %%mm6) //112242211 /16
317  "movq (%%"FF_REG_a"), %%mm5 \n\t" // 1
318  "movq %%mm6, (%%"FF_REG_a") \n\t" // X
319  // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9
320  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t" // 1
321  PAVGB(%%mm7, %%mm6) // 11 /2
322  PAVGB(%%mm4, %%mm6) // 11 11 /4
323  PAVGB(%%mm3, %%mm6) // 11 2211 /8
324  PAVGB(%%mm5, %%mm2) // 11 /2
325  "movq (%0, %1, 4), %%mm4 \n\t" // 1
326  PAVGB(%%mm4, %%mm2) // 112 /4
327  PAVGB(%%mm2, %%mm6) // 112242211 /16
328  "movq %%mm6, (%0, %1, 4) \n\t" // X
329  // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9
330  PAVGB(%%mm7, %%mm1) // 11 2 /4
331  PAVGB(%%mm4, %%mm5) // 11 /2
332  PAVGB(%%mm5, %%mm0) // 11 11 /4
333  "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" // 1
334  PAVGB(%%mm6, %%mm1) // 11 4 2 /8
335  PAVGB(%%mm0, %%mm1) // 11224222 /16
336  "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t" // X
337  // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9
338  PAVGB((%%FF_REGc), %%mm2) // 112 4 /8
339  "movq (%%"FF_REG_a", %1, 4), %%mm0 \n\t" // 1
340  PAVGB(%%mm0, %%mm6) // 1 1 /2
341  PAVGB(%%mm7, %%mm6) // 1 12 /4
342  PAVGB(%%mm2, %%mm6) // 1122424 /4
343  "movq %%mm6, (%%"FF_REG_c") \n\t" // X
344  // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9
345  PAVGB(%%mm7, %%mm5) // 11 2 /4
346  PAVGB(%%mm7, %%mm5) // 11 6 /8
347 
348  PAVGB(%%mm3, %%mm0) // 112 /4
349  PAVGB(%%mm0, %%mm5) // 112246 /16
350  "movq %%mm5, (%%"FF_REG_a", %1, 4) \n\t" // X
351  "sub %1, %0 \n\t"
352 
353  :
354  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
355  : "%"FF_REG_a, "%"FF_REG_c
356  );
357 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
358  const int l1= stride;
359  const int l2= stride + l1;
360  const int l3= stride + l2;
361  const int l4= stride + l3;
362  const int l5= stride + l4;
363  const int l6= stride + l5;
364  const int l7= stride + l6;
365  const int l8= stride + l7;
366  const int l9= stride + l8;
367  int x;
368  src+= stride*3;
369  for(x=0; x<BLOCK_SIZE; x++){
370  const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
371  const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
372 
373  int sums[10];
374  sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
375  sums[1] = sums[0] - first + src[l4];
376  sums[2] = sums[1] - first + src[l5];
377  sums[3] = sums[2] - first + src[l6];
378  sums[4] = sums[3] - first + src[l7];
379  sums[5] = sums[4] - src[l1] + src[l8];
380  sums[6] = sums[5] - src[l2] + last;
381  sums[7] = sums[6] - src[l3] + last;
382  sums[8] = sums[7] - src[l4] + last;
383  sums[9] = sums[8] - src[l5] + last;
384 
385  src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
386  src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
387  src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
388  src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
389  src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
390  src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
391  src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
392  src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
393 
394  src++;
395  }
396 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
397 }
398 #endif //TEMPLATE_PP_ALTIVEC
399 
400 /**
401  * Experimental Filter 1
402  * will not damage linear gradients
403  * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter
404  * can only smooth blocks at the expected locations (it cannot smooth them if they did move)
405  * MMX2 version does correct clipping C version does not
406  */
407 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
408 {
409 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
410  src+= stride*3;
411 
412  __asm__ volatile(
413  "pxor %%mm7, %%mm7 \n\t" // 0
414  "lea (%0, %1), %%"FF_REG_a" \n\t"
415  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
416 // 0 1 2 3 4 5 6 7 8 9
417 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
418  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
419  "movq (%0, %1, 4), %%mm1 \n\t" // line 4
420  "movq %%mm1, %%mm2 \n\t" // line 4
421  "psubusb %%mm0, %%mm1 \n\t"
422  "psubusb %%mm2, %%mm0 \n\t"
423  "por %%mm1, %%mm0 \n\t" // |l2 - l3|
424  "movq (%%"FF_REG_c"), %%mm3 \n\t" // line 5
425  "movq (%%"FF_REG_c", %1), %%mm4 \n\t" // line 6
426  "movq %%mm3, %%mm5 \n\t" // line 5
427  "psubusb %%mm4, %%mm3 \n\t"
428  "psubusb %%mm5, %%mm4 \n\t"
429  "por %%mm4, %%mm3 \n\t" // |l5 - l6|
430  PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2
431  "movq %%mm2, %%mm1 \n\t" // line 4
432  "psubusb %%mm5, %%mm2 \n\t"
433  "movq %%mm2, %%mm4 \n\t"
434  "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0
435  "psubusb %%mm1, %%mm5 \n\t"
436  "por %%mm5, %%mm4 \n\t" // |l4 - l5|
437  "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2)
438  "movq %%mm4, %%mm3 \n\t" // d
439  "movq %2, %%mm0 \n\t"
440  "paddusb %%mm0, %%mm0 \n\t"
441  "psubusb %%mm0, %%mm4 \n\t"
442  "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0
443  "psubusb "MANGLE(b01)", %%mm3 \n\t"
444  "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0
445 
446  PAVGB(%%mm7, %%mm3) // d/2
447  "movq %%mm3, %%mm1 \n\t" // d/2
448  PAVGB(%%mm7, %%mm3) // d/4
449  PAVGB(%%mm1, %%mm3) // 3*d/8
450 
451  "movq (%0, %1, 4), %%mm0 \n\t" // line 4
452  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
453  "psubusb %%mm3, %%mm0 \n\t"
454  "pxor %%mm2, %%mm0 \n\t"
455  "movq %%mm0, (%0, %1, 4) \n\t" // line 4
456 
457  "movq (%%"FF_REG_c"), %%mm0 \n\t" // line 5
458  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
459  "paddusb %%mm3, %%mm0 \n\t"
460  "pxor %%mm2, %%mm0 \n\t"
461  "movq %%mm0, (%%"FF_REG_c") \n\t" // line 5
462 
463  PAVGB(%%mm7, %%mm1) // d/4
464 
465  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t" // line 3
466  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4
467  "psubusb %%mm1, %%mm0 \n\t"
468  "pxor %%mm2, %%mm0 \n\t"
469  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t" // line 3
470 
471  "movq (%%"FF_REG_c", %1), %%mm0 \n\t" // line 6
472  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5
473  "paddusb %%mm1, %%mm0 \n\t"
474  "pxor %%mm2, %%mm0 \n\t"
475  "movq %%mm0, (%%"FF_REG_c", %1) \n\t" // line 6
476 
477  PAVGB(%%mm7, %%mm1) // d/8
478 
479  "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // line 2
480  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2
481  "psubusb %%mm1, %%mm0 \n\t"
482  "pxor %%mm2, %%mm0 \n\t"
483  "movq %%mm0, (%%"FF_REG_a", %1) \n\t" // line 2
484 
485  "movq (%%"FF_REG_c", %1, 2), %%mm0 \n\t" // line 7
486  "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7
487  "paddusb %%mm1, %%mm0 \n\t"
488  "pxor %%mm2, %%mm0 \n\t"
489  "movq %%mm0, (%%"FF_REG_c", %1, 2) \n\t" // line 7
490 
491  :
492  : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
494  : "%"FF_REG_a, "%"FF_REG_c
495  );
496 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
497 
498  const int l1= stride;
499  const int l2= stride + l1;
500  const int l3= stride + l2;
501  const int l4= stride + l3;
502  const int l5= stride + l4;
503  const int l6= stride + l5;
504  const int l7= stride + l6;
505 // const int l8= stride + l7;
506 // const int l9= stride + l8;
507  int x;
508 
509  src+= stride*3;
510  for(x=0; x<BLOCK_SIZE; x++){
511  int a= src[l3] - src[l4];
512  int b= src[l4] - src[l5];
513  int c= src[l5] - src[l6];
514 
515  int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
516  d= FFMAX(d, 0);
517 
518  if(d < co->QP*2){
519  int v = d * FFSIGN(-b);
520 
521  src[l2] +=v>>3;
522  src[l3] +=v>>2;
523  src[l4] +=(3*v)>>3;
524  src[l5] -=(3*v)>>3;
525  src[l6] -=v>>2;
526  src[l7] -=v>>3;
527  }
528  src++;
529  }
530 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
531 }
532 
533 #if !TEMPLATE_PP_ALTIVEC
534 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
535 {
536 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
537 /*
538  uint8_t tmp[16];
539  const int l1= stride;
540  const int l2= stride + l1;
541  const int l3= stride + l2;
542  const int l4= (int)tmp - (int)src - stride*3;
543  const int l5= (int)tmp - (int)src - stride*3 + 8;
544  const int l6= stride*3 + l3;
545  const int l7= stride + l6;
546  const int l8= stride + l7;
547 
548  memcpy(tmp, src+stride*7, 8);
549  memcpy(tmp+8, src+stride*8, 8);
550 */
551  src+= stride*4;
552  __asm__ volatile(
553 
554 #if 0 //slightly more accurate and slightly slower
555  "pxor %%mm7, %%mm7 \n\t" // 0
556  "lea (%0, %1), %%"FF_REG_a" \n\t"
557  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
558 // 0 1 2 3 4 5 6 7
559 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
560 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
561 
562 
563  "movq (%0, %1, 2), %%mm0 \n\t" // l2
564  "movq (%0), %%mm1 \n\t" // l0
565  "movq %%mm0, %%mm2 \n\t" // l2
566  PAVGB(%%mm7, %%mm0) // ~l2/2
567  PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4
568  PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8
569 
570  "movq (%%"FF_REG_a"), %%mm1 \n\t" // l1
571  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t" // l3
572  "movq %%mm1, %%mm4 \n\t" // l1
573  PAVGB(%%mm7, %%mm1) // ~l1/2
574  PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4
575  PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8
576 
577  "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8
578  "psubusb %%mm1, %%mm0 \n\t"
579  "psubusb %%mm4, %%mm1 \n\t"
580  "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8
581 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0
582 
583  "movq (%0, %1, 4), %%mm0 \n\t" // l4
584  "movq %%mm0, %%mm4 \n\t" // l4
585  PAVGB(%%mm7, %%mm0) // ~l4/2
586  PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4
587  PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8
588 
589  "movq (%%"FF_REG_c"), %%mm2 \n\t" // l5
590  "movq %%mm3, %%mm5 \n\t" // l3
591  PAVGB(%%mm7, %%mm3) // ~l3/2
592  PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4
593  PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8
594 
595  "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8
596  "psubusb %%mm3, %%mm0 \n\t"
597  "psubusb %%mm6, %%mm3 \n\t"
598  "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8
599  "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5)
600 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0
601 
602  "movq (%%"FF_REG_c", %1), %%mm6 \n\t" // l6
603  "movq %%mm6, %%mm5 \n\t" // l6
604  PAVGB(%%mm7, %%mm6) // ~l6/2
605  PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4
606  PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8
607 
608  "movq (%%"FF_REG_c", %1, 2), %%mm5 \n\t" // l7
609  "movq %%mm2, %%mm4 \n\t" // l5
610  PAVGB(%%mm7, %%mm2) // ~l5/2
611  PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4
612  PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8
613 
614  "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8
615  "psubusb %%mm2, %%mm6 \n\t"
616  "psubusb %%mm4, %%mm2 \n\t"
617  "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8
618 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0
619 
620 
621  PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8
622  "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ?
623  "paddusb "MANGLE(b01)", %%mm4 \n\t"
624  "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP
625  "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8
626  "pand %%mm4, %%mm3 \n\t"
627 
628  "movq %%mm3, %%mm1 \n\t"
629 // "psubusb "MANGLE(b01)", %%mm3 \n\t"
630  PAVGB(%%mm7, %%mm3)
631  PAVGB(%%mm7, %%mm3)
632  "paddusb %%mm1, %%mm3 \n\t"
633 // "paddusb "MANGLE(b01)", %%mm3 \n\t"
634 
635  "movq (%%"FF_REG_a", %1, 2), %%mm6 \n\t" //l3
636  "movq (%0, %1, 4), %%mm5 \n\t" //l4
637  "movq (%0, %1, 4), %%mm4 \n\t" //l4
638  "psubusb %%mm6, %%mm5 \n\t"
639  "psubusb %%mm4, %%mm6 \n\t"
640  "por %%mm6, %%mm5 \n\t" // |l3-l4|
641  "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4)
642  "pxor %%mm6, %%mm0 \n\t"
643  "pand %%mm0, %%mm3 \n\t"
644  PMINUB(%%mm5, %%mm3, %%mm0)
645 
646  "psubusb "MANGLE(b01)", %%mm3 \n\t"
647  PAVGB(%%mm7, %%mm3)
648 
649  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
650  "movq (%0, %1, 4), %%mm2 \n\t"
651  "pxor %%mm6, %%mm0 \n\t"
652  "pxor %%mm6, %%mm2 \n\t"
653  "psubb %%mm3, %%mm0 \n\t"
654  "paddb %%mm3, %%mm2 \n\t"
655  "pxor %%mm6, %%mm0 \n\t"
656  "pxor %%mm6, %%mm2 \n\t"
657  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
658  "movq %%mm2, (%0, %1, 4) \n\t"
659 #endif //0
660 
661  "lea (%0, %1), %%"FF_REG_a" \n\t"
662  "pcmpeqb %%mm6, %%mm6 \n\t" // -1
663 // 0 1 2 3 4 5 6 7
664 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1
665 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1
666 
667 
668  "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t" // l3
669  "movq (%0, %1, 4), %%mm0 \n\t" // l4
670  "pxor %%mm6, %%mm1 \n\t" // -l3-1
671  PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2
672 // mm1=-l3-1, mm0=128-q
673 
674  "movq (%%"FF_REG_a", %1, 4), %%mm2 \n\t" // l5
675  "movq (%%"FF_REG_a", %1), %%mm3 \n\t" // l2
676  "pxor %%mm6, %%mm2 \n\t" // -l5-1
677  "movq %%mm2, %%mm5 \n\t" // -l5-1
678  "movq "MANGLE(b80)", %%mm4 \n\t" // 128
679  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
680  PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2
681  PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128
682  PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128
683  PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128
684 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1
685 
686  "movq (%%"FF_REG_a"), %%mm2 \n\t" // l1
687  "pxor %%mm6, %%mm2 \n\t" // -l1-1
688  PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2
689  PAVGB((%0), %%mm1) // (l0-l3+256)/2
690  "movq "MANGLE(b80)", %%mm3 \n\t" // 128
691  PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128
692  PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128
693  PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128
694 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1
695 
696  PAVGB((%%FF_REGc, %1), %%mm5) // (l6-l5+256)/2
697  "movq (%%"FF_REG_c", %1, 2), %%mm1 \n\t" // l7
698  "pxor %%mm6, %%mm1 \n\t" // -l7-1
699  PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2
700  "movq "MANGLE(b80)", %%mm2 \n\t" // 128
701  PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128
702  PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128
703  PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128
704 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128
705 
706  "movq "MANGLE(b00)", %%mm1 \n\t" // 0
707  "movq "MANGLE(b00)", %%mm5 \n\t" // 0
708  "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16
709  "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16
710  PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16|
711  PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16|
712  PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16
713 
714 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128
715 
716  "movq "MANGLE(b00)", %%mm7 \n\t" // 0
717  "movq %2, %%mm2 \n\t" // QP
718  PAVGB(%%mm6, %%mm2) // 128 + QP/2
719  "psubb %%mm6, %%mm2 \n\t"
720 
721  "movq %%mm4, %%mm1 \n\t"
722  "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy)
723  "pxor %%mm1, %%mm4 \n\t"
724  "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16
725  "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2
726  "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16
727 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16
728 
729  "movq %%mm4, %%mm3 \n\t" // d
730  "psubusb "MANGLE(b01)", %%mm4 \n\t"
731  PAVGB(%%mm7, %%mm4) // d/32
732  PAVGB(%%mm7, %%mm4) // (d + 32)/64
733  "paddb %%mm3, %%mm4 \n\t" // 5d/64
734  "pand %%mm2, %%mm4 \n\t"
735 
736  "movq "MANGLE(b80)", %%mm5 \n\t" // 128
737  "psubb %%mm0, %%mm5 \n\t" // q
738  "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding
739  "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q)
740  "pxor %%mm7, %%mm5 \n\t"
741 
742  PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64)
743  "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q)
744 
745  "pand %%mm7, %%mm4 \n\t"
746  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
747  "movq (%0, %1, 4), %%mm2 \n\t"
748  "pxor %%mm1, %%mm0 \n\t"
749  "pxor %%mm1, %%mm2 \n\t"
750  "paddb %%mm4, %%mm0 \n\t"
751  "psubb %%mm4, %%mm2 \n\t"
752  "pxor %%mm1, %%mm0 \n\t"
753  "pxor %%mm1, %%mm2 \n\t"
754  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
755  "movq %%mm2, (%0, %1, 4) \n\t"
756 
757  :
758  : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
759  NAMED_CONSTRAINTS_ADD(b80,b00,b01)
760  : "%"FF_REG_a, "%"FF_REG_c
761  );
762 
763 /*
764  {
765  int x;
766  src-= stride;
767  for(x=0; x<BLOCK_SIZE; x++){
768  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
769  if(FFABS(middleEnergy)< 8*QP){
770  const int q=(src[l4] - src[l5])/2;
771  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
772  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
773 
774  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
775  d= FFMAX(d, 0);
776 
777  d= (5*d + 32) >> 6;
778  d*= FFSIGN(-middleEnergy);
779 
780  if(q>0){
781  d= d<0 ? 0 : d;
782  d= d>q ? q : d;
783  }else{
784  d= d>0 ? 0 : d;
785  d= d<q ? q : d;
786  }
787 
788  src[l4]-= d;
789  src[l5]+= d;
790  }
791  src++;
792  }
793  src-=8;
794  for(x=0; x<8; x++){
795  int y;
796  for(y=4; y<6; y++){
797  int d= src[x+y*stride] - tmp[x+(y-4)*8];
798  int ad= FFABS(d);
799  static int max=0;
800  static int sum=0;
801  static int num=0;
802  static int bias=0;
803 
804  if(max<ad) max=ad;
805  sum+= ad>3 ? 1 : 0;
806  if(ad>3){
807  src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255;
808  }
809  if(y==4) bias+=d;
810  num++;
811  if(num%1000000 == 0){
812  av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias);
813  }
814  }
815  }
816 }
817 */
818 #elif TEMPLATE_PP_MMX
819  DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
820  src+= stride*4;
821  __asm__ volatile(
822  "pxor %%mm7, %%mm7 \n\t"
823 // 0 1 2 3 4 5 6 7
824 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1
825 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1
826 
827  "movq (%0), %%mm0 \n\t"
828  "movq %%mm0, %%mm1 \n\t"
829  "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
830  "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
831 
832  "movq (%0, %1), %%mm2 \n\t"
833  "lea (%0, %1, 2), %%"FF_REG_a" \n\t"
834  "movq %%mm2, %%mm3 \n\t"
835  "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
836  "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
837 
838  "movq (%%"FF_REG_a"), %%mm4 \n\t"
839  "movq %%mm4, %%mm5 \n\t"
840  "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
841  "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
842 
843  "paddw %%mm0, %%mm0 \n\t" // 2L0
844  "paddw %%mm1, %%mm1 \n\t" // 2H0
845  "psubw %%mm4, %%mm2 \n\t" // L1 - L2
846  "psubw %%mm5, %%mm3 \n\t" // H1 - H2
847  "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
848  "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
849 
850  "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
851  "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
852  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
853  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
854 
855  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
856  "movq %%mm2, %%mm3 \n\t"
857  "punpcklbw %%mm7, %%mm2 \n\t" // L3
858  "punpckhbw %%mm7, %%mm3 \n\t" // H3
859 
860  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
861  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
862  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
863  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
864  "movq %%mm0, (%3) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
865  "movq %%mm1, 8(%3) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
866 
867  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
868  "movq %%mm0, %%mm1 \n\t"
869  "punpcklbw %%mm7, %%mm0 \n\t" // L4
870  "punpckhbw %%mm7, %%mm1 \n\t" // H4
871 
872  "psubw %%mm0, %%mm2 \n\t" // L3 - L4
873  "psubw %%mm1, %%mm3 \n\t" // H3 - H4
874  "movq %%mm2, 16(%3) \n\t" // L3 - L4
875  "movq %%mm3, 24(%3) \n\t" // H3 - H4
876  "paddw %%mm4, %%mm4 \n\t" // 2L2
877  "paddw %%mm5, %%mm5 \n\t" // 2H2
878  "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
879  "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
880 
881  "lea (%%"FF_REG_a", %1), %0 \n\t"
882  "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
883  "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
884  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
885  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
886 //50 opcodes so far
887  "movq (%0, %1, 2), %%mm2 \n\t"
888  "movq %%mm2, %%mm3 \n\t"
889  "punpcklbw %%mm7, %%mm2 \n\t" // L5
890  "punpckhbw %%mm7, %%mm3 \n\t" // H5
891  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
892  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
893  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
894  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
895 
896  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
897  "punpcklbw %%mm7, %%mm6 \n\t" // L6
898  "psubw %%mm6, %%mm2 \n\t" // L5 - L6
899  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
900  "punpckhbw %%mm7, %%mm6 \n\t" // H6
901  "psubw %%mm6, %%mm3 \n\t" // H5 - H6
902 
903  "paddw %%mm0, %%mm0 \n\t" // 2L4
904  "paddw %%mm1, %%mm1 \n\t" // 2H4
905  "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
906  "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
907 
908  "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
909  "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
910  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
911  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
912 
913  "movq (%0, %1, 4), %%mm2 \n\t"
914  "movq %%mm2, %%mm3 \n\t"
915  "punpcklbw %%mm7, %%mm2 \n\t" // L7
916  "punpckhbw %%mm7, %%mm3 \n\t" // H7
917 
918  "paddw %%mm2, %%mm2 \n\t" // 2L7
919  "paddw %%mm3, %%mm3 \n\t" // 2H7
920  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
921  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
922 
923  "movq (%3), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
924  "movq 8(%3), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
925 
926 #if TEMPLATE_PP_MMXEXT
927  "movq %%mm7, %%mm6 \n\t" // 0
928  "psubw %%mm0, %%mm6 \n\t"
929  "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
930  "movq %%mm7, %%mm6 \n\t" // 0
931  "psubw %%mm1, %%mm6 \n\t"
932  "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
933  "movq %%mm7, %%mm6 \n\t" // 0
934  "psubw %%mm2, %%mm6 \n\t"
935  "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
936  "movq %%mm7, %%mm6 \n\t" // 0
937  "psubw %%mm3, %%mm6 \n\t"
938  "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
939 #else
940  "movq %%mm7, %%mm6 \n\t" // 0
941  "pcmpgtw %%mm0, %%mm6 \n\t"
942  "pxor %%mm6, %%mm0 \n\t"
943  "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
944  "movq %%mm7, %%mm6 \n\t" // 0
945  "pcmpgtw %%mm1, %%mm6 \n\t"
946  "pxor %%mm6, %%mm1 \n\t"
947  "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
948  "movq %%mm7, %%mm6 \n\t" // 0
949  "pcmpgtw %%mm2, %%mm6 \n\t"
950  "pxor %%mm6, %%mm2 \n\t"
951  "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
952  "movq %%mm7, %%mm6 \n\t" // 0
953  "pcmpgtw %%mm3, %%mm6 \n\t"
954  "pxor %%mm6, %%mm3 \n\t"
955  "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
956 #endif
957 
958 #if TEMPLATE_PP_MMXEXT
959  "pminsw %%mm2, %%mm0 \n\t"
960  "pminsw %%mm3, %%mm1 \n\t"
961 #else
962  "movq %%mm0, %%mm6 \n\t"
963  "psubusw %%mm2, %%mm6 \n\t"
964  "psubw %%mm6, %%mm0 \n\t"
965  "movq %%mm1, %%mm6 \n\t"
966  "psubusw %%mm3, %%mm6 \n\t"
967  "psubw %%mm6, %%mm1 \n\t"
968 #endif
969 
970  "movd %2, %%mm2 \n\t" // QP
971  "punpcklbw %%mm7, %%mm2 \n\t"
972 
973  "movq %%mm7, %%mm6 \n\t" // 0
974  "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
975  "pxor %%mm6, %%mm4 \n\t"
976  "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
977  "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
978  "pxor %%mm7, %%mm5 \n\t"
979  "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
980 // 100 opcodes
981  "psllw $3, %%mm2 \n\t" // 8QP
982  "movq %%mm2, %%mm3 \n\t" // 8QP
983  "pcmpgtw %%mm4, %%mm2 \n\t"
984  "pcmpgtw %%mm5, %%mm3 \n\t"
985  "pand %%mm2, %%mm4 \n\t"
986  "pand %%mm3, %%mm5 \n\t"
987 
988 
989  "psubusw %%mm0, %%mm4 \n\t" // hd
990  "psubusw %%mm1, %%mm5 \n\t" // ld
991 
992 
993  "movq "MANGLE(w05)", %%mm2 \n\t" // 5
994  "pmullw %%mm2, %%mm4 \n\t"
995  "pmullw %%mm2, %%mm5 \n\t"
996  "movq "MANGLE(w20)", %%mm2 \n\t" // 32
997  "paddw %%mm2, %%mm4 \n\t"
998  "paddw %%mm2, %%mm5 \n\t"
999  "psrlw $6, %%mm4 \n\t"
1000  "psrlw $6, %%mm5 \n\t"
1001 
1002  "movq 16(%3), %%mm0 \n\t" // L3 - L4
1003  "movq 24(%3), %%mm1 \n\t" // H3 - H4
1004 
1005  "pxor %%mm2, %%mm2 \n\t"
1006  "pxor %%mm3, %%mm3 \n\t"
1007 
1008  "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
1009  "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
1010  "pxor %%mm2, %%mm0 \n\t"
1011  "pxor %%mm3, %%mm1 \n\t"
1012  "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
1013  "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
1014  "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
1015  "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
1016 
1017  "pxor %%mm6, %%mm2 \n\t"
1018  "pxor %%mm7, %%mm3 \n\t"
1019  "pand %%mm2, %%mm4 \n\t"
1020  "pand %%mm3, %%mm5 \n\t"
1021 
1022 #if TEMPLATE_PP_MMXEXT
1023  "pminsw %%mm0, %%mm4 \n\t"
1024  "pminsw %%mm1, %%mm5 \n\t"
1025 #else
1026  "movq %%mm4, %%mm2 \n\t"
1027  "psubusw %%mm0, %%mm2 \n\t"
1028  "psubw %%mm2, %%mm4 \n\t"
1029  "movq %%mm5, %%mm2 \n\t"
1030  "psubusw %%mm1, %%mm2 \n\t"
1031  "psubw %%mm2, %%mm5 \n\t"
1032 #endif
1033  "pxor %%mm6, %%mm4 \n\t"
1034  "pxor %%mm7, %%mm5 \n\t"
1035  "psubw %%mm6, %%mm4 \n\t"
1036  "psubw %%mm7, %%mm5 \n\t"
1037  "packsswb %%mm5, %%mm4 \n\t"
1038  "movq (%0), %%mm0 \n\t"
1039  "paddb %%mm4, %%mm0 \n\t"
1040  "movq %%mm0, (%0) \n\t"
1041  "movq (%0, %1), %%mm0 \n\t"
1042  "psubb %%mm4, %%mm0 \n\t"
1043  "movq %%mm0, (%0, %1) \n\t"
1044 
1045  : "+r" (src)
1046  : "r" ((x86_reg)stride), "m" (c->pQPb), "r"(tmp)
1047  NAMED_CONSTRAINTS_ADD(w05,w20)
1048  : "%"FF_REG_a
1049  );
1050 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1051  const int l1= stride;
1052  const int l2= stride + l1;
1053  const int l3= stride + l2;
1054  const int l4= stride + l3;
1055  const int l5= stride + l4;
1056  const int l6= stride + l5;
1057  const int l7= stride + l6;
1058  const int l8= stride + l7;
1059 // const int l9= stride + l8;
1060  int x;
1061  src+= stride*3;
1062  for(x=0; x<BLOCK_SIZE; x++){
1063  const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
1064  if(FFABS(middleEnergy) < 8*c->QP){
1065  const int q=(src[l4] - src[l5])/2;
1066  const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
1067  const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
1068 
1069  int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
1070  d= FFMAX(d, 0);
1071 
1072  d= (5*d + 32) >> 6;
1073  d*= FFSIGN(-middleEnergy);
1074 
1075  if(q>0){
1076  d = FFMAX(d, 0);
1077  d = FFMIN(d, q);
1078  }else{
1079  d = FFMIN(d, 0);
1080  d = FFMAX(d, q);
1081  }
1082 
1083  src[l4]-= d;
1084  src[l5]+= d;
1085  }
1086  src++;
1087  }
1088 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1089 }
1090 #endif //TEMPLATE_PP_ALTIVEC
1091 
1092 #if !TEMPLATE_PP_ALTIVEC
1093 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
1094 {
1095 #if HAVE_7REGS && (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW)
1096  DECLARE_ALIGNED(8, uint64_t, tmp)[3];
1097  __asm__ volatile(
1098  "pxor %%mm6, %%mm6 \n\t"
1099  "pcmpeqb %%mm7, %%mm7 \n\t"
1100  "movq %2, %%mm0 \n\t"
1101  "punpcklbw %%mm6, %%mm0 \n\t"
1102  "psrlw $1, %%mm0 \n\t"
1103  "psubw %%mm7, %%mm0 \n\t"
1104  "packuswb %%mm0, %%mm0 \n\t"
1105  "movq %%mm0, %3 \n\t"
1106 
1107  "lea (%0, %1), %%"FF_REG_a" \n\t"
1108  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1109 
1110 // 0 1 2 3 4 5 6 7 8 9
1111 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1112 
1113 #undef REAL_FIND_MIN_MAX
1114 #undef FIND_MIN_MAX
1115 #if TEMPLATE_PP_MMXEXT
1116 #define REAL_FIND_MIN_MAX(addr)\
1117  "movq " #addr ", %%mm0 \n\t"\
1118  "pminub %%mm0, %%mm7 \n\t"\
1119  "pmaxub %%mm0, %%mm6 \n\t"
1120 #else
1121 #define REAL_FIND_MIN_MAX(addr)\
1122  "movq " #addr ", %%mm0 \n\t"\
1123  "movq %%mm7, %%mm1 \n\t"\
1124  "psubusb %%mm0, %%mm6 \n\t"\
1125  "paddb %%mm0, %%mm6 \n\t"\
1126  "psubusb %%mm0, %%mm1 \n\t"\
1127  "psubb %%mm1, %%mm7 \n\t"
1128 #endif
1129 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr)
1130 
1131 FIND_MIN_MAX((%%FF_REGa))
1132 FIND_MIN_MAX((%%FF_REGa, %1))
1133 FIND_MIN_MAX((%%FF_REGa, %1, 2))
1134 FIND_MIN_MAX((%0, %1, 4))
1135 FIND_MIN_MAX((%%FF_REGd))
1136 FIND_MIN_MAX((%%FF_REGd, %1))
1137 FIND_MIN_MAX((%%FF_REGd, %1, 2))
1138 FIND_MIN_MAX((%0, %1, 8))
1139 
1140  "movq %%mm7, %%mm4 \n\t"
1141  "psrlq $8, %%mm7 \n\t"
1142 #if TEMPLATE_PP_MMXEXT
1143  "pminub %%mm4, %%mm7 \n\t" // min of pixels
1144  "pshufw $0xF9, %%mm7, %%mm4 \n\t"
1145  "pminub %%mm4, %%mm7 \n\t" // min of pixels
1146  "pshufw $0xFE, %%mm7, %%mm4 \n\t"
1147  "pminub %%mm4, %%mm7 \n\t"
1148 #else
1149  "movq %%mm7, %%mm1 \n\t"
1150  "psubusb %%mm4, %%mm1 \n\t"
1151  "psubb %%mm1, %%mm7 \n\t"
1152  "movq %%mm7, %%mm4 \n\t"
1153  "psrlq $16, %%mm7 \n\t"
1154  "movq %%mm7, %%mm1 \n\t"
1155  "psubusb %%mm4, %%mm1 \n\t"
1156  "psubb %%mm1, %%mm7 \n\t"
1157  "movq %%mm7, %%mm4 \n\t"
1158  "psrlq $32, %%mm7 \n\t"
1159  "movq %%mm7, %%mm1 \n\t"
1160  "psubusb %%mm4, %%mm1 \n\t"
1161  "psubb %%mm1, %%mm7 \n\t"
1162 #endif
1163 
1164 
1165  "movq %%mm6, %%mm4 \n\t"
1166  "psrlq $8, %%mm6 \n\t"
1167 #if TEMPLATE_PP_MMXEXT
1168  "pmaxub %%mm4, %%mm6 \n\t" // max of pixels
1169  "pshufw $0xF9, %%mm6, %%mm4 \n\t"
1170  "pmaxub %%mm4, %%mm6 \n\t"
1171  "pshufw $0xFE, %%mm6, %%mm4 \n\t"
1172  "pmaxub %%mm4, %%mm6 \n\t"
1173 #else
1174  "psubusb %%mm4, %%mm6 \n\t"
1175  "paddb %%mm4, %%mm6 \n\t"
1176  "movq %%mm6, %%mm4 \n\t"
1177  "psrlq $16, %%mm6 \n\t"
1178  "psubusb %%mm4, %%mm6 \n\t"
1179  "paddb %%mm4, %%mm6 \n\t"
1180  "movq %%mm6, %%mm4 \n\t"
1181  "psrlq $32, %%mm6 \n\t"
1182  "psubusb %%mm4, %%mm6 \n\t"
1183  "paddb %%mm4, %%mm6 \n\t"
1184 #endif
1185  "movq %%mm6, %%mm0 \n\t" // max
1186  "psubb %%mm7, %%mm6 \n\t" // max - min
1187  "push %%"FF_REG_a" \n\t"
1188  "movd %%mm6, %%eax \n\t"
1189  "cmpb "MANGLE(deringThreshold)", %%al \n\t"
1190  "pop %%"FF_REG_a" \n\t"
1191  " jb 1f \n\t"
1192  PAVGB(%%mm0, %%mm7) // a=(max + min)/2
1193  "punpcklbw %%mm7, %%mm7 \n\t"
1194  "punpcklbw %%mm7, %%mm7 \n\t"
1195  "punpcklbw %%mm7, %%mm7 \n\t"
1196  "movq %%mm7, (%4) \n\t"
1197 
1198  "movq (%0), %%mm0 \n\t" // L10
1199  "movq %%mm0, %%mm1 \n\t" // L10
1200  "movq %%mm0, %%mm2 \n\t" // L10
1201  "psllq $8, %%mm1 \n\t"
1202  "psrlq $8, %%mm2 \n\t"
1203  "movd -4(%0), %%mm3 \n\t"
1204  "movd 8(%0), %%mm4 \n\t"
1205  "psrlq $24, %%mm3 \n\t"
1206  "psllq $56, %%mm4 \n\t"
1207  "por %%mm3, %%mm1 \n\t" // L00
1208  "por %%mm4, %%mm2 \n\t" // L20
1209  "movq %%mm1, %%mm3 \n\t" // L00
1210  PAVGB(%%mm2, %%mm1) // (L20 + L00)/2
1211  PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4
1212  "psubusb %%mm7, %%mm0 \n\t"
1213  "psubusb %%mm7, %%mm2 \n\t"
1214  "psubusb %%mm7, %%mm3 \n\t"
1215  "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1
1216  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1
1217  "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1
1218  "paddb %%mm2, %%mm0 \n\t"
1219  "paddb %%mm3, %%mm0 \n\t"
1220 
1221  "movq (%%"FF_REG_a"), %%mm2 \n\t" // L11
1222  "movq %%mm2, %%mm3 \n\t" // L11
1223  "movq %%mm2, %%mm4 \n\t" // L11
1224  "psllq $8, %%mm3 \n\t"
1225  "psrlq $8, %%mm4 \n\t"
1226  "movd -4(%%"FF_REG_a"), %%mm5 \n\t"
1227  "movd 8(%%"FF_REG_a"), %%mm6 \n\t"
1228  "psrlq $24, %%mm5 \n\t"
1229  "psllq $56, %%mm6 \n\t"
1230  "por %%mm5, %%mm3 \n\t" // L01
1231  "por %%mm6, %%mm4 \n\t" // L21
1232  "movq %%mm3, %%mm5 \n\t" // L01
1233  PAVGB(%%mm4, %%mm3) // (L21 + L01)/2
1234  PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4
1235  "psubusb %%mm7, %%mm2 \n\t"
1236  "psubusb %%mm7, %%mm4 \n\t"
1237  "psubusb %%mm7, %%mm5 \n\t"
1238  "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1
1239  "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1
1240  "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1
1241  "paddb %%mm4, %%mm2 \n\t"
1242  "paddb %%mm5, %%mm2 \n\t"
1243 // 0, 2, 3, 1
1244 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1245  "movq " #src ", " #sx " \n\t" /* src[0] */\
1246  "movq " #sx ", " #lx " \n\t" /* src[0] */\
1247  "movq " #sx ", " #t0 " \n\t" /* src[0] */\
1248  "psllq $8, " #lx " \n\t"\
1249  "psrlq $8, " #t0 " \n\t"\
1250  "movd -4" #src ", " #t1 " \n\t"\
1251  "psrlq $24, " #t1 " \n\t"\
1252  "por " #t1 ", " #lx " \n\t" /* src[-1] */\
1253  "movd 8" #src ", " #t1 " \n\t"\
1254  "psllq $56, " #t1 " \n\t"\
1255  "por " #t1 ", " #t0 " \n\t" /* src[+1] */\
1256  "movq " #lx ", " #t1 " \n\t" /* src[-1] */\
1257  PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\
1258  PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\
1259  PAVGB(lx, pplx) \
1260  "movq " #lx ", 8(%4) \n\t"\
1261  "movq (%4), " #lx " \n\t"\
1262  "psubusb " #lx ", " #t1 " \n\t"\
1263  "psubusb " #lx ", " #t0 " \n\t"\
1264  "psubusb " #lx ", " #sx " \n\t"\
1265  "movq "MANGLE(b00)", " #lx " \n\t"\
1266  "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\
1267  "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\
1268  "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\
1269  "paddb " #t1 ", " #t0 " \n\t"\
1270  "paddb " #t0 ", " #sx " \n\t"\
1271 \
1272  PAVGB(plx, pplx) /* filtered */\
1273  "movq " #dst ", " #t0 " \n\t" /* dst */\
1274  "movq " #t0 ", " #t1 " \n\t" /* dst */\
1275  "psubusb %3, " #t0 " \n\t"\
1276  "paddusb %3, " #t1 " \n\t"\
1277  PMAXUB(t0, pplx)\
1278  PMINUB(t1, pplx, t0)\
1279  "paddb " #sx ", " #ppsx " \n\t"\
1280  "paddb " #psx ", " #ppsx " \n\t"\
1281  "#paddb "MANGLE(b02)", " #ppsx " \n\t"\
1282  "pand "MANGLE(b08)", " #ppsx " \n\t"\
1283  "pcmpeqb " #lx ", " #ppsx " \n\t"\
1284  "pand " #ppsx ", " #pplx " \n\t"\
1285  "pandn " #dst ", " #ppsx " \n\t"\
1286  "por " #pplx ", " #ppsx " \n\t"\
1287  "movq " #ppsx ", " #dst " \n\t"\
1288  "movq 8(%4), " #lx " \n\t"
1289 
1290 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
1291  REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
1292 /*
1293 0000000
1294 1111111
1295 
1296 1111110
1297 1111101
1298 1111100
1299 1111011
1300 1111010
1301 1111001
1302 
1303 1111000
1304 1110111
1305 
1306 */
1307 //DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1)
1308 DERING_CORE((%%FF_REGa) ,(%%FF_REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1309 DERING_CORE((%%FF_REGa, %1) ,(%%FF_REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1310 DERING_CORE((%%FF_REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1311 DERING_CORE((%0, %1, 4) ,(%%FF_REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1312 DERING_CORE((%%FF_REGd) ,(%%FF_REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1313 DERING_CORE((%%FF_REGd, %1) ,(%%FF_REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
1314 DERING_CORE((%%FF_REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
1315 DERING_CORE((%0, %1, 8) ,(%%FF_REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
1316 
1317  "1: \n\t"
1318  : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2), "q"(tmp)
1319  NAMED_CONSTRAINTS_ADD(deringThreshold,b00,b02,b08)
1320  : "%"FF_REG_a, "%"FF_REG_d
1321  );
1322 #else // HAVE_7REGS && (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW)
1323  int y;
1324  int min=255;
1325  int max=0;
1326  int avg;
1327  uint8_t *p;
1328  int s[10];
1329  const int QP2= c->QP/2 + 1;
1330 
1331  src --;
1332  for(y=1; y<9; y++){
1333  int x;
1334  p= src + stride*y;
1335  for(x=1; x<9; x++){
1336  p++;
1337  if(*p > max) max= *p;
1338  if(*p < min) min= *p;
1339  }
1340  }
1341  avg= (min + max + 1)>>1;
1342 
1343  if(max - min <deringThreshold) return;
1344 
1345  for(y=0; y<10; y++){
1346  int t = 0;
1347 
1348  if(src[stride*y + 0] > avg) t+= 1;
1349  if(src[stride*y + 1] > avg) t+= 2;
1350  if(src[stride*y + 2] > avg) t+= 4;
1351  if(src[stride*y + 3] > avg) t+= 8;
1352  if(src[stride*y + 4] > avg) t+= 16;
1353  if(src[stride*y + 5] > avg) t+= 32;
1354  if(src[stride*y + 6] > avg) t+= 64;
1355  if(src[stride*y + 7] > avg) t+= 128;
1356  if(src[stride*y + 8] > avg) t+= 256;
1357  if(src[stride*y + 9] > avg) t+= 512;
1358 
1359  t |= (~t)<<16;
1360  t &= (t<<1) & (t>>1);
1361  s[y] = t;
1362  }
1363 
1364  for(y=1; y<9; y++){
1365  int t = s[y-1] & s[y] & s[y+1];
1366  t|= t>>16;
1367  s[y-1]= t;
1368  }
1369 
1370  for(y=1; y<9; y++){
1371  int x;
1372  int t = s[y-1];
1373 
1374  p= src + stride*y;
1375  for(x=1; x<9; x++){
1376  p++;
1377  if(t & (1<<x)){
1378  int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
1379  +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
1380  +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
1381  f= (f + 8)>>4;
1382 
1383 #ifdef DEBUG_DERING_THRESHOLD
1384  __asm__ volatile("emms\n\t":);
1385  {
1386  static uint64_t numPixels=0;
1387  if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
1388 // if((max-min)<20 || (max-min)*QP<200)
1389 // if((max-min)*QP < 500)
1390 // if(max-min<QP/2)
1391  if(max-min < 20){
1392  static int numSkipped=0;
1393  static int errorSum=0;
1394  static int worstQP=0;
1395  static int worstRange=0;
1396  static int worstDiff=0;
1397  int diff= (f - *p);
1398  int absDiff= FFABS(diff);
1399  int error= diff*diff;
1400 
1401  if(x==1 || x==8 || y==1 || y==8) continue;
1402 
1403  numSkipped++;
1404  if(absDiff > worstDiff){
1405  worstDiff= absDiff;
1406  worstQP= QP;
1407  worstRange= max-min;
1408  }
1409  errorSum+= error;
1410 
1411  if(1024LL*1024LL*1024LL % numSkipped == 0){
1412  av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
1413  "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
1414  (float)errorSum/numSkipped, numSkipped, worstQP, worstRange,
1415  worstDiff, (float)numSkipped/numPixels);
1416  }
1417  }
1418  }
1419 #endif
1420  if (*p + QP2 < f) *p= *p + QP2;
1421  else if(*p - QP2 > f) *p= *p - QP2;
1422  else *p=f;
1423  }
1424  }
1425  }
1426 #ifdef DEBUG_DERING_THRESHOLD
1427  if(max-min < 20){
1428  for(y=1; y<9; y++){
1429  int x;
1430  int t = 0;
1431  p= src + stride*y;
1432  for(x=1; x<9; x++){
1433  p++;
1434  *p = FFMIN(*p + 20, 255);
1435  }
1436  }
1437 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255;
1438  }
1439 #endif
1440 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1441 }
1442 #endif //TEMPLATE_PP_ALTIVEC
1443 
1444 /**
1445  * Deinterlace the given block by linearly interpolating every second line.
1446  * will be called for every 8x8 block and can read & write from line 4-15
1447  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1448  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1449  */
1450 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
1451 {
1452 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1453  src+= 4*stride;
1454  __asm__ volatile(
1455  "lea (%0, %1), %%"FF_REG_a" \n\t"
1456  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_c"\n\t"
1457 // 0 1 2 3 4 5 6 7 8 9
1458 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1
1459 
1460  "movq (%0), %%mm0 \n\t"
1461  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
1462  PAVGB(%%mm1, %%mm0)
1463  "movq %%mm0, (%%"FF_REG_a") \n\t"
1464  "movq (%0, %1, 4), %%mm0 \n\t"
1465  PAVGB(%%mm0, %%mm1)
1466  "movq %%mm1, (%%"FF_REG_a", %1, 2) \n\t"
1467  "movq (%%"FF_REG_c", %1), %%mm1 \n\t"
1468  PAVGB(%%mm1, %%mm0)
1469  "movq %%mm0, (%%"FF_REG_c") \n\t"
1470  "movq (%0, %1, 8), %%mm0 \n\t"
1471  PAVGB(%%mm0, %%mm1)
1472  "movq %%mm1, (%%"FF_REG_c", %1, 2) \n\t"
1473 
1474  : : "r" (src), "r" ((x86_reg)stride)
1475  : "%"FF_REG_a, "%"FF_REG_c
1476  );
1477 #else
1478  int a, b, x;
1479  src+= 4*stride;
1480 
1481  for(x=0; x<2; x++){
1482  a= *(uint32_t*)&src[stride*0];
1483  b= *(uint32_t*)&src[stride*2];
1484  *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1485  a= *(uint32_t*)&src[stride*4];
1486  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1487  b= *(uint32_t*)&src[stride*6];
1488  *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1489  a= *(uint32_t*)&src[stride*8];
1490  *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1491  src += 4;
1492  }
1493 #endif
1494 }
1495 
1496 /**
1497  * Deinterlace the given block by cubic interpolating every second line.
1498  * will be called for every 8x8 block and can read & write from line 4-15
1499  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1500  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1501  * this filter will read lines 3-15 and write 7-13
1502  */
1503 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
1504 {
1505 #if TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1506  src+= stride*3;
1507  __asm__ volatile(
1508  "lea (%0, %1), %%"FF_REG_a" \n\t"
1509  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1510  "lea (%%"FF_REG_d", %1, 4), %%"FF_REG_c"\n\t"
1511  "add %1, %%"FF_REG_c" \n\t"
1512 #if TEMPLATE_PP_SSE2
1513  "pxor %%xmm7, %%xmm7 \n\t"
1514 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1515  "movq " #a ", %%xmm0 \n\t"\
1516  "movq " #b ", %%xmm1 \n\t"\
1517  "movq " #d ", %%xmm2 \n\t"\
1518  "movq " #e ", %%xmm3 \n\t"\
1519  "pavgb %%xmm2, %%xmm1 \n\t"\
1520  "pavgb %%xmm3, %%xmm0 \n\t"\
1521  "punpcklbw %%xmm7, %%xmm0 \n\t"\
1522  "punpcklbw %%xmm7, %%xmm1 \n\t"\
1523  "psubw %%xmm1, %%xmm0 \n\t"\
1524  "psraw $3, %%xmm0 \n\t"\
1525  "psubw %%xmm0, %%xmm1 \n\t"\
1526  "packuswb %%xmm1, %%xmm1 \n\t"\
1527  "movlps %%xmm1, " #c " \n\t"
1528 #else //TEMPLATE_PP_SSE2
1529  "pxor %%mm7, %%mm7 \n\t"
1530 // 0 1 2 3 4 5 6 7 8 9 10
1531 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1532 
1533 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
1534  "movq " #a ", %%mm0 \n\t"\
1535  "movq " #b ", %%mm1 \n\t"\
1536  "movq " #d ", %%mm2 \n\t"\
1537  "movq " #e ", %%mm3 \n\t"\
1538  PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\
1539  PAVGB(%%mm3, %%mm0) /* (a+e) /2 */\
1540  "movq %%mm0, %%mm2 \n\t"\
1541  "punpcklbw %%mm7, %%mm0 \n\t"\
1542  "punpckhbw %%mm7, %%mm2 \n\t"\
1543  "movq %%mm1, %%mm3 \n\t"\
1544  "punpcklbw %%mm7, %%mm1 \n\t"\
1545  "punpckhbw %%mm7, %%mm3 \n\t"\
1546  "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\
1547  "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\
1548  "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\
1549  "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\
1550  "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\
1551  "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\
1552  "packuswb %%mm3, %%mm1 \n\t"\
1553  "movq %%mm1, " #c " \n\t"
1554 #endif //TEMPLATE_PP_SSE2
1555 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e)
1556 
1557 DEINT_CUBIC((%0) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd, %1))
1558 DEINT_CUBIC((%%FF_REGa, %1), (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%0, %1, 8))
1559 DEINT_CUBIC((%0, %1, 4) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGc))
1560 DEINT_CUBIC((%%FF_REGd, %1), (%0, %1, 8) , (%%FF_REGd, %1, 4), (%%FF_REGc) , (%%FF_REGc, %1, 2))
1561 
1562  : : "r" (src), "r" ((x86_reg)stride)
1563  :
1564 #if TEMPLATE_PP_SSE2
1565  XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm7",)
1566 #endif
1567  "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c
1568  );
1569 #undef REAL_DEINT_CUBIC
1570 #else //TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1571  int x;
1572  src+= stride*3;
1573  for(x=0; x<8; x++){
1574  src[stride*3] = av_clip_uint8((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
1575  src[stride*5] = av_clip_uint8((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
1576  src[stride*7] = av_clip_uint8((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
1577  src[stride*9] = av_clip_uint8((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
1578  src++;
1579  }
1580 #endif //TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1581 }
1582 
1583 /**
1584  * Deinterlace the given block by filtering every second line with a (-1 4 2 4 -1) filter.
1585  * will be called for every 8x8 block and can read & write from line 4-15
1586  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1587  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1588  * this filter will read lines 4-13 and write 5-11
1589  */
1590 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
1591 {
1592 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1593  src+= stride*4;
1594  __asm__ volatile(
1595  "lea (%0, %1), %%"FF_REG_a" \n\t"
1596  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1597  "pxor %%mm7, %%mm7 \n\t"
1598  "movq (%2), %%mm0 \n\t"
1599 // 0 1 2 3 4 5 6 7 8 9 10
1600 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1601 
1602 #define REAL_DEINT_FF(a,b,c,d)\
1603  "movq " #a ", %%mm1 \n\t"\
1604  "movq " #b ", %%mm2 \n\t"\
1605  "movq " #c ", %%mm3 \n\t"\
1606  "movq " #d ", %%mm4 \n\t"\
1607  PAVGB(%%mm3, %%mm1) \
1608  PAVGB(%%mm4, %%mm0) \
1609  "movq %%mm0, %%mm3 \n\t"\
1610  "punpcklbw %%mm7, %%mm0 \n\t"\
1611  "punpckhbw %%mm7, %%mm3 \n\t"\
1612  "movq %%mm1, %%mm4 \n\t"\
1613  "punpcklbw %%mm7, %%mm1 \n\t"\
1614  "punpckhbw %%mm7, %%mm4 \n\t"\
1615  "psllw $2, %%mm1 \n\t"\
1616  "psllw $2, %%mm4 \n\t"\
1617  "psubw %%mm0, %%mm1 \n\t"\
1618  "psubw %%mm3, %%mm4 \n\t"\
1619  "movq %%mm2, %%mm5 \n\t"\
1620  "movq %%mm2, %%mm0 \n\t"\
1621  "punpcklbw %%mm7, %%mm2 \n\t"\
1622  "punpckhbw %%mm7, %%mm5 \n\t"\
1623  "paddw %%mm2, %%mm1 \n\t"\
1624  "paddw %%mm5, %%mm4 \n\t"\
1625  "psraw $2, %%mm1 \n\t"\
1626  "psraw $2, %%mm4 \n\t"\
1627  "packuswb %%mm4, %%mm1 \n\t"\
1628  "movq %%mm1, " #b " \n\t"\
1629 
1630 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d)
1631 
1632 DEINT_FF((%0) , (%%FF_REGa) , (%%FF_REGa, %1), (%%FF_REGa, %1, 2))
1633 DEINT_FF((%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) )
1634 DEINT_FF((%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1), (%%FF_REGd, %1, 2))
1635 DEINT_FF((%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4))
1636 
1637  "movq %%mm0, (%2) \n\t"
1638  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
1639  : "%"FF_REG_a, "%"FF_REG_d
1640  );
1641 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1642  int x;
1643  src+= stride*4;
1644  for(x=0; x<8; x++){
1645  int t1= tmp[x];
1646  int t2= src[stride*1];
1647 
1648  src[stride*1]= av_clip_uint8((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
1649  t1= src[stride*4];
1650  src[stride*3]= av_clip_uint8((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
1651  t2= src[stride*6];
1652  src[stride*5]= av_clip_uint8((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
1653  t1= src[stride*8];
1654  src[stride*7]= av_clip_uint8((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
1655  tmp[x]= t1;
1656 
1657  src++;
1658  }
1659 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1660 }
1661 
1662 /**
1663  * Deinterlace the given block by filtering every line with a (-1 2 6 2 -1) filter.
1664  * will be called for every 8x8 block and can read & write from line 4-15
1665  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1666  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1667  * this filter will read lines 4-13 and write 4-11
1668  */
1669 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
1670 {
1671 #if (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
1672  src+= stride*4;
1673  __asm__ volatile(
1674  "lea (%0, %1), %%"FF_REG_a" \n\t"
1675  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1676  "pxor %%mm7, %%mm7 \n\t"
1677  "movq (%2), %%mm0 \n\t"
1678  "movq (%3), %%mm1 \n\t"
1679 // 0 1 2 3 4 5 6 7 8 9 10
1680 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx
1681 
1682 #define REAL_DEINT_L5(t1,t2,a,b,c)\
1683  "movq " #a ", %%mm2 \n\t"\
1684  "movq " #b ", %%mm3 \n\t"\
1685  "movq " #c ", %%mm4 \n\t"\
1686  PAVGB(t2, %%mm3) \
1687  PAVGB(t1, %%mm4) \
1688  "movq %%mm2, %%mm5 \n\t"\
1689  "movq %%mm2, " #t1 " \n\t"\
1690  "punpcklbw %%mm7, %%mm2 \n\t"\
1691  "punpckhbw %%mm7, %%mm5 \n\t"\
1692  "movq %%mm2, %%mm6 \n\t"\
1693  "paddw %%mm2, %%mm2 \n\t"\
1694  "paddw %%mm6, %%mm2 \n\t"\
1695  "movq %%mm5, %%mm6 \n\t"\
1696  "paddw %%mm5, %%mm5 \n\t"\
1697  "paddw %%mm6, %%mm5 \n\t"\
1698  "movq %%mm3, %%mm6 \n\t"\
1699  "punpcklbw %%mm7, %%mm3 \n\t"\
1700  "punpckhbw %%mm7, %%mm6 \n\t"\
1701  "paddw %%mm3, %%mm3 \n\t"\
1702  "paddw %%mm6, %%mm6 \n\t"\
1703  "paddw %%mm3, %%mm2 \n\t"\
1704  "paddw %%mm6, %%mm5 \n\t"\
1705  "movq %%mm4, %%mm6 \n\t"\
1706  "punpcklbw %%mm7, %%mm4 \n\t"\
1707  "punpckhbw %%mm7, %%mm6 \n\t"\
1708  "psubw %%mm4, %%mm2 \n\t"\
1709  "psubw %%mm6, %%mm5 \n\t"\
1710  "psraw $2, %%mm2 \n\t"\
1711  "psraw $2, %%mm5 \n\t"\
1712  "packuswb %%mm5, %%mm2 \n\t"\
1713  "movq %%mm2, " #a " \n\t"\
1714 
1715 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c)
1716 
1717 DEINT_L5(%%mm0, %%mm1, (%0) , (%%FF_REGa) , (%%FF_REGa, %1) )
1718 DEINT_L5(%%mm1, %%mm0, (%%FF_REGa) , (%%FF_REGa, %1) , (%%FF_REGa, %1, 2))
1719 DEINT_L5(%%mm0, %%mm1, (%%FF_REGa, %1) , (%%FF_REGa, %1, 2), (%0, %1, 4) )
1720 DEINT_L5(%%mm1, %%mm0, (%%FF_REGa, %1, 2), (%0, %1, 4) , (%%FF_REGd) )
1721 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1) )
1722 DEINT_L5(%%mm1, %%mm0, (%%FF_REGd) , (%%FF_REGd, %1) , (%%FF_REGd, %1, 2))
1723 DEINT_L5(%%mm0, %%mm1, (%%FF_REGd, %1) , (%%FF_REGd, %1, 2), (%0, %1, 8) )
1724 DEINT_L5(%%mm1, %%mm0, (%%FF_REGd, %1, 2), (%0, %1, 8) , (%%FF_REGd, %1, 4))
1725 
1726  "movq %%mm0, (%2) \n\t"
1727  "movq %%mm1, (%3) \n\t"
1728  : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
1729  : "%"FF_REG_a, "%"FF_REG_d
1730  );
1731 #else //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
1732  int x;
1733  src+= stride*4;
1734  for(x=0; x<8; x++){
1735  int t1= tmp[x];
1736  int t2= tmp2[x];
1737  int t3= src[0];
1738 
1739  src[stride*0]= av_clip_uint8((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
1740  t1= src[stride*1];
1741  src[stride*1]= av_clip_uint8((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
1742  t2= src[stride*2];
1743  src[stride*2]= av_clip_uint8((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
1744  t3= src[stride*3];
1745  src[stride*3]= av_clip_uint8((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
1746  t1= src[stride*4];
1747  src[stride*4]= av_clip_uint8((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
1748  t2= src[stride*5];
1749  src[stride*5]= av_clip_uint8((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
1750  t3= src[stride*6];
1751  src[stride*6]= av_clip_uint8((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
1752  t1= src[stride*7];
1753  src[stride*7]= av_clip_uint8((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
1754 
1755  tmp[x]= t3;
1756  tmp2[x]= t1;
1757 
1758  src++;
1759  }
1760 #endif //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
1761 }
1762 
1763 /**
1764  * Deinterlace the given block by filtering all lines with a (1 2 1) filter.
1765  * will be called for every 8x8 block and can read & write from line 4-15
1766  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1767  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1768  * this filter will read lines 4-13 and write 4-11
1769  */
1770 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
1771 {
1772 #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1773  src+= 4*stride;
1774  __asm__ volatile(
1775  "lea (%0, %1), %%"FF_REG_a" \n\t"
1776  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1777 // 0 1 2 3 4 5 6 7 8 9
1778 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1779 
1780  "movq (%2), %%mm0 \n\t" // L0
1781  "movq (%%"FF_REG_a"), %%mm1 \n\t" // L2
1782  PAVGB(%%mm1, %%mm0) // L0+L2
1783  "movq (%0), %%mm2 \n\t" // L1
1784  PAVGB(%%mm2, %%mm0)
1785  "movq %%mm0, (%0) \n\t"
1786  "movq (%%"FF_REG_a", %1), %%mm0 \n\t" // L3
1787  PAVGB(%%mm0, %%mm2) // L1+L3
1788  PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3
1789  "movq %%mm2, (%%"FF_REG_a") \n\t"
1790  "movq (%%"FF_REG_a", %1, 2), %%mm2 \n\t" // L4
1791  PAVGB(%%mm2, %%mm1) // L2+L4
1792  PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4
1793  "movq %%mm1, (%%"FF_REG_a", %1) \n\t"
1794  "movq (%0, %1, 4), %%mm1 \n\t" // L5
1795  PAVGB(%%mm1, %%mm0) // L3+L5
1796  PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5
1797  "movq %%mm0, (%%"FF_REG_a", %1, 2) \n\t"
1798  "movq (%%"FF_REG_d"), %%mm0 \n\t" // L6
1799  PAVGB(%%mm0, %%mm2) // L4+L6
1800  PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6
1801  "movq %%mm2, (%0, %1, 4) \n\t"
1802  "movq (%%"FF_REG_d", %1), %%mm2 \n\t" // L7
1803  PAVGB(%%mm2, %%mm1) // L5+L7
1804  PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7
1805  "movq %%mm1, (%%"FF_REG_d") \n\t"
1806  "movq (%%"FF_REG_d", %1, 2), %%mm1 \n\t" // L8
1807  PAVGB(%%mm1, %%mm0) // L6+L8
1808  PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8
1809  "movq %%mm0, (%%"FF_REG_d", %1) \n\t"
1810  "movq (%0, %1, 8), %%mm0 \n\t" // L9
1811  PAVGB(%%mm0, %%mm2) // L7+L9
1812  PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9
1813  "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t"
1814  "movq %%mm1, (%2) \n\t"
1815 
1816  : : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
1817  : "%"FF_REG_a, "%"FF_REG_d
1818  );
1819 #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1820  int a, b, c, x;
1821  src+= 4*stride;
1822 
1823  for(x=0; x<2; x++){
1824  a= *(uint32_t*)&tmp[stride*0];
1825  b= *(uint32_t*)&src[stride*0];
1826  c= *(uint32_t*)&src[stride*1];
1827  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1828  *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1829 
1830  a= *(uint32_t*)&src[stride*2];
1831  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1832  *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1833 
1834  b= *(uint32_t*)&src[stride*3];
1835  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1836  *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1837 
1838  c= *(uint32_t*)&src[stride*4];
1839  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1840  *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1841 
1842  a= *(uint32_t*)&src[stride*5];
1843  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1844  *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1845 
1846  b= *(uint32_t*)&src[stride*6];
1847  c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
1848  *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
1849 
1850  c= *(uint32_t*)&src[stride*7];
1851  a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
1852  *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
1853 
1854  a= *(uint32_t*)&src[stride*8];
1855  b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
1856  *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
1857 
1858  *(uint32_t*)&tmp[stride*0]= c;
1859  src += 4;
1860  tmp += 4;
1861  }
1862 #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW
1863 }
1864 
1865 /**
1866  * Deinterlace the given block by applying a median filter to every second line.
1867  * will be called for every 8x8 block and can read & write from line 4-15,
1868  * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too.
1869  * lines 4-12 will be read into the deblocking filter and should be deinterlaced
1870  */
1871 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
1872 {
1873 #if TEMPLATE_PP_MMX
1874  src+= 4*stride;
1875 #if TEMPLATE_PP_MMXEXT
1876  __asm__ volatile(
1877  "lea (%0, %1), %%"FF_REG_a" \n\t"
1878  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1879 // 0 1 2 3 4 5 6 7 8 9
1880 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1881 
1882  "movq (%0), %%mm0 \n\t"
1883  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
1884  "movq (%%"FF_REG_a"), %%mm1 \n\t"
1885  "movq %%mm0, %%mm3 \n\t"
1886  "pmaxub %%mm1, %%mm0 \n\t"
1887  "pminub %%mm3, %%mm1 \n\t"
1888  "pmaxub %%mm2, %%mm1 \n\t"
1889  "pminub %%mm1, %%mm0 \n\t"
1890  "movq %%mm0, (%%"FF_REG_a") \n\t"
1891 
1892  "movq (%0, %1, 4), %%mm0 \n\t"
1893  "movq (%%"FF_REG_a", %1, 2), %%mm1 \n\t"
1894  "movq %%mm2, %%mm3 \n\t"
1895  "pmaxub %%mm1, %%mm2 \n\t"
1896  "pminub %%mm3, %%mm1 \n\t"
1897  "pmaxub %%mm0, %%mm1 \n\t"
1898  "pminub %%mm1, %%mm2 \n\t"
1899  "movq %%mm2, (%%"FF_REG_a", %1, 2) \n\t"
1900 
1901  "movq (%%"FF_REG_d"), %%mm2 \n\t"
1902  "movq (%%"FF_REG_d", %1), %%mm1 \n\t"
1903  "movq %%mm2, %%mm3 \n\t"
1904  "pmaxub %%mm0, %%mm2 \n\t"
1905  "pminub %%mm3, %%mm0 \n\t"
1906  "pmaxub %%mm1, %%mm0 \n\t"
1907  "pminub %%mm0, %%mm2 \n\t"
1908  "movq %%mm2, (%%"FF_REG_d") \n\t"
1909 
1910  "movq (%%"FF_REG_d", %1, 2), %%mm2 \n\t"
1911  "movq (%0, %1, 8), %%mm0 \n\t"
1912  "movq %%mm2, %%mm3 \n\t"
1913  "pmaxub %%mm0, %%mm2 \n\t"
1914  "pminub %%mm3, %%mm0 \n\t"
1915  "pmaxub %%mm1, %%mm0 \n\t"
1916  "pminub %%mm0, %%mm2 \n\t"
1917  "movq %%mm2, (%%"FF_REG_d", %1, 2) \n\t"
1918 
1919 
1920  : : "r" (src), "r" ((x86_reg)stride)
1921  : "%"FF_REG_a, "%"FF_REG_d
1922  );
1923 
1924 #else // MMX without MMX2
1925  __asm__ volatile(
1926  "lea (%0, %1), %%"FF_REG_a" \n\t"
1927  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_d"\n\t"
1928 // 0 1 2 3 4 5 6 7 8 9
1929 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1930  "pxor %%mm7, %%mm7 \n\t"
1931 
1932 #define REAL_MEDIAN(a,b,c)\
1933  "movq " #a ", %%mm0 \n\t"\
1934  "movq " #b ", %%mm2 \n\t"\
1935  "movq " #c ", %%mm1 \n\t"\
1936  "movq %%mm0, %%mm3 \n\t"\
1937  "movq %%mm1, %%mm4 \n\t"\
1938  "movq %%mm2, %%mm5 \n\t"\
1939  "psubusb %%mm1, %%mm3 \n\t"\
1940  "psubusb %%mm2, %%mm4 \n\t"\
1941  "psubusb %%mm0, %%mm5 \n\t"\
1942  "pcmpeqb %%mm7, %%mm3 \n\t"\
1943  "pcmpeqb %%mm7, %%mm4 \n\t"\
1944  "pcmpeqb %%mm7, %%mm5 \n\t"\
1945  "movq %%mm3, %%mm6 \n\t"\
1946  "pxor %%mm4, %%mm3 \n\t"\
1947  "pxor %%mm5, %%mm4 \n\t"\
1948  "pxor %%mm6, %%mm5 \n\t"\
1949  "por %%mm3, %%mm1 \n\t"\
1950  "por %%mm4, %%mm2 \n\t"\
1951  "por %%mm5, %%mm0 \n\t"\
1952  "pand %%mm2, %%mm0 \n\t"\
1953  "pand %%mm1, %%mm0 \n\t"\
1954  "movq %%mm0, " #b " \n\t"
1955 #define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c)
1956 
1957 MEDIAN((%0) , (%%FF_REGa) , (%%FF_REGa, %1))
1958 MEDIAN((%%FF_REGa, %1), (%%FF_REGa, %1, 2), (%0, %1, 4))
1959 MEDIAN((%0, %1, 4) , (%%FF_REGd) , (%%FF_REGd, %1))
1960 MEDIAN((%%FF_REGd, %1), (%%FF_REGd, %1, 2), (%0, %1, 8))
1961 
1962  : : "r" (src), "r" ((x86_reg)stride)
1963  : "%"FF_REG_a, "%"FF_REG_d
1964  );
1965 #endif //TEMPLATE_PP_MMXEXT
1966 #else //TEMPLATE_PP_MMX
1967  int x, y;
1968  src+= 4*stride;
1969  // FIXME - there should be a way to do a few columns in parallel like w/mmx
1970  for(x=0; x<8; x++){
1971  uint8_t *colsrc = src;
1972  for (y=0; y<4; y++){
1973  int a, b, c, d, e, f;
1974  a = colsrc[0 ];
1975  b = colsrc[stride ];
1976  c = colsrc[stride*2];
1977  d = (a-b)>>31;
1978  e = (b-c)>>31;
1979  f = (c-a)>>31;
1980  colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
1981  colsrc += stride*2;
1982  }
1983  src++;
1984  }
1985 #endif //TEMPLATE_PP_MMX
1986 }
1987 
1988 #if TEMPLATE_PP_MMX
1989 /**
1990  * Transpose and shift the given 8x8 Block into dst1 and dst2.
1991  */
1992 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, const uint8_t *src, int srcStride)
1993 {
1994  __asm__(
1995  "lea (%0, %1), %%"FF_REG_a" \n\t"
1996 // 0 1 2 3 4 5 6 7 8 9
1997 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
1998  "movq (%0), %%mm0 \n\t" // 12345678
1999  "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
2000  "movq %%mm0, %%mm2 \n\t" // 12345678
2001  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2002  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2003 
2004  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
2005  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
2006  "movq %%mm1, %%mm4 \n\t"
2007  "punpcklbw %%mm3, %%mm1 \n\t"
2008  "punpckhbw %%mm3, %%mm4 \n\t"
2009 
2010  "movq %%mm0, %%mm3 \n\t"
2011  "punpcklwd %%mm1, %%mm0 \n\t"
2012  "punpckhwd %%mm1, %%mm3 \n\t"
2013  "movq %%mm2, %%mm1 \n\t"
2014  "punpcklwd %%mm4, %%mm2 \n\t"
2015  "punpckhwd %%mm4, %%mm1 \n\t"
2016 
2017  "movd %%mm0, 128(%2) \n\t"
2018  "psrlq $32, %%mm0 \n\t"
2019  "movd %%mm0, 144(%2) \n\t"
2020  "movd %%mm3, 160(%2) \n\t"
2021  "psrlq $32, %%mm3 \n\t"
2022  "movd %%mm3, 176(%2) \n\t"
2023  "movd %%mm3, 48(%3) \n\t"
2024  "movd %%mm2, 192(%2) \n\t"
2025  "movd %%mm2, 64(%3) \n\t"
2026  "psrlq $32, %%mm2 \n\t"
2027  "movd %%mm2, 80(%3) \n\t"
2028  "movd %%mm1, 96(%3) \n\t"
2029  "psrlq $32, %%mm1 \n\t"
2030  "movd %%mm1, 112(%3) \n\t"
2031 
2032  "lea (%%"FF_REG_a", %1, 4), %%"FF_REG_a"\n\t"
2033 
2034  "movq (%0, %1, 4), %%mm0 \n\t" // 12345678
2035  "movq (%%"FF_REG_a"), %%mm1 \n\t" // abcdefgh
2036  "movq %%mm0, %%mm2 \n\t" // 12345678
2037  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2038  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2039 
2040  "movq (%%"FF_REG_a", %1), %%mm1 \n\t"
2041  "movq (%%"FF_REG_a", %1, 2), %%mm3 \n\t"
2042  "movq %%mm1, %%mm4 \n\t"
2043  "punpcklbw %%mm3, %%mm1 \n\t"
2044  "punpckhbw %%mm3, %%mm4 \n\t"
2045 
2046  "movq %%mm0, %%mm3 \n\t"
2047  "punpcklwd %%mm1, %%mm0 \n\t"
2048  "punpckhwd %%mm1, %%mm3 \n\t"
2049  "movq %%mm2, %%mm1 \n\t"
2050  "punpcklwd %%mm4, %%mm2 \n\t"
2051  "punpckhwd %%mm4, %%mm1 \n\t"
2052 
2053  "movd %%mm0, 132(%2) \n\t"
2054  "psrlq $32, %%mm0 \n\t"
2055  "movd %%mm0, 148(%2) \n\t"
2056  "movd %%mm3, 164(%2) \n\t"
2057  "psrlq $32, %%mm3 \n\t"
2058  "movd %%mm3, 180(%2) \n\t"
2059  "movd %%mm3, 52(%3) \n\t"
2060  "movd %%mm2, 196(%2) \n\t"
2061  "movd %%mm2, 68(%3) \n\t"
2062  "psrlq $32, %%mm2 \n\t"
2063  "movd %%mm2, 84(%3) \n\t"
2064  "movd %%mm1, 100(%3) \n\t"
2065  "psrlq $32, %%mm1 \n\t"
2066  "movd %%mm1, 116(%3) \n\t"
2067 
2068 
2069  :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2)
2070  : "%"FF_REG_a
2071  );
2072 }
2073 
2074 /**
2075  * Transpose the given 8x8 block.
2076  */
2077 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, const uint8_t *src)
2078 {
2079  __asm__(
2080  "lea (%0, %1), %%"FF_REG_a" \n\t"
2081  "lea (%%"FF_REG_a",%1,4), %%"FF_REG_d" \n\t"
2082 // 0 1 2 3 4 5 6 7 8 9
2083 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1
2084  "movq (%2), %%mm0 \n\t" // 12345678
2085  "movq 16(%2), %%mm1 \n\t" // abcdefgh
2086  "movq %%mm0, %%mm2 \n\t" // 12345678
2087  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2088  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2089 
2090  "movq 32(%2), %%mm1 \n\t"
2091  "movq 48(%2), %%mm3 \n\t"
2092  "movq %%mm1, %%mm4 \n\t"
2093  "punpcklbw %%mm3, %%mm1 \n\t"
2094  "punpckhbw %%mm3, %%mm4 \n\t"
2095 
2096  "movq %%mm0, %%mm3 \n\t"
2097  "punpcklwd %%mm1, %%mm0 \n\t"
2098  "punpckhwd %%mm1, %%mm3 \n\t"
2099  "movq %%mm2, %%mm1 \n\t"
2100  "punpcklwd %%mm4, %%mm2 \n\t"
2101  "punpckhwd %%mm4, %%mm1 \n\t"
2102 
2103  "movd %%mm0, (%0) \n\t"
2104  "psrlq $32, %%mm0 \n\t"
2105  "movd %%mm0, (%%"FF_REG_a") \n\t"
2106  "movd %%mm3, (%%"FF_REG_a", %1) \n\t"
2107  "psrlq $32, %%mm3 \n\t"
2108  "movd %%mm3, (%%"FF_REG_a", %1, 2) \n\t"
2109  "movd %%mm2, (%0, %1, 4) \n\t"
2110  "psrlq $32, %%mm2 \n\t"
2111  "movd %%mm2, (%%"FF_REG_d") \n\t"
2112  "movd %%mm1, (%%"FF_REG_d", %1) \n\t"
2113  "psrlq $32, %%mm1 \n\t"
2114  "movd %%mm1, (%%"FF_REG_d", %1, 2) \n\t"
2115 
2116 
2117  "movq 64(%2), %%mm0 \n\t" // 12345678
2118  "movq 80(%2), %%mm1 \n\t" // abcdefgh
2119  "movq %%mm0, %%mm2 \n\t" // 12345678
2120  "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d
2121  "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h
2122 
2123  "movq 96(%2), %%mm1 \n\t"
2124  "movq 112(%2), %%mm3 \n\t"
2125  "movq %%mm1, %%mm4 \n\t"
2126  "punpcklbw %%mm3, %%mm1 \n\t"
2127  "punpckhbw %%mm3, %%mm4 \n\t"
2128 
2129  "movq %%mm0, %%mm3 \n\t"
2130  "punpcklwd %%mm1, %%mm0 \n\t"
2131  "punpckhwd %%mm1, %%mm3 \n\t"
2132  "movq %%mm2, %%mm1 \n\t"
2133  "punpcklwd %%mm4, %%mm2 \n\t"
2134  "punpckhwd %%mm4, %%mm1 \n\t"
2135 
2136  "movd %%mm0, 4(%0) \n\t"
2137  "psrlq $32, %%mm0 \n\t"
2138  "movd %%mm0, 4(%%"FF_REG_a") \n\t"
2139  "movd %%mm3, 4(%%"FF_REG_a", %1) \n\t"
2140  "psrlq $32, %%mm3 \n\t"
2141  "movd %%mm3, 4(%%"FF_REG_a", %1, 2) \n\t"
2142  "movd %%mm2, 4(%0, %1, 4) \n\t"
2143  "psrlq $32, %%mm2 \n\t"
2144  "movd %%mm2, 4(%%"FF_REG_d") \n\t"
2145  "movd %%mm1, 4(%%"FF_REG_d", %1) \n\t"
2146  "psrlq $32, %%mm1 \n\t"
2147  "movd %%mm1, 4(%%"FF_REG_d", %1, 2) \n\t"
2148 
2149  :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src)
2150  : "%"FF_REG_a, "%"FF_REG_d
2151  );
2152 }
2153 #endif //TEMPLATE_PP_MMX
2154 //static long test=0;
2155 
2156 #if !TEMPLATE_PP_ALTIVEC
2157 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
2158  uint8_t *tempBlurred, uint32_t *tempBlurredPast, const int *maxNoise)
2159 {
2160  // to save a register (FIXME do this outside of the loops)
2161  tempBlurredPast[127]= maxNoise[0];
2162  tempBlurredPast[128]= maxNoise[1];
2163  tempBlurredPast[129]= maxNoise[2];
2164 
2165 #define FAST_L2_DIFF
2166 //#define L1_DIFF //u should change the thresholds too if u try that one
2167 #if (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
2168  __asm__ volatile(
2169  "lea (%2, %2, 2), %%"FF_REG_a" \n\t" // 3*stride
2170  "lea (%2, %2, 4), %%"FF_REG_d" \n\t" // 5*stride
2171  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2172 // 0 1 2 3 4 5 6 7 8 9
2173 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2
2174 //FIXME reorder?
2175 #ifdef L1_DIFF //needs mmx2
2176  "movq (%0), %%mm0 \n\t" // L0
2177  "psadbw (%1), %%mm0 \n\t" // |L0-R0|
2178  "movq (%0, %2), %%mm1 \n\t" // L1
2179  "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1|
2180  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2181  "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2|
2182  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2183  "psadbw (%1, %%"FF_REG_a"), %%mm3 \n\t" // |L3-R3|
2184 
2185  "movq (%0, %2, 4), %%mm4 \n\t" // L4
2186  "paddw %%mm1, %%mm0 \n\t"
2187  "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4|
2188  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
2189  "paddw %%mm2, %%mm0 \n\t"
2190  "psadbw (%1, %%"FF_REG_d"), %%mm5 \n\t" // |L5-R5|
2191  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
2192  "paddw %%mm3, %%mm0 \n\t"
2193  "psadbw (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // |L6-R6|
2194  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
2195  "paddw %%mm4, %%mm0 \n\t"
2196  "psadbw (%1, %%"FF_REG_c"), %%mm7 \n\t" // |L7-R7|
2197  "paddw %%mm5, %%mm6 \n\t"
2198  "paddw %%mm7, %%mm6 \n\t"
2199  "paddw %%mm6, %%mm0 \n\t"
2200 #else //L1_DIFF
2201 #if defined (FAST_L2_DIFF)
2202  "pcmpeqb %%mm7, %%mm7 \n\t"
2203  "movq "MANGLE(b80)", %%mm6 \n\t"
2204  "pxor %%mm0, %%mm0 \n\t"
2205 #define REAL_L2_DIFF_CORE(a, b)\
2206  "movq " #a ", %%mm5 \n\t"\
2207  "movq " #b ", %%mm2 \n\t"\
2208  "pxor %%mm7, %%mm2 \n\t"\
2209  PAVGB(%%mm2, %%mm5)\
2210  "paddb %%mm6, %%mm5 \n\t"\
2211  "movq %%mm5, %%mm2 \n\t"\
2212  "psllw $8, %%mm5 \n\t"\
2213  "pmaddwd %%mm5, %%mm5 \n\t"\
2214  "pmaddwd %%mm2, %%mm2 \n\t"\
2215  "paddd %%mm2, %%mm5 \n\t"\
2216  "psrld $14, %%mm5 \n\t"\
2217  "paddd %%mm5, %%mm0 \n\t"
2218 
2219 #else //defined (FAST_L2_DIFF)
2220  "pxor %%mm7, %%mm7 \n\t"
2221  "pxor %%mm0, %%mm0 \n\t"
2222 #define REAL_L2_DIFF_CORE(a, b)\
2223  "movq " #a ", %%mm5 \n\t"\
2224  "movq " #b ", %%mm2 \n\t"\
2225  "movq %%mm5, %%mm1 \n\t"\
2226  "movq %%mm2, %%mm3 \n\t"\
2227  "punpcklbw %%mm7, %%mm5 \n\t"\
2228  "punpckhbw %%mm7, %%mm1 \n\t"\
2229  "punpcklbw %%mm7, %%mm2 \n\t"\
2230  "punpckhbw %%mm7, %%mm3 \n\t"\
2231  "psubw %%mm2, %%mm5 \n\t"\
2232  "psubw %%mm3, %%mm1 \n\t"\
2233  "pmaddwd %%mm5, %%mm5 \n\t"\
2234  "pmaddwd %%mm1, %%mm1 \n\t"\
2235  "paddd %%mm1, %%mm5 \n\t"\
2236  "paddd %%mm5, %%mm0 \n\t"
2237 
2238 #endif //defined (FAST_L2_DIFF)
2239 
2240 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b)
2241 
2242 L2_DIFF_CORE((%0) , (%1))
2243 L2_DIFF_CORE((%0, %2) , (%1, %2))
2244 L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2))
2245 L2_DIFF_CORE((%0, %%FF_REGa) , (%1, %%FF_REGa))
2246 L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4))
2247 L2_DIFF_CORE((%0, %%FF_REGd) , (%1, %%FF_REGd))
2248 L2_DIFF_CORE((%0, %%FF_REGa,2), (%1, %%FF_REGa,2))
2249 L2_DIFF_CORE((%0, %%FF_REGc) , (%1, %%FF_REGc))
2250 
2251 #endif //L1_DIFF
2252 
2253  "movq %%mm0, %%mm4 \n\t"
2254  "psrlq $32, %%mm0 \n\t"
2255  "paddd %%mm0, %%mm4 \n\t"
2256  "movd %%mm4, %%ecx \n\t"
2257  "shll $2, %%ecx \n\t"
2258  "mov %3, %%"FF_REG_d" \n\t"
2259  "addl -4(%%"FF_REG_d"), %%ecx \n\t"
2260  "addl 4(%%"FF_REG_d"), %%ecx \n\t"
2261  "addl -1024(%%"FF_REG_d"), %%ecx \n\t"
2262  "addl $4, %%ecx \n\t"
2263  "addl 1024(%%"FF_REG_d"), %%ecx \n\t"
2264  "shrl $3, %%ecx \n\t"
2265  "movl %%ecx, (%%"FF_REG_d") \n\t"
2266 
2267 // "mov %3, %%"FF_REG_c" \n\t"
2268 // "mov %%"FF_REG_c", test \n\t"
2269 // "jmp 4f \n\t"
2270  "cmpl 512(%%"FF_REG_d"), %%ecx \n\t"
2271  " jb 2f \n\t"
2272  "cmpl 516(%%"FF_REG_d"), %%ecx \n\t"
2273  " jb 1f \n\t"
2274 
2275  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2276  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2277  "movq (%0), %%mm0 \n\t" // L0
2278  "movq (%0, %2), %%mm1 \n\t" // L1
2279  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2280  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2281  "movq (%0, %2, 4), %%mm4 \n\t" // L4
2282  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
2283  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
2284  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
2285  "movq %%mm0, (%1) \n\t" // L0
2286  "movq %%mm1, (%1, %2) \n\t" // L1
2287  "movq %%mm2, (%1, %2, 2) \n\t" // L2
2288  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // L3
2289  "movq %%mm4, (%1, %2, 4) \n\t" // L4
2290  "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // L5
2291  "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // L6
2292  "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // L7
2293  "jmp 4f \n\t"
2294 
2295  "1: \n\t"
2296  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2297  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2298  "movq (%0), %%mm0 \n\t" // L0
2299  PAVGB((%1), %%mm0) // L0
2300  "movq (%0, %2), %%mm1 \n\t" // L1
2301  PAVGB((%1, %2), %%mm1) // L1
2302  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2303  PAVGB((%1, %2, 2), %%mm2) // L2
2304  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2305  PAVGB((%1, %%FF_REGa), %%mm3) // L3
2306  "movq (%0, %2, 4), %%mm4 \n\t" // L4
2307  PAVGB((%1, %2, 4), %%mm4) // L4
2308  "movq (%0, %%"FF_REG_d"), %%mm5 \n\t" // L5
2309  PAVGB((%1, %%FF_REGd), %%mm5) // L5
2310  "movq (%0, %%"FF_REG_a", 2), %%mm6 \n\t" // L6
2311  PAVGB((%1, %%FF_REGa, 2), %%mm6) // L6
2312  "movq (%0, %%"FF_REG_c"), %%mm7 \n\t" // L7
2313  PAVGB((%1, %%FF_REGc), %%mm7) // L7
2314  "movq %%mm0, (%1) \n\t" // R0
2315  "movq %%mm1, (%1, %2) \n\t" // R1
2316  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2317  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
2318  "movq %%mm4, (%1, %2, 4) \n\t" // R4
2319  "movq %%mm5, (%1, %%"FF_REG_d") \n\t" // R5
2320  "movq %%mm6, (%1, %%"FF_REG_a", 2) \n\t" // R6
2321  "movq %%mm7, (%1, %%"FF_REG_c") \n\t" // R7
2322  "movq %%mm0, (%0) \n\t" // L0
2323  "movq %%mm1, (%0, %2) \n\t" // L1
2324  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2325  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
2326  "movq %%mm4, (%0, %2, 4) \n\t" // L4
2327  "movq %%mm5, (%0, %%"FF_REG_d") \n\t" // L5
2328  "movq %%mm6, (%0, %%"FF_REG_a", 2) \n\t" // L6
2329  "movq %%mm7, (%0, %%"FF_REG_c") \n\t" // L7
2330  "jmp 4f \n\t"
2331 
2332  "2: \n\t"
2333  "cmpl 508(%%"FF_REG_d"), %%ecx \n\t"
2334  " jb 3f \n\t"
2335 
2336  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2337  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2338  "movq (%0), %%mm0 \n\t" // L0
2339  "movq (%0, %2), %%mm1 \n\t" // L1
2340  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2341  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2342  "movq (%1), %%mm4 \n\t" // R0
2343  "movq (%1, %2), %%mm5 \n\t" // R1
2344  "movq (%1, %2, 2), %%mm6 \n\t" // R2
2345  "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
2346  PAVGB(%%mm4, %%mm0)
2347  PAVGB(%%mm5, %%mm1)
2348  PAVGB(%%mm6, %%mm2)
2349  PAVGB(%%mm7, %%mm3)
2350  PAVGB(%%mm4, %%mm0)
2351  PAVGB(%%mm5, %%mm1)
2352  PAVGB(%%mm6, %%mm2)
2353  PAVGB(%%mm7, %%mm3)
2354  "movq %%mm0, (%1) \n\t" // R0
2355  "movq %%mm1, (%1, %2) \n\t" // R1
2356  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2357  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
2358  "movq %%mm0, (%0) \n\t" // L0
2359  "movq %%mm1, (%0, %2) \n\t" // L1
2360  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2361  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
2362 
2363  "movq (%0, %2, 4), %%mm0 \n\t" // L4
2364  "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
2365  "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
2366  "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
2367  "movq (%1, %2, 4), %%mm4 \n\t" // R4
2368  "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
2369  "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
2370  "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
2371  PAVGB(%%mm4, %%mm0)
2372  PAVGB(%%mm5, %%mm1)
2373  PAVGB(%%mm6, %%mm2)
2374  PAVGB(%%mm7, %%mm3)
2375  PAVGB(%%mm4, %%mm0)
2376  PAVGB(%%mm5, %%mm1)
2377  PAVGB(%%mm6, %%mm2)
2378  PAVGB(%%mm7, %%mm3)
2379  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2380  "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
2381  "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
2382  "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
2383  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2384  "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
2385  "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
2386  "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
2387  "jmp 4f \n\t"
2388 
2389  "3: \n\t"
2390  "lea (%%"FF_REG_a", %2, 2), %%"FF_REG_d"\n\t" // 5*stride
2391  "lea (%%"FF_REG_d", %2, 2), %%"FF_REG_c"\n\t" // 7*stride
2392  "movq (%0), %%mm0 \n\t" // L0
2393  "movq (%0, %2), %%mm1 \n\t" // L1
2394  "movq (%0, %2, 2), %%mm2 \n\t" // L2
2395  "movq (%0, %%"FF_REG_a"), %%mm3 \n\t" // L3
2396  "movq (%1), %%mm4 \n\t" // R0
2397  "movq (%1, %2), %%mm5 \n\t" // R1
2398  "movq (%1, %2, 2), %%mm6 \n\t" // R2
2399  "movq (%1, %%"FF_REG_a"), %%mm7 \n\t" // R3
2400  PAVGB(%%mm4, %%mm0)
2401  PAVGB(%%mm5, %%mm1)
2402  PAVGB(%%mm6, %%mm2)
2403  PAVGB(%%mm7, %%mm3)
2404  PAVGB(%%mm4, %%mm0)
2405  PAVGB(%%mm5, %%mm1)
2406  PAVGB(%%mm6, %%mm2)
2407  PAVGB(%%mm7, %%mm3)
2408  PAVGB(%%mm4, %%mm0)
2409  PAVGB(%%mm5, %%mm1)
2410  PAVGB(%%mm6, %%mm2)
2411  PAVGB(%%mm7, %%mm3)
2412  "movq %%mm0, (%1) \n\t" // R0
2413  "movq %%mm1, (%1, %2) \n\t" // R1
2414  "movq %%mm2, (%1, %2, 2) \n\t" // R2
2415  "movq %%mm3, (%1, %%"FF_REG_a") \n\t" // R3
2416  "movq %%mm0, (%0) \n\t" // L0
2417  "movq %%mm1, (%0, %2) \n\t" // L1
2418  "movq %%mm2, (%0, %2, 2) \n\t" // L2
2419  "movq %%mm3, (%0, %%"FF_REG_a") \n\t" // L3
2420 
2421  "movq (%0, %2, 4), %%mm0 \n\t" // L4
2422  "movq (%0, %%"FF_REG_d"), %%mm1 \n\t" // L5
2423  "movq (%0, %%"FF_REG_a", 2), %%mm2 \n\t" // L6
2424  "movq (%0, %%"FF_REG_c"), %%mm3 \n\t" // L7
2425  "movq (%1, %2, 4), %%mm4 \n\t" // R4
2426  "movq (%1, %%"FF_REG_d"), %%mm5 \n\t" // R5
2427  "movq (%1, %%"FF_REG_a", 2), %%mm6 \n\t" // R6
2428  "movq (%1, %%"FF_REG_c"), %%mm7 \n\t" // R7
2429  PAVGB(%%mm4, %%mm0)
2430  PAVGB(%%mm5, %%mm1)
2431  PAVGB(%%mm6, %%mm2)
2432  PAVGB(%%mm7, %%mm3)
2433  PAVGB(%%mm4, %%mm0)
2434  PAVGB(%%mm5, %%mm1)
2435  PAVGB(%%mm6, %%mm2)
2436  PAVGB(%%mm7, %%mm3)
2437  PAVGB(%%mm4, %%mm0)
2438  PAVGB(%%mm5, %%mm1)
2439  PAVGB(%%mm6, %%mm2)
2440  PAVGB(%%mm7, %%mm3)
2441  "movq %%mm0, (%1, %2, 4) \n\t" // R4
2442  "movq %%mm1, (%1, %%"FF_REG_d") \n\t" // R5
2443  "movq %%mm2, (%1, %%"FF_REG_a", 2) \n\t" // R6
2444  "movq %%mm3, (%1, %%"FF_REG_c") \n\t" // R7
2445  "movq %%mm0, (%0, %2, 4) \n\t" // L4
2446  "movq %%mm1, (%0, %%"FF_REG_d") \n\t" // L5
2447  "movq %%mm2, (%0, %%"FF_REG_a", 2) \n\t" // L6
2448  "movq %%mm3, (%0, %%"FF_REG_c") \n\t" // L7
2449 
2450  "4: \n\t"
2451 
2452  :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast)
2454  : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_c, "memory"
2455  );
2456 #else //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
2457 {
2458  int y;
2459  int d=0;
2460 // int sysd=0;
2461  int i;
2462 
2463  for(y=0; y<8; y++){
2464  int x;
2465  for(x=0; x<8; x++){
2466  int ref= tempBlurred[ x + y*stride ];
2467  int cur= src[ x + y*stride ];
2468  int d1=ref - cur;
2469 // if(x==0 || x==7) d1+= d1>>1;
2470 // if(y==0 || y==7) d1+= d1>>1;
2471 // d+= FFABS(d1);
2472  d+= d1*d1;
2473 // sysd+= d1;
2474  }
2475  }
2476  i=d;
2477  d= (
2478  4*d
2479  +(*(tempBlurredPast-256))
2480  +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
2481  +(*(tempBlurredPast+256))
2482  +4)>>3;
2483  *tempBlurredPast=i;
2484 // ((*tempBlurredPast)*3 + d + 2)>>2;
2485 
2486 /*
2487 Switch between
2488  1 0 0 0 0 0 0 (0)
2489 64 32 16 8 4 2 1 (1)
2490 64 48 36 27 20 15 11 (33) (approx)
2491 64 56 49 43 37 33 29 (200) (approx)
2492 */
2493  if(d > maxNoise[1]){
2494  if(d < maxNoise[2]){
2495  for(y=0; y<8; y++){
2496  int x;
2497  for(x=0; x<8; x++){
2498  int ref= tempBlurred[ x + y*stride ];
2499  int cur= src[ x + y*stride ];
2500  tempBlurred[ x + y*stride ]=
2501  src[ x + y*stride ]=
2502  (ref + cur + 1)>>1;
2503  }
2504  }
2505  }else{
2506  for(y=0; y<8; y++){
2507  int x;
2508  for(x=0; x<8; x++){
2509  tempBlurred[ x + y*stride ]= src[ x + y*stride ];
2510  }
2511  }
2512  }
2513  }else{
2514  if(d < maxNoise[0]){
2515  for(y=0; y<8; y++){
2516  int x;
2517  for(x=0; x<8; x++){
2518  int ref= tempBlurred[ x + y*stride ];
2519  int cur= src[ x + y*stride ];
2520  tempBlurred[ x + y*stride ]=
2521  src[ x + y*stride ]=
2522  (ref*7 + cur + 4)>>3;
2523  }
2524  }
2525  }else{
2526  for(y=0; y<8; y++){
2527  int x;
2528  for(x=0; x<8; x++){
2529  int ref= tempBlurred[ x + y*stride ];
2530  int cur= src[ x + y*stride ];
2531  tempBlurred[ x + y*stride ]=
2532  src[ x + y*stride ]=
2533  (ref*3 + cur + 2)>>2;
2534  }
2535  }
2536  }
2537  }
2538 }
2539 #endif //(TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) && HAVE_6REGS
2540 }
2541 #endif //TEMPLATE_PP_ALTIVEC
2542 
2543 #if TEMPLATE_PP_MMX
2544 /**
2545  * accurate deblock filter
2546  */
2547 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, const PPContext *c, int mode){
2548  int64_t dc_mask, eq_mask, both_masks;
2549  int64_t sums[10*8*2];
2550  src+= step*3; // src points to begin of the 8x8 Block
2551  //{ START_TIMER
2552  __asm__ volatile(
2553  "movq %0, %%mm7 \n\t"
2554  "movq %1, %%mm6 \n\t"
2555  : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
2556  );
2557 
2558  __asm__ volatile(
2559  "lea (%2, %3), %%"FF_REG_a" \n\t"
2560 // 0 1 2 3 4 5 6 7 8 9
2561 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2
2562 
2563  "movq (%2), %%mm0 \n\t"
2564  "movq (%%"FF_REG_a"), %%mm1 \n\t"
2565  "movq %%mm1, %%mm3 \n\t"
2566  "movq %%mm1, %%mm4 \n\t"
2567  "psubb %%mm1, %%mm0 \n\t" // mm0 = difference
2568  "paddb %%mm7, %%mm0 \n\t"
2569  "pcmpgtb %%mm6, %%mm0 \n\t"
2570 
2571  "movq (%%"FF_REG_a",%3), %%mm2 \n\t"
2572  PMAXUB(%%mm2, %%mm4)
2573  PMINUB(%%mm2, %%mm3, %%mm5)
2574  "psubb %%mm2, %%mm1 \n\t"
2575  "paddb %%mm7, %%mm1 \n\t"
2576  "pcmpgtb %%mm6, %%mm1 \n\t"
2577  "paddb %%mm1, %%mm0 \n\t"
2578 
2579  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
2580  PMAXUB(%%mm1, %%mm4)
2581  PMINUB(%%mm1, %%mm3, %%mm5)
2582  "psubb %%mm1, %%mm2 \n\t"
2583  "paddb %%mm7, %%mm2 \n\t"
2584  "pcmpgtb %%mm6, %%mm2 \n\t"
2585  "paddb %%mm2, %%mm0 \n\t"
2586 
2587  "lea (%%"FF_REG_a", %3, 4), %%"FF_REG_a"\n\t"
2588 
2589  "movq (%2, %3, 4), %%mm2 \n\t"
2590  PMAXUB(%%mm2, %%mm4)
2591  PMINUB(%%mm2, %%mm3, %%mm5)
2592  "psubb %%mm2, %%mm1 \n\t"
2593  "paddb %%mm7, %%mm1 \n\t"
2594  "pcmpgtb %%mm6, %%mm1 \n\t"
2595  "paddb %%mm1, %%mm0 \n\t"
2596 
2597  "movq (%%"FF_REG_a"), %%mm1 \n\t"
2598  PMAXUB(%%mm1, %%mm4)
2599  PMINUB(%%mm1, %%mm3, %%mm5)
2600  "psubb %%mm1, %%mm2 \n\t"
2601  "paddb %%mm7, %%mm2 \n\t"
2602  "pcmpgtb %%mm6, %%mm2 \n\t"
2603  "paddb %%mm2, %%mm0 \n\t"
2604 
2605  "movq (%%"FF_REG_a", %3), %%mm2 \n\t"
2606  PMAXUB(%%mm2, %%mm4)
2607  PMINUB(%%mm2, %%mm3, %%mm5)
2608  "psubb %%mm2, %%mm1 \n\t"
2609  "paddb %%mm7, %%mm1 \n\t"
2610  "pcmpgtb %%mm6, %%mm1 \n\t"
2611  "paddb %%mm1, %%mm0 \n\t"
2612 
2613  "movq (%%"FF_REG_a", %3, 2), %%mm1 \n\t"
2614  PMAXUB(%%mm1, %%mm4)
2615  PMINUB(%%mm1, %%mm3, %%mm5)
2616  "psubb %%mm1, %%mm2 \n\t"
2617  "paddb %%mm7, %%mm2 \n\t"
2618  "pcmpgtb %%mm6, %%mm2 \n\t"
2619  "paddb %%mm2, %%mm0 \n\t"
2620 
2621  "movq (%2, %3, 8), %%mm2 \n\t"
2622  PMAXUB(%%mm2, %%mm4)
2623  PMINUB(%%mm2, %%mm3, %%mm5)
2624  "psubb %%mm2, %%mm1 \n\t"
2625  "paddb %%mm7, %%mm1 \n\t"
2626  "pcmpgtb %%mm6, %%mm1 \n\t"
2627  "paddb %%mm1, %%mm0 \n\t"
2628 
2629  "movq (%%"FF_REG_a", %3, 4), %%mm1 \n\t"
2630  "psubb %%mm1, %%mm2 \n\t"
2631  "paddb %%mm7, %%mm2 \n\t"
2632  "pcmpgtb %%mm6, %%mm2 \n\t"
2633  "paddb %%mm2, %%mm0 \n\t"
2634  "psubusb %%mm3, %%mm4 \n\t"
2635 
2636  "pxor %%mm6, %%mm6 \n\t"
2637  "movq %4, %%mm7 \n\t" // QP,..., QP
2638  "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP
2639  "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0
2640  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2641  "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0
2642  "movq %%mm7, %1 \n\t"
2643 
2644  "movq %5, %%mm7 \n\t"
2645  "punpcklbw %%mm7, %%mm7 \n\t"
2646  "punpcklbw %%mm7, %%mm7 \n\t"
2647  "punpcklbw %%mm7, %%mm7 \n\t"
2648  "psubb %%mm0, %%mm6 \n\t"
2649  "pcmpgtb %%mm7, %%mm6 \n\t"
2650  "movq %%mm6, %0 \n\t"
2651 
2652  : "=m" (eq_mask), "=m" (dc_mask)
2653  : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
2654  : "%"FF_REG_a
2655  );
2656 
2657  both_masks = dc_mask & eq_mask;
2658 
2659  if(both_masks){
2660  x86_reg offset= -8*step;
2661  int64_t *temp_sums= sums;
2662 
2663  __asm__ volatile(
2664  "movq %2, %%mm0 \n\t" // QP,..., QP
2665  "pxor %%mm4, %%mm4 \n\t"
2666 
2667  "movq (%0), %%mm6 \n\t"
2668  "movq (%0, %1), %%mm5 \n\t"
2669  "movq %%mm5, %%mm1 \n\t"
2670  "movq %%mm6, %%mm2 \n\t"
2671  "psubusb %%mm6, %%mm5 \n\t"
2672  "psubusb %%mm1, %%mm2 \n\t"
2673  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2674  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2675  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2676 
2677  "pxor %%mm6, %%mm1 \n\t"
2678  "pand %%mm0, %%mm1 \n\t"
2679  "pxor %%mm1, %%mm6 \n\t"
2680  // 0:QP 6:First
2681 
2682  "movq (%0, %1, 8), %%mm5 \n\t"
2683  "add %1, %0 \n\t" // %0 points to line 1 not 0
2684  "movq (%0, %1, 8), %%mm7 \n\t"
2685  "movq %%mm5, %%mm1 \n\t"
2686  "movq %%mm7, %%mm2 \n\t"
2687  "psubusb %%mm7, %%mm5 \n\t"
2688  "psubusb %%mm1, %%mm2 \n\t"
2689  "por %%mm5, %%mm2 \n\t" // ABS Diff of lines
2690  "movq %2, %%mm0 \n\t" // QP,..., QP
2691  "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0
2692  "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF
2693 
2694  "pxor %%mm7, %%mm1 \n\t"
2695  "pand %%mm0, %%mm1 \n\t"
2696  "pxor %%mm1, %%mm7 \n\t"
2697 
2698  "movq %%mm6, %%mm5 \n\t"
2699  "punpckhbw %%mm4, %%mm6 \n\t"
2700  "punpcklbw %%mm4, %%mm5 \n\t"
2701  // 4:0 5/6:First 7:Last
2702 
2703  "movq %%mm5, %%mm0 \n\t"
2704  "movq %%mm6, %%mm1 \n\t"
2705  "psllw $2, %%mm0 \n\t"
2706  "psllw $2, %%mm1 \n\t"
2707  "paddw "MANGLE(w04)", %%mm0 \n\t"
2708  "paddw "MANGLE(w04)", %%mm1 \n\t"
2709 
2710 #define NEXT\
2711  "movq (%0), %%mm2 \n\t"\
2712  "movq (%0), %%mm3 \n\t"\
2713  "add %1, %0 \n\t"\
2714  "punpcklbw %%mm4, %%mm2 \n\t"\
2715  "punpckhbw %%mm4, %%mm3 \n\t"\
2716  "paddw %%mm2, %%mm0 \n\t"\
2717  "paddw %%mm3, %%mm1 \n\t"
2718 
2719 #define PREV\
2720  "movq (%0), %%mm2 \n\t"\
2721  "movq (%0), %%mm3 \n\t"\
2722  "add %1, %0 \n\t"\
2723  "punpcklbw %%mm4, %%mm2 \n\t"\
2724  "punpckhbw %%mm4, %%mm3 \n\t"\
2725  "psubw %%mm2, %%mm0 \n\t"\
2726  "psubw %%mm3, %%mm1 \n\t"
2727 
2728 
2729  NEXT //0
2730  NEXT //1
2731  NEXT //2
2732  "movq %%mm0, (%3) \n\t"
2733  "movq %%mm1, 8(%3) \n\t"
2734 
2735  NEXT //3
2736  "psubw %%mm5, %%mm0 \n\t"
2737  "psubw %%mm6, %%mm1 \n\t"
2738  "movq %%mm0, 16(%3) \n\t"
2739  "movq %%mm1, 24(%3) \n\t"
2740 
2741  NEXT //4
2742  "psubw %%mm5, %%mm0 \n\t"
2743  "psubw %%mm6, %%mm1 \n\t"
2744  "movq %%mm0, 32(%3) \n\t"
2745  "movq %%mm1, 40(%3) \n\t"
2746 
2747  NEXT //5
2748  "psubw %%mm5, %%mm0 \n\t"
2749  "psubw %%mm6, %%mm1 \n\t"
2750  "movq %%mm0, 48(%3) \n\t"
2751  "movq %%mm1, 56(%3) \n\t"
2752 
2753  NEXT //6
2754  "psubw %%mm5, %%mm0 \n\t"
2755  "psubw %%mm6, %%mm1 \n\t"
2756  "movq %%mm0, 64(%3) \n\t"
2757  "movq %%mm1, 72(%3) \n\t"
2758 
2759  "movq %%mm7, %%mm6 \n\t"
2760  "punpckhbw %%mm4, %%mm7 \n\t"
2761  "punpcklbw %%mm4, %%mm6 \n\t"
2762 
2763  NEXT //7
2764  "mov %4, %0 \n\t"
2765  "add %1, %0 \n\t"
2766  PREV //0
2767  "movq %%mm0, 80(%3) \n\t"
2768  "movq %%mm1, 88(%3) \n\t"
2769 
2770  PREV //1
2771  "paddw %%mm6, %%mm0 \n\t"
2772  "paddw %%mm7, %%mm1 \n\t"
2773  "movq %%mm0, 96(%3) \n\t"
2774  "movq %%mm1, 104(%3) \n\t"
2775 
2776  PREV //2
2777  "paddw %%mm6, %%mm0 \n\t"
2778  "paddw %%mm7, %%mm1 \n\t"
2779  "movq %%mm0, 112(%3) \n\t"
2780  "movq %%mm1, 120(%3) \n\t"
2781 
2782  PREV //3
2783  "paddw %%mm6, %%mm0 \n\t"
2784  "paddw %%mm7, %%mm1 \n\t"
2785  "movq %%mm0, 128(%3) \n\t"
2786  "movq %%mm1, 136(%3) \n\t"
2787 
2788  PREV //4
2789  "paddw %%mm6, %%mm0 \n\t"
2790  "paddw %%mm7, %%mm1 \n\t"
2791  "movq %%mm0, 144(%3) \n\t"
2792  "movq %%mm1, 152(%3) \n\t"
2793 
2794  "mov %4, %0 \n\t" //FIXME
2795 
2796  : "+&r"(src)
2797  : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src)
2799  );
2800 
2801  src+= step; // src points to begin of the 8x8 Block
2802 
2803  __asm__ volatile(
2804  "movq %4, %%mm6 \n\t"
2805  "pcmpeqb %%mm5, %%mm5 \n\t"
2806  "pxor %%mm6, %%mm5 \n\t"
2807  "pxor %%mm7, %%mm7 \n\t"
2808 
2809  "1: \n\t"
2810  "movq (%1), %%mm0 \n\t"
2811  "movq 8(%1), %%mm1 \n\t"
2812  "paddw 32(%1), %%mm0 \n\t"
2813  "paddw 40(%1), %%mm1 \n\t"
2814  "movq (%0, %3), %%mm2 \n\t"
2815  "movq %%mm2, %%mm3 \n\t"
2816  "movq %%mm2, %%mm4 \n\t"
2817  "punpcklbw %%mm7, %%mm2 \n\t"
2818  "punpckhbw %%mm7, %%mm3 \n\t"
2819  "paddw %%mm2, %%mm0 \n\t"
2820  "paddw %%mm3, %%mm1 \n\t"
2821  "paddw %%mm2, %%mm0 \n\t"
2822  "paddw %%mm3, %%mm1 \n\t"
2823  "psrlw $4, %%mm0 \n\t"
2824  "psrlw $4, %%mm1 \n\t"
2825  "packuswb %%mm1, %%mm0 \n\t"
2826  "pand %%mm6, %%mm0 \n\t"
2827  "pand %%mm5, %%mm4 \n\t"
2828  "por %%mm4, %%mm0 \n\t"
2829  "movq %%mm0, (%0, %3) \n\t"
2830  "add $16, %1 \n\t"
2831  "add %2, %0 \n\t"
2832  " js 1b \n\t"
2833 
2834  : "+r"(offset), "+r"(temp_sums)
2835  : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks)
2836  );
2837  }else
2838  src+= step; // src points to begin of the 8x8 Block
2839 
2840  if(eq_mask != -1LL){
2841  uint8_t *temp_src= src;
2842  DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars
2843  __asm__ volatile(
2844  "pxor %%mm7, %%mm7 \n\t"
2845 // 0 1 2 3 4 5 6 7 8 9
2846 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1
2847 
2848  "movq (%0), %%mm0 \n\t"
2849  "movq %%mm0, %%mm1 \n\t"
2850  "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0
2851  "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0
2852 
2853  "movq (%0, %1), %%mm2 \n\t"
2854  "lea (%0, %1, 2), %%"FF_REG_a" \n\t"
2855  "movq %%mm2, %%mm3 \n\t"
2856  "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1
2857  "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1
2858 
2859  "movq (%%"FF_REG_a"), %%mm4 \n\t"
2860  "movq %%mm4, %%mm5 \n\t"
2861  "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2
2862  "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2
2863 
2864  "paddw %%mm0, %%mm0 \n\t" // 2L0
2865  "paddw %%mm1, %%mm1 \n\t" // 2H0
2866  "psubw %%mm4, %%mm2 \n\t" // L1 - L2
2867  "psubw %%mm5, %%mm3 \n\t" // H1 - H2
2868  "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2
2869  "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2
2870 
2871  "psllw $2, %%mm2 \n\t" // 4L1 - 4L2
2872  "psllw $2, %%mm3 \n\t" // 4H1 - 4H2
2873  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2
2874  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2
2875 
2876  "movq (%%"FF_REG_a", %1), %%mm2 \n\t"
2877  "movq %%mm2, %%mm3 \n\t"
2878  "punpcklbw %%mm7, %%mm2 \n\t" // L3
2879  "punpckhbw %%mm7, %%mm3 \n\t" // H3
2880 
2881  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3
2882  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3
2883  "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2884  "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2885  "movq %%mm0, (%4) \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2886  "movq %%mm1, 8(%4) \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2887 
2888  "movq (%%"FF_REG_a", %1, 2), %%mm0 \n\t"
2889  "movq %%mm0, %%mm1 \n\t"
2890  "punpcklbw %%mm7, %%mm0 \n\t" // L4
2891  "punpckhbw %%mm7, %%mm1 \n\t" // H4
2892 
2893  "psubw %%mm0, %%mm2 \n\t" // L3 - L4
2894  "psubw %%mm1, %%mm3 \n\t" // H3 - H4
2895  "movq %%mm2, 16(%4) \n\t" // L3 - L4
2896  "movq %%mm3, 24(%4) \n\t" // H3 - H4
2897  "paddw %%mm4, %%mm4 \n\t" // 2L2
2898  "paddw %%mm5, %%mm5 \n\t" // 2H2
2899  "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4
2900  "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4
2901 
2902  "lea (%%"FF_REG_a", %1), %0 \n\t"
2903  "psllw $2, %%mm2 \n\t" // 4L3 - 4L4
2904  "psllw $2, %%mm3 \n\t" // 4H3 - 4H4
2905  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4
2906  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4
2907 //50 opcodes so far
2908  "movq (%0, %1, 2), %%mm2 \n\t"
2909  "movq %%mm2, %%mm3 \n\t"
2910  "punpcklbw %%mm7, %%mm2 \n\t" // L5
2911  "punpckhbw %%mm7, %%mm3 \n\t" // H5
2912  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5
2913  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5
2914  "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5
2915  "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5
2916 
2917  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2918  "punpcklbw %%mm7, %%mm6 \n\t" // L6
2919  "psubw %%mm6, %%mm2 \n\t" // L5 - L6
2920  "movq (%%"FF_REG_a", %1, 4), %%mm6 \n\t"
2921  "punpckhbw %%mm7, %%mm6 \n\t" // H6
2922  "psubw %%mm6, %%mm3 \n\t" // H5 - H6
2923 
2924  "paddw %%mm0, %%mm0 \n\t" // 2L4
2925  "paddw %%mm1, %%mm1 \n\t" // 2H4
2926  "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6
2927  "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6
2928 
2929  "psllw $2, %%mm2 \n\t" // 4L5 - 4L6
2930  "psllw $2, %%mm3 \n\t" // 4H5 - 4H6
2931  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6
2932  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6
2933 
2934  "movq (%0, %1, 4), %%mm2 \n\t"
2935  "movq %%mm2, %%mm3 \n\t"
2936  "punpcklbw %%mm7, %%mm2 \n\t" // L7
2937  "punpckhbw %%mm7, %%mm3 \n\t" // H7
2938 
2939  "paddw %%mm2, %%mm2 \n\t" // 2L7
2940  "paddw %%mm3, %%mm3 \n\t" // 2H7
2941  "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7
2942  "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7
2943 
2944  "movq (%4), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3
2945  "movq 8(%4), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3
2946 
2947 #if TEMPLATE_PP_MMXEXT
2948  "movq %%mm7, %%mm6 \n\t" // 0
2949  "psubw %%mm0, %%mm6 \n\t"
2950  "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2951  "movq %%mm7, %%mm6 \n\t" // 0
2952  "psubw %%mm1, %%mm6 \n\t"
2953  "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2954  "movq %%mm7, %%mm6 \n\t" // 0
2955  "psubw %%mm2, %%mm6 \n\t"
2956  "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2957  "movq %%mm7, %%mm6 \n\t" // 0
2958  "psubw %%mm3, %%mm6 \n\t"
2959  "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2960 #else
2961  "movq %%mm7, %%mm6 \n\t" // 0
2962  "pcmpgtw %%mm0, %%mm6 \n\t"
2963  "pxor %%mm6, %%mm0 \n\t"
2964  "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7|
2965  "movq %%mm7, %%mm6 \n\t" // 0
2966  "pcmpgtw %%mm1, %%mm6 \n\t"
2967  "pxor %%mm6, %%mm1 \n\t"
2968  "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7|
2969  "movq %%mm7, %%mm6 \n\t" // 0
2970  "pcmpgtw %%mm2, %%mm6 \n\t"
2971  "pxor %%mm6, %%mm2 \n\t"
2972  "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3|
2973  "movq %%mm7, %%mm6 \n\t" // 0
2974  "pcmpgtw %%mm3, %%mm6 \n\t"
2975  "pxor %%mm6, %%mm3 \n\t"
2976  "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3|
2977 #endif
2978 
2979 #if TEMPLATE_PP_MMXEXT
2980  "pminsw %%mm2, %%mm0 \n\t"
2981  "pminsw %%mm3, %%mm1 \n\t"
2982 #else
2983  "movq %%mm0, %%mm6 \n\t"
2984  "psubusw %%mm2, %%mm6 \n\t"
2985  "psubw %%mm6, %%mm0 \n\t"
2986  "movq %%mm1, %%mm6 \n\t"
2987  "psubusw %%mm3, %%mm6 \n\t"
2988  "psubw %%mm6, %%mm1 \n\t"
2989 #endif
2990 
2991  "movd %2, %%mm2 \n\t" // QP
2992  "punpcklbw %%mm7, %%mm2 \n\t"
2993 
2994  "movq %%mm7, %%mm6 \n\t" // 0
2995  "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5)
2996  "pxor %%mm6, %%mm4 \n\t"
2997  "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5|
2998  "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5)
2999  "pxor %%mm7, %%mm5 \n\t"
3000  "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5|
3001 // 100 opcodes
3002  "psllw $3, %%mm2 \n\t" // 8QP
3003  "movq %%mm2, %%mm3 \n\t" // 8QP
3004  "pcmpgtw %%mm4, %%mm2 \n\t"
3005  "pcmpgtw %%mm5, %%mm3 \n\t"
3006  "pand %%mm2, %%mm4 \n\t"
3007  "pand %%mm3, %%mm5 \n\t"
3008 
3009 
3010  "psubusw %%mm0, %%mm4 \n\t" // hd
3011  "psubusw %%mm1, %%mm5 \n\t" // ld
3012 
3013 
3014  "movq "MANGLE(w05)", %%mm2 \n\t" // 5
3015  "pmullw %%mm2, %%mm4 \n\t"
3016  "pmullw %%mm2, %%mm5 \n\t"
3017  "movq "MANGLE(w20)", %%mm2 \n\t" // 32
3018  "paddw %%mm2, %%mm4 \n\t"
3019  "paddw %%mm2, %%mm5 \n\t"
3020  "psrlw $6, %%mm4 \n\t"
3021  "psrlw $6, %%mm5 \n\t"
3022 
3023  "movq 16(%4), %%mm0 \n\t" // L3 - L4
3024  "movq 24(%4), %%mm1 \n\t" // H3 - H4
3025 
3026  "pxor %%mm2, %%mm2 \n\t"
3027  "pxor %%mm3, %%mm3 \n\t"
3028 
3029  "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4)
3030  "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4)
3031  "pxor %%mm2, %%mm0 \n\t"
3032  "pxor %%mm3, %%mm1 \n\t"
3033  "psubw %%mm2, %%mm0 \n\t" // |L3-L4|
3034  "psubw %%mm3, %%mm1 \n\t" // |H3-H4|
3035  "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2
3036  "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2
3037 
3038  "pxor %%mm6, %%mm2 \n\t"
3039  "pxor %%mm7, %%mm3 \n\t"
3040  "pand %%mm2, %%mm4 \n\t"
3041  "pand %%mm3, %%mm5 \n\t"
3042 
3043 #if TEMPLATE_PP_MMXEXT
3044  "pminsw %%mm0, %%mm4 \n\t"
3045  "pminsw %%mm1, %%mm5 \n\t"
3046 #else
3047  "movq %%mm4, %%mm2 \n\t"
3048  "psubusw %%mm0, %%mm2 \n\t"
3049  "psubw %%mm2, %%mm4 \n\t"
3050  "movq %%mm5, %%mm2 \n\t"
3051  "psubusw %%mm1, %%mm2 \n\t"
3052  "psubw %%mm2, %%mm5 \n\t"
3053 #endif
3054  "pxor %%mm6, %%mm4 \n\t"
3055  "pxor %%mm7, %%mm5 \n\t"
3056  "psubw %%mm6, %%mm4 \n\t"
3057  "psubw %%mm7, %%mm5 \n\t"
3058  "packsswb %%mm5, %%mm4 \n\t"
3059  "movq %3, %%mm1 \n\t"
3060  "pandn %%mm4, %%mm1 \n\t"
3061  "movq (%0), %%mm0 \n\t"
3062  "paddb %%mm1, %%mm0 \n\t"
3063  "movq %%mm0, (%0) \n\t"
3064  "movq (%0, %1), %%mm0 \n\t"
3065  "psubb %%mm1, %%mm0 \n\t"
3066  "movq %%mm0, (%0, %1) \n\t"
3067 
3068  : "+r" (temp_src)
3069  : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask), "r"(tmp)
3070  NAMED_CONSTRAINTS_ADD(w05,w20)
3071  : "%"FF_REG_a
3072  );
3073  }
3074 /*if(step==16){
3075  STOP_TIMER("step16")
3076 }else{
3077  STOP_TIMER("stepX")
3078 }
3079  } */
3080 }
3081 #endif //TEMPLATE_PP_MMX
3082 
3083 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3084  const int8_t QPs[], int QPStride, int isColor, PPContext *c);
3085 
3086 /**
3087  * Copy a block from src to dst and fixes the blacklevel.
3088  * levelFix == 0 -> do not touch the brightness & contrast
3089  */
3090 #undef REAL_SCALED_CPY
3091 #undef SCALED_CPY
3092 
3093 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
3094  int levelFix, int64_t *packedOffsetAndScale)
3095 {
3096 #if !TEMPLATE_PP_MMX || !HAVE_6REGS
3097  int i;
3098 #endif
3099  if(levelFix){
3100 #if TEMPLATE_PP_MMX && HAVE_6REGS
3101  __asm__ volatile(
3102  "movq (%%"FF_REG_a"), %%mm2 \n\t" // packedYOffset
3103  "movq 8(%%"FF_REG_a"), %%mm3 \n\t" // packedYScale
3104  "lea (%2,%4), %%"FF_REG_a" \n\t"
3105  "lea (%3,%5), %%"FF_REG_d" \n\t"
3106  "pxor %%mm4, %%mm4 \n\t"
3107 #if TEMPLATE_PP_MMXEXT
3108 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3109  "movq " #src1 ", %%mm0 \n\t"\
3110  "movq " #src1 ", %%mm5 \n\t"\
3111  "movq " #src2 ", %%mm1 \n\t"\
3112  "movq " #src2 ", %%mm6 \n\t"\
3113  "punpcklbw %%mm0, %%mm0 \n\t"\
3114  "punpckhbw %%mm5, %%mm5 \n\t"\
3115  "punpcklbw %%mm1, %%mm1 \n\t"\
3116  "punpckhbw %%mm6, %%mm6 \n\t"\
3117  "pmulhuw %%mm3, %%mm0 \n\t"\
3118  "pmulhuw %%mm3, %%mm5 \n\t"\
3119  "pmulhuw %%mm3, %%mm1 \n\t"\
3120  "pmulhuw %%mm3, %%mm6 \n\t"\
3121  "psubw %%mm2, %%mm0 \n\t"\
3122  "psubw %%mm2, %%mm5 \n\t"\
3123  "psubw %%mm2, %%mm1 \n\t"\
3124  "psubw %%mm2, %%mm6 \n\t"\
3125  "packuswb %%mm5, %%mm0 \n\t"\
3126  "packuswb %%mm6, %%mm1 \n\t"\
3127  "movq %%mm0, " #dst1 " \n\t"\
3128  "movq %%mm1, " #dst2 " \n\t"\
3129 
3130 #else //TEMPLATE_PP_MMXEXT
3131 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
3132  "movq " #src1 ", %%mm0 \n\t"\
3133  "movq " #src1 ", %%mm5 \n\t"\
3134  "punpcklbw %%mm4, %%mm0 \n\t"\
3135  "punpckhbw %%mm4, %%mm5 \n\t"\
3136  "psubw %%mm2, %%mm0 \n\t"\
3137  "psubw %%mm2, %%mm5 \n\t"\
3138  "movq " #src2 ", %%mm1 \n\t"\
3139  "psllw $6, %%mm0 \n\t"\
3140  "psllw $6, %%mm5 \n\t"\
3141  "pmulhw %%mm3, %%mm0 \n\t"\
3142  "movq " #src2 ", %%mm6 \n\t"\
3143  "pmulhw %%mm3, %%mm5 \n\t"\
3144  "punpcklbw %%mm4, %%mm1 \n\t"\
3145  "punpckhbw %%mm4, %%mm6 \n\t"\
3146  "psubw %%mm2, %%mm1 \n\t"\
3147  "psubw %%mm2, %%mm6 \n\t"\
3148  "psllw $6, %%mm1 \n\t"\
3149  "psllw $6, %%mm6 \n\t"\
3150  "pmulhw %%mm3, %%mm1 \n\t"\
3151  "pmulhw %%mm3, %%mm6 \n\t"\
3152  "packuswb %%mm5, %%mm0 \n\t"\
3153  "packuswb %%mm6, %%mm1 \n\t"\
3154  "movq %%mm0, " #dst1 " \n\t"\
3155  "movq %%mm1, " #dst2 " \n\t"\
3156 
3157 #endif //TEMPLATE_PP_MMXEXT
3158 #define SCALED_CPY(src1, src2, dst1, dst2)\
3159  REAL_SCALED_CPY(src1, src2, dst1, dst2)
3160 
3161 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
3162 SCALED_CPY((%2, %4, 2), (%%FF_REGa, %4, 2), (%3, %5, 2), (%%FF_REGd, %5, 2))
3163 SCALED_CPY((%2, %4, 4), (%%FF_REGa, %4, 4), (%3, %5, 4), (%%FF_REGd, %5, 4))
3164  "lea (%%"FF_REG_a",%4,4), %%"FF_REG_a" \n\t"
3165  "lea (%%"FF_REG_d",%5,4), %%"FF_REG_d" \n\t"
3166 SCALED_CPY((%%FF_REGa, %4), (%%FF_REGa, %4, 2), (%%FF_REGd, %5), (%%FF_REGd, %5, 2))
3167 
3168 
3169  : "=&a" (packedOffsetAndScale)
3170  : "0" (packedOffsetAndScale),
3171  "r"(src),
3172  "r"(dst),
3173  "r" ((x86_reg)srcStride),
3174  "r" ((x86_reg)dstStride)
3175  : "%"FF_REG_d
3176  );
3177 #else //TEMPLATE_PP_MMX && HAVE_6REGS
3178  for(i=0; i<8; i++)
3179  memcpy( &(dst[dstStride*i]),
3180  &(src[srcStride*i]), BLOCK_SIZE);
3181 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
3182  }else{
3183 #if TEMPLATE_PP_MMX && HAVE_6REGS
3184  __asm__ volatile(
3185  "lea (%0,%2), %%"FF_REG_a" \n\t"
3186  "lea (%1,%3), %%"FF_REG_d" \n\t"
3187 
3188 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
3189  "movq " #src1 ", %%mm0 \n\t"\
3190  "movq " #src2 ", %%mm1 \n\t"\
3191  "movq %%mm0, " #dst1 " \n\t"\
3192  "movq %%mm1, " #dst2 " \n\t"\
3193 
3194 #define SIMPLE_CPY(src1, src2, dst1, dst2)\
3195  REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
3196 
3197 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
3198 SIMPLE_CPY((%0, %2, 2), (%%FF_REGa, %2, 2), (%1, %3, 2), (%%FF_REGd, %3, 2))
3199 SIMPLE_CPY((%0, %2, 4), (%%FF_REGa, %2, 4), (%1, %3, 4), (%%FF_REGd, %3, 4))
3200  "lea (%%"FF_REG_a",%2,4), %%"FF_REG_a" \n\t"
3201  "lea (%%"FF_REG_d",%3,4), %%"FF_REG_d" \n\t"
3202 SIMPLE_CPY((%%FF_REGa, %2), (%%FF_REGa, %2, 2), (%%FF_REGd, %3), (%%FF_REGd, %3, 2))
3203 
3204  : : "r" (src),
3205  "r" (dst),
3206  "r" ((x86_reg)srcStride),
3207  "r" ((x86_reg)dstStride)
3208  : "%"FF_REG_a, "%"FF_REG_d
3209  );
3210 #else //TEMPLATE_PP_MMX && HAVE_6REGS
3211  for(i=0; i<8; i++)
3212  memcpy( &(dst[dstStride*i]),
3213  &(src[srcStride*i]), BLOCK_SIZE);
3214 #endif //TEMPLATE_PP_MMX && HAVE_6REGS
3215  }
3216 }
3217 
3218 /**
3219  * Duplicate the given 8 src pixels ? times upward
3220  */
3221 static inline void RENAME(duplicate)(uint8_t src[], int stride)
3222 {
3223 #if TEMPLATE_PP_MMX
3224  __asm__ volatile(
3225  "movq (%0), %%mm0 \n\t"
3226  "movq %%mm0, (%0, %1, 4) \n\t"
3227  "add %1, %0 \n\t"
3228  "movq %%mm0, (%0) \n\t"
3229  "movq %%mm0, (%0, %1) \n\t"
3230  "movq %%mm0, (%0, %1, 2) \n\t"
3231  "movq %%mm0, (%0, %1, 4) \n\t"
3232  : "+r" (src)
3233  : "r" ((x86_reg)-stride)
3234  );
3235 #else
3236  int i;
3237  uint8_t *p=src;
3238  for(i=0; i<5; i++){
3239  p-= stride;
3240  memcpy(p, src, 8);
3241  }
3242 #endif
3243 }
3244 
3245 #if ARCH_X86 && TEMPLATE_PP_MMXEXT
3246 static inline void RENAME(prefetchnta)(const void *p)
3247 {
3248  __asm__ volatile( "prefetchnta (%0)\n\t"
3249  : : "r" (p)
3250  );
3251 }
3252 
3253 static inline void RENAME(prefetcht0)(const void *p)
3254 {
3255  __asm__ volatile( "prefetcht0 (%0)\n\t"
3256  : : "r" (p)
3257  );
3258 }
3259 
3260 static inline void RENAME(prefetcht1)(const void *p)
3261 {
3262  __asm__ volatile( "prefetcht1 (%0)\n\t"
3263  : : "r" (p)
3264  );
3265 }
3266 
3267 static inline void RENAME(prefetcht2)(const void *p)
3268 {
3269  __asm__ volatile( "prefetcht2 (%0)\n\t"
3270  : : "r" (p)
3271  );
3272 }
3273 #elif !ARCH_X86 && AV_GCC_VERSION_AT_LEAST(3,2)
3274 static inline void RENAME(prefetchnta)(const void *p)
3275 {
3276  __builtin_prefetch(p,0,0);
3277 }
3278 static inline void RENAME(prefetcht0)(const void *p)
3279 {
3280  __builtin_prefetch(p,0,1);
3281 }
3282 static inline void RENAME(prefetcht1)(const void *p)
3283 {
3284  __builtin_prefetch(p,0,2);
3285 }
3286 static inline void RENAME(prefetcht2)(const void *p)
3287 {
3288  __builtin_prefetch(p,0,3);
3289 }
3290 #else
3291 static inline void RENAME(prefetchnta)(const void *p)
3292 {
3293  return;
3294 }
3295 static inline void RENAME(prefetcht0)(const void *p)
3296 {
3297  return;
3298 }
3299 static inline void RENAME(prefetcht1)(const void *p)
3300 {
3301  return;
3302 }
3303 static inline void RENAME(prefetcht2)(const void *p)
3304 {
3305  return;
3306 }
3307 #endif
3308 /**
3309  * Filter array of bytes (Y or U or V values)
3310  */
3311 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
3312  const int8_t QPs[], int QPStride, int isColor, PPContext *c2)
3313 {
3314  DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access
3315  int x,y;
3316 #ifdef TEMPLATE_PP_TIME_MODE
3317  const int mode= TEMPLATE_PP_TIME_MODE;
3318 #else
3319  const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
3320 #endif
3321  int black=0, white=255; // blackest black and whitest white in the picture
3322  int QPCorrecture= 256*256;
3323 
3324  int copyAhead;
3325 #if TEMPLATE_PP_MMX
3326  int i;
3327 #endif
3328 
3329  const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
3330  const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
3331 
3332  //FIXME remove
3333  uint64_t * const yHistogram= c.yHistogram;
3334  uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
3335  uint8_t * const tempDst= (dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride) + 32;
3336  //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4;
3337 
3338  if (mode & VISUALIZE){
3339  if(!(mode & (V_A_DEBLOCK | H_A_DEBLOCK)) || TEMPLATE_PP_MMX) {
3340  av_log(c2, AV_LOG_WARNING, "Visualization is currently only supported with the accurate deblock filter without SIMD\n");
3341  }
3342  }
3343 
3344 #if TEMPLATE_PP_MMX
3345  for(i=0; i<57; i++){
3346  int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
3347  int threshold= offset*2 + 1;
3348  c.mmxDcOffset[i]= 0x7F - offset;
3349  c.mmxDcThreshold[i]= 0x7F - threshold;
3350  c.mmxDcOffset[i]*= 0x0101010101010101LL;
3351  c.mmxDcThreshold[i]*= 0x0101010101010101LL;
3352  }
3353 #endif
3354 
3355  if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
3356  else if( (mode & LINEAR_BLEND_DEINT_FILTER)
3357  || (mode & FFMPEG_DEINT_FILTER)
3358  || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
3359  else if( (mode & V_DEBLOCK)
3361  || (mode & MEDIAN_DEINT_FILTER)
3362  || (mode & V_A_DEBLOCK)) copyAhead=13;
3363  else if(mode & V_X1_FILTER) copyAhead=11;
3364 // else if(mode & V_RK1_FILTER) copyAhead=10;
3365  else if(mode & DERING) copyAhead=9;
3366  else copyAhead=8;
3367 
3368  copyAhead-= 8;
3369 
3370  if(!isColor){
3371  uint64_t sum= 0;
3372  int i;
3373  uint64_t maxClipped;
3374  uint64_t clipped;
3375  AVRational scale;
3376 
3377  c.frameNum++;
3378  // first frame is fscked so we ignore it
3379  if(c.frameNum == 1) yHistogram[0]= width*(uint64_t)height/64*15/256;
3380 
3381  for(i=0; i<256; i++){
3382  sum+= yHistogram[i];
3383  }
3384 
3385  /* We always get a completely black picture first. */
3386  maxClipped= av_rescale(sum, c.ppMode.maxClippedThreshold.num, c.ppMode.maxClippedThreshold.den);
3387 
3388  clipped= sum;
3389  for(black=255; black>0; black--){
3390  if(clipped < maxClipped) break;
3391  clipped-= yHistogram[black];
3392  }
3393 
3394  clipped= sum;
3395  for(white=0; white<256; white++){
3396  if(clipped < maxClipped) break;
3397  clipped-= yHistogram[white];
3398  }
3399 
3400  scale = (AVRational){c.ppMode.maxAllowedY - c.ppMode.minAllowedY, white - black};
3401 
3402 #if TEMPLATE_PP_MMXEXT
3403  c.packedYScale = (uint16_t)av_rescale(scale.num, 256, scale.den);
3404  c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
3405 #else
3406  c.packedYScale = (uint16_t)av_rescale(scale.num, 1024, scale.den);
3407  c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
3408 #endif
3409 
3410  c.packedYOffset|= c.packedYOffset<<32;
3411  c.packedYOffset|= c.packedYOffset<<16;
3412 
3413  c.packedYScale|= c.packedYScale<<32;
3414  c.packedYScale|= c.packedYScale<<16;
3415 
3416  if(mode & LEVEL_FIX) QPCorrecture= (int)av_rescale(scale.num, 256*256, scale.den);
3417  else QPCorrecture= 256*256;
3418  }else{
3419  c.packedYScale= 0x0100010001000100LL;
3420  c.packedYOffset= 0;
3421  QPCorrecture= 256*256;
3422  }
3423 
3424  /* copy & deinterlace first row of blocks */
3425  y=-BLOCK_SIZE;
3426  {
3427  const uint8_t *srcBlock= &(src[y*srcStride]);
3428  uint8_t *dstBlock= tempDst + dstStride;
3429 
3430  // From this point on it is guaranteed that we can read and write 16 lines downward
3431  // finish 1 block before the next otherwise we might have a problem
3432  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
3433  for(x=0; x<width; x+=BLOCK_SIZE){
3434  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
3435  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
3436  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
3437  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
3438 
3439  RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
3440  srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3441 
3442  RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
3443 
3445  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3446  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3447  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3448  else if(mode & MEDIAN_DEINT_FILTER)
3449  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3450  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3451  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3452  else if(mode & FFMPEG_DEINT_FILTER)
3453  RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3454  else if(mode & LOWPASS5_DEINT_FILTER)
3455  RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3456 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3457  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3458 */
3459  dstBlock+=8;
3460  srcBlock+=8;
3461  }
3462  if(width==FFABS(dstStride))
3463  linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
3464  else{
3465  int i;
3466  for(i=0; i<copyAhead; i++){
3467  memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
3468  }
3469  }
3470  }
3471 
3472  for(y=0; y<height; y+=BLOCK_SIZE){
3473  //1% speedup if these are here instead of the inner loop
3474  const uint8_t *srcBlock= &(src[y*srcStride]);
3475  uint8_t *dstBlock= &(dst[y*dstStride]);
3476 #if TEMPLATE_PP_MMX
3477  uint8_t *tempBlock1= c.tempBlocks;
3478  uint8_t *tempBlock2= c.tempBlocks + 8;
3479 #endif
3480  const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
3481  int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
3482  int QP=0, nonBQP=0;
3483  /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards
3484  if not than use a temporary buffer */
3485  if(y+15 >= height){
3486  int i;
3487  /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with
3488  blockcopy to dst later */
3489  linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
3490  FFMAX(height-y-copyAhead, 0), srcStride);
3491 
3492  /* duplicate last line of src to fill the void up to line (copyAhead+7) */
3493  for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
3494  memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
3495 
3496  /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/
3497  linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
3498 
3499  /* duplicate last line of dst to fill the void up to line (copyAhead) */
3500  for(i=height-y+1; i<=copyAhead; i++)
3501  memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
3502 
3503  dstBlock= tempDst + dstStride;
3504  srcBlock= tempSrc;
3505  }
3506 
3507  // From this point on it is guaranteed that we can read and write 16 lines downward
3508  // finish 1 block before the next otherwise we might have a problem
3509  // with the L1 Cache of the P4 ... or only a few blocks at a time or something
3510  for(x=0; x<width; ){
3511  int startx = x;
3512  int endx = FFMIN(width, x+32);
3513  uint8_t *dstBlockStart = dstBlock;
3514  const uint8_t *srcBlockStart = srcBlock;
3515  int qp_index = 0;
3516  for(qp_index=0; qp_index < (endx-startx)/BLOCK_SIZE; qp_index++){
3517  QP = QPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
3518  nonBQP = nonBQPptr[(x+qp_index*BLOCK_SIZE)>>qpHShift];
3519  if(!isColor){
3520  QP= (QP* QPCorrecture + 256*128)>>16;
3521  nonBQP= (nonBQP* QPCorrecture + 256*128)>>16;
3522  yHistogram[(srcBlock+qp_index*8)[srcStride*12 + 4]]++;
3523  }
3524  c.QP_block[qp_index] = QP;
3525  c.nonBQP_block[qp_index] = nonBQP;
3526 #if TEMPLATE_PP_MMX
3527  __asm__ volatile(
3528  "movd %1, %%mm7 \n\t"
3529  "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
3530  "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
3531  "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP
3532  "movq %%mm7, %0 \n\t"
3533  : "=m" (c.pQPb_block[qp_index])
3534  : "r" (QP)
3535  );
3536 #endif
3537  }
3538  for(; x < endx; x+=BLOCK_SIZE){
3539  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
3540  RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
3541  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
3542  RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
3543 
3544  RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
3545  srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
3546 
3548  RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
3549  else if(mode & LINEAR_BLEND_DEINT_FILTER)
3550  RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
3551  else if(mode & MEDIAN_DEINT_FILTER)
3552  RENAME(deInterlaceMedian)(dstBlock, dstStride);
3553  else if(mode & CUBIC_IPOL_DEINT_FILTER)
3554  RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
3555  else if(mode & FFMPEG_DEINT_FILTER)
3556  RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
3557  else if(mode & LOWPASS5_DEINT_FILTER)
3558  RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
3559 /* else if(mode & CUBIC_BLEND_DEINT_FILTER)
3560  RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
3561 */
3562  dstBlock+=8;
3563  srcBlock+=8;
3564  }
3565 
3566  dstBlock = dstBlockStart;
3567  srcBlock = srcBlockStart;
3568 
3569  for(x = startx, qp_index = 0; x < endx; x+=BLOCK_SIZE, qp_index++){
3570  const int stride= dstStride;
3571  //temporary while changing QP stuff to make things continue to work
3572  //eventually QP,nonBQP,etc will be arrays and this will be unnecessary
3573  c.QP = c.QP_block[qp_index];
3574  c.nonBQP = c.nonBQP_block[qp_index];
3575  c.pQPb = c.pQPb_block[qp_index];
3576  c.pQPb2 = c.pQPb2_block[qp_index];
3577 
3578  /* only deblock if we have 2 blocks */
3579  if(y + 8 < height){
3580  if(mode & V_X1_FILTER)
3581  RENAME(vertX1Filter)(dstBlock, stride, &c);
3582  else if(mode & V_DEBLOCK){
3583  const int t= RENAME(vertClassify)(dstBlock, stride, &c);
3584 
3585  if(t==1)
3586  RENAME(doVertLowPass)(dstBlock, stride, &c);
3587  else if(t==2)
3588  RENAME(doVertDefFilter)(dstBlock, stride, &c);
3589  }else if(mode & V_A_DEBLOCK){
3590  RENAME(do_a_deblock)(dstBlock, stride, 1, &c, mode);
3591  }
3592  }
3593 
3594  dstBlock+=8;
3595  srcBlock+=8;
3596  }
3597 
3598  dstBlock = dstBlockStart;
3599  srcBlock = srcBlockStart;
3600 
3601  for(x = startx, qp_index=0; x < endx; x+=BLOCK_SIZE, qp_index++){
3602  const int stride= dstStride;
3603  av_unused uint8_t *tmpXchg;
3604  c.QP = c.QP_block[qp_index];
3605  c.nonBQP = c.nonBQP_block[qp_index];
3606  c.pQPb = c.pQPb_block[qp_index];
3607  c.pQPb2 = c.pQPb2_block[qp_index];
3608 #if TEMPLATE_PP_MMX
3609  RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
3610 #endif
3611  /* check if we have a previous block to deblock it with dstBlock */
3612  if(x - 8 >= 0){
3613 #if TEMPLATE_PP_MMX
3614  if(mode & H_X1_FILTER)
3615  RENAME(vertX1Filter)(tempBlock1, 16, &c);
3616  else if(mode & H_DEBLOCK){
3617  const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
3618  if(t==1)
3619  RENAME(doVertLowPass)(tempBlock1, 16, &c);
3620  else if(t==2)
3621  RENAME(doVertDefFilter)(tempBlock1, 16, &c);
3622  }else if(mode & H_A_DEBLOCK){
3623  RENAME(do_a_deblock)(tempBlock1, 16, 1, &c, mode);
3624  }
3625 
3626  RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
3627 
3628 #else
3629  if(mode & H_X1_FILTER)
3630  horizX1Filter(dstBlock-4, stride, c.QP);
3631  else if(mode & H_DEBLOCK){
3632 #if TEMPLATE_PP_ALTIVEC
3633  DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];
3634  int t;
3635  transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
3636 
3637  t = vertClassify_altivec(tempBlock-48, 16, &c);
3638  if(t==1) {
3639  doVertLowPass_altivec(tempBlock-48, 16, &c);
3640  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3641  }
3642  else if(t==2) {
3643  doVertDefFilter_altivec(tempBlock-48, 16, &c);
3644  transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
3645  }
3646 #else
3647  const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
3648 
3649  if(t==1)
3650  RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
3651  else if(t==2)
3652  RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
3653 #endif
3654  }else if(mode & H_A_DEBLOCK){
3655  RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c, mode);
3656  }
3657 #endif //TEMPLATE_PP_MMX
3658  if(mode & DERING){
3659  //FIXME filter first line
3660  if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
3661  }
3662 
3663  if(mode & TEMP_NOISE_FILTER)
3664  {
3665  RENAME(tempNoiseReducer)(dstBlock-8, stride,
3666  c.tempBlurred[isColor] + y*dstStride + x,
3667  c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3668  c.ppMode.maxTmpNoise);
3669  }
3670  }
3671 
3672  dstBlock+=8;
3673  srcBlock+=8;
3674 
3675 #if TEMPLATE_PP_MMX
3676  tmpXchg= tempBlock1;
3677  tempBlock1= tempBlock2;
3678  tempBlock2 = tmpXchg;
3679 #endif
3680  }
3681  }
3682 
3683  if(mode & DERING){
3684  if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
3685  }
3686 
3687  if((mode & TEMP_NOISE_FILTER)){
3688  RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
3689  c.tempBlurred[isColor] + y*dstStride + x,
3690  c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256,
3691  c.ppMode.maxTmpNoise);
3692  }
3693 
3694  /* did we use a tmp buffer for the last lines*/
3695  if(y+15 >= height){
3696  uint8_t *dstBlock= &(dst[y*dstStride]);
3697  if(width==FFABS(dstStride))
3698  linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
3699  else{
3700  int i;
3701  for(i=0; i<height-y; i++){
3702  memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
3703  }
3704  }
3705  }
3706  }
3707 #if TEMPLATE_PP_3DNOW
3708  __asm__ volatile("femms");
3709 #elif TEMPLATE_PP_MMX
3710  __asm__ volatile("emms");
3711 #endif
3712 
3713 #ifdef DEBUG_BRIGHTNESS
3714  if(!isColor){
3715  int max=1;
3716  int i;
3717  for(i=0; i<256; i++)
3718  if(yHistogram[i] > max) max=yHistogram[i];
3719 
3720  for(i=1; i<256; i++){
3721  int x;
3722  int start=yHistogram[i-1]/(max/256+1);
3723  int end=yHistogram[i]/(max/256+1);
3724  int inc= end > start ? 1 : -1;
3725  for(x=start; x!=end+inc; x+=inc)
3726  dst[ i*dstStride + x]+=128;
3727  }
3728 
3729  for(i=0; i<100; i+=2){
3730  dst[ (white)*dstStride + i]+=128;
3731  dst[ (black)*dstStride + i]+=128;
3732  }
3733  }
3734 #endif
3735 
3736  *c2= c; //copy local context back
3737 
3738 }
3739 
3740 #undef RENAME
3741 #undef TEMPLATE_PP_C
3742 #undef TEMPLATE_PP_ALTIVEC
3743 #undef TEMPLATE_PP_MMX
3744 #undef TEMPLATE_PP_MMXEXT
3745 #undef TEMPLATE_PP_3DNOW
3746 #undef TEMPLATE_PP_SSE2
FFMPEG_DEINT_FILTER
#define FFMPEG_DEINT_FILTER
Definition: postprocess_internal.h:66
stride
int stride
Definition: mace.c:144
AV_LOG_WARNING
#define AV_LOG_WARNING
Something somehow does not look correct.
Definition: log.h:182
PPContext
postprocess context.
Definition: postprocess_internal.h:115
av_unused
#define av_unused
Definition: attributes.h:125
end
static av_cold int end(AVCodecContext *avctx)
Definition: avrndec.c:90
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
LOWPASS5_DEINT_FILTER
#define LOWPASS5_DEINT_FILTER
Definition: postprocess_internal.h:67
b
#define b
Definition: input.c:41
NAMED_CONSTRAINTS_ADD
#define NAMED_CONSTRAINTS_ADD(...)
Definition: asm.h:145
horizX1Filter
static void horizX1Filter(uint8_t *src, int stride, int QP)
Experimental Filter 1 (Horizontal) will not damage linear gradients Flat blocks should look like they...
Definition: postprocess.c:346
doVertLowPass_altivec
static void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c)
Definition: postprocess_altivec_template.c:213
MEDIAN
@ MEDIAN
Definition: huffyuv.h:52
t1
#define t1
Definition: regdef.h:29
max
#define max(a, b)
Definition: cuda_runtime.h:33
H_A_DEBLOCK
#define H_A_DEBLOCK
Definition: postprocess_internal.h:55
start
void INT64 start
Definition: avisynth_c.h:767
FFSIGN
#define FFSIGN(a)
Definition: common.h:73
QP
#define QP(qP, depth)
Definition: h264data.c:190
AVRational::num
int num
Numerator.
Definition: rational.h:59
src
#define src
Definition: vp8dsp.c:254
first
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But first
Definition: rate_distortion.txt:12
postProcess
static void postProcess(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, const int8_t QPs[], int QPStride, int isColor, pp_mode *vm, pp_context *vc)
Definition: postprocess.c:563
width
#define width
s
#define s(width, name)
Definition: cbs_vp9.c:257
V_A_DEBLOCK
#define V_A_DEBLOCK
Definition: postprocess_internal.h:51
V_DEBLOCK
#define V_DEBLOCK
Definition: postprocess_internal.h:35
TEMP_NOISE_FILTER
#define TEMP_NOISE_FILTER
Definition: postprocess_internal.h:69
XMM_CLOBBERS
#define XMM_CLOBBERS(...)
Definition: asm.h:98
f
#define f(width, name)
Definition: cbs_vp9.c:255
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:72
TEMPLATE_PP_SSE2
#define TEMPLATE_PP_SSE2
Definition: postprocess_template.c:74
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
MEDIAN_DEINT_FILTER
#define MEDIAN_DEINT_FILTER
Definition: postprocess_internal.h:65
linecpy
static void linecpy(void *dest, const void *src, int lines, int stride)
Definition: postprocess_internal.h:176
V_X1_FILTER
#define V_X1_FILTER
Definition: postprocess_internal.h:50
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
error
static void error(const char *err)
Definition: target_dec_fuzzer.c:61
transpose_16x8_char_toPackedAlign_altivec
static void transpose_16x8_char_toPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1015
PAVGB
#define PAVGB(a, b)
Definition: postprocess_template.c:87
FFMAX
#define FFMAX(a, b)
Definition: common.h:94
asm.h
transpose_8x16_char_fromPackedAlign_altivec
static void transpose_8x16_char_fromPackedAlign_altivec(unsigned char *dst, unsigned char *src, int stride)
Definition: postprocess_altivec_template.c:1120
avg
#define avg(a, b, c, d)
Definition: colorspacedsp_template.c:28
PREV
@ PREV
Definition: vf_fftdnoiz.c:31
height
#define height
FFMIN
#define FFMIN(a, b)
Definition: common.h:96
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
H_DEBLOCK
#define H_DEBLOCK
Definition: postprocess_internal.h:36
AV_LOG_INFO
#define AV_LOG_INFO
Standard information.
Definition: log.h:187
DERING
#define DERING
Definition: postprocess_internal.h:37
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:112
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
VISUALIZE
#define VISUALIZE
Definition: postprocess_internal.h:72
t3
#define t3
Definition: regdef.h:31
RENAME
#define RENAME(name)
Definition: ffv1.h:197
av_always_inline
#define av_always_inline
Definition: attributes.h:43
uint8_t
uint8_t
Definition: audio_convert.c:194
av_rescale
int64_t av_rescale(int64_t a, int64_t b, int64_t c)
Rescale a 64-bit integer with rounding to nearest.
Definition: mathematics.c:129
NEXT
@ NEXT
Definition: vf_fftdnoiz.c:32
CUBIC_IPOL_DEINT_FILTER
#define CUBIC_IPOL_DEINT_FILTER
Definition: postprocess_internal.h:64
vertClassify_altivec
static int vertClassify_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:58
TEMPLATE_PP_MMX
#define TEMPLATE_PP_MMX
Definition: postprocess_template.c:48
c2
static const uint64_t c2
Definition: murmur3.c:50
t2
#define t2
Definition: regdef.h:30
doVertDefFilter_altivec
static void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c)
Definition: postprocess_altivec_template.c:411
AVRational::den
int den
Denominator.
Definition: rational.h:60
mode
mode
Definition: ebur128.h:83
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:107
LINEAR_BLEND_DEINT_FILTER
#define LINEAR_BLEND_DEINT_FILTER
Definition: postprocess_internal.h:62
LINEAR_IPOL_DEINT_FILTER
#define LINEAR_IPOL_DEINT_FILTER
Definition: postprocess_internal.h:61
MANGLE
#define MANGLE(a)
Definition: asm.h:127
diff
static av_always_inline int diff(const uint32_t a, const uint32_t b)
Definition: vf_palettegen.c:136
H_X1_FILTER
#define H_X1_FILTER
Definition: postprocess_internal.h:54
x86_reg
int x86_reg
Definition: asm.h:72
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:28
BLOCK_SIZE
#define BLOCK_SIZE
Definition: adx.h:53
int
int
Definition: ffmpeg_filter.c:191
LEVEL_FIX
#define LEVEL_FIX
Brightness & Contrast.
Definition: postprocess_internal.h:38
min
float min
Definition: vorbis_enc_data.h:456