00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00026 #include "libavutil/x86_cpu.h"
00027
00028 #undef REAL_PAVGB
00029 #undef PAVGB
00030 #undef PMINUB
00031 #undef PMAXUB
00032
00033 #if HAVE_MMX2
00034 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t"
00035 #elif HAVE_AMD3DNOW
00036 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t"
00037 #endif
00038 #define PAVGB(a,b) REAL_PAVGB(a,b)
00039
00040 #if HAVE_MMX2
00041 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t"
00042 #elif HAVE_MMX
00043 #define PMINUB(b,a,t) \
00044 "movq " #a ", " #t " \n\t"\
00045 "psubusb " #b ", " #t " \n\t"\
00046 "psubb " #t ", " #a " \n\t"
00047 #endif
00048
00049 #if HAVE_MMX2
00050 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t"
00051 #elif HAVE_MMX
00052 #define PMAXUB(a,b) \
00053 "psubusb " #a ", " #b " \n\t"\
00054 "paddb " #a ", " #b " \n\t"
00055 #endif
00056
00057
00058 #if HAVE_MMX
00059
00062 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){
00063 int numEq= 0, dcOk;
00064 src+= stride*4;
00065 __asm__ volatile(
00066 "movq %0, %%mm7 \n\t"
00067 "movq %1, %%mm6 \n\t"
00068 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
00069 );
00070
00071 __asm__ volatile(
00072 "lea (%2, %3), %%"REG_a" \n\t"
00073
00074
00075
00076 "movq (%2), %%mm0 \n\t"
00077 "movq (%%"REG_a"), %%mm1 \n\t"
00078 "movq %%mm0, %%mm3 \n\t"
00079 "movq %%mm0, %%mm4 \n\t"
00080 PMAXUB(%%mm1, %%mm4)
00081 PMINUB(%%mm1, %%mm3, %%mm5)
00082 "psubb %%mm1, %%mm0 \n\t"
00083 "paddb %%mm7, %%mm0 \n\t"
00084 "pcmpgtb %%mm6, %%mm0 \n\t"
00085
00086 "movq (%%"REG_a",%3), %%mm2 \n\t"
00087 PMAXUB(%%mm2, %%mm4)
00088 PMINUB(%%mm2, %%mm3, %%mm5)
00089 "psubb %%mm2, %%mm1 \n\t"
00090 "paddb %%mm7, %%mm1 \n\t"
00091 "pcmpgtb %%mm6, %%mm1 \n\t"
00092 "paddb %%mm1, %%mm0 \n\t"
00093
00094 "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
00095 PMAXUB(%%mm1, %%mm4)
00096 PMINUB(%%mm1, %%mm3, %%mm5)
00097 "psubb %%mm1, %%mm2 \n\t"
00098 "paddb %%mm7, %%mm2 \n\t"
00099 "pcmpgtb %%mm6, %%mm2 \n\t"
00100 "paddb %%mm2, %%mm0 \n\t"
00101
00102 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t"
00103
00104 "movq (%2, %3, 4), %%mm2 \n\t"
00105 PMAXUB(%%mm2, %%mm4)
00106 PMINUB(%%mm2, %%mm3, %%mm5)
00107 "psubb %%mm2, %%mm1 \n\t"
00108 "paddb %%mm7, %%mm1 \n\t"
00109 "pcmpgtb %%mm6, %%mm1 \n\t"
00110 "paddb %%mm1, %%mm0 \n\t"
00111
00112 "movq (%%"REG_a"), %%mm1 \n\t"
00113 PMAXUB(%%mm1, %%mm4)
00114 PMINUB(%%mm1, %%mm3, %%mm5)
00115 "psubb %%mm1, %%mm2 \n\t"
00116 "paddb %%mm7, %%mm2 \n\t"
00117 "pcmpgtb %%mm6, %%mm2 \n\t"
00118 "paddb %%mm2, %%mm0 \n\t"
00119
00120 "movq (%%"REG_a", %3), %%mm2 \n\t"
00121 PMAXUB(%%mm2, %%mm4)
00122 PMINUB(%%mm2, %%mm3, %%mm5)
00123 "psubb %%mm2, %%mm1 \n\t"
00124 "paddb %%mm7, %%mm1 \n\t"
00125 "pcmpgtb %%mm6, %%mm1 \n\t"
00126 "paddb %%mm1, %%mm0 \n\t"
00127
00128 "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
00129 PMAXUB(%%mm1, %%mm4)
00130 PMINUB(%%mm1, %%mm3, %%mm5)
00131 "psubb %%mm1, %%mm2 \n\t"
00132 "paddb %%mm7, %%mm2 \n\t"
00133 "pcmpgtb %%mm6, %%mm2 \n\t"
00134 "paddb %%mm2, %%mm0 \n\t"
00135 "psubusb %%mm3, %%mm4 \n\t"
00136
00137 " \n\t"
00138 #if HAVE_MMX2
00139 "pxor %%mm7, %%mm7 \n\t"
00140 "psadbw %%mm7, %%mm0 \n\t"
00141 #else
00142 "movq %%mm0, %%mm1 \n\t"
00143 "psrlw $8, %%mm0 \n\t"
00144 "paddb %%mm1, %%mm0 \n\t"
00145 "movq %%mm0, %%mm1 \n\t"
00146 "psrlq $16, %%mm0 \n\t"
00147 "paddb %%mm1, %%mm0 \n\t"
00148 "movq %%mm0, %%mm1 \n\t"
00149 "psrlq $32, %%mm0 \n\t"
00150 "paddb %%mm1, %%mm0 \n\t"
00151 #endif
00152 "movq %4, %%mm7 \n\t"
00153 "paddusb %%mm7, %%mm7 \n\t"
00154 "psubusb %%mm7, %%mm4 \n\t"
00155 "packssdw %%mm4, %%mm4 \n\t"
00156 "movd %%mm0, %0 \n\t"
00157 "movd %%mm4, %1 \n\t"
00158
00159 : "=r" (numEq), "=r" (dcOk)
00160 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
00161 : "%"REG_a
00162 );
00163
00164 numEq= (-numEq) &0xFF;
00165 if(numEq > c->ppMode.flatnessThreshold){
00166 if(dcOk) return 0;
00167 else return 1;
00168 }else{
00169 return 2;
00170 }
00171 }
00172 #endif //HAVE_MMX
00173
00178 #if !HAVE_ALTIVEC
00179 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c)
00180 {
00181 #if HAVE_MMX2 || HAVE_AMD3DNOW
00182 src+= stride*3;
00183 __asm__ volatile(
00184 "movq %2, %%mm0 \n\t"
00185 "pxor %%mm4, %%mm4 \n\t"
00186
00187 "movq (%0), %%mm6 \n\t"
00188 "movq (%0, %1), %%mm5 \n\t"
00189 "movq %%mm5, %%mm1 \n\t"
00190 "movq %%mm6, %%mm2 \n\t"
00191 "psubusb %%mm6, %%mm5 \n\t"
00192 "psubusb %%mm1, %%mm2 \n\t"
00193 "por %%mm5, %%mm2 \n\t"
00194 "psubusb %%mm0, %%mm2 \n\t"
00195 "pcmpeqb %%mm4, %%mm2 \n\t"
00196
00197 "pand %%mm2, %%mm6 \n\t"
00198 "pandn %%mm1, %%mm2 \n\t"
00199 "por %%mm2, %%mm6 \n\t"
00200
00201 "movq (%0, %1, 8), %%mm5 \n\t"
00202 "lea (%0, %1, 4), %%"REG_a" \n\t"
00203 "lea (%0, %1, 8), %%"REG_c" \n\t"
00204 "sub %1, %%"REG_c" \n\t"
00205 "add %1, %0 \n\t"
00206 "movq (%0, %1, 8), %%mm7 \n\t"
00207 "movq %%mm5, %%mm1 \n\t"
00208 "movq %%mm7, %%mm2 \n\t"
00209 "psubusb %%mm7, %%mm5 \n\t"
00210 "psubusb %%mm1, %%mm2 \n\t"
00211 "por %%mm5, %%mm2 \n\t"
00212 "psubusb %%mm0, %%mm2 \n\t"
00213 "pcmpeqb %%mm4, %%mm2 \n\t"
00214
00215 "pand %%mm2, %%mm7 \n\t"
00216 "pandn %%mm1, %%mm2 \n\t"
00217 "por %%mm2, %%mm7 \n\t"
00218
00219
00220
00221
00222
00223
00224
00225
00226 "movq (%0, %1), %%mm0 \n\t"
00227 "movq %%mm0, %%mm1 \n\t"
00228 PAVGB(%%mm6, %%mm0)
00229 PAVGB(%%mm6, %%mm0)
00230
00231 "movq (%0, %1, 4), %%mm2 \n\t"
00232 "movq %%mm2, %%mm5 \n\t"
00233 PAVGB((%%REGa), %%mm2)
00234 PAVGB((%0, %1, 2), %%mm2)
00235 "movq %%mm2, %%mm3 \n\t"
00236 "movq (%0), %%mm4 \n\t"
00237 PAVGB(%%mm4, %%mm3)
00238 PAVGB(%%mm0, %%mm3)
00239 "movq %%mm3, (%0) \n\t"
00240
00241 "movq %%mm1, %%mm0 \n\t"
00242 PAVGB(%%mm6, %%mm0)
00243 "movq %%mm4, %%mm3 \n\t"
00244 PAVGB((%0,%1,2), %%mm3)
00245 PAVGB((%%REGa,%1,2), %%mm5)
00246 PAVGB((%%REGa), %%mm5)
00247 PAVGB(%%mm5, %%mm3)
00248 PAVGB(%%mm0, %%mm3)
00249 "movq %%mm3, (%0,%1) \n\t"
00250
00251 PAVGB(%%mm4, %%mm6)
00252 "movq (%%"REG_c"), %%mm0 \n\t"
00253 PAVGB((%%REGa, %1, 2), %%mm0)
00254 "movq %%mm0, %%mm3 \n\t"
00255 PAVGB(%%mm1, %%mm0)
00256 PAVGB(%%mm6, %%mm0)
00257 PAVGB(%%mm2, %%mm0)
00258 "movq (%0, %1, 2), %%mm2 \n\t"
00259 "movq %%mm0, (%0, %1, 2) \n\t"
00260
00261 "movq (%%"REG_a", %1, 4), %%mm0 \n\t"
00262 PAVGB((%%REGc), %%mm0)
00263 PAVGB(%%mm0, %%mm6)
00264 PAVGB(%%mm1, %%mm4)
00265 PAVGB(%%mm2, %%mm1)
00266 PAVGB(%%mm1, %%mm6)
00267 PAVGB(%%mm5, %%mm6)
00268 "movq (%%"REG_a"), %%mm5 \n\t"
00269 "movq %%mm6, (%%"REG_a") \n\t"
00270
00271 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
00272 PAVGB(%%mm7, %%mm6)
00273 PAVGB(%%mm4, %%mm6)
00274 PAVGB(%%mm3, %%mm6)
00275 PAVGB(%%mm5, %%mm2)
00276 "movq (%0, %1, 4), %%mm4 \n\t"
00277 PAVGB(%%mm4, %%mm2)
00278 PAVGB(%%mm2, %%mm6)
00279 "movq %%mm6, (%0, %1, 4) \n\t"
00280
00281 PAVGB(%%mm7, %%mm1)
00282 PAVGB(%%mm4, %%mm5)
00283 PAVGB(%%mm5, %%mm0)
00284 "movq (%%"REG_a", %1, 2), %%mm6 \n\t"
00285 PAVGB(%%mm6, %%mm1)
00286 PAVGB(%%mm0, %%mm1)
00287 "movq %%mm1, (%%"REG_a", %1, 2) \n\t"
00288
00289 PAVGB((%%REGc), %%mm2)
00290 "movq (%%"REG_a", %1, 4), %%mm0 \n\t"
00291 PAVGB(%%mm0, %%mm6)
00292 PAVGB(%%mm7, %%mm6)
00293 PAVGB(%%mm2, %%mm6)
00294 "movq %%mm6, (%%"REG_c") \n\t"
00295
00296 PAVGB(%%mm7, %%mm5)
00297 PAVGB(%%mm7, %%mm5)
00298
00299 PAVGB(%%mm3, %%mm0)
00300 PAVGB(%%mm0, %%mm5)
00301 "movq %%mm5, (%%"REG_a", %1, 4) \n\t"
00302 "sub %1, %0 \n\t"
00303
00304 :
00305 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
00306 : "%"REG_a, "%"REG_c
00307 );
00308 #else //HAVE_MMX2 || HAVE_AMD3DNOW
00309 const int l1= stride;
00310 const int l2= stride + l1;
00311 const int l3= stride + l2;
00312 const int l4= stride + l3;
00313 const int l5= stride + l4;
00314 const int l6= stride + l5;
00315 const int l7= stride + l6;
00316 const int l8= stride + l7;
00317 const int l9= stride + l8;
00318 int x;
00319 src+= stride*3;
00320 for(x=0; x<BLOCK_SIZE; x++){
00321 const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1];
00322 const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8];
00323
00324 int sums[10];
00325 sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4;
00326 sums[1] = sums[0] - first + src[l4];
00327 sums[2] = sums[1] - first + src[l5];
00328 sums[3] = sums[2] - first + src[l6];
00329 sums[4] = sums[3] - first + src[l7];
00330 sums[5] = sums[4] - src[l1] + src[l8];
00331 sums[6] = sums[5] - src[l2] + last;
00332 sums[7] = sums[6] - src[l3] + last;
00333 sums[8] = sums[7] - src[l4] + last;
00334 sums[9] = sums[8] - src[l5] + last;
00335
00336 src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4;
00337 src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4;
00338 src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4;
00339 src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4;
00340 src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4;
00341 src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4;
00342 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4;
00343 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4;
00344
00345 src++;
00346 }
00347 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
00348 }
00349 #endif //HAVE_ALTIVEC
00350
00358 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co)
00359 {
00360 #if HAVE_MMX2 || HAVE_AMD3DNOW
00361 src+= stride*3;
00362
00363 __asm__ volatile(
00364 "pxor %%mm7, %%mm7 \n\t"
00365 "lea (%0, %1), %%"REG_a" \n\t"
00366 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
00367
00368
00369 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
00370 "movq (%0, %1, 4), %%mm1 \n\t"
00371 "movq %%mm1, %%mm2 \n\t"
00372 "psubusb %%mm0, %%mm1 \n\t"
00373 "psubusb %%mm2, %%mm0 \n\t"
00374 "por %%mm1, %%mm0 \n\t"
00375 "movq (%%"REG_c"), %%mm3 \n\t"
00376 "movq (%%"REG_c", %1), %%mm4 \n\t"
00377 "movq %%mm3, %%mm5 \n\t"
00378 "psubusb %%mm4, %%mm3 \n\t"
00379 "psubusb %%mm5, %%mm4 \n\t"
00380 "por %%mm4, %%mm3 \n\t"
00381 PAVGB(%%mm3, %%mm0)
00382 "movq %%mm2, %%mm1 \n\t"
00383 "psubusb %%mm5, %%mm2 \n\t"
00384 "movq %%mm2, %%mm4 \n\t"
00385 "pcmpeqb %%mm7, %%mm2 \n\t"
00386 "psubusb %%mm1, %%mm5 \n\t"
00387 "por %%mm5, %%mm4 \n\t"
00388 "psubusb %%mm0, %%mm4 \n\t"
00389 "movq %%mm4, %%mm3 \n\t"
00390 "movq %2, %%mm0 \n\t"
00391 "paddusb %%mm0, %%mm0 \n\t"
00392 "psubusb %%mm0, %%mm4 \n\t"
00393 "pcmpeqb %%mm7, %%mm4 \n\t"
00394 "psubusb "MANGLE(b01)", %%mm3 \n\t"
00395 "pand %%mm4, %%mm3 \n\t"
00396
00397 PAVGB(%%mm7, %%mm3)
00398 "movq %%mm3, %%mm1 \n\t"
00399 PAVGB(%%mm7, %%mm3)
00400 PAVGB(%%mm1, %%mm3)
00401
00402 "movq (%0, %1, 4), %%mm0 \n\t"
00403 "pxor %%mm2, %%mm0 \n\t"
00404 "psubusb %%mm3, %%mm0 \n\t"
00405 "pxor %%mm2, %%mm0 \n\t"
00406 "movq %%mm0, (%0, %1, 4) \n\t"
00407
00408 "movq (%%"REG_c"), %%mm0 \n\t"
00409 "pxor %%mm2, %%mm0 \n\t"
00410 "paddusb %%mm3, %%mm0 \n\t"
00411 "pxor %%mm2, %%mm0 \n\t"
00412 "movq %%mm0, (%%"REG_c") \n\t"
00413
00414 PAVGB(%%mm7, %%mm1)
00415
00416 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
00417 "pxor %%mm2, %%mm0 \n\t"
00418 "psubusb %%mm1, %%mm0 \n\t"
00419 "pxor %%mm2, %%mm0 \n\t"
00420 "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
00421
00422 "movq (%%"REG_c", %1), %%mm0 \n\t"
00423 "pxor %%mm2, %%mm0 \n\t"
00424 "paddusb %%mm1, %%mm0 \n\t"
00425 "pxor %%mm2, %%mm0 \n\t"
00426 "movq %%mm0, (%%"REG_c", %1) \n\t"
00427
00428 PAVGB(%%mm7, %%mm1)
00429
00430 "movq (%%"REG_a", %1), %%mm0 \n\t"
00431 "pxor %%mm2, %%mm0 \n\t"
00432 "psubusb %%mm1, %%mm0 \n\t"
00433 "pxor %%mm2, %%mm0 \n\t"
00434 "movq %%mm0, (%%"REG_a", %1) \n\t"
00435
00436 "movq (%%"REG_c", %1, 2), %%mm0 \n\t"
00437 "pxor %%mm2, %%mm0 \n\t"
00438 "paddusb %%mm1, %%mm0 \n\t"
00439 "pxor %%mm2, %%mm0 \n\t"
00440 "movq %%mm0, (%%"REG_c", %1, 2) \n\t"
00441
00442 :
00443 : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb)
00444 : "%"REG_a, "%"REG_c
00445 );
00446 #else //HAVE_MMX2 || HAVE_AMD3DNOW
00447
00448 const int l1= stride;
00449 const int l2= stride + l1;
00450 const int l3= stride + l2;
00451 const int l4= stride + l3;
00452 const int l5= stride + l4;
00453 const int l6= stride + l5;
00454 const int l7= stride + l6;
00455
00456
00457 int x;
00458
00459 src+= stride*3;
00460 for(x=0; x<BLOCK_SIZE; x++){
00461 int a= src[l3] - src[l4];
00462 int b= src[l4] - src[l5];
00463 int c= src[l5] - src[l6];
00464
00465 int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1);
00466 d= FFMAX(d, 0);
00467
00468 if(d < co->QP*2){
00469 int v = d * FFSIGN(-b);
00470
00471 src[l2] +=v>>3;
00472 src[l3] +=v>>2;
00473 src[l4] +=(3*v)>>3;
00474 src[l5] -=(3*v)>>3;
00475 src[l6] -=v>>2;
00476 src[l7] -=v>>3;
00477 }
00478 src++;
00479 }
00480 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
00481 }
00482
00483 #if !HAVE_ALTIVEC
00484 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c)
00485 {
00486 #if HAVE_MMX2 || HAVE_AMD3DNOW
00487
00488
00489
00490
00491
00492
00493
00494
00495
00496
00497
00498
00499
00500
00501 src+= stride*4;
00502 __asm__ volatile(
00503
00504 #if 0 //slightly more accurate and slightly slower
00505 "pxor %%mm7, %%mm7 \n\t"
00506 "lea (%0, %1), %%"REG_a" \n\t"
00507 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
00508
00509
00510
00511
00512
00513 "movq (%0, %1, 2), %%mm0 \n\t"
00514 "movq (%0), %%mm1 \n\t"
00515 "movq %%mm0, %%mm2 \n\t"
00516 PAVGB(%%mm7, %%mm0)
00517 PAVGB(%%mm1, %%mm0)
00518 PAVGB(%%mm2, %%mm0)
00519
00520 "movq (%%"REG_a"), %%mm1 \n\t"
00521 "movq (%%"REG_a", %1, 2), %%mm3 \n\t"
00522 "movq %%mm1, %%mm4 \n\t"
00523 PAVGB(%%mm7, %%mm1)
00524 PAVGB(%%mm3, %%mm1)
00525 PAVGB(%%mm4, %%mm1)
00526
00527 "movq %%mm0, %%mm4 \n\t"
00528 "psubusb %%mm1, %%mm0 \n\t"
00529 "psubusb %%mm4, %%mm1 \n\t"
00530 "por %%mm0, %%mm1 \n\t"
00531
00532
00533 "movq (%0, %1, 4), %%mm0 \n\t"
00534 "movq %%mm0, %%mm4 \n\t"
00535 PAVGB(%%mm7, %%mm0)
00536 PAVGB(%%mm2, %%mm0)
00537 PAVGB(%%mm4, %%mm0)
00538
00539 "movq (%%"REG_c"), %%mm2 \n\t"
00540 "movq %%mm3, %%mm5 \n\t"
00541 PAVGB(%%mm7, %%mm3)
00542 PAVGB(%%mm2, %%mm3)
00543 PAVGB(%%mm5, %%mm3)
00544
00545 "movq %%mm0, %%mm6 \n\t"
00546 "psubusb %%mm3, %%mm0 \n\t"
00547 "psubusb %%mm6, %%mm3 \n\t"
00548 "por %%mm0, %%mm3 \n\t"
00549 "pcmpeqb %%mm7, %%mm0 \n\t"
00550
00551
00552 "movq (%%"REG_c", %1), %%mm6 \n\t"
00553 "movq %%mm6, %%mm5 \n\t"
00554 PAVGB(%%mm7, %%mm6)
00555 PAVGB(%%mm4, %%mm6)
00556 PAVGB(%%mm5, %%mm6)
00557
00558 "movq (%%"REG_c", %1, 2), %%mm5 \n\t"
00559 "movq %%mm2, %%mm4 \n\t"
00560 PAVGB(%%mm7, %%mm2)
00561 PAVGB(%%mm5, %%mm2)
00562 PAVGB(%%mm4, %%mm2)
00563
00564 "movq %%mm6, %%mm4 \n\t"
00565 "psubusb %%mm2, %%mm6 \n\t"
00566 "psubusb %%mm4, %%mm2 \n\t"
00567 "por %%mm6, %%mm2 \n\t"
00568
00569
00570
00571 PMINUB(%%mm2, %%mm1, %%mm4)
00572 "movq %2, %%mm4 \n\t"
00573 "paddusb "MANGLE(b01)", %%mm4 \n\t"
00574 "pcmpgtb %%mm3, %%mm4 \n\t"
00575 "psubusb %%mm1, %%mm3 \n\t"
00576 "pand %%mm4, %%mm3 \n\t"
00577
00578 "movq %%mm3, %%mm1 \n\t"
00579
00580 PAVGB(%%mm7, %%mm3)
00581 PAVGB(%%mm7, %%mm3)
00582 "paddusb %%mm1, %%mm3 \n\t"
00583
00584
00585 "movq (%%"REG_a", %1, 2), %%mm6 \n\t"
00586 "movq (%0, %1, 4), %%mm5 \n\t"
00587 "movq (%0, %1, 4), %%mm4 \n\t"
00588 "psubusb %%mm6, %%mm5 \n\t"
00589 "psubusb %%mm4, %%mm6 \n\t"
00590 "por %%mm6, %%mm5 \n\t"
00591 "pcmpeqb %%mm7, %%mm6 \n\t"
00592 "pxor %%mm6, %%mm0 \n\t"
00593 "pand %%mm0, %%mm3 \n\t"
00594 PMINUB(%%mm5, %%mm3, %%mm0)
00595
00596 "psubusb "MANGLE(b01)", %%mm3 \n\t"
00597 PAVGB(%%mm7, %%mm3)
00598
00599 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
00600 "movq (%0, %1, 4), %%mm2 \n\t"
00601 "pxor %%mm6, %%mm0 \n\t"
00602 "pxor %%mm6, %%mm2 \n\t"
00603 "psubb %%mm3, %%mm0 \n\t"
00604 "paddb %%mm3, %%mm2 \n\t"
00605 "pxor %%mm6, %%mm0 \n\t"
00606 "pxor %%mm6, %%mm2 \n\t"
00607 "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
00608 "movq %%mm2, (%0, %1, 4) \n\t"
00609 #endif //0
00610
00611 "lea (%0, %1), %%"REG_a" \n\t"
00612 "pcmpeqb %%mm6, %%mm6 \n\t"
00613
00614
00615
00616
00617
00618 "movq (%%"REG_a", %1, 2), %%mm1 \n\t"
00619 "movq (%0, %1, 4), %%mm0 \n\t"
00620 "pxor %%mm6, %%mm1 \n\t"
00621 PAVGB(%%mm1, %%mm0)
00622
00623
00624 "movq (%%"REG_a", %1, 4), %%mm2 \n\t"
00625 "movq (%%"REG_a", %1), %%mm3 \n\t"
00626 "pxor %%mm6, %%mm2 \n\t"
00627 "movq %%mm2, %%mm5 \n\t"
00628 "movq "MANGLE(b80)", %%mm4 \n\t"
00629 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
00630 PAVGB(%%mm3, %%mm2)
00631 PAVGB(%%mm0, %%mm4)
00632 PAVGB(%%mm2, %%mm4)
00633 PAVGB(%%mm0, %%mm4)
00634
00635
00636 "movq (%%"REG_a"), %%mm2 \n\t"
00637 "pxor %%mm6, %%mm2 \n\t"
00638 PAVGB(%%mm3, %%mm2)
00639 PAVGB((%0), %%mm1)
00640 "movq "MANGLE(b80)", %%mm3 \n\t"
00641 PAVGB(%%mm2, %%mm3)
00642 PAVGB(%%mm1, %%mm3)
00643 PAVGB(%%mm2, %%mm3)
00644
00645
00646 PAVGB((%%REGc, %1), %%mm5)
00647 "movq (%%"REG_c", %1, 2), %%mm1 \n\t"
00648 "pxor %%mm6, %%mm1 \n\t"
00649 PAVGB((%0, %1, 4), %%mm1)
00650 "movq "MANGLE(b80)", %%mm2 \n\t"
00651 PAVGB(%%mm5, %%mm2)
00652 PAVGB(%%mm1, %%mm2)
00653 PAVGB(%%mm5, %%mm2)
00654
00655
00656 "movq "MANGLE(b00)", %%mm1 \n\t"
00657 "movq "MANGLE(b00)", %%mm5 \n\t"
00658 "psubb %%mm2, %%mm1 \n\t"
00659 "psubb %%mm3, %%mm5 \n\t"
00660 PMAXUB(%%mm1, %%mm2)
00661 PMAXUB(%%mm5, %%mm3)
00662 PMINUB(%%mm2, %%mm3, %%mm1)
00663
00664
00665
00666 "movq "MANGLE(b00)", %%mm7 \n\t"
00667 "movq %2, %%mm2 \n\t"
00668 PAVGB(%%mm6, %%mm2)
00669 "psubb %%mm6, %%mm2 \n\t"
00670
00671 "movq %%mm4, %%mm1 \n\t"
00672 "pcmpgtb %%mm7, %%mm1 \n\t"
00673 "pxor %%mm1, %%mm4 \n\t"
00674 "psubb %%mm1, %%mm4 \n\t"
00675 "pcmpgtb %%mm4, %%mm2 \n\t"
00676 "psubusb %%mm3, %%mm4 \n\t"
00677
00678
00679 "movq %%mm4, %%mm3 \n\t"
00680 "psubusb "MANGLE(b01)", %%mm4 \n\t"
00681 PAVGB(%%mm7, %%mm4)
00682 PAVGB(%%mm7, %%mm4)
00683 "paddb %%mm3, %%mm4 \n\t"
00684 "pand %%mm2, %%mm4 \n\t"
00685
00686 "movq "MANGLE(b80)", %%mm5 \n\t"
00687 "psubb %%mm0, %%mm5 \n\t"
00688 "paddsb %%mm6, %%mm5 \n\t"
00689 "pcmpgtb %%mm5, %%mm7 \n\t"
00690 "pxor %%mm7, %%mm5 \n\t"
00691
00692 PMINUB(%%mm5, %%mm4, %%mm3)
00693 "pxor %%mm1, %%mm7 \n\t"
00694
00695 "pand %%mm7, %%mm4 \n\t"
00696 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
00697 "movq (%0, %1, 4), %%mm2 \n\t"
00698 "pxor %%mm1, %%mm0 \n\t"
00699 "pxor %%mm1, %%mm2 \n\t"
00700 "paddb %%mm4, %%mm0 \n\t"
00701 "psubb %%mm4, %%mm2 \n\t"
00702 "pxor %%mm1, %%mm0 \n\t"
00703 "pxor %%mm1, %%mm2 \n\t"
00704 "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
00705 "movq %%mm2, (%0, %1, 4) \n\t"
00706
00707 :
00708 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb)
00709 : "%"REG_a, "%"REG_c
00710 );
00711
00712
00713
00714
00715
00716
00717
00718
00719
00720
00721
00722
00723
00724
00725
00726
00727
00728
00729
00730
00731
00732
00733
00734
00735
00736
00737
00738
00739
00740
00741
00742
00743
00744
00745
00746
00747
00748
00749
00750
00751
00752
00753
00754
00755
00756
00757
00758
00759
00760
00761
00762
00763
00764
00765
00766
00767 #elif HAVE_MMX
00768 DECLARE_ALIGNED(8, uint64_t, tmp)[4];
00769 src+= stride*4;
00770 __asm__ volatile(
00771 "pxor %%mm7, %%mm7 \n\t"
00772
00773
00774
00775
00776 "movq (%0), %%mm0 \n\t"
00777 "movq %%mm0, %%mm1 \n\t"
00778 "punpcklbw %%mm7, %%mm0 \n\t"
00779 "punpckhbw %%mm7, %%mm1 \n\t"
00780
00781 "movq (%0, %1), %%mm2 \n\t"
00782 "lea (%0, %1, 2), %%"REG_a" \n\t"
00783 "movq %%mm2, %%mm3 \n\t"
00784 "punpcklbw %%mm7, %%mm2 \n\t"
00785 "punpckhbw %%mm7, %%mm3 \n\t"
00786
00787 "movq (%%"REG_a"), %%mm4 \n\t"
00788 "movq %%mm4, %%mm5 \n\t"
00789 "punpcklbw %%mm7, %%mm4 \n\t"
00790 "punpckhbw %%mm7, %%mm5 \n\t"
00791
00792 "paddw %%mm0, %%mm0 \n\t"
00793 "paddw %%mm1, %%mm1 \n\t"
00794 "psubw %%mm4, %%mm2 \n\t"
00795 "psubw %%mm5, %%mm3 \n\t"
00796 "psubw %%mm2, %%mm0 \n\t"
00797 "psubw %%mm3, %%mm1 \n\t"
00798
00799 "psllw $2, %%mm2 \n\t"
00800 "psllw $2, %%mm3 \n\t"
00801 "psubw %%mm2, %%mm0 \n\t"
00802 "psubw %%mm3, %%mm1 \n\t"
00803
00804 "movq (%%"REG_a", %1), %%mm2 \n\t"
00805 "movq %%mm2, %%mm3 \n\t"
00806 "punpcklbw %%mm7, %%mm2 \n\t"
00807 "punpckhbw %%mm7, %%mm3 \n\t"
00808
00809 "psubw %%mm2, %%mm0 \n\t"
00810 "psubw %%mm3, %%mm1 \n\t"
00811 "psubw %%mm2, %%mm0 \n\t"
00812 "psubw %%mm3, %%mm1 \n\t"
00813 "movq %%mm0, (%3) \n\t"
00814 "movq %%mm1, 8(%3) \n\t"
00815
00816 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
00817 "movq %%mm0, %%mm1 \n\t"
00818 "punpcklbw %%mm7, %%mm0 \n\t"
00819 "punpckhbw %%mm7, %%mm1 \n\t"
00820
00821 "psubw %%mm0, %%mm2 \n\t"
00822 "psubw %%mm1, %%mm3 \n\t"
00823 "movq %%mm2, 16(%3) \n\t"
00824 "movq %%mm3, 24(%3) \n\t"
00825 "paddw %%mm4, %%mm4 \n\t"
00826 "paddw %%mm5, %%mm5 \n\t"
00827 "psubw %%mm2, %%mm4 \n\t"
00828 "psubw %%mm3, %%mm5 \n\t"
00829
00830 "lea (%%"REG_a", %1), %0 \n\t"
00831 "psllw $2, %%mm2 \n\t"
00832 "psllw $2, %%mm3 \n\t"
00833 "psubw %%mm2, %%mm4 \n\t"
00834 "psubw %%mm3, %%mm5 \n\t"
00835
00836 "movq (%0, %1, 2), %%mm2 \n\t"
00837 "movq %%mm2, %%mm3 \n\t"
00838 "punpcklbw %%mm7, %%mm2 \n\t"
00839 "punpckhbw %%mm7, %%mm3 \n\t"
00840 "psubw %%mm2, %%mm4 \n\t"
00841 "psubw %%mm3, %%mm5 \n\t"
00842 "psubw %%mm2, %%mm4 \n\t"
00843 "psubw %%mm3, %%mm5 \n\t"
00844
00845 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
00846 "punpcklbw %%mm7, %%mm6 \n\t"
00847 "psubw %%mm6, %%mm2 \n\t"
00848 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
00849 "punpckhbw %%mm7, %%mm6 \n\t"
00850 "psubw %%mm6, %%mm3 \n\t"
00851
00852 "paddw %%mm0, %%mm0 \n\t"
00853 "paddw %%mm1, %%mm1 \n\t"
00854 "psubw %%mm2, %%mm0 \n\t"
00855 "psubw %%mm3, %%mm1 \n\t"
00856
00857 "psllw $2, %%mm2 \n\t"
00858 "psllw $2, %%mm3 \n\t"
00859 "psubw %%mm2, %%mm0 \n\t"
00860 "psubw %%mm3, %%mm1 \n\t"
00861
00862 "movq (%0, %1, 4), %%mm2 \n\t"
00863 "movq %%mm2, %%mm3 \n\t"
00864 "punpcklbw %%mm7, %%mm2 \n\t"
00865 "punpckhbw %%mm7, %%mm3 \n\t"
00866
00867 "paddw %%mm2, %%mm2 \n\t"
00868 "paddw %%mm3, %%mm3 \n\t"
00869 "psubw %%mm2, %%mm0 \n\t"
00870 "psubw %%mm3, %%mm1 \n\t"
00871
00872 "movq (%3), %%mm2 \n\t"
00873 "movq 8(%3), %%mm3 \n\t"
00874
00875 #if HAVE_MMX2
00876 "movq %%mm7, %%mm6 \n\t"
00877 "psubw %%mm0, %%mm6 \n\t"
00878 "pmaxsw %%mm6, %%mm0 \n\t"
00879 "movq %%mm7, %%mm6 \n\t"
00880 "psubw %%mm1, %%mm6 \n\t"
00881 "pmaxsw %%mm6, %%mm1 \n\t"
00882 "movq %%mm7, %%mm6 \n\t"
00883 "psubw %%mm2, %%mm6 \n\t"
00884 "pmaxsw %%mm6, %%mm2 \n\t"
00885 "movq %%mm7, %%mm6 \n\t"
00886 "psubw %%mm3, %%mm6 \n\t"
00887 "pmaxsw %%mm6, %%mm3 \n\t"
00888 #else
00889 "movq %%mm7, %%mm6 \n\t"
00890 "pcmpgtw %%mm0, %%mm6 \n\t"
00891 "pxor %%mm6, %%mm0 \n\t"
00892 "psubw %%mm6, %%mm0 \n\t"
00893 "movq %%mm7, %%mm6 \n\t"
00894 "pcmpgtw %%mm1, %%mm6 \n\t"
00895 "pxor %%mm6, %%mm1 \n\t"
00896 "psubw %%mm6, %%mm1 \n\t"
00897 "movq %%mm7, %%mm6 \n\t"
00898 "pcmpgtw %%mm2, %%mm6 \n\t"
00899 "pxor %%mm6, %%mm2 \n\t"
00900 "psubw %%mm6, %%mm2 \n\t"
00901 "movq %%mm7, %%mm6 \n\t"
00902 "pcmpgtw %%mm3, %%mm6 \n\t"
00903 "pxor %%mm6, %%mm3 \n\t"
00904 "psubw %%mm6, %%mm3 \n\t"
00905 #endif
00906
00907 #if HAVE_MMX2
00908 "pminsw %%mm2, %%mm0 \n\t"
00909 "pminsw %%mm3, %%mm1 \n\t"
00910 #else
00911 "movq %%mm0, %%mm6 \n\t"
00912 "psubusw %%mm2, %%mm6 \n\t"
00913 "psubw %%mm6, %%mm0 \n\t"
00914 "movq %%mm1, %%mm6 \n\t"
00915 "psubusw %%mm3, %%mm6 \n\t"
00916 "psubw %%mm6, %%mm1 \n\t"
00917 #endif
00918
00919 "movd %2, %%mm2 \n\t"
00920 "punpcklbw %%mm7, %%mm2 \n\t"
00921
00922 "movq %%mm7, %%mm6 \n\t"
00923 "pcmpgtw %%mm4, %%mm6 \n\t"
00924 "pxor %%mm6, %%mm4 \n\t"
00925 "psubw %%mm6, %%mm4 \n\t"
00926 "pcmpgtw %%mm5, %%mm7 \n\t"
00927 "pxor %%mm7, %%mm5 \n\t"
00928 "psubw %%mm7, %%mm5 \n\t"
00929
00930 "psllw $3, %%mm2 \n\t"
00931 "movq %%mm2, %%mm3 \n\t"
00932 "pcmpgtw %%mm4, %%mm2 \n\t"
00933 "pcmpgtw %%mm5, %%mm3 \n\t"
00934 "pand %%mm2, %%mm4 \n\t"
00935 "pand %%mm3, %%mm5 \n\t"
00936
00937
00938 "psubusw %%mm0, %%mm4 \n\t"
00939 "psubusw %%mm1, %%mm5 \n\t"
00940
00941
00942 "movq "MANGLE(w05)", %%mm2 \n\t"
00943 "pmullw %%mm2, %%mm4 \n\t"
00944 "pmullw %%mm2, %%mm5 \n\t"
00945 "movq "MANGLE(w20)", %%mm2 \n\t"
00946 "paddw %%mm2, %%mm4 \n\t"
00947 "paddw %%mm2, %%mm5 \n\t"
00948 "psrlw $6, %%mm4 \n\t"
00949 "psrlw $6, %%mm5 \n\t"
00950
00951 "movq 16(%3), %%mm0 \n\t"
00952 "movq 24(%3), %%mm1 \n\t"
00953
00954 "pxor %%mm2, %%mm2 \n\t"
00955 "pxor %%mm3, %%mm3 \n\t"
00956
00957 "pcmpgtw %%mm0, %%mm2 \n\t"
00958 "pcmpgtw %%mm1, %%mm3 \n\t"
00959 "pxor %%mm2, %%mm0 \n\t"
00960 "pxor %%mm3, %%mm1 \n\t"
00961 "psubw %%mm2, %%mm0 \n\t"
00962 "psubw %%mm3, %%mm1 \n\t"
00963 "psrlw $1, %%mm0 \n\t"
00964 "psrlw $1, %%mm1 \n\t"
00965
00966 "pxor %%mm6, %%mm2 \n\t"
00967 "pxor %%mm7, %%mm3 \n\t"
00968 "pand %%mm2, %%mm4 \n\t"
00969 "pand %%mm3, %%mm5 \n\t"
00970
00971 #if HAVE_MMX2
00972 "pminsw %%mm0, %%mm4 \n\t"
00973 "pminsw %%mm1, %%mm5 \n\t"
00974 #else
00975 "movq %%mm4, %%mm2 \n\t"
00976 "psubusw %%mm0, %%mm2 \n\t"
00977 "psubw %%mm2, %%mm4 \n\t"
00978 "movq %%mm5, %%mm2 \n\t"
00979 "psubusw %%mm1, %%mm2 \n\t"
00980 "psubw %%mm2, %%mm5 \n\t"
00981 #endif
00982 "pxor %%mm6, %%mm4 \n\t"
00983 "pxor %%mm7, %%mm5 \n\t"
00984 "psubw %%mm6, %%mm4 \n\t"
00985 "psubw %%mm7, %%mm5 \n\t"
00986 "packsswb %%mm5, %%mm4 \n\t"
00987 "movq (%0), %%mm0 \n\t"
00988 "paddb %%mm4, %%mm0 \n\t"
00989 "movq %%mm0, (%0) \n\t"
00990 "movq (%0, %1), %%mm0 \n\t"
00991 "psubb %%mm4, %%mm0 \n\t"
00992 "movq %%mm0, (%0, %1) \n\t"
00993
00994 : "+r" (src)
00995 : "r" ((x86_reg)stride), "m" (c->pQPb), "r"(tmp)
00996 : "%"REG_a
00997 );
00998 #else //HAVE_MMX2 || HAVE_AMD3DNOW
00999 const int l1= stride;
01000 const int l2= stride + l1;
01001 const int l3= stride + l2;
01002 const int l4= stride + l3;
01003 const int l5= stride + l4;
01004 const int l6= stride + l5;
01005 const int l7= stride + l6;
01006 const int l8= stride + l7;
01007
01008 int x;
01009 src+= stride*3;
01010 for(x=0; x<BLOCK_SIZE; x++){
01011 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]);
01012 if(FFABS(middleEnergy) < 8*c->QP){
01013 const int q=(src[l4] - src[l5])/2;
01014 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]);
01015 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]);
01016
01017 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) );
01018 d= FFMAX(d, 0);
01019
01020 d= (5*d + 32) >> 6;
01021 d*= FFSIGN(-middleEnergy);
01022
01023 if(q>0){
01024 d= d<0 ? 0 : d;
01025 d= d>q ? q : d;
01026 }else{
01027 d= d>0 ? 0 : d;
01028 d= d<q ? q : d;
01029 }
01030
01031 src[l4]-= d;
01032 src[l5]+= d;
01033 }
01034 src++;
01035 }
01036 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
01037 }
01038 #endif //HAVE_ALTIVEC
01039
01040 #if !HAVE_ALTIVEC
01041 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c)
01042 {
01043 #if HAVE_MMX2 || HAVE_AMD3DNOW
01044 DECLARE_ALIGNED(8, uint64_t, tmp)[3];
01045 __asm__ volatile(
01046 "pxor %%mm6, %%mm6 \n\t"
01047 "pcmpeqb %%mm7, %%mm7 \n\t"
01048 "movq %2, %%mm0 \n\t"
01049 "punpcklbw %%mm6, %%mm0 \n\t"
01050 "psrlw $1, %%mm0 \n\t"
01051 "psubw %%mm7, %%mm0 \n\t"
01052 "packuswb %%mm0, %%mm0 \n\t"
01053 "movq %%mm0, %3 \n\t"
01054
01055 "lea (%0, %1), %%"REG_a" \n\t"
01056 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
01057
01058
01059
01060
01061 #undef REAL_FIND_MIN_MAX
01062 #undef FIND_MIN_MAX
01063 #if HAVE_MMX2
01064 #define REAL_FIND_MIN_MAX(addr)\
01065 "movq " #addr ", %%mm0 \n\t"\
01066 "pminub %%mm0, %%mm7 \n\t"\
01067 "pmaxub %%mm0, %%mm6 \n\t"
01068 #else
01069 #define REAL_FIND_MIN_MAX(addr)\
01070 "movq " #addr ", %%mm0 \n\t"\
01071 "movq %%mm7, %%mm1 \n\t"\
01072 "psubusb %%mm0, %%mm6 \n\t"\
01073 "paddb %%mm0, %%mm6 \n\t"\
01074 "psubusb %%mm0, %%mm1 \n\t"\
01075 "psubb %%mm1, %%mm7 \n\t"
01076 #endif
01077 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr)
01078
01079 FIND_MIN_MAX((%%REGa))
01080 FIND_MIN_MAX((%%REGa, %1))
01081 FIND_MIN_MAX((%%REGa, %1, 2))
01082 FIND_MIN_MAX((%0, %1, 4))
01083 FIND_MIN_MAX((%%REGd))
01084 FIND_MIN_MAX((%%REGd, %1))
01085 FIND_MIN_MAX((%%REGd, %1, 2))
01086 FIND_MIN_MAX((%0, %1, 8))
01087
01088 "movq %%mm7, %%mm4 \n\t"
01089 "psrlq $8, %%mm7 \n\t"
01090 #if HAVE_MMX2
01091 "pminub %%mm4, %%mm7 \n\t"
01092 "pshufw $0xF9, %%mm7, %%mm4 \n\t"
01093 "pminub %%mm4, %%mm7 \n\t"
01094 "pshufw $0xFE, %%mm7, %%mm4 \n\t"
01095 "pminub %%mm4, %%mm7 \n\t"
01096 #else
01097 "movq %%mm7, %%mm1 \n\t"
01098 "psubusb %%mm4, %%mm1 \n\t"
01099 "psubb %%mm1, %%mm7 \n\t"
01100 "movq %%mm7, %%mm4 \n\t"
01101 "psrlq $16, %%mm7 \n\t"
01102 "movq %%mm7, %%mm1 \n\t"
01103 "psubusb %%mm4, %%mm1 \n\t"
01104 "psubb %%mm1, %%mm7 \n\t"
01105 "movq %%mm7, %%mm4 \n\t"
01106 "psrlq $32, %%mm7 \n\t"
01107 "movq %%mm7, %%mm1 \n\t"
01108 "psubusb %%mm4, %%mm1 \n\t"
01109 "psubb %%mm1, %%mm7 \n\t"
01110 #endif
01111
01112
01113 "movq %%mm6, %%mm4 \n\t"
01114 "psrlq $8, %%mm6 \n\t"
01115 #if HAVE_MMX2
01116 "pmaxub %%mm4, %%mm6 \n\t"
01117 "pshufw $0xF9, %%mm6, %%mm4 \n\t"
01118 "pmaxub %%mm4, %%mm6 \n\t"
01119 "pshufw $0xFE, %%mm6, %%mm4 \n\t"
01120 "pmaxub %%mm4, %%mm6 \n\t"
01121 #else
01122 "psubusb %%mm4, %%mm6 \n\t"
01123 "paddb %%mm4, %%mm6 \n\t"
01124 "movq %%mm6, %%mm4 \n\t"
01125 "psrlq $16, %%mm6 \n\t"
01126 "psubusb %%mm4, %%mm6 \n\t"
01127 "paddb %%mm4, %%mm6 \n\t"
01128 "movq %%mm6, %%mm4 \n\t"
01129 "psrlq $32, %%mm6 \n\t"
01130 "psubusb %%mm4, %%mm6 \n\t"
01131 "paddb %%mm4, %%mm6 \n\t"
01132 #endif
01133 "movq %%mm6, %%mm0 \n\t"
01134 "psubb %%mm7, %%mm6 \n\t"
01135 "push %4 \n\t"
01136 "movd %%mm6, %k4 \n\t"
01137 "cmpb "MANGLE(deringThreshold)", %b4 \n\t"
01138 "pop %4 \n\t"
01139 " jb 1f \n\t"
01140 PAVGB(%%mm0, %%mm7)
01141 "punpcklbw %%mm7, %%mm7 \n\t"
01142 "punpcklbw %%mm7, %%mm7 \n\t"
01143 "punpcklbw %%mm7, %%mm7 \n\t"
01144 "movq %%mm7, (%4) \n\t"
01145
01146 "movq (%0), %%mm0 \n\t"
01147 "movq %%mm0, %%mm1 \n\t"
01148 "movq %%mm0, %%mm2 \n\t"
01149 "psllq $8, %%mm1 \n\t"
01150 "psrlq $8, %%mm2 \n\t"
01151 "movd -4(%0), %%mm3 \n\t"
01152 "movd 8(%0), %%mm4 \n\t"
01153 "psrlq $24, %%mm3 \n\t"
01154 "psllq $56, %%mm4 \n\t"
01155 "por %%mm3, %%mm1 \n\t"
01156 "por %%mm4, %%mm2 \n\t"
01157 "movq %%mm1, %%mm3 \n\t"
01158 PAVGB(%%mm2, %%mm1)
01159 PAVGB(%%mm0, %%mm1)
01160 "psubusb %%mm7, %%mm0 \n\t"
01161 "psubusb %%mm7, %%mm2 \n\t"
01162 "psubusb %%mm7, %%mm3 \n\t"
01163 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t"
01164 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t"
01165 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t"
01166 "paddb %%mm2, %%mm0 \n\t"
01167 "paddb %%mm3, %%mm0 \n\t"
01168
01169 "movq (%%"REG_a"), %%mm2 \n\t"
01170 "movq %%mm2, %%mm3 \n\t"
01171 "movq %%mm2, %%mm4 \n\t"
01172 "psllq $8, %%mm3 \n\t"
01173 "psrlq $8, %%mm4 \n\t"
01174 "movd -4(%%"REG_a"), %%mm5 \n\t"
01175 "movd 8(%%"REG_a"), %%mm6 \n\t"
01176 "psrlq $24, %%mm5 \n\t"
01177 "psllq $56, %%mm6 \n\t"
01178 "por %%mm5, %%mm3 \n\t"
01179 "por %%mm6, %%mm4 \n\t"
01180 "movq %%mm3, %%mm5 \n\t"
01181 PAVGB(%%mm4, %%mm3)
01182 PAVGB(%%mm2, %%mm3)
01183 "psubusb %%mm7, %%mm2 \n\t"
01184 "psubusb %%mm7, %%mm4 \n\t"
01185 "psubusb %%mm7, %%mm5 \n\t"
01186 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t"
01187 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t"
01188 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t"
01189 "paddb %%mm4, %%mm2 \n\t"
01190 "paddb %%mm5, %%mm2 \n\t"
01191
01192 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
01193 "movq " #src ", " #sx " \n\t" \
01194 "movq " #sx ", " #lx " \n\t" \
01195 "movq " #sx ", " #t0 " \n\t" \
01196 "psllq $8, " #lx " \n\t"\
01197 "psrlq $8, " #t0 " \n\t"\
01198 "movd -4" #src ", " #t1 " \n\t"\
01199 "psrlq $24, " #t1 " \n\t"\
01200 "por " #t1 ", " #lx " \n\t" \
01201 "movd 8" #src ", " #t1 " \n\t"\
01202 "psllq $56, " #t1 " \n\t"\
01203 "por " #t1 ", " #t0 " \n\t" \
01204 "movq " #lx ", " #t1 " \n\t" \
01205 PAVGB(t0, lx) \
01206 PAVGB(sx, lx) \
01207 PAVGB(lx, pplx) \
01208 "movq " #lx ", 8(%4) \n\t"\
01209 "movq (%4), " #lx " \n\t"\
01210 "psubusb " #lx ", " #t1 " \n\t"\
01211 "psubusb " #lx ", " #t0 " \n\t"\
01212 "psubusb " #lx ", " #sx " \n\t"\
01213 "movq "MANGLE(b00)", " #lx " \n\t"\
01214 "pcmpeqb " #lx ", " #t1 " \n\t" \
01215 "pcmpeqb " #lx ", " #t0 " \n\t" \
01216 "pcmpeqb " #lx ", " #sx " \n\t" \
01217 "paddb " #t1 ", " #t0 " \n\t"\
01218 "paddb " #t0 ", " #sx " \n\t"\
01219 \
01220 PAVGB(plx, pplx) \
01221 "movq " #dst ", " #t0 " \n\t" \
01222 "movq " #t0 ", " #t1 " \n\t" \
01223 "psubusb %3, " #t0 " \n\t"\
01224 "paddusb %3, " #t1 " \n\t"\
01225 PMAXUB(t0, pplx)\
01226 PMINUB(t1, pplx, t0)\
01227 "paddb " #sx ", " #ppsx " \n\t"\
01228 "paddb " #psx ", " #ppsx " \n\t"\
01229 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\
01230 "pand "MANGLE(b08)", " #ppsx " \n\t"\
01231 "pcmpeqb " #lx ", " #ppsx " \n\t"\
01232 "pand " #ppsx ", " #pplx " \n\t"\
01233 "pandn " #dst ", " #ppsx " \n\t"\
01234 "por " #pplx ", " #ppsx " \n\t"\
01235 "movq " #ppsx ", " #dst " \n\t"\
01236 "movq 8(%4), " #lx " \n\t"
01237
01238 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \
01239 REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1)
01240
01241
01242
01243
01244
01245
01246
01247
01248
01249
01250
01251
01252
01253
01254
01255
01256 DERING_CORE((%%REGa) ,(%%REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
01257 DERING_CORE((%%REGa, %1) ,(%%REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
01258 DERING_CORE((%%REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
01259 DERING_CORE((%0, %1, 4) ,(%%REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
01260 DERING_CORE((%%REGd) ,(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
01261 DERING_CORE((%%REGd, %1) ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7)
01262 DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7)
01263 DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7)
01264
01265 "1: \n\t"
01266 : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2), "q"(tmp)
01267 : "%"REG_a, "%"REG_d
01268 );
01269 #else //HAVE_MMX2 || HAVE_AMD3DNOW
01270 int y;
01271 int min=255;
01272 int max=0;
01273 int avg;
01274 uint8_t *p;
01275 int s[10];
01276 const int QP2= c->QP/2 + 1;
01277
01278 for(y=1; y<9; y++){
01279 int x;
01280 p= src + stride*y;
01281 for(x=1; x<9; x++){
01282 p++;
01283 if(*p > max) max= *p;
01284 if(*p < min) min= *p;
01285 }
01286 }
01287 avg= (min + max + 1)>>1;
01288
01289 if(max - min <deringThreshold) return;
01290
01291 for(y=0; y<10; y++){
01292 int t = 0;
01293
01294 if(src[stride*y + 0] > avg) t+= 1;
01295 if(src[stride*y + 1] > avg) t+= 2;
01296 if(src[stride*y + 2] > avg) t+= 4;
01297 if(src[stride*y + 3] > avg) t+= 8;
01298 if(src[stride*y + 4] > avg) t+= 16;
01299 if(src[stride*y + 5] > avg) t+= 32;
01300 if(src[stride*y + 6] > avg) t+= 64;
01301 if(src[stride*y + 7] > avg) t+= 128;
01302 if(src[stride*y + 8] > avg) t+= 256;
01303 if(src[stride*y + 9] > avg) t+= 512;
01304
01305 t |= (~t)<<16;
01306 t &= (t<<1) & (t>>1);
01307 s[y] = t;
01308 }
01309
01310 for(y=1; y<9; y++){
01311 int t = s[y-1] & s[y] & s[y+1];
01312 t|= t>>16;
01313 s[y-1]= t;
01314 }
01315
01316 for(y=1; y<9; y++){
01317 int x;
01318 int t = s[y-1];
01319
01320 p= src + stride*y;
01321 for(x=1; x<9; x++){
01322 p++;
01323 if(t & (1<<x)){
01324 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1))
01325 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1))
01326 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1));
01327 f= (f + 8)>>4;
01328
01329 #ifdef DEBUG_DERING_THRESHOLD
01330 __asm__ volatile("emms\n\t":);
01331 {
01332 static long long numPixels=0;
01333 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++;
01334
01335
01336
01337 if(max-min < 20){
01338 static int numSkipped=0;
01339 static int errorSum=0;
01340 static int worstQP=0;
01341 static int worstRange=0;
01342 static int worstDiff=0;
01343 int diff= (f - *p);
01344 int absDiff= FFABS(diff);
01345 int error= diff*diff;
01346
01347 if(x==1 || x==8 || y==1 || y==8) continue;
01348
01349 numSkipped++;
01350 if(absDiff > worstDiff){
01351 worstDiff= absDiff;
01352 worstQP= QP;
01353 worstRange= max-min;
01354 }
01355 errorSum+= error;
01356
01357 if(1024LL*1024LL*1024LL % numSkipped == 0){
01358 av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, "
01359 "wRange:%d, wDiff:%d, relSkip:%1.3f\n",
01360 (float)errorSum/numSkipped, numSkipped, worstQP, worstRange,
01361 worstDiff, (float)numSkipped/numPixels);
01362 }
01363 }
01364 }
01365 #endif
01366 if (*p + QP2 < f) *p= *p + QP2;
01367 else if(*p - QP2 > f) *p= *p - QP2;
01368 else *p=f;
01369 }
01370 }
01371 }
01372 #ifdef DEBUG_DERING_THRESHOLD
01373 if(max-min < 20){
01374 for(y=1; y<9; y++){
01375 int x;
01376 int t = 0;
01377 p= src + stride*y;
01378 for(x=1; x<9; x++){
01379 p++;
01380 *p = FFMIN(*p + 20, 255);
01381 }
01382 }
01383
01384 }
01385 #endif
01386 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
01387 }
01388 #endif //HAVE_ALTIVEC
01389
01396 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride)
01397 {
01398 #if HAVE_MMX2 || HAVE_AMD3DNOW
01399 src+= 4*stride;
01400 __asm__ volatile(
01401 "lea (%0, %1), %%"REG_a" \n\t"
01402 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t"
01403
01404
01405
01406 "movq (%0), %%mm0 \n\t"
01407 "movq (%%"REG_a", %1), %%mm1 \n\t"
01408 PAVGB(%%mm1, %%mm0)
01409 "movq %%mm0, (%%"REG_a") \n\t"
01410 "movq (%0, %1, 4), %%mm0 \n\t"
01411 PAVGB(%%mm0, %%mm1)
01412 "movq %%mm1, (%%"REG_a", %1, 2) \n\t"
01413 "movq (%%"REG_c", %1), %%mm1 \n\t"
01414 PAVGB(%%mm1, %%mm0)
01415 "movq %%mm0, (%%"REG_c") \n\t"
01416 "movq (%0, %1, 8), %%mm0 \n\t"
01417 PAVGB(%%mm0, %%mm1)
01418 "movq %%mm1, (%%"REG_c", %1, 2) \n\t"
01419
01420 : : "r" (src), "r" ((x86_reg)stride)
01421 : "%"REG_a, "%"REG_c
01422 );
01423 #else
01424 int a, b, x;
01425 src+= 4*stride;
01426
01427 for(x=0; x<2; x++){
01428 a= *(uint32_t*)&src[stride*0];
01429 b= *(uint32_t*)&src[stride*2];
01430 *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
01431 a= *(uint32_t*)&src[stride*4];
01432 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
01433 b= *(uint32_t*)&src[stride*6];
01434 *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
01435 a= *(uint32_t*)&src[stride*8];
01436 *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
01437 src += 4;
01438 }
01439 #endif
01440 }
01441
01449 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride)
01450 {
01451 #if HAVE_MMX2 || HAVE_AMD3DNOW
01452 src+= stride*3;
01453 __asm__ volatile(
01454 "lea (%0, %1), %%"REG_a" \n\t"
01455 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
01456 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t"
01457 "add %1, %%"REG_c" \n\t"
01458 "pxor %%mm7, %%mm7 \n\t"
01459
01460
01461
01462 #define REAL_DEINT_CUBIC(a,b,c,d,e)\
01463 "movq " #a ", %%mm0 \n\t"\
01464 "movq " #b ", %%mm1 \n\t"\
01465 "movq " #d ", %%mm2 \n\t"\
01466 "movq " #e ", %%mm3 \n\t"\
01467 PAVGB(%%mm2, %%mm1) \
01468 PAVGB(%%mm3, %%mm0) \
01469 "movq %%mm0, %%mm2 \n\t"\
01470 "punpcklbw %%mm7, %%mm0 \n\t"\
01471 "punpckhbw %%mm7, %%mm2 \n\t"\
01472 "movq %%mm1, %%mm3 \n\t"\
01473 "punpcklbw %%mm7, %%mm1 \n\t"\
01474 "punpckhbw %%mm7, %%mm3 \n\t"\
01475 "psubw %%mm1, %%mm0 \n\t" \
01476 "psubw %%mm3, %%mm2 \n\t" \
01477 "psraw $3, %%mm0 \n\t" \
01478 "psraw $3, %%mm2 \n\t" \
01479 "psubw %%mm0, %%mm1 \n\t" \
01480 "psubw %%mm2, %%mm3 \n\t" \
01481 "packuswb %%mm3, %%mm1 \n\t"\
01482 "movq %%mm1, " #c " \n\t"
01483 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e)
01484
01485 DEINT_CUBIC((%0) , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1))
01486 DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%0, %1, 8))
01487 DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc))
01488 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2))
01489
01490 : : "r" (src), "r" ((x86_reg)stride)
01491 : "%"REG_a, "%"REG_d, "%"REG_c
01492 );
01493 #else //HAVE_MMX2 || HAVE_AMD3DNOW
01494 int x;
01495 src+= stride*3;
01496 for(x=0; x<8; x++){
01497 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4);
01498 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4);
01499 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4);
01500 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4);
01501 src++;
01502 }
01503 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
01504 }
01505
01513 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp)
01514 {
01515 #if HAVE_MMX2 || HAVE_AMD3DNOW
01516 src+= stride*4;
01517 __asm__ volatile(
01518 "lea (%0, %1), %%"REG_a" \n\t"
01519 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
01520 "pxor %%mm7, %%mm7 \n\t"
01521 "movq (%2), %%mm0 \n\t"
01522
01523
01524
01525 #define REAL_DEINT_FF(a,b,c,d)\
01526 "movq " #a ", %%mm1 \n\t"\
01527 "movq " #b ", %%mm2 \n\t"\
01528 "movq " #c ", %%mm3 \n\t"\
01529 "movq " #d ", %%mm4 \n\t"\
01530 PAVGB(%%mm3, %%mm1) \
01531 PAVGB(%%mm4, %%mm0) \
01532 "movq %%mm0, %%mm3 \n\t"\
01533 "punpcklbw %%mm7, %%mm0 \n\t"\
01534 "punpckhbw %%mm7, %%mm3 \n\t"\
01535 "movq %%mm1, %%mm4 \n\t"\
01536 "punpcklbw %%mm7, %%mm1 \n\t"\
01537 "punpckhbw %%mm7, %%mm4 \n\t"\
01538 "psllw $2, %%mm1 \n\t"\
01539 "psllw $2, %%mm4 \n\t"\
01540 "psubw %%mm0, %%mm1 \n\t"\
01541 "psubw %%mm3, %%mm4 \n\t"\
01542 "movq %%mm2, %%mm5 \n\t"\
01543 "movq %%mm2, %%mm0 \n\t"\
01544 "punpcklbw %%mm7, %%mm2 \n\t"\
01545 "punpckhbw %%mm7, %%mm5 \n\t"\
01546 "paddw %%mm2, %%mm1 \n\t"\
01547 "paddw %%mm5, %%mm4 \n\t"\
01548 "psraw $2, %%mm1 \n\t"\
01549 "psraw $2, %%mm4 \n\t"\
01550 "packuswb %%mm4, %%mm1 \n\t"\
01551 "movq %%mm1, " #b " \n\t"\
01552
01553 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d)
01554
01555 DEINT_FF((%0) , (%%REGa) , (%%REGa, %1), (%%REGa, %1, 2))
01556 DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) )
01557 DEINT_FF((%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%%REGd, %1, 2))
01558 DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
01559
01560 "movq %%mm0, (%2) \n\t"
01561 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp)
01562 : "%"REG_a, "%"REG_d
01563 );
01564 #else //HAVE_MMX2 || HAVE_AMD3DNOW
01565 int x;
01566 src+= stride*4;
01567 for(x=0; x<8; x++){
01568 int t1= tmp[x];
01569 int t2= src[stride*1];
01570
01571 src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3);
01572 t1= src[stride*4];
01573 src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3);
01574 t2= src[stride*6];
01575 src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3);
01576 t1= src[stride*8];
01577 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3);
01578 tmp[x]= t1;
01579
01580 src++;
01581 }
01582 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
01583 }
01584
01592 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2)
01593 {
01594 #if HAVE_MMX2 || HAVE_AMD3DNOW
01595 src+= stride*4;
01596 __asm__ volatile(
01597 "lea (%0, %1), %%"REG_a" \n\t"
01598 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
01599 "pxor %%mm7, %%mm7 \n\t"
01600 "movq (%2), %%mm0 \n\t"
01601 "movq (%3), %%mm1 \n\t"
01602
01603
01604
01605 #define REAL_DEINT_L5(t1,t2,a,b,c)\
01606 "movq " #a ", %%mm2 \n\t"\
01607 "movq " #b ", %%mm3 \n\t"\
01608 "movq " #c ", %%mm4 \n\t"\
01609 PAVGB(t2, %%mm3) \
01610 PAVGB(t1, %%mm4) \
01611 "movq %%mm2, %%mm5 \n\t"\
01612 "movq %%mm2, " #t1 " \n\t"\
01613 "punpcklbw %%mm7, %%mm2 \n\t"\
01614 "punpckhbw %%mm7, %%mm5 \n\t"\
01615 "movq %%mm2, %%mm6 \n\t"\
01616 "paddw %%mm2, %%mm2 \n\t"\
01617 "paddw %%mm6, %%mm2 \n\t"\
01618 "movq %%mm5, %%mm6 \n\t"\
01619 "paddw %%mm5, %%mm5 \n\t"\
01620 "paddw %%mm6, %%mm5 \n\t"\
01621 "movq %%mm3, %%mm6 \n\t"\
01622 "punpcklbw %%mm7, %%mm3 \n\t"\
01623 "punpckhbw %%mm7, %%mm6 \n\t"\
01624 "paddw %%mm3, %%mm3 \n\t"\
01625 "paddw %%mm6, %%mm6 \n\t"\
01626 "paddw %%mm3, %%mm2 \n\t"\
01627 "paddw %%mm6, %%mm5 \n\t"\
01628 "movq %%mm4, %%mm6 \n\t"\
01629 "punpcklbw %%mm7, %%mm4 \n\t"\
01630 "punpckhbw %%mm7, %%mm6 \n\t"\
01631 "psubw %%mm4, %%mm2 \n\t"\
01632 "psubw %%mm6, %%mm5 \n\t"\
01633 "psraw $2, %%mm2 \n\t"\
01634 "psraw $2, %%mm5 \n\t"\
01635 "packuswb %%mm5, %%mm2 \n\t"\
01636 "movq %%mm2, " #a " \n\t"\
01637
01638 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c)
01639
01640 DEINT_L5(%%mm0, %%mm1, (%0) , (%%REGa) , (%%REGa, %1) )
01641 DEINT_L5(%%mm1, %%mm0, (%%REGa) , (%%REGa, %1) , (%%REGa, %1, 2))
01642 DEINT_L5(%%mm0, %%mm1, (%%REGa, %1) , (%%REGa, %1, 2), (%0, %1, 4) )
01643 DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) )
01644 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%REGd) , (%%REGd, %1) )
01645 DEINT_L5(%%mm1, %%mm0, (%%REGd) , (%%REGd, %1) , (%%REGd, %1, 2))
01646 DEINT_L5(%%mm0, %%mm1, (%%REGd, %1) , (%%REGd, %1, 2), (%0, %1, 8) )
01647 DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4))
01648
01649 "movq %%mm0, (%2) \n\t"
01650 "movq %%mm1, (%3) \n\t"
01651 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2)
01652 : "%"REG_a, "%"REG_d
01653 );
01654 #else //HAVE_MMX2 || HAVE_AMD3DNOW
01655 int x;
01656 src+= stride*4;
01657 for(x=0; x<8; x++){
01658 int t1= tmp[x];
01659 int t2= tmp2[x];
01660 int t3= src[0];
01661
01662 src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3);
01663 t1= src[stride*1];
01664 src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3);
01665 t2= src[stride*2];
01666 src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3);
01667 t3= src[stride*3];
01668 src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3);
01669 t1= src[stride*4];
01670 src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3);
01671 t2= src[stride*5];
01672 src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3);
01673 t3= src[stride*6];
01674 src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3);
01675 t1= src[stride*7];
01676 src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3);
01677
01678 tmp[x]= t3;
01679 tmp2[x]= t1;
01680
01681 src++;
01682 }
01683 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
01684 }
01685
01693 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp)
01694 {
01695 #if HAVE_MMX2 || HAVE_AMD3DNOW
01696 src+= 4*stride;
01697 __asm__ volatile(
01698 "lea (%0, %1), %%"REG_a" \n\t"
01699 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
01700
01701
01702
01703 "movq (%2), %%mm0 \n\t"
01704 "movq (%%"REG_a"), %%mm1 \n\t"
01705 PAVGB(%%mm1, %%mm0)
01706 "movq (%0), %%mm2 \n\t"
01707 PAVGB(%%mm2, %%mm0)
01708 "movq %%mm0, (%0) \n\t"
01709 "movq (%%"REG_a", %1), %%mm0 \n\t"
01710 PAVGB(%%mm0, %%mm2)
01711 PAVGB(%%mm1, %%mm2)
01712 "movq %%mm2, (%%"REG_a") \n\t"
01713 "movq (%%"REG_a", %1, 2), %%mm2 \n\t"
01714 PAVGB(%%mm2, %%mm1)
01715 PAVGB(%%mm0, %%mm1)
01716 "movq %%mm1, (%%"REG_a", %1) \n\t"
01717 "movq (%0, %1, 4), %%mm1 \n\t"
01718 PAVGB(%%mm1, %%mm0)
01719 PAVGB(%%mm2, %%mm0)
01720 "movq %%mm0, (%%"REG_a", %1, 2) \n\t"
01721 "movq (%%"REG_d"), %%mm0 \n\t"
01722 PAVGB(%%mm0, %%mm2)
01723 PAVGB(%%mm1, %%mm2)
01724 "movq %%mm2, (%0, %1, 4) \n\t"
01725 "movq (%%"REG_d", %1), %%mm2 \n\t"
01726 PAVGB(%%mm2, %%mm1)
01727 PAVGB(%%mm0, %%mm1)
01728 "movq %%mm1, (%%"REG_d") \n\t"
01729 "movq (%%"REG_d", %1, 2), %%mm1 \n\t"
01730 PAVGB(%%mm1, %%mm0)
01731 PAVGB(%%mm2, %%mm0)
01732 "movq %%mm0, (%%"REG_d", %1) \n\t"
01733 "movq (%0, %1, 8), %%mm0 \n\t"
01734 PAVGB(%%mm0, %%mm2)
01735 PAVGB(%%mm1, %%mm2)
01736 "movq %%mm2, (%%"REG_d", %1, 2) \n\t"
01737 "movq %%mm1, (%2) \n\t"
01738
01739 : : "r" (src), "r" ((x86_reg)stride), "r" (tmp)
01740 : "%"REG_a, "%"REG_d
01741 );
01742 #else //HAVE_MMX2 || HAVE_AMD3DNOW
01743 int a, b, c, x;
01744 src+= 4*stride;
01745
01746 for(x=0; x<2; x++){
01747 a= *(uint32_t*)&tmp[stride*0];
01748 b= *(uint32_t*)&src[stride*0];
01749 c= *(uint32_t*)&src[stride*1];
01750 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
01751 *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
01752
01753 a= *(uint32_t*)&src[stride*2];
01754 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
01755 *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
01756
01757 b= *(uint32_t*)&src[stride*3];
01758 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
01759 *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
01760
01761 c= *(uint32_t*)&src[stride*4];
01762 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
01763 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
01764
01765 a= *(uint32_t*)&src[stride*5];
01766 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
01767 *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
01768
01769 b= *(uint32_t*)&src[stride*6];
01770 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1);
01771 *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1);
01772
01773 c= *(uint32_t*)&src[stride*7];
01774 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1);
01775 *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1);
01776
01777 a= *(uint32_t*)&src[stride*8];
01778 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1);
01779 *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1);
01780
01781 *(uint32_t*)&tmp[stride*0]= c;
01782 src += 4;
01783 tmp += 4;
01784 }
01785 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
01786 }
01787
01794 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
01795 {
01796 #if HAVE_MMX
01797 src+= 4*stride;
01798 #if HAVE_MMX2
01799 __asm__ volatile(
01800 "lea (%0, %1), %%"REG_a" \n\t"
01801 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
01802
01803
01804
01805 "movq (%0), %%mm0 \n\t"
01806 "movq (%%"REG_a", %1), %%mm2 \n\t"
01807 "movq (%%"REG_a"), %%mm1 \n\t"
01808 "movq %%mm0, %%mm3 \n\t"
01809 "pmaxub %%mm1, %%mm0 \n\t"
01810 "pminub %%mm3, %%mm1 \n\t"
01811 "pmaxub %%mm2, %%mm1 \n\t"
01812 "pminub %%mm1, %%mm0 \n\t"
01813 "movq %%mm0, (%%"REG_a") \n\t"
01814
01815 "movq (%0, %1, 4), %%mm0 \n\t"
01816 "movq (%%"REG_a", %1, 2), %%mm1 \n\t"
01817 "movq %%mm2, %%mm3 \n\t"
01818 "pmaxub %%mm1, %%mm2 \n\t"
01819 "pminub %%mm3, %%mm1 \n\t"
01820 "pmaxub %%mm0, %%mm1 \n\t"
01821 "pminub %%mm1, %%mm2 \n\t"
01822 "movq %%mm2, (%%"REG_a", %1, 2) \n\t"
01823
01824 "movq (%%"REG_d"), %%mm2 \n\t"
01825 "movq (%%"REG_d", %1), %%mm1 \n\t"
01826 "movq %%mm2, %%mm3 \n\t"
01827 "pmaxub %%mm0, %%mm2 \n\t"
01828 "pminub %%mm3, %%mm0 \n\t"
01829 "pmaxub %%mm1, %%mm0 \n\t"
01830 "pminub %%mm0, %%mm2 \n\t"
01831 "movq %%mm2, (%%"REG_d") \n\t"
01832
01833 "movq (%%"REG_d", %1, 2), %%mm2 \n\t"
01834 "movq (%0, %1, 8), %%mm0 \n\t"
01835 "movq %%mm2, %%mm3 \n\t"
01836 "pmaxub %%mm0, %%mm2 \n\t"
01837 "pminub %%mm3, %%mm0 \n\t"
01838 "pmaxub %%mm1, %%mm0 \n\t"
01839 "pminub %%mm0, %%mm2 \n\t"
01840 "movq %%mm2, (%%"REG_d", %1, 2) \n\t"
01841
01842
01843 : : "r" (src), "r" ((x86_reg)stride)
01844 : "%"REG_a, "%"REG_d
01845 );
01846
01847 #else // MMX without MMX2
01848 __asm__ volatile(
01849 "lea (%0, %1), %%"REG_a" \n\t"
01850 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t"
01851
01852
01853 "pxor %%mm7, %%mm7 \n\t"
01854
01855 #define REAL_MEDIAN(a,b,c)\
01856 "movq " #a ", %%mm0 \n\t"\
01857 "movq " #b ", %%mm2 \n\t"\
01858 "movq " #c ", %%mm1 \n\t"\
01859 "movq %%mm0, %%mm3 \n\t"\
01860 "movq %%mm1, %%mm4 \n\t"\
01861 "movq %%mm2, %%mm5 \n\t"\
01862 "psubusb %%mm1, %%mm3 \n\t"\
01863 "psubusb %%mm2, %%mm4 \n\t"\
01864 "psubusb %%mm0, %%mm5 \n\t"\
01865 "pcmpeqb %%mm7, %%mm3 \n\t"\
01866 "pcmpeqb %%mm7, %%mm4 \n\t"\
01867 "pcmpeqb %%mm7, %%mm5 \n\t"\
01868 "movq %%mm3, %%mm6 \n\t"\
01869 "pxor %%mm4, %%mm3 \n\t"\
01870 "pxor %%mm5, %%mm4 \n\t"\
01871 "pxor %%mm6, %%mm5 \n\t"\
01872 "por %%mm3, %%mm1 \n\t"\
01873 "por %%mm4, %%mm2 \n\t"\
01874 "por %%mm5, %%mm0 \n\t"\
01875 "pand %%mm2, %%mm0 \n\t"\
01876 "pand %%mm1, %%mm0 \n\t"\
01877 "movq %%mm0, " #b " \n\t"
01878 #define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c)
01879
01880 MEDIAN((%0) , (%%REGa) , (%%REGa, %1))
01881 MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4))
01882 MEDIAN((%0, %1, 4) , (%%REGd) , (%%REGd, %1))
01883 MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8))
01884
01885 : : "r" (src), "r" ((x86_reg)stride)
01886 : "%"REG_a, "%"REG_d
01887 );
01888 #endif //HAVE_MMX2
01889 #else //HAVE_MMX
01890 int x, y;
01891 src+= 4*stride;
01892
01893 for(x=0; x<8; x++){
01894 uint8_t *colsrc = src;
01895 for (y=0; y<4; y++){
01896 int a, b, c, d, e, f;
01897 a = colsrc[0 ];
01898 b = colsrc[stride ];
01899 c = colsrc[stride*2];
01900 d = (a-b)>>31;
01901 e = (b-c)>>31;
01902 f = (c-a)>>31;
01903 colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f));
01904 colsrc += stride*2;
01905 }
01906 src++;
01907 }
01908 #endif //HAVE_MMX
01909 }
01910
01911 #if HAVE_MMX
01912
01915 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride)
01916 {
01917 __asm__(
01918 "lea (%0, %1), %%"REG_a" \n\t"
01919
01920
01921 "movq (%0), %%mm0 \n\t"
01922 "movq (%%"REG_a"), %%mm1 \n\t"
01923 "movq %%mm0, %%mm2 \n\t"
01924 "punpcklbw %%mm1, %%mm0 \n\t"
01925 "punpckhbw %%mm1, %%mm2 \n\t"
01926
01927 "movq (%%"REG_a", %1), %%mm1 \n\t"
01928 "movq (%%"REG_a", %1, 2), %%mm3 \n\t"
01929 "movq %%mm1, %%mm4 \n\t"
01930 "punpcklbw %%mm3, %%mm1 \n\t"
01931 "punpckhbw %%mm3, %%mm4 \n\t"
01932
01933 "movq %%mm0, %%mm3 \n\t"
01934 "punpcklwd %%mm1, %%mm0 \n\t"
01935 "punpckhwd %%mm1, %%mm3 \n\t"
01936 "movq %%mm2, %%mm1 \n\t"
01937 "punpcklwd %%mm4, %%mm2 \n\t"
01938 "punpckhwd %%mm4, %%mm1 \n\t"
01939
01940 "movd %%mm0, 128(%2) \n\t"
01941 "psrlq $32, %%mm0 \n\t"
01942 "movd %%mm0, 144(%2) \n\t"
01943 "movd %%mm3, 160(%2) \n\t"
01944 "psrlq $32, %%mm3 \n\t"
01945 "movd %%mm3, 176(%2) \n\t"
01946 "movd %%mm3, 48(%3) \n\t"
01947 "movd %%mm2, 192(%2) \n\t"
01948 "movd %%mm2, 64(%3) \n\t"
01949 "psrlq $32, %%mm2 \n\t"
01950 "movd %%mm2, 80(%3) \n\t"
01951 "movd %%mm1, 96(%3) \n\t"
01952 "psrlq $32, %%mm1 \n\t"
01953 "movd %%mm1, 112(%3) \n\t"
01954
01955 "lea (%%"REG_a", %1, 4), %%"REG_a" \n\t"
01956
01957 "movq (%0, %1, 4), %%mm0 \n\t"
01958 "movq (%%"REG_a"), %%mm1 \n\t"
01959 "movq %%mm0, %%mm2 \n\t"
01960 "punpcklbw %%mm1, %%mm0 \n\t"
01961 "punpckhbw %%mm1, %%mm2 \n\t"
01962
01963 "movq (%%"REG_a", %1), %%mm1 \n\t"
01964 "movq (%%"REG_a", %1, 2), %%mm3 \n\t"
01965 "movq %%mm1, %%mm4 \n\t"
01966 "punpcklbw %%mm3, %%mm1 \n\t"
01967 "punpckhbw %%mm3, %%mm4 \n\t"
01968
01969 "movq %%mm0, %%mm3 \n\t"
01970 "punpcklwd %%mm1, %%mm0 \n\t"
01971 "punpckhwd %%mm1, %%mm3 \n\t"
01972 "movq %%mm2, %%mm1 \n\t"
01973 "punpcklwd %%mm4, %%mm2 \n\t"
01974 "punpckhwd %%mm4, %%mm1 \n\t"
01975
01976 "movd %%mm0, 132(%2) \n\t"
01977 "psrlq $32, %%mm0 \n\t"
01978 "movd %%mm0, 148(%2) \n\t"
01979 "movd %%mm3, 164(%2) \n\t"
01980 "psrlq $32, %%mm3 \n\t"
01981 "movd %%mm3, 180(%2) \n\t"
01982 "movd %%mm3, 52(%3) \n\t"
01983 "movd %%mm2, 196(%2) \n\t"
01984 "movd %%mm2, 68(%3) \n\t"
01985 "psrlq $32, %%mm2 \n\t"
01986 "movd %%mm2, 84(%3) \n\t"
01987 "movd %%mm1, 100(%3) \n\t"
01988 "psrlq $32, %%mm1 \n\t"
01989 "movd %%mm1, 116(%3) \n\t"
01990
01991
01992 :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2)
01993 : "%"REG_a
01994 );
01995 }
01996
02000 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src)
02001 {
02002 __asm__(
02003 "lea (%0, %1), %%"REG_a" \n\t"
02004 "lea (%%"REG_a",%1,4), %%"REG_d" \n\t"
02005
02006
02007 "movq (%2), %%mm0 \n\t"
02008 "movq 16(%2), %%mm1 \n\t"
02009 "movq %%mm0, %%mm2 \n\t"
02010 "punpcklbw %%mm1, %%mm0 \n\t"
02011 "punpckhbw %%mm1, %%mm2 \n\t"
02012
02013 "movq 32(%2), %%mm1 \n\t"
02014 "movq 48(%2), %%mm3 \n\t"
02015 "movq %%mm1, %%mm4 \n\t"
02016 "punpcklbw %%mm3, %%mm1 \n\t"
02017 "punpckhbw %%mm3, %%mm4 \n\t"
02018
02019 "movq %%mm0, %%mm3 \n\t"
02020 "punpcklwd %%mm1, %%mm0 \n\t"
02021 "punpckhwd %%mm1, %%mm3 \n\t"
02022 "movq %%mm2, %%mm1 \n\t"
02023 "punpcklwd %%mm4, %%mm2 \n\t"
02024 "punpckhwd %%mm4, %%mm1 \n\t"
02025
02026 "movd %%mm0, (%0) \n\t"
02027 "psrlq $32, %%mm0 \n\t"
02028 "movd %%mm0, (%%"REG_a") \n\t"
02029 "movd %%mm3, (%%"REG_a", %1) \n\t"
02030 "psrlq $32, %%mm3 \n\t"
02031 "movd %%mm3, (%%"REG_a", %1, 2) \n\t"
02032 "movd %%mm2, (%0, %1, 4) \n\t"
02033 "psrlq $32, %%mm2 \n\t"
02034 "movd %%mm2, (%%"REG_d") \n\t"
02035 "movd %%mm1, (%%"REG_d", %1) \n\t"
02036 "psrlq $32, %%mm1 \n\t"
02037 "movd %%mm1, (%%"REG_d", %1, 2) \n\t"
02038
02039
02040 "movq 64(%2), %%mm0 \n\t"
02041 "movq 80(%2), %%mm1 \n\t"
02042 "movq %%mm0, %%mm2 \n\t"
02043 "punpcklbw %%mm1, %%mm0 \n\t"
02044 "punpckhbw %%mm1, %%mm2 \n\t"
02045
02046 "movq 96(%2), %%mm1 \n\t"
02047 "movq 112(%2), %%mm3 \n\t"
02048 "movq %%mm1, %%mm4 \n\t"
02049 "punpcklbw %%mm3, %%mm1 \n\t"
02050 "punpckhbw %%mm3, %%mm4 \n\t"
02051
02052 "movq %%mm0, %%mm3 \n\t"
02053 "punpcklwd %%mm1, %%mm0 \n\t"
02054 "punpckhwd %%mm1, %%mm3 \n\t"
02055 "movq %%mm2, %%mm1 \n\t"
02056 "punpcklwd %%mm4, %%mm2 \n\t"
02057 "punpckhwd %%mm4, %%mm1 \n\t"
02058
02059 "movd %%mm0, 4(%0) \n\t"
02060 "psrlq $32, %%mm0 \n\t"
02061 "movd %%mm0, 4(%%"REG_a") \n\t"
02062 "movd %%mm3, 4(%%"REG_a", %1) \n\t"
02063 "psrlq $32, %%mm3 \n\t"
02064 "movd %%mm3, 4(%%"REG_a", %1, 2) \n\t"
02065 "movd %%mm2, 4(%0, %1, 4) \n\t"
02066 "psrlq $32, %%mm2 \n\t"
02067 "movd %%mm2, 4(%%"REG_d") \n\t"
02068 "movd %%mm1, 4(%%"REG_d", %1) \n\t"
02069 "psrlq $32, %%mm1 \n\t"
02070 "movd %%mm1, 4(%%"REG_d", %1, 2) \n\t"
02071
02072 :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src)
02073 : "%"REG_a, "%"REG_d
02074 );
02075 }
02076 #endif //HAVE_MMX
02077
02078
02079 #if !HAVE_ALTIVEC
02080 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
02081 uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise)
02082 {
02083
02084 tempBlurredPast[127]= maxNoise[0];
02085 tempBlurredPast[128]= maxNoise[1];
02086 tempBlurredPast[129]= maxNoise[2];
02087
02088 #define FAST_L2_DIFF
02089
02090 #if HAVE_MMX2 || HAVE_AMD3DNOW
02091 __asm__ volatile(
02092 "lea (%2, %2, 2), %%"REG_a" \n\t"
02093 "lea (%2, %2, 4), %%"REG_d" \n\t"
02094 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t"
02095
02096
02097
02098 #ifdef L1_DIFF //needs mmx2
02099 "movq (%0), %%mm0 \n\t"
02100 "psadbw (%1), %%mm0 \n\t"
02101 "movq (%0, %2), %%mm1 \n\t"
02102 "psadbw (%1, %2), %%mm1 \n\t"
02103 "movq (%0, %2, 2), %%mm2 \n\t"
02104 "psadbw (%1, %2, 2), %%mm2 \n\t"
02105 "movq (%0, %%"REG_a"), %%mm3 \n\t"
02106 "psadbw (%1, %%"REG_a"), %%mm3 \n\t"
02107
02108 "movq (%0, %2, 4), %%mm4 \n\t"
02109 "paddw %%mm1, %%mm0 \n\t"
02110 "psadbw (%1, %2, 4), %%mm4 \n\t"
02111 "movq (%0, %%"REG_d"), %%mm5 \n\t"
02112 "paddw %%mm2, %%mm0 \n\t"
02113 "psadbw (%1, %%"REG_d"), %%mm5 \n\t"
02114 "movq (%0, %%"REG_a", 2), %%mm6 \n\t"
02115 "paddw %%mm3, %%mm0 \n\t"
02116 "psadbw (%1, %%"REG_a", 2), %%mm6 \n\t"
02117 "movq (%0, %%"REG_c"), %%mm7 \n\t"
02118 "paddw %%mm4, %%mm0 \n\t"
02119 "psadbw (%1, %%"REG_c"), %%mm7 \n\t"
02120 "paddw %%mm5, %%mm6 \n\t"
02121 "paddw %%mm7, %%mm6 \n\t"
02122 "paddw %%mm6, %%mm0 \n\t"
02123 #else //L1_DIFF
02124 #if defined (FAST_L2_DIFF)
02125 "pcmpeqb %%mm7, %%mm7 \n\t"
02126 "movq "MANGLE(b80)", %%mm6 \n\t"
02127 "pxor %%mm0, %%mm0 \n\t"
02128 #define REAL_L2_DIFF_CORE(a, b)\
02129 "movq " #a ", %%mm5 \n\t"\
02130 "movq " #b ", %%mm2 \n\t"\
02131 "pxor %%mm7, %%mm2 \n\t"\
02132 PAVGB(%%mm2, %%mm5)\
02133 "paddb %%mm6, %%mm5 \n\t"\
02134 "movq %%mm5, %%mm2 \n\t"\
02135 "psllw $8, %%mm5 \n\t"\
02136 "pmaddwd %%mm5, %%mm5 \n\t"\
02137 "pmaddwd %%mm2, %%mm2 \n\t"\
02138 "paddd %%mm2, %%mm5 \n\t"\
02139 "psrld $14, %%mm5 \n\t"\
02140 "paddd %%mm5, %%mm0 \n\t"
02141
02142 #else //defined (FAST_L2_DIFF)
02143 "pxor %%mm7, %%mm7 \n\t"
02144 "pxor %%mm0, %%mm0 \n\t"
02145 #define REAL_L2_DIFF_CORE(a, b)\
02146 "movq " #a ", %%mm5 \n\t"\
02147 "movq " #b ", %%mm2 \n\t"\
02148 "movq %%mm5, %%mm1 \n\t"\
02149 "movq %%mm2, %%mm3 \n\t"\
02150 "punpcklbw %%mm7, %%mm5 \n\t"\
02151 "punpckhbw %%mm7, %%mm1 \n\t"\
02152 "punpcklbw %%mm7, %%mm2 \n\t"\
02153 "punpckhbw %%mm7, %%mm3 \n\t"\
02154 "psubw %%mm2, %%mm5 \n\t"\
02155 "psubw %%mm3, %%mm1 \n\t"\
02156 "pmaddwd %%mm5, %%mm5 \n\t"\
02157 "pmaddwd %%mm1, %%mm1 \n\t"\
02158 "paddd %%mm1, %%mm5 \n\t"\
02159 "paddd %%mm5, %%mm0 \n\t"
02160
02161 #endif //defined (FAST_L2_DIFF)
02162
02163 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b)
02164
02165 L2_DIFF_CORE((%0) , (%1))
02166 L2_DIFF_CORE((%0, %2) , (%1, %2))
02167 L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2))
02168 L2_DIFF_CORE((%0, %%REGa) , (%1, %%REGa))
02169 L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4))
02170 L2_DIFF_CORE((%0, %%REGd) , (%1, %%REGd))
02171 L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2))
02172 L2_DIFF_CORE((%0, %%REGc) , (%1, %%REGc))
02173
02174 #endif //L1_DIFF
02175
02176 "movq %%mm0, %%mm4 \n\t"
02177 "psrlq $32, %%mm0 \n\t"
02178 "paddd %%mm0, %%mm4 \n\t"
02179 "movd %%mm4, %%ecx \n\t"
02180 "shll $2, %%ecx \n\t"
02181 "mov %3, %%"REG_d" \n\t"
02182 "addl -4(%%"REG_d"), %%ecx \n\t"
02183 "addl 4(%%"REG_d"), %%ecx \n\t"
02184 "addl -1024(%%"REG_d"), %%ecx \n\t"
02185 "addl $4, %%ecx \n\t"
02186 "addl 1024(%%"REG_d"), %%ecx \n\t"
02187 "shrl $3, %%ecx \n\t"
02188 "movl %%ecx, (%%"REG_d") \n\t"
02189
02190
02191
02192
02193 "cmpl 512(%%"REG_d"), %%ecx \n\t"
02194 " jb 2f \n\t"
02195 "cmpl 516(%%"REG_d"), %%ecx \n\t"
02196 " jb 1f \n\t"
02197
02198 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t"
02199 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t"
02200 "movq (%0), %%mm0 \n\t"
02201 "movq (%0, %2), %%mm1 \n\t"
02202 "movq (%0, %2, 2), %%mm2 \n\t"
02203 "movq (%0, %%"REG_a"), %%mm3 \n\t"
02204 "movq (%0, %2, 4), %%mm4 \n\t"
02205 "movq (%0, %%"REG_d"), %%mm5 \n\t"
02206 "movq (%0, %%"REG_a", 2), %%mm6 \n\t"
02207 "movq (%0, %%"REG_c"), %%mm7 \n\t"
02208 "movq %%mm0, (%1) \n\t"
02209 "movq %%mm1, (%1, %2) \n\t"
02210 "movq %%mm2, (%1, %2, 2) \n\t"
02211 "movq %%mm3, (%1, %%"REG_a") \n\t"
02212 "movq %%mm4, (%1, %2, 4) \n\t"
02213 "movq %%mm5, (%1, %%"REG_d") \n\t"
02214 "movq %%mm6, (%1, %%"REG_a", 2) \n\t"
02215 "movq %%mm7, (%1, %%"REG_c") \n\t"
02216 "jmp 4f \n\t"
02217
02218 "1: \n\t"
02219 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t"
02220 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t"
02221 "movq (%0), %%mm0 \n\t"
02222 PAVGB((%1), %%mm0)
02223 "movq (%0, %2), %%mm1 \n\t"
02224 PAVGB((%1, %2), %%mm1)
02225 "movq (%0, %2, 2), %%mm2 \n\t"
02226 PAVGB((%1, %2, 2), %%mm2)
02227 "movq (%0, %%"REG_a"), %%mm3 \n\t"
02228 PAVGB((%1, %%REGa), %%mm3)
02229 "movq (%0, %2, 4), %%mm4 \n\t"
02230 PAVGB((%1, %2, 4), %%mm4)
02231 "movq (%0, %%"REG_d"), %%mm5 \n\t"
02232 PAVGB((%1, %%REGd), %%mm5)
02233 "movq (%0, %%"REG_a", 2), %%mm6 \n\t"
02234 PAVGB((%1, %%REGa, 2), %%mm6)
02235 "movq (%0, %%"REG_c"), %%mm7 \n\t"
02236 PAVGB((%1, %%REGc), %%mm7)
02237 "movq %%mm0, (%1) \n\t"
02238 "movq %%mm1, (%1, %2) \n\t"
02239 "movq %%mm2, (%1, %2, 2) \n\t"
02240 "movq %%mm3, (%1, %%"REG_a") \n\t"
02241 "movq %%mm4, (%1, %2, 4) \n\t"
02242 "movq %%mm5, (%1, %%"REG_d") \n\t"
02243 "movq %%mm6, (%1, %%"REG_a", 2) \n\t"
02244 "movq %%mm7, (%1, %%"REG_c") \n\t"
02245 "movq %%mm0, (%0) \n\t"
02246 "movq %%mm1, (%0, %2) \n\t"
02247 "movq %%mm2, (%0, %2, 2) \n\t"
02248 "movq %%mm3, (%0, %%"REG_a") \n\t"
02249 "movq %%mm4, (%0, %2, 4) \n\t"
02250 "movq %%mm5, (%0, %%"REG_d") \n\t"
02251 "movq %%mm6, (%0, %%"REG_a", 2) \n\t"
02252 "movq %%mm7, (%0, %%"REG_c") \n\t"
02253 "jmp 4f \n\t"
02254
02255 "2: \n\t"
02256 "cmpl 508(%%"REG_d"), %%ecx \n\t"
02257 " jb 3f \n\t"
02258
02259 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t"
02260 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t"
02261 "movq (%0), %%mm0 \n\t"
02262 "movq (%0, %2), %%mm1 \n\t"
02263 "movq (%0, %2, 2), %%mm2 \n\t"
02264 "movq (%0, %%"REG_a"), %%mm3 \n\t"
02265 "movq (%1), %%mm4 \n\t"
02266 "movq (%1, %2), %%mm5 \n\t"
02267 "movq (%1, %2, 2), %%mm6 \n\t"
02268 "movq (%1, %%"REG_a"), %%mm7 \n\t"
02269 PAVGB(%%mm4, %%mm0)
02270 PAVGB(%%mm5, %%mm1)
02271 PAVGB(%%mm6, %%mm2)
02272 PAVGB(%%mm7, %%mm3)
02273 PAVGB(%%mm4, %%mm0)
02274 PAVGB(%%mm5, %%mm1)
02275 PAVGB(%%mm6, %%mm2)
02276 PAVGB(%%mm7, %%mm3)
02277 "movq %%mm0, (%1) \n\t"
02278 "movq %%mm1, (%1, %2) \n\t"
02279 "movq %%mm2, (%1, %2, 2) \n\t"
02280 "movq %%mm3, (%1, %%"REG_a") \n\t"
02281 "movq %%mm0, (%0) \n\t"
02282 "movq %%mm1, (%0, %2) \n\t"
02283 "movq %%mm2, (%0, %2, 2) \n\t"
02284 "movq %%mm3, (%0, %%"REG_a") \n\t"
02285
02286 "movq (%0, %2, 4), %%mm0 \n\t"
02287 "movq (%0, %%"REG_d"), %%mm1 \n\t"
02288 "movq (%0, %%"REG_a", 2), %%mm2 \n\t"
02289 "movq (%0, %%"REG_c"), %%mm3 \n\t"
02290 "movq (%1, %2, 4), %%mm4 \n\t"
02291 "movq (%1, %%"REG_d"), %%mm5 \n\t"
02292 "movq (%1, %%"REG_a", 2), %%mm6 \n\t"
02293 "movq (%1, %%"REG_c"), %%mm7 \n\t"
02294 PAVGB(%%mm4, %%mm0)
02295 PAVGB(%%mm5, %%mm1)
02296 PAVGB(%%mm6, %%mm2)
02297 PAVGB(%%mm7, %%mm3)
02298 PAVGB(%%mm4, %%mm0)
02299 PAVGB(%%mm5, %%mm1)
02300 PAVGB(%%mm6, %%mm2)
02301 PAVGB(%%mm7, %%mm3)
02302 "movq %%mm0, (%1, %2, 4) \n\t"
02303 "movq %%mm1, (%1, %%"REG_d") \n\t"
02304 "movq %%mm2, (%1, %%"REG_a", 2) \n\t"
02305 "movq %%mm3, (%1, %%"REG_c") \n\t"
02306 "movq %%mm0, (%0, %2, 4) \n\t"
02307 "movq %%mm1, (%0, %%"REG_d") \n\t"
02308 "movq %%mm2, (%0, %%"REG_a", 2) \n\t"
02309 "movq %%mm3, (%0, %%"REG_c") \n\t"
02310 "jmp 4f \n\t"
02311
02312 "3: \n\t"
02313 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t"
02314 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t"
02315 "movq (%0), %%mm0 \n\t"
02316 "movq (%0, %2), %%mm1 \n\t"
02317 "movq (%0, %2, 2), %%mm2 \n\t"
02318 "movq (%0, %%"REG_a"), %%mm3 \n\t"
02319 "movq (%1), %%mm4 \n\t"
02320 "movq (%1, %2), %%mm5 \n\t"
02321 "movq (%1, %2, 2), %%mm6 \n\t"
02322 "movq (%1, %%"REG_a"), %%mm7 \n\t"
02323 PAVGB(%%mm4, %%mm0)
02324 PAVGB(%%mm5, %%mm1)
02325 PAVGB(%%mm6, %%mm2)
02326 PAVGB(%%mm7, %%mm3)
02327 PAVGB(%%mm4, %%mm0)
02328 PAVGB(%%mm5, %%mm1)
02329 PAVGB(%%mm6, %%mm2)
02330 PAVGB(%%mm7, %%mm3)
02331 PAVGB(%%mm4, %%mm0)
02332 PAVGB(%%mm5, %%mm1)
02333 PAVGB(%%mm6, %%mm2)
02334 PAVGB(%%mm7, %%mm3)
02335 "movq %%mm0, (%1) \n\t"
02336 "movq %%mm1, (%1, %2) \n\t"
02337 "movq %%mm2, (%1, %2, 2) \n\t"
02338 "movq %%mm3, (%1, %%"REG_a") \n\t"
02339 "movq %%mm0, (%0) \n\t"
02340 "movq %%mm1, (%0, %2) \n\t"
02341 "movq %%mm2, (%0, %2, 2) \n\t"
02342 "movq %%mm3, (%0, %%"REG_a") \n\t"
02343
02344 "movq (%0, %2, 4), %%mm0 \n\t"
02345 "movq (%0, %%"REG_d"), %%mm1 \n\t"
02346 "movq (%0, %%"REG_a", 2), %%mm2 \n\t"
02347 "movq (%0, %%"REG_c"), %%mm3 \n\t"
02348 "movq (%1, %2, 4), %%mm4 \n\t"
02349 "movq (%1, %%"REG_d"), %%mm5 \n\t"
02350 "movq (%1, %%"REG_a", 2), %%mm6 \n\t"
02351 "movq (%1, %%"REG_c"), %%mm7 \n\t"
02352 PAVGB(%%mm4, %%mm0)
02353 PAVGB(%%mm5, %%mm1)
02354 PAVGB(%%mm6, %%mm2)
02355 PAVGB(%%mm7, %%mm3)
02356 PAVGB(%%mm4, %%mm0)
02357 PAVGB(%%mm5, %%mm1)
02358 PAVGB(%%mm6, %%mm2)
02359 PAVGB(%%mm7, %%mm3)
02360 PAVGB(%%mm4, %%mm0)
02361 PAVGB(%%mm5, %%mm1)
02362 PAVGB(%%mm6, %%mm2)
02363 PAVGB(%%mm7, %%mm3)
02364 "movq %%mm0, (%1, %2, 4) \n\t"
02365 "movq %%mm1, (%1, %%"REG_d") \n\t"
02366 "movq %%mm2, (%1, %%"REG_a", 2) \n\t"
02367 "movq %%mm3, (%1, %%"REG_c") \n\t"
02368 "movq %%mm0, (%0, %2, 4) \n\t"
02369 "movq %%mm1, (%0, %%"REG_d") \n\t"
02370 "movq %%mm2, (%0, %%"REG_a", 2) \n\t"
02371 "movq %%mm3, (%0, %%"REG_c") \n\t"
02372
02373 "4: \n\t"
02374
02375 :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast)
02376 : "%"REG_a, "%"REG_d, "%"REG_c, "memory"
02377 );
02378 #else //HAVE_MMX2 || HAVE_AMD3DNOW
02379 {
02380 int y;
02381 int d=0;
02382
02383 int i;
02384
02385 for(y=0; y<8; y++){
02386 int x;
02387 for(x=0; x<8; x++){
02388 int ref= tempBlurred[ x + y*stride ];
02389 int cur= src[ x + y*stride ];
02390 int d1=ref - cur;
02391
02392
02393
02394 d+= d1*d1;
02395
02396 }
02397 }
02398 i=d;
02399 d= (
02400 4*d
02401 +(*(tempBlurredPast-256))
02402 +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1))
02403 +(*(tempBlurredPast+256))
02404 +4)>>3;
02405 *tempBlurredPast=i;
02406
02407
02408
02409
02410
02411
02412
02413
02414
02415 if(d > maxNoise[1]){
02416 if(d < maxNoise[2]){
02417 for(y=0; y<8; y++){
02418 int x;
02419 for(x=0; x<8; x++){
02420 int ref= tempBlurred[ x + y*stride ];
02421 int cur= src[ x + y*stride ];
02422 tempBlurred[ x + y*stride ]=
02423 src[ x + y*stride ]=
02424 (ref + cur + 1)>>1;
02425 }
02426 }
02427 }else{
02428 for(y=0; y<8; y++){
02429 int x;
02430 for(x=0; x<8; x++){
02431 tempBlurred[ x + y*stride ]= src[ x + y*stride ];
02432 }
02433 }
02434 }
02435 }else{
02436 if(d < maxNoise[0]){
02437 for(y=0; y<8; y++){
02438 int x;
02439 for(x=0; x<8; x++){
02440 int ref= tempBlurred[ x + y*stride ];
02441 int cur= src[ x + y*stride ];
02442 tempBlurred[ x + y*stride ]=
02443 src[ x + y*stride ]=
02444 (ref*7 + cur + 4)>>3;
02445 }
02446 }
02447 }else{
02448 for(y=0; y<8; y++){
02449 int x;
02450 for(x=0; x<8; x++){
02451 int ref= tempBlurred[ x + y*stride ];
02452 int cur= src[ x + y*stride ];
02453 tempBlurred[ x + y*stride ]=
02454 src[ x + y*stride ]=
02455 (ref*3 + cur + 2)>>2;
02456 }
02457 }
02458 }
02459 }
02460 }
02461 #endif //HAVE_MMX2 || HAVE_AMD3DNOW
02462 }
02463 #endif //HAVE_ALTIVEC
02464
02465 #if HAVE_MMX
02466
02469 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){
02470 int64_t dc_mask, eq_mask, both_masks;
02471 int64_t sums[10*8*2];
02472 src+= step*3;
02473
02474 __asm__ volatile(
02475 "movq %0, %%mm7 \n\t"
02476 "movq %1, %%mm6 \n\t"
02477 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP])
02478 );
02479
02480 __asm__ volatile(
02481 "lea (%2, %3), %%"REG_a" \n\t"
02482
02483
02484
02485 "movq (%2), %%mm0 \n\t"
02486 "movq (%%"REG_a"), %%mm1 \n\t"
02487 "movq %%mm1, %%mm3 \n\t"
02488 "movq %%mm1, %%mm4 \n\t"
02489 "psubb %%mm1, %%mm0 \n\t"
02490 "paddb %%mm7, %%mm0 \n\t"
02491 "pcmpgtb %%mm6, %%mm0 \n\t"
02492
02493 "movq (%%"REG_a",%3), %%mm2 \n\t"
02494 PMAXUB(%%mm2, %%mm4)
02495 PMINUB(%%mm2, %%mm3, %%mm5)
02496 "psubb %%mm2, %%mm1 \n\t"
02497 "paddb %%mm7, %%mm1 \n\t"
02498 "pcmpgtb %%mm6, %%mm1 \n\t"
02499 "paddb %%mm1, %%mm0 \n\t"
02500
02501 "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
02502 PMAXUB(%%mm1, %%mm4)
02503 PMINUB(%%mm1, %%mm3, %%mm5)
02504 "psubb %%mm1, %%mm2 \n\t"
02505 "paddb %%mm7, %%mm2 \n\t"
02506 "pcmpgtb %%mm6, %%mm2 \n\t"
02507 "paddb %%mm2, %%mm0 \n\t"
02508
02509 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t"
02510
02511 "movq (%2, %3, 4), %%mm2 \n\t"
02512 PMAXUB(%%mm2, %%mm4)
02513 PMINUB(%%mm2, %%mm3, %%mm5)
02514 "psubb %%mm2, %%mm1 \n\t"
02515 "paddb %%mm7, %%mm1 \n\t"
02516 "pcmpgtb %%mm6, %%mm1 \n\t"
02517 "paddb %%mm1, %%mm0 \n\t"
02518
02519 "movq (%%"REG_a"), %%mm1 \n\t"
02520 PMAXUB(%%mm1, %%mm4)
02521 PMINUB(%%mm1, %%mm3, %%mm5)
02522 "psubb %%mm1, %%mm2 \n\t"
02523 "paddb %%mm7, %%mm2 \n\t"
02524 "pcmpgtb %%mm6, %%mm2 \n\t"
02525 "paddb %%mm2, %%mm0 \n\t"
02526
02527 "movq (%%"REG_a", %3), %%mm2 \n\t"
02528 PMAXUB(%%mm2, %%mm4)
02529 PMINUB(%%mm2, %%mm3, %%mm5)
02530 "psubb %%mm2, %%mm1 \n\t"
02531 "paddb %%mm7, %%mm1 \n\t"
02532 "pcmpgtb %%mm6, %%mm1 \n\t"
02533 "paddb %%mm1, %%mm0 \n\t"
02534
02535 "movq (%%"REG_a", %3, 2), %%mm1 \n\t"
02536 PMAXUB(%%mm1, %%mm4)
02537 PMINUB(%%mm1, %%mm3, %%mm5)
02538 "psubb %%mm1, %%mm2 \n\t"
02539 "paddb %%mm7, %%mm2 \n\t"
02540 "pcmpgtb %%mm6, %%mm2 \n\t"
02541 "paddb %%mm2, %%mm0 \n\t"
02542
02543 "movq (%2, %3, 8), %%mm2 \n\t"
02544 PMAXUB(%%mm2, %%mm4)
02545 PMINUB(%%mm2, %%mm3, %%mm5)
02546 "psubb %%mm2, %%mm1 \n\t"
02547 "paddb %%mm7, %%mm1 \n\t"
02548 "pcmpgtb %%mm6, %%mm1 \n\t"
02549 "paddb %%mm1, %%mm0 \n\t"
02550
02551 "movq (%%"REG_a", %3, 4), %%mm1 \n\t"
02552 "psubb %%mm1, %%mm2 \n\t"
02553 "paddb %%mm7, %%mm2 \n\t"
02554 "pcmpgtb %%mm6, %%mm2 \n\t"
02555 "paddb %%mm2, %%mm0 \n\t"
02556 "psubusb %%mm3, %%mm4 \n\t"
02557
02558 "pxor %%mm6, %%mm6 \n\t"
02559 "movq %4, %%mm7 \n\t"
02560 "paddusb %%mm7, %%mm7 \n\t"
02561 "psubusb %%mm4, %%mm7 \n\t"
02562 "pcmpeqb %%mm6, %%mm7 \n\t"
02563 "pcmpeqb %%mm6, %%mm7 \n\t"
02564 "movq %%mm7, %1 \n\t"
02565
02566 "movq %5, %%mm7 \n\t"
02567 "punpcklbw %%mm7, %%mm7 \n\t"
02568 "punpcklbw %%mm7, %%mm7 \n\t"
02569 "punpcklbw %%mm7, %%mm7 \n\t"
02570 "psubb %%mm0, %%mm6 \n\t"
02571 "pcmpgtb %%mm7, %%mm6 \n\t"
02572 "movq %%mm6, %0 \n\t"
02573
02574 : "=m" (eq_mask), "=m" (dc_mask)
02575 : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold)
02576 : "%"REG_a
02577 );
02578
02579 both_masks = dc_mask & eq_mask;
02580
02581 if(both_masks){
02582 x86_reg offset= -8*step;
02583 int64_t *temp_sums= sums;
02584
02585 __asm__ volatile(
02586 "movq %2, %%mm0 \n\t"
02587 "pxor %%mm4, %%mm4 \n\t"
02588
02589 "movq (%0), %%mm6 \n\t"
02590 "movq (%0, %1), %%mm5 \n\t"
02591 "movq %%mm5, %%mm1 \n\t"
02592 "movq %%mm6, %%mm2 \n\t"
02593 "psubusb %%mm6, %%mm5 \n\t"
02594 "psubusb %%mm1, %%mm2 \n\t"
02595 "por %%mm5, %%mm2 \n\t"
02596 "psubusb %%mm2, %%mm0 \n\t"
02597 "pcmpeqb %%mm4, %%mm0 \n\t"
02598
02599 "pxor %%mm6, %%mm1 \n\t"
02600 "pand %%mm0, %%mm1 \n\t"
02601 "pxor %%mm1, %%mm6 \n\t"
02602
02603
02604 "movq (%0, %1, 8), %%mm5 \n\t"
02605 "add %1, %0 \n\t"
02606 "movq (%0, %1, 8), %%mm7 \n\t"
02607 "movq %%mm5, %%mm1 \n\t"
02608 "movq %%mm7, %%mm2 \n\t"
02609 "psubusb %%mm7, %%mm5 \n\t"
02610 "psubusb %%mm1, %%mm2 \n\t"
02611 "por %%mm5, %%mm2 \n\t"
02612 "movq %2, %%mm0 \n\t"
02613 "psubusb %%mm2, %%mm0 \n\t"
02614 "pcmpeqb %%mm4, %%mm0 \n\t"
02615
02616 "pxor %%mm7, %%mm1 \n\t"
02617 "pand %%mm0, %%mm1 \n\t"
02618 "pxor %%mm1, %%mm7 \n\t"
02619
02620 "movq %%mm6, %%mm5 \n\t"
02621 "punpckhbw %%mm4, %%mm6 \n\t"
02622 "punpcklbw %%mm4, %%mm5 \n\t"
02623
02624
02625 "movq %%mm5, %%mm0 \n\t"
02626 "movq %%mm6, %%mm1 \n\t"
02627 "psllw $2, %%mm0 \n\t"
02628 "psllw $2, %%mm1 \n\t"
02629 "paddw "MANGLE(w04)", %%mm0 \n\t"
02630 "paddw "MANGLE(w04)", %%mm1 \n\t"
02631
02632 #define NEXT\
02633 "movq (%0), %%mm2 \n\t"\
02634 "movq (%0), %%mm3 \n\t"\
02635 "add %1, %0 \n\t"\
02636 "punpcklbw %%mm4, %%mm2 \n\t"\
02637 "punpckhbw %%mm4, %%mm3 \n\t"\
02638 "paddw %%mm2, %%mm0 \n\t"\
02639 "paddw %%mm3, %%mm1 \n\t"
02640
02641 #define PREV\
02642 "movq (%0), %%mm2 \n\t"\
02643 "movq (%0), %%mm3 \n\t"\
02644 "add %1, %0 \n\t"\
02645 "punpcklbw %%mm4, %%mm2 \n\t"\
02646 "punpckhbw %%mm4, %%mm3 \n\t"\
02647 "psubw %%mm2, %%mm0 \n\t"\
02648 "psubw %%mm3, %%mm1 \n\t"
02649
02650
02651 NEXT
02652 NEXT
02653 NEXT
02654 "movq %%mm0, (%3) \n\t"
02655 "movq %%mm1, 8(%3) \n\t"
02656
02657 NEXT
02658 "psubw %%mm5, %%mm0 \n\t"
02659 "psubw %%mm6, %%mm1 \n\t"
02660 "movq %%mm0, 16(%3) \n\t"
02661 "movq %%mm1, 24(%3) \n\t"
02662
02663 NEXT
02664 "psubw %%mm5, %%mm0 \n\t"
02665 "psubw %%mm6, %%mm1 \n\t"
02666 "movq %%mm0, 32(%3) \n\t"
02667 "movq %%mm1, 40(%3) \n\t"
02668
02669 NEXT
02670 "psubw %%mm5, %%mm0 \n\t"
02671 "psubw %%mm6, %%mm1 \n\t"
02672 "movq %%mm0, 48(%3) \n\t"
02673 "movq %%mm1, 56(%3) \n\t"
02674
02675 NEXT
02676 "psubw %%mm5, %%mm0 \n\t"
02677 "psubw %%mm6, %%mm1 \n\t"
02678 "movq %%mm0, 64(%3) \n\t"
02679 "movq %%mm1, 72(%3) \n\t"
02680
02681 "movq %%mm7, %%mm6 \n\t"
02682 "punpckhbw %%mm4, %%mm7 \n\t"
02683 "punpcklbw %%mm4, %%mm6 \n\t"
02684
02685 NEXT
02686 "mov %4, %0 \n\t"
02687 "add %1, %0 \n\t"
02688 PREV
02689 "movq %%mm0, 80(%3) \n\t"
02690 "movq %%mm1, 88(%3) \n\t"
02691
02692 PREV
02693 "paddw %%mm6, %%mm0 \n\t"
02694 "paddw %%mm7, %%mm1 \n\t"
02695 "movq %%mm0, 96(%3) \n\t"
02696 "movq %%mm1, 104(%3) \n\t"
02697
02698 PREV
02699 "paddw %%mm6, %%mm0 \n\t"
02700 "paddw %%mm7, %%mm1 \n\t"
02701 "movq %%mm0, 112(%3) \n\t"
02702 "movq %%mm1, 120(%3) \n\t"
02703
02704 PREV
02705 "paddw %%mm6, %%mm0 \n\t"
02706 "paddw %%mm7, %%mm1 \n\t"
02707 "movq %%mm0, 128(%3) \n\t"
02708 "movq %%mm1, 136(%3) \n\t"
02709
02710 PREV
02711 "paddw %%mm6, %%mm0 \n\t"
02712 "paddw %%mm7, %%mm1 \n\t"
02713 "movq %%mm0, 144(%3) \n\t"
02714 "movq %%mm1, 152(%3) \n\t"
02715
02716 "mov %4, %0 \n\t"
02717
02718 : "+&r"(src)
02719 : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src)
02720 );
02721
02722 src+= step;
02723
02724 __asm__ volatile(
02725 "movq %4, %%mm6 \n\t"
02726 "pcmpeqb %%mm5, %%mm5 \n\t"
02727 "pxor %%mm6, %%mm5 \n\t"
02728 "pxor %%mm7, %%mm7 \n\t"
02729
02730 "1: \n\t"
02731 "movq (%1), %%mm0 \n\t"
02732 "movq 8(%1), %%mm1 \n\t"
02733 "paddw 32(%1), %%mm0 \n\t"
02734 "paddw 40(%1), %%mm1 \n\t"
02735 "movq (%0, %3), %%mm2 \n\t"
02736 "movq %%mm2, %%mm3 \n\t"
02737 "movq %%mm2, %%mm4 \n\t"
02738 "punpcklbw %%mm7, %%mm2 \n\t"
02739 "punpckhbw %%mm7, %%mm3 \n\t"
02740 "paddw %%mm2, %%mm0 \n\t"
02741 "paddw %%mm3, %%mm1 \n\t"
02742 "paddw %%mm2, %%mm0 \n\t"
02743 "paddw %%mm3, %%mm1 \n\t"
02744 "psrlw $4, %%mm0 \n\t"
02745 "psrlw $4, %%mm1 \n\t"
02746 "packuswb %%mm1, %%mm0 \n\t"
02747 "pand %%mm6, %%mm0 \n\t"
02748 "pand %%mm5, %%mm4 \n\t"
02749 "por %%mm4, %%mm0 \n\t"
02750 "movq %%mm0, (%0, %3) \n\t"
02751 "add $16, %1 \n\t"
02752 "add %2, %0 \n\t"
02753 " js 1b \n\t"
02754
02755 : "+r"(offset), "+r"(temp_sums)
02756 : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks)
02757 );
02758 }else
02759 src+= step;
02760
02761 if(eq_mask != -1LL){
02762 uint8_t *temp_src= src;
02763 DECLARE_ALIGNED(8, uint64_t, tmp)[4];
02764 __asm__ volatile(
02765 "pxor %%mm7, %%mm7 \n\t"
02766
02767
02768
02769 "movq (%0), %%mm0 \n\t"
02770 "movq %%mm0, %%mm1 \n\t"
02771 "punpcklbw %%mm7, %%mm0 \n\t"
02772 "punpckhbw %%mm7, %%mm1 \n\t"
02773
02774 "movq (%0, %1), %%mm2 \n\t"
02775 "lea (%0, %1, 2), %%"REG_a" \n\t"
02776 "movq %%mm2, %%mm3 \n\t"
02777 "punpcklbw %%mm7, %%mm2 \n\t"
02778 "punpckhbw %%mm7, %%mm3 \n\t"
02779
02780 "movq (%%"REG_a"), %%mm4 \n\t"
02781 "movq %%mm4, %%mm5 \n\t"
02782 "punpcklbw %%mm7, %%mm4 \n\t"
02783 "punpckhbw %%mm7, %%mm5 \n\t"
02784
02785 "paddw %%mm0, %%mm0 \n\t"
02786 "paddw %%mm1, %%mm1 \n\t"
02787 "psubw %%mm4, %%mm2 \n\t"
02788 "psubw %%mm5, %%mm3 \n\t"
02789 "psubw %%mm2, %%mm0 \n\t"
02790 "psubw %%mm3, %%mm1 \n\t"
02791
02792 "psllw $2, %%mm2 \n\t"
02793 "psllw $2, %%mm3 \n\t"
02794 "psubw %%mm2, %%mm0 \n\t"
02795 "psubw %%mm3, %%mm1 \n\t"
02796
02797 "movq (%%"REG_a", %1), %%mm2 \n\t"
02798 "movq %%mm2, %%mm3 \n\t"
02799 "punpcklbw %%mm7, %%mm2 \n\t"
02800 "punpckhbw %%mm7, %%mm3 \n\t"
02801
02802 "psubw %%mm2, %%mm0 \n\t"
02803 "psubw %%mm3, %%mm1 \n\t"
02804 "psubw %%mm2, %%mm0 \n\t"
02805 "psubw %%mm3, %%mm1 \n\t"
02806 "movq %%mm0, (%4) \n\t"
02807 "movq %%mm1, 8(%4) \n\t"
02808
02809 "movq (%%"REG_a", %1, 2), %%mm0 \n\t"
02810 "movq %%mm0, %%mm1 \n\t"
02811 "punpcklbw %%mm7, %%mm0 \n\t"
02812 "punpckhbw %%mm7, %%mm1 \n\t"
02813
02814 "psubw %%mm0, %%mm2 \n\t"
02815 "psubw %%mm1, %%mm3 \n\t"
02816 "movq %%mm2, 16(%4) \n\t"
02817 "movq %%mm3, 24(%4) \n\t"
02818 "paddw %%mm4, %%mm4 \n\t"
02819 "paddw %%mm5, %%mm5 \n\t"
02820 "psubw %%mm2, %%mm4 \n\t"
02821 "psubw %%mm3, %%mm5 \n\t"
02822
02823 "lea (%%"REG_a", %1), %0 \n\t"
02824 "psllw $2, %%mm2 \n\t"
02825 "psllw $2, %%mm3 \n\t"
02826 "psubw %%mm2, %%mm4 \n\t"
02827 "psubw %%mm3, %%mm5 \n\t"
02828
02829 "movq (%0, %1, 2), %%mm2 \n\t"
02830 "movq %%mm2, %%mm3 \n\t"
02831 "punpcklbw %%mm7, %%mm2 \n\t"
02832 "punpckhbw %%mm7, %%mm3 \n\t"
02833 "psubw %%mm2, %%mm4 \n\t"
02834 "psubw %%mm3, %%mm5 \n\t"
02835 "psubw %%mm2, %%mm4 \n\t"
02836 "psubw %%mm3, %%mm5 \n\t"
02837
02838 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
02839 "punpcklbw %%mm7, %%mm6 \n\t"
02840 "psubw %%mm6, %%mm2 \n\t"
02841 "movq (%%"REG_a", %1, 4), %%mm6 \n\t"
02842 "punpckhbw %%mm7, %%mm6 \n\t"
02843 "psubw %%mm6, %%mm3 \n\t"
02844
02845 "paddw %%mm0, %%mm0 \n\t"
02846 "paddw %%mm1, %%mm1 \n\t"
02847 "psubw %%mm2, %%mm0 \n\t"
02848 "psubw %%mm3, %%mm1 \n\t"
02849
02850 "psllw $2, %%mm2 \n\t"
02851 "psllw $2, %%mm3 \n\t"
02852 "psubw %%mm2, %%mm0 \n\t"
02853 "psubw %%mm3, %%mm1 \n\t"
02854
02855 "movq (%0, %1, 4), %%mm2 \n\t"
02856 "movq %%mm2, %%mm3 \n\t"
02857 "punpcklbw %%mm7, %%mm2 \n\t"
02858 "punpckhbw %%mm7, %%mm3 \n\t"
02859
02860 "paddw %%mm2, %%mm2 \n\t"
02861 "paddw %%mm3, %%mm3 \n\t"
02862 "psubw %%mm2, %%mm0 \n\t"
02863 "psubw %%mm3, %%mm1 \n\t"
02864
02865 "movq (%4), %%mm2 \n\t"
02866 "movq 8(%4), %%mm3 \n\t"
02867
02868 #if HAVE_MMX2
02869 "movq %%mm7, %%mm6 \n\t"
02870 "psubw %%mm0, %%mm6 \n\t"
02871 "pmaxsw %%mm6, %%mm0 \n\t"
02872 "movq %%mm7, %%mm6 \n\t"
02873 "psubw %%mm1, %%mm6 \n\t"
02874 "pmaxsw %%mm6, %%mm1 \n\t"
02875 "movq %%mm7, %%mm6 \n\t"
02876 "psubw %%mm2, %%mm6 \n\t"
02877 "pmaxsw %%mm6, %%mm2 \n\t"
02878 "movq %%mm7, %%mm6 \n\t"
02879 "psubw %%mm3, %%mm6 \n\t"
02880 "pmaxsw %%mm6, %%mm3 \n\t"
02881 #else
02882 "movq %%mm7, %%mm6 \n\t"
02883 "pcmpgtw %%mm0, %%mm6 \n\t"
02884 "pxor %%mm6, %%mm0 \n\t"
02885 "psubw %%mm6, %%mm0 \n\t"
02886 "movq %%mm7, %%mm6 \n\t"
02887 "pcmpgtw %%mm1, %%mm6 \n\t"
02888 "pxor %%mm6, %%mm1 \n\t"
02889 "psubw %%mm6, %%mm1 \n\t"
02890 "movq %%mm7, %%mm6 \n\t"
02891 "pcmpgtw %%mm2, %%mm6 \n\t"
02892 "pxor %%mm6, %%mm2 \n\t"
02893 "psubw %%mm6, %%mm2 \n\t"
02894 "movq %%mm7, %%mm6 \n\t"
02895 "pcmpgtw %%mm3, %%mm6 \n\t"
02896 "pxor %%mm6, %%mm3 \n\t"
02897 "psubw %%mm6, %%mm3 \n\t"
02898 #endif
02899
02900 #if HAVE_MMX2
02901 "pminsw %%mm2, %%mm0 \n\t"
02902 "pminsw %%mm3, %%mm1 \n\t"
02903 #else
02904 "movq %%mm0, %%mm6 \n\t"
02905 "psubusw %%mm2, %%mm6 \n\t"
02906 "psubw %%mm6, %%mm0 \n\t"
02907 "movq %%mm1, %%mm6 \n\t"
02908 "psubusw %%mm3, %%mm6 \n\t"
02909 "psubw %%mm6, %%mm1 \n\t"
02910 #endif
02911
02912 "movd %2, %%mm2 \n\t"
02913 "punpcklbw %%mm7, %%mm2 \n\t"
02914
02915 "movq %%mm7, %%mm6 \n\t"
02916 "pcmpgtw %%mm4, %%mm6 \n\t"
02917 "pxor %%mm6, %%mm4 \n\t"
02918 "psubw %%mm6, %%mm4 \n\t"
02919 "pcmpgtw %%mm5, %%mm7 \n\t"
02920 "pxor %%mm7, %%mm5 \n\t"
02921 "psubw %%mm7, %%mm5 \n\t"
02922
02923 "psllw $3, %%mm2 \n\t"
02924 "movq %%mm2, %%mm3 \n\t"
02925 "pcmpgtw %%mm4, %%mm2 \n\t"
02926 "pcmpgtw %%mm5, %%mm3 \n\t"
02927 "pand %%mm2, %%mm4 \n\t"
02928 "pand %%mm3, %%mm5 \n\t"
02929
02930
02931 "psubusw %%mm0, %%mm4 \n\t"
02932 "psubusw %%mm1, %%mm5 \n\t"
02933
02934
02935 "movq "MANGLE(w05)", %%mm2 \n\t"
02936 "pmullw %%mm2, %%mm4 \n\t"
02937 "pmullw %%mm2, %%mm5 \n\t"
02938 "movq "MANGLE(w20)", %%mm2 \n\t"
02939 "paddw %%mm2, %%mm4 \n\t"
02940 "paddw %%mm2, %%mm5 \n\t"
02941 "psrlw $6, %%mm4 \n\t"
02942 "psrlw $6, %%mm5 \n\t"
02943
02944 "movq 16(%4), %%mm0 \n\t"
02945 "movq 24(%4), %%mm1 \n\t"
02946
02947 "pxor %%mm2, %%mm2 \n\t"
02948 "pxor %%mm3, %%mm3 \n\t"
02949
02950 "pcmpgtw %%mm0, %%mm2 \n\t"
02951 "pcmpgtw %%mm1, %%mm3 \n\t"
02952 "pxor %%mm2, %%mm0 \n\t"
02953 "pxor %%mm3, %%mm1 \n\t"
02954 "psubw %%mm2, %%mm0 \n\t"
02955 "psubw %%mm3, %%mm1 \n\t"
02956 "psrlw $1, %%mm0 \n\t"
02957 "psrlw $1, %%mm1 \n\t"
02958
02959 "pxor %%mm6, %%mm2 \n\t"
02960 "pxor %%mm7, %%mm3 \n\t"
02961 "pand %%mm2, %%mm4 \n\t"
02962 "pand %%mm3, %%mm5 \n\t"
02963
02964 #if HAVE_MMX2
02965 "pminsw %%mm0, %%mm4 \n\t"
02966 "pminsw %%mm1, %%mm5 \n\t"
02967 #else
02968 "movq %%mm4, %%mm2 \n\t"
02969 "psubusw %%mm0, %%mm2 \n\t"
02970 "psubw %%mm2, %%mm4 \n\t"
02971 "movq %%mm5, %%mm2 \n\t"
02972 "psubusw %%mm1, %%mm2 \n\t"
02973 "psubw %%mm2, %%mm5 \n\t"
02974 #endif
02975 "pxor %%mm6, %%mm4 \n\t"
02976 "pxor %%mm7, %%mm5 \n\t"
02977 "psubw %%mm6, %%mm4 \n\t"
02978 "psubw %%mm7, %%mm5 \n\t"
02979 "packsswb %%mm5, %%mm4 \n\t"
02980 "movq %3, %%mm1 \n\t"
02981 "pandn %%mm4, %%mm1 \n\t"
02982 "movq (%0), %%mm0 \n\t"
02983 "paddb %%mm1, %%mm0 \n\t"
02984 "movq %%mm0, (%0) \n\t"
02985 "movq (%0, %1), %%mm0 \n\t"
02986 "psubb %%mm1, %%mm0 \n\t"
02987 "movq %%mm0, (%0, %1) \n\t"
02988
02989 : "+r" (temp_src)
02990 : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask), "r"(tmp)
02991 : "%"REG_a
02992 );
02993 }
02994
02995
02996
02997
02998
02999 }
03000 #endif //HAVE_MMX
03001
03002 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
03003 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c);
03004
03009 #undef REAL_SCALED_CPY
03010 #undef SCALED_CPY
03011
03012 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride,
03013 int levelFix, int64_t *packedOffsetAndScale)
03014 {
03015 #if !HAVE_MMX
03016 int i;
03017 #endif
03018 if(levelFix){
03019 #if HAVE_MMX
03020 __asm__ volatile(
03021 "movq (%%"REG_a"), %%mm2 \n\t"
03022 "movq 8(%%"REG_a"), %%mm3 \n\t"
03023 "lea (%2,%4), %%"REG_a" \n\t"
03024 "lea (%3,%5), %%"REG_d" \n\t"
03025 "pxor %%mm4, %%mm4 \n\t"
03026 #if HAVE_MMX2
03027 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
03028 "movq " #src1 ", %%mm0 \n\t"\
03029 "movq " #src1 ", %%mm5 \n\t"\
03030 "movq " #src2 ", %%mm1 \n\t"\
03031 "movq " #src2 ", %%mm6 \n\t"\
03032 "punpcklbw %%mm0, %%mm0 \n\t"\
03033 "punpckhbw %%mm5, %%mm5 \n\t"\
03034 "punpcklbw %%mm1, %%mm1 \n\t"\
03035 "punpckhbw %%mm6, %%mm6 \n\t"\
03036 "pmulhuw %%mm3, %%mm0 \n\t"\
03037 "pmulhuw %%mm3, %%mm5 \n\t"\
03038 "pmulhuw %%mm3, %%mm1 \n\t"\
03039 "pmulhuw %%mm3, %%mm6 \n\t"\
03040 "psubw %%mm2, %%mm0 \n\t"\
03041 "psubw %%mm2, %%mm5 \n\t"\
03042 "psubw %%mm2, %%mm1 \n\t"\
03043 "psubw %%mm2, %%mm6 \n\t"\
03044 "packuswb %%mm5, %%mm0 \n\t"\
03045 "packuswb %%mm6, %%mm1 \n\t"\
03046 "movq %%mm0, " #dst1 " \n\t"\
03047 "movq %%mm1, " #dst2 " \n\t"\
03048
03049 #else //HAVE_MMX2
03050 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \
03051 "movq " #src1 ", %%mm0 \n\t"\
03052 "movq " #src1 ", %%mm5 \n\t"\
03053 "punpcklbw %%mm4, %%mm0 \n\t"\
03054 "punpckhbw %%mm4, %%mm5 \n\t"\
03055 "psubw %%mm2, %%mm0 \n\t"\
03056 "psubw %%mm2, %%mm5 \n\t"\
03057 "movq " #src2 ", %%mm1 \n\t"\
03058 "psllw $6, %%mm0 \n\t"\
03059 "psllw $6, %%mm5 \n\t"\
03060 "pmulhw %%mm3, %%mm0 \n\t"\
03061 "movq " #src2 ", %%mm6 \n\t"\
03062 "pmulhw %%mm3, %%mm5 \n\t"\
03063 "punpcklbw %%mm4, %%mm1 \n\t"\
03064 "punpckhbw %%mm4, %%mm6 \n\t"\
03065 "psubw %%mm2, %%mm1 \n\t"\
03066 "psubw %%mm2, %%mm6 \n\t"\
03067 "psllw $6, %%mm1 \n\t"\
03068 "psllw $6, %%mm6 \n\t"\
03069 "pmulhw %%mm3, %%mm1 \n\t"\
03070 "pmulhw %%mm3, %%mm6 \n\t"\
03071 "packuswb %%mm5, %%mm0 \n\t"\
03072 "packuswb %%mm6, %%mm1 \n\t"\
03073 "movq %%mm0, " #dst1 " \n\t"\
03074 "movq %%mm1, " #dst2 " \n\t"\
03075
03076 #endif //HAVE_MMX2
03077 #define SCALED_CPY(src1, src2, dst1, dst2)\
03078 REAL_SCALED_CPY(src1, src2, dst1, dst2)
03079
03080 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5))
03081 SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2))
03082 SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4))
03083 "lea (%%"REG_a",%4,4), %%"REG_a" \n\t"
03084 "lea (%%"REG_d",%5,4), %%"REG_d" \n\t"
03085 SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2))
03086
03087
03088 : "=&a" (packedOffsetAndScale)
03089 : "0" (packedOffsetAndScale),
03090 "r"(src),
03091 "r"(dst),
03092 "r" ((x86_reg)srcStride),
03093 "r" ((x86_reg)dstStride)
03094 : "%"REG_d
03095 );
03096 #else //HAVE_MMX
03097 for(i=0; i<8; i++)
03098 memcpy( &(dst[dstStride*i]),
03099 &(src[srcStride*i]), BLOCK_SIZE);
03100 #endif //HAVE_MMX
03101 }else{
03102 #if HAVE_MMX
03103 __asm__ volatile(
03104 "lea (%0,%2), %%"REG_a" \n\t"
03105 "lea (%1,%3), %%"REG_d" \n\t"
03106
03107 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \
03108 "movq " #src1 ", %%mm0 \n\t"\
03109 "movq " #src2 ", %%mm1 \n\t"\
03110 "movq %%mm0, " #dst1 " \n\t"\
03111 "movq %%mm1, " #dst2 " \n\t"\
03112
03113 #define SIMPLE_CPY(src1, src2, dst1, dst2)\
03114 REAL_SIMPLE_CPY(src1, src2, dst1, dst2)
03115
03116 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3))
03117 SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2))
03118 SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4))
03119 "lea (%%"REG_a",%2,4), %%"REG_a" \n\t"
03120 "lea (%%"REG_d",%3,4), %%"REG_d" \n\t"
03121 SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2))
03122
03123 : : "r" (src),
03124 "r" (dst),
03125 "r" ((x86_reg)srcStride),
03126 "r" ((x86_reg)dstStride)
03127 : "%"REG_a, "%"REG_d
03128 );
03129 #else //HAVE_MMX
03130 for(i=0; i<8; i++)
03131 memcpy( &(dst[dstStride*i]),
03132 &(src[srcStride*i]), BLOCK_SIZE);
03133 #endif //HAVE_MMX
03134 }
03135 }
03136
03140 static inline void RENAME(duplicate)(uint8_t src[], int stride)
03141 {
03142 #if HAVE_MMX
03143 __asm__ volatile(
03144 "movq (%0), %%mm0 \n\t"
03145 "add %1, %0 \n\t"
03146 "movq %%mm0, (%0) \n\t"
03147 "movq %%mm0, (%0, %1) \n\t"
03148 "movq %%mm0, (%0, %1, 2) \n\t"
03149 : "+r" (src)
03150 : "r" ((x86_reg)-stride)
03151 );
03152 #else
03153 int i;
03154 uint8_t *p=src;
03155 for(i=0; i<3; i++){
03156 p-= stride;
03157 memcpy(p, src, 8);
03158 }
03159 #endif
03160 }
03161
03165 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height,
03166 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2)
03167 {
03168 DECLARE_ALIGNED(8, PPContext, c)= *c2;
03169 int x,y;
03170 #ifdef COMPILE_TIME_MODE
03171 const int mode= COMPILE_TIME_MODE;
03172 #else
03173 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode;
03174 #endif
03175 int black=0, white=255;
03176 int QPCorrecture= 256*256;
03177
03178 int copyAhead;
03179 #if HAVE_MMX
03180 int i;
03181 #endif
03182
03183 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4;
03184 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4;
03185
03186
03187 uint64_t * const yHistogram= c.yHistogram;
03188 uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride;
03189 uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride;
03190
03191
03192 #if HAVE_MMX
03193 for(i=0; i<57; i++){
03194 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1;
03195 int threshold= offset*2 + 1;
03196 c.mmxDcOffset[i]= 0x7F - offset;
03197 c.mmxDcThreshold[i]= 0x7F - threshold;
03198 c.mmxDcOffset[i]*= 0x0101010101010101LL;
03199 c.mmxDcThreshold[i]*= 0x0101010101010101LL;
03200 }
03201 #endif
03202
03203 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16;
03204 else if( (mode & LINEAR_BLEND_DEINT_FILTER)
03205 || (mode & FFMPEG_DEINT_FILTER)
03206 || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14;
03207 else if( (mode & V_DEBLOCK)
03208 || (mode & LINEAR_IPOL_DEINT_FILTER)
03209 || (mode & MEDIAN_DEINT_FILTER)
03210 || (mode & V_A_DEBLOCK)) copyAhead=13;
03211 else if(mode & V_X1_FILTER) copyAhead=11;
03212
03213 else if(mode & DERING) copyAhead=9;
03214 else copyAhead=8;
03215
03216 copyAhead-= 8;
03217
03218 if(!isColor){
03219 uint64_t sum= 0;
03220 int i;
03221 uint64_t maxClipped;
03222 uint64_t clipped;
03223 double scale;
03224
03225 c.frameNum++;
03226
03227 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256;
03228
03229 for(i=0; i<256; i++){
03230 sum+= yHistogram[i];
03231 }
03232
03233
03234 maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold);
03235
03236 clipped= sum;
03237 for(black=255; black>0; black--){
03238 if(clipped < maxClipped) break;
03239 clipped-= yHistogram[black];
03240 }
03241
03242 clipped= sum;
03243 for(white=0; white<256; white++){
03244 if(clipped < maxClipped) break;
03245 clipped-= yHistogram[white];
03246 }
03247
03248 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black);
03249
03250 #if HAVE_MMX2
03251 c.packedYScale= (uint16_t)(scale*256.0 + 0.5);
03252 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF;
03253 #else
03254 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5);
03255 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF;
03256 #endif
03257
03258 c.packedYOffset|= c.packedYOffset<<32;
03259 c.packedYOffset|= c.packedYOffset<<16;
03260
03261 c.packedYScale|= c.packedYScale<<32;
03262 c.packedYScale|= c.packedYScale<<16;
03263
03264 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5);
03265 else QPCorrecture= 256*256;
03266 }else{
03267 c.packedYScale= 0x0100010001000100LL;
03268 c.packedYOffset= 0;
03269 QPCorrecture= 256*256;
03270 }
03271
03272
03273 y=-BLOCK_SIZE;
03274 {
03275 const uint8_t *srcBlock= &(src[y*srcStride]);
03276 uint8_t *dstBlock= tempDst + dstStride;
03277
03278
03279
03280
03281 for(x=0; x<width; x+=BLOCK_SIZE){
03282
03283 #if HAVE_MMX2
03284
03285
03286
03287
03288
03289
03290
03291 __asm__(
03292 "mov %4, %%"REG_a" \n\t"
03293 "shr $2, %%"REG_a" \n\t"
03294 "and $6, %%"REG_a" \n\t"
03295 "add %5, %%"REG_a" \n\t"
03296 "mov %%"REG_a", %%"REG_d" \n\t"
03297 "imul %1, %%"REG_a" \n\t"
03298 "imul %3, %%"REG_d" \n\t"
03299 "prefetchnta 32(%%"REG_a", %0) \n\t"
03300 "prefetcht0 32(%%"REG_d", %2) \n\t"
03301 "add %1, %%"REG_a" \n\t"
03302 "add %3, %%"REG_d" \n\t"
03303 "prefetchnta 32(%%"REG_a", %0) \n\t"
03304 "prefetcht0 32(%%"REG_d", %2) \n\t"
03305 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
03306 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
03307 : "%"REG_a, "%"REG_d
03308 );
03309
03310 #elif HAVE_AMD3DNOW
03311
03312
03313
03314
03315
03316
03317 #endif
03318
03319 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
03320 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
03321
03322 RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
03323
03324 if(mode & LINEAR_IPOL_DEINT_FILTER)
03325 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
03326 else if(mode & LINEAR_BLEND_DEINT_FILTER)
03327 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
03328 else if(mode & MEDIAN_DEINT_FILTER)
03329 RENAME(deInterlaceMedian)(dstBlock, dstStride);
03330 else if(mode & CUBIC_IPOL_DEINT_FILTER)
03331 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
03332 else if(mode & FFMPEG_DEINT_FILTER)
03333 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
03334 else if(mode & LOWPASS5_DEINT_FILTER)
03335 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
03336
03337
03338
03339 dstBlock+=8;
03340 srcBlock+=8;
03341 }
03342 if(width==FFABS(dstStride))
03343 linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride);
03344 else{
03345 int i;
03346 for(i=0; i<copyAhead; i++){
03347 memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width);
03348 }
03349 }
03350 }
03351
03352 for(y=0; y<height; y+=BLOCK_SIZE){
03353
03354 const uint8_t *srcBlock= &(src[y*srcStride]);
03355 uint8_t *dstBlock= &(dst[y*dstStride]);
03356 #if HAVE_MMX
03357 uint8_t *tempBlock1= c.tempBlocks;
03358 uint8_t *tempBlock2= c.tempBlocks + 8;
03359 #endif
03360 const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride];
03361 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)];
03362 int QP=0;
03363
03364
03365 if(y+15 >= height){
03366 int i;
03367
03368
03369 linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead,
03370 FFMAX(height-y-copyAhead, 0), srcStride);
03371
03372
03373 for(i=FFMAX(height-y, 8); i<copyAhead+8; i++)
03374 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride));
03375
03376
03377 linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride);
03378
03379
03380 for(i=height-y+1; i<=copyAhead; i++)
03381 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride));
03382
03383 dstBlock= tempDst + dstStride;
03384 srcBlock= tempSrc;
03385 }
03386
03387
03388
03389
03390 for(x=0; x<width; x+=BLOCK_SIZE){
03391 const int stride= dstStride;
03392 #if HAVE_MMX
03393 uint8_t *tmpXchg;
03394 #endif
03395 if(isColor){
03396 QP= QPptr[x>>qpHShift];
03397 c.nonBQP= nonBQPptr[x>>qpHShift];
03398 }else{
03399 QP= QPptr[x>>4];
03400 QP= (QP* QPCorrecture + 256*128)>>16;
03401 c.nonBQP= nonBQPptr[x>>4];
03402 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16;
03403 yHistogram[ srcBlock[srcStride*12 + 4] ]++;
03404 }
03405 c.QP= QP;
03406 #if HAVE_MMX
03407 __asm__ volatile(
03408 "movd %1, %%mm7 \n\t"
03409 "packuswb %%mm7, %%mm7 \n\t"
03410 "packuswb %%mm7, %%mm7 \n\t"
03411 "packuswb %%mm7, %%mm7 \n\t"
03412 "movq %%mm7, %0 \n\t"
03413 : "=m" (c.pQPb)
03414 : "r" (QP)
03415 );
03416 #endif
03417
03418
03419 #if HAVE_MMX2
03420
03421
03422
03423
03424
03425
03426
03427 __asm__(
03428 "mov %4, %%"REG_a" \n\t"
03429 "shr $2, %%"REG_a" \n\t"
03430 "and $6, %%"REG_a" \n\t"
03431 "add %5, %%"REG_a" \n\t"
03432 "mov %%"REG_a", %%"REG_d" \n\t"
03433 "imul %1, %%"REG_a" \n\t"
03434 "imul %3, %%"REG_d" \n\t"
03435 "prefetchnta 32(%%"REG_a", %0) \n\t"
03436 "prefetcht0 32(%%"REG_d", %2) \n\t"
03437 "add %1, %%"REG_a" \n\t"
03438 "add %3, %%"REG_d" \n\t"
03439 "prefetchnta 32(%%"REG_a", %0) \n\t"
03440 "prefetcht0 32(%%"REG_d", %2) \n\t"
03441 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride),
03442 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead)
03443 : "%"REG_a, "%"REG_d
03444 );
03445
03446 #elif HAVE_AMD3DNOW
03447
03448
03449
03450
03451
03452
03453 #endif
03454
03455 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
03456 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
03457
03458 if(mode & LINEAR_IPOL_DEINT_FILTER)
03459 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
03460 else if(mode & LINEAR_BLEND_DEINT_FILTER)
03461 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
03462 else if(mode & MEDIAN_DEINT_FILTER)
03463 RENAME(deInterlaceMedian)(dstBlock, dstStride);
03464 else if(mode & CUBIC_IPOL_DEINT_FILTER)
03465 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
03466 else if(mode & FFMPEG_DEINT_FILTER)
03467 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
03468 else if(mode & LOWPASS5_DEINT_FILTER)
03469 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
03470
03471
03472
03473
03474
03475 if(y + 8 < height){
03476 if(mode & V_X1_FILTER)
03477 RENAME(vertX1Filter)(dstBlock, stride, &c);
03478 else if(mode & V_DEBLOCK){
03479 const int t= RENAME(vertClassify)(dstBlock, stride, &c);
03480
03481 if(t==1)
03482 RENAME(doVertLowPass)(dstBlock, stride, &c);
03483 else if(t==2)
03484 RENAME(doVertDefFilter)(dstBlock, stride, &c);
03485 }else if(mode & V_A_DEBLOCK){
03486 RENAME(do_a_deblock)(dstBlock, stride, 1, &c);
03487 }
03488 }
03489
03490 #if HAVE_MMX
03491 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride);
03492 #endif
03493
03494 if(x - 8 >= 0){
03495 #if HAVE_MMX
03496 if(mode & H_X1_FILTER)
03497 RENAME(vertX1Filter)(tempBlock1, 16, &c);
03498 else if(mode & H_DEBLOCK){
03499
03500 const int t= RENAME(vertClassify)(tempBlock1, 16, &c);
03501
03502 if(t==1)
03503 RENAME(doVertLowPass)(tempBlock1, 16, &c);
03504 else if(t==2)
03505 RENAME(doVertDefFilter)(tempBlock1, 16, &c);
03506 }else if(mode & H_A_DEBLOCK){
03507 RENAME(do_a_deblock)(tempBlock1, 16, 1, &c);
03508 }
03509
03510 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16);
03511
03512 #else
03513 if(mode & H_X1_FILTER)
03514 horizX1Filter(dstBlock-4, stride, QP);
03515 else if(mode & H_DEBLOCK){
03516 #if HAVE_ALTIVEC
03517 DECLARE_ALIGNED(16, unsigned char, tempBlock)[272];
03518 transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride);
03519
03520 const int t=vertClassify_altivec(tempBlock-48, 16, &c);
03521 if(t==1) {
03522 doVertLowPass_altivec(tempBlock-48, 16, &c);
03523 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
03524 }
03525 else if(t==2) {
03526 doVertDefFilter_altivec(tempBlock-48, 16, &c);
03527 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride);
03528 }
03529 #else
03530 const int t= RENAME(horizClassify)(dstBlock-4, stride, &c);
03531
03532 if(t==1)
03533 RENAME(doHorizLowPass)(dstBlock-4, stride, &c);
03534 else if(t==2)
03535 RENAME(doHorizDefFilter)(dstBlock-4, stride, &c);
03536 #endif
03537 }else if(mode & H_A_DEBLOCK){
03538 RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c);
03539 }
03540 #endif //HAVE_MMX
03541 if(mode & DERING){
03542
03543 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c);
03544 }
03545
03546 if(mode & TEMP_NOISE_FILTER)
03547 {
03548 RENAME(tempNoiseReducer)(dstBlock-8, stride,
03549 c.tempBlurred[isColor] + y*dstStride + x,
03550 c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3),
03551 c.ppMode.maxTmpNoise);
03552 }
03553 }
03554
03555 dstBlock+=8;
03556 srcBlock+=8;
03557
03558 #if HAVE_MMX
03559 tmpXchg= tempBlock1;
03560 tempBlock1= tempBlock2;
03561 tempBlock2 = tmpXchg;
03562 #endif
03563 }
03564
03565 if(mode & DERING){
03566 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c);
03567 }
03568
03569 if((mode & TEMP_NOISE_FILTER)){
03570 RENAME(tempNoiseReducer)(dstBlock-8, dstStride,
03571 c.tempBlurred[isColor] + y*dstStride + x,
03572 c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3),
03573 c.ppMode.maxTmpNoise);
03574 }
03575
03576
03577 if(y+15 >= height){
03578 uint8_t *dstBlock= &(dst[y*dstStride]);
03579 if(width==FFABS(dstStride))
03580 linecpy(dstBlock, tempDst + dstStride, height-y, dstStride);
03581 else{
03582 int i;
03583 for(i=0; i<height-y; i++){
03584 memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width);
03585 }
03586 }
03587 }
03588
03589
03590
03591
03592
03593
03594
03595
03596
03597 }
03598 #if HAVE_AMD3DNOW
03599 __asm__ volatile("femms");
03600 #elif HAVE_MMX
03601 __asm__ volatile("emms");
03602 #endif
03603
03604 #ifdef DEBUG_BRIGHTNESS
03605 if(!isColor){
03606 int max=1;
03607 int i;
03608 for(i=0; i<256; i++)
03609 if(yHistogram[i] > max) max=yHistogram[i];
03610
03611 for(i=1; i<256; i++){
03612 int x;
03613 int start=yHistogram[i-1]/(max/256+1);
03614 int end=yHistogram[i]/(max/256+1);
03615 int inc= end > start ? 1 : -1;
03616 for(x=start; x!=end+inc; x+=inc)
03617 dst[ i*dstStride + x]+=128;
03618 }
03619
03620 for(i=0; i<100; i+=2){
03621 dst[ (white)*dstStride + i]+=128;
03622 dst[ (black)*dstStride + i]+=128;
03623 }
03624 }
03625 #endif
03626
03627 *c2= c;
03628
03629 }