00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00028 #include <stdlib.h>
00029 #include <stdio.h>
00030 #include <string.h>
00031 #include <sys/time.h>
00032 #include <unistd.h>
00033 #include <math.h>
00034
00035 #include "libavutil/common.h"
00036
00037 #include "simple_idct.h"
00038 #include "aandcttab.h"
00039 #include "faandct.h"
00040 #include "faanidct.h"
00041 #include "x86/idct_xvid.h"
00042
00043 #undef printf
00044 #undef random
00045
00046 void *fast_memcpy(void *a, const void *b, size_t c){return memcpy(a,b,c);};
00047
00048
00049 void ff_ref_fdct(DCTELEM *block);
00050 void ff_ref_idct(DCTELEM *block);
00051 void ff_ref_dct_init(void);
00052
00053 void ff_mmx_idct(DCTELEM *data);
00054 void ff_mmxext_idct(DCTELEM *data);
00055
00056 void odivx_idct_c(short *block);
00057
00058
00059 void ff_bfin_idct(DCTELEM *block);
00060 void ff_bfin_fdct(DCTELEM *block);
00061
00062
00063 void fdct_altivec(DCTELEM *block);
00064
00065
00066
00067 void j_rev_dct_ARM(DCTELEM *data);
00068 void simple_idct_ARM(DCTELEM *data);
00069 void simple_idct_armv5te(DCTELEM *data);
00070 void ff_simple_idct_armv6(DCTELEM *data);
00071 void ff_simple_idct_neon(DCTELEM *data);
00072
00073 void ff_simple_idct_axp(DCTELEM *data);
00074
00075 struct algo {
00076 const char *name;
00077 enum { FDCT, IDCT } is_idct;
00078 void (* func) (DCTELEM *block);
00079 void (* ref) (DCTELEM *block);
00080 enum formattag { NO_PERM,MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM, SSE2_PERM, PARTTRANS_PERM } format;
00081 int mm_support;
00082 };
00083
00084 #ifndef FAAN_POSTSCALE
00085 #define FAAN_SCALE SCALE_PERM
00086 #else
00087 #define FAAN_SCALE NO_PERM
00088 #endif
00089
00090 static int cpu_flags;
00091
00092 struct algo algos[] = {
00093 {"REF-DBL", 0, ff_ref_fdct, ff_ref_fdct, NO_PERM},
00094 {"FAAN", 0, ff_faandct, ff_ref_fdct, FAAN_SCALE},
00095 {"FAANI", 1, ff_faanidct, ff_ref_idct, NO_PERM},
00096 {"IJG-AAN-INT", 0, fdct_ifast, ff_ref_fdct, SCALE_PERM},
00097 {"IJG-LLM-INT", 0, ff_jpeg_fdct_islow, ff_ref_fdct, NO_PERM},
00098 {"REF-DBL", 1, ff_ref_idct, ff_ref_idct, NO_PERM},
00099 {"INT", 1, j_rev_dct, ff_ref_idct, MMX_PERM},
00100 {"SIMPLE-C", 1, ff_simple_idct, ff_ref_idct, NO_PERM},
00101
00102 #if HAVE_MMX
00103 {"MMX", 0, ff_fdct_mmx, ff_ref_fdct, NO_PERM, FF_MM_MMX},
00104 #if HAVE_MMX2
00105 {"MMX2", 0, ff_fdct_mmx2, ff_ref_fdct, NO_PERM, FF_MM_MMXEXT},
00106 {"SSE2", 0, ff_fdct_sse2, ff_ref_fdct, NO_PERM, FF_MM_SSE2},
00107 #endif
00108
00109 #if CONFIG_GPL
00110 {"LIBMPEG2-MMX", 1, ff_mmx_idct, ff_ref_idct, MMX_PERM, FF_MM_MMX},
00111 {"LIBMPEG2-MMXEXT", 1, ff_mmxext_idct, ff_ref_idct, MMX_PERM, FF_MM_MMXEXT},
00112 #endif
00113 {"SIMPLE-MMX", 1, ff_simple_idct_mmx, ff_ref_idct, MMX_SIMPLE_PERM, FF_MM_MMX},
00114 {"XVID-MMX", 1, ff_idct_xvid_mmx, ff_ref_idct, NO_PERM, FF_MM_MMX},
00115 {"XVID-MMX2", 1, ff_idct_xvid_mmx2, ff_ref_idct, NO_PERM, FF_MM_MMXEXT},
00116 {"XVID-SSE2", 1, ff_idct_xvid_sse2, ff_ref_idct, SSE2_PERM, FF_MM_SSE2},
00117 #endif
00118
00119 #if HAVE_ALTIVEC
00120 {"altivecfdct", 0, fdct_altivec, ff_ref_fdct, NO_PERM, FF_MM_ALTIVEC},
00121 #endif
00122
00123 #if ARCH_BFIN
00124 {"BFINfdct", 0, ff_bfin_fdct, ff_ref_fdct, NO_PERM},
00125 {"BFINidct", 1, ff_bfin_idct, ff_ref_idct, NO_PERM},
00126 #endif
00127
00128 #if ARCH_ARM
00129 {"SIMPLE-ARM", 1, simple_idct_ARM, ff_ref_idct, NO_PERM },
00130 {"INT-ARM", 1, j_rev_dct_ARM, ff_ref_idct, MMX_PERM },
00131 #if HAVE_ARMV5TE
00132 {"SIMPLE-ARMV5TE", 1, simple_idct_armv5te, ff_ref_idct, NO_PERM },
00133 #endif
00134 #if HAVE_ARMV6
00135 {"SIMPLE-ARMV6", 1, ff_simple_idct_armv6, ff_ref_idct, MMX_PERM },
00136 #endif
00137 #if HAVE_NEON
00138 {"SIMPLE-NEON", 1, ff_simple_idct_neon, ff_ref_idct, PARTTRANS_PERM },
00139 #endif
00140 #endif
00141
00142 #if ARCH_ALPHA
00143 {"SIMPLE-ALPHA", 1, ff_simple_idct_axp, ff_ref_idct, NO_PERM },
00144 #endif
00145
00146 { 0 }
00147 };
00148
00149 #define AANSCALE_BITS 12
00150
00151 uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
00152
00153 int64_t gettime(void)
00154 {
00155 struct timeval tv;
00156 gettimeofday(&tv,NULL);
00157 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
00158 }
00159
00160 #define NB_ITS 20000
00161 #define NB_ITS_SPEED 50000
00162
00163 static short idct_mmx_perm[64];
00164
00165 static short idct_simple_mmx_perm[64]={
00166 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
00167 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
00168 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
00169 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
00170 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
00171 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
00172 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
00173 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
00174 };
00175
00176 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
00177
00178 void idct_mmx_init(void)
00179 {
00180 int i;
00181
00182
00183 for (i = 0; i < 64; i++) {
00184 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
00185
00186 }
00187 }
00188
00189 static DCTELEM block[64] __attribute__ ((aligned (16)));
00190 static DCTELEM block1[64] __attribute__ ((aligned (8)));
00191 static DCTELEM block_org[64] __attribute__ ((aligned (8)));
00192
00193 static inline void mmx_emms(void)
00194 {
00195 #if HAVE_MMX
00196 if (cpu_flags & FF_MM_MMX)
00197 __asm__ volatile ("emms\n\t");
00198 #endif
00199 }
00200
00201 void dct_error(const char *name, int is_idct,
00202 void (*fdct_func)(DCTELEM *block),
00203 void (*fdct_ref)(DCTELEM *block), int form, int test)
00204 {
00205 int it, i, scale;
00206 int err_inf, v;
00207 int64_t err2, ti, ti1, it1;
00208 int64_t sysErr[64], sysErrMax=0;
00209 int maxout=0;
00210 int blockSumErrMax=0, blockSumErr;
00211
00212 srandom(0);
00213
00214 err_inf = 0;
00215 err2 = 0;
00216 for(i=0; i<64; i++) sysErr[i]=0;
00217 for(it=0;it<NB_ITS;it++) {
00218 for(i=0;i<64;i++)
00219 block1[i] = 0;
00220 switch(test){
00221 case 0:
00222 for(i=0;i<64;i++)
00223 block1[i] = (random() % 512) -256;
00224 if (is_idct){
00225 ff_ref_fdct(block1);
00226
00227 for(i=0;i<64;i++)
00228 block1[i]>>=3;
00229 }
00230 break;
00231 case 1:{
00232 int num= (random()%10)+1;
00233 for(i=0;i<num;i++)
00234 block1[random()%64] = (random() % 512) -256;
00235 }break;
00236 case 2:
00237 block1[0]= (random()%4096)-2048;
00238 block1[63]= (block1[0]&1)^1;
00239 break;
00240 }
00241
00242 #if 0 // simulate mismatch control
00243 { int sum=0;
00244 for(i=0;i<64;i++)
00245 sum+=block1[i];
00246
00247 if((sum&1)==0) block1[63]^=1;
00248 }
00249 #endif
00250
00251 for(i=0; i<64; i++)
00252 block_org[i]= block1[i];
00253
00254 if (form == MMX_PERM) {
00255 for(i=0;i<64;i++)
00256 block[idct_mmx_perm[i]] = block1[i];
00257 } else if (form == MMX_SIMPLE_PERM) {
00258 for(i=0;i<64;i++)
00259 block[idct_simple_mmx_perm[i]] = block1[i];
00260
00261 } else if (form == SSE2_PERM) {
00262 for(i=0; i<64; i++)
00263 block[(i&0x38) | idct_sse2_row_perm[i&7]] = block1[i];
00264 } else if (form == PARTTRANS_PERM) {
00265 for(i=0; i<64; i++)
00266 block[(i&0x24) | ((i&3)<<3) | ((i>>3)&3)] = block1[i];
00267 } else {
00268 for(i=0; i<64; i++)
00269 block[i]= block1[i];
00270 }
00271 #if 0 // simulate mismatch control for tested IDCT but not the ref
00272 { int sum=0;
00273 for(i=0;i<64;i++)
00274 sum+=block[i];
00275
00276 if((sum&1)==0) block[63]^=1;
00277 }
00278 #endif
00279
00280 fdct_func(block);
00281 mmx_emms();
00282
00283 if (form == SCALE_PERM) {
00284 for(i=0; i<64; i++) {
00285 scale = 8*(1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
00286 block[i] = (block[i] * scale ) >> AANSCALE_BITS;
00287 }
00288 }
00289
00290 fdct_ref(block1);
00291
00292 blockSumErr=0;
00293 for(i=0;i<64;i++) {
00294 v = abs(block[i] - block1[i]);
00295 if (v > err_inf)
00296 err_inf = v;
00297 err2 += v * v;
00298 sysErr[i] += block[i] - block1[i];
00299 blockSumErr += v;
00300 if( abs(block[i])>maxout) maxout=abs(block[i]);
00301 }
00302 if(blockSumErrMax < blockSumErr) blockSumErrMax= blockSumErr;
00303 #if 0 // print different matrix pairs
00304 if(blockSumErr){
00305 printf("\n");
00306 for(i=0; i<64; i++){
00307 if((i&7)==0) printf("\n");
00308 printf("%4d ", block_org[i]);
00309 }
00310 for(i=0; i<64; i++){
00311 if((i&7)==0) printf("\n");
00312 printf("%4d ", block[i] - block1[i]);
00313 }
00314 }
00315 #endif
00316 }
00317 for(i=0; i<64; i++) sysErrMax= FFMAX(sysErrMax, FFABS(sysErr[i]));
00318
00319 #if 1 // dump systematic errors
00320 for(i=0; i<64; i++){
00321 if(i%8==0) printf("\n");
00322 printf("%5d ", (int)sysErr[i]);
00323 }
00324 printf("\n");
00325 #endif
00326
00327 printf("%s %s: err_inf=%d err2=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
00328 is_idct ? "IDCT" : "DCT",
00329 name, err_inf, (double)err2 / NB_ITS / 64.0, (double)sysErrMax / NB_ITS, maxout, blockSumErrMax);
00330 #if 1 //Speed test
00331
00332 for(i=0;i<64;i++)
00333 block1[i] = 0;
00334 switch(test){
00335 case 0:
00336 for(i=0;i<64;i++)
00337 block1[i] = (random() % 512) -256;
00338 if (is_idct){
00339 ff_ref_fdct(block1);
00340
00341 for(i=0;i<64;i++)
00342 block1[i]>>=3;
00343 }
00344 break;
00345 case 1:{
00346 case 2:
00347 block1[0] = (random() % 512) -256;
00348 block1[1] = (random() % 512) -256;
00349 block1[2] = (random() % 512) -256;
00350 block1[3] = (random() % 512) -256;
00351 }break;
00352 }
00353
00354 if (form == MMX_PERM) {
00355 for(i=0;i<64;i++)
00356 block[idct_mmx_perm[i]] = block1[i];
00357 } else if(form == MMX_SIMPLE_PERM) {
00358 for(i=0;i<64;i++)
00359 block[idct_simple_mmx_perm[i]] = block1[i];
00360 } else {
00361 for(i=0; i<64; i++)
00362 block[i]= block1[i];
00363 }
00364
00365 ti = gettime();
00366 it1 = 0;
00367 do {
00368 for(it=0;it<NB_ITS_SPEED;it++) {
00369 for(i=0; i<64; i++)
00370 block[i]= block1[i];
00371
00372
00373 fdct_func(block);
00374 }
00375 it1 += NB_ITS_SPEED;
00376 ti1 = gettime() - ti;
00377 } while (ti1 < 1000000);
00378 mmx_emms();
00379
00380 printf("%s %s: %0.1f kdct/s\n",
00381 is_idct ? "IDCT" : "DCT",
00382 name, (double)it1 * 1000.0 / (double)ti1);
00383 #endif
00384 }
00385
00386 static uint8_t img_dest[64] __attribute__ ((aligned (8)));
00387 static uint8_t img_dest1[64] __attribute__ ((aligned (8)));
00388
00389 void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
00390 {
00391 static int init;
00392 static double c8[8][8];
00393 static double c4[4][4];
00394 double block1[64], block2[64], block3[64];
00395 double s, sum, v;
00396 int i, j, k;
00397
00398 if (!init) {
00399 init = 1;
00400
00401 for(i=0;i<8;i++) {
00402 sum = 0;
00403 for(j=0;j<8;j++) {
00404 s = (i==0) ? sqrt(1.0/8.0) : sqrt(1.0/4.0);
00405 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
00406 sum += c8[i][j] * c8[i][j];
00407 }
00408 }
00409
00410 for(i=0;i<4;i++) {
00411 sum = 0;
00412 for(j=0;j<4;j++) {
00413 s = (i==0) ? sqrt(1.0/4.0) : sqrt(1.0/2.0);
00414 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
00415 sum += c4[i][j] * c4[i][j];
00416 }
00417 }
00418 }
00419
00420
00421 s = 0.5 * sqrt(2.0);
00422 for(i=0;i<4;i++) {
00423 for(j=0;j<8;j++) {
00424 block1[8*(2*i)+j] = (block[8*(2*i)+j] + block[8*(2*i+1)+j]) * s;
00425 block1[8*(2*i+1)+j] = (block[8*(2*i)+j] - block[8*(2*i+1)+j]) * s;
00426 }
00427 }
00428
00429
00430 for(i=0;i<8;i++) {
00431 for(j=0;j<8;j++) {
00432 sum = 0;
00433 for(k=0;k<8;k++)
00434 sum += c8[k][j] * block1[8*i+k];
00435 block2[8*i+j] = sum;
00436 }
00437 }
00438
00439
00440 for(i=0;i<8;i++) {
00441 for(j=0;j<4;j++) {
00442
00443 sum = 0;
00444 for(k=0;k<4;k++)
00445 sum += c4[k][j] * block2[8*(2*k)+i];
00446 block3[8*(2*j)+i] = sum;
00447
00448
00449 sum = 0;
00450 for(k=0;k<4;k++)
00451 sum += c4[k][j] * block2[8*(2*k+1)+i];
00452 block3[8*(2*j+1)+i] = sum;
00453 }
00454 }
00455
00456
00457 for(i=0;i<8;i++) {
00458 for(j=0;j<8;j++) {
00459 v = block3[8*i+j];
00460 if (v < 0)
00461 v = 0;
00462 else if (v > 255)
00463 v = 255;
00464 dest[i * linesize + j] = (int)rint(v);
00465 }
00466 }
00467 }
00468
00469 void idct248_error(const char *name,
00470 void (*idct248_put)(uint8_t *dest, int line_size, int16_t *block))
00471 {
00472 int it, i, it1, ti, ti1, err_max, v;
00473
00474 srandom(0);
00475
00476
00477
00478 err_max = 0;
00479 for(it=0;it<NB_ITS;it++) {
00480
00481
00482 for(i=0;i<64;i++)
00483 block1[i] = (random() % 256) - 128;
00484 block1[0] += 1024;
00485
00486 for(i=0; i<64; i++)
00487 block[i]= block1[i];
00488 idct248_ref(img_dest1, 8, block);
00489
00490 for(i=0; i<64; i++)
00491 block[i]= block1[i];
00492 idct248_put(img_dest, 8, block);
00493
00494 for(i=0;i<64;i++) {
00495 v = abs((int)img_dest[i] - (int)img_dest1[i]);
00496 if (v == 255)
00497 printf("%d %d\n", img_dest[i], img_dest1[i]);
00498 if (v > err_max)
00499 err_max = v;
00500 }
00501 #if 0
00502 printf("ref=\n");
00503 for(i=0;i<8;i++) {
00504 int j;
00505 for(j=0;j<8;j++) {
00506 printf(" %3d", img_dest1[i*8+j]);
00507 }
00508 printf("\n");
00509 }
00510
00511 printf("out=\n");
00512 for(i=0;i<8;i++) {
00513 int j;
00514 for(j=0;j<8;j++) {
00515 printf(" %3d", img_dest[i*8+j]);
00516 }
00517 printf("\n");
00518 }
00519 #endif
00520 }
00521 printf("%s %s: err_inf=%d\n",
00522 1 ? "IDCT248" : "DCT248",
00523 name, err_max);
00524
00525 ti = gettime();
00526 it1 = 0;
00527 do {
00528 for(it=0;it<NB_ITS_SPEED;it++) {
00529 for(i=0; i<64; i++)
00530 block[i]= block1[i];
00531
00532
00533 idct248_put(img_dest, 8, block);
00534 }
00535 it1 += NB_ITS_SPEED;
00536 ti1 = gettime() - ti;
00537 } while (ti1 < 1000000);
00538 mmx_emms();
00539
00540 printf("%s %s: %0.1f kdct/s\n",
00541 1 ? "IDCT248" : "DCT248",
00542 name, (double)it1 * 1000.0 / (double)ti1);
00543 }
00544
00545 void help(void)
00546 {
00547 printf("dct-test [-i] [<test-number>]\n"
00548 "test-number 0 -> test with random matrixes\n"
00549 " 1 -> test with random sparse matrixes\n"
00550 " 2 -> do 3. test from mpeg4 std\n"
00551 "-i test IDCT implementations\n"
00552 "-4 test IDCT248 implementations\n");
00553 }
00554
00555 int main(int argc, char **argv)
00556 {
00557 int test_idct = 0, test_248_dct = 0;
00558 int c,i;
00559 int test=1;
00560 cpu_flags = mm_support();
00561
00562 ff_ref_dct_init();
00563 idct_mmx_init();
00564
00565 for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
00566 for(i=0;i<MAX_NEG_CROP;i++) {
00567 cropTbl[i] = 0;
00568 cropTbl[i + MAX_NEG_CROP + 256] = 255;
00569 }
00570
00571 for(;;) {
00572 c = getopt(argc, argv, "ih4");
00573 if (c == -1)
00574 break;
00575 switch(c) {
00576 case 'i':
00577 test_idct = 1;
00578 break;
00579 case '4':
00580 test_248_dct = 1;
00581 break;
00582 default :
00583 case 'h':
00584 help();
00585 return 0;
00586 }
00587 }
00588
00589 if(optind <argc) test= atoi(argv[optind]);
00590
00591 printf("ffmpeg DCT/IDCT test\n");
00592
00593 if (test_248_dct) {
00594 idct248_error("SIMPLE-C", ff_simple_idct248_put);
00595 } else {
00596 for (i=0;algos[i].name;i++)
00597 if (algos[i].is_idct == test_idct && !(~cpu_flags & algos[i].mm_support)) {
00598 dct_error (algos[i].name, algos[i].is_idct, algos[i].func, algos[i].ref, algos[i].format, test);
00599 }
00600 }
00601 return 0;
00602 }