00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00028 #include <stdlib.h>
00029 #include <stdio.h>
00030 #include <string.h>
00031 #include <sys/time.h>
00032 #include <unistd.h>
00033 #include <math.h>
00034
00035 #include "libavutil/cpu.h"
00036 #include "libavutil/common.h"
00037 #include "libavutil/lfg.h"
00038
00039 #include "simple_idct.h"
00040 #include "aandcttab.h"
00041 #include "faandct.h"
00042 #include "faanidct.h"
00043 #include "x86/idct_xvid.h"
00044 #include "dctref.h"
00045
00046 #undef printf
00047
00048 void ff_mmx_idct(DCTELEM *data);
00049 void ff_mmxext_idct(DCTELEM *data);
00050
00051
00052 void ff_bfin_idct(DCTELEM *block);
00053 void ff_bfin_fdct(DCTELEM *block);
00054
00055
00056 void ff_fdct_altivec(DCTELEM *block);
00057
00058
00059
00060 void ff_j_rev_dct_arm(DCTELEM *data);
00061 void ff_simple_idct_arm(DCTELEM *data);
00062 void ff_simple_idct_armv5te(DCTELEM *data);
00063 void ff_simple_idct_armv6(DCTELEM *data);
00064 void ff_simple_idct_neon(DCTELEM *data);
00065
00066 void ff_simple_idct_axp(DCTELEM *data);
00067
00068 struct algo {
00069 const char *name;
00070 void (*func)(DCTELEM *block);
00071 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
00072 SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
00073 int mm_support;
00074 int nonspec;
00075 };
00076
00077 static int cpu_flags;
00078
00079 static const struct algo fdct_tab[] = {
00080 { "REF-DBL", ff_ref_fdct, NO_PERM },
00081 { "FAAN", ff_faandct, NO_PERM },
00082 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
00083 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
00084
00085 #if HAVE_MMX
00086 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
00087 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
00088 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
00089 #endif
00090
00091 #if HAVE_ALTIVEC
00092 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
00093 #endif
00094
00095 #if ARCH_BFIN
00096 { "BFINfdct", ff_bfin_fdct, NO_PERM },
00097 #endif
00098
00099 { 0 }
00100 };
00101
00102 #if HAVE_MMX && HAVE_YASM
00103 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
00104 DCTELEM *block, int16_t *qmat);
00105
00106 static void ff_prores_idct_put_10_sse2_wrap(DCTELEM *dst){
00107 int16_t qmat[64]; int i;
00108 int16_t tmp[64];
00109
00110 for(i=0; i<64; i++){
00111 qmat[i]=4;
00112 tmp[i]= dst[i];
00113 }
00114 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
00115 }
00116 #endif
00117
00118 static const struct algo idct_tab[] = {
00119 { "FAANI", ff_faanidct, NO_PERM },
00120 { "REF-DBL", ff_ref_idct, NO_PERM },
00121 { "INT", ff_j_rev_dct, MMX_PERM },
00122 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
00123
00124 #if HAVE_MMX
00125 #if CONFIG_GPL
00126 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
00127 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
00128 #endif
00129 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
00130 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
00131 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
00132 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
00133 #if ARCH_X86_64 && HAVE_YASM
00134 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
00135 #endif
00136 #endif
00137
00138 #if ARCH_BFIN
00139 { "BFINidct", ff_bfin_idct, NO_PERM },
00140 #endif
00141
00142 #if ARCH_ARM
00143 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
00144 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
00145 #endif
00146 #if HAVE_ARMV5TE
00147 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
00148 #endif
00149 #if HAVE_ARMV6
00150 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
00151 #endif
00152 #if HAVE_NEON
00153 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
00154 #endif
00155
00156 #if ARCH_ALPHA
00157 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
00158 #endif
00159
00160 { 0 }
00161 };
00162
00163 #define AANSCALE_BITS 12
00164
00165 static int64_t gettime(void)
00166 {
00167 struct timeval tv;
00168 gettimeofday(&tv, NULL);
00169 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
00170 }
00171
00172 #define NB_ITS 20000
00173 #define NB_ITS_SPEED 50000
00174
00175 static short idct_mmx_perm[64];
00176
00177 static short idct_simple_mmx_perm[64] = {
00178 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
00179 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
00180 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
00181 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
00182 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
00183 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
00184 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
00185 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
00186 };
00187
00188 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
00189
00190 static void idct_mmx_init(void)
00191 {
00192 int i;
00193
00194
00195 for (i = 0; i < 64; i++) {
00196 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
00197 }
00198 }
00199
00200 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
00201 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
00202
00203 static inline void mmx_emms(void)
00204 {
00205 #if HAVE_MMX
00206 if (cpu_flags & AV_CPU_FLAG_MMX)
00207 __asm__ volatile ("emms\n\t");
00208 #endif
00209 }
00210
00211 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
00212 {
00213 int i, j;
00214
00215 memset(block, 0, 64 * sizeof(*block));
00216
00217 switch (test) {
00218 case 0:
00219 for (i = 0; i < 64; i++)
00220 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
00221 if (is_idct) {
00222 ff_ref_fdct(block);
00223 for (i = 0; i < 64; i++)
00224 block[i] >>= 3;
00225 }
00226 break;
00227 case 1:
00228 j = av_lfg_get(prng) % 10 + 1;
00229 for (i = 0; i < j; i++)
00230 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % (2*vals) -vals;
00231 break;
00232 case 2:
00233 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
00234 block[63] = (block[0] & 1) ^ 1;
00235 break;
00236 }
00237 }
00238
00239 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
00240 {
00241 int i;
00242
00243 if (perm == MMX_PERM) {
00244 for (i = 0; i < 64; i++)
00245 dst[idct_mmx_perm[i]] = src[i];
00246 } else if (perm == MMX_SIMPLE_PERM) {
00247 for (i = 0; i < 64; i++)
00248 dst[idct_simple_mmx_perm[i]] = src[i];
00249 } else if (perm == SSE2_PERM) {
00250 for (i = 0; i < 64; i++)
00251 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
00252 } else if (perm == PARTTRANS_PERM) {
00253 for (i = 0; i < 64; i++)
00254 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
00255 } else if (perm == TRANSPOSE_PERM) {
00256 for (i = 0; i < 64; i++)
00257 dst[(i>>3) | ((i<<3)&0x38)] = src[i];
00258 } else {
00259 for (i = 0; i < 64; i++)
00260 dst[i] = src[i];
00261 }
00262 }
00263
00264 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
00265 {
00266 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
00267 int it, i, scale;
00268 int err_inf, v;
00269 int64_t err2, ti, ti1, it1, err_sum = 0;
00270 int64_t sysErr[64], sysErrMax = 0;
00271 int maxout = 0;
00272 int blockSumErrMax = 0, blockSumErr;
00273 AVLFG prng;
00274 const int vals=1<<bits;
00275 double omse, ome;
00276 int spec_err;
00277
00278 av_lfg_init(&prng, 1);
00279
00280 err_inf = 0;
00281 err2 = 0;
00282 for (i = 0; i < 64; i++)
00283 sysErr[i] = 0;
00284 for (it = 0; it < NB_ITS; it++) {
00285 init_block(block1, test, is_idct, &prng, vals);
00286 permute(block, block1, dct->format);
00287
00288 dct->func(block);
00289 mmx_emms();
00290
00291 if (dct->format == SCALE_PERM) {
00292 for (i = 0; i < 64; i++) {
00293 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
00294 block[i] = (block[i] * scale) >> AANSCALE_BITS;
00295 }
00296 }
00297
00298 ref(block1);
00299
00300 blockSumErr = 0;
00301 for (i = 0; i < 64; i++) {
00302 int err = block[i] - block1[i];
00303 err_sum += err;
00304 v = abs(err);
00305 if (v > err_inf)
00306 err_inf = v;
00307 err2 += v * v;
00308 sysErr[i] += block[i] - block1[i];
00309 blockSumErr += v;
00310 if (abs(block[i]) > maxout)
00311 maxout = abs(block[i]);
00312 }
00313 if (blockSumErrMax < blockSumErr)
00314 blockSumErrMax = blockSumErr;
00315 }
00316 for (i = 0; i < 64; i++)
00317 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
00318
00319 for (i = 0; i < 64; i++) {
00320 if (i % 8 == 0)
00321 printf("\n");
00322 printf("%7d ", (int) sysErr[i]);
00323 }
00324 printf("\n");
00325
00326 omse = (double) err2 / NB_ITS / 64;
00327 ome = (double) err_sum / NB_ITS / 64;
00328
00329 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
00330
00331 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
00332 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
00333 omse, ome, (double) sysErrMax / NB_ITS,
00334 maxout, blockSumErrMax);
00335
00336 if (spec_err && !dct->nonspec)
00337 return 1;
00338
00339 if (!speed)
00340 return 0;
00341
00342
00343
00344 init_block(block, test, is_idct, &prng, vals);
00345 permute(block1, block, dct->format);
00346
00347 ti = gettime();
00348 it1 = 0;
00349 do {
00350 for (it = 0; it < NB_ITS_SPEED; it++) {
00351 memcpy(block, block1, sizeof(block));
00352 dct->func(block);
00353 }
00354 it1 += NB_ITS_SPEED;
00355 ti1 = gettime() - ti;
00356 } while (ti1 < 1000000);
00357 mmx_emms();
00358
00359 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
00360 (double) it1 * 1000.0 / (double) ti1);
00361
00362 return 0;
00363 }
00364
00365 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
00366 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
00367
00368 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
00369 {
00370 static int init;
00371 static double c8[8][8];
00372 static double c4[4][4];
00373 double block1[64], block2[64], block3[64];
00374 double s, sum, v;
00375 int i, j, k;
00376
00377 if (!init) {
00378 init = 1;
00379
00380 for (i = 0; i < 8; i++) {
00381 sum = 0;
00382 for (j = 0; j < 8; j++) {
00383 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
00384 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
00385 sum += c8[i][j] * c8[i][j];
00386 }
00387 }
00388
00389 for (i = 0; i < 4; i++) {
00390 sum = 0;
00391 for (j = 0; j < 4; j++) {
00392 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
00393 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
00394 sum += c4[i][j] * c4[i][j];
00395 }
00396 }
00397 }
00398
00399
00400 s = 0.5 * sqrt(2.0);
00401 for (i = 0; i < 4; i++) {
00402 for (j = 0; j < 8; j++) {
00403 block1[8 * (2 * i) + j] =
00404 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
00405 block1[8 * (2 * i + 1) + j] =
00406 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
00407 }
00408 }
00409
00410
00411 for (i = 0; i < 8; i++) {
00412 for (j = 0; j < 8; j++) {
00413 sum = 0;
00414 for (k = 0; k < 8; k++)
00415 sum += c8[k][j] * block1[8 * i + k];
00416 block2[8 * i + j] = sum;
00417 }
00418 }
00419
00420
00421 for (i = 0; i < 8; i++) {
00422 for (j = 0; j < 4; j++) {
00423
00424 sum = 0;
00425 for (k = 0; k < 4; k++)
00426 sum += c4[k][j] * block2[8 * (2 * k) + i];
00427 block3[8 * (2 * j) + i] = sum;
00428
00429
00430 sum = 0;
00431 for (k = 0; k < 4; k++)
00432 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
00433 block3[8 * (2 * j + 1) + i] = sum;
00434 }
00435 }
00436
00437
00438 for (i = 0; i < 8; i++) {
00439 for (j = 0; j < 8; j++) {
00440 v = block3[8 * i + j];
00441 if (v < 0) v = 0;
00442 else if (v > 255) v = 255;
00443 dest[i * linesize + j] = (int) rint(v);
00444 }
00445 }
00446 }
00447
00448 static void idct248_error(const char *name,
00449 void (*idct248_put)(uint8_t *dest, int line_size,
00450 int16_t *block),
00451 int speed)
00452 {
00453 int it, i, it1, ti, ti1, err_max, v;
00454 AVLFG prng;
00455
00456 av_lfg_init(&prng, 1);
00457
00458
00459
00460 err_max = 0;
00461 for (it = 0; it < NB_ITS; it++) {
00462
00463 for (i = 0; i < 64; i++)
00464 block1[i] = av_lfg_get(&prng) % 256 - 128;
00465 block1[0] += 1024;
00466
00467 for (i = 0; i < 64; i++)
00468 block[i] = block1[i];
00469 idct248_ref(img_dest1, 8, block);
00470
00471 for (i = 0; i < 64; i++)
00472 block[i] = block1[i];
00473 idct248_put(img_dest, 8, block);
00474
00475 for (i = 0; i < 64; i++) {
00476 v = abs((int) img_dest[i] - (int) img_dest1[i]);
00477 if (v == 255)
00478 printf("%d %d\n", img_dest[i], img_dest1[i]);
00479 if (v > err_max)
00480 err_max = v;
00481 }
00482 #if 0
00483 printf("ref=\n");
00484 for(i=0;i<8;i++) {
00485 int j;
00486 for(j=0;j<8;j++) {
00487 printf(" %3d", img_dest1[i*8+j]);
00488 }
00489 printf("\n");
00490 }
00491
00492 printf("out=\n");
00493 for(i=0;i<8;i++) {
00494 int j;
00495 for(j=0;j<8;j++) {
00496 printf(" %3d", img_dest[i*8+j]);
00497 }
00498 printf("\n");
00499 }
00500 #endif
00501 }
00502 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
00503
00504 if (!speed)
00505 return;
00506
00507 ti = gettime();
00508 it1 = 0;
00509 do {
00510 for (it = 0; it < NB_ITS_SPEED; it++) {
00511 for (i = 0; i < 64; i++)
00512 block[i] = block1[i];
00513 idct248_put(img_dest, 8, block);
00514 }
00515 it1 += NB_ITS_SPEED;
00516 ti1 = gettime() - ti;
00517 } while (ti1 < 1000000);
00518 mmx_emms();
00519
00520 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
00521 (double) it1 * 1000.0 / (double) ti1);
00522 }
00523
00524 static void help(void)
00525 {
00526 printf("dct-test [-i] [<test-number>] [<bits>]\n"
00527 "test-number 0 -> test with random matrixes\n"
00528 " 1 -> test with random sparse matrixes\n"
00529 " 2 -> do 3. test from mpeg4 std\n"
00530 "bits Number of time domain bits to use, 8 is default\n"
00531 "-i test IDCT implementations\n"
00532 "-4 test IDCT248 implementations\n"
00533 "-t speed test\n");
00534 }
00535
00536 int main(int argc, char **argv)
00537 {
00538 int test_idct = 0, test_248_dct = 0;
00539 int c, i;
00540 int test = 1;
00541 int speed = 0;
00542 int err = 0;
00543 int bits=8;
00544
00545 cpu_flags = av_get_cpu_flags();
00546
00547 ff_ref_dct_init();
00548 idct_mmx_init();
00549
00550 for (;;) {
00551 c = getopt(argc, argv, "ih4t");
00552 if (c == -1)
00553 break;
00554 switch (c) {
00555 case 'i':
00556 test_idct = 1;
00557 break;
00558 case '4':
00559 test_248_dct = 1;
00560 break;
00561 case 't':
00562 speed = 1;
00563 break;
00564 default:
00565 case 'h':
00566 help();
00567 return 0;
00568 }
00569 }
00570
00571 if (optind < argc)
00572 test = atoi(argv[optind]);
00573 if(optind+1 < argc) bits= atoi(argv[optind+1]);
00574
00575 printf("ffmpeg DCT/IDCT test\n");
00576
00577 if (test_248_dct) {
00578 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
00579 } else {
00580 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
00581 for (i = 0; algos[i].name; i++)
00582 if (!(~cpu_flags & algos[i].mm_support)) {
00583 err |= dct_error(&algos[i], test, test_idct, speed, bits);
00584 }
00585 }
00586
00587 return err;
00588 }