00001 
00002 
00003 
00004 
00005 
00006 
00007 
00008 
00009 
00010 
00011 
00012 
00013 
00014 
00015 
00016 
00017 
00018 
00019 
00020 
00021 
00028 #include "config.h"
00029 #include <stdlib.h>
00030 #include <stdio.h>
00031 #include <string.h>
00032 #if HAVE_UNISTD_H
00033 #include <unistd.h>
00034 #endif
00035 #include <math.h>
00036 
00037 #include "libavutil/cpu.h"
00038 #include "libavutil/common.h"
00039 #include "libavutil/lfg.h"
00040 #include "libavutil/time.h"
00041 
00042 #include "simple_idct.h"
00043 #include "aandcttab.h"
00044 #include "faandct.h"
00045 #include "faanidct.h"
00046 #include "x86/idct_xvid.h"
00047 #include "dctref.h"
00048 
00049 #undef printf
00050 
00051 void ff_mmx_idct(DCTELEM *data);
00052 void ff_mmxext_idct(DCTELEM *data);
00053 
00054 
00055 void ff_bfin_idct(DCTELEM *block);
00056 void ff_bfin_fdct(DCTELEM *block);
00057 
00058 
00059 void ff_fdct_altivec(DCTELEM *block);
00060 
00061 
00062 void ff_j_rev_dct_arm(DCTELEM *data);
00063 void ff_simple_idct_arm(DCTELEM *data);
00064 void ff_simple_idct_armv5te(DCTELEM *data);
00065 void ff_simple_idct_armv6(DCTELEM *data);
00066 void ff_simple_idct_neon(DCTELEM *data);
00067 
00068 void ff_simple_idct_axp(DCTELEM *data);
00069 
00070 struct algo {
00071     const char *name;
00072     void (*func)(DCTELEM *block);
00073     enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
00074                      SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
00075     int mm_support;
00076     int nonspec;
00077 };
00078 
00079 static int cpu_flags;
00080 
00081 static const struct algo fdct_tab[] = {
00082     { "REF-DBL",        ff_ref_fdct,           NO_PERM    },
00083     { "FAAN",           ff_faandct,            NO_PERM    },
00084     { "IJG-AAN-INT",    ff_fdct_ifast,         SCALE_PERM },
00085     { "IJG-LLM-INT",    ff_jpeg_fdct_islow_8,  NO_PERM    },
00086 
00087 #if HAVE_MMX_INLINE
00088     { "MMX",            ff_fdct_mmx,           NO_PERM,   AV_CPU_FLAG_MMX     },
00089     { "MMXEXT",         ff_fdct_mmx2,          NO_PERM,   AV_CPU_FLAG_MMXEXT  },
00090     { "SSE2",           ff_fdct_sse2,          NO_PERM,   AV_CPU_FLAG_SSE2    },
00091 #endif
00092 
00093 #if HAVE_ALTIVEC
00094     { "altivecfdct",    ff_fdct_altivec,       NO_PERM,   AV_CPU_FLAG_ALTIVEC },
00095 #endif
00096 
00097 #if ARCH_BFIN
00098     { "BFINfdct",       ff_bfin_fdct,          NO_PERM  },
00099 #endif
00100 
00101     { 0 }
00102 };
00103 
00104 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
00105 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
00106                                 DCTELEM *block, int16_t *qmat);
00107 
00108 static void ff_prores_idct_put_10_sse2_wrap(DCTELEM *dst){
00109     DECLARE_ALIGNED(16, static int16_t, qmat)[64];
00110     DECLARE_ALIGNED(16, static int16_t, tmp)[64];
00111     int i;
00112 
00113     for(i=0; i<64; i++){
00114         qmat[i]=4;
00115         tmp[i]= dst[i];
00116     }
00117     ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
00118 }
00119 #endif
00120 
00121 static const struct algo idct_tab[] = {
00122     { "FAANI",          ff_faanidct,           NO_PERM  },
00123     { "REF-DBL",        ff_ref_idct,           NO_PERM  },
00124     { "INT",            ff_j_rev_dct,          MMX_PERM },
00125     { "SIMPLE-C",       ff_simple_idct_8,      NO_PERM  },
00126 
00127 #if HAVE_MMX_INLINE
00128 #if CONFIG_GPL
00129     { "LIBMPEG2-MMX",   ff_mmx_idct,           MMX_PERM,  AV_CPU_FLAG_MMX,  1 },
00130     { "LIBMPEG2-MMX2",  ff_mmxext_idct,        MMX_PERM,  AV_CPU_FLAG_MMX2, 1 },
00131 #endif
00132     { "SIMPLE-MMX",     ff_simple_idct_mmx,  MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
00133     { "XVID-MMX",       ff_idct_xvid_mmx,      NO_PERM,   AV_CPU_FLAG_MMX,  1 },
00134     { "XVID-MMXEXT",    ff_idct_xvid_mmx2,     NO_PERM,   AV_CPU_FLAG_MMXEXT, 1 },
00135     { "XVID-SSE2",      ff_idct_xvid_sse2,     SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
00136 #if ARCH_X86_64 && HAVE_YASM
00137     { "PR-SSE2",        ff_prores_idct_put_10_sse2_wrap,     TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
00138 #endif
00139 #endif
00140 
00141 #if ARCH_BFIN
00142     { "BFINidct",       ff_bfin_idct,          NO_PERM  },
00143 #endif
00144 
00145 #if ARCH_ARM
00146     { "SIMPLE-ARM",     ff_simple_idct_arm,    NO_PERM  },
00147     { "INT-ARM",        ff_j_rev_dct_arm,      MMX_PERM },
00148 #endif
00149 #if HAVE_ARMV5TE
00150     { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM  },
00151 #endif
00152 #if HAVE_ARMV6
00153     { "SIMPLE-ARMV6",   ff_simple_idct_armv6,  MMX_PERM },
00154 #endif
00155 #if HAVE_NEON
00156     { "SIMPLE-NEON",    ff_simple_idct_neon,   PARTTRANS_PERM },
00157 #endif
00158 
00159 #if ARCH_ALPHA
00160     { "SIMPLE-ALPHA",   ff_simple_idct_axp,    NO_PERM },
00161 #endif
00162 
00163     { 0 }
00164 };
00165 
00166 #define AANSCALE_BITS 12
00167 
00168 #define NB_ITS 20000
00169 #define NB_ITS_SPEED 50000
00170 
00171 static short idct_mmx_perm[64];
00172 
00173 static short idct_simple_mmx_perm[64] = {
00174     0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
00175     0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
00176     0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
00177     0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
00178     0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
00179     0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
00180     0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
00181     0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
00182 };
00183 
00184 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
00185 
00186 static void idct_mmx_init(void)
00187 {
00188     int i;
00189 
00190     
00191     for (i = 0; i < 64; i++) {
00192         idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
00193     }
00194 }
00195 
00196 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
00197 DECLARE_ALIGNED(8,  static DCTELEM, block1)[64];
00198 
00199 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
00200 {
00201     int i, j;
00202 
00203     memset(block, 0, 64 * sizeof(*block));
00204 
00205     switch (test) {
00206     case 0:
00207         for (i = 0; i < 64; i++)
00208             block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
00209         if (is_idct) {
00210             ff_ref_fdct(block);
00211             for (i = 0; i < 64; i++)
00212                 block[i] >>= 3;
00213         }
00214         break;
00215     case 1:
00216         j = av_lfg_get(prng) % 10 + 1;
00217         for (i = 0; i < j; i++) {
00218             int idx = av_lfg_get(prng) % 64;
00219             block[idx] = av_lfg_get(prng) % (2*vals) -vals;
00220         }
00221         break;
00222     case 2:
00223         block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
00224         block[63] = (block[0] & 1) ^ 1;
00225         break;
00226     }
00227 }
00228 
00229 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
00230 {
00231     int i;
00232 
00233     if (perm == MMX_PERM) {
00234         for (i = 0; i < 64; i++)
00235             dst[idct_mmx_perm[i]] = src[i];
00236     } else if (perm == MMX_SIMPLE_PERM) {
00237         for (i = 0; i < 64; i++)
00238             dst[idct_simple_mmx_perm[i]] = src[i];
00239     } else if (perm == SSE2_PERM) {
00240         for (i = 0; i < 64; i++)
00241             dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
00242     } else if (perm == PARTTRANS_PERM) {
00243         for (i = 0; i < 64; i++)
00244             dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
00245     } else if (perm == TRANSPOSE_PERM) {
00246         for (i = 0; i < 64; i++)
00247             dst[(i>>3) | ((i<<3)&0x38)] = src[i];
00248     } else {
00249         for (i = 0; i < 64; i++)
00250             dst[i] = src[i];
00251     }
00252 }
00253 
00254 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
00255 {
00256     void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
00257     int it, i, scale;
00258     int err_inf, v;
00259     int64_t err2, ti, ti1, it1, err_sum = 0;
00260     int64_t sysErr[64], sysErrMax = 0;
00261     int maxout = 0;
00262     int blockSumErrMax = 0, blockSumErr;
00263     AVLFG prng;
00264     const int vals=1<<bits;
00265     double omse, ome;
00266     int spec_err;
00267 
00268     av_lfg_init(&prng, 1);
00269 
00270     err_inf = 0;
00271     err2 = 0;
00272     for (i = 0; i < 64; i++)
00273         sysErr[i] = 0;
00274     for (it = 0; it < NB_ITS; it++) {
00275         init_block(block1, test, is_idct, &prng, vals);
00276         permute(block, block1, dct->format);
00277 
00278         dct->func(block);
00279         emms_c();
00280 
00281         if (dct->format == SCALE_PERM) {
00282             for (i = 0; i < 64; i++) {
00283                 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
00284                 block[i] = (block[i] * scale) >> AANSCALE_BITS;
00285             }
00286         }
00287 
00288         ref(block1);
00289 
00290         blockSumErr = 0;
00291         for (i = 0; i < 64; i++) {
00292             int err = block[i] - block1[i];
00293             err_sum += err;
00294             v = abs(err);
00295             if (v > err_inf)
00296                 err_inf = v;
00297             err2 += v * v;
00298             sysErr[i] += block[i] - block1[i];
00299             blockSumErr += v;
00300             if (abs(block[i]) > maxout)
00301                 maxout = abs(block[i]);
00302         }
00303         if (blockSumErrMax < blockSumErr)
00304             blockSumErrMax = blockSumErr;
00305     }
00306     for (i = 0; i < 64; i++)
00307         sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
00308 
00309     for (i = 0; i < 64; i++) {
00310         if (i % 8 == 0)
00311             printf("\n");
00312         printf("%7d ", (int) sysErr[i]);
00313     }
00314     printf("\n");
00315 
00316     omse = (double) err2 / NB_ITS / 64;
00317     ome  = (double) err_sum / NB_ITS / 64;
00318 
00319     spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
00320 
00321     printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
00322            is_idct ? "IDCT" : "DCT", dct->name, err_inf,
00323            omse, ome, (double) sysErrMax / NB_ITS,
00324            maxout, blockSumErrMax);
00325 
00326     if (spec_err && !dct->nonspec)
00327         return 1;
00328 
00329     if (!speed)
00330         return 0;
00331 
00332     
00333 
00334     init_block(block, test, is_idct, &prng, vals);
00335     permute(block1, block, dct->format);
00336 
00337     ti = av_gettime();
00338     it1 = 0;
00339     do {
00340         for (it = 0; it < NB_ITS_SPEED; it++) {
00341             memcpy(block, block1, sizeof(block));
00342             dct->func(block);
00343         }
00344         emms_c();
00345         it1 += NB_ITS_SPEED;
00346         ti1 = av_gettime() - ti;
00347     } while (ti1 < 1000000);
00348 
00349     printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
00350            (double) it1 * 1000.0 / (double) ti1);
00351 
00352     return 0;
00353 }
00354 
00355 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
00356 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
00357 
00358 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
00359 {
00360     static int init;
00361     static double c8[8][8];
00362     static double c4[4][4];
00363     double block1[64], block2[64], block3[64];
00364     double s, sum, v;
00365     int i, j, k;
00366 
00367     if (!init) {
00368         init = 1;
00369 
00370         for (i = 0; i < 8; i++) {
00371             sum = 0;
00372             for (j = 0; j < 8; j++) {
00373                 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
00374                 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
00375                 sum += c8[i][j] * c8[i][j];
00376             }
00377         }
00378 
00379         for (i = 0; i < 4; i++) {
00380             sum = 0;
00381             for (j = 0; j < 4; j++) {
00382                 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
00383                 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
00384                 sum += c4[i][j] * c4[i][j];
00385             }
00386         }
00387     }
00388 
00389     
00390     s = 0.5 * sqrt(2.0);
00391     for (i = 0; i < 4; i++) {
00392         for (j = 0; j < 8; j++) {
00393             block1[8 * (2 * i) + j] =
00394                 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
00395             block1[8 * (2 * i + 1) + j] =
00396                 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
00397         }
00398     }
00399 
00400     
00401     for (i = 0; i < 8; i++) {
00402         for (j = 0; j < 8; j++) {
00403             sum = 0;
00404             for (k = 0; k < 8; k++)
00405                 sum += c8[k][j] * block1[8 * i + k];
00406             block2[8 * i + j] = sum;
00407         }
00408     }
00409 
00410     
00411     for (i = 0; i < 8; i++) {
00412         for (j = 0; j < 4; j++) {
00413             
00414             sum = 0;
00415             for (k = 0; k < 4; k++)
00416                 sum += c4[k][j] * block2[8 * (2 * k) + i];
00417             block3[8 * (2 * j) + i] = sum;
00418 
00419             
00420             sum = 0;
00421             for (k = 0; k < 4; k++)
00422                 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
00423             block3[8 * (2 * j + 1) + i] = sum;
00424         }
00425     }
00426 
00427     
00428     for (i = 0; i < 8; i++) {
00429         for (j = 0; j < 8; j++) {
00430             v = block3[8 * i + j];
00431             if      (v < 0)   v = 0;
00432             else if (v > 255) v = 255;
00433             dest[i * linesize + j] = (int) rint(v);
00434         }
00435     }
00436 }
00437 
00438 static void idct248_error(const char *name,
00439                           void (*idct248_put)(uint8_t *dest, int line_size,
00440                                               int16_t *block),
00441                           int speed)
00442 {
00443     int it, i, it1, ti, ti1, err_max, v;
00444     AVLFG prng;
00445 
00446     av_lfg_init(&prng, 1);
00447 
00448     
00449 
00450     err_max = 0;
00451     for (it = 0; it < NB_ITS; it++) {
00452         
00453         for (i = 0; i < 64; i++)
00454             block1[i] = av_lfg_get(&prng) % 256 - 128;
00455         block1[0] += 1024;
00456 
00457         for (i = 0; i < 64; i++)
00458             block[i] = block1[i];
00459         idct248_ref(img_dest1, 8, block);
00460 
00461         for (i = 0; i < 64; i++)
00462             block[i] = block1[i];
00463         idct248_put(img_dest, 8, block);
00464 
00465         for (i = 0; i < 64; i++) {
00466             v = abs((int) img_dest[i] - (int) img_dest1[i]);
00467             if (v == 255)
00468                 printf("%d %d\n", img_dest[i], img_dest1[i]);
00469             if (v > err_max)
00470                 err_max = v;
00471         }
00472 #if 0
00473         printf("ref=\n");
00474         for(i=0;i<8;i++) {
00475             int j;
00476             for(j=0;j<8;j++) {
00477                 printf(" %3d", img_dest1[i*8+j]);
00478             }
00479             printf("\n");
00480         }
00481 
00482         printf("out=\n");
00483         for(i=0;i<8;i++) {
00484             int j;
00485             for(j=0;j<8;j++) {
00486                 printf(" %3d", img_dest[i*8+j]);
00487             }
00488             printf("\n");
00489         }
00490 #endif
00491     }
00492     printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
00493 
00494     if (!speed)
00495         return;
00496 
00497     ti = av_gettime();
00498     it1 = 0;
00499     do {
00500         for (it = 0; it < NB_ITS_SPEED; it++) {
00501             for (i = 0; i < 64; i++)
00502                 block[i] = block1[i];
00503             idct248_put(img_dest, 8, block);
00504         }
00505         emms_c();
00506         it1 += NB_ITS_SPEED;
00507         ti1 = av_gettime() - ti;
00508     } while (ti1 < 1000000);
00509 
00510     printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
00511            (double) it1 * 1000.0 / (double) ti1);
00512 }
00513 
00514 static void help(void)
00515 {
00516     printf("dct-test [-i] [<test-number>] [<bits>]\n"
00517            "test-number 0 -> test with random matrixes\n"
00518            "            1 -> test with random sparse matrixes\n"
00519            "            2 -> do 3. test from mpeg4 std\n"
00520            "bits        Number of time domain bits to use, 8 is default\n"
00521            "-i          test IDCT implementations\n"
00522            "-4          test IDCT248 implementations\n"
00523            "-t          speed test\n");
00524 }
00525 
00526 #if !HAVE_GETOPT
00527 #include "compat/getopt.c"
00528 #endif
00529 
00530 int main(int argc, char **argv)
00531 {
00532     int test_idct = 0, test_248_dct = 0;
00533     int c, i;
00534     int test = 1;
00535     int speed = 0;
00536     int err = 0;
00537     int bits=8;
00538 
00539     cpu_flags = av_get_cpu_flags();
00540 
00541     ff_ref_dct_init();
00542     idct_mmx_init();
00543 
00544     for (;;) {
00545         c = getopt(argc, argv, "ih4t");
00546         if (c == -1)
00547             break;
00548         switch (c) {
00549         case 'i':
00550             test_idct = 1;
00551             break;
00552         case '4':
00553             test_248_dct = 1;
00554             break;
00555         case 't':
00556             speed = 1;
00557             break;
00558         default:
00559         case 'h':
00560             help();
00561             return 0;
00562         }
00563     }
00564 
00565     if (optind < argc)
00566         test = atoi(argv[optind]);
00567     if(optind+1 < argc) bits= atoi(argv[optind+1]);
00568 
00569     printf("ffmpeg DCT/IDCT test\n");
00570 
00571     if (test_248_dct) {
00572         idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
00573     } else {
00574         const struct algo *algos = test_idct ? idct_tab : fdct_tab;
00575         for (i = 0; algos[i].name; i++)
00576             if (!(~cpu_flags & algos[i].mm_support)) {
00577                 err |= dct_error(&algos[i], test, test_idct, speed, bits);
00578             }
00579     }
00580 
00581     return err;
00582 }