FFmpeg: libavcodec/simple_idct.c Source File

00001 /*
00002  * Simple IDCT
00003  *
00004  * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
00005  *
00006  * This file is part of FFmpeg.
00007  *
00008  * FFmpeg is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * FFmpeg is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with FFmpeg; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  */
00022 
00028 /*
00029   based upon some outcommented c code from mpeg2dec (idct_mmx.c
00030   written by Aaron Holtzman <aholtzma@ess.engr.uvic.ca>)
00031  */
00032 #include "avcodec.h"
00033 #include "dsputil.h"
00034 #include "mathops.h"
00035 #include "simple_idct.h"
00036 
00037 #if 0
00038 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
00039 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
00040 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
00041 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
00042 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
00043 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
00044 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
00045 #define ROW_SHIFT 8
00046 #define COL_SHIFT 17
00047 #else
00048 #define W1  22725  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00049 #define W2  21407  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00050 #define W3  19266  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00051 #define W4  16383  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00052 #define W5  12873  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00053 #define W6  8867   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00054 #define W7  4520   //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
00055 #define ROW_SHIFT 11
00056 #define COL_SHIFT 20 // 6
00057 #endif
00058 
00059 static inline void idctRowCondDC (DCTELEM * row)
00060 {
00061         int a0, a1, a2, a3, b0, b1, b2, b3;
00062 #if HAVE_FAST_64BIT
00063         uint64_t temp;
00064 #else
00065         uint32_t temp;
00066 #endif
00067 
00068 #if HAVE_FAST_64BIT
00069 #if HAVE_BIGENDIAN
00070 #define ROW0_MASK 0xffff000000000000LL
00071 #else
00072 #define ROW0_MASK 0xffffLL
00073 #endif
00074         if(sizeof(DCTELEM)==2){
00075             if ( ((((uint64_t *)row)[0] & ~ROW0_MASK) |
00076                   ((uint64_t *)row)[1]) == 0) {
00077                 temp = (row[0] << 3) & 0xffff;
00078                 temp += temp << 16;
00079                 temp += temp << 32;
00080                 ((uint64_t *)row)[0] = temp;
00081                 ((uint64_t *)row)[1] = temp;
00082                 return;
00083             }
00084         }else{
00085             if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) {
00086                 row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3;
00087                 return;
00088             }
00089         }
00090 #else
00091         if(sizeof(DCTELEM)==2){
00092             if (!(((uint32_t*)row)[1] |
00093                   ((uint32_t*)row)[2] |
00094                   ((uint32_t*)row)[3] |
00095                   row[1])) {
00096                 temp = (row[0] << 3) & 0xffff;
00097                 temp += temp << 16;
00098                 ((uint32_t*)row)[0]=((uint32_t*)row)[1] =
00099                 ((uint32_t*)row)[2]=((uint32_t*)row)[3] = temp;
00100                 return;
00101             }
00102         }else{
00103             if (!(row[1]|row[2]|row[3]|row[4]|row[5]|row[6]|row[7])) {
00104                 row[0]=row[1]=row[2]=row[3]=row[4]=row[5]=row[6]=row[7]= row[0] << 3;
00105                 return;
00106             }
00107         }
00108 #endif
00109 
00110         a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
00111         a1 = a0;
00112         a2 = a0;
00113         a3 = a0;
00114 
00115         /* no need to optimize : gcc does it */
00116         a0 += W2 * row[2];
00117         a1 += W6 * row[2];
00118         a2 -= W6 * row[2];
00119         a3 -= W2 * row[2];
00120 
00121         b0 = MUL16(W1, row[1]);
00122         MAC16(b0, W3, row[3]);
00123         b1 = MUL16(W3, row[1]);
00124         MAC16(b1, -W7, row[3]);
00125         b2 = MUL16(W5, row[1]);
00126         MAC16(b2, -W1, row[3]);
00127         b3 = MUL16(W7, row[1]);
00128         MAC16(b3, -W5, row[3]);
00129 
00130 #if HAVE_FAST_64BIT
00131         temp = ((uint64_t*)row)[1];
00132 #else
00133         temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
00134 #endif
00135         if (temp != 0) {
00136             a0 += W4*row[4] + W6*row[6];
00137             a1 += - W4*row[4] - W2*row[6];
00138             a2 += - W4*row[4] + W2*row[6];
00139             a3 += W4*row[4] - W6*row[6];
00140 
00141             MAC16(b0, W5, row[5]);
00142             MAC16(b0, W7, row[7]);
00143 
00144             MAC16(b1, -W1, row[5]);
00145             MAC16(b1, -W5, row[7]);
00146 
00147             MAC16(b2, W7, row[5]);
00148             MAC16(b2, W3, row[7]);
00149 
00150             MAC16(b3, W3, row[5]);
00151             MAC16(b3, -W1, row[7]);
00152         }
00153 
00154         row[0] = (a0 + b0) >> ROW_SHIFT;
00155         row[7] = (a0 - b0) >> ROW_SHIFT;
00156         row[1] = (a1 + b1) >> ROW_SHIFT;
00157         row[6] = (a1 - b1) >> ROW_SHIFT;
00158         row[2] = (a2 + b2) >> ROW_SHIFT;
00159         row[5] = (a2 - b2) >> ROW_SHIFT;
00160         row[3] = (a3 + b3) >> ROW_SHIFT;
00161         row[4] = (a3 - b3) >> ROW_SHIFT;
00162 }
00163 
00164 static inline void idctSparseColPut (uint8_t *dest, int line_size,
00165                                      DCTELEM * col)
00166 {
00167         int a0, a1, a2, a3, b0, b1, b2, b3;
00168         uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00169 
00170         /* XXX: I did that only to give same values as previous code */
00171         a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
00172         a1 = a0;
00173         a2 = a0;
00174         a3 = a0;
00175 
00176         a0 +=  + W2*col[8*2];
00177         a1 +=  + W6*col[8*2];
00178         a2 +=  - W6*col[8*2];
00179         a3 +=  - W2*col[8*2];
00180 
00181         b0 = MUL16(W1, col[8*1]);
00182         b1 = MUL16(W3, col[8*1]);
00183         b2 = MUL16(W5, col[8*1]);
00184         b3 = MUL16(W7, col[8*1]);
00185 
00186         MAC16(b0, + W3, col[8*3]);
00187         MAC16(b1, - W7, col[8*3]);
00188         MAC16(b2, - W1, col[8*3]);
00189         MAC16(b3, - W5, col[8*3]);
00190 
00191         if(col[8*4]){
00192             a0 += + W4*col[8*4];
00193             a1 += - W4*col[8*4];
00194             a2 += - W4*col[8*4];
00195             a3 += + W4*col[8*4];
00196         }
00197 
00198         if (col[8*5]) {
00199             MAC16(b0, + W5, col[8*5]);
00200             MAC16(b1, - W1, col[8*5]);
00201             MAC16(b2, + W7, col[8*5]);
00202             MAC16(b3, + W3, col[8*5]);
00203         }
00204 
00205         if(col[8*6]){
00206             a0 += + W6*col[8*6];
00207             a1 += - W2*col[8*6];
00208             a2 += + W2*col[8*6];
00209             a3 += - W6*col[8*6];
00210         }
00211 
00212         if (col[8*7]) {
00213             MAC16(b0, + W7, col[8*7]);
00214             MAC16(b1, - W5, col[8*7]);
00215             MAC16(b2, + W3, col[8*7]);
00216             MAC16(b3, - W1, col[8*7]);
00217         }
00218 
00219         dest[0] = cm[(a0 + b0) >> COL_SHIFT];
00220         dest += line_size;
00221         dest[0] = cm[(a1 + b1) >> COL_SHIFT];
00222         dest += line_size;
00223         dest[0] = cm[(a2 + b2) >> COL_SHIFT];
00224         dest += line_size;
00225         dest[0] = cm[(a3 + b3) >> COL_SHIFT];
00226         dest += line_size;
00227         dest[0] = cm[(a3 - b3) >> COL_SHIFT];
00228         dest += line_size;
00229         dest[0] = cm[(a2 - b2) >> COL_SHIFT];
00230         dest += line_size;
00231         dest[0] = cm[(a1 - b1) >> COL_SHIFT];
00232         dest += line_size;
00233         dest[0] = cm[(a0 - b0) >> COL_SHIFT];
00234 }
00235 
00236 static inline void idctSparseColAdd (uint8_t *dest, int line_size,
00237                                      DCTELEM * col)
00238 {
00239         int a0, a1, a2, a3, b0, b1, b2, b3;
00240         uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00241 
00242         /* XXX: I did that only to give same values as previous code */
00243         a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
00244         a1 = a0;
00245         a2 = a0;
00246         a3 = a0;
00247 
00248         a0 +=  + W2*col[8*2];
00249         a1 +=  + W6*col[8*2];
00250         a2 +=  - W6*col[8*2];
00251         a3 +=  - W2*col[8*2];
00252 
00253         b0 = MUL16(W1, col[8*1]);
00254         b1 = MUL16(W3, col[8*1]);
00255         b2 = MUL16(W5, col[8*1]);
00256         b3 = MUL16(W7, col[8*1]);
00257 
00258         MAC16(b0, + W3, col[8*3]);
00259         MAC16(b1, - W7, col[8*3]);
00260         MAC16(b2, - W1, col[8*3]);
00261         MAC16(b3, - W5, col[8*3]);
00262 
00263         if(col[8*4]){
00264             a0 += + W4*col[8*4];
00265             a1 += - W4*col[8*4];
00266             a2 += - W4*col[8*4];
00267             a3 += + W4*col[8*4];
00268         }
00269 
00270         if (col[8*5]) {
00271             MAC16(b0, + W5, col[8*5]);
00272             MAC16(b1, - W1, col[8*5]);
00273             MAC16(b2, + W7, col[8*5]);
00274             MAC16(b3, + W3, col[8*5]);
00275         }
00276 
00277         if(col[8*6]){
00278             a0 += + W6*col[8*6];
00279             a1 += - W2*col[8*6];
00280             a2 += + W2*col[8*6];
00281             a3 += - W6*col[8*6];
00282         }
00283 
00284         if (col[8*7]) {
00285             MAC16(b0, + W7, col[8*7]);
00286             MAC16(b1, - W5, col[8*7]);
00287             MAC16(b2, + W3, col[8*7]);
00288             MAC16(b3, - W1, col[8*7]);
00289         }
00290 
00291         dest[0] = cm[dest[0] + ((a0 + b0) >> COL_SHIFT)];
00292         dest += line_size;
00293         dest[0] = cm[dest[0] + ((a1 + b1) >> COL_SHIFT)];
00294         dest += line_size;
00295         dest[0] = cm[dest[0] + ((a2 + b2) >> COL_SHIFT)];
00296         dest += line_size;
00297         dest[0] = cm[dest[0] + ((a3 + b3) >> COL_SHIFT)];
00298         dest += line_size;
00299         dest[0] = cm[dest[0] + ((a3 - b3) >> COL_SHIFT)];
00300         dest += line_size;
00301         dest[0] = cm[dest[0] + ((a2 - b2) >> COL_SHIFT)];
00302         dest += line_size;
00303         dest[0] = cm[dest[0] + ((a1 - b1) >> COL_SHIFT)];
00304         dest += line_size;
00305         dest[0] = cm[dest[0] + ((a0 - b0) >> COL_SHIFT)];
00306 }
00307 
00308 static inline void idctSparseCol (DCTELEM * col)
00309 {
00310         int a0, a1, a2, a3, b0, b1, b2, b3;
00311 
00312         /* XXX: I did that only to give same values as previous code */
00313         a0 = W4 * (col[8*0] + ((1<<(COL_SHIFT-1))/W4));
00314         a1 = a0;
00315         a2 = a0;
00316         a3 = a0;
00317 
00318         a0 +=  + W2*col[8*2];
00319         a1 +=  + W6*col[8*2];
00320         a2 +=  - W6*col[8*2];
00321         a3 +=  - W2*col[8*2];
00322 
00323         b0 = MUL16(W1, col[8*1]);
00324         b1 = MUL16(W3, col[8*1]);
00325         b2 = MUL16(W5, col[8*1]);
00326         b3 = MUL16(W7, col[8*1]);
00327 
00328         MAC16(b0, + W3, col[8*3]);
00329         MAC16(b1, - W7, col[8*3]);
00330         MAC16(b2, - W1, col[8*3]);
00331         MAC16(b3, - W5, col[8*3]);
00332 
00333         if(col[8*4]){
00334             a0 += + W4*col[8*4];
00335             a1 += - W4*col[8*4];
00336             a2 += - W4*col[8*4];
00337             a3 += + W4*col[8*4];
00338         }
00339 
00340         if (col[8*5]) {
00341             MAC16(b0, + W5, col[8*5]);
00342             MAC16(b1, - W1, col[8*5]);
00343             MAC16(b2, + W7, col[8*5]);
00344             MAC16(b3, + W3, col[8*5]);
00345         }
00346 
00347         if(col[8*6]){
00348             a0 += + W6*col[8*6];
00349             a1 += - W2*col[8*6];
00350             a2 += + W2*col[8*6];
00351             a3 += - W6*col[8*6];
00352         }
00353 
00354         if (col[8*7]) {
00355             MAC16(b0, + W7, col[8*7]);
00356             MAC16(b1, - W5, col[8*7]);
00357             MAC16(b2, + W3, col[8*7]);
00358             MAC16(b3, - W1, col[8*7]);
00359         }
00360 
00361         col[0 ] = ((a0 + b0) >> COL_SHIFT);
00362         col[8 ] = ((a1 + b1) >> COL_SHIFT);
00363         col[16] = ((a2 + b2) >> COL_SHIFT);
00364         col[24] = ((a3 + b3) >> COL_SHIFT);
00365         col[32] = ((a3 - b3) >> COL_SHIFT);
00366         col[40] = ((a2 - b2) >> COL_SHIFT);
00367         col[48] = ((a1 - b1) >> COL_SHIFT);
00368         col[56] = ((a0 - b0) >> COL_SHIFT);
00369 }
00370 
00371 void ff_simple_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
00372 {
00373     int i;
00374     for(i=0; i<8; i++)
00375         idctRowCondDC(block + i*8);
00376 
00377     for(i=0; i<8; i++)
00378         idctSparseColPut(dest + i, line_size, block + i);
00379 }
00380 
00381 void ff_simple_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
00382 {
00383     int i;
00384     for(i=0; i<8; i++)
00385         idctRowCondDC(block + i*8);
00386 
00387     for(i=0; i<8; i++)
00388         idctSparseColAdd(dest + i, line_size, block + i);
00389 }
00390 
00391 void ff_simple_idct(DCTELEM *block)
00392 {
00393     int i;
00394     for(i=0; i<8; i++)
00395         idctRowCondDC(block + i*8);
00396 
00397     for(i=0; i<8; i++)
00398         idctSparseCol(block + i);
00399 }
00400 
00401 /* 2x4x8 idct */
00402 
00403 #define CN_SHIFT 12
00404 #define C_FIX(x) ((int)((x) * (1 << CN_SHIFT) + 0.5))
00405 #define C1 C_FIX(0.6532814824)
00406 #define C2 C_FIX(0.2705980501)
00407 
00408 /* row idct is multiple by 16 * sqrt(2.0), col idct4 is normalized,
00409    and the butterfly must be multiplied by 0.5 * sqrt(2.0) */
00410 #define C_SHIFT (4+1+12)
00411 
00412 static inline void idct4col_put(uint8_t *dest, int line_size, const DCTELEM *col)
00413 {
00414     int c0, c1, c2, c3, a0, a1, a2, a3;
00415     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00416 
00417     a0 = col[8*0];
00418     a1 = col[8*2];
00419     a2 = col[8*4];
00420     a3 = col[8*6];
00421     c0 = ((a0 + a2) << (CN_SHIFT - 1)) + (1 << (C_SHIFT - 1));
00422     c2 = ((a0 - a2) << (CN_SHIFT - 1)) + (1 << (C_SHIFT - 1));
00423     c1 = a1 * C1 + a3 * C2;
00424     c3 = a1 * C2 - a3 * C1;
00425     dest[0] = cm[(c0 + c1) >> C_SHIFT];
00426     dest += line_size;
00427     dest[0] = cm[(c2 + c3) >> C_SHIFT];
00428     dest += line_size;
00429     dest[0] = cm[(c2 - c3) >> C_SHIFT];
00430     dest += line_size;
00431     dest[0] = cm[(c0 - c1) >> C_SHIFT];
00432 }
00433 
00434 #define BF(k) \
00435 {\
00436     int a0, a1;\
00437     a0 = ptr[k];\
00438     a1 = ptr[8 + k];\
00439     ptr[k] = a0 + a1;\
00440     ptr[8 + k] = a0 - a1;\
00441 }
00442 
00443 /* only used by DV codec. The input must be interlaced. 128 is added
00444    to the pixels before clamping to avoid systematic error
00445    (1024*sqrt(2)) offset would be needed otherwise. */
00446 /* XXX: I think a 1.0/sqrt(2) normalization should be needed to
00447    compensate the extra butterfly stage - I don't have the full DV
00448    specification */
00449 void ff_simple_idct248_put(uint8_t *dest, int line_size, DCTELEM *block)
00450 {
00451     int i;
00452     DCTELEM *ptr;
00453 
00454     /* butterfly */
00455     ptr = block;
00456     for(i=0;i<4;i++) {
00457         BF(0);
00458         BF(1);
00459         BF(2);
00460         BF(3);
00461         BF(4);
00462         BF(5);
00463         BF(6);
00464         BF(7);
00465         ptr += 2 * 8;
00466     }
00467 
00468     /* IDCT8 on each line */
00469     for(i=0; i<8; i++) {
00470         idctRowCondDC(block + i*8);
00471     }
00472 
00473     /* IDCT4 and store */
00474     for(i=0;i<8;i++) {
00475         idct4col_put(dest + i, 2 * line_size, block + i);
00476         idct4col_put(dest + line_size + i, 2 * line_size, block + 8 + i);
00477     }
00478 }
00479 
00480 /* 8x4 & 4x8 WMV2 IDCT */
00481 #undef CN_SHIFT
00482 #undef C_SHIFT
00483 #undef C_FIX
00484 #undef C1
00485 #undef C2
00486 #define CN_SHIFT 12
00487 #define C_FIX(x) ((int)((x) * 1.414213562 * (1 << CN_SHIFT) + 0.5))
00488 #define C1 C_FIX(0.6532814824)
00489 #define C2 C_FIX(0.2705980501)
00490 #define C3 C_FIX(0.5)
00491 #define C_SHIFT (4+1+12)
00492 static inline void idct4col_add(uint8_t *dest, int line_size, const DCTELEM *col)
00493 {
00494     int c0, c1, c2, c3, a0, a1, a2, a3;
00495     const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00496 
00497     a0 = col[8*0];
00498     a1 = col[8*1];
00499     a2 = col[8*2];
00500     a3 = col[8*3];
00501     c0 = (a0 + a2)*C3 + (1 << (C_SHIFT - 1));
00502     c2 = (a0 - a2)*C3 + (1 << (C_SHIFT - 1));
00503     c1 = a1 * C1 + a3 * C2;
00504     c3 = a1 * C2 - a3 * C1;
00505     dest[0] = cm[dest[0] + ((c0 + c1) >> C_SHIFT)];
00506     dest += line_size;
00507     dest[0] = cm[dest[0] + ((c2 + c3) >> C_SHIFT)];
00508     dest += line_size;
00509     dest[0] = cm[dest[0] + ((c2 - c3) >> C_SHIFT)];
00510     dest += line_size;
00511     dest[0] = cm[dest[0] + ((c0 - c1) >> C_SHIFT)];
00512 }
00513 
00514 #define RN_SHIFT 15
00515 #define R_FIX(x) ((int)((x) * 1.414213562 * (1 << RN_SHIFT) + 0.5))
00516 #define R1 R_FIX(0.6532814824)
00517 #define R2 R_FIX(0.2705980501)
00518 #define R3 R_FIX(0.5)
00519 #define R_SHIFT 11
00520 static inline void idct4row(DCTELEM *row)
00521 {
00522     int c0, c1, c2, c3, a0, a1, a2, a3;
00523     //const uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00524 
00525     a0 = row[0];
00526     a1 = row[1];
00527     a2 = row[2];
00528     a3 = row[3];
00529     c0 = (a0 + a2)*R3 + (1 << (R_SHIFT - 1));
00530     c2 = (a0 - a2)*R3 + (1 << (R_SHIFT - 1));
00531     c1 = a1 * R1 + a3 * R2;
00532     c3 = a1 * R2 - a3 * R1;
00533     row[0]= (c0 + c1) >> R_SHIFT;
00534     row[1]= (c2 + c3) >> R_SHIFT;
00535     row[2]= (c2 - c3) >> R_SHIFT;
00536     row[3]= (c0 - c1) >> R_SHIFT;
00537 }
00538 
00539 void ff_simple_idct84_add(uint8_t *dest, int line_size, DCTELEM *block)
00540 {
00541     int i;
00542 
00543     /* IDCT8 on each line */
00544     for(i=0; i<4; i++) {
00545         idctRowCondDC(block + i*8);
00546     }
00547 
00548     /* IDCT4 and store */
00549     for(i=0;i<8;i++) {
00550         idct4col_add(dest + i, line_size, block + i);
00551     }
00552 }
00553 
00554 void ff_simple_idct48_add(uint8_t *dest, int line_size, DCTELEM *block)
00555 {
00556     int i;
00557 
00558     /* IDCT4 on each line */
00559     for(i=0; i<8; i++) {
00560         idct4row(block + i*8);
00561     }
00562 
00563     /* IDCT8 and store */
00564     for(i=0; i<4; i++){
00565         idctSparseColAdd(dest + i, line_size, block + i);
00566     }
00567 }
00568 
00569 void ff_simple_idct44_add(uint8_t *dest, int line_size, DCTELEM *block)
00570 {
00571     int i;
00572 
00573     /* IDCT4 on each line */
00574     for(i=0; i<4; i++) {
00575         idct4row(block + i*8);
00576     }
00577 
00578     /* IDCT4 and store */
00579     for(i=0; i<4; i++){
00580         idct4col_add(dest + i, line_size, block + i);
00581     }
00582 }