FFmpeg: libavcodec/dsputil.c Source File

00001 /*
00002  * DSP utils
00003  * Copyright (c) 2000, 2001 Fabrice Bellard
00004  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
00005  *
00006  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
00007  *
00008  * This file is part of FFmpeg.
00009  *
00010  * FFmpeg is free software; you can redistribute it and/or
00011  * modify it under the terms of the GNU Lesser General Public
00012  * License as published by the Free Software Foundation; either
00013  * version 2.1 of the License, or (at your option) any later version.
00014  *
00015  * FFmpeg is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00018  * Lesser General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU Lesser General Public
00021  * License along with FFmpeg; if not, write to the Free Software
00022  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00023  */
00024 
00030 #include "avcodec.h"
00031 #include "dsputil.h"
00032 #include "simple_idct.h"
00033 #include "faandct.h"
00034 #include "faanidct.h"
00035 #include "mathops.h"
00036 #include "mpegvideo.h"
00037 #include "config.h"
00038 #include "lpc.h"
00039 #include "ac3dec.h"
00040 #include "vorbis.h"
00041 #include "png.h"
00042 
00043 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
00044 uint32_t ff_squareTbl[512] = {0, };
00045 
00046 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
00047 #define pb_7f (~0UL/255 * 0x7f)
00048 #define pb_80 (~0UL/255 * 0x80)
00049 
00050 const uint8_t ff_zigzag_direct[64] = {
00051     0,   1,  8, 16,  9,  2,  3, 10,
00052     17, 24, 32, 25, 18, 11,  4,  5,
00053     12, 19, 26, 33, 40, 48, 41, 34,
00054     27, 20, 13,  6,  7, 14, 21, 28,
00055     35, 42, 49, 56, 57, 50, 43, 36,
00056     29, 22, 15, 23, 30, 37, 44, 51,
00057     58, 59, 52, 45, 38, 31, 39, 46,
00058     53, 60, 61, 54, 47, 55, 62, 63
00059 };
00060 
00061 /* Specific zigzag scan for 248 idct. NOTE that unlike the
00062    specification, we interleave the fields */
00063 const uint8_t ff_zigzag248_direct[64] = {
00064      0,  8,  1,  9, 16, 24,  2, 10,
00065     17, 25, 32, 40, 48, 56, 33, 41,
00066     18, 26,  3, 11,  4, 12, 19, 27,
00067     34, 42, 49, 57, 50, 58, 35, 43,
00068     20, 28,  5, 13,  6, 14, 21, 29,
00069     36, 44, 51, 59, 52, 60, 37, 45,
00070     22, 30,  7, 15, 23, 31, 38, 46,
00071     53, 61, 54, 62, 39, 47, 55, 63,
00072 };
00073 
00074 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
00075 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
00076 
00077 const uint8_t ff_alternate_horizontal_scan[64] = {
00078     0,  1,   2,  3,  8,  9, 16, 17,
00079     10, 11,  4,  5,  6,  7, 15, 14,
00080     13, 12, 19, 18, 24, 25, 32, 33,
00081     26, 27, 20, 21, 22, 23, 28, 29,
00082     30, 31, 34, 35, 40, 41, 48, 49,
00083     42, 43, 36, 37, 38, 39, 44, 45,
00084     46, 47, 50, 51, 56, 57, 58, 59,
00085     52, 53, 54, 55, 60, 61, 62, 63,
00086 };
00087 
00088 const uint8_t ff_alternate_vertical_scan[64] = {
00089     0,  8,  16, 24,  1,  9,  2, 10,
00090     17, 25, 32, 40, 48, 56, 57, 49,
00091     41, 33, 26, 18,  3, 11,  4, 12,
00092     19, 27, 34, 42, 50, 58, 35, 43,
00093     51, 59, 20, 28,  5, 13,  6, 14,
00094     21, 29, 36, 44, 52, 60, 37, 45,
00095     53, 61, 22, 30,  7, 15, 23, 31,
00096     38, 46, 54, 62, 39, 47, 55, 63,
00097 };
00098 
00099 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
00100  * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
00101 const uint32_t ff_inverse[257]={
00102          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757,
00103  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154,
00104  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709,
00105  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333,
00106  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367,
00107  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283,
00108   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315,
00109   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085,
00110   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498,
00111   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675,
00112   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441,
00113   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183,
00114   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712,
00115   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400,
00116   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163,
00117   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641,
00118   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573,
00119   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737,
00120   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493,
00121   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373,
00122   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368,
00123   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671,
00124   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767,
00125   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740,
00126   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751,
00127   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635,
00128   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593,
00129   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944,
00130   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933,
00131   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575,
00132   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532,
00133   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
00134   16777216
00135 };
00136 
00137 /* Input permutation for the simple_idct_mmx */
00138 static const uint8_t simple_mmx_permutation[64]={
00139         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
00140         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
00141         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
00142         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
00143         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
00144         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
00145         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
00146         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
00147 };
00148 
00149 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
00150 
00151 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
00152     int i;
00153     int end;
00154 
00155     st->scantable= src_scantable;
00156 
00157     for(i=0; i<64; i++){
00158         int j;
00159         j = src_scantable[i];
00160         st->permutated[i] = permutation[j];
00161 #if ARCH_PPC
00162         st->inverse[j] = i;
00163 #endif
00164     }
00165 
00166     end=-1;
00167     for(i=0; i<64; i++){
00168         int j;
00169         j = st->permutated[i];
00170         if(j>end) end=j;
00171         st->raster_end[i]= end;
00172     }
00173 }
00174 
00175 static int pix_sum_c(uint8_t * pix, int line_size)
00176 {
00177     int s, i, j;
00178 
00179     s = 0;
00180     for (i = 0; i < 16; i++) {
00181         for (j = 0; j < 16; j += 8) {
00182             s += pix[0];
00183             s += pix[1];
00184             s += pix[2];
00185             s += pix[3];
00186             s += pix[4];
00187             s += pix[5];
00188             s += pix[6];
00189             s += pix[7];
00190             pix += 8;
00191         }
00192         pix += line_size - 16;
00193     }
00194     return s;
00195 }
00196 
00197 static int pix_norm1_c(uint8_t * pix, int line_size)
00198 {
00199     int s, i, j;
00200     uint32_t *sq = ff_squareTbl + 256;
00201 
00202     s = 0;
00203     for (i = 0; i < 16; i++) {
00204         for (j = 0; j < 16; j += 8) {
00205 #if 0
00206             s += sq[pix[0]];
00207             s += sq[pix[1]];
00208             s += sq[pix[2]];
00209             s += sq[pix[3]];
00210             s += sq[pix[4]];
00211             s += sq[pix[5]];
00212             s += sq[pix[6]];
00213             s += sq[pix[7]];
00214 #else
00215 #if LONG_MAX > 2147483647
00216             register uint64_t x=*(uint64_t*)pix;
00217             s += sq[x&0xff];
00218             s += sq[(x>>8)&0xff];
00219             s += sq[(x>>16)&0xff];
00220             s += sq[(x>>24)&0xff];
00221             s += sq[(x>>32)&0xff];
00222             s += sq[(x>>40)&0xff];
00223             s += sq[(x>>48)&0xff];
00224             s += sq[(x>>56)&0xff];
00225 #else
00226             register uint32_t x=*(uint32_t*)pix;
00227             s += sq[x&0xff];
00228             s += sq[(x>>8)&0xff];
00229             s += sq[(x>>16)&0xff];
00230             s += sq[(x>>24)&0xff];
00231             x=*(uint32_t*)(pix+4);
00232             s += sq[x&0xff];
00233             s += sq[(x>>8)&0xff];
00234             s += sq[(x>>16)&0xff];
00235             s += sq[(x>>24)&0xff];
00236 #endif
00237 #endif
00238             pix += 8;
00239         }
00240         pix += line_size - 16;
00241     }
00242     return s;
00243 }
00244 
00245 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
00246     int i;
00247 
00248     for(i=0; i+8<=w; i+=8){
00249         dst[i+0]= bswap_32(src[i+0]);
00250         dst[i+1]= bswap_32(src[i+1]);
00251         dst[i+2]= bswap_32(src[i+2]);
00252         dst[i+3]= bswap_32(src[i+3]);
00253         dst[i+4]= bswap_32(src[i+4]);
00254         dst[i+5]= bswap_32(src[i+5]);
00255         dst[i+6]= bswap_32(src[i+6]);
00256         dst[i+7]= bswap_32(src[i+7]);
00257     }
00258     for(;i<w; i++){
00259         dst[i+0]= bswap_32(src[i+0]);
00260     }
00261 }
00262 
00263 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
00264 {
00265     int s, i;
00266     uint32_t *sq = ff_squareTbl + 256;
00267 
00268     s = 0;
00269     for (i = 0; i < h; i++) {
00270         s += sq[pix1[0] - pix2[0]];
00271         s += sq[pix1[1] - pix2[1]];
00272         s += sq[pix1[2] - pix2[2]];
00273         s += sq[pix1[3] - pix2[3]];
00274         pix1 += line_size;
00275         pix2 += line_size;
00276     }
00277     return s;
00278 }
00279 
00280 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
00281 {
00282     int s, i;
00283     uint32_t *sq = ff_squareTbl + 256;
00284 
00285     s = 0;
00286     for (i = 0; i < h; i++) {
00287         s += sq[pix1[0] - pix2[0]];
00288         s += sq[pix1[1] - pix2[1]];
00289         s += sq[pix1[2] - pix2[2]];
00290         s += sq[pix1[3] - pix2[3]];
00291         s += sq[pix1[4] - pix2[4]];
00292         s += sq[pix1[5] - pix2[5]];
00293         s += sq[pix1[6] - pix2[6]];
00294         s += sq[pix1[7] - pix2[7]];
00295         pix1 += line_size;
00296         pix2 += line_size;
00297     }
00298     return s;
00299 }
00300 
00301 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
00302 {
00303     int s, i;
00304     uint32_t *sq = ff_squareTbl + 256;
00305 
00306     s = 0;
00307     for (i = 0; i < h; i++) {
00308         s += sq[pix1[ 0] - pix2[ 0]];
00309         s += sq[pix1[ 1] - pix2[ 1]];
00310         s += sq[pix1[ 2] - pix2[ 2]];
00311         s += sq[pix1[ 3] - pix2[ 3]];
00312         s += sq[pix1[ 4] - pix2[ 4]];
00313         s += sq[pix1[ 5] - pix2[ 5]];
00314         s += sq[pix1[ 6] - pix2[ 6]];
00315         s += sq[pix1[ 7] - pix2[ 7]];
00316         s += sq[pix1[ 8] - pix2[ 8]];
00317         s += sq[pix1[ 9] - pix2[ 9]];
00318         s += sq[pix1[10] - pix2[10]];
00319         s += sq[pix1[11] - pix2[11]];
00320         s += sq[pix1[12] - pix2[12]];
00321         s += sq[pix1[13] - pix2[13]];
00322         s += sq[pix1[14] - pix2[14]];
00323         s += sq[pix1[15] - pix2[15]];
00324 
00325         pix1 += line_size;
00326         pix2 += line_size;
00327     }
00328     return s;
00329 }
00330 
00331 /* draw the edges of width 'w' of an image of size width, height */
00332 //FIXME check that this is ok for mpeg4 interlaced
00333 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
00334 {
00335     uint8_t *ptr, *last_line;
00336     int i;
00337 
00338     last_line = buf + (height - 1) * wrap;
00339     for(i=0;i<w;i++) {
00340         /* top and bottom */
00341         memcpy(buf - (i + 1) * wrap, buf, width);
00342         memcpy(last_line + (i + 1) * wrap, last_line, width);
00343     }
00344     /* left and right */
00345     ptr = buf;
00346     for(i=0;i<height;i++) {
00347         memset(ptr - w, ptr[0], w);
00348         memset(ptr + width, ptr[width-1], w);
00349         ptr += wrap;
00350     }
00351     /* corners */
00352     for(i=0;i<w;i++) {
00353         memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
00354         memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
00355         memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
00356         memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
00357     }
00358 }
00359 
00372 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
00373                                     int src_x, int src_y, int w, int h){
00374     int x, y;
00375     int start_y, start_x, end_y, end_x;
00376 
00377     if(src_y>= h){
00378         src+= (h-1-src_y)*linesize;
00379         src_y=h-1;
00380     }else if(src_y<=-block_h){
00381         src+= (1-block_h-src_y)*linesize;
00382         src_y=1-block_h;
00383     }
00384     if(src_x>= w){
00385         src+= (w-1-src_x);
00386         src_x=w-1;
00387     }else if(src_x<=-block_w){
00388         src+= (1-block_w-src_x);
00389         src_x=1-block_w;
00390     }
00391 
00392     start_y= FFMAX(0, -src_y);
00393     start_x= FFMAX(0, -src_x);
00394     end_y= FFMIN(block_h, h-src_y);
00395     end_x= FFMIN(block_w, w-src_x);
00396 
00397     // copy existing part
00398     for(y=start_y; y<end_y; y++){
00399         for(x=start_x; x<end_x; x++){
00400             buf[x + y*linesize]= src[x + y*linesize];
00401         }
00402     }
00403 
00404     //top
00405     for(y=0; y<start_y; y++){
00406         for(x=start_x; x<end_x; x++){
00407             buf[x + y*linesize]= buf[x + start_y*linesize];
00408         }
00409     }
00410 
00411     //bottom
00412     for(y=end_y; y<block_h; y++){
00413         for(x=start_x; x<end_x; x++){
00414             buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
00415         }
00416     }
00417 
00418     for(y=0; y<block_h; y++){
00419        //left
00420         for(x=0; x<start_x; x++){
00421             buf[x + y*linesize]= buf[start_x + y*linesize];
00422         }
00423 
00424        //right
00425         for(x=end_x; x<block_w; x++){
00426             buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
00427         }
00428     }
00429 }
00430 
00431 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
00432 {
00433     int i;
00434 
00435     /* read the pixels */
00436     for(i=0;i<8;i++) {
00437         block[0] = pixels[0];
00438         block[1] = pixels[1];
00439         block[2] = pixels[2];
00440         block[3] = pixels[3];
00441         block[4] = pixels[4];
00442         block[5] = pixels[5];
00443         block[6] = pixels[6];
00444         block[7] = pixels[7];
00445         pixels += line_size;
00446         block += 8;
00447     }
00448 }
00449 
00450 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
00451                           const uint8_t *s2, int stride){
00452     int i;
00453 
00454     /* read the pixels */
00455     for(i=0;i<8;i++) {
00456         block[0] = s1[0] - s2[0];
00457         block[1] = s1[1] - s2[1];
00458         block[2] = s1[2] - s2[2];
00459         block[3] = s1[3] - s2[3];
00460         block[4] = s1[4] - s2[4];
00461         block[5] = s1[5] - s2[5];
00462         block[6] = s1[6] - s2[6];
00463         block[7] = s1[7] - s2[7];
00464         s1 += stride;
00465         s2 += stride;
00466         block += 8;
00467     }
00468 }
00469 
00470 
00471 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
00472                                  int line_size)
00473 {
00474     int i;
00475     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00476 
00477     /* read the pixels */
00478     for(i=0;i<8;i++) {
00479         pixels[0] = cm[block[0]];
00480         pixels[1] = cm[block[1]];
00481         pixels[2] = cm[block[2]];
00482         pixels[3] = cm[block[3]];
00483         pixels[4] = cm[block[4]];
00484         pixels[5] = cm[block[5]];
00485         pixels[6] = cm[block[6]];
00486         pixels[7] = cm[block[7]];
00487 
00488         pixels += line_size;
00489         block += 8;
00490     }
00491 }
00492 
00493 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
00494                                  int line_size)
00495 {
00496     int i;
00497     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00498 
00499     /* read the pixels */
00500     for(i=0;i<4;i++) {
00501         pixels[0] = cm[block[0]];
00502         pixels[1] = cm[block[1]];
00503         pixels[2] = cm[block[2]];
00504         pixels[3] = cm[block[3]];
00505 
00506         pixels += line_size;
00507         block += 8;
00508     }
00509 }
00510 
00511 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
00512                                  int line_size)
00513 {
00514     int i;
00515     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00516 
00517     /* read the pixels */
00518     for(i=0;i<2;i++) {
00519         pixels[0] = cm[block[0]];
00520         pixels[1] = cm[block[1]];
00521 
00522         pixels += line_size;
00523         block += 8;
00524     }
00525 }
00526 
00527 static void put_signed_pixels_clamped_c(const DCTELEM *block,
00528                                         uint8_t *restrict pixels,
00529                                         int line_size)
00530 {
00531     int i, j;
00532 
00533     for (i = 0; i < 8; i++) {
00534         for (j = 0; j < 8; j++) {
00535             if (*block < -128)
00536                 *pixels = 0;
00537             else if (*block > 127)
00538                 *pixels = 255;
00539             else
00540                 *pixels = (uint8_t)(*block + 128);
00541             block++;
00542             pixels++;
00543         }
00544         pixels += (line_size - 8);
00545     }
00546 }
00547 
00548 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
00549                                     int line_size)
00550 {
00551     int i;
00552 
00553     /* read the pixels */
00554     for(i=0;i<8;i++) {
00555         pixels[0] = block[0];
00556         pixels[1] = block[1];
00557         pixels[2] = block[2];
00558         pixels[3] = block[3];
00559         pixels[4] = block[4];
00560         pixels[5] = block[5];
00561         pixels[6] = block[6];
00562         pixels[7] = block[7];
00563 
00564         pixels += line_size;
00565         block += 8;
00566     }
00567 }
00568 
00569 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
00570                           int line_size)
00571 {
00572     int i;
00573     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00574 
00575     /* read the pixels */
00576     for(i=0;i<8;i++) {
00577         pixels[0] = cm[pixels[0] + block[0]];
00578         pixels[1] = cm[pixels[1] + block[1]];
00579         pixels[2] = cm[pixels[2] + block[2]];
00580         pixels[3] = cm[pixels[3] + block[3]];
00581         pixels[4] = cm[pixels[4] + block[4]];
00582         pixels[5] = cm[pixels[5] + block[5]];
00583         pixels[6] = cm[pixels[6] + block[6]];
00584         pixels[7] = cm[pixels[7] + block[7]];
00585         pixels += line_size;
00586         block += 8;
00587     }
00588 }
00589 
00590 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
00591                           int line_size)
00592 {
00593     int i;
00594     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00595 
00596     /* read the pixels */
00597     for(i=0;i<4;i++) {
00598         pixels[0] = cm[pixels[0] + block[0]];
00599         pixels[1] = cm[pixels[1] + block[1]];
00600         pixels[2] = cm[pixels[2] + block[2]];
00601         pixels[3] = cm[pixels[3] + block[3]];
00602         pixels += line_size;
00603         block += 8;
00604     }
00605 }
00606 
00607 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
00608                           int line_size)
00609 {
00610     int i;
00611     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
00612 
00613     /* read the pixels */
00614     for(i=0;i<2;i++) {
00615         pixels[0] = cm[pixels[0] + block[0]];
00616         pixels[1] = cm[pixels[1] + block[1]];
00617         pixels += line_size;
00618         block += 8;
00619     }
00620 }
00621 
00622 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
00623 {
00624     int i;
00625     for(i=0;i<8;i++) {
00626         pixels[0] += block[0];
00627         pixels[1] += block[1];
00628         pixels[2] += block[2];
00629         pixels[3] += block[3];
00630         pixels[4] += block[4];
00631         pixels[5] += block[5];
00632         pixels[6] += block[6];
00633         pixels[7] += block[7];
00634         pixels += line_size;
00635         block += 8;
00636     }
00637 }
00638 
00639 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
00640 {
00641     int i;
00642     for(i=0;i<4;i++) {
00643         pixels[0] += block[0];
00644         pixels[1] += block[1];
00645         pixels[2] += block[2];
00646         pixels[3] += block[3];
00647         pixels += line_size;
00648         block += 4;
00649     }
00650 }
00651 
00652 static int sum_abs_dctelem_c(DCTELEM *block)
00653 {
00654     int sum=0, i;
00655     for(i=0; i<64; i++)
00656         sum+= FFABS(block[i]);
00657     return sum;
00658 }
00659 
00660 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
00661 {
00662     int i;
00663 
00664     for (i = 0; i < h; i++) {
00665         memset(block, value, 16);
00666         block += line_size;
00667     }
00668 }
00669 
00670 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
00671 {
00672     int i;
00673 
00674     for (i = 0; i < h; i++) {
00675         memset(block, value, 8);
00676         block += line_size;
00677     }
00678 }
00679 
00680 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
00681 {
00682     int i, j;
00683     uint16_t *dst1 = (uint16_t *) dst;
00684     uint16_t *dst2 = (uint16_t *)(dst + linesize);
00685 
00686     for (j = 0; j < 8; j++) {
00687         for (i = 0; i < 8; i++) {
00688             dst1[i] = dst2[i] = src[i] * 0x0101;
00689         }
00690         src  += 8;
00691         dst1 += linesize;
00692         dst2 += linesize;
00693     }
00694 }
00695 
00696 #if 0
00697 
00698 #define PIXOP2(OPNAME, OP) \
00699 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
00700 {\
00701     int i;\
00702     for(i=0; i<h; i++){\
00703         OP(*((uint64_t*)block), AV_RN64(pixels));\
00704         pixels+=line_size;\
00705         block +=line_size;\
00706     }\
00707 }\
00708 \
00709 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
00710 {\
00711     int i;\
00712     for(i=0; i<h; i++){\
00713         const uint64_t a= AV_RN64(pixels  );\
00714         const uint64_t b= AV_RN64(pixels+1);\
00715         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
00716         pixels+=line_size;\
00717         block +=line_size;\
00718     }\
00719 }\
00720 \
00721 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
00722 {\
00723     int i;\
00724     for(i=0; i<h; i++){\
00725         const uint64_t a= AV_RN64(pixels  );\
00726         const uint64_t b= AV_RN64(pixels+1);\
00727         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
00728         pixels+=line_size;\
00729         block +=line_size;\
00730     }\
00731 }\
00732 \
00733 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
00734 {\
00735     int i;\
00736     for(i=0; i<h; i++){\
00737         const uint64_t a= AV_RN64(pixels          );\
00738         const uint64_t b= AV_RN64(pixels+line_size);\
00739         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
00740         pixels+=line_size;\
00741         block +=line_size;\
00742     }\
00743 }\
00744 \
00745 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
00746 {\
00747     int i;\
00748     for(i=0; i<h; i++){\
00749         const uint64_t a= AV_RN64(pixels          );\
00750         const uint64_t b= AV_RN64(pixels+line_size);\
00751         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
00752         pixels+=line_size;\
00753         block +=line_size;\
00754     }\
00755 }\
00756 \
00757 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
00758 {\
00759         int i;\
00760         const uint64_t a= AV_RN64(pixels  );\
00761         const uint64_t b= AV_RN64(pixels+1);\
00762         uint64_t l0=  (a&0x0303030303030303ULL)\
00763                     + (b&0x0303030303030303ULL)\
00764                     + 0x0202020202020202ULL;\
00765         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
00766                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
00767         uint64_t l1,h1;\
00768 \
00769         pixels+=line_size;\
00770         for(i=0; i<h; i+=2){\
00771             uint64_t a= AV_RN64(pixels  );\
00772             uint64_t b= AV_RN64(pixels+1);\
00773             l1=  (a&0x0303030303030303ULL)\
00774                + (b&0x0303030303030303ULL);\
00775             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
00776               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
00777             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
00778             pixels+=line_size;\
00779             block +=line_size;\
00780             a= AV_RN64(pixels  );\
00781             b= AV_RN64(pixels+1);\
00782             l0=  (a&0x0303030303030303ULL)\
00783                + (b&0x0303030303030303ULL)\
00784                + 0x0202020202020202ULL;\
00785             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
00786               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
00787             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
00788             pixels+=line_size;\
00789             block +=line_size;\
00790         }\
00791 }\
00792 \
00793 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
00794 {\
00795         int i;\
00796         const uint64_t a= AV_RN64(pixels  );\
00797         const uint64_t b= AV_RN64(pixels+1);\
00798         uint64_t l0=  (a&0x0303030303030303ULL)\
00799                     + (b&0x0303030303030303ULL)\
00800                     + 0x0101010101010101ULL;\
00801         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
00802                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
00803         uint64_t l1,h1;\
00804 \
00805         pixels+=line_size;\
00806         for(i=0; i<h; i+=2){\
00807             uint64_t a= AV_RN64(pixels  );\
00808             uint64_t b= AV_RN64(pixels+1);\
00809             l1=  (a&0x0303030303030303ULL)\
00810                + (b&0x0303030303030303ULL);\
00811             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
00812               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
00813             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
00814             pixels+=line_size;\
00815             block +=line_size;\
00816             a= AV_RN64(pixels  );\
00817             b= AV_RN64(pixels+1);\
00818             l0=  (a&0x0303030303030303ULL)\
00819                + (b&0x0303030303030303ULL)\
00820                + 0x0101010101010101ULL;\
00821             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
00822               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
00823             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
00824             pixels+=line_size;\
00825             block +=line_size;\
00826         }\
00827 }\
00828 \
00829 CALL_2X_PIXELS(OPNAME ## _pixels16_c    , OPNAME ## _pixels_c    , 8)\
00830 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
00831 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
00832 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
00833 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
00834 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
00835 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
00836 
00837 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
00838 #else // 64 bit variant
00839 
00840 #define PIXOP2(OPNAME, OP) \
00841 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00842     int i;\
00843     for(i=0; i<h; i++){\
00844         OP(*((uint16_t*)(block  )), AV_RN16(pixels  ));\
00845         pixels+=line_size;\
00846         block +=line_size;\
00847     }\
00848 }\
00849 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00850     int i;\
00851     for(i=0; i<h; i++){\
00852         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
00853         pixels+=line_size;\
00854         block +=line_size;\
00855     }\
00856 }\
00857 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00858     int i;\
00859     for(i=0; i<h; i++){\
00860         OP(*((uint32_t*)(block  )), AV_RN32(pixels  ));\
00861         OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
00862         pixels+=line_size;\
00863         block +=line_size;\
00864     }\
00865 }\
00866 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00867     OPNAME ## _pixels8_c(block, pixels, line_size, h);\
00868 }\
00869 \
00870 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
00871                                                 int src_stride1, int src_stride2, int h){\
00872     int i;\
00873     for(i=0; i<h; i++){\
00874         uint32_t a,b;\
00875         a= AV_RN32(&src1[i*src_stride1  ]);\
00876         b= AV_RN32(&src2[i*src_stride2  ]);\
00877         OP(*((uint32_t*)&dst[i*dst_stride  ]), no_rnd_avg32(a, b));\
00878         a= AV_RN32(&src1[i*src_stride1+4]);\
00879         b= AV_RN32(&src2[i*src_stride2+4]);\
00880         OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
00881     }\
00882 }\
00883 \
00884 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
00885                                                 int src_stride1, int src_stride2, int h){\
00886     int i;\
00887     for(i=0; i<h; i++){\
00888         uint32_t a,b;\
00889         a= AV_RN32(&src1[i*src_stride1  ]);\
00890         b= AV_RN32(&src2[i*src_stride2  ]);\
00891         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
00892         a= AV_RN32(&src1[i*src_stride1+4]);\
00893         b= AV_RN32(&src2[i*src_stride2+4]);\
00894         OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
00895     }\
00896 }\
00897 \
00898 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
00899                                                 int src_stride1, int src_stride2, int h){\
00900     int i;\
00901     for(i=0; i<h; i++){\
00902         uint32_t a,b;\
00903         a= AV_RN32(&src1[i*src_stride1  ]);\
00904         b= AV_RN32(&src2[i*src_stride2  ]);\
00905         OP(*((uint32_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
00906     }\
00907 }\
00908 \
00909 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
00910                                                 int src_stride1, int src_stride2, int h){\
00911     int i;\
00912     for(i=0; i<h; i++){\
00913         uint32_t a,b;\
00914         a= AV_RN16(&src1[i*src_stride1  ]);\
00915         b= AV_RN16(&src2[i*src_stride2  ]);\
00916         OP(*((uint16_t*)&dst[i*dst_stride  ]), rnd_avg32(a, b));\
00917     }\
00918 }\
00919 \
00920 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
00921                                                 int src_stride1, int src_stride2, int h){\
00922     OPNAME ## _pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
00923     OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
00924 }\
00925 \
00926 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
00927                                                 int src_stride1, int src_stride2, int h){\
00928     OPNAME ## _no_rnd_pixels8_l2(dst  , src1  , src2  , dst_stride, src_stride1, src_stride2, h);\
00929     OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
00930 }\
00931 \
00932 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00933     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
00934 }\
00935 \
00936 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00937     OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
00938 }\
00939 \
00940 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00941     OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
00942 }\
00943 \
00944 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00945     OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
00946 }\
00947 \
00948 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
00949                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
00950     int i;\
00951     for(i=0; i<h; i++){\
00952         uint32_t a, b, c, d, l0, l1, h0, h1;\
00953         a= AV_RN32(&src1[i*src_stride1]);\
00954         b= AV_RN32(&src2[i*src_stride2]);\
00955         c= AV_RN32(&src3[i*src_stride3]);\
00956         d= AV_RN32(&src4[i*src_stride4]);\
00957         l0=  (a&0x03030303UL)\
00958            + (b&0x03030303UL)\
00959            + 0x02020202UL;\
00960         h0= ((a&0xFCFCFCFCUL)>>2)\
00961           + ((b&0xFCFCFCFCUL)>>2);\
00962         l1=  (c&0x03030303UL)\
00963            + (d&0x03030303UL);\
00964         h1= ((c&0xFCFCFCFCUL)>>2)\
00965           + ((d&0xFCFCFCFCUL)>>2);\
00966         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
00967         a= AV_RN32(&src1[i*src_stride1+4]);\
00968         b= AV_RN32(&src2[i*src_stride2+4]);\
00969         c= AV_RN32(&src3[i*src_stride3+4]);\
00970         d= AV_RN32(&src4[i*src_stride4+4]);\
00971         l0=  (a&0x03030303UL)\
00972            + (b&0x03030303UL)\
00973            + 0x02020202UL;\
00974         h0= ((a&0xFCFCFCFCUL)>>2)\
00975           + ((b&0xFCFCFCFCUL)>>2);\
00976         l1=  (c&0x03030303UL)\
00977            + (d&0x03030303UL);\
00978         h1= ((c&0xFCFCFCFCUL)>>2)\
00979           + ((d&0xFCFCFCFCUL)>>2);\
00980         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
00981     }\
00982 }\
00983 \
00984 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00985     OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
00986 }\
00987 \
00988 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00989     OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
00990 }\
00991 \
00992 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00993     OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
00994 }\
00995 \
00996 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
00997     OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
00998 }\
00999 \
01000 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
01001                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
01002     int i;\
01003     for(i=0; i<h; i++){\
01004         uint32_t a, b, c, d, l0, l1, h0, h1;\
01005         a= AV_RN32(&src1[i*src_stride1]);\
01006         b= AV_RN32(&src2[i*src_stride2]);\
01007         c= AV_RN32(&src3[i*src_stride3]);\
01008         d= AV_RN32(&src4[i*src_stride4]);\
01009         l0=  (a&0x03030303UL)\
01010            + (b&0x03030303UL)\
01011            + 0x01010101UL;\
01012         h0= ((a&0xFCFCFCFCUL)>>2)\
01013           + ((b&0xFCFCFCFCUL)>>2);\
01014         l1=  (c&0x03030303UL)\
01015            + (d&0x03030303UL);\
01016         h1= ((c&0xFCFCFCFCUL)>>2)\
01017           + ((d&0xFCFCFCFCUL)>>2);\
01018         OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01019         a= AV_RN32(&src1[i*src_stride1+4]);\
01020         b= AV_RN32(&src2[i*src_stride2+4]);\
01021         c= AV_RN32(&src3[i*src_stride3+4]);\
01022         d= AV_RN32(&src4[i*src_stride4+4]);\
01023         l0=  (a&0x03030303UL)\
01024            + (b&0x03030303UL)\
01025            + 0x01010101UL;\
01026         h0= ((a&0xFCFCFCFCUL)>>2)\
01027           + ((b&0xFCFCFCFCUL)>>2);\
01028         l1=  (c&0x03030303UL)\
01029            + (d&0x03030303UL);\
01030         h1= ((c&0xFCFCFCFCUL)>>2)\
01031           + ((d&0xFCFCFCFCUL)>>2);\
01032         OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01033     }\
01034 }\
01035 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
01036                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
01037     OPNAME ## _pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
01038     OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
01039 }\
01040 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
01041                  int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
01042     OPNAME ## _no_rnd_pixels8_l4(dst  , src1  , src2  , src3  , src4  , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
01043     OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
01044 }\
01045 \
01046 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
01047 {\
01048         int i, a0, b0, a1, b1;\
01049         a0= pixels[0];\
01050         b0= pixels[1] + 2;\
01051         a0 += b0;\
01052         b0 += pixels[2];\
01053 \
01054         pixels+=line_size;\
01055         for(i=0; i<h; i+=2){\
01056             a1= pixels[0];\
01057             b1= pixels[1];\
01058             a1 += b1;\
01059             b1 += pixels[2];\
01060 \
01061             block[0]= (a1+a0)>>2; /* FIXME non put */\
01062             block[1]= (b1+b0)>>2;\
01063 \
01064             pixels+=line_size;\
01065             block +=line_size;\
01066 \
01067             a0= pixels[0];\
01068             b0= pixels[1] + 2;\
01069             a0 += b0;\
01070             b0 += pixels[2];\
01071 \
01072             block[0]= (a1+a0)>>2;\
01073             block[1]= (b1+b0)>>2;\
01074             pixels+=line_size;\
01075             block +=line_size;\
01076         }\
01077 }\
01078 \
01079 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
01080 {\
01081         int i;\
01082         const uint32_t a= AV_RN32(pixels  );\
01083         const uint32_t b= AV_RN32(pixels+1);\
01084         uint32_t l0=  (a&0x03030303UL)\
01085                     + (b&0x03030303UL)\
01086                     + 0x02020202UL;\
01087         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
01088                    + ((b&0xFCFCFCFCUL)>>2);\
01089         uint32_t l1,h1;\
01090 \
01091         pixels+=line_size;\
01092         for(i=0; i<h; i+=2){\
01093             uint32_t a= AV_RN32(pixels  );\
01094             uint32_t b= AV_RN32(pixels+1);\
01095             l1=  (a&0x03030303UL)\
01096                + (b&0x03030303UL);\
01097             h1= ((a&0xFCFCFCFCUL)>>2)\
01098               + ((b&0xFCFCFCFCUL)>>2);\
01099             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01100             pixels+=line_size;\
01101             block +=line_size;\
01102             a= AV_RN32(pixels  );\
01103             b= AV_RN32(pixels+1);\
01104             l0=  (a&0x03030303UL)\
01105                + (b&0x03030303UL)\
01106                + 0x02020202UL;\
01107             h0= ((a&0xFCFCFCFCUL)>>2)\
01108               + ((b&0xFCFCFCFCUL)>>2);\
01109             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01110             pixels+=line_size;\
01111             block +=line_size;\
01112         }\
01113 }\
01114 \
01115 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
01116 {\
01117     int j;\
01118     for(j=0; j<2; j++){\
01119         int i;\
01120         const uint32_t a= AV_RN32(pixels  );\
01121         const uint32_t b= AV_RN32(pixels+1);\
01122         uint32_t l0=  (a&0x03030303UL)\
01123                     + (b&0x03030303UL)\
01124                     + 0x02020202UL;\
01125         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
01126                    + ((b&0xFCFCFCFCUL)>>2);\
01127         uint32_t l1,h1;\
01128 \
01129         pixels+=line_size;\
01130         for(i=0; i<h; i+=2){\
01131             uint32_t a= AV_RN32(pixels  );\
01132             uint32_t b= AV_RN32(pixels+1);\
01133             l1=  (a&0x03030303UL)\
01134                + (b&0x03030303UL);\
01135             h1= ((a&0xFCFCFCFCUL)>>2)\
01136               + ((b&0xFCFCFCFCUL)>>2);\
01137             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01138             pixels+=line_size;\
01139             block +=line_size;\
01140             a= AV_RN32(pixels  );\
01141             b= AV_RN32(pixels+1);\
01142             l0=  (a&0x03030303UL)\
01143                + (b&0x03030303UL)\
01144                + 0x02020202UL;\
01145             h0= ((a&0xFCFCFCFCUL)>>2)\
01146               + ((b&0xFCFCFCFCUL)>>2);\
01147             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01148             pixels+=line_size;\
01149             block +=line_size;\
01150         }\
01151         pixels+=4-line_size*(h+1);\
01152         block +=4-line_size*h;\
01153     }\
01154 }\
01155 \
01156 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
01157 {\
01158     int j;\
01159     for(j=0; j<2; j++){\
01160         int i;\
01161         const uint32_t a= AV_RN32(pixels  );\
01162         const uint32_t b= AV_RN32(pixels+1);\
01163         uint32_t l0=  (a&0x03030303UL)\
01164                     + (b&0x03030303UL)\
01165                     + 0x01010101UL;\
01166         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
01167                    + ((b&0xFCFCFCFCUL)>>2);\
01168         uint32_t l1,h1;\
01169 \
01170         pixels+=line_size;\
01171         for(i=0; i<h; i+=2){\
01172             uint32_t a= AV_RN32(pixels  );\
01173             uint32_t b= AV_RN32(pixels+1);\
01174             l1=  (a&0x03030303UL)\
01175                + (b&0x03030303UL);\
01176             h1= ((a&0xFCFCFCFCUL)>>2)\
01177               + ((b&0xFCFCFCFCUL)>>2);\
01178             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01179             pixels+=line_size;\
01180             block +=line_size;\
01181             a= AV_RN32(pixels  );\
01182             b= AV_RN32(pixels+1);\
01183             l0=  (a&0x03030303UL)\
01184                + (b&0x03030303UL)\
01185                + 0x01010101UL;\
01186             h0= ((a&0xFCFCFCFCUL)>>2)\
01187               + ((b&0xFCFCFCFCUL)>>2);\
01188             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
01189             pixels+=line_size;\
01190             block +=line_size;\
01191         }\
01192         pixels+=4-line_size*(h+1);\
01193         block +=4-line_size*h;\
01194     }\
01195 }\
01196 \
01197 CALL_2X_PIXELS(OPNAME ## _pixels16_c  , OPNAME ## _pixels8_c  , 8)\
01198 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
01199 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
01200 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
01201 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c  , OPNAME ## _pixels8_c         , 8)\
01202 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
01203 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
01204 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
01205 
01206 #define op_avg(a, b) a = rnd_avg32(a, b)
01207 #endif
01208 #define op_put(a, b) a = b
01209 
01210 PIXOP2(avg, op_avg)
01211 PIXOP2(put, op_put)
01212 #undef op_avg
01213 #undef op_put
01214 
01215 #define avg2(a,b) ((a+b+1)>>1)
01216 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
01217 
01218 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
01219     put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
01220 }
01221 
01222 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
01223     put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
01224 }
01225 
01226 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
01227 {
01228     const int A=(16-x16)*(16-y16);
01229     const int B=(   x16)*(16-y16);
01230     const int C=(16-x16)*(   y16);
01231     const int D=(   x16)*(   y16);
01232     int i;
01233 
01234     for(i=0; i<h; i++)
01235     {
01236         dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
01237         dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
01238         dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
01239         dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
01240         dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
01241         dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
01242         dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
01243         dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
01244         dst+= stride;
01245         src+= stride;
01246     }
01247 }
01248 
01249 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
01250                   int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
01251 {
01252     int y, vx, vy;
01253     const int s= 1<<shift;
01254 
01255     width--;
01256     height--;
01257 
01258     for(y=0; y<h; y++){
01259         int x;
01260 
01261         vx= ox;
01262         vy= oy;
01263         for(x=0; x<8; x++){ //XXX FIXME optimize
01264             int src_x, src_y, frac_x, frac_y, index;
01265 
01266             src_x= vx>>16;
01267             src_y= vy>>16;
01268             frac_x= src_x&(s-1);
01269             frac_y= src_y&(s-1);
01270             src_x>>=shift;
01271             src_y>>=shift;
01272 
01273             if((unsigned)src_x < width){
01274                 if((unsigned)src_y < height){
01275                     index= src_x + src_y*stride;
01276                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_x)
01277                                            + src[index       +1]*   frac_x )*(s-frac_y)
01278                                         + (  src[index+stride  ]*(s-frac_x)
01279                                            + src[index+stride+1]*   frac_x )*   frac_y
01280                                         + r)>>(shift*2);
01281                 }else{
01282                     index= src_x + av_clip(src_y, 0, height)*stride;
01283                     dst[y*stride + x]= ( (  src[index         ]*(s-frac_x)
01284                                           + src[index       +1]*   frac_x )*s
01285                                         + r)>>(shift*2);
01286                 }
01287             }else{
01288                 if((unsigned)src_y < height){
01289                     index= av_clip(src_x, 0, width) + src_y*stride;
01290                     dst[y*stride + x]= (  (  src[index         ]*(s-frac_y)
01291                                            + src[index+stride  ]*   frac_y )*s
01292                                         + r)>>(shift*2);
01293                 }else{
01294                     index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
01295                     dst[y*stride + x]=    src[index         ];
01296                 }
01297             }
01298 
01299             vx+= dxx;
01300             vy+= dyx;
01301         }
01302         ox += dxy;
01303         oy += dyy;
01304     }
01305 }
01306 
01307 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01308     switch(width){
01309     case 2: put_pixels2_c (dst, src, stride, height); break;
01310     case 4: put_pixels4_c (dst, src, stride, height); break;
01311     case 8: put_pixels8_c (dst, src, stride, height); break;
01312     case 16:put_pixels16_c(dst, src, stride, height); break;
01313     }
01314 }
01315 
01316 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01317     int i,j;
01318     for (i=0; i < height; i++) {
01319       for (j=0; j < width; j++) {
01320         dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
01321       }
01322       src += stride;
01323       dst += stride;
01324     }
01325 }
01326 
01327 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01328     int i,j;
01329     for (i=0; i < height; i++) {
01330       for (j=0; j < width; j++) {
01331         dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
01332       }
01333       src += stride;
01334       dst += stride;
01335     }
01336 }
01337 
01338 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01339     int i,j;
01340     for (i=0; i < height; i++) {
01341       for (j=0; j < width; j++) {
01342         dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
01343       }
01344       src += stride;
01345       dst += stride;
01346     }
01347 }
01348 
01349 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01350     int i,j;
01351     for (i=0; i < height; i++) {
01352       for (j=0; j < width; j++) {
01353         dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
01354       }
01355       src += stride;
01356       dst += stride;
01357     }
01358 }
01359 
01360 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01361     int i,j;
01362     for (i=0; i < height; i++) {
01363       for (j=0; j < width; j++) {
01364         dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
01365       }
01366       src += stride;
01367       dst += stride;
01368     }
01369 }
01370 
01371 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01372     int i,j;
01373     for (i=0; i < height; i++) {
01374       for (j=0; j < width; j++) {
01375         dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
01376       }
01377       src += stride;
01378       dst += stride;
01379     }
01380 }
01381 
01382 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01383     int i,j;
01384     for (i=0; i < height; i++) {
01385       for (j=0; j < width; j++) {
01386         dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
01387       }
01388       src += stride;
01389       dst += stride;
01390     }
01391 }
01392 
01393 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01394     int i,j;
01395     for (i=0; i < height; i++) {
01396       for (j=0; j < width; j++) {
01397         dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
01398       }
01399       src += stride;
01400       dst += stride;
01401     }
01402 }
01403 
01404 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01405     switch(width){
01406     case 2: avg_pixels2_c (dst, src, stride, height); break;
01407     case 4: avg_pixels4_c (dst, src, stride, height); break;
01408     case 8: avg_pixels8_c (dst, src, stride, height); break;
01409     case 16:avg_pixels16_c(dst, src, stride, height); break;
01410     }
01411 }
01412 
01413 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01414     int i,j;
01415     for (i=0; i < height; i++) {
01416       for (j=0; j < width; j++) {
01417         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
01418       }
01419       src += stride;
01420       dst += stride;
01421     }
01422 }
01423 
01424 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01425     int i,j;
01426     for (i=0; i < height; i++) {
01427       for (j=0; j < width; j++) {
01428         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
01429       }
01430       src += stride;
01431       dst += stride;
01432     }
01433 }
01434 
01435 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01436     int i,j;
01437     for (i=0; i < height; i++) {
01438       for (j=0; j < width; j++) {
01439         dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
01440       }
01441       src += stride;
01442       dst += stride;
01443     }
01444 }
01445 
01446 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01447     int i,j;
01448     for (i=0; i < height; i++) {
01449       for (j=0; j < width; j++) {
01450         dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
01451       }
01452       src += stride;
01453       dst += stride;
01454     }
01455 }
01456 
01457 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01458     int i,j;
01459     for (i=0; i < height; i++) {
01460       for (j=0; j < width; j++) {
01461         dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
01462       }
01463       src += stride;
01464       dst += stride;
01465     }
01466 }
01467 
01468 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01469     int i,j;
01470     for (i=0; i < height; i++) {
01471       for (j=0; j < width; j++) {
01472         dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
01473       }
01474       src += stride;
01475       dst += stride;
01476     }
01477 }
01478 
01479 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01480     int i,j;
01481     for (i=0; i < height; i++) {
01482       for (j=0; j < width; j++) {
01483         dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
01484       }
01485       src += stride;
01486       dst += stride;
01487     }
01488 }
01489 
01490 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
01491     int i,j;
01492     for (i=0; i < height; i++) {
01493       for (j=0; j < width; j++) {
01494         dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
01495       }
01496       src += stride;
01497       dst += stride;
01498     }
01499 }
01500 #if 0
01501 #define TPEL_WIDTH(width)\
01502 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01503     void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
01504 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01505     void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
01506 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01507     void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
01508 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01509     void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
01510 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01511     void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
01512 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01513     void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
01514 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01515     void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
01516 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01517     void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
01518 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
01519     void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
01520 #endif
01521 
01522 #define H264_CHROMA_MC(OPNAME, OP)\
01523 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
01524     const int A=(8-x)*(8-y);\
01525     const int B=(  x)*(8-y);\
01526     const int C=(8-x)*(  y);\
01527     const int D=(  x)*(  y);\
01528     int i;\
01529     \
01530     assert(x<8 && y<8 && x>=0 && y>=0);\
01531 \
01532     if(D){\
01533         for(i=0; i<h; i++){\
01534             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
01535             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
01536             dst+= stride;\
01537             src+= stride;\
01538         }\
01539     }else{\
01540         const int E= B+C;\
01541         const int step= C ? stride : 1;\
01542         for(i=0; i<h; i++){\
01543             OP(dst[0], (A*src[0] + E*src[step+0]));\
01544             OP(dst[1], (A*src[1] + E*src[step+1]));\
01545             dst+= stride;\
01546             src+= stride;\
01547         }\
01548     }\
01549 }\
01550 \
01551 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
01552     const int A=(8-x)*(8-y);\
01553     const int B=(  x)*(8-y);\
01554     const int C=(8-x)*(  y);\
01555     const int D=(  x)*(  y);\
01556     int i;\
01557     \
01558     assert(x<8 && y<8 && x>=0 && y>=0);\
01559 \
01560     if(D){\
01561         for(i=0; i<h; i++){\
01562             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
01563             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
01564             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
01565             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
01566             dst+= stride;\
01567             src+= stride;\
01568         }\
01569     }else{\
01570         const int E= B+C;\
01571         const int step= C ? stride : 1;\
01572         for(i=0; i<h; i++){\
01573             OP(dst[0], (A*src[0] + E*src[step+0]));\
01574             OP(dst[1], (A*src[1] + E*src[step+1]));\
01575             OP(dst[2], (A*src[2] + E*src[step+2]));\
01576             OP(dst[3], (A*src[3] + E*src[step+3]));\
01577             dst+= stride;\
01578             src+= stride;\
01579         }\
01580     }\
01581 }\
01582 \
01583 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
01584     const int A=(8-x)*(8-y);\
01585     const int B=(  x)*(8-y);\
01586     const int C=(8-x)*(  y);\
01587     const int D=(  x)*(  y);\
01588     int i;\
01589     \
01590     assert(x<8 && y<8 && x>=0 && y>=0);\
01591 \
01592     if(D){\
01593         for(i=0; i<h; i++){\
01594             OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
01595             OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
01596             OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
01597             OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
01598             OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
01599             OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
01600             OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
01601             OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
01602             dst+= stride;\
01603             src+= stride;\
01604         }\
01605     }else{\
01606         const int E= B+C;\
01607         const int step= C ? stride : 1;\
01608         for(i=0; i<h; i++){\
01609             OP(dst[0], (A*src[0] + E*src[step+0]));\
01610             OP(dst[1], (A*src[1] + E*src[step+1]));\
01611             OP(dst[2], (A*src[2] + E*src[step+2]));\
01612             OP(dst[3], (A*src[3] + E*src[step+3]));\
01613             OP(dst[4], (A*src[4] + E*src[step+4]));\
01614             OP(dst[5], (A*src[5] + E*src[step+5]));\
01615             OP(dst[6], (A*src[6] + E*src[step+6]));\
01616             OP(dst[7], (A*src[7] + E*src[step+7]));\
01617             dst+= stride;\
01618             src+= stride;\
01619         }\
01620     }\
01621 }
01622 
01623 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
01624 #define op_put(a, b) a = (((b) + 32)>>6)
01625 
01626 H264_CHROMA_MC(put_       , op_put)
01627 H264_CHROMA_MC(avg_       , op_avg)
01628 #undef op_avg
01629 #undef op_put
01630 
01631 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
01632     const int A=(8-x)*(8-y);
01633     const int B=(  x)*(8-y);
01634     const int C=(8-x)*(  y);
01635     const int D=(  x)*(  y);
01636     int i;
01637 
01638     assert(x<8 && y<8 && x>=0 && y>=0);
01639 
01640     for(i=0; i<h; i++)
01641     {
01642         dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
01643         dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
01644         dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
01645         dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
01646         dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
01647         dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
01648         dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
01649         dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
01650         dst+= stride;
01651         src+= stride;
01652     }
01653 }
01654 
01655 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
01656     const int A=(8-x)*(8-y);
01657     const int B=(  x)*(8-y);
01658     const int C=(8-x)*(  y);
01659     const int D=(  x)*(  y);
01660     int i;
01661 
01662     assert(x<8 && y<8 && x>=0 && y>=0);
01663 
01664     for(i=0; i<h; i++)
01665     {
01666         dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
01667         dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
01668         dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
01669         dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
01670         dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
01671         dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
01672         dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
01673         dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
01674         dst+= stride;
01675         src+= stride;
01676     }
01677 }
01678 
01679 #define QPEL_MC(r, OPNAME, RND, OP) \
01680 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01681     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
01682     int i;\
01683     for(i=0; i<h; i++)\
01684     {\
01685         OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
01686         OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
01687         OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
01688         OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
01689         OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
01690         OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
01691         OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
01692         OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
01693         dst+=dstStride;\
01694         src+=srcStride;\
01695     }\
01696 }\
01697 \
01698 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01699     const int w=8;\
01700     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
01701     int i;\
01702     for(i=0; i<w; i++)\
01703     {\
01704         const int src0= src[0*srcStride];\
01705         const int src1= src[1*srcStride];\
01706         const int src2= src[2*srcStride];\
01707         const int src3= src[3*srcStride];\
01708         const int src4= src[4*srcStride];\
01709         const int src5= src[5*srcStride];\
01710         const int src6= src[6*srcStride];\
01711         const int src7= src[7*srcStride];\
01712         const int src8= src[8*srcStride];\
01713         OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
01714         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
01715         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
01716         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
01717         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
01718         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
01719         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
01720         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
01721         dst++;\
01722         src++;\
01723     }\
01724 }\
01725 \
01726 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01727     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
01728     int i;\
01729     \
01730     for(i=0; i<h; i++)\
01731     {\
01732         OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
01733         OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
01734         OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
01735         OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
01736         OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
01737         OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
01738         OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
01739         OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
01740         OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
01741         OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
01742         OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
01743         OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
01744         OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
01745         OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
01746         OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
01747         OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
01748         dst+=dstStride;\
01749         src+=srcStride;\
01750     }\
01751 }\
01752 \
01753 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01754     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
01755     int i;\
01756     const int w=16;\
01757     for(i=0; i<w; i++)\
01758     {\
01759         const int src0= src[0*srcStride];\
01760         const int src1= src[1*srcStride];\
01761         const int src2= src[2*srcStride];\
01762         const int src3= src[3*srcStride];\
01763         const int src4= src[4*srcStride];\
01764         const int src5= src[5*srcStride];\
01765         const int src6= src[6*srcStride];\
01766         const int src7= src[7*srcStride];\
01767         const int src8= src[8*srcStride];\
01768         const int src9= src[9*srcStride];\
01769         const int src10= src[10*srcStride];\
01770         const int src11= src[11*srcStride];\
01771         const int src12= src[12*srcStride];\
01772         const int src13= src[13*srcStride];\
01773         const int src14= src[14*srcStride];\
01774         const int src15= src[15*srcStride];\
01775         const int src16= src[16*srcStride];\
01776         OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
01777         OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
01778         OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
01779         OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
01780         OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
01781         OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
01782         OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
01783         OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
01784         OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
01785         OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
01786         OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
01787         OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
01788         OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
01789         OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
01790         OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
01791         OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
01792         dst++;\
01793         src++;\
01794     }\
01795 }\
01796 \
01797 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
01798     OPNAME ## pixels8_c(dst, src, stride, 8);\
01799 }\
01800 \
01801 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
01802     uint8_t half[64];\
01803     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
01804     OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
01805 }\
01806 \
01807 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
01808     OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
01809 }\
01810 \
01811 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
01812     uint8_t half[64];\
01813     put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
01814     OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
01815 }\
01816 \
01817 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
01818     uint8_t full[16*9];\
01819     uint8_t half[64];\
01820     copy_block9(full, src, 16, stride, 9);\
01821     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
01822     OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
01823 }\
01824 \
01825 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
01826     uint8_t full[16*9];\
01827     copy_block9(full, src, 16, stride, 9);\
01828     OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
01829 }\
01830 \
01831 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
01832     uint8_t full[16*9];\
01833     uint8_t half[64];\
01834     copy_block9(full, src, 16, stride, 9);\
01835     put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
01836     OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
01837 }\
01838 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
01839     uint8_t full[16*9];\
01840     uint8_t halfH[72];\
01841     uint8_t halfV[64];\
01842     uint8_t halfHV[64];\
01843     copy_block9(full, src, 16, stride, 9);\
01844     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01845     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
01846     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01847     OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
01848 }\
01849 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
01850     uint8_t full[16*9];\
01851     uint8_t halfH[72];\
01852     uint8_t halfHV[64];\
01853     copy_block9(full, src, 16, stride, 9);\
01854     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01855     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
01856     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01857     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
01858 }\
01859 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
01860     uint8_t full[16*9];\
01861     uint8_t halfH[72];\
01862     uint8_t halfV[64];\
01863     uint8_t halfHV[64];\
01864     copy_block9(full, src, 16, stride, 9);\
01865     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01866     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
01867     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01868     OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
01869 }\
01870 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
01871     uint8_t full[16*9];\
01872     uint8_t halfH[72];\
01873     uint8_t halfHV[64];\
01874     copy_block9(full, src, 16, stride, 9);\
01875     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01876     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
01877     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01878     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
01879 }\
01880 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
01881     uint8_t full[16*9];\
01882     uint8_t halfH[72];\
01883     uint8_t halfV[64];\
01884     uint8_t halfHV[64];\
01885     copy_block9(full, src, 16, stride, 9);\
01886     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01887     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
01888     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01889     OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
01890 }\
01891 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
01892     uint8_t full[16*9];\
01893     uint8_t halfH[72];\
01894     uint8_t halfHV[64];\
01895     copy_block9(full, src, 16, stride, 9);\
01896     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01897     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
01898     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01899     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
01900 }\
01901 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
01902     uint8_t full[16*9];\
01903     uint8_t halfH[72];\
01904     uint8_t halfV[64];\
01905     uint8_t halfHV[64];\
01906     copy_block9(full, src, 16, stride, 9);\
01907     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full  , 8, 16, 9);\
01908     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
01909     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01910     OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
01911 }\
01912 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
01913     uint8_t full[16*9];\
01914     uint8_t halfH[72];\
01915     uint8_t halfHV[64];\
01916     copy_block9(full, src, 16, stride, 9);\
01917     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01918     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
01919     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01920     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
01921 }\
01922 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
01923     uint8_t halfH[72];\
01924     uint8_t halfHV[64];\
01925     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
01926     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01927     OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
01928 }\
01929 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
01930     uint8_t halfH[72];\
01931     uint8_t halfHV[64];\
01932     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
01933     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01934     OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
01935 }\
01936 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
01937     uint8_t full[16*9];\
01938     uint8_t halfH[72];\
01939     uint8_t halfV[64];\
01940     uint8_t halfHV[64];\
01941     copy_block9(full, src, 16, stride, 9);\
01942     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01943     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
01944     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01945     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
01946 }\
01947 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
01948     uint8_t full[16*9];\
01949     uint8_t halfH[72];\
01950     copy_block9(full, src, 16, stride, 9);\
01951     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01952     put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
01953     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
01954 }\
01955 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
01956     uint8_t full[16*9];\
01957     uint8_t halfH[72];\
01958     uint8_t halfV[64];\
01959     uint8_t halfHV[64];\
01960     copy_block9(full, src, 16, stride, 9);\
01961     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01962     put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
01963     put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
01964     OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
01965 }\
01966 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
01967     uint8_t full[16*9];\
01968     uint8_t halfH[72];\
01969     copy_block9(full, src, 16, stride, 9);\
01970     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
01971     put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
01972     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
01973 }\
01974 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
01975     uint8_t halfH[72];\
01976     put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
01977     OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
01978 }\
01979 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
01980     OPNAME ## pixels16_c(dst, src, stride, 16);\
01981 }\
01982 \
01983 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
01984     uint8_t half[256];\
01985     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
01986     OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
01987 }\
01988 \
01989 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
01990     OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
01991 }\
01992 \
01993 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
01994     uint8_t half[256];\
01995     put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
01996     OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
01997 }\
01998 \
01999 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
02000     uint8_t full[24*17];\
02001     uint8_t half[256];\
02002     copy_block17(full, src, 24, stride, 17);\
02003     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
02004     OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
02005 }\
02006 \
02007 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
02008     uint8_t full[24*17];\
02009     copy_block17(full, src, 24, stride, 17);\
02010     OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
02011 }\
02012 \
02013 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
02014     uint8_t full[24*17];\
02015     uint8_t half[256];\
02016     copy_block17(full, src, 24, stride, 17);\
02017     put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
02018     OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
02019 }\
02020 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
02021     uint8_t full[24*17];\
02022     uint8_t halfH[272];\
02023     uint8_t halfV[256];\
02024     uint8_t halfHV[256];\
02025     copy_block17(full, src, 24, stride, 17);\
02026     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02027     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
02028     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02029     OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
02030 }\
02031 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
02032     uint8_t full[24*17];\
02033     uint8_t halfH[272];\
02034     uint8_t halfHV[256];\
02035     copy_block17(full, src, 24, stride, 17);\
02036     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02037     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
02038     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02039     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
02040 }\
02041 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
02042     uint8_t full[24*17];\
02043     uint8_t halfH[272];\
02044     uint8_t halfV[256];\
02045     uint8_t halfHV[256];\
02046     copy_block17(full, src, 24, stride, 17);\
02047     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02048     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
02049     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02050     OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
02051 }\
02052 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
02053     uint8_t full[24*17];\
02054     uint8_t halfH[272];\
02055     uint8_t halfHV[256];\
02056     copy_block17(full, src, 24, stride, 17);\
02057     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02058     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
02059     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02060     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
02061 }\
02062 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
02063     uint8_t full[24*17];\
02064     uint8_t halfH[272];\
02065     uint8_t halfV[256];\
02066     uint8_t halfHV[256];\
02067     copy_block17(full, src, 24, stride, 17);\
02068     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02069     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
02070     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02071     OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
02072 }\
02073 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
02074     uint8_t full[24*17];\
02075     uint8_t halfH[272];\
02076     uint8_t halfHV[256];\
02077     copy_block17(full, src, 24, stride, 17);\
02078     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02079     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
02080     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02081     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
02082 }\
02083 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
02084     uint8_t full[24*17];\
02085     uint8_t halfH[272];\
02086     uint8_t halfV[256];\
02087     uint8_t halfHV[256];\
02088     copy_block17(full, src, 24, stride, 17);\
02089     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full  , 16, 24, 17);\
02090     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
02091     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02092     OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
02093 }\
02094 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
02095     uint8_t full[24*17];\
02096     uint8_t halfH[272];\
02097     uint8_t halfHV[256];\
02098     copy_block17(full, src, 24, stride, 17);\
02099     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02100     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
02101     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02102     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
02103 }\
02104 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
02105     uint8_t halfH[272];\
02106     uint8_t halfHV[256];\
02107     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
02108     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02109     OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
02110 }\
02111 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
02112     uint8_t halfH[272];\
02113     uint8_t halfHV[256];\
02114     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
02115     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02116     OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
02117 }\
02118 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
02119     uint8_t full[24*17];\
02120     uint8_t halfH[272];\
02121     uint8_t halfV[256];\
02122     uint8_t halfHV[256];\
02123     copy_block17(full, src, 24, stride, 17);\
02124     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02125     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
02126     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02127     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
02128 }\
02129 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
02130     uint8_t full[24*17];\
02131     uint8_t halfH[272];\
02132     copy_block17(full, src, 24, stride, 17);\
02133     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02134     put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
02135     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
02136 }\
02137 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
02138     uint8_t full[24*17];\
02139     uint8_t halfH[272];\
02140     uint8_t halfV[256];\
02141     uint8_t halfHV[256];\
02142     copy_block17(full, src, 24, stride, 17);\
02143     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02144     put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
02145     put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
02146     OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
02147 }\
02148 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
02149     uint8_t full[24*17];\
02150     uint8_t halfH[272];\
02151     copy_block17(full, src, 24, stride, 17);\
02152     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
02153     put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
02154     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
02155 }\
02156 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
02157     uint8_t halfH[272];\
02158     put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
02159     OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
02160 }
02161 
02162 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
02163 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
02164 #define op_put(a, b) a = cm[((b) + 16)>>5]
02165 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
02166 
02167 QPEL_MC(0, put_       , _       , op_put)
02168 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
02169 QPEL_MC(0, avg_       , _       , op_avg)
02170 //QPEL_MC(1, avg_no_rnd , _       , op_avg)
02171 #undef op_avg
02172 #undef op_avg_no_rnd
02173 #undef op_put
02174 #undef op_put_no_rnd
02175 
02176 #if 1
02177 #define H264_LOWPASS(OPNAME, OP, OP2) \
02178 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02179     const int h=2;\
02180     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02181     int i;\
02182     for(i=0; i<h; i++)\
02183     {\
02184         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
02185         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
02186         dst+=dstStride;\
02187         src+=srcStride;\
02188     }\
02189 }\
02190 \
02191 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02192     const int w=2;\
02193     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02194     int i;\
02195     for(i=0; i<w; i++)\
02196     {\
02197         const int srcB= src[-2*srcStride];\
02198         const int srcA= src[-1*srcStride];\
02199         const int src0= src[0 *srcStride];\
02200         const int src1= src[1 *srcStride];\
02201         const int src2= src[2 *srcStride];\
02202         const int src3= src[3 *srcStride];\
02203         const int src4= src[4 *srcStride];\
02204         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
02205         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
02206         dst++;\
02207         src++;\
02208     }\
02209 }\
02210 \
02211 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
02212     const int h=2;\
02213     const int w=2;\
02214     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02215     int i;\
02216     src -= 2*srcStride;\
02217     for(i=0; i<h+5; i++)\
02218     {\
02219         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
02220         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
02221         tmp+=tmpStride;\
02222         src+=srcStride;\
02223     }\
02224     tmp -= tmpStride*(h+5-2);\
02225     for(i=0; i<w; i++)\
02226     {\
02227         const int tmpB= tmp[-2*tmpStride];\
02228         const int tmpA= tmp[-1*tmpStride];\
02229         const int tmp0= tmp[0 *tmpStride];\
02230         const int tmp1= tmp[1 *tmpStride];\
02231         const int tmp2= tmp[2 *tmpStride];\
02232         const int tmp3= tmp[3 *tmpStride];\
02233         const int tmp4= tmp[4 *tmpStride];\
02234         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
02235         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
02236         dst++;\
02237         tmp++;\
02238     }\
02239 }\
02240 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02241     const int h=4;\
02242     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02243     int i;\
02244     for(i=0; i<h; i++)\
02245     {\
02246         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
02247         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
02248         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
02249         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
02250         dst+=dstStride;\
02251         src+=srcStride;\
02252     }\
02253 }\
02254 \
02255 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02256     const int w=4;\
02257     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02258     int i;\
02259     for(i=0; i<w; i++)\
02260     {\
02261         const int srcB= src[-2*srcStride];\
02262         const int srcA= src[-1*srcStride];\
02263         const int src0= src[0 *srcStride];\
02264         const int src1= src[1 *srcStride];\
02265         const int src2= src[2 *srcStride];\
02266         const int src3= src[3 *srcStride];\
02267         const int src4= src[4 *srcStride];\
02268         const int src5= src[5 *srcStride];\
02269         const int src6= src[6 *srcStride];\
02270         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
02271         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
02272         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
02273         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
02274         dst++;\
02275         src++;\
02276     }\
02277 }\
02278 \
02279 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
02280     const int h=4;\
02281     const int w=4;\
02282     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02283     int i;\
02284     src -= 2*srcStride;\
02285     for(i=0; i<h+5; i++)\
02286     {\
02287         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
02288         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
02289         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
02290         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
02291         tmp+=tmpStride;\
02292         src+=srcStride;\
02293     }\
02294     tmp -= tmpStride*(h+5-2);\
02295     for(i=0; i<w; i++)\
02296     {\
02297         const int tmpB= tmp[-2*tmpStride];\
02298         const int tmpA= tmp[-1*tmpStride];\
02299         const int tmp0= tmp[0 *tmpStride];\
02300         const int tmp1= tmp[1 *tmpStride];\
02301         const int tmp2= tmp[2 *tmpStride];\
02302         const int tmp3= tmp[3 *tmpStride];\
02303         const int tmp4= tmp[4 *tmpStride];\
02304         const int tmp5= tmp[5 *tmpStride];\
02305         const int tmp6= tmp[6 *tmpStride];\
02306         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
02307         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
02308         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
02309         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
02310         dst++;\
02311         tmp++;\
02312     }\
02313 }\
02314 \
02315 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02316     const int h=8;\
02317     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02318     int i;\
02319     for(i=0; i<h; i++)\
02320     {\
02321         OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
02322         OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
02323         OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
02324         OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
02325         OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
02326         OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
02327         OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
02328         OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
02329         dst+=dstStride;\
02330         src+=srcStride;\
02331     }\
02332 }\
02333 \
02334 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02335     const int w=8;\
02336     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02337     int i;\
02338     for(i=0; i<w; i++)\
02339     {\
02340         const int srcB= src[-2*srcStride];\
02341         const int srcA= src[-1*srcStride];\
02342         const int src0= src[0 *srcStride];\
02343         const int src1= src[1 *srcStride];\
02344         const int src2= src[2 *srcStride];\
02345         const int src3= src[3 *srcStride];\
02346         const int src4= src[4 *srcStride];\
02347         const int src5= src[5 *srcStride];\
02348         const int src6= src[6 *srcStride];\
02349         const int src7= src[7 *srcStride];\
02350         const int src8= src[8 *srcStride];\
02351         const int src9= src[9 *srcStride];\
02352         const int src10=src[10*srcStride];\
02353         OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
02354         OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
02355         OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
02356         OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
02357         OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
02358         OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
02359         OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
02360         OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
02361         dst++;\
02362         src++;\
02363     }\
02364 }\
02365 \
02366 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
02367     const int h=8;\
02368     const int w=8;\
02369     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
02370     int i;\
02371     src -= 2*srcStride;\
02372     for(i=0; i<h+5; i++)\
02373     {\
02374         tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
02375         tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
02376         tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
02377         tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
02378         tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
02379         tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
02380         tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
02381         tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
02382         tmp+=tmpStride;\
02383         src+=srcStride;\
02384     }\
02385     tmp -= tmpStride*(h+5-2);\
02386     for(i=0; i<w; i++)\
02387     {\
02388         const int tmpB= tmp[-2*tmpStride];\
02389         const int tmpA= tmp[-1*tmpStride];\
02390         const int tmp0= tmp[0 *tmpStride];\
02391         const int tmp1= tmp[1 *tmpStride];\
02392         const int tmp2= tmp[2 *tmpStride];\
02393         const int tmp3= tmp[3 *tmpStride];\
02394         const int tmp4= tmp[4 *tmpStride];\
02395         const int tmp5= tmp[5 *tmpStride];\
02396         const int tmp6= tmp[6 *tmpStride];\
02397         const int tmp7= tmp[7 *tmpStride];\
02398         const int tmp8= tmp[8 *tmpStride];\
02399         const int tmp9= tmp[9 *tmpStride];\
02400         const int tmp10=tmp[10*tmpStride];\
02401         OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
02402         OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
02403         OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
02404         OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
02405         OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
02406         OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
02407         OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
02408         OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
02409         dst++;\
02410         tmp++;\
02411     }\
02412 }\
02413 \
02414 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02415     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
02416     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
02417     src += 8*srcStride;\
02418     dst += 8*dstStride;\
02419     OPNAME ## h264_qpel8_v_lowpass(dst  , src  , dstStride, srcStride);\
02420     OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
02421 }\
02422 \
02423 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02424     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
02425     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
02426     src += 8*srcStride;\
02427     dst += 8*dstStride;\
02428     OPNAME ## h264_qpel8_h_lowpass(dst  , src  , dstStride, srcStride);\
02429     OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
02430 }\
02431 \
02432 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
02433     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
02434     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
02435     src += 8*srcStride;\
02436     dst += 8*dstStride;\
02437     OPNAME ## h264_qpel8_hv_lowpass(dst  , tmp  , src  , dstStride, tmpStride, srcStride);\
02438     OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
02439 }\
02440 
02441 #define H264_MC(OPNAME, SIZE) \
02442 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
02443     OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
02444 }\
02445 \
02446 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
02447     uint8_t half[SIZE*SIZE];\
02448     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
02449     OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
02450 }\
02451 \
02452 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
02453     OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
02454 }\
02455 \
02456 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
02457     uint8_t half[SIZE*SIZE];\
02458     put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
02459     OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
02460 }\
02461 \
02462 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
02463     uint8_t full[SIZE*(SIZE+5)];\
02464     uint8_t * const full_mid= full + SIZE*2;\
02465     uint8_t half[SIZE*SIZE];\
02466     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
02467     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
02468     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
02469 }\
02470 \
02471 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
02472     uint8_t full[SIZE*(SIZE+5)];\
02473     uint8_t * const full_mid= full + SIZE*2;\
02474     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
02475     OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
02476 }\
02477 \
02478 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
02479     uint8_t full[SIZE*(SIZE+5)];\
02480     uint8_t * const full_mid= full + SIZE*2;\
02481     uint8_t half[SIZE*SIZE];\
02482     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
02483     put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
02484     OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
02485 }\
02486 \
02487 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
02488     uint8_t full[SIZE*(SIZE+5)];\
02489     uint8_t * const full_mid= full + SIZE*2;\
02490     uint8_t halfH[SIZE*SIZE];\
02491     uint8_t halfV[SIZE*SIZE];\
02492     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
02493     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
02494     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
02495     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
02496 }\
02497 \
02498 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
02499     uint8_t full[SIZE*(SIZE+5)];\
02500     uint8_t * const full_mid= full + SIZE*2;\
02501     uint8_t halfH[SIZE*SIZE];\
02502     uint8_t halfV[SIZE*SIZE];\
02503     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
02504     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
02505     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
02506     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
02507 }\
02508 \
02509 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
02510     uint8_t full[SIZE*(SIZE+5)];\
02511     uint8_t * const full_mid= full + SIZE*2;\
02512     uint8_t halfH[SIZE*SIZE];\
02513     uint8_t halfV[SIZE*SIZE];\
02514     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
02515     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
02516     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
02517     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
02518 }\
02519 \
02520 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
02521     uint8_t full[SIZE*(SIZE+5)];\
02522     uint8_t * const full_mid= full + SIZE*2;\
02523     uint8_t halfH[SIZE*SIZE];\
02524     uint8_t halfV[SIZE*SIZE];\
02525     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
02526     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
02527     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
02528     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
02529 }\
02530 \
02531 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
02532     int16_t tmp[SIZE*(SIZE+5)];\
02533     OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
02534 }\
02535 \
02536 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
02537     int16_t tmp[SIZE*(SIZE+5)];\
02538     uint8_t halfH[SIZE*SIZE];\
02539     uint8_t halfHV[SIZE*SIZE];\
02540     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
02541     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
02542     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
02543 }\
02544 \
02545 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
02546     int16_t tmp[SIZE*(SIZE+5)];\
02547     uint8_t halfH[SIZE*SIZE];\
02548     uint8_t halfHV[SIZE*SIZE];\
02549     put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
02550     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
02551     OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
02552 }\
02553 \
02554 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
02555     uint8_t full[SIZE*(SIZE+5)];\
02556     uint8_t * const full_mid= full + SIZE*2;\
02557     int16_t tmp[SIZE*(SIZE+5)];\
02558     uint8_t halfV[SIZE*SIZE];\
02559     uint8_t halfHV[SIZE*SIZE];\
02560     copy_block ## SIZE (full, src - stride*2, SIZE,  stride, SIZE + 5);\
02561     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
02562     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
02563     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
02564 }\
02565 \
02566 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
02567     uint8_t full[SIZE*(SIZE+5)];\
02568     uint8_t * const full_mid= full + SIZE*2;\
02569     int16_t tmp[SIZE*(SIZE+5)];\
02570     uint8_t halfV[SIZE*SIZE];\
02571     uint8_t halfHV[SIZE*SIZE];\
02572     copy_block ## SIZE (full, src - stride*2 + 1, SIZE,  stride, SIZE + 5);\
02573     put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
02574     put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
02575     OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
02576 }\
02577 
02578 #define op_avg(a, b)  a = (((a)+cm[((b) + 16)>>5]+1)>>1)
02579 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
02580 #define op_put(a, b)  a = cm[((b) + 16)>>5]
02581 #define op2_avg(a, b)  a = (((a)+cm[((b) + 512)>>10]+1)>>1)
02582 #define op2_put(a, b)  a = cm[((b) + 512)>>10]
02583 
02584 H264_LOWPASS(put_       , op_put, op2_put)
02585 H264_LOWPASS(avg_       , op_avg, op2_avg)
02586 H264_MC(put_, 2)
02587 H264_MC(put_, 4)
02588 H264_MC(put_, 8)
02589 H264_MC(put_, 16)
02590 H264_MC(avg_, 4)
02591 H264_MC(avg_, 8)
02592 H264_MC(avg_, 16)
02593 
02594 #undef op_avg
02595 #undef op_put
02596 #undef op2_avg
02597 #undef op2_put
02598 #endif
02599 
02600 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
02601     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
02602     int i;
02603 
02604     for(i=0; i<h; i++){
02605         dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
02606         dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
02607         dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
02608         dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
02609         dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
02610         dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
02611         dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
02612         dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
02613         dst+=dstStride;
02614         src+=srcStride;
02615     }
02616 }
02617 
02618 #if CONFIG_CAVS_DECODER
02619 /* AVS specific */
02620 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
02621     put_pixels8_c(dst, src, stride, 8);
02622 }
02623 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
02624     avg_pixels8_c(dst, src, stride, 8);
02625 }
02626 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
02627     put_pixels16_c(dst, src, stride, 16);
02628 }
02629 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
02630     avg_pixels16_c(dst, src, stride, 16);
02631 }
02632 #endif /* CONFIG_CAVS_DECODER */
02633 
02634 #if CONFIG_VC1_DECODER
02635 /* VC-1 specific */
02636 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
02637     put_pixels8_c(dst, src, stride, 8);
02638 }
02639 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
02640     avg_pixels8_c(dst, src, stride, 8);
02641 }
02642 #endif /* CONFIG_VC1_DECODER */
02643 
02644 #if CONFIG_RV40_DECODER
02645 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
02646     put_pixels16_xy2_c(dst, src, stride, 16);
02647 }
02648 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
02649     avg_pixels16_xy2_c(dst, src, stride, 16);
02650 }
02651 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
02652     put_pixels8_xy2_c(dst, src, stride, 8);
02653 }
02654 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
02655     avg_pixels8_xy2_c(dst, src, stride, 8);
02656 }
02657 #endif /* CONFIG_RV40_DECODER */
02658 
02659 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
02660     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
02661     int i;
02662 
02663     for(i=0; i<w; i++){
02664         const int src_1= src[ -srcStride];
02665         const int src0 = src[0          ];
02666         const int src1 = src[  srcStride];
02667         const int src2 = src[2*srcStride];
02668         const int src3 = src[3*srcStride];
02669         const int src4 = src[4*srcStride];
02670         const int src5 = src[5*srcStride];
02671         const int src6 = src[6*srcStride];
02672         const int src7 = src[7*srcStride];
02673         const int src8 = src[8*srcStride];
02674         const int src9 = src[9*srcStride];
02675         dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
02676         dst[1*dstStride]= cm[(9*(src1 + src2) - (src0  + src3) + 8)>>4];
02677         dst[2*dstStride]= cm[(9*(src2 + src3) - (src1  + src4) + 8)>>4];
02678         dst[3*dstStride]= cm[(9*(src3 + src4) - (src2  + src5) + 8)>>4];
02679         dst[4*dstStride]= cm[(9*(src4 + src5) - (src3  + src6) + 8)>>4];
02680         dst[5*dstStride]= cm[(9*(src5 + src6) - (src4  + src7) + 8)>>4];
02681         dst[6*dstStride]= cm[(9*(src6 + src7) - (src5  + src8) + 8)>>4];
02682         dst[7*dstStride]= cm[(9*(src7 + src8) - (src6  + src9) + 8)>>4];
02683         src++;
02684         dst++;
02685     }
02686 }
02687 
02688 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
02689     put_pixels8_c(dst, src, stride, 8);
02690 }
02691 
02692 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
02693     uint8_t half[64];
02694     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
02695     put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
02696 }
02697 
02698 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
02699     wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
02700 }
02701 
02702 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
02703     uint8_t half[64];
02704     wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
02705     put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
02706 }
02707 
02708 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
02709     wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
02710 }
02711 
02712 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
02713     uint8_t halfH[88];
02714     uint8_t halfV[64];
02715     uint8_t halfHV[64];
02716     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
02717     wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
02718     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
02719     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
02720 }
02721 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
02722     uint8_t halfH[88];
02723     uint8_t halfV[64];
02724     uint8_t halfHV[64];
02725     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
02726     wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
02727     wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
02728     put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
02729 }
02730 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
02731     uint8_t halfH[88];
02732     wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
02733     wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
02734 }
02735 
02736 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
02737     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
02738     int x;
02739     const int strength= ff_h263_loop_filter_strength[qscale];
02740 
02741     for(x=0; x<8; x++){
02742         int d1, d2, ad1;
02743         int p0= src[x-2*stride];
02744         int p1= src[x-1*stride];
02745         int p2= src[x+0*stride];
02746         int p3= src[x+1*stride];
02747         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
02748 
02749         if     (d<-2*strength) d1= 0;
02750         else if(d<-  strength) d1=-2*strength - d;
02751         else if(d<   strength) d1= d;
02752         else if(d< 2*strength) d1= 2*strength - d;
02753         else                   d1= 0;
02754 
02755         p1 += d1;
02756         p2 -= d1;
02757         if(p1&256) p1= ~(p1>>31);
02758         if(p2&256) p2= ~(p2>>31);
02759 
02760         src[x-1*stride] = p1;
02761         src[x+0*stride] = p2;
02762 
02763         ad1= FFABS(d1)>>1;
02764 
02765         d2= av_clip((p0-p3)/4, -ad1, ad1);
02766 
02767         src[x-2*stride] = p0 - d2;
02768         src[x+  stride] = p3 + d2;
02769     }
02770     }
02771 }
02772 
02773 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
02774     if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
02775     int y;
02776     const int strength= ff_h263_loop_filter_strength[qscale];
02777 
02778     for(y=0; y<8; y++){
02779         int d1, d2, ad1;
02780         int p0= src[y*stride-2];
02781         int p1= src[y*stride-1];
02782         int p2= src[y*stride+0];
02783         int p3= src[y*stride+1];
02784         int d = (p0 - p3 + 4*(p2 - p1)) / 8;
02785 
02786         if     (d<-2*strength) d1= 0;
02787         else if(d<-  strength) d1=-2*strength - d;
02788         else if(d<   strength) d1= d;
02789         else if(d< 2*strength) d1= 2*strength - d;
02790         else                   d1= 0;
02791 
02792         p1 += d1;
02793         p2 -= d1;
02794         if(p1&256) p1= ~(p1>>31);
02795         if(p2&256) p2= ~(p2>>31);
02796 
02797         src[y*stride-1] = p1;
02798         src[y*stride+0] = p2;
02799 
02800         ad1= FFABS(d1)>>1;
02801 
02802         d2= av_clip((p0-p3)/4, -ad1, ad1);
02803 
02804         src[y*stride-2] = p0 - d2;
02805         src[y*stride+1] = p3 + d2;
02806     }
02807     }
02808 }
02809 
02810 static void h261_loop_filter_c(uint8_t *src, int stride){
02811     int x,y,xy,yz;
02812     int temp[64];
02813 
02814     for(x=0; x<8; x++){
02815         temp[x      ] = 4*src[x           ];
02816         temp[x + 7*8] = 4*src[x + 7*stride];
02817     }
02818     for(y=1; y<7; y++){
02819         for(x=0; x<8; x++){
02820             xy = y * stride + x;
02821             yz = y * 8 + x;
02822             temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
02823         }
02824     }
02825 
02826     for(y=0; y<8; y++){
02827         src[  y*stride] = (temp[  y*8] + 2)>>2;
02828         src[7+y*stride] = (temp[7+y*8] + 2)>>2;
02829         for(x=1; x<7; x++){
02830             xy = y * stride + x;
02831             yz = y * 8 + x;
02832             src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
02833         }
02834     }
02835 }
02836 
02837 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
02838 {
02839     int s, i;
02840 
02841     s = 0;
02842     for(i=0;i<h;i++) {
02843         s += abs(pix1[0] - pix2[0]);
02844         s += abs(pix1[1] - pix2[1]);
02845         s += abs(pix1[2] - pix2[2]);
02846         s += abs(pix1[3] - pix2[3]);
02847         s += abs(pix1[4] - pix2[4]);
02848         s += abs(pix1[5] - pix2[5]);
02849         s += abs(pix1[6] - pix2[6]);
02850         s += abs(pix1[7] - pix2[7]);
02851         s += abs(pix1[8] - pix2[8]);
02852         s += abs(pix1[9] - pix2[9]);
02853         s += abs(pix1[10] - pix2[10]);
02854         s += abs(pix1[11] - pix2[11]);
02855         s += abs(pix1[12] - pix2[12]);
02856         s += abs(pix1[13] - pix2[13]);
02857         s += abs(pix1[14] - pix2[14]);
02858         s += abs(pix1[15] - pix2[15]);
02859         pix1 += line_size;
02860         pix2 += line_size;
02861     }
02862     return s;
02863 }
02864 
02865 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
02866 {
02867     int s, i;
02868 
02869     s = 0;
02870     for(i=0;i<h;i++) {
02871         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
02872         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
02873         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
02874         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
02875         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
02876         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
02877         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
02878         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
02879         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
02880         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
02881         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
02882         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
02883         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
02884         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
02885         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
02886         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
02887         pix1 += line_size;
02888         pix2 += line_size;
02889     }
02890     return s;
02891 }
02892 
02893 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
02894 {
02895     int s, i;
02896     uint8_t *pix3 = pix2 + line_size;
02897 
02898     s = 0;
02899     for(i=0;i<h;i++) {
02900         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
02901         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
02902         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
02903         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
02904         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
02905         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
02906         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
02907         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
02908         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
02909         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
02910         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
02911         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
02912         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
02913         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
02914         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
02915         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
02916         pix1 += line_size;
02917         pix2 += line_size;
02918         pix3 += line_size;
02919     }
02920     return s;
02921 }
02922 
02923 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
02924 {
02925     int s, i;
02926     uint8_t *pix3 = pix2 + line_size;
02927 
02928     s = 0;
02929     for(i=0;i<h;i++) {
02930         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
02931         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
02932         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
02933         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
02934         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
02935         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
02936         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
02937         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
02938         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
02939         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
02940         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
02941         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
02942         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
02943         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
02944         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
02945         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
02946         pix1 += line_size;
02947         pix2 += line_size;
02948         pix3 += line_size;
02949     }
02950     return s;
02951 }
02952 
02953 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
02954 {
02955     int s, i;
02956 
02957     s = 0;
02958     for(i=0;i<h;i++) {
02959         s += abs(pix1[0] - pix2[0]);
02960         s += abs(pix1[1] - pix2[1]);
02961         s += abs(pix1[2] - pix2[2]);
02962         s += abs(pix1[3] - pix2[3]);
02963         s += abs(pix1[4] - pix2[4]);
02964         s += abs(pix1[5] - pix2[5]);
02965         s += abs(pix1[6] - pix2[6]);
02966         s += abs(pix1[7] - pix2[7]);
02967         pix1 += line_size;
02968         pix2 += line_size;
02969     }
02970     return s;
02971 }
02972 
02973 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
02974 {
02975     int s, i;
02976 
02977     s = 0;
02978     for(i=0;i<h;i++) {
02979         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
02980         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
02981         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
02982         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
02983         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
02984         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
02985         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
02986         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
02987         pix1 += line_size;
02988         pix2 += line_size;
02989     }
02990     return s;
02991 }
02992 
02993 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
02994 {
02995     int s, i;
02996     uint8_t *pix3 = pix2 + line_size;
02997 
02998     s = 0;
02999     for(i=0;i<h;i++) {
03000         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
03001         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
03002         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
03003         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
03004         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
03005         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
03006         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
03007         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
03008         pix1 += line_size;
03009         pix2 += line_size;
03010         pix3 += line_size;
03011     }
03012     return s;
03013 }
03014 
03015 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
03016 {
03017     int s, i;
03018     uint8_t *pix3 = pix2 + line_size;
03019 
03020     s = 0;
03021     for(i=0;i<h;i++) {
03022         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
03023         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
03024         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
03025         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
03026         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
03027         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
03028         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
03029         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
03030         pix1 += line_size;
03031         pix2 += line_size;
03032         pix3 += line_size;
03033     }
03034     return s;
03035 }
03036 
03037 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
03038     MpegEncContext *c = v;
03039     int score1=0;
03040     int score2=0;
03041     int x,y;
03042 
03043     for(y=0; y<h; y++){
03044         for(x=0; x<16; x++){
03045             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
03046         }
03047         if(y+1<h){
03048             for(x=0; x<15; x++){
03049                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
03050                              - s1[x+1] + s1[x+1+stride])
03051                         -FFABS(  s2[x  ] - s2[x  +stride]
03052                              - s2[x+1] + s2[x+1+stride]);
03053             }
03054         }
03055         s1+= stride;
03056         s2+= stride;
03057     }
03058 
03059     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
03060     else  return score1 + FFABS(score2)*8;
03061 }
03062 
03063 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
03064     MpegEncContext *c = v;
03065     int score1=0;
03066     int score2=0;
03067     int x,y;
03068 
03069     for(y=0; y<h; y++){
03070         for(x=0; x<8; x++){
03071             score1+= (s1[x  ] - s2[x ])*(s1[x  ] - s2[x ]);
03072         }
03073         if(y+1<h){
03074             for(x=0; x<7; x++){
03075                 score2+= FFABS(  s1[x  ] - s1[x  +stride]
03076                              - s1[x+1] + s1[x+1+stride])
03077                         -FFABS(  s2[x  ] - s2[x  +stride]
03078                              - s2[x+1] + s2[x+1+stride]);
03079             }
03080         }
03081         s1+= stride;
03082         s2+= stride;
03083     }
03084 
03085     if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
03086     else  return score1 + FFABS(score2)*8;
03087 }
03088 
03089 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
03090     int i;
03091     unsigned int sum=0;
03092 
03093     for(i=0; i<8*8; i++){
03094         int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
03095         int w= weight[i];
03096         b>>= RECON_SHIFT;
03097         assert(-512<b && b<512);
03098 
03099         sum += (w*b)*(w*b)>>4;
03100     }
03101     return sum>>2;
03102 }
03103 
03104 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
03105     int i;
03106 
03107     for(i=0; i<8*8; i++){
03108         rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
03109     }
03110 }
03111 
03120 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
03121 {
03122     int i;
03123     DCTELEM temp[64];
03124 
03125     if(last<=0) return;
03126     //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
03127 
03128     for(i=0; i<=last; i++){
03129         const int j= scantable[i];
03130         temp[j]= block[j];
03131         block[j]=0;
03132     }
03133 
03134     for(i=0; i<=last; i++){
03135         const int j= scantable[i];
03136         const int perm_j= permutation[j];
03137         block[perm_j]= temp[j];
03138     }
03139 }
03140 
03141 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
03142     return 0;
03143 }
03144 
03145 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
03146     int i;
03147 
03148     memset(cmp, 0, sizeof(void*)*6);
03149 
03150     for(i=0; i<6; i++){
03151         switch(type&0xFF){
03152         case FF_CMP_SAD:
03153             cmp[i]= c->sad[i];
03154             break;
03155         case FF_CMP_SATD:
03156             cmp[i]= c->hadamard8_diff[i];
03157             break;
03158         case FF_CMP_SSE:
03159             cmp[i]= c->sse[i];
03160             break;
03161         case FF_CMP_DCT:
03162             cmp[i]= c->dct_sad[i];
03163             break;
03164         case FF_CMP_DCT264:
03165             cmp[i]= c->dct264_sad[i];
03166             break;
03167         case FF_CMP_DCTMAX:
03168             cmp[i]= c->dct_max[i];
03169             break;
03170         case FF_CMP_PSNR:
03171             cmp[i]= c->quant_psnr[i];
03172             break;
03173         case FF_CMP_BIT:
03174             cmp[i]= c->bit[i];
03175             break;
03176         case FF_CMP_RD:
03177             cmp[i]= c->rd[i];
03178             break;
03179         case FF_CMP_VSAD:
03180             cmp[i]= c->vsad[i];
03181             break;
03182         case FF_CMP_VSSE:
03183             cmp[i]= c->vsse[i];
03184             break;
03185         case FF_CMP_ZERO:
03186             cmp[i]= zero_cmp;
03187             break;
03188         case FF_CMP_NSSE:
03189             cmp[i]= c->nsse[i];
03190             break;
03191 #if CONFIG_DWT
03192         case FF_CMP_W53:
03193             cmp[i]= c->w53[i];
03194             break;
03195         case FF_CMP_W97:
03196             cmp[i]= c->w97[i];
03197             break;
03198 #endif
03199         default:
03200             av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
03201         }
03202     }
03203 }
03204 
03205 static void clear_block_c(DCTELEM *block)
03206 {
03207     memset(block, 0, sizeof(DCTELEM)*64);
03208 }
03209 
03213 static void clear_blocks_c(DCTELEM *blocks)
03214 {
03215     memset(blocks, 0, sizeof(DCTELEM)*6*64);
03216 }
03217 
03218 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
03219     long i;
03220     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
03221         long a = *(long*)(src+i);
03222         long b = *(long*)(dst+i);
03223         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
03224     }
03225     for(; i<w; i++)
03226         dst[i+0] += src[i+0];
03227 }
03228 
03229 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
03230     long i;
03231     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
03232         long a = *(long*)(src1+i);
03233         long b = *(long*)(src2+i);
03234         *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
03235     }
03236     for(; i<w; i++)
03237         dst[i] = src1[i]+src2[i];
03238 }
03239 
03240 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
03241     long i;
03242 #if !HAVE_FAST_UNALIGNED
03243     if((long)src2 & (sizeof(long)-1)){
03244         for(i=0; i+7<w; i+=8){
03245             dst[i+0] = src1[i+0]-src2[i+0];
03246             dst[i+1] = src1[i+1]-src2[i+1];
03247             dst[i+2] = src1[i+2]-src2[i+2];
03248             dst[i+3] = src1[i+3]-src2[i+3];
03249             dst[i+4] = src1[i+4]-src2[i+4];
03250             dst[i+5] = src1[i+5]-src2[i+5];
03251             dst[i+6] = src1[i+6]-src2[i+6];
03252             dst[i+7] = src1[i+7]-src2[i+7];
03253         }
03254     }else
03255 #endif
03256     for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
03257         long a = *(long*)(src1+i);
03258         long b = *(long*)(src2+i);
03259         *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
03260     }
03261     for(; i<w; i++)
03262         dst[i+0] = src1[i+0]-src2[i+0];
03263 }
03264 
03265 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
03266     int i;
03267     uint8_t l, lt;
03268 
03269     l= *left;
03270     lt= *left_top;
03271 
03272     for(i=0; i<w; i++){
03273         l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
03274         lt= src1[i];
03275         dst[i]= l;
03276     }
03277 
03278     *left= l;
03279     *left_top= lt;
03280 }
03281 
03282 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
03283     int i;
03284     uint8_t l, lt;
03285 
03286     l= *left;
03287     lt= *left_top;
03288 
03289     for(i=0; i<w; i++){
03290         const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
03291         lt= src1[i];
03292         l= src2[i];
03293         dst[i]= l - pred;
03294     }
03295 
03296     *left= l;
03297     *left_top= lt;
03298 }
03299 
03300 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
03301     int i;
03302 
03303     for(i=0; i<w-1; i++){
03304         acc+= src[i];
03305         dst[i]= acc;
03306         i++;
03307         acc+= src[i];
03308         dst[i]= acc;
03309     }
03310 
03311     for(; i<w; i++){
03312         acc+= src[i];
03313         dst[i]= acc;
03314     }
03315 
03316     return acc;
03317 }
03318 
03319 #if HAVE_BIGENDIAN
03320 #define B 3
03321 #define G 2
03322 #define R 1
03323 #define A 0
03324 #else
03325 #define B 0
03326 #define G 1
03327 #define R 2
03328 #define A 3
03329 #endif
03330 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
03331     int i;
03332     int r,g,b,a;
03333     r= *red;
03334     g= *green;
03335     b= *blue;
03336     a= *alpha;
03337 
03338     for(i=0; i<w; i++){
03339         b+= src[4*i+B];
03340         g+= src[4*i+G];
03341         r+= src[4*i+R];
03342         a+= src[4*i+A];
03343 
03344         dst[4*i+B]= b;
03345         dst[4*i+G]= g;
03346         dst[4*i+R]= r;
03347         dst[4*i+A]= a;
03348     }
03349 
03350     *red= r;
03351     *green= g;
03352     *blue= b;
03353     *alpha= a;
03354 }
03355 #undef B
03356 #undef G
03357 #undef R
03358 #undef A
03359 
03360 #define BUTTERFLY2(o1,o2,i1,i2) \
03361 o1= (i1)+(i2);\
03362 o2= (i1)-(i2);
03363 
03364 #define BUTTERFLY1(x,y) \
03365 {\
03366     int a,b;\
03367     a= x;\
03368     b= y;\
03369     x= a+b;\
03370     y= a-b;\
03371 }
03372 
03373 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
03374 
03375 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
03376     int i;
03377     int temp[64];
03378     int sum=0;
03379 
03380     assert(h==8);
03381 
03382     for(i=0; i<8; i++){
03383         //FIXME try pointer walks
03384         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
03385         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
03386         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
03387         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
03388 
03389         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
03390         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
03391         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
03392         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
03393 
03394         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
03395         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
03396         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
03397         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
03398     }
03399 
03400     for(i=0; i<8; i++){
03401         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
03402         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
03403         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
03404         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
03405 
03406         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
03407         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
03408         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
03409         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
03410 
03411         sum +=
03412              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
03413             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
03414             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
03415             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
03416     }
03417 #if 0
03418 static int maxi=0;
03419 if(sum>maxi){
03420     maxi=sum;
03421     printf("MAX:%d\n", maxi);
03422 }
03423 #endif
03424     return sum;
03425 }
03426 
03427 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
03428     int i;
03429     int temp[64];
03430     int sum=0;
03431 
03432     assert(h==8);
03433 
03434     for(i=0; i<8; i++){
03435         //FIXME try pointer walks
03436         BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
03437         BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
03438         BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
03439         BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
03440 
03441         BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
03442         BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
03443         BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
03444         BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
03445 
03446         BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
03447         BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
03448         BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
03449         BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
03450     }
03451 
03452     for(i=0; i<8; i++){
03453         BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
03454         BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
03455         BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
03456         BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
03457 
03458         BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
03459         BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
03460         BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
03461         BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
03462 
03463         sum +=
03464              BUTTERFLYA(temp[8*0+i], temp[8*4+i])
03465             +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
03466             +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
03467             +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
03468     }
03469 
03470     sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
03471 
03472     return sum;
03473 }
03474 
03475 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
03476     MpegEncContext * const s= (MpegEncContext *)c;
03477     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
03478 
03479     assert(h==8);
03480 
03481     s->dsp.diff_pixels(temp, src1, src2, stride);
03482     s->dsp.fdct(temp);
03483     return s->dsp.sum_abs_dctelem(temp);
03484 }
03485 
03486 #if CONFIG_GPL
03487 #define DCT8_1D {\
03488     const int s07 = SRC(0) + SRC(7);\
03489     const int s16 = SRC(1) + SRC(6);\
03490     const int s25 = SRC(2) + SRC(5);\
03491     const int s34 = SRC(3) + SRC(4);\
03492     const int a0 = s07 + s34;\
03493     const int a1 = s16 + s25;\
03494     const int a2 = s07 - s34;\
03495     const int a3 = s16 - s25;\
03496     const int d07 = SRC(0) - SRC(7);\
03497     const int d16 = SRC(1) - SRC(6);\
03498     const int d25 = SRC(2) - SRC(5);\
03499     const int d34 = SRC(3) - SRC(4);\
03500     const int a4 = d16 + d25 + (d07 + (d07>>1));\
03501     const int a5 = d07 - d34 - (d25 + (d25>>1));\
03502     const int a6 = d07 + d34 - (d16 + (d16>>1));\
03503     const int a7 = d16 - d25 + (d34 + (d34>>1));\
03504     DST(0,  a0 + a1     ) ;\
03505     DST(1,  a4 + (a7>>2)) ;\
03506     DST(2,  a2 + (a3>>1)) ;\
03507     DST(3,  a5 + (a6>>2)) ;\
03508     DST(4,  a0 - a1     ) ;\
03509     DST(5,  a6 - (a5>>2)) ;\
03510     DST(6, (a2>>1) - a3 ) ;\
03511     DST(7, (a4>>2) - a7 ) ;\
03512 }
03513 
03514 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
03515     MpegEncContext * const s= (MpegEncContext *)c;
03516     DCTELEM dct[8][8];
03517     int i;
03518     int sum=0;
03519 
03520     s->dsp.diff_pixels(dct[0], src1, src2, stride);
03521 
03522 #define SRC(x) dct[i][x]
03523 #define DST(x,v) dct[i][x]= v
03524     for( i = 0; i < 8; i++ )
03525         DCT8_1D
03526 #undef SRC
03527 #undef DST
03528 
03529 #define SRC(x) dct[x][i]
03530 #define DST(x,v) sum += FFABS(v)
03531     for( i = 0; i < 8; i++ )
03532         DCT8_1D
03533 #undef SRC
03534 #undef DST
03535     return sum;
03536 }
03537 #endif
03538 
03539 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
03540     MpegEncContext * const s= (MpegEncContext *)c;
03541     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
03542     int sum=0, i;
03543 
03544     assert(h==8);
03545 
03546     s->dsp.diff_pixels(temp, src1, src2, stride);
03547     s->dsp.fdct(temp);
03548 
03549     for(i=0; i<64; i++)
03550         sum= FFMAX(sum, FFABS(temp[i]));
03551 
03552     return sum;
03553 }
03554 
03555 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
03556     MpegEncContext * const s= (MpegEncContext *)c;
03557     LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
03558     DCTELEM * const bak = temp+64;
03559     int sum=0, i;
03560 
03561     assert(h==8);
03562     s->mb_intra=0;
03563 
03564     s->dsp.diff_pixels(temp, src1, src2, stride);
03565 
03566     memcpy(bak, temp, 64*sizeof(DCTELEM));
03567 
03568     s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
03569     s->dct_unquantize_inter(s, temp, 0, s->qscale);
03570     ff_simple_idct(temp); //FIXME
03571 
03572     for(i=0; i<64; i++)
03573         sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
03574 
03575     return sum;
03576 }
03577 
03578 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
03579     MpegEncContext * const s= (MpegEncContext *)c;
03580     const uint8_t *scantable= s->intra_scantable.permutated;
03581     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
03582     LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
03583     LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
03584     int i, last, run, bits, level, distortion, start_i;
03585     const int esc_length= s->ac_esc_length;
03586     uint8_t * length;
03587     uint8_t * last_length;
03588 
03589     assert(h==8);
03590 
03591     copy_block8(lsrc1, src1, 8, stride, 8);
03592     copy_block8(lsrc2, src2, 8, stride, 8);
03593 
03594     s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
03595 
03596     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
03597 
03598     bits=0;
03599 
03600     if (s->mb_intra) {
03601         start_i = 1;
03602         length     = s->intra_ac_vlc_length;
03603         last_length= s->intra_ac_vlc_last_length;
03604         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
03605     } else {
03606         start_i = 0;
03607         length     = s->inter_ac_vlc_length;
03608         last_length= s->inter_ac_vlc_last_length;
03609     }
03610 
03611     if(last>=start_i){
03612         run=0;
03613         for(i=start_i; i<last; i++){
03614             int j= scantable[i];
03615             level= temp[j];
03616 
03617             if(level){
03618                 level+=64;
03619                 if((level&(~127)) == 0){
03620                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
03621                 }else
03622                     bits+= esc_length;
03623                 run=0;
03624             }else
03625                 run++;
03626         }
03627         i= scantable[last];
03628 
03629         level= temp[i] + 64;
03630 
03631         assert(level - 64);
03632 
03633         if((level&(~127)) == 0){
03634             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
03635         }else
03636             bits+= esc_length;
03637 
03638     }
03639 
03640     if(last>=0){
03641         if(s->mb_intra)
03642             s->dct_unquantize_intra(s, temp, 0, s->qscale);
03643         else
03644             s->dct_unquantize_inter(s, temp, 0, s->qscale);
03645     }
03646 
03647     s->dsp.idct_add(lsrc2, 8, temp);
03648 
03649     distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
03650 
03651     return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
03652 }
03653 
03654 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
03655     MpegEncContext * const s= (MpegEncContext *)c;
03656     const uint8_t *scantable= s->intra_scantable.permutated;
03657     LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
03658     int i, last, run, bits, level, start_i;
03659     const int esc_length= s->ac_esc_length;
03660     uint8_t * length;
03661     uint8_t * last_length;
03662 
03663     assert(h==8);
03664 
03665     s->dsp.diff_pixels(temp, src1, src2, stride);
03666 
03667     s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
03668 
03669     bits=0;
03670 
03671     if (s->mb_intra) {
03672         start_i = 1;
03673         length     = s->intra_ac_vlc_length;
03674         last_length= s->intra_ac_vlc_last_length;
03675         bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
03676     } else {
03677         start_i = 0;
03678         length     = s->inter_ac_vlc_length;
03679         last_length= s->inter_ac_vlc_last_length;
03680     }
03681 
03682     if(last>=start_i){
03683         run=0;
03684         for(i=start_i; i<last; i++){
03685             int j= scantable[i];
03686             level= temp[j];
03687 
03688             if(level){
03689                 level+=64;
03690                 if((level&(~127)) == 0){
03691                     bits+= length[UNI_AC_ENC_INDEX(run, level)];
03692                 }else
03693                     bits+= esc_length;
03694                 run=0;
03695             }else
03696                 run++;
03697         }
03698         i= scantable[last];
03699 
03700         level= temp[i] + 64;
03701 
03702         assert(level - 64);
03703 
03704         if((level&(~127)) == 0){
03705             bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
03706         }else
03707             bits+= esc_length;
03708     }
03709 
03710     return bits;
03711 }
03712 
03713 #define VSAD_INTRA(size) \
03714 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
03715     int score=0;                                                                                            \
03716     int x,y;                                                                                                \
03717                                                                                                             \
03718     for(y=1; y<h; y++){                                                                                     \
03719         for(x=0; x<size; x+=4){                                                                             \
03720             score+= FFABS(s[x  ] - s[x  +stride]) + FFABS(s[x+1] - s[x+1+stride])                           \
03721                    +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);                          \
03722         }                                                                                                   \
03723         s+= stride;                                                                                         \
03724     }                                                                                                       \
03725                                                                                                             \
03726     return score;                                                                                           \
03727 }
03728 VSAD_INTRA(8)
03729 VSAD_INTRA(16)
03730 
03731 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
03732     int score=0;
03733     int x,y;
03734 
03735     for(y=1; y<h; y++){
03736         for(x=0; x<16; x++){
03737             score+= FFABS(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
03738         }
03739         s1+= stride;
03740         s2+= stride;
03741     }
03742 
03743     return score;
03744 }
03745 
03746 #define SQ(a) ((a)*(a))
03747 #define VSSE_INTRA(size) \
03748 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
03749     int score=0;                                                                                            \
03750     int x,y;                                                                                                \
03751                                                                                                             \
03752     for(y=1; y<h; y++){                                                                                     \
03753         for(x=0; x<size; x+=4){                                                                               \
03754             score+= SQ(s[x  ] - s[x  +stride]) + SQ(s[x+1] - s[x+1+stride])                                 \
03755                    +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);                                \
03756         }                                                                                                   \
03757         s+= stride;                                                                                         \
03758     }                                                                                                       \
03759                                                                                                             \
03760     return score;                                                                                           \
03761 }
03762 VSSE_INTRA(8)
03763 VSSE_INTRA(16)
03764 
03765 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
03766     int score=0;
03767     int x,y;
03768 
03769     for(y=1; y<h; y++){
03770         for(x=0; x<16; x++){
03771             score+= SQ(s1[x  ] - s2[x ] - s1[x  +stride] + s2[x +stride]);
03772         }
03773         s1+= stride;
03774         s2+= stride;
03775     }
03776 
03777     return score;
03778 }
03779 
03780 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
03781                                int size){
03782     int score=0;
03783     int i;
03784     for(i=0; i<size; i++)
03785         score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
03786     return score;
03787 }
03788 
03789 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
03790 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
03791 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
03792 #if CONFIG_GPL
03793 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
03794 #endif
03795 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
03796 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
03797 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
03798 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
03799 
03800 static void vector_fmul_c(float *dst, const float *src, int len){
03801     int i;
03802     for(i=0; i<len; i++)
03803         dst[i] *= src[i];
03804 }
03805 
03806 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
03807     int i;
03808     src1 += len-1;
03809     for(i=0; i<len; i++)
03810         dst[i] = src0[i] * src1[-i];
03811 }
03812 
03813 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
03814     int i;
03815     for(i=0; i<len; i++)
03816         dst[i] = src0[i] * src1[i] + src2[i];
03817 }
03818 
03819 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
03820     int i,j;
03821     dst += len;
03822     win += len;
03823     src0+= len;
03824     for(i=-len, j=len-1; i<0; i++, j--) {
03825         float s0 = src0[i];
03826         float s1 = src1[j];
03827         float wi = win[i];
03828         float wj = win[j];
03829         dst[i] = s0*wj - s1*wi + add_bias;
03830         dst[j] = s0*wi + s1*wj + add_bias;
03831     }
03832 }
03833 
03834 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
03835                                  int len)
03836 {
03837     int i;
03838     for (i = 0; i < len; i++)
03839         dst[i] = src[i] * mul;
03840 }
03841 
03842 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
03843                                       const float **sv, float mul, int len)
03844 {
03845     int i;
03846     for (i = 0; i < len; i += 2, sv++) {
03847         dst[i  ] = src[i  ] * sv[0][0] * mul;
03848         dst[i+1] = src[i+1] * sv[0][1] * mul;
03849     }
03850 }
03851 
03852 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
03853                                       const float **sv, float mul, int len)
03854 {
03855     int i;
03856     for (i = 0; i < len; i += 4, sv++) {
03857         dst[i  ] = src[i  ] * sv[0][0] * mul;
03858         dst[i+1] = src[i+1] * sv[0][1] * mul;
03859         dst[i+2] = src[i+2] * sv[0][2] * mul;
03860         dst[i+3] = src[i+3] * sv[0][3] * mul;
03861     }
03862 }
03863 
03864 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
03865                                int len)
03866 {
03867     int i;
03868     for (i = 0; i < len; i += 2, sv++) {
03869         dst[i  ] = sv[0][0] * mul;
03870         dst[i+1] = sv[0][1] * mul;
03871     }
03872 }
03873 
03874 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
03875                                int len)
03876 {
03877     int i;
03878     for (i = 0; i < len; i += 4, sv++) {
03879         dst[i  ] = sv[0][0] * mul;
03880         dst[i+1] = sv[0][1] * mul;
03881         dst[i+2] = sv[0][2] * mul;
03882         dst[i+3] = sv[0][3] * mul;
03883     }
03884 }
03885 
03886 static void butterflies_float_c(float *restrict v1, float *restrict v2,
03887                                 int len)
03888 {
03889     int i;
03890     for (i = 0; i < len; i++) {
03891         float t = v1[i] - v2[i];
03892         v1[i] += v2[i];
03893         v2[i] = t;
03894     }
03895 }
03896 
03897 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
03898 {
03899     float p = 0.0;
03900     int i;
03901 
03902     for (i = 0; i < len; i++)
03903         p += v1[i] * v2[i];
03904 
03905     return p;
03906 }
03907 
03908 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
03909     int i;
03910     for(i=0; i<len; i++)
03911         dst[i] = src[i] * mul;
03912 }
03913 
03914 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
03915                    uint32_t maxi, uint32_t maxisign)
03916 {
03917 
03918     if(a > mini) return mini;
03919     else if((a^(1<<31)) > maxisign) return maxi;
03920     else return a;
03921 }
03922 
03923 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
03924     int i;
03925     uint32_t mini = *(uint32_t*)min;
03926     uint32_t maxi = *(uint32_t*)max;
03927     uint32_t maxisign = maxi ^ (1<<31);
03928     uint32_t *dsti = (uint32_t*)dst;
03929     const uint32_t *srci = (const uint32_t*)src;
03930     for(i=0; i<len; i+=8) {
03931         dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
03932         dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
03933         dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
03934         dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
03935         dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
03936         dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
03937         dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
03938         dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
03939     }
03940 }
03941 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
03942     int i;
03943     if(min < 0 && max > 0) {
03944         vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
03945     } else {
03946         for(i=0; i < len; i+=8) {
03947             dst[i    ] = av_clipf(src[i    ], min, max);
03948             dst[i + 1] = av_clipf(src[i + 1], min, max);
03949             dst[i + 2] = av_clipf(src[i + 2], min, max);
03950             dst[i + 3] = av_clipf(src[i + 3], min, max);
03951             dst[i + 4] = av_clipf(src[i + 4], min, max);
03952             dst[i + 5] = av_clipf(src[i + 5], min, max);
03953             dst[i + 6] = av_clipf(src[i + 6], min, max);
03954             dst[i + 7] = av_clipf(src[i + 7], min, max);
03955         }
03956     }
03957 }
03958 
03959 static av_always_inline int float_to_int16_one(const float *src){
03960     int_fast32_t tmp = *(const int32_t*)src;
03961     if(tmp & 0xf0000){
03962         tmp = (0x43c0ffff - tmp)>>31;
03963         // is this faster on some gcc/cpu combinations?
03964 //      if(tmp > 0x43c0ffff) tmp = 0xFFFF;
03965 //      else                 tmp = 0;
03966     }
03967     return tmp - 0x8000;
03968 }
03969 
03970 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
03971     int i;
03972     for(i=0; i<len; i++)
03973         dst[i] = float_to_int16_one(src+i);
03974 }
03975 
03976 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
03977     int i,j,c;
03978     if(channels==2){
03979         for(i=0; i<len; i++){
03980             dst[2*i]   = float_to_int16_one(src[0]+i);
03981             dst[2*i+1] = float_to_int16_one(src[1]+i);
03982         }
03983     }else{
03984         for(c=0; c<channels; c++)
03985             for(i=0, j=c; i<len; i++, j+=channels)
03986                 dst[j] = float_to_int16_one(src[c]+i);
03987     }
03988 }
03989 
03990 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
03991 {
03992     int res = 0;
03993 
03994     while (order--)
03995         res += (*v1++ * *v2++) >> shift;
03996 
03997     return res;
03998 }
03999 
04000 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
04001 {
04002     int res = 0;
04003     while (order--) {
04004         res   += *v1 * *v2++;
04005         *v1++ += mul * *v3++;
04006     }
04007     return res;
04008 }
04009 
04010 #define W0 2048
04011 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
04012 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
04013 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
04014 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
04015 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
04016 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
04017 #define W7 565  /* 2048*sqrt (2)*cos (7*pi/16) */
04018 
04019 static void wmv2_idct_row(short * b)
04020 {
04021     int s1,s2;
04022     int a0,a1,a2,a3,a4,a5,a6,a7;
04023     /*step 1*/
04024     a1 = W1*b[1]+W7*b[7];
04025     a7 = W7*b[1]-W1*b[7];
04026     a5 = W5*b[5]+W3*b[3];
04027     a3 = W3*b[5]-W5*b[3];
04028     a2 = W2*b[2]+W6*b[6];
04029     a6 = W6*b[2]-W2*b[6];
04030     a0 = W0*b[0]+W0*b[4];
04031     a4 = W0*b[0]-W0*b[4];
04032     /*step 2*/
04033     s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
04034     s2 = (181*(a1-a5-a7+a3)+128)>>8;
04035     /*step 3*/
04036     b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
04037     b[1] = (a4+a6 +s1   + (1<<7))>>8;
04038     b[2] = (a4-a6 +s2   + (1<<7))>>8;
04039     b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
04040     b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
04041     b[5] = (a4-a6 -s2   + (1<<7))>>8;
04042     b[6] = (a4+a6 -s1   + (1<<7))>>8;
04043     b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
04044 }
04045 static void wmv2_idct_col(short * b)
04046 {
04047     int s1,s2;
04048     int a0,a1,a2,a3,a4,a5,a6,a7;
04049     /*step 1, with extended precision*/
04050     a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
04051     a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
04052     a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
04053     a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
04054     a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
04055     a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
04056     a0 = (W0*b[8*0]+W0*b[8*4]    )>>3;
04057     a4 = (W0*b[8*0]-W0*b[8*4]    )>>3;
04058     /*step 2*/
04059     s1 = (181*(a1-a5+a7-a3)+128)>>8;
04060     s2 = (181*(a1-a5-a7+a3)+128)>>8;
04061     /*step 3*/
04062     b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
04063     b[8*1] = (a4+a6 +s1   + (1<<13))>>14;
04064     b[8*2] = (a4-a6 +s2   + (1<<13))>>14;
04065     b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
04066 
04067     b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
04068     b[8*5] = (a4-a6 -s2   + (1<<13))>>14;
04069     b[8*6] = (a4+a6 -s1   + (1<<13))>>14;
04070     b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
04071 }
04072 void ff_wmv2_idct_c(short * block){
04073     int i;
04074 
04075     for(i=0;i<64;i+=8){
04076         wmv2_idct_row(block+i);
04077     }
04078     for(i=0;i<8;i++){
04079         wmv2_idct_col(block+i);
04080     }
04081 }
04082 /* XXX: those functions should be suppressed ASAP when all IDCTs are
04083  converted */
04084 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
04085 {
04086     ff_wmv2_idct_c(block);
04087     put_pixels_clamped_c(block, dest, line_size);
04088 }
04089 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
04090 {
04091     ff_wmv2_idct_c(block);
04092     add_pixels_clamped_c(block, dest, line_size);
04093 }
04094 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
04095 {
04096     j_rev_dct (block);
04097     put_pixels_clamped_c(block, dest, line_size);
04098 }
04099 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
04100 {
04101     j_rev_dct (block);
04102     add_pixels_clamped_c(block, dest, line_size);
04103 }
04104 
04105 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
04106 {
04107     j_rev_dct4 (block);
04108     put_pixels_clamped4_c(block, dest, line_size);
04109 }
04110 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
04111 {
04112     j_rev_dct4 (block);
04113     add_pixels_clamped4_c(block, dest, line_size);
04114 }
04115 
04116 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
04117 {
04118     j_rev_dct2 (block);
04119     put_pixels_clamped2_c(block, dest, line_size);
04120 }
04121 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
04122 {
04123     j_rev_dct2 (block);
04124     add_pixels_clamped2_c(block, dest, line_size);
04125 }
04126 
04127 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
04128 {
04129     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
04130 
04131     dest[0] = cm[(block[0] + 4)>>3];
04132 }
04133 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
04134 {
04135     uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
04136 
04137     dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
04138 }
04139 
04140 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
04141 
04142 /* init static data */
04143 av_cold void dsputil_static_init(void)
04144 {
04145     int i;
04146 
04147     for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
04148     for(i=0;i<MAX_NEG_CROP;i++) {
04149         ff_cropTbl[i] = 0;
04150         ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
04151     }
04152 
04153     for(i=0;i<512;i++) {
04154         ff_squareTbl[i] = (i - 256) * (i - 256);
04155     }
04156 
04157     for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
04158 }
04159 
04160 int ff_check_alignment(void){
04161     static int did_fail=0;
04162     DECLARE_ALIGNED(16, int, aligned);
04163 
04164     if((intptr_t)&aligned & 15){
04165         if(!did_fail){
04166 #if HAVE_MMX || HAVE_ALTIVEC
04167             av_log(NULL, AV_LOG_ERROR,
04168                 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
04169                 "and may be very slow or crash. This is not a bug in libavcodec,\n"
04170                 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
04171                 "Do not report crashes to FFmpeg developers.\n");
04172 #endif
04173             did_fail=1;
04174         }
04175         return -1;
04176     }
04177     return 0;
04178 }
04179 
04180 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
04181 {
04182     int i;
04183 
04184     ff_check_alignment();
04185 
04186 #if CONFIG_ENCODERS
04187     if(avctx->dct_algo==FF_DCT_FASTINT) {
04188         c->fdct = fdct_ifast;
04189         c->fdct248 = fdct_ifast248;
04190     }
04191     else if(avctx->dct_algo==FF_DCT_FAAN) {
04192         c->fdct = ff_faandct;
04193         c->fdct248 = ff_faandct248;
04194     }
04195     else {
04196         c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
04197         c->fdct248 = ff_fdct248_islow;
04198     }
04199 #endif //CONFIG_ENCODERS
04200 
04201     if(avctx->lowres==1){
04202         if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
04203             c->idct_put= ff_jref_idct4_put;
04204             c->idct_add= ff_jref_idct4_add;
04205         }else{
04206             c->idct_put= ff_h264_lowres_idct_put_c;
04207             c->idct_add= ff_h264_lowres_idct_add_c;
04208         }
04209         c->idct    = j_rev_dct4;
04210         c->idct_permutation_type= FF_NO_IDCT_PERM;
04211     }else if(avctx->lowres==2){
04212         c->idct_put= ff_jref_idct2_put;
04213         c->idct_add= ff_jref_idct2_add;
04214         c->idct    = j_rev_dct2;
04215         c->idct_permutation_type= FF_NO_IDCT_PERM;
04216     }else if(avctx->lowres==3){
04217         c->idct_put= ff_jref_idct1_put;
04218         c->idct_add= ff_jref_idct1_add;
04219         c->idct    = j_rev_dct1;
04220         c->idct_permutation_type= FF_NO_IDCT_PERM;
04221     }else{
04222         if(avctx->idct_algo==FF_IDCT_INT){
04223             c->idct_put= ff_jref_idct_put;
04224             c->idct_add= ff_jref_idct_add;
04225             c->idct    = j_rev_dct;
04226             c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
04227         }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
04228                 avctx->idct_algo==FF_IDCT_VP3){
04229             c->idct_put= ff_vp3_idct_put_c;
04230             c->idct_add= ff_vp3_idct_add_c;
04231             c->idct    = ff_vp3_idct_c;
04232             c->idct_permutation_type= FF_NO_IDCT_PERM;
04233         }else if(avctx->idct_algo==FF_IDCT_WMV2){
04234             c->idct_put= ff_wmv2_idct_put_c;
04235             c->idct_add= ff_wmv2_idct_add_c;
04236             c->idct    = ff_wmv2_idct_c;
04237             c->idct_permutation_type= FF_NO_IDCT_PERM;
04238         }else if(avctx->idct_algo==FF_IDCT_FAAN){
04239             c->idct_put= ff_faanidct_put;
04240             c->idct_add= ff_faanidct_add;
04241             c->idct    = ff_faanidct;
04242             c->idct_permutation_type= FF_NO_IDCT_PERM;
04243         }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
04244             c->idct_put= ff_ea_idct_put_c;
04245             c->idct_permutation_type= FF_NO_IDCT_PERM;
04246         }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
04247             c->idct     = ff_bink_idct_c;
04248             c->idct_add = ff_bink_idct_add_c;
04249             c->idct_put = ff_bink_idct_put_c;
04250             c->idct_permutation_type = FF_NO_IDCT_PERM;
04251         }else{ //accurate/default
04252             c->idct_put= ff_simple_idct_put;
04253             c->idct_add= ff_simple_idct_add;
04254             c->idct    = ff_simple_idct;
04255             c->idct_permutation_type= FF_NO_IDCT_PERM;
04256         }
04257     }
04258 
04259     c->get_pixels = get_pixels_c;
04260     c->diff_pixels = diff_pixels_c;
04261     c->put_pixels_clamped = put_pixels_clamped_c;
04262     c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
04263     c->put_pixels_nonclamped = put_pixels_nonclamped_c;
04264     c->add_pixels_clamped = add_pixels_clamped_c;
04265     c->add_pixels8 = add_pixels8_c;
04266     c->add_pixels4 = add_pixels4_c;
04267     c->sum_abs_dctelem = sum_abs_dctelem_c;
04268     c->gmc1 = gmc1_c;
04269     c->gmc = ff_gmc_c;
04270     c->clear_block = clear_block_c;
04271     c->clear_blocks = clear_blocks_c;
04272     c->pix_sum = pix_sum_c;
04273     c->pix_norm1 = pix_norm1_c;
04274 
04275     c->fill_block_tab[0] = fill_block16_c;
04276     c->fill_block_tab[1] = fill_block8_c;
04277     c->scale_block = scale_block_c;
04278 
04279     /* TODO [0] 16  [1] 8 */
04280     c->pix_abs[0][0] = pix_abs16_c;
04281     c->pix_abs[0][1] = pix_abs16_x2_c;
04282     c->pix_abs[0][2] = pix_abs16_y2_c;
04283     c->pix_abs[0][3] = pix_abs16_xy2_c;
04284     c->pix_abs[1][0] = pix_abs8_c;
04285     c->pix_abs[1][1] = pix_abs8_x2_c;
04286     c->pix_abs[1][2] = pix_abs8_y2_c;
04287     c->pix_abs[1][3] = pix_abs8_xy2_c;
04288 
04289 #define dspfunc(PFX, IDX, NUM) \
04290     c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c;     \
04291     c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c;  \
04292     c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c;  \
04293     c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
04294 
04295     dspfunc(put, 0, 16);
04296     dspfunc(put_no_rnd, 0, 16);
04297     dspfunc(put, 1, 8);
04298     dspfunc(put_no_rnd, 1, 8);
04299     dspfunc(put, 2, 4);
04300     dspfunc(put, 3, 2);
04301 
04302     dspfunc(avg, 0, 16);
04303     dspfunc(avg_no_rnd, 0, 16);
04304     dspfunc(avg, 1, 8);
04305     dspfunc(avg_no_rnd, 1, 8);
04306     dspfunc(avg, 2, 4);
04307     dspfunc(avg, 3, 2);
04308 #undef dspfunc
04309 
04310     c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
04311     c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
04312 
04313     c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
04314     c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
04315     c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
04316     c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
04317     c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
04318     c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
04319     c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
04320     c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
04321     c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
04322 
04323     c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
04324     c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
04325     c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
04326     c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
04327     c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
04328     c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
04329     c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
04330     c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
04331     c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
04332 
04333 #define dspfunc(PFX, IDX, NUM) \
04334     c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
04335     c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
04336     c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
04337     c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
04338     c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
04339     c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
04340     c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
04341     c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
04342     c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
04343     c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
04344     c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
04345     c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
04346     c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
04347     c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
04348     c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
04349     c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
04350 
04351     dspfunc(put_qpel, 0, 16);
04352     dspfunc(put_no_rnd_qpel, 0, 16);
04353 
04354     dspfunc(avg_qpel, 0, 16);
04355     /* dspfunc(avg_no_rnd_qpel, 0, 16); */
04356 
04357     dspfunc(put_qpel, 1, 8);
04358     dspfunc(put_no_rnd_qpel, 1, 8);
04359 
04360     dspfunc(avg_qpel, 1, 8);
04361     /* dspfunc(avg_no_rnd_qpel, 1, 8); */
04362 
04363     dspfunc(put_h264_qpel, 0, 16);
04364     dspfunc(put_h264_qpel, 1, 8);
04365     dspfunc(put_h264_qpel, 2, 4);
04366     dspfunc(put_h264_qpel, 3, 2);
04367     dspfunc(avg_h264_qpel, 0, 16);
04368     dspfunc(avg_h264_qpel, 1, 8);
04369     dspfunc(avg_h264_qpel, 2, 4);
04370 
04371 #undef dspfunc
04372     c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
04373     c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
04374     c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
04375     c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
04376     c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
04377     c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
04378     c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
04379     c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
04380 
04381     c->draw_edges = draw_edges_c;
04382 
04383 #if CONFIG_CAVS_DECODER
04384     ff_cavsdsp_init(c,avctx);
04385 #endif
04386 
04387 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
04388     ff_mlp_init(c, avctx);
04389 #endif
04390 #if CONFIG_VC1_DECODER
04391     ff_vc1dsp_init(c,avctx);
04392 #endif
04393 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
04394     ff_intrax8dsp_init(c,avctx);
04395 #endif
04396 #if CONFIG_RV30_DECODER
04397     ff_rv30dsp_init(c,avctx);
04398 #endif
04399 #if CONFIG_RV40_DECODER
04400     ff_rv40dsp_init(c,avctx);
04401     c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
04402     c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
04403     c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
04404     c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
04405 #endif
04406 
04407     c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
04408     c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
04409     c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
04410     c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
04411     c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
04412     c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
04413     c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
04414     c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
04415 
04416 #define SET_CMP_FUNC(name) \
04417     c->name[0]= name ## 16_c;\
04418     c->name[1]= name ## 8x8_c;
04419 
04420     SET_CMP_FUNC(hadamard8_diff)
04421     c->hadamard8_diff[4]= hadamard8_intra16_c;
04422     c->hadamard8_diff[5]= hadamard8_intra8x8_c;
04423     SET_CMP_FUNC(dct_sad)
04424     SET_CMP_FUNC(dct_max)
04425 #if CONFIG_GPL
04426     SET_CMP_FUNC(dct264_sad)
04427 #endif
04428     c->sad[0]= pix_abs16_c;
04429     c->sad[1]= pix_abs8_c;
04430     c->sse[0]= sse16_c;
04431     c->sse[1]= sse8_c;
04432     c->sse[2]= sse4_c;
04433     SET_CMP_FUNC(quant_psnr)
04434     SET_CMP_FUNC(rd)
04435     SET_CMP_FUNC(bit)
04436     c->vsad[0]= vsad16_c;
04437     c->vsad[4]= vsad_intra16_c;
04438     c->vsad[5]= vsad_intra8_c;
04439     c->vsse[0]= vsse16_c;
04440     c->vsse[4]= vsse_intra16_c;
04441     c->vsse[5]= vsse_intra8_c;
04442     c->nsse[0]= nsse16_c;
04443     c->nsse[1]= nsse8_c;
04444 #if CONFIG_DWT
04445     ff_dsputil_init_dwt(c);
04446 #endif
04447 
04448     c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
04449 
04450     c->add_bytes= add_bytes_c;
04451     c->add_bytes_l2= add_bytes_l2_c;
04452     c->diff_bytes= diff_bytes_c;
04453     c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
04454     c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
04455     c->add_hfyu_left_prediction  = add_hfyu_left_prediction_c;
04456     c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
04457     c->bswap_buf= bswap_buf;
04458 #if CONFIG_PNG_DECODER
04459     c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
04460 #endif
04461 
04462     if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
04463         c->h263_h_loop_filter= h263_h_loop_filter_c;
04464         c->h263_v_loop_filter= h263_v_loop_filter_c;
04465     }
04466 
04467     if (CONFIG_VP3_DECODER) {
04468         c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
04469         c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
04470         c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
04471     }
04472     if (CONFIG_VP6_DECODER) {
04473         c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
04474     }
04475 
04476     c->h261_loop_filter= h261_loop_filter_c;
04477 
04478     c->try_8x8basis= try_8x8basis_c;
04479     c->add_8x8basis= add_8x8basis_c;
04480 
04481 #if CONFIG_VORBIS_DECODER
04482     c->vorbis_inverse_coupling = vorbis_inverse_coupling;
04483 #endif
04484 #if CONFIG_AC3_DECODER
04485     c->ac3_downmix = ff_ac3_downmix_c;
04486 #endif
04487 #if CONFIG_LPC
04488     c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
04489 #endif
04490     c->vector_fmul = vector_fmul_c;
04491     c->vector_fmul_reverse = vector_fmul_reverse_c;
04492     c->vector_fmul_add = vector_fmul_add_c;
04493     c->vector_fmul_window = ff_vector_fmul_window_c;
04494     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
04495     c->vector_clipf = vector_clipf_c;
04496     c->float_to_int16 = ff_float_to_int16_c;
04497     c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
04498     c->scalarproduct_int16 = scalarproduct_int16_c;
04499     c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
04500     c->scalarproduct_float = scalarproduct_float_c;
04501     c->butterflies_float = butterflies_float_c;
04502     c->vector_fmul_scalar = vector_fmul_scalar_c;
04503 
04504     c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
04505     c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
04506 
04507     c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
04508     c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
04509 
04510     c->shrink[0]= ff_img_copy_plane;
04511     c->shrink[1]= ff_shrink22;
04512     c->shrink[2]= ff_shrink44;
04513     c->shrink[3]= ff_shrink88;
04514 
04515     c->prefetch= just_return;
04516 
04517     memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
04518     memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
04519 
04520     if (HAVE_MMX)        dsputil_init_mmx   (c, avctx);
04521     if (ARCH_ARM)        dsputil_init_arm   (c, avctx);
04522     if (CONFIG_MLIB)     dsputil_init_mlib  (c, avctx);
04523     if (HAVE_VIS)        dsputil_init_vis   (c, avctx);
04524     if (ARCH_ALPHA)      dsputil_init_alpha (c, avctx);
04525     if (ARCH_PPC)        dsputil_init_ppc   (c, avctx);
04526     if (HAVE_MMI)        dsputil_init_mmi   (c, avctx);
04527     if (ARCH_SH4)        dsputil_init_sh4   (c, avctx);
04528     if (ARCH_BFIN)       dsputil_init_bfin  (c, avctx);
04529 
04530     for(i=0; i<64; i++){
04531         if(!c->put_2tap_qpel_pixels_tab[0][i])
04532             c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
04533         if(!c->avg_2tap_qpel_pixels_tab[0][i])
04534             c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
04535     }
04536 
04537     switch(c->idct_permutation_type){
04538     case FF_NO_IDCT_PERM:
04539         for(i=0; i<64; i++)
04540             c->idct_permutation[i]= i;
04541         break;
04542     case FF_LIBMPEG2_IDCT_PERM:
04543         for(i=0; i<64; i++)
04544             c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
04545         break;
04546     case FF_SIMPLE_IDCT_PERM:
04547         for(i=0; i<64; i++)
04548             c->idct_permutation[i]= simple_mmx_permutation[i];
04549         break;
04550     case FF_TRANSPOSE_IDCT_PERM:
04551         for(i=0; i<64; i++)
04552             c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
04553         break;
04554     case FF_PARTTRANS_IDCT_PERM:
04555         for(i=0; i<64; i++)
04556             c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
04557         break;
04558     case FF_SSE2_IDCT_PERM:
04559         for(i=0; i<64; i++)
04560             c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
04561         break;
04562     default:
04563         av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");
04564     }
04565 }
04566