00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #ifdef DEBUG
00022 #define ASSERT_ALIGNED(ptr) assert(((unsigned long)ptr&0x0000000F));
00023 #else
00024 #define ASSERT_ALIGNED(ptr) ;
00025 #endif
00026
00027
00028
00029 #define CHROMA_MC8_ALTIVEC_CORE(BIAS1, BIAS2) \
00030 vsrc2ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc2uc);\
00031 vsrc3ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc3uc);\
00032 \
00033 psum = vec_mladd(vA, vsrc0ssH, BIAS1);\
00034 psum = vec_mladd(vB, vsrc1ssH, psum);\
00035 psum = vec_mladd(vC, vsrc2ssH, psum);\
00036 psum = vec_mladd(vD, vsrc3ssH, psum);\
00037 psum = BIAS2(psum);\
00038 psum = vec_sr(psum, v6us);\
00039 \
00040 vdst = vec_ld(0, dst);\
00041 ppsum = (vec_u8)vec_pack(psum, psum);\
00042 vfdst = vec_perm(vdst, ppsum, fperm);\
00043 \
00044 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00045 \
00046 vec_st(fsum, 0, dst);\
00047 \
00048 vsrc0ssH = vsrc2ssH;\
00049 vsrc1ssH = vsrc3ssH;\
00050 \
00051 dst += stride;\
00052 src += stride;
00053
00054 #define CHROMA_MC8_ALTIVEC_CORE_SIMPLE \
00055 \
00056 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);\
00057 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);\
00058 \
00059 psum = vec_mladd(vA, vsrc0ssH, v32ss);\
00060 psum = vec_mladd(vE, vsrc1ssH, psum);\
00061 psum = vec_sr(psum, v6us);\
00062 \
00063 vdst = vec_ld(0, dst);\
00064 ppsum = (vec_u8)vec_pack(psum, psum);\
00065 vfdst = vec_perm(vdst, ppsum, fperm);\
00066 \
00067 OP_U8_ALTIVEC(fsum, vfdst, vdst);\
00068 \
00069 vec_st(fsum, 0, dst);\
00070 \
00071 dst += stride;\
00072 src += stride;
00073
00074 #define noop(a) a
00075 #define add28(a) vec_add(v28ss, a)
00076
00077 #ifdef PREFIX_h264_chroma_mc8_altivec
00078 static void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src,
00079 int stride, int h, int x, int y) {
00080 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
00081 {((8 - x) * (8 - y)),
00082 (( x) * (8 - y)),
00083 ((8 - x) * ( y)),
00084 (( x) * ( y))};
00085 register int i;
00086 vec_u8 fperm;
00087 const vec_s32 vABCD = vec_ld(0, ABCD);
00088 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00089 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00090 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00091 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00092 LOAD_ZERO;
00093 const vec_s16 v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
00094 const vec_u16 v6us = vec_splat_u16(6);
00095 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00096 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00097
00098 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
00099 vec_u8 vsrc0uc, vsrc1uc;
00100 vec_s16 vsrc0ssH, vsrc1ssH;
00101 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00102 vec_s16 vsrc2ssH, vsrc3ssH, psum;
00103 vec_u8 vdst, ppsum, vfdst, fsum;
00104
00105 if (((unsigned long)dst) % 16 == 0) {
00106 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00107 0x14, 0x15, 0x16, 0x17,
00108 0x08, 0x09, 0x0A, 0x0B,
00109 0x0C, 0x0D, 0x0E, 0x0F};
00110 } else {
00111 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00112 0x04, 0x05, 0x06, 0x07,
00113 0x18, 0x19, 0x1A, 0x1B,
00114 0x1C, 0x1D, 0x1E, 0x1F};
00115 }
00116
00117 vsrcAuc = vec_ld(0, src);
00118
00119 if (loadSecond)
00120 vsrcBuc = vec_ld(16, src);
00121 vsrcperm0 = vec_lvsl(0, src);
00122 vsrcperm1 = vec_lvsl(1, src);
00123
00124 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00125 if (reallyBadAlign)
00126 vsrc1uc = vsrcBuc;
00127 else
00128 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00129
00130 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc0uc);
00131 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v,(vec_u8)vsrc1uc);
00132
00133 if (ABCD[3]) {
00134 if (!loadSecond) {
00135 for (i = 0 ; i < h ; i++) {
00136 vsrcCuc = vec_ld(stride + 0, src);
00137 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00138 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00139
00140 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
00141 }
00142 } else {
00143 vec_u8 vsrcDuc;
00144 for (i = 0 ; i < h ; i++) {
00145 vsrcCuc = vec_ld(stride + 0, src);
00146 vsrcDuc = vec_ld(stride + 16, src);
00147 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00148 if (reallyBadAlign)
00149 vsrc3uc = vsrcDuc;
00150 else
00151 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00152
00153 CHROMA_MC8_ALTIVEC_CORE(v32ss, noop)
00154 }
00155 }
00156 } else {
00157 const vec_s16 vE = vec_add(vB, vC);
00158 if (ABCD[2]) {
00159 if (!loadSecond) {
00160 for (i = 0 ; i < h ; i++) {
00161 vsrcCuc = vec_ld(stride + 0, src);
00162 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00163 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00164
00165 vsrc0uc = vsrc1uc;
00166 }
00167 } else {
00168 vec_u8 vsrcDuc;
00169 for (i = 0 ; i < h ; i++) {
00170 vsrcCuc = vec_ld(stride + 0, src);
00171 vsrcDuc = vec_ld(stride + 15, src);
00172 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00173 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00174
00175 vsrc0uc = vsrc1uc;
00176 }
00177 }
00178 } else {
00179 if (!loadSecond) {
00180 for (i = 0 ; i < h ; i++) {
00181 vsrcCuc = vec_ld(0, src);
00182 vsrc0uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00183 vsrc1uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00184
00185 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00186 }
00187 } else {
00188 vec_u8 vsrcDuc;
00189 for (i = 0 ; i < h ; i++) {
00190 vsrcCuc = vec_ld(0, src);
00191 vsrcDuc = vec_ld(15, src);
00192 vsrc0uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00193 if (reallyBadAlign)
00194 vsrc1uc = vsrcDuc;
00195 else
00196 vsrc1uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00197
00198 CHROMA_MC8_ALTIVEC_CORE_SIMPLE
00199 }
00200 }
00201 }
00202 }
00203 }
00204 #endif
00205
00206
00207 #ifdef PREFIX_no_rnd_vc1_chroma_mc8_altivec
00208 static void PREFIX_no_rnd_vc1_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
00209 DECLARE_ALIGNED(16, signed int, ABCD)[4] =
00210 {((8 - x) * (8 - y)),
00211 (( x) * (8 - y)),
00212 ((8 - x) * ( y)),
00213 (( x) * ( y))};
00214 register int i;
00215 vec_u8 fperm;
00216 const vec_s32 vABCD = vec_ld(0, ABCD);
00217 const vec_s16 vA = vec_splat((vec_s16)vABCD, 1);
00218 const vec_s16 vB = vec_splat((vec_s16)vABCD, 3);
00219 const vec_s16 vC = vec_splat((vec_s16)vABCD, 5);
00220 const vec_s16 vD = vec_splat((vec_s16)vABCD, 7);
00221 LOAD_ZERO;
00222 const vec_s16 v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
00223 const vec_u16 v6us = vec_splat_u16(6);
00224 register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
00225 register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
00226
00227 vec_u8 vsrcAuc, av_uninit(vsrcBuc), vsrcperm0, vsrcperm1;
00228 vec_u8 vsrc0uc, vsrc1uc;
00229 vec_s16 vsrc0ssH, vsrc1ssH;
00230 vec_u8 vsrcCuc, vsrc2uc, vsrc3uc;
00231 vec_s16 vsrc2ssH, vsrc3ssH, psum;
00232 vec_u8 vdst, ppsum, vfdst, fsum;
00233
00234 if (((unsigned long)dst) % 16 == 0) {
00235 fperm = (vec_u8){0x10, 0x11, 0x12, 0x13,
00236 0x14, 0x15, 0x16, 0x17,
00237 0x08, 0x09, 0x0A, 0x0B,
00238 0x0C, 0x0D, 0x0E, 0x0F};
00239 } else {
00240 fperm = (vec_u8){0x00, 0x01, 0x02, 0x03,
00241 0x04, 0x05, 0x06, 0x07,
00242 0x18, 0x19, 0x1A, 0x1B,
00243 0x1C, 0x1D, 0x1E, 0x1F};
00244 }
00245
00246 vsrcAuc = vec_ld(0, src);
00247
00248 if (loadSecond)
00249 vsrcBuc = vec_ld(16, src);
00250 vsrcperm0 = vec_lvsl(0, src);
00251 vsrcperm1 = vec_lvsl(1, src);
00252
00253 vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
00254 if (reallyBadAlign)
00255 vsrc1uc = vsrcBuc;
00256 else
00257 vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
00258
00259 vsrc0ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc0uc);
00260 vsrc1ssH = (vec_s16)vec_mergeh(zero_u8v, (vec_u8)vsrc1uc);
00261
00262 if (!loadSecond) {
00263 for (i = 0 ; i < h ; i++) {
00264
00265
00266 vsrcCuc = vec_ld(stride + 0, src);
00267
00268 vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
00269 vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
00270
00271 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
00272 }
00273 } else {
00274 vec_u8 vsrcDuc;
00275 for (i = 0 ; i < h ; i++) {
00276 vsrcCuc = vec_ld(stride + 0, src);
00277 vsrcDuc = vec_ld(stride + 16, src);
00278
00279 vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
00280 if (reallyBadAlign)
00281 vsrc3uc = vsrcDuc;
00282 else
00283 vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
00284
00285 CHROMA_MC8_ALTIVEC_CORE(vec_splat_s16(0), add28)
00286 }
00287 }
00288 }
00289 #endif
00290
00291 #undef noop
00292 #undef add28
00293 #undef CHROMA_MC8_ALTIVEC_CORE
00294
00295
00296 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
00297 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00298 register int i;
00299
00300 LOAD_ZERO;
00301 const vec_u8 permM2 = vec_lvsl(-2, src);
00302 const vec_u8 permM1 = vec_lvsl(-1, src);
00303 const vec_u8 permP0 = vec_lvsl(+0, src);
00304 const vec_u8 permP1 = vec_lvsl(+1, src);
00305 const vec_u8 permP2 = vec_lvsl(+2, src);
00306 const vec_u8 permP3 = vec_lvsl(+3, src);
00307 const vec_s16 v5ss = vec_splat_s16(5);
00308 const vec_u16 v5us = vec_splat_u16(5);
00309 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00310 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00311
00312 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00313
00314 register int align = ((((unsigned long)src) - 2) % 16);
00315
00316 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00317 srcP2A, srcP2B, srcP3A, srcP3B,
00318 srcM1A, srcM1B, srcM2A, srcM2B,
00319 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00320 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00321 psumA, psumB, sumA, sumB;
00322
00323 vec_u8 sum, vdst, fsum;
00324
00325 for (i = 0 ; i < 16 ; i ++) {
00326 vec_u8 srcR1 = vec_ld(-2, src);
00327 vec_u8 srcR2 = vec_ld(14, src);
00328
00329 switch (align) {
00330 default: {
00331 srcM2 = vec_perm(srcR1, srcR2, permM2);
00332 srcM1 = vec_perm(srcR1, srcR2, permM1);
00333 srcP0 = vec_perm(srcR1, srcR2, permP0);
00334 srcP1 = vec_perm(srcR1, srcR2, permP1);
00335 srcP2 = vec_perm(srcR1, srcR2, permP2);
00336 srcP3 = vec_perm(srcR1, srcR2, permP3);
00337 } break;
00338 case 11: {
00339 srcM2 = vec_perm(srcR1, srcR2, permM2);
00340 srcM1 = vec_perm(srcR1, srcR2, permM1);
00341 srcP0 = vec_perm(srcR1, srcR2, permP0);
00342 srcP1 = vec_perm(srcR1, srcR2, permP1);
00343 srcP2 = vec_perm(srcR1, srcR2, permP2);
00344 srcP3 = srcR2;
00345 } break;
00346 case 12: {
00347 vec_u8 srcR3 = vec_ld(30, src);
00348 srcM2 = vec_perm(srcR1, srcR2, permM2);
00349 srcM1 = vec_perm(srcR1, srcR2, permM1);
00350 srcP0 = vec_perm(srcR1, srcR2, permP0);
00351 srcP1 = vec_perm(srcR1, srcR2, permP1);
00352 srcP2 = srcR2;
00353 srcP3 = vec_perm(srcR2, srcR3, permP3);
00354 } break;
00355 case 13: {
00356 vec_u8 srcR3 = vec_ld(30, src);
00357 srcM2 = vec_perm(srcR1, srcR2, permM2);
00358 srcM1 = vec_perm(srcR1, srcR2, permM1);
00359 srcP0 = vec_perm(srcR1, srcR2, permP0);
00360 srcP1 = srcR2;
00361 srcP2 = vec_perm(srcR2, srcR3, permP2);
00362 srcP3 = vec_perm(srcR2, srcR3, permP3);
00363 } break;
00364 case 14: {
00365 vec_u8 srcR3 = vec_ld(30, src);
00366 srcM2 = vec_perm(srcR1, srcR2, permM2);
00367 srcM1 = vec_perm(srcR1, srcR2, permM1);
00368 srcP0 = srcR2;
00369 srcP1 = vec_perm(srcR2, srcR3, permP1);
00370 srcP2 = vec_perm(srcR2, srcR3, permP2);
00371 srcP3 = vec_perm(srcR2, srcR3, permP3);
00372 } break;
00373 case 15: {
00374 vec_u8 srcR3 = vec_ld(30, src);
00375 srcM2 = vec_perm(srcR1, srcR2, permM2);
00376 srcM1 = srcR2;
00377 srcP0 = vec_perm(srcR2, srcR3, permP0);
00378 srcP1 = vec_perm(srcR2, srcR3, permP1);
00379 srcP2 = vec_perm(srcR2, srcR3, permP2);
00380 srcP3 = vec_perm(srcR2, srcR3, permP3);
00381 } break;
00382 }
00383
00384 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00385 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00386 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00387 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00388
00389 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00390 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00391 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00392 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00393
00394 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00395 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00396 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00397 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00398
00399 sum1A = vec_adds(srcP0A, srcP1A);
00400 sum1B = vec_adds(srcP0B, srcP1B);
00401 sum2A = vec_adds(srcM1A, srcP2A);
00402 sum2B = vec_adds(srcM1B, srcP2B);
00403 sum3A = vec_adds(srcM2A, srcP3A);
00404 sum3B = vec_adds(srcM2B, srcP3B);
00405
00406 pp1A = vec_mladd(sum1A, v20ss, v16ss);
00407 pp1B = vec_mladd(sum1B, v20ss, v16ss);
00408
00409 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00410 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00411
00412 pp3A = vec_add(sum3A, pp1A);
00413 pp3B = vec_add(sum3B, pp1B);
00414
00415 psumA = vec_sub(pp3A, pp2A);
00416 psumB = vec_sub(pp3B, pp2B);
00417
00418 sumA = vec_sra(psumA, v5us);
00419 sumB = vec_sra(psumB, v5us);
00420
00421 sum = vec_packsu(sumA, sumB);
00422
00423 ASSERT_ALIGNED(dst);
00424 vdst = vec_ld(0, dst);
00425
00426 OP_U8_ALTIVEC(fsum, sum, vdst);
00427
00428 vec_st(fsum, 0, dst);
00429
00430 src += srcStride;
00431 dst += dstStride;
00432 }
00433 }
00434 #endif
00435
00436
00437 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
00438 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
00439 register int i;
00440
00441 LOAD_ZERO;
00442 const vec_u8 perm = vec_lvsl(0, src);
00443 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00444 const vec_u16 v5us = vec_splat_u16(5);
00445 const vec_s16 v5ss = vec_splat_s16(5);
00446 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
00447
00448 uint8_t *srcbis = src - (srcStride * 2);
00449
00450 const vec_u8 srcM2a = vec_ld(0, srcbis);
00451 const vec_u8 srcM2b = vec_ld(16, srcbis);
00452 const vec_u8 srcM2 = vec_perm(srcM2a, srcM2b, perm);
00453
00454 const vec_u8 srcM1a = vec_ld(0, srcbis += srcStride);
00455 const vec_u8 srcM1b = vec_ld(16, srcbis);
00456 const vec_u8 srcM1 = vec_perm(srcM1a, srcM1b, perm);
00457
00458 const vec_u8 srcP0a = vec_ld(0, srcbis += srcStride);
00459 const vec_u8 srcP0b = vec_ld(16, srcbis);
00460 const vec_u8 srcP0 = vec_perm(srcP0a, srcP0b, perm);
00461
00462 const vec_u8 srcP1a = vec_ld(0, srcbis += srcStride);
00463 const vec_u8 srcP1b = vec_ld(16, srcbis);
00464 const vec_u8 srcP1 = vec_perm(srcP1a, srcP1b, perm);
00465
00466 const vec_u8 srcP2a = vec_ld(0, srcbis += srcStride);
00467 const vec_u8 srcP2b = vec_ld(16, srcbis);
00468 const vec_u8 srcP2 = vec_perm(srcP2a, srcP2b, perm);
00469
00470
00471 vec_s16 srcM2ssA = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00472 vec_s16 srcM2ssB = (vec_s16) vec_mergel(zero_u8v, srcM2);
00473 vec_s16 srcM1ssA = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00474 vec_s16 srcM1ssB = (vec_s16) vec_mergel(zero_u8v, srcM1);
00475 vec_s16 srcP0ssA = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00476 vec_s16 srcP0ssB = (vec_s16) vec_mergel(zero_u8v, srcP0);
00477 vec_s16 srcP1ssA = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00478 vec_s16 srcP1ssB = (vec_s16) vec_mergel(zero_u8v, srcP1);
00479 vec_s16 srcP2ssA = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00480 vec_s16 srcP2ssB = (vec_s16) vec_mergel(zero_u8v, srcP2);
00481
00482 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
00483 psumA, psumB, sumA, sumB,
00484 srcP3ssA, srcP3ssB,
00485 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
00486
00487 vec_u8 sum, vdst, fsum, srcP3a, srcP3b, srcP3;
00488
00489 for (i = 0 ; i < 16 ; i++) {
00490 srcP3a = vec_ld(0, srcbis += srcStride);
00491 srcP3b = vec_ld(16, srcbis);
00492 srcP3 = vec_perm(srcP3a, srcP3b, perm);
00493 srcP3ssA = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00494 srcP3ssB = (vec_s16) vec_mergel(zero_u8v, srcP3);
00495
00496
00497 sum1A = vec_adds(srcP0ssA, srcP1ssA);
00498 sum1B = vec_adds(srcP0ssB, srcP1ssB);
00499 sum2A = vec_adds(srcM1ssA, srcP2ssA);
00500 sum2B = vec_adds(srcM1ssB, srcP2ssB);
00501 sum3A = vec_adds(srcM2ssA, srcP3ssA);
00502 sum3B = vec_adds(srcM2ssB, srcP3ssB);
00503
00504 srcM2ssA = srcM1ssA;
00505 srcM2ssB = srcM1ssB;
00506 srcM1ssA = srcP0ssA;
00507 srcM1ssB = srcP0ssB;
00508 srcP0ssA = srcP1ssA;
00509 srcP0ssB = srcP1ssB;
00510 srcP1ssA = srcP2ssA;
00511 srcP1ssB = srcP2ssB;
00512 srcP2ssA = srcP3ssA;
00513 srcP2ssB = srcP3ssB;
00514
00515 pp1A = vec_mladd(sum1A, v20ss, v16ss);
00516 pp1B = vec_mladd(sum1B, v20ss, v16ss);
00517
00518 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00519 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00520
00521 pp3A = vec_add(sum3A, pp1A);
00522 pp3B = vec_add(sum3B, pp1B);
00523
00524 psumA = vec_sub(pp3A, pp2A);
00525 psumB = vec_sub(pp3B, pp2B);
00526
00527 sumA = vec_sra(psumA, v5us);
00528 sumB = vec_sra(psumB, v5us);
00529
00530 sum = vec_packsu(sumA, sumB);
00531
00532 ASSERT_ALIGNED(dst);
00533 vdst = vec_ld(0, dst);
00534
00535 OP_U8_ALTIVEC(fsum, sum, vdst);
00536
00537 vec_st(fsum, 0, dst);
00538
00539 dst += dstStride;
00540 }
00541 }
00542 #endif
00543
00544
00545 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
00546 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
00547 register int i;
00548 LOAD_ZERO;
00549 const vec_u8 permM2 = vec_lvsl(-2, src);
00550 const vec_u8 permM1 = vec_lvsl(-1, src);
00551 const vec_u8 permP0 = vec_lvsl(+0, src);
00552 const vec_u8 permP1 = vec_lvsl(+1, src);
00553 const vec_u8 permP2 = vec_lvsl(+2, src);
00554 const vec_u8 permP3 = vec_lvsl(+3, src);
00555 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
00556 const vec_u32 v10ui = vec_splat_u32(10);
00557 const vec_s16 v5ss = vec_splat_s16(5);
00558 const vec_s16 v1ss = vec_splat_s16(1);
00559 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
00560 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
00561
00562 register int align = ((((unsigned long)src) - 2) % 16);
00563
00564 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
00565 srcP2A, srcP2B, srcP3A, srcP3B,
00566 srcM1A, srcM1B, srcM2A, srcM2B,
00567 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
00568 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
00569
00570 const vec_u8 mperm = (const vec_u8)
00571 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
00572 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
00573 int16_t *tmpbis = tmp;
00574
00575 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
00576 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
00577 tmpP2ssA, tmpP2ssB;
00578
00579 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
00580 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
00581 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
00582 ssumAe, ssumAo, ssumBe, ssumBo;
00583 vec_u8 fsum, sumv, sum, vdst;
00584 vec_s16 ssume, ssumo;
00585
00586 src -= (2 * srcStride);
00587 for (i = 0 ; i < 21 ; i ++) {
00588 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
00589 vec_u8 srcR1 = vec_ld(-2, src);
00590 vec_u8 srcR2 = vec_ld(14, src);
00591
00592 switch (align) {
00593 default: {
00594 srcM2 = vec_perm(srcR1, srcR2, permM2);
00595 srcM1 = vec_perm(srcR1, srcR2, permM1);
00596 srcP0 = vec_perm(srcR1, srcR2, permP0);
00597 srcP1 = vec_perm(srcR1, srcR2, permP1);
00598 srcP2 = vec_perm(srcR1, srcR2, permP2);
00599 srcP3 = vec_perm(srcR1, srcR2, permP3);
00600 } break;
00601 case 11: {
00602 srcM2 = vec_perm(srcR1, srcR2, permM2);
00603 srcM1 = vec_perm(srcR1, srcR2, permM1);
00604 srcP0 = vec_perm(srcR1, srcR2, permP0);
00605 srcP1 = vec_perm(srcR1, srcR2, permP1);
00606 srcP2 = vec_perm(srcR1, srcR2, permP2);
00607 srcP3 = srcR2;
00608 } break;
00609 case 12: {
00610 vec_u8 srcR3 = vec_ld(30, src);
00611 srcM2 = vec_perm(srcR1, srcR2, permM2);
00612 srcM1 = vec_perm(srcR1, srcR2, permM1);
00613 srcP0 = vec_perm(srcR1, srcR2, permP0);
00614 srcP1 = vec_perm(srcR1, srcR2, permP1);
00615 srcP2 = srcR2;
00616 srcP3 = vec_perm(srcR2, srcR3, permP3);
00617 } break;
00618 case 13: {
00619 vec_u8 srcR3 = vec_ld(30, src);
00620 srcM2 = vec_perm(srcR1, srcR2, permM2);
00621 srcM1 = vec_perm(srcR1, srcR2, permM1);
00622 srcP0 = vec_perm(srcR1, srcR2, permP0);
00623 srcP1 = srcR2;
00624 srcP2 = vec_perm(srcR2, srcR3, permP2);
00625 srcP3 = vec_perm(srcR2, srcR3, permP3);
00626 } break;
00627 case 14: {
00628 vec_u8 srcR3 = vec_ld(30, src);
00629 srcM2 = vec_perm(srcR1, srcR2, permM2);
00630 srcM1 = vec_perm(srcR1, srcR2, permM1);
00631 srcP0 = srcR2;
00632 srcP1 = vec_perm(srcR2, srcR3, permP1);
00633 srcP2 = vec_perm(srcR2, srcR3, permP2);
00634 srcP3 = vec_perm(srcR2, srcR3, permP3);
00635 } break;
00636 case 15: {
00637 vec_u8 srcR3 = vec_ld(30, src);
00638 srcM2 = vec_perm(srcR1, srcR2, permM2);
00639 srcM1 = srcR2;
00640 srcP0 = vec_perm(srcR2, srcR3, permP0);
00641 srcP1 = vec_perm(srcR2, srcR3, permP1);
00642 srcP2 = vec_perm(srcR2, srcR3, permP2);
00643 srcP3 = vec_perm(srcR2, srcR3, permP3);
00644 } break;
00645 }
00646
00647 srcP0A = (vec_s16) vec_mergeh(zero_u8v, srcP0);
00648 srcP0B = (vec_s16) vec_mergel(zero_u8v, srcP0);
00649 srcP1A = (vec_s16) vec_mergeh(zero_u8v, srcP1);
00650 srcP1B = (vec_s16) vec_mergel(zero_u8v, srcP1);
00651
00652 srcP2A = (vec_s16) vec_mergeh(zero_u8v, srcP2);
00653 srcP2B = (vec_s16) vec_mergel(zero_u8v, srcP2);
00654 srcP3A = (vec_s16) vec_mergeh(zero_u8v, srcP3);
00655 srcP3B = (vec_s16) vec_mergel(zero_u8v, srcP3);
00656
00657 srcM1A = (vec_s16) vec_mergeh(zero_u8v, srcM1);
00658 srcM1B = (vec_s16) vec_mergel(zero_u8v, srcM1);
00659 srcM2A = (vec_s16) vec_mergeh(zero_u8v, srcM2);
00660 srcM2B = (vec_s16) vec_mergel(zero_u8v, srcM2);
00661
00662 sum1A = vec_adds(srcP0A, srcP1A);
00663 sum1B = vec_adds(srcP0B, srcP1B);
00664 sum2A = vec_adds(srcM1A, srcP2A);
00665 sum2B = vec_adds(srcM1B, srcP2B);
00666 sum3A = vec_adds(srcM2A, srcP3A);
00667 sum3B = vec_adds(srcM2B, srcP3B);
00668
00669 pp1A = vec_mladd(sum1A, v20ss, sum3A);
00670 pp1B = vec_mladd(sum1B, v20ss, sum3B);
00671
00672 pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
00673 pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
00674
00675 psumA = vec_sub(pp1A, pp2A);
00676 psumB = vec_sub(pp1B, pp2B);
00677
00678 vec_st(psumA, 0, tmp);
00679 vec_st(psumB, 16, tmp);
00680
00681 src += srcStride;
00682 tmp += tmpStride;
00683 }
00684
00685 tmpM2ssA = vec_ld(0, tmpbis);
00686 tmpM2ssB = vec_ld(16, tmpbis);
00687 tmpbis += tmpStride;
00688 tmpM1ssA = vec_ld(0, tmpbis);
00689 tmpM1ssB = vec_ld(16, tmpbis);
00690 tmpbis += tmpStride;
00691 tmpP0ssA = vec_ld(0, tmpbis);
00692 tmpP0ssB = vec_ld(16, tmpbis);
00693 tmpbis += tmpStride;
00694 tmpP1ssA = vec_ld(0, tmpbis);
00695 tmpP1ssB = vec_ld(16, tmpbis);
00696 tmpbis += tmpStride;
00697 tmpP2ssA = vec_ld(0, tmpbis);
00698 tmpP2ssB = vec_ld(16, tmpbis);
00699 tmpbis += tmpStride;
00700
00701 for (i = 0 ; i < 16 ; i++) {
00702 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
00703 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
00704
00705 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
00706 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
00707 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
00708 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
00709 const vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
00710 const vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
00711
00712 tmpbis += tmpStride;
00713
00714 tmpM2ssA = tmpM1ssA;
00715 tmpM2ssB = tmpM1ssB;
00716 tmpM1ssA = tmpP0ssA;
00717 tmpM1ssB = tmpP0ssB;
00718 tmpP0ssA = tmpP1ssA;
00719 tmpP0ssB = tmpP1ssB;
00720 tmpP1ssA = tmpP2ssA;
00721 tmpP1ssB = tmpP2ssB;
00722 tmpP2ssA = tmpP3ssA;
00723 tmpP2ssB = tmpP3ssB;
00724
00725 pp1Ae = vec_mule(sum1A, v20ss);
00726 pp1Ao = vec_mulo(sum1A, v20ss);
00727 pp1Be = vec_mule(sum1B, v20ss);
00728 pp1Bo = vec_mulo(sum1B, v20ss);
00729
00730 pp2Ae = vec_mule(sum2A, v5ss);
00731 pp2Ao = vec_mulo(sum2A, v5ss);
00732 pp2Be = vec_mule(sum2B, v5ss);
00733 pp2Bo = vec_mulo(sum2B, v5ss);
00734
00735 pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
00736 pp3Ao = vec_mulo(sum3A, v1ss);
00737 pp3Be = vec_sra((vec_s32)sum3B, v16ui);
00738 pp3Bo = vec_mulo(sum3B, v1ss);
00739
00740 pp1cAe = vec_add(pp1Ae, v512si);
00741 pp1cAo = vec_add(pp1Ao, v512si);
00742 pp1cBe = vec_add(pp1Be, v512si);
00743 pp1cBo = vec_add(pp1Bo, v512si);
00744
00745 pp32Ae = vec_sub(pp3Ae, pp2Ae);
00746 pp32Ao = vec_sub(pp3Ao, pp2Ao);
00747 pp32Be = vec_sub(pp3Be, pp2Be);
00748 pp32Bo = vec_sub(pp3Bo, pp2Bo);
00749
00750 sumAe = vec_add(pp1cAe, pp32Ae);
00751 sumAo = vec_add(pp1cAo, pp32Ao);
00752 sumBe = vec_add(pp1cBe, pp32Be);
00753 sumBo = vec_add(pp1cBo, pp32Bo);
00754
00755 ssumAe = vec_sra(sumAe, v10ui);
00756 ssumAo = vec_sra(sumAo, v10ui);
00757 ssumBe = vec_sra(sumBe, v10ui);
00758 ssumBo = vec_sra(sumBo, v10ui);
00759
00760 ssume = vec_packs(ssumAe, ssumBe);
00761 ssumo = vec_packs(ssumAo, ssumBo);
00762
00763 sumv = vec_packsu(ssume, ssumo);
00764 sum = vec_perm(sumv, sumv, mperm);
00765
00766 ASSERT_ALIGNED(dst);
00767 vdst = vec_ld(0, dst);
00768
00769 OP_U8_ALTIVEC(fsum, sum, vdst);
00770
00771 vec_st(fsum, 0, dst);
00772
00773 dst += dstStride;
00774 }
00775 }
00776 #endif