31 #define ASSERT_ALIGNED(ptr) av_assert2(((unsigned long)ptr&0x0000000F));
34 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
35 vec_u8 srcR1 = vec_ld(-2, s);\
36 vec_u8 srcR2 = vec_ld(14, s);\
39 srcM2 = vec_perm(srcR1, srcR2, pm2);\
40 srcM1 = vec_perm(srcR1, srcR2, pm1);\
41 srcP0 = vec_perm(srcR1, srcR2, pp0);\
42 srcP1 = vec_perm(srcR1, srcR2, pp1);\
43 srcP2 = vec_perm(srcR1, srcR2, pp2);\
44 srcP3 = vec_perm(srcR1, srcR2, pp3);\
47 srcM2 = vec_perm(srcR1, srcR2, pm2);\
48 srcM1 = vec_perm(srcR1, srcR2, pm1);\
49 srcP0 = vec_perm(srcR1, srcR2, pp0);\
50 srcP1 = vec_perm(srcR1, srcR2, pp1);\
51 srcP2 = vec_perm(srcR1, srcR2, pp2);\
55 vec_u8 srcR3 = vec_ld(30, s);\
56 srcM2 = vec_perm(srcR1, srcR2, pm2);\
57 srcM1 = vec_perm(srcR1, srcR2, pm1);\
58 srcP0 = vec_perm(srcR1, srcR2, pp0);\
59 srcP1 = vec_perm(srcR1, srcR2, pp1);\
61 srcP3 = vec_perm(srcR2, srcR3, pp3);\
64 vec_u8 srcR3 = vec_ld(30, s);\
65 srcM2 = vec_perm(srcR1, srcR2, pm2);\
66 srcM1 = vec_perm(srcR1, srcR2, pm1);\
67 srcP0 = vec_perm(srcR1, srcR2, pp0);\
69 srcP2 = vec_perm(srcR2, srcR3, pp2);\
70 srcP3 = vec_perm(srcR2, srcR3, pp3);\
73 vec_u8 srcR3 = vec_ld(30, s);\
74 srcM2 = vec_perm(srcR1, srcR2, pm2);\
75 srcM1 = vec_perm(srcR1, srcR2, pm1);\
77 srcP1 = vec_perm(srcR2, srcR3, pp1);\
78 srcP2 = vec_perm(srcR2, srcR3, pp2);\
79 srcP3 = vec_perm(srcR2, srcR3, pp3);\
82 vec_u8 srcR3 = vec_ld(30, s);\
83 srcM2 = vec_perm(srcR1, srcR2, pm2);\
85 srcP0 = vec_perm(srcR2, srcR3, pp0);\
86 srcP1 = vec_perm(srcR2, srcR3, pp1);\
87 srcP2 = vec_perm(srcR2, srcR3, pp2);\
88 srcP3 = vec_perm(srcR2, srcR3, pp3);\
93 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
94 srcM2 = vec_vsx_ld(-2, s);\
95 srcM1 = vec_vsx_ld(-1, s);\
96 srcP0 = vec_vsx_ld(0, s);\
97 srcP1 = vec_vsx_ld(1, s);\
98 srcP2 = vec_vsx_ld(2, s);\
99 srcP3 = vec_vsx_ld(3, s);\
104 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
105 static void PREFIX_h264_qpel16_h_lowpass_altivec(
uint8_t *dst,
107 int dstStride,
int srcStride)
112 vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
113 const vec_s16 v5ss = vec_splat_s16(5);
114 const vec_u16 v5us = vec_splat_u16(5);
115 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
116 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
118 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
120 register int align = ((((
unsigned long)src) - 2) % 16);
122 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
123 srcP2A, srcP2B, srcP3A, srcP3B,
124 srcM1A, srcM1B, srcM2A, srcM2B,
125 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
126 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
127 psumA, psumB, sumA, sumB;
132 permM2 = vec_lvsl(-2, src);
133 permM1 = vec_lvsl(-1, src);
134 permP0 = vec_lvsl(+0, src);
135 permP1 = vec_lvsl(+1, src);
136 permP2 = vec_lvsl(+2, src);
137 permP3 = vec_lvsl(+3, src);
140 for (i = 0 ; i < 16 ; i ++) {
141 load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
158 sum1A = vec_adds(srcP0A, srcP1A);
159 sum1B = vec_adds(srcP0B, srcP1B);
160 sum2A = vec_adds(srcM1A, srcP2A);
161 sum2B = vec_adds(srcM1B, srcP2B);
162 sum3A = vec_adds(srcM2A, srcP3A);
163 sum3B = vec_adds(srcM2B, srcP3B);
165 pp1A = vec_mladd(sum1A, v20ss, v16ss);
166 pp1B = vec_mladd(sum1B, v20ss, v16ss);
168 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
169 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
171 pp3A = vec_add(sum3A, pp1A);
172 pp3B = vec_add(sum3B, pp1B);
174 psumA = vec_sub(pp3A, pp2A);
175 psumB = vec_sub(pp3B, pp2B);
177 sumA = vec_sra(psumA, v5us);
178 sumB = vec_sra(psumB, v5us);
180 sum = vec_packsu(sumA, sumB);
184 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
186 vec_st(fsum, 0, dst);
195 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
196 static void PREFIX_h264_qpel16_v_lowpass_altivec(
uint8_t *dst,
198 int dstStride,
int srcStride)
205 perm = vec_lvsl(0, src);
207 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
208 const vec_u16 v5us = vec_splat_u16(5);
209 const vec_s16 v5ss = vec_splat_s16(5);
210 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
212 const uint8_t *srcbis = src - (srcStride * 2);
214 const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm);
216 const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm);
218 const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm);
220 const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm);
222 const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm);
236 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
237 psumA, psumB, sumA, sumB,
239 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
243 for (i = 0 ; i < 16 ; i++) {
244 srcP3 = load_with_perm_vec(0, srcbis, perm);
250 sum1A = vec_adds(srcP0ssA, srcP1ssA);
251 sum1B = vec_adds(srcP0ssB, srcP1ssB);
252 sum2A = vec_adds(srcM1ssA, srcP2ssA);
253 sum2B = vec_adds(srcM1ssB, srcP2ssB);
254 sum3A = vec_adds(srcM2ssA, srcP3ssA);
255 sum3B = vec_adds(srcM2ssB, srcP3ssB);
268 pp1A = vec_mladd(sum1A, v20ss, v16ss);
269 pp1B = vec_mladd(sum1B, v20ss, v16ss);
271 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
272 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
274 pp3A = vec_add(sum3A, pp1A);
275 pp3B = vec_add(sum3B, pp1B);
277 psumA = vec_sub(pp3A, pp2A);
278 psumB = vec_sub(pp3B, pp2B);
280 sumA = vec_sra(psumA, v5us);
281 sumB = vec_sra(psumB, v5us);
283 sum = vec_packsu(sumA, sumB);
287 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
289 vec_st(fsum, 0, dst);
297 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
298 static void PREFIX_h264_qpel16_hv_lowpass_altivec(
uint8_t *dst, int16_t *tmp,
300 int dstStride,
int tmpStride,
305 vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
306 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
307 const vec_u32 v10ui = vec_splat_u32(10);
308 const vec_s16 v5ss = vec_splat_s16(5);
309 const vec_s16 v1ss = vec_splat_s16(1);
310 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
311 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
313 register int align = ((((
unsigned long)src) - 2) % 16);
315 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
316 srcP2A, srcP2B, srcP3A, srcP3B,
317 srcM1A, srcM1B, srcM2A, srcM2B,
318 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
319 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
322 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
323 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
324 int16_t *tmpbis = tmp;
326 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
327 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
330 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
331 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
332 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
333 ssumAe, ssumAo, ssumBe, ssumBo;
338 permM2 = vec_lvsl(-2, src);
339 permM1 = vec_lvsl(-1, src);
340 permP0 = vec_lvsl(+0, src);
341 permP1 = vec_lvsl(+1, src);
342 permP2 = vec_lvsl(+2, src);
343 permP3 = vec_lvsl(+3, src);
346 src -= (2 * srcStride);
347 for (i = 0 ; i < 21 ; i ++) {
348 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
350 load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
367 sum1A = vec_adds(srcP0A, srcP1A);
368 sum1B = vec_adds(srcP0B, srcP1B);
369 sum2A = vec_adds(srcM1A, srcP2A);
370 sum2B = vec_adds(srcM1B, srcP2B);
371 sum3A = vec_adds(srcM2A, srcP3A);
372 sum3B = vec_adds(srcM2B, srcP3B);
374 pp1A = vec_mladd(sum1A, v20ss, sum3A);
375 pp1B = vec_mladd(sum1B, v20ss, sum3B);
377 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
378 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
380 psumA = vec_sub(pp1A, pp2A);
381 psumB = vec_sub(pp1B, pp2B);
383 vec_st(psumA, 0, tmp);
384 vec_st(psumB, 16, tmp);
390 tmpM2ssA = vec_ld(0, tmpbis);
391 tmpM2ssB = vec_ld(16, tmpbis);
393 tmpM1ssA = vec_ld(0, tmpbis);
394 tmpM1ssB = vec_ld(16, tmpbis);
396 tmpP0ssA = vec_ld(0, tmpbis);
397 tmpP0ssB = vec_ld(16, tmpbis);
399 tmpP1ssA = vec_ld(0, tmpbis);
400 tmpP1ssB = vec_ld(16, tmpbis);
402 tmpP2ssA = vec_ld(0, tmpbis);
403 tmpP2ssB = vec_ld(16, tmpbis);
406 for (i = 0 ; i < 16 ; i++) {
407 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
408 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
410 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
411 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
412 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
413 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
414 vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
415 vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
430 pp1Ae = vec_mule(sum1A, v20ss);
431 pp1Ao = vec_mulo(sum1A, v20ss);
432 pp1Be = vec_mule(sum1B, v20ss);
433 pp1Bo = vec_mulo(sum1B, v20ss);
435 pp2Ae = vec_mule(sum2A, v5ss);
436 pp2Ao = vec_mulo(sum2A, v5ss);
437 pp2Be = vec_mule(sum2B, v5ss);
438 pp2Bo = vec_mulo(sum2B, v5ss);
440 pp3Ao = vec_mulo(sum3A, v1ss);
441 pp3Bo = vec_mulo(sum3B, v1ss);
443 sum3A = (
vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
444 sum3B = (
vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
446 pp3Ae = vec_sra((
vec_s32)sum3A, v16ui);
447 pp3Be = vec_sra((
vec_s32)sum3B, v16ui);
449 pp1cAe = vec_add(pp1Ae, v512si);
450 pp1cAo = vec_add(pp1Ao, v512si);
451 pp1cBe = vec_add(pp1Be, v512si);
452 pp1cBo = vec_add(pp1Bo, v512si);
454 pp32Ae = vec_sub(pp3Ae, pp2Ae);
455 pp32Ao = vec_sub(pp3Ao, pp2Ao);
456 pp32Be = vec_sub(pp3Be, pp2Be);
457 pp32Bo = vec_sub(pp3Bo, pp2Bo);
459 sumAe = vec_add(pp1cAe, pp32Ae);
460 sumAo = vec_add(pp1cAo, pp32Ao);
461 sumBe = vec_add(pp1cBe, pp32Be);
462 sumBo = vec_add(pp1cBo, pp32Bo);
464 ssumAe = vec_sra(sumAe, v10ui);
465 ssumAo = vec_sra(sumAo, v10ui);
466 ssumBe = vec_sra(sumBe, v10ui);
467 ssumBo = vec_sra(sumBo, v10ui);
469 ssume = vec_packs(ssumAe, ssumBe);
470 ssumo = vec_packs(ssumAo, ssumBo);
472 sumv = vec_packsu(ssume, ssumo);
473 sum = vec_perm(sumv, sumv, mperm);
477 OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
479 vec_st(fsum, 0, dst);