30 #define ASSERT_ALIGNED(ptr) av_assert2(!((uintptr_t)ptr&0x0000000F));
33 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
34 vec_u8 srcR1 = vec_ld(-2, s);\
35 vec_u8 srcR2 = vec_ld(14, s);\
38 srcM2 = vec_perm(srcR1, srcR2, pm2);\
39 srcM1 = vec_perm(srcR1, srcR2, pm1);\
40 srcP0 = vec_perm(srcR1, srcR2, pp0);\
41 srcP1 = vec_perm(srcR1, srcR2, pp1);\
42 srcP2 = vec_perm(srcR1, srcR2, pp2);\
43 srcP3 = vec_perm(srcR1, srcR2, pp3);\
46 srcM2 = vec_perm(srcR1, srcR2, pm2);\
47 srcM1 = vec_perm(srcR1, srcR2, pm1);\
48 srcP0 = vec_perm(srcR1, srcR2, pp0);\
49 srcP1 = vec_perm(srcR1, srcR2, pp1);\
50 srcP2 = vec_perm(srcR1, srcR2, pp2);\
54 vec_u8 srcR3 = vec_ld(30, s);\
55 srcM2 = vec_perm(srcR1, srcR2, pm2);\
56 srcM1 = vec_perm(srcR1, srcR2, pm1);\
57 srcP0 = vec_perm(srcR1, srcR2, pp0);\
58 srcP1 = vec_perm(srcR1, srcR2, pp1);\
60 srcP3 = vec_perm(srcR2, srcR3, pp3);\
63 vec_u8 srcR3 = vec_ld(30, s);\
64 srcM2 = vec_perm(srcR1, srcR2, pm2);\
65 srcM1 = vec_perm(srcR1, srcR2, pm1);\
66 srcP0 = vec_perm(srcR1, srcR2, pp0);\
68 srcP2 = vec_perm(srcR2, srcR3, pp2);\
69 srcP3 = vec_perm(srcR2, srcR3, pp3);\
72 vec_u8 srcR3 = vec_ld(30, s);\
73 srcM2 = vec_perm(srcR1, srcR2, pm2);\
74 srcM1 = vec_perm(srcR1, srcR2, pm1);\
76 srcP1 = vec_perm(srcR2, srcR3, pp1);\
77 srcP2 = vec_perm(srcR2, srcR3, pp2);\
78 srcP3 = vec_perm(srcR2, srcR3, pp3);\
81 vec_u8 srcR3 = vec_ld(30, s);\
82 srcM2 = vec_perm(srcR1, srcR2, pm2);\
84 srcP0 = vec_perm(srcR2, srcR3, pp0);\
85 srcP1 = vec_perm(srcR2, srcR3, pp1);\
86 srcP2 = vec_perm(srcR2, srcR3, pp2);\
87 srcP3 = vec_perm(srcR2, srcR3, pp3);\
92 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
93 srcM2 = vec_vsx_ld(-2, s);\
94 srcM1 = vec_vsx_ld(-1, s);\
95 srcP0 = vec_vsx_ld(0, s);\
96 srcP1 = vec_vsx_ld(1, s);\
97 srcP2 = vec_vsx_ld(2, s);\
98 srcP3 = vec_vsx_ld(3, s);\
103 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
104 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *
dst,
106 int dstStride,
int srcStride)
111 vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
112 const vec_s16 v5ss = vec_splat_s16(5);
113 const vec_u16 v5us = vec_splat_u16(5);
114 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
115 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
117 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
119 register int align = ((((
unsigned long)
src) - 2) % 16);
121 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
122 srcP2A, srcP2B, srcP3A, srcP3B,
123 srcM1A, srcM1B, srcM2A, srcM2B,
124 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
125 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
126 psumA, psumB, sumA, sumB;
131 permM2 = vec_lvsl(-2,
src);
132 permM1 = vec_lvsl(-1,
src);
133 permP0 = vec_lvsl(+0,
src);
134 permP1 = vec_lvsl(+1,
src);
135 permP2 = vec_lvsl(+2,
src);
136 permP3 = vec_lvsl(+3,
src);
139 for (
i = 0 ;
i < 16 ;
i ++) {
157 sum1A = vec_adds(srcP0A, srcP1A);
158 sum1B = vec_adds(srcP0B, srcP1B);
159 sum2A = vec_adds(srcM1A, srcP2A);
160 sum2B = vec_adds(srcM1B, srcP2B);
161 sum3A = vec_adds(srcM2A, srcP3A);
162 sum3B = vec_adds(srcM2B, srcP3B);
164 pp1A = vec_mladd(sum1A, v20ss, v16ss);
165 pp1B = vec_mladd(sum1B, v20ss, v16ss);
167 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
168 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
170 pp3A = vec_add(sum3A, pp1A);
171 pp3B = vec_add(sum3B, pp1B);
173 psumA = vec_sub(pp3A, pp2A);
174 psumB = vec_sub(pp3B, pp2B);
176 sumA = vec_sra(psumA, v5us);
177 sumB = vec_sra(psumB, v5us);
179 sum = vec_packsu(sumA, sumB);
183 OP_U8_ALTIVEC(fsum, sum, vec_ld(0,
dst));
185 vec_st(fsum, 0,
dst);
194 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
195 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *
dst,
197 int dstStride,
int srcStride)
206 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
207 const vec_u16 v5us = vec_splat_u16(5);
208 const vec_s16 v5ss = vec_splat_s16(5);
209 const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
211 const uint8_t *srcbis =
src - (srcStride * 2);
213 const vec_u8 srcM2 = load_with_perm_vec(0, srcbis,
perm);
215 const vec_u8 srcM1 = load_with_perm_vec(0, srcbis,
perm);
217 const vec_u8 srcP0 = load_with_perm_vec(0, srcbis,
perm);
219 const vec_u8 srcP1 = load_with_perm_vec(0, srcbis,
perm);
221 const vec_u8 srcP2 = load_with_perm_vec(0, srcbis,
perm);
235 vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
236 psumA, psumB, sumA, sumB,
238 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
242 for (
i = 0 ;
i < 16 ;
i++) {
243 srcP3 = load_with_perm_vec(0, srcbis,
perm);
249 sum1A = vec_adds(srcP0ssA, srcP1ssA);
250 sum1B = vec_adds(srcP0ssB, srcP1ssB);
251 sum2A = vec_adds(srcM1ssA, srcP2ssA);
252 sum2B = vec_adds(srcM1ssB, srcP2ssB);
253 sum3A = vec_adds(srcM2ssA, srcP3ssA);
254 sum3B = vec_adds(srcM2ssB, srcP3ssB);
267 pp1A = vec_mladd(sum1A, v20ss, v16ss);
268 pp1B = vec_mladd(sum1B, v20ss, v16ss);
270 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
271 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
273 pp3A = vec_add(sum3A, pp1A);
274 pp3B = vec_add(sum3B, pp1B);
276 psumA = vec_sub(pp3A, pp2A);
277 psumB = vec_sub(pp3B, pp2B);
279 sumA = vec_sra(psumA, v5us);
280 sumB = vec_sra(psumB, v5us);
282 sum = vec_packsu(sumA, sumB);
286 OP_U8_ALTIVEC(fsum, sum, vec_ld(0,
dst));
288 vec_st(fsum, 0,
dst);
296 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
297 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *
dst, int16_t *
tmp,
299 int dstStride,
int tmpStride,
304 vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
305 const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
306 const vec_u32 v10ui = vec_splat_u32(10);
307 const vec_s16 v5ss = vec_splat_s16(5);
308 const vec_s16 v1ss = vec_splat_s16(1);
309 const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
310 const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
312 register int align = ((((
unsigned long)
src) - 2) % 16);
314 vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
315 srcP2A, srcP2B, srcP3A, srcP3B,
316 srcM1A, srcM1B, srcM2A, srcM2B,
317 sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
318 pp1A, pp1B, pp2A, pp2B, psumA, psumB;
321 {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
322 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
323 int16_t *tmpbis =
tmp;
325 vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
326 tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
329 vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
330 pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
331 pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
332 ssumAe, ssumAo, ssumBe, ssumBo;
337 permM2 = vec_lvsl(-2,
src);
338 permM1 = vec_lvsl(-1,
src);
339 permP0 = vec_lvsl(+0,
src);
340 permP1 = vec_lvsl(+1,
src);
341 permP2 = vec_lvsl(+2,
src);
342 permP3 = vec_lvsl(+3,
src);
345 src -= (2 * srcStride);
346 for (
i = 0 ;
i < 21 ;
i ++) {
347 vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
366 sum1A = vec_adds(srcP0A, srcP1A);
367 sum1B = vec_adds(srcP0B, srcP1B);
368 sum2A = vec_adds(srcM1A, srcP2A);
369 sum2B = vec_adds(srcM1B, srcP2B);
370 sum3A = vec_adds(srcM2A, srcP3A);
371 sum3B = vec_adds(srcM2B, srcP3B);
373 pp1A = vec_mladd(sum1A, v20ss, sum3A);
374 pp1B = vec_mladd(sum1B, v20ss, sum3B);
376 pp2A = vec_mladd(sum2A, v5ss,
zero_s16v);
377 pp2B = vec_mladd(sum2B, v5ss,
zero_s16v);
379 psumA = vec_sub(pp1A, pp2A);
380 psumB = vec_sub(pp1B, pp2B);
382 vec_st(psumA, 0,
tmp);
383 vec_st(psumB, 16,
tmp);
389 tmpM2ssA = vec_ld(0, tmpbis);
390 tmpM2ssB = vec_ld(16, tmpbis);
392 tmpM1ssA = vec_ld(0, tmpbis);
393 tmpM1ssB = vec_ld(16, tmpbis);
395 tmpP0ssA = vec_ld(0, tmpbis);
396 tmpP0ssB = vec_ld(16, tmpbis);
398 tmpP1ssA = vec_ld(0, tmpbis);
399 tmpP1ssB = vec_ld(16, tmpbis);
401 tmpP2ssA = vec_ld(0, tmpbis);
402 tmpP2ssB = vec_ld(16, tmpbis);
405 for (
i = 0 ;
i < 16 ;
i++) {
406 const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
407 const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
409 const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
410 const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
411 const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
412 const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
413 vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
414 vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
429 pp1Ae = vec_mule(sum1A, v20ss);
430 pp1Ao = vec_mulo(sum1A, v20ss);
431 pp1Be = vec_mule(sum1B, v20ss);
432 pp1Bo = vec_mulo(sum1B, v20ss);
434 pp2Ae = vec_mule(sum2A, v5ss);
435 pp2Ao = vec_mulo(sum2A, v5ss);
436 pp2Be = vec_mule(sum2B, v5ss);
437 pp2Bo = vec_mulo(sum2B, v5ss);
439 pp3Ao = vec_mulo(sum3A, v1ss);
440 pp3Bo = vec_mulo(sum3B, v1ss);
442 sum3A = (
vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
443 sum3B = (
vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
445 pp3Ae = vec_sra((
vec_s32)sum3A, v16ui);
446 pp3Be = vec_sra((
vec_s32)sum3B, v16ui);
448 pp1cAe = vec_add(pp1Ae, v512si);
449 pp1cAo = vec_add(pp1Ao, v512si);
450 pp1cBe = vec_add(pp1Be, v512si);
451 pp1cBo = vec_add(pp1Bo, v512si);
453 pp32Ae = vec_sub(pp3Ae, pp2Ae);
454 pp32Ao = vec_sub(pp3Ao, pp2Ao);
455 pp32Be = vec_sub(pp3Be, pp2Be);
456 pp32Bo = vec_sub(pp3Bo, pp2Bo);
458 sumAe = vec_add(pp1cAe, pp32Ae);
459 sumAo = vec_add(pp1cAo, pp32Ao);
460 sumBe = vec_add(pp1cBe, pp32Be);
461 sumBo = vec_add(pp1cBo, pp32Bo);
463 ssumAe = vec_sra(sumAe, v10ui);
464 ssumAo = vec_sra(sumAo, v10ui);
465 ssumBe = vec_sra(sumBe, v10ui);
466 ssumBo = vec_sra(sumBo, v10ui);
468 ssume = vec_packs(ssumAe, ssumBe);
469 ssumo = vec_packs(ssumAo, ssumBo);
471 sumv = vec_packsu(ssume, ssumo);
472 sum = vec_perm(sumv, sumv, mperm);
476 OP_U8_ALTIVEC(fsum, sum, vec_ld(0,
dst));
478 vec_st(fsum, 0,
dst);