FFmpeg
h264qpel_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "config.h"
22 #if HAVE_UNISTD_H
23 #include <unistd.h>
24 #endif
25 
26 #include "libavutil/avassert.h"
27 #include "libavutil/mem.h"
29 
30 #define ASSERT_ALIGNED(ptr) av_assert2(!((uintptr_t)ptr&0x0000000F));
31 
32 #if HAVE_BIGENDIAN
33 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
34  vec_u8 srcR1 = vec_ld(-2, s);\
35  vec_u8 srcR2 = vec_ld(14, s);\
36  switch (ali) {\
37  default: {\
38  srcM2 = vec_perm(srcR1, srcR2, pm2);\
39  srcM1 = vec_perm(srcR1, srcR2, pm1);\
40  srcP0 = vec_perm(srcR1, srcR2, pp0);\
41  srcP1 = vec_perm(srcR1, srcR2, pp1);\
42  srcP2 = vec_perm(srcR1, srcR2, pp2);\
43  srcP3 = vec_perm(srcR1, srcR2, pp3);\
44  } break;\
45  case 11: {\
46  srcM2 = vec_perm(srcR1, srcR2, pm2);\
47  srcM1 = vec_perm(srcR1, srcR2, pm1);\
48  srcP0 = vec_perm(srcR1, srcR2, pp0);\
49  srcP1 = vec_perm(srcR1, srcR2, pp1);\
50  srcP2 = vec_perm(srcR1, srcR2, pp2);\
51  srcP3 = srcR2;\
52  } break;\
53  case 12: {\
54  vec_u8 srcR3 = vec_ld(30, s);\
55  srcM2 = vec_perm(srcR1, srcR2, pm2);\
56  srcM1 = vec_perm(srcR1, srcR2, pm1);\
57  srcP0 = vec_perm(srcR1, srcR2, pp0);\
58  srcP1 = vec_perm(srcR1, srcR2, pp1);\
59  srcP2 = srcR2;\
60  srcP3 = vec_perm(srcR2, srcR3, pp3);\
61  } break;\
62  case 13: {\
63  vec_u8 srcR3 = vec_ld(30, s);\
64  srcM2 = vec_perm(srcR1, srcR2, pm2);\
65  srcM1 = vec_perm(srcR1, srcR2, pm1);\
66  srcP0 = vec_perm(srcR1, srcR2, pp0);\
67  srcP1 = srcR2;\
68  srcP2 = vec_perm(srcR2, srcR3, pp2);\
69  srcP3 = vec_perm(srcR2, srcR3, pp3);\
70  } break;\
71  case 14: {\
72  vec_u8 srcR3 = vec_ld(30, s);\
73  srcM2 = vec_perm(srcR1, srcR2, pm2);\
74  srcM1 = vec_perm(srcR1, srcR2, pm1);\
75  srcP0 = srcR2;\
76  srcP1 = vec_perm(srcR2, srcR3, pp1);\
77  srcP2 = vec_perm(srcR2, srcR3, pp2);\
78  srcP3 = vec_perm(srcR2, srcR3, pp3);\
79  } break;\
80  case 15: {\
81  vec_u8 srcR3 = vec_ld(30, s);\
82  srcM2 = vec_perm(srcR1, srcR2, pm2);\
83  srcM1 = srcR2;\
84  srcP0 = vec_perm(srcR2, srcR3, pp0);\
85  srcP1 = vec_perm(srcR2, srcR3, pp1);\
86  srcP2 = vec_perm(srcR2, srcR3, pp2);\
87  srcP3 = vec_perm(srcR2, srcR3, pp3);\
88  } break;\
89  }\
90  }
91 #else
92 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
93  srcM2 = vec_vsx_ld(-2, s);\
94  srcM1 = vec_vsx_ld(-1, s);\
95  srcP0 = vec_vsx_ld(0, s);\
96  srcP1 = vec_vsx_ld(1, s);\
97  srcP2 = vec_vsx_ld(2, s);\
98  srcP3 = vec_vsx_ld(3, s);\
99  }
100 #endif /* HAVE_BIGENDIAN */
101 
102 /* this code assume stride % 16 == 0 */
103 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
104 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
105  const uint8_t *src,
106  int dstStride, int srcStride)
107 {
108  register int i;
109 
110  LOAD_ZERO;
111  vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
112  const vec_s16 v5ss = vec_splat_s16(5);
113  const vec_u16 v5us = vec_splat_u16(5);
114  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
115  const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
116 
117  vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
118 
119  register int align = ((((unsigned long)src) - 2) % 16);
120 
121  vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
122  srcP2A, srcP2B, srcP3A, srcP3B,
123  srcM1A, srcM1B, srcM2A, srcM2B,
124  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
125  pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
126  psumA, psumB, sumA, sumB;
127 
128  vec_u8 sum, fsum;
129 
130 #if HAVE_BIGENDIAN
131  permM2 = vec_lvsl(-2, src);
132  permM1 = vec_lvsl(-1, src);
133  permP0 = vec_lvsl(+0, src);
134  permP1 = vec_lvsl(+1, src);
135  permP2 = vec_lvsl(+2, src);
136  permP3 = vec_lvsl(+3, src);
137 #endif /* HAVE_BIGENDIAN */
138 
139  for (i = 0 ; i < 16 ; i ++) {
140  load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
141 
142  srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
143  srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
144  srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
145  srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
146 
147  srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
148  srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
149  srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
150  srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
151 
152  srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
153  srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
154  srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
155  srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
156 
157  sum1A = vec_adds(srcP0A, srcP1A);
158  sum1B = vec_adds(srcP0B, srcP1B);
159  sum2A = vec_adds(srcM1A, srcP2A);
160  sum2B = vec_adds(srcM1B, srcP2B);
161  sum3A = vec_adds(srcM2A, srcP3A);
162  sum3B = vec_adds(srcM2B, srcP3B);
163 
164  pp1A = vec_mladd(sum1A, v20ss, v16ss);
165  pp1B = vec_mladd(sum1B, v20ss, v16ss);
166 
167  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
168  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
169 
170  pp3A = vec_add(sum3A, pp1A);
171  pp3B = vec_add(sum3B, pp1B);
172 
173  psumA = vec_sub(pp3A, pp2A);
174  psumB = vec_sub(pp3B, pp2B);
175 
176  sumA = vec_sra(psumA, v5us);
177  sumB = vec_sra(psumB, v5us);
178 
179  sum = vec_packsu(sumA, sumB);
180 
181  ASSERT_ALIGNED(dst);
182 
183  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
184 
185  vec_st(fsum, 0, dst);
186 
187  src += srcStride;
188  dst += dstStride;
189  }
190 }
191 #endif /* PREFIX_h264_qpel16_h_lowpass_altivec */
192 
193 /* this code assume stride % 16 == 0 */
194 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
195 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
196  const uint8_t *src,
197  int dstStride, int srcStride)
198 {
199  register int i;
200 
201  LOAD_ZERO;
202  vec_u8 perm;
203 #if HAVE_BIGENDIAN
204  perm = vec_lvsl(0, src);
205 #endif
206  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
207  const vec_u16 v5us = vec_splat_u16(5);
208  const vec_s16 v5ss = vec_splat_s16(5);
209  const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
210 
211  const uint8_t *srcbis = src - (srcStride * 2);
212 
213  const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm);
214  srcbis += srcStride;
215  const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm);
216  srcbis += srcStride;
217  const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm);
218  srcbis += srcStride;
219  const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm);
220  srcbis += srcStride;
221  const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm);
222  srcbis += srcStride;
223 
224  vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
225  vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
226  vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
227  vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
228  vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
229  vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
230  vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
231  vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
232  vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
233  vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
234 
235  vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
236  psumA, psumB, sumA, sumB,
237  srcP3ssA, srcP3ssB,
238  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
239 
240  vec_u8 sum, fsum, srcP3;
241 
242  for (i = 0 ; i < 16 ; i++) {
243  srcP3 = load_with_perm_vec(0, srcbis, perm);
244  srcbis += srcStride;
245 
246  srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
247  srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
248 
249  sum1A = vec_adds(srcP0ssA, srcP1ssA);
250  sum1B = vec_adds(srcP0ssB, srcP1ssB);
251  sum2A = vec_adds(srcM1ssA, srcP2ssA);
252  sum2B = vec_adds(srcM1ssB, srcP2ssB);
253  sum3A = vec_adds(srcM2ssA, srcP3ssA);
254  sum3B = vec_adds(srcM2ssB, srcP3ssB);
255 
256  srcM2ssA = srcM1ssA;
257  srcM2ssB = srcM1ssB;
258  srcM1ssA = srcP0ssA;
259  srcM1ssB = srcP0ssB;
260  srcP0ssA = srcP1ssA;
261  srcP0ssB = srcP1ssB;
262  srcP1ssA = srcP2ssA;
263  srcP1ssB = srcP2ssB;
264  srcP2ssA = srcP3ssA;
265  srcP2ssB = srcP3ssB;
266 
267  pp1A = vec_mladd(sum1A, v20ss, v16ss);
268  pp1B = vec_mladd(sum1B, v20ss, v16ss);
269 
270  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
271  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
272 
273  pp3A = vec_add(sum3A, pp1A);
274  pp3B = vec_add(sum3B, pp1B);
275 
276  psumA = vec_sub(pp3A, pp2A);
277  psumB = vec_sub(pp3B, pp2B);
278 
279  sumA = vec_sra(psumA, v5us);
280  sumB = vec_sra(psumB, v5us);
281 
282  sum = vec_packsu(sumA, sumB);
283 
284  ASSERT_ALIGNED(dst);
285 
286  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
287 
288  vec_st(fsum, 0, dst);
289 
290  dst += dstStride;
291  }
292 }
293 #endif /* PREFIX_h264_qpel16_v_lowpass_altivec */
294 
295 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
296 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
297 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
298  const uint8_t *src,
299  int dstStride, int tmpStride,
300  int srcStride)
301 {
302  register int i;
303  LOAD_ZERO;
304  vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
305  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
306  const vec_u32 v10ui = vec_splat_u32(10);
307  const vec_s16 v5ss = vec_splat_s16(5);
308  const vec_s16 v1ss = vec_splat_s16(1);
309  const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
310  const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
311 
312  register int align = ((((unsigned long)src) - 2) % 16);
313 
314  vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
315  srcP2A, srcP2B, srcP3A, srcP3B,
316  srcM1A, srcM1B, srcM2A, srcM2B,
317  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
318  pp1A, pp1B, pp2A, pp2B, psumA, psumB;
319 
320  const vec_u8 mperm = (const vec_u8)
321  {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
322  0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
323  int16_t *tmpbis = tmp;
324 
325  vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
326  tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
327  tmpP2ssA, tmpP2ssB;
328 
329  vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
330  pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
331  pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
332  ssumAe, ssumAo, ssumBe, ssumBo;
333  vec_u8 fsum, sumv, sum;
334  vec_s16 ssume, ssumo;
335 
336 #if HAVE_BIGENDIAN
337  permM2 = vec_lvsl(-2, src);
338  permM1 = vec_lvsl(-1, src);
339  permP0 = vec_lvsl(+0, src);
340  permP1 = vec_lvsl(+1, src);
341  permP2 = vec_lvsl(+2, src);
342  permP3 = vec_lvsl(+3, src);
343 #endif /* HAVE_BIGENDIAN */
344 
345  src -= (2 * srcStride);
346  for (i = 0 ; i < 21 ; i ++) {
347  vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
348 
349  load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
350 
351  srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
352  srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
353  srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
354  srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
355 
356  srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
357  srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
358  srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
359  srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
360 
361  srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
362  srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
363  srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
364  srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
365 
366  sum1A = vec_adds(srcP0A, srcP1A);
367  sum1B = vec_adds(srcP0B, srcP1B);
368  sum2A = vec_adds(srcM1A, srcP2A);
369  sum2B = vec_adds(srcM1B, srcP2B);
370  sum3A = vec_adds(srcM2A, srcP3A);
371  sum3B = vec_adds(srcM2B, srcP3B);
372 
373  pp1A = vec_mladd(sum1A, v20ss, sum3A);
374  pp1B = vec_mladd(sum1B, v20ss, sum3B);
375 
376  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
377  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
378 
379  psumA = vec_sub(pp1A, pp2A);
380  psumB = vec_sub(pp1B, pp2B);
381 
382  vec_st(psumA, 0, tmp);
383  vec_st(psumB, 16, tmp);
384 
385  src += srcStride;
386  tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
387  }
388 
389  tmpM2ssA = vec_ld(0, tmpbis);
390  tmpM2ssB = vec_ld(16, tmpbis);
391  tmpbis += tmpStride;
392  tmpM1ssA = vec_ld(0, tmpbis);
393  tmpM1ssB = vec_ld(16, tmpbis);
394  tmpbis += tmpStride;
395  tmpP0ssA = vec_ld(0, tmpbis);
396  tmpP0ssB = vec_ld(16, tmpbis);
397  tmpbis += tmpStride;
398  tmpP1ssA = vec_ld(0, tmpbis);
399  tmpP1ssB = vec_ld(16, tmpbis);
400  tmpbis += tmpStride;
401  tmpP2ssA = vec_ld(0, tmpbis);
402  tmpP2ssB = vec_ld(16, tmpbis);
403  tmpbis += tmpStride;
404 
405  for (i = 0 ; i < 16 ; i++) {
406  const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
407  const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
408 
409  const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
410  const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
411  const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
412  const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
413  vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
414  vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
415 
416  tmpbis += tmpStride;
417 
418  tmpM2ssA = tmpM1ssA;
419  tmpM2ssB = tmpM1ssB;
420  tmpM1ssA = tmpP0ssA;
421  tmpM1ssB = tmpP0ssB;
422  tmpP0ssA = tmpP1ssA;
423  tmpP0ssB = tmpP1ssB;
424  tmpP1ssA = tmpP2ssA;
425  tmpP1ssB = tmpP2ssB;
426  tmpP2ssA = tmpP3ssA;
427  tmpP2ssB = tmpP3ssB;
428 
429  pp1Ae = vec_mule(sum1A, v20ss);
430  pp1Ao = vec_mulo(sum1A, v20ss);
431  pp1Be = vec_mule(sum1B, v20ss);
432  pp1Bo = vec_mulo(sum1B, v20ss);
433 
434  pp2Ae = vec_mule(sum2A, v5ss);
435  pp2Ao = vec_mulo(sum2A, v5ss);
436  pp2Be = vec_mule(sum2B, v5ss);
437  pp2Bo = vec_mulo(sum2B, v5ss);
438 
439  pp3Ao = vec_mulo(sum3A, v1ss);
440  pp3Bo = vec_mulo(sum3B, v1ss);
441 #if !HAVE_BIGENDIAN
442  sum3A = (vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
443  sum3B = (vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
444 #endif
445  pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
446  pp3Be = vec_sra((vec_s32)sum3B, v16ui);
447 
448  pp1cAe = vec_add(pp1Ae, v512si);
449  pp1cAo = vec_add(pp1Ao, v512si);
450  pp1cBe = vec_add(pp1Be, v512si);
451  pp1cBo = vec_add(pp1Bo, v512si);
452 
453  pp32Ae = vec_sub(pp3Ae, pp2Ae);
454  pp32Ao = vec_sub(pp3Ao, pp2Ao);
455  pp32Be = vec_sub(pp3Be, pp2Be);
456  pp32Bo = vec_sub(pp3Bo, pp2Bo);
457 
458  sumAe = vec_add(pp1cAe, pp32Ae);
459  sumAo = vec_add(pp1cAo, pp32Ao);
460  sumBe = vec_add(pp1cBe, pp32Be);
461  sumBo = vec_add(pp1cBo, pp32Bo);
462 
463  ssumAe = vec_sra(sumAe, v10ui);
464  ssumAo = vec_sra(sumAo, v10ui);
465  ssumBe = vec_sra(sumBe, v10ui);
466  ssumBo = vec_sra(sumBo, v10ui);
467 
468  ssume = vec_packs(ssumAe, ssumBe);
469  ssumo = vec_packs(ssumAo, ssumBo);
470 
471  sumv = vec_packsu(ssume, ssumo);
472  sum = vec_perm(sumv, sumv, mperm);
473 
474  ASSERT_ALIGNED(dst);
475 
476  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
477 
478  vec_st(fsum, 0, dst);
479 
480  dst += dstStride;
481  }
482 }
483 #endif /* PREFIX_h264_qpel16_hv_lowpass_altivec */
LOAD_ZERO
#define LOAD_ZERO
Definition: util_altivec.h:45
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
ASSERT_ALIGNED
#define ASSERT_ALIGNED(ptr)
Definition: h264qpel_template.c:30
perm
perm
Definition: f_perms.c:75
vec_s32
#define vec_s32
Definition: util_altivec.h:39
avassert.h
vec_s16
#define vec_s16
Definition: util_altivec.h:37
zero_u8v
#define zero_u8v
Definition: util_altivec.h:47
vec_u32
#define vec_u32
Definition: util_altivec.h:38
align
static const uint8_t *BS_FUNC() align(BSCTX *bc)
Skip bits to a byte boundary.
Definition: bitstream_template.h:411
vec_u8
#define vec_u8
Definition: util_altivec.h:34
load_alignment
#define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3)
Definition: h264qpel_template.c:92
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:255
mem.h
util_altivec.h
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
zero_s16v
#define zero_s16v
Definition: util_altivec.h:50
vec_u16
#define vec_u16
Definition: util_altivec.h:36