FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
h264qpel_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "config.h"
22 #if HAVE_UNISTD_H
23 #include <unistd.h>
24 #endif
25 
26 #include "libavutil/avassert.h"
27 #include "libavutil/mem.h"
30 
31 #define ASSERT_ALIGNED(ptr) av_assert2(((unsigned long)ptr&0x0000000F));
32 
33 #if HAVE_BIGENDIAN
34 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
35  vec_u8 srcR1 = vec_ld(-2, s);\
36  vec_u8 srcR2 = vec_ld(14, s);\
37  switch (ali) {\
38  default: {\
39  srcM2 = vec_perm(srcR1, srcR2, pm2);\
40  srcM1 = vec_perm(srcR1, srcR2, pm1);\
41  srcP0 = vec_perm(srcR1, srcR2, pp0);\
42  srcP1 = vec_perm(srcR1, srcR2, pp1);\
43  srcP2 = vec_perm(srcR1, srcR2, pp2);\
44  srcP3 = vec_perm(srcR1, srcR2, pp3);\
45  } break;\
46  case 11: {\
47  srcM2 = vec_perm(srcR1, srcR2, pm2);\
48  srcM1 = vec_perm(srcR1, srcR2, pm1);\
49  srcP0 = vec_perm(srcR1, srcR2, pp0);\
50  srcP1 = vec_perm(srcR1, srcR2, pp1);\
51  srcP2 = vec_perm(srcR1, srcR2, pp2);\
52  srcP3 = srcR2;\
53  } break;\
54  case 12: {\
55  vec_u8 srcR3 = vec_ld(30, s);\
56  srcM2 = vec_perm(srcR1, srcR2, pm2);\
57  srcM1 = vec_perm(srcR1, srcR2, pm1);\
58  srcP0 = vec_perm(srcR1, srcR2, pp0);\
59  srcP1 = vec_perm(srcR1, srcR2, pp1);\
60  srcP2 = srcR2;\
61  srcP3 = vec_perm(srcR2, srcR3, pp3);\
62  } break;\
63  case 13: {\
64  vec_u8 srcR3 = vec_ld(30, s);\
65  srcM2 = vec_perm(srcR1, srcR2, pm2);\
66  srcM1 = vec_perm(srcR1, srcR2, pm1);\
67  srcP0 = vec_perm(srcR1, srcR2, pp0);\
68  srcP1 = srcR2;\
69  srcP2 = vec_perm(srcR2, srcR3, pp2);\
70  srcP3 = vec_perm(srcR2, srcR3, pp3);\
71  } break;\
72  case 14: {\
73  vec_u8 srcR3 = vec_ld(30, s);\
74  srcM2 = vec_perm(srcR1, srcR2, pm2);\
75  srcM1 = vec_perm(srcR1, srcR2, pm1);\
76  srcP0 = srcR2;\
77  srcP1 = vec_perm(srcR2, srcR3, pp1);\
78  srcP2 = vec_perm(srcR2, srcR3, pp2);\
79  srcP3 = vec_perm(srcR2, srcR3, pp3);\
80  } break;\
81  case 15: {\
82  vec_u8 srcR3 = vec_ld(30, s);\
83  srcM2 = vec_perm(srcR1, srcR2, pm2);\
84  srcM1 = srcR2;\
85  srcP0 = vec_perm(srcR2, srcR3, pp0);\
86  srcP1 = vec_perm(srcR2, srcR3, pp1);\
87  srcP2 = vec_perm(srcR2, srcR3, pp2);\
88  srcP3 = vec_perm(srcR2, srcR3, pp3);\
89  } break;\
90  }\
91  }
92 #else
93 #define load_alignment(s, ali, pm2, pm1, pp0, pp1, pp2, pp3){\
94  srcM2 = vec_vsx_ld(-2, s);\
95  srcM1 = vec_vsx_ld(-1, s);\
96  srcP0 = vec_vsx_ld(0, s);\
97  srcP1 = vec_vsx_ld(1, s);\
98  srcP2 = vec_vsx_ld(2, s);\
99  srcP3 = vec_vsx_ld(3, s);\
100  }
101 #endif /* HAVE_BIGENDIAN */
102 
103 /* this code assume stride % 16 == 0 */
104 #ifdef PREFIX_h264_qpel16_h_lowpass_altivec
105 static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t *dst,
106  const uint8_t *src,
107  int dstStride, int srcStride)
108 {
109  register int i;
110 
111  LOAD_ZERO;
112  vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
113  const vec_s16 v5ss = vec_splat_s16(5);
114  const vec_u16 v5us = vec_splat_u16(5);
115  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
116  const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
117 
118  vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
119 
120  register int align = ((((unsigned long)src) - 2) % 16);
121 
122  vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
123  srcP2A, srcP2B, srcP3A, srcP3B,
124  srcM1A, srcM1B, srcM2A, srcM2B,
125  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
126  pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
127  psumA, psumB, sumA, sumB;
128 
129  vec_u8 sum, fsum;
130 
131 #if HAVE_BIGENDIAN
132  permM2 = vec_lvsl(-2, src);
133  permM1 = vec_lvsl(-1, src);
134  permP0 = vec_lvsl(+0, src);
135  permP1 = vec_lvsl(+1, src);
136  permP2 = vec_lvsl(+2, src);
137  permP3 = vec_lvsl(+3, src);
138 #endif /* HAVE_BIGENDIAN */
139 
140  for (i = 0 ; i < 16 ; i ++) {
141  load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
142 
143  srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
144  srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
145  srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
146  srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
147 
148  srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
149  srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
150  srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
151  srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
152 
153  srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
154  srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
155  srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
156  srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
157 
158  sum1A = vec_adds(srcP0A, srcP1A);
159  sum1B = vec_adds(srcP0B, srcP1B);
160  sum2A = vec_adds(srcM1A, srcP2A);
161  sum2B = vec_adds(srcM1B, srcP2B);
162  sum3A = vec_adds(srcM2A, srcP3A);
163  sum3B = vec_adds(srcM2B, srcP3B);
164 
165  pp1A = vec_mladd(sum1A, v20ss, v16ss);
166  pp1B = vec_mladd(sum1B, v20ss, v16ss);
167 
168  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
169  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
170 
171  pp3A = vec_add(sum3A, pp1A);
172  pp3B = vec_add(sum3B, pp1B);
173 
174  psumA = vec_sub(pp3A, pp2A);
175  psumB = vec_sub(pp3B, pp2B);
176 
177  sumA = vec_sra(psumA, v5us);
178  sumB = vec_sra(psumB, v5us);
179 
180  sum = vec_packsu(sumA, sumB);
181 
182  ASSERT_ALIGNED(dst);
183 
184  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
185 
186  vec_st(fsum, 0, dst);
187 
188  src += srcStride;
189  dst += dstStride;
190  }
191 }
192 #endif
193 
194 /* this code assume stride % 16 == 0 */
195 #ifdef PREFIX_h264_qpel16_v_lowpass_altivec
196 static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t *dst,
197  const uint8_t *src,
198  int dstStride, int srcStride)
199 {
200  register int i;
201 
202  LOAD_ZERO;
203  vec_u8 perm;
204 #if HAVE_BIGENDIAN
205  perm = vec_lvsl(0, src);
206 #endif
207  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
208  const vec_u16 v5us = vec_splat_u16(5);
209  const vec_s16 v5ss = vec_splat_s16(5);
210  const vec_s16 v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
211 
212  const uint8_t *srcbis = src - (srcStride * 2);
213 
214  const vec_u8 srcM2 = load_with_perm_vec(0, srcbis, perm);
215  srcbis += srcStride;
216  const vec_u8 srcM1 = load_with_perm_vec(0, srcbis, perm);
217  srcbis += srcStride;
218  const vec_u8 srcP0 = load_with_perm_vec(0, srcbis, perm);
219  srcbis += srcStride;
220  const vec_u8 srcP1 = load_with_perm_vec(0, srcbis, perm);
221  srcbis += srcStride;
222  const vec_u8 srcP2 = load_with_perm_vec(0, srcbis, perm);
223  srcbis += srcStride;
224 
225  vec_s16 srcM2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
226  vec_s16 srcM2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
227  vec_s16 srcM1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
228  vec_s16 srcM1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
229  vec_s16 srcP0ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
230  vec_s16 srcP0ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
231  vec_s16 srcP1ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
232  vec_s16 srcP1ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
233  vec_s16 srcP2ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
234  vec_s16 srcP2ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
235 
236  vec_s16 pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
237  psumA, psumB, sumA, sumB,
238  srcP3ssA, srcP3ssB,
239  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;
240 
241  vec_u8 sum, fsum, srcP3;
242 
243  for (i = 0 ; i < 16 ; i++) {
244  srcP3 = load_with_perm_vec(0, srcbis, perm);
245  srcbis += srcStride;
246 
247  srcP3ssA = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
248  srcP3ssB = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
249 
250  sum1A = vec_adds(srcP0ssA, srcP1ssA);
251  sum1B = vec_adds(srcP0ssB, srcP1ssB);
252  sum2A = vec_adds(srcM1ssA, srcP2ssA);
253  sum2B = vec_adds(srcM1ssB, srcP2ssB);
254  sum3A = vec_adds(srcM2ssA, srcP3ssA);
255  sum3B = vec_adds(srcM2ssB, srcP3ssB);
256 
257  srcM2ssA = srcM1ssA;
258  srcM2ssB = srcM1ssB;
259  srcM1ssA = srcP0ssA;
260  srcM1ssB = srcP0ssB;
261  srcP0ssA = srcP1ssA;
262  srcP0ssB = srcP1ssB;
263  srcP1ssA = srcP2ssA;
264  srcP1ssB = srcP2ssB;
265  srcP2ssA = srcP3ssA;
266  srcP2ssB = srcP3ssB;
267 
268  pp1A = vec_mladd(sum1A, v20ss, v16ss);
269  pp1B = vec_mladd(sum1B, v20ss, v16ss);
270 
271  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
272  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
273 
274  pp3A = vec_add(sum3A, pp1A);
275  pp3B = vec_add(sum3B, pp1B);
276 
277  psumA = vec_sub(pp3A, pp2A);
278  psumB = vec_sub(pp3B, pp2B);
279 
280  sumA = vec_sra(psumA, v5us);
281  sumB = vec_sra(psumB, v5us);
282 
283  sum = vec_packsu(sumA, sumB);
284 
285  ASSERT_ALIGNED(dst);
286 
287  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
288 
289  vec_st(fsum, 0, dst);
290 
291  dst += dstStride;
292  }
293 }
294 #endif
295 
296 /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
297 #ifdef PREFIX_h264_qpel16_hv_lowpass_altivec
298 static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t *dst, int16_t *tmp,
299  const uint8_t *src,
300  int dstStride, int tmpStride,
301  int srcStride)
302 {
303  register int i;
304  LOAD_ZERO;
305  vec_u8 permM2, permM1, permP0, permP1, permP2, permP3;
306  const vec_s16 v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
307  const vec_u32 v10ui = vec_splat_u32(10);
308  const vec_s16 v5ss = vec_splat_s16(5);
309  const vec_s16 v1ss = vec_splat_s16(1);
310  const vec_s32 v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
311  const vec_u32 v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
312 
313  register int align = ((((unsigned long)src) - 2) % 16);
314 
315  vec_s16 srcP0A, srcP0B, srcP1A, srcP1B,
316  srcP2A, srcP2B, srcP3A, srcP3B,
317  srcM1A, srcM1B, srcM2A, srcM2B,
318  sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
319  pp1A, pp1B, pp2A, pp2B, psumA, psumB;
320 
321  const vec_u8 mperm = (const vec_u8)
322  {0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
323  0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F};
324  int16_t *tmpbis = tmp;
325 
326  vec_s16 tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
327  tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
328  tmpP2ssA, tmpP2ssB;
329 
330  vec_s32 pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
331  pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
332  pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
333  ssumAe, ssumAo, ssumBe, ssumBo;
334  vec_u8 fsum, sumv, sum;
335  vec_s16 ssume, ssumo;
336 
337 #if HAVE_BIGENDIAN
338  permM2 = vec_lvsl(-2, src);
339  permM1 = vec_lvsl(-1, src);
340  permP0 = vec_lvsl(+0, src);
341  permP1 = vec_lvsl(+1, src);
342  permP2 = vec_lvsl(+2, src);
343  permP3 = vec_lvsl(+3, src);
344 #endif /* HAVE_BIGENDIAN */
345 
346  src -= (2 * srcStride);
347  for (i = 0 ; i < 21 ; i ++) {
348  vec_u8 srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
349 
350  load_alignment(src, align, permM2, permM1, permP0, permP1, permP2, permP3);
351 
352  srcP0A = (vec_s16) VEC_MERGEH(zero_u8v, srcP0);
353  srcP0B = (vec_s16) VEC_MERGEL(zero_u8v, srcP0);
354  srcP1A = (vec_s16) VEC_MERGEH(zero_u8v, srcP1);
355  srcP1B = (vec_s16) VEC_MERGEL(zero_u8v, srcP1);
356 
357  srcP2A = (vec_s16) VEC_MERGEH(zero_u8v, srcP2);
358  srcP2B = (vec_s16) VEC_MERGEL(zero_u8v, srcP2);
359  srcP3A = (vec_s16) VEC_MERGEH(zero_u8v, srcP3);
360  srcP3B = (vec_s16) VEC_MERGEL(zero_u8v, srcP3);
361 
362  srcM1A = (vec_s16) VEC_MERGEH(zero_u8v, srcM1);
363  srcM1B = (vec_s16) VEC_MERGEL(zero_u8v, srcM1);
364  srcM2A = (vec_s16) VEC_MERGEH(zero_u8v, srcM2);
365  srcM2B = (vec_s16) VEC_MERGEL(zero_u8v, srcM2);
366 
367  sum1A = vec_adds(srcP0A, srcP1A);
368  sum1B = vec_adds(srcP0B, srcP1B);
369  sum2A = vec_adds(srcM1A, srcP2A);
370  sum2B = vec_adds(srcM1B, srcP2B);
371  sum3A = vec_adds(srcM2A, srcP3A);
372  sum3B = vec_adds(srcM2B, srcP3B);
373 
374  pp1A = vec_mladd(sum1A, v20ss, sum3A);
375  pp1B = vec_mladd(sum1B, v20ss, sum3B);
376 
377  pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
378  pp2B = vec_mladd(sum2B, v5ss, zero_s16v);
379 
380  psumA = vec_sub(pp1A, pp2A);
381  psumB = vec_sub(pp1B, pp2B);
382 
383  vec_st(psumA, 0, tmp);
384  vec_st(psumB, 16, tmp);
385 
386  src += srcStride;
387  tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
388  }
389 
390  tmpM2ssA = vec_ld(0, tmpbis);
391  tmpM2ssB = vec_ld(16, tmpbis);
392  tmpbis += tmpStride;
393  tmpM1ssA = vec_ld(0, tmpbis);
394  tmpM1ssB = vec_ld(16, tmpbis);
395  tmpbis += tmpStride;
396  tmpP0ssA = vec_ld(0, tmpbis);
397  tmpP0ssB = vec_ld(16, tmpbis);
398  tmpbis += tmpStride;
399  tmpP1ssA = vec_ld(0, tmpbis);
400  tmpP1ssB = vec_ld(16, tmpbis);
401  tmpbis += tmpStride;
402  tmpP2ssA = vec_ld(0, tmpbis);
403  tmpP2ssB = vec_ld(16, tmpbis);
404  tmpbis += tmpStride;
405 
406  for (i = 0 ; i < 16 ; i++) {
407  const vec_s16 tmpP3ssA = vec_ld(0, tmpbis);
408  const vec_s16 tmpP3ssB = vec_ld(16, tmpbis);
409 
410  const vec_s16 sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
411  const vec_s16 sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
412  const vec_s16 sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
413  const vec_s16 sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
414  vec_s16 sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
415  vec_s16 sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
416 
417  tmpbis += tmpStride;
418 
419  tmpM2ssA = tmpM1ssA;
420  tmpM2ssB = tmpM1ssB;
421  tmpM1ssA = tmpP0ssA;
422  tmpM1ssB = tmpP0ssB;
423  tmpP0ssA = tmpP1ssA;
424  tmpP0ssB = tmpP1ssB;
425  tmpP1ssA = tmpP2ssA;
426  tmpP1ssB = tmpP2ssB;
427  tmpP2ssA = tmpP3ssA;
428  tmpP2ssB = tmpP3ssB;
429 
430  pp1Ae = vec_mule(sum1A, v20ss);
431  pp1Ao = vec_mulo(sum1A, v20ss);
432  pp1Be = vec_mule(sum1B, v20ss);
433  pp1Bo = vec_mulo(sum1B, v20ss);
434 
435  pp2Ae = vec_mule(sum2A, v5ss);
436  pp2Ao = vec_mulo(sum2A, v5ss);
437  pp2Be = vec_mule(sum2B, v5ss);
438  pp2Bo = vec_mulo(sum2B, v5ss);
439 
440  pp3Ao = vec_mulo(sum3A, v1ss);
441  pp3Bo = vec_mulo(sum3B, v1ss);
442 #if !HAVE_BIGENDIAN
443  sum3A = (vec_s16)vec_perm(sum3A, sum3A,vcswapi2s(0,1,2,3));
444  sum3B = (vec_s16)vec_perm(sum3B, sum3B,vcswapi2s(0,1,2,3));
445 #endif
446  pp3Ae = vec_sra((vec_s32)sum3A, v16ui);
447  pp3Be = vec_sra((vec_s32)sum3B, v16ui);
448 
449  pp1cAe = vec_add(pp1Ae, v512si);
450  pp1cAo = vec_add(pp1Ao, v512si);
451  pp1cBe = vec_add(pp1Be, v512si);
452  pp1cBo = vec_add(pp1Bo, v512si);
453 
454  pp32Ae = vec_sub(pp3Ae, pp2Ae);
455  pp32Ao = vec_sub(pp3Ao, pp2Ao);
456  pp32Be = vec_sub(pp3Be, pp2Be);
457  pp32Bo = vec_sub(pp3Bo, pp2Bo);
458 
459  sumAe = vec_add(pp1cAe, pp32Ae);
460  sumAo = vec_add(pp1cAo, pp32Ao);
461  sumBe = vec_add(pp1cBe, pp32Be);
462  sumBo = vec_add(pp1cBo, pp32Bo);
463 
464  ssumAe = vec_sra(sumAe, v10ui);
465  ssumAo = vec_sra(sumAo, v10ui);
466  ssumBe = vec_sra(sumBe, v10ui);
467  ssumBo = vec_sra(sumBo, v10ui);
468 
469  ssume = vec_packs(ssumAe, ssumBe);
470  ssumo = vec_packs(ssumAo, ssumBo);
471 
472  sumv = vec_packsu(ssume, ssumo);
473  sum = vec_perm(sumv, sumv, mperm);
474 
475  ASSERT_ALIGNED(dst);
476 
477  OP_U8_ALTIVEC(fsum, sum, vec_ld(0, dst));
478 
479  vec_st(fsum, 0, dst);
480 
481  dst += dstStride;
482  }
483 }
484 #endif