FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
hpeldsp_altivec.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2002 Brian Foley
3  * Copyright (c) 2002 Dieter Shirley
4  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "config.h"
24 
25 #if HAVE_ALTIVEC_H
26 #include <altivec.h>
27 #endif
28 
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
31 #include "libavutil/ppc/cpu.h"
34 #include "libavcodec/hpeldsp.h"
35 #include "hpeldsp_altivec.h"
36 
37 #if HAVE_ALTIVEC
38 /* next one assumes that ((line_size % 16) == 0) */
39 void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
40 {
41  register vector unsigned char pixelsv1, pixelsv2;
42  register vector unsigned char pixelsv1B, pixelsv2B;
43  register vector unsigned char pixelsv1C, pixelsv2C;
44  register vector unsigned char pixelsv1D, pixelsv2D;
45 
46  register vector unsigned char perm = vec_lvsl(0, pixels);
47  int i;
48  register ptrdiff_t line_size_2 = line_size << 1;
49  register ptrdiff_t line_size_3 = line_size + line_size_2;
50  register ptrdiff_t line_size_4 = line_size << 2;
51 
52 // hand-unrolling the loop by 4 gains about 15%
53 // mininum execution time goes from 74 to 60 cycles
54 // it's faster than -funroll-loops, but using
55 // -funroll-loops w/ this is bad - 74 cycles again.
56 // all this is on a 7450, tuning for the 7450
57  for (i = 0; i < h; i += 4) {
58  pixelsv1 = vec_ld( 0, pixels);
59  pixelsv2 = vec_ld(15, pixels);
60  pixelsv1B = vec_ld(line_size, pixels);
61  pixelsv2B = vec_ld(15 + line_size, pixels);
62  pixelsv1C = vec_ld(line_size_2, pixels);
63  pixelsv2C = vec_ld(15 + line_size_2, pixels);
64  pixelsv1D = vec_ld(line_size_3, pixels);
65  pixelsv2D = vec_ld(15 + line_size_3, pixels);
66  vec_st(vec_perm(pixelsv1, pixelsv2, perm),
67  0, (unsigned char*)block);
68  vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
69  line_size, (unsigned char*)block);
70  vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
71  line_size_2, (unsigned char*)block);
72  vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
73  line_size_3, (unsigned char*)block);
74  pixels+=line_size_4;
75  block +=line_size_4;
76  }
77 }
78 
79 /* next one assumes that ((line_size % 16) == 0) */
80 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
81 void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
82 {
83  register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
84  register vector unsigned char perm = vec_lvsl(0, pixels);
85  int i;
86 
87  for (i = 0; i < h; i++) {
88  pixelsv1 = vec_ld( 0, pixels);
89  pixelsv2 = vec_ld(16,pixels);
90  blockv = vec_ld(0, block);
91  pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
92  blockv = vec_avg(blockv,pixelsv);
93  vec_st(blockv, 0, (unsigned char*)block);
94  pixels+=line_size;
95  block +=line_size;
96  }
97 }
98 
99 /* next one assumes that ((line_size % 8) == 0) */
100 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
101 {
102  register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
103  int i;
104 
105  for (i = 0; i < h; i++) {
106  /* block is 8 bytes-aligned, so we're either in the
107  left block (16 bytes-aligned) or in the right block (not) */
108  int rightside = ((unsigned long)block & 0x0000000F);
109 
110  blockv = vec_ld(0, block);
111  pixelsv1 = vec_ld( 0, pixels);
112  pixelsv2 = vec_ld(16, pixels);
113  pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
114 
115  if (rightside) {
116  pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
117  } else {
118  pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
119  }
120 
121  blockv = vec_avg(blockv, pixelsv);
122 
123  vec_st(blockv, 0, block);
124 
125  pixels += line_size;
126  block += line_size;
127  }
128 }
129 
130 /* next one assumes that ((line_size % 8) == 0) */
131 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
132 {
133  register int i;
134  register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
135  register vector unsigned char blockv, temp1, temp2;
136  register vector unsigned short pixelssum1, pixelssum2, temp3;
137  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
138  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
139 
140  temp1 = vec_ld(0, pixels);
141  temp2 = vec_ld(16, pixels);
142  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
143  if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
144  pixelsv2 = temp2;
145  } else {
146  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
147  }
148  pixelsv1 = vec_mergeh(vczero, pixelsv1);
149  pixelsv2 = vec_mergeh(vczero, pixelsv2);
150  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
151  (vector unsigned short)pixelsv2);
152  pixelssum1 = vec_add(pixelssum1, vctwo);
153 
154  for (i = 0; i < h ; i++) {
155  int rightside = ((unsigned long)block & 0x0000000F);
156  blockv = vec_ld(0, block);
157 
158  temp1 = vec_ld(line_size, pixels);
159  temp2 = vec_ld(line_size + 16, pixels);
160  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
161  if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
162  pixelsv2 = temp2;
163  } else {
164  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
165  }
166 
167  pixelsv1 = vec_mergeh(vczero, pixelsv1);
168  pixelsv2 = vec_mergeh(vczero, pixelsv2);
169  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
170  (vector unsigned short)pixelsv2);
171  temp3 = vec_add(pixelssum1, pixelssum2);
172  temp3 = vec_sra(temp3, vctwo);
173  pixelssum1 = vec_add(pixelssum2, vctwo);
174  pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
175 
176  if (rightside) {
177  blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
178  } else {
179  blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
180  }
181 
182  vec_st(blockv, 0, block);
183 
184  block += line_size;
185  pixels += line_size;
186  }
187 }
188 
189 /* next one assumes that ((line_size % 8) == 0) */
190 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
191 {
192  register int i;
193  register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
194  register vector unsigned char blockv, temp1, temp2;
195  register vector unsigned short pixelssum1, pixelssum2, temp3;
196  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
197  register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
198  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
199 
200  temp1 = vec_ld(0, pixels);
201  temp2 = vec_ld(16, pixels);
202  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
203  if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
204  pixelsv2 = temp2;
205  } else {
206  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
207  }
208  pixelsv1 = vec_mergeh(vczero, pixelsv1);
209  pixelsv2 = vec_mergeh(vczero, pixelsv2);
210  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
211  (vector unsigned short)pixelsv2);
212  pixelssum1 = vec_add(pixelssum1, vcone);
213 
214  for (i = 0; i < h ; i++) {
215  int rightside = ((unsigned long)block & 0x0000000F);
216  blockv = vec_ld(0, block);
217 
218  temp1 = vec_ld(line_size, pixels);
219  temp2 = vec_ld(line_size + 16, pixels);
220  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
221  if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
222  pixelsv2 = temp2;
223  } else {
224  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
225  }
226 
227  pixelsv1 = vec_mergeh(vczero, pixelsv1);
228  pixelsv2 = vec_mergeh(vczero, pixelsv2);
229  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
230  (vector unsigned short)pixelsv2);
231  temp3 = vec_add(pixelssum1, pixelssum2);
232  temp3 = vec_sra(temp3, vctwo);
233  pixelssum1 = vec_add(pixelssum2, vcone);
234  pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
235 
236  if (rightside) {
237  blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
238  } else {
239  blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
240  }
241 
242  vec_st(blockv, 0, block);
243 
244  block += line_size;
245  pixels += line_size;
246  }
247 }
248 
249 /* next one assumes that ((line_size % 16) == 0) */
250 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
251 {
252  register int i;
253  register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
254  register vector unsigned char blockv, temp1, temp2;
255  register vector unsigned short temp3, temp4,
256  pixelssum1, pixelssum2, pixelssum3, pixelssum4;
257  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
258  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
259 
260  temp1 = vec_ld(0, pixels);
261  temp2 = vec_ld(16, pixels);
262  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
263  if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
264  pixelsv2 = temp2;
265  } else {
266  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
267  }
268  pixelsv3 = vec_mergel(vczero, pixelsv1);
269  pixelsv4 = vec_mergel(vczero, pixelsv2);
270  pixelsv1 = vec_mergeh(vczero, pixelsv1);
271  pixelsv2 = vec_mergeh(vczero, pixelsv2);
272  pixelssum3 = vec_add((vector unsigned short)pixelsv3,
273  (vector unsigned short)pixelsv4);
274  pixelssum3 = vec_add(pixelssum3, vctwo);
275  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
276  (vector unsigned short)pixelsv2);
277  pixelssum1 = vec_add(pixelssum1, vctwo);
278 
279  for (i = 0; i < h ; i++) {
280  blockv = vec_ld(0, block);
281 
282  temp1 = vec_ld(line_size, pixels);
283  temp2 = vec_ld(line_size + 16, pixels);
284  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
285  if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
286  pixelsv2 = temp2;
287  } else {
288  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
289  }
290 
291  pixelsv3 = vec_mergel(vczero, pixelsv1);
292  pixelsv4 = vec_mergel(vczero, pixelsv2);
293  pixelsv1 = vec_mergeh(vczero, pixelsv1);
294  pixelsv2 = vec_mergeh(vczero, pixelsv2);
295 
296  pixelssum4 = vec_add((vector unsigned short)pixelsv3,
297  (vector unsigned short)pixelsv4);
298  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
299  (vector unsigned short)pixelsv2);
300  temp4 = vec_add(pixelssum3, pixelssum4);
301  temp4 = vec_sra(temp4, vctwo);
302  temp3 = vec_add(pixelssum1, pixelssum2);
303  temp3 = vec_sra(temp3, vctwo);
304 
305  pixelssum3 = vec_add(pixelssum4, vctwo);
306  pixelssum1 = vec_add(pixelssum2, vctwo);
307 
308  blockv = vec_packsu(temp3, temp4);
309 
310  vec_st(blockv, 0, block);
311 
312  block += line_size;
313  pixels += line_size;
314  }
315 }
316 
317 /* next one assumes that ((line_size % 16) == 0) */
318 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
319 {
320  register int i;
321  register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
322  register vector unsigned char blockv, temp1, temp2;
323  register vector unsigned short temp3, temp4,
324  pixelssum1, pixelssum2, pixelssum3, pixelssum4;
325  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
326  register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
327  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
328 
329  temp1 = vec_ld(0, pixels);
330  temp2 = vec_ld(16, pixels);
331  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
332  if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
333  pixelsv2 = temp2;
334  } else {
335  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
336  }
337  pixelsv3 = vec_mergel(vczero, pixelsv1);
338  pixelsv4 = vec_mergel(vczero, pixelsv2);
339  pixelsv1 = vec_mergeh(vczero, pixelsv1);
340  pixelsv2 = vec_mergeh(vczero, pixelsv2);
341  pixelssum3 = vec_add((vector unsigned short)pixelsv3,
342  (vector unsigned short)pixelsv4);
343  pixelssum3 = vec_add(pixelssum3, vcone);
344  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
345  (vector unsigned short)pixelsv2);
346  pixelssum1 = vec_add(pixelssum1, vcone);
347 
348  for (i = 0; i < h ; i++) {
349  blockv = vec_ld(0, block);
350 
351  temp1 = vec_ld(line_size, pixels);
352  temp2 = vec_ld(line_size + 16, pixels);
353  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
354  if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
355  pixelsv2 = temp2;
356  } else {
357  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
358  }
359 
360  pixelsv3 = vec_mergel(vczero, pixelsv1);
361  pixelsv4 = vec_mergel(vczero, pixelsv2);
362  pixelsv1 = vec_mergeh(vczero, pixelsv1);
363  pixelsv2 = vec_mergeh(vczero, pixelsv2);
364 
365  pixelssum4 = vec_add((vector unsigned short)pixelsv3,
366  (vector unsigned short)pixelsv4);
367  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
368  (vector unsigned short)pixelsv2);
369  temp4 = vec_add(pixelssum3, pixelssum4);
370  temp4 = vec_sra(temp4, vctwo);
371  temp3 = vec_add(pixelssum1, pixelssum2);
372  temp3 = vec_sra(temp3, vctwo);
373 
374  pixelssum3 = vec_add(pixelssum4, vcone);
375  pixelssum1 = vec_add(pixelssum2, vcone);
376 
377  blockv = vec_packsu(temp3, temp4);
378 
379  vec_st(blockv, 0, block);
380 
381  block += line_size;
382  pixels += line_size;
383  }
384 }
385 
386 /* next one assumes that ((line_size % 8) == 0) */
387 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
388 {
389  register int i;
390  register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
391  register vector unsigned char blockv, temp1, temp2, blocktemp;
392  register vector unsigned short pixelssum1, pixelssum2, temp3;
393 
394  register const vector unsigned char vczero = (const vector unsigned char)
395  vec_splat_u8(0);
396  register const vector unsigned short vctwo = (const vector unsigned short)
397  vec_splat_u16(2);
398 
399  temp1 = vec_ld(0, pixels);
400  temp2 = vec_ld(16, pixels);
401  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
402  if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
403  pixelsv2 = temp2;
404  } else {
405  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
406  }
407  pixelsv1 = vec_mergeh(vczero, pixelsv1);
408  pixelsv2 = vec_mergeh(vczero, pixelsv2);
409  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
410  (vector unsigned short)pixelsv2);
411  pixelssum1 = vec_add(pixelssum1, vctwo);
412 
413  for (i = 0; i < h ; i++) {
414  int rightside = ((unsigned long)block & 0x0000000F);
415  blockv = vec_ld(0, block);
416 
417  temp1 = vec_ld(line_size, pixels);
418  temp2 = vec_ld(line_size + 16, pixels);
419  pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
420  if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
421  pixelsv2 = temp2;
422  } else {
423  pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
424  }
425 
426  pixelsv1 = vec_mergeh(vczero, pixelsv1);
427  pixelsv2 = vec_mergeh(vczero, pixelsv2);
428  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
429  (vector unsigned short)pixelsv2);
430  temp3 = vec_add(pixelssum1, pixelssum2);
431  temp3 = vec_sra(temp3, vctwo);
432  pixelssum1 = vec_add(pixelssum2, vctwo);
433  pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
434 
435  if (rightside) {
436  blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
437  } else {
438  blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
439  }
440 
441  blockv = vec_avg(blocktemp, blockv);
442  vec_st(blockv, 0, block);
443 
444  block += line_size;
445  pixels += line_size;
446  }
447 }
448 #endif /* HAVE_ALTIVEC */
449 
451 {
452 #if HAVE_ALTIVEC
454  return;
455 
457  c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
458  c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
459 
461  c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
462  c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
463 
465  c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
466  c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
467 #endif /* HAVE_ALTIVEC */
468 }