FFmpeg
hpeldsp_altivec.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2002 Brian Foley
3  * Copyright (c) 2002 Dieter Shirley
4  * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "config.h"
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/ppc/cpu.h"
29 
30 #include "libavcodec/hpeldsp.h"
31 
32 #include "hpeldsp_altivec.h"
33 
34 #if HAVE_ALTIVEC
35 /* next one assumes that ((line_size % 16) == 0) */
36 void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
37 {
38  register vector unsigned char pixelsv1;
39  register vector unsigned char pixelsv1B;
40  register vector unsigned char pixelsv1C;
41  register vector unsigned char pixelsv1D;
42 
43  int i;
44  register ptrdiff_t line_size_2 = line_size << 1;
45  register ptrdiff_t line_size_3 = line_size + line_size_2;
46  register ptrdiff_t line_size_4 = line_size << 2;
47 
48 // hand-unrolling the loop by 4 gains about 15%
49 // mininum execution time goes from 74 to 60 cycles
50 // it's faster than -funroll-loops, but using
51 // -funroll-loops w/ this is bad - 74 cycles again.
52 // all this is on a 7450, tuning for the 7450
53  for (i = 0; i < h; i += 4) {
54  pixelsv1 = unaligned_load( 0, pixels);
55  pixelsv1B = unaligned_load(line_size, pixels);
56  pixelsv1C = unaligned_load(line_size_2, pixels);
57  pixelsv1D = unaligned_load(line_size_3, pixels);
58  VEC_ST(pixelsv1, 0, (unsigned char*)block);
59  VEC_ST(pixelsv1B, line_size, (unsigned char*)block);
60  VEC_ST(pixelsv1C, line_size_2, (unsigned char*)block);
61  VEC_ST(pixelsv1D, line_size_3, (unsigned char*)block);
62  pixels+=line_size_4;
63  block +=line_size_4;
64  }
65 }
66 
67 /* next one assumes that ((line_size % 16) == 0) */
68 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
69 void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
70 {
71  register vector unsigned char pixelsv, blockv;
72 
73  int i;
74  for (i = 0; i < h; i++) {
75  blockv = vec_ld(0, block);
76  pixelsv = VEC_LD( 0, pixels);
77  blockv = vec_avg(blockv,pixelsv);
78  vec_st(blockv, 0, (unsigned char*)block);
79  pixels+=line_size;
80  block +=line_size;
81  }
82 }
83 
84 /* next one assumes that ((line_size % 8) == 0) */
85 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
86 {
87  register vector unsigned char pixelsv, blockv;
88  int i;
89 
90  for (i = 0; i < h; i++) {
91  /* block is 8 bytes-aligned, so we're either in the
92  left block (16 bytes-aligned) or in the right block (not) */
93  int rightside = ((unsigned long)block & 0x0000000F);
94 
95  blockv = vec_ld(0, block);
96  pixelsv = VEC_LD( 0, pixels);
97 
98  if (rightside) {
99  pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
100  } else {
101  pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
102  }
103 
104  blockv = vec_avg(blockv, pixelsv);
105 
106  vec_st(blockv, 0, block);
107 
108  pixels += line_size;
109  block += line_size;
110  }
111 }
112 
113 /* next one assumes that ((line_size % 8) == 0) */
114 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
115 {
116  register int i;
117  register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
118  register vector unsigned char blockv;
119  register vector unsigned short pixelssum1, pixelssum2, temp3;
120  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
121  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
122 
123  pixelsv1 = VEC_LD(0, pixels);
124  pixelsv2 = VEC_LD(1, pixels);
125  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
126  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
127 
128  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
129  (vector unsigned short)pixelsv2);
130  pixelssum1 = vec_add(pixelssum1, vctwo);
131 
132  for (i = 0; i < h ; i++) {
133  int rightside = ((unsigned long)block & 0x0000000F);
134  blockv = vec_ld(0, block);
135 
136  pixelsv1 = unaligned_load(line_size, pixels);
137  pixelsv2 = unaligned_load(line_size+1, pixels);
138  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
139  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
140  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
141  (vector unsigned short)pixelsv2);
142  temp3 = vec_add(pixelssum1, pixelssum2);
143  temp3 = vec_sra(temp3, vctwo);
144  pixelssum1 = vec_add(pixelssum2, vctwo);
145  pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
146 
147  if (rightside) {
148  blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
149  } else {
150  blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
151  }
152 
153  vec_st(blockv, 0, block);
154 
155  block += line_size;
156  pixels += line_size;
157  }
158 }
159 
160 /* next one assumes that ((line_size % 8) == 0) */
161 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
162 {
163  register int i;
164  register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
165  register vector unsigned char blockv;
166  register vector unsigned short pixelssum1, pixelssum2, temp3;
167  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
168  register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
169  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
170 
171  pixelsv1 = VEC_LD(0, pixels);
172  pixelsv2 = VEC_LD(1, pixels);
173  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
174  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
175  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
176  (vector unsigned short)pixelsv2);
177  pixelssum1 = vec_add(pixelssum1, vcone);
178 
179  for (i = 0; i < h ; i++) {
180  int rightside = ((unsigned long)block & 0x0000000F);
181  blockv = vec_ld(0, block);
182 
183  pixelsv1 = unaligned_load(line_size, pixels);
184  pixelsv2 = unaligned_load(line_size+1, pixels);
185  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
186  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
187  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
188  (vector unsigned short)pixelsv2);
189  temp3 = vec_add(pixelssum1, pixelssum2);
190  temp3 = vec_sra(temp3, vctwo);
191  pixelssum1 = vec_add(pixelssum2, vcone);
192  pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
193 
194  if (rightside) {
195  blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
196  } else {
197  blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
198  }
199 
200  vec_st(blockv, 0, block);
201 
202  block += line_size;
203  pixels += line_size;
204  }
205 }
206 
207 /* next one assumes that ((line_size % 16) == 0) */
208 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
209 {
210  register int i;
211  register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
212  register vector unsigned char blockv;
213  register vector unsigned short temp3, temp4,
214  pixelssum1, pixelssum2, pixelssum3, pixelssum4;
215  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
216  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
217 
218  pixelsv1 = VEC_LD(0, pixels);
219  pixelsv2 = VEC_LD(1, pixels);
220  pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
221  pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
222  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
223  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
224  pixelssum3 = vec_add((vector unsigned short)pixelsv3,
225  (vector unsigned short)pixelsv4);
226  pixelssum3 = vec_add(pixelssum3, vctwo);
227  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
228  (vector unsigned short)pixelsv2);
229  pixelssum1 = vec_add(pixelssum1, vctwo);
230 
231  for (i = 0; i < h ; i++) {
232  blockv = vec_ld(0, block);
233 
234  pixelsv1 = unaligned_load(line_size, pixels);
235  pixelsv2 = unaligned_load(line_size+1, pixels);
236 
237  pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
238  pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
239  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
240  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
241  pixelssum4 = vec_add((vector unsigned short)pixelsv3,
242  (vector unsigned short)pixelsv4);
243  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
244  (vector unsigned short)pixelsv2);
245  temp4 = vec_add(pixelssum3, pixelssum4);
246  temp4 = vec_sra(temp4, vctwo);
247  temp3 = vec_add(pixelssum1, pixelssum2);
248  temp3 = vec_sra(temp3, vctwo);
249 
250  pixelssum3 = vec_add(pixelssum4, vctwo);
251  pixelssum1 = vec_add(pixelssum2, vctwo);
252 
253  blockv = vec_packsu(temp3, temp4);
254 
255  vec_st(blockv, 0, block);
256 
257  block += line_size;
258  pixels += line_size;
259  }
260 }
261 
262 /* next one assumes that ((line_size % 16) == 0) */
263 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
264 {
265  register int i;
266  register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
267  register vector unsigned char blockv;
268  register vector unsigned short temp3, temp4,
269  pixelssum1, pixelssum2, pixelssum3, pixelssum4;
270  register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
271  register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
272  register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
273 
274  pixelsv1 = VEC_LD(0, pixels);
275  pixelsv2 = VEC_LD(1, pixels);
276  pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
277  pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
278  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
279  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
280  pixelssum3 = vec_add((vector unsigned short)pixelsv3,
281  (vector unsigned short)pixelsv4);
282  pixelssum3 = vec_add(pixelssum3, vcone);
283  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
284  (vector unsigned short)pixelsv2);
285  pixelssum1 = vec_add(pixelssum1, vcone);
286 
287  for (i = 0; i < h ; i++) {
288  pixelsv1 = unaligned_load(line_size, pixels);
289  pixelsv2 = unaligned_load(line_size+1, pixels);
290 
291  pixelsv3 = VEC_MERGEL(vczero, pixelsv1);
292  pixelsv4 = VEC_MERGEL(vczero, pixelsv2);
293  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
294  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
295  pixelssum4 = vec_add((vector unsigned short)pixelsv3,
296  (vector unsigned short)pixelsv4);
297  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
298  (vector unsigned short)pixelsv2);
299  temp4 = vec_add(pixelssum3, pixelssum4);
300  temp4 = vec_sra(temp4, vctwo);
301  temp3 = vec_add(pixelssum1, pixelssum2);
302  temp3 = vec_sra(temp3, vctwo);
303 
304  pixelssum3 = vec_add(pixelssum4, vcone);
305  pixelssum1 = vec_add(pixelssum2, vcone);
306 
307  blockv = vec_packsu(temp3, temp4);
308 
309  VEC_ST(blockv, 0, block);
310 
311  block += line_size;
312  pixels += line_size;
313  }
314 }
315 
316 /* next one assumes that ((line_size % 8) == 0) */
317 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
318 {
319  register int i;
320  register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
321  register vector unsigned char blockv, blocktemp;
322  register vector unsigned short pixelssum1, pixelssum2, temp3;
323 
324  register const vector unsigned char vczero = (const vector unsigned char)
325  vec_splat_u8(0);
326  register const vector unsigned short vctwo = (const vector unsigned short)
327  vec_splat_u16(2);
328 
329  pixelsv1 = VEC_LD(0, pixels);
330  pixelsv2 = VEC_LD(1, pixels);
331  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
332  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
333  pixelssum1 = vec_add((vector unsigned short)pixelsv1,
334  (vector unsigned short)pixelsv2);
335  pixelssum1 = vec_add(pixelssum1, vctwo);
336 
337  for (i = 0; i < h ; i++) {
338  int rightside = ((unsigned long)block & 0x0000000F);
339  blockv = vec_ld(0, block);
340 
341  pixelsv1 = unaligned_load(line_size, pixels);
342  pixelsv2 = unaligned_load(line_size+1, pixels);
343 
344  pixelsv1 = VEC_MERGEH(vczero, pixelsv1);
345  pixelsv2 = VEC_MERGEH(vczero, pixelsv2);
346  pixelssum2 = vec_add((vector unsigned short)pixelsv1,
347  (vector unsigned short)pixelsv2);
348  temp3 = vec_add(pixelssum1, pixelssum2);
349  temp3 = vec_sra(temp3, vctwo);
350  pixelssum1 = vec_add(pixelssum2, vctwo);
351  pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
352 
353  if (rightside) {
354  blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
355  } else {
356  blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
357  }
358 
359  blockv = vec_avg(blocktemp, blockv);
360  vec_st(blockv, 0, block);
361 
362  block += line_size;
363  pixels += line_size;
364  }
365 }
366 #endif /* HAVE_ALTIVEC */
367 
369 {
370 #if HAVE_ALTIVEC
372  return;
373 
374  c->avg_pixels_tab[0][0] = ff_avg_pixels16_altivec;
375  c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
376  c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
377 
378  c->put_pixels_tab[0][0] = ff_put_pixels16_altivec;
379  c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
380  c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
381 
382  c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec;
383  c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
384  c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
385 #endif /* HAVE_ALTIVEC */
386 }
ff_avg_pixels16_altivec
void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:101
ff_hpeldsp_init_ppc
av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)
Definition: hpeldsp_altivec.c:368
av_cold
#define av_cold
Definition: attributes.h:90
s1
#define s1
Definition: regdef.h:38
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
ff_put_pixels16_altivec
void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
HpelDSPContext
Half-pel DSP context.
Definition: hpeldsp.h:45
PPC_ALTIVEC
#define PPC_ALTIVEC(flags)
Definition: cpu.h:25
cpu.h
attributes.h
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
hpeldsp_altivec.h
s0
#define s0
Definition: regdef.h:37
util_altivec.h
cpu.h
hpeldsp.h
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:561
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
h
h
Definition: vp9dsp_template.c:2038
short
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf default minimum maximum flags name is the option keep it simple and lowercase description are short
Definition: writing_filters.txt:89