FFmpeg
fft_vsx.h
Go to the documentation of this file.
1 #ifndef AVCODEC_PPC_FFT_VSX_H
2 #define AVCODEC_PPC_FFT_VSX_H
3 /*
4  * FFT transform, optimized with VSX built-in functions
5  * Copyright (c) 2014 Rong Yan Copyright (c) 2009 Loren Merritt
6  *
7  * This algorithm (though not any of the implementation details) is
8  * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S.
9  *
10  * This file is part of FFmpeg.
11  *
12  * FFmpeg is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * FFmpeg is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with FFmpeg; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 
28 #include "config.h"
29 #include "libavutil/cpu.h"
31 #include "libavcodec/fft.h"
33 
34 #if HAVE_VSX
35 
36 void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z);
37 void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z);
38 
39 
40 #define byte_2complex (2*sizeof(FFTComplex))
41 #define byte_4complex (4*sizeof(FFTComplex))
42 #define byte_6complex (6*sizeof(FFTComplex))
43 #define byte_8complex (8*sizeof(FFTComplex))
44 #define byte_10complex (10*sizeof(FFTComplex))
45 #define byte_12complex (12*sizeof(FFTComplex))
46 #define byte_14complex (14*sizeof(FFTComplex))
47 
48 inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n)
49 {
50  int o1 = n<<1;
51  int o2 = n<<2;
52  int o3 = o1+o2;
53  int i1, i2, i3;
54  FFTSample* out = (FFTSample*)z;
55  const FFTSample *wim = wre+o1;
56  vec_f vz0, vzo1, vzo2, vzo3;
57  vec_f x0, x1, x2, x3;
58  vec_f x4, x5, x6, x7;
59  vec_f x8, x9, x10, x11;
60  vec_f x12, x13, x14, x15;
61  vec_f x16, x17, x18, x19;
62  vec_f x20, x21, x22, x23;
63  vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1;
64  vec_f y0, y1, y2, y3;
65  vec_f y4, y5, y8, y9;
66  vec_f y10, y13, y14, y15;
67  vec_f y16, y17, y18, y19;
68  vec_f y20, y21, y22, y23;
69  vec_f wr1, wi1, wr0, wi0;
70  vec_f wr2, wi2, wr3, wi3;
71  vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3;
72 
73  n = n-2;
74  i1 = o1*sizeof(FFTComplex);
75  i2 = o2*sizeof(FFTComplex);
76  i3 = o3*sizeof(FFTComplex);
77  vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i
78  vzo2plus1 = vec_ld(i2+16, &(out[0]));
79  vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i
80  vzo3plus1 = vec_ld(i3+16, &(out[0]));
81  vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i
82  vz0plus1 = vec_ld(16, &(out[0]));
83  vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i
84  vzo1plus1 = vec_ld(i1+16, &(out[0]));
85 
86  x0 = vec_add(vzo2, vzo3);
87  x1 = vec_sub(vzo2, vzo3);
88  y0 = vec_add(vzo2plus1, vzo3plus1);
89  y1 = vec_sub(vzo2plus1, vzo3plus1);
90 
91  wr1 = vec_splats(wre[1]);
92  wi1 = vec_splats(wim[-1]);
93  wi2 = vec_splats(wim[-2]);
94  wi3 = vec_splats(wim[-3]);
95  wr2 = vec_splats(wre[2]);
96  wr3 = vec_splats(wre[3]);
97 
98  x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
99  x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
100 
101  y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
102  y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
103  y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
104  y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
105 
106  ymulwi2 = vec_mul(y4, wi2);
107  ymulwi3 = vec_mul(y5, wi3);
108  x4 = vec_mul(x2, wr1);
109  x5 = vec_mul(x3, wi1);
110  y8 = vec_madd(y2, wr2, ymulwi2);
111  y9 = vec_msub(y2, wr2, ymulwi2);
112  x6 = vec_add(x4, x5);
113  x7 = vec_sub(x4, x5);
114  y13 = vec_madd(y3, wr3, ymulwi3);
115  y14 = vec_msub(y3, wr3, ymulwi3);
116 
117  x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3));
118  y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
119  y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
120 
121  x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2));
122  x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1));
123 
124  y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
125  y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
126 
127  x11 = vec_add(vz0, x9);
128  x12 = vec_sub(vz0, x9);
129  x13 = vec_add(vzo1, x10);
130  x14 = vec_sub(vzo1, x10);
131 
132  y18 = vec_add(vz0plus1, y16);
133  y19 = vec_sub(vz0plus1, y16);
134  y20 = vec_add(vzo1plus1, y17);
135  y21 = vec_sub(vzo1plus1, y17);
136 
137  x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3));
138  x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3));
139  y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
140  y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
141 
142 
143  vec_st(x11, 0, &(out[0]));
144  vec_st(y18, 16, &(out[0]));
145  vec_st(x15, i1, &(out[0]));
146  vec_st(y22, i1+16, &(out[0]));
147  vec_st(x12, i2, &(out[0]));
148  vec_st(y19, i2+16, &(out[0]));
149  vec_st(x16, i3, &(out[0]));
150  vec_st(y23, i3+16, &(out[0]));
151 
152  do {
153  out += 8;
154  wre += 4;
155  wim -= 4;
156  wr0 = vec_splats(wre[0]);
157  wr1 = vec_splats(wre[1]);
158  wi0 = vec_splats(wim[0]);
159  wi1 = vec_splats(wim[-1]);
160 
161  wr2 = vec_splats(wre[2]);
162  wr3 = vec_splats(wre[3]);
163  wi2 = vec_splats(wim[-2]);
164  wi3 = vec_splats(wim[-3]);
165 
166  vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i
167  vzo2plus1 = vec_ld(i2+16, &(out[0]));
168  vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i
169  vzo3plus1 = vec_ld(i3+16, &(out[0]));
170  vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i
171  vz0plus1 = vec_ld(16, &(out[0]));
172  vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i
173  vzo1plus1 = vec_ld(i1+16, &(out[0]));
174 
175  x0 = vec_add(vzo2, vzo3);
176  x1 = vec_sub(vzo2, vzo3);
177 
178  y0 = vec_add(vzo2plus1, vzo3plus1);
179  y1 = vec_sub(vzo2plus1, vzo3plus1);
180 
181  x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0));
182  x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
183  x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1));
184  x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
185 
186  y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
187  y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
188  xmulwi0 = vec_mul(x4, wi0);
189  xmulwi1 = vec_mul(x5, wi1);
190 
191  y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
192  y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
193 
194  x8 = vec_madd(x2, wr0, xmulwi0);
195  x9 = vec_msub(x2, wr0, xmulwi0);
196  ymulwi2 = vec_mul(y4, wi2);
197  ymulwi3 = vec_mul(y5, wi3);
198 
199  x13 = vec_madd(x3, wr1, xmulwi1);
200  x14 = vec_msub(x3, wr1, xmulwi1);
201 
202  y8 = vec_madd(y2, wr2, ymulwi2);
203  y9 = vec_msub(y2, wr2, ymulwi2);
204  y13 = vec_madd(y3, wr3, ymulwi3);
205  y14 = vec_msub(y3, wr3, ymulwi3);
206 
207  x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3));
208  x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3));
209 
210  y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
211  y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
212 
213  x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2));
214  x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1));
215 
216  y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
217  y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
218 
219  x18 = vec_add(vz0, x16);
220  x19 = vec_sub(vz0, x16);
221  x20 = vec_add(vzo1, x17);
222  x21 = vec_sub(vzo1, x17);
223 
224  y18 = vec_add(vz0plus1, y16);
225  y19 = vec_sub(vz0plus1, y16);
226  y20 = vec_add(vzo1plus1, y17);
227  y21 = vec_sub(vzo1plus1, y17);
228 
229  x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3));
230  x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3));
231 
232  y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
233  y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
234 
235  vec_st(x18, 0, &(out[0]));
236  vec_st(y18, 16, &(out[0]));
237  vec_st(x22, i1, &(out[0]));
238  vec_st(y22, i1+16, &(out[0]));
239  vec_st(x19, i2, &(out[0]));
240  vec_st(y19, i2+16, &(out[0]));
241  vec_st(x23, i3, &(out[0]));
242  vec_st(y23, i3+16, &(out[0]));
243  } while (n-=2);
244 }
245 
246 inline static void fft2_vsx_interleave(FFTComplex *z)
247 {
248  FFTSample r1, i1;
249 
250  r1 = z[0].re - z[1].re;
251  z[0].re += z[1].re;
252  z[1].re = r1;
253 
254  i1 = z[0].im - z[1].im;
255  z[0].im += z[1].im;
256  z[1].im = i1;
257  }
258 
259 inline static void fft4_vsx_interleave(FFTComplex *z)
260 {
261  vec_f a, b, c, d;
262  float* out= (float*)z;
263  a = vec_ld(0, &(out[0]));
264  b = vec_ld(byte_2complex, &(out[0]));
265 
266  c = vec_perm(a, b, vcprm(0,1,s2,s1));
267  d = vec_perm(a, b, vcprm(2,3,s0,s3));
268  a = vec_add(c, d);
269  b = vec_sub(c, d);
270 
271  c = vec_perm(a, b, vcprm(0,1,s0,s1));
272  d = vec_perm(a, b, vcprm(2,3,s3,s2));
273 
274  a = vec_add(c, d);
275  b = vec_sub(c, d);
276  vec_st(a, 0, &(out[0]));
277  vec_st(b, byte_2complex, &(out[0]));
278 }
279 
280 inline static void fft8_vsx_interleave(FFTComplex *z)
281 {
282  vec_f vz0, vz1, vz2, vz3;
283  vec_f x0, x1, x2, x3;
284  vec_f x4, x5, x6, x7;
285  vec_f x8, x9, x10, x11;
286  vec_f x12, x13, x14, x15;
287  vec_f x16, x17, x18, x19;
288  vec_f x20, x21, x22, x23;
289  vec_f x24, x25, x26, x27;
290  vec_f x28, x29, x30, x31;
291  vec_f x32, x33, x34;
292 
293  float* out= (float*)z;
295 
296  vz0 = vec_ld(0, &(out[0]));
297  vz1 = vec_ld(byte_2complex, &(out[0]));
298  vz2 = vec_ld(byte_4complex, &(out[0]));
299  vz3 = vec_ld(byte_6complex, &(out[0]));
300 
301  x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
302  x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
303  x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1));
304  x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3));
305 
306  x4 = vec_add(x0, x1);
307  x5 = vec_sub(x0, x1);
308  x6 = vec_add(x2, x3);
309  x7 = vec_sub(x2, x3);
310 
311  x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1));
312  x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2));
313  x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1));
314  x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3));
315 
316  x12 = vec_add(x8, x9);
317  x13 = vec_sub(x8, x9);
318  x14 = vec_add(x10, x11);
319  x15 = vec_sub(x10, x11);
320  x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1));
321  x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1));
322  x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1));
323  x19 = vec_add(x16, x18); // z0.r z2.r z0.i z2.i
324  x20 = vec_sub(x16, x18); // z4.r z6.r z4.i z6.i
325 
326  x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3));
327  x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3));
328  x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2));
329  x24 = vec_add(x22, x23);
330  x25 = vec_sub(x22, x23);
331  x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1);
332 
333  x27 = vec_add(x21, x26); // z1.r z7.r z1.i z3.i
334  x28 = vec_sub(x21, x26); //z5.r z3.r z5.i z7.i
335 
336  x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r z0.i z1.r z1.i
337  x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r z2.i z7.r z3.i
338  x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r z4.i z5.r z5.i
339  x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r z6.i z3.r z7.i
340  x33 = vec_perm(x30, x32, vcprm(0,1,s2,3)); // z2.r z2.i z3.r z3.i
341  x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r z6.i z7.r z7.i
342 
343  vec_st(x29, 0, &(out[0]));
344  vec_st(x33, byte_2complex, &(out[0]));
345  vec_st(x31, byte_4complex, &(out[0]));
346  vec_st(x34, byte_6complex, &(out[0]));
347 }
348 
349 inline static void fft16_vsx_interleave(FFTComplex *z)
350 {
351  float* out= (float*)z;
353  vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]};
354  vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]};
355  vec_f vz0, vz1, vz2, vz3;
356  vec_f vz4, vz5, vz6, vz7;
357  vec_f x0, x1, x2, x3;
358  vec_f x4, x5, x6, x7;
359  vec_f x8, x9, x10, x11;
360  vec_f x12, x13, x14, x15;
361  vec_f x16, x17, x18, x19;
362  vec_f x20, x21, x22, x23;
363  vec_f x24, x25, x26, x27;
364  vec_f x28, x29, x30, x31;
365  vec_f x32, x33, x34, x35;
366  vec_f x36, x37, x38, x39;
367  vec_f x40, x41, x42, x43;
368  vec_f x44, x45, x46, x47;
369  vec_f x48, x49, x50, x51;
370  vec_f x52, x53, x54, x55;
371  vec_f x56, x57, x58, x59;
372  vec_f x60, x61, x62, x63;
373  vec_f x64, x65, x66, x67;
374  vec_f x68, x69, x70, x71;
375  vec_f x72, x73, x74, x75;
376  vec_f x76, x77, x78, x79;
377  vec_f x80, x81, x82, x83;
378  vec_f x84, x85, x86;
379 
380  vz0 = vec_ld(0, &(out[0]));
381  vz1 = vec_ld(byte_2complex, &(out[0]));
382  vz2 = vec_ld(byte_4complex, &(out[0]));
383  vz3 = vec_ld(byte_6complex, &(out[0]));
384  vz4 = vec_ld(byte_8complex, &(out[0]));
385  vz5 = vec_ld(byte_10complex, &(out[0]));
386  vz6 = vec_ld(byte_12complex, &(out[0]));
387  vz7 = vec_ld(byte_14complex, &(out[0]));
388 
389  x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
390  x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
391  x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
392  x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
393 
394  x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1));
395  x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3));
396  x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1));
397  x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3));
398 
399  x8 = vec_add(x0, x1);
400  x9 = vec_sub(x0, x1);
401  x10 = vec_add(x2, x3);
402  x11 = vec_sub(x2, x3);
403 
404  x12 = vec_add(x4, x5);
405  x13 = vec_sub(x4, x5);
406  x14 = vec_add(x6, x7);
407  x15 = vec_sub(x6, x7);
408 
409  x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1));
410  x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2));
411  x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2));
412  x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3));
413  x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1));
414  x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3));
415  x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1));
416  x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2));
417 
418  x24 = vec_add(x16, x17);
419  x25 = vec_sub(x16, x17);
420  x26 = vec_add(x18, x19);
421  x27 = vec_sub(x18, x19);
422  x28 = vec_add(x20, x21);
423  x29 = vec_sub(x20, x21);
424  x30 = vec_add(x22, x23);
425  x31 = vec_sub(x22, x23);
426 
427  x32 = vec_add(x24, x26);
428  x33 = vec_sub(x24, x26);
429  x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1));
430 
431  x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2));
432  x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3));
433  x37 = vec_add(x35, x36);
434  x38 = vec_sub(x35, x36);
435  x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0));
436 
437  x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3));
438  x41 = vec_perm(x26, x37, vcprm(2,3,s3,s2));
439  x42 = vec_add(x40, x41);
440  x43 = vec_sub(x40, x41);
441  x44 = vec_mul(x42, vc0);
442  x45 = vec_mul(x43, vc0);
443 
444  x46 = vec_add(x34, x39); // z0.r z0.i z4.r z4.i
445  x47 = vec_sub(x34, x39); // z8.r z8.i z12.r z12.i
446 
447  x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2));
448  x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0));
449  x50 = vec_add(x48, x49);
450  x51 = vec_sub(x48, x49);
451  x52 = vec_mul(x50, vc1);
452  x53 = vec_mul(x50, vc2);
453  x54 = vec_mul(x51, vc1);
454  x55 = vec_mul(x51, vc2);
455 
456  x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3));
457  x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0));
458  x58 = vec_add(x56, x57);
459  x59 = vec_sub(x56, x57);
460 
461  x60 = vec_perm(x54, x55, vcprm(1,0,3,2));
462  x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2));
463  x62 = vec_add(x52, x61);
464  x63 = vec_sub(x52, x61);
465  x64 = vec_add(x60, x53);
466  x65 = vec_sub(x60, x53);
467  x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2));
468  x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2));
469 
470  x68 = vec_add(x58, x66); // z1.r z1.i z3.r z3.i
471  x69 = vec_sub(x58, x66); // z9.r z9.i z11.r z11.i
472  x70 = vec_add(x59, x67); // z5.r z5.i z15.r z15.i
473  x71 = vec_sub(x59, x67); // z13.r z13.i z7.r z7.i
474 
475  x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3));
476  x73 = vec_add(x25, x72);
477  x74 = vec_sub(x25, x72);
478  x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1));
479  x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3));
480  x77 = vec_add(x75, x76); // z2.r z2.i z6.r z6.i
481  x78 = vec_sub(x75, x76); // z10.r z10.i z14.r z14.i
482 
483  x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r z0.i z1.r z1.i
484  x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r z2.i z3.r z3.i
485  x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r z4.i z5.r z5.i
486  x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r z6.i z7.r z7.i
487  vec_st(x79, 0, &(out[0]));
488  vec_st(x80, byte_2complex, &(out[0]));
489  vec_st(x81, byte_4complex, &(out[0]));
490  vec_st(x82, byte_6complex, &(out[0]));
491  x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r z8.i z9.r z9.i
492  x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r z10.i z11.r z11.i
493  x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r z12.i z13.r z13.i
494  x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r z14.i z15.r z15.i
495  vec_st(x83, byte_8complex, &(out[0]));
496  vec_st(x84, byte_10complex, &(out[0]));
497  vec_st(x85, byte_12complex, &(out[0]));
498  vec_st(x86, byte_14complex, &(out[0]));
499 }
500 
501 inline static void fft4_vsx(FFTComplex *z)
502 {
503  vec_f a, b, c, d;
504  float* out= (float*)z;
505  a = vec_ld(0, &(out[0]));
506  b = vec_ld(byte_2complex, &(out[0]));
507 
508  c = vec_perm(a, b, vcprm(0,1,s2,s1));
509  d = vec_perm(a, b, vcprm(2,3,s0,s3));
510  a = vec_add(c, d);
511  b = vec_sub(c, d);
512 
513  c = vec_perm(a,b, vcprm(0,s0,1,s1));
514  d = vec_perm(a, b, vcprm(2,s3,3,s2));
515 
516  a = vec_add(c, d);
517  b = vec_sub(c, d);
518 
519  c = vec_perm(a, b, vcprm(0,1,s0,s1));
520  d = vec_perm(a, b, vcprm(2,3,s2,s3));
521 
522  vec_st(c, 0, &(out[0]));
523  vec_st(d, byte_2complex, &(out[0]));
524  return;
525 }
526 
527 inline static void fft8_vsx(FFTComplex *z)
528 {
529  vec_f vz0, vz1, vz2, vz3;
530  vec_f vz4, vz5, vz6, vz7, vz8;
531 
532  float* out= (float*)z;
533  vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
534  vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
536 
537  vz0 = vec_ld(0, &(out[0]));
538  vz1 = vec_ld(byte_2complex, &(out[0]));
539  vz2 = vec_ld(byte_4complex, &(out[0]));
540  vz3 = vec_ld(byte_6complex, &(out[0]));
541 
542  vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
543  vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
544  vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
545  vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
546 
547  vz2 = vec_add(vz6, vz7);
548  vz3 = vec_sub(vz6, vz7);
549  vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
550 
551  vz0 = vec_add(vz4, vz5);
552  vz1 = vec_sub(vz4, vz5);
553 
554  vz3 = vec_madd(vz3, vc1, vc0);
555  vz3 = vec_madd(vz8, vc2, vz3);
556 
557  vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
558  vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
559  vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
560  vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
561 
562  vz0 = vec_add(vz4, vz5);
563  vz1 = vec_sub(vz4, vz5);
564  vz2 = vec_add(vz6, vz7);
565  vz3 = vec_sub(vz6, vz7);
566 
567  vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
568  vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
569  vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
570  vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
571 
572 
573  vz2 = vec_sub(vz4, vz6);
574  vz3 = vec_sub(vz5, vz7);
575 
576  vz0 = vec_add(vz4, vz6);
577  vz1 = vec_add(vz5, vz7);
578 
579  vec_st(vz0, 0, &(out[0]));
580  vec_st(vz1, byte_2complex, &(out[0]));
581  vec_st(vz2, byte_4complex, &(out[0]));
582  vec_st(vz3, byte_6complex, &(out[0]));
583  return;
584 }
585 
586 inline static void fft16_vsx(FFTComplex *z)
587 {
588  float* out= (float*)z;
589  vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
590  vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
592  vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343};
593  vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953};
594  vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953};
595 
596  vec_f vz0, vz1, vz2, vz3;
597  vec_f vz4, vz5, vz6, vz7;
598  vec_f vz8, vz9, vz10, vz11;
599  vec_f vz12, vz13;
600 
601  vz0 = vec_ld(byte_8complex, &(out[0]));
602  vz1 = vec_ld(byte_10complex, &(out[0]));
603  vz2 = vec_ld(byte_12complex, &(out[0]));
604  vz3 = vec_ld(byte_14complex, &(out[0]));
605 
606  vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
607  vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
608  vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1));
609  vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3));
610 
611  vz0 = vec_add(vz4, vz5);
612  vz1= vec_sub(vz4, vz5);
613  vz2 = vec_add(vz6, vz7);
614  vz3 = vec_sub(vz6, vz7);
615 
616  vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
617  vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
618  vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
619  vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2));
620 
621  vz0 = vec_add(vz4, vz5);
622  vz1 = vec_sub(vz4, vz5);
623  vz2 = vec_add(vz6, vz7);
624  vz3 = vec_sub(vz6, vz7);
625 
626  vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
627  vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
628 
629  vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
630  vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
631 
632  vz0 = vec_ld(0, &(out[0]));
633  vz1 = vec_ld(byte_2complex, &(out[0]));
634  vz2 = vec_ld(byte_4complex, &(out[0]));
635  vz3 = vec_ld(byte_6complex, &(out[0]));
636  vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
637  vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
638  vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
639  vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
640 
641  vz2 = vec_add(vz10, vz11);
642  vz3 = vec_sub(vz10, vz11);
643  vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
644  vz0 = vec_add(vz8, vz9);
645  vz1 = vec_sub(vz8, vz9);
646 
647  vz3 = vec_madd(vz3, vc1, vc0);
648  vz3 = vec_madd(vz12, vc2, vz3);
649  vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
650  vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
651  vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
652  vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
653 
654  vz0 = vec_add(vz8, vz9);
655  vz1 = vec_sub(vz8, vz9);
656  vz2 = vec_add(vz10, vz11);
657  vz3 = vec_sub(vz10, vz11);
658 
659  vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
660  vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
661  vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
662  vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
663 
664  vz2 = vec_sub(vz8, vz10);
665  vz3 = vec_sub(vz9, vz11);
666  vz0 = vec_add(vz8, vz10);
667  vz1 = vec_add(vz9, vz11);
668 
669  vz8 = vec_madd(vz4, vc3, vc0);
670  vz9 = vec_madd(vz5, vc3, vc0);
671  vz10 = vec_madd(vz6, vc3, vc0);
672  vz11 = vec_madd(vz7, vc3, vc0);
673 
674  vz8 = vec_madd(vz5, vc4, vz8);
675  vz9 = vec_madd(vz4, vc5, vz9);
676  vz10 = vec_madd(vz7, vc5, vz10);
677  vz11 = vec_madd(vz6, vc4, vz11);
678 
679  vz12 = vec_sub(vz10, vz8);
680  vz10 = vec_add(vz10, vz8);
681 
682  vz13 = vec_sub(vz9, vz11);
683  vz11 = vec_add(vz9, vz11);
684 
685  vz4 = vec_sub(vz0, vz10);
686  vz0 = vec_add(vz0, vz10);
687 
688  vz7= vec_sub(vz3, vz12);
689  vz3= vec_add(vz3, vz12);
690 
691  vz5 = vec_sub(vz1, vz11);
692  vz1 = vec_add(vz1, vz11);
693 
694  vz6 = vec_sub(vz2, vz13);
695  vz2 = vec_add(vz2, vz13);
696 
697  vec_st(vz0, 0, &(out[0]));
698  vec_st(vz1, byte_2complex, &(out[0]));
699  vec_st(vz2, byte_4complex, &(out[0]));
700  vec_st(vz3, byte_6complex, &(out[0]));
701  vec_st(vz4, byte_8complex, &(out[0]));
702  vec_st(vz5, byte_10complex, &(out[0]));
703  vec_st(vz6, byte_12complex, &(out[0]));
704  vec_st(vz7, byte_14complex, &(out[0]));
705  return;
706 
707 }
708 inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n)
709 {
710  int o1 = n<<1;
711  int o2 = n<<2;
712  int o3 = o1+o2;
713  int i1, i2, i3;
714  FFTSample* out = (FFTSample*)z;
715  const FFTSample *wim = wre+o1;
716  vec_f v0, v1, v2, v3;
717  vec_f v4, v5, v6, v7;
718  vec_f v8, v9, v10, v11;
719  vec_f v12, v13;
720 
721  n = n-2;
722  i1 = o1*sizeof(FFTComplex);
723  i2 = o2*sizeof(FFTComplex);
724  i3 = o3*sizeof(FFTComplex);
725 
726  v8 = vec_ld(0, &(wre[0]));
727  v10 = vec_ld(0, &(wim[0]));
728  v9 = vec_ld(0, &(wim[-4]));
729  v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
730 
731  v4 = vec_ld(i2, &(out[0]));
732  v5 = vec_ld(i2+16, &(out[0]));
733  v6 = vec_ld(i3, &(out[0]));
734  v7 = vec_ld(i3+16, &(out[0]));
735  v10 = vec_mul(v4, v8); // r2*wre
736  v11 = vec_mul(v5, v8); // i2*wre
737  v12 = vec_mul(v6, v8); // r3*wre
738  v13 = vec_mul(v7, v8); // i3*wre
739 
740  v0 = vec_ld(0, &(out[0])); // r0
741  v3 = vec_ld(i1+16, &(out[0])); // i1
742  v10 = vec_madd(v5, v9, v10); // r2*wim
743  v11 = vec_nmsub(v4, v9, v11); // i2*wim
744  v12 = vec_nmsub(v7, v9, v12); // r3*wim
745  v13 = vec_madd(v6, v9, v13); // i3*wim
746 
747  v1 = vec_ld(16, &(out[0])); // i0
748  v2 = vec_ld(i1, &(out[0])); // r1
749  v8 = vec_sub(v12, v10);
750  v12 = vec_add(v12, v10);
751  v9 = vec_sub(v11, v13);
752  v13 = vec_add(v11, v13);
753  v4 = vec_sub(v0, v12);
754  v0 = vec_add(v0, v12);
755  v7 = vec_sub(v3, v8);
756  v3 = vec_add(v3, v8);
757 
758  vec_st(v0, 0, &(out[0])); // r0
759  vec_st(v3, i1+16, &(out[0])); // i1
760  vec_st(v4, i2, &(out[0])); // r2
761  vec_st(v7, i3+16, &(out[0]));// i3
762 
763  v5 = vec_sub(v1, v13);
764  v1 = vec_add(v1, v13);
765  v6 = vec_sub(v2, v9);
766  v2 = vec_add(v2, v9);
767 
768  vec_st(v1, 16, &(out[0])); // i0
769  vec_st(v2, i1, &(out[0])); // r1
770  vec_st(v5, i2+16, &(out[0])); // i2
771  vec_st(v6, i3, &(out[0])); // r3
772 
773  do {
774  out += 8;
775  wre += 4;
776  wim -= 4;
777 
778  v8 = vec_ld(0, &(wre[0]));
779  v10 = vec_ld(0, &(wim[0]));
780  v9 = vec_ld(0, &(wim[-4]));
781  v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
782 
783  v4 = vec_ld(i2, &(out[0])); // r2
784  v5 = vec_ld(i2+16, &(out[0])); // i2
785  v6 = vec_ld(i3, &(out[0])); // r3
786  v7 = vec_ld(i3+16, &(out[0]));// i3
787  v10 = vec_mul(v4, v8); // r2*wre
788  v11 = vec_mul(v5, v8); // i2*wre
789  v12 = vec_mul(v6, v8); // r3*wre
790  v13 = vec_mul(v7, v8); // i3*wre
791 
792  v0 = vec_ld(0, &(out[0])); // r0
793  v3 = vec_ld(i1+16, &(out[0])); // i1
794  v10 = vec_madd(v5, v9, v10); // r2*wim
795  v11 = vec_nmsub(v4, v9, v11); // i2*wim
796  v12 = vec_nmsub(v7, v9, v12); // r3*wim
797  v13 = vec_madd(v6, v9, v13); // i3*wim
798 
799  v1 = vec_ld(16, &(out[0])); // i0
800  v2 = vec_ld(i1, &(out[0])); // r1
801  v8 = vec_sub(v12, v10);
802  v12 = vec_add(v12, v10);
803  v9 = vec_sub(v11, v13);
804  v13 = vec_add(v11, v13);
805  v4 = vec_sub(v0, v12);
806  v0 = vec_add(v0, v12);
807  v7 = vec_sub(v3, v8);
808  v3 = vec_add(v3, v8);
809 
810  vec_st(v0, 0, &(out[0])); // r0
811  vec_st(v3, i1+16, &(out[0])); // i1
812  vec_st(v4, i2, &(out[0])); // r2
813  vec_st(v7, i3+16, &(out[0])); // i3
814 
815  v5 = vec_sub(v1, v13);
816  v1 = vec_add(v1, v13);
817  v6 = vec_sub(v2, v9);
818  v2 = vec_add(v2, v9);
819 
820  vec_st(v1, 16, &(out[0])); // i0
821  vec_st(v2, i1, &(out[0])); // r1
822  vec_st(v5, i2+16, &(out[0])); // i2
823  vec_st(v6, i3, &(out[0])); // r3
824  } while (n-=2);
825 }
826 
827 #endif
828 
829 #endif /* AVCODEC_PPC_FFT_VSX_H */
out
FILE * out
Definition: movenc.c:54
b
#define b
Definition: input.c:41
s3
#define s3
Definition: regdef.h:40
v0
#define v0
Definition: regdef.h:26
s
#define s(width, name)
Definition: cbs_vp9.c:256
s1
#define s1
Definition: regdef.h:38
fft-internal.h
FFTSample
float FFTSample
Definition: avfft.h:35
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
s2
#define s2
Definition: regdef.h:39
cpu.h
FFTComplex::im
FFTSample im
Definition: avfft.h:38
FFTComplex::re
FFTSample re
Definition: avfft.h:38
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
FFTContext
Definition: fft.h:76
fft.h
s0
#define s0
Definition: regdef.h:37
sqrthalf
#define sqrthalf
Definition: fft-internal.h:28
util_altivec.h
d
d
Definition: ffmpeg_filter.c:156
vec_f
#define vec_f
Definition: util_altivec.h:40
FFTComplex
Definition: avfft.h:37