FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
fft_vsx.h
Go to the documentation of this file.
1 #ifndef AVCODEC_PPC_FFT_VSX_H
2 #define AVCODEC_PPC_FFT_VSX_H
3 /*
4  * FFT transform, optimized with VSX built-in functions
5  * Copyright (c) 2014 Rong Yan Copyright (c) 2009 Loren Merritt
6  *
7  * This algorithm (though not any of the implementation details) is
8  * based on libdjbfft by D. J. Bernstein, and fft_altivec_s.S.
9  *
10  * This file is part of FFmpeg.
11  *
12  * FFmpeg is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * FFmpeg is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with FFmpeg; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 
28 #include "config.h"
29 #include "libavutil/cpu.h"
32 #include "libavcodec/fft.h"
34 
35 #if HAVE_VSX
36 
37 void ff_fft_calc_interleave_vsx(FFTContext *s, FFTComplex *z);
38 void ff_fft_calc_vsx(FFTContext *s, FFTComplex *z);
39 
40 
41 #define byte_2complex (2*sizeof(FFTComplex))
42 #define byte_4complex (4*sizeof(FFTComplex))
43 #define byte_6complex (6*sizeof(FFTComplex))
44 #define byte_8complex (8*sizeof(FFTComplex))
45 #define byte_10complex (10*sizeof(FFTComplex))
46 #define byte_12complex (12*sizeof(FFTComplex))
47 #define byte_14complex (14*sizeof(FFTComplex))
48 
49 inline static void pass_vsx_interleave(FFTComplex *z, const FFTSample *wre, unsigned int n)
50 {
51  int o1 = n<<1;
52  int o2 = n<<2;
53  int o3 = o1+o2;
54  int i1, i2, i3;
55  FFTSample* out = (FFTSample*)z;
56  const FFTSample *wim = wre+o1;
57  vec_f vz0, vzo1, vzo2, vzo3;
58  vec_f x0, x1, x2, x3;
59  vec_f x4, x5, x6, x7;
60  vec_f x8, x9, x10, x11;
61  vec_f x12, x13, x14, x15;
62  vec_f x16, x17, x18, x19;
63  vec_f x20, x21, x22, x23;
64  vec_f vz0plus1, vzo1plus1, vzo2plus1, vzo3plus1;
65  vec_f y0, y1, y2, y3;
66  vec_f y4, y5, y8, y9;
67  vec_f y10, y13, y14, y15;
68  vec_f y16, y17, y18, y19;
69  vec_f y20, y21, y22, y23;
70  vec_f wr1, wi1, wr0, wi0;
71  vec_f wr2, wi2, wr3, wi3;
72  vec_f xmulwi0, xmulwi1, ymulwi2, ymulwi3;
73 
74  n = n-2;
75  i1 = o1*sizeof(FFTComplex);
76  i2 = o2*sizeof(FFTComplex);
77  i3 = o3*sizeof(FFTComplex);
78  vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i
79  vzo2plus1 = vec_ld(i2+16, &(out[0]));
80  vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i
81  vzo3plus1 = vec_ld(i3+16, &(out[0]));
82  vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i
83  vz0plus1 = vec_ld(16, &(out[0]));
84  vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i
85  vzo1plus1 = vec_ld(i1+16, &(out[0]));
86 
87  x0 = vec_add(vzo2, vzo3);
88  x1 = vec_sub(vzo2, vzo3);
89  y0 = vec_add(vzo2plus1, vzo3plus1);
90  y1 = vec_sub(vzo2plus1, vzo3plus1);
91 
92  wr1 = vec_splats(wre[1]);
93  wi1 = vec_splats(wim[-1]);
94  wi2 = vec_splats(wim[-2]);
95  wi3 = vec_splats(wim[-3]);
96  wr2 = vec_splats(wre[2]);
97  wr3 = vec_splats(wre[3]);
98 
99  x2 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
100  x3 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
101 
102  y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
103  y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
104  y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
105  y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
106 
107  ymulwi2 = vec_mul(y4, wi2);
108  ymulwi3 = vec_mul(y5, wi3);
109  x4 = vec_mul(x2, wr1);
110  x5 = vec_mul(x3, wi1);
111  y8 = vec_madd(y2, wr2, ymulwi2);
112  y9 = vec_msub(y2, wr2, ymulwi2);
113  x6 = vec_add(x4, x5);
114  x7 = vec_sub(x4, x5);
115  y13 = vec_madd(y3, wr3, ymulwi3);
116  y14 = vec_msub(y3, wr3, ymulwi3);
117 
118  x8 = vec_perm(x6, x7, vcprm(0,1,s2,s3));
119  y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
120  y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
121 
122  x9 = vec_perm(x0, x8, vcprm(0,1,s0,s2));
123  x10 = vec_perm(x1, x8, vcprm(1,0,s3,s1));
124 
125  y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
126  y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
127 
128  x11 = vec_add(vz0, x9);
129  x12 = vec_sub(vz0, x9);
130  x13 = vec_add(vzo1, x10);
131  x14 = vec_sub(vzo1, x10);
132 
133  y18 = vec_add(vz0plus1, y16);
134  y19 = vec_sub(vz0plus1, y16);
135  y20 = vec_add(vzo1plus1, y17);
136  y21 = vec_sub(vzo1plus1, y17);
137 
138  x15 = vec_perm(x13, x14, vcprm(0,s1,2,s3));
139  x16 = vec_perm(x13, x14, vcprm(s0,1,s2,3));
140  y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
141  y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
142 
143 
144  vec_st(x11, 0, &(out[0]));
145  vec_st(y18, 16, &(out[0]));
146  vec_st(x15, i1, &(out[0]));
147  vec_st(y22, i1+16, &(out[0]));
148  vec_st(x12, i2, &(out[0]));
149  vec_st(y19, i2+16, &(out[0]));
150  vec_st(x16, i3, &(out[0]));
151  vec_st(y23, i3+16, &(out[0]));
152 
153  do {
154  out += 8;
155  wre += 4;
156  wim -= 4;
157  wr0 = vec_splats(wre[0]);
158  wr1 = vec_splats(wre[1]);
159  wi0 = vec_splats(wim[0]);
160  wi1 = vec_splats(wim[-1]);
161 
162  wr2 = vec_splats(wre[2]);
163  wr3 = vec_splats(wre[3]);
164  wi2 = vec_splats(wim[-2]);
165  wi3 = vec_splats(wim[-3]);
166 
167  vzo2 = vec_ld(i2, &(out[0])); // zo2.r zo2.i z(o2+1).r z(o2+1).i
168  vzo2plus1 = vec_ld(i2+16, &(out[0]));
169  vzo3 = vec_ld(i3, &(out[0])); // zo3.r zo3.i z(o3+1).r z(o3+1).i
170  vzo3plus1 = vec_ld(i3+16, &(out[0]));
171  vz0 = vec_ld(0, &(out[0])); // z0.r z0.i z1.r z1.i
172  vz0plus1 = vec_ld(16, &(out[0]));
173  vzo1 = vec_ld(i1, &(out[0])); // zo1.r zo1.i z(o1+1).r z(o1+1).i
174  vzo1plus1 = vec_ld(i1+16, &(out[0]));
175 
176  x0 = vec_add(vzo2, vzo3);
177  x1 = vec_sub(vzo2, vzo3);
178 
179  y0 = vec_add(vzo2plus1, vzo3plus1);
180  y1 = vec_sub(vzo2plus1, vzo3plus1);
181 
182  x4 = vec_perm(x0, x1, vcprm(s1,1,s0,0));
183  x5 = vec_perm(x0, x1, vcprm(s3,3,s2,2));
184  x2 = vec_perm(x0, x1, vcprm(0,s0,1,s1));
185  x3 = vec_perm(x0, x1, vcprm(2,s2,3,s3));
186 
187  y2 = vec_perm(y0, y1, vcprm(0,s0,1,s1));
188  y3 = vec_perm(y0, y1, vcprm(2,s2,3,s3));
189  xmulwi0 = vec_mul(x4, wi0);
190  xmulwi1 = vec_mul(x5, wi1);
191 
192  y4 = vec_perm(y0, y1, vcprm(s1,1,s0,0));
193  y5 = vec_perm(y0, y1, vcprm(s3,3,s2,2));
194 
195  x8 = vec_madd(x2, wr0, xmulwi0);
196  x9 = vec_msub(x2, wr0, xmulwi0);
197  ymulwi2 = vec_mul(y4, wi2);
198  ymulwi3 = vec_mul(y5, wi3);
199 
200  x13 = vec_madd(x3, wr1, xmulwi1);
201  x14 = vec_msub(x3, wr1, xmulwi1);
202 
203  y8 = vec_madd(y2, wr2, ymulwi2);
204  y9 = vec_msub(y2, wr2, ymulwi2);
205  y13 = vec_madd(y3, wr3, ymulwi3);
206  y14 = vec_msub(y3, wr3, ymulwi3);
207 
208  x10 = vec_perm(x8, x9, vcprm(0,1,s2,s3));
209  x15 = vec_perm(x13, x14, vcprm(0,1,s2,s3));
210 
211  y10 = vec_perm(y8, y9, vcprm(0,1,s2,s3));
212  y15 = vec_perm(y13, y14, vcprm(0,1,s2,s3));
213 
214  x16 = vec_perm(x10, x15, vcprm(0,2,s0,s2));
215  x17 = vec_perm(x10, x15, vcprm(3,1,s3,s1));
216 
217  y16 = vec_perm(y10, y15, vcprm(0,2,s0,s2));
218  y17 = vec_perm(y10, y15, vcprm(3,1,s3,s1));
219 
220  x18 = vec_add(vz0, x16);
221  x19 = vec_sub(vz0, x16);
222  x20 = vec_add(vzo1, x17);
223  x21 = vec_sub(vzo1, x17);
224 
225  y18 = vec_add(vz0plus1, y16);
226  y19 = vec_sub(vz0plus1, y16);
227  y20 = vec_add(vzo1plus1, y17);
228  y21 = vec_sub(vzo1plus1, y17);
229 
230  x22 = vec_perm(x20, x21, vcprm(0,s1,2,s3));
231  x23 = vec_perm(x20, x21, vcprm(s0,1,s2,3));
232 
233  y22 = vec_perm(y20, y21, vcprm(0,s1,2,s3));
234  y23 = vec_perm(y20, y21, vcprm(s0,1,s2,3));
235 
236  vec_st(x18, 0, &(out[0]));
237  vec_st(y18, 16, &(out[0]));
238  vec_st(x22, i1, &(out[0]));
239  vec_st(y22, i1+16, &(out[0]));
240  vec_st(x19, i2, &(out[0]));
241  vec_st(y19, i2+16, &(out[0]));
242  vec_st(x23, i3, &(out[0]));
243  vec_st(y23, i3+16, &(out[0]));
244  } while (n-=2);
245 }
246 
247 inline static void fft2_vsx_interleave(FFTComplex *z)
248 {
249  FFTSample r1, i1;
250 
251  r1 = z[0].re - z[1].re;
252  z[0].re += z[1].re;
253  z[1].re = r1;
254 
255  i1 = z[0].im - z[1].im;
256  z[0].im += z[1].im;
257  z[1].im = i1;
258  }
259 
260 inline static void fft4_vsx_interleave(FFTComplex *z)
261 {
262  vec_f a, b, c, d;
263  float* out= (float*)z;
264  a = vec_ld(0, &(out[0]));
265  b = vec_ld(byte_2complex, &(out[0]));
266 
267  c = vec_perm(a, b, vcprm(0,1,s2,s1));
268  d = vec_perm(a, b, vcprm(2,3,s0,s3));
269  a = vec_add(c, d);
270  b = vec_sub(c, d);
271 
272  c = vec_perm(a, b, vcprm(0,1,s0,s1));
273  d = vec_perm(a, b, vcprm(2,3,s3,s2));
274 
275  a = vec_add(c, d);
276  b = vec_sub(c, d);
277  vec_st(a, 0, &(out[0]));
278  vec_st(b, byte_2complex, &(out[0]));
279 }
280 
281 inline static void fft8_vsx_interleave(FFTComplex *z)
282 {
283  vec_f vz0, vz1, vz2, vz3;
284  vec_f x0, x1, x2, x3;
285  vec_f x4, x5, x6, x7;
286  vec_f x8, x9, x10, x11;
287  vec_f x12, x13, x14, x15;
288  vec_f x16, x17, x18, x19;
289  vec_f x20, x21, x22, x23;
290  vec_f x24, x25, x26, x27;
291  vec_f x28, x29, x30, x31;
292  vec_f x32, x33, x34;
293 
294  float* out= (float*)z;
295  vec_f vc1 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
296 
297  vz0 = vec_ld(0, &(out[0]));
298  vz1 = vec_ld(byte_2complex, &(out[0]));
299  vz2 = vec_ld(byte_4complex, &(out[0]));
300  vz3 = vec_ld(byte_6complex, &(out[0]));
301 
302  x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
303  x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
304  x2 = vec_perm(vz2, vz3, vcprm(2,1,s0,s1));
305  x3 = vec_perm(vz2, vz3, vcprm(0,3,s2,s3));
306 
307  x4 = vec_add(x0, x1);
308  x5 = vec_sub(x0, x1);
309  x6 = vec_add(x2, x3);
310  x7 = vec_sub(x2, x3);
311 
312  x8 = vec_perm(x4, x5, vcprm(0,1,s0,s1));
313  x9 = vec_perm(x4, x5, vcprm(2,3,s3,s2));
314  x10 = vec_perm(x6, x7, vcprm(2,1,s2,s1));
315  x11 = vec_perm(x6, x7, vcprm(0,3,s0,s3));
316 
317  x12 = vec_add(x8, x9);
318  x13 = vec_sub(x8, x9);
319  x14 = vec_add(x10, x11);
320  x15 = vec_sub(x10, x11);
321  x16 = vec_perm(x12, x13, vcprm(0,s0,1,s1));
322  x17 = vec_perm(x14, x15, vcprm(0,s0,1,s1));
323  x18 = vec_perm(x16, x17, vcprm(s0,s3,s2,s1));
324  x19 = vec_add(x16, x18); // z0.r z2.r z0.i z2.i
325  x20 = vec_sub(x16, x18); // z4.r z6.r z4.i z6.i
326 
327  x21 = vec_perm(x12, x13, vcprm(2,s2,3,s3));
328  x22 = vec_perm(x14, x15, vcprm(2,3,s2,s3));
329  x23 = vec_perm(x14, x15, vcprm(3,2,s3,s2));
330  x24 = vec_add(x22, x23);
331  x25 = vec_sub(x22, x23);
332  x26 = vec_mul( vec_perm(x24, x25, vcprm(2,s2,0,s0)), vc1);
333 
334  x27 = vec_add(x21, x26); // z1.r z7.r z1.i z3.i
335  x28 = vec_sub(x21, x26); //z5.r z3.r z5.i z7.i
336 
337  x29 = vec_perm(x19, x27, vcprm(0,2,s0,s2)); // z0.r z0.i z1.r z1.i
338  x30 = vec_perm(x19, x27, vcprm(1,3,s1,s3)); // z2.r z2.i z7.r z3.i
339  x31 = vec_perm(x20, x28, vcprm(0,2,s0,s2)); // z4.r z4.i z5.r z5.i
340  x32 = vec_perm(x20, x28, vcprm(1,3,s1,s3)); // z6.r z6.i z3.r z7.i
341  x33 = vec_perm(x30, x32, vcprm(0,1,s2,3)); // z2.r z2.i z3.r z3.i
342  x34 = vec_perm(x30, x32, vcprm(s0,s1,2,s3)); // z6.r z6.i z7.r z7.i
343 
344  vec_st(x29, 0, &(out[0]));
345  vec_st(x33, byte_2complex, &(out[0]));
346  vec_st(x31, byte_4complex, &(out[0]));
347  vec_st(x34, byte_6complex, &(out[0]));
348 }
349 
350 inline static void fft16_vsx_interleave(FFTComplex *z)
351 {
352  float* out= (float*)z;
353  vec_f vc0 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
354  vec_f vc1 = {ff_cos_16[1], ff_cos_16[1], ff_cos_16[1], ff_cos_16[1]};
355  vec_f vc2 = {ff_cos_16[3], ff_cos_16[3], ff_cos_16[3], ff_cos_16[3]};
356  vec_f vz0, vz1, vz2, vz3;
357  vec_f vz4, vz5, vz6, vz7;
358  vec_f x0, x1, x2, x3;
359  vec_f x4, x5, x6, x7;
360  vec_f x8, x9, x10, x11;
361  vec_f x12, x13, x14, x15;
362  vec_f x16, x17, x18, x19;
363  vec_f x20, x21, x22, x23;
364  vec_f x24, x25, x26, x27;
365  vec_f x28, x29, x30, x31;
366  vec_f x32, x33, x34, x35;
367  vec_f x36, x37, x38, x39;
368  vec_f x40, x41, x42, x43;
369  vec_f x44, x45, x46, x47;
370  vec_f x48, x49, x50, x51;
371  vec_f x52, x53, x54, x55;
372  vec_f x56, x57, x58, x59;
373  vec_f x60, x61, x62, x63;
374  vec_f x64, x65, x66, x67;
375  vec_f x68, x69, x70, x71;
376  vec_f x72, x73, x74, x75;
377  vec_f x76, x77, x78, x79;
378  vec_f x80, x81, x82, x83;
379  vec_f x84, x85, x86;
380 
381  vz0 = vec_ld(0, &(out[0]));
382  vz1 = vec_ld(byte_2complex, &(out[0]));
383  vz2 = vec_ld(byte_4complex, &(out[0]));
384  vz3 = vec_ld(byte_6complex, &(out[0]));
385  vz4 = vec_ld(byte_8complex, &(out[0]));
386  vz5 = vec_ld(byte_10complex, &(out[0]));
387  vz6 = vec_ld(byte_12complex, &(out[0]));
388  vz7 = vec_ld(byte_14complex, &(out[0]));
389 
390  x0 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
391  x1 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
392  x2 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
393  x3 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
394 
395  x4 = vec_perm(vz4, vz5, vcprm(0,1,s2,s1));
396  x5 = vec_perm(vz4, vz5, vcprm(2,3,s0,s3));
397  x6 = vec_perm(vz6, vz7, vcprm(0,1,s2,s1));
398  x7 = vec_perm(vz6, vz7, vcprm(2,3,s0,s3));
399 
400  x8 = vec_add(x0, x1);
401  x9 = vec_sub(x0, x1);
402  x10 = vec_add(x2, x3);
403  x11 = vec_sub(x2, x3);
404 
405  x12 = vec_add(x4, x5);
406  x13 = vec_sub(x4, x5);
407  x14 = vec_add(x6, x7);
408  x15 = vec_sub(x6, x7);
409 
410  x16 = vec_perm(x8, x9, vcprm(0,1,s0,s1));
411  x17 = vec_perm(x8, x9, vcprm(2,3,s3,s2));
412  x18 = vec_perm(x10, x11, vcprm(2,1,s1,s2));
413  x19 = vec_perm(x10, x11, vcprm(0,3,s0,s3));
414  x20 = vec_perm(x12, x14, vcprm(0,1,s0, s1));
415  x21 = vec_perm(x12, x14, vcprm(2,3,s2,s3));
416  x22 = vec_perm(x13, x15, vcprm(0,1,s0,s1));
417  x23 = vec_perm(x13, x15, vcprm(3,2,s3,s2));
418 
419  x24 = vec_add(x16, x17);
420  x25 = vec_sub(x16, x17);
421  x26 = vec_add(x18, x19);
422  x27 = vec_sub(x18, x19);
423  x28 = vec_add(x20, x21);
424  x29 = vec_sub(x20, x21);
425  x30 = vec_add(x22, x23);
426  x31 = vec_sub(x22, x23);
427 
428  x32 = vec_add(x24, x26);
429  x33 = vec_sub(x24, x26);
430  x34 = vec_perm(x32, x33, vcprm(0,1,s0,s1));
431 
432  x35 = vec_perm(x28, x29, vcprm(2,1,s1,s2));
433  x36 = vec_perm(x28, x29, vcprm(0,3,s0,s3));
434  x37 = vec_add(x35, x36);
435  x38 = vec_sub(x35, x36);
436  x39 = vec_perm(x37, x38, vcprm(0,1,s1,s0));
437 
438  x40 = vec_perm(x27, x38, vcprm(3,2,s2,s3));
439  x41 = vec_perm(x26, x37, vcprm(2,3,s3,s2));
440  x42 = vec_add(x40, x41);
441  x43 = vec_sub(x40, x41);
442  x44 = vec_mul(x42, vc0);
443  x45 = vec_mul(x43, vc0);
444 
445  x46 = vec_add(x34, x39); // z0.r z0.i z4.r z4.i
446  x47 = vec_sub(x34, x39); // z8.r z8.i z12.r z12.i
447 
448  x48 = vec_perm(x30, x31, vcprm(2,1,s1,s2));
449  x49 = vec_perm(x30, x31, vcprm(0,3,s3,s0));
450  x50 = vec_add(x48, x49);
451  x51 = vec_sub(x48, x49);
452  x52 = vec_mul(x50, vc1);
453  x53 = vec_mul(x50, vc2);
454  x54 = vec_mul(x51, vc1);
455  x55 = vec_mul(x51, vc2);
456 
457  x56 = vec_perm(x24, x25, vcprm(2,3,s2,s3));
458  x57 = vec_perm(x44, x45, vcprm(0,1,s1,s0));
459  x58 = vec_add(x56, x57);
460  x59 = vec_sub(x56, x57);
461 
462  x60 = vec_perm(x54, x55, vcprm(1,0,3,2));
463  x61 = vec_perm(x54, x55, vcprm(s1,s0,s3,s2));
464  x62 = vec_add(x52, x61);
465  x63 = vec_sub(x52, x61);
466  x64 = vec_add(x60, x53);
467  x65 = vec_sub(x60, x53);
468  x66 = vec_perm(x62, x64, vcprm(0,1,s3,s2));
469  x67 = vec_perm(x63, x65, vcprm(s0,s1,3,2));
470 
471  x68 = vec_add(x58, x66); // z1.r z1.i z3.r z3.i
472  x69 = vec_sub(x58, x66); // z9.r z9.i z11.r z11.i
473  x70 = vec_add(x59, x67); // z5.r z5.i z15.r z15.i
474  x71 = vec_sub(x59, x67); // z13.r z13.i z7.r z7.i
475 
476  x72 = vec_perm(x25, x27, vcprm(s1,s0,s2,s3));
477  x73 = vec_add(x25, x72);
478  x74 = vec_sub(x25, x72);
479  x75 = vec_perm(x73, x74, vcprm(0,1,s0,s1));
480  x76 = vec_perm(x44, x45, vcprm(3,2,s2,s3));
481  x77 = vec_add(x75, x76); // z2.r z2.i z6.r z6.i
482  x78 = vec_sub(x75, x76); // z10.r z10.i z14.r z14.i
483 
484  x79 = vec_perm(x46, x68, vcprm(0,1,s0,s1)); // z0.r z0.i z1.r z1.i
485  x80 = vec_perm(x77, x68, vcprm(0,1,s2,s3)); // z2.r z2.i z3.r z3.i
486  x81 = vec_perm(x46, x70, vcprm(2,3,s0,s1)); // z4.r z4.i z5.r z5.i
487  x82 = vec_perm(x71, x77, vcprm(s2,s3,2,3)); // z6.r z6.i z7.r z7.i
488  vec_st(x79, 0, &(out[0]));
489  vec_st(x80, byte_2complex, &(out[0]));
490  vec_st(x81, byte_4complex, &(out[0]));
491  vec_st(x82, byte_6complex, &(out[0]));
492  x83 = vec_perm(x47, x69, vcprm(0,1,s0,s1)); // z8.r z8.i z9.r z9.i
493  x84 = vec_perm(x78, x69, vcprm(0,1,s2,s3)); // z10.r z10.i z11.r z11.i
494  x85 = vec_perm(x47, x71, vcprm(2,3,s0,s1)); // z12.r z12.i z13.r z13.i
495  x86 = vec_perm(x70, x78, vcprm(s2,s3,2,3)); // z14.r z14.i z15.r z15.i
496  vec_st(x83, byte_8complex, &(out[0]));
497  vec_st(x84, byte_10complex, &(out[0]));
498  vec_st(x85, byte_12complex, &(out[0]));
499  vec_st(x86, byte_14complex, &(out[0]));
500 }
501 
502 inline static void fft4_vsx(FFTComplex *z)
503 {
504  vec_f a, b, c, d;
505  float* out= (float*)z;
506  a = vec_ld(0, &(out[0]));
507  b = vec_ld(byte_2complex, &(out[0]));
508 
509  c = vec_perm(a, b, vcprm(0,1,s2,s1));
510  d = vec_perm(a, b, vcprm(2,3,s0,s3));
511  a = vec_add(c, d);
512  b = vec_sub(c, d);
513 
514  c = vec_perm(a,b, vcprm(0,s0,1,s1));
515  d = vec_perm(a, b, vcprm(2,s3,3,s2));
516 
517  a = vec_add(c, d);
518  b = vec_sub(c, d);
519 
520  c = vec_perm(a, b, vcprm(0,1,s0,s1));
521  d = vec_perm(a, b, vcprm(2,3,s2,s3));
522 
523  vec_st(c, 0, &(out[0]));
524  vec_st(d, byte_2complex, &(out[0]));
525  return;
526 }
527 
528 inline static void fft8_vsx(FFTComplex *z)
529 {
530  vec_f vz0, vz1, vz2, vz3;
531  vec_f vz4, vz5, vz6, vz7, vz8;
532 
533  float* out= (float*)z;
534  vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
535  vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
536  vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
537 
538  vz0 = vec_ld(0, &(out[0]));
539  vz1 = vec_ld(byte_2complex, &(out[0]));
540  vz2 = vec_ld(byte_4complex, &(out[0]));
541  vz3 = vec_ld(byte_6complex, &(out[0]));
542 
543  vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
544  vz7 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
545  vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
546  vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
547 
548  vz2 = vec_add(vz6, vz7);
549  vz3 = vec_sub(vz6, vz7);
550  vz8 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
551 
552  vz0 = vec_add(vz4, vz5);
553  vz1 = vec_sub(vz4, vz5);
554 
555  vz3 = vec_madd(vz3, vc1, vc0);
556  vz3 = vec_madd(vz8, vc2, vz3);
557 
558  vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
559  vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
560  vz6 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
561  vz7 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
562 
563  vz0 = vec_add(vz4, vz5);
564  vz1 = vec_sub(vz4, vz5);
565  vz2 = vec_add(vz6, vz7);
566  vz3 = vec_sub(vz6, vz7);
567 
568  vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
569  vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
570  vz6 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
571  vz7 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
572 
573 
574  vz2 = vec_sub(vz4, vz6);
575  vz3 = vec_sub(vz5, vz7);
576 
577  vz0 = vec_add(vz4, vz6);
578  vz1 = vec_add(vz5, vz7);
579 
580  vec_st(vz0, 0, &(out[0]));
581  vec_st(vz1, byte_2complex, &(out[0]));
582  vec_st(vz2, byte_4complex, &(out[0]));
583  vec_st(vz3, byte_6complex, &(out[0]));
584  return;
585 }
586 
587 inline static void fft16_vsx(FFTComplex *z)
588 {
589  float* out= (float*)z;
590  vec_f vc0 = {0.0, 0.0, 0.0, 0.0};
591  vec_f vc1 = {-sqrthalf, sqrthalf, sqrthalf, -sqrthalf};
592  vec_f vc2 = {sqrthalf, sqrthalf, sqrthalf, sqrthalf};
593  vec_f vc3 = {1.0, 0.92387953, sqrthalf, 0.38268343};
594  vec_f vc4 = {0.0, 0.38268343, sqrthalf, 0.92387953};
595  vec_f vc5 = {-0.0, -0.38268343, -sqrthalf, -0.92387953};
596 
597  vec_f vz0, vz1, vz2, vz3;
598  vec_f vz4, vz5, vz6, vz7;
599  vec_f vz8, vz9, vz10, vz11;
600  vec_f vz12, vz13;
601 
602  vz0 = vec_ld(byte_8complex, &(out[0]));
603  vz1 = vec_ld(byte_10complex, &(out[0]));
604  vz2 = vec_ld(byte_12complex, &(out[0]));
605  vz3 = vec_ld(byte_14complex, &(out[0]));
606 
607  vz4 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
608  vz5 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
609  vz6 = vec_perm(vz2, vz3, vcprm(0,1,s2,s1));
610  vz7 = vec_perm(vz2, vz3, vcprm(2,3,s0,s3));
611 
612  vz0 = vec_add(vz4, vz5);
613  vz1= vec_sub(vz4, vz5);
614  vz2 = vec_add(vz6, vz7);
615  vz3 = vec_sub(vz6, vz7);
616 
617  vz4 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
618  vz5 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
619  vz6 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
620  vz7 = vec_perm(vz2, vz3, vcprm(2,s3,3,s2));
621 
622  vz0 = vec_add(vz4, vz5);
623  vz1 = vec_sub(vz4, vz5);
624  vz2 = vec_add(vz6, vz7);
625  vz3 = vec_sub(vz6, vz7);
626 
627  vz4 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
628  vz5 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
629 
630  vz6 = vec_perm(vz2, vz3, vcprm(0,1,s0,s1));
631  vz7 = vec_perm(vz2, vz3, vcprm(2,3,s2,s3));
632 
633  vz0 = vec_ld(0, &(out[0]));
634  vz1 = vec_ld(byte_2complex, &(out[0]));
635  vz2 = vec_ld(byte_4complex, &(out[0]));
636  vz3 = vec_ld(byte_6complex, &(out[0]));
637  vz10 = vec_perm(vz2, vz3, vcprm(0,s0,1,s1));
638  vz11 = vec_perm(vz2, vz3, vcprm(2,s2,3,s3));
639  vz8 = vec_perm(vz0, vz1, vcprm(0,1,s2,s1));
640  vz9 = vec_perm(vz0, vz1, vcprm(2,3,s0,s3));
641 
642  vz2 = vec_add(vz10, vz11);
643  vz3 = vec_sub(vz10, vz11);
644  vz12 = vec_perm(vz3, vz3, vcprm(2,3,0,1));
645  vz0 = vec_add(vz8, vz9);
646  vz1 = vec_sub(vz8, vz9);
647 
648  vz3 = vec_madd(vz3, vc1, vc0);
649  vz3 = vec_madd(vz12, vc2, vz3);
650  vz8 = vec_perm(vz0, vz1, vcprm(0,s0,1,s1));
651  vz9 = vec_perm(vz0, vz1, vcprm(2,s3,3,s2));
652  vz10 = vec_perm(vz2, vz3, vcprm(1,2,s3,s0));
653  vz11 = vec_perm(vz2, vz3, vcprm(0,3,s2,s1));
654 
655  vz0 = vec_add(vz8, vz9);
656  vz1 = vec_sub(vz8, vz9);
657  vz2 = vec_add(vz10, vz11);
658  vz3 = vec_sub(vz10, vz11);
659 
660  vz8 = vec_perm(vz0, vz1, vcprm(0,1,s0,s1));
661  vz9 = vec_perm(vz0, vz1, vcprm(2,3,s2,s3));
662  vz10 = vec_perm(vz2, vz3, vcprm(0,2,s1,s3));
663  vz11 = vec_perm(vz2, vz3, vcprm(1,3,s0,s2));
664 
665  vz2 = vec_sub(vz8, vz10);
666  vz3 = vec_sub(vz9, vz11);
667  vz0 = vec_add(vz8, vz10);
668  vz1 = vec_add(vz9, vz11);
669 
670  vz8 = vec_madd(vz4, vc3, vc0);
671  vz9 = vec_madd(vz5, vc3, vc0);
672  vz10 = vec_madd(vz6, vc3, vc0);
673  vz11 = vec_madd(vz7, vc3, vc0);
674 
675  vz8 = vec_madd(vz5, vc4, vz8);
676  vz9 = vec_madd(vz4, vc5, vz9);
677  vz10 = vec_madd(vz7, vc5, vz10);
678  vz11 = vec_madd(vz6, vc4, vz11);
679 
680  vz12 = vec_sub(vz10, vz8);
681  vz10 = vec_add(vz10, vz8);
682 
683  vz13 = vec_sub(vz9, vz11);
684  vz11 = vec_add(vz9, vz11);
685 
686  vz4 = vec_sub(vz0, vz10);
687  vz0 = vec_add(vz0, vz10);
688 
689  vz7= vec_sub(vz3, vz12);
690  vz3= vec_add(vz3, vz12);
691 
692  vz5 = vec_sub(vz1, vz11);
693  vz1 = vec_add(vz1, vz11);
694 
695  vz6 = vec_sub(vz2, vz13);
696  vz2 = vec_add(vz2, vz13);
697 
698  vec_st(vz0, 0, &(out[0]));
699  vec_st(vz1, byte_2complex, &(out[0]));
700  vec_st(vz2, byte_4complex, &(out[0]));
701  vec_st(vz3, byte_6complex, &(out[0]));
702  vec_st(vz4, byte_8complex, &(out[0]));
703  vec_st(vz5, byte_10complex, &(out[0]));
704  vec_st(vz6, byte_12complex, &(out[0]));
705  vec_st(vz7, byte_14complex, &(out[0]));
706  return;
707 
708 }
709 inline static void pass_vsx(FFTComplex * z, const FFTSample * wre, unsigned int n)
710 {
711  int o1 = n<<1;
712  int o2 = n<<2;
713  int o3 = o1+o2;
714  int i1, i2, i3;
715  FFTSample* out = (FFTSample*)z;
716  const FFTSample *wim = wre+o1;
717  vec_f v0, v1, v2, v3;
718  vec_f v4, v5, v6, v7;
719  vec_f v8, v9, v10, v11;
720  vec_f v12, v13;
721 
722  n = n-2;
723  i1 = o1*sizeof(FFTComplex);
724  i2 = o2*sizeof(FFTComplex);
725  i3 = o3*sizeof(FFTComplex);
726 
727  v8 = vec_ld(0, &(wre[0]));
728  v10 = vec_ld(0, &(wim[0]));
729  v9 = vec_ld(0, &(wim[-4]));
730  v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
731 
732  v4 = vec_ld(i2, &(out[0]));
733  v5 = vec_ld(i2+16, &(out[0]));
734  v6 = vec_ld(i3, &(out[0]));
735  v7 = vec_ld(i3+16, &(out[0]));
736  v10 = vec_mul(v4, v8); // r2*wre
737  v11 = vec_mul(v5, v8); // i2*wre
738  v12 = vec_mul(v6, v8); // r3*wre
739  v13 = vec_mul(v7, v8); // i3*wre
740 
741  v0 = vec_ld(0, &(out[0])); // r0
742  v3 = vec_ld(i1+16, &(out[0])); // i1
743  v10 = vec_madd(v5, v9, v10); // r2*wim
744  v11 = vec_nmsub(v4, v9, v11); // i2*wim
745  v12 = vec_nmsub(v7, v9, v12); // r3*wim
746  v13 = vec_madd(v6, v9, v13); // i3*wim
747 
748  v1 = vec_ld(16, &(out[0])); // i0
749  v2 = vec_ld(i1, &(out[0])); // r1
750  v8 = vec_sub(v12, v10);
751  v12 = vec_add(v12, v10);
752  v9 = vec_sub(v11, v13);
753  v13 = vec_add(v11, v13);
754  v4 = vec_sub(v0, v12);
755  v0 = vec_add(v0, v12);
756  v7 = vec_sub(v3, v8);
757  v3 = vec_add(v3, v8);
758 
759  vec_st(v0, 0, &(out[0])); // r0
760  vec_st(v3, i1+16, &(out[0])); // i1
761  vec_st(v4, i2, &(out[0])); // r2
762  vec_st(v7, i3+16, &(out[0]));// i3
763 
764  v5 = vec_sub(v1, v13);
765  v1 = vec_add(v1, v13);
766  v6 = vec_sub(v2, v9);
767  v2 = vec_add(v2, v9);
768 
769  vec_st(v1, 16, &(out[0])); // i0
770  vec_st(v2, i1, &(out[0])); // r1
771  vec_st(v5, i2+16, &(out[0])); // i2
772  vec_st(v6, i3, &(out[0])); // r3
773 
774  do {
775  out += 8;
776  wre += 4;
777  wim -= 4;
778 
779  v8 = vec_ld(0, &(wre[0]));
780  v10 = vec_ld(0, &(wim[0]));
781  v9 = vec_ld(0, &(wim[-4]));
782  v9 = vec_perm(v9, v10, vcprm(s0,3,2,1));
783 
784  v4 = vec_ld(i2, &(out[0])); // r2
785  v5 = vec_ld(i2+16, &(out[0])); // i2
786  v6 = vec_ld(i3, &(out[0])); // r3
787  v7 = vec_ld(i3+16, &(out[0]));// i3
788  v10 = vec_mul(v4, v8); // r2*wre
789  v11 = vec_mul(v5, v8); // i2*wre
790  v12 = vec_mul(v6, v8); // r3*wre
791  v13 = vec_mul(v7, v8); // i3*wre
792 
793  v0 = vec_ld(0, &(out[0])); // r0
794  v3 = vec_ld(i1+16, &(out[0])); // i1
795  v10 = vec_madd(v5, v9, v10); // r2*wim
796  v11 = vec_nmsub(v4, v9, v11); // i2*wim
797  v12 = vec_nmsub(v7, v9, v12); // r3*wim
798  v13 = vec_madd(v6, v9, v13); // i3*wim
799 
800  v1 = vec_ld(16, &(out[0])); // i0
801  v2 = vec_ld(i1, &(out[0])); // r1
802  v8 = vec_sub(v12, v10);
803  v12 = vec_add(v12, v10);
804  v9 = vec_sub(v11, v13);
805  v13 = vec_add(v11, v13);
806  v4 = vec_sub(v0, v12);
807  v0 = vec_add(v0, v12);
808  v7 = vec_sub(v3, v8);
809  v3 = vec_add(v3, v8);
810 
811  vec_st(v0, 0, &(out[0])); // r0
812  vec_st(v3, i1+16, &(out[0])); // i1
813  vec_st(v4, i2, &(out[0])); // r2
814  vec_st(v7, i3+16, &(out[0])); // i3
815 
816  v5 = vec_sub(v1, v13);
817  v1 = vec_add(v1, v13);
818  v6 = vec_sub(v2, v9);
819  v2 = vec_add(v2, v9);
820 
821  vec_st(v1, 16, &(out[0])); // i0
822  vec_st(v2, i1, &(out[0])); // r1
823  vec_st(v5, i2+16, &(out[0])); // i2
824  vec_st(v6, i3, &(out[0])); // r3
825  } while (n-=2);
826 }
827 
828 #endif
829 
830 #endif /* AVCODEC_PPC_FFT_VSX_H */