FFmpeg
fft_mips.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2012
3  * MIPS Technologies, Inc., California.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  * notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  * notice, this list of conditions and the following disclaimer in the
12  * documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
14  * contributors may be used to endorse or promote products derived from
15  * this software without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  * Author: Stanislav Ocovaj (socovaj@mips.com)
30  * Author: Zoran Lukic (zoranl@mips.com)
31  *
32  * Optimized MDCT/IMDCT and FFT transforms
33  *
34  * This file is part of FFmpeg.
35  *
36  * FFmpeg is free software; you can redistribute it and/or
37  * modify it under the terms of the GNU Lesser General Public
38  * License as published by the Free Software Foundation; either
39  * version 2.1 of the License, or (at your option) any later version.
40  *
41  * FFmpeg is distributed in the hope that it will be useful,
42  * but WITHOUT ANY WARRANTY; without even the implied warranty of
43  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
44  * Lesser General Public License for more details.
45  *
46  * You should have received a copy of the GNU Lesser General Public
47  * License along with FFmpeg; if not, write to the Free Software
48  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
49  */
50 #include "config.h"
51 #include "libavutil/attributes.h"
52 #include "libavcodec/fft.h"
53 #include "libavcodec/fft_table.h"
54 #include "libavutil/mips/asmdefs.h"
55 
56 /**
57  * FFT transform
58  */
59 
60 #if HAVE_INLINE_ASM
61 #if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
62 static void ff_fft_calc_mips(FFTContext *s, FFTComplex *z)
63 {
64  int nbits, i, n, num_transforms, offset, step;
65  int n4, n2, n34;
66  FFTSample tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
67  FFTComplex *tmpz;
68  float w_re, w_im;
69  float *w_re_ptr, *w_im_ptr;
70  const int fft_size = (1 << s->nbits);
71  float pom, pom1, pom2, pom3;
72  float temp, temp1, temp3, temp4;
73  FFTComplex * tmpz_n2, * tmpz_n34, * tmpz_n4;
74  FFTComplex * tmpz_n2_i, * tmpz_n34_i, * tmpz_n4_i, * tmpz_i;
75  float f1 = 0.7071067812;
76 
77  num_transforms = (21845 >> (17 - s->nbits)) | 1;
78 
79  for (n=0; n<num_transforms; n++) {
80  offset = ff_fft_offsets_lut[n] << 2;
81  tmpz = z + offset;
82 
83  tmp1 = tmpz[0].re + tmpz[1].re;
84  tmp5 = tmpz[2].re + tmpz[3].re;
85  tmp2 = tmpz[0].im + tmpz[1].im;
86  tmp6 = tmpz[2].im + tmpz[3].im;
87  tmp3 = tmpz[0].re - tmpz[1].re;
88  tmp8 = tmpz[2].im - tmpz[3].im;
89  tmp4 = tmpz[0].im - tmpz[1].im;
90  tmp7 = tmpz[2].re - tmpz[3].re;
91 
92  tmpz[0].re = tmp1 + tmp5;
93  tmpz[2].re = tmp1 - tmp5;
94  tmpz[0].im = tmp2 + tmp6;
95  tmpz[2].im = tmp2 - tmp6;
96  tmpz[1].re = tmp3 + tmp8;
97  tmpz[3].re = tmp3 - tmp8;
98  tmpz[1].im = tmp4 - tmp7;
99  tmpz[3].im = tmp4 + tmp7;
100 
101  }
102 
103  if (fft_size < 8)
104  return;
105 
106  num_transforms = (num_transforms >> 1) | 1;
107 
108  for (n=0; n<num_transforms; n++) {
109  offset = ff_fft_offsets_lut[n] << 3;
110  tmpz = z + offset;
111 
112  __asm__ volatile (
113  "lwc1 %[tmp1], 32(%[tmpz]) \n\t"
114  "lwc1 %[pom], 40(%[tmpz]) \n\t"
115  "lwc1 %[tmp3], 48(%[tmpz]) \n\t"
116  "lwc1 %[pom1], 56(%[tmpz]) \n\t"
117  "lwc1 %[tmp2], 36(%[tmpz]) \n\t"
118  "lwc1 %[pom2], 44(%[tmpz]) \n\t"
119  "lwc1 %[pom3], 60(%[tmpz]) \n\t"
120  "lwc1 %[tmp4], 52(%[tmpz]) \n\t"
121  "add.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re + tmpz[5].re;
122  "add.s %[tmp3], %[tmp3], %[pom1] \n\t" // tmp3 = tmpz[6].re + tmpz[7].re;
123  "add.s %[tmp2], %[tmp2], %[pom2] \n\t" // tmp2 = tmpz[4].im + tmpz[5].im;
124  "lwc1 %[pom], 40(%[tmpz]) \n\t"
125  "add.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im + tmpz[7].im;
126  "add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3;
127  "sub.s %[tmp7], %[tmp1], %[tmp3] \n\t" // tmp7 = tmp1 - tmp3;
128  "lwc1 %[tmp1], 32(%[tmpz]) \n\t"
129  "lwc1 %[pom1], 44(%[tmpz]) \n\t"
130  "add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4;
131  "sub.s %[tmp8], %[tmp2], %[tmp4] \n\t" // tmp8 = tmp2 - tmp4;
132  "lwc1 %[tmp2], 36(%[tmpz]) \n\t"
133  "lwc1 %[pom2], 56(%[tmpz]) \n\t"
134  "lwc1 %[pom3], 60(%[tmpz]) \n\t"
135  "lwc1 %[tmp3], 48(%[tmpz]) \n\t"
136  "lwc1 %[tmp4], 52(%[tmpz]) \n\t"
137  "sub.s %[tmp1], %[tmp1], %[pom] \n\t" // tmp1 = tmpz[4].re - tmpz[5].re;
138  "lwc1 %[pom], 0(%[tmpz]) \n\t"
139  "sub.s %[tmp2], %[tmp2], %[pom1] \n\t" // tmp2 = tmpz[4].im - tmpz[5].im;
140  "sub.s %[tmp3], %[tmp3], %[pom2] \n\t" // tmp3 = tmpz[6].re - tmpz[7].re;
141  "lwc1 %[pom2], 4(%[tmpz]) \n\t"
142  "sub.s %[pom1], %[pom], %[tmp5] \n\t"
143  "sub.s %[tmp4], %[tmp4], %[pom3] \n\t" // tmp4 = tmpz[6].im - tmpz[7].im;
144  "add.s %[pom3], %[pom], %[tmp5] \n\t"
145  "sub.s %[pom], %[pom2], %[tmp6] \n\t"
146  "add.s %[pom2], %[pom2], %[tmp6] \n\t"
147  "swc1 %[pom1], 32(%[tmpz]) \n\t" // tmpz[4].re = tmpz[0].re - tmp5;
148  "swc1 %[pom3], 0(%[tmpz]) \n\t" // tmpz[0].re = tmpz[0].re + tmp5;
149  "swc1 %[pom], 36(%[tmpz]) \n\t" // tmpz[4].im = tmpz[0].im - tmp6;
150  "swc1 %[pom2], 4(%[tmpz]) \n\t" // tmpz[0].im = tmpz[0].im + tmp6;
151  "lwc1 %[pom1], 16(%[tmpz]) \n\t"
152  "lwc1 %[pom3], 20(%[tmpz]) \n\t"
153  "add.s %[temp1],%[tmp1], %[tmp2] \n\t"
154  "sub.s %[temp], %[pom1], %[tmp8] \n\t"
155  "add.s %[pom2], %[pom3], %[tmp7] \n\t"
156  "sub.s %[temp3],%[tmp3], %[tmp4] \n\t"
157  "sub.s %[temp4],%[tmp2], %[tmp1] \n\t"
158  "swc1 %[temp], 48(%[tmpz]) \n\t" // tmpz[6].re = tmpz[2].re - tmp8;
159  "swc1 %[pom2], 52(%[tmpz]) \n\t" // tmpz[6].im = tmpz[2].im + tmp7;
160  "add.s %[pom1], %[pom1], %[tmp8] \n\t"
161  "sub.s %[pom3], %[pom3], %[tmp7] \n\t"
162  "add.s %[tmp3], %[tmp3], %[tmp4] \n\t"
163  "mul.s %[tmp5], %[f1], %[temp1] \n\t" // tmp5 = pom * (tmp1 + tmp2);
164  "mul.s %[tmp7], %[f1], %[temp3] \n\t" // tmp7 = pom * (tmp3 - tmp4);
165  "mul.s %[tmp6], %[f1], %[temp4] \n\t" // tmp6 = pom * (tmp2 - tmp1);
166  "mul.s %[tmp8], %[f1], %[tmp3] \n\t" // tmp8 = pom * (tmp3 + tmp4);
167  "swc1 %[pom1], 16(%[tmpz]) \n\t" // tmpz[2].re = tmpz[2].re + tmp8;
168  "swc1 %[pom3], 20(%[tmpz]) \n\t" // tmpz[2].im = tmpz[2].im - tmp7;
169  "add.s %[tmp1], %[tmp5], %[tmp7] \n\t" // tmp1 = tmp5 + tmp7;
170  "sub.s %[tmp3], %[tmp5], %[tmp7] \n\t" // tmp3 = tmp5 - tmp7;
171  "add.s %[tmp2], %[tmp6], %[tmp8] \n\t" // tmp2 = tmp6 + tmp8;
172  "sub.s %[tmp4], %[tmp6], %[tmp8] \n\t" // tmp4 = tmp6 - tmp8;
173  "lwc1 %[temp], 8(%[tmpz]) \n\t"
174  "lwc1 %[temp1],12(%[tmpz]) \n\t"
175  "lwc1 %[pom], 24(%[tmpz]) \n\t"
176  "lwc1 %[pom2], 28(%[tmpz]) \n\t"
177  "sub.s %[temp4],%[temp], %[tmp1] \n\t"
178  "sub.s %[temp3],%[temp1], %[tmp2] \n\t"
179  "add.s %[temp], %[temp], %[tmp1] \n\t"
180  "add.s %[temp1],%[temp1], %[tmp2] \n\t"
181  "sub.s %[pom1], %[pom], %[tmp4] \n\t"
182  "add.s %[pom3], %[pom2], %[tmp3] \n\t"
183  "add.s %[pom], %[pom], %[tmp4] \n\t"
184  "sub.s %[pom2], %[pom2], %[tmp3] \n\t"
185  "swc1 %[temp4],40(%[tmpz]) \n\t" // tmpz[5].re = tmpz[1].re - tmp1;
186  "swc1 %[temp3],44(%[tmpz]) \n\t" // tmpz[5].im = tmpz[1].im - tmp2;
187  "swc1 %[temp], 8(%[tmpz]) \n\t" // tmpz[1].re = tmpz[1].re + tmp1;
188  "swc1 %[temp1],12(%[tmpz]) \n\t" // tmpz[1].im = tmpz[1].im + tmp2;
189  "swc1 %[pom1], 56(%[tmpz]) \n\t" // tmpz[7].re = tmpz[3].re - tmp4;
190  "swc1 %[pom3], 60(%[tmpz]) \n\t" // tmpz[7].im = tmpz[3].im + tmp3;
191  "swc1 %[pom], 24(%[tmpz]) \n\t" // tmpz[3].re = tmpz[3].re + tmp4;
192  "swc1 %[pom2], 28(%[tmpz]) \n\t" // tmpz[3].im = tmpz[3].im - tmp3;
193  : [tmp1]"=&f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
194  [tmp3]"=&f"(tmp3), [tmp2]"=&f"(tmp2), [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp7]"=&f"(tmp7),
195  [tmp6]"=&f"(tmp6), [tmp8]"=&f"(tmp8), [pom3]"=&f"(pom3),[temp]"=&f"(temp), [temp1]"=&f"(temp1),
196  [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
197  : [tmpz]"r"(tmpz), [f1]"f"(f1)
198  : "memory"
199  );
200  }
201 
202  step = 1 << (MAX_LOG2_NFFT - 4);
203  n4 = 4;
204 
205  for (nbits=4; nbits<=s->nbits; nbits++) {
206  num_transforms = (num_transforms >> 1) | 1;
207  n2 = 2 * n4;
208  n34 = 3 * n4;
209 
210  for (n=0; n<num_transforms; n++) {
211  offset = ff_fft_offsets_lut[n] << nbits;
212  tmpz = z + offset;
213 
214  tmpz_n2 = tmpz + n2;
215  tmpz_n4 = tmpz + n4;
216  tmpz_n34 = tmpz + n34;
217 
218  __asm__ volatile (
219  "lwc1 %[pom1], 0(%[tmpz_n2]) \n\t"
220  "lwc1 %[pom], 0(%[tmpz_n34]) \n\t"
221  "lwc1 %[pom2], 4(%[tmpz_n2]) \n\t"
222  "lwc1 %[pom3], 4(%[tmpz_n34]) \n\t"
223  "lwc1 %[temp1],0(%[tmpz]) \n\t"
224  "lwc1 %[temp3],4(%[tmpz]) \n\t"
225  "add.s %[tmp5], %[pom1], %[pom] \n\t" // tmp5 = tmpz[ n2].re + tmpz[n34].re;
226  "sub.s %[tmp1], %[pom1], %[pom] \n\t" // tmp1 = tmpz[ n2].re - tmpz[n34].re;
227  "add.s %[tmp6], %[pom2], %[pom3] \n\t" // tmp6 = tmpz[ n2].im + tmpz[n34].im;
228  "sub.s %[tmp2], %[pom2], %[pom3] \n\t" // tmp2 = tmpz[ n2].im - tmpz[n34].im;
229  "sub.s %[temp], %[temp1], %[tmp5] \n\t"
230  "add.s %[temp1],%[temp1], %[tmp5] \n\t"
231  "sub.s %[temp4],%[temp3], %[tmp6] \n\t"
232  "add.s %[temp3],%[temp3], %[tmp6] \n\t"
233  "swc1 %[temp], 0(%[tmpz_n2]) \n\t" // tmpz[ n2].re = tmpz[ 0].re - tmp5;
234  "swc1 %[temp1],0(%[tmpz]) \n\t" // tmpz[ 0].re = tmpz[ 0].re + tmp5;
235  "lwc1 %[pom1], 0(%[tmpz_n4]) \n\t"
236  "swc1 %[temp4],4(%[tmpz_n2]) \n\t" // tmpz[ n2].im = tmpz[ 0].im - tmp6;
237  "lwc1 %[temp], 4(%[tmpz_n4]) \n\t"
238  "swc1 %[temp3],4(%[tmpz]) \n\t" // tmpz[ 0].im = tmpz[ 0].im + tmp6;
239  "sub.s %[pom], %[pom1], %[tmp2] \n\t"
240  "add.s %[pom1], %[pom1], %[tmp2] \n\t"
241  "add.s %[temp1],%[temp], %[tmp1] \n\t"
242  "sub.s %[temp], %[temp], %[tmp1] \n\t"
243  "swc1 %[pom], 0(%[tmpz_n34]) \n\t" // tmpz[n34].re = tmpz[n4].re - tmp2;
244  "swc1 %[pom1], 0(%[tmpz_n4]) \n\t" // tmpz[ n4].re = tmpz[n4].re + tmp2;
245  "swc1 %[temp1],4(%[tmpz_n34]) \n\t" // tmpz[n34].im = tmpz[n4].im + tmp1;
246  "swc1 %[temp], 4(%[tmpz_n4]) \n\t" // tmpz[ n4].im = tmpz[n4].im - tmp1;
247  : [tmp5]"=&f"(tmp5),
248  [tmp1]"=&f"(tmp1), [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2),
249  [tmp2]"=&f"(tmp2), [tmp6]"=&f"(tmp6), [pom3]"=&f"(pom3),
250  [temp]"=&f"(temp), [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4)
251  : [tmpz]"r"(tmpz), [tmpz_n2]"r"(tmpz_n2), [tmpz_n34]"r"(tmpz_n34), [tmpz_n4]"r"(tmpz_n4)
252  : "memory"
253  );
254 
255  w_re_ptr = (float*)(ff_cos_131072 + step);
256  w_im_ptr = (float*)(ff_cos_131072 + MAX_FFT_SIZE/4 - step);
257 
258  for (i=1; i<n4; i++) {
259  w_re = w_re_ptr[0];
260  w_im = w_im_ptr[0];
261  tmpz_n2_i = tmpz_n2 + i;
262  tmpz_n4_i = tmpz_n4 + i;
263  tmpz_n34_i= tmpz_n34 + i;
264  tmpz_i = tmpz + i;
265 
266  __asm__ volatile (
267  "lwc1 %[temp], 0(%[tmpz_n2_i]) \n\t"
268  "lwc1 %[temp1], 4(%[tmpz_n2_i]) \n\t"
269  "lwc1 %[pom], 0(%[tmpz_n34_i]) \n\t"
270  "lwc1 %[pom1], 4(%[tmpz_n34_i]) \n\t"
271  "mul.s %[temp3], %[w_im], %[temp] \n\t"
272  "mul.s %[temp4], %[w_im], %[temp1] \n\t"
273  "mul.s %[pom2], %[w_im], %[pom1] \n\t"
274  "mul.s %[pom3], %[w_im], %[pom] \n\t"
275  "msub.s %[tmp2], %[temp3], %[w_re], %[temp1] \n\t" // tmp2 = w_re * tmpz[ n2+i].im - w_im * tmpz[ n2+i].re;
276  "madd.s %[tmp1], %[temp4], %[w_re], %[temp] \n\t" // tmp1 = w_re * tmpz[ n2+i].re + w_im * tmpz[ n2+i].im;
277  "msub.s %[tmp3], %[pom2], %[w_re], %[pom] \n\t" // tmp3 = w_re * tmpz[n34+i].re - w_im * tmpz[n34+i].im;
278  "madd.s %[tmp4], %[pom3], %[w_re], %[pom1] \n\t" // tmp4 = w_re * tmpz[n34+i].im + w_im * tmpz[n34+i].re;
279  "lwc1 %[temp], 0(%[tmpz_i]) \n\t"
280  "lwc1 %[pom], 4(%[tmpz_i]) \n\t"
281  "add.s %[tmp5], %[tmp1], %[tmp3] \n\t" // tmp5 = tmp1 + tmp3;
282  "sub.s %[tmp1], %[tmp1], %[tmp3] \n\t" // tmp1 = tmp1 - tmp3;
283  "add.s %[tmp6], %[tmp2], %[tmp4] \n\t" // tmp6 = tmp2 + tmp4;
284  "sub.s %[tmp2], %[tmp2], %[tmp4] \n\t" // tmp2 = tmp2 - tmp4;
285  "sub.s %[temp1], %[temp], %[tmp5] \n\t"
286  "add.s %[temp], %[temp], %[tmp5] \n\t"
287  "sub.s %[pom1], %[pom], %[tmp6] \n\t"
288  "add.s %[pom], %[pom], %[tmp6] \n\t"
289  "lwc1 %[temp3], 0(%[tmpz_n4_i]) \n\t"
290  "lwc1 %[pom2], 4(%[tmpz_n4_i]) \n\t"
291  "swc1 %[temp1], 0(%[tmpz_n2_i]) \n\t" // tmpz[ n2+i].re = tmpz[ i].re - tmp5;
292  "swc1 %[temp], 0(%[tmpz_i]) \n\t" // tmpz[ i].re = tmpz[ i].re + tmp5;
293  "swc1 %[pom1], 4(%[tmpz_n2_i]) \n\t" // tmpz[ n2+i].im = tmpz[ i].im - tmp6;
294  "swc1 %[pom] , 4(%[tmpz_i]) \n\t" // tmpz[ i].im = tmpz[ i].im + tmp6;
295  "sub.s %[temp4], %[temp3], %[tmp2] \n\t"
296  "add.s %[pom3], %[pom2], %[tmp1] \n\t"
297  "add.s %[temp3], %[temp3], %[tmp2] \n\t"
298  "sub.s %[pom2], %[pom2], %[tmp1] \n\t"
299  "swc1 %[temp4], 0(%[tmpz_n34_i]) \n\t" // tmpz[n34+i].re = tmpz[n4+i].re - tmp2;
300  "swc1 %[pom3], 4(%[tmpz_n34_i]) \n\t" // tmpz[n34+i].im = tmpz[n4+i].im + tmp1;
301  "swc1 %[temp3], 0(%[tmpz_n4_i]) \n\t" // tmpz[ n4+i].re = tmpz[n4+i].re + tmp2;
302  "swc1 %[pom2], 4(%[tmpz_n4_i]) \n\t" // tmpz[ n4+i].im = tmpz[n4+i].im - tmp1;
303  : [tmp1]"=&f"(tmp1), [tmp2]"=&f" (tmp2), [temp]"=&f"(temp), [tmp3]"=&f"(tmp3),
304  [tmp4]"=&f"(tmp4), [tmp5]"=&f"(tmp5), [tmp6]"=&f"(tmp6),
305  [temp1]"=&f"(temp1), [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
306  [pom]"=&f"(pom), [pom1]"=&f"(pom1), [pom2]"=&f"(pom2), [pom3]"=&f"(pom3)
307  : [w_re]"f"(w_re), [w_im]"f"(w_im),
308  [tmpz_i]"r"(tmpz_i),[tmpz_n2_i]"r"(tmpz_n2_i),
309  [tmpz_n34_i]"r"(tmpz_n34_i), [tmpz_n4_i]"r"(tmpz_n4_i)
310  : "memory"
311  );
312  w_re_ptr += step;
313  w_im_ptr -= step;
314  }
315  }
316  step >>= 1;
317  n4 <<= 1;
318  }
319 }
320 
321 /**
322  * MDCT/IMDCT transforms.
323  */
324 
325 static void ff_imdct_half_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
326 {
327  int k, n8, n4, n2, n, j;
328  const uint16_t *revtab = s->revtab;
329  const FFTSample *tcos = s->tcos;
330  const FFTSample *tsin = s->tsin;
331  const FFTSample *in1, *in2, *in3, *in4;
332  FFTComplex *z = (FFTComplex *)output;
333 
334  int j1;
335  const float *tcos1, *tsin1, *tcos2, *tsin2;
336  float temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
337  temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
338  FFTComplex *z1, *z2;
339 
340  n = 1 << s->mdct_bits;
341  n2 = n >> 1;
342  n4 = n >> 2;
343  n8 = n >> 3;
344 
345  /* pre rotation */
346  in1 = input;
347  in2 = input + n2 - 1;
348  in3 = input + 2;
349  in4 = input + n2 - 3;
350 
351  tcos1 = tcos;
352  tsin1 = tsin;
353 
354  /* n4 = 64 or 128 */
355  for(k = 0; k < n4; k += 2) {
356  j = revtab[k ];
357  j1 = revtab[k + 1];
358 
359  __asm__ volatile (
360  "lwc1 %[temp1], 0(%[in2]) \t\n"
361  "lwc1 %[temp2], 0(%[tcos1]) \t\n"
362  "lwc1 %[temp3], 0(%[tsin1]) \t\n"
363  "lwc1 %[temp4], 0(%[in1]) \t\n"
364  "lwc1 %[temp5], 0(%[in4]) \t\n"
365  "mul.s %[temp9], %[temp1], %[temp2] \t\n"
366  "mul.s %[temp10], %[temp1], %[temp3] \t\n"
367  "lwc1 %[temp6], 4(%[tcos1]) \t\n"
368  "lwc1 %[temp7], 4(%[tsin1]) \t\n"
369  "nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n"
370  "madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n"
371  "mul.s %[temp11], %[temp5], %[temp6] \t\n"
372  "mul.s %[temp12], %[temp5], %[temp7] \t\n"
373  "lwc1 %[temp8], 0(%[in3]) \t\n"
374  PTR_ADDIU " %[tcos1], %[tcos1], 8 \t\n"
375  PTR_ADDIU " %[tsin1], %[tsin1], 8 \t\n"
376  PTR_ADDIU " %[in1], %[in1], 16 \t\n"
377  "nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n"
378  "madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n"
379  PTR_ADDIU " %[in2], %[in2], -16 \t\n"
380  PTR_ADDIU " %[in3], %[in3], 16 \t\n"
381  PTR_ADDIU " %[in4], %[in4], -16 \t\n"
382 
383  : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
384  [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
385  [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
386  [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
387  [temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
388  [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
389  [tsin1]"+r"(tsin1), [tcos1]"+r"(tcos1),
390  [in1]"+r"(in1), [in2]"+r"(in2),
391  [in3]"+r"(in3), [in4]"+r"(in4)
392  :
393  : "memory"
394  );
395 
396  z[j ].re = temp9;
397  z[j ].im = temp10;
398  z[j1].re = temp11;
399  z[j1].im = temp12;
400  }
401 
402  s->fft_calc(s, z);
403 
404  /* post rotation + reordering */
405  /* n8 = 32 or 64 */
406  for(k = 0; k < n8; k += 2) {
407  tcos1 = &tcos[n8 - k - 2];
408  tsin1 = &tsin[n8 - k - 2];
409  tcos2 = &tcos[n8 + k];
410  tsin2 = &tsin[n8 + k];
411  z1 = &z[n8 - k - 2];
412  z2 = &z[n8 + k ];
413 
414  __asm__ volatile (
415  "lwc1 %[temp1], 12(%[z1]) \t\n"
416  "lwc1 %[temp2], 4(%[tsin1]) \t\n"
417  "lwc1 %[temp3], 4(%[tcos1]) \t\n"
418  "lwc1 %[temp4], 8(%[z1]) \t\n"
419  "lwc1 %[temp5], 4(%[z1]) \t\n"
420  "mul.s %[temp9], %[temp1], %[temp2] \t\n"
421  "mul.s %[temp10], %[temp1], %[temp3] \t\n"
422  "lwc1 %[temp6], 0(%[tsin1]) \t\n"
423  "lwc1 %[temp7], 0(%[tcos1]) \t\n"
424  "nmsub.s %[temp9], %[temp9], %[temp4], %[temp3] \t\n"
425  "madd.s %[temp10], %[temp10], %[temp4], %[temp2] \t\n"
426  "mul.s %[temp11], %[temp5], %[temp6] \t\n"
427  "mul.s %[temp12], %[temp5], %[temp7] \t\n"
428  "lwc1 %[temp8], 0(%[z1]) \t\n"
429  "lwc1 %[temp1], 4(%[z2]) \t\n"
430  "lwc1 %[temp2], 0(%[tsin2]) \t\n"
431  "lwc1 %[temp3], 0(%[tcos2]) \t\n"
432  "nmsub.s %[temp11], %[temp11], %[temp8], %[temp7] \t\n"
433  "madd.s %[temp12], %[temp12], %[temp8], %[temp6] \t\n"
434  "mul.s %[temp13], %[temp1], %[temp2] \t\n"
435  "mul.s %[temp14], %[temp1], %[temp3] \t\n"
436  "lwc1 %[temp4], 0(%[z2]) \t\n"
437  "lwc1 %[temp5], 12(%[z2]) \t\n"
438  "lwc1 %[temp6], 4(%[tsin2]) \t\n"
439  "lwc1 %[temp7], 4(%[tcos2]) \t\n"
440  "nmsub.s %[temp13], %[temp13], %[temp4], %[temp3] \t\n"
441  "madd.s %[temp14], %[temp14], %[temp4], %[temp2] \t\n"
442  "mul.s %[temp15], %[temp5], %[temp6] \t\n"
443  "mul.s %[temp16], %[temp5], %[temp7] \t\n"
444  "lwc1 %[temp8], 8(%[z2]) \t\n"
445  "nmsub.s %[temp15], %[temp15], %[temp8], %[temp7] \t\n"
446  "madd.s %[temp16], %[temp16], %[temp8], %[temp6] \t\n"
447  : [temp1]"=&f"(temp1), [temp2]"=&f"(temp2),
448  [temp3]"=&f"(temp3), [temp4]"=&f"(temp4),
449  [temp5]"=&f"(temp5), [temp6]"=&f"(temp6),
450  [temp7]"=&f"(temp7), [temp8]"=&f"(temp8),
451  [temp9]"=&f"(temp9), [temp10]"=&f"(temp10),
452  [temp11]"=&f"(temp11), [temp12]"=&f"(temp12),
453  [temp13]"=&f"(temp13), [temp14]"=&f"(temp14),
454  [temp15]"=&f"(temp15), [temp16]"=&f"(temp16)
455  : [z1]"r"(z1), [z2]"r"(z2),
456  [tsin1]"r"(tsin1), [tcos1]"r"(tcos1),
457  [tsin2]"r"(tsin2), [tcos2]"r"(tcos2)
458  : "memory"
459  );
460 
461  z1[1].re = temp9;
462  z1[1].im = temp14;
463  z2[0].re = temp13;
464  z2[0].im = temp10;
465 
466  z1[0].re = temp11;
467  z1[0].im = temp16;
468  z2[1].re = temp15;
469  z2[1].im = temp12;
470  }
471 }
472 
473 /**
474  * Compute inverse MDCT of size N = 2^nbits
475  * @param output N samples
476  * @param input N/2 samples
477  */
478 static void ff_imdct_calc_mips(FFTContext *s, FFTSample *output, const FFTSample *input)
479 {
480  int k;
481  int n = 1 << s->mdct_bits;
482  int n2 = n >> 1;
483  int n4 = n >> 2;
484 
485  ff_imdct_half_mips(s, output+n4, input);
486 
487  for(k = 0; k < n4; k+=4) {
488  output[k] = -output[n2-k-1];
489  output[k+1] = -output[n2-k-2];
490  output[k+2] = -output[n2-k-3];
491  output[k+3] = -output[n2-k-4];
492 
493  output[n-k-1] = output[n2+k];
494  output[n-k-2] = output[n2+k+1];
495  output[n-k-3] = output[n2+k+2];
496  output[n-k-4] = output[n2+k+3];
497  }
498 }
499 #endif /* !HAVE_MIPS32R6 && !HAVE_MIPS64R6 */
500 #endif /* HAVE_INLINE_ASM */
501 
503 {
504  ff_fft_lut_init();
506 
507 #if HAVE_INLINE_ASM
508 #if !HAVE_MIPS32R6 && !HAVE_MIPS64R6
509  s->fft_calc = ff_fft_calc_mips;
510 #if CONFIG_MDCT
511  s->imdct_calc = ff_imdct_calc_mips;
512  s->imdct_half = ff_imdct_half_mips;
513 #endif
514 #endif
515 #endif
516 }
ff_fft_lut_init
void ff_fft_lut_init(void)
Definition: fft_init_table.c:339
ff_init_ff_cos_tabs
#define ff_init_ff_cos_tabs
Definition: fft.h:108
output
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce output
Definition: filter_design.txt:225
ff_fft_init_mips
av_cold void ff_fft_init_mips(FFTContext *s)
FFT transform.
Definition: fft_mips.c:502
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
asmdefs.h
MAX_FFT_SIZE
#define MAX_FFT_SIZE
Definition: fft_table.h:60
av_cold
#define av_cold
Definition: attributes.h:90
s
#define s(width, name)
Definition: cbs_vp9.c:256
MAX_LOG2_NFFT
#define MAX_LOG2_NFFT
Specifies maximum allowed fft size.
Definition: fft_table.h:59
FFTSample
float FFTSample
Definition: avfft.h:35
fft_table.h
FFTComplex::im
FFTSample im
Definition: avfft.h:38
FFTComplex::re
FFTSample re
Definition: avfft.h:38
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
attributes.h
input
and forward the test the status of outputs and forward it to the corresponding return FFERROR_NOT_READY If the filters stores internally one or a few frame for some input
Definition: filter_design.txt:172
FFTContext
Definition: fft.h:76
ff_fft_offsets_lut
uint16_t ff_fft_offsets_lut[21845]
Definition: fft_init_table.c:317
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
fft.h
temp
else temp
Definition: vf_mcdeint.c:248
PTR_ADDIU
#define PTR_ADDIU
Definition: asmdefs.h:50
FFTComplex
Definition: avfft.h:37