FFmpeg
tx_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) Lynne
3  *
4  * Power of two FFT:
5  * Copyright (c) Lynne
6  * Copyright (c) 2008 Loren Merritt
7  * Copyright (c) 2002 Fabrice Bellard
8  * Partly based on libdjbfft by D. J. Bernstein
9  *
10  * This file is part of FFmpeg.
11  *
12  * FFmpeg is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * FFmpeg is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with FFmpeg; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 #define TABLE_DEF(name, size) \
28  DECLARE_ALIGNED(32, TXSample, TX_TAB(ff_tx_tab_ ##name))[size]
29 
30 #define SR_POW2_TABLES \
31  SR_TABLE(8) \
32  SR_TABLE(16) \
33  SR_TABLE(32) \
34  SR_TABLE(64) \
35  SR_TABLE(128) \
36  SR_TABLE(256) \
37  SR_TABLE(512) \
38  SR_TABLE(1024) \
39  SR_TABLE(2048) \
40  SR_TABLE(4096) \
41  SR_TABLE(8192) \
42  SR_TABLE(16384) \
43  SR_TABLE(32768) \
44  SR_TABLE(65536) \
45  SR_TABLE(131072) \
46  SR_TABLE(262144) \
47  SR_TABLE(524288) \
48  SR_TABLE(1048576) \
49  SR_TABLE(2097152) \
50 
51 #define SR_TABLE(len) \
52  TABLE_DEF(len, len/4 + 1);
53 /* Power of two tables */
55 #undef SR_TABLE
56 
57 /* Other factors' tables */
58 TABLE_DEF(53, 12);
59 TABLE_DEF( 7, 6);
60 TABLE_DEF( 9, 8);
61 
62 typedef struct FFTabInitData {
63  void (*func)(void);
64  int factors[TX_MAX_SUB]; /* Must be sorted high -> low */
66 
67 #define SR_TABLE(len) \
68 static av_cold void TX_TAB(ff_tx_init_tab_ ##len)(void) \
69 { \
70  double freq = 2*M_PI/len; \
71  TXSample *tab = TX_TAB(ff_tx_tab_ ##len); \
72  \
73  for (int i = 0; i < len/4; i++) \
74  *tab++ = RESCALE(cos(i*freq)); \
75  \
76  *tab = 0; \
77 }
79 #undef SR_TABLE
80 
81 static void (*const sr_tabs_init_funcs[])(void) = {
82 #define SR_TABLE(len) TX_TAB(ff_tx_init_tab_ ##len),
84 #undef SR_TABLE
85 };
86 
88 #define SR_TABLE(len) AV_ONCE_INIT,
90 #undef SR_TABLE
91 };
92 
93 static av_cold void TX_TAB(ff_tx_init_tab_53)(void)
94 {
95  /* 5pt, doubled to eliminate AVX lane shuffles */
96  TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 5));
97  TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 5));
98  TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10));
99  TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10));
100  TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI / 5));
101  TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI / 5));
102  TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10));
103  TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10));
104 
105  /* 3pt */
106  TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12));
107  TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12));
108  TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI / 6));
109  TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI / 6));
110 }
111 
112 static av_cold void TX_TAB(ff_tx_init_tab_7)(void)
113 {
114  TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 * M_PI / 7));
115  TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 * M_PI / 7));
116  TX_TAB(ff_tx_tab_7)[2] = RESCALE(sin(2 * M_PI / 28));
117  TX_TAB(ff_tx_tab_7)[3] = RESCALE(cos(2 * M_PI / 28));
118  TX_TAB(ff_tx_tab_7)[4] = RESCALE(cos(2 * M_PI / 14));
119  TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 * M_PI / 14));
120 }
121 
122 static av_cold void TX_TAB(ff_tx_init_tab_9)(void)
123 {
124  TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 * M_PI / 3));
125  TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 * M_PI / 3));
126  TX_TAB(ff_tx_tab_9)[2] = RESCALE(cos(2 * M_PI / 9));
127  TX_TAB(ff_tx_tab_9)[3] = RESCALE(sin(2 * M_PI / 9));
128  TX_TAB(ff_tx_tab_9)[4] = RESCALE(cos(2 * M_PI / 36));
129  TX_TAB(ff_tx_tab_9)[5] = RESCALE(sin(2 * M_PI / 36));
130  TX_TAB(ff_tx_tab_9)[6] = TX_TAB(ff_tx_tab_9)[2] + TX_TAB(ff_tx_tab_9)[5];
131  TX_TAB(ff_tx_tab_9)[7] = TX_TAB(ff_tx_tab_9)[3] - TX_TAB(ff_tx_tab_9)[4];
132 }
133 
135  { TX_TAB(ff_tx_init_tab_53), { 15, 5, 3 } },
136  { TX_TAB(ff_tx_init_tab_9), { 9 } },
137  { TX_TAB(ff_tx_init_tab_7), { 7 } },
138 };
139 
141  AV_ONCE_INIT,
142  AV_ONCE_INIT,
143  AV_ONCE_INIT,
144 };
145 
146 av_cold void TX_TAB(ff_tx_init_tabs)(int len)
147 {
148  int factor_2 = ff_ctz(len);
149  if (factor_2) {
150  int idx = factor_2 - 3;
151  for (int i = 0; i <= idx; i++)
154  len >>= factor_2;
155  }
156 
157  for (int i = 0; i < FF_ARRAY_ELEMS(nptwo_tabs_init_data); i++) {
158  int f, f_idx = 0;
159 
160  if (len <= 1)
161  return;
162 
163  while ((f = nptwo_tabs_init_data[i].factors[f_idx++])) {
164  if (f % len)
165  continue;
166 
169  len /= f;
170  break;
171  }
172  }
173 }
174 
176  ptrdiff_t stride)
177 {
178  TXComplex tmp[3];
179  const TXSample *tab = TX_TAB(ff_tx_tab_53);
180 #ifdef TX_INT32
181  int64_t mtmp[4];
182 #endif
183 
184  tmp[0] = in[0];
185  BF(tmp[1].re, tmp[2].im, in[1].im, in[2].im);
186  BF(tmp[1].im, tmp[2].re, in[1].re, in[2].re);
187 
188 #ifdef TX_INT32
189  out[0*stride].re = (int64_t)tmp[0].re + tmp[2].re;
190  out[0*stride].im = (int64_t)tmp[0].im + tmp[2].im;
191  mtmp[0] = (int64_t)tab[ 8] * tmp[1].re;
192  mtmp[1] = (int64_t)tab[ 9] * tmp[1].im;
193  mtmp[2] = (int64_t)tab[10] * tmp[2].re;
194  mtmp[3] = (int64_t)tab[10] * tmp[2].im;
195  out[1*stride].re = tmp[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
196  out[1*stride].im = tmp[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
197  out[2*stride].re = tmp[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
198  out[2*stride].im = tmp[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
199 #else
200  out[0*stride].re = tmp[0].re + tmp[2].re;
201  out[0*stride].im = tmp[0].im + tmp[2].im;
202  tmp[1].re = tab[ 8] * tmp[1].re;
203  tmp[1].im = tab[ 9] * tmp[1].im;
204  tmp[2].re = tab[10] * tmp[2].re;
205  tmp[2].im = tab[10] * tmp[2].im;
206  out[1*stride].re = tmp[0].re - tmp[2].re + tmp[1].re;
207  out[1*stride].im = tmp[0].im - tmp[2].im - tmp[1].im;
208  out[2*stride].re = tmp[0].re - tmp[2].re - tmp[1].re;
209  out[2*stride].im = tmp[0].im - tmp[2].im + tmp[1].im;
210 #endif
211 }
212 
213 #define DECL_FFT5(NAME, D0, D1, D2, D3, D4) \
214 static av_always_inline void NAME(TXComplex *out, TXComplex *in, \
215  ptrdiff_t stride) \
216 { \
217  TXComplex dc, z0[4], t[6]; \
218  const TXSample *tab = TX_TAB(ff_tx_tab_53); \
219  \
220  dc = in[0]; \
221  BF(t[1].im, t[0].re, in[1].re, in[4].re); \
222  BF(t[1].re, t[0].im, in[1].im, in[4].im); \
223  BF(t[3].im, t[2].re, in[2].re, in[3].re); \
224  BF(t[3].re, t[2].im, in[2].im, in[3].im); \
225  \
226  out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re; \
227  out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im; \
228  \
229  SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re); \
230  SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im); \
231  CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re); \
232  CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im); \
233  \
234  BF(z0[0].re, z0[3].re, t[0].re, t[1].re); \
235  BF(z0[0].im, z0[3].im, t[0].im, t[1].im); \
236  BF(z0[2].re, z0[1].re, t[4].re, t[5].re); \
237  BF(z0[2].im, z0[1].im, t[4].im, t[5].im); \
238  \
239  out[D1*stride].re = dc.re + (TXUSample)z0[3].re; \
240  out[D1*stride].im = dc.im + (TXUSample)z0[0].im; \
241  out[D2*stride].re = dc.re + (TXUSample)z0[2].re; \
242  out[D2*stride].im = dc.im + (TXUSample)z0[1].im; \
243  out[D3*stride].re = dc.re + (TXUSample)z0[1].re; \
244  out[D3*stride].im = dc.im + (TXUSample)z0[2].im; \
245  out[D4*stride].re = dc.re + (TXUSample)z0[0].re; \
246  out[D4*stride].im = dc.im + (TXUSample)z0[3].im; \
247 }
248 
249 DECL_FFT5(fft5, 0, 1, 2, 3, 4)
250 DECL_FFT5(fft5_m1, 0, 6, 12, 3, 9)
251 DECL_FFT5(fft5_m2, 10, 1, 7, 13, 4)
252 DECL_FFT5(fft5_m3, 5, 11, 2, 8, 14)
253 
255  ptrdiff_t stride)
256 {
257  TXComplex dc, t[6], z[3];
258  const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_7);
259 #ifdef TX_INT32
260  int64_t mtmp[12];
261 #endif
262 
263  dc = in[0];
264  BF(t[1].re, t[0].re, in[1].re, in[6].re);
265  BF(t[1].im, t[0].im, in[1].im, in[6].im);
266  BF(t[3].re, t[2].re, in[2].re, in[5].re);
267  BF(t[3].im, t[2].im, in[2].im, in[5].im);
268  BF(t[5].re, t[4].re, in[3].re, in[4].re);
269  BF(t[5].im, t[4].im, in[3].im, in[4].im);
270 
271  out[0*stride].re = dc.re + t[0].re + t[2].re + t[4].re;
272  out[0*stride].im = dc.im + t[0].im + t[2].im + t[4].im;
273 
274 #ifdef TX_INT32 /* NOTE: it's possible to do this with 16 mults but 72 adds */
275  mtmp[ 0] = ((int64_t)tab[0].re)*t[0].re - ((int64_t)tab[2].re)*t[4].re;
276  mtmp[ 1] = ((int64_t)tab[0].re)*t[4].re - ((int64_t)tab[1].re)*t[0].re;
277  mtmp[ 2] = ((int64_t)tab[0].re)*t[2].re - ((int64_t)tab[2].re)*t[0].re;
278  mtmp[ 3] = ((int64_t)tab[0].re)*t[0].im - ((int64_t)tab[1].re)*t[2].im;
279  mtmp[ 4] = ((int64_t)tab[0].re)*t[4].im - ((int64_t)tab[1].re)*t[0].im;
280  mtmp[ 5] = ((int64_t)tab[0].re)*t[2].im - ((int64_t)tab[2].re)*t[0].im;
281 
282  mtmp[ 6] = ((int64_t)tab[2].im)*t[1].im + ((int64_t)tab[1].im)*t[5].im;
283  mtmp[ 7] = ((int64_t)tab[0].im)*t[5].im + ((int64_t)tab[2].im)*t[3].im;
284  mtmp[ 8] = ((int64_t)tab[2].im)*t[5].im + ((int64_t)tab[1].im)*t[3].im;
285  mtmp[ 9] = ((int64_t)tab[0].im)*t[1].re + ((int64_t)tab[1].im)*t[3].re;
286  mtmp[10] = ((int64_t)tab[2].im)*t[3].re + ((int64_t)tab[0].im)*t[5].re;
287  mtmp[11] = ((int64_t)tab[2].im)*t[1].re + ((int64_t)tab[1].im)*t[5].re;
288 
289  z[0].re = (int32_t)(mtmp[ 0] - ((int64_t)tab[1].re)*t[2].re + 0x40000000 >> 31);
290  z[1].re = (int32_t)(mtmp[ 1] - ((int64_t)tab[2].re)*t[2].re + 0x40000000 >> 31);
291  z[2].re = (int32_t)(mtmp[ 2] - ((int64_t)tab[1].re)*t[4].re + 0x40000000 >> 31);
292  z[0].im = (int32_t)(mtmp[ 3] - ((int64_t)tab[2].re)*t[4].im + 0x40000000 >> 31);
293  z[1].im = (int32_t)(mtmp[ 4] - ((int64_t)tab[2].re)*t[2].im + 0x40000000 >> 31);
294  z[2].im = (int32_t)(mtmp[ 5] - ((int64_t)tab[1].re)*t[4].im + 0x40000000 >> 31);
295 
296  t[0].re = (int32_t)(mtmp[ 6] - ((int64_t)tab[0].im)*t[3].im + 0x40000000 >> 31);
297  t[2].re = (int32_t)(mtmp[ 7] - ((int64_t)tab[1].im)*t[1].im + 0x40000000 >> 31);
298  t[4].re = (int32_t)(mtmp[ 8] + ((int64_t)tab[0].im)*t[1].im + 0x40000000 >> 31);
299  t[0].im = (int32_t)(mtmp[ 9] + ((int64_t)tab[2].im)*t[5].re + 0x40000000 >> 31);
300  t[2].im = (int32_t)(mtmp[10] - ((int64_t)tab[1].im)*t[1].re + 0x40000000 >> 31);
301  t[4].im = (int32_t)(mtmp[11] - ((int64_t)tab[0].im)*t[3].re + 0x40000000 >> 31);
302 #else
303  z[0].re = tab[0].re*t[0].re - tab[2].re*t[4].re - tab[1].re*t[2].re;
304  z[1].re = tab[0].re*t[4].re - tab[1].re*t[0].re - tab[2].re*t[2].re;
305  z[2].re = tab[0].re*t[2].re - tab[2].re*t[0].re - tab[1].re*t[4].re;
306  z[0].im = tab[0].re*t[0].im - tab[1].re*t[2].im - tab[2].re*t[4].im;
307  z[1].im = tab[0].re*t[4].im - tab[1].re*t[0].im - tab[2].re*t[2].im;
308  z[2].im = tab[0].re*t[2].im - tab[2].re*t[0].im - tab[1].re*t[4].im;
309 
310  /* It's possible to do t[4].re and t[0].im with 2 multiplies only by
311  * multiplying the sum of all with the average of the twiddles */
312 
313  t[0].re = tab[2].im*t[1].im + tab[1].im*t[5].im - tab[0].im*t[3].im;
314  t[2].re = tab[0].im*t[5].im + tab[2].im*t[3].im - tab[1].im*t[1].im;
315  t[4].re = tab[2].im*t[5].im + tab[1].im*t[3].im + tab[0].im*t[1].im;
316  t[0].im = tab[0].im*t[1].re + tab[1].im*t[3].re + tab[2].im*t[5].re;
317  t[2].im = tab[2].im*t[3].re + tab[0].im*t[5].re - tab[1].im*t[1].re;
318  t[4].im = tab[2].im*t[1].re + tab[1].im*t[5].re - tab[0].im*t[3].re;
319 #endif
320 
321  BF(t[1].re, z[0].re, z[0].re, t[4].re);
322  BF(t[3].re, z[1].re, z[1].re, t[2].re);
323  BF(t[5].re, z[2].re, z[2].re, t[0].re);
324  BF(t[1].im, z[0].im, z[0].im, t[0].im);
325  BF(t[3].im, z[1].im, z[1].im, t[2].im);
326  BF(t[5].im, z[2].im, z[2].im, t[4].im);
327 
328  out[1*stride].re = dc.re + z[0].re;
329  out[1*stride].im = dc.im + t[1].im;
330  out[2*stride].re = dc.re + t[3].re;
331  out[2*stride].im = dc.im + z[1].im;
332  out[3*stride].re = dc.re + z[2].re;
333  out[3*stride].im = dc.im + t[5].im;
334  out[4*stride].re = dc.re + t[5].re;
335  out[4*stride].im = dc.im + z[2].im;
336  out[5*stride].re = dc.re + z[1].re;
337  out[5*stride].im = dc.im + t[3].im;
338  out[6*stride].re = dc.re + t[1].re;
339  out[6*stride].im = dc.im + z[0].im;
340 }
341 
343  ptrdiff_t stride)
344 {
345  const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_9);
346  TXComplex dc, t[16], w[4], x[5], y[5], z[2];
347 #ifdef TX_INT32
348  int64_t mtmp[12];
349 #endif
350 
351  dc = in[0];
352  BF(t[1].re, t[0].re, in[1].re, in[8].re);
353  BF(t[1].im, t[0].im, in[1].im, in[8].im);
354  BF(t[3].re, t[2].re, in[2].re, in[7].re);
355  BF(t[3].im, t[2].im, in[2].im, in[7].im);
356  BF(t[5].re, t[4].re, in[3].re, in[6].re);
357  BF(t[5].im, t[4].im, in[3].im, in[6].im);
358  BF(t[7].re, t[6].re, in[4].re, in[5].re);
359  BF(t[7].im, t[6].im, in[4].im, in[5].im);
360 
361  w[0].re = t[0].re - t[6].re;
362  w[0].im = t[0].im - t[6].im;
363  w[1].re = t[2].re - t[6].re;
364  w[1].im = t[2].im - t[6].im;
365  w[2].re = t[1].re - t[7].re;
366  w[2].im = t[1].im - t[7].im;
367  w[3].re = t[3].re + t[7].re;
368  w[3].im = t[3].im + t[7].im;
369 
370  z[0].re = dc.re + t[4].re;
371  z[0].im = dc.im + t[4].im;
372 
373  z[1].re = t[0].re + t[2].re + t[6].re;
374  z[1].im = t[0].im + t[2].im + t[6].im;
375 
376  out[0*stride].re = z[0].re + z[1].re;
377  out[0*stride].im = z[0].im + z[1].im;
378 
379 #ifdef TX_INT32
380  mtmp[0] = t[1].re - t[3].re + t[7].re;
381  mtmp[1] = t[1].im - t[3].im + t[7].im;
382 
383  y[3].re = (int32_t)(((int64_t)tab[0].im)*mtmp[0] + 0x40000000 >> 31);
384  y[3].im = (int32_t)(((int64_t)tab[0].im)*mtmp[1] + 0x40000000 >> 31);
385 
386  mtmp[0] = (int32_t)(((int64_t)tab[0].re)*z[1].re + 0x40000000 >> 31);
387  mtmp[1] = (int32_t)(((int64_t)tab[0].re)*z[1].im + 0x40000000 >> 31);
388  mtmp[2] = (int32_t)(((int64_t)tab[0].re)*t[4].re + 0x40000000 >> 31);
389  mtmp[3] = (int32_t)(((int64_t)tab[0].re)*t[4].im + 0x40000000 >> 31);
390 
391  x[3].re = z[0].re + (int32_t)mtmp[0];
392  x[3].im = z[0].im + (int32_t)mtmp[1];
393  z[0].re = in[0].re + (int32_t)mtmp[2];
394  z[0].im = in[0].im + (int32_t)mtmp[3];
395 
396  mtmp[0] = ((int64_t)tab[1].re)*w[0].re;
397  mtmp[1] = ((int64_t)tab[1].re)*w[0].im;
398  mtmp[2] = ((int64_t)tab[2].im)*w[0].re;
399  mtmp[3] = ((int64_t)tab[2].im)*w[0].im;
400  mtmp[4] = ((int64_t)tab[1].im)*w[2].re;
401  mtmp[5] = ((int64_t)tab[1].im)*w[2].im;
402  mtmp[6] = ((int64_t)tab[2].re)*w[2].re;
403  mtmp[7] = ((int64_t)tab[2].re)*w[2].im;
404 
405  x[1].re = (int32_t)(mtmp[0] + ((int64_t)tab[2].im)*w[1].re + 0x40000000 >> 31);
406  x[1].im = (int32_t)(mtmp[1] + ((int64_t)tab[2].im)*w[1].im + 0x40000000 >> 31);
407  x[2].re = (int32_t)(mtmp[2] - ((int64_t)tab[3].re)*w[1].re + 0x40000000 >> 31);
408  x[2].im = (int32_t)(mtmp[3] - ((int64_t)tab[3].re)*w[1].im + 0x40000000 >> 31);
409  y[1].re = (int32_t)(mtmp[4] + ((int64_t)tab[2].re)*w[3].re + 0x40000000 >> 31);
410  y[1].im = (int32_t)(mtmp[5] + ((int64_t)tab[2].re)*w[3].im + 0x40000000 >> 31);
411  y[2].re = (int32_t)(mtmp[6] - ((int64_t)tab[3].im)*w[3].re + 0x40000000 >> 31);
412  y[2].im = (int32_t)(mtmp[7] - ((int64_t)tab[3].im)*w[3].im + 0x40000000 >> 31);
413 
414  y[0].re = (int32_t)(((int64_t)tab[0].im)*t[5].re + 0x40000000 >> 31);
415  y[0].im = (int32_t)(((int64_t)tab[0].im)*t[5].im + 0x40000000 >> 31);
416 
417 #else
418  y[3].re = tab[0].im*(t[1].re - t[3].re + t[7].re);
419  y[3].im = tab[0].im*(t[1].im - t[3].im + t[7].im);
420 
421  x[3].re = z[0].re + tab[0].re*z[1].re;
422  x[3].im = z[0].im + tab[0].re*z[1].im;
423  z[0].re = dc.re + tab[0].re*t[4].re;
424  z[0].im = dc.im + tab[0].re*t[4].im;
425 
426  x[1].re = tab[1].re*w[0].re + tab[2].im*w[1].re;
427  x[1].im = tab[1].re*w[0].im + tab[2].im*w[1].im;
428  x[2].re = tab[2].im*w[0].re - tab[3].re*w[1].re;
429  x[2].im = tab[2].im*w[0].im - tab[3].re*w[1].im;
430  y[1].re = tab[1].im*w[2].re + tab[2].re*w[3].re;
431  y[1].im = tab[1].im*w[2].im + tab[2].re*w[3].im;
432  y[2].re = tab[2].re*w[2].re - tab[3].im*w[3].re;
433  y[2].im = tab[2].re*w[2].im - tab[3].im*w[3].im;
434 
435  y[0].re = tab[0].im*t[5].re;
436  y[0].im = tab[0].im*t[5].im;
437 #endif
438 
439  x[4].re = x[1].re + x[2].re;
440  x[4].im = x[1].im + x[2].im;
441 
442  y[4].re = y[1].re - y[2].re;
443  y[4].im = y[1].im - y[2].im;
444  x[1].re = z[0].re + x[1].re;
445  x[1].im = z[0].im + x[1].im;
446  y[1].re = y[0].re + y[1].re;
447  y[1].im = y[0].im + y[1].im;
448  x[2].re = z[0].re + x[2].re;
449  x[2].im = z[0].im + x[2].im;
450  y[2].re = y[2].re - y[0].re;
451  y[2].im = y[2].im - y[0].im;
452  x[4].re = z[0].re - x[4].re;
453  x[4].im = z[0].im - x[4].im;
454  y[4].re = y[0].re - y[4].re;
455  y[4].im = y[0].im - y[4].im;
456 
457  out[1*stride] = (TXComplex){ x[1].re + y[1].im, x[1].im - y[1].re };
458  out[2*stride] = (TXComplex){ x[2].re + y[2].im, x[2].im - y[2].re };
459  out[3*stride] = (TXComplex){ x[3].re + y[3].im, x[3].im - y[3].re };
460  out[4*stride] = (TXComplex){ x[4].re + y[4].im, x[4].im - y[4].re };
461  out[5*stride] = (TXComplex){ x[4].re - y[4].im, x[4].im + y[4].re };
462  out[6*stride] = (TXComplex){ x[3].re - y[3].im, x[3].im + y[3].re };
463  out[7*stride] = (TXComplex){ x[2].re - y[2].im, x[2].im + y[2].re };
464  out[8*stride] = (TXComplex){ x[1].re - y[1].im, x[1].im + y[1].re };
465 }
466 
468  ptrdiff_t stride)
469 {
470  TXComplex tmp[15];
471 
472  for (int i = 0; i < 5; i++)
473  fft3(tmp + i, in + i*3, 5);
474 
475  fft5_m1(out, tmp + 0, stride);
476  fft5_m2(out, tmp + 5, stride);
477  fft5_m3(out, tmp + 10, stride);
478 }
479 
481  const FFTXCodelet *cd,
482  uint64_t flags,
484  int len, int inv,
485  const void *scale)
486 {
487  int ret = 0;
488  TX_TAB(ff_tx_init_tabs)(len);
489 
490  if (len == 15)
491  ret = ff_tx_gen_pfa_input_map(s, opts, 3, 5);
492  else if (flags & FF_TX_PRESHUFFLE)
494 
495  return ret;
496 }
497 
498 #define DECL_FACTOR_S(n) \
499 static void TX_NAME(ff_tx_fft##n)(AVTXContext *s, void *dst, \
500  void *src, ptrdiff_t stride) \
501 { \
502  fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex)); \
503 } \
504 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
505  .name = TX_NAME_STR("fft" #n "_ns"), \
506  .function = TX_NAME(ff_tx_fft##n), \
507  .type = TX_TYPE(FFT), \
508  .flags = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
509  AV_TX_UNALIGNED | FF_TX_PRESHUFFLE, \
510  .factors[0] = n, \
511  .nb_factors = 1, \
512  .min_len = n, \
513  .max_len = n, \
514  .init = TX_NAME(ff_tx_fft_factor_init), \
515  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
516  .prio = FF_TX_PRIO_BASE, \
517 };
518 
519 #define DECL_FACTOR_F(n) \
520 DECL_FACTOR_S(n) \
521 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_fwd_def) = { \
522  .name = TX_NAME_STR("fft" #n "_fwd"), \
523  .function = TX_NAME(ff_tx_fft##n), \
524  .type = TX_TYPE(FFT), \
525  .flags = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
526  AV_TX_UNALIGNED | FF_TX_FORWARD_ONLY, \
527  .factors[0] = n, \
528  .nb_factors = 1, \
529  .min_len = n, \
530  .max_len = n, \
531  .init = TX_NAME(ff_tx_fft_factor_init), \
532  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
533  .prio = FF_TX_PRIO_BASE, \
534 };
535 
536 DECL_FACTOR_F(3)
537 DECL_FACTOR_F(5)
538 DECL_FACTOR_F(7)
539 DECL_FACTOR_F(9)
540 DECL_FACTOR_S(15)
541 
542 #define BUTTERFLIES(a0, a1, a2, a3) \
543  do { \
544  r0=a0.re; \
545  i0=a0.im; \
546  r1=a1.re; \
547  i1=a1.im; \
548  BF(t3, t5, t5, t1); \
549  BF(a2.re, a0.re, r0, t5); \
550  BF(a3.im, a1.im, i1, t3); \
551  BF(t4, t6, t2, t6); \
552  BF(a3.re, a1.re, r1, t4); \
553  BF(a2.im, a0.im, i0, t6); \
554  } while (0)
555 
556 #define TRANSFORM(a0, a1, a2, a3, wre, wim) \
557  do { \
558  CMUL(t1, t2, a2.re, a2.im, wre, -wim); \
559  CMUL(t5, t6, a3.re, a3.im, wre, wim); \
560  BUTTERFLIES(a0, a1, a2, a3); \
561  } while (0)
562 
563 /* z[0...8n-1], w[1...2n-1] */
564 static inline void TX_NAME(ff_tx_fft_sr_combine)(TXComplex *z,
565  const TXSample *cos, int len)
566 {
567  int o1 = 2*len;
568  int o2 = 4*len;
569  int o3 = 6*len;
570  const TXSample *wim = cos + o1 - 7;
571  TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
572 
573  for (int i = 0; i < len; i += 4) {
574  TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]);
575  TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]);
576  TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]);
577  TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]);
578 
579  TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]);
580  TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]);
581  TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]);
582  TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]);
583 
584  z += 2*4;
585  cos += 2*4;
586  wim -= 2*4;
587  }
588 }
589 
591  const FFTXCodelet *cd,
592  uint64_t flags,
594  int len, int inv,
595  const void *scale)
596 {
597  TX_TAB(ff_tx_init_tabs)(len);
598  return ff_tx_gen_ptwo_revtab(s, opts);
599 }
600 
601 #define DECL_SR_CODELET_DEF(n) \
602 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
603  .name = TX_NAME_STR("fft" #n "_ns"), \
604  .function = TX_NAME(ff_tx_fft##n##_ns), \
605  .type = TX_TYPE(FFT), \
606  .flags = FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | \
607  AV_TX_UNALIGNED | FF_TX_PRESHUFFLE, \
608  .factors[0] = 2, \
609  .nb_factors = 1, \
610  .min_len = n, \
611  .max_len = n, \
612  .init = TX_NAME(ff_tx_fft_sr_codelet_init), \
613  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
614  .prio = FF_TX_PRIO_BASE, \
615 };
616 
617 #define DECL_SR_CODELET(n, n2, n4) \
618 static void TX_NAME(ff_tx_fft##n##_ns)(AVTXContext *s, void *_dst, \
619  void *_src, ptrdiff_t stride) \
620 { \
621  TXComplex *src = _src; \
622  TXComplex *dst = _dst; \
623  const TXSample *cos = TX_TAB(ff_tx_tab_##n); \
624  \
625  TX_NAME(ff_tx_fft##n2##_ns)(s, dst, src, stride); \
626  TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride); \
627  TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride); \
628  TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1); \
629 } \
630  \
631 DECL_SR_CODELET_DEF(n)
632 
633 static void TX_NAME(ff_tx_fft2_ns)(AVTXContext *s, void *_dst,
634  void *_src, ptrdiff_t stride)
635 {
636  TXComplex *src = _src;
637  TXComplex *dst = _dst;
638  TXComplex tmp;
639 
640  BF(tmp.re, dst[0].re, src[0].re, src[1].re);
641  BF(tmp.im, dst[0].im, src[0].im, src[1].im);
642  dst[1] = tmp;
643 }
644 
645 static void TX_NAME(ff_tx_fft4_ns)(AVTXContext *s, void *_dst,
646  void *_src, ptrdiff_t stride)
647 {
648  TXComplex *src = _src;
649  TXComplex *dst = _dst;
650  TXSample t1, t2, t3, t4, t5, t6, t7, t8;
651 
652  BF(t3, t1, src[0].re, src[1].re);
653  BF(t8, t6, src[3].re, src[2].re);
654  BF(dst[2].re, dst[0].re, t1, t6);
655  BF(t4, t2, src[0].im, src[1].im);
656  BF(t7, t5, src[2].im, src[3].im);
657  BF(dst[3].im, dst[1].im, t4, t8);
658  BF(dst[3].re, dst[1].re, t3, t7);
659  BF(dst[2].im, dst[0].im, t2, t5);
660 }
661 
662 static void TX_NAME(ff_tx_fft8_ns)(AVTXContext *s, void *_dst,
663  void *_src, ptrdiff_t stride)
664 {
665  TXComplex *src = _src;
666  TXComplex *dst = _dst;
667  TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
668  const TXSample cos = TX_TAB(ff_tx_tab_8)[1];
669 
670  TX_NAME(ff_tx_fft4_ns)(s, dst, src, stride);
671 
672  BF(t1, dst[5].re, src[4].re, -src[5].re);
673  BF(t2, dst[5].im, src[4].im, -src[5].im);
674  BF(t5, dst[7].re, src[6].re, -src[7].re);
675  BF(t6, dst[7].im, src[6].im, -src[7].im);
676 
677  BUTTERFLIES(dst[0], dst[2], dst[4], dst[6]);
678  TRANSFORM(dst[1], dst[3], dst[5], dst[7], cos, cos);
679 }
680 
681 static void TX_NAME(ff_tx_fft16_ns)(AVTXContext *s, void *_dst,
682  void *_src, ptrdiff_t stride)
683 {
684  TXComplex *src = _src;
685  TXComplex *dst = _dst;
686  const TXSample *cos = TX_TAB(ff_tx_tab_16);
687 
688  TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
689  TXSample cos_16_1 = cos[1];
690  TXSample cos_16_2 = cos[2];
691  TXSample cos_16_3 = cos[3];
692 
693  TX_NAME(ff_tx_fft8_ns)(s, dst + 0, src + 0, stride);
694  TX_NAME(ff_tx_fft4_ns)(s, dst + 8, src + 8, stride);
695  TX_NAME(ff_tx_fft4_ns)(s, dst + 12, src + 12, stride);
696 
697  t1 = dst[ 8].re;
698  t2 = dst[ 8].im;
699  t5 = dst[12].re;
700  t6 = dst[12].im;
701  BUTTERFLIES(dst[0], dst[4], dst[8], dst[12]);
702 
703  TRANSFORM(dst[ 2], dst[ 6], dst[10], dst[14], cos_16_2, cos_16_2);
704  TRANSFORM(dst[ 1], dst[ 5], dst[ 9], dst[13], cos_16_1, cos_16_3);
705  TRANSFORM(dst[ 3], dst[ 7], dst[11], dst[15], cos_16_3, cos_16_1);
706 }
707 
712 DECL_SR_CODELET(32,16,8)
713 DECL_SR_CODELET(64,32,16)
714 DECL_SR_CODELET(128,64,32)
715 DECL_SR_CODELET(256,128,64)
716 DECL_SR_CODELET(512,256,128)
717 DECL_SR_CODELET(1024,512,256)
718 DECL_SR_CODELET(2048,1024,512)
719 DECL_SR_CODELET(4096,2048,1024)
720 DECL_SR_CODELET(8192,4096,2048)
721 DECL_SR_CODELET(16384,8192,4096)
722 DECL_SR_CODELET(32768,16384,8192)
723 DECL_SR_CODELET(65536,32768,16384)
724 DECL_SR_CODELET(131072,65536,32768)
725 DECL_SR_CODELET(262144,131072,65536)
726 DECL_SR_CODELET(524288,262144,131072)
727 DECL_SR_CODELET(1048576,524288,262144)
728 DECL_SR_CODELET(2097152,1048576,524288)
729 
731  const FFTXCodelet *cd,
732  uint64_t flags,
734  int len, int inv,
735  const void *scale)
736 {
737  int ret;
738  int is_inplace = !!(flags & AV_TX_INPLACE);
739  FFTXCodeletOptions sub_opts = {
740  .map_dir = is_inplace ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
741  };
742 
743  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
744  flags |= AV_TX_INPLACE; /* in-place */
745  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
746 
747  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len, inv, scale)))
748  return ret;
749 
750  if (is_inplace && (ret = ff_tx_gen_inplace_map(s, len)))
751  return ret;
752 
753  return 0;
754 }
755 
757  const FFTXCodelet *cd,
758  uint64_t flags,
760  int len, int inv,
761  const void *scale)
762 {
763  if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
764  return AVERROR(ENOMEM);
765  flags &= ~AV_TX_INPLACE;
766  return TX_NAME(ff_tx_fft_init)(s, cd, flags, opts, len, inv, scale);
767 }
768 
769 static void TX_NAME(ff_tx_fft)(AVTXContext *s, void *_dst,
770  void *_src, ptrdiff_t stride)
771 {
772  TXComplex *src = _src;
773  TXComplex *dst1 = s->flags & AV_TX_INPLACE ? s->tmp : _dst;
774  TXComplex *dst2 = _dst;
775  int *map = s->sub[0].map;
776  int len = s->len;
777 
778  /* Compilers can't vectorize this anyway without assuming AVX2, which they
779  * generally don't, at least without -march=native -mtune=native */
780  for (int i = 0; i < len; i++)
781  dst1[i] = src[map[i]];
782 
783  s->fn[0](&s->sub[0], dst2, dst1, stride);
784 }
785 
786 static void TX_NAME(ff_tx_fft_inplace)(AVTXContext *s, void *_dst,
787  void *_src, ptrdiff_t stride)
788 {
789  TXComplex *src = _src;
790  TXComplex *dst = _dst;
791  TXComplex tmp;
792  const int *map = s->sub->map;
793  const int *inplace_idx = s->map;
794  int src_idx, dst_idx;
795 
796  src_idx = *inplace_idx++;
797  do {
798  tmp = src[src_idx];
799  dst_idx = map[src_idx];
800  do {
801  FFSWAP(TXComplex, tmp, src[dst_idx]);
802  dst_idx = map[dst_idx];
803  } while (dst_idx != src_idx); /* Can be > as well, but was less predictable */
804  src[dst_idx] = tmp;
805  } while ((src_idx = *inplace_idx++));
806 
807  s->fn[0](&s->sub[0], dst, src, stride);
808 }
809 
810 static const FFTXCodelet TX_NAME(ff_tx_fft_def) = {
811  .name = TX_NAME_STR("fft"),
812  .function = TX_NAME(ff_tx_fft),
813  .type = TX_TYPE(FFT),
815  .factors[0] = TX_FACTOR_ANY,
816  .nb_factors = 1,
817  .min_len = 2,
818  .max_len = TX_LEN_UNLIMITED,
819  .init = TX_NAME(ff_tx_fft_init),
821  .prio = FF_TX_PRIO_BASE,
822 };
823 
824 static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_small_def) = {
825  .name = TX_NAME_STR("fft_inplace_small"),
826  .function = TX_NAME(ff_tx_fft),
827  .type = TX_TYPE(FFT),
829  .factors[0] = TX_FACTOR_ANY,
830  .nb_factors = 1,
831  .min_len = 2,
832  .max_len = 65536,
835  .prio = FF_TX_PRIO_BASE - 256,
836 };
837 
838 static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_def) = {
839  .name = TX_NAME_STR("fft_inplace"),
840  .function = TX_NAME(ff_tx_fft_inplace),
841  .type = TX_TYPE(FFT),
843  .factors[0] = TX_FACTOR_ANY,
844  .nb_factors = 1,
845  .min_len = 2,
846  .max_len = TX_LEN_UNLIMITED,
847  .init = TX_NAME(ff_tx_fft_init),
849  .prio = FF_TX_PRIO_BASE - 512,
850 };
851 
853  const FFTXCodelet *cd,
854  uint64_t flags,
856  int len, int inv,
857  const void *scale)
858 {
859  const double phase = s->inv ? 2.0*M_PI/len : -2.0*M_PI/len;
860 
861  if (!(s->exp = av_malloc(len*len*sizeof(*s->exp))))
862  return AVERROR(ENOMEM);
863 
864  for (int i = 0; i < len; i++) {
865  for (int j = 0; j < len; j++) {
866  const double factor = phase*i*j;
867  s->exp[i*j] = (TXComplex){
868  RESCALE(cos(factor)),
869  RESCALE(sin(factor)),
870  };
871  }
872  }
873 
874  return 0;
875 }
876 
877 static void TX_NAME(ff_tx_fft_naive)(AVTXContext *s, void *_dst, void *_src,
878  ptrdiff_t stride)
879 {
880  TXComplex *src = _src;
881  TXComplex *dst = _dst;
882  const int n = s->len;
883  double phase = s->inv ? 2.0*M_PI/n : -2.0*M_PI/n;
884 
885  stride /= sizeof(*dst);
886 
887  for (int i = 0; i < n; i++) {
888  TXComplex tmp = { 0 };
889  for (int j = 0; j < n; j++) {
890  const double factor = phase*i*j;
891  const TXComplex mult = {
892  RESCALE(cos(factor)),
893  RESCALE(sin(factor)),
894  };
895  TXComplex res;
896  CMUL3(res, src[j], mult);
897  tmp.re += res.re;
898  tmp.im += res.im;
899  }
900  dst[i*stride] = tmp;
901  }
902 }
903 
904 static void TX_NAME(ff_tx_fft_naive_small)(AVTXContext *s, void *_dst, void *_src,
905  ptrdiff_t stride)
906 {
907  TXComplex *src = _src;
908  TXComplex *dst = _dst;
909  const int n = s->len;
910 
911  stride /= sizeof(*dst);
912 
913  for (int i = 0; i < n; i++) {
914  TXComplex tmp = { 0 };
915  for (int j = 0; j < n; j++) {
916  TXComplex res;
917  const TXComplex mult = s->exp[i*j];
918  CMUL3(res, src[j], mult);
919  tmp.re += res.re;
920  tmp.im += res.im;
921  }
922  dst[i*stride] = tmp;
923  }
924 }
925 
926 static const FFTXCodelet TX_NAME(ff_tx_fft_naive_small_def) = {
927  .name = TX_NAME_STR("fft_naive_small"),
928  .function = TX_NAME(ff_tx_fft_naive_small),
929  .type = TX_TYPE(FFT),
931  .factors[0] = TX_FACTOR_ANY,
932  .nb_factors = 1,
933  .min_len = 2,
934  .max_len = 1024,
937  .prio = FF_TX_PRIO_MIN/2,
938 };
939 
940 static const FFTXCodelet TX_NAME(ff_tx_fft_naive_def) = {
941  .name = TX_NAME_STR("fft_naive"),
942  .function = TX_NAME(ff_tx_fft_naive),
943  .type = TX_TYPE(FFT),
945  .factors[0] = TX_FACTOR_ANY,
946  .nb_factors = 1,
947  .min_len = 2,
948  .max_len = TX_LEN_UNLIMITED,
949  .init = NULL,
950  .cpu_flags = FF_TX_CPU_FLAGS_ALL,
951  .prio = FF_TX_PRIO_MIN,
952 };
953 
955  const FFTXCodelet *cd,
956  uint64_t flags,
958  int len, int inv,
959  const void *scale)
960 {
961  int ret, *tmp, ps = flags & FF_TX_PRESHUFFLE;
962  FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
963  size_t extra_tmp_len = 0;
964  int len_list[TX_MAX_DECOMPOSITIONS];
965 
966  if ((ret = ff_tx_decompose_length(len_list, TX_TYPE(FFT), len, inv)) < 0)
967  return ret;
968 
969  /* Two iterations to test both orderings. */
970  for (int i = 0; i < ret; i++) {
971  int len1 = len_list[i];
972  int len2 = len / len1;
973 
974  /* Our ptwo transforms don't support striding the output. */
975  if (len2 & (len2 - 1))
976  FFSWAP(int, len1, len2);
977 
979 
980  /* First transform */
981  sub_opts.map_dir = FF_TX_MAP_GATHER;
982  flags &= ~AV_TX_INPLACE;
984  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
985  ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
986  len1, inv, scale);
987 
988  if (ret == AVERROR(ENOMEM)) {
989  return ret;
990  } else if (ret < 0) { /* Try again without a preshuffle flag */
992  ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
993  len1, inv, scale);
994  if (ret == AVERROR(ENOMEM))
995  return ret;
996  else if (ret < 0)
997  continue;
998  }
999 
1000  /* Second transform. */
1001  sub_opts.map_dir = FF_TX_MAP_SCATTER;
1003 retry:
1005  flags |= AV_TX_INPLACE;
1006  ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1007  len2, inv, scale);
1008 
1009  if (ret == AVERROR(ENOMEM)) {
1010  return ret;
1011  } else if (ret < 0) { /* Try again with an out-of-place transform */
1013  flags &= ~AV_TX_INPLACE;
1014  ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1015  len2, inv, scale);
1016  if (ret == AVERROR(ENOMEM)) {
1017  return ret;
1018  } else if (ret < 0) {
1019  if (flags & FF_TX_PRESHUFFLE) { /* Retry again without a preshuf flag */
1020  flags &= ~FF_TX_PRESHUFFLE;
1021  goto retry;
1022  } else {
1023  continue;
1024  }
1025  }
1026  }
1027 
1028  /* Success */
1029  break;
1030  }
1031 
1032  /* If nothing was sucessful, error out */
1033  if (ret < 0)
1034  return ret;
1035 
1036  /* Generate PFA map */
1037  if ((ret = ff_tx_gen_compound_mapping(s, opts, 0,
1038  s->sub[0].len, s->sub[1].len)))
1039  return ret;
1040 
1041  if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1042  return AVERROR(ENOMEM);
1043 
1044  /* Flatten input map */
1045  tmp = (int *)s->tmp;
1046  for (int k = 0; k < len; k += s->sub[0].len) {
1047  memcpy(tmp, &s->map[k], s->sub[0].len*sizeof(*tmp));
1048  for (int i = 0; i < s->sub[0].len; i++)
1049  s->map[k + i] = tmp[s->sub[0].map[i]];
1050  }
1051 
1052  /* Only allocate extra temporary memory if we need it */
1053  if (!(s->sub[1].flags & AV_TX_INPLACE))
1054  extra_tmp_len = len;
1055  else if (!ps)
1056  extra_tmp_len = s->sub[0].len;
1057 
1058  if (extra_tmp_len && !(s->exp = av_malloc(extra_tmp_len*sizeof(*s->exp))))
1059  return AVERROR(ENOMEM);
1060 
1061  return 0;
1062 }
1063 
1064 static void TX_NAME(ff_tx_fft_pfa)(AVTXContext *s, void *_out,
1065  void *_in, ptrdiff_t stride)
1066 {
1067  const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
1068  const int *in_map = s->map, *out_map = in_map + l;
1069  const int *sub_map = s->sub[1].map;
1070  TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
1071  TXComplex *in = _in, *out = _out;
1072 
1073  stride /= sizeof(*out);
1074 
1075  for (int i = 0; i < m; i++) {
1076  for (int j = 0; j < n; j++)
1077  s->exp[j] = in[in_map[i*n + j]];
1078  s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], s->exp, m*sizeof(TXComplex));
1079  }
1080 
1081  for (int i = 0; i < n; i++)
1082  s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
1083 
1084  for (int i = 0; i < l; i++)
1085  out[i*stride] = tmp1[out_map[i]];
1086 }
1087 
1088 static void TX_NAME(ff_tx_fft_pfa_ns)(AVTXContext *s, void *_out,
1089  void *_in, ptrdiff_t stride)
1090 {
1091  const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
1092  const int *in_map = s->map, *out_map = in_map + l;
1093  const int *sub_map = s->sub[1].map;
1094  TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
1095  TXComplex *in = _in, *out = _out;
1096 
1097  stride /= sizeof(*out);
1098 
1099  for (int i = 0; i < m; i++)
1100  s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], &in[i*n], m*sizeof(TXComplex));
1101 
1102  for (int i = 0; i < n; i++)
1103  s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
1104 
1105  for (int i = 0; i < l; i++)
1106  out[i*stride] = tmp1[out_map[i]];
1107 }
1108 
1109 static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_def) = {
1110  .name = TX_NAME_STR("fft_pfa"),
1111  .function = TX_NAME(ff_tx_fft_pfa),
1112  .type = TX_TYPE(FFT),
1114  .factors = { 7, 5, 3, 2, TX_FACTOR_ANY },
1115  .nb_factors = 2,
1116  .min_len = 2*3,
1117  .max_len = TX_LEN_UNLIMITED,
1118  .init = TX_NAME(ff_tx_fft_pfa_init),
1120  .prio = FF_TX_PRIO_BASE,
1121 };
1122 
1123 static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_ns_def) = {
1124  .name = TX_NAME_STR("fft_pfa_ns"),
1125  .function = TX_NAME(ff_tx_fft_pfa_ns),
1126  .type = TX_TYPE(FFT),
1129  .factors = { 7, 5, 3, 2, TX_FACTOR_ANY },
1130  .nb_factors = 2,
1131  .min_len = 2*3,
1132  .max_len = TX_LEN_UNLIMITED,
1133  .init = TX_NAME(ff_tx_fft_pfa_init),
1135  .prio = FF_TX_PRIO_BASE,
1136 };
1137 
1139  const FFTXCodelet *cd,
1140  uint64_t flags,
1142  int len, int inv,
1143  const void *scale)
1144 {
1145  s->scale_d = *((SCALE_TYPE *)scale);
1146  s->scale_f = s->scale_d;
1147  return 0;
1148 }
1149 
1150 static void TX_NAME(ff_tx_mdct_naive_fwd)(AVTXContext *s, void *_dst,
1151  void *_src, ptrdiff_t stride)
1152 {
1153  TXSample *src = _src;
1154  TXSample *dst = _dst;
1155  double scale = s->scale_d;
1156  int len = s->len;
1157  const double phase = M_PI/(4.0*len);
1158 
1159  stride /= sizeof(*dst);
1160 
1161  for (int i = 0; i < len; i++) {
1162  double sum = 0.0;
1163  for (int j = 0; j < len*2; j++) {
1164  int a = (2*j + 1 + len) * (2*i + 1);
1165  sum += UNSCALE(src[j]) * cos(a * phase);
1166  }
1167  dst[i*stride] = RESCALE(sum*scale);
1168  }
1169 }
1170 
1171 static void TX_NAME(ff_tx_mdct_naive_inv)(AVTXContext *s, void *_dst,
1172  void *_src, ptrdiff_t stride)
1173 {
1174  TXSample *src = _src;
1175  TXSample *dst = _dst;
1176  double scale = s->scale_d;
1177  int len = s->len >> 1;
1178  int len2 = len*2;
1179  const double phase = M_PI/(4.0*len2);
1180 
1181  stride /= sizeof(*src);
1182 
1183  for (int i = 0; i < len; i++) {
1184  double sum_d = 0.0;
1185  double sum_u = 0.0;
1186  double i_d = phase * (4*len - 2*i - 1);
1187  double i_u = phase * (3*len2 + 2*i + 1);
1188  for (int j = 0; j < len2; j++) {
1189  double a = (2 * j + 1);
1190  double a_d = cos(a * i_d);
1191  double a_u = cos(a * i_u);
1192  double val = UNSCALE(src[j*stride]);
1193  sum_d += a_d * val;
1194  sum_u += a_u * val;
1195  }
1196  dst[i + 0] = RESCALE( sum_d*scale);
1197  dst[i + len] = RESCALE(-sum_u*scale);
1198  }
1199 }
1200 
1201 static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_fwd_def) = {
1202  .name = TX_NAME_STR("mdct_naive_fwd"),
1203  .function = TX_NAME(ff_tx_mdct_naive_fwd),
1204  .type = TX_TYPE(MDCT),
1206  .factors = { 2, TX_FACTOR_ANY }, /* MDCTs need an even length */
1207  .nb_factors = 2,
1208  .min_len = 2,
1209  .max_len = TX_LEN_UNLIMITED,
1210  .init = TX_NAME(ff_tx_mdct_naive_init),
1212  .prio = FF_TX_PRIO_MIN,
1213 };
1214 
1215 static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_inv_def) = {
1216  .name = TX_NAME_STR("mdct_naive_inv"),
1217  .function = TX_NAME(ff_tx_mdct_naive_inv),
1218  .type = TX_TYPE(MDCT),
1220  .factors = { 2, TX_FACTOR_ANY },
1221  .nb_factors = 2,
1222  .min_len = 2,
1223  .max_len = TX_LEN_UNLIMITED,
1224  .init = TX_NAME(ff_tx_mdct_naive_init),
1226  .prio = FF_TX_PRIO_MIN,
1227 };
1228 
1230  const FFTXCodelet *cd,
1231  uint64_t flags,
1233  int len, int inv,
1234  const void *scale)
1235 {
1236  int ret;
1237  FFTXCodeletOptions sub_opts = {
1239  };
1240 
1241  s->scale_d = *((SCALE_TYPE *)scale);
1242  s->scale_f = s->scale_d;
1243 
1244  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1245  flags |= AV_TX_INPLACE; /* in-place */
1246  flags |= FF_TX_PRESHUFFLE; /* First try with an in-place transform */
1247 
1248  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1249  inv, scale))) {
1250  flags &= ~FF_TX_PRESHUFFLE; /* Now try with a generic FFT */
1251  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1252  inv, scale)))
1253  return ret;
1254  }
1255 
1256  s->map = av_malloc((len >> 1)*sizeof(*s->map));
1257  if (!s->map)
1258  return AVERROR(ENOMEM);
1259 
1260  /* If we need to preshuffle copy the map from the subcontext */
1261  if (s->sub[0].flags & FF_TX_PRESHUFFLE) {
1262  memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map));
1263  } else {
1264  for (int i = 0; i < len >> 1; i++)
1265  s->map[i] = i;
1266  }
1267 
1268  if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1269  return ret;
1270 
1271  /* Saves a multiply in a hot path. */
1272  if (inv)
1273  for (int i = 0; i < (s->len >> 1); i++)
1274  s->map[i] <<= 1;
1275 
1276  return 0;
1277 }
1278 
1279 static void TX_NAME(ff_tx_mdct_fwd)(AVTXContext *s, void *_dst, void *_src,
1280  ptrdiff_t stride)
1281 {
1282  TXSample *src = _src, *dst = _dst;
1283  TXComplex *exp = s->exp, tmp, *z = _dst;
1284  const int len2 = s->len >> 1;
1285  const int len4 = s->len >> 2;
1286  const int len3 = len2 * 3;
1287  const int *sub_map = s->map;
1288 
1289  stride /= sizeof(*dst);
1290 
1291  for (int i = 0; i < len2; i++) { /* Folding and pre-reindexing */
1292  const int k = 2*i;
1293  const int idx = sub_map[i];
1294  if (k < len2) {
1295  tmp.re = FOLD(-src[ len2 + k], src[1*len2 - 1 - k]);
1296  tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);
1297  } else {
1298  tmp.re = FOLD(-src[ len2 + k], -src[5*len2 - 1 - k]);
1299  tmp.im = FOLD( src[-len2 + k], -src[1*len3 - 1 - k]);
1300  }
1301  CMUL(z[idx].im, z[idx].re, tmp.re, tmp.im, exp[i].re, exp[i].im);
1302  }
1303 
1304  s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1305 
1306  for (int i = 0; i < len4; i++) {
1307  const int i0 = len4 + i, i1 = len4 - i - 1;
1308  TXComplex src1 = { z[i1].re, z[i1].im };
1309  TXComplex src0 = { z[i0].re, z[i0].im };
1310 
1311  CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im,
1312  exp[i0].im, exp[i0].re);
1313  CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im,
1314  exp[i1].im, exp[i1].re);
1315  }
1316 }
1317 
1318 static void TX_NAME(ff_tx_mdct_inv)(AVTXContext *s, void *_dst, void *_src,
1319  ptrdiff_t stride)
1320 {
1321  TXComplex *z = _dst, *exp = s->exp;
1322  const TXSample *src = _src, *in1, *in2;
1323  const int len2 = s->len >> 1;
1324  const int len4 = s->len >> 2;
1325  const int *sub_map = s->map;
1326 
1327  stride /= sizeof(*src);
1328  in1 = src;
1329  in2 = src + ((len2*2) - 1) * stride;
1330 
1331  for (int i = 0; i < len2; i++) {
1332  int k = sub_map[i];
1333  TXComplex tmp = { in2[-k*stride], in1[k*stride] };
1334  CMUL3(z[i], tmp, exp[i]);
1335  }
1336 
1337  s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1338 
1339  exp += len2;
1340  for (int i = 0; i < len4; i++) {
1341  const int i0 = len4 + i, i1 = len4 - i - 1;
1342  TXComplex src1 = { z[i1].im, z[i1].re };
1343  TXComplex src0 = { z[i0].im, z[i0].re };
1344 
1345  CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);
1346  CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);
1347  }
1348 }
1349 
1350 static const FFTXCodelet TX_NAME(ff_tx_mdct_fwd_def) = {
1351  .name = TX_NAME_STR("mdct_fwd"),
1352  .function = TX_NAME(ff_tx_mdct_fwd),
1353  .type = TX_TYPE(MDCT),
1355  .factors = { 2, TX_FACTOR_ANY },
1356  .nb_factors = 2,
1357  .min_len = 2,
1358  .max_len = TX_LEN_UNLIMITED,
1359  .init = TX_NAME(ff_tx_mdct_init),
1361  .prio = FF_TX_PRIO_BASE,
1362 };
1363 
1364 static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_def) = {
1365  .name = TX_NAME_STR("mdct_inv"),
1366  .function = TX_NAME(ff_tx_mdct_inv),
1367  .type = TX_TYPE(MDCT),
1369  .factors = { 2, TX_FACTOR_ANY },
1370  .nb_factors = 2,
1371  .min_len = 2,
1372  .max_len = TX_LEN_UNLIMITED,
1373  .init = TX_NAME(ff_tx_mdct_init),
1375  .prio = FF_TX_PRIO_BASE,
1376 };
1377 
1379  const FFTXCodelet *cd,
1380  uint64_t flags,
1382  int len, int inv,
1383  const void *scale)
1384 {
1385  int ret;
1386 
1387  s->scale_d = *((SCALE_TYPE *)scale);
1388  s->scale_f = s->scale_d;
1389 
1390  flags &= ~AV_TX_FULL_IMDCT;
1391 
1392  if ((ret = ff_tx_init_subtx(s, TX_TYPE(MDCT), flags, NULL, len, 1, scale)))
1393  return ret;
1394 
1395  return 0;
1396 }
1397 
1398 static void TX_NAME(ff_tx_mdct_inv_full)(AVTXContext *s, void *_dst,
1399  void *_src, ptrdiff_t stride)
1400 {
1401  int len = s->len << 1;
1402  int len2 = len >> 1;
1403  int len4 = len >> 2;
1404  TXSample *dst = _dst;
1405 
1406  s->fn[0](&s->sub[0], dst + len4, _src, stride);
1407 
1408  stride /= sizeof(*dst);
1409 
1410  for (int i = 0; i < len4; i++) {
1411  dst[ i*stride] = -dst[(len2 - i - 1)*stride];
1412  dst[(len - i - 1)*stride] = dst[(len2 + i + 0)*stride];
1413  }
1414 }
1415 
1416 static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_full_def) = {
1417  .name = TX_NAME_STR("mdct_inv_full"),
1418  .function = TX_NAME(ff_tx_mdct_inv_full),
1419  .type = TX_TYPE(MDCT),
1420  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
1422  .factors = { 2, TX_FACTOR_ANY },
1423  .nb_factors = 2,
1424  .min_len = 2,
1425  .max_len = TX_LEN_UNLIMITED,
1428  .prio = FF_TX_PRIO_BASE,
1429 };
1430 
1432  const FFTXCodelet *cd,
1433  uint64_t flags,
1435  int len, int inv,
1436  const void *scale)
1437 {
1438  int ret, sub_len;
1439  FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
1440 
1441  len >>= 1;
1442  sub_len = len / cd->factors[0];
1443 
1444  s->scale_d = *((SCALE_TYPE *)scale);
1445  s->scale_f = s->scale_d;
1446 
1447  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1448  flags |= AV_TX_INPLACE; /* in-place */
1449  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
1450 
1451  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1452  sub_len, inv, scale)))
1453  return ret;
1454 
1455  if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
1456  return ret;
1457 
1458  /* Our 15-point transform is also a compound one, so embed its input map */
1459  if (cd->factors[0] == 15)
1460  TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
1461 
1462  if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1463  return ret;
1464 
1465  /* Saves multiplies in loops. */
1466  for (int i = 0; i < len; i++)
1467  s->map[i] <<= 1;
1468 
1469  if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1470  return AVERROR(ENOMEM);
1471 
1472  TX_TAB(ff_tx_init_tabs)(len / sub_len);
1473 
1474  return 0;
1475 }
1476 
1477 #define DECL_COMP_IMDCT(N) \
1478 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst, \
1479  void *_src, ptrdiff_t stride) \
1480 { \
1481  TXComplex fft##N##in[N]; \
1482  TXComplex *z = _dst, *exp = s->exp; \
1483  const TXSample *src = _src, *in1, *in2; \
1484  const int len4 = s->len >> 2; \
1485  const int len2 = s->len >> 1; \
1486  const int m = s->sub->len; \
1487  const int *in_map = s->map, *out_map = in_map + N*m; \
1488  const int *sub_map = s->sub->map; \
1489  \
1490  stride /= sizeof(*src); /* To convert it from bytes */ \
1491  in1 = src; \
1492  in2 = src + ((N*m*2) - 1) * stride; \
1493  \
1494  for (int i = 0; i < len2; i += N) { \
1495  for (int j = 0; j < N; j++) { \
1496  const int k = in_map[j]; \
1497  TXComplex tmp = { in2[-k*stride], in1[k*stride] }; \
1498  CMUL3(fft##N##in[j], tmp, exp[j]); \
1499  } \
1500  fft##N(s->tmp + *(sub_map++), fft##N##in, m); \
1501  exp += N; \
1502  in_map += N; \
1503  } \
1504  \
1505  for (int i = 0; i < N; i++) \
1506  s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \
1507  \
1508  for (int i = 0; i < len4; i++) { \
1509  const int i0 = len4 + i, i1 = len4 - i - 1; \
1510  const int s0 = out_map[i0], s1 = out_map[i1]; \
1511  TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re }; \
1512  TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re }; \
1513  \
1514  CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re); \
1515  CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re); \
1516  } \
1517 } \
1518  \
1519 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_inv_def) = { \
1520  .name = TX_NAME_STR("mdct_pfa_" #N "xM_inv"), \
1521  .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_inv), \
1522  .type = TX_TYPE(MDCT), \
1523  .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY, \
1524  .factors = { N, TX_FACTOR_ANY }, \
1525  .nb_factors = 2, \
1526  .min_len = N*2, \
1527  .max_len = TX_LEN_UNLIMITED, \
1528  .init = TX_NAME(ff_tx_mdct_pfa_init), \
1529  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1530  .prio = FF_TX_PRIO_BASE, \
1531 };
1532 
1533 DECL_COMP_IMDCT(3)
1534 DECL_COMP_IMDCT(5)
1535 DECL_COMP_IMDCT(7)
1536 DECL_COMP_IMDCT(9)
1537 DECL_COMP_IMDCT(15)
1538 
1539 #define DECL_COMP_MDCT(N) \
1540 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd)(AVTXContext *s, void *_dst, \
1541  void *_src, ptrdiff_t stride) \
1542 { \
1543  TXComplex fft##N##in[N]; \
1544  TXSample *src = _src, *dst = _dst; \
1545  TXComplex *exp = s->exp, tmp; \
1546  const int m = s->sub->len; \
1547  const int len4 = N*m; \
1548  const int len3 = len4 * 3; \
1549  const int len8 = s->len >> 2; \
1550  const int *in_map = s->map, *out_map = in_map + N*m; \
1551  const int *sub_map = s->sub->map; \
1552  \
1553  stride /= sizeof(*dst); \
1554  \
1555  for (int i = 0; i < m; i++) { /* Folding and pre-reindexing */ \
1556  for (int j = 0; j < N; j++) { \
1557  const int k = in_map[i*N + j]; \
1558  if (k < len4) { \
1559  tmp.re = FOLD(-src[ len4 + k], src[1*len4 - 1 - k]); \
1560  tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]); \
1561  } else { \
1562  tmp.re = FOLD(-src[ len4 + k], -src[5*len4 - 1 - k]); \
1563  tmp.im = FOLD( src[-len4 + k], -src[1*len3 - 1 - k]); \
1564  } \
1565  CMUL(fft##N##in[j].im, fft##N##in[j].re, tmp.re, tmp.im, \
1566  exp[k >> 1].re, exp[k >> 1].im); \
1567  } \
1568  fft##N(s->tmp + sub_map[i], fft##N##in, m); \
1569  } \
1570  \
1571  for (int i = 0; i < N; i++) \
1572  s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \
1573  \
1574  for (int i = 0; i < len8; i++) { \
1575  const int i0 = len8 + i, i1 = len8 - i - 1; \
1576  const int s0 = out_map[i0], s1 = out_map[i1]; \
1577  TXComplex src1 = { s->tmp[s1].re, s->tmp[s1].im }; \
1578  TXComplex src0 = { s->tmp[s0].re, s->tmp[s0].im }; \
1579  \
1580  CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im, \
1581  exp[i0].im, exp[i0].re); \
1582  CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im, \
1583  exp[i1].im, exp[i1].re); \
1584  } \
1585 } \
1586  \
1587 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd_def) = { \
1588  .name = TX_NAME_STR("mdct_pfa_" #N "xM_fwd"), \
1589  .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd), \
1590  .type = TX_TYPE(MDCT), \
1591  .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, \
1592  .factors = { N, TX_FACTOR_ANY }, \
1593  .nb_factors = 2, \
1594  .min_len = N*2, \
1595  .max_len = TX_LEN_UNLIMITED, \
1596  .init = TX_NAME(ff_tx_mdct_pfa_init), \
1597  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1598  .prio = FF_TX_PRIO_BASE, \
1599 };
1600 
1601 DECL_COMP_MDCT(3)
1602 DECL_COMP_MDCT(5)
1603 DECL_COMP_MDCT(7)
1604 DECL_COMP_MDCT(9)
1605 DECL_COMP_MDCT(15)
1606 
1608  const FFTXCodelet *cd,
1609  uint64_t flags,
1611  int len, int inv,
1612  const void *scale)
1613 {
1614  int ret;
1615  double f, m;
1616  TXSample *tab;
1617  uint64_t r2r = flags & AV_TX_REAL_TO_REAL;
1618  int len4 = FFALIGN(len, 4) / 4;
1619 
1620  s->scale_d = *((SCALE_TYPE *)scale);
1621  s->scale_f = s->scale_d;
1622 
1624 
1625  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, NULL, len >> 1, inv, scale)))
1626  return ret;
1627 
1628  if (!(s->exp = av_mallocz((8 + 2*len4)*sizeof(*s->exp))))
1629  return AVERROR(ENOMEM);
1630 
1631  tab = (TXSample *)s->exp;
1632 
1633  f = 2*M_PI/len;
1634 
1635  m = (inv ? 2*s->scale_d : s->scale_d);
1636 
1637  *tab++ = RESCALE((inv ? 0.5 : 1.0) * m);
1638  *tab++ = RESCALE(inv ? 0.5*m : 1.0*m);
1639  *tab++ = RESCALE( m);
1640  *tab++ = RESCALE(-m);
1641 
1642  *tab++ = RESCALE( (0.5 - 0.0) * m);
1643  if (r2r)
1644  *tab++ = 1 / s->scale_f;
1645  else
1646  *tab++ = RESCALE( (0.0 - 0.5) * m);
1647  *tab++ = RESCALE( (0.5 - inv) * m);
1648  *tab++ = RESCALE(-(0.5 - inv) * m);
1649 
1650  for (int i = 0; i < len4; i++)
1651  *tab++ = RESCALE(cos(i*f));
1652 
1653  tab = ((TXSample *)s->exp) + len4 + 8;
1654 
1655  for (int i = 0; i < len4; i++)
1656  *tab++ = RESCALE(cos(((len - i*4)/4.0)*f)) * (inv ? 1 : -1);
1657 
1658  return 0;
1659 }
1660 
1661 #define DECL_RDFT(n, inv) \
1662 static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst, \
1663  void *_src, ptrdiff_t stride) \
1664 { \
1665  const int len2 = s->len >> 1; \
1666  const int len4 = s->len >> 2; \
1667  const TXSample *fact = (void *)s->exp; \
1668  const TXSample *tcos = fact + 8; \
1669  const TXSample *tsin = tcos + len4; \
1670  TXComplex *data = inv ? _src : _dst; \
1671  TXComplex t[3]; \
1672  \
1673  if (!inv) \
1674  s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex)); \
1675  else \
1676  data[0].im = data[len2].re; \
1677  \
1678  /* The DC value's both components are real, but we need to change them \
1679  * into complex values. Also, the middle of the array is special-cased. \
1680  * These operations can be done before or after the loop. */ \
1681  t[0].re = data[0].re; \
1682  data[0].re = t[0].re + data[0].im; \
1683  data[0].im = t[0].re - data[0].im; \
1684  data[ 0].re = MULT(fact[0], data[ 0].re); \
1685  data[ 0].im = MULT(fact[1], data[ 0].im); \
1686  data[len4].re = MULT(fact[2], data[len4].re); \
1687  data[len4].im = MULT(fact[3], data[len4].im); \
1688  \
1689  for (int i = 1; i < len4; i++) { \
1690  /* Separate even and odd FFTs */ \
1691  t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re)); \
1692  t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im)); \
1693  t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im)); \
1694  t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re)); \
1695  \
1696  /* Apply twiddle factors to the odd FFT and add to the even FFT */ \
1697  CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]); \
1698  \
1699  data[ i].re = t[0].re + t[2].re; \
1700  data[ i].im = t[2].im - t[0].im; \
1701  data[len2 - i].re = t[0].re - t[2].re; \
1702  data[len2 - i].im = t[2].im + t[0].im; \
1703  } \
1704  \
1705  if (inv) { \
1706  s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex)); \
1707  } else { \
1708  /* Move [0].im to the last position, as convention requires */ \
1709  data[len2].re = data[0].im; \
1710  data[ 0].im = data[len2].im = 0; \
1711  } \
1712 } \
1713  \
1714 static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = { \
1715  .name = TX_NAME_STR("rdft_" #n), \
1716  .function = TX_NAME(ff_tx_rdft_ ##n), \
1717  .type = TX_TYPE(RDFT), \
1718  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
1719  (inv ? FF_TX_INVERSE_ONLY : FF_TX_FORWARD_ONLY), \
1720  .factors = { 4, TX_FACTOR_ANY }, \
1721  .nb_factors = 2, \
1722  .min_len = 4, \
1723  .max_len = TX_LEN_UNLIMITED, \
1724  .init = TX_NAME(ff_tx_rdft_init), \
1725  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1726  .prio = FF_TX_PRIO_BASE, \
1727 };
1728 
1729 DECL_RDFT(r2c, 0)
1731 
1732 #define DECL_RDFT_HALF(n, mode, mod2) \
1733 static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst, \
1734  void *_src, ptrdiff_t stride) \
1735 { \
1736  const int len = s->len; \
1737  const int len2 = len >> 1; \
1738  const int len4 = len >> 2; \
1739  const int aligned_len4 = FFALIGN(len, 4)/4; \
1740  const TXSample *fact = (void *)s->exp; \
1741  const TXSample *tcos = fact + 8; \
1742  const TXSample *tsin = tcos + aligned_len4; \
1743  TXComplex *data = _dst; \
1744  TXSample *out = _dst; /* Half-complex is forward-only */ \
1745  TXSample tmp_dc; \
1746  av_unused TXSample tmp_mid; \
1747  TXSample tmp[4]; \
1748  TXComplex sf, sl; \
1749  \
1750  s->fn[0](&s->sub[0], _dst, _src, sizeof(TXComplex)); \
1751  \
1752  tmp_dc = data[0].re; \
1753  data[ 0].re = tmp_dc + data[0].im; \
1754  tmp_dc = tmp_dc - data[0].im; \
1755  \
1756  data[ 0].re = MULT(fact[0], data[ 0].re); \
1757  tmp_dc = MULT(fact[1], tmp_dc); \
1758  data[len4].re = MULT(fact[2], data[len4].re); \
1759  \
1760  if (!mod2) { \
1761  data[len4].im = MULT(fact[3], data[len4].im); \
1762  } else { \
1763  sf = data[len4]; \
1764  sl = data[len4 + 1]; \
1765  if (mode == AV_TX_REAL_TO_REAL) \
1766  tmp[0] = MULT(fact[4], (sf.re + sl.re)); \
1767  else \
1768  tmp[0] = MULT(fact[5], (sf.im - sl.im)); \
1769  tmp[1] = MULT(fact[6], (sf.im + sl.im)); \
1770  tmp[2] = MULT(fact[7], (sf.re - sl.re)); \
1771  \
1772  if (mode == AV_TX_REAL_TO_REAL) { \
1773  tmp[3] = tmp[1]*tcos[len4] - tmp[2]*tsin[len4]; \
1774  tmp_mid = (tmp[0] - tmp[3]); \
1775  } else { \
1776  tmp[3] = tmp[1]*tsin[len4] + tmp[2]*tcos[len4]; \
1777  tmp_mid = (tmp[0] + tmp[3]); \
1778  } \
1779  } \
1780  \
1781  /* NOTE: unrolling this breaks non-mod8 lengths */ \
1782  for (int i = 1; i <= len4; i++) { \
1783  TXSample tmp[4]; \
1784  TXComplex sf = data[i]; \
1785  TXComplex sl = data[len2 - i]; \
1786  \
1787  if (mode == AV_TX_REAL_TO_REAL) \
1788  tmp[0] = MULT(fact[4], (sf.re + sl.re)); \
1789  else \
1790  tmp[0] = MULT(fact[5], (sf.im - sl.im)); \
1791  \
1792  tmp[1] = MULT(fact[6], (sf.im + sl.im)); \
1793  tmp[2] = MULT(fact[7], (sf.re - sl.re)); \
1794  \
1795  if (mode == AV_TX_REAL_TO_REAL) { \
1796  tmp[3] = tmp[1]*tcos[i] - tmp[2]*tsin[i]; \
1797  out[i] = (tmp[0] + tmp[3]); \
1798  out[len - i] = (tmp[0] - tmp[3]); \
1799  } else { \
1800  tmp[3] = tmp[1]*tsin[i] + tmp[2]*tcos[i]; \
1801  out[i - 1] = (tmp[3] - tmp[0]); \
1802  out[len - i - 1] = (tmp[0] + tmp[3]); \
1803  } \
1804  } \
1805  \
1806  for (int i = 1; i < (len4 + (mode == AV_TX_REAL_TO_IMAGINARY)); i++) \
1807  out[len2 - i] = out[len - i]; \
1808  \
1809  if (mode == AV_TX_REAL_TO_REAL) { \
1810  out[len2] = tmp_dc; \
1811  if (mod2) \
1812  out[len4 + 1] = tmp_mid * fact[5]; \
1813  } else if (mod2) { \
1814  out[len4] = tmp_mid; \
1815  } \
1816 } \
1817  \
1818 static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = { \
1819  .name = TX_NAME_STR("rdft_" #n), \
1820  .function = TX_NAME(ff_tx_rdft_ ##n), \
1821  .type = TX_TYPE(RDFT), \
1822  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | mode | \
1823  FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, \
1824  .factors = { 2 + 2*(!mod2), TX_FACTOR_ANY }, \
1825  .nb_factors = 2, \
1826  .min_len = 2 + 2*(!mod2), \
1827  .max_len = TX_LEN_UNLIMITED, \
1828  .init = TX_NAME(ff_tx_rdft_init), \
1829  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1830  .prio = FF_TX_PRIO_BASE, \
1831 };
1832 
1834 DECL_RDFT_HALF(r2r_mod2, AV_TX_REAL_TO_REAL, 1)
1837 
1839  const FFTXCodelet *cd,
1840  uint64_t flags,
1842  int len, int inv,
1843  const void *scale)
1844 {
1845  int ret;
1846  double freq;
1847  TXSample *tab;
1848  SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
1849 
1850  if (inv) {
1851  len *= 2;
1852  s->len *= 2;
1853  rsc *= 0.5;
1854  }
1855 
1856  if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL, len, inv, &rsc)))
1857  return ret;
1858 
1859  s->exp = av_malloc((len/2)*3*sizeof(TXSample));
1860  if (!s->exp)
1861  return AVERROR(ENOMEM);
1862 
1863  tab = (TXSample *)s->exp;
1864 
1865  freq = M_PI/(len*2);
1866 
1867  for (int i = 0; i < len; i++)
1868  tab[i] = RESCALE(cos(i*freq)*(!inv + 1));
1869 
1870  if (inv) {
1871  for (int i = 0; i < len/2; i++)
1872  tab[len + i] = RESCALE(0.5 / sin((2*i + 1)*freq));
1873  } else {
1874  for (int i = 0; i < len/2; i++)
1875  tab[len + i] = RESCALE(cos((len - 2*i - 1)*freq));
1876  }
1877 
1878  return 0;
1880 
1881 static void TX_NAME(ff_tx_dctII)(AVTXContext *s, void *_dst,
1882  void *_src, ptrdiff_t stride)
1883 {
1884  TXSample *dst = _dst;
1885  TXSample *src = _src;
1886  const int len = s->len;
1887  const int len2 = len >> 1;
1888  const TXSample *exp = (void *)s->exp;
1889  TXSample next;
1890 #ifdef TX_INT32
1891  int64_t tmp1, tmp2;
1892 #else
1893  TXSample tmp1, tmp2;
1894 #endif
1895 
1896  for (int i = 0; i < len2; i++) {
1897  TXSample in1 = src[i];
1898  TXSample in2 = src[len - i - 1];
1899  TXSample s = exp[len + i];
1900 
1901 #ifdef TX_INT32
1902  tmp1 = in1 + in2;
1903  tmp2 = in1 - in2;
1904 
1905  tmp1 >>= 1;
1906  tmp2 *= s;
1907 
1908  tmp2 = (tmp2 + 0x40000000) >> 31;
1909 #else
1910  tmp1 = (in1 + in2)*0.5;
1911  tmp2 = (in1 - in2)*s;
1912 #endif
1913 
1914  src[i] = tmp1 + tmp2;
1915  src[len - i - 1] = tmp1 - tmp2;
1916  }
1917 
1918  s->fn[0](&s->sub[0], dst, src, sizeof(TXComplex));
1919 
1920  next = dst[len];
1921 
1922  for (int i = len - 2; i > 0; i -= 2) {
1923  TXSample tmp;
1924 
1925  CMUL(tmp, dst[i], exp[len - i], exp[i], dst[i + 0], dst[i + 1]);
1926 
1927  dst[i + 1] = next;
1928 
1929  next += tmp;
1930  }
1931 
1932 #ifdef TX_INT32
1933  tmp1 = ((int64_t)exp[0]) * ((int64_t)dst[0]);
1934  dst[0] = (tmp1 + 0x40000000) >> 31;
1935 #else
1936  dst[0] = exp[0] * dst[0];
1937 #endif
1938  dst[1] = next;
1940 
1941 static void TX_NAME(ff_tx_dctIII)(AVTXContext *s, void *_dst,
1942  void *_src, ptrdiff_t stride)
1943 {
1944  TXSample *dst = _dst;
1945  TXSample *src = _src;
1946  const int len = s->len;
1947  const int len2 = len >> 1;
1948  const TXSample *exp = (void *)s->exp;
1949 #ifdef TX_INT32
1950  int64_t tmp1, tmp2 = src[len - 1];
1951  tmp2 = (2*tmp2 + 0x40000000) >> 31;
1952 #else
1953  TXSample tmp1, tmp2 = 2*src[len - 1];
1954 #endif
1955 
1956  src[len] = tmp2;
1957 
1958  for (int i = len - 2; i >= 2; i -= 2) {
1959  TXSample val1 = src[i - 0];
1960  TXSample val2 = src[i - 1] - src[i + 1];
1961 
1962  CMUL(src[i + 1], src[i], exp[len - i], exp[i], val1, val2);
1963  }
1964 
1965  s->fn[0](&s->sub[0], dst, src, sizeof(float));
1966 
1967  for (int i = 0; i < len2; i++) {
1968  TXSample in1 = dst[i];
1969  TXSample in2 = dst[len - i - 1];
1970  TXSample c = exp[len + i];
1971 
1972  tmp1 = in1 + in2;
1973  tmp2 = in1 - in2;
1974  tmp2 *= c;
1975 #ifdef TX_INT32
1976  tmp2 = (tmp2 + 0x40000000) >> 31;
1977 #endif
1978 
1979  dst[i] = tmp1 + tmp2;
1980  dst[len - i - 1] = tmp1 - tmp2;
1981  }
1982 }
1983 
1984 static const FFTXCodelet TX_NAME(ff_tx_dctII_def) = {
1985  .name = TX_NAME_STR("dctII"),
1986  .function = TX_NAME(ff_tx_dctII),
1987  .type = TX_TYPE(DCT),
1988  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
1990  .factors = { 2, TX_FACTOR_ANY },
1991  .min_len = 2,
1992  .max_len = TX_LEN_UNLIMITED,
1993  .init = TX_NAME(ff_tx_dct_init),
1995  .prio = FF_TX_PRIO_BASE,
1996 };
1997 
1998 static const FFTXCodelet TX_NAME(ff_tx_dctIII_def) = {
1999  .name = TX_NAME_STR("dctIII"),
2000  .function = TX_NAME(ff_tx_dctIII),
2001  .type = TX_TYPE(DCT),
2002  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
2004  .factors = { 2, TX_FACTOR_ANY },
2005  .min_len = 2,
2006  .max_len = TX_LEN_UNLIMITED,
2007  .init = TX_NAME(ff_tx_dct_init),
2009  .prio = FF_TX_PRIO_BASE,
2010 };
2011 
2013  const FFTXCodelet *cd,
2014  uint64_t flags,
2016  int len, int inv,
2017  const void *scale)
2018 {
2019  int ret;
2020  SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
2021 
2022  if (inv) {
2023  len *= 2;
2024  s->len *= 2;
2025  rsc *= 0.5;
2026  }
2027 
2028  /* We want a half-complex RDFT */
2029  flags |= cd->type == TX_TYPE(DCT_I) ? AV_TX_REAL_TO_REAL :
2031 
2032  if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL,
2033  (len - 1 + 2*(cd->type == TX_TYPE(DST_I)))*2,
2034  0, &rsc)))
2035  return ret;
2036 
2037  s->tmp = av_mallocz((len + 1)*2*sizeof(TXSample));
2038  if (!s->tmp)
2039  return AVERROR(ENOMEM);
2040 
2041  return 0;
2043 
2044 static void TX_NAME(ff_tx_dctI)(AVTXContext *s, void *_dst,
2045  void *_src, ptrdiff_t stride)
2046 {
2047  TXSample *dst = _dst;
2048  TXSample *src = _src;
2049  const int len = s->len - 1;
2050  TXSample *tmp = (TXSample *)s->tmp;
2051 
2052  stride /= sizeof(TXSample);
2053 
2054  for (int i = 0; i < len; i++)
2055  tmp[i] = tmp[2*len - i] = src[i * stride];
2056 
2057  tmp[len] = src[len * stride]; /* Middle */
2058 
2059  s->fn[0](&s->sub[0], dst, tmp, sizeof(TXSample));
2061 
2062 static void TX_NAME(ff_tx_dstI)(AVTXContext *s, void *_dst,
2063  void *_src, ptrdiff_t stride)
2064 {
2065  TXSample *dst = _dst;
2066  TXSample *src = _src;
2067  const int len = s->len + 1;
2068  TXSample *tmp = (void *)s->tmp;
2069 
2070  stride /= sizeof(TXSample);
2071 
2072  tmp[0] = 0;
2073 
2074  for (int i = 1; i < len; i++) {
2075  TXSample a = src[(i - 1) * stride];
2076  tmp[i] = -a;
2077  tmp[2*len - i] = a;
2078  }
2079 
2080  tmp[len] = 0; /* i == n, Nyquist */
2081 
2082  s->fn[0](&s->sub[0], dst, tmp, sizeof(float));
2083 }
2084 
2085 static const FFTXCodelet TX_NAME(ff_tx_dctI_def) = {
2086  .name = TX_NAME_STR("dctI"),
2087  .function = TX_NAME(ff_tx_dctI),
2088  .type = TX_TYPE(DCT_I),
2090  .factors = { 2, TX_FACTOR_ANY },
2091  .nb_factors = 2,
2092  .min_len = 2,
2093  .max_len = TX_LEN_UNLIMITED,
2094  .init = TX_NAME(ff_tx_dcstI_init),
2096  .prio = FF_TX_PRIO_BASE,
2097 };
2098 
2099 static const FFTXCodelet TX_NAME(ff_tx_dstI_def) = {
2100  .name = TX_NAME_STR("dstI"),
2101  .function = TX_NAME(ff_tx_dstI),
2102  .type = TX_TYPE(DST_I),
2104  .factors = { 2, TX_FACTOR_ANY },
2105  .nb_factors = 2,
2106  .min_len = 2,
2107  .max_len = TX_LEN_UNLIMITED,
2108  .init = TX_NAME(ff_tx_dcstI_init),
2110  .prio = FF_TX_PRIO_BASE,
2111 };
2112 
2113 int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s, int *pre_tab)
2114 {
2115  int off = 0;
2116  int len4 = s->len >> 1;
2117  double scale = s->scale_d;
2118  const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0;
2119  size_t alloc = pre_tab ? 2*len4 : len4;
2120 
2121  if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp))))
2122  return AVERROR(ENOMEM);
2123 
2124  scale = sqrt(fabs(scale));
2125 
2126  if (pre_tab)
2127  off = len4;
2128 
2129  for (int i = 0; i < len4; i++) {
2130  const double alpha = M_PI_2 * (i + theta) / len4;
2131  s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale),
2132  RESCALE(sin(alpha) * scale) };
2133  }
2134 
2135  if (pre_tab)
2136  for (int i = 0; i < len4; i++)
2137  s->exp[i] = s->exp[len4 + pre_tab[i]];
2138 
2139  return 0;
2140 }
2141 
2142 const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = {
2143  /* Split-Radix codelets */
2144  &TX_NAME(ff_tx_fft2_ns_def),
2145  &TX_NAME(ff_tx_fft4_ns_def),
2146  &TX_NAME(ff_tx_fft8_ns_def),
2147  &TX_NAME(ff_tx_fft16_ns_def),
2148  &TX_NAME(ff_tx_fft32_ns_def),
2149  &TX_NAME(ff_tx_fft64_ns_def),
2150  &TX_NAME(ff_tx_fft128_ns_def),
2151  &TX_NAME(ff_tx_fft256_ns_def),
2152  &TX_NAME(ff_tx_fft512_ns_def),
2153  &TX_NAME(ff_tx_fft1024_ns_def),
2154  &TX_NAME(ff_tx_fft2048_ns_def),
2155  &TX_NAME(ff_tx_fft4096_ns_def),
2156  &TX_NAME(ff_tx_fft8192_ns_def),
2157  &TX_NAME(ff_tx_fft16384_ns_def),
2158  &TX_NAME(ff_tx_fft32768_ns_def),
2159  &TX_NAME(ff_tx_fft65536_ns_def),
2160  &TX_NAME(ff_tx_fft131072_ns_def),
2161  &TX_NAME(ff_tx_fft262144_ns_def),
2162  &TX_NAME(ff_tx_fft524288_ns_def),
2163  &TX_NAME(ff_tx_fft1048576_ns_def),
2164  &TX_NAME(ff_tx_fft2097152_ns_def),
2165 
2166  /* Prime factor codelets */
2167  &TX_NAME(ff_tx_fft3_ns_def),
2168  &TX_NAME(ff_tx_fft5_ns_def),
2169  &TX_NAME(ff_tx_fft7_ns_def),
2170  &TX_NAME(ff_tx_fft9_ns_def),
2171  &TX_NAME(ff_tx_fft15_ns_def),
2172 
2173  /* We get these for free */
2174  &TX_NAME(ff_tx_fft3_fwd_def),
2175  &TX_NAME(ff_tx_fft5_fwd_def),
2176  &TX_NAME(ff_tx_fft7_fwd_def),
2177  &TX_NAME(ff_tx_fft9_fwd_def),
2178 
2179  /* Standalone transforms */
2180  &TX_NAME(ff_tx_fft_def),
2181  &TX_NAME(ff_tx_fft_inplace_def),
2182  &TX_NAME(ff_tx_fft_inplace_small_def),
2183  &TX_NAME(ff_tx_fft_pfa_def),
2184  &TX_NAME(ff_tx_fft_pfa_ns_def),
2185  &TX_NAME(ff_tx_fft_naive_def),
2186  &TX_NAME(ff_tx_fft_naive_small_def),
2187  &TX_NAME(ff_tx_mdct_fwd_def),
2188  &TX_NAME(ff_tx_mdct_inv_def),
2189  &TX_NAME(ff_tx_mdct_pfa_3xM_fwd_def),
2190  &TX_NAME(ff_tx_mdct_pfa_5xM_fwd_def),
2191  &TX_NAME(ff_tx_mdct_pfa_7xM_fwd_def),
2192  &TX_NAME(ff_tx_mdct_pfa_9xM_fwd_def),
2193  &TX_NAME(ff_tx_mdct_pfa_15xM_fwd_def),
2194  &TX_NAME(ff_tx_mdct_pfa_3xM_inv_def),
2195  &TX_NAME(ff_tx_mdct_pfa_5xM_inv_def),
2196  &TX_NAME(ff_tx_mdct_pfa_7xM_inv_def),
2197  &TX_NAME(ff_tx_mdct_pfa_9xM_inv_def),
2198  &TX_NAME(ff_tx_mdct_pfa_15xM_inv_def),
2199  &TX_NAME(ff_tx_mdct_naive_fwd_def),
2200  &TX_NAME(ff_tx_mdct_naive_inv_def),
2201  &TX_NAME(ff_tx_mdct_inv_full_def),
2202  &TX_NAME(ff_tx_rdft_r2c_def),
2203  &TX_NAME(ff_tx_rdft_r2r_def),
2204  &TX_NAME(ff_tx_rdft_r2r_mod2_def),
2205  &TX_NAME(ff_tx_rdft_r2i_def),
2206  &TX_NAME(ff_tx_rdft_r2i_mod2_def),
2207  &TX_NAME(ff_tx_rdft_c2r_def),
2208  &TX_NAME(ff_tx_dctII_def),
2209  &TX_NAME(ff_tx_dctIII_def),
2210  &TX_NAME(ff_tx_dctI_def),
2211  &TX_NAME(ff_tx_dstI_def),
2212 
2213  NULL,
2214 };
func
int(* func)(AVBPrint *dst, const char *in, const char *arg)
Definition: jacosubdec.c:68
DCT_I
@ DCT_I
Definition: avfft.h:121
ff_tx_fft_sr_combine
static void TX_NAME() ff_tx_fft_sr_combine(TXComplex *z, const TXSample *cos, int len)
Definition: tx_template.c:564
ff_tx_dct_init
static av_cold int TX_NAME() ff_tx_dct_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1836
AV_TX_REAL_TO_REAL
@ AV_TX_REAL_TO_REAL
Perform a real to half-complex RDFT.
Definition: tx.h:184
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
out
FILE * out
Definition: movenc.c:54
ff_ctz
#define ff_ctz
Definition: intmath.h:107
TRANSFORM
#define TRANSFORM(a0, a1, a2, a3, wre, wim)
Definition: tx_template.c:556
src1
const pixel * src1
Definition: h264pred_template.c:421
AVTXContext
Definition: tx_priv.h:235
int64_t
long long int64_t
Definition: coverity.c:34
ff_tx_fft
static void TX_NAME() ff_tx_fft(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:769
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
FFTXCodeletOptions
Definition: tx_priv.h:183
w
uint8_t w
Definition: llviddspenc.c:38
M_PI_2
#define M_PI_2
Definition: mathematics.h:73
TX_MAX_DECOMPOSITIONS
#define TX_MAX_DECOMPOSITIONS
Definition: tx_priv.h:197
SR_POW2_TABLES
#define SR_POW2_TABLES
Definition: tx_template.c:30
ff_tx_fft_pfa
static void TX_NAME() ff_tx_fft_pfa(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride)
Definition: tx_template.c:1064
ff_tx_fft16_ns
static void TX_NAME() ff_tx_fft16_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:681
ff_tx_gen_inplace_map
int ff_tx_gen_inplace_map(AVTXContext *s, int len)
Definition: tx.c:155
t1
#define t1
Definition: regdef.h:29
fft15
static av_always_inline void fft15(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:467
FF_TX_CPU_FLAGS_ALL
#define FF_TX_CPU_FLAGS_ALL
Definition: tx_priv.h:230
ff_tx_gen_compound_mapping
int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts, int inv, int n, int m)
Definition: tx.c:74
ff_tx_dctI
static void TX_NAME() ff_tx_dctI(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:2042
ff_tx_fft_naive
static void TX_NAME() ff_tx_fft_naive(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:877
av_malloc
#define av_malloc(s)
Definition: tableprint_vlc.h:30
DECL_FFT5
#define DECL_FFT5(NAME, D0, D1, D2, D3, D4)
Definition: tx_template.c:213
ff_tx_mdct_naive_fwd
static void TX_NAME() ff_tx_mdct_naive_fwd(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1150
ff_tx_rdft_init
static av_cold int TX_NAME() ff_tx_rdft_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1607
DECL_SR_CODELET_DEF
#define DECL_SR_CODELET_DEF(n)
Definition: tx_template.c:601
FFTabInitData::func
void(* func)(void)
Definition: tx_template.c:63
sr_tabs_init_funcs
static SR_POW2_TABLES void(*const sr_tabs_init_funcs[])(void)
Definition: tx_template.c:81
tab
static const struct twinvq_data tab
Definition: twinvq_data.h:10345
TX_NAME
static const FFTXCodelet TX_NAME(ff_tx_fft_def)
FF_TX_MAP_GATHER
@ FF_TX_MAP_GATHER
Definition: tx_priv.h:176
sum_d
static void sum_d(const int *input, int *output, int len)
Definition: dcadct.c:51
TX_INT32
#define TX_INT32
Definition: tx_int32.c:19
sr_tabs_init_once
static AVOnce sr_tabs_init_once[]
Definition: tx_template.c:87
val
static double val(void *priv, double ch)
Definition: aeval.c:78
DECL_FACTOR_F
#define DECL_FACTOR_F(n)
Definition: tx_template.c:519
TX_MAX_SUB
#define TX_MAX_SUB
Definition: tx_priv.h:194
TABLE_DEF
#define TABLE_DEF(name, size)
Definition: tx_template.c:27
FFTXCodelet::type
enum AVTXType type
Definition: tx_priv.h:202
FFTXCodeletOptions::map_dir
FFTXMapDirection map_dir
Definition: tx_priv.h:187
mult
static int16_t mult(Float11 *f1, Float11 *f2)
Definition: g726.c:60
ff_thread_once
static int ff_thread_once(char *control, void(*routine)(void))
Definition: thread.h:205
FF_ARRAY_ELEMS
#define FF_ARRAY_ELEMS(a)
Definition: sinewin_tablegen.c:29
av_cold
#define av_cold
Definition: attributes.h:90
FFTabInitData
Definition: tx_template.c:62
float
float
Definition: af_crystalizer.c:121
c2r
static void c2r(float *buffer, int size)
Definition: af_apsyclip.c:386
s
#define s(width, name)
Definition: cbs_vp9.c:198
ff_tx_fft_factor_init
static av_cold int TX_NAME() ff_tx_fft_factor_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:480
ff_tx_mdct_fwd
static void TX_NAME() ff_tx_mdct_fwd(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1279
t7
#define t7
Definition: regdef.h:35
ff_tx_mdct_naive_init
static av_cold int TX_NAME() ff_tx_mdct_naive_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1138
FF_TX_FORWARD_ONLY
#define FF_TX_FORWARD_ONLY
Definition: tx_priv.h:158
FFTXCodelet::cpu_flags
int cpu_flags
Definition: tx_priv.h:227
DECL_FACTOR_S
#define DECL_FACTOR_S(n)
Definition: tx_template.c:498
ff_tx_dstI
static void TX_NAME() ff_tx_dstI(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:2060
if
if(ret)
Definition: filter_design.txt:179
AV_TX_FULL_IMDCT
@ AV_TX_FULL_IMDCT
Performs a full inverse MDCT rather than leaving out samples that can be derived through symmetry.
Definition: tx.h:175
opts
AVDictionary * opts
Definition: movenc.c:50
AV_ONCE_INIT
#define AV_ONCE_INIT
Definition: thread.h:203
fabs
static __device__ float fabs(float a)
Definition: cuda_runtime.h:182
AV_TX_REAL_TO_IMAGINARY
@ AV_TX_REAL_TO_IMAGINARY
Definition: tx.h:185
NULL
#define NULL
Definition: coverity.c:32
t5
#define t5
Definition: regdef.h:33
ff_tx_mdct_init
static av_cold int TX_NAME() ff_tx_mdct_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1229
t6
#define t6
Definition: regdef.h:34
AV_TX_INPLACE
@ AV_TX_INPLACE
Allows for in-place transformations, where input == output.
Definition: tx.h:161
ff_tx_gen_ptwo_revtab
int ff_tx_gen_ptwo_revtab(AVTXContext *s, FFTXCodeletOptions *opts)
Definition: tx.c:135
r2c
static void r2c(float *buffer, int size)
Definition: af_apsyclip.c:377
FF_TX_OUT_OF_PLACE
#define FF_TX_OUT_OF_PLACE
Definition: tx_priv.h:154
CMUL3
#define CMUL3(c, a, b)
Definition: tx_priv.h:150
AV_TX_UNALIGNED
@ AV_TX_UNALIGNED
Relaxes alignment requirement for the in and out arrays of av_tx_fn().
Definition: tx.h:167
exp
int8_t exp
Definition: eval.c:74
ff_tx_dctIII
static void TX_NAME() ff_tx_dctIII(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1939
DECL_COMP_MDCT
#define DECL_COMP_MDCT(N)
Definition: tx_template.c:1539
AVOnce
#define AVOnce
Definition: thread.h:202
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
ff_tx_fft_pfa_init
static av_cold int TX_NAME() ff_tx_fft_pfa_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:954
ff_tx_clear_ctx
void ff_tx_clear_ctx(AVTXContext *s)
Definition: tx.c:289
ff_tx_fft2_ns
static void TX_NAME() ff_tx_fft2_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:633
FF_TX_PRESHUFFLE
#define FF_TX_PRESHUFFLE
Definition: tx_priv.h:156
ff_tx_fft_sr_codelet_init
static av_cold int TX_NAME() ff_tx_fft_sr_codelet_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:590
ff_tx_gen_default_map
int ff_tx_gen_default_map(AVTXContext *s, FFTXCodeletOptions *opts)
Definition: tx.c:524
f
f
Definition: af_crystalizer.c:121
ff_tx_init_tab_53
static av_cold void TX_TAB() ff_tx_init_tab_53(void)
Definition: tx_template.c:93
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: vvc_intra.c:291
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
FF_TX_PRIO_BASE
@ FF_TX_PRIO_BASE
Definition: tx_priv.h:162
for
for(k=2;k<=8;++k)
Definition: h264pred_template.c:425
ff_tx_fft8_ns
static void TX_NAME() ff_tx_fft8_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:662
fft9
static av_always_inline void fft9(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:342
t8
#define t8
Definition: regdef.h:53
BF
#define BF(a, b, c, s)
Definition: dct32_template.c:90
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
TX_EMBED_INPUT_PFA_MAP
#define TX_EMBED_INPUT_PFA_MAP(map, tot_len, d1, d2)
Definition: tx_priv.h:271
ff_tx_fft_inplace
static void TX_NAME() ff_tx_fft_inplace(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:786
DECL_RDFT_HALF
#define DECL_RDFT_HALF(n, mode, mod2)
Definition: tx_template.c:1730
M_PI
#define M_PI
Definition: mathematics.h:67
ff_tx_fft_init
static av_cold int TX_NAME() ff_tx_fft_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:730
DST_I
@ DST_I
Definition: avfft.h:122
TXComplex
void TXComplex
Definition: tx_priv.h:65
ff_tx_mdct_inv
static void TX_NAME() ff_tx_mdct_inv(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1318
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:255
t4
#define t4
Definition: regdef.h:32
t3
#define t3
Definition: regdef.h:31
av_malloc_array
#define av_malloc_array(a, b)
Definition: tableprint_vlc.h:31
nptwo_tabs_init_once
static AVOnce nptwo_tabs_init_once[]
Definition: tx_template.c:140
av_always_inline
#define av_always_inline
Definition: attributes.h:49
ff_tx_fft_init_naive_small
static av_cold int TX_NAME() ff_tx_fft_init_naive_small(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:852
DECL_SR_CODELET
#define DECL_SR_CODELET(n, n2, n4)
Definition: tx_template.c:617
DECL_COMP_IMDCT
#define DECL_COMP_IMDCT(N)
Definition: tx_template.c:1477
av_mallocz
void * av_mallocz(size_t size)
Allocate a memory block with alignment suitable for all memory accesses (including vectors if availab...
Definition: mem.c:254
len
int len
Definition: vorbis_enc_data.h:426
fft3
static av_always_inline void fft3(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:175
FF_TX_MAP_SCATTER
@ FF_TX_MAP_SCATTER
Definition: tx_priv.h:179
TX_LEN_UNLIMITED
#define TX_LEN_UNLIMITED
Definition: tx_priv.h:216
stride
#define stride
Definition: h264pred_template.c:537
nptwo_tabs_init_data
static const FFTabInitData nptwo_tabs_init_data[]
Definition: tx_template.c:134
ret
ret
Definition: filter_design.txt:187
ff_tx_init_subtx
av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx.c:711
FFSWAP
#define FFSWAP(type, a, b)
Definition: macros.h:52
ff_tx_init_tab_7
static av_cold void TX_TAB() ff_tx_init_tab_7(void)
Definition: tx_template.c:112
TX_FACTOR_ANY
#define TX_FACTOR_ANY
Definition: tx_priv.h:209
FF_TX_INVERSE_ONLY
#define FF_TX_INVERSE_ONLY
Definition: tx_priv.h:157
ff_tx_fft_naive_small
static void TX_NAME() ff_tx_fft_naive_small(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:904
ff_tx_init_tab_9
static av_cold void TX_TAB() ff_tx_init_tab_9(void)
Definition: tx_template.c:122
FFTXCodelet
Definition: tx_priv.h:199
ff_tx_init_tabs
av_cold void TX_TAB() ff_tx_init_tabs(int len)
Definition: tx_template.c:146
t2
#define t2
Definition: regdef.h:30
ff_tx_mdct_naive_inv
static void TX_NAME() ff_tx_mdct_naive_inv(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1171
FFTabInitData::factors
int factors[TX_MAX_SUB]
Definition: tx_template.c:64
ff_tx_dctII
static void TX_NAME() ff_tx_dctII(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1879
BUTTERFLIES
#define BUTTERFLIES(a0, a1, a2, a3)
Definition: tx_template.c:542
ff_tx_fft_pfa_ns
static void TX_NAME() ff_tx_fft_pfa_ns(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride)
Definition: tx_template.c:1088
src0
const pixel *const src0
Definition: h264pred_template.c:420
FFTXCodelet::name
const char * name
Definition: tx_priv.h:200
factor
static const int factor[16]
Definition: vf_pp7.c:78
ff_tx_dcstI_init
static av_cold int TX_NAME() ff_tx_dcstI_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:2010
ff_tx_fft_inplace_small_init
static av_cold int TX_NAME() ff_tx_fft_inplace_small_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:756
map
const VDPAUPixFmtMap * map
Definition: hwcontext_vdpau.c:71
FFALIGN
#define FFALIGN(x, a)
Definition: macros.h:78
alpha
static const int16_t alpha[]
Definition: ilbcdata.h:55
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
fft7
static av_always_inline void fft7(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:254
int32_t
int32_t
Definition: audioconvert.c:56
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:482
ff_tx_mdct_gen_exp
int TX_TAB() ff_tx_mdct_gen_exp(AVTXContext *s, int *pre_tab)
Definition: tx_template.c:2111
ff_tx_gen_pfa_input_map
int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts, int d1, int d2)
Definition: tx.c:43
DECL_RDFT
#define DECL_RDFT(n, inv)
Definition: tx_template.c:1661
ff_tx_mdct_pfa_init
static av_cold int TX_NAME() ff_tx_mdct_pfa_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1431
ff_tx_fft4_ns
static void TX_NAME() ff_tx_fft4_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:645
ff_tx_mdct_inv_full_init
static av_cold int TX_NAME() ff_tx_mdct_inv_full_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1378
ff_tx_decompose_length
int ff_tx_decompose_length(int dst[TX_MAX_DECOMPOSITIONS], enum AVTXType type, int len, int inv)
Definition: tx.c:411
TX_TYPE
#define TX_TYPE
Definition: aacdec.c:36
ff_tx_mdct_inv_full
static void TX_NAME() ff_tx_mdct_inv_full(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1398
FF_TX_PRIO_MIN
@ FF_TX_PRIO_MIN
Definition: tx_priv.h:167