Go to the documentation of this file.
27 #define TABLE_DEF(name, size) \
28 DECLARE_ALIGNED(32, TXSample, TX_TAB(ff_tx_tab_ ##name))[size]
30 #define SR_POW2_TABLES \
51 #define SR_TABLE(len) \
52 TABLE_DEF(len, len/4 + 1);
67 #define SR_TABLE(len) \
68 static av_cold void TX_TAB(ff_tx_init_tab_ ##len)(void) \
70 double freq = 2*M_PI/len; \
71 TXSample *tab = TX_TAB(ff_tx_tab_ ##len); \
73 for (int i = 0; i < len/4; i++) \
74 *tab++ = RESCALE(cos(i*freq)); \
82 #define SR_TABLE(len) TX_TAB(ff_tx_init_tab_ ##len),
88 #define SR_TABLE(len) AV_ONCE_INIT,
96 TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 *
M_PI / 5));
97 TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 *
M_PI / 5));
98 TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 *
M_PI / 10));
99 TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 *
M_PI / 10));
100 TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 *
M_PI / 5));
101 TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 *
M_PI / 5));
102 TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 *
M_PI / 10));
103 TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 *
M_PI / 10));
106 TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 *
M_PI / 12));
107 TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 *
M_PI / 12));
108 TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 *
M_PI / 6));
109 TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 *
M_PI / 6));
114 TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 *
M_PI / 7));
115 TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 *
M_PI / 7));
116 TX_TAB(ff_tx_tab_7)[2] = RESCALE(sin(2 *
M_PI / 28));
117 TX_TAB(ff_tx_tab_7)[3] = RESCALE(cos(2 *
M_PI / 28));
118 TX_TAB(ff_tx_tab_7)[4] = RESCALE(cos(2 *
M_PI / 14));
119 TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 *
M_PI / 14));
124 TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 *
M_PI / 3));
125 TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 *
M_PI / 3));
126 TX_TAB(ff_tx_tab_9)[2] = RESCALE(cos(2 *
M_PI / 9));
127 TX_TAB(ff_tx_tab_9)[3] = RESCALE(sin(2 *
M_PI / 9));
128 TX_TAB(ff_tx_tab_9)[4] = RESCALE(cos(2 *
M_PI / 36));
129 TX_TAB(ff_tx_tab_9)[5] = RESCALE(sin(2 *
M_PI / 36));
130 TX_TAB(ff_tx_tab_9)[6] = TX_TAB(ff_tx_tab_9)[2] + TX_TAB(ff_tx_tab_9)[5];
131 TX_TAB(ff_tx_tab_9)[7] = TX_TAB(ff_tx_tab_9)[3] - TX_TAB(ff_tx_tab_9)[4];
150 int idx = factor_2 - 3;
151 for (
int i = 0;
i <= idx;
i++)
179 const TXSample *
tab = TX_TAB(ff_tx_tab_53);
185 BF(
tmp[1].re,
tmp[2].im, in[1].im, in[2].im);
186 BF(
tmp[1].im,
tmp[2].re, in[1].re, in[2].re);
191 mtmp[0] = (int64_t)
tab[ 8] *
tmp[1].re;
192 mtmp[1] = (int64_t)
tab[ 9] *
tmp[1].im;
193 mtmp[2] = (int64_t)
tab[10] *
tmp[2].re;
194 mtmp[3] = (int64_t)
tab[10] *
tmp[2].im;
195 out[1*
stride].re =
tmp[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
196 out[1*
stride].im =
tmp[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
197 out[2*
stride].re =
tmp[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
198 out[2*
stride].im =
tmp[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
213 #define DECL_FFT5(NAME, D0, D1, D2, D3, D4) \
214 static av_always_inline void NAME(TXComplex *out, TXComplex *in, \
217 TXComplex dc, z0[4], t[6]; \
218 const TXSample *tab = TX_TAB(ff_tx_tab_53); \
221 BF(t[1].im, t[0].re, in[1].re, in[4].re); \
222 BF(t[1].re, t[0].im, in[1].im, in[4].im); \
223 BF(t[3].im, t[2].re, in[2].re, in[3].re); \
224 BF(t[3].re, t[2].im, in[2].im, in[3].im); \
226 out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re; \
227 out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im; \
229 SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re); \
230 SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im); \
231 CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re); \
232 CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im); \
234 BF(z0[0].re, z0[3].re, t[0].re, t[1].re); \
235 BF(z0[0].im, z0[3].im, t[0].im, t[1].im); \
236 BF(z0[2].re, z0[1].re, t[4].re, t[5].re); \
237 BF(z0[2].im, z0[1].im, t[4].im, t[5].im); \
239 out[D1*stride].re = dc.re + (TXUSample)z0[3].re; \
240 out[D1*stride].im = dc.im + (TXUSample)z0[0].im; \
241 out[D2*stride].re = dc.re + (TXUSample)z0[2].re; \
242 out[D2*stride].im = dc.im + (TXUSample)z0[1].im; \
243 out[D3*stride].re = dc.re + (TXUSample)z0[1].re; \
244 out[D3*stride].im = dc.im + (TXUSample)z0[2].im; \
245 out[D4*stride].re = dc.re + (TXUSample)z0[0].re; \
246 out[D4*stride].im = dc.im + (TXUSample)z0[3].im; \
264 BF(t[1].re, t[0].re, in[1].re, in[6].re);
265 BF(t[1].im, t[0].im, in[1].im, in[6].im);
266 BF(t[3].re, t[2].re, in[2].re, in[5].re);
267 BF(t[3].im, t[2].im, in[2].im, in[5].im);
268 BF(t[5].re, t[4].re, in[3].re, in[4].re);
269 BF(t[5].im, t[4].im, in[3].im, in[4].im);
271 out[0*
stride].re =
dc.re + t[0].re + t[2].re + t[4].re;
272 out[0*
stride].im =
dc.im + t[0].im + t[2].im + t[4].im;
275 mtmp[ 0] = ((int64_t)
tab[0].re)*t[0].re - ((int64_t)
tab[2].re)*t[4].re;
276 mtmp[ 1] = ((int64_t)
tab[0].re)*t[4].re - ((int64_t)
tab[1].re)*t[0].re;
277 mtmp[ 2] = ((int64_t)
tab[0].re)*t[2].re - ((int64_t)
tab[2].re)*t[0].re;
278 mtmp[ 3] = ((int64_t)
tab[0].re)*t[0].im - ((int64_t)
tab[1].re)*t[2].im;
279 mtmp[ 4] = ((int64_t)
tab[0].re)*t[4].im - ((int64_t)
tab[1].re)*t[0].im;
280 mtmp[ 5] = ((int64_t)
tab[0].re)*t[2].im - ((int64_t)
tab[2].re)*t[0].im;
282 mtmp[ 6] = ((int64_t)
tab[2].im)*t[1].im + ((int64_t)
tab[1].im)*t[5].im;
283 mtmp[ 7] = ((int64_t)
tab[0].im)*t[5].im + ((int64_t)
tab[2].im)*t[3].im;
284 mtmp[ 8] = ((int64_t)
tab[2].im)*t[5].im + ((int64_t)
tab[1].im)*t[3].im;
285 mtmp[ 9] = ((int64_t)
tab[0].im)*t[1].re + ((int64_t)
tab[1].im)*t[3].re;
286 mtmp[10] = ((int64_t)
tab[2].im)*t[3].re + ((int64_t)
tab[0].im)*t[5].re;
287 mtmp[11] = ((int64_t)
tab[2].im)*t[1].re + ((int64_t)
tab[1].im)*t[5].re;
289 z[0].re = (
int32_t)(mtmp[ 0] - ((int64_t)
tab[1].re)*t[2].re + 0x40000000 >> 31);
290 z[1].re = (
int32_t)(mtmp[ 1] - ((int64_t)
tab[2].re)*t[2].re + 0x40000000 >> 31);
291 z[2].re = (
int32_t)(mtmp[ 2] - ((int64_t)
tab[1].re)*t[4].re + 0x40000000 >> 31);
292 z[0].im = (
int32_t)(mtmp[ 3] - ((int64_t)
tab[2].re)*t[4].im + 0x40000000 >> 31);
293 z[1].im = (
int32_t)(mtmp[ 4] - ((int64_t)
tab[2].re)*t[2].im + 0x40000000 >> 31);
294 z[2].im = (
int32_t)(mtmp[ 5] - ((int64_t)
tab[1].re)*t[4].im + 0x40000000 >> 31);
296 t[0].re = (
int32_t)(mtmp[ 6] - ((int64_t)
tab[0].im)*t[3].im + 0x40000000 >> 31);
297 t[2].re = (
int32_t)(mtmp[ 7] - ((int64_t)
tab[1].im)*t[1].im + 0x40000000 >> 31);
298 t[4].re = (
int32_t)(mtmp[ 8] + ((int64_t)
tab[0].im)*t[1].im + 0x40000000 >> 31);
299 t[0].im = (
int32_t)(mtmp[ 9] + ((int64_t)
tab[2].im)*t[5].re + 0x40000000 >> 31);
300 t[2].im = (
int32_t)(mtmp[10] - ((int64_t)
tab[1].im)*t[1].re + 0x40000000 >> 31);
301 t[4].im = (
int32_t)(mtmp[11] - ((int64_t)
tab[0].im)*t[3].re + 0x40000000 >> 31);
303 z[0].re =
tab[0].re*t[0].re -
tab[2].re*t[4].re -
tab[1].re*t[2].re;
304 z[1].re =
tab[0].re*t[4].re -
tab[1].re*t[0].re -
tab[2].re*t[2].re;
305 z[2].re =
tab[0].re*t[2].re -
tab[2].re*t[0].re -
tab[1].re*t[4].re;
306 z[0].im =
tab[0].re*t[0].im -
tab[1].re*t[2].im -
tab[2].re*t[4].im;
307 z[1].im =
tab[0].re*t[4].im -
tab[1].re*t[0].im -
tab[2].re*t[2].im;
308 z[2].im =
tab[0].re*t[2].im -
tab[2].re*t[0].im -
tab[1].re*t[4].im;
313 t[0].re =
tab[2].im*t[1].im +
tab[1].im*t[5].im -
tab[0].im*t[3].im;
314 t[2].re =
tab[0].im*t[5].im +
tab[2].im*t[3].im -
tab[1].im*t[1].im;
315 t[4].re =
tab[2].im*t[5].im +
tab[1].im*t[3].im +
tab[0].im*t[1].im;
316 t[0].im =
tab[0].im*t[1].re +
tab[1].im*t[3].re +
tab[2].im*t[5].re;
317 t[2].im =
tab[2].im*t[3].re +
tab[0].im*t[5].re -
tab[1].im*t[1].re;
318 t[4].im =
tab[2].im*t[1].re +
tab[1].im*t[5].re -
tab[0].im*t[3].re;
321 BF(t[1].re, z[0].re, z[0].re, t[4].re);
322 BF(t[3].re, z[1].re, z[1].re, t[2].re);
323 BF(t[5].re, z[2].re, z[2].re, t[0].re);
324 BF(t[1].im, z[0].im, z[0].im, t[0].im);
325 BF(t[3].im, z[1].im, z[1].im, t[2].im);
326 BF(t[5].im, z[2].im, z[2].im, t[4].im);
352 BF(t[1].re, t[0].re, in[1].re, in[8].re);
353 BF(t[1].im, t[0].im, in[1].im, in[8].im);
354 BF(t[3].re, t[2].re, in[2].re, in[7].re);
355 BF(t[3].im, t[2].im, in[2].im, in[7].im);
356 BF(t[5].re, t[4].re, in[3].re, in[6].re);
357 BF(t[5].im, t[4].im, in[3].im, in[6].im);
358 BF(t[7].re, t[6].re, in[4].re, in[5].re);
359 BF(t[7].im, t[6].im, in[4].im, in[5].im);
361 w[0].re = t[0].re - t[6].re;
362 w[0].im = t[0].im - t[6].im;
363 w[1].re = t[2].re - t[6].re;
364 w[1].im = t[2].im - t[6].im;
365 w[2].re = t[1].re - t[7].re;
366 w[2].im = t[1].im - t[7].im;
367 w[3].re = t[3].re + t[7].re;
368 w[3].im = t[3].im + t[7].im;
370 z[0].re =
dc.re + t[4].re;
371 z[0].im =
dc.im + t[4].im;
373 z[1].re = t[0].re + t[2].re + t[6].re;
374 z[1].im = t[0].im + t[2].im + t[6].im;
380 mtmp[0] = t[1].re - t[3].re + t[7].re;
381 mtmp[1] = t[1].im - t[3].im + t[7].im;
383 y[3].re = (
int32_t)(((int64_t)
tab[0].im)*mtmp[0] + 0x40000000 >> 31);
384 y[3].im = (
int32_t)(((int64_t)
tab[0].im)*mtmp[1] + 0x40000000 >> 31);
386 mtmp[0] = (
int32_t)(((int64_t)
tab[0].re)*z[1].re + 0x40000000 >> 31);
387 mtmp[1] = (
int32_t)(((int64_t)
tab[0].re)*z[1].im + 0x40000000 >> 31);
388 mtmp[2] = (
int32_t)(((int64_t)
tab[0].re)*t[4].re + 0x40000000 >> 31);
389 mtmp[3] = (
int32_t)(((int64_t)
tab[0].re)*t[4].im + 0x40000000 >> 31);
391 x[3].re = z[0].re + (
int32_t)mtmp[0];
392 x[3].im = z[0].im + (
int32_t)mtmp[1];
393 z[0].re = in[0].re + (
int32_t)mtmp[2];
394 z[0].im = in[0].im + (
int32_t)mtmp[3];
396 mtmp[0] = ((int64_t)
tab[1].re)*
w[0].re;
397 mtmp[1] = ((int64_t)
tab[1].re)*
w[0].im;
398 mtmp[2] = ((int64_t)
tab[2].im)*
w[0].re;
399 mtmp[3] = ((int64_t)
tab[2].im)*
w[0].im;
400 mtmp[4] = ((int64_t)
tab[1].im)*
w[2].re;
401 mtmp[5] = ((int64_t)
tab[1].im)*
w[2].im;
402 mtmp[6] = ((int64_t)
tab[2].re)*
w[2].re;
403 mtmp[7] = ((int64_t)
tab[2].re)*
w[2].im;
405 x[1].re = (
int32_t)(mtmp[0] + ((int64_t)
tab[2].im)*
w[1].re + 0x40000000 >> 31);
406 x[1].im = (
int32_t)(mtmp[1] + ((int64_t)
tab[2].im)*
w[1].im + 0x40000000 >> 31);
407 x[2].re = (
int32_t)(mtmp[2] - ((int64_t)
tab[3].re)*
w[1].re + 0x40000000 >> 31);
408 x[2].im = (
int32_t)(mtmp[3] - ((int64_t)
tab[3].re)*
w[1].im + 0x40000000 >> 31);
409 y[1].re = (
int32_t)(mtmp[4] + ((int64_t)
tab[2].re)*
w[3].re + 0x40000000 >> 31);
410 y[1].im = (
int32_t)(mtmp[5] + ((int64_t)
tab[2].re)*
w[3].im + 0x40000000 >> 31);
411 y[2].re = (
int32_t)(mtmp[6] - ((int64_t)
tab[3].im)*
w[3].re + 0x40000000 >> 31);
412 y[2].im = (
int32_t)(mtmp[7] - ((int64_t)
tab[3].im)*
w[3].im + 0x40000000 >> 31);
414 y[0].re = (
int32_t)(((int64_t)
tab[0].im)*t[5].re + 0x40000000 >> 31);
415 y[0].im = (
int32_t)(((int64_t)
tab[0].im)*t[5].im + 0x40000000 >> 31);
418 y[3].re =
tab[0].im*(t[1].re - t[3].re + t[7].re);
419 y[3].im =
tab[0].im*(t[1].im - t[3].im + t[7].im);
421 x[3].re = z[0].re +
tab[0].re*z[1].re;
422 x[3].im = z[0].im +
tab[0].re*z[1].im;
423 z[0].re =
dc.re +
tab[0].re*t[4].re;
424 z[0].im =
dc.im +
tab[0].re*t[4].im;
426 x[1].re =
tab[1].re*
w[0].re +
tab[2].im*
w[1].re;
427 x[1].im =
tab[1].re*
w[0].im +
tab[2].im*
w[1].im;
428 x[2].re =
tab[2].im*
w[0].re -
tab[3].re*
w[1].re;
429 x[2].im =
tab[2].im*
w[0].im -
tab[3].re*
w[1].im;
430 y[1].re =
tab[1].im*
w[2].re +
tab[2].re*
w[3].re;
431 y[1].im =
tab[1].im*
w[2].im +
tab[2].re*
w[3].im;
432 y[2].re =
tab[2].re*
w[2].re -
tab[3].im*
w[3].re;
433 y[2].im =
tab[2].re*
w[2].im -
tab[3].im*
w[3].im;
435 y[0].re =
tab[0].im*t[5].re;
436 y[0].im =
tab[0].im*t[5].im;
439 x[4].re = x[1].re + x[2].re;
440 x[4].im = x[1].im + x[2].im;
442 y[4].re = y[1].re - y[2].re;
443 y[4].im = y[1].im - y[2].im;
444 x[1].re = z[0].re + x[1].re;
445 x[1].im = z[0].im + x[1].im;
446 y[1].re = y[0].re + y[1].re;
447 y[1].im = y[0].im + y[1].im;
448 x[2].re = z[0].re + x[2].re;
449 x[2].im = z[0].im + x[2].im;
450 y[2].re = y[2].re - y[0].re;
451 y[2].im = y[2].im - y[0].im;
452 x[4].re = z[0].re - x[4].re;
453 x[4].im = z[0].im - x[4].im;
454 y[4].re = y[0].re - y[4].re;
455 y[4].im = y[0].im - y[4].im;
472 for (
int i = 0;
i < 5;
i++)
498 #define DECL_FACTOR_S(n) \
499 static void TX_NAME(ff_tx_fft##n)(AVTXContext *s, void *dst, \
500 void *src, ptrdiff_t stride) \
502 fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex)); \
504 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
505 .name = TX_NAME_STR("fft" #n "_ns"), \
506 .function = TX_NAME(ff_tx_fft##n), \
507 .type = TX_TYPE(FFT), \
508 .flags = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
509 AV_TX_UNALIGNED | FF_TX_PRESHUFFLE, \
514 .init = TX_NAME(ff_tx_fft_factor_init), \
515 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
516 .prio = FF_TX_PRIO_BASE, \
519 #define DECL_FACTOR_F(n) \
521 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_fwd_def) = { \
522 .name = TX_NAME_STR("fft" #n "_fwd"), \
523 .function = TX_NAME(ff_tx_fft##n), \
524 .type = TX_TYPE(FFT), \
525 .flags = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
526 AV_TX_UNALIGNED | FF_TX_FORWARD_ONLY, \
531 .init = TX_NAME(ff_tx_fft_factor_init), \
532 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
533 .prio = FF_TX_PRIO_BASE, \
542 #define BUTTERFLIES(a0, a1, a2, a3) \
548 BF(t3, t5, t5, t1); \
549 BF(a2.re, a0.re, r0, t5); \
550 BF(a3.im, a1.im, i1, t3); \
551 BF(t4, t6, t2, t6); \
552 BF(a3.re, a1.re, r1, t4); \
553 BF(a2.im, a0.im, i0, t6); \
556 #define TRANSFORM(a0, a1, a2, a3, wre, wim) \
558 CMUL(t1, t2, a2.re, a2.im, wre, -wim); \
559 CMUL(t5, t6, a3.re, a3.im, wre, wim); \
560 BUTTERFLIES(a0, a1, a2, a3); \
565 const TXSample *cos,
int len)
570 const TXSample *wim = cos + o1 - 7;
573 for (
int i = 0;
i <
len;
i += 4) {
574 TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]);
575 TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]);
576 TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]);
577 TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]);
579 TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]);
580 TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]);
581 TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]);
582 TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]);
601 #define DECL_SR_CODELET_DEF(n) \
602 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
603 .name = TX_NAME_STR("fft" #n "_ns"), \
604 .function = TX_NAME(ff_tx_fft##n##_ns), \
605 .type = TX_TYPE(FFT), \
606 .flags = FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | \
607 AV_TX_UNALIGNED | FF_TX_PRESHUFFLE, \
612 .init = TX_NAME(ff_tx_fft_sr_codelet_init), \
613 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
614 .prio = FF_TX_PRIO_BASE, \
617 #define DECL_SR_CODELET(n, n2, n4) \
618 static void TX_NAME(ff_tx_fft##n##_ns)(AVTXContext *s, void *_dst, \
619 void *_src, ptrdiff_t stride) \
621 TXComplex *src = _src; \
622 TXComplex *dst = _dst; \
623 const TXSample *cos = TX_TAB(ff_tx_tab_##n); \
625 TX_NAME(ff_tx_fft##n2##_ns)(s, dst, src, stride); \
626 TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride); \
627 TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride); \
628 TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1); \
631 DECL_SR_CODELET_DEF(n)
634 void *_src, ptrdiff_t
stride)
646 void *_src, ptrdiff_t
stride)
654 BF(dst[2].re, dst[0].re,
t1,
t6);
657 BF(dst[3].im, dst[1].im,
t4,
t8);
658 BF(dst[3].re, dst[1].re,
t3,
t7);
659 BF(dst[2].im, dst[0].im,
t2,
t5);
663 void *_src, ptrdiff_t
stride)
668 const TXSample cos = TX_TAB(ff_tx_tab_8)[1];
678 TRANSFORM(dst[1], dst[3], dst[5], dst[7], cos, cos);
682 void *_src, ptrdiff_t
stride)
686 const TXSample *cos = TX_TAB(ff_tx_tab_16);
689 TXSample cos_16_1 = cos[1];
690 TXSample cos_16_2 = cos[2];
691 TXSample cos_16_3 = cos[3];
703 TRANSFORM(dst[ 2], dst[ 6], dst[10], dst[14], cos_16_2, cos_16_2);
704 TRANSFORM(dst[ 1], dst[ 5], dst[ 9], dst[13], cos_16_1, cos_16_3);
705 TRANSFORM(dst[ 3], dst[ 7], dst[11], dst[15], cos_16_3, cos_16_1);
770 void *_src, ptrdiff_t
stride)
775 int *
map =
s->sub[0].map;
780 for (
int i = 0;
i <
len;
i++)
783 s->fn[0](&
s->sub[0], dst2, dst1,
stride);
787 void *_src, ptrdiff_t
stride)
792 const int *
map =
s->sub->map;
793 const int *inplace_idx =
s->map;
794 int src_idx, dst_idx;
796 src_idx = *inplace_idx++;
799 dst_idx =
map[src_idx];
802 dst_idx =
map[dst_idx];
803 }
while (dst_idx != src_idx);
805 }
while ((src_idx = *inplace_idx++));
811 .
name = TX_NAME_STR(
"fft"),
825 .
name = TX_NAME_STR(
"fft_inplace_small"),
839 .
name = TX_NAME_STR(
"fft_inplace"),
864 for (
int i = 0;
i <
len;
i++) {
865 for (
int j = 0; j <
len; j++) {
866 const double factor = phase*
i*j;
882 const int n =
s->len;
883 double phase =
s->inv ? 2.0*
M_PI/n : -2.0*
M_PI/n;
887 for (
int i = 0;
i < n;
i++) {
889 for (
int j = 0; j < n; j++) {
890 const double factor = phase*
i*j;
909 const int n =
s->len;
913 for (
int i = 0;
i < n;
i++) {
915 for (
int j = 0; j < n; j++) {
927 .
name = TX_NAME_STR(
"fft_naive_small"),
941 .
name = TX_NAME_STR(
"fft_naive"),
963 size_t extra_tmp_len = 0;
970 for (
int i = 0;
i <
ret;
i++) {
971 int len1 = len_list[
i];
972 int len2 =
len / len1;
975 if (len2 & (len2 - 1))
990 }
else if (
ret < 0) {
1011 }
else if (
ret < 0) {
1018 }
else if (
ret < 0) {
1038 s->sub[0].len,
s->sub[1].len)))
1045 tmp = (
int *)
s->tmp;
1046 for (
int k = 0; k <
len; k +=
s->sub[0].len) {
1047 memcpy(
tmp, &
s->map[k],
s->sub[0].len*
sizeof(*
tmp));
1048 for (
int i = 0;
i <
s->sub[0].len;
i++)
1049 s->map[k +
i] =
tmp[
s->sub[0].map[
i]];
1054 extra_tmp_len =
len;
1056 extra_tmp_len =
s->sub[0].len;
1058 if (extra_tmp_len && !(
s->exp =
av_malloc(extra_tmp_len*
sizeof(*
s->exp))))
1065 void *_in, ptrdiff_t
stride)
1067 const int n =
s->sub[0].len, m =
s->sub[1].len, l =
s->len;
1068 const int *in_map =
s->map, *out_map = in_map + l;
1069 const int *sub_map =
s->sub[1].map;
1075 for (
int i = 0;
i < m;
i++) {
1076 for (
int j = 0; j < n; j++)
1077 s->exp[j] = in[in_map[
i*n + j]];
1078 s->fn[0](&
s->sub[0], &
s->tmp[sub_map[
i]],
s->exp, m*
sizeof(
TXComplex));
1081 for (
int i = 0;
i < n;
i++)
1082 s->fn[1](&
s->sub[1], &tmp1[m*
i], &
s->tmp[m*
i],
sizeof(
TXComplex));
1084 for (
int i = 0;
i < l;
i++)
1089 void *_in, ptrdiff_t
stride)
1091 const int n =
s->sub[0].len, m =
s->sub[1].len, l =
s->len;
1092 const int *in_map =
s->map, *out_map = in_map + l;
1093 const int *sub_map =
s->sub[1].map;
1099 for (
int i = 0;
i < m;
i++)
1100 s->fn[0](&
s->sub[0], &
s->tmp[sub_map[
i]], &in[
i*n], m*
sizeof(
TXComplex));
1102 for (
int i = 0;
i < n;
i++)
1103 s->fn[1](&
s->sub[1], &tmp1[m*
i], &
s->tmp[m*
i],
sizeof(
TXComplex));
1105 for (
int i = 0;
i < l;
i++)
1110 .
name = TX_NAME_STR(
"fft_pfa"),
1124 .
name = TX_NAME_STR(
"fft_pfa_ns"),
1145 s->scale_d = *((SCALE_TYPE *)
scale);
1146 s->scale_f =
s->scale_d;
1151 void *_src, ptrdiff_t
stride)
1153 TXSample *
src = _src;
1154 TXSample *dst = _dst;
1155 double scale =
s->scale_d;
1157 const double phase =
M_PI/(4.0*
len);
1161 for (
int i = 0;
i <
len;
i++) {
1163 for (
int j = 0; j <
len*2; j++) {
1164 int a = (2*j + 1 +
len) * (2*
i + 1);
1165 sum += UNSCALE(
src[j]) * cos(
a * phase);
1172 void *_src, ptrdiff_t
stride)
1174 TXSample *
src = _src;
1175 TXSample *dst = _dst;
1176 double scale =
s->scale_d;
1177 int len =
s->len >> 1;
1179 const double phase =
M_PI/(4.0*len2);
1183 for (
int i = 0;
i <
len;
i++) {
1186 double i_d = phase * (4*
len - 2*
i - 1);
1187 double i_u = phase * (3*len2 + 2*
i + 1);
1188 for (
int j = 0; j < len2; j++) {
1189 double a = (2 * j + 1);
1190 double a_d = cos(
a * i_d);
1191 double a_u = cos(
a * i_u);
1202 .
name = TX_NAME_STR(
"mdct_naive_fwd"),
1216 .
name = TX_NAME_STR(
"mdct_naive_inv"),
1241 s->scale_d = *((SCALE_TYPE *)
scale);
1242 s->scale_f =
s->scale_d;
1262 memcpy(
s->map,
s->sub->map, (
len >> 1)*
sizeof(*
s->map));
1264 for (
int i = 0; i < len >> 1;
i++)
1273 for (
int i = 0;
i < (
s->len >> 1);
i++)
1282 TXSample *
src = _src, *dst = _dst;
1284 const int len2 =
s->len >> 1;
1285 const int len4 =
s->len >> 2;
1286 const int len3 = len2 * 3;
1287 const int *sub_map =
s->map;
1291 for (
int i = 0;
i < len2;
i++) {
1293 const int idx = sub_map[
i];
1295 tmp.re = FOLD(-
src[ len2 + k],
src[1*len2 - 1 - k]);
1296 tmp.im = FOLD(-
src[ len3 + k], -
src[1*len3 - 1 - k]);
1298 tmp.re = FOLD(-
src[ len2 + k], -
src[5*len2 - 1 - k]);
1299 tmp.im = FOLD(
src[-len2 + k], -
src[1*len3 - 1 - k]);
1301 CMUL(z[idx].im, z[idx].re,
tmp.re,
tmp.im,
exp[
i].re,
exp[
i].im);
1306 for (
int i = 0;
i < len4;
i++) {
1307 const int i0 = len4 +
i, i1 = len4 -
i - 1;
1322 const TXSample *
src = _src, *in1, *in2;
1323 const int len2 =
s->len >> 1;
1324 const int len4 =
s->len >> 2;
1325 const int *sub_map =
s->map;
1331 for (
int i = 0;
i < len2;
i++) {
1340 for (
int i = 0;
i < len4;
i++) {
1341 const int i0 = len4 +
i, i1 = len4 -
i - 1;
1345 CMUL(z[i1].re, z[i0].im,
src1.re,
src1.im,
exp[i1].im,
exp[i1].re);
1346 CMUL(z[i0].re, z[i1].im,
src0.re,
src0.im,
exp[i0].im,
exp[i0].re);
1351 .
name = TX_NAME_STR(
"mdct_fwd"),
1365 .
name = TX_NAME_STR(
"mdct_inv"),
1387 s->scale_d = *((SCALE_TYPE *)
scale);
1388 s->scale_f =
s->scale_d;
1399 void *_src, ptrdiff_t
stride)
1401 int len =
s->len << 1;
1402 int len2 =
len >> 1;
1403 int len4 =
len >> 2;
1404 TXSample *dst = _dst;
1406 s->fn[0](&
s->sub[0], dst + len4, _src,
stride);
1410 for (
int i = 0;
i < len4;
i++) {
1417 .
name = TX_NAME_STR(
"mdct_inv_full"),
1442 sub_len =
len / cd->factors[0];
1444 s->scale_d = *((SCALE_TYPE *)
scale);
1445 s->scale_f =
s->scale_d;
1452 sub_len, inv,
scale)))
1459 if (cd->factors[0] == 15)
1466 for (
int i = 0;
i <
len;
i++)
1477 #define DECL_COMP_IMDCT(N) \
1478 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst, \
1479 void *_src, ptrdiff_t stride) \
1481 TXComplex fft##N##in[N]; \
1482 TXComplex *z = _dst, *exp = s->exp; \
1483 const TXSample *src = _src, *in1, *in2; \
1484 const int len4 = s->len >> 2; \
1485 const int len2 = s->len >> 1; \
1486 const int m = s->sub->len; \
1487 const int *in_map = s->map, *out_map = in_map + N*m; \
1488 const int *sub_map = s->sub->map; \
1490 stride /= sizeof(*src); \
1492 in2 = src + ((N*m*2) - 1) * stride; \
1494 for (int i = 0; i < len2; i += N) { \
1495 for (int j = 0; j < N; j++) { \
1496 const int k = in_map[j]; \
1497 TXComplex tmp = { in2[-k*stride], in1[k*stride] }; \
1498 CMUL3(fft##N##in[j], tmp, exp[j]); \
1500 fft##N(s->tmp + *(sub_map++), fft##N##in, m); \
1505 for (int i = 0; i < N; i++) \
1506 s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \
1508 for (int i = 0; i < len4; i++) { \
1509 const int i0 = len4 + i, i1 = len4 - i - 1; \
1510 const int s0 = out_map[i0], s1 = out_map[i1]; \
1511 TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re }; \
1512 TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re }; \
1514 CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re); \
1515 CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re); \
1519 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_inv_def) = { \
1520 .name = TX_NAME_STR("mdct_pfa_" #N "xM_inv"), \
1521 .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_inv), \
1522 .type = TX_TYPE(MDCT), \
1523 .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY, \
1524 .factors = { N, TX_FACTOR_ANY }, \
1527 .max_len = TX_LEN_UNLIMITED, \
1528 .init = TX_NAME(ff_tx_mdct_pfa_init), \
1529 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1530 .prio = FF_TX_PRIO_BASE, \
1539 #define DECL_COMP_MDCT(N) \
1540 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd)(AVTXContext *s, void *_dst, \
1541 void *_src, ptrdiff_t stride) \
1543 TXComplex fft##N##in[N]; \
1544 TXSample *src = _src, *dst = _dst; \
1545 TXComplex *exp = s->exp, tmp; \
1546 const int m = s->sub->len; \
1547 const int len4 = N*m; \
1548 const int len3 = len4 * 3; \
1549 const int len8 = s->len >> 2; \
1550 const int *in_map = s->map, *out_map = in_map + N*m; \
1551 const int *sub_map = s->sub->map; \
1553 stride /= sizeof(*dst); \
1555 for (int i = 0; i < m; i++) { \
1556 for (int j = 0; j < N; j++) { \
1557 const int k = in_map[i*N + j]; \
1559 tmp.re = FOLD(-src[ len4 + k], src[1*len4 - 1 - k]); \
1560 tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]); \
1562 tmp.re = FOLD(-src[ len4 + k], -src[5*len4 - 1 - k]); \
1563 tmp.im = FOLD( src[-len4 + k], -src[1*len3 - 1 - k]); \
1565 CMUL(fft##N##in[j].im, fft##N##in[j].re, tmp.re, tmp.im, \
1566 exp[k >> 1].re, exp[k >> 1].im); \
1568 fft##N(s->tmp + sub_map[i], fft##N##in, m); \
1571 for (int i = 0; i < N; i++) \
1572 s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \
1574 for (int i = 0; i < len8; i++) { \
1575 const int i0 = len8 + i, i1 = len8 - i - 1; \
1576 const int s0 = out_map[i0], s1 = out_map[i1]; \
1577 TXComplex src1 = { s->tmp[s1].re, s->tmp[s1].im }; \
1578 TXComplex src0 = { s->tmp[s0].re, s->tmp[s0].im }; \
1580 CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im, \
1581 exp[i0].im, exp[i0].re); \
1582 CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im, \
1583 exp[i1].im, exp[i1].re); \
1587 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd_def) = { \
1588 .name = TX_NAME_STR("mdct_pfa_" #N "xM_fwd"), \
1589 .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd), \
1590 .type = TX_TYPE(MDCT), \
1591 .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, \
1592 .factors = { N, TX_FACTOR_ANY }, \
1595 .max_len = TX_LEN_UNLIMITED, \
1596 .init = TX_NAME(ff_tx_mdct_pfa_init), \
1597 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1598 .prio = FF_TX_PRIO_BASE, \
1620 s->scale_d = *((SCALE_TYPE *)
scale);
1621 s->scale_f =
s->scale_d;
1628 if (!(
s->exp =
av_mallocz((8 + 2*len4)*
sizeof(*
s->exp))))
1631 tab = (TXSample *)
s->exp;
1635 m = (inv ? 2*
s->scale_d :
s->scale_d);
1637 *
tab++ = RESCALE((inv ? 0.5 : 1.0) * m);
1638 *
tab++ = RESCALE(inv ? 0.5*m : 1.0*m);
1639 *
tab++ = RESCALE( m);
1640 *
tab++ = RESCALE(-m);
1642 *
tab++ = RESCALE( (0.5 - 0.0) * m);
1644 *
tab++ = 1 /
s->scale_f;
1646 *
tab++ = RESCALE( (0.0 - 0.5) * m);
1647 *
tab++ = RESCALE( (0.5 - inv) * m);
1648 *
tab++ = RESCALE(-(0.5 - inv) * m);
1650 for (
int i = 0;
i < len4;
i++)
1651 *
tab++ = RESCALE(cos(
i*
f));
1653 tab = ((TXSample *)
s->exp) + len4 + 8;
1655 for (
int i = 0;
i < len4;
i++)
1656 *
tab++ = RESCALE(cos(((
len -
i*4)/4.0)*
f)) * (inv ? 1 : -1);
1661 #define DECL_RDFT(n, inv) \
1662 static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst, \
1663 void *_src, ptrdiff_t stride) \
1665 const int len2 = s->len >> 1; \
1666 const int len4 = s->len >> 2; \
1667 const TXSample *fact = (void *)s->exp; \
1668 const TXSample *tcos = fact + 8; \
1669 const TXSample *tsin = tcos + len4; \
1670 TXComplex *data = inv ? _src : _dst; \
1674 s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex)); \
1676 data[0].im = data[len2].re; \
1681 t[0].re = data[0].re; \
1682 data[0].re = t[0].re + data[0].im; \
1683 data[0].im = t[0].re - data[0].im; \
1684 data[ 0].re = MULT(fact[0], data[ 0].re); \
1685 data[ 0].im = MULT(fact[1], data[ 0].im); \
1686 data[len4].re = MULT(fact[2], data[len4].re); \
1687 data[len4].im = MULT(fact[3], data[len4].im); \
1689 for (int i = 1; i < len4; i++) { \
1691 t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re)); \
1692 t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im)); \
1693 t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im)); \
1694 t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re)); \
1697 CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]); \
1699 data[ i].re = t[0].re + t[2].re; \
1700 data[ i].im = t[2].im - t[0].im; \
1701 data[len2 - i].re = t[0].re - t[2].re; \
1702 data[len2 - i].im = t[2].im + t[0].im; \
1706 s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex)); \
1709 data[len2].re = data[0].im; \
1710 data[ 0].im = data[len2].im = 0; \
1714 static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = { \
1715 .name = TX_NAME_STR("rdft_" #n), \
1716 .function = TX_NAME(ff_tx_rdft_ ##n), \
1717 .type = TX_TYPE(RDFT), \
1718 .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
1719 (inv ? FF_TX_INVERSE_ONLY : FF_TX_FORWARD_ONLY), \
1720 .factors = { 4, TX_FACTOR_ANY }, \
1723 .max_len = TX_LEN_UNLIMITED, \
1724 .init = TX_NAME(ff_tx_rdft_init), \
1725 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1726 .prio = FF_TX_PRIO_BASE, \
1732 #define DECL_RDFT_HALF(n, mode, mod2) \
1733 static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst, \
1734 void *_src, ptrdiff_t stride) \
1736 const int len = s->len; \
1737 const int len2 = len >> 1; \
1738 const int len4 = len >> 2; \
1739 const int aligned_len4 = FFALIGN(len, 4)/4; \
1740 const TXSample *fact = (void *)s->exp; \
1741 const TXSample *tcos = fact + 8; \
1742 const TXSample *tsin = tcos + aligned_len4; \
1743 TXComplex *data = _dst; \
1744 TXSample *out = _dst; \
1746 av_unused TXSample tmp_mid; \
1750 s->fn[0](&s->sub[0], _dst, _src, sizeof(TXComplex)); \
1752 tmp_dc = data[0].re; \
1753 data[ 0].re = tmp_dc + data[0].im; \
1754 tmp_dc = tmp_dc - data[0].im; \
1756 data[ 0].re = MULT(fact[0], data[ 0].re); \
1757 tmp_dc = MULT(fact[1], tmp_dc); \
1758 data[len4].re = MULT(fact[2], data[len4].re); \
1761 data[len4].im = MULT(fact[3], data[len4].im); \
1764 sl = data[len4 + 1]; \
1765 if (mode == AV_TX_REAL_TO_REAL) \
1766 tmp[0] = MULT(fact[4], (sf.re + sl.re)); \
1768 tmp[0] = MULT(fact[5], (sf.im - sl.im)); \
1769 tmp[1] = MULT(fact[6], (sf.im + sl.im)); \
1770 tmp[2] = MULT(fact[7], (sf.re - sl.re)); \
1772 if (mode == AV_TX_REAL_TO_REAL) { \
1773 tmp[3] = tmp[1]*tcos[len4] - tmp[2]*tsin[len4]; \
1774 tmp_mid = (tmp[0] - tmp[3]); \
1776 tmp[3] = tmp[1]*tsin[len4] + tmp[2]*tcos[len4]; \
1777 tmp_mid = (tmp[0] + tmp[3]); \
1782 for (int i = 1; i <= len4; i++) { \
1784 TXComplex sf = data[i]; \
1785 TXComplex sl = data[len2 - i]; \
1787 if (mode == AV_TX_REAL_TO_REAL) \
1788 tmp[0] = MULT(fact[4], (sf.re + sl.re)); \
1790 tmp[0] = MULT(fact[5], (sf.im - sl.im)); \
1792 tmp[1] = MULT(fact[6], (sf.im + sl.im)); \
1793 tmp[2] = MULT(fact[7], (sf.re - sl.re)); \
1795 if (mode == AV_TX_REAL_TO_REAL) { \
1796 tmp[3] = tmp[1]*tcos[i] - tmp[2]*tsin[i]; \
1797 out[i] = (tmp[0] + tmp[3]); \
1798 out[len - i] = (tmp[0] - tmp[3]); \
1800 tmp[3] = tmp[1]*tsin[i] + tmp[2]*tcos[i]; \
1801 out[i - 1] = (tmp[3] - tmp[0]); \
1802 out[len - i - 1] = (tmp[0] + tmp[3]); \
1806 for (int i = 1; i < (len4 + (mode == AV_TX_REAL_TO_IMAGINARY)); i++) \
1807 out[len2 - i] = out[len - i]; \
1809 if (mode == AV_TX_REAL_TO_REAL) { \
1810 out[len2] = tmp_dc; \
1812 out[len4 + 1] = tmp_mid * fact[5]; \
1813 } else if (mod2) { \
1814 out[len4] = tmp_mid; \
1818 static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = { \
1819 .name = TX_NAME_STR("rdft_" #n), \
1820 .function = TX_NAME(ff_tx_rdft_ ##n), \
1821 .type = TX_TYPE(RDFT), \
1822 .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | mode | \
1823 FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, \
1824 .factors = { 2 + 2*(!mod2), TX_FACTOR_ANY }, \
1826 .min_len = 2 + 2*(!mod2), \
1827 .max_len = TX_LEN_UNLIMITED, \
1828 .init = TX_NAME(ff_tx_rdft_init), \
1829 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1830 .prio = FF_TX_PRIO_BASE, \
1848 SCALE_TYPE rsc = *((SCALE_TYPE *)
scale);
1863 tab = (TXSample *)
s->exp;
1867 for (
int i = 0;
i <
len;
i++)
1868 tab[
i] = RESCALE(cos(
i*freq)*(!inv + 1));
1871 for (
int i = 0;
i <
len/2;
i++)
1872 tab[
len +
i] = RESCALE(0.5 / sin((2*
i + 1)*freq));
1874 for (
int i = 0;
i <
len/2;
i++)
1875 tab[
len +
i] = RESCALE(cos((
len - 2*
i - 1)*freq));
1882 void *_src, ptrdiff_t
stride)
1884 TXSample *dst = _dst;
1885 TXSample *
src = _src;
1886 const int len =
s->len;
1887 const int len2 =
len >> 1;
1888 const TXSample *
exp = (
void *)
s->exp;
1893 TXSample tmp1, tmp2;
1896 for (
int i = 0;
i < len2;
i++) {
1897 TXSample in1 =
src[
i];
1898 TXSample in2 =
src[
len -
i - 1];
1908 tmp2 = (tmp2 + 0x40000000) >> 31;
1910 tmp1 = (in1 + in2)*0.5;
1911 tmp2 = (in1 - in2)*
s;
1914 src[
i] = tmp1 + tmp2;
1915 src[
len -
i - 1] = tmp1 - tmp2;
1922 for (
int i =
len - 2;
i > 0;
i -= 2) {
1933 tmp1 = ((int64_t)
exp[0]) * ((int64_t)dst[0]);
1934 dst[0] = (tmp1 + 0x40000000) >> 31;
1936 dst[0] =
exp[0] * dst[0];
1942 void *_src, ptrdiff_t
stride)
1944 TXSample *dst = _dst;
1945 TXSample *
src = _src;
1946 const int len =
s->len;
1947 const int len2 =
len >> 1;
1948 const TXSample *
exp = (
void *)
s->exp;
1950 int64_t tmp1, tmp2 =
src[
len - 1];
1951 tmp2 = (2*tmp2 + 0x40000000) >> 31;
1953 TXSample tmp1, tmp2 = 2*
src[
len - 1];
1958 for (
int i =
len - 2;
i >= 2;
i -= 2) {
1959 TXSample val1 =
src[
i - 0];
1960 TXSample val2 =
src[
i - 1] -
src[
i + 1];
1965 s->fn[0](&
s->sub[0], dst,
src,
sizeof(
float));
1967 for (
int i = 0;
i < len2;
i++) {
1968 TXSample in1 = dst[
i];
1969 TXSample in2 = dst[
len -
i - 1];
1976 tmp2 = (tmp2 + 0x40000000) >> 31;
1979 dst[
i] = tmp1 + tmp2;
1980 dst[
len -
i - 1] = tmp1 - tmp2;
1985 .
name = TX_NAME_STR(
"dctII"),
1999 .
name = TX_NAME_STR(
"dctIII"),
2020 SCALE_TYPE rsc = *((SCALE_TYPE *)
scale);
2045 void *_src, ptrdiff_t
stride)
2047 TXSample *dst = _dst;
2048 TXSample *
src = _src;
2049 const int len =
s->len - 1;
2050 TXSample *
tmp = (TXSample *)
s->tmp;
2052 stride /=
sizeof(TXSample);
2054 for (
int i = 0;
i <
len;
i++)
2059 s->fn[0](&
s->sub[0], dst,
tmp,
sizeof(TXSample));
2063 void *_src, ptrdiff_t
stride)
2065 TXSample *dst = _dst;
2066 TXSample *
src = _src;
2067 const int len =
s->len + 1;
2068 TXSample *
tmp = (
void *)
s->tmp;
2070 stride /=
sizeof(TXSample);
2074 for (
int i = 1;
i <
len;
i++) {
2082 s->fn[0](&
s->sub[0], dst,
tmp,
sizeof(
float));
2086 .
name = TX_NAME_STR(
"dctI"),
2100 .
name = TX_NAME_STR(
"dstI"),
2116 int len4 =
s->len >> 1;
2117 double scale =
s->scale_d;
2118 const double theta = (
scale < 0 ? len4 : 0) + 1.0/8.0;
2119 size_t alloc = pre_tab ? 2*len4 : len4;
2129 for (
int i = 0;
i < len4;
i++) {
2136 for (
int i = 0;
i < len4;
i++)
2137 s->exp[
i] =
s->exp[len4 + pre_tab[
i]];
2150 &
TX_NAME(ff_tx_fft128_ns_def),
2151 &
TX_NAME(ff_tx_fft256_ns_def),
2152 &
TX_NAME(ff_tx_fft512_ns_def),
2153 &
TX_NAME(ff_tx_fft1024_ns_def),
2154 &
TX_NAME(ff_tx_fft2048_ns_def),
2155 &
TX_NAME(ff_tx_fft4096_ns_def),
2156 &
TX_NAME(ff_tx_fft8192_ns_def),
2157 &
TX_NAME(ff_tx_fft16384_ns_def),
2158 &
TX_NAME(ff_tx_fft32768_ns_def),
2159 &
TX_NAME(ff_tx_fft65536_ns_def),
2160 &
TX_NAME(ff_tx_fft131072_ns_def),
2161 &
TX_NAME(ff_tx_fft262144_ns_def),
2162 &
TX_NAME(ff_tx_fft524288_ns_def),
2163 &
TX_NAME(ff_tx_fft1048576_ns_def),
2164 &
TX_NAME(ff_tx_fft2097152_ns_def),
2181 &
TX_NAME(ff_tx_fft_inplace_def),
2182 &
TX_NAME(ff_tx_fft_inplace_small_def),
2184 &
TX_NAME(ff_tx_fft_pfa_ns_def),
2185 &
TX_NAME(ff_tx_fft_naive_def),
2186 &
TX_NAME(ff_tx_fft_naive_small_def),
2189 &
TX_NAME(ff_tx_mdct_pfa_3xM_fwd_def),
2190 &
TX_NAME(ff_tx_mdct_pfa_5xM_fwd_def),
2191 &
TX_NAME(ff_tx_mdct_pfa_7xM_fwd_def),
2192 &
TX_NAME(ff_tx_mdct_pfa_9xM_fwd_def),
2193 &
TX_NAME(ff_tx_mdct_pfa_15xM_fwd_def),
2194 &
TX_NAME(ff_tx_mdct_pfa_3xM_inv_def),
2195 &
TX_NAME(ff_tx_mdct_pfa_5xM_inv_def),
2196 &
TX_NAME(ff_tx_mdct_pfa_7xM_inv_def),
2197 &
TX_NAME(ff_tx_mdct_pfa_9xM_inv_def),
2198 &
TX_NAME(ff_tx_mdct_pfa_15xM_inv_def),
2199 &
TX_NAME(ff_tx_mdct_naive_fwd_def),
2200 &
TX_NAME(ff_tx_mdct_naive_inv_def),
2201 &
TX_NAME(ff_tx_mdct_inv_full_def),
2204 &
TX_NAME(ff_tx_rdft_r2r_mod2_def),
2206 &
TX_NAME(ff_tx_rdft_r2i_mod2_def),
int(* func)(AVBPrint *dst, const char *in, const char *arg)
static void TX_NAME() ff_tx_fft_sr_combine(TXComplex *z, const TXSample *cos, int len)
static av_cold int TX_NAME() ff_tx_dct_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
@ AV_TX_REAL_TO_REAL
Perform a real to half-complex RDFT.
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
#define TRANSFORM(a0, a1, a2, a3, wre, wim)
static void TX_NAME() ff_tx_fft(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
#define TX_MAX_DECOMPOSITIONS
static void TX_NAME() ff_tx_fft_pfa(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride)
static void TX_NAME() ff_tx_fft16_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
int ff_tx_gen_inplace_map(AVTXContext *s, int len)
static av_always_inline void fft15(TXComplex *out, TXComplex *in, ptrdiff_t stride)
#define FF_TX_CPU_FLAGS_ALL
int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts, int inv, int n, int m)
static void TX_NAME() ff_tx_dctI(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
static void TX_NAME() ff_tx_fft_naive(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
#define DECL_FFT5(NAME, D0, D1, D2, D3, D4)
static void TX_NAME() ff_tx_mdct_naive_fwd(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
static av_cold int TX_NAME() ff_tx_rdft_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
#define DECL_SR_CODELET_DEF(n)
static SR_POW2_TABLES void(*const sr_tabs_init_funcs[])(void)
static const struct twinvq_data tab
static const FFTXCodelet TX_NAME(ff_tx_fft_def)
static void sum_d(const int *input, int *output, int len)
static AVOnce sr_tabs_init_once[]
static double val(void *priv, double ch)
static av_always_inline float scale(float x, float s)
#define TABLE_DEF(name, size)
static int16_t mult(Float11 *f1, Float11 *f2)
static int ff_thread_once(char *control, void(*routine)(void))
#define FF_ARRAY_ELEMS(a)
static void c2r(float *buffer, int size)
static av_cold int TX_NAME() ff_tx_fft_factor_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
static void TX_NAME() ff_tx_mdct_fwd(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
static av_cold int TX_NAME() ff_tx_mdct_naive_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
#define FF_TX_FORWARD_ONLY
static void TX_NAME() ff_tx_dstI(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
@ AV_TX_FULL_IMDCT
Performs a full inverse MDCT rather than leaving out samples that can be derived through symmetry.
static __device__ float fabs(float a)
@ AV_TX_REAL_TO_IMAGINARY
static av_cold int TX_NAME() ff_tx_mdct_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
@ AV_TX_INPLACE
Allows for in-place transformations, where input == output.
int ff_tx_gen_ptwo_revtab(AVTXContext *s, FFTXCodeletOptions *opts)
static void r2c(float *buffer, int size)
#define FF_TX_OUT_OF_PLACE
@ AV_TX_UNALIGNED
Relaxes alignment requirement for the in and out arrays of av_tx_fn().
static void TX_NAME() ff_tx_dctIII(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
#define DECL_COMP_MDCT(N)
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
static av_cold int TX_NAME() ff_tx_fft_pfa_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
void ff_tx_clear_ctx(AVTXContext *s)
static void TX_NAME() ff_tx_fft2_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
static av_cold int TX_NAME() ff_tx_fft_sr_codelet_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
int ff_tx_gen_default_map(AVTXContext *s, FFTXCodeletOptions *opts)
static av_cold void TX_TAB() ff_tx_init_tab_53(void)
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
static void TX_NAME() ff_tx_fft8_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
static av_always_inline void fft9(TXComplex *out, TXComplex *in, ptrdiff_t stride)
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
#define TX_EMBED_INPUT_PFA_MAP(map, tot_len, d1, d2)
static void TX_NAME() ff_tx_fft_inplace(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
#define DECL_RDFT_HALF(n, mode, mod2)
static av_cold int TX_NAME() ff_tx_fft_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
static void TX_NAME() ff_tx_mdct_inv(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
#define i(width, name, range_min, range_max)
#define av_malloc_array(a, b)
static AVOnce nptwo_tabs_init_once[]
static av_cold int TX_NAME() ff_tx_fft_init_naive_small(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
#define DECL_SR_CODELET(n, n2, n4)
#define DECL_COMP_IMDCT(N)
void * av_mallocz(size_t size)
Allocate a memory block with alignment suitable for all memory accesses (including vectors if availab...
static av_always_inline void fft3(TXComplex *out, TXComplex *in, ptrdiff_t stride)
static const FFTabInitData nptwo_tabs_init_data[]
av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
#define FFSWAP(type, a, b)
static av_cold void TX_TAB() ff_tx_init_tab_7(void)
#define FF_TX_INVERSE_ONLY
static void TX_NAME() ff_tx_fft_naive_small(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
static av_cold void TX_TAB() ff_tx_init_tab_9(void)
av_cold void TX_TAB() ff_tx_init_tabs(int len)
static void TX_NAME() ff_tx_mdct_naive_inv(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
static void TX_NAME() ff_tx_dctII(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
#define BUTTERFLIES(a0, a1, a2, a3)
static void TX_NAME() ff_tx_fft_pfa_ns(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride)
static const int factor[16]
static av_cold int TX_NAME() ff_tx_dcstI_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
static av_cold int TX_NAME() ff_tx_fft_inplace_small_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
const VDPAUPixFmtMap * map
static const int16_t alpha[]
static av_always_inline void fft7(TXComplex *out, TXComplex *in, ptrdiff_t stride)
#define flags(name, subs,...)
int TX_TAB() ff_tx_mdct_gen_exp(AVTXContext *s, int *pre_tab)
int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts, int d1, int d2)
#define DECL_RDFT(n, inv)
static av_cold int TX_NAME() ff_tx_mdct_pfa_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
static void TX_NAME() ff_tx_fft4_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
static av_cold int TX_NAME() ff_tx_mdct_inv_full_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
int ff_tx_decompose_length(int dst[TX_MAX_DECOMPOSITIONS], enum AVTXType type, int len, int inv)
static void TX_NAME() ff_tx_mdct_inv_full(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)