Go to the documentation of this file.
27 #define TABLE_DEF(name, size) \
28 DECLARE_ALIGNED(32, TXSample, TX_TAB(ff_tx_tab_ ##name))[size]
30 #define SR_POW2_TABLES \
51 #define SR_TABLE(len) \
52 TABLE_DEF(len, len/4 + 1);
67 #define SR_TABLE(len) \
68 static av_cold void TX_TAB(ff_tx_init_tab_ ##len)(void) \
70 double freq = 2*M_PI/len; \
71 TXSample *tab = TX_TAB(ff_tx_tab_ ##len); \
73 for (int i = 0; i < len/4; i++) \
74 *tab++ = RESCALE(cos(i*freq)); \
82 #define SR_TABLE(len) TX_TAB(ff_tx_init_tab_ ##len),
88 #define SR_TABLE(len) AV_ONCE_INIT,
96 TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 *
M_PI / 5));
97 TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 *
M_PI / 5));
98 TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 *
M_PI / 10));
99 TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 *
M_PI / 10));
100 TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 *
M_PI / 5));
101 TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 *
M_PI / 5));
102 TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 *
M_PI / 10));
103 TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 *
M_PI / 10));
106 TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 *
M_PI / 12));
107 TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 *
M_PI / 12));
108 TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 *
M_PI / 6));
109 TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 *
M_PI / 6));
114 TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 *
M_PI / 7));
115 TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 *
M_PI / 7));
116 TX_TAB(ff_tx_tab_7)[2] = RESCALE(sin(2 *
M_PI / 28));
117 TX_TAB(ff_tx_tab_7)[3] = RESCALE(cos(2 *
M_PI / 28));
118 TX_TAB(ff_tx_tab_7)[4] = RESCALE(cos(2 *
M_PI / 14));
119 TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 *
M_PI / 14));
124 TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 *
M_PI / 3));
125 TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 *
M_PI / 3));
126 TX_TAB(ff_tx_tab_9)[2] = RESCALE(cos(2 *
M_PI / 9));
127 TX_TAB(ff_tx_tab_9)[3] = RESCALE(sin(2 *
M_PI / 9));
128 TX_TAB(ff_tx_tab_9)[4] = RESCALE(cos(2 *
M_PI / 36));
129 TX_TAB(ff_tx_tab_9)[5] = RESCALE(sin(2 *
M_PI / 36));
130 TX_TAB(ff_tx_tab_9)[6] = TX_TAB(ff_tx_tab_9)[2] + TX_TAB(ff_tx_tab_9)[5];
131 TX_TAB(ff_tx_tab_9)[7] = TX_TAB(ff_tx_tab_9)[3] - TX_TAB(ff_tx_tab_9)[4];
150 int idx = factor_2 - 3;
151 for (
int i = 0;
i <= idx;
i++)
179 const TXSample *
tab = TX_TAB(ff_tx_tab_53);
192 mtmp[0] = (int64_t)
tab[ 8] *
tmp[1].
re;
193 mtmp[1] = (int64_t)
tab[ 9] *
tmp[1].
im;
194 mtmp[2] = (int64_t)
tab[10] *
tmp[2].
re;
195 mtmp[3] = (int64_t)
tab[10] *
tmp[2].
im;
196 out[1*
stride].re =
tmp[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
197 out[1*
stride].im =
tmp[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
198 out[2*
stride].re =
tmp[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
199 out[2*
stride].im =
tmp[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
212 #define DECL_FFT5(NAME, D0, D1, D2, D3, D4) \
213 static av_always_inline void NAME(TXComplex *out, TXComplex *in, \
216 TXComplex dc, z0[4], t[6]; \
217 const TXSample *tab = TX_TAB(ff_tx_tab_53); \
220 BF(t[1].im, t[0].re, in[1].re, in[4].re); \
221 BF(t[1].re, t[0].im, in[1].im, in[4].im); \
222 BF(t[3].im, t[2].re, in[2].re, in[3].re); \
223 BF(t[3].re, t[2].im, in[2].im, in[3].im); \
225 out[D0*stride].re = dc.re + t[0].re + t[2].re; \
226 out[D0*stride].im = dc.im + t[0].im + t[2].im; \
228 SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re); \
229 SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im); \
230 CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re); \
231 CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im); \
233 BF(z0[0].re, z0[3].re, t[0].re, t[1].re); \
234 BF(z0[0].im, z0[3].im, t[0].im, t[1].im); \
235 BF(z0[2].re, z0[1].re, t[4].re, t[5].re); \
236 BF(z0[2].im, z0[1].im, t[4].im, t[5].im); \
238 out[D1*stride].re = dc.re + z0[3].re; \
239 out[D1*stride].im = dc.im + z0[0].im; \
240 out[D2*stride].re = dc.re + z0[2].re; \
241 out[D2*stride].im = dc.im + z0[1].im; \
242 out[D3*stride].re = dc.re + z0[1].re; \
243 out[D3*stride].im = dc.im + z0[2].im; \
244 out[D4*stride].re = dc.re + z0[0].re; \
245 out[D4*stride].im = dc.im + z0[3].im; \
270 out[0*
stride].re =
dc.re + t[0].re + t[2].re + t[4].re;
271 out[0*
stride].im =
dc.im + t[0].im + t[2].im + t[4].im;
274 mtmp[ 0] = ((int64_t)
tab[0].
re)*t[0].re - ((int64_t)
tab[2].
re)*t[4].re;
275 mtmp[ 1] = ((int64_t)
tab[0].
re)*t[4].re - ((int64_t)
tab[1].
re)*t[0].re;
276 mtmp[ 2] = ((int64_t)
tab[0].
re)*t[2].re - ((int64_t)
tab[2].
re)*t[0].re;
277 mtmp[ 3] = ((int64_t)
tab[0].
re)*t[0].im - ((int64_t)
tab[1].
re)*t[2].im;
278 mtmp[ 4] = ((int64_t)
tab[0].
re)*t[4].im - ((int64_t)
tab[1].
re)*t[0].im;
279 mtmp[ 5] = ((int64_t)
tab[0].
re)*t[2].im - ((int64_t)
tab[2].
re)*t[0].im;
281 mtmp[ 6] = ((int64_t)
tab[2].
im)*t[1].im + ((int64_t)
tab[1].
im)*t[5].im;
282 mtmp[ 7] = ((int64_t)
tab[0].
im)*t[5].im + ((int64_t)
tab[2].
im)*t[3].im;
283 mtmp[ 8] = ((int64_t)
tab[2].
im)*t[5].im + ((int64_t)
tab[1].
im)*t[3].im;
284 mtmp[ 9] = ((int64_t)
tab[0].
im)*t[1].re + ((int64_t)
tab[1].
im)*t[3].re;
285 mtmp[10] = ((int64_t)
tab[2].
im)*t[3].re + ((int64_t)
tab[0].
im)*t[5].re;
286 mtmp[11] = ((int64_t)
tab[2].
im)*t[1].re + ((int64_t)
tab[1].
im)*t[5].re;
288 z[0].re = (
int32_t)(mtmp[ 0] - ((int64_t)
tab[1].re)*t[2].
re + 0x40000000 >> 31);
289 z[1].re = (
int32_t)(mtmp[ 1] - ((int64_t)
tab[2].re)*t[2].
re + 0x40000000 >> 31);
290 z[2].re = (
int32_t)(mtmp[ 2] - ((int64_t)
tab[1].re)*t[4].
re + 0x40000000 >> 31);
291 z[0].im = (
int32_t)(mtmp[ 3] - ((int64_t)
tab[2].re)*t[4].
im + 0x40000000 >> 31);
292 z[1].im = (
int32_t)(mtmp[ 4] - ((int64_t)
tab[2].re)*t[2].
im + 0x40000000 >> 31);
293 z[2].im = (
int32_t)(mtmp[ 5] - ((int64_t)
tab[1].re)*t[4].
im + 0x40000000 >> 31);
295 t[0].re = (
int32_t)(mtmp[ 6] - ((int64_t)
tab[0].im)*t[3].
im + 0x40000000 >> 31);
296 t[2].re = (
int32_t)(mtmp[ 7] - ((int64_t)
tab[1].im)*t[1].
im + 0x40000000 >> 31);
297 t[4].re = (
int32_t)(mtmp[ 8] + ((int64_t)
tab[0].im)*t[1].
im + 0x40000000 >> 31);
298 t[0].im = (
int32_t)(mtmp[ 9] + ((int64_t)
tab[2].im)*t[5].
re + 0x40000000 >> 31);
299 t[2].im = (
int32_t)(mtmp[10] - ((int64_t)
tab[1].im)*t[1].
re + 0x40000000 >> 31);
300 t[4].im = (
int32_t)(mtmp[11] - ((int64_t)
tab[0].im)*t[3].
re + 0x40000000 >> 31);
302 z[0].re =
tab[0].re*t[0].re -
tab[2].re*t[4].re -
tab[1].re*t[2].re;
303 z[1].re =
tab[0].re*t[4].re -
tab[1].re*t[0].re -
tab[2].re*t[2].re;
304 z[2].re =
tab[0].re*t[2].re -
tab[2].re*t[0].re -
tab[1].re*t[4].re;
305 z[0].im =
tab[0].re*t[0].im -
tab[1].re*t[2].im -
tab[2].re*t[4].im;
306 z[1].im =
tab[0].re*t[4].im -
tab[1].re*t[0].im -
tab[2].re*t[2].im;
307 z[2].im =
tab[0].re*t[2].im -
tab[2].re*t[0].im -
tab[1].re*t[4].im;
312 t[0].re =
tab[2].im*t[1].im +
tab[1].im*t[5].im -
tab[0].im*t[3].im;
313 t[2].re =
tab[0].im*t[5].im +
tab[2].im*t[3].im -
tab[1].im*t[1].im;
314 t[4].re =
tab[2].im*t[5].im +
tab[1].im*t[3].im +
tab[0].im*t[1].im;
315 t[0].im =
tab[0].im*t[1].re +
tab[1].im*t[3].re +
tab[2].im*t[5].re;
316 t[2].im =
tab[2].im*t[3].re +
tab[0].im*t[5].re -
tab[1].im*t[1].re;
317 t[4].im =
tab[2].im*t[1].re +
tab[1].im*t[5].re -
tab[0].im*t[3].re;
360 w[0].re = t[0].re - t[6].re;
361 w[0].im = t[0].im - t[6].im;
362 w[1].re = t[2].re - t[6].re;
363 w[1].im = t[2].im - t[6].im;
364 w[2].re = t[1].re - t[7].re;
365 w[2].im = t[1].im - t[7].im;
366 w[3].re = t[3].re + t[7].re;
367 w[3].im = t[3].im + t[7].im;
369 z[0].re =
dc.re + t[4].re;
370 z[0].im =
dc.im + t[4].im;
372 z[1].re = t[0].re + t[2].re + t[6].re;
373 z[1].im = t[0].im + t[2].im + t[6].im;
379 mtmp[0] = t[1].re - t[3].re + t[7].re;
380 mtmp[1] = t[1].im - t[3].im + t[7].im;
382 y[3].re = (
int32_t)(((int64_t)
tab[0].
im)*mtmp[0] + 0x40000000 >> 31);
383 y[3].im = (
int32_t)(((int64_t)
tab[0].im)*mtmp[1] + 0x40000000 >> 31);
385 mtmp[0] = (
int32_t)(((int64_t)
tab[0].re)*z[1].
re + 0x40000000 >> 31);
386 mtmp[1] = (
int32_t)(((int64_t)
tab[0].re)*z[1].
im + 0x40000000 >> 31);
387 mtmp[2] = (
int32_t)(((int64_t)
tab[0].re)*t[4].
re + 0x40000000 >> 31);
388 mtmp[3] = (
int32_t)(((int64_t)
tab[0].re)*t[4].
im + 0x40000000 >> 31);
390 x[3].re = z[0].re + (
int32_t)mtmp[0];
391 x[3].im = z[0].im + (
int32_t)mtmp[1];
392 z[0].re = in[0].re + (
int32_t)mtmp[2];
393 z[0].im = in[0].im + (
int32_t)mtmp[3];
395 mtmp[0] = ((int64_t)
tab[1].
re)*
w[0].re;
396 mtmp[1] = ((int64_t)
tab[1].
re)*
w[0].im;
397 mtmp[2] = ((int64_t)
tab[2].
im)*
w[0].re;
398 mtmp[3] = ((int64_t)
tab[2].
im)*
w[0].im;
399 mtmp[4] = ((int64_t)
tab[1].
im)*
w[2].re;
400 mtmp[5] = ((int64_t)
tab[1].
im)*
w[2].im;
401 mtmp[6] = ((int64_t)
tab[2].
re)*
w[2].re;
402 mtmp[7] = ((int64_t)
tab[2].
re)*
w[2].im;
404 x[1].re = (
int32_t)(mtmp[0] + ((int64_t)
tab[2].im)*
w[1].
re + 0x40000000 >> 31);
405 x[1].im = (
int32_t)(mtmp[1] + ((int64_t)
tab[2].im)*
w[1].
im + 0x40000000 >> 31);
406 x[2].re = (
int32_t)(mtmp[2] - ((int64_t)
tab[3].re)*
w[1].
re + 0x40000000 >> 31);
407 x[2].im = (
int32_t)(mtmp[3] - ((int64_t)
tab[3].re)*
w[1].
im + 0x40000000 >> 31);
408 y[1].re = (
int32_t)(mtmp[4] + ((int64_t)
tab[2].re)*
w[3].
re + 0x40000000 >> 31);
409 y[1].im = (
int32_t)(mtmp[5] + ((int64_t)
tab[2].re)*
w[3].
im + 0x40000000 >> 31);
410 y[2].re = (
int32_t)(mtmp[6] - ((int64_t)
tab[3].im)*
w[3].
re + 0x40000000 >> 31);
411 y[2].im = (
int32_t)(mtmp[7] - ((int64_t)
tab[3].im)*
w[3].
im + 0x40000000 >> 31);
413 y[0].re = (
int32_t)(((int64_t)
tab[0].im)*t[5].
re + 0x40000000 >> 31);
414 y[0].im = (
int32_t)(((int64_t)
tab[0].im)*t[5].
im + 0x40000000 >> 31);
417 y[3].re =
tab[0].im*(t[1].re - t[3].re + t[7].re);
418 y[3].im =
tab[0].im*(t[1].im - t[3].im + t[7].im);
420 x[3].re = z[0].re +
tab[0].re*z[1].re;
421 x[3].im = z[0].im +
tab[0].re*z[1].im;
422 z[0].re =
dc.re +
tab[0].re*t[4].re;
423 z[0].im =
dc.im +
tab[0].re*t[4].im;
425 x[1].re =
tab[1].re*
w[0].re +
tab[2].im*
w[1].re;
426 x[1].im =
tab[1].re*
w[0].im +
tab[2].im*
w[1].im;
427 x[2].re =
tab[2].im*
w[0].re -
tab[3].re*
w[1].re;
428 x[2].im =
tab[2].im*
w[0].im -
tab[3].re*
w[1].im;
429 y[1].re =
tab[1].im*
w[2].re +
tab[2].re*
w[3].re;
430 y[1].im =
tab[1].im*
w[2].im +
tab[2].re*
w[3].im;
431 y[2].re =
tab[2].re*
w[2].re -
tab[3].im*
w[3].re;
432 y[2].im =
tab[2].re*
w[2].im -
tab[3].im*
w[3].im;
434 y[0].re =
tab[0].im*t[5].re;
435 y[0].im =
tab[0].im*t[5].im;
438 x[4].re = x[1].re + x[2].re;
439 x[4].im = x[1].im + x[2].im;
441 y[4].re = y[1].re - y[2].re;
442 y[4].im = y[1].im - y[2].im;
443 x[1].re = z[0].re + x[1].re;
444 x[1].im = z[0].im + x[1].im;
445 y[1].re = y[0].re + y[1].re;
446 y[1].im = y[0].im + y[1].im;
447 x[2].re = z[0].re + x[2].re;
448 x[2].im = z[0].im + x[2].im;
449 y[2].re = y[2].re - y[0].re;
450 y[2].im = y[2].im - y[0].im;
451 x[4].re = z[0].re - x[4].re;
452 x[4].im = z[0].im - x[4].im;
453 y[4].re = y[0].re - y[4].re;
454 y[4].im = y[0].im - y[4].im;
471 for (
int i = 0;
i < 5;
i++)
497 #define DECL_FACTOR_S(n) \
498 static void TX_NAME(ff_tx_fft##n)(AVTXContext *s, void *dst, \
499 void *src, ptrdiff_t stride) \
501 fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex)); \
503 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
504 .name = TX_NAME_STR("fft" #n "_ns"), \
505 .function = TX_NAME(ff_tx_fft##n), \
506 .type = TX_TYPE(FFT), \
507 .flags = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
508 AV_TX_UNALIGNED | FF_TX_PRESHUFFLE, \
513 .init = TX_NAME(ff_tx_fft_factor_init), \
514 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
515 .prio = FF_TX_PRIO_BASE, \
518 #define DECL_FACTOR_F(n) \
520 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_fwd_def) = { \
521 .name = TX_NAME_STR("fft" #n "_fwd"), \
522 .function = TX_NAME(ff_tx_fft##n), \
523 .type = TX_TYPE(FFT), \
524 .flags = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
525 AV_TX_UNALIGNED | FF_TX_FORWARD_ONLY, \
530 .init = TX_NAME(ff_tx_fft_factor_init), \
531 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
532 .prio = FF_TX_PRIO_BASE, \
541 #define BUTTERFLIES(a0, a1, a2, a3) \
547 BF(t3, t5, t5, t1); \
548 BF(a2.re, a0.re, r0, t5); \
549 BF(a3.im, a1.im, i1, t3); \
550 BF(t4, t6, t2, t6); \
551 BF(a3.re, a1.re, r1, t4); \
552 BF(a2.im, a0.im, i0, t6); \
555 #define TRANSFORM(a0, a1, a2, a3, wre, wim) \
557 CMUL(t1, t2, a2.re, a2.im, wre, -wim); \
558 CMUL(t5, t6, a3.re, a3.im, wre, wim); \
559 BUTTERFLIES(a0, a1, a2, a3); \
564 const TXSample *cos,
int len)
569 const TXSample *wim = cos + o1 - 7;
572 for (
int i = 0;
i <
len;
i += 4) {
573 TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]);
574 TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]);
575 TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]);
576 TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]);
578 TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]);
579 TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]);
580 TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]);
581 TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]);
600 #define DECL_SR_CODELET_DEF(n) \
601 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
602 .name = TX_NAME_STR("fft" #n "_ns"), \
603 .function = TX_NAME(ff_tx_fft##n##_ns), \
604 .type = TX_TYPE(FFT), \
605 .flags = FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | \
606 AV_TX_UNALIGNED | FF_TX_PRESHUFFLE, \
611 .init = TX_NAME(ff_tx_fft_sr_codelet_init), \
612 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
613 .prio = FF_TX_PRIO_BASE, \
616 #define DECL_SR_CODELET(n, n2, n4) \
617 static void TX_NAME(ff_tx_fft##n##_ns)(AVTXContext *s, void *_dst, \
618 void *_src, ptrdiff_t stride) \
620 TXComplex *src = _src; \
621 TXComplex *dst = _dst; \
622 const TXSample *cos = TX_TAB(ff_tx_tab_##n); \
624 TX_NAME(ff_tx_fft##n2##_ns)(s, dst, src, stride); \
625 TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride); \
626 TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride); \
627 TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1); \
630 DECL_SR_CODELET_DEF(n)
633 void *_src, ptrdiff_t
stride)
645 void *_src, ptrdiff_t
stride)
662 void *_src, ptrdiff_t
stride)
667 const TXSample cos = TX_TAB(ff_tx_tab_8)[1];
677 TRANSFORM(dst[1], dst[3], dst[5], dst[7], cos, cos);
681 void *_src, ptrdiff_t
stride)
685 const TXSample *cos = TX_TAB(ff_tx_tab_16);
688 TXSample cos_16_1 = cos[1];
689 TXSample cos_16_2 = cos[2];
690 TXSample cos_16_3 = cos[3];
702 TRANSFORM(dst[ 2], dst[ 6], dst[10], dst[14], cos_16_2, cos_16_2);
703 TRANSFORM(dst[ 1], dst[ 5], dst[ 9], dst[13], cos_16_1, cos_16_3);
704 TRANSFORM(dst[ 3], dst[ 7], dst[11], dst[15], cos_16_3, cos_16_1);
769 void *_src, ptrdiff_t
stride)
774 int *
map =
s->sub[0].map;
779 for (
int i = 0;
i <
len;
i++)
782 s->fn[0](&
s->sub[0], dst2, dst1,
stride);
786 void *_src, ptrdiff_t
stride)
791 const int *
map =
s->sub->map;
792 const int *inplace_idx =
s->map;
793 int src_idx, dst_idx;
795 src_idx = *inplace_idx++;
798 dst_idx =
map[src_idx];
801 dst_idx =
map[dst_idx];
802 }
while (dst_idx != src_idx);
804 }
while ((src_idx = *inplace_idx++));
810 .
name = TX_NAME_STR(
"fft"),
824 .
name = TX_NAME_STR(
"fft_inplace_small"),
838 .
name = TX_NAME_STR(
"fft_inplace"),
863 for (
int i = 0;
i <
len;
i++) {
864 for (
int j = 0; j <
len; j++) {
865 const double factor = phase*
i*j;
881 const int n =
s->len;
882 double phase =
s->inv ? 2.0*
M_PI/n : -2.0*
M_PI/n;
886 for (
int i = 0;
i < n;
i++) {
888 for (
int j = 0; j < n; j++) {
889 const double factor = phase*
i*j;
908 const int n =
s->len;
912 for (
int i = 0;
i < n;
i++) {
914 for (
int j = 0; j < n; j++) {
926 .
name = TX_NAME_STR(
"fft_naive_small"),
940 .
name = TX_NAME_STR(
"fft_naive"),
962 size_t extra_tmp_len = 0;
969 for (
int i = 0;
i <
ret;
i++) {
970 int len1 = len_list[
i];
971 int len2 =
len / len1;
974 if (len2 & (len2 - 1))
989 }
else if (
ret < 0) {
1010 }
else if (
ret < 0) {
1017 }
else if (
ret < 0) {
1037 s->sub[0].len,
s->sub[1].len)))
1044 tmp = (
int *)
s->tmp;
1045 for (
int k = 0; k <
len; k +=
s->sub[0].len) {
1046 memcpy(
tmp, &
s->map[k],
s->sub[0].len*
sizeof(*
tmp));
1047 for (
int i = 0;
i <
s->sub[0].len;
i++)
1048 s->map[k +
i] =
tmp[
s->sub[0].map[
i]];
1053 extra_tmp_len =
len;
1055 extra_tmp_len =
s->sub[0].len;
1057 if (extra_tmp_len && !(
s->exp =
av_malloc(extra_tmp_len*
sizeof(*
s->exp))))
1064 void *_in, ptrdiff_t
stride)
1066 const int n =
s->sub[0].len, m =
s->sub[1].len, l =
s->len;
1067 const int *in_map =
s->map, *out_map = in_map + l;
1068 const int *sub_map =
s->sub[1].map;
1074 for (
int i = 0;
i < m;
i++) {
1075 for (
int j = 0; j < n; j++)
1076 s->exp[j] = in[in_map[
i*n + j]];
1077 s->fn[0](&
s->sub[0], &
s->tmp[sub_map[
i]],
s->exp, m*
sizeof(
TXComplex));
1080 for (
int i = 0;
i < n;
i++)
1081 s->fn[1](&
s->sub[1], &tmp1[m*
i], &
s->tmp[m*
i],
sizeof(
TXComplex));
1083 for (
int i = 0;
i < l;
i++)
1088 void *_in, ptrdiff_t
stride)
1090 const int n =
s->sub[0].len, m =
s->sub[1].len, l =
s->len;
1091 const int *in_map =
s->map, *out_map = in_map + l;
1092 const int *sub_map =
s->sub[1].map;
1098 for (
int i = 0;
i < m;
i++)
1099 s->fn[0](&
s->sub[0], &
s->tmp[sub_map[
i]], &in[
i*n], m*
sizeof(
TXComplex));
1101 for (
int i = 0;
i < n;
i++)
1102 s->fn[1](&
s->sub[1], &tmp1[m*
i], &
s->tmp[m*
i],
sizeof(
TXComplex));
1104 for (
int i = 0;
i < l;
i++)
1109 .
name = TX_NAME_STR(
"fft_pfa"),
1123 .
name = TX_NAME_STR(
"fft_pfa_ns"),
1144 s->scale_d = *((SCALE_TYPE *)
scale);
1145 s->scale_f =
s->scale_d;
1150 void *_src, ptrdiff_t
stride)
1152 TXSample *
src = _src;
1153 TXSample *dst = _dst;
1154 double scale =
s->scale_d;
1156 const double phase =
M_PI/(4.0*
len);
1160 for (
int i = 0;
i <
len;
i++) {
1162 for (
int j = 0; j <
len*2; j++) {
1163 int a = (2*j + 1 +
len) * (2*
i + 1);
1164 sum += UNSCALE(
src[j]) * cos(
a * phase);
1171 void *_src, ptrdiff_t
stride)
1173 TXSample *
src = _src;
1174 TXSample *dst = _dst;
1175 double scale =
s->scale_d;
1176 int len =
s->len >> 1;
1178 const double phase =
M_PI/(4.0*len2);
1182 for (
int i = 0;
i <
len;
i++) {
1185 double i_d = phase * (4*
len - 2*
i - 1);
1186 double i_u = phase * (3*len2 + 2*
i + 1);
1187 for (
int j = 0; j < len2; j++) {
1188 double a = (2 * j + 1);
1189 double a_d = cos(
a * i_d);
1190 double a_u = cos(
a * i_u);
1201 .
name = TX_NAME_STR(
"mdct_naive_fwd"),
1215 .
name = TX_NAME_STR(
"mdct_naive_inv"),
1240 s->scale_d = *((SCALE_TYPE *)
scale);
1241 s->scale_f =
s->scale_d;
1261 memcpy(
s->map,
s->sub->map, (
len >> 1)*
sizeof(*
s->map));
1263 for (
int i = 0; i < len >> 1;
i++)
1272 for (
int i = 0;
i < (
s->len >> 1);
i++)
1281 TXSample *
src = _src, *dst = _dst;
1283 const int len2 =
s->len >> 1;
1284 const int len4 =
s->len >> 2;
1285 const int len3 = len2 * 3;
1286 const int *sub_map =
s->map;
1290 for (
int i = 0;
i < len2;
i++) {
1292 const int idx = sub_map[
i];
1294 tmp.re = FOLD(-
src[ len2 + k],
src[1*len2 - 1 - k]);
1295 tmp.im = FOLD(-
src[ len3 + k], -
src[1*len3 - 1 - k]);
1297 tmp.re = FOLD(-
src[ len2 + k], -
src[5*len2 - 1 - k]);
1298 tmp.im = FOLD(
src[-len2 + k], -
src[1*len3 - 1 - k]);
1305 for (
int i = 0;
i < len4;
i++) {
1306 const int i0 = len4 +
i, i1 = len4 -
i - 1;
1321 const TXSample *
src = _src, *in1, *in2;
1322 const int len2 =
s->len >> 1;
1323 const int len4 =
s->len >> 2;
1324 const int *sub_map =
s->map;
1330 for (
int i = 0;
i < len2;
i++) {
1339 for (
int i = 0;
i < len4;
i++) {
1340 const int i0 = len4 +
i, i1 = len4 -
i - 1;
1350 .
name = TX_NAME_STR(
"mdct_fwd"),
1364 .
name = TX_NAME_STR(
"mdct_inv"),
1386 s->scale_d = *((SCALE_TYPE *)
scale);
1387 s->scale_f =
s->scale_d;
1398 void *_src, ptrdiff_t
stride)
1400 int len =
s->len << 1;
1401 int len2 =
len >> 1;
1402 int len4 =
len >> 2;
1403 TXSample *dst = _dst;
1405 s->fn[0](&
s->sub[0], dst + len4, _src,
stride);
1409 for (
int i = 0;
i < len4;
i++) {
1416 .
name = TX_NAME_STR(
"mdct_inv_full"),
1441 sub_len =
len / cd->factors[0];
1443 s->scale_d = *((SCALE_TYPE *)
scale);
1444 s->scale_f =
s->scale_d;
1451 sub_len, inv,
scale)))
1458 if (cd->factors[0] == 15)
1465 for (
int i = 0;
i <
len;
i++)
1476 #define DECL_COMP_IMDCT(N) \
1477 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst, \
1478 void *_src, ptrdiff_t stride) \
1480 TXComplex fft##N##in[N]; \
1481 TXComplex *z = _dst, *exp = s->exp; \
1482 const TXSample *src = _src, *in1, *in2; \
1483 const int len4 = s->len >> 2; \
1484 const int len2 = s->len >> 1; \
1485 const int m = s->sub->len; \
1486 const int *in_map = s->map, *out_map = in_map + N*m; \
1487 const int *sub_map = s->sub->map; \
1489 stride /= sizeof(*src); \
1491 in2 = src + ((N*m*2) - 1) * stride; \
1493 for (int i = 0; i < len2; i += N) { \
1494 for (int j = 0; j < N; j++) { \
1495 const int k = in_map[j]; \
1496 TXComplex tmp = { in2[-k*stride], in1[k*stride] }; \
1497 CMUL3(fft##N##in[j], tmp, exp[j]); \
1499 fft##N(s->tmp + *(sub_map++), fft##N##in, m); \
1504 for (int i = 0; i < N; i++) \
1505 s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \
1507 for (int i = 0; i < len4; i++) { \
1508 const int i0 = len4 + i, i1 = len4 - i - 1; \
1509 const int s0 = out_map[i0], s1 = out_map[i1]; \
1510 TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re }; \
1511 TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re }; \
1513 CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re); \
1514 CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re); \
1518 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_inv_def) = { \
1519 .name = TX_NAME_STR("mdct_pfa_" #N "xM_inv"), \
1520 .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_inv), \
1521 .type = TX_TYPE(MDCT), \
1522 .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY, \
1523 .factors = { N, TX_FACTOR_ANY }, \
1526 .max_len = TX_LEN_UNLIMITED, \
1527 .init = TX_NAME(ff_tx_mdct_pfa_init), \
1528 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1529 .prio = FF_TX_PRIO_BASE, \
1538 #define DECL_COMP_MDCT(N) \
1539 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd)(AVTXContext *s, void *_dst, \
1540 void *_src, ptrdiff_t stride) \
1542 TXComplex fft##N##in[N]; \
1543 TXSample *src = _src, *dst = _dst; \
1544 TXComplex *exp = s->exp, tmp; \
1545 const int m = s->sub->len; \
1546 const int len4 = N*m; \
1547 const int len3 = len4 * 3; \
1548 const int len8 = s->len >> 2; \
1549 const int *in_map = s->map, *out_map = in_map + N*m; \
1550 const int *sub_map = s->sub->map; \
1552 stride /= sizeof(*dst); \
1554 for (int i = 0; i < m; i++) { \
1555 for (int j = 0; j < N; j++) { \
1556 const int k = in_map[i*N + j]; \
1558 tmp.re = FOLD(-src[ len4 + k], src[1*len4 - 1 - k]); \
1559 tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]); \
1561 tmp.re = FOLD(-src[ len4 + k], -src[5*len4 - 1 - k]); \
1562 tmp.im = FOLD( src[-len4 + k], -src[1*len3 - 1 - k]); \
1564 CMUL(fft##N##in[j].im, fft##N##in[j].re, tmp.re, tmp.im, \
1565 exp[k >> 1].re, exp[k >> 1].im); \
1567 fft##N(s->tmp + sub_map[i], fft##N##in, m); \
1570 for (int i = 0; i < N; i++) \
1571 s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \
1573 for (int i = 0; i < len8; i++) { \
1574 const int i0 = len8 + i, i1 = len8 - i - 1; \
1575 const int s0 = out_map[i0], s1 = out_map[i1]; \
1576 TXComplex src1 = { s->tmp[s1].re, s->tmp[s1].im }; \
1577 TXComplex src0 = { s->tmp[s0].re, s->tmp[s0].im }; \
1579 CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im, \
1580 exp[i0].im, exp[i0].re); \
1581 CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im, \
1582 exp[i1].im, exp[i1].re); \
1586 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd_def) = { \
1587 .name = TX_NAME_STR("mdct_pfa_" #N "xM_fwd"), \
1588 .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd), \
1589 .type = TX_TYPE(MDCT), \
1590 .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, \
1591 .factors = { N, TX_FACTOR_ANY }, \
1594 .max_len = TX_LEN_UNLIMITED, \
1595 .init = TX_NAME(ff_tx_mdct_pfa_init), \
1596 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1597 .prio = FF_TX_PRIO_BASE, \
1619 s->scale_d = *((SCALE_TYPE *)
scale);
1620 s->scale_f =
s->scale_d;
1627 if (!(
s->exp =
av_mallocz((8 + 2*len4)*
sizeof(*
s->exp))))
1630 tab = (TXSample *)
s->exp;
1634 m = (inv ? 2*
s->scale_d :
s->scale_d);
1636 *
tab++ = RESCALE((inv ? 0.5 : 1.0) * m);
1637 *
tab++ = RESCALE(inv ? 0.5*m : 1.0*m);
1638 *
tab++ = RESCALE( m);
1639 *
tab++ = RESCALE(-m);
1641 *
tab++ = RESCALE( (0.5 - 0.0) * m);
1643 *
tab++ = 1 /
s->scale_f;
1645 *
tab++ = RESCALE( (0.0 - 0.5) * m);
1646 *
tab++ = RESCALE( (0.5 - inv) * m);
1647 *
tab++ = RESCALE(-(0.5 - inv) * m);
1649 for (
int i = 0;
i < len4;
i++)
1650 *
tab++ = RESCALE(cos(
i*
f));
1652 tab = ((TXSample *)
s->exp) + len4 + 8;
1654 for (
int i = 0;
i < len4;
i++)
1655 *
tab++ = RESCALE(cos(((
len -
i*4)/4.0)*
f)) * (inv ? 1 : -1);
1660 #define DECL_RDFT(n, inv) \
1661 static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst, \
1662 void *_src, ptrdiff_t stride) \
1664 const int len2 = s->len >> 1; \
1665 const int len4 = s->len >> 2; \
1666 const TXSample *fact = (void *)s->exp; \
1667 const TXSample *tcos = fact + 8; \
1668 const TXSample *tsin = tcos + len4; \
1669 TXComplex *data = inv ? _src : _dst; \
1673 s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex)); \
1675 data[0].im = data[len2].re; \
1680 t[0].re = data[0].re; \
1681 data[0].re = t[0].re + data[0].im; \
1682 data[0].im = t[0].re - data[0].im; \
1683 data[ 0].re = MULT(fact[0], data[ 0].re); \
1684 data[ 0].im = MULT(fact[1], data[ 0].im); \
1685 data[len4].re = MULT(fact[2], data[len4].re); \
1686 data[len4].im = MULT(fact[3], data[len4].im); \
1688 for (int i = 1; i < len4; i++) { \
1690 t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re)); \
1691 t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im)); \
1692 t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im)); \
1693 t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re)); \
1696 CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]); \
1698 data[ i].re = t[0].re + t[2].re; \
1699 data[ i].im = t[2].im - t[0].im; \
1700 data[len2 - i].re = t[0].re - t[2].re; \
1701 data[len2 - i].im = t[2].im + t[0].im; \
1705 s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex)); \
1708 data[len2].re = data[0].im; \
1709 data[ 0].im = data[len2].im = 0; \
1713 static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = { \
1714 .name = TX_NAME_STR("rdft_" #n), \
1715 .function = TX_NAME(ff_tx_rdft_ ##n), \
1716 .type = TX_TYPE(RDFT), \
1717 .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
1718 (inv ? FF_TX_INVERSE_ONLY : FF_TX_FORWARD_ONLY), \
1719 .factors = { 4, TX_FACTOR_ANY }, \
1722 .max_len = TX_LEN_UNLIMITED, \
1723 .init = TX_NAME(ff_tx_rdft_init), \
1724 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1725 .prio = FF_TX_PRIO_BASE, \
1731 #define DECL_RDFT_HALF(n, mode, mod2) \
1732 static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst, \
1733 void *_src, ptrdiff_t stride) \
1735 const int len = s->len; \
1736 const int len2 = len >> 1; \
1737 const int len4 = len >> 2; \
1738 const int aligned_len4 = FFALIGN(len, 4)/4; \
1739 const TXSample *fact = (void *)s->exp; \
1740 const TXSample *tcos = fact + 8; \
1741 const TXSample *tsin = tcos + aligned_len4; \
1742 TXComplex *data = _dst; \
1743 TXSample *out = _dst; \
1745 av_unused TXSample tmp_mid; \
1749 s->fn[0](&s->sub[0], _dst, _src, sizeof(TXComplex)); \
1751 tmp_dc = data[0].re; \
1752 data[ 0].re = tmp_dc + data[0].im; \
1753 tmp_dc = tmp_dc - data[0].im; \
1755 data[ 0].re = MULT(fact[0], data[ 0].re); \
1756 tmp_dc = MULT(fact[1], tmp_dc); \
1757 data[len4].re = MULT(fact[2], data[len4].re); \
1760 data[len4].im = MULT(fact[3], data[len4].im); \
1763 sl = data[len4 + 1]; \
1764 if (mode == AV_TX_REAL_TO_REAL) \
1765 tmp[0] = MULT(fact[4], (sf.re + sl.re)); \
1767 tmp[0] = MULT(fact[5], (sf.im - sl.im)); \
1768 tmp[1] = MULT(fact[6], (sf.im + sl.im)); \
1769 tmp[2] = MULT(fact[7], (sf.re - sl.re)); \
1771 if (mode == AV_TX_REAL_TO_REAL) { \
1772 tmp[3] = tmp[1]*tcos[len4] - tmp[2]*tsin[len4]; \
1773 tmp_mid = (tmp[0] - tmp[3]); \
1775 tmp[3] = tmp[1]*tsin[len4] + tmp[2]*tcos[len4]; \
1776 tmp_mid = (tmp[0] + tmp[3]); \
1781 for (int i = 1; i <= len4; i++) { \
1783 TXComplex sf = data[i]; \
1784 TXComplex sl = data[len2 - i]; \
1786 if (mode == AV_TX_REAL_TO_REAL) \
1787 tmp[0] = MULT(fact[4], (sf.re + sl.re)); \
1789 tmp[0] = MULT(fact[5], (sf.im - sl.im)); \
1791 tmp[1] = MULT(fact[6], (sf.im + sl.im)); \
1792 tmp[2] = MULT(fact[7], (sf.re - sl.re)); \
1794 if (mode == AV_TX_REAL_TO_REAL) { \
1795 tmp[3] = tmp[1]*tcos[i] - tmp[2]*tsin[i]; \
1796 out[i] = (tmp[0] + tmp[3]); \
1797 out[len - i] = (tmp[0] - tmp[3]); \
1799 tmp[3] = tmp[1]*tsin[i] + tmp[2]*tcos[i]; \
1800 out[i - 1] = (tmp[3] - tmp[0]); \
1801 out[len - i - 1] = (tmp[0] + tmp[3]); \
1805 for (int i = 1; i < (len4 + (mode == AV_TX_REAL_TO_IMAGINARY)); i++) \
1806 out[len2 - i] = out[len - i]; \
1808 if (mode == AV_TX_REAL_TO_REAL) { \
1809 out[len2] = tmp_dc; \
1811 out[len4 + 1] = tmp_mid * fact[5]; \
1812 } else if (mod2) { \
1813 out[len4] = tmp_mid; \
1817 static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = { \
1818 .name = TX_NAME_STR("rdft_" #n), \
1819 .function = TX_NAME(ff_tx_rdft_ ##n), \
1820 .type = TX_TYPE(RDFT), \
1821 .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | mode | \
1822 FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, \
1823 .factors = { 2 + 2*(!mod2), TX_FACTOR_ANY }, \
1825 .min_len = 2 + 2*(!mod2), \
1826 .max_len = TX_LEN_UNLIMITED, \
1827 .init = TX_NAME(ff_tx_rdft_init), \
1828 .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1829 .prio = FF_TX_PRIO_BASE, \
1847 SCALE_TYPE rsc = *((SCALE_TYPE *)
scale);
1862 tab = (TXSample *)
s->exp;
1866 for (
int i = 0;
i <
len;
i++)
1867 tab[
i] = RESCALE(cos(
i*freq)*(!inv + 1));
1870 for (
int i = 0;
i <
len/2;
i++)
1871 tab[
len +
i] = RESCALE(0.5 / sin((2*
i + 1)*freq));
1873 for (
int i = 0;
i <
len/2;
i++)
1874 tab[
len +
i] = RESCALE(cos((
len - 2*
i - 1)*freq));
1881 void *_src, ptrdiff_t
stride)
1883 TXSample *dst = _dst;
1884 TXSample *
src = _src;
1885 const int len =
s->len;
1886 const int len2 =
len >> 1;
1887 const TXSample *
exp = (
void *)
s->exp;
1892 TXSample tmp1, tmp2;
1895 for (
int i = 0;
i < len2;
i++) {
1896 TXSample in1 =
src[
i];
1897 TXSample in2 =
src[
len -
i - 1];
1907 tmp2 = (tmp2 + 0x40000000) >> 31;
1909 tmp1 = (in1 + in2)*0.5;
1910 tmp2 = (in1 - in2)*
s;
1913 src[
i] = tmp1 + tmp2;
1914 src[
len -
i - 1] = tmp1 - tmp2;
1921 for (
int i =
len - 2;
i > 0;
i -= 2) {
1932 tmp1 = ((int64_t)
exp[0]) * ((int64_t)dst[0]);
1933 dst[0] = (tmp1 + 0x40000000) >> 31;
1935 dst[0] =
exp[0] * dst[0];
1941 void *_src, ptrdiff_t
stride)
1943 TXSample *dst = _dst;
1944 TXSample *
src = _src;
1945 const int len =
s->len;
1946 const int len2 =
len >> 1;
1947 const TXSample *
exp = (
void *)
s->exp;
1949 int64_t tmp1, tmp2 =
src[
len - 1];
1950 tmp2 = (2*tmp2 + 0x40000000) >> 31;
1952 TXSample tmp1, tmp2 = 2*
src[
len - 1];
1957 for (
int i =
len - 2;
i >= 2;
i -= 2) {
1958 TXSample val1 =
src[
i - 0];
1959 TXSample val2 =
src[
i - 1] -
src[
i + 1];
1964 s->fn[0](&
s->sub[0], dst,
src,
sizeof(
float));
1966 for (
int i = 0;
i < len2;
i++) {
1967 TXSample in1 = dst[
i];
1968 TXSample in2 = dst[
len -
i - 1];
1975 tmp2 = (tmp2 + 0x40000000) >> 31;
1978 dst[
i] = tmp1 + tmp2;
1979 dst[
len -
i - 1] = tmp1 - tmp2;
1984 .
name = TX_NAME_STR(
"dctII"),
1998 .
name = TX_NAME_STR(
"dctIII"),
2019 SCALE_TYPE rsc = *((SCALE_TYPE *)
scale);
2044 void *_src, ptrdiff_t
stride)
2046 TXSample *dst = _dst;
2047 TXSample *
src = _src;
2048 const int len =
s->len - 1;
2049 TXSample *
tmp = (TXSample *)
s->tmp;
2051 stride /=
sizeof(TXSample);
2053 for (
int i = 0;
i <
len;
i++)
2058 s->fn[0](&
s->sub[0], dst,
tmp,
sizeof(TXSample));
2062 void *_src, ptrdiff_t
stride)
2064 TXSample *dst = _dst;
2065 TXSample *
src = _src;
2066 const int len =
s->len + 1;
2067 TXSample *
tmp = (
void *)
s->tmp;
2069 stride /=
sizeof(TXSample);
2073 for (
int i = 1;
i <
len;
i++) {
2081 s->fn[0](&
s->sub[0], dst,
tmp,
sizeof(
float));
2085 .
name = TX_NAME_STR(
"dctI"),
2099 .
name = TX_NAME_STR(
"dstI"),
2115 int len4 =
s->len >> 1;
2116 double scale =
s->scale_d;
2117 const double theta = (
scale < 0 ? len4 : 0) + 1.0/8.0;
2118 size_t alloc = pre_tab ? 2*len4 : len4;
2128 for (
int i = 0;
i < len4;
i++) {
2135 for (
int i = 0;
i < len4;
i++)
2136 s->exp[
i] =
s->exp[len4 + pre_tab[
i]];
2149 &
TX_NAME(ff_tx_fft128_ns_def),
2150 &
TX_NAME(ff_tx_fft256_ns_def),
2151 &
TX_NAME(ff_tx_fft512_ns_def),
2152 &
TX_NAME(ff_tx_fft1024_ns_def),
2153 &
TX_NAME(ff_tx_fft2048_ns_def),
2154 &
TX_NAME(ff_tx_fft4096_ns_def),
2155 &
TX_NAME(ff_tx_fft8192_ns_def),
2156 &
TX_NAME(ff_tx_fft16384_ns_def),
2157 &
TX_NAME(ff_tx_fft32768_ns_def),
2158 &
TX_NAME(ff_tx_fft65536_ns_def),
2159 &
TX_NAME(ff_tx_fft131072_ns_def),
2160 &
TX_NAME(ff_tx_fft262144_ns_def),
2161 &
TX_NAME(ff_tx_fft524288_ns_def),
2162 &
TX_NAME(ff_tx_fft1048576_ns_def),
2163 &
TX_NAME(ff_tx_fft2097152_ns_def),
2180 &
TX_NAME(ff_tx_fft_inplace_def),
2181 &
TX_NAME(ff_tx_fft_inplace_small_def),
2183 &
TX_NAME(ff_tx_fft_pfa_ns_def),
2184 &
TX_NAME(ff_tx_fft_naive_def),
2185 &
TX_NAME(ff_tx_fft_naive_small_def),
2188 &
TX_NAME(ff_tx_mdct_pfa_3xM_fwd_def),
2189 &
TX_NAME(ff_tx_mdct_pfa_5xM_fwd_def),
2190 &
TX_NAME(ff_tx_mdct_pfa_7xM_fwd_def),
2191 &
TX_NAME(ff_tx_mdct_pfa_9xM_fwd_def),
2192 &
TX_NAME(ff_tx_mdct_pfa_15xM_fwd_def),
2193 &
TX_NAME(ff_tx_mdct_pfa_3xM_inv_def),
2194 &
TX_NAME(ff_tx_mdct_pfa_5xM_inv_def),
2195 &
TX_NAME(ff_tx_mdct_pfa_7xM_inv_def),
2196 &
TX_NAME(ff_tx_mdct_pfa_9xM_inv_def),
2197 &
TX_NAME(ff_tx_mdct_pfa_15xM_inv_def),
2198 &
TX_NAME(ff_tx_mdct_naive_fwd_def),
2199 &
TX_NAME(ff_tx_mdct_naive_inv_def),
2200 &
TX_NAME(ff_tx_mdct_inv_full_def),
2203 &
TX_NAME(ff_tx_rdft_r2r_mod2_def),
2205 &
TX_NAME(ff_tx_rdft_r2i_mod2_def),
int(* func)(AVBPrint *dst, const char *in, const char *arg)
static void TX_NAME() ff_tx_fft_sr_combine(TXComplex *z, const TXSample *cos, int len)
static av_cold int TX_NAME() ff_tx_dct_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
@ AV_TX_REAL_TO_REAL
Perform a real to half-complex RDFT.
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
#define TRANSFORM(a0, a1, a2, a3, wre, wim)
static void TX_NAME() ff_tx_fft(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
#define TX_MAX_DECOMPOSITIONS
static void TX_NAME() ff_tx_fft_pfa(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride)
static void TX_NAME() ff_tx_fft16_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
int ff_tx_gen_inplace_map(AVTXContext *s, int len)
static av_always_inline void fft15(TXComplex *out, TXComplex *in, ptrdiff_t stride)
#define FF_TX_CPU_FLAGS_ALL
int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts, int inv, int n, int m)
static void TX_NAME() ff_tx_dctI(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
static void TX_NAME() ff_tx_fft_naive(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
#define DECL_FFT5(NAME, D0, D1, D2, D3, D4)
static void TX_NAME() ff_tx_mdct_naive_fwd(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
static av_cold int TX_NAME() ff_tx_rdft_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
#define DECL_SR_CODELET_DEF(n)
static SR_POW2_TABLES void(*const sr_tabs_init_funcs[])(void)
static const FFTXCodelet TX_NAME(ff_tx_fft_def)
static void sum_d(const int *input, int *output, int len)
static AVOnce sr_tabs_init_once[]
static double val(void *priv, double ch)
static av_always_inline float scale(float x, float s)
#define TABLE_DEF(name, size)
static int16_t mult(Float11 *f1, Float11 *f2)
static int ff_thread_once(char *control, void(*routine)(void))
#define FF_ARRAY_ELEMS(a)
static void c2r(float *buffer, int size)
static av_cold int TX_NAME() ff_tx_fft_factor_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
static void TX_NAME() ff_tx_mdct_fwd(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
static av_cold int TX_NAME() ff_tx_mdct_naive_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
#define FF_TX_FORWARD_ONLY
static void TX_NAME() ff_tx_dstI(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
@ AV_TX_FULL_IMDCT
Performs a full inverse MDCT rather than leaving out samples that can be derived through symmetry.
static __device__ float fabs(float a)
@ AV_TX_REAL_TO_IMAGINARY
static av_cold int TX_NAME() ff_tx_mdct_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
@ AV_TX_INPLACE
Allows for in-place transformations, where input == output.
int ff_tx_gen_ptwo_revtab(AVTXContext *s, FFTXCodeletOptions *opts)
static void r2c(float *buffer, int size)
#define FF_TX_OUT_OF_PLACE
@ AV_TX_UNALIGNED
Relaxes alignment requirement for the in and out arrays of av_tx_fn().
static void TX_NAME() ff_tx_dctIII(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
#define DECL_COMP_MDCT(N)
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
static const uint8_t tab[16]
static av_cold int TX_NAME() ff_tx_fft_pfa_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
void ff_tx_clear_ctx(AVTXContext *s)
static void TX_NAME() ff_tx_fft2_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
static av_cold int TX_NAME() ff_tx_fft_sr_codelet_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
int ff_tx_gen_default_map(AVTXContext *s, FFTXCodeletOptions *opts)
static av_cold void TX_TAB() ff_tx_init_tab_53(void)
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
static void TX_NAME() ff_tx_fft8_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
static av_always_inline void fft9(TXComplex *out, TXComplex *in, ptrdiff_t stride)
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
#define TX_EMBED_INPUT_PFA_MAP(map, tot_len, d1, d2)
static void TX_NAME() ff_tx_fft_inplace(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
#define DECL_RDFT_HALF(n, mode, mod2)
static av_cold int TX_NAME() ff_tx_fft_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
static void TX_NAME() ff_tx_mdct_inv(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
#define i(width, name, range_min, range_max)
#define av_malloc_array(a, b)
static AVOnce nptwo_tabs_init_once[]
static av_cold int TX_NAME() ff_tx_fft_init_naive_small(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
#define DECL_SR_CODELET(n, n2, n4)
#define DECL_COMP_IMDCT(N)
void * av_mallocz(size_t size)
Allocate a memory block with alignment suitable for all memory accesses (including vectors if availab...
static av_always_inline void fft3(TXComplex *out, TXComplex *in, ptrdiff_t stride)
static const FFTabInitData nptwo_tabs_init_data[]
av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
#define FFSWAP(type, a, b)
static av_cold void TX_TAB() ff_tx_init_tab_7(void)
#define FF_TX_INVERSE_ONLY
static void TX_NAME() ff_tx_fft_naive_small(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
static av_cold void TX_TAB() ff_tx_init_tab_9(void)
av_cold void TX_TAB() ff_tx_init_tabs(int len)
static void TX_NAME() ff_tx_mdct_naive_inv(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
static void TX_NAME() ff_tx_dctII(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
#define BUTTERFLIES(a0, a1, a2, a3)
static void TX_NAME() ff_tx_fft_pfa_ns(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride)
static const int factor[16]
static av_cold int TX_NAME() ff_tx_dcstI_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
static av_cold int TX_NAME() ff_tx_fft_inplace_small_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
const VDPAUPixFmtMap * map
static const int16_t alpha[]
static av_always_inline void fft7(TXComplex *out, TXComplex *in, ptrdiff_t stride)
#define flags(name, subs,...)
int TX_TAB() ff_tx_mdct_gen_exp(AVTXContext *s, int *pre_tab)
int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts, int d1, int d2)
#define DECL_RDFT(n, inv)
static av_cold int TX_NAME() ff_tx_mdct_pfa_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
static void TX_NAME() ff_tx_fft4_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
static av_cold int TX_NAME() ff_tx_mdct_inv_full_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
int ff_tx_decompose_length(int dst[TX_MAX_DECOMPOSITIONS], enum AVTXType type, int len, int inv)
#define CMUL(dre, dim, are, aim, bre, bim)
static void TX_NAME() ff_tx_mdct_inv_full(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)