FFmpeg
tx_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) Lynne
3  *
4  * Power of two FFT:
5  * Copyright (c) Lynne
6  * Copyright (c) 2008 Loren Merritt
7  * Copyright (c) 2002 Fabrice Bellard
8  * Partly based on libdjbfft by D. J. Bernstein
9  *
10  * This file is part of FFmpeg.
11  *
12  * FFmpeg is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * FFmpeg is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with FFmpeg; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 #define TABLE_DEF(name, size) \
28  DECLARE_ALIGNED(32, TXSample, TX_TAB(ff_tx_tab_ ##name))[size]
29 
30 #define SR_POW2_TABLES \
31  SR_TABLE(8) \
32  SR_TABLE(16) \
33  SR_TABLE(32) \
34  SR_TABLE(64) \
35  SR_TABLE(128) \
36  SR_TABLE(256) \
37  SR_TABLE(512) \
38  SR_TABLE(1024) \
39  SR_TABLE(2048) \
40  SR_TABLE(4096) \
41  SR_TABLE(8192) \
42  SR_TABLE(16384) \
43  SR_TABLE(32768) \
44  SR_TABLE(65536) \
45  SR_TABLE(131072) \
46  SR_TABLE(262144) \
47  SR_TABLE(524288) \
48  SR_TABLE(1048576) \
49  SR_TABLE(2097152) \
50 
51 #define SR_TABLE(len) \
52  TABLE_DEF(len, len/4 + 1);
53 /* Power of two tables */
55 #undef SR_TABLE
56 
57 /* Other factors' tables */
58 TABLE_DEF(53, 12);
59 TABLE_DEF( 7, 6);
60 TABLE_DEF( 9, 8);
61 
62 typedef struct FFTabInitData {
63  void (*func)(void);
64  int factors[TX_MAX_SUB]; /* Must be sorted high -> low */
66 
67 #define SR_TABLE(len) \
68 static av_cold void TX_TAB(ff_tx_init_tab_ ##len)(void) \
69 { \
70  double freq = 2*M_PI/len; \
71  TXSample *tab = TX_TAB(ff_tx_tab_ ##len); \
72  \
73  for (int i = 0; i < len/4; i++) \
74  *tab++ = RESCALE(cos(i*freq)); \
75  \
76  *tab = 0; \
77 }
79 #undef SR_TABLE
80 
81 static void (*const sr_tabs_init_funcs[])(void) = {
82 #define SR_TABLE(len) TX_TAB(ff_tx_init_tab_ ##len),
84 #undef SR_TABLE
85 };
86 
88 #define SR_TABLE(len) AV_ONCE_INIT,
90 #undef SR_TABLE
91 };
92 
93 static av_cold void TX_TAB(ff_tx_init_tab_53)(void)
94 {
95  /* 5pt, doubled to eliminate AVX lane shuffles */
96  TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 5));
97  TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 5));
98  TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10));
99  TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10));
100  TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI / 5));
101  TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI / 5));
102  TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10));
103  TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10));
104 
105  /* 3pt */
106  TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12));
107  TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12));
108  TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI / 6));
109  TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI / 6));
110 }
111 
112 static av_cold void TX_TAB(ff_tx_init_tab_7)(void)
113 {
114  TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 * M_PI / 7));
115  TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 * M_PI / 7));
116  TX_TAB(ff_tx_tab_7)[2] = RESCALE(sin(2 * M_PI / 28));
117  TX_TAB(ff_tx_tab_7)[3] = RESCALE(cos(2 * M_PI / 28));
118  TX_TAB(ff_tx_tab_7)[4] = RESCALE(cos(2 * M_PI / 14));
119  TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 * M_PI / 14));
120 }
121 
122 static av_cold void TX_TAB(ff_tx_init_tab_9)(void)
123 {
124  TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 * M_PI / 3));
125  TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 * M_PI / 3));
126  TX_TAB(ff_tx_tab_9)[2] = RESCALE(cos(2 * M_PI / 9));
127  TX_TAB(ff_tx_tab_9)[3] = RESCALE(sin(2 * M_PI / 9));
128  TX_TAB(ff_tx_tab_9)[4] = RESCALE(cos(2 * M_PI / 36));
129  TX_TAB(ff_tx_tab_9)[5] = RESCALE(sin(2 * M_PI / 36));
130  TX_TAB(ff_tx_tab_9)[6] = TX_TAB(ff_tx_tab_9)[2] + TX_TAB(ff_tx_tab_9)[5];
131  TX_TAB(ff_tx_tab_9)[7] = TX_TAB(ff_tx_tab_9)[3] - TX_TAB(ff_tx_tab_9)[4];
132 }
133 
135  { TX_TAB(ff_tx_init_tab_53), { 15, 5, 3 } },
136  { TX_TAB(ff_tx_init_tab_9), { 9 } },
137  { TX_TAB(ff_tx_init_tab_7), { 7 } },
138 };
139 
141  AV_ONCE_INIT,
142  AV_ONCE_INIT,
143  AV_ONCE_INIT,
144 };
145 
146 av_cold void TX_TAB(ff_tx_init_tabs)(int len)
147 {
148  int factor_2 = ff_ctz(len);
149  if (factor_2) {
150  int idx = factor_2 - 3;
151  for (int i = 0; i <= idx; i++)
154  len >>= factor_2;
155  }
156 
157  for (int i = 0; i < FF_ARRAY_ELEMS(nptwo_tabs_init_data); i++) {
158  int f, f_idx = 0;
159 
160  if (len <= 1)
161  return;
162 
163  while ((f = nptwo_tabs_init_data[i].factors[f_idx++])) {
164  if (f % len)
165  continue;
166 
169  len /= f;
170  break;
171  }
172  }
173 }
174 
176  ptrdiff_t stride)
177 {
178  TXComplex tmp[3];
179  const TXSample *tab = TX_TAB(ff_tx_tab_53);
180 #ifdef TX_INT32
181  int64_t mtmp[4];
182 #endif
183 
184  tmp[0] = in[0];
185  BF(tmp[1].re, tmp[2].im, in[1].im, in[2].im);
186  BF(tmp[1].im, tmp[2].re, in[1].re, in[2].re);
187 
188  out[0*stride].re = tmp[0].re + tmp[2].re;
189  out[0*stride].im = tmp[0].im + tmp[2].im;
190 
191 #ifdef TX_INT32
192  mtmp[0] = (int64_t)tab[ 8] * tmp[1].re;
193  mtmp[1] = (int64_t)tab[ 9] * tmp[1].im;
194  mtmp[2] = (int64_t)tab[10] * tmp[2].re;
195  mtmp[3] = (int64_t)tab[10] * tmp[2].im;
196  out[1*stride].re = tmp[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
197  out[1*stride].im = tmp[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
198  out[2*stride].re = tmp[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
199  out[2*stride].im = tmp[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
200 #else
201  tmp[1].re = tab[ 8] * tmp[1].re;
202  tmp[1].im = tab[ 9] * tmp[1].im;
203  tmp[2].re = tab[10] * tmp[2].re;
204  tmp[2].im = tab[10] * tmp[2].im;
205  out[1*stride].re = tmp[0].re - tmp[2].re + tmp[1].re;
206  out[1*stride].im = tmp[0].im - tmp[2].im - tmp[1].im;
207  out[2*stride].re = tmp[0].re - tmp[2].re - tmp[1].re;
208  out[2*stride].im = tmp[0].im - tmp[2].im + tmp[1].im;
209 #endif
210 }
211 
212 #define DECL_FFT5(NAME, D0, D1, D2, D3, D4) \
213 static av_always_inline void NAME(TXComplex *out, TXComplex *in, \
214  ptrdiff_t stride) \
215 { \
216  TXComplex dc, z0[4], t[6]; \
217  const TXSample *tab = TX_TAB(ff_tx_tab_53); \
218  \
219  dc = in[0]; \
220  BF(t[1].im, t[0].re, in[1].re, in[4].re); \
221  BF(t[1].re, t[0].im, in[1].im, in[4].im); \
222  BF(t[3].im, t[2].re, in[2].re, in[3].re); \
223  BF(t[3].re, t[2].im, in[2].im, in[3].im); \
224  \
225  out[D0*stride].re = dc.re + t[0].re + t[2].re; \
226  out[D0*stride].im = dc.im + t[0].im + t[2].im; \
227  \
228  SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re); \
229  SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im); \
230  CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re); \
231  CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im); \
232  \
233  BF(z0[0].re, z0[3].re, t[0].re, t[1].re); \
234  BF(z0[0].im, z0[3].im, t[0].im, t[1].im); \
235  BF(z0[2].re, z0[1].re, t[4].re, t[5].re); \
236  BF(z0[2].im, z0[1].im, t[4].im, t[5].im); \
237  \
238  out[D1*stride].re = dc.re + z0[3].re; \
239  out[D1*stride].im = dc.im + z0[0].im; \
240  out[D2*stride].re = dc.re + z0[2].re; \
241  out[D2*stride].im = dc.im + z0[1].im; \
242  out[D3*stride].re = dc.re + z0[1].re; \
243  out[D3*stride].im = dc.im + z0[2].im; \
244  out[D4*stride].re = dc.re + z0[0].re; \
245  out[D4*stride].im = dc.im + z0[3].im; \
246 }
247 
248 DECL_FFT5(fft5, 0, 1, 2, 3, 4)
249 DECL_FFT5(fft5_m1, 0, 6, 12, 3, 9)
250 DECL_FFT5(fft5_m2, 10, 1, 7, 13, 4)
251 DECL_FFT5(fft5_m3, 5, 11, 2, 8, 14)
252 
254  ptrdiff_t stride)
255 {
256  TXComplex dc, t[6], z[3];
257  const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_7);
258 #ifdef TX_INT32
259  int64_t mtmp[12];
260 #endif
261 
262  dc = in[0];
263  BF(t[1].re, t[0].re, in[1].re, in[6].re);
264  BF(t[1].im, t[0].im, in[1].im, in[6].im);
265  BF(t[3].re, t[2].re, in[2].re, in[5].re);
266  BF(t[3].im, t[2].im, in[2].im, in[5].im);
267  BF(t[5].re, t[4].re, in[3].re, in[4].re);
268  BF(t[5].im, t[4].im, in[3].im, in[4].im);
269 
270  out[0*stride].re = dc.re + t[0].re + t[2].re + t[4].re;
271  out[0*stride].im = dc.im + t[0].im + t[2].im + t[4].im;
272 
273 #ifdef TX_INT32 /* NOTE: it's possible to do this with 16 mults but 72 adds */
274  mtmp[ 0] = ((int64_t)tab[0].re)*t[0].re - ((int64_t)tab[2].re)*t[4].re;
275  mtmp[ 1] = ((int64_t)tab[0].re)*t[4].re - ((int64_t)tab[1].re)*t[0].re;
276  mtmp[ 2] = ((int64_t)tab[0].re)*t[2].re - ((int64_t)tab[2].re)*t[0].re;
277  mtmp[ 3] = ((int64_t)tab[0].re)*t[0].im - ((int64_t)tab[1].re)*t[2].im;
278  mtmp[ 4] = ((int64_t)tab[0].re)*t[4].im - ((int64_t)tab[1].re)*t[0].im;
279  mtmp[ 5] = ((int64_t)tab[0].re)*t[2].im - ((int64_t)tab[2].re)*t[0].im;
280 
281  mtmp[ 6] = ((int64_t)tab[2].im)*t[1].im + ((int64_t)tab[1].im)*t[5].im;
282  mtmp[ 7] = ((int64_t)tab[0].im)*t[5].im + ((int64_t)tab[2].im)*t[3].im;
283  mtmp[ 8] = ((int64_t)tab[2].im)*t[5].im + ((int64_t)tab[1].im)*t[3].im;
284  mtmp[ 9] = ((int64_t)tab[0].im)*t[1].re + ((int64_t)tab[1].im)*t[3].re;
285  mtmp[10] = ((int64_t)tab[2].im)*t[3].re + ((int64_t)tab[0].im)*t[5].re;
286  mtmp[11] = ((int64_t)tab[2].im)*t[1].re + ((int64_t)tab[1].im)*t[5].re;
287 
288  z[0].re = (int32_t)(mtmp[ 0] - ((int64_t)tab[1].re)*t[2].re + 0x40000000 >> 31);
289  z[1].re = (int32_t)(mtmp[ 1] - ((int64_t)tab[2].re)*t[2].re + 0x40000000 >> 31);
290  z[2].re = (int32_t)(mtmp[ 2] - ((int64_t)tab[1].re)*t[4].re + 0x40000000 >> 31);
291  z[0].im = (int32_t)(mtmp[ 3] - ((int64_t)tab[2].re)*t[4].im + 0x40000000 >> 31);
292  z[1].im = (int32_t)(mtmp[ 4] - ((int64_t)tab[2].re)*t[2].im + 0x40000000 >> 31);
293  z[2].im = (int32_t)(mtmp[ 5] - ((int64_t)tab[1].re)*t[4].im + 0x40000000 >> 31);
294 
295  t[0].re = (int32_t)(mtmp[ 6] - ((int64_t)tab[0].im)*t[3].im + 0x40000000 >> 31);
296  t[2].re = (int32_t)(mtmp[ 7] - ((int64_t)tab[1].im)*t[1].im + 0x40000000 >> 31);
297  t[4].re = (int32_t)(mtmp[ 8] + ((int64_t)tab[0].im)*t[1].im + 0x40000000 >> 31);
298  t[0].im = (int32_t)(mtmp[ 9] + ((int64_t)tab[2].im)*t[5].re + 0x40000000 >> 31);
299  t[2].im = (int32_t)(mtmp[10] - ((int64_t)tab[1].im)*t[1].re + 0x40000000 >> 31);
300  t[4].im = (int32_t)(mtmp[11] - ((int64_t)tab[0].im)*t[3].re + 0x40000000 >> 31);
301 #else
302  z[0].re = tab[0].re*t[0].re - tab[2].re*t[4].re - tab[1].re*t[2].re;
303  z[1].re = tab[0].re*t[4].re - tab[1].re*t[0].re - tab[2].re*t[2].re;
304  z[2].re = tab[0].re*t[2].re - tab[2].re*t[0].re - tab[1].re*t[4].re;
305  z[0].im = tab[0].re*t[0].im - tab[1].re*t[2].im - tab[2].re*t[4].im;
306  z[1].im = tab[0].re*t[4].im - tab[1].re*t[0].im - tab[2].re*t[2].im;
307  z[2].im = tab[0].re*t[2].im - tab[2].re*t[0].im - tab[1].re*t[4].im;
308 
309  /* It's possible to do t[4].re and t[0].im with 2 multiplies only by
310  * multiplying the sum of all with the average of the twiddles */
311 
312  t[0].re = tab[2].im*t[1].im + tab[1].im*t[5].im - tab[0].im*t[3].im;
313  t[2].re = tab[0].im*t[5].im + tab[2].im*t[3].im - tab[1].im*t[1].im;
314  t[4].re = tab[2].im*t[5].im + tab[1].im*t[3].im + tab[0].im*t[1].im;
315  t[0].im = tab[0].im*t[1].re + tab[1].im*t[3].re + tab[2].im*t[5].re;
316  t[2].im = tab[2].im*t[3].re + tab[0].im*t[5].re - tab[1].im*t[1].re;
317  t[4].im = tab[2].im*t[1].re + tab[1].im*t[5].re - tab[0].im*t[3].re;
318 #endif
319 
320  BF(t[1].re, z[0].re, z[0].re, t[4].re);
321  BF(t[3].re, z[1].re, z[1].re, t[2].re);
322  BF(t[5].re, z[2].re, z[2].re, t[0].re);
323  BF(t[1].im, z[0].im, z[0].im, t[0].im);
324  BF(t[3].im, z[1].im, z[1].im, t[2].im);
325  BF(t[5].im, z[2].im, z[2].im, t[4].im);
326 
327  out[1*stride].re = dc.re + z[0].re;
328  out[1*stride].im = dc.im + t[1].im;
329  out[2*stride].re = dc.re + t[3].re;
330  out[2*stride].im = dc.im + z[1].im;
331  out[3*stride].re = dc.re + z[2].re;
332  out[3*stride].im = dc.im + t[5].im;
333  out[4*stride].re = dc.re + t[5].re;
334  out[4*stride].im = dc.im + z[2].im;
335  out[5*stride].re = dc.re + z[1].re;
336  out[5*stride].im = dc.im + t[3].im;
337  out[6*stride].re = dc.re + t[1].re;
338  out[6*stride].im = dc.im + z[0].im;
339 }
340 
342  ptrdiff_t stride)
343 {
344  const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_9);
345  TXComplex dc, t[16], w[4], x[5], y[5], z[2];
346 #ifdef TX_INT32
347  int64_t mtmp[12];
348 #endif
349 
350  dc = in[0];
351  BF(t[1].re, t[0].re, in[1].re, in[8].re);
352  BF(t[1].im, t[0].im, in[1].im, in[8].im);
353  BF(t[3].re, t[2].re, in[2].re, in[7].re);
354  BF(t[3].im, t[2].im, in[2].im, in[7].im);
355  BF(t[5].re, t[4].re, in[3].re, in[6].re);
356  BF(t[5].im, t[4].im, in[3].im, in[6].im);
357  BF(t[7].re, t[6].re, in[4].re, in[5].re);
358  BF(t[7].im, t[6].im, in[4].im, in[5].im);
359 
360  w[0].re = t[0].re - t[6].re;
361  w[0].im = t[0].im - t[6].im;
362  w[1].re = t[2].re - t[6].re;
363  w[1].im = t[2].im - t[6].im;
364  w[2].re = t[1].re - t[7].re;
365  w[2].im = t[1].im - t[7].im;
366  w[3].re = t[3].re + t[7].re;
367  w[3].im = t[3].im + t[7].im;
368 
369  z[0].re = dc.re + t[4].re;
370  z[0].im = dc.im + t[4].im;
371 
372  z[1].re = t[0].re + t[2].re + t[6].re;
373  z[1].im = t[0].im + t[2].im + t[6].im;
374 
375  out[0*stride].re = z[0].re + z[1].re;
376  out[0*stride].im = z[0].im + z[1].im;
377 
378 #ifdef TX_INT32
379  mtmp[0] = t[1].re - t[3].re + t[7].re;
380  mtmp[1] = t[1].im - t[3].im + t[7].im;
381 
382  y[3].re = (int32_t)(((int64_t)tab[0].im)*mtmp[0] + 0x40000000 >> 31);
383  y[3].im = (int32_t)(((int64_t)tab[0].im)*mtmp[1] + 0x40000000 >> 31);
384 
385  mtmp[0] = (int32_t)(((int64_t)tab[0].re)*z[1].re + 0x40000000 >> 31);
386  mtmp[1] = (int32_t)(((int64_t)tab[0].re)*z[1].im + 0x40000000 >> 31);
387  mtmp[2] = (int32_t)(((int64_t)tab[0].re)*t[4].re + 0x40000000 >> 31);
388  mtmp[3] = (int32_t)(((int64_t)tab[0].re)*t[4].im + 0x40000000 >> 31);
389 
390  x[3].re = z[0].re + (int32_t)mtmp[0];
391  x[3].im = z[0].im + (int32_t)mtmp[1];
392  z[0].re = in[0].re + (int32_t)mtmp[2];
393  z[0].im = in[0].im + (int32_t)mtmp[3];
394 
395  mtmp[0] = ((int64_t)tab[1].re)*w[0].re;
396  mtmp[1] = ((int64_t)tab[1].re)*w[0].im;
397  mtmp[2] = ((int64_t)tab[2].im)*w[0].re;
398  mtmp[3] = ((int64_t)tab[2].im)*w[0].im;
399  mtmp[4] = ((int64_t)tab[1].im)*w[2].re;
400  mtmp[5] = ((int64_t)tab[1].im)*w[2].im;
401  mtmp[6] = ((int64_t)tab[2].re)*w[2].re;
402  mtmp[7] = ((int64_t)tab[2].re)*w[2].im;
403 
404  x[1].re = (int32_t)(mtmp[0] + ((int64_t)tab[2].im)*w[1].re + 0x40000000 >> 31);
405  x[1].im = (int32_t)(mtmp[1] + ((int64_t)tab[2].im)*w[1].im + 0x40000000 >> 31);
406  x[2].re = (int32_t)(mtmp[2] - ((int64_t)tab[3].re)*w[1].re + 0x40000000 >> 31);
407  x[2].im = (int32_t)(mtmp[3] - ((int64_t)tab[3].re)*w[1].im + 0x40000000 >> 31);
408  y[1].re = (int32_t)(mtmp[4] + ((int64_t)tab[2].re)*w[3].re + 0x40000000 >> 31);
409  y[1].im = (int32_t)(mtmp[5] + ((int64_t)tab[2].re)*w[3].im + 0x40000000 >> 31);
410  y[2].re = (int32_t)(mtmp[6] - ((int64_t)tab[3].im)*w[3].re + 0x40000000 >> 31);
411  y[2].im = (int32_t)(mtmp[7] - ((int64_t)tab[3].im)*w[3].im + 0x40000000 >> 31);
412 
413  y[0].re = (int32_t)(((int64_t)tab[0].im)*t[5].re + 0x40000000 >> 31);
414  y[0].im = (int32_t)(((int64_t)tab[0].im)*t[5].im + 0x40000000 >> 31);
415 
416 #else
417  y[3].re = tab[0].im*(t[1].re - t[3].re + t[7].re);
418  y[3].im = tab[0].im*(t[1].im - t[3].im + t[7].im);
419 
420  x[3].re = z[0].re + tab[0].re*z[1].re;
421  x[3].im = z[0].im + tab[0].re*z[1].im;
422  z[0].re = dc.re + tab[0].re*t[4].re;
423  z[0].im = dc.im + tab[0].re*t[4].im;
424 
425  x[1].re = tab[1].re*w[0].re + tab[2].im*w[1].re;
426  x[1].im = tab[1].re*w[0].im + tab[2].im*w[1].im;
427  x[2].re = tab[2].im*w[0].re - tab[3].re*w[1].re;
428  x[2].im = tab[2].im*w[0].im - tab[3].re*w[1].im;
429  y[1].re = tab[1].im*w[2].re + tab[2].re*w[3].re;
430  y[1].im = tab[1].im*w[2].im + tab[2].re*w[3].im;
431  y[2].re = tab[2].re*w[2].re - tab[3].im*w[3].re;
432  y[2].im = tab[2].re*w[2].im - tab[3].im*w[3].im;
433 
434  y[0].re = tab[0].im*t[5].re;
435  y[0].im = tab[0].im*t[5].im;
436 #endif
437 
438  x[4].re = x[1].re + x[2].re;
439  x[4].im = x[1].im + x[2].im;
440 
441  y[4].re = y[1].re - y[2].re;
442  y[4].im = y[1].im - y[2].im;
443  x[1].re = z[0].re + x[1].re;
444  x[1].im = z[0].im + x[1].im;
445  y[1].re = y[0].re + y[1].re;
446  y[1].im = y[0].im + y[1].im;
447  x[2].re = z[0].re + x[2].re;
448  x[2].im = z[0].im + x[2].im;
449  y[2].re = y[2].re - y[0].re;
450  y[2].im = y[2].im - y[0].im;
451  x[4].re = z[0].re - x[4].re;
452  x[4].im = z[0].im - x[4].im;
453  y[4].re = y[0].re - y[4].re;
454  y[4].im = y[0].im - y[4].im;
455 
456  out[1*stride] = (TXComplex){ x[1].re + y[1].im, x[1].im - y[1].re };
457  out[2*stride] = (TXComplex){ x[2].re + y[2].im, x[2].im - y[2].re };
458  out[3*stride] = (TXComplex){ x[3].re + y[3].im, x[3].im - y[3].re };
459  out[4*stride] = (TXComplex){ x[4].re + y[4].im, x[4].im - y[4].re };
460  out[5*stride] = (TXComplex){ x[4].re - y[4].im, x[4].im + y[4].re };
461  out[6*stride] = (TXComplex){ x[3].re - y[3].im, x[3].im + y[3].re };
462  out[7*stride] = (TXComplex){ x[2].re - y[2].im, x[2].im + y[2].re };
463  out[8*stride] = (TXComplex){ x[1].re - y[1].im, x[1].im + y[1].re };
464 }
465 
467  ptrdiff_t stride)
468 {
469  TXComplex tmp[15];
470 
471  for (int i = 0; i < 5; i++)
472  fft3(tmp + i, in + i*3, 5);
473 
474  fft5_m1(out, tmp + 0, stride);
475  fft5_m2(out, tmp + 5, stride);
476  fft5_m3(out, tmp + 10, stride);
477 }
478 
480  const FFTXCodelet *cd,
481  uint64_t flags,
483  int len, int inv,
484  const void *scale)
485 {
486  int ret = 0;
487  TX_TAB(ff_tx_init_tabs)(len);
488 
489  if (len == 15)
490  ret = ff_tx_gen_pfa_input_map(s, opts, 3, 5);
491  else if (flags & FF_TX_PRESHUFFLE)
493 
494  return ret;
495 }
496 
497 #define DECL_FACTOR_S(n) \
498 static void TX_NAME(ff_tx_fft##n)(AVTXContext *s, void *dst, \
499  void *src, ptrdiff_t stride) \
500 { \
501  fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex)); \
502 } \
503 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
504  .name = TX_NAME_STR("fft" #n "_ns"), \
505  .function = TX_NAME(ff_tx_fft##n), \
506  .type = TX_TYPE(FFT), \
507  .flags = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
508  AV_TX_UNALIGNED | FF_TX_PRESHUFFLE, \
509  .factors[0] = n, \
510  .nb_factors = 1, \
511  .min_len = n, \
512  .max_len = n, \
513  .init = TX_NAME(ff_tx_fft_factor_init), \
514  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
515  .prio = FF_TX_PRIO_BASE, \
516 };
517 
518 #define DECL_FACTOR_F(n) \
519 DECL_FACTOR_S(n) \
520 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_fwd_def) = { \
521  .name = TX_NAME_STR("fft" #n "_fwd"), \
522  .function = TX_NAME(ff_tx_fft##n), \
523  .type = TX_TYPE(FFT), \
524  .flags = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
525  AV_TX_UNALIGNED | FF_TX_FORWARD_ONLY, \
526  .factors[0] = n, \
527  .nb_factors = 1, \
528  .min_len = n, \
529  .max_len = n, \
530  .init = TX_NAME(ff_tx_fft_factor_init), \
531  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
532  .prio = FF_TX_PRIO_BASE, \
533 };
534 
535 DECL_FACTOR_F(3)
536 DECL_FACTOR_F(5)
537 DECL_FACTOR_F(7)
538 DECL_FACTOR_F(9)
539 DECL_FACTOR_S(15)
540 
541 #define BUTTERFLIES(a0, a1, a2, a3) \
542  do { \
543  r0=a0.re; \
544  i0=a0.im; \
545  r1=a1.re; \
546  i1=a1.im; \
547  BF(t3, t5, t5, t1); \
548  BF(a2.re, a0.re, r0, t5); \
549  BF(a3.im, a1.im, i1, t3); \
550  BF(t4, t6, t2, t6); \
551  BF(a3.re, a1.re, r1, t4); \
552  BF(a2.im, a0.im, i0, t6); \
553  } while (0)
554 
555 #define TRANSFORM(a0, a1, a2, a3, wre, wim) \
556  do { \
557  CMUL(t1, t2, a2.re, a2.im, wre, -wim); \
558  CMUL(t5, t6, a3.re, a3.im, wre, wim); \
559  BUTTERFLIES(a0, a1, a2, a3); \
560  } while (0)
561 
562 /* z[0...8n-1], w[1...2n-1] */
563 static inline void TX_NAME(ff_tx_fft_sr_combine)(TXComplex *z,
564  const TXSample *cos, int len)
565 {
566  int o1 = 2*len;
567  int o2 = 4*len;
568  int o3 = 6*len;
569  const TXSample *wim = cos + o1 - 7;
570  TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
571 
572  for (int i = 0; i < len; i += 4) {
573  TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]);
574  TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]);
575  TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]);
576  TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]);
577 
578  TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]);
579  TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]);
580  TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]);
581  TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]);
582 
583  z += 2*4;
584  cos += 2*4;
585  wim -= 2*4;
586  }
587 }
588 
590  const FFTXCodelet *cd,
591  uint64_t flags,
593  int len, int inv,
594  const void *scale)
595 {
596  TX_TAB(ff_tx_init_tabs)(len);
597  return ff_tx_gen_ptwo_revtab(s, opts);
598 }
599 
600 #define DECL_SR_CODELET_DEF(n) \
601 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
602  .name = TX_NAME_STR("fft" #n "_ns"), \
603  .function = TX_NAME(ff_tx_fft##n##_ns), \
604  .type = TX_TYPE(FFT), \
605  .flags = FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | \
606  AV_TX_UNALIGNED | FF_TX_PRESHUFFLE, \
607  .factors[0] = 2, \
608  .nb_factors = 1, \
609  .min_len = n, \
610  .max_len = n, \
611  .init = TX_NAME(ff_tx_fft_sr_codelet_init), \
612  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
613  .prio = FF_TX_PRIO_BASE, \
614 };
615 
616 #define DECL_SR_CODELET(n, n2, n4) \
617 static void TX_NAME(ff_tx_fft##n##_ns)(AVTXContext *s, void *_dst, \
618  void *_src, ptrdiff_t stride) \
619 { \
620  TXComplex *src = _src; \
621  TXComplex *dst = _dst; \
622  const TXSample *cos = TX_TAB(ff_tx_tab_##n); \
623  \
624  TX_NAME(ff_tx_fft##n2##_ns)(s, dst, src, stride); \
625  TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride); \
626  TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride); \
627  TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1); \
628 } \
629  \
630 DECL_SR_CODELET_DEF(n)
631 
632 static void TX_NAME(ff_tx_fft2_ns)(AVTXContext *s, void *_dst,
633  void *_src, ptrdiff_t stride)
634 {
635  TXComplex *src = _src;
636  TXComplex *dst = _dst;
637  TXComplex tmp;
638 
639  BF(tmp.re, dst[0].re, src[0].re, src[1].re);
640  BF(tmp.im, dst[0].im, src[0].im, src[1].im);
641  dst[1] = tmp;
642 }
643 
644 static void TX_NAME(ff_tx_fft4_ns)(AVTXContext *s, void *_dst,
645  void *_src, ptrdiff_t stride)
646 {
647  TXComplex *src = _src;
648  TXComplex *dst = _dst;
649  TXSample t1, t2, t3, t4, t5, t6, t7, t8;
650 
651  BF(t3, t1, src[0].re, src[1].re);
652  BF(t8, t6, src[3].re, src[2].re);
653  BF(dst[2].re, dst[0].re, t1, t6);
654  BF(t4, t2, src[0].im, src[1].im);
655  BF(t7, t5, src[2].im, src[3].im);
656  BF(dst[3].im, dst[1].im, t4, t8);
657  BF(dst[3].re, dst[1].re, t3, t7);
658  BF(dst[2].im, dst[0].im, t2, t5);
659 }
660 
661 static void TX_NAME(ff_tx_fft8_ns)(AVTXContext *s, void *_dst,
662  void *_src, ptrdiff_t stride)
663 {
664  TXComplex *src = _src;
665  TXComplex *dst = _dst;
666  TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
667  const TXSample cos = TX_TAB(ff_tx_tab_8)[1];
668 
669  TX_NAME(ff_tx_fft4_ns)(s, dst, src, stride);
670 
671  BF(t1, dst[5].re, src[4].re, -src[5].re);
672  BF(t2, dst[5].im, src[4].im, -src[5].im);
673  BF(t5, dst[7].re, src[6].re, -src[7].re);
674  BF(t6, dst[7].im, src[6].im, -src[7].im);
675 
676  BUTTERFLIES(dst[0], dst[2], dst[4], dst[6]);
677  TRANSFORM(dst[1], dst[3], dst[5], dst[7], cos, cos);
678 }
679 
680 static void TX_NAME(ff_tx_fft16_ns)(AVTXContext *s, void *_dst,
681  void *_src, ptrdiff_t stride)
682 {
683  TXComplex *src = _src;
684  TXComplex *dst = _dst;
685  const TXSample *cos = TX_TAB(ff_tx_tab_16);
686 
687  TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
688  TXSample cos_16_1 = cos[1];
689  TXSample cos_16_2 = cos[2];
690  TXSample cos_16_3 = cos[3];
691 
692  TX_NAME(ff_tx_fft8_ns)(s, dst + 0, src + 0, stride);
693  TX_NAME(ff_tx_fft4_ns)(s, dst + 8, src + 8, stride);
694  TX_NAME(ff_tx_fft4_ns)(s, dst + 12, src + 12, stride);
695 
696  t1 = dst[ 8].re;
697  t2 = dst[ 8].im;
698  t5 = dst[12].re;
699  t6 = dst[12].im;
700  BUTTERFLIES(dst[0], dst[4], dst[8], dst[12]);
701 
702  TRANSFORM(dst[ 2], dst[ 6], dst[10], dst[14], cos_16_2, cos_16_2);
703  TRANSFORM(dst[ 1], dst[ 5], dst[ 9], dst[13], cos_16_1, cos_16_3);
704  TRANSFORM(dst[ 3], dst[ 7], dst[11], dst[15], cos_16_3, cos_16_1);
705 }
706 
711 DECL_SR_CODELET(32,16,8)
712 DECL_SR_CODELET(64,32,16)
713 DECL_SR_CODELET(128,64,32)
714 DECL_SR_CODELET(256,128,64)
715 DECL_SR_CODELET(512,256,128)
716 DECL_SR_CODELET(1024,512,256)
717 DECL_SR_CODELET(2048,1024,512)
718 DECL_SR_CODELET(4096,2048,1024)
719 DECL_SR_CODELET(8192,4096,2048)
720 DECL_SR_CODELET(16384,8192,4096)
721 DECL_SR_CODELET(32768,16384,8192)
722 DECL_SR_CODELET(65536,32768,16384)
723 DECL_SR_CODELET(131072,65536,32768)
724 DECL_SR_CODELET(262144,131072,65536)
725 DECL_SR_CODELET(524288,262144,131072)
726 DECL_SR_CODELET(1048576,524288,262144)
727 DECL_SR_CODELET(2097152,1048576,524288)
728 
730  const FFTXCodelet *cd,
731  uint64_t flags,
733  int len, int inv,
734  const void *scale)
735 {
736  int ret;
737  int is_inplace = !!(flags & AV_TX_INPLACE);
738  FFTXCodeletOptions sub_opts = {
739  .map_dir = is_inplace ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
740  };
741 
742  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
743  flags |= AV_TX_INPLACE; /* in-place */
744  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
745 
746  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len, inv, scale)))
747  return ret;
748 
749  if (is_inplace && (ret = ff_tx_gen_inplace_map(s, len)))
750  return ret;
751 
752  return 0;
753 }
754 
756  const FFTXCodelet *cd,
757  uint64_t flags,
759  int len, int inv,
760  const void *scale)
761 {
762  if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
763  return AVERROR(ENOMEM);
764  flags &= ~AV_TX_INPLACE;
765  return TX_NAME(ff_tx_fft_init)(s, cd, flags, opts, len, inv, scale);
766 }
767 
768 static void TX_NAME(ff_tx_fft)(AVTXContext *s, void *_dst,
769  void *_src, ptrdiff_t stride)
770 {
771  TXComplex *src = _src;
772  TXComplex *dst1 = s->flags & AV_TX_INPLACE ? s->tmp : _dst;
773  TXComplex *dst2 = _dst;
774  int *map = s->sub[0].map;
775  int len = s->len;
776 
777  /* Compilers can't vectorize this anyway without assuming AVX2, which they
778  * generally don't, at least without -march=native -mtune=native */
779  for (int i = 0; i < len; i++)
780  dst1[i] = src[map[i]];
781 
782  s->fn[0](&s->sub[0], dst2, dst1, stride);
783 }
784 
785 static void TX_NAME(ff_tx_fft_inplace)(AVTXContext *s, void *_dst,
786  void *_src, ptrdiff_t stride)
787 {
788  TXComplex *src = _src;
789  TXComplex *dst = _dst;
790  TXComplex tmp;
791  const int *map = s->sub->map;
792  const int *inplace_idx = s->map;
793  int src_idx, dst_idx;
794 
795  src_idx = *inplace_idx++;
796  do {
797  tmp = src[src_idx];
798  dst_idx = map[src_idx];
799  do {
800  FFSWAP(TXComplex, tmp, src[dst_idx]);
801  dst_idx = map[dst_idx];
802  } while (dst_idx != src_idx); /* Can be > as well, but was less predictable */
803  src[dst_idx] = tmp;
804  } while ((src_idx = *inplace_idx++));
805 
806  s->fn[0](&s->sub[0], dst, src, stride);
807 }
808 
809 static const FFTXCodelet TX_NAME(ff_tx_fft_def) = {
810  .name = TX_NAME_STR("fft"),
811  .function = TX_NAME(ff_tx_fft),
812  .type = TX_TYPE(FFT),
814  .factors[0] = TX_FACTOR_ANY,
815  .nb_factors = 1,
816  .min_len = 2,
817  .max_len = TX_LEN_UNLIMITED,
818  .init = TX_NAME(ff_tx_fft_init),
820  .prio = FF_TX_PRIO_BASE,
821 };
822 
823 static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_small_def) = {
824  .name = TX_NAME_STR("fft_inplace_small"),
825  .function = TX_NAME(ff_tx_fft),
826  .type = TX_TYPE(FFT),
828  .factors[0] = TX_FACTOR_ANY,
829  .nb_factors = 1,
830  .min_len = 2,
831  .max_len = 65536,
834  .prio = FF_TX_PRIO_BASE - 256,
835 };
836 
837 static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_def) = {
838  .name = TX_NAME_STR("fft_inplace"),
839  .function = TX_NAME(ff_tx_fft_inplace),
840  .type = TX_TYPE(FFT),
842  .factors[0] = TX_FACTOR_ANY,
843  .nb_factors = 1,
844  .min_len = 2,
845  .max_len = TX_LEN_UNLIMITED,
846  .init = TX_NAME(ff_tx_fft_init),
848  .prio = FF_TX_PRIO_BASE - 512,
849 };
850 
852  const FFTXCodelet *cd,
853  uint64_t flags,
855  int len, int inv,
856  const void *scale)
857 {
858  const double phase = s->inv ? 2.0*M_PI/len : -2.0*M_PI/len;
859 
860  if (!(s->exp = av_malloc(len*len*sizeof(*s->exp))))
861  return AVERROR(ENOMEM);
862 
863  for (int i = 0; i < len; i++) {
864  for (int j = 0; j < len; j++) {
865  const double factor = phase*i*j;
866  s->exp[i*j] = (TXComplex){
867  RESCALE(cos(factor)),
868  RESCALE(sin(factor)),
869  };
870  }
871  }
872 
873  return 0;
874 }
875 
876 static void TX_NAME(ff_tx_fft_naive)(AVTXContext *s, void *_dst, void *_src,
877  ptrdiff_t stride)
878 {
879  TXComplex *src = _src;
880  TXComplex *dst = _dst;
881  const int n = s->len;
882  double phase = s->inv ? 2.0*M_PI/n : -2.0*M_PI/n;
883 
884  stride /= sizeof(*dst);
885 
886  for (int i = 0; i < n; i++) {
887  TXComplex tmp = { 0 };
888  for (int j = 0; j < n; j++) {
889  const double factor = phase*i*j;
890  const TXComplex mult = {
891  RESCALE(cos(factor)),
892  RESCALE(sin(factor)),
893  };
894  TXComplex res;
895  CMUL3(res, src[j], mult);
896  tmp.re += res.re;
897  tmp.im += res.im;
898  }
899  dst[i*stride] = tmp;
900  }
901 }
902 
903 static void TX_NAME(ff_tx_fft_naive_small)(AVTXContext *s, void *_dst, void *_src,
904  ptrdiff_t stride)
905 {
906  TXComplex *src = _src;
907  TXComplex *dst = _dst;
908  const int n = s->len;
909 
910  stride /= sizeof(*dst);
911 
912  for (int i = 0; i < n; i++) {
913  TXComplex tmp = { 0 };
914  for (int j = 0; j < n; j++) {
915  TXComplex res;
916  const TXComplex mult = s->exp[i*j];
917  CMUL3(res, src[j], mult);
918  tmp.re += res.re;
919  tmp.im += res.im;
920  }
921  dst[i*stride] = tmp;
922  }
923 }
924 
925 static const FFTXCodelet TX_NAME(ff_tx_fft_naive_small_def) = {
926  .name = TX_NAME_STR("fft_naive_small"),
927  .function = TX_NAME(ff_tx_fft_naive_small),
928  .type = TX_TYPE(FFT),
930  .factors[0] = TX_FACTOR_ANY,
931  .nb_factors = 1,
932  .min_len = 2,
933  .max_len = 1024,
936  .prio = FF_TX_PRIO_MIN/2,
937 };
938 
939 static const FFTXCodelet TX_NAME(ff_tx_fft_naive_def) = {
940  .name = TX_NAME_STR("fft_naive"),
941  .function = TX_NAME(ff_tx_fft_naive),
942  .type = TX_TYPE(FFT),
944  .factors[0] = TX_FACTOR_ANY,
945  .nb_factors = 1,
946  .min_len = 2,
947  .max_len = TX_LEN_UNLIMITED,
948  .init = NULL,
949  .cpu_flags = FF_TX_CPU_FLAGS_ALL,
950  .prio = FF_TX_PRIO_MIN,
951 };
952 
954  const FFTXCodelet *cd,
955  uint64_t flags,
957  int len, int inv,
958  const void *scale)
959 {
960  int ret, *tmp, ps = flags & FF_TX_PRESHUFFLE;
961  FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
962  size_t extra_tmp_len = 0;
963  int len_list[TX_MAX_DECOMPOSITIONS];
964 
965  if ((ret = ff_tx_decompose_length(len_list, TX_TYPE(FFT), len, inv)) < 0)
966  return ret;
967 
968  /* Two iterations to test both orderings. */
969  for (int i = 0; i < ret; i++) {
970  int len1 = len_list[i];
971  int len2 = len / len1;
972 
973  /* Our ptwo transforms don't support striding the output. */
974  if (len2 & (len2 - 1))
975  FFSWAP(int, len1, len2);
976 
978 
979  /* First transform */
980  sub_opts.map_dir = FF_TX_MAP_GATHER;
981  flags &= ~AV_TX_INPLACE;
983  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
984  ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
985  len1, inv, scale);
986 
987  if (ret == AVERROR(ENOMEM)) {
988  return ret;
989  } else if (ret < 0) { /* Try again without a preshuffle flag */
991  ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
992  len1, inv, scale);
993  if (ret == AVERROR(ENOMEM))
994  return ret;
995  else if (ret < 0)
996  continue;
997  }
998 
999  /* Second transform. */
1000  sub_opts.map_dir = FF_TX_MAP_SCATTER;
1002 retry:
1004  flags |= AV_TX_INPLACE;
1005  ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1006  len2, inv, scale);
1007 
1008  if (ret == AVERROR(ENOMEM)) {
1009  return ret;
1010  } else if (ret < 0) { /* Try again with an out-of-place transform */
1012  flags &= ~AV_TX_INPLACE;
1013  ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1014  len2, inv, scale);
1015  if (ret == AVERROR(ENOMEM)) {
1016  return ret;
1017  } else if (ret < 0) {
1018  if (flags & FF_TX_PRESHUFFLE) { /* Retry again without a preshuf flag */
1019  flags &= ~FF_TX_PRESHUFFLE;
1020  goto retry;
1021  } else {
1022  continue;
1023  }
1024  }
1025  }
1026 
1027  /* Success */
1028  break;
1029  }
1030 
1031  /* If nothing was sucessful, error out */
1032  if (ret < 0)
1033  return ret;
1034 
1035  /* Generate PFA map */
1036  if ((ret = ff_tx_gen_compound_mapping(s, opts, 0,
1037  s->sub[0].len, s->sub[1].len)))
1038  return ret;
1039 
1040  if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1041  return AVERROR(ENOMEM);
1042 
1043  /* Flatten input map */
1044  tmp = (int *)s->tmp;
1045  for (int k = 0; k < len; k += s->sub[0].len) {
1046  memcpy(tmp, &s->map[k], s->sub[0].len*sizeof(*tmp));
1047  for (int i = 0; i < s->sub[0].len; i++)
1048  s->map[k + i] = tmp[s->sub[0].map[i]];
1049  }
1050 
1051  /* Only allocate extra temporary memory if we need it */
1052  if (!(s->sub[1].flags & AV_TX_INPLACE))
1053  extra_tmp_len = len;
1054  else if (!ps)
1055  extra_tmp_len = s->sub[0].len;
1056 
1057  if (extra_tmp_len && !(s->exp = av_malloc(extra_tmp_len*sizeof(*s->exp))))
1058  return AVERROR(ENOMEM);
1059 
1060  return 0;
1061 }
1062 
1063 static void TX_NAME(ff_tx_fft_pfa)(AVTXContext *s, void *_out,
1064  void *_in, ptrdiff_t stride)
1065 {
1066  const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
1067  const int *in_map = s->map, *out_map = in_map + l;
1068  const int *sub_map = s->sub[1].map;
1069  TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
1070  TXComplex *in = _in, *out = _out;
1071 
1072  stride /= sizeof(*out);
1073 
1074  for (int i = 0; i < m; i++) {
1075  for (int j = 0; j < n; j++)
1076  s->exp[j] = in[in_map[i*n + j]];
1077  s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], s->exp, m*sizeof(TXComplex));
1078  }
1079 
1080  for (int i = 0; i < n; i++)
1081  s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
1082 
1083  for (int i = 0; i < l; i++)
1084  out[i*stride] = tmp1[out_map[i]];
1085 }
1086 
1087 static void TX_NAME(ff_tx_fft_pfa_ns)(AVTXContext *s, void *_out,
1088  void *_in, ptrdiff_t stride)
1089 {
1090  const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
1091  const int *in_map = s->map, *out_map = in_map + l;
1092  const int *sub_map = s->sub[1].map;
1093  TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
1094  TXComplex *in = _in, *out = _out;
1095 
1096  stride /= sizeof(*out);
1097 
1098  for (int i = 0; i < m; i++)
1099  s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], &in[i*n], m*sizeof(TXComplex));
1100 
1101  for (int i = 0; i < n; i++)
1102  s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
1103 
1104  for (int i = 0; i < l; i++)
1105  out[i*stride] = tmp1[out_map[i]];
1106 }
1107 
1108 static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_def) = {
1109  .name = TX_NAME_STR("fft_pfa"),
1110  .function = TX_NAME(ff_tx_fft_pfa),
1111  .type = TX_TYPE(FFT),
1113  .factors = { 7, 5, 3, 2, TX_FACTOR_ANY },
1114  .nb_factors = 2,
1115  .min_len = 2*3,
1116  .max_len = TX_LEN_UNLIMITED,
1117  .init = TX_NAME(ff_tx_fft_pfa_init),
1119  .prio = FF_TX_PRIO_BASE,
1120 };
1121 
1122 static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_ns_def) = {
1123  .name = TX_NAME_STR("fft_pfa_ns"),
1124  .function = TX_NAME(ff_tx_fft_pfa_ns),
1125  .type = TX_TYPE(FFT),
1128  .factors = { 7, 5, 3, 2, TX_FACTOR_ANY },
1129  .nb_factors = 2,
1130  .min_len = 2*3,
1131  .max_len = TX_LEN_UNLIMITED,
1132  .init = TX_NAME(ff_tx_fft_pfa_init),
1134  .prio = FF_TX_PRIO_BASE,
1135 };
1136 
1138  const FFTXCodelet *cd,
1139  uint64_t flags,
1141  int len, int inv,
1142  const void *scale)
1143 {
1144  s->scale_d = *((SCALE_TYPE *)scale);
1145  s->scale_f = s->scale_d;
1146  return 0;
1147 }
1148 
1149 static void TX_NAME(ff_tx_mdct_naive_fwd)(AVTXContext *s, void *_dst,
1150  void *_src, ptrdiff_t stride)
1151 {
1152  TXSample *src = _src;
1153  TXSample *dst = _dst;
1154  double scale = s->scale_d;
1155  int len = s->len;
1156  const double phase = M_PI/(4.0*len);
1157 
1158  stride /= sizeof(*dst);
1159 
1160  for (int i = 0; i < len; i++) {
1161  double sum = 0.0;
1162  for (int j = 0; j < len*2; j++) {
1163  int a = (2*j + 1 + len) * (2*i + 1);
1164  sum += UNSCALE(src[j]) * cos(a * phase);
1165  }
1166  dst[i*stride] = RESCALE(sum*scale);
1167  }
1168 }
1169 
1170 static void TX_NAME(ff_tx_mdct_naive_inv)(AVTXContext *s, void *_dst,
1171  void *_src, ptrdiff_t stride)
1172 {
1173  TXSample *src = _src;
1174  TXSample *dst = _dst;
1175  double scale = s->scale_d;
1176  int len = s->len >> 1;
1177  int len2 = len*2;
1178  const double phase = M_PI/(4.0*len2);
1179 
1180  stride /= sizeof(*src);
1181 
1182  for (int i = 0; i < len; i++) {
1183  double sum_d = 0.0;
1184  double sum_u = 0.0;
1185  double i_d = phase * (4*len - 2*i - 1);
1186  double i_u = phase * (3*len2 + 2*i + 1);
1187  for (int j = 0; j < len2; j++) {
1188  double a = (2 * j + 1);
1189  double a_d = cos(a * i_d);
1190  double a_u = cos(a * i_u);
1191  double val = UNSCALE(src[j*stride]);
1192  sum_d += a_d * val;
1193  sum_u += a_u * val;
1194  }
1195  dst[i + 0] = RESCALE( sum_d*scale);
1196  dst[i + len] = RESCALE(-sum_u*scale);
1197  }
1198 }
1199 
1200 static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_fwd_def) = {
1201  .name = TX_NAME_STR("mdct_naive_fwd"),
1202  .function = TX_NAME(ff_tx_mdct_naive_fwd),
1203  .type = TX_TYPE(MDCT),
1205  .factors = { 2, TX_FACTOR_ANY }, /* MDCTs need an even length */
1206  .nb_factors = 2,
1207  .min_len = 2,
1208  .max_len = TX_LEN_UNLIMITED,
1209  .init = TX_NAME(ff_tx_mdct_naive_init),
1211  .prio = FF_TX_PRIO_MIN,
1212 };
1213 
1214 static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_inv_def) = {
1215  .name = TX_NAME_STR("mdct_naive_inv"),
1216  .function = TX_NAME(ff_tx_mdct_naive_inv),
1217  .type = TX_TYPE(MDCT),
1219  .factors = { 2, TX_FACTOR_ANY },
1220  .nb_factors = 2,
1221  .min_len = 2,
1222  .max_len = TX_LEN_UNLIMITED,
1223  .init = TX_NAME(ff_tx_mdct_naive_init),
1225  .prio = FF_TX_PRIO_MIN,
1226 };
1227 
1229  const FFTXCodelet *cd,
1230  uint64_t flags,
1232  int len, int inv,
1233  const void *scale)
1234 {
1235  int ret;
1236  FFTXCodeletOptions sub_opts = {
1238  };
1239 
1240  s->scale_d = *((SCALE_TYPE *)scale);
1241  s->scale_f = s->scale_d;
1242 
1243  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1244  flags |= AV_TX_INPLACE; /* in-place */
1245  flags |= FF_TX_PRESHUFFLE; /* First try with an in-place transform */
1246 
1247  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1248  inv, scale))) {
1249  flags &= ~FF_TX_PRESHUFFLE; /* Now try with a generic FFT */
1250  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1251  inv, scale)))
1252  return ret;
1253  }
1254 
1255  s->map = av_malloc((len >> 1)*sizeof(*s->map));
1256  if (!s->map)
1257  return AVERROR(ENOMEM);
1258 
1259  /* If we need to preshuffle copy the map from the subcontext */
1260  if (s->sub[0].flags & FF_TX_PRESHUFFLE) {
1261  memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map));
1262  } else {
1263  for (int i = 0; i < len >> 1; i++)
1264  s->map[i] = i;
1265  }
1266 
1267  if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1268  return ret;
1269 
1270  /* Saves a multiply in a hot path. */
1271  if (inv)
1272  for (int i = 0; i < (s->len >> 1); i++)
1273  s->map[i] <<= 1;
1274 
1275  return 0;
1276 }
1277 
1278 static void TX_NAME(ff_tx_mdct_fwd)(AVTXContext *s, void *_dst, void *_src,
1279  ptrdiff_t stride)
1280 {
1281  TXSample *src = _src, *dst = _dst;
1282  TXComplex *exp = s->exp, tmp, *z = _dst;
1283  const int len2 = s->len >> 1;
1284  const int len4 = s->len >> 2;
1285  const int len3 = len2 * 3;
1286  const int *sub_map = s->map;
1287 
1288  stride /= sizeof(*dst);
1289 
1290  for (int i = 0; i < len2; i++) { /* Folding and pre-reindexing */
1291  const int k = 2*i;
1292  const int idx = sub_map[i];
1293  if (k < len2) {
1294  tmp.re = FOLD(-src[ len2 + k], src[1*len2 - 1 - k]);
1295  tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);
1296  } else {
1297  tmp.re = FOLD(-src[ len2 + k], -src[5*len2 - 1 - k]);
1298  tmp.im = FOLD( src[-len2 + k], -src[1*len3 - 1 - k]);
1299  }
1300  CMUL(z[idx].im, z[idx].re, tmp.re, tmp.im, exp[i].re, exp[i].im);
1301  }
1302 
1303  s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1304 
1305  for (int i = 0; i < len4; i++) {
1306  const int i0 = len4 + i, i1 = len4 - i - 1;
1307  TXComplex src1 = { z[i1].re, z[i1].im };
1308  TXComplex src0 = { z[i0].re, z[i0].im };
1309 
1310  CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im,
1311  exp[i0].im, exp[i0].re);
1312  CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im,
1313  exp[i1].im, exp[i1].re);
1314  }
1315 }
1316 
1317 static void TX_NAME(ff_tx_mdct_inv)(AVTXContext *s, void *_dst, void *_src,
1318  ptrdiff_t stride)
1319 {
1320  TXComplex *z = _dst, *exp = s->exp;
1321  const TXSample *src = _src, *in1, *in2;
1322  const int len2 = s->len >> 1;
1323  const int len4 = s->len >> 2;
1324  const int *sub_map = s->map;
1325 
1326  stride /= sizeof(*src);
1327  in1 = src;
1328  in2 = src + ((len2*2) - 1) * stride;
1329 
1330  for (int i = 0; i < len2; i++) {
1331  int k = sub_map[i];
1332  TXComplex tmp = { in2[-k*stride], in1[k*stride] };
1333  CMUL3(z[i], tmp, exp[i]);
1334  }
1335 
1336  s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1337 
1338  exp += len2;
1339  for (int i = 0; i < len4; i++) {
1340  const int i0 = len4 + i, i1 = len4 - i - 1;
1341  TXComplex src1 = { z[i1].im, z[i1].re };
1342  TXComplex src0 = { z[i0].im, z[i0].re };
1343 
1344  CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);
1345  CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);
1346  }
1347 }
1348 
1349 static const FFTXCodelet TX_NAME(ff_tx_mdct_fwd_def) = {
1350  .name = TX_NAME_STR("mdct_fwd"),
1351  .function = TX_NAME(ff_tx_mdct_fwd),
1352  .type = TX_TYPE(MDCT),
1354  .factors = { 2, TX_FACTOR_ANY },
1355  .nb_factors = 2,
1356  .min_len = 2,
1357  .max_len = TX_LEN_UNLIMITED,
1358  .init = TX_NAME(ff_tx_mdct_init),
1360  .prio = FF_TX_PRIO_BASE,
1361 };
1362 
1363 static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_def) = {
1364  .name = TX_NAME_STR("mdct_inv"),
1365  .function = TX_NAME(ff_tx_mdct_inv),
1366  .type = TX_TYPE(MDCT),
1368  .factors = { 2, TX_FACTOR_ANY },
1369  .nb_factors = 2,
1370  .min_len = 2,
1371  .max_len = TX_LEN_UNLIMITED,
1372  .init = TX_NAME(ff_tx_mdct_init),
1374  .prio = FF_TX_PRIO_BASE,
1375 };
1376 
1378  const FFTXCodelet *cd,
1379  uint64_t flags,
1381  int len, int inv,
1382  const void *scale)
1383 {
1384  int ret;
1385 
1386  s->scale_d = *((SCALE_TYPE *)scale);
1387  s->scale_f = s->scale_d;
1388 
1389  flags &= ~AV_TX_FULL_IMDCT;
1390 
1391  if ((ret = ff_tx_init_subtx(s, TX_TYPE(MDCT), flags, NULL, len, 1, scale)))
1392  return ret;
1393 
1394  return 0;
1395 }
1396 
1397 static void TX_NAME(ff_tx_mdct_inv_full)(AVTXContext *s, void *_dst,
1398  void *_src, ptrdiff_t stride)
1399 {
1400  int len = s->len << 1;
1401  int len2 = len >> 1;
1402  int len4 = len >> 2;
1403  TXSample *dst = _dst;
1404 
1405  s->fn[0](&s->sub[0], dst + len4, _src, stride);
1406 
1407  stride /= sizeof(*dst);
1408 
1409  for (int i = 0; i < len4; i++) {
1410  dst[ i*stride] = -dst[(len2 - i - 1)*stride];
1411  dst[(len - i - 1)*stride] = dst[(len2 + i + 0)*stride];
1412  }
1413 }
1414 
1415 static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_full_def) = {
1416  .name = TX_NAME_STR("mdct_inv_full"),
1417  .function = TX_NAME(ff_tx_mdct_inv_full),
1418  .type = TX_TYPE(MDCT),
1419  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
1421  .factors = { 2, TX_FACTOR_ANY },
1422  .nb_factors = 2,
1423  .min_len = 2,
1424  .max_len = TX_LEN_UNLIMITED,
1427  .prio = FF_TX_PRIO_BASE,
1428 };
1429 
1431  const FFTXCodelet *cd,
1432  uint64_t flags,
1434  int len, int inv,
1435  const void *scale)
1436 {
1437  int ret, sub_len;
1438  FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
1439 
1440  len >>= 1;
1441  sub_len = len / cd->factors[0];
1442 
1443  s->scale_d = *((SCALE_TYPE *)scale);
1444  s->scale_f = s->scale_d;
1445 
1446  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1447  flags |= AV_TX_INPLACE; /* in-place */
1448  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
1449 
1450  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1451  sub_len, inv, scale)))
1452  return ret;
1453 
1454  if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
1455  return ret;
1456 
1457  /* Our 15-point transform is also a compound one, so embed its input map */
1458  if (cd->factors[0] == 15)
1459  TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
1460 
1461  if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1462  return ret;
1463 
1464  /* Saves multiplies in loops. */
1465  for (int i = 0; i < len; i++)
1466  s->map[i] <<= 1;
1467 
1468  if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1469  return AVERROR(ENOMEM);
1470 
1471  TX_TAB(ff_tx_init_tabs)(len / sub_len);
1472 
1473  return 0;
1474 }
1475 
1476 #define DECL_COMP_IMDCT(N) \
1477 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst, \
1478  void *_src, ptrdiff_t stride) \
1479 { \
1480  TXComplex fft##N##in[N]; \
1481  TXComplex *z = _dst, *exp = s->exp; \
1482  const TXSample *src = _src, *in1, *in2; \
1483  const int len4 = s->len >> 2; \
1484  const int len2 = s->len >> 1; \
1485  const int m = s->sub->len; \
1486  const int *in_map = s->map, *out_map = in_map + N*m; \
1487  const int *sub_map = s->sub->map; \
1488  \
1489  stride /= sizeof(*src); /* To convert it from bytes */ \
1490  in1 = src; \
1491  in2 = src + ((N*m*2) - 1) * stride; \
1492  \
1493  for (int i = 0; i < len2; i += N) { \
1494  for (int j = 0; j < N; j++) { \
1495  const int k = in_map[j]; \
1496  TXComplex tmp = { in2[-k*stride], in1[k*stride] }; \
1497  CMUL3(fft##N##in[j], tmp, exp[j]); \
1498  } \
1499  fft##N(s->tmp + *(sub_map++), fft##N##in, m); \
1500  exp += N; \
1501  in_map += N; \
1502  } \
1503  \
1504  for (int i = 0; i < N; i++) \
1505  s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \
1506  \
1507  for (int i = 0; i < len4; i++) { \
1508  const int i0 = len4 + i, i1 = len4 - i - 1; \
1509  const int s0 = out_map[i0], s1 = out_map[i1]; \
1510  TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re }; \
1511  TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re }; \
1512  \
1513  CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re); \
1514  CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re); \
1515  } \
1516 } \
1517  \
1518 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_inv_def) = { \
1519  .name = TX_NAME_STR("mdct_pfa_" #N "xM_inv"), \
1520  .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_inv), \
1521  .type = TX_TYPE(MDCT), \
1522  .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY, \
1523  .factors = { N, TX_FACTOR_ANY }, \
1524  .nb_factors = 2, \
1525  .min_len = N*2, \
1526  .max_len = TX_LEN_UNLIMITED, \
1527  .init = TX_NAME(ff_tx_mdct_pfa_init), \
1528  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1529  .prio = FF_TX_PRIO_BASE, \
1530 };
1531 
1532 DECL_COMP_IMDCT(3)
1533 DECL_COMP_IMDCT(5)
1534 DECL_COMP_IMDCT(7)
1535 DECL_COMP_IMDCT(9)
1536 DECL_COMP_IMDCT(15)
1537 
1538 #define DECL_COMP_MDCT(N) \
1539 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd)(AVTXContext *s, void *_dst, \
1540  void *_src, ptrdiff_t stride) \
1541 { \
1542  TXComplex fft##N##in[N]; \
1543  TXSample *src = _src, *dst = _dst; \
1544  TXComplex *exp = s->exp, tmp; \
1545  const int m = s->sub->len; \
1546  const int len4 = N*m; \
1547  const int len3 = len4 * 3; \
1548  const int len8 = s->len >> 2; \
1549  const int *in_map = s->map, *out_map = in_map + N*m; \
1550  const int *sub_map = s->sub->map; \
1551  \
1552  stride /= sizeof(*dst); \
1553  \
1554  for (int i = 0; i < m; i++) { /* Folding and pre-reindexing */ \
1555  for (int j = 0; j < N; j++) { \
1556  const int k = in_map[i*N + j]; \
1557  if (k < len4) { \
1558  tmp.re = FOLD(-src[ len4 + k], src[1*len4 - 1 - k]); \
1559  tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]); \
1560  } else { \
1561  tmp.re = FOLD(-src[ len4 + k], -src[5*len4 - 1 - k]); \
1562  tmp.im = FOLD( src[-len4 + k], -src[1*len3 - 1 - k]); \
1563  } \
1564  CMUL(fft##N##in[j].im, fft##N##in[j].re, tmp.re, tmp.im, \
1565  exp[k >> 1].re, exp[k >> 1].im); \
1566  } \
1567  fft##N(s->tmp + sub_map[i], fft##N##in, m); \
1568  } \
1569  \
1570  for (int i = 0; i < N; i++) \
1571  s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \
1572  \
1573  for (int i = 0; i < len8; i++) { \
1574  const int i0 = len8 + i, i1 = len8 - i - 1; \
1575  const int s0 = out_map[i0], s1 = out_map[i1]; \
1576  TXComplex src1 = { s->tmp[s1].re, s->tmp[s1].im }; \
1577  TXComplex src0 = { s->tmp[s0].re, s->tmp[s0].im }; \
1578  \
1579  CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im, \
1580  exp[i0].im, exp[i0].re); \
1581  CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im, \
1582  exp[i1].im, exp[i1].re); \
1583  } \
1584 } \
1585  \
1586 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd_def) = { \
1587  .name = TX_NAME_STR("mdct_pfa_" #N "xM_fwd"), \
1588  .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd), \
1589  .type = TX_TYPE(MDCT), \
1590  .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, \
1591  .factors = { N, TX_FACTOR_ANY }, \
1592  .nb_factors = 2, \
1593  .min_len = N*2, \
1594  .max_len = TX_LEN_UNLIMITED, \
1595  .init = TX_NAME(ff_tx_mdct_pfa_init), \
1596  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1597  .prio = FF_TX_PRIO_BASE, \
1598 };
1599 
1600 DECL_COMP_MDCT(3)
1601 DECL_COMP_MDCT(5)
1602 DECL_COMP_MDCT(7)
1603 DECL_COMP_MDCT(9)
1604 DECL_COMP_MDCT(15)
1605 
1607  const FFTXCodelet *cd,
1608  uint64_t flags,
1610  int len, int inv,
1611  const void *scale)
1612 {
1613  int ret;
1614  double f, m;
1615  TXSample *tab;
1616  uint64_t r2r = flags & AV_TX_REAL_TO_REAL;
1617  int len4 = FFALIGN(len, 4) / 4;
1618 
1619  s->scale_d = *((SCALE_TYPE *)scale);
1620  s->scale_f = s->scale_d;
1621 
1623 
1624  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, NULL, len >> 1, inv, scale)))
1625  return ret;
1626 
1627  if (!(s->exp = av_mallocz((8 + 2*len4)*sizeof(*s->exp))))
1628  return AVERROR(ENOMEM);
1629 
1630  tab = (TXSample *)s->exp;
1631 
1632  f = 2*M_PI/len;
1633 
1634  m = (inv ? 2*s->scale_d : s->scale_d);
1635 
1636  *tab++ = RESCALE((inv ? 0.5 : 1.0) * m);
1637  *tab++ = RESCALE(inv ? 0.5*m : 1.0*m);
1638  *tab++ = RESCALE( m);
1639  *tab++ = RESCALE(-m);
1640 
1641  *tab++ = RESCALE( (0.5 - 0.0) * m);
1642  if (r2r)
1643  *tab++ = 1 / s->scale_f;
1644  else
1645  *tab++ = RESCALE( (0.0 - 0.5) * m);
1646  *tab++ = RESCALE( (0.5 - inv) * m);
1647  *tab++ = RESCALE(-(0.5 - inv) * m);
1648 
1649  for (int i = 0; i < len4; i++)
1650  *tab++ = RESCALE(cos(i*f));
1651 
1652  tab = ((TXSample *)s->exp) + len4 + 8;
1653 
1654  for (int i = 0; i < len4; i++)
1655  *tab++ = RESCALE(cos(((len - i*4)/4.0)*f)) * (inv ? 1 : -1);
1656 
1657  return 0;
1658 }
1659 
1660 #define DECL_RDFT(n, inv) \
1661 static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst, \
1662  void *_src, ptrdiff_t stride) \
1663 { \
1664  const int len2 = s->len >> 1; \
1665  const int len4 = s->len >> 2; \
1666  const TXSample *fact = (void *)s->exp; \
1667  const TXSample *tcos = fact + 8; \
1668  const TXSample *tsin = tcos + len4; \
1669  TXComplex *data = inv ? _src : _dst; \
1670  TXComplex t[3]; \
1671  \
1672  if (!inv) \
1673  s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex)); \
1674  else \
1675  data[0].im = data[len2].re; \
1676  \
1677  /* The DC value's both components are real, but we need to change them \
1678  * into complex values. Also, the middle of the array is special-cased. \
1679  * These operations can be done before or after the loop. */ \
1680  t[0].re = data[0].re; \
1681  data[0].re = t[0].re + data[0].im; \
1682  data[0].im = t[0].re - data[0].im; \
1683  data[ 0].re = MULT(fact[0], data[ 0].re); \
1684  data[ 0].im = MULT(fact[1], data[ 0].im); \
1685  data[len4].re = MULT(fact[2], data[len4].re); \
1686  data[len4].im = MULT(fact[3], data[len4].im); \
1687  \
1688  for (int i = 1; i < len4; i++) { \
1689  /* Separate even and odd FFTs */ \
1690  t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re)); \
1691  t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im)); \
1692  t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im)); \
1693  t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re)); \
1694  \
1695  /* Apply twiddle factors to the odd FFT and add to the even FFT */ \
1696  CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]); \
1697  \
1698  data[ i].re = t[0].re + t[2].re; \
1699  data[ i].im = t[2].im - t[0].im; \
1700  data[len2 - i].re = t[0].re - t[2].re; \
1701  data[len2 - i].im = t[2].im + t[0].im; \
1702  } \
1703  \
1704  if (inv) { \
1705  s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex)); \
1706  } else { \
1707  /* Move [0].im to the last position, as convention requires */ \
1708  data[len2].re = data[0].im; \
1709  data[ 0].im = data[len2].im = 0; \
1710  } \
1711 } \
1712  \
1713 static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = { \
1714  .name = TX_NAME_STR("rdft_" #n), \
1715  .function = TX_NAME(ff_tx_rdft_ ##n), \
1716  .type = TX_TYPE(RDFT), \
1717  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
1718  (inv ? FF_TX_INVERSE_ONLY : FF_TX_FORWARD_ONLY), \
1719  .factors = { 4, TX_FACTOR_ANY }, \
1720  .nb_factors = 2, \
1721  .min_len = 4, \
1722  .max_len = TX_LEN_UNLIMITED, \
1723  .init = TX_NAME(ff_tx_rdft_init), \
1724  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1725  .prio = FF_TX_PRIO_BASE, \
1726 };
1727 
1728 DECL_RDFT(r2c, 0)
1730 
1731 #define DECL_RDFT_HALF(n, mode, mod2) \
1732 static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst, \
1733  void *_src, ptrdiff_t stride) \
1734 { \
1735  const int len = s->len; \
1736  const int len2 = len >> 1; \
1737  const int len4 = len >> 2; \
1738  const int aligned_len4 = FFALIGN(len, 4)/4; \
1739  const TXSample *fact = (void *)s->exp; \
1740  const TXSample *tcos = fact + 8; \
1741  const TXSample *tsin = tcos + aligned_len4; \
1742  TXComplex *data = _dst; \
1743  TXSample *out = _dst; /* Half-complex is forward-only */ \
1744  TXSample tmp_dc; \
1745  av_unused TXSample tmp_mid; \
1746  TXSample tmp[4]; \
1747  TXComplex sf, sl; \
1748  \
1749  s->fn[0](&s->sub[0], _dst, _src, sizeof(TXComplex)); \
1750  \
1751  tmp_dc = data[0].re; \
1752  data[ 0].re = tmp_dc + data[0].im; \
1753  tmp_dc = tmp_dc - data[0].im; \
1754  \
1755  data[ 0].re = MULT(fact[0], data[ 0].re); \
1756  tmp_dc = MULT(fact[1], tmp_dc); \
1757  data[len4].re = MULT(fact[2], data[len4].re); \
1758  \
1759  if (!mod2) { \
1760  data[len4].im = MULT(fact[3], data[len4].im); \
1761  } else { \
1762  sf = data[len4]; \
1763  sl = data[len4 + 1]; \
1764  if (mode == AV_TX_REAL_TO_REAL) \
1765  tmp[0] = MULT(fact[4], (sf.re + sl.re)); \
1766  else \
1767  tmp[0] = MULT(fact[5], (sf.im - sl.im)); \
1768  tmp[1] = MULT(fact[6], (sf.im + sl.im)); \
1769  tmp[2] = MULT(fact[7], (sf.re - sl.re)); \
1770  \
1771  if (mode == AV_TX_REAL_TO_REAL) { \
1772  tmp[3] = tmp[1]*tcos[len4] - tmp[2]*tsin[len4]; \
1773  tmp_mid = (tmp[0] - tmp[3]); \
1774  } else { \
1775  tmp[3] = tmp[1]*tsin[len4] + tmp[2]*tcos[len4]; \
1776  tmp_mid = (tmp[0] + tmp[3]); \
1777  } \
1778  } \
1779  \
1780  /* NOTE: unrolling this breaks non-mod8 lengths */ \
1781  for (int i = 1; i <= len4; i++) { \
1782  TXSample tmp[4]; \
1783  TXComplex sf = data[i]; \
1784  TXComplex sl = data[len2 - i]; \
1785  \
1786  if (mode == AV_TX_REAL_TO_REAL) \
1787  tmp[0] = MULT(fact[4], (sf.re + sl.re)); \
1788  else \
1789  tmp[0] = MULT(fact[5], (sf.im - sl.im)); \
1790  \
1791  tmp[1] = MULT(fact[6], (sf.im + sl.im)); \
1792  tmp[2] = MULT(fact[7], (sf.re - sl.re)); \
1793  \
1794  if (mode == AV_TX_REAL_TO_REAL) { \
1795  tmp[3] = tmp[1]*tcos[i] - tmp[2]*tsin[i]; \
1796  out[i] = (tmp[0] + tmp[3]); \
1797  out[len - i] = (tmp[0] - tmp[3]); \
1798  } else { \
1799  tmp[3] = tmp[1]*tsin[i] + tmp[2]*tcos[i]; \
1800  out[i - 1] = (tmp[3] - tmp[0]); \
1801  out[len - i - 1] = (tmp[0] + tmp[3]); \
1802  } \
1803  } \
1804  \
1805  for (int i = 1; i < (len4 + (mode == AV_TX_REAL_TO_IMAGINARY)); i++) \
1806  out[len2 - i] = out[len - i]; \
1807  \
1808  if (mode == AV_TX_REAL_TO_REAL) { \
1809  out[len2] = tmp_dc; \
1810  if (mod2) \
1811  out[len4 + 1] = tmp_mid * fact[5]; \
1812  } else if (mod2) { \
1813  out[len4] = tmp_mid; \
1814  } \
1815 } \
1816  \
1817 static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = { \
1818  .name = TX_NAME_STR("rdft_" #n), \
1819  .function = TX_NAME(ff_tx_rdft_ ##n), \
1820  .type = TX_TYPE(RDFT), \
1821  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | mode | \
1822  FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, \
1823  .factors = { 2 + 2*(!mod2), TX_FACTOR_ANY }, \
1824  .nb_factors = 2, \
1825  .min_len = 2 + 2*(!mod2), \
1826  .max_len = TX_LEN_UNLIMITED, \
1827  .init = TX_NAME(ff_tx_rdft_init), \
1828  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1829  .prio = FF_TX_PRIO_BASE, \
1830 };
1831 
1833 DECL_RDFT_HALF(r2r_mod2, AV_TX_REAL_TO_REAL, 1)
1836 
1838  const FFTXCodelet *cd,
1839  uint64_t flags,
1841  int len, int inv,
1842  const void *scale)
1843 {
1844  int ret;
1845  double freq;
1846  TXSample *tab;
1847  SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
1848 
1849  if (inv) {
1850  len *= 2;
1851  s->len *= 2;
1852  rsc *= 0.5;
1853  }
1854 
1855  if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL, len, inv, &rsc)))
1856  return ret;
1857 
1858  s->exp = av_malloc((len/2)*3*sizeof(TXSample));
1859  if (!s->exp)
1860  return AVERROR(ENOMEM);
1861 
1862  tab = (TXSample *)s->exp;
1863 
1864  freq = M_PI/(len*2);
1865 
1866  for (int i = 0; i < len; i++)
1867  tab[i] = RESCALE(cos(i*freq)*(!inv + 1));
1868 
1869  if (inv) {
1870  for (int i = 0; i < len/2; i++)
1871  tab[len + i] = RESCALE(0.5 / sin((2*i + 1)*freq));
1872  } else {
1873  for (int i = 0; i < len/2; i++)
1874  tab[len + i] = RESCALE(cos((len - 2*i - 1)*freq));
1875  }
1876 
1877  return 0;
1879 
1880 static void TX_NAME(ff_tx_dctII)(AVTXContext *s, void *_dst,
1881  void *_src, ptrdiff_t stride)
1882 {
1883  TXSample *dst = _dst;
1884  TXSample *src = _src;
1885  const int len = s->len;
1886  const int len2 = len >> 1;
1887  const TXSample *exp = (void *)s->exp;
1888  TXSample next;
1889 #ifdef TX_INT32
1890  int64_t tmp1, tmp2;
1891 #else
1892  TXSample tmp1, tmp2;
1893 #endif
1894 
1895  for (int i = 0; i < len2; i++) {
1896  TXSample in1 = src[i];
1897  TXSample in2 = src[len - i - 1];
1898  TXSample s = exp[len + i];
1899 
1900 #ifdef TX_INT32
1901  tmp1 = in1 + in2;
1902  tmp2 = in1 - in2;
1903 
1904  tmp1 >>= 1;
1905  tmp2 *= s;
1906 
1907  tmp2 = (tmp2 + 0x40000000) >> 31;
1908 #else
1909  tmp1 = (in1 + in2)*0.5;
1910  tmp2 = (in1 - in2)*s;
1911 #endif
1912 
1913  src[i] = tmp1 + tmp2;
1914  src[len - i - 1] = tmp1 - tmp2;
1915  }
1916 
1917  s->fn[0](&s->sub[0], dst, src, sizeof(TXComplex));
1918 
1919  next = dst[len];
1920 
1921  for (int i = len - 2; i > 0; i -= 2) {
1922  TXSample tmp;
1923 
1924  CMUL(tmp, dst[i], exp[len - i], exp[i], dst[i + 0], dst[i + 1]);
1925 
1926  dst[i + 1] = next;
1927 
1928  next += tmp;
1929  }
1930 
1931 #ifdef TX_INT32
1932  tmp1 = ((int64_t)exp[0]) * ((int64_t)dst[0]);
1933  dst[0] = (tmp1 + 0x40000000) >> 31;
1934 #else
1935  dst[0] = exp[0] * dst[0];
1936 #endif
1937  dst[1] = next;
1939 
1940 static void TX_NAME(ff_tx_dctIII)(AVTXContext *s, void *_dst,
1941  void *_src, ptrdiff_t stride)
1942 {
1943  TXSample *dst = _dst;
1944  TXSample *src = _src;
1945  const int len = s->len;
1946  const int len2 = len >> 1;
1947  const TXSample *exp = (void *)s->exp;
1948 #ifdef TX_INT32
1949  int64_t tmp1, tmp2 = src[len - 1];
1950  tmp2 = (2*tmp2 + 0x40000000) >> 31;
1951 #else
1952  TXSample tmp1, tmp2 = 2*src[len - 1];
1953 #endif
1954 
1955  src[len] = tmp2;
1956 
1957  for (int i = len - 2; i >= 2; i -= 2) {
1958  TXSample val1 = src[i - 0];
1959  TXSample val2 = src[i - 1] - src[i + 1];
1960 
1961  CMUL(src[i + 1], src[i], exp[len - i], exp[i], val1, val2);
1962  }
1963 
1964  s->fn[0](&s->sub[0], dst, src, sizeof(float));
1965 
1966  for (int i = 0; i < len2; i++) {
1967  TXSample in1 = dst[i];
1968  TXSample in2 = dst[len - i - 1];
1969  TXSample c = exp[len + i];
1970 
1971  tmp1 = in1 + in2;
1972  tmp2 = in1 - in2;
1973  tmp2 *= c;
1974 #ifdef TX_INT32
1975  tmp2 = (tmp2 + 0x40000000) >> 31;
1976 #endif
1977 
1978  dst[i] = tmp1 + tmp2;
1979  dst[len - i - 1] = tmp1 - tmp2;
1980  }
1981 }
1982 
1983 static const FFTXCodelet TX_NAME(ff_tx_dctII_def) = {
1984  .name = TX_NAME_STR("dctII"),
1985  .function = TX_NAME(ff_tx_dctII),
1986  .type = TX_TYPE(DCT),
1987  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
1989  .factors = { 2, TX_FACTOR_ANY },
1990  .min_len = 2,
1991  .max_len = TX_LEN_UNLIMITED,
1992  .init = TX_NAME(ff_tx_dct_init),
1994  .prio = FF_TX_PRIO_BASE,
1995 };
1996 
1997 static const FFTXCodelet TX_NAME(ff_tx_dctIII_def) = {
1998  .name = TX_NAME_STR("dctIII"),
1999  .function = TX_NAME(ff_tx_dctIII),
2000  .type = TX_TYPE(DCT),
2001  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
2003  .factors = { 2, TX_FACTOR_ANY },
2004  .min_len = 2,
2005  .max_len = TX_LEN_UNLIMITED,
2006  .init = TX_NAME(ff_tx_dct_init),
2008  .prio = FF_TX_PRIO_BASE,
2009 };
2010 
2012  const FFTXCodelet *cd,
2013  uint64_t flags,
2015  int len, int inv,
2016  const void *scale)
2017 {
2018  int ret;
2019  SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
2020 
2021  if (inv) {
2022  len *= 2;
2023  s->len *= 2;
2024  rsc *= 0.5;
2025  }
2026 
2027  /* We want a half-complex RDFT */
2028  flags |= cd->type == TX_TYPE(DCT_I) ? AV_TX_REAL_TO_REAL :
2030 
2031  if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL,
2032  (len - 1 + 2*(cd->type == TX_TYPE(DST_I)))*2,
2033  0, &rsc)))
2034  return ret;
2035 
2036  s->tmp = av_mallocz((len + 1)*2*sizeof(TXSample));
2037  if (!s->tmp)
2038  return AVERROR(ENOMEM);
2039 
2040  return 0;
2042 
2043 static void TX_NAME(ff_tx_dctI)(AVTXContext *s, void *_dst,
2044  void *_src, ptrdiff_t stride)
2045 {
2046  TXSample *dst = _dst;
2047  TXSample *src = _src;
2048  const int len = s->len - 1;
2049  TXSample *tmp = (TXSample *)s->tmp;
2050 
2051  stride /= sizeof(TXSample);
2052 
2053  for (int i = 0; i < len; i++)
2054  tmp[i] = tmp[2*len - i] = src[i * stride];
2055 
2056  tmp[len] = src[len * stride]; /* Middle */
2057 
2058  s->fn[0](&s->sub[0], dst, tmp, sizeof(TXSample));
2060 
2061 static void TX_NAME(ff_tx_dstI)(AVTXContext *s, void *_dst,
2062  void *_src, ptrdiff_t stride)
2063 {
2064  TXSample *dst = _dst;
2065  TXSample *src = _src;
2066  const int len = s->len + 1;
2067  TXSample *tmp = (void *)s->tmp;
2068 
2069  stride /= sizeof(TXSample);
2070 
2071  tmp[0] = 0;
2072 
2073  for (int i = 1; i < len; i++) {
2074  TXSample a = src[(i - 1) * stride];
2075  tmp[i] = -a;
2076  tmp[2*len - i] = a;
2077  }
2078 
2079  tmp[len] = 0; /* i == n, Nyquist */
2080 
2081  s->fn[0](&s->sub[0], dst, tmp, sizeof(float));
2082 }
2083 
2084 static const FFTXCodelet TX_NAME(ff_tx_dctI_def) = {
2085  .name = TX_NAME_STR("dctI"),
2086  .function = TX_NAME(ff_tx_dctI),
2087  .type = TX_TYPE(DCT_I),
2089  .factors = { 2, TX_FACTOR_ANY },
2090  .nb_factors = 2,
2091  .min_len = 2,
2092  .max_len = TX_LEN_UNLIMITED,
2093  .init = TX_NAME(ff_tx_dcstI_init),
2095  .prio = FF_TX_PRIO_BASE,
2096 };
2097 
2098 static const FFTXCodelet TX_NAME(ff_tx_dstI_def) = {
2099  .name = TX_NAME_STR("dstI"),
2100  .function = TX_NAME(ff_tx_dstI),
2101  .type = TX_TYPE(DST_I),
2103  .factors = { 2, TX_FACTOR_ANY },
2104  .nb_factors = 2,
2105  .min_len = 2,
2106  .max_len = TX_LEN_UNLIMITED,
2107  .init = TX_NAME(ff_tx_dcstI_init),
2109  .prio = FF_TX_PRIO_BASE,
2110 };
2111 
2112 int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s, int *pre_tab)
2113 {
2114  int off = 0;
2115  int len4 = s->len >> 1;
2116  double scale = s->scale_d;
2117  const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0;
2118  size_t alloc = pre_tab ? 2*len4 : len4;
2119 
2120  if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp))))
2121  return AVERROR(ENOMEM);
2122 
2123  scale = sqrt(fabs(scale));
2124 
2125  if (pre_tab)
2126  off = len4;
2127 
2128  for (int i = 0; i < len4; i++) {
2129  const double alpha = M_PI_2 * (i + theta) / len4;
2130  s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale),
2131  RESCALE(sin(alpha) * scale) };
2132  }
2133 
2134  if (pre_tab)
2135  for (int i = 0; i < len4; i++)
2136  s->exp[i] = s->exp[len4 + pre_tab[i]];
2137 
2138  return 0;
2139 }
2140 
2141 const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = {
2142  /* Split-Radix codelets */
2143  &TX_NAME(ff_tx_fft2_ns_def),
2144  &TX_NAME(ff_tx_fft4_ns_def),
2145  &TX_NAME(ff_tx_fft8_ns_def),
2146  &TX_NAME(ff_tx_fft16_ns_def),
2147  &TX_NAME(ff_tx_fft32_ns_def),
2148  &TX_NAME(ff_tx_fft64_ns_def),
2149  &TX_NAME(ff_tx_fft128_ns_def),
2150  &TX_NAME(ff_tx_fft256_ns_def),
2151  &TX_NAME(ff_tx_fft512_ns_def),
2152  &TX_NAME(ff_tx_fft1024_ns_def),
2153  &TX_NAME(ff_tx_fft2048_ns_def),
2154  &TX_NAME(ff_tx_fft4096_ns_def),
2155  &TX_NAME(ff_tx_fft8192_ns_def),
2156  &TX_NAME(ff_tx_fft16384_ns_def),
2157  &TX_NAME(ff_tx_fft32768_ns_def),
2158  &TX_NAME(ff_tx_fft65536_ns_def),
2159  &TX_NAME(ff_tx_fft131072_ns_def),
2160  &TX_NAME(ff_tx_fft262144_ns_def),
2161  &TX_NAME(ff_tx_fft524288_ns_def),
2162  &TX_NAME(ff_tx_fft1048576_ns_def),
2163  &TX_NAME(ff_tx_fft2097152_ns_def),
2164 
2165  /* Prime factor codelets */
2166  &TX_NAME(ff_tx_fft3_ns_def),
2167  &TX_NAME(ff_tx_fft5_ns_def),
2168  &TX_NAME(ff_tx_fft7_ns_def),
2169  &TX_NAME(ff_tx_fft9_ns_def),
2170  &TX_NAME(ff_tx_fft15_ns_def),
2171 
2172  /* We get these for free */
2173  &TX_NAME(ff_tx_fft3_fwd_def),
2174  &TX_NAME(ff_tx_fft5_fwd_def),
2175  &TX_NAME(ff_tx_fft7_fwd_def),
2176  &TX_NAME(ff_tx_fft9_fwd_def),
2177 
2178  /* Standalone transforms */
2179  &TX_NAME(ff_tx_fft_def),
2180  &TX_NAME(ff_tx_fft_inplace_def),
2181  &TX_NAME(ff_tx_fft_inplace_small_def),
2182  &TX_NAME(ff_tx_fft_pfa_def),
2183  &TX_NAME(ff_tx_fft_pfa_ns_def),
2184  &TX_NAME(ff_tx_fft_naive_def),
2185  &TX_NAME(ff_tx_fft_naive_small_def),
2186  &TX_NAME(ff_tx_mdct_fwd_def),
2187  &TX_NAME(ff_tx_mdct_inv_def),
2188  &TX_NAME(ff_tx_mdct_pfa_3xM_fwd_def),
2189  &TX_NAME(ff_tx_mdct_pfa_5xM_fwd_def),
2190  &TX_NAME(ff_tx_mdct_pfa_7xM_fwd_def),
2191  &TX_NAME(ff_tx_mdct_pfa_9xM_fwd_def),
2192  &TX_NAME(ff_tx_mdct_pfa_15xM_fwd_def),
2193  &TX_NAME(ff_tx_mdct_pfa_3xM_inv_def),
2194  &TX_NAME(ff_tx_mdct_pfa_5xM_inv_def),
2195  &TX_NAME(ff_tx_mdct_pfa_7xM_inv_def),
2196  &TX_NAME(ff_tx_mdct_pfa_9xM_inv_def),
2197  &TX_NAME(ff_tx_mdct_pfa_15xM_inv_def),
2198  &TX_NAME(ff_tx_mdct_naive_fwd_def),
2199  &TX_NAME(ff_tx_mdct_naive_inv_def),
2200  &TX_NAME(ff_tx_mdct_inv_full_def),
2201  &TX_NAME(ff_tx_rdft_r2c_def),
2202  &TX_NAME(ff_tx_rdft_r2r_def),
2203  &TX_NAME(ff_tx_rdft_r2r_mod2_def),
2204  &TX_NAME(ff_tx_rdft_r2i_def),
2205  &TX_NAME(ff_tx_rdft_r2i_mod2_def),
2206  &TX_NAME(ff_tx_rdft_c2r_def),
2207  &TX_NAME(ff_tx_dctII_def),
2208  &TX_NAME(ff_tx_dctIII_def),
2209  &TX_NAME(ff_tx_dctI_def),
2210  &TX_NAME(ff_tx_dstI_def),
2211 
2212  NULL,
2213 };
func
int(* func)(AVBPrint *dst, const char *in, const char *arg)
Definition: jacosubdec.c:68
DCT_I
@ DCT_I
Definition: avfft.h:121
ff_tx_fft_sr_combine
static void TX_NAME() ff_tx_fft_sr_combine(TXComplex *z, const TXSample *cos, int len)
Definition: tx_template.c:563
ff_tx_dct_init
static av_cold int TX_NAME() ff_tx_dct_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1835
AV_TX_REAL_TO_REAL
@ AV_TX_REAL_TO_REAL
Perform a real to half-complex RDFT.
Definition: tx.h:184
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
out
FILE * out
Definition: movenc.c:54
ff_ctz
#define ff_ctz
Definition: intmath.h:107
TRANSFORM
#define TRANSFORM(a0, a1, a2, a3, wre, wim)
Definition: tx_template.c:555
src1
const pixel * src1
Definition: h264pred_template.c:421
AVTXContext
Definition: tx_priv.h:235
ff_tx_fft
static void TX_NAME() ff_tx_fft(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:768
im
float im
Definition: fft.c:83
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
FFTXCodeletOptions
Definition: tx_priv.h:183
w
uint8_t w
Definition: llviddspenc.c:38
M_PI_2
#define M_PI_2
Definition: mathematics.h:73
TX_MAX_DECOMPOSITIONS
#define TX_MAX_DECOMPOSITIONS
Definition: tx_priv.h:197
SR_POW2_TABLES
#define SR_POW2_TABLES
Definition: tx_template.c:30
ff_tx_fft_pfa
static void TX_NAME() ff_tx_fft_pfa(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride)
Definition: tx_template.c:1063
ff_tx_fft16_ns
static void TX_NAME() ff_tx_fft16_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:680
ff_tx_gen_inplace_map
int ff_tx_gen_inplace_map(AVTXContext *s, int len)
Definition: tx.c:155
t1
#define t1
Definition: regdef.h:29
fft15
static av_always_inline void fft15(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:466
FF_TX_CPU_FLAGS_ALL
#define FF_TX_CPU_FLAGS_ALL
Definition: tx_priv.h:230
ff_tx_gen_compound_mapping
int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts, int inv, int n, int m)
Definition: tx.c:74
ff_tx_dctI
static void TX_NAME() ff_tx_dctI(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:2041
ff_tx_fft_naive
static void TX_NAME() ff_tx_fft_naive(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:876
av_malloc
#define av_malloc(s)
Definition: tableprint_vlc.h:30
DECL_FFT5
#define DECL_FFT5(NAME, D0, D1, D2, D3, D4)
Definition: tx_template.c:212
ff_tx_mdct_naive_fwd
static void TX_NAME() ff_tx_mdct_naive_fwd(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1149
ff_tx_rdft_init
static av_cold int TX_NAME() ff_tx_rdft_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1606
DECL_SR_CODELET_DEF
#define DECL_SR_CODELET_DEF(n)
Definition: tx_template.c:600
FFTabInitData::func
void(* func)(void)
Definition: tx_template.c:63
sr_tabs_init_funcs
static SR_POW2_TABLES void(*const sr_tabs_init_funcs[])(void)
Definition: tx_template.c:81
TX_NAME
static const FFTXCodelet TX_NAME(ff_tx_fft_def)
FF_TX_MAP_GATHER
@ FF_TX_MAP_GATHER
Definition: tx_priv.h:176
sum_d
static void sum_d(const int *input, int *output, int len)
Definition: dcadct.c:51
TX_INT32
#define TX_INT32
Definition: tx_int32.c:19
sr_tabs_init_once
static AVOnce sr_tabs_init_once[]
Definition: tx_template.c:87
val
static double val(void *priv, double ch)
Definition: aeval.c:78
DECL_FACTOR_F
#define DECL_FACTOR_F(n)
Definition: tx_template.c:518
scale
static av_always_inline float scale(float x, float s)
Definition: vf_v360.c:1389
TX_MAX_SUB
#define TX_MAX_SUB
Definition: tx_priv.h:194
TABLE_DEF
#define TABLE_DEF(name, size)
Definition: tx_template.c:27
FFTXCodelet::type
enum AVTXType type
Definition: tx_priv.h:202
FFTXCodeletOptions::map_dir
FFTXMapDirection map_dir
Definition: tx_priv.h:187
mult
static int16_t mult(Float11 *f1, Float11 *f2)
Definition: g726.c:60
ff_thread_once
static int ff_thread_once(char *control, void(*routine)(void))
Definition: thread.h:203
FF_ARRAY_ELEMS
#define FF_ARRAY_ELEMS(a)
Definition: sinewin_tablegen.c:29
av_cold
#define av_cold
Definition: attributes.h:90
FFTabInitData
Definition: tx_template.c:62
float
float
Definition: af_crystalizer.c:121
c2r
static void c2r(float *buffer, int size)
Definition: af_apsyclip.c:386
s
#define s(width, name)
Definition: cbs_vp9.c:198
ff_tx_fft_factor_init
static av_cold int TX_NAME() ff_tx_fft_factor_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:479
ff_tx_mdct_fwd
static void TX_NAME() ff_tx_mdct_fwd(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1278
t7
#define t7
Definition: regdef.h:35
ff_tx_mdct_naive_init
static av_cold int TX_NAME() ff_tx_mdct_naive_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1137
FF_TX_FORWARD_ONLY
#define FF_TX_FORWARD_ONLY
Definition: tx_priv.h:158
FFTXCodelet::cpu_flags
int cpu_flags
Definition: tx_priv.h:227
DECL_FACTOR_S
#define DECL_FACTOR_S(n)
Definition: tx_template.c:497
ff_tx_dstI
static void TX_NAME() ff_tx_dstI(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:2059
if
if(ret)
Definition: filter_design.txt:179
AV_TX_FULL_IMDCT
@ AV_TX_FULL_IMDCT
Performs a full inverse MDCT rather than leaving out samples that can be derived through symmetry.
Definition: tx.h:175
opts
AVDictionary * opts
Definition: movenc.c:50
AV_ONCE_INIT
#define AV_ONCE_INIT
Definition: thread.h:201
fabs
static __device__ float fabs(float a)
Definition: cuda_runtime.h:182
AV_TX_REAL_TO_IMAGINARY
@ AV_TX_REAL_TO_IMAGINARY
Definition: tx.h:185
NULL
#define NULL
Definition: coverity.c:32
t5
#define t5
Definition: regdef.h:33
ff_tx_mdct_init
static av_cold int TX_NAME() ff_tx_mdct_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1228
t6
#define t6
Definition: regdef.h:34
AV_TX_INPLACE
@ AV_TX_INPLACE
Allows for in-place transformations, where input == output.
Definition: tx.h:161
ff_tx_gen_ptwo_revtab
int ff_tx_gen_ptwo_revtab(AVTXContext *s, FFTXCodeletOptions *opts)
Definition: tx.c:135
r2c
static void r2c(float *buffer, int size)
Definition: af_apsyclip.c:377
FF_TX_OUT_OF_PLACE
#define FF_TX_OUT_OF_PLACE
Definition: tx_priv.h:154
CMUL3
#define CMUL3(c, a, b)
Definition: tx_priv.h:150
AV_TX_UNALIGNED
@ AV_TX_UNALIGNED
Relaxes alignment requirement for the in and out arrays of av_tx_fn().
Definition: tx.h:167
exp
int8_t exp
Definition: eval.c:72
ff_tx_dctIII
static void TX_NAME() ff_tx_dctIII(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1938
DECL_COMP_MDCT
#define DECL_COMP_MDCT(N)
Definition: tx_template.c:1538
AVOnce
#define AVOnce
Definition: thread.h:200
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
tab
static const uint8_t tab[16]
Definition: rka.c:668
ff_tx_fft_pfa_init
static av_cold int TX_NAME() ff_tx_fft_pfa_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:953
ff_tx_clear_ctx
void ff_tx_clear_ctx(AVTXContext *s)
Definition: tx.c:289
ff_tx_fft2_ns
static void TX_NAME() ff_tx_fft2_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:632
FF_TX_PRESHUFFLE
#define FF_TX_PRESHUFFLE
Definition: tx_priv.h:156
ff_tx_fft_sr_codelet_init
static av_cold int TX_NAME() ff_tx_fft_sr_codelet_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:589
ff_tx_gen_default_map
int ff_tx_gen_default_map(AVTXContext *s, FFTXCodeletOptions *opts)
Definition: tx.c:524
f
f
Definition: af_crystalizer.c:121
ff_tx_init_tab_53
static av_cold void TX_TAB() ff_tx_init_tab_53(void)
Definition: tx_template.c:93
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
FF_TX_PRIO_BASE
@ FF_TX_PRIO_BASE
Definition: tx_priv.h:162
for
for(k=2;k<=8;++k)
Definition: h264pred_template.c:425
ff_tx_fft8_ns
static void TX_NAME() ff_tx_fft8_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:661
fft9
static av_always_inline void fft9(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:341
t8
#define t8
Definition: regdef.h:53
BF
#define BF(a, b, c, s)
Definition: dct32_template.c:90
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
TX_EMBED_INPUT_PFA_MAP
#define TX_EMBED_INPUT_PFA_MAP(map, tot_len, d1, d2)
Definition: tx_priv.h:271
ff_tx_fft_inplace
static void TX_NAME() ff_tx_fft_inplace(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:785
DECL_RDFT_HALF
#define DECL_RDFT_HALF(n, mode, mod2)
Definition: tx_template.c:1729
M_PI
#define M_PI
Definition: mathematics.h:67
ff_tx_fft_init
static av_cold int TX_NAME() ff_tx_fft_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:729
DST_I
@ DST_I
Definition: avfft.h:122
TXComplex
void TXComplex
Definition: tx_priv.h:65
ff_tx_mdct_inv
static void TX_NAME() ff_tx_mdct_inv(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1317
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:244
t4
#define t4
Definition: regdef.h:32
t3
#define t3
Definition: regdef.h:31
av_malloc_array
#define av_malloc_array(a, b)
Definition: tableprint_vlc.h:31
nptwo_tabs_init_once
static AVOnce nptwo_tabs_init_once[]
Definition: tx_template.c:140
av_always_inline
#define av_always_inline
Definition: attributes.h:49
ff_tx_fft_init_naive_small
static av_cold int TX_NAME() ff_tx_fft_init_naive_small(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:851
DECL_SR_CODELET
#define DECL_SR_CODELET(n, n2, n4)
Definition: tx_template.c:616
DECL_COMP_IMDCT
#define DECL_COMP_IMDCT(N)
Definition: tx_template.c:1476
av_mallocz
void * av_mallocz(size_t size)
Allocate a memory block with alignment suitable for all memory accesses (including vectors if availab...
Definition: mem.c:254
len
int len
Definition: vorbis_enc_data.h:426
fft3
static av_always_inline void fft3(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:175
FF_TX_MAP_SCATTER
@ FF_TX_MAP_SCATTER
Definition: tx_priv.h:179
TX_LEN_UNLIMITED
#define TX_LEN_UNLIMITED
Definition: tx_priv.h:216
stride
#define stride
Definition: h264pred_template.c:537
nptwo_tabs_init_data
static const FFTabInitData nptwo_tabs_init_data[]
Definition: tx_template.c:134
ret
ret
Definition: filter_design.txt:187
ff_tx_init_subtx
av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx.c:710
FFSWAP
#define FFSWAP(type, a, b)
Definition: macros.h:52
ff_tx_init_tab_7
static av_cold void TX_TAB() ff_tx_init_tab_7(void)
Definition: tx_template.c:112
TX_FACTOR_ANY
#define TX_FACTOR_ANY
Definition: tx_priv.h:209
FF_TX_INVERSE_ONLY
#define FF_TX_INVERSE_ONLY
Definition: tx_priv.h:157
ff_tx_fft_naive_small
static void TX_NAME() ff_tx_fft_naive_small(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:903
ff_tx_init_tab_9
static av_cold void TX_TAB() ff_tx_init_tab_9(void)
Definition: tx_template.c:122
FFTXCodelet
Definition: tx_priv.h:199
ff_tx_init_tabs
av_cold void TX_TAB() ff_tx_init_tabs(int len)
Definition: tx_template.c:146
t2
#define t2
Definition: regdef.h:30
ff_tx_mdct_naive_inv
static void TX_NAME() ff_tx_mdct_naive_inv(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1170
FFTabInitData::factors
int factors[TX_MAX_SUB]
Definition: tx_template.c:64
ff_tx_dctII
static void TX_NAME() ff_tx_dctII(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1878
BUTTERFLIES
#define BUTTERFLIES(a0, a1, a2, a3)
Definition: tx_template.c:541
ff_tx_fft_pfa_ns
static void TX_NAME() ff_tx_fft_pfa_ns(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride)
Definition: tx_template.c:1087
src0
const pixel *const src0
Definition: h264pred_template.c:420
FFTXCodelet::name
const char * name
Definition: tx_priv.h:200
factor
static const int factor[16]
Definition: vf_pp7.c:78
ff_tx_dcstI_init
static av_cold int TX_NAME() ff_tx_dcstI_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:2009
ff_tx_fft_inplace_small_init
static av_cold int TX_NAME() ff_tx_fft_inplace_small_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:755
map
const VDPAUPixFmtMap * map
Definition: hwcontext_vdpau.c:71
FFALIGN
#define FFALIGN(x, a)
Definition: macros.h:78
alpha
static const int16_t alpha[]
Definition: ilbcdata.h:55
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
fft7
static av_always_inline void fft7(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:253
int32_t
int32_t
Definition: audioconvert.c:56
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:467
ff_tx_mdct_gen_exp
int TX_TAB() ff_tx_mdct_gen_exp(AVTXContext *s, int *pre_tab)
Definition: tx_template.c:2110
ff_tx_gen_pfa_input_map
int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts, int d1, int d2)
Definition: tx.c:43
DECL_RDFT
#define DECL_RDFT(n, inv)
Definition: tx_template.c:1660
ff_tx_mdct_pfa_init
static av_cold int TX_NAME() ff_tx_mdct_pfa_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1430
ff_tx_fft4_ns
static void TX_NAME() ff_tx_fft4_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:644
ff_tx_mdct_inv_full_init
static av_cold int TX_NAME() ff_tx_mdct_inv_full_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1377
ff_tx_decompose_length
int ff_tx_decompose_length(int dst[TX_MAX_DECOMPOSITIONS], enum AVTXType type, int len, int inv)
Definition: tx.c:411
CMUL
#define CMUL(dre, dim, are, aim, bre, bim)
Definition: fft-internal.h:35
TX_TYPE
#define TX_TYPE
Definition: aacdec.c:36
re
float re
Definition: fft.c:83
ff_tx_mdct_inv_full
static void TX_NAME() ff_tx_mdct_inv_full(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1397
FF_TX_PRIO_MIN
@ FF_TX_PRIO_MIN
Definition: tx_priv.h:167