FFmpeg
tx_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) Lynne
3  *
4  * Power of two FFT:
5  * Copyright (c) Lynne
6  * Copyright (c) 2008 Loren Merritt
7  * Copyright (c) 2002 Fabrice Bellard
8  * Partly based on libdjbfft by D. J. Bernstein
9  *
10  * This file is part of FFmpeg.
11  *
12  * FFmpeg is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * FFmpeg is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with FFmpeg; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 #include "mem.h"
28 
29 #define TABLE_DEF(name, size) \
30  DECLARE_ALIGNED(32, TXSample, TX_TAB(ff_tx_tab_ ##name))[size]
31 
32 #define SR_POW2_TABLES \
33  SR_TABLE(8) \
34  SR_TABLE(16) \
35  SR_TABLE(32) \
36  SR_TABLE(64) \
37  SR_TABLE(128) \
38  SR_TABLE(256) \
39  SR_TABLE(512) \
40  SR_TABLE(1024) \
41  SR_TABLE(2048) \
42  SR_TABLE(4096) \
43  SR_TABLE(8192) \
44  SR_TABLE(16384) \
45  SR_TABLE(32768) \
46  SR_TABLE(65536) \
47  SR_TABLE(131072) \
48  SR_TABLE(262144) \
49  SR_TABLE(524288) \
50  SR_TABLE(1048576) \
51  SR_TABLE(2097152) \
52 
53 #define SR_TABLE(len) \
54  TABLE_DEF(len, len/4 + 1);
55 /* Power of two tables */
57 #undef SR_TABLE
58 
59 /* Other factors' tables */
60 TABLE_DEF(53, 12);
61 TABLE_DEF( 7, 6);
62 TABLE_DEF( 9, 8);
63 
64 typedef struct FFTabInitData {
65  void (*func)(void);
66  int factors[TX_MAX_SUB]; /* Must be sorted high -> low */
68 
69 #define SR_TABLE(len) \
70 static av_cold void TX_TAB(ff_tx_init_tab_ ##len)(void) \
71 { \
72  double freq = 2*M_PI/len; \
73  TXSample *tab = TX_TAB(ff_tx_tab_ ##len); \
74  \
75  for (int i = 0; i < len/4; i++) \
76  *tab++ = RESCALE(cos(i*freq)); \
77  \
78  *tab = 0; \
79 }
81 #undef SR_TABLE
82 
83 static void (*const sr_tabs_init_funcs[])(void) = {
84 #define SR_TABLE(len) TX_TAB(ff_tx_init_tab_ ##len),
86 #undef SR_TABLE
87 };
88 
90 #define SR_TABLE(len) AV_ONCE_INIT,
92 #undef SR_TABLE
93 };
94 
95 static av_cold void TX_TAB(ff_tx_init_tab_53)(void)
96 {
97  /* 5pt, doubled to eliminate AVX lane shuffles */
98  TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 5));
99  TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 5));
100  TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10));
101  TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10));
102  TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI / 5));
103  TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI / 5));
104  TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10));
105  TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10));
106 
107  /* 3pt */
108  TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12));
109  TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12));
110  TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI / 6));
111  TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI / 6));
112 }
113 
114 static av_cold void TX_TAB(ff_tx_init_tab_7)(void)
115 {
116  TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 * M_PI / 7));
117  TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 * M_PI / 7));
118  TX_TAB(ff_tx_tab_7)[2] = RESCALE(sin(2 * M_PI / 28));
119  TX_TAB(ff_tx_tab_7)[3] = RESCALE(cos(2 * M_PI / 28));
120  TX_TAB(ff_tx_tab_7)[4] = RESCALE(cos(2 * M_PI / 14));
121  TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 * M_PI / 14));
122 }
123 
124 static av_cold void TX_TAB(ff_tx_init_tab_9)(void)
125 {
126  TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 * M_PI / 3));
127  TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 * M_PI / 3));
128  TX_TAB(ff_tx_tab_9)[2] = RESCALE(cos(2 * M_PI / 9));
129  TX_TAB(ff_tx_tab_9)[3] = RESCALE(sin(2 * M_PI / 9));
130  TX_TAB(ff_tx_tab_9)[4] = RESCALE(cos(2 * M_PI / 36));
131  TX_TAB(ff_tx_tab_9)[5] = RESCALE(sin(2 * M_PI / 36));
132  TX_TAB(ff_tx_tab_9)[6] = TX_TAB(ff_tx_tab_9)[2] + TX_TAB(ff_tx_tab_9)[5];
133  TX_TAB(ff_tx_tab_9)[7] = TX_TAB(ff_tx_tab_9)[3] - TX_TAB(ff_tx_tab_9)[4];
134 }
135 
137  { TX_TAB(ff_tx_init_tab_53), { 15, 5, 3 } },
138  { TX_TAB(ff_tx_init_tab_9), { 9 } },
139  { TX_TAB(ff_tx_init_tab_7), { 7 } },
140 };
141 
143  AV_ONCE_INIT,
144  AV_ONCE_INIT,
145  AV_ONCE_INIT,
146 };
147 
148 av_cold void TX_TAB(ff_tx_init_tabs)(int len)
149 {
150  int factor_2 = ff_ctz(len);
151  if (factor_2) {
152  int idx = factor_2 - 3;
153  for (int i = 0; i <= idx; i++)
156  len >>= factor_2;
157  }
158 
159  for (int i = 0; i < FF_ARRAY_ELEMS(nptwo_tabs_init_data); i++) {
160  int f, f_idx = 0;
161 
162  if (len <= 1)
163  return;
164 
165  while ((f = nptwo_tabs_init_data[i].factors[f_idx++])) {
166  if (f % len)
167  continue;
168 
171  len /= f;
172  break;
173  }
174  }
175 }
176 
178  ptrdiff_t stride)
179 {
180  TXComplex tmp[3];
181  const TXSample *tab = TX_TAB(ff_tx_tab_53);
182 #ifdef TX_INT32
183  int64_t mtmp[4];
184 #endif
185 
186  tmp[0] = in[0];
187  BF(tmp[1].re, tmp[2].im, in[1].im, in[2].im);
188  BF(tmp[1].im, tmp[2].re, in[1].re, in[2].re);
189 
190 #ifdef TX_INT32
191  out[0*stride].re = (int64_t)tmp[0].re + tmp[2].re;
192  out[0*stride].im = (int64_t)tmp[0].im + tmp[2].im;
193  mtmp[0] = (int64_t)tab[ 8] * tmp[1].re;
194  mtmp[1] = (int64_t)tab[ 9] * tmp[1].im;
195  mtmp[2] = (int64_t)tab[10] * tmp[2].re;
196  mtmp[3] = (int64_t)tab[10] * tmp[2].im;
197  out[1*stride].re = tmp[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
198  out[1*stride].im = tmp[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
199  out[2*stride].re = tmp[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
200  out[2*stride].im = tmp[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
201 #else
202  out[0*stride].re = tmp[0].re + tmp[2].re;
203  out[0*stride].im = tmp[0].im + tmp[2].im;
204  tmp[1].re = tab[ 8] * tmp[1].re;
205  tmp[1].im = tab[ 9] * tmp[1].im;
206  tmp[2].re = tab[10] * tmp[2].re;
207  tmp[2].im = tab[10] * tmp[2].im;
208  out[1*stride].re = tmp[0].re - tmp[2].re + tmp[1].re;
209  out[1*stride].im = tmp[0].im - tmp[2].im - tmp[1].im;
210  out[2*stride].re = tmp[0].re - tmp[2].re - tmp[1].re;
211  out[2*stride].im = tmp[0].im - tmp[2].im + tmp[1].im;
212 #endif
213 }
214 
215 #define DECL_FFT5(NAME, D0, D1, D2, D3, D4) \
216 static av_always_inline void NAME(TXComplex *out, TXComplex *in, \
217  ptrdiff_t stride) \
218 { \
219  TXComplex dc, z0[4], t[6]; \
220  const TXSample *tab = TX_TAB(ff_tx_tab_53); \
221  \
222  dc = in[0]; \
223  BF(t[1].im, t[0].re, in[1].re, in[4].re); \
224  BF(t[1].re, t[0].im, in[1].im, in[4].im); \
225  BF(t[3].im, t[2].re, in[2].re, in[3].re); \
226  BF(t[3].re, t[2].im, in[2].im, in[3].im); \
227  \
228  out[D0*stride].re = dc.re + (TXUSample)t[0].re + t[2].re; \
229  out[D0*stride].im = dc.im + (TXUSample)t[0].im + t[2].im; \
230  \
231  SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re); \
232  SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im); \
233  CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re); \
234  CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im); \
235  \
236  BF(z0[0].re, z0[3].re, t[0].re, t[1].re); \
237  BF(z0[0].im, z0[3].im, t[0].im, t[1].im); \
238  BF(z0[2].re, z0[1].re, t[4].re, t[5].re); \
239  BF(z0[2].im, z0[1].im, t[4].im, t[5].im); \
240  \
241  out[D1*stride].re = dc.re + (TXUSample)z0[3].re; \
242  out[D1*stride].im = dc.im + (TXUSample)z0[0].im; \
243  out[D2*stride].re = dc.re + (TXUSample)z0[2].re; \
244  out[D2*stride].im = dc.im + (TXUSample)z0[1].im; \
245  out[D3*stride].re = dc.re + (TXUSample)z0[1].re; \
246  out[D3*stride].im = dc.im + (TXUSample)z0[2].im; \
247  out[D4*stride].re = dc.re + (TXUSample)z0[0].re; \
248  out[D4*stride].im = dc.im + (TXUSample)z0[3].im; \
249 }
250 
251 DECL_FFT5(fft5, 0, 1, 2, 3, 4)
252 DECL_FFT5(fft5_m1, 0, 6, 12, 3, 9)
253 DECL_FFT5(fft5_m2, 10, 1, 7, 13, 4)
254 DECL_FFT5(fft5_m3, 5, 11, 2, 8, 14)
255 
257  ptrdiff_t stride)
258 {
259  TXComplex dc, t[6], z[3];
260  const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_7);
261 #ifdef TX_INT32
262  int64_t mtmp[12];
263 #endif
264 
265  dc = in[0];
266  BF(t[1].re, t[0].re, in[1].re, in[6].re);
267  BF(t[1].im, t[0].im, in[1].im, in[6].im);
268  BF(t[3].re, t[2].re, in[2].re, in[5].re);
269  BF(t[3].im, t[2].im, in[2].im, in[5].im);
270  BF(t[5].re, t[4].re, in[3].re, in[4].re);
271  BF(t[5].im, t[4].im, in[3].im, in[4].im);
272 
273  out[0*stride].re = dc.re + t[0].re + t[2].re + t[4].re;
274  out[0*stride].im = dc.im + t[0].im + t[2].im + t[4].im;
275 
276 #ifdef TX_INT32 /* NOTE: it's possible to do this with 16 mults but 72 adds */
277  mtmp[ 0] = ((int64_t)tab[0].re)*t[0].re - ((int64_t)tab[2].re)*t[4].re;
278  mtmp[ 1] = ((int64_t)tab[0].re)*t[4].re - ((int64_t)tab[1].re)*t[0].re;
279  mtmp[ 2] = ((int64_t)tab[0].re)*t[2].re - ((int64_t)tab[2].re)*t[0].re;
280  mtmp[ 3] = ((int64_t)tab[0].re)*t[0].im - ((int64_t)tab[1].re)*t[2].im;
281  mtmp[ 4] = ((int64_t)tab[0].re)*t[4].im - ((int64_t)tab[1].re)*t[0].im;
282  mtmp[ 5] = ((int64_t)tab[0].re)*t[2].im - ((int64_t)tab[2].re)*t[0].im;
283 
284  mtmp[ 6] = ((int64_t)tab[2].im)*t[1].im + ((int64_t)tab[1].im)*t[5].im;
285  mtmp[ 7] = ((int64_t)tab[0].im)*t[5].im + ((int64_t)tab[2].im)*t[3].im;
286  mtmp[ 8] = ((int64_t)tab[2].im)*t[5].im + ((int64_t)tab[1].im)*t[3].im;
287  mtmp[ 9] = ((int64_t)tab[0].im)*t[1].re + ((int64_t)tab[1].im)*t[3].re;
288  mtmp[10] = ((int64_t)tab[2].im)*t[3].re + ((int64_t)tab[0].im)*t[5].re;
289  mtmp[11] = ((int64_t)tab[2].im)*t[1].re + ((int64_t)tab[1].im)*t[5].re;
290 
291  z[0].re = (int32_t)(mtmp[ 0] - ((int64_t)tab[1].re)*t[2].re + 0x40000000 >> 31);
292  z[1].re = (int32_t)(mtmp[ 1] - ((int64_t)tab[2].re)*t[2].re + 0x40000000 >> 31);
293  z[2].re = (int32_t)(mtmp[ 2] - ((int64_t)tab[1].re)*t[4].re + 0x40000000 >> 31);
294  z[0].im = (int32_t)(mtmp[ 3] - ((int64_t)tab[2].re)*t[4].im + 0x40000000 >> 31);
295  z[1].im = (int32_t)(mtmp[ 4] - ((int64_t)tab[2].re)*t[2].im + 0x40000000 >> 31);
296  z[2].im = (int32_t)(mtmp[ 5] - ((int64_t)tab[1].re)*t[4].im + 0x40000000 >> 31);
297 
298  t[0].re = (int32_t)(mtmp[ 6] - ((int64_t)tab[0].im)*t[3].im + 0x40000000 >> 31);
299  t[2].re = (int32_t)(mtmp[ 7] - ((int64_t)tab[1].im)*t[1].im + 0x40000000 >> 31);
300  t[4].re = (int32_t)(mtmp[ 8] + ((int64_t)tab[0].im)*t[1].im + 0x40000000 >> 31);
301  t[0].im = (int32_t)(mtmp[ 9] + ((int64_t)tab[2].im)*t[5].re + 0x40000000 >> 31);
302  t[2].im = (int32_t)(mtmp[10] - ((int64_t)tab[1].im)*t[1].re + 0x40000000 >> 31);
303  t[4].im = (int32_t)(mtmp[11] - ((int64_t)tab[0].im)*t[3].re + 0x40000000 >> 31);
304 #else
305  z[0].re = tab[0].re*t[0].re - tab[2].re*t[4].re - tab[1].re*t[2].re;
306  z[1].re = tab[0].re*t[4].re - tab[1].re*t[0].re - tab[2].re*t[2].re;
307  z[2].re = tab[0].re*t[2].re - tab[2].re*t[0].re - tab[1].re*t[4].re;
308  z[0].im = tab[0].re*t[0].im - tab[1].re*t[2].im - tab[2].re*t[4].im;
309  z[1].im = tab[0].re*t[4].im - tab[1].re*t[0].im - tab[2].re*t[2].im;
310  z[2].im = tab[0].re*t[2].im - tab[2].re*t[0].im - tab[1].re*t[4].im;
311 
312  /* It's possible to do t[4].re and t[0].im with 2 multiplies only by
313  * multiplying the sum of all with the average of the twiddles */
314 
315  t[0].re = tab[2].im*t[1].im + tab[1].im*t[5].im - tab[0].im*t[3].im;
316  t[2].re = tab[0].im*t[5].im + tab[2].im*t[3].im - tab[1].im*t[1].im;
317  t[4].re = tab[2].im*t[5].im + tab[1].im*t[3].im + tab[0].im*t[1].im;
318  t[0].im = tab[0].im*t[1].re + tab[1].im*t[3].re + tab[2].im*t[5].re;
319  t[2].im = tab[2].im*t[3].re + tab[0].im*t[5].re - tab[1].im*t[1].re;
320  t[4].im = tab[2].im*t[1].re + tab[1].im*t[5].re - tab[0].im*t[3].re;
321 #endif
322 
323  BF(t[1].re, z[0].re, z[0].re, t[4].re);
324  BF(t[3].re, z[1].re, z[1].re, t[2].re);
325  BF(t[5].re, z[2].re, z[2].re, t[0].re);
326  BF(t[1].im, z[0].im, z[0].im, t[0].im);
327  BF(t[3].im, z[1].im, z[1].im, t[2].im);
328  BF(t[5].im, z[2].im, z[2].im, t[4].im);
329 
330  out[1*stride].re = dc.re + z[0].re;
331  out[1*stride].im = dc.im + t[1].im;
332  out[2*stride].re = dc.re + t[3].re;
333  out[2*stride].im = dc.im + z[1].im;
334  out[3*stride].re = dc.re + z[2].re;
335  out[3*stride].im = dc.im + t[5].im;
336  out[4*stride].re = dc.re + t[5].re;
337  out[4*stride].im = dc.im + z[2].im;
338  out[5*stride].re = dc.re + z[1].re;
339  out[5*stride].im = dc.im + t[3].im;
340  out[6*stride].re = dc.re + t[1].re;
341  out[6*stride].im = dc.im + z[0].im;
342 }
343 
345  ptrdiff_t stride)
346 {
347  const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_9);
348  TXComplex dc, t[16], w[4], x[5], y[5], z[2];
349 #ifdef TX_INT32
350  int64_t mtmp[12];
351 #endif
352 
353  dc = in[0];
354  BF(t[1].re, t[0].re, in[1].re, in[8].re);
355  BF(t[1].im, t[0].im, in[1].im, in[8].im);
356  BF(t[3].re, t[2].re, in[2].re, in[7].re);
357  BF(t[3].im, t[2].im, in[2].im, in[7].im);
358  BF(t[5].re, t[4].re, in[3].re, in[6].re);
359  BF(t[5].im, t[4].im, in[3].im, in[6].im);
360  BF(t[7].re, t[6].re, in[4].re, in[5].re);
361  BF(t[7].im, t[6].im, in[4].im, in[5].im);
362 
363  w[0].re = t[0].re - t[6].re;
364  w[0].im = t[0].im - t[6].im;
365  w[1].re = t[2].re - t[6].re;
366  w[1].im = t[2].im - t[6].im;
367  w[2].re = t[1].re - t[7].re;
368  w[2].im = t[1].im - t[7].im;
369  w[3].re = t[3].re + t[7].re;
370  w[3].im = t[3].im + t[7].im;
371 
372  z[0].re = dc.re + t[4].re;
373  z[0].im = dc.im + t[4].im;
374 
375  z[1].re = t[0].re + t[2].re + t[6].re;
376  z[1].im = t[0].im + t[2].im + t[6].im;
377 
378  out[0*stride].re = z[0].re + z[1].re;
379  out[0*stride].im = z[0].im + z[1].im;
380 
381 #ifdef TX_INT32
382  mtmp[0] = t[1].re - t[3].re + t[7].re;
383  mtmp[1] = t[1].im - t[3].im + t[7].im;
384 
385  y[3].re = (int32_t)(((int64_t)tab[0].im)*mtmp[0] + 0x40000000 >> 31);
386  y[3].im = (int32_t)(((int64_t)tab[0].im)*mtmp[1] + 0x40000000 >> 31);
387 
388  mtmp[0] = (int32_t)(((int64_t)tab[0].re)*z[1].re + 0x40000000 >> 31);
389  mtmp[1] = (int32_t)(((int64_t)tab[0].re)*z[1].im + 0x40000000 >> 31);
390  mtmp[2] = (int32_t)(((int64_t)tab[0].re)*t[4].re + 0x40000000 >> 31);
391  mtmp[3] = (int32_t)(((int64_t)tab[0].re)*t[4].im + 0x40000000 >> 31);
392 
393  x[3].re = z[0].re + (int32_t)mtmp[0];
394  x[3].im = z[0].im + (int32_t)mtmp[1];
395  z[0].re = in[0].re + (int32_t)mtmp[2];
396  z[0].im = in[0].im + (int32_t)mtmp[3];
397 
398  mtmp[0] = ((int64_t)tab[1].re)*w[0].re;
399  mtmp[1] = ((int64_t)tab[1].re)*w[0].im;
400  mtmp[2] = ((int64_t)tab[2].im)*w[0].re;
401  mtmp[3] = ((int64_t)tab[2].im)*w[0].im;
402  mtmp[4] = ((int64_t)tab[1].im)*w[2].re;
403  mtmp[5] = ((int64_t)tab[1].im)*w[2].im;
404  mtmp[6] = ((int64_t)tab[2].re)*w[2].re;
405  mtmp[7] = ((int64_t)tab[2].re)*w[2].im;
406 
407  x[1].re = (int32_t)(mtmp[0] + ((int64_t)tab[2].im)*w[1].re + 0x40000000 >> 31);
408  x[1].im = (int32_t)(mtmp[1] + ((int64_t)tab[2].im)*w[1].im + 0x40000000 >> 31);
409  x[2].re = (int32_t)(mtmp[2] - ((int64_t)tab[3].re)*w[1].re + 0x40000000 >> 31);
410  x[2].im = (int32_t)(mtmp[3] - ((int64_t)tab[3].re)*w[1].im + 0x40000000 >> 31);
411  y[1].re = (int32_t)(mtmp[4] + ((int64_t)tab[2].re)*w[3].re + 0x40000000 >> 31);
412  y[1].im = (int32_t)(mtmp[5] + ((int64_t)tab[2].re)*w[3].im + 0x40000000 >> 31);
413  y[2].re = (int32_t)(mtmp[6] - ((int64_t)tab[3].im)*w[3].re + 0x40000000 >> 31);
414  y[2].im = (int32_t)(mtmp[7] - ((int64_t)tab[3].im)*w[3].im + 0x40000000 >> 31);
415 
416  y[0].re = (int32_t)(((int64_t)tab[0].im)*t[5].re + 0x40000000 >> 31);
417  y[0].im = (int32_t)(((int64_t)tab[0].im)*t[5].im + 0x40000000 >> 31);
418 
419 #else
420  y[3].re = tab[0].im*(t[1].re - t[3].re + t[7].re);
421  y[3].im = tab[0].im*(t[1].im - t[3].im + t[7].im);
422 
423  x[3].re = z[0].re + tab[0].re*z[1].re;
424  x[3].im = z[0].im + tab[0].re*z[1].im;
425  z[0].re = dc.re + tab[0].re*t[4].re;
426  z[0].im = dc.im + tab[0].re*t[4].im;
427 
428  x[1].re = tab[1].re*w[0].re + tab[2].im*w[1].re;
429  x[1].im = tab[1].re*w[0].im + tab[2].im*w[1].im;
430  x[2].re = tab[2].im*w[0].re - tab[3].re*w[1].re;
431  x[2].im = tab[2].im*w[0].im - tab[3].re*w[1].im;
432  y[1].re = tab[1].im*w[2].re + tab[2].re*w[3].re;
433  y[1].im = tab[1].im*w[2].im + tab[2].re*w[3].im;
434  y[2].re = tab[2].re*w[2].re - tab[3].im*w[3].re;
435  y[2].im = tab[2].re*w[2].im - tab[3].im*w[3].im;
436 
437  y[0].re = tab[0].im*t[5].re;
438  y[0].im = tab[0].im*t[5].im;
439 #endif
440 
441  x[4].re = x[1].re + x[2].re;
442  x[4].im = x[1].im + x[2].im;
443 
444  y[4].re = y[1].re - y[2].re;
445  y[4].im = y[1].im - y[2].im;
446  x[1].re = z[0].re + x[1].re;
447  x[1].im = z[0].im + x[1].im;
448  y[1].re = y[0].re + y[1].re;
449  y[1].im = y[0].im + y[1].im;
450  x[2].re = z[0].re + x[2].re;
451  x[2].im = z[0].im + x[2].im;
452  y[2].re = y[2].re - y[0].re;
453  y[2].im = y[2].im - y[0].im;
454  x[4].re = z[0].re - x[4].re;
455  x[4].im = z[0].im - x[4].im;
456  y[4].re = y[0].re - y[4].re;
457  y[4].im = y[0].im - y[4].im;
458 
459  out[1*stride] = (TXComplex){ x[1].re + y[1].im, x[1].im - y[1].re };
460  out[2*stride] = (TXComplex){ x[2].re + y[2].im, x[2].im - y[2].re };
461  out[3*stride] = (TXComplex){ x[3].re + y[3].im, x[3].im - y[3].re };
462  out[4*stride] = (TXComplex){ x[4].re + y[4].im, x[4].im - y[4].re };
463  out[5*stride] = (TXComplex){ x[4].re - y[4].im, x[4].im + y[4].re };
464  out[6*stride] = (TXComplex){ x[3].re - y[3].im, x[3].im + y[3].re };
465  out[7*stride] = (TXComplex){ x[2].re - y[2].im, x[2].im + y[2].re };
466  out[8*stride] = (TXComplex){ x[1].re - y[1].im, x[1].im + y[1].re };
467 }
468 
470  ptrdiff_t stride)
471 {
472  TXComplex tmp[15];
473 
474  for (int i = 0; i < 5; i++)
475  fft3(tmp + i, in + i*3, 5);
476 
477  fft5_m1(out, tmp + 0, stride);
478  fft5_m2(out, tmp + 5, stride);
479  fft5_m3(out, tmp + 10, stride);
480 }
481 
483  const FFTXCodelet *cd,
484  uint64_t flags,
486  int len, int inv,
487  const void *scale)
488 {
489  int ret = 0;
490  TX_TAB(ff_tx_init_tabs)(len);
491 
492  if (len == 15)
493  ret = ff_tx_gen_pfa_input_map(s, opts, 3, 5);
494  else if (flags & FF_TX_PRESHUFFLE)
496 
497  return ret;
498 }
499 
500 #define DECL_FACTOR_S(n) \
501 static void TX_NAME(ff_tx_fft##n)(AVTXContext *s, void *dst, \
502  void *src, ptrdiff_t stride) \
503 { \
504  fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex)); \
505 } \
506 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
507  .name = TX_NAME_STR("fft" #n "_ns"), \
508  .function = TX_NAME(ff_tx_fft##n), \
509  .type = TX_TYPE(FFT), \
510  .flags = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
511  AV_TX_UNALIGNED | FF_TX_PRESHUFFLE, \
512  .factors[0] = n, \
513  .nb_factors = 1, \
514  .min_len = n, \
515  .max_len = n, \
516  .init = TX_NAME(ff_tx_fft_factor_init), \
517  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
518  .prio = FF_TX_PRIO_BASE, \
519 };
520 
521 #define DECL_FACTOR_F(n) \
522 DECL_FACTOR_S(n) \
523 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_fwd_def) = { \
524  .name = TX_NAME_STR("fft" #n "_fwd"), \
525  .function = TX_NAME(ff_tx_fft##n), \
526  .type = TX_TYPE(FFT), \
527  .flags = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
528  AV_TX_UNALIGNED | FF_TX_FORWARD_ONLY, \
529  .factors[0] = n, \
530  .nb_factors = 1, \
531  .min_len = n, \
532  .max_len = n, \
533  .init = TX_NAME(ff_tx_fft_factor_init), \
534  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
535  .prio = FF_TX_PRIO_BASE, \
536 };
537 
538 DECL_FACTOR_F(3)
539 DECL_FACTOR_F(5)
540 DECL_FACTOR_F(7)
541 DECL_FACTOR_F(9)
542 DECL_FACTOR_S(15)
543 
544 #define BUTTERFLIES(a0, a1, a2, a3) \
545  do { \
546  r0=a0.re; \
547  i0=a0.im; \
548  r1=a1.re; \
549  i1=a1.im; \
550  BF(t3, t5, t5, t1); \
551  BF(a2.re, a0.re, r0, t5); \
552  BF(a3.im, a1.im, i1, t3); \
553  BF(t4, t6, t2, t6); \
554  BF(a3.re, a1.re, r1, t4); \
555  BF(a2.im, a0.im, i0, t6); \
556  } while (0)
557 
558 #define TRANSFORM(a0, a1, a2, a3, wre, wim) \
559  do { \
560  CMUL(t1, t2, a2.re, a2.im, wre, -wim); \
561  CMUL(t5, t6, a3.re, a3.im, wre, wim); \
562  BUTTERFLIES(a0, a1, a2, a3); \
563  } while (0)
564 
565 /* z[0...8n-1], w[1...2n-1] */
566 static inline void TX_NAME(ff_tx_fft_sr_combine)(TXComplex *z,
567  const TXSample *cos, int len)
568 {
569  int o1 = 2*len;
570  int o2 = 4*len;
571  int o3 = 6*len;
572  const TXSample *wim = cos + o1 - 7;
573  TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
574 
575  for (int i = 0; i < len; i += 4) {
576  TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]);
577  TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]);
578  TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]);
579  TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]);
580 
581  TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]);
582  TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]);
583  TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]);
584  TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]);
585 
586  z += 2*4;
587  cos += 2*4;
588  wim -= 2*4;
589  }
590 }
591 
593  const FFTXCodelet *cd,
594  uint64_t flags,
596  int len, int inv,
597  const void *scale)
598 {
599  TX_TAB(ff_tx_init_tabs)(len);
600  return ff_tx_gen_ptwo_revtab(s, opts);
601 }
602 
603 #define DECL_SR_CODELET_DEF(n) \
604 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
605  .name = TX_NAME_STR("fft" #n "_ns"), \
606  .function = TX_NAME(ff_tx_fft##n##_ns), \
607  .type = TX_TYPE(FFT), \
608  .flags = FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | \
609  AV_TX_UNALIGNED | FF_TX_PRESHUFFLE, \
610  .factors[0] = 2, \
611  .nb_factors = 1, \
612  .min_len = n, \
613  .max_len = n, \
614  .init = TX_NAME(ff_tx_fft_sr_codelet_init), \
615  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
616  .prio = FF_TX_PRIO_BASE, \
617 };
618 
619 #define DECL_SR_CODELET(n, n2, n4) \
620 static void TX_NAME(ff_tx_fft##n##_ns)(AVTXContext *s, void *_dst, \
621  void *_src, ptrdiff_t stride) \
622 { \
623  TXComplex *src = _src; \
624  TXComplex *dst = _dst; \
625  const TXSample *cos = TX_TAB(ff_tx_tab_##n); \
626  \
627  TX_NAME(ff_tx_fft##n2##_ns)(s, dst, src, stride); \
628  TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride); \
629  TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride); \
630  TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1); \
631 } \
632  \
633 DECL_SR_CODELET_DEF(n)
634 
635 static void TX_NAME(ff_tx_fft2_ns)(AVTXContext *s, void *_dst,
636  void *_src, ptrdiff_t stride)
637 {
638  TXComplex *src = _src;
639  TXComplex *dst = _dst;
640  TXComplex tmp;
641 
642  BF(tmp.re, dst[0].re, src[0].re, src[1].re);
643  BF(tmp.im, dst[0].im, src[0].im, src[1].im);
644  dst[1] = tmp;
645 }
646 
647 static void TX_NAME(ff_tx_fft4_ns)(AVTXContext *s, void *_dst,
648  void *_src, ptrdiff_t stride)
649 {
650  TXComplex *src = _src;
651  TXComplex *dst = _dst;
652  TXSample t1, t2, t3, t4, t5, t6, t7, t8;
653 
654  BF(t3, t1, src[0].re, src[1].re);
655  BF(t8, t6, src[3].re, src[2].re);
656  BF(dst[2].re, dst[0].re, t1, t6);
657  BF(t4, t2, src[0].im, src[1].im);
658  BF(t7, t5, src[2].im, src[3].im);
659  BF(dst[3].im, dst[1].im, t4, t8);
660  BF(dst[3].re, dst[1].re, t3, t7);
661  BF(dst[2].im, dst[0].im, t2, t5);
662 }
663 
664 static void TX_NAME(ff_tx_fft8_ns)(AVTXContext *s, void *_dst,
665  void *_src, ptrdiff_t stride)
666 {
667  TXComplex *src = _src;
668  TXComplex *dst = _dst;
669  TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
670  const TXSample cos = TX_TAB(ff_tx_tab_8)[1];
671 
672  TX_NAME(ff_tx_fft4_ns)(s, dst, src, stride);
673 
674  BF(t1, dst[5].re, src[4].re, -src[5].re);
675  BF(t2, dst[5].im, src[4].im, -src[5].im);
676  BF(t5, dst[7].re, src[6].re, -src[7].re);
677  BF(t6, dst[7].im, src[6].im, -src[7].im);
678 
679  BUTTERFLIES(dst[0], dst[2], dst[4], dst[6]);
680  TRANSFORM(dst[1], dst[3], dst[5], dst[7], cos, cos);
681 }
682 
683 static void TX_NAME(ff_tx_fft16_ns)(AVTXContext *s, void *_dst,
684  void *_src, ptrdiff_t stride)
685 {
686  TXComplex *src = _src;
687  TXComplex *dst = _dst;
688  const TXSample *cos = TX_TAB(ff_tx_tab_16);
689 
690  TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
691  TXSample cos_16_1 = cos[1];
692  TXSample cos_16_2 = cos[2];
693  TXSample cos_16_3 = cos[3];
694 
695  TX_NAME(ff_tx_fft8_ns)(s, dst + 0, src + 0, stride);
696  TX_NAME(ff_tx_fft4_ns)(s, dst + 8, src + 8, stride);
697  TX_NAME(ff_tx_fft4_ns)(s, dst + 12, src + 12, stride);
698 
699  t1 = dst[ 8].re;
700  t2 = dst[ 8].im;
701  t5 = dst[12].re;
702  t6 = dst[12].im;
703  BUTTERFLIES(dst[0], dst[4], dst[8], dst[12]);
704 
705  TRANSFORM(dst[ 2], dst[ 6], dst[10], dst[14], cos_16_2, cos_16_2);
706  TRANSFORM(dst[ 1], dst[ 5], dst[ 9], dst[13], cos_16_1, cos_16_3);
707  TRANSFORM(dst[ 3], dst[ 7], dst[11], dst[15], cos_16_3, cos_16_1);
708 }
709 
714 DECL_SR_CODELET(32,16,8)
715 DECL_SR_CODELET(64,32,16)
716 DECL_SR_CODELET(128,64,32)
717 DECL_SR_CODELET(256,128,64)
718 DECL_SR_CODELET(512,256,128)
719 DECL_SR_CODELET(1024,512,256)
720 DECL_SR_CODELET(2048,1024,512)
721 DECL_SR_CODELET(4096,2048,1024)
722 DECL_SR_CODELET(8192,4096,2048)
723 DECL_SR_CODELET(16384,8192,4096)
724 DECL_SR_CODELET(32768,16384,8192)
725 DECL_SR_CODELET(65536,32768,16384)
726 DECL_SR_CODELET(131072,65536,32768)
727 DECL_SR_CODELET(262144,131072,65536)
728 DECL_SR_CODELET(524288,262144,131072)
729 DECL_SR_CODELET(1048576,524288,262144)
730 DECL_SR_CODELET(2097152,1048576,524288)
731 
733  const FFTXCodelet *cd,
734  uint64_t flags,
736  int len, int inv,
737  const void *scale)
738 {
739  int ret;
740  int is_inplace = !!(flags & AV_TX_INPLACE);
741  FFTXCodeletOptions sub_opts = {
742  .map_dir = is_inplace ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
743  };
744 
745  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
746  flags |= AV_TX_INPLACE; /* in-place */
747  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
748 
749  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len, inv, scale)))
750  return ret;
751 
752  if (is_inplace && (ret = ff_tx_gen_inplace_map(s, len)))
753  return ret;
754 
755  return 0;
756 }
757 
759  const FFTXCodelet *cd,
760  uint64_t flags,
762  int len, int inv,
763  const void *scale)
764 {
765  if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
766  return AVERROR(ENOMEM);
767  flags &= ~AV_TX_INPLACE;
768  return TX_NAME(ff_tx_fft_init)(s, cd, flags, opts, len, inv, scale);
769 }
770 
771 static void TX_NAME(ff_tx_fft)(AVTXContext *s, void *_dst,
772  void *_src, ptrdiff_t stride)
773 {
774  TXComplex *src = _src;
775  TXComplex *dst1 = s->flags & AV_TX_INPLACE ? s->tmp : _dst;
776  TXComplex *dst2 = _dst;
777  int *map = s->sub[0].map;
778  int len = s->len;
779 
780  /* Compilers can't vectorize this anyway without assuming AVX2, which they
781  * generally don't, at least without -march=native -mtune=native */
782  for (int i = 0; i < len; i++)
783  dst1[i] = src[map[i]];
784 
785  s->fn[0](&s->sub[0], dst2, dst1, stride);
786 }
787 
788 static void TX_NAME(ff_tx_fft_inplace)(AVTXContext *s, void *_dst,
789  void *_src, ptrdiff_t stride)
790 {
791  TXComplex *src = _src;
792  TXComplex *dst = _dst;
793  TXComplex tmp;
794  const int *map = s->sub->map;
795  const int *inplace_idx = s->map;
796  int src_idx, dst_idx;
797 
798  src_idx = *inplace_idx++;
799  do {
800  tmp = src[src_idx];
801  dst_idx = map[src_idx];
802  do {
803  FFSWAP(TXComplex, tmp, src[dst_idx]);
804  dst_idx = map[dst_idx];
805  } while (dst_idx != src_idx); /* Can be > as well, but was less predictable */
806  src[dst_idx] = tmp;
807  } while ((src_idx = *inplace_idx++));
808 
809  s->fn[0](&s->sub[0], dst, src, stride);
810 }
811 
812 static const FFTXCodelet TX_NAME(ff_tx_fft_def) = {
813  .name = TX_NAME_STR("fft"),
814  .function = TX_NAME(ff_tx_fft),
815  .type = TX_TYPE(FFT),
817  .factors[0] = TX_FACTOR_ANY,
818  .nb_factors = 1,
819  .min_len = 2,
820  .max_len = TX_LEN_UNLIMITED,
821  .init = TX_NAME(ff_tx_fft_init),
823  .prio = FF_TX_PRIO_BASE,
824 };
825 
826 static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_small_def) = {
827  .name = TX_NAME_STR("fft_inplace_small"),
828  .function = TX_NAME(ff_tx_fft),
829  .type = TX_TYPE(FFT),
831  .factors[0] = TX_FACTOR_ANY,
832  .nb_factors = 1,
833  .min_len = 2,
834  .max_len = 65536,
837  .prio = FF_TX_PRIO_BASE - 256,
838 };
839 
840 static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_def) = {
841  .name = TX_NAME_STR("fft_inplace"),
842  .function = TX_NAME(ff_tx_fft_inplace),
843  .type = TX_TYPE(FFT),
845  .factors[0] = TX_FACTOR_ANY,
846  .nb_factors = 1,
847  .min_len = 2,
848  .max_len = TX_LEN_UNLIMITED,
849  .init = TX_NAME(ff_tx_fft_init),
851  .prio = FF_TX_PRIO_BASE - 512,
852 };
853 
855  const FFTXCodelet *cd,
856  uint64_t flags,
858  int len, int inv,
859  const void *scale)
860 {
861  const double phase = s->inv ? 2.0*M_PI/len : -2.0*M_PI/len;
862 
863  if (!(s->exp = av_malloc(len*len*sizeof(*s->exp))))
864  return AVERROR(ENOMEM);
865 
866  for (int i = 0; i < len; i++) {
867  for (int j = 0; j < len; j++) {
868  const double factor = phase*i*j;
869  s->exp[i*j] = (TXComplex){
870  RESCALE(cos(factor)),
871  RESCALE(sin(factor)),
872  };
873  }
874  }
875 
876  return 0;
877 }
878 
879 static void TX_NAME(ff_tx_fft_naive)(AVTXContext *s, void *_dst, void *_src,
880  ptrdiff_t stride)
881 {
882  TXComplex *src = _src;
883  TXComplex *dst = _dst;
884  const int n = s->len;
885  double phase = s->inv ? 2.0*M_PI/n : -2.0*M_PI/n;
886 
887  stride /= sizeof(*dst);
888 
889  for (int i = 0; i < n; i++) {
890  TXComplex tmp = { 0 };
891  for (int j = 0; j < n; j++) {
892  const double factor = phase*i*j;
893  const TXComplex mult = {
894  RESCALE(cos(factor)),
895  RESCALE(sin(factor)),
896  };
897  TXComplex res;
898  CMUL3(res, src[j], mult);
899  tmp.re += res.re;
900  tmp.im += res.im;
901  }
902  dst[i*stride] = tmp;
903  }
904 }
905 
906 static void TX_NAME(ff_tx_fft_naive_small)(AVTXContext *s, void *_dst, void *_src,
907  ptrdiff_t stride)
908 {
909  TXComplex *src = _src;
910  TXComplex *dst = _dst;
911  const int n = s->len;
912 
913  stride /= sizeof(*dst);
914 
915  for (int i = 0; i < n; i++) {
916  TXComplex tmp = { 0 };
917  for (int j = 0; j < n; j++) {
918  TXComplex res;
919  const TXComplex mult = s->exp[i*j];
920  CMUL3(res, src[j], mult);
921  tmp.re += res.re;
922  tmp.im += res.im;
923  }
924  dst[i*stride] = tmp;
925  }
926 }
927 
928 static const FFTXCodelet TX_NAME(ff_tx_fft_naive_small_def) = {
929  .name = TX_NAME_STR("fft_naive_small"),
930  .function = TX_NAME(ff_tx_fft_naive_small),
931  .type = TX_TYPE(FFT),
933  .factors[0] = TX_FACTOR_ANY,
934  .nb_factors = 1,
935  .min_len = 2,
936  .max_len = 1024,
939  .prio = FF_TX_PRIO_MIN/2,
940 };
941 
942 static const FFTXCodelet TX_NAME(ff_tx_fft_naive_def) = {
943  .name = TX_NAME_STR("fft_naive"),
944  .function = TX_NAME(ff_tx_fft_naive),
945  .type = TX_TYPE(FFT),
947  .factors[0] = TX_FACTOR_ANY,
948  .nb_factors = 1,
949  .min_len = 2,
950  .max_len = TX_LEN_UNLIMITED,
951  .init = NULL,
952  .cpu_flags = FF_TX_CPU_FLAGS_ALL,
953  .prio = FF_TX_PRIO_MIN,
954 };
955 
957  const FFTXCodelet *cd,
958  uint64_t flags,
960  int len, int inv,
961  const void *scale)
962 {
963  int ret, *tmp, ps = flags & FF_TX_PRESHUFFLE;
964  FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
965  size_t extra_tmp_len = 0;
966  int len_list[TX_MAX_DECOMPOSITIONS];
967 
968  if ((ret = ff_tx_decompose_length(len_list, TX_TYPE(FFT), len, inv)) < 0)
969  return ret;
970 
971  /* Two iterations to test both orderings. */
972  for (int i = 0; i < ret; i++) {
973  int len1 = len_list[i];
974  int len2 = len / len1;
975 
976  /* Our ptwo transforms don't support striding the output. */
977  if (len2 & (len2 - 1))
978  FFSWAP(int, len1, len2);
979 
981 
982  /* First transform */
983  sub_opts.map_dir = FF_TX_MAP_GATHER;
984  flags &= ~AV_TX_INPLACE;
986  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
987  ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
988  len1, inv, scale);
989 
990  if (ret == AVERROR(ENOMEM)) {
991  return ret;
992  } else if (ret < 0) { /* Try again without a preshuffle flag */
994  ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
995  len1, inv, scale);
996  if (ret == AVERROR(ENOMEM))
997  return ret;
998  else if (ret < 0)
999  continue;
1000  }
1001 
1002  /* Second transform. */
1003  sub_opts.map_dir = FF_TX_MAP_SCATTER;
1005 retry:
1007  flags |= AV_TX_INPLACE;
1008  ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1009  len2, inv, scale);
1010 
1011  if (ret == AVERROR(ENOMEM)) {
1012  return ret;
1013  } else if (ret < 0) { /* Try again with an out-of-place transform */
1015  flags &= ~AV_TX_INPLACE;
1016  ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1017  len2, inv, scale);
1018  if (ret == AVERROR(ENOMEM)) {
1019  return ret;
1020  } else if (ret < 0) {
1021  if (flags & FF_TX_PRESHUFFLE) { /* Retry again without a preshuf flag */
1022  flags &= ~FF_TX_PRESHUFFLE;
1023  goto retry;
1024  } else {
1025  continue;
1026  }
1027  }
1028  }
1029 
1030  /* Success */
1031  break;
1032  }
1033 
1034  /* If nothing was sucessful, error out */
1035  if (ret < 0)
1036  return ret;
1037 
1038  /* Generate PFA map */
1039  if ((ret = ff_tx_gen_compound_mapping(s, opts, 0,
1040  s->sub[0].len, s->sub[1].len)))
1041  return ret;
1042 
1043  if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1044  return AVERROR(ENOMEM);
1045 
1046  /* Flatten input map */
1047  tmp = (int *)s->tmp;
1048  for (int k = 0; k < len; k += s->sub[0].len) {
1049  memcpy(tmp, &s->map[k], s->sub[0].len*sizeof(*tmp));
1050  for (int i = 0; i < s->sub[0].len; i++)
1051  s->map[k + i] = tmp[s->sub[0].map[i]];
1052  }
1053 
1054  /* Only allocate extra temporary memory if we need it */
1055  if (!(s->sub[1].flags & AV_TX_INPLACE))
1056  extra_tmp_len = len;
1057  else if (!ps)
1058  extra_tmp_len = s->sub[0].len;
1059 
1060  if (extra_tmp_len && !(s->exp = av_malloc(extra_tmp_len*sizeof(*s->exp))))
1061  return AVERROR(ENOMEM);
1062 
1063  return 0;
1064 }
1065 
1066 static void TX_NAME(ff_tx_fft_pfa)(AVTXContext *s, void *_out,
1067  void *_in, ptrdiff_t stride)
1068 {
1069  const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
1070  const int *in_map = s->map, *out_map = in_map + l;
1071  const int *sub_map = s->sub[1].map;
1072  TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
1073  TXComplex *in = _in, *out = _out;
1074 
1075  stride /= sizeof(*out);
1076 
1077  for (int i = 0; i < m; i++) {
1078  for (int j = 0; j < n; j++)
1079  s->exp[j] = in[in_map[i*n + j]];
1080  s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], s->exp, m*sizeof(TXComplex));
1081  }
1082 
1083  for (int i = 0; i < n; i++)
1084  s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
1085 
1086  for (int i = 0; i < l; i++)
1087  out[i*stride] = tmp1[out_map[i]];
1088 }
1089 
1090 static void TX_NAME(ff_tx_fft_pfa_ns)(AVTXContext *s, void *_out,
1091  void *_in, ptrdiff_t stride)
1092 {
1093  const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
1094  const int *in_map = s->map, *out_map = in_map + l;
1095  const int *sub_map = s->sub[1].map;
1096  TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
1097  TXComplex *in = _in, *out = _out;
1098 
1099  stride /= sizeof(*out);
1100 
1101  for (int i = 0; i < m; i++)
1102  s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], &in[i*n], m*sizeof(TXComplex));
1103 
1104  for (int i = 0; i < n; i++)
1105  s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
1106 
1107  for (int i = 0; i < l; i++)
1108  out[i*stride] = tmp1[out_map[i]];
1109 }
1110 
1111 static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_def) = {
1112  .name = TX_NAME_STR("fft_pfa"),
1113  .function = TX_NAME(ff_tx_fft_pfa),
1114  .type = TX_TYPE(FFT),
1116  .factors = { 7, 5, 3, 2, TX_FACTOR_ANY },
1117  .nb_factors = 2,
1118  .min_len = 2*3,
1119  .max_len = TX_LEN_UNLIMITED,
1120  .init = TX_NAME(ff_tx_fft_pfa_init),
1122  .prio = FF_TX_PRIO_BASE,
1123 };
1124 
1125 static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_ns_def) = {
1126  .name = TX_NAME_STR("fft_pfa_ns"),
1127  .function = TX_NAME(ff_tx_fft_pfa_ns),
1128  .type = TX_TYPE(FFT),
1131  .factors = { 7, 5, 3, 2, TX_FACTOR_ANY },
1132  .nb_factors = 2,
1133  .min_len = 2*3,
1134  .max_len = TX_LEN_UNLIMITED,
1135  .init = TX_NAME(ff_tx_fft_pfa_init),
1137  .prio = FF_TX_PRIO_BASE,
1138 };
1139 
1141  const FFTXCodelet *cd,
1142  uint64_t flags,
1144  int len, int inv,
1145  const void *scale)
1146 {
1147  s->scale_d = *((SCALE_TYPE *)scale);
1148  s->scale_f = s->scale_d;
1149  return 0;
1150 }
1151 
1152 static void TX_NAME(ff_tx_mdct_naive_fwd)(AVTXContext *s, void *_dst,
1153  void *_src, ptrdiff_t stride)
1154 {
1155  TXSample *src = _src;
1156  TXSample *dst = _dst;
1157  double scale = s->scale_d;
1158  int len = s->len;
1159  const double phase = M_PI/(4.0*len);
1160 
1161  stride /= sizeof(*dst);
1162 
1163  for (int i = 0; i < len; i++) {
1164  double sum = 0.0;
1165  for (int j = 0; j < len*2; j++) {
1166  int a = (2*j + 1 + len) * (2*i + 1);
1167  sum += UNSCALE(src[j]) * cos(a * phase);
1168  }
1169  dst[i*stride] = RESCALE(sum*scale);
1170  }
1171 }
1172 
1173 static void TX_NAME(ff_tx_mdct_naive_inv)(AVTXContext *s, void *_dst,
1174  void *_src, ptrdiff_t stride)
1175 {
1176  TXSample *src = _src;
1177  TXSample *dst = _dst;
1178  double scale = s->scale_d;
1179  int len = s->len >> 1;
1180  int len2 = len*2;
1181  const double phase = M_PI/(4.0*len2);
1182 
1183  stride /= sizeof(*src);
1184 
1185  for (int i = 0; i < len; i++) {
1186  double sum_d = 0.0;
1187  double sum_u = 0.0;
1188  double i_d = phase * (4*len - 2*i - 1);
1189  double i_u = phase * (3*len2 + 2*i + 1);
1190  for (int j = 0; j < len2; j++) {
1191  double a = (2 * j + 1);
1192  double a_d = cos(a * i_d);
1193  double a_u = cos(a * i_u);
1194  double val = UNSCALE(src[j*stride]);
1195  sum_d += a_d * val;
1196  sum_u += a_u * val;
1197  }
1198  dst[i + 0] = RESCALE( sum_d*scale);
1199  dst[i + len] = RESCALE(-sum_u*scale);
1200  }
1201 }
1202 
1203 static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_fwd_def) = {
1204  .name = TX_NAME_STR("mdct_naive_fwd"),
1205  .function = TX_NAME(ff_tx_mdct_naive_fwd),
1206  .type = TX_TYPE(MDCT),
1208  .factors = { 2, TX_FACTOR_ANY }, /* MDCTs need an even length */
1209  .nb_factors = 2,
1210  .min_len = 2,
1211  .max_len = TX_LEN_UNLIMITED,
1212  .init = TX_NAME(ff_tx_mdct_naive_init),
1214  .prio = FF_TX_PRIO_MIN,
1215 };
1216 
1217 static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_inv_def) = {
1218  .name = TX_NAME_STR("mdct_naive_inv"),
1219  .function = TX_NAME(ff_tx_mdct_naive_inv),
1220  .type = TX_TYPE(MDCT),
1222  .factors = { 2, TX_FACTOR_ANY },
1223  .nb_factors = 2,
1224  .min_len = 2,
1225  .max_len = TX_LEN_UNLIMITED,
1226  .init = TX_NAME(ff_tx_mdct_naive_init),
1228  .prio = FF_TX_PRIO_MIN,
1229 };
1230 
1232  const FFTXCodelet *cd,
1233  uint64_t flags,
1235  int len, int inv,
1236  const void *scale)
1237 {
1238  int ret;
1239  FFTXCodeletOptions sub_opts = {
1241  };
1242 
1243  s->scale_d = *((SCALE_TYPE *)scale);
1244  s->scale_f = s->scale_d;
1245 
1246  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1247  flags |= AV_TX_INPLACE; /* in-place */
1248  flags |= FF_TX_PRESHUFFLE; /* First try with an in-place transform */
1249 
1250  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1251  inv, scale))) {
1252  flags &= ~FF_TX_PRESHUFFLE; /* Now try with a generic FFT */
1253  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1254  inv, scale)))
1255  return ret;
1256  }
1257 
1258  s->map = av_malloc((len >> 1)*sizeof(*s->map));
1259  if (!s->map)
1260  return AVERROR(ENOMEM);
1261 
1262  /* If we need to preshuffle copy the map from the subcontext */
1263  if (s->sub[0].flags & FF_TX_PRESHUFFLE) {
1264  memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map));
1265  } else {
1266  for (int i = 0; i < len >> 1; i++)
1267  s->map[i] = i;
1268  }
1269 
1270  if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1271  return ret;
1272 
1273  /* Saves a multiply in a hot path. */
1274  if (inv)
1275  for (int i = 0; i < (s->len >> 1); i++)
1276  s->map[i] <<= 1;
1277 
1278  return 0;
1279 }
1280 
1281 static void TX_NAME(ff_tx_mdct_fwd)(AVTXContext *s, void *_dst, void *_src,
1282  ptrdiff_t stride)
1283 {
1284  TXSample *src = _src, *dst = _dst;
1285  TXComplex *exp = s->exp, tmp, *z = _dst;
1286  const int len2 = s->len >> 1;
1287  const int len4 = s->len >> 2;
1288  const int len3 = len2 * 3;
1289  const int *sub_map = s->map;
1290 
1291  stride /= sizeof(*dst);
1292 
1293  for (int i = 0; i < len2; i++) { /* Folding and pre-reindexing */
1294  const int k = 2*i;
1295  const int idx = sub_map[i];
1296  if (k < len2) {
1297  tmp.re = FOLD(-src[ len2 + k], src[1*len2 - 1 - k]);
1298  tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);
1299  } else {
1300  tmp.re = FOLD(-src[ len2 + k], -src[5*len2 - 1 - k]);
1301  tmp.im = FOLD( src[-len2 + k], -src[1*len3 - 1 - k]);
1302  }
1303  CMUL(z[idx].im, z[idx].re, tmp.re, tmp.im, exp[i].re, exp[i].im);
1304  }
1305 
1306  s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1307 
1308  for (int i = 0; i < len4; i++) {
1309  const int i0 = len4 + i, i1 = len4 - i - 1;
1310  TXComplex src1 = { z[i1].re, z[i1].im };
1311  TXComplex src0 = { z[i0].re, z[i0].im };
1312 
1313  CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im,
1314  exp[i0].im, exp[i0].re);
1315  CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im,
1316  exp[i1].im, exp[i1].re);
1317  }
1318 }
1319 
1320 static void TX_NAME(ff_tx_mdct_inv)(AVTXContext *s, void *_dst, void *_src,
1321  ptrdiff_t stride)
1322 {
1323  TXComplex *z = _dst, *exp = s->exp;
1324  const TXSample *src = _src, *in1, *in2;
1325  const int len2 = s->len >> 1;
1326  const int len4 = s->len >> 2;
1327  const int *sub_map = s->map;
1328 
1329  stride /= sizeof(*src);
1330  in1 = src;
1331  in2 = src + ((len2*2) - 1) * stride;
1332 
1333  for (int i = 0; i < len2; i++) {
1334  int k = sub_map[i];
1335  TXComplex tmp = { in2[-k*stride], in1[k*stride] };
1336  CMUL3(z[i], tmp, exp[i]);
1337  }
1338 
1339  s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1340 
1341  exp += len2;
1342  for (int i = 0; i < len4; i++) {
1343  const int i0 = len4 + i, i1 = len4 - i - 1;
1344  TXComplex src1 = { z[i1].im, z[i1].re };
1345  TXComplex src0 = { z[i0].im, z[i0].re };
1346 
1347  CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);
1348  CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);
1349  }
1350 }
1351 
1352 static const FFTXCodelet TX_NAME(ff_tx_mdct_fwd_def) = {
1353  .name = TX_NAME_STR("mdct_fwd"),
1354  .function = TX_NAME(ff_tx_mdct_fwd),
1355  .type = TX_TYPE(MDCT),
1357  .factors = { 2, TX_FACTOR_ANY },
1358  .nb_factors = 2,
1359  .min_len = 2,
1360  .max_len = TX_LEN_UNLIMITED,
1361  .init = TX_NAME(ff_tx_mdct_init),
1363  .prio = FF_TX_PRIO_BASE,
1364 };
1365 
1366 static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_def) = {
1367  .name = TX_NAME_STR("mdct_inv"),
1368  .function = TX_NAME(ff_tx_mdct_inv),
1369  .type = TX_TYPE(MDCT),
1371  .factors = { 2, TX_FACTOR_ANY },
1372  .nb_factors = 2,
1373  .min_len = 2,
1374  .max_len = TX_LEN_UNLIMITED,
1375  .init = TX_NAME(ff_tx_mdct_init),
1377  .prio = FF_TX_PRIO_BASE,
1378 };
1379 
1381  const FFTXCodelet *cd,
1382  uint64_t flags,
1384  int len, int inv,
1385  const void *scale)
1386 {
1387  int ret;
1388 
1389  s->scale_d = *((SCALE_TYPE *)scale);
1390  s->scale_f = s->scale_d;
1391 
1392  flags &= ~AV_TX_FULL_IMDCT;
1393 
1394  if ((ret = ff_tx_init_subtx(s, TX_TYPE(MDCT), flags, NULL, len, 1, scale)))
1395  return ret;
1396 
1397  return 0;
1398 }
1399 
1400 static void TX_NAME(ff_tx_mdct_inv_full)(AVTXContext *s, void *_dst,
1401  void *_src, ptrdiff_t stride)
1402 {
1403  int len = s->len << 1;
1404  int len2 = len >> 1;
1405  int len4 = len >> 2;
1406  TXSample *dst = _dst;
1407 
1408  s->fn[0](&s->sub[0], dst + len4, _src, stride);
1409 
1410  stride /= sizeof(*dst);
1411 
1412  for (int i = 0; i < len4; i++) {
1413  dst[ i*stride] = -dst[(len2 - i - 1)*stride];
1414  dst[(len - i - 1)*stride] = dst[(len2 + i + 0)*stride];
1415  }
1416 }
1417 
1418 static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_full_def) = {
1419  .name = TX_NAME_STR("mdct_inv_full"),
1420  .function = TX_NAME(ff_tx_mdct_inv_full),
1421  .type = TX_TYPE(MDCT),
1422  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
1424  .factors = { 2, TX_FACTOR_ANY },
1425  .nb_factors = 2,
1426  .min_len = 2,
1427  .max_len = TX_LEN_UNLIMITED,
1430  .prio = FF_TX_PRIO_BASE,
1431 };
1432 
1434  const FFTXCodelet *cd,
1435  uint64_t flags,
1437  int len, int inv,
1438  const void *scale)
1439 {
1440  int ret, sub_len;
1441  FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
1442 
1443  len >>= 1;
1444  sub_len = len / cd->factors[0];
1445 
1446  s->scale_d = *((SCALE_TYPE *)scale);
1447  s->scale_f = s->scale_d;
1448 
1449  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1450  flags |= AV_TX_INPLACE; /* in-place */
1451  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
1452 
1453  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1454  sub_len, inv, scale)))
1455  return ret;
1456 
1457  if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
1458  return ret;
1459 
1460  /* Our 15-point transform is also a compound one, so embed its input map */
1461  if (cd->factors[0] == 15)
1462  TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
1463 
1464  if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1465  return ret;
1466 
1467  /* Saves multiplies in loops. */
1468  for (int i = 0; i < len; i++)
1469  s->map[i] <<= 1;
1470 
1471  if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1472  return AVERROR(ENOMEM);
1473 
1474  TX_TAB(ff_tx_init_tabs)(len / sub_len);
1475 
1476  return 0;
1477 }
1478 
1479 #define DECL_COMP_IMDCT(N) \
1480 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst, \
1481  void *_src, ptrdiff_t stride) \
1482 { \
1483  TXComplex fft##N##in[N]; \
1484  TXComplex *z = _dst, *exp = s->exp; \
1485  const TXSample *src = _src, *in1, *in2; \
1486  const int len4 = s->len >> 2; \
1487  const int len2 = s->len >> 1; \
1488  const int m = s->sub->len; \
1489  const int *in_map = s->map, *out_map = in_map + N*m; \
1490  const int *sub_map = s->sub->map; \
1491  \
1492  stride /= sizeof(*src); /* To convert it from bytes */ \
1493  in1 = src; \
1494  in2 = src + ((N*m*2) - 1) * stride; \
1495  \
1496  for (int i = 0; i < len2; i += N) { \
1497  for (int j = 0; j < N; j++) { \
1498  const int k = in_map[j]; \
1499  TXComplex tmp = { in2[-k*stride], in1[k*stride] }; \
1500  CMUL3(fft##N##in[j], tmp, exp[j]); \
1501  } \
1502  fft##N(s->tmp + *(sub_map++), fft##N##in, m); \
1503  exp += N; \
1504  in_map += N; \
1505  } \
1506  \
1507  for (int i = 0; i < N; i++) \
1508  s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \
1509  \
1510  for (int i = 0; i < len4; i++) { \
1511  const int i0 = len4 + i, i1 = len4 - i - 1; \
1512  const int s0 = out_map[i0], s1 = out_map[i1]; \
1513  TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re }; \
1514  TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re }; \
1515  \
1516  CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re); \
1517  CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re); \
1518  } \
1519 } \
1520  \
1521 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_inv_def) = { \
1522  .name = TX_NAME_STR("mdct_pfa_" #N "xM_inv"), \
1523  .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_inv), \
1524  .type = TX_TYPE(MDCT), \
1525  .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY, \
1526  .factors = { N, TX_FACTOR_ANY }, \
1527  .nb_factors = 2, \
1528  .min_len = N*2, \
1529  .max_len = TX_LEN_UNLIMITED, \
1530  .init = TX_NAME(ff_tx_mdct_pfa_init), \
1531  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1532  .prio = FF_TX_PRIO_BASE, \
1533 };
1534 
1535 DECL_COMP_IMDCT(3)
1536 DECL_COMP_IMDCT(5)
1537 DECL_COMP_IMDCT(7)
1538 DECL_COMP_IMDCT(9)
1539 DECL_COMP_IMDCT(15)
1540 
1541 #define DECL_COMP_MDCT(N) \
1542 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd)(AVTXContext *s, void *_dst, \
1543  void *_src, ptrdiff_t stride) \
1544 { \
1545  TXComplex fft##N##in[N]; \
1546  TXSample *src = _src, *dst = _dst; \
1547  TXComplex *exp = s->exp, tmp; \
1548  const int m = s->sub->len; \
1549  const int len4 = N*m; \
1550  const int len3 = len4 * 3; \
1551  const int len8 = s->len >> 2; \
1552  const int *in_map = s->map, *out_map = in_map + N*m; \
1553  const int *sub_map = s->sub->map; \
1554  \
1555  stride /= sizeof(*dst); \
1556  \
1557  for (int i = 0; i < m; i++) { /* Folding and pre-reindexing */ \
1558  for (int j = 0; j < N; j++) { \
1559  const int k = in_map[i*N + j]; \
1560  if (k < len4) { \
1561  tmp.re = FOLD(-src[ len4 + k], src[1*len4 - 1 - k]); \
1562  tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]); \
1563  } else { \
1564  tmp.re = FOLD(-src[ len4 + k], -src[5*len4 - 1 - k]); \
1565  tmp.im = FOLD( src[-len4 + k], -src[1*len3 - 1 - k]); \
1566  } \
1567  CMUL(fft##N##in[j].im, fft##N##in[j].re, tmp.re, tmp.im, \
1568  exp[k >> 1].re, exp[k >> 1].im); \
1569  } \
1570  fft##N(s->tmp + sub_map[i], fft##N##in, m); \
1571  } \
1572  \
1573  for (int i = 0; i < N; i++) \
1574  s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \
1575  \
1576  for (int i = 0; i < len8; i++) { \
1577  const int i0 = len8 + i, i1 = len8 - i - 1; \
1578  const int s0 = out_map[i0], s1 = out_map[i1]; \
1579  TXComplex src1 = { s->tmp[s1].re, s->tmp[s1].im }; \
1580  TXComplex src0 = { s->tmp[s0].re, s->tmp[s0].im }; \
1581  \
1582  CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im, \
1583  exp[i0].im, exp[i0].re); \
1584  CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im, \
1585  exp[i1].im, exp[i1].re); \
1586  } \
1587 } \
1588  \
1589 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd_def) = { \
1590  .name = TX_NAME_STR("mdct_pfa_" #N "xM_fwd"), \
1591  .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd), \
1592  .type = TX_TYPE(MDCT), \
1593  .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, \
1594  .factors = { N, TX_FACTOR_ANY }, \
1595  .nb_factors = 2, \
1596  .min_len = N*2, \
1597  .max_len = TX_LEN_UNLIMITED, \
1598  .init = TX_NAME(ff_tx_mdct_pfa_init), \
1599  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1600  .prio = FF_TX_PRIO_BASE, \
1601 };
1602 
1603 DECL_COMP_MDCT(3)
1604 DECL_COMP_MDCT(5)
1605 DECL_COMP_MDCT(7)
1606 DECL_COMP_MDCT(9)
1607 DECL_COMP_MDCT(15)
1608 
1610  const FFTXCodelet *cd,
1611  uint64_t flags,
1613  int len, int inv,
1614  const void *scale)
1615 {
1616  int ret;
1617  double f, m;
1618  TXSample *tab;
1619  uint64_t r2r = flags & AV_TX_REAL_TO_REAL;
1620  int len4 = FFALIGN(len, 4) / 4;
1621 
1622  s->scale_d = *((SCALE_TYPE *)scale);
1623  s->scale_f = s->scale_d;
1624 
1626 
1627  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, NULL, len >> 1, inv, scale)))
1628  return ret;
1629 
1630  if (!(s->exp = av_mallocz((8 + 2*len4)*sizeof(*s->exp))))
1631  return AVERROR(ENOMEM);
1632 
1633  tab = (TXSample *)s->exp;
1634 
1635  f = 2*M_PI/len;
1636 
1637  m = (inv ? 2*s->scale_d : s->scale_d);
1638 
1639  *tab++ = RESCALE((inv ? 0.5 : 1.0) * m);
1640  *tab++ = RESCALE(inv ? 0.5*m : 1.0*m);
1641  *tab++ = RESCALE( m);
1642  *tab++ = RESCALE(-m);
1643 
1644  *tab++ = RESCALE( (0.5 - 0.0) * m);
1645  if (r2r)
1646  *tab++ = 1 / s->scale_f;
1647  else
1648  *tab++ = RESCALE( (0.0 - 0.5) * m);
1649  *tab++ = RESCALE( (0.5 - inv) * m);
1650  *tab++ = RESCALE(-(0.5 - inv) * m);
1651 
1652  for (int i = 0; i < len4; i++)
1653  *tab++ = RESCALE(cos(i*f));
1654 
1655  tab = ((TXSample *)s->exp) + len4 + 8;
1656 
1657  for (int i = 0; i < len4; i++)
1658  *tab++ = RESCALE(cos(((len - i*4)/4.0)*f)) * (inv ? 1 : -1);
1659 
1660  return 0;
1661 }
1662 
1663 #define DECL_RDFT(n, inv) \
1664 static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst, \
1665  void *_src, ptrdiff_t stride) \
1666 { \
1667  const int len2 = s->len >> 1; \
1668  const int len4 = s->len >> 2; \
1669  const TXSample *fact = (void *)s->exp; \
1670  const TXSample *tcos = fact + 8; \
1671  const TXSample *tsin = tcos + len4; \
1672  TXComplex *data = inv ? _src : _dst; \
1673  TXComplex t[3]; \
1674  \
1675  if (!inv) \
1676  s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex)); \
1677  else \
1678  data[0].im = data[len2].re; \
1679  \
1680  /* The DC value's both components are real, but we need to change them \
1681  * into complex values. Also, the middle of the array is special-cased. \
1682  * These operations can be done before or after the loop. */ \
1683  t[0].re = data[0].re; \
1684  data[0].re = t[0].re + data[0].im; \
1685  data[0].im = t[0].re - data[0].im; \
1686  data[ 0].re = MULT(fact[0], data[ 0].re); \
1687  data[ 0].im = MULT(fact[1], data[ 0].im); \
1688  data[len4].re = MULT(fact[2], data[len4].re); \
1689  data[len4].im = MULT(fact[3], data[len4].im); \
1690  \
1691  for (int i = 1; i < len4; i++) { \
1692  /* Separate even and odd FFTs */ \
1693  t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re)); \
1694  t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im)); \
1695  t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im)); \
1696  t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re)); \
1697  \
1698  /* Apply twiddle factors to the odd FFT and add to the even FFT */ \
1699  CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]); \
1700  \
1701  data[ i].re = t[0].re + t[2].re; \
1702  data[ i].im = t[2].im - t[0].im; \
1703  data[len2 - i].re = t[0].re - t[2].re; \
1704  data[len2 - i].im = t[2].im + t[0].im; \
1705  } \
1706  \
1707  if (inv) { \
1708  s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex)); \
1709  } else { \
1710  /* Move [0].im to the last position, as convention requires */ \
1711  data[len2].re = data[0].im; \
1712  data[ 0].im = data[len2].im = 0; \
1713  } \
1714 } \
1715  \
1716 static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = { \
1717  .name = TX_NAME_STR("rdft_" #n), \
1718  .function = TX_NAME(ff_tx_rdft_ ##n), \
1719  .type = TX_TYPE(RDFT), \
1720  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
1721  (inv ? FF_TX_INVERSE_ONLY : FF_TX_FORWARD_ONLY), \
1722  .factors = { 4, TX_FACTOR_ANY }, \
1723  .nb_factors = 2, \
1724  .min_len = 4, \
1725  .max_len = TX_LEN_UNLIMITED, \
1726  .init = TX_NAME(ff_tx_rdft_init), \
1727  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1728  .prio = FF_TX_PRIO_BASE, \
1729 };
1730 
1731 DECL_RDFT(r2c, 0)
1733 
1734 #define DECL_RDFT_HALF(n, mode, mod2) \
1735 static void TX_NAME(ff_tx_rdft_ ##n)(AVTXContext *s, void *_dst, \
1736  void *_src, ptrdiff_t stride) \
1737 { \
1738  const int len = s->len; \
1739  const int len2 = len >> 1; \
1740  const int len4 = len >> 2; \
1741  const int aligned_len4 = FFALIGN(len, 4)/4; \
1742  const TXSample *fact = (void *)s->exp; \
1743  const TXSample *tcos = fact + 8; \
1744  const TXSample *tsin = tcos + aligned_len4; \
1745  TXComplex *data = _dst; \
1746  TXSample *out = _dst; /* Half-complex is forward-only */ \
1747  TXSample tmp_dc; \
1748  av_unused TXSample tmp_mid; \
1749  TXSample tmp[4]; \
1750  TXComplex sf, sl; \
1751  \
1752  s->fn[0](&s->sub[0], _dst, _src, sizeof(TXComplex)); \
1753  \
1754  tmp_dc = data[0].re; \
1755  data[ 0].re = tmp_dc + data[0].im; \
1756  tmp_dc = tmp_dc - data[0].im; \
1757  \
1758  data[ 0].re = MULT(fact[0], data[ 0].re); \
1759  tmp_dc = MULT(fact[1], tmp_dc); \
1760  data[len4].re = MULT(fact[2], data[len4].re); \
1761  \
1762  if (!mod2) { \
1763  data[len4].im = MULT(fact[3], data[len4].im); \
1764  } else { \
1765  sf = data[len4]; \
1766  sl = data[len4 + 1]; \
1767  if (mode == AV_TX_REAL_TO_REAL) \
1768  tmp[0] = MULT(fact[4], (sf.re + sl.re)); \
1769  else \
1770  tmp[0] = MULT(fact[5], (sf.im - sl.im)); \
1771  tmp[1] = MULT(fact[6], (sf.im + sl.im)); \
1772  tmp[2] = MULT(fact[7], (sf.re - sl.re)); \
1773  \
1774  if (mode == AV_TX_REAL_TO_REAL) { \
1775  tmp[3] = tmp[1]*tcos[len4] - tmp[2]*tsin[len4]; \
1776  tmp_mid = (tmp[0] - tmp[3]); \
1777  } else { \
1778  tmp[3] = tmp[1]*tsin[len4] + tmp[2]*tcos[len4]; \
1779  tmp_mid = (tmp[0] + tmp[3]); \
1780  } \
1781  } \
1782  \
1783  /* NOTE: unrolling this breaks non-mod8 lengths */ \
1784  for (int i = 1; i <= len4; i++) { \
1785  TXSample tmp[4]; \
1786  TXComplex sf = data[i]; \
1787  TXComplex sl = data[len2 - i]; \
1788  \
1789  if (mode == AV_TX_REAL_TO_REAL) \
1790  tmp[0] = MULT(fact[4], (sf.re + sl.re)); \
1791  else \
1792  tmp[0] = MULT(fact[5], (sf.im - sl.im)); \
1793  \
1794  tmp[1] = MULT(fact[6], (sf.im + sl.im)); \
1795  tmp[2] = MULT(fact[7], (sf.re - sl.re)); \
1796  \
1797  if (mode == AV_TX_REAL_TO_REAL) { \
1798  tmp[3] = tmp[1]*tcos[i] - tmp[2]*tsin[i]; \
1799  out[i] = (tmp[0] + tmp[3]); \
1800  out[len - i] = (tmp[0] - tmp[3]); \
1801  } else { \
1802  tmp[3] = tmp[1]*tsin[i] + tmp[2]*tcos[i]; \
1803  out[i - 1] = (tmp[3] - tmp[0]); \
1804  out[len - i - 1] = (tmp[0] + tmp[3]); \
1805  } \
1806  } \
1807  \
1808  for (int i = 1; i < (len4 + (mode == AV_TX_REAL_TO_IMAGINARY)); i++) \
1809  out[len2 - i] = out[len - i]; \
1810  \
1811  if (mode == AV_TX_REAL_TO_REAL) { \
1812  out[len2] = tmp_dc; \
1813  if (mod2) \
1814  out[len4 + 1] = tmp_mid * fact[5]; \
1815  } else if (mod2) { \
1816  out[len4] = tmp_mid; \
1817  } \
1818 } \
1819  \
1820 static const FFTXCodelet TX_NAME(ff_tx_rdft_ ##n## _def) = { \
1821  .name = TX_NAME_STR("rdft_" #n), \
1822  .function = TX_NAME(ff_tx_rdft_ ##n), \
1823  .type = TX_TYPE(RDFT), \
1824  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE | mode | \
1825  FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, \
1826  .factors = { 2 + 2*(!mod2), TX_FACTOR_ANY }, \
1827  .nb_factors = 2, \
1828  .min_len = 2 + 2*(!mod2), \
1829  .max_len = TX_LEN_UNLIMITED, \
1830  .init = TX_NAME(ff_tx_rdft_init), \
1831  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1832  .prio = FF_TX_PRIO_BASE, \
1833 };
1834 
1836 DECL_RDFT_HALF(r2r_mod2, AV_TX_REAL_TO_REAL, 1)
1839 
1841  const FFTXCodelet *cd,
1842  uint64_t flags,
1844  int len, int inv,
1845  const void *scale)
1846 {
1847  int ret;
1848  double freq;
1849  TXSample *tab;
1850  SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
1851 
1852  if (inv) {
1853  len *= 2;
1854  s->len *= 2;
1855  rsc *= 0.5;
1856  }
1857 
1858  if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL, len, inv, &rsc)))
1859  return ret;
1860 
1861  s->exp = av_malloc((len/2)*3*sizeof(TXSample));
1862  if (!s->exp)
1863  return AVERROR(ENOMEM);
1864 
1865  tab = (TXSample *)s->exp;
1866 
1867  freq = M_PI/(len*2);
1868 
1869  for (int i = 0; i < len; i++)
1870  tab[i] = RESCALE(cos(i*freq)*(!inv + 1));
1871 
1872  if (inv) {
1873  for (int i = 0; i < len/2; i++)
1874  tab[len + i] = RESCALE(0.5 / sin((2*i + 1)*freq));
1875  } else {
1876  for (int i = 0; i < len/2; i++)
1877  tab[len + i] = RESCALE(cos((len - 2*i - 1)*freq));
1878  }
1879 
1880  return 0;
1882 
1883 static void TX_NAME(ff_tx_dctII)(AVTXContext *s, void *_dst,
1884  void *_src, ptrdiff_t stride)
1885 {
1886  TXSample *dst = _dst;
1887  TXSample *src = _src;
1888  const int len = s->len;
1889  const int len2 = len >> 1;
1890  const TXSample *exp = (void *)s->exp;
1891  TXSample next;
1892 #ifdef TX_INT32
1893  int64_t tmp1, tmp2;
1894 #else
1895  TXSample tmp1, tmp2;
1896 #endif
1897 
1898  for (int i = 0; i < len2; i++) {
1899  TXSample in1 = src[i];
1900  TXSample in2 = src[len - i - 1];
1901  TXSample s = exp[len + i];
1902 
1903 #ifdef TX_INT32
1904  tmp1 = in1 + in2;
1905  tmp2 = in1 - in2;
1906 
1907  tmp1 >>= 1;
1908  tmp2 *= s;
1909 
1910  tmp2 = (tmp2 + 0x40000000) >> 31;
1911 #else
1912  tmp1 = (in1 + in2)*0.5;
1913  tmp2 = (in1 - in2)*s;
1914 #endif
1915 
1916  src[i] = tmp1 + tmp2;
1917  src[len - i - 1] = tmp1 - tmp2;
1918  }
1919 
1920  s->fn[0](&s->sub[0], dst, src, sizeof(TXComplex));
1921 
1922  next = dst[len];
1923 
1924  for (int i = len - 2; i > 0; i -= 2) {
1925  TXSample tmp;
1926 
1927  CMUL(tmp, dst[i], exp[len - i], exp[i], dst[i + 0], dst[i + 1]);
1928 
1929  dst[i + 1] = next;
1930 
1931  next += tmp;
1932  }
1933 
1934 #ifdef TX_INT32
1935  tmp1 = ((int64_t)exp[0]) * ((int64_t)dst[0]);
1936  dst[0] = (tmp1 + 0x40000000) >> 31;
1937 #else
1938  dst[0] = exp[0] * dst[0];
1939 #endif
1940  dst[1] = next;
1942 
1943 static void TX_NAME(ff_tx_dctIII)(AVTXContext *s, void *_dst,
1944  void *_src, ptrdiff_t stride)
1945 {
1946  TXSample *dst = _dst;
1947  TXSample *src = _src;
1948  const int len = s->len;
1949  const int len2 = len >> 1;
1950  const TXSample *exp = (void *)s->exp;
1951 #ifdef TX_INT32
1952  int64_t tmp1, tmp2 = src[len - 1];
1953  tmp2 = (2*tmp2 + 0x40000000) >> 31;
1954 #else
1955  TXSample tmp1, tmp2 = 2*src[len - 1];
1956 #endif
1957 
1958  src[len] = tmp2;
1959 
1960  for (int i = len - 2; i >= 2; i -= 2) {
1961  TXSample val1 = src[i - 0];
1962  TXSample val2 = src[i - 1] - src[i + 1];
1963 
1964  CMUL(src[i + 1], src[i], exp[len - i], exp[i], val1, val2);
1965  }
1966 
1967  s->fn[0](&s->sub[0], dst, src, sizeof(float));
1968 
1969  for (int i = 0; i < len2; i++) {
1970  TXSample in1 = dst[i];
1971  TXSample in2 = dst[len - i - 1];
1972  TXSample c = exp[len + i];
1973 
1974  tmp1 = in1 + in2;
1975  tmp2 = in1 - in2;
1976  tmp2 *= c;
1977 #ifdef TX_INT32
1978  tmp2 = (tmp2 + 0x40000000) >> 31;
1979 #endif
1980 
1981  dst[i] = tmp1 + tmp2;
1982  dst[len - i - 1] = tmp1 - tmp2;
1983  }
1984 }
1985 
1986 static const FFTXCodelet TX_NAME(ff_tx_dctII_def) = {
1987  .name = TX_NAME_STR("dctII"),
1988  .function = TX_NAME(ff_tx_dctII),
1989  .type = TX_TYPE(DCT),
1990  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
1992  .factors = { 2, TX_FACTOR_ANY },
1993  .min_len = 2,
1994  .max_len = TX_LEN_UNLIMITED,
1995  .init = TX_NAME(ff_tx_dct_init),
1997  .prio = FF_TX_PRIO_BASE,
1998 };
1999 
2000 static const FFTXCodelet TX_NAME(ff_tx_dctIII_def) = {
2001  .name = TX_NAME_STR("dctIII"),
2002  .function = TX_NAME(ff_tx_dctIII),
2003  .type = TX_TYPE(DCT),
2004  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
2006  .factors = { 2, TX_FACTOR_ANY },
2007  .min_len = 2,
2008  .max_len = TX_LEN_UNLIMITED,
2009  .init = TX_NAME(ff_tx_dct_init),
2011  .prio = FF_TX_PRIO_BASE,
2012 };
2013 
2015  const FFTXCodelet *cd,
2016  uint64_t flags,
2018  int len, int inv,
2019  const void *scale)
2020 {
2021  int ret;
2022  SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
2023 
2024  if (inv) {
2025  len *= 2;
2026  s->len *= 2;
2027  rsc *= 0.5;
2028  }
2029 
2030  /* We want a half-complex RDFT */
2031  flags |= cd->type == TX_TYPE(DCT_I) ? AV_TX_REAL_TO_REAL :
2033 
2034  if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL,
2035  (len - 1 + 2*(cd->type == TX_TYPE(DST_I)))*2,
2036  0, &rsc)))
2037  return ret;
2038 
2039  s->tmp = av_mallocz((len + 1)*2*sizeof(TXSample));
2040  if (!s->tmp)
2041  return AVERROR(ENOMEM);
2042 
2043  return 0;
2045 
2046 static void TX_NAME(ff_tx_dctI)(AVTXContext *s, void *_dst,
2047  void *_src, ptrdiff_t stride)
2048 {
2049  TXSample *dst = _dst;
2050  TXSample *src = _src;
2051  const int len = s->len - 1;
2052  TXSample *tmp = (TXSample *)s->tmp;
2053 
2054  stride /= sizeof(TXSample);
2055 
2056  for (int i = 0; i < len; i++)
2057  tmp[i] = tmp[2*len - i] = src[i * stride];
2058 
2059  tmp[len] = src[len * stride]; /* Middle */
2060 
2061  s->fn[0](&s->sub[0], dst, tmp, sizeof(TXSample));
2063 
2064 static void TX_NAME(ff_tx_dstI)(AVTXContext *s, void *_dst,
2065  void *_src, ptrdiff_t stride)
2066 {
2067  TXSample *dst = _dst;
2068  TXSample *src = _src;
2069  const int len = s->len + 1;
2070  TXSample *tmp = (void *)s->tmp;
2071 
2072  stride /= sizeof(TXSample);
2073 
2074  tmp[0] = 0;
2075 
2076  for (int i = 1; i < len; i++) {
2077  TXSample a = src[(i - 1) * stride];
2078  tmp[i] = -a;
2079  tmp[2*len - i] = a;
2080  }
2081 
2082  tmp[len] = 0; /* i == n, Nyquist */
2083 
2084  s->fn[0](&s->sub[0], dst, tmp, sizeof(float));
2085 }
2086 
2087 static const FFTXCodelet TX_NAME(ff_tx_dctI_def) = {
2088  .name = TX_NAME_STR("dctI"),
2089  .function = TX_NAME(ff_tx_dctI),
2090  .type = TX_TYPE(DCT_I),
2092  .factors = { 2, TX_FACTOR_ANY },
2093  .nb_factors = 2,
2094  .min_len = 2,
2095  .max_len = TX_LEN_UNLIMITED,
2096  .init = TX_NAME(ff_tx_dcstI_init),
2098  .prio = FF_TX_PRIO_BASE,
2099 };
2100 
2101 static const FFTXCodelet TX_NAME(ff_tx_dstI_def) = {
2102  .name = TX_NAME_STR("dstI"),
2103  .function = TX_NAME(ff_tx_dstI),
2104  .type = TX_TYPE(DST_I),
2106  .factors = { 2, TX_FACTOR_ANY },
2107  .nb_factors = 2,
2108  .min_len = 2,
2109  .max_len = TX_LEN_UNLIMITED,
2110  .init = TX_NAME(ff_tx_dcstI_init),
2112  .prio = FF_TX_PRIO_BASE,
2113 };
2114 
2115 int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s, int *pre_tab)
2116 {
2117  int off = 0;
2118  int len4 = s->len >> 1;
2119  double scale = s->scale_d;
2120  const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0;
2121  size_t alloc = pre_tab ? 2*len4 : len4;
2122 
2123  if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp))))
2124  return AVERROR(ENOMEM);
2125 
2126  scale = sqrt(fabs(scale));
2127 
2128  if (pre_tab)
2129  off = len4;
2130 
2131  for (int i = 0; i < len4; i++) {
2132  const double alpha = M_PI_2 * (i + theta) / len4;
2133  s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale),
2134  RESCALE(sin(alpha) * scale) };
2135  }
2136 
2137  if (pre_tab)
2138  for (int i = 0; i < len4; i++)
2139  s->exp[i] = s->exp[len4 + pre_tab[i]];
2140 
2141  return 0;
2142 }
2143 
2144 const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = {
2145  /* Split-Radix codelets */
2146  &TX_NAME(ff_tx_fft2_ns_def),
2147  &TX_NAME(ff_tx_fft4_ns_def),
2148  &TX_NAME(ff_tx_fft8_ns_def),
2149  &TX_NAME(ff_tx_fft16_ns_def),
2150  &TX_NAME(ff_tx_fft32_ns_def),
2151  &TX_NAME(ff_tx_fft64_ns_def),
2152  &TX_NAME(ff_tx_fft128_ns_def),
2153  &TX_NAME(ff_tx_fft256_ns_def),
2154  &TX_NAME(ff_tx_fft512_ns_def),
2155  &TX_NAME(ff_tx_fft1024_ns_def),
2156  &TX_NAME(ff_tx_fft2048_ns_def),
2157  &TX_NAME(ff_tx_fft4096_ns_def),
2158  &TX_NAME(ff_tx_fft8192_ns_def),
2159  &TX_NAME(ff_tx_fft16384_ns_def),
2160  &TX_NAME(ff_tx_fft32768_ns_def),
2161  &TX_NAME(ff_tx_fft65536_ns_def),
2162  &TX_NAME(ff_tx_fft131072_ns_def),
2163  &TX_NAME(ff_tx_fft262144_ns_def),
2164  &TX_NAME(ff_tx_fft524288_ns_def),
2165  &TX_NAME(ff_tx_fft1048576_ns_def),
2166  &TX_NAME(ff_tx_fft2097152_ns_def),
2167 
2168  /* Prime factor codelets */
2169  &TX_NAME(ff_tx_fft3_ns_def),
2170  &TX_NAME(ff_tx_fft5_ns_def),
2171  &TX_NAME(ff_tx_fft7_ns_def),
2172  &TX_NAME(ff_tx_fft9_ns_def),
2173  &TX_NAME(ff_tx_fft15_ns_def),
2174 
2175  /* We get these for free */
2176  &TX_NAME(ff_tx_fft3_fwd_def),
2177  &TX_NAME(ff_tx_fft5_fwd_def),
2178  &TX_NAME(ff_tx_fft7_fwd_def),
2179  &TX_NAME(ff_tx_fft9_fwd_def),
2180 
2181  /* Standalone transforms */
2182  &TX_NAME(ff_tx_fft_def),
2183  &TX_NAME(ff_tx_fft_inplace_def),
2184  &TX_NAME(ff_tx_fft_inplace_small_def),
2185  &TX_NAME(ff_tx_fft_pfa_def),
2186  &TX_NAME(ff_tx_fft_pfa_ns_def),
2187  &TX_NAME(ff_tx_fft_naive_def),
2188  &TX_NAME(ff_tx_fft_naive_small_def),
2189  &TX_NAME(ff_tx_mdct_fwd_def),
2190  &TX_NAME(ff_tx_mdct_inv_def),
2191  &TX_NAME(ff_tx_mdct_pfa_3xM_fwd_def),
2192  &TX_NAME(ff_tx_mdct_pfa_5xM_fwd_def),
2193  &TX_NAME(ff_tx_mdct_pfa_7xM_fwd_def),
2194  &TX_NAME(ff_tx_mdct_pfa_9xM_fwd_def),
2195  &TX_NAME(ff_tx_mdct_pfa_15xM_fwd_def),
2196  &TX_NAME(ff_tx_mdct_pfa_3xM_inv_def),
2197  &TX_NAME(ff_tx_mdct_pfa_5xM_inv_def),
2198  &TX_NAME(ff_tx_mdct_pfa_7xM_inv_def),
2199  &TX_NAME(ff_tx_mdct_pfa_9xM_inv_def),
2200  &TX_NAME(ff_tx_mdct_pfa_15xM_inv_def),
2201  &TX_NAME(ff_tx_mdct_naive_fwd_def),
2202  &TX_NAME(ff_tx_mdct_naive_inv_def),
2203  &TX_NAME(ff_tx_mdct_inv_full_def),
2204  &TX_NAME(ff_tx_rdft_r2c_def),
2205  &TX_NAME(ff_tx_rdft_r2r_def),
2206  &TX_NAME(ff_tx_rdft_r2r_mod2_def),
2207  &TX_NAME(ff_tx_rdft_r2i_def),
2208  &TX_NAME(ff_tx_rdft_r2i_mod2_def),
2209  &TX_NAME(ff_tx_rdft_c2r_def),
2210  &TX_NAME(ff_tx_dctII_def),
2211  &TX_NAME(ff_tx_dctIII_def),
2212  &TX_NAME(ff_tx_dctI_def),
2213  &TX_NAME(ff_tx_dstI_def),
2214 
2215  NULL,
2216 };
func
int(* func)(AVBPrint *dst, const char *in, const char *arg)
Definition: jacosubdec.c:68
DCT_I
@ DCT_I
Definition: avfft.h:121
ff_tx_fft_sr_combine
static void TX_NAME() ff_tx_fft_sr_combine(TXComplex *z, const TXSample *cos, int len)
Definition: tx_template.c:566
ff_tx_dct_init
static av_cold int TX_NAME() ff_tx_dct_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1838
AV_TX_REAL_TO_REAL
@ AV_TX_REAL_TO_REAL
Perform a real to half-complex RDFT.
Definition: tx.h:184
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
out
FILE * out
Definition: movenc.c:55
ff_ctz
#define ff_ctz
Definition: intmath.h:107
TRANSFORM
#define TRANSFORM(a0, a1, a2, a3, wre, wim)
Definition: tx_template.c:558
src1
const pixel * src1
Definition: h264pred_template.c:421
AVTXContext
Definition: tx_priv.h:235
ff_tx_fft
static void TX_NAME() ff_tx_fft(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:771
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
FFTXCodeletOptions
Definition: tx_priv.h:183
w
uint8_t w
Definition: llviddspenc.c:38
M_PI_2
#define M_PI_2
Definition: mathematics.h:73
TX_MAX_DECOMPOSITIONS
#define TX_MAX_DECOMPOSITIONS
Definition: tx_priv.h:197
SR_POW2_TABLES
#define SR_POW2_TABLES
Definition: tx_template.c:32
ff_tx_fft_pfa
static void TX_NAME() ff_tx_fft_pfa(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride)
Definition: tx_template.c:1066
ff_tx_fft16_ns
static void TX_NAME() ff_tx_fft16_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:683
ff_tx_gen_inplace_map
int ff_tx_gen_inplace_map(AVTXContext *s, int len)
Definition: tx.c:156
t1
#define t1
Definition: regdef.h:29
fft15
static av_always_inline void fft15(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:469
FF_TX_CPU_FLAGS_ALL
#define FF_TX_CPU_FLAGS_ALL
Definition: tx_priv.h:230
ff_tx_gen_compound_mapping
int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts, int inv, int n, int m)
Definition: tx.c:75
ff_tx_dctI
static void TX_NAME() ff_tx_dctI(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:2044
ff_tx_fft_naive
static void TX_NAME() ff_tx_fft_naive(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:879
av_malloc
#define av_malloc(s)
Definition: tableprint_vlc.h:30
DECL_FFT5
#define DECL_FFT5(NAME, D0, D1, D2, D3, D4)
Definition: tx_template.c:215
ff_tx_mdct_naive_fwd
static void TX_NAME() ff_tx_mdct_naive_fwd(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1152
ff_tx_rdft_init
static av_cold int TX_NAME() ff_tx_rdft_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1609
DECL_SR_CODELET_DEF
#define DECL_SR_CODELET_DEF(n)
Definition: tx_template.c:603
FFTabInitData::func
void(* func)(void)
Definition: tx_template.c:65
sr_tabs_init_funcs
static SR_POW2_TABLES void(*const sr_tabs_init_funcs[])(void)
Definition: tx_template.c:83
tab
static const struct twinvq_data tab
Definition: twinvq_data.h:10345
TX_NAME
static const FFTXCodelet TX_NAME(ff_tx_fft_def)
FF_TX_MAP_GATHER
@ FF_TX_MAP_GATHER
Definition: tx_priv.h:176
sum_d
static void sum_d(const int *input, int *output, int len)
Definition: dcadct.c:51
TX_INT32
#define TX_INT32
Definition: tx_int32.c:19
sr_tabs_init_once
static AVOnce sr_tabs_init_once[]
Definition: tx_template.c:89
val
static double val(void *priv, double ch)
Definition: aeval.c:78
DECL_FACTOR_F
#define DECL_FACTOR_F(n)
Definition: tx_template.c:521
TX_MAX_SUB
#define TX_MAX_SUB
Definition: tx_priv.h:194
TABLE_DEF
#define TABLE_DEF(name, size)
Definition: tx_template.c:29
FFTXCodelet::type
enum AVTXType type
Definition: tx_priv.h:202
FFTXCodeletOptions::map_dir
FFTXMapDirection map_dir
Definition: tx_priv.h:187
mult
static int16_t mult(Float11 *f1, Float11 *f2)
Definition: g726.c:60
ff_thread_once
static int ff_thread_once(char *control, void(*routine)(void))
Definition: thread.h:205
FF_ARRAY_ELEMS
#define FF_ARRAY_ELEMS(a)
Definition: sinewin_tablegen.c:29
av_cold
#define av_cold
Definition: attributes.h:90
FFTabInitData
Definition: tx_template.c:64
float
float
Definition: af_crystalizer.c:121
c2r
static void c2r(float *buffer, int size)
Definition: af_apsyclip.c:387
s
#define s(width, name)
Definition: cbs_vp9.c:198
ff_tx_fft_factor_init
static av_cold int TX_NAME() ff_tx_fft_factor_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:482
ff_tx_mdct_fwd
static void TX_NAME() ff_tx_mdct_fwd(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1281
t7
#define t7
Definition: regdef.h:35
ff_tx_mdct_naive_init
static av_cold int TX_NAME() ff_tx_mdct_naive_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1140
FF_TX_FORWARD_ONLY
#define FF_TX_FORWARD_ONLY
Definition: tx_priv.h:158
FFTXCodelet::cpu_flags
int cpu_flags
Definition: tx_priv.h:227
DECL_FACTOR_S
#define DECL_FACTOR_S(n)
Definition: tx_template.c:500
ff_tx_dstI
static void TX_NAME() ff_tx_dstI(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:2062
if
if(ret)
Definition: filter_design.txt:179
AV_TX_FULL_IMDCT
@ AV_TX_FULL_IMDCT
Performs a full inverse MDCT rather than leaving out samples that can be derived through symmetry.
Definition: tx.h:175
opts
AVDictionary * opts
Definition: movenc.c:51
AV_ONCE_INIT
#define AV_ONCE_INIT
Definition: thread.h:203
fabs
static __device__ float fabs(float a)
Definition: cuda_runtime.h:182
AV_TX_REAL_TO_IMAGINARY
@ AV_TX_REAL_TO_IMAGINARY
Definition: tx.h:185
NULL
#define NULL
Definition: coverity.c:32
t5
#define t5
Definition: regdef.h:33
ff_tx_mdct_init
static av_cold int TX_NAME() ff_tx_mdct_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1231
t6
#define t6
Definition: regdef.h:34
AV_TX_INPLACE
@ AV_TX_INPLACE
Allows for in-place transformations, where input == output.
Definition: tx.h:161
ff_tx_gen_ptwo_revtab
int ff_tx_gen_ptwo_revtab(AVTXContext *s, FFTXCodeletOptions *opts)
Definition: tx.c:136
r2c
static void r2c(float *buffer, int size)
Definition: af_apsyclip.c:378
FF_TX_OUT_OF_PLACE
#define FF_TX_OUT_OF_PLACE
Definition: tx_priv.h:154
CMUL3
#define CMUL3(c, a, b)
Definition: tx_priv.h:150
AV_TX_UNALIGNED
@ AV_TX_UNALIGNED
Relaxes alignment requirement for the in and out arrays of av_tx_fn().
Definition: tx.h:167
exp
int8_t exp
Definition: eval.c:73
ff_tx_dctIII
static void TX_NAME() ff_tx_dctIII(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1941
DECL_COMP_MDCT
#define DECL_COMP_MDCT(N)
Definition: tx_template.c:1541
AVOnce
#define AVOnce
Definition: thread.h:202
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
ff_tx_fft_pfa_init
static av_cold int TX_NAME() ff_tx_fft_pfa_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:956
ff_tx_clear_ctx
void ff_tx_clear_ctx(AVTXContext *s)
Definition: tx.c:290
ff_tx_fft2_ns
static void TX_NAME() ff_tx_fft2_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:635
FF_TX_PRESHUFFLE
#define FF_TX_PRESHUFFLE
Definition: tx_priv.h:156
ff_tx_fft_sr_codelet_init
static av_cold int TX_NAME() ff_tx_fft_sr_codelet_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:592
ff_tx_gen_default_map
int ff_tx_gen_default_map(AVTXContext *s, FFTXCodeletOptions *opts)
Definition: tx.c:525
f
f
Definition: af_crystalizer.c:121
ff_tx_init_tab_53
static av_cold void TX_TAB() ff_tx_init_tab_53(void)
Definition: tx_template.c:95
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
FF_TX_PRIO_BASE
@ FF_TX_PRIO_BASE
Definition: tx_priv.h:162
for
for(k=2;k<=8;++k)
Definition: h264pred_template.c:425
ff_tx_fft8_ns
static void TX_NAME() ff_tx_fft8_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:664
fft9
static av_always_inline void fft9(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:344
t8
#define t8
Definition: regdef.h:53
BF
#define BF(a, b, c, s)
Definition: dct32_template.c:90
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
TX_EMBED_INPUT_PFA_MAP
#define TX_EMBED_INPUT_PFA_MAP(map, tot_len, d1, d2)
Definition: tx_priv.h:271
ff_tx_fft_inplace
static void TX_NAME() ff_tx_fft_inplace(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:788
DECL_RDFT_HALF
#define DECL_RDFT_HALF(n, mode, mod2)
Definition: tx_template.c:1732
M_PI
#define M_PI
Definition: mathematics.h:67
ff_tx_fft_init
static av_cold int TX_NAME() ff_tx_fft_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:732
DST_I
@ DST_I
Definition: avfft.h:122
TXComplex
void TXComplex
Definition: tx_priv.h:65
ff_tx_mdct_inv
static void TX_NAME() ff_tx_mdct_inv(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1320
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
t4
#define t4
Definition: regdef.h:32
t3
#define t3
Definition: regdef.h:31
av_malloc_array
#define av_malloc_array(a, b)
Definition: tableprint_vlc.h:31
nptwo_tabs_init_once
static AVOnce nptwo_tabs_init_once[]
Definition: tx_template.c:142
av_always_inline
#define av_always_inline
Definition: attributes.h:49
ff_tx_fft_init_naive_small
static av_cold int TX_NAME() ff_tx_fft_init_naive_small(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:854
DECL_SR_CODELET
#define DECL_SR_CODELET(n, n2, n4)
Definition: tx_template.c:619
DECL_COMP_IMDCT
#define DECL_COMP_IMDCT(N)
Definition: tx_template.c:1479
av_mallocz
void * av_mallocz(size_t size)
Allocate a memory block with alignment suitable for all memory accesses (including vectors if availab...
Definition: mem.c:256
len
int len
Definition: vorbis_enc_data.h:426
fft3
static av_always_inline void fft3(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:177
FF_TX_MAP_SCATTER
@ FF_TX_MAP_SCATTER
Definition: tx_priv.h:179
TX_LEN_UNLIMITED
#define TX_LEN_UNLIMITED
Definition: tx_priv.h:216
stride
#define stride
Definition: h264pred_template.c:537
nptwo_tabs_init_data
static const FFTabInitData nptwo_tabs_init_data[]
Definition: tx_template.c:136
ret
ret
Definition: filter_design.txt:187
ff_tx_init_subtx
av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx.c:712
FFSWAP
#define FFSWAP(type, a, b)
Definition: macros.h:52
ff_tx_init_tab_7
static av_cold void TX_TAB() ff_tx_init_tab_7(void)
Definition: tx_template.c:114
TX_FACTOR_ANY
#define TX_FACTOR_ANY
Definition: tx_priv.h:209
FF_TX_INVERSE_ONLY
#define FF_TX_INVERSE_ONLY
Definition: tx_priv.h:157
ff_tx_fft_naive_small
static void TX_NAME() ff_tx_fft_naive_small(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:906
ff_tx_init_tab_9
static av_cold void TX_TAB() ff_tx_init_tab_9(void)
Definition: tx_template.c:124
FFTXCodelet
Definition: tx_priv.h:199
ff_tx_init_tabs
av_cold void TX_TAB() ff_tx_init_tabs(int len)
Definition: tx_template.c:148
t2
#define t2
Definition: regdef.h:30
ff_tx_mdct_naive_inv
static void TX_NAME() ff_tx_mdct_naive_inv(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1173
FFTabInitData::factors
int factors[TX_MAX_SUB]
Definition: tx_template.c:66
ff_tx_dctII
static void TX_NAME() ff_tx_dctII(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1881
BUTTERFLIES
#define BUTTERFLIES(a0, a1, a2, a3)
Definition: tx_template.c:544
ff_tx_fft_pfa_ns
static void TX_NAME() ff_tx_fft_pfa_ns(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride)
Definition: tx_template.c:1090
src0
const pixel *const src0
Definition: h264pred_template.c:420
FFTXCodelet::name
const char * name
Definition: tx_priv.h:200
factor
static const int factor[16]
Definition: vf_pp7.c:79
ff_tx_dcstI_init
static av_cold int TX_NAME() ff_tx_dcstI_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:2012
mem.h
ff_tx_fft_inplace_small_init
static av_cold int TX_NAME() ff_tx_fft_inplace_small_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:758
map
const VDPAUPixFmtMap * map
Definition: hwcontext_vdpau.c:71
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: intra.c:291
FFALIGN
#define FFALIGN(x, a)
Definition: macros.h:78
alpha
static const int16_t alpha[]
Definition: ilbcdata.h:55
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
fft7
static av_always_inline void fft7(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:256
int32_t
int32_t
Definition: audioconvert.c:56
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:474
ff_tx_mdct_gen_exp
int TX_TAB() ff_tx_mdct_gen_exp(AVTXContext *s, int *pre_tab)
Definition: tx_template.c:2113
ff_tx_gen_pfa_input_map
int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts, int d1, int d2)
Definition: tx.c:44
DECL_RDFT
#define DECL_RDFT(n, inv)
Definition: tx_template.c:1663
ff_tx_mdct_pfa_init
static av_cold int TX_NAME() ff_tx_mdct_pfa_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1433
ff_tx_fft4_ns
static void TX_NAME() ff_tx_fft4_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:647
ff_tx_mdct_inv_full_init
static av_cold int TX_NAME() ff_tx_mdct_inv_full_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1380
ff_tx_decompose_length
int ff_tx_decompose_length(int dst[TX_MAX_DECOMPOSITIONS], enum AVTXType type, int len, int inv)
Definition: tx.c:412
TX_TYPE
#define TX_TYPE
Definition: aacdec.c:36
ff_tx_mdct_inv_full
static void TX_NAME() ff_tx_mdct_inv_full(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1400
FF_TX_PRIO_MIN
@ FF_TX_PRIO_MIN
Definition: tx_priv.h:167