FFmpeg
tx_template.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) Lynne
3  *
4  * Power of two FFT:
5  * Copyright (c) Lynne
6  * Copyright (c) 2008 Loren Merritt
7  * Copyright (c) 2002 Fabrice Bellard
8  * Partly based on libdjbfft by D. J. Bernstein
9  *
10  * This file is part of FFmpeg.
11  *
12  * FFmpeg is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * FFmpeg is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with FFmpeg; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 #define TABLE_DEF(name, size) \
28  DECLARE_ALIGNED(32, TXSample, TX_TAB(ff_tx_tab_ ##name))[size]
29 
30 #define SR_POW2_TABLES \
31  SR_TABLE(8) \
32  SR_TABLE(16) \
33  SR_TABLE(32) \
34  SR_TABLE(64) \
35  SR_TABLE(128) \
36  SR_TABLE(256) \
37  SR_TABLE(512) \
38  SR_TABLE(1024) \
39  SR_TABLE(2048) \
40  SR_TABLE(4096) \
41  SR_TABLE(8192) \
42  SR_TABLE(16384) \
43  SR_TABLE(32768) \
44  SR_TABLE(65536) \
45  SR_TABLE(131072) \
46 
47 #define SR_TABLE(len) \
48  TABLE_DEF(len, len/4 + 1);
49 /* Power of two tables */
51 #undef SR_TABLE
52 
53 /* Other factors' tables */
54 TABLE_DEF(53, 12);
55 TABLE_DEF( 7, 6);
56 TABLE_DEF( 9, 8);
57 
58 typedef struct FFTabInitData {
59  void (*func)(void);
60  int factors[TX_MAX_SUB]; /* Must be sorted high -> low */
62 
63 #define SR_TABLE(len) \
64 static av_cold void TX_TAB(ff_tx_init_tab_ ##len)(void) \
65 { \
66  double freq = 2*M_PI/len; \
67  TXSample *tab = TX_TAB(ff_tx_tab_ ##len); \
68  \
69  for (int i = 0; i < len/4; i++) \
70  *tab++ = RESCALE(cos(i*freq)); \
71  \
72  *tab = 0; \
73 }
75 #undef SR_TABLE
76 
77 static void (*const sr_tabs_init_funcs[])(void) = {
78 #define SR_TABLE(len) TX_TAB(ff_tx_init_tab_ ##len),
80 #undef SR_TABLE
81 };
82 
84 #define SR_TABLE(len) AV_ONCE_INIT,
86 #undef SR_TABLE
87 };
88 
89 static av_cold void TX_TAB(ff_tx_init_tab_53)(void)
90 {
91  /* 5pt, doubled to eliminate AVX lane shuffles */
92  TX_TAB(ff_tx_tab_53)[0] = RESCALE(cos(2 * M_PI / 5));
93  TX_TAB(ff_tx_tab_53)[1] = RESCALE(cos(2 * M_PI / 5));
94  TX_TAB(ff_tx_tab_53)[2] = RESCALE(cos(2 * M_PI / 10));
95  TX_TAB(ff_tx_tab_53)[3] = RESCALE(cos(2 * M_PI / 10));
96  TX_TAB(ff_tx_tab_53)[4] = RESCALE(sin(2 * M_PI / 5));
97  TX_TAB(ff_tx_tab_53)[5] = RESCALE(sin(2 * M_PI / 5));
98  TX_TAB(ff_tx_tab_53)[6] = RESCALE(sin(2 * M_PI / 10));
99  TX_TAB(ff_tx_tab_53)[7] = RESCALE(sin(2 * M_PI / 10));
100 
101  /* 3pt */
102  TX_TAB(ff_tx_tab_53)[ 8] = RESCALE(cos(2 * M_PI / 12));
103  TX_TAB(ff_tx_tab_53)[ 9] = RESCALE(cos(2 * M_PI / 12));
104  TX_TAB(ff_tx_tab_53)[10] = RESCALE(cos(2 * M_PI / 6));
105  TX_TAB(ff_tx_tab_53)[11] = RESCALE(cos(8 * M_PI / 6));
106 }
107 
108 static av_cold void TX_TAB(ff_tx_init_tab_7)(void)
109 {
110  TX_TAB(ff_tx_tab_7)[0] = RESCALE(cos(2 * M_PI / 7));
111  TX_TAB(ff_tx_tab_7)[1] = RESCALE(sin(2 * M_PI / 7));
112  TX_TAB(ff_tx_tab_7)[2] = RESCALE(sin(2 * M_PI / 28));
113  TX_TAB(ff_tx_tab_7)[3] = RESCALE(cos(2 * M_PI / 28));
114  TX_TAB(ff_tx_tab_7)[4] = RESCALE(cos(2 * M_PI / 14));
115  TX_TAB(ff_tx_tab_7)[5] = RESCALE(sin(2 * M_PI / 14));
116 }
117 
118 static av_cold void TX_TAB(ff_tx_init_tab_9)(void)
119 {
120  TX_TAB(ff_tx_tab_9)[0] = RESCALE(cos(2 * M_PI / 3));
121  TX_TAB(ff_tx_tab_9)[1] = RESCALE(sin(2 * M_PI / 3));
122  TX_TAB(ff_tx_tab_9)[2] = RESCALE(cos(2 * M_PI / 9));
123  TX_TAB(ff_tx_tab_9)[3] = RESCALE(sin(2 * M_PI / 9));
124  TX_TAB(ff_tx_tab_9)[4] = RESCALE(cos(2 * M_PI / 36));
125  TX_TAB(ff_tx_tab_9)[5] = RESCALE(sin(2 * M_PI / 36));
126  TX_TAB(ff_tx_tab_9)[6] = TX_TAB(ff_tx_tab_9)[2] + TX_TAB(ff_tx_tab_9)[5];
127  TX_TAB(ff_tx_tab_9)[7] = TX_TAB(ff_tx_tab_9)[3] - TX_TAB(ff_tx_tab_9)[4];
128 }
129 
131  { TX_TAB(ff_tx_init_tab_53), { 15, 5, 3 } },
132  { TX_TAB(ff_tx_init_tab_9), { 9 } },
133  { TX_TAB(ff_tx_init_tab_7), { 7 } },
134 };
135 
137  AV_ONCE_INIT,
138  AV_ONCE_INIT,
139  AV_ONCE_INIT,
140 };
141 
142 av_cold void TX_TAB(ff_tx_init_tabs)(int len)
143 {
144  int factor_2 = ff_ctz(len);
145  if (factor_2) {
146  int idx = factor_2 - 3;
147  for (int i = 0; i <= idx; i++)
150  len >>= factor_2;
151  }
152 
153  for (int i = 0; i < FF_ARRAY_ELEMS(nptwo_tabs_init_data); i++) {
154  int f, f_idx = 0;
155 
156  if (len <= 1)
157  return;
158 
159  while ((f = nptwo_tabs_init_data[i].factors[f_idx++])) {
160  if (f % len)
161  continue;
162 
165  len /= f;
166  break;
167  }
168  }
169 }
170 
172  ptrdiff_t stride)
173 {
174  TXComplex tmp[3];
175  const TXSample *tab = TX_TAB(ff_tx_tab_53);
176 #ifdef TX_INT32
177  int64_t mtmp[4];
178 #endif
179 
180  tmp[0] = in[0];
181  BF(tmp[1].re, tmp[2].im, in[1].im, in[2].im);
182  BF(tmp[1].im, tmp[2].re, in[1].re, in[2].re);
183 
184  out[0*stride].re = tmp[0].re + tmp[2].re;
185  out[0*stride].im = tmp[0].im + tmp[2].im;
186 
187 #ifdef TX_INT32
188  mtmp[0] = (int64_t)tab[ 8] * tmp[1].re;
189  mtmp[1] = (int64_t)tab[ 9] * tmp[1].im;
190  mtmp[2] = (int64_t)tab[10] * tmp[2].re;
191  mtmp[3] = (int64_t)tab[10] * tmp[2].im;
192  out[1*stride].re = tmp[0].re - (mtmp[2] + mtmp[0] + 0x40000000 >> 31);
193  out[1*stride].im = tmp[0].im - (mtmp[3] - mtmp[1] + 0x40000000 >> 31);
194  out[2*stride].re = tmp[0].re - (mtmp[2] - mtmp[0] + 0x40000000 >> 31);
195  out[2*stride].im = tmp[0].im - (mtmp[3] + mtmp[1] + 0x40000000 >> 31);
196 #else
197  tmp[1].re = tab[ 8] * tmp[1].re;
198  tmp[1].im = tab[ 9] * tmp[1].im;
199  tmp[2].re = tab[10] * tmp[2].re;
200  tmp[2].im = tab[10] * tmp[2].im;
201  out[1*stride].re = tmp[0].re - tmp[2].re + tmp[1].re;
202  out[1*stride].im = tmp[0].im - tmp[2].im - tmp[1].im;
203  out[2*stride].re = tmp[0].re - tmp[2].re - tmp[1].re;
204  out[2*stride].im = tmp[0].im - tmp[2].im + tmp[1].im;
205 #endif
206 }
207 
208 #define DECL_FFT5(NAME, D0, D1, D2, D3, D4) \
209 static av_always_inline void NAME(TXComplex *out, TXComplex *in, \
210  ptrdiff_t stride) \
211 { \
212  TXComplex dc, z0[4], t[6]; \
213  const TXSample *tab = TX_TAB(ff_tx_tab_53); \
214  \
215  dc = in[0]; \
216  BF(t[1].im, t[0].re, in[1].re, in[4].re); \
217  BF(t[1].re, t[0].im, in[1].im, in[4].im); \
218  BF(t[3].im, t[2].re, in[2].re, in[3].re); \
219  BF(t[3].re, t[2].im, in[2].im, in[3].im); \
220  \
221  out[D0*stride].re = dc.re + t[0].re + t[2].re; \
222  out[D0*stride].im = dc.im + t[0].im + t[2].im; \
223  \
224  SMUL(t[4].re, t[0].re, tab[0], tab[2], t[2].re, t[0].re); \
225  SMUL(t[4].im, t[0].im, tab[0], tab[2], t[2].im, t[0].im); \
226  CMUL(t[5].re, t[1].re, tab[4], tab[6], t[3].re, t[1].re); \
227  CMUL(t[5].im, t[1].im, tab[4], tab[6], t[3].im, t[1].im); \
228  \
229  BF(z0[0].re, z0[3].re, t[0].re, t[1].re); \
230  BF(z0[0].im, z0[3].im, t[0].im, t[1].im); \
231  BF(z0[2].re, z0[1].re, t[4].re, t[5].re); \
232  BF(z0[2].im, z0[1].im, t[4].im, t[5].im); \
233  \
234  out[D1*stride].re = dc.re + z0[3].re; \
235  out[D1*stride].im = dc.im + z0[0].im; \
236  out[D2*stride].re = dc.re + z0[2].re; \
237  out[D2*stride].im = dc.im + z0[1].im; \
238  out[D3*stride].re = dc.re + z0[1].re; \
239  out[D3*stride].im = dc.im + z0[2].im; \
240  out[D4*stride].re = dc.re + z0[0].re; \
241  out[D4*stride].im = dc.im + z0[3].im; \
242 }
243 
244 DECL_FFT5(fft5, 0, 1, 2, 3, 4)
245 DECL_FFT5(fft5_m1, 0, 6, 12, 3, 9)
246 DECL_FFT5(fft5_m2, 10, 1, 7, 13, 4)
247 DECL_FFT5(fft5_m3, 5, 11, 2, 8, 14)
248 
250  ptrdiff_t stride)
251 {
252  TXComplex dc, t[6], z[3];
253  const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_7);
254 #ifdef TX_INT32
255  int64_t mtmp[12];
256 #endif
257 
258  dc = in[0];
259  BF(t[1].re, t[0].re, in[1].re, in[6].re);
260  BF(t[1].im, t[0].im, in[1].im, in[6].im);
261  BF(t[3].re, t[2].re, in[2].re, in[5].re);
262  BF(t[3].im, t[2].im, in[2].im, in[5].im);
263  BF(t[5].re, t[4].re, in[3].re, in[4].re);
264  BF(t[5].im, t[4].im, in[3].im, in[4].im);
265 
266  out[0*stride].re = dc.re + t[0].re + t[2].re + t[4].re;
267  out[0*stride].im = dc.im + t[0].im + t[2].im + t[4].im;
268 
269 #ifdef TX_INT32 /* NOTE: it's possible to do this with 16 mults but 72 adds */
270  mtmp[ 0] = ((int64_t)tab[0].re)*t[0].re - ((int64_t)tab[2].re)*t[4].re;
271  mtmp[ 1] = ((int64_t)tab[0].re)*t[4].re - ((int64_t)tab[1].re)*t[0].re;
272  mtmp[ 2] = ((int64_t)tab[0].re)*t[2].re - ((int64_t)tab[2].re)*t[0].re;
273  mtmp[ 3] = ((int64_t)tab[0].re)*t[0].im - ((int64_t)tab[1].re)*t[2].im;
274  mtmp[ 4] = ((int64_t)tab[0].re)*t[4].im - ((int64_t)tab[1].re)*t[0].im;
275  mtmp[ 5] = ((int64_t)tab[0].re)*t[2].im - ((int64_t)tab[2].re)*t[0].im;
276 
277  mtmp[ 6] = ((int64_t)tab[2].im)*t[1].im + ((int64_t)tab[1].im)*t[5].im;
278  mtmp[ 7] = ((int64_t)tab[0].im)*t[5].im + ((int64_t)tab[2].im)*t[3].im;
279  mtmp[ 8] = ((int64_t)tab[2].im)*t[5].im + ((int64_t)tab[1].im)*t[3].im;
280  mtmp[ 9] = ((int64_t)tab[0].im)*t[1].re + ((int64_t)tab[1].im)*t[3].re;
281  mtmp[10] = ((int64_t)tab[2].im)*t[3].re + ((int64_t)tab[0].im)*t[5].re;
282  mtmp[11] = ((int64_t)tab[2].im)*t[1].re + ((int64_t)tab[1].im)*t[5].re;
283 
284  z[0].re = (int32_t)(mtmp[ 0] - ((int64_t)tab[1].re)*t[2].re + 0x40000000 >> 31);
285  z[1].re = (int32_t)(mtmp[ 1] - ((int64_t)tab[2].re)*t[2].re + 0x40000000 >> 31);
286  z[2].re = (int32_t)(mtmp[ 2] - ((int64_t)tab[1].re)*t[4].re + 0x40000000 >> 31);
287  z[0].im = (int32_t)(mtmp[ 3] - ((int64_t)tab[2].re)*t[4].im + 0x40000000 >> 31);
288  z[1].im = (int32_t)(mtmp[ 4] - ((int64_t)tab[2].re)*t[2].im + 0x40000000 >> 31);
289  z[2].im = (int32_t)(mtmp[ 5] - ((int64_t)tab[1].re)*t[4].im + 0x40000000 >> 31);
290 
291  t[0].re = (int32_t)(mtmp[ 6] - ((int64_t)tab[0].im)*t[3].im + 0x40000000 >> 31);
292  t[2].re = (int32_t)(mtmp[ 7] - ((int64_t)tab[1].im)*t[1].im + 0x40000000 >> 31);
293  t[4].re = (int32_t)(mtmp[ 8] + ((int64_t)tab[0].im)*t[1].im + 0x40000000 >> 31);
294  t[0].im = (int32_t)(mtmp[ 9] + ((int64_t)tab[2].im)*t[5].re + 0x40000000 >> 31);
295  t[2].im = (int32_t)(mtmp[10] - ((int64_t)tab[1].im)*t[1].re + 0x40000000 >> 31);
296  t[4].im = (int32_t)(mtmp[11] - ((int64_t)tab[0].im)*t[3].re + 0x40000000 >> 31);
297 #else
298  z[0].re = tab[0].re*t[0].re - tab[2].re*t[4].re - tab[1].re*t[2].re;
299  z[1].re = tab[0].re*t[4].re - tab[1].re*t[0].re - tab[2].re*t[2].re;
300  z[2].re = tab[0].re*t[2].re - tab[2].re*t[0].re - tab[1].re*t[4].re;
301  z[0].im = tab[0].re*t[0].im - tab[1].re*t[2].im - tab[2].re*t[4].im;
302  z[1].im = tab[0].re*t[4].im - tab[1].re*t[0].im - tab[2].re*t[2].im;
303  z[2].im = tab[0].re*t[2].im - tab[2].re*t[0].im - tab[1].re*t[4].im;
304 
305  /* It's possible to do t[4].re and t[0].im with 2 multiplies only by
306  * multiplying the sum of all with the average of the twiddles */
307 
308  t[0].re = tab[2].im*t[1].im + tab[1].im*t[5].im - tab[0].im*t[3].im;
309  t[2].re = tab[0].im*t[5].im + tab[2].im*t[3].im - tab[1].im*t[1].im;
310  t[4].re = tab[2].im*t[5].im + tab[1].im*t[3].im + tab[0].im*t[1].im;
311  t[0].im = tab[0].im*t[1].re + tab[1].im*t[3].re + tab[2].im*t[5].re;
312  t[2].im = tab[2].im*t[3].re + tab[0].im*t[5].re - tab[1].im*t[1].re;
313  t[4].im = tab[2].im*t[1].re + tab[1].im*t[5].re - tab[0].im*t[3].re;
314 #endif
315 
316  BF(t[1].re, z[0].re, z[0].re, t[4].re);
317  BF(t[3].re, z[1].re, z[1].re, t[2].re);
318  BF(t[5].re, z[2].re, z[2].re, t[0].re);
319  BF(t[1].im, z[0].im, z[0].im, t[0].im);
320  BF(t[3].im, z[1].im, z[1].im, t[2].im);
321  BF(t[5].im, z[2].im, z[2].im, t[4].im);
322 
323  out[1*stride].re = dc.re + z[0].re;
324  out[1*stride].im = dc.im + t[1].im;
325  out[2*stride].re = dc.re + t[3].re;
326  out[2*stride].im = dc.im + z[1].im;
327  out[3*stride].re = dc.re + z[2].re;
328  out[3*stride].im = dc.im + t[5].im;
329  out[4*stride].re = dc.re + t[5].re;
330  out[4*stride].im = dc.im + z[2].im;
331  out[5*stride].re = dc.re + z[1].re;
332  out[5*stride].im = dc.im + t[3].im;
333  out[6*stride].re = dc.re + t[1].re;
334  out[6*stride].im = dc.im + z[0].im;
335 }
336 
338  ptrdiff_t stride)
339 {
340  const TXComplex *tab = (const TXComplex *)TX_TAB(ff_tx_tab_9);
341  TXComplex dc, t[16], w[4], x[5], y[5], z[2];
342 #ifdef TX_INT32
343  int64_t mtmp[12];
344 #endif
345 
346  dc = in[0];
347  BF(t[1].re, t[0].re, in[1].re, in[8].re);
348  BF(t[1].im, t[0].im, in[1].im, in[8].im);
349  BF(t[3].re, t[2].re, in[2].re, in[7].re);
350  BF(t[3].im, t[2].im, in[2].im, in[7].im);
351  BF(t[5].re, t[4].re, in[3].re, in[6].re);
352  BF(t[5].im, t[4].im, in[3].im, in[6].im);
353  BF(t[7].re, t[6].re, in[4].re, in[5].re);
354  BF(t[7].im, t[6].im, in[4].im, in[5].im);
355 
356  w[0].re = t[0].re - t[6].re;
357  w[0].im = t[0].im - t[6].im;
358  w[1].re = t[2].re - t[6].re;
359  w[1].im = t[2].im - t[6].im;
360  w[2].re = t[1].re - t[7].re;
361  w[2].im = t[1].im - t[7].im;
362  w[3].re = t[3].re + t[7].re;
363  w[3].im = t[3].im + t[7].im;
364 
365  z[0].re = dc.re + t[4].re;
366  z[0].im = dc.im + t[4].im;
367 
368  z[1].re = t[0].re + t[2].re + t[6].re;
369  z[1].im = t[0].im + t[2].im + t[6].im;
370 
371  out[0*stride].re = z[0].re + z[1].re;
372  out[0*stride].im = z[0].im + z[1].im;
373 
374 #ifdef TX_INT32
375  mtmp[0] = t[1].re - t[3].re + t[7].re;
376  mtmp[1] = t[1].im - t[3].im + t[7].im;
377 
378  y[3].re = (int32_t)(((int64_t)tab[0].im)*mtmp[0] + 0x40000000 >> 31);
379  y[3].im = (int32_t)(((int64_t)tab[0].im)*mtmp[1] + 0x40000000 >> 31);
380 
381  mtmp[0] = (int32_t)(((int64_t)tab[0].re)*z[1].re + 0x40000000 >> 31);
382  mtmp[1] = (int32_t)(((int64_t)tab[0].re)*z[1].im + 0x40000000 >> 31);
383  mtmp[2] = (int32_t)(((int64_t)tab[0].re)*t[4].re + 0x40000000 >> 31);
384  mtmp[3] = (int32_t)(((int64_t)tab[0].re)*t[4].im + 0x40000000 >> 31);
385 
386  x[3].re = z[0].re + (int32_t)mtmp[0];
387  x[3].im = z[0].im + (int32_t)mtmp[1];
388  z[0].re = in[0].re + (int32_t)mtmp[2];
389  z[0].im = in[0].im + (int32_t)mtmp[3];
390 
391  mtmp[0] = ((int64_t)tab[1].re)*w[0].re;
392  mtmp[1] = ((int64_t)tab[1].re)*w[0].im;
393  mtmp[2] = ((int64_t)tab[2].im)*w[0].re;
394  mtmp[3] = ((int64_t)tab[2].im)*w[0].im;
395  mtmp[4] = ((int64_t)tab[1].im)*w[2].re;
396  mtmp[5] = ((int64_t)tab[1].im)*w[2].im;
397  mtmp[6] = ((int64_t)tab[2].re)*w[2].re;
398  mtmp[7] = ((int64_t)tab[2].re)*w[2].im;
399 
400  x[1].re = (int32_t)(mtmp[0] + ((int64_t)tab[2].im)*w[1].re + 0x40000000 >> 31);
401  x[1].im = (int32_t)(mtmp[1] + ((int64_t)tab[2].im)*w[1].im + 0x40000000 >> 31);
402  x[2].re = (int32_t)(mtmp[2] - ((int64_t)tab[3].re)*w[1].re + 0x40000000 >> 31);
403  x[2].im = (int32_t)(mtmp[3] - ((int64_t)tab[3].re)*w[1].im + 0x40000000 >> 31);
404  y[1].re = (int32_t)(mtmp[4] + ((int64_t)tab[2].re)*w[3].re + 0x40000000 >> 31);
405  y[1].im = (int32_t)(mtmp[5] + ((int64_t)tab[2].re)*w[3].im + 0x40000000 >> 31);
406  y[2].re = (int32_t)(mtmp[6] - ((int64_t)tab[3].im)*w[3].re + 0x40000000 >> 31);
407  y[2].im = (int32_t)(mtmp[7] - ((int64_t)tab[3].im)*w[3].im + 0x40000000 >> 31);
408 
409  y[0].re = (int32_t)(((int64_t)tab[0].im)*t[5].re + 0x40000000 >> 31);
410  y[0].im = (int32_t)(((int64_t)tab[0].im)*t[5].im + 0x40000000 >> 31);
411 
412 #else
413  y[3].re = tab[0].im*(t[1].re - t[3].re + t[7].re);
414  y[3].im = tab[0].im*(t[1].im - t[3].im + t[7].im);
415 
416  x[3].re = z[0].re + tab[0].re*z[1].re;
417  x[3].im = z[0].im + tab[0].re*z[1].im;
418  z[0].re = dc.re + tab[0].re*t[4].re;
419  z[0].im = dc.im + tab[0].re*t[4].im;
420 
421  x[1].re = tab[1].re*w[0].re + tab[2].im*w[1].re;
422  x[1].im = tab[1].re*w[0].im + tab[2].im*w[1].im;
423  x[2].re = tab[2].im*w[0].re - tab[3].re*w[1].re;
424  x[2].im = tab[2].im*w[0].im - tab[3].re*w[1].im;
425  y[1].re = tab[1].im*w[2].re + tab[2].re*w[3].re;
426  y[1].im = tab[1].im*w[2].im + tab[2].re*w[3].im;
427  y[2].re = tab[2].re*w[2].re - tab[3].im*w[3].re;
428  y[2].im = tab[2].re*w[2].im - tab[3].im*w[3].im;
429 
430  y[0].re = tab[0].im*t[5].re;
431  y[0].im = tab[0].im*t[5].im;
432 #endif
433 
434  x[4].re = x[1].re + x[2].re;
435  x[4].im = x[1].im + x[2].im;
436 
437  y[4].re = y[1].re - y[2].re;
438  y[4].im = y[1].im - y[2].im;
439  x[1].re = z[0].re + x[1].re;
440  x[1].im = z[0].im + x[1].im;
441  y[1].re = y[0].re + y[1].re;
442  y[1].im = y[0].im + y[1].im;
443  x[2].re = z[0].re + x[2].re;
444  x[2].im = z[0].im + x[2].im;
445  y[2].re = y[2].re - y[0].re;
446  y[2].im = y[2].im - y[0].im;
447  x[4].re = z[0].re - x[4].re;
448  x[4].im = z[0].im - x[4].im;
449  y[4].re = y[0].re - y[4].re;
450  y[4].im = y[0].im - y[4].im;
451 
452  out[1*stride] = (TXComplex){ x[1].re + y[1].im, x[1].im - y[1].re };
453  out[2*stride] = (TXComplex){ x[2].re + y[2].im, x[2].im - y[2].re };
454  out[3*stride] = (TXComplex){ x[3].re + y[3].im, x[3].im - y[3].re };
455  out[4*stride] = (TXComplex){ x[4].re + y[4].im, x[4].im - y[4].re };
456  out[5*stride] = (TXComplex){ x[4].re - y[4].im, x[4].im + y[4].re };
457  out[6*stride] = (TXComplex){ x[3].re - y[3].im, x[3].im + y[3].re };
458  out[7*stride] = (TXComplex){ x[2].re - y[2].im, x[2].im + y[2].re };
459  out[8*stride] = (TXComplex){ x[1].re - y[1].im, x[1].im + y[1].re };
460 }
461 
463  ptrdiff_t stride)
464 {
465  TXComplex tmp[15];
466 
467  for (int i = 0; i < 5; i++)
468  fft3(tmp + i, in + i*3, 5);
469 
470  fft5_m1(out, tmp + 0, stride);
471  fft5_m2(out, tmp + 5, stride);
472  fft5_m3(out, tmp + 10, stride);
473 }
474 
476  const FFTXCodelet *cd,
477  uint64_t flags,
479  int len, int inv,
480  const void *scale)
481 {
482  int ret = 0;
483  TX_TAB(ff_tx_init_tabs)(len);
484 
485  if (len == 15)
486  ret = ff_tx_gen_pfa_input_map(s, opts, 3, 5);
487  else if (flags & FF_TX_PRESHUFFLE)
489 
490  return ret;
491 }
492 
493 #define DECL_FACTOR_S(n) \
494 static void TX_NAME(ff_tx_fft##n)(AVTXContext *s, void *dst, \
495  void *src, ptrdiff_t stride) \
496 { \
497  fft##n((TXComplex *)dst, (TXComplex *)src, stride / sizeof(TXComplex)); \
498 } \
499 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
500  .name = TX_NAME_STR("fft" #n "_ns"), \
501  .function = TX_NAME(ff_tx_fft##n), \
502  .type = TX_TYPE(FFT), \
503  .flags = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
504  AV_TX_UNALIGNED | FF_TX_PRESHUFFLE, \
505  .factors[0] = n, \
506  .nb_factors = 1, \
507  .min_len = n, \
508  .max_len = n, \
509  .init = TX_NAME(ff_tx_fft_factor_init), \
510  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
511  .prio = FF_TX_PRIO_BASE, \
512 };
513 
514 #define DECL_FACTOR_F(n) \
515 DECL_FACTOR_S(n) \
516 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_fwd_def) = { \
517  .name = TX_NAME_STR("fft" #n "_fwd"), \
518  .function = TX_NAME(ff_tx_fft##n), \
519  .type = TX_TYPE(FFT), \
520  .flags = AV_TX_INPLACE | FF_TX_OUT_OF_PLACE | \
521  AV_TX_UNALIGNED | FF_TX_FORWARD_ONLY, \
522  .factors[0] = n, \
523  .nb_factors = 1, \
524  .min_len = n, \
525  .max_len = n, \
526  .init = TX_NAME(ff_tx_fft_factor_init), \
527  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
528  .prio = FF_TX_PRIO_BASE, \
529 };
530 
531 DECL_FACTOR_F(3)
532 DECL_FACTOR_F(5)
533 DECL_FACTOR_F(7)
534 DECL_FACTOR_F(9)
535 DECL_FACTOR_S(15)
536 
537 #define BUTTERFLIES(a0, a1, a2, a3) \
538  do { \
539  r0=a0.re; \
540  i0=a0.im; \
541  r1=a1.re; \
542  i1=a1.im; \
543  BF(t3, t5, t5, t1); \
544  BF(a2.re, a0.re, r0, t5); \
545  BF(a3.im, a1.im, i1, t3); \
546  BF(t4, t6, t2, t6); \
547  BF(a3.re, a1.re, r1, t4); \
548  BF(a2.im, a0.im, i0, t6); \
549  } while (0)
550 
551 #define TRANSFORM(a0, a1, a2, a3, wre, wim) \
552  do { \
553  CMUL(t1, t2, a2.re, a2.im, wre, -wim); \
554  CMUL(t5, t6, a3.re, a3.im, wre, wim); \
555  BUTTERFLIES(a0, a1, a2, a3); \
556  } while (0)
557 
558 /* z[0...8n-1], w[1...2n-1] */
559 static inline void TX_NAME(ff_tx_fft_sr_combine)(TXComplex *z,
560  const TXSample *cos, int len)
561 {
562  int o1 = 2*len;
563  int o2 = 4*len;
564  int o3 = 6*len;
565  const TXSample *wim = cos + o1 - 7;
566  TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
567 
568  for (int i = 0; i < len; i += 4) {
569  TRANSFORM(z[0], z[o1 + 0], z[o2 + 0], z[o3 + 0], cos[0], wim[7]);
570  TRANSFORM(z[2], z[o1 + 2], z[o2 + 2], z[o3 + 2], cos[2], wim[5]);
571  TRANSFORM(z[4], z[o1 + 4], z[o2 + 4], z[o3 + 4], cos[4], wim[3]);
572  TRANSFORM(z[6], z[o1 + 6], z[o2 + 6], z[o3 + 6], cos[6], wim[1]);
573 
574  TRANSFORM(z[1], z[o1 + 1], z[o2 + 1], z[o3 + 1], cos[1], wim[6]);
575  TRANSFORM(z[3], z[o1 + 3], z[o2 + 3], z[o3 + 3], cos[3], wim[4]);
576  TRANSFORM(z[5], z[o1 + 5], z[o2 + 5], z[o3 + 5], cos[5], wim[2]);
577  TRANSFORM(z[7], z[o1 + 7], z[o2 + 7], z[o3 + 7], cos[7], wim[0]);
578 
579  z += 2*4;
580  cos += 2*4;
581  wim -= 2*4;
582  }
583 }
584 
586  const FFTXCodelet *cd,
587  uint64_t flags,
589  int len, int inv,
590  const void *scale)
591 {
592  TX_TAB(ff_tx_init_tabs)(len);
593  return ff_tx_gen_ptwo_revtab(s, opts);
594 }
595 
596 #define DECL_SR_CODELET_DEF(n) \
597 static const FFTXCodelet TX_NAME(ff_tx_fft##n##_ns_def) = { \
598  .name = TX_NAME_STR("fft" #n "_ns"), \
599  .function = TX_NAME(ff_tx_fft##n##_ns), \
600  .type = TX_TYPE(FFT), \
601  .flags = FF_TX_OUT_OF_PLACE | AV_TX_INPLACE | \
602  AV_TX_UNALIGNED | FF_TX_PRESHUFFLE, \
603  .factors[0] = 2, \
604  .nb_factors = 1, \
605  .min_len = n, \
606  .max_len = n, \
607  .init = TX_NAME(ff_tx_fft_sr_codelet_init), \
608  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
609  .prio = FF_TX_PRIO_BASE, \
610 };
611 
612 #define DECL_SR_CODELET(n, n2, n4) \
613 static void TX_NAME(ff_tx_fft##n##_ns)(AVTXContext *s, void *_dst, \
614  void *_src, ptrdiff_t stride) \
615 { \
616  TXComplex *src = _src; \
617  TXComplex *dst = _dst; \
618  const TXSample *cos = TX_TAB(ff_tx_tab_##n); \
619  \
620  TX_NAME(ff_tx_fft##n2##_ns)(s, dst, src, stride); \
621  TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*2, src + n4*2, stride); \
622  TX_NAME(ff_tx_fft##n4##_ns)(s, dst + n4*3, src + n4*3, stride); \
623  TX_NAME(ff_tx_fft_sr_combine)(dst, cos, n4 >> 1); \
624 } \
625  \
626 DECL_SR_CODELET_DEF(n)
627 
628 static void TX_NAME(ff_tx_fft2_ns)(AVTXContext *s, void *_dst,
629  void *_src, ptrdiff_t stride)
630 {
631  TXComplex *src = _src;
632  TXComplex *dst = _dst;
633  TXComplex tmp;
634 
635  BF(tmp.re, dst[0].re, src[0].re, src[1].re);
636  BF(tmp.im, dst[0].im, src[0].im, src[1].im);
637  dst[1] = tmp;
638 }
639 
640 static void TX_NAME(ff_tx_fft4_ns)(AVTXContext *s, void *_dst,
641  void *_src, ptrdiff_t stride)
642 {
643  TXComplex *src = _src;
644  TXComplex *dst = _dst;
645  TXSample t1, t2, t3, t4, t5, t6, t7, t8;
646 
647  BF(t3, t1, src[0].re, src[1].re);
648  BF(t8, t6, src[3].re, src[2].re);
649  BF(dst[2].re, dst[0].re, t1, t6);
650  BF(t4, t2, src[0].im, src[1].im);
651  BF(t7, t5, src[2].im, src[3].im);
652  BF(dst[3].im, dst[1].im, t4, t8);
653  BF(dst[3].re, dst[1].re, t3, t7);
654  BF(dst[2].im, dst[0].im, t2, t5);
655 }
656 
657 static void TX_NAME(ff_tx_fft8_ns)(AVTXContext *s, void *_dst,
658  void *_src, ptrdiff_t stride)
659 {
660  TXComplex *src = _src;
661  TXComplex *dst = _dst;
662  TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
663  const TXSample cos = TX_TAB(ff_tx_tab_8)[1];
664 
665  TX_NAME(ff_tx_fft4_ns)(s, dst, src, stride);
666 
667  BF(t1, dst[5].re, src[4].re, -src[5].re);
668  BF(t2, dst[5].im, src[4].im, -src[5].im);
669  BF(t5, dst[7].re, src[6].re, -src[7].re);
670  BF(t6, dst[7].im, src[6].im, -src[7].im);
671 
672  BUTTERFLIES(dst[0], dst[2], dst[4], dst[6]);
673  TRANSFORM(dst[1], dst[3], dst[5], dst[7], cos, cos);
674 }
675 
676 static void TX_NAME(ff_tx_fft16_ns)(AVTXContext *s, void *_dst,
677  void *_src, ptrdiff_t stride)
678 {
679  TXComplex *src = _src;
680  TXComplex *dst = _dst;
681  const TXSample *cos = TX_TAB(ff_tx_tab_16);
682 
683  TXUSample t1, t2, t3, t4, t5, t6, r0, i0, r1, i1;
684  TXSample cos_16_1 = cos[1];
685  TXSample cos_16_2 = cos[2];
686  TXSample cos_16_3 = cos[3];
687 
688  TX_NAME(ff_tx_fft8_ns)(s, dst + 0, src + 0, stride);
689  TX_NAME(ff_tx_fft4_ns)(s, dst + 8, src + 8, stride);
690  TX_NAME(ff_tx_fft4_ns)(s, dst + 12, src + 12, stride);
691 
692  t1 = dst[ 8].re;
693  t2 = dst[ 8].im;
694  t5 = dst[12].re;
695  t6 = dst[12].im;
696  BUTTERFLIES(dst[0], dst[4], dst[8], dst[12]);
697 
698  TRANSFORM(dst[ 2], dst[ 6], dst[10], dst[14], cos_16_2, cos_16_2);
699  TRANSFORM(dst[ 1], dst[ 5], dst[ 9], dst[13], cos_16_1, cos_16_3);
700  TRANSFORM(dst[ 3], dst[ 7], dst[11], dst[15], cos_16_3, cos_16_1);
701 }
702 
707 DECL_SR_CODELET(32,16,8)
708 DECL_SR_CODELET(64,32,16)
709 DECL_SR_CODELET(128,64,32)
710 DECL_SR_CODELET(256,128,64)
711 DECL_SR_CODELET(512,256,128)
712 DECL_SR_CODELET(1024,512,256)
713 DECL_SR_CODELET(2048,1024,512)
714 DECL_SR_CODELET(4096,2048,1024)
715 DECL_SR_CODELET(8192,4096,2048)
716 DECL_SR_CODELET(16384,8192,4096)
717 DECL_SR_CODELET(32768,16384,8192)
718 DECL_SR_CODELET(65536,32768,16384)
719 DECL_SR_CODELET(131072,65536,32768)
720 
722  const FFTXCodelet *cd,
723  uint64_t flags,
725  int len, int inv,
726  const void *scale)
727 {
728  int ret;
729  int is_inplace = !!(flags & AV_TX_INPLACE);
730  FFTXCodeletOptions sub_opts = {
731  .map_dir = is_inplace ? FF_TX_MAP_SCATTER : FF_TX_MAP_GATHER,
732  };
733 
734  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
735  flags |= AV_TX_INPLACE; /* in-place */
736  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
737 
738  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len, inv, scale)))
739  return ret;
740 
741  if (is_inplace && (ret = ff_tx_gen_inplace_map(s, len)))
742  return ret;
743 
744  return 0;
745 }
746 
748  const FFTXCodelet *cd,
749  uint64_t flags,
751  int len, int inv,
752  const void *scale)
753 {
754  if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
755  return AVERROR(ENOMEM);
756  flags &= ~AV_TX_INPLACE;
757  return TX_NAME(ff_tx_fft_init)(s, cd, flags, opts, len, inv, scale);
758 }
759 
760 static void TX_NAME(ff_tx_fft)(AVTXContext *s, void *_dst,
761  void *_src, ptrdiff_t stride)
762 {
763  TXComplex *src = _src;
764  TXComplex *dst1 = s->flags & AV_TX_INPLACE ? s->tmp : _dst;
765  TXComplex *dst2 = _dst;
766  int *map = s->sub[0].map;
767  int len = s->len;
768 
769  /* Compilers can't vectorize this anyway without assuming AVX2, which they
770  * generally don't, at least without -march=native -mtune=native */
771  for (int i = 0; i < len; i++)
772  dst1[i] = src[map[i]];
773 
774  s->fn[0](&s->sub[0], dst2, dst1, stride);
775 }
776 
777 static void TX_NAME(ff_tx_fft_inplace)(AVTXContext *s, void *_dst,
778  void *_src, ptrdiff_t stride)
779 {
780  TXComplex *src = _src;
781  TXComplex *dst = _dst;
782  TXComplex tmp;
783  const int *map = s->sub->map;
784  const int *inplace_idx = s->map;
785  int src_idx, dst_idx;
786 
787  src_idx = *inplace_idx++;
788  do {
789  tmp = src[src_idx];
790  dst_idx = map[src_idx];
791  do {
792  FFSWAP(TXComplex, tmp, src[dst_idx]);
793  dst_idx = map[dst_idx];
794  } while (dst_idx != src_idx); /* Can be > as well, but was less predictable */
795  src[dst_idx] = tmp;
796  } while ((src_idx = *inplace_idx++));
797 
798  s->fn[0](&s->sub[0], dst, src, stride);
799 }
800 
801 static const FFTXCodelet TX_NAME(ff_tx_fft_def) = {
802  .name = TX_NAME_STR("fft"),
803  .function = TX_NAME(ff_tx_fft),
804  .type = TX_TYPE(FFT),
806  .factors[0] = TX_FACTOR_ANY,
807  .nb_factors = 1,
808  .min_len = 2,
809  .max_len = TX_LEN_UNLIMITED,
810  .init = TX_NAME(ff_tx_fft_init),
812  .prio = FF_TX_PRIO_BASE,
813 };
814 
815 static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_small_def) = {
816  .name = TX_NAME_STR("fft_inplace_small"),
817  .function = TX_NAME(ff_tx_fft),
818  .type = TX_TYPE(FFT),
820  .factors[0] = TX_FACTOR_ANY,
821  .nb_factors = 1,
822  .min_len = 2,
823  .max_len = 65536,
826  .prio = FF_TX_PRIO_BASE - 256,
827 };
828 
829 static const FFTXCodelet TX_NAME(ff_tx_fft_inplace_def) = {
830  .name = TX_NAME_STR("fft_inplace"),
831  .function = TX_NAME(ff_tx_fft_inplace),
832  .type = TX_TYPE(FFT),
834  .factors[0] = TX_FACTOR_ANY,
835  .nb_factors = 1,
836  .min_len = 2,
837  .max_len = TX_LEN_UNLIMITED,
838  .init = TX_NAME(ff_tx_fft_init),
840  .prio = FF_TX_PRIO_BASE - 512,
841 };
842 
844  const FFTXCodelet *cd,
845  uint64_t flags,
847  int len, int inv,
848  const void *scale)
849 {
850  const double phase = s->inv ? 2.0*M_PI/len : -2.0*M_PI/len;
851 
852  if (!(s->exp = av_malloc(len*len*sizeof(*s->exp))))
853  return AVERROR(ENOMEM);
854 
855  for (int i = 0; i < len; i++) {
856  for (int j = 0; j < len; j++) {
857  const double factor = phase*i*j;
858  s->exp[i*j] = (TXComplex){
859  RESCALE(cos(factor)),
860  RESCALE(sin(factor)),
861  };
862  }
863  }
864 
865  return 0;
866 }
867 
868 static void TX_NAME(ff_tx_fft_naive)(AVTXContext *s, void *_dst, void *_src,
869  ptrdiff_t stride)
870 {
871  TXComplex *src = _src;
872  TXComplex *dst = _dst;
873  const int n = s->len;
874  double phase = s->inv ? 2.0*M_PI/n : -2.0*M_PI/n;
875 
876  stride /= sizeof(*dst);
877 
878  for (int i = 0; i < n; i++) {
879  TXComplex tmp = { 0 };
880  for (int j = 0; j < n; j++) {
881  const double factor = phase*i*j;
882  const TXComplex mult = {
883  RESCALE(cos(factor)),
884  RESCALE(sin(factor)),
885  };
886  TXComplex res;
887  CMUL3(res, src[j], mult);
888  tmp.re += res.re;
889  tmp.im += res.im;
890  }
891  dst[i*stride] = tmp;
892  }
893 }
894 
895 static void TX_NAME(ff_tx_fft_naive_small)(AVTXContext *s, void *_dst, void *_src,
896  ptrdiff_t stride)
897 {
898  TXComplex *src = _src;
899  TXComplex *dst = _dst;
900  const int n = s->len;
901 
902  stride /= sizeof(*dst);
903 
904  for (int i = 0; i < n; i++) {
905  TXComplex tmp = { 0 };
906  for (int j = 0; j < n; j++) {
907  TXComplex res;
908  const TXComplex mult = s->exp[i*j];
909  CMUL3(res, src[j], mult);
910  tmp.re += res.re;
911  tmp.im += res.im;
912  }
913  dst[i*stride] = tmp;
914  }
915 }
916 
917 static const FFTXCodelet TX_NAME(ff_tx_fft_naive_small_def) = {
918  .name = TX_NAME_STR("fft_naive_small"),
919  .function = TX_NAME(ff_tx_fft_naive_small),
920  .type = TX_TYPE(FFT),
922  .factors[0] = TX_FACTOR_ANY,
923  .nb_factors = 1,
924  .min_len = 2,
925  .max_len = 1024,
928  .prio = FF_TX_PRIO_MIN/2,
929 };
930 
931 static const FFTXCodelet TX_NAME(ff_tx_fft_naive_def) = {
932  .name = TX_NAME_STR("fft_naive"),
933  .function = TX_NAME(ff_tx_fft_naive),
934  .type = TX_TYPE(FFT),
936  .factors[0] = TX_FACTOR_ANY,
937  .nb_factors = 1,
938  .min_len = 2,
939  .max_len = TX_LEN_UNLIMITED,
940  .init = NULL,
941  .cpu_flags = FF_TX_CPU_FLAGS_ALL,
942  .prio = FF_TX_PRIO_MIN,
943 };
944 
946  const FFTXCodelet *cd,
947  uint64_t flags,
949  int len, int inv,
950  const void *scale)
951 {
952  int ret, *tmp, ps = flags & FF_TX_PRESHUFFLE;
953  FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
954  size_t extra_tmp_len = 0;
955  int len_list[TX_MAX_DECOMPOSITIONS];
956 
957  if ((ret = ff_tx_decompose_length(len_list, TX_TYPE(FFT), len, inv)) < 0)
958  return ret;
959 
960  /* Two iterations to test both orderings. */
961  for (int i = 0; i < ret; i++) {
962  int len1 = len_list[i];
963  int len2 = len / len1;
964 
965  /* Our ptwo transforms don't support striding the output. */
966  if (len2 & (len2 - 1))
967  FFSWAP(int, len1, len2);
968 
970 
971  /* First transform */
972  sub_opts.map_dir = FF_TX_MAP_GATHER;
973  flags &= ~AV_TX_INPLACE;
975  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
976  ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
977  len1, inv, scale);
978 
979  if (ret == AVERROR(ENOMEM)) {
980  return ret;
981  } else if (ret < 0) { /* Try again without a preshuffle flag */
983  ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
984  len1, inv, scale);
985  if (ret == AVERROR(ENOMEM))
986  return ret;
987  else if (ret < 0)
988  continue;
989  }
990 
991  /* Second transform. */
992  sub_opts.map_dir = FF_TX_MAP_SCATTER;
994 retry:
996  flags |= AV_TX_INPLACE;
997  ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
998  len2, inv, scale);
999 
1000  if (ret == AVERROR(ENOMEM)) {
1001  return ret;
1002  } else if (ret < 0) { /* Try again with an out-of-place transform */
1004  flags &= ~AV_TX_INPLACE;
1005  ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1006  len2, inv, scale);
1007  if (ret == AVERROR(ENOMEM)) {
1008  return ret;
1009  } else if (ret < 0) {
1010  if (flags & FF_TX_PRESHUFFLE) { /* Retry again without a preshuf flag */
1011  flags &= ~FF_TX_PRESHUFFLE;
1012  goto retry;
1013  } else {
1014  continue;
1015  }
1016  }
1017  }
1018 
1019  /* Success */
1020  break;
1021  }
1022 
1023  /* If nothing was sucessful, error out */
1024  if (ret < 0)
1025  return ret;
1026 
1027  /* Generate PFA map */
1028  if ((ret = ff_tx_gen_compound_mapping(s, opts, 0,
1029  s->sub[0].len, s->sub[1].len)))
1030  return ret;
1031 
1032  if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1033  return AVERROR(ENOMEM);
1034 
1035  /* Flatten input map */
1036  tmp = (int *)s->tmp;
1037  for (int k = 0; k < len; k += s->sub[0].len) {
1038  memcpy(tmp, &s->map[k], s->sub[0].len*sizeof(*tmp));
1039  for (int i = 0; i < s->sub[0].len; i++)
1040  s->map[k + i] = tmp[s->sub[0].map[i]];
1041  }
1042 
1043  /* Only allocate extra temporary memory if we need it */
1044  if (!(s->sub[1].flags & AV_TX_INPLACE))
1045  extra_tmp_len = len;
1046  else if (!ps)
1047  extra_tmp_len = s->sub[0].len;
1048 
1049  if (extra_tmp_len && !(s->exp = av_malloc(extra_tmp_len*sizeof(*s->exp))))
1050  return AVERROR(ENOMEM);
1051 
1052  return 0;
1053 }
1054 
1055 static void TX_NAME(ff_tx_fft_pfa)(AVTXContext *s, void *_out,
1056  void *_in, ptrdiff_t stride)
1057 {
1058  const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
1059  const int *in_map = s->map, *out_map = in_map + l;
1060  const int *sub_map = s->sub[1].map;
1061  TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
1062  TXComplex *in = _in, *out = _out;
1063 
1064  stride /= sizeof(*out);
1065 
1066  for (int i = 0; i < m; i++) {
1067  for (int j = 0; j < n; j++)
1068  s->exp[j] = in[in_map[i*n + j]];
1069  s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], s->exp, m*sizeof(TXComplex));
1070  }
1071 
1072  for (int i = 0; i < n; i++)
1073  s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
1074 
1075  for (int i = 0; i < l; i++)
1076  out[i*stride] = tmp1[out_map[i]];
1077 }
1078 
1079 static void TX_NAME(ff_tx_fft_pfa_ns)(AVTXContext *s, void *_out,
1080  void *_in, ptrdiff_t stride)
1081 {
1082  const int n = s->sub[0].len, m = s->sub[1].len, l = s->len;
1083  const int *in_map = s->map, *out_map = in_map + l;
1084  const int *sub_map = s->sub[1].map;
1085  TXComplex *tmp1 = s->sub[1].flags & AV_TX_INPLACE ? s->tmp : s->exp;
1086  TXComplex *in = _in, *out = _out;
1087 
1088  stride /= sizeof(*out);
1089 
1090  for (int i = 0; i < m; i++)
1091  s->fn[0](&s->sub[0], &s->tmp[sub_map[i]], &in[i*n], m*sizeof(TXComplex));
1092 
1093  for (int i = 0; i < n; i++)
1094  s->fn[1](&s->sub[1], &tmp1[m*i], &s->tmp[m*i], sizeof(TXComplex));
1095 
1096  for (int i = 0; i < l; i++)
1097  out[i*stride] = tmp1[out_map[i]];
1098 }
1099 
1100 static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_def) = {
1101  .name = TX_NAME_STR("fft_pfa"),
1102  .function = TX_NAME(ff_tx_fft_pfa),
1103  .type = TX_TYPE(FFT),
1105  .factors = { 7, 5, 3, 2, TX_FACTOR_ANY },
1106  .nb_factors = 2,
1107  .min_len = 2*3,
1108  .max_len = TX_LEN_UNLIMITED,
1109  .init = TX_NAME(ff_tx_fft_pfa_init),
1111  .prio = FF_TX_PRIO_BASE,
1112 };
1113 
1114 static const FFTXCodelet TX_NAME(ff_tx_fft_pfa_ns_def) = {
1115  .name = TX_NAME_STR("fft_pfa_ns"),
1116  .function = TX_NAME(ff_tx_fft_pfa_ns),
1117  .type = TX_TYPE(FFT),
1120  .factors = { 7, 5, 3, 2, TX_FACTOR_ANY },
1121  .nb_factors = 2,
1122  .min_len = 2*3,
1123  .max_len = TX_LEN_UNLIMITED,
1124  .init = TX_NAME(ff_tx_fft_pfa_init),
1126  .prio = FF_TX_PRIO_BASE,
1127 };
1128 
1130  const FFTXCodelet *cd,
1131  uint64_t flags,
1133  int len, int inv,
1134  const void *scale)
1135 {
1136  s->scale_d = *((SCALE_TYPE *)scale);
1137  s->scale_f = s->scale_d;
1138  return 0;
1139 }
1140 
1141 static void TX_NAME(ff_tx_mdct_naive_fwd)(AVTXContext *s, void *_dst,
1142  void *_src, ptrdiff_t stride)
1143 {
1144  TXSample *src = _src;
1145  TXSample *dst = _dst;
1146  double scale = s->scale_d;
1147  int len = s->len;
1148  const double phase = M_PI/(4.0*len);
1149 
1150  stride /= sizeof(*dst);
1151 
1152  for (int i = 0; i < len; i++) {
1153  double sum = 0.0;
1154  for (int j = 0; j < len*2; j++) {
1155  int a = (2*j + 1 + len) * (2*i + 1);
1156  sum += UNSCALE(src[j]) * cos(a * phase);
1157  }
1158  dst[i*stride] = RESCALE(sum*scale);
1159  }
1160 }
1161 
1162 static void TX_NAME(ff_tx_mdct_naive_inv)(AVTXContext *s, void *_dst,
1163  void *_src, ptrdiff_t stride)
1164 {
1165  TXSample *src = _src;
1166  TXSample *dst = _dst;
1167  double scale = s->scale_d;
1168  int len = s->len >> 1;
1169  int len2 = len*2;
1170  const double phase = M_PI/(4.0*len2);
1171 
1172  stride /= sizeof(*src);
1173 
1174  for (int i = 0; i < len; i++) {
1175  double sum_d = 0.0;
1176  double sum_u = 0.0;
1177  double i_d = phase * (4*len - 2*i - 1);
1178  double i_u = phase * (3*len2 + 2*i + 1);
1179  for (int j = 0; j < len2; j++) {
1180  double a = (2 * j + 1);
1181  double a_d = cos(a * i_d);
1182  double a_u = cos(a * i_u);
1183  double val = UNSCALE(src[j*stride]);
1184  sum_d += a_d * val;
1185  sum_u += a_u * val;
1186  }
1187  dst[i + 0] = RESCALE( sum_d*scale);
1188  dst[i + len] = RESCALE(-sum_u*scale);
1189  }
1190 }
1191 
1192 static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_fwd_def) = {
1193  .name = TX_NAME_STR("mdct_naive_fwd"),
1194  .function = TX_NAME(ff_tx_mdct_naive_fwd),
1195  .type = TX_TYPE(MDCT),
1197  .factors = { 2, TX_FACTOR_ANY }, /* MDCTs need an even length */
1198  .nb_factors = 2,
1199  .min_len = 2,
1200  .max_len = TX_LEN_UNLIMITED,
1201  .init = TX_NAME(ff_tx_mdct_naive_init),
1203  .prio = FF_TX_PRIO_MIN,
1204 };
1205 
1206 static const FFTXCodelet TX_NAME(ff_tx_mdct_naive_inv_def) = {
1207  .name = TX_NAME_STR("mdct_naive_inv"),
1208  .function = TX_NAME(ff_tx_mdct_naive_inv),
1209  .type = TX_TYPE(MDCT),
1211  .factors = { 2, TX_FACTOR_ANY },
1212  .nb_factors = 2,
1213  .min_len = 2,
1214  .max_len = TX_LEN_UNLIMITED,
1215  .init = TX_NAME(ff_tx_mdct_naive_init),
1217  .prio = FF_TX_PRIO_MIN,
1218 };
1219 
1221  const FFTXCodelet *cd,
1222  uint64_t flags,
1224  int len, int inv,
1225  const void *scale)
1226 {
1227  int ret;
1228  FFTXCodeletOptions sub_opts = {
1230  };
1231 
1232  s->scale_d = *((SCALE_TYPE *)scale);
1233  s->scale_f = s->scale_d;
1234 
1235  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1236  flags |= AV_TX_INPLACE; /* in-place */
1237  flags |= FF_TX_PRESHUFFLE; /* First try with an in-place transform */
1238 
1239  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1240  inv, scale))) {
1241  flags &= ~FF_TX_PRESHUFFLE; /* Now try with a generic FFT */
1242  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
1243  inv, scale)))
1244  return ret;
1245  }
1246 
1247  s->map = av_malloc((len >> 1)*sizeof(*s->map));
1248  if (!s->map)
1249  return AVERROR(ENOMEM);
1250 
1251  /* If we need to preshuffle copy the map from the subcontext */
1252  if (s->sub[0].flags & FF_TX_PRESHUFFLE) {
1253  memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map));
1254  } else {
1255  for (int i = 0; i < len >> 1; i++)
1256  s->map[i] = i;
1257  }
1258 
1259  if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1260  return ret;
1261 
1262  /* Saves a multiply in a hot path. */
1263  if (inv)
1264  for (int i = 0; i < (s->len >> 1); i++)
1265  s->map[i] <<= 1;
1266 
1267  return 0;
1268 }
1269 
1270 static void TX_NAME(ff_tx_mdct_fwd)(AVTXContext *s, void *_dst, void *_src,
1271  ptrdiff_t stride)
1272 {
1273  TXSample *src = _src, *dst = _dst;
1274  TXComplex *exp = s->exp, tmp, *z = _dst;
1275  const int len2 = s->len >> 1;
1276  const int len4 = s->len >> 2;
1277  const int len3 = len2 * 3;
1278  const int *sub_map = s->map;
1279 
1280  stride /= sizeof(*dst);
1281 
1282  for (int i = 0; i < len2; i++) { /* Folding and pre-reindexing */
1283  const int k = 2*i;
1284  const int idx = sub_map[i];
1285  if (k < len2) {
1286  tmp.re = FOLD(-src[ len2 + k], src[1*len2 - 1 - k]);
1287  tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]);
1288  } else {
1289  tmp.re = FOLD(-src[ len2 + k], -src[5*len2 - 1 - k]);
1290  tmp.im = FOLD( src[-len2 + k], -src[1*len3 - 1 - k]);
1291  }
1292  CMUL(z[idx].im, z[idx].re, tmp.re, tmp.im, exp[i].re, exp[i].im);
1293  }
1294 
1295  s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1296 
1297  for (int i = 0; i < len4; i++) {
1298  const int i0 = len4 + i, i1 = len4 - i - 1;
1299  TXComplex src1 = { z[i1].re, z[i1].im };
1300  TXComplex src0 = { z[i0].re, z[i0].im };
1301 
1302  CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im,
1303  exp[i0].im, exp[i0].re);
1304  CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im,
1305  exp[i1].im, exp[i1].re);
1306  }
1307 }
1308 
1309 static void TX_NAME(ff_tx_mdct_inv)(AVTXContext *s, void *_dst, void *_src,
1310  ptrdiff_t stride)
1311 {
1312  TXComplex *z = _dst, *exp = s->exp;
1313  const TXSample *src = _src, *in1, *in2;
1314  const int len2 = s->len >> 1;
1315  const int len4 = s->len >> 2;
1316  const int *sub_map = s->map;
1317 
1318  stride /= sizeof(*src);
1319  in1 = src;
1320  in2 = src + ((len2*2) - 1) * stride;
1321 
1322  for (int i = 0; i < len2; i++) {
1323  int k = sub_map[i];
1324  TXComplex tmp = { in2[-k*stride], in1[k*stride] };
1325  CMUL3(z[i], tmp, exp[i]);
1326  }
1327 
1328  s->fn[0](&s->sub[0], z, z, sizeof(TXComplex));
1329 
1330  exp += len2;
1331  for (int i = 0; i < len4; i++) {
1332  const int i0 = len4 + i, i1 = len4 - i - 1;
1333  TXComplex src1 = { z[i1].im, z[i1].re };
1334  TXComplex src0 = { z[i0].im, z[i0].re };
1335 
1336  CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re);
1337  CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re);
1338  }
1339 }
1340 
1341 static const FFTXCodelet TX_NAME(ff_tx_mdct_fwd_def) = {
1342  .name = TX_NAME_STR("mdct_fwd"),
1343  .function = TX_NAME(ff_tx_mdct_fwd),
1344  .type = TX_TYPE(MDCT),
1346  .factors = { 2, TX_FACTOR_ANY },
1347  .nb_factors = 2,
1348  .min_len = 2,
1349  .max_len = TX_LEN_UNLIMITED,
1350  .init = TX_NAME(ff_tx_mdct_init),
1352  .prio = FF_TX_PRIO_BASE,
1353 };
1354 
1355 static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_def) = {
1356  .name = TX_NAME_STR("mdct_inv"),
1357  .function = TX_NAME(ff_tx_mdct_inv),
1358  .type = TX_TYPE(MDCT),
1360  .factors = { 2, TX_FACTOR_ANY },
1361  .nb_factors = 2,
1362  .min_len = 2,
1363  .max_len = TX_LEN_UNLIMITED,
1364  .init = TX_NAME(ff_tx_mdct_init),
1366  .prio = FF_TX_PRIO_BASE,
1367 };
1368 
1370  const FFTXCodelet *cd,
1371  uint64_t flags,
1373  int len, int inv,
1374  const void *scale)
1375 {
1376  int ret;
1377 
1378  s->scale_d = *((SCALE_TYPE *)scale);
1379  s->scale_f = s->scale_d;
1380 
1381  flags &= ~AV_TX_FULL_IMDCT;
1382 
1383  if ((ret = ff_tx_init_subtx(s, TX_TYPE(MDCT), flags, NULL, len, 1, scale)))
1384  return ret;
1385 
1386  return 0;
1387 }
1388 
1389 static void TX_NAME(ff_tx_mdct_inv_full)(AVTXContext *s, void *_dst,
1390  void *_src, ptrdiff_t stride)
1391 {
1392  int len = s->len << 1;
1393  int len2 = len >> 1;
1394  int len4 = len >> 2;
1395  TXSample *dst = _dst;
1396 
1397  s->fn[0](&s->sub[0], dst + len4, _src, stride);
1398 
1399  stride /= sizeof(*dst);
1400 
1401  for (int i = 0; i < len4; i++) {
1402  dst[ i*stride] = -dst[(len2 - i - 1)*stride];
1403  dst[(len - i - 1)*stride] = dst[(len2 + i + 0)*stride];
1404  }
1405 }
1406 
1407 static const FFTXCodelet TX_NAME(ff_tx_mdct_inv_full_def) = {
1408  .name = TX_NAME_STR("mdct_inv_full"),
1409  .function = TX_NAME(ff_tx_mdct_inv_full),
1410  .type = TX_TYPE(MDCT),
1411  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
1413  .factors = { 2, TX_FACTOR_ANY },
1414  .nb_factors = 2,
1415  .min_len = 2,
1416  .max_len = TX_LEN_UNLIMITED,
1419  .prio = FF_TX_PRIO_BASE,
1420 };
1421 
1423  const FFTXCodelet *cd,
1424  uint64_t flags,
1426  int len, int inv,
1427  const void *scale)
1428 {
1429  int ret, sub_len;
1430  FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
1431 
1432  len >>= 1;
1433  sub_len = len / cd->factors[0];
1434 
1435  s->scale_d = *((SCALE_TYPE *)scale);
1436  s->scale_f = s->scale_d;
1437 
1438  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
1439  flags |= AV_TX_INPLACE; /* in-place */
1440  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
1441 
1442  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
1443  sub_len, inv, scale)))
1444  return ret;
1445 
1446  if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
1447  return ret;
1448 
1449  /* Our 15-point transform is also a compound one, so embed its input map */
1450  if (cd->factors[0] == 15)
1451  TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
1452 
1453  if ((ret = TX_TAB(ff_tx_mdct_gen_exp)(s, inv ? s->map : NULL)))
1454  return ret;
1455 
1456  /* Saves multiplies in loops. */
1457  for (int i = 0; i < len; i++)
1458  s->map[i] <<= 1;
1459 
1460  if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
1461  return AVERROR(ENOMEM);
1462 
1463  TX_TAB(ff_tx_init_tabs)(len / sub_len);
1464 
1465  return 0;
1466 }
1467 
1468 #define DECL_COMP_IMDCT(N) \
1469 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_inv)(AVTXContext *s, void *_dst, \
1470  void *_src, ptrdiff_t stride) \
1471 { \
1472  TXComplex fft##N##in[N]; \
1473  TXComplex *z = _dst, *exp = s->exp; \
1474  const TXSample *src = _src, *in1, *in2; \
1475  const int len4 = s->len >> 2; \
1476  const int len2 = s->len >> 1; \
1477  const int m = s->sub->len; \
1478  const int *in_map = s->map, *out_map = in_map + N*m; \
1479  const int *sub_map = s->sub->map; \
1480  \
1481  stride /= sizeof(*src); /* To convert it from bytes */ \
1482  in1 = src; \
1483  in2 = src + ((N*m*2) - 1) * stride; \
1484  \
1485  for (int i = 0; i < len2; i += N) { \
1486  for (int j = 0; j < N; j++) { \
1487  const int k = in_map[j]; \
1488  TXComplex tmp = { in2[-k*stride], in1[k*stride] }; \
1489  CMUL3(fft##N##in[j], tmp, exp[j]); \
1490  } \
1491  fft##N(s->tmp + *(sub_map++), fft##N##in, m); \
1492  exp += N; \
1493  in_map += N; \
1494  } \
1495  \
1496  for (int i = 0; i < N; i++) \
1497  s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \
1498  \
1499  for (int i = 0; i < len4; i++) { \
1500  const int i0 = len4 + i, i1 = len4 - i - 1; \
1501  const int s0 = out_map[i0], s1 = out_map[i1]; \
1502  TXComplex src1 = { s->tmp[s1].im, s->tmp[s1].re }; \
1503  TXComplex src0 = { s->tmp[s0].im, s->tmp[s0].re }; \
1504  \
1505  CMUL(z[i1].re, z[i0].im, src1.re, src1.im, exp[i1].im, exp[i1].re); \
1506  CMUL(z[i0].re, z[i1].im, src0.re, src0.im, exp[i0].im, exp[i0].re); \
1507  } \
1508 } \
1509  \
1510 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_inv_def) = { \
1511  .name = TX_NAME_STR("mdct_pfa_" #N "xM_inv"), \
1512  .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_inv), \
1513  .type = TX_TYPE(MDCT), \
1514  .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_INVERSE_ONLY, \
1515  .factors = { N, TX_FACTOR_ANY }, \
1516  .nb_factors = 2, \
1517  .min_len = N*2, \
1518  .max_len = TX_LEN_UNLIMITED, \
1519  .init = TX_NAME(ff_tx_mdct_pfa_init), \
1520  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1521  .prio = FF_TX_PRIO_BASE, \
1522 };
1523 
1524 DECL_COMP_IMDCT(3)
1525 DECL_COMP_IMDCT(5)
1526 DECL_COMP_IMDCT(7)
1527 DECL_COMP_IMDCT(9)
1528 DECL_COMP_IMDCT(15)
1529 
1530 #define DECL_COMP_MDCT(N) \
1531 static void TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd)(AVTXContext *s, void *_dst, \
1532  void *_src, ptrdiff_t stride) \
1533 { \
1534  TXComplex fft##N##in[N]; \
1535  TXSample *src = _src, *dst = _dst; \
1536  TXComplex *exp = s->exp, tmp; \
1537  const int m = s->sub->len; \
1538  const int len4 = N*m; \
1539  const int len3 = len4 * 3; \
1540  const int len8 = s->len >> 2; \
1541  const int *in_map = s->map, *out_map = in_map + N*m; \
1542  const int *sub_map = s->sub->map; \
1543  \
1544  stride /= sizeof(*dst); \
1545  \
1546  for (int i = 0; i < m; i++) { /* Folding and pre-reindexing */ \
1547  for (int j = 0; j < N; j++) { \
1548  const int k = in_map[i*N + j]; \
1549  if (k < len4) { \
1550  tmp.re = FOLD(-src[ len4 + k], src[1*len4 - 1 - k]); \
1551  tmp.im = FOLD(-src[ len3 + k], -src[1*len3 - 1 - k]); \
1552  } else { \
1553  tmp.re = FOLD(-src[ len4 + k], -src[5*len4 - 1 - k]); \
1554  tmp.im = FOLD( src[-len4 + k], -src[1*len3 - 1 - k]); \
1555  } \
1556  CMUL(fft##N##in[j].im, fft##N##in[j].re, tmp.re, tmp.im, \
1557  exp[k >> 1].re, exp[k >> 1].im); \
1558  } \
1559  fft##N(s->tmp + sub_map[i], fft##N##in, m); \
1560  } \
1561  \
1562  for (int i = 0; i < N; i++) \
1563  s->fn[0](&s->sub[0], s->tmp + m*i, s->tmp + m*i, sizeof(TXComplex)); \
1564  \
1565  for (int i = 0; i < len8; i++) { \
1566  const int i0 = len8 + i, i1 = len8 - i - 1; \
1567  const int s0 = out_map[i0], s1 = out_map[i1]; \
1568  TXComplex src1 = { s->tmp[s1].re, s->tmp[s1].im }; \
1569  TXComplex src0 = { s->tmp[s0].re, s->tmp[s0].im }; \
1570  \
1571  CMUL(dst[2*i1*stride + stride], dst[2*i0*stride], src0.re, src0.im, \
1572  exp[i0].im, exp[i0].re); \
1573  CMUL(dst[2*i0*stride + stride], dst[2*i1*stride], src1.re, src1.im, \
1574  exp[i1].im, exp[i1].re); \
1575  } \
1576 } \
1577  \
1578 static const FFTXCodelet TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd_def) = { \
1579  .name = TX_NAME_STR("mdct_pfa_" #N "xM_fwd"), \
1580  .function = TX_NAME(ff_tx_mdct_pfa_##N##xM_fwd), \
1581  .type = TX_TYPE(MDCT), \
1582  .flags = AV_TX_UNALIGNED | FF_TX_OUT_OF_PLACE | FF_TX_FORWARD_ONLY, \
1583  .factors = { N, TX_FACTOR_ANY }, \
1584  .nb_factors = 2, \
1585  .min_len = N*2, \
1586  .max_len = TX_LEN_UNLIMITED, \
1587  .init = TX_NAME(ff_tx_mdct_pfa_init), \
1588  .cpu_flags = FF_TX_CPU_FLAGS_ALL, \
1589  .prio = FF_TX_PRIO_BASE, \
1590 };
1591 
1592 DECL_COMP_MDCT(3)
1593 DECL_COMP_MDCT(5)
1594 DECL_COMP_MDCT(7)
1595 DECL_COMP_MDCT(9)
1596 DECL_COMP_MDCT(15)
1597 
1599  const FFTXCodelet *cd,
1600  uint64_t flags,
1602  int len, int inv,
1603  const void *scale)
1604 {
1605  int ret;
1606  double f, m;
1607  TXSample *tab;
1608 
1609  s->scale_d = *((SCALE_TYPE *)scale);
1610  s->scale_f = s->scale_d;
1611 
1612  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, NULL, len >> 1, inv, scale)))
1613  return ret;
1614 
1615  if (!(s->exp = av_mallocz((8 + (len >> 2) - 1)*sizeof(*s->exp))))
1616  return AVERROR(ENOMEM);
1617 
1618  tab = (TXSample *)s->exp;
1619 
1620  f = 2*M_PI/len;
1621 
1622  m = (inv ? 2*s->scale_d : s->scale_d);
1623 
1624  *tab++ = RESCALE((inv ? 0.5 : 1.0) * m);
1625  *tab++ = RESCALE(inv ? 0.5*m : 1.0*m);
1626  *tab++ = RESCALE( m);
1627  *tab++ = RESCALE(-m);
1628 
1629  *tab++ = RESCALE( (0.5 - 0.0) * m);
1630  *tab++ = RESCALE( (0.0 - 0.5) * m);
1631  *tab++ = RESCALE( (0.5 - inv) * m);
1632  *tab++ = RESCALE(-(0.5 - inv) * m);
1633 
1634  for (int i = 0; i < len >> 2; i++)
1635  *tab++ = RESCALE(cos(i*f));
1636  for (int i = len >> 2; i >= 0; i--)
1637  *tab++ = RESCALE(cos(i*f) * (inv ? +1.0 : -1.0));
1638 
1639  return 0;
1640 }
1641 
1642 #define DECL_RDFT(name, inv) \
1643 static void TX_NAME(ff_tx_rdft_ ##name)(AVTXContext *s, void *_dst, \
1644  void *_src, ptrdiff_t stride) \
1645 { \
1646  const int len2 = s->len >> 1; \
1647  const int len4 = s->len >> 2; \
1648  const TXSample *fact = (void *)s->exp; \
1649  const TXSample *tcos = fact + 8; \
1650  const TXSample *tsin = tcos + len4; \
1651  TXComplex *data = inv ? _src : _dst; \
1652  TXComplex t[3]; \
1653  \
1654  if (!inv) \
1655  s->fn[0](&s->sub[0], data, _src, sizeof(TXComplex)); \
1656  else \
1657  data[0].im = data[len2].re; \
1658  \
1659  /* The DC value's both components are real, but we need to change them \
1660  * into complex values. Also, the middle of the array is special-cased. \
1661  * These operations can be done before or after the loop. */ \
1662  t[0].re = data[0].re; \
1663  data[0].re = t[0].re + data[0].im; \
1664  data[0].im = t[0].re - data[0].im; \
1665  data[ 0].re = MULT(fact[0], data[ 0].re); \
1666  data[ 0].im = MULT(fact[1], data[ 0].im); \
1667  data[len4].re = MULT(fact[2], data[len4].re); \
1668  data[len4].im = MULT(fact[3], data[len4].im); \
1669  \
1670  for (int i = 1; i < len4; i++) { \
1671  /* Separate even and odd FFTs */ \
1672  t[0].re = MULT(fact[4], (data[i].re + data[len2 - i].re)); \
1673  t[0].im = MULT(fact[5], (data[i].im - data[len2 - i].im)); \
1674  t[1].re = MULT(fact[6], (data[i].im + data[len2 - i].im)); \
1675  t[1].im = MULT(fact[7], (data[i].re - data[len2 - i].re)); \
1676  \
1677  /* Apply twiddle factors to the odd FFT and add to the even FFT */ \
1678  CMUL(t[2].re, t[2].im, t[1].re, t[1].im, tcos[i], tsin[i]); \
1679  \
1680  data[ i].re = t[0].re + t[2].re; \
1681  data[ i].im = t[2].im - t[0].im; \
1682  data[len2 - i].re = t[0].re - t[2].re; \
1683  data[len2 - i].im = t[2].im + t[0].im; \
1684  } \
1685  \
1686  if (inv) { \
1687  s->fn[0](&s->sub[0], _dst, data, sizeof(TXComplex)); \
1688  } else { \
1689  /* Move [0].im to the last position, as convention requires */ \
1690  data[len2].re = data[0].im; \
1691  data[ 0].im = data[len2].im = 0; \
1692  } \
1693 }
1694 
1695 DECL_RDFT(r2c, 0)
1696 DECL_RDFT(c2r, 1)
1697 
1698 static const FFTXCodelet TX_NAME(ff_tx_rdft_r2c_def) = {
1699  .name = TX_NAME_STR("rdft_r2c"),
1700  .function = TX_NAME(ff_tx_rdft_r2c),
1701  .type = TX_TYPE(RDFT),
1702  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
1704  .factors = { 2, TX_FACTOR_ANY },
1705  .nb_factors = 2,
1706  .min_len = 2,
1707  .max_len = TX_LEN_UNLIMITED,
1708  .init = TX_NAME(ff_tx_rdft_init),
1710  .prio = FF_TX_PRIO_BASE,
1711 };
1712 
1713 static const FFTXCodelet TX_NAME(ff_tx_rdft_c2r_def) = {
1714  .name = TX_NAME_STR("rdft_c2r"),
1715  .function = TX_NAME(ff_tx_rdft_c2r),
1716  .type = TX_TYPE(RDFT),
1717  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
1719  .factors = { 2, TX_FACTOR_ANY },
1720  .nb_factors = 2,
1721  .min_len = 2,
1722  .max_len = TX_LEN_UNLIMITED,
1723  .init = TX_NAME(ff_tx_rdft_init),
1725  .prio = FF_TX_PRIO_BASE,
1726 };
1727 
1729  const FFTXCodelet *cd,
1730  uint64_t flags,
1732  int len, int inv,
1733  const void *scale)
1734 {
1735  int ret;
1736  double freq;
1737  TXSample *tab;
1738  SCALE_TYPE rsc = *((SCALE_TYPE *)scale);
1739 
1740  if (inv) {
1741  len *= 2;
1742  s->len *= 2;
1743  rsc *= 0.5;
1744  }
1745 
1746  if ((ret = ff_tx_init_subtx(s, TX_TYPE(RDFT), flags, NULL, len, inv, &rsc)))
1747  return ret;
1748 
1749  s->exp = av_malloc((len/2)*3*sizeof(TXSample));
1750  if (!s->exp)
1751  return AVERROR(ENOMEM);
1752 
1753  tab = (TXSample *)s->exp;
1754 
1755  freq = M_PI/(len*2);
1756 
1757  for (int i = 0; i < len; i++)
1758  tab[i] = RESCALE(cos(i*freq)*(!inv + 1));
1759 
1760  if (inv) {
1761  for (int i = 0; i < len/2; i++)
1762  tab[len + i] = RESCALE(0.5 / sin((2*i + 1)*freq));
1763  } else {
1764  for (int i = 0; i < len/2; i++)
1765  tab[len + i] = RESCALE(cos((len - 2*i - 1)*freq));
1766  }
1767 
1768  return 0;
1770 
1771 static void TX_NAME(ff_tx_dctII)(AVTXContext *s, void *_dst,
1772  void *_src, ptrdiff_t stride)
1773 {
1774  TXSample *dst = _dst;
1775  TXSample *src = _src;
1776  const int len = s->len;
1777  const int len2 = len >> 1;
1778  const TXSample *exp = (void *)s->exp;
1779  TXSample next;
1780 #ifdef TX_INT32
1781  int64_t tmp1, tmp2;
1782 #else
1783  TXSample tmp1, tmp2;
1784 #endif
1785 
1786  for (int i = 0; i < len2; i++) {
1787  TXSample in1 = src[i];
1788  TXSample in2 = src[len - i - 1];
1789  TXSample s = exp[len + i];
1790 
1791 #ifdef TX_INT32
1792  tmp1 = in1 + in2;
1793  tmp2 = in1 - in2;
1794 
1795  tmp1 >>= 1;
1796  tmp2 *= s;
1797 
1798  tmp2 = (tmp2 + 0x40000000) >> 31;
1799 #else
1800  tmp1 = (in1 + in2)*0.5;
1801  tmp2 = (in1 - in2)*s;
1802 #endif
1803 
1804  src[i] = tmp1 + tmp2;
1805  src[len - i - 1] = tmp1 - tmp2;
1806  }
1807 
1808  s->fn[0](&s->sub[0], dst, src, sizeof(TXComplex));
1809 
1810  next = dst[len];
1811 
1812  for (int i = len - 2; i > 0; i -= 2) {
1813  TXSample tmp;
1814 
1815  CMUL(tmp, dst[i], exp[len - i], exp[i], dst[i + 0], dst[i + 1]);
1816 
1817  dst[i + 1] = next;
1818 
1819  next += tmp;
1820  }
1821 
1822 #ifdef TX_INT32
1823  tmp1 = ((int64_t)exp[0]) * ((int64_t)dst[0]);
1824  dst[0] = (tmp1 + 0x40000000) >> 31;
1825 #else
1826  dst[0] = exp[0] * dst[0];
1827 #endif
1828  dst[1] = next;
1830 
1831 static void TX_NAME(ff_tx_dctIII)(AVTXContext *s, void *_dst,
1832  void *_src, ptrdiff_t stride)
1833 {
1834  TXSample *dst = _dst;
1835  TXSample *src = _src;
1836  const int len = s->len;
1837  const int len2 = len >> 1;
1838  const TXSample *exp = (void *)s->exp;
1839 #ifdef TX_INT32
1840  int64_t tmp1, tmp2 = src[len - 1];
1841  tmp2 = (2*tmp2 + 0x40000000) >> 31;
1842 #else
1843  TXSample tmp1, tmp2 = 2*src[len - 1];
1844 #endif
1845 
1846  src[len] = tmp2;
1847 
1848  for (int i = len - 2; i >= 2; i -= 2) {
1849  TXSample val1 = src[i - 0];
1850  TXSample val2 = src[i - 1] - src[i + 1];
1851 
1852  CMUL(src[i + 1], src[i], exp[len - i], exp[i], val1, val2);
1853  }
1854 
1855  s->fn[0](&s->sub[0], dst, src, sizeof(float));
1856 
1857  for (int i = 0; i < len2; i++) {
1858  TXSample in1 = dst[i];
1859  TXSample in2 = dst[len - i - 1];
1860  TXSample c = exp[len + i];
1861 
1862  tmp1 = in1 + in2;
1863  tmp2 = in1 - in2;
1864  tmp2 *= c;
1865 #ifdef TX_INT32
1866  tmp2 = (tmp2 + 0x40000000) >> 31;
1867 #endif
1868 
1869  dst[i] = tmp1 + tmp2;
1870  dst[len - i - 1] = tmp1 - tmp2;
1871  }
1872 }
1873 
1874 static const FFTXCodelet TX_NAME(ff_tx_dctII_def) = {
1875  .name = TX_NAME_STR("dctII"),
1876  .function = TX_NAME(ff_tx_dctII),
1877  .type = TX_TYPE(DCT),
1878  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
1880  .factors = { 2, TX_FACTOR_ANY },
1881  .min_len = 2,
1882  .max_len = TX_LEN_UNLIMITED,
1883  .init = TX_NAME(ff_tx_dct_init),
1885  .prio = FF_TX_PRIO_BASE,
1886 };
1887 
1888 static const FFTXCodelet TX_NAME(ff_tx_dctIII_def) = {
1889  .name = TX_NAME_STR("dctIII"),
1890  .function = TX_NAME(ff_tx_dctIII),
1891  .type = TX_TYPE(DCT),
1892  .flags = AV_TX_UNALIGNED | AV_TX_INPLACE |
1894  .factors = { 2, TX_FACTOR_ANY },
1895  .min_len = 2,
1896  .max_len = TX_LEN_UNLIMITED,
1897  .init = TX_NAME(ff_tx_dct_init),
1899  .prio = FF_TX_PRIO_BASE,
1900 };
1901 
1902 int TX_TAB(ff_tx_mdct_gen_exp)(AVTXContext *s, int *pre_tab)
1903 {
1904  int off = 0;
1905  int len4 = s->len >> 1;
1906  double scale = s->scale_d;
1907  const double theta = (scale < 0 ? len4 : 0) + 1.0/8.0;
1908  size_t alloc = pre_tab ? 2*len4 : len4;
1909 
1910  if (!(s->exp = av_malloc_array(alloc, sizeof(*s->exp))))
1911  return AVERROR(ENOMEM);
1912 
1913  scale = sqrt(fabs(scale));
1914 
1915  if (pre_tab)
1916  off = len4;
1917 
1918  for (int i = 0; i < len4; i++) {
1919  const double alpha = M_PI_2 * (i + theta) / len4;
1920  s->exp[off + i] = (TXComplex){ RESCALE(cos(alpha) * scale),
1921  RESCALE(sin(alpha) * scale) };
1922  }
1923 
1924  if (pre_tab)
1925  for (int i = 0; i < len4; i++)
1926  s->exp[i] = s->exp[len4 + pre_tab[i]];
1927 
1928  return 0;
1929 }
1930 
1931 const FFTXCodelet * const TX_NAME(ff_tx_codelet_list)[] = {
1932  /* Split-Radix codelets */
1933  &TX_NAME(ff_tx_fft2_ns_def),
1934  &TX_NAME(ff_tx_fft4_ns_def),
1935  &TX_NAME(ff_tx_fft8_ns_def),
1936  &TX_NAME(ff_tx_fft16_ns_def),
1937  &TX_NAME(ff_tx_fft32_ns_def),
1938  &TX_NAME(ff_tx_fft64_ns_def),
1939  &TX_NAME(ff_tx_fft128_ns_def),
1940  &TX_NAME(ff_tx_fft256_ns_def),
1941  &TX_NAME(ff_tx_fft512_ns_def),
1942  &TX_NAME(ff_tx_fft1024_ns_def),
1943  &TX_NAME(ff_tx_fft2048_ns_def),
1944  &TX_NAME(ff_tx_fft4096_ns_def),
1945  &TX_NAME(ff_tx_fft8192_ns_def),
1946  &TX_NAME(ff_tx_fft16384_ns_def),
1947  &TX_NAME(ff_tx_fft32768_ns_def),
1948  &TX_NAME(ff_tx_fft65536_ns_def),
1949  &TX_NAME(ff_tx_fft131072_ns_def),
1950 
1951  /* Prime factor codelets */
1952  &TX_NAME(ff_tx_fft3_ns_def),
1953  &TX_NAME(ff_tx_fft5_ns_def),
1954  &TX_NAME(ff_tx_fft7_ns_def),
1955  &TX_NAME(ff_tx_fft9_ns_def),
1956  &TX_NAME(ff_tx_fft15_ns_def),
1957 
1958  /* We get these for free */
1959  &TX_NAME(ff_tx_fft3_fwd_def),
1960  &TX_NAME(ff_tx_fft5_fwd_def),
1961  &TX_NAME(ff_tx_fft7_fwd_def),
1962  &TX_NAME(ff_tx_fft9_fwd_def),
1963 
1964  /* Standalone transforms */
1965  &TX_NAME(ff_tx_fft_def),
1966  &TX_NAME(ff_tx_fft_inplace_def),
1967  &TX_NAME(ff_tx_fft_inplace_small_def),
1968  &TX_NAME(ff_tx_fft_pfa_def),
1969  &TX_NAME(ff_tx_fft_pfa_ns_def),
1970  &TX_NAME(ff_tx_fft_naive_def),
1971  &TX_NAME(ff_tx_fft_naive_small_def),
1972  &TX_NAME(ff_tx_mdct_fwd_def),
1973  &TX_NAME(ff_tx_mdct_inv_def),
1974  &TX_NAME(ff_tx_mdct_pfa_3xM_fwd_def),
1975  &TX_NAME(ff_tx_mdct_pfa_5xM_fwd_def),
1976  &TX_NAME(ff_tx_mdct_pfa_7xM_fwd_def),
1977  &TX_NAME(ff_tx_mdct_pfa_9xM_fwd_def),
1978  &TX_NAME(ff_tx_mdct_pfa_15xM_fwd_def),
1979  &TX_NAME(ff_tx_mdct_pfa_3xM_inv_def),
1980  &TX_NAME(ff_tx_mdct_pfa_5xM_inv_def),
1981  &TX_NAME(ff_tx_mdct_pfa_7xM_inv_def),
1982  &TX_NAME(ff_tx_mdct_pfa_9xM_inv_def),
1983  &TX_NAME(ff_tx_mdct_pfa_15xM_inv_def),
1984  &TX_NAME(ff_tx_mdct_naive_fwd_def),
1985  &TX_NAME(ff_tx_mdct_naive_inv_def),
1986  &TX_NAME(ff_tx_mdct_inv_full_def),
1987  &TX_NAME(ff_tx_rdft_r2c_def),
1988  &TX_NAME(ff_tx_rdft_c2r_def),
1989  &TX_NAME(ff_tx_dctII_def),
1990  &TX_NAME(ff_tx_dctIII_def),
1991 
1992  NULL,
1993 };
func
int(* func)(AVBPrint *dst, const char *in, const char *arg)
Definition: jacosubdec.c:68
ff_tx_fft_sr_combine
static void TX_NAME() ff_tx_fft_sr_combine(TXComplex *z, const TXSample *cos, int len)
Definition: tx_template.c:559
ff_tx_dct_init
static av_cold int TX_NAME() ff_tx_dct_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1726
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
out
FILE * out
Definition: movenc.c:54
ff_ctz
#define ff_ctz
Definition: intmath.h:107
TRANSFORM
#define TRANSFORM(a0, a1, a2, a3, wre, wim)
Definition: tx_template.c:551
src1
const pixel * src1
Definition: h264pred_template.c:421
AVTXContext
Definition: tx_priv.h:228
ff_tx_fft
static void TX_NAME() ff_tx_fft(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:760
im
float im
Definition: fft.c:79
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
FFTXCodeletOptions
Definition: tx_priv.h:176
w
uint8_t w
Definition: llviddspenc.c:38
M_PI_2
#define M_PI_2
Definition: mathematics.h:55
TX_MAX_DECOMPOSITIONS
#define TX_MAX_DECOMPOSITIONS
Definition: tx_priv.h:190
SR_POW2_TABLES
#define SR_POW2_TABLES
Definition: tx_template.c:30
ff_tx_fft_pfa
static void TX_NAME() ff_tx_fft_pfa(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride)
Definition: tx_template.c:1055
ff_tx_fft16_ns
static void TX_NAME() ff_tx_fft16_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:676
DECL_RDFT
#define DECL_RDFT(name, inv)
Definition: tx_template.c:1642
ff_tx_gen_inplace_map
int ff_tx_gen_inplace_map(AVTXContext *s, int len)
Definition: tx.c:155
t1
#define t1
Definition: regdef.h:29
fft15
static av_always_inline void fft15(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:462
FF_TX_CPU_FLAGS_ALL
#define FF_TX_CPU_FLAGS_ALL
Definition: tx_priv.h:223
ff_tx_gen_compound_mapping
int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts, int inv, int n, int m)
Definition: tx.c:74
ff_tx_fft_naive
static void TX_NAME() ff_tx_fft_naive(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:868
av_malloc
#define av_malloc(s)
Definition: tableprint_vlc.h:30
DECL_FFT5
#define DECL_FFT5(NAME, D0, D1, D2, D3, D4)
Definition: tx_template.c:208
ff_tx_mdct_naive_fwd
static void TX_NAME() ff_tx_mdct_naive_fwd(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1141
ff_tx_rdft_init
static av_cold int TX_NAME() ff_tx_rdft_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1598
DECL_SR_CODELET_DEF
#define DECL_SR_CODELET_DEF(n)
Definition: tx_template.c:596
FFTabInitData::func
void(* func)(void)
Definition: tx_template.c:59
sr_tabs_init_funcs
static SR_POW2_TABLES void(*const sr_tabs_init_funcs[])(void)
Definition: tx_template.c:77
TX_NAME
static const FFTXCodelet TX_NAME(ff_tx_fft_def)
FF_TX_MAP_GATHER
@ FF_TX_MAP_GATHER
Definition: tx_priv.h:169
sum_d
static void sum_d(const int *input, int *output, int len)
Definition: dcadct.c:51
TX_INT32
#define TX_INT32
Definition: tx_int32.c:19
sr_tabs_init_once
static AVOnce sr_tabs_init_once[]
Definition: tx_template.c:83
val
static double val(void *priv, double ch)
Definition: aeval.c:77
DECL_FACTOR_F
#define DECL_FACTOR_F(n)
Definition: tx_template.c:514
scale
static av_always_inline float scale(float x, float s)
Definition: vf_v360.c:1389
TX_MAX_SUB
#define TX_MAX_SUB
Definition: tx_priv.h:187
TABLE_DEF
#define TABLE_DEF(name, size)
Definition: tx_template.c:27
FFTXCodelet::type
enum AVTXType type
Definition: tx_priv.h:195
FFTXCodeletOptions::map_dir
FFTXMapDirection map_dir
Definition: tx_priv.h:180
mult
static int16_t mult(Float11 *f1, Float11 *f2)
Definition: g726.c:60
ff_thread_once
static int ff_thread_once(char *control, void(*routine)(void))
Definition: thread.h:184
FF_ARRAY_ELEMS
#define FF_ARRAY_ELEMS(a)
Definition: sinewin_tablegen.c:29
av_cold
#define av_cold
Definition: attributes.h:90
FFTabInitData
Definition: tx_template.c:58
float
float
Definition: af_crystalizer.c:122
c2r
static void c2r(float *buffer, int size)
Definition: af_apsyclip.c:386
s
#define s(width, name)
Definition: cbs_vp9.c:256
ff_tx_fft_factor_init
static av_cold int TX_NAME() ff_tx_fft_factor_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:475
ff_tx_mdct_fwd
static void TX_NAME() ff_tx_mdct_fwd(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1270
t7
#define t7
Definition: regdef.h:35
ff_tx_mdct_naive_init
static av_cold int TX_NAME() ff_tx_mdct_naive_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1129
FF_TX_FORWARD_ONLY
#define FF_TX_FORWARD_ONLY
Definition: tx_priv.h:151
FFTXCodelet::cpu_flags
int cpu_flags
Definition: tx_priv.h:220
DECL_FACTOR_S
#define DECL_FACTOR_S(n)
Definition: tx_template.c:493
if
if(ret)
Definition: filter_design.txt:179
AV_TX_FULL_IMDCT
@ AV_TX_FULL_IMDCT
Performs a full inverse MDCT rather than leaving out samples that can be derived through symmetry.
Definition: tx.h:151
opts
AVDictionary * opts
Definition: movenc.c:50
AV_ONCE_INIT
#define AV_ONCE_INIT
Definition: thread.h:182
fabs
static __device__ float fabs(float a)
Definition: cuda_runtime.h:182
NULL
#define NULL
Definition: coverity.c:32
t5
#define t5
Definition: regdef.h:33
ff_tx_mdct_init
static av_cold int TX_NAME() ff_tx_mdct_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1220
t6
#define t6
Definition: regdef.h:34
AV_TX_INPLACE
@ AV_TX_INPLACE
Allows for in-place transformations, where input == output.
Definition: tx.h:137
ff_tx_gen_ptwo_revtab
int ff_tx_gen_ptwo_revtab(AVTXContext *s, FFTXCodeletOptions *opts)
Definition: tx.c:135
r2c
static void r2c(float *buffer, int size)
Definition: af_apsyclip.c:377
FF_TX_OUT_OF_PLACE
#define FF_TX_OUT_OF_PLACE
Definition: tx_priv.h:147
CMUL3
#define CMUL3(c, a, b)
Definition: tx_priv.h:143
AV_TX_UNALIGNED
@ AV_TX_UNALIGNED
Relaxes alignment requirement for the in and out arrays of av_tx_fn().
Definition: tx.h:143
exp
int8_t exp
Definition: eval.c:72
ff_tx_dctIII
static void TX_NAME() ff_tx_dctIII(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1829
DECL_COMP_MDCT
#define DECL_COMP_MDCT(N)
Definition: tx_template.c:1530
AVOnce
#define AVOnce
Definition: thread.h:181
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
tab
static const uint8_t tab[16]
Definition: rka.c:668
ff_tx_fft_pfa_init
static av_cold int TX_NAME() ff_tx_fft_pfa_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:945
ff_tx_clear_ctx
void ff_tx_clear_ctx(AVTXContext *s)
Definition: tx.c:289
ff_tx_fft2_ns
static void TX_NAME() ff_tx_fft2_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:628
FF_TX_PRESHUFFLE
#define FF_TX_PRESHUFFLE
Definition: tx_priv.h:149
ff_tx_fft_sr_codelet_init
static av_cold int TX_NAME() ff_tx_fft_sr_codelet_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:585
ff_tx_gen_default_map
int ff_tx_gen_default_map(AVTXContext *s, FFTXCodeletOptions *opts)
Definition: tx.c:522
f
f
Definition: af_crystalizer.c:122
ff_tx_init_tab_53
static av_cold void TX_TAB() ff_tx_init_tab_53(void)
Definition: tx_template.c:89
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
FF_TX_PRIO_BASE
@ FF_TX_PRIO_BASE
Definition: tx_priv.h:155
for
for(k=2;k<=8;++k)
Definition: h264pred_template.c:425
ff_tx_fft8_ns
static void TX_NAME() ff_tx_fft8_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:657
fft9
static av_always_inline void fft9(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:337
t8
#define t8
Definition: regdef.h:53
BF
#define BF(a, b, c, s)
Definition: dct32_template.c:90
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
TX_EMBED_INPUT_PFA_MAP
#define TX_EMBED_INPUT_PFA_MAP(map, tot_len, d1, d2)
Definition: tx_priv.h:264
ff_tx_fft_inplace
static void TX_NAME() ff_tx_fft_inplace(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:777
M_PI
#define M_PI
Definition: mathematics.h:52
ff_tx_fft_init
static av_cold int TX_NAME() ff_tx_fft_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:721
TXComplex
void TXComplex
Definition: tx_priv.h:64
ff_tx_mdct_inv
static void TX_NAME() ff_tx_mdct_inv(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1309
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
t4
#define t4
Definition: regdef.h:32
t3
#define t3
Definition: regdef.h:31
av_malloc_array
#define av_malloc_array(a, b)
Definition: tableprint_vlc.h:31
nptwo_tabs_init_once
static AVOnce nptwo_tabs_init_once[]
Definition: tx_template.c:136
av_always_inline
#define av_always_inline
Definition: attributes.h:49
ff_tx_fft_init_naive_small
static av_cold int TX_NAME() ff_tx_fft_init_naive_small(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:843
DECL_SR_CODELET
#define DECL_SR_CODELET(n, n2, n4)
Definition: tx_template.c:612
DECL_COMP_IMDCT
#define DECL_COMP_IMDCT(N)
Definition: tx_template.c:1468
av_mallocz
void * av_mallocz(size_t size)
Allocate a memory block with alignment suitable for all memory accesses (including vectors if availab...
Definition: mem.c:254
len
int len
Definition: vorbis_enc_data.h:426
fft3
static av_always_inline void fft3(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:171
FF_TX_MAP_SCATTER
@ FF_TX_MAP_SCATTER
Definition: tx_priv.h:172
TX_LEN_UNLIMITED
#define TX_LEN_UNLIMITED
Definition: tx_priv.h:209
stride
#define stride
Definition: h264pred_template.c:537
nptwo_tabs_init_data
static const FFTabInitData nptwo_tabs_init_data[]
Definition: tx_template.c:130
ret
ret
Definition: filter_design.txt:187
ff_tx_init_subtx
av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx.c:698
FFSWAP
#define FFSWAP(type, a, b)
Definition: macros.h:52
ff_tx_init_tab_7
static av_cold void TX_TAB() ff_tx_init_tab_7(void)
Definition: tx_template.c:108
TX_FACTOR_ANY
#define TX_FACTOR_ANY
Definition: tx_priv.h:202
FF_TX_INVERSE_ONLY
#define FF_TX_INVERSE_ONLY
Definition: tx_priv.h:150
ff_tx_fft_naive_small
static void TX_NAME() ff_tx_fft_naive_small(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:895
ff_tx_init_tab_9
static av_cold void TX_TAB() ff_tx_init_tab_9(void)
Definition: tx_template.c:118
FFTXCodelet
Definition: tx_priv.h:192
ff_tx_init_tabs
av_cold void TX_TAB() ff_tx_init_tabs(int len)
Definition: tx_template.c:142
t2
#define t2
Definition: regdef.h:30
ff_tx_mdct_naive_inv
static void TX_NAME() ff_tx_mdct_naive_inv(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1162
FFTabInitData::factors
int factors[TX_MAX_SUB]
Definition: tx_template.c:60
ff_tx_dctII
static void TX_NAME() ff_tx_dctII(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1769
BUTTERFLIES
#define BUTTERFLIES(a0, a1, a2, a3)
Definition: tx_template.c:537
ff_tx_fft_pfa_ns
static void TX_NAME() ff_tx_fft_pfa_ns(AVTXContext *s, void *_out, void *_in, ptrdiff_t stride)
Definition: tx_template.c:1079
src0
const pixel *const src0
Definition: h264pred_template.c:420
FFTXCodelet::name
const char * name
Definition: tx_priv.h:193
factor
static const int factor[16]
Definition: vf_pp7.c:76
ff_tx_fft_inplace_small_init
static av_cold int TX_NAME() ff_tx_fft_inplace_small_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:747
map
const VDPAUPixFmtMap * map
Definition: hwcontext_vdpau.c:71
alpha
static const int16_t alpha[]
Definition: ilbcdata.h:55
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
fft7
static av_always_inline void fft7(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:249
int32_t
int32_t
Definition: audioconvert.c:56
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:561
ff_tx_mdct_gen_exp
int TX_TAB() ff_tx_mdct_gen_exp(AVTXContext *s, int *pre_tab)
Definition: tx_template.c:1900
ff_tx_gen_pfa_input_map
int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts, int d1, int d2)
Definition: tx.c:43
ff_tx_mdct_pfa_init
static av_cold int TX_NAME() ff_tx_mdct_pfa_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1422
ff_tx_fft4_ns
static void TX_NAME() ff_tx_fft4_ns(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:640
ff_tx_mdct_inv_full_init
static av_cold int TX_NAME() ff_tx_mdct_inv_full_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_template.c:1369
ff_tx_decompose_length
int ff_tx_decompose_length(int dst[TX_MAX_DECOMPOSITIONS], enum AVTXType type, int len, int inv)
Definition: tx.c:411
CMUL
#define CMUL(dre, dim, are, aim, bre, bim)
Definition: fft-internal.h:35
TX_TYPE
#define TX_TYPE
Definition: aacdec.c:36
re
float re
Definition: fft.c:79
ff_tx_mdct_inv_full
static void TX_NAME() ff_tx_mdct_inv_full(AVTXContext *s, void *_dst, void *_src, ptrdiff_t stride)
Definition: tx_template.c:1389
FF_TX_PRIO_MIN
@ FF_TX_PRIO_MIN
Definition: tx_priv.h:160