FFmpeg
tx_float_init.c
Go to the documentation of this file.
1 /*
2  * This file is part of FFmpeg.
3  *
4  * FFmpeg is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * FFmpeg is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with FFmpeg; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
19 #define TX_FLOAT
20 #include "libavutil/tx_priv.h"
21 #include "libavutil/attributes.h"
22 #include "libavutil/mem.h"
23 #include "libavutil/x86/cpu.h"
24 
25 #include "config.h"
26 
27 TX_DECL_FN(fft2, sse3)
28 TX_DECL_FN(fft4_fwd, sse2)
29 TX_DECL_FN(fft4_inv, sse2)
30 TX_DECL_FN(fft8, sse3)
31 TX_DECL_FN(fft8_ns, sse3)
32 TX_DECL_FN(fft8, avx)
33 TX_DECL_FN(fft8_ns, avx)
34 TX_DECL_FN(fft15, avx2)
35 TX_DECL_FN(fft15_ns, avx2)
36 TX_DECL_FN(fft16, avx)
37 TX_DECL_FN(fft16_ns, avx)
38 TX_DECL_FN(fft16, fma3)
39 TX_DECL_FN(fft16_ns, fma3)
40 TX_DECL_FN(fft32, avx)
41 TX_DECL_FN(fft32_ns, avx)
42 TX_DECL_FN(fft32, fma3)
43 TX_DECL_FN(fft32_ns, fma3)
44 TX_DECL_FN(fft_sr, avx)
45 TX_DECL_FN(fft_sr_ns, avx)
46 TX_DECL_FN(fft_sr, fma3)
47 TX_DECL_FN(fft_sr_ns, fma3)
48 TX_DECL_FN(fft_sr, avx2)
49 TX_DECL_FN(fft_sr_ns, avx2)
50 
51 TX_DECL_FN(fft_pfa_15xM, avx2)
52 TX_DECL_FN(fft_pfa_15xM_ns, avx2)
53 
54 TX_DECL_FN(mdct_inv, avx2)
55 
56 TX_DECL_FN(fft2_asm, sse3)
57 TX_DECL_FN(fft4_fwd_asm, sse2)
58 TX_DECL_FN(fft4_inv_asm, sse2)
59 TX_DECL_FN(fft8_asm, sse3)
60 TX_DECL_FN(fft8_asm, avx)
61 TX_DECL_FN(fft16_asm, avx)
62 TX_DECL_FN(fft16_asm, fma3)
63 TX_DECL_FN(fft32_asm, avx)
64 TX_DECL_FN(fft32_asm, fma3)
65 TX_DECL_FN(fft_sr_asm, avx)
66 TX_DECL_FN(fft_sr_asm, fma3)
67 TX_DECL_FN(fft_sr_asm, avx2)
68 
69 TX_DECL_FN(fft_pfa_15xM_asm, avx2)
70 
71 #define DECL_INIT_FN(basis, interleave) \
72 static av_cold int b ##basis## _i ##interleave(AVTXContext *s, \
73  const FFTXCodelet *cd, \
74  uint64_t flags, \
75  FFTXCodeletOptions *opts, \
76  int len, int inv, \
77  const void *scale) \
78 { \
79  ff_tx_init_tabs_float(len); \
80  if (cd->max_len == 2) \
81  return ff_tx_gen_ptwo_revtab(s, opts); \
82  else \
83  return ff_tx_gen_split_radix_parity_revtab(s, len, inv, opts, \
84  basis, interleave); \
85 }
86 
87 DECL_INIT_FN(8, 0)
88 DECL_INIT_FN(8, 2)
89 
90 static av_cold int factor_init(AVTXContext *s, const FFTXCodelet *cd,
91  uint64_t flags, FFTXCodeletOptions *opts,
92  int len, int inv, const void *scale)
93 {
94  int ret;
95 
96  /* The transformations below are performed in the gather domain,
97  * so override the option and let the infrastructure convert the map
98  * to SCATTER if needed. */
100 
101  TX_TAB(ff_tx_init_tabs)(len);
102 
103  if (len == 15)
104  ret = ff_tx_gen_pfa_input_map(s, &sub_opts, 3, 5);
105  else
106  ret = ff_tx_gen_default_map(s, &sub_opts);
107 
108  if (ret < 0)
109  return ret;
110 
111  if (len == 15) {
112  int cnt = 0, tmp[15];
113 
114  /* Special permutation to simplify loads in the pre-permuted version */
115  memcpy(tmp, s->map, 15*sizeof(*tmp));
116  for (int i = 1; i < 15; i += 3) {
117  s->map[cnt] = tmp[i];
118  cnt++;
119  }
120  for (int i = 2; i < 15; i += 3) {
121  s->map[cnt] = tmp[i];
122  cnt++;
123  }
124  for (int i = 0; i < 15; i += 3) {
125  s->map[cnt] = tmp[i];
126  cnt++;
127  }
128  memmove(&s->map[7], &s->map[6], 4*sizeof(int));
129  memmove(&s->map[3], &s->map[1], 4*sizeof(int));
130  s->map[1] = tmp[2];
131  s->map[2] = tmp[0];
132  }
133 
134  return 0;
135 }
136 
137 static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd,
138  uint64_t flags, FFTXCodeletOptions *opts,
139  int len, int inv, const void *scale)
140 {
141  int ret;
142  FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_GATHER };
143 
144  s->scale_d = *((SCALE_TYPE *)scale);
145  s->scale_f = s->scale_d;
146 
147  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
148  flags |= AV_TX_INPLACE; /* in-place */
149  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
150  flags |= FF_TX_ASM_CALL; /* We want an assembly function, not C */
151 
152  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts, len >> 1,
153  inv, scale)))
154  return ret;
155 
156  s->map = av_malloc(len*sizeof(*s->map));
157  if (!s->map)
158  return AVERROR(ENOMEM);
159 
160  memcpy(s->map, s->sub->map, (len >> 1)*sizeof(*s->map));
161  /* Invert lookup table for unstrided path */
162  for (int i = 0; i < (len >> 1); i++)
163  s->map[(len >> 1) + s->map[i]] = i;
164 
165  if ((ret = ff_tx_mdct_gen_exp_float(s, s->map)))
166  return ret;
167 
168  return 0;
169 }
170 
172  const FFTXCodelet *cd,
173  uint64_t flags,
175  int len, int inv,
176  const void *scale)
177 {
178  int ret;
179  int sub_len = len / cd->factors[0];
180  FFTXCodeletOptions sub_opts = { .map_dir = FF_TX_MAP_SCATTER };
181 
182  flags &= ~FF_TX_OUT_OF_PLACE; /* We want the subtransform to be */
183  flags |= AV_TX_INPLACE; /* in-place */
184  flags |= FF_TX_PRESHUFFLE; /* This function handles the permute step */
185  flags |= FF_TX_ASM_CALL; /* We want an assembly function, not C */
186 
187  if ((ret = ff_tx_init_subtx(s, TX_TYPE(FFT), flags, &sub_opts,
188  sub_len, inv, scale)))
189  return ret;
190 
191  if ((ret = ff_tx_gen_compound_mapping(s, opts, s->inv, cd->factors[0], sub_len)))
192  return ret;
193 
194  if (cd->factors[0] == 15) {
195  int tmp[15];
196 
197  /* Our 15-point transform is also a compound one, so embed its input map */
198  TX_EMBED_INPUT_PFA_MAP(s->map, len, 3, 5);
199 
200  /* Special permutation to simplify loads in the pre-permuted version */
201  for (int k = 0; k < s->sub[0].len; k++) {
202  int cnt = 0;
203  memcpy(tmp, &s->map[k*15], 15*sizeof(*tmp));
204  for (int i = 1; i < 15; i += 3) {
205  s->map[k*15 + cnt] = tmp[i];
206  cnt++;
207  }
208  for (int i = 2; i < 15; i += 3) {
209  s->map[k*15 + cnt] = tmp[i];
210  cnt++;
211  }
212  for (int i = 0; i < 15; i += 3) {
213  s->map[k*15 + cnt] = tmp[i];
214  cnt++;
215  }
216  memmove(&s->map[k*15 + 7], &s->map[k*15 + 6], 4*sizeof(int));
217  memmove(&s->map[k*15 + 3], &s->map[k*15 + 1], 4*sizeof(int));
218  s->map[k*15 + 1] = tmp[2];
219  s->map[k*15 + 2] = tmp[0];
220  }
221  }
222 
223  if (!(s->tmp = av_malloc(len*sizeof(*s->tmp))))
224  return AVERROR(ENOMEM);
225 
226  TX_TAB(ff_tx_init_tabs)(len / sub_len);
227 
228  return 0;
229 }
230 
232  TX_DEF(fft2, FFT, 2, 2, 2, 0, 128, NULL, sse3, SSE3, AV_TX_INPLACE, 0),
233  TX_DEF(fft2_asm, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3,
235  TX_DEF(fft2, FFT, 2, 2, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
236  TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_FORWARD_ONLY, 0),
237  TX_DEF(fft4_fwd_asm, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2,
239  TX_DEF(fft4_inv_asm, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2,
241  TX_DEF(fft4_fwd, FFT, 4, 4, 2, 0, 192, b8_i0, sse2, SSE2, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
242  TX_DEF(fft4_inv, FFT, 4, 4, 2, 0, 128, NULL, sse2, SSE2, AV_TX_INPLACE | FF_TX_INVERSE_ONLY, 0),
243  TX_DEF(fft8, FFT, 8, 8, 2, 0, 128, b8_i0, sse3, SSE3, AV_TX_INPLACE, 0),
244  TX_DEF(fft8_asm, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3,
246  TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 192, b8_i0, sse3, SSE3, AV_TX_INPLACE | FF_TX_PRESHUFFLE, 0),
247  TX_DEF(fft8, FFT, 8, 8, 2, 0, 256, b8_i0, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
248  TX_DEF(fft8_asm, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX,
250  TX_DEF(fft8_ns, FFT, 8, 8, 2, 0, 320, b8_i0, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
252  TX_DEF(fft16, FFT, 16, 16, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
253  TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX,
255  TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
257  TX_DEF(fft16, FFT, 16, 16, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
258  TX_DEF(fft16_asm, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3,
260  TX_DEF(fft16_ns, FFT, 16, 16, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
262 
263 #if ARCH_X86_64
264  TX_DEF(fft32, FFT, 32, 32, 2, 0, 256, b8_i2, avx, AVX, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
265  TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX,
267  TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
269  TX_DEF(fft32, FFT, 32, 32, 2, 0, 288, b8_i2, fma3, FMA3, AV_TX_INPLACE, AV_CPU_FLAG_AVXSLOW),
270  TX_DEF(fft32_asm, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3,
272  TX_DEF(fft32_ns, FFT, 32, 32, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
274  TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 256, b8_i2, avx, AVX, 0, AV_CPU_FLAG_AVXSLOW),
275  TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx, AVX,
277  TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx, AVX, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
279  TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 288, b8_i2, fma3, FMA3, 0, AV_CPU_FLAG_AVXSLOW),
280  TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 352, b8_i2, fma3, FMA3,
282  TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 352, b8_i2, fma3, FMA3, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
284 
285 #if HAVE_AVX2_EXTERNAL
286  TX_DEF(fft15, FFT, 15, 15, 15, 0, 320, factor_init, avx2, AVX2,
288  TX_DEF(fft15_ns, FFT, 15, 15, 15, 0, 384, factor_init, avx2, AVX2,
290 
291  TX_DEF(fft_sr, FFT, 64, 2097152, 2, 0, 320, b8_i2, avx2, AVX2, 0,
293  TX_DEF(fft_sr_asm, FFT, 64, 2097152, 2, 0, 384, b8_i2, avx2, AVX2,
295  TX_DEF(fft_sr_ns, FFT, 64, 2097152, 2, 0, 384, b8_i2, avx2, AVX2, AV_TX_INPLACE | FF_TX_PRESHUFFLE,
297 
298  TX_DEF(fft_pfa_15xM, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 320, fft_pfa_init, avx2, AVX2,
300  TX_DEF(fft_pfa_15xM_asm, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 384, fft_pfa_init, avx2, AVX2,
302  TX_DEF(fft_pfa_15xM_ns, FFT, 60, TX_LEN_UNLIMITED, 15, 2, 384, fft_pfa_init, avx2, AVX2,
304 
305  TX_DEF(mdct_inv, MDCT, 16, TX_LEN_UNLIMITED, 2, TX_FACTOR_ANY, 384, m_inv_init, avx2, AVX2,
307 #endif
308 #endif
309 
310  NULL,
311 };
cpu.h
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
TX_TYPE
#define TX_TYPE
Definition: afir_template.c:49
AVTXContext
Definition: tx_priv.h:235
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
FFTXCodeletOptions
Definition: tx_priv.h:183
TX_DEF
#define TX_DEF(fn, tx_type, len_min, len_max, f1, f2, p, init_fn, suffix, cf, cd_flags, cf2)
Definition: tx_priv.h:71
fft15
static av_always_inline void fft15(TXComplex *out, TXComplex *in, ptrdiff_t stride)
Definition: tx_template.c:469
ff_tx_gen_compound_mapping
int ff_tx_gen_compound_mapping(AVTXContext *s, FFTXCodeletOptions *opts, int inv, int n, int m)
Definition: tx.c:75
av_malloc
#define av_malloc(s)
Definition: tableprint_vlc.h:30
FF_TX_MAP_GATHER
@ FF_TX_MAP_GATHER
Definition: tx_priv.h:176
FFTXCodeletOptions::map_dir
FFTXMapDirection map_dir
Definition: tx_priv.h:187
AV_CPU_FLAG_SLOW_GATHER
#define AV_CPU_FLAG_SLOW_GATHER
CPU has slow gathers.
Definition: cpu.h:58
FFTXCodelet::factors
int factors[TX_MAX_FACTORS]
Definition: tx_priv.h:208
av_cold
#define av_cold
Definition: attributes.h:90
s
#define s(width, name)
Definition: cbs_vp9.c:198
ff_tx_mdct_gen_exp_float
int ff_tx_mdct_gen_exp_float(AVTXContext *s, int *pre_tab)
FF_TX_FORWARD_ONLY
#define FF_TX_FORWARD_ONLY
Definition: tx_priv.h:158
factor_init
static av_cold int factor_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_float_init.c:90
opts
AVDictionary * opts
Definition: movenc.c:51
NULL
#define NULL
Definition: coverity.c:32
AV_TX_INPLACE
@ AV_TX_INPLACE
Allows for in-place transformations, where input == output.
Definition: tx.h:161
FF_TX_OUT_OF_PLACE
#define FF_TX_OUT_OF_PLACE
Definition: tx_priv.h:154
FF_TX_PRESHUFFLE
#define FF_TX_PRESHUFFLE
Definition: tx_priv.h:156
ff_tx_gen_default_map
int ff_tx_gen_default_map(AVTXContext *s, FFTXCodeletOptions *opts)
Definition: tx.c:525
AV_CPU_FLAG_AVXSLOW
#define AV_CPU_FLAG_AVXSLOW
AVX supported, but slow when using YMM registers (e.g. Bulldozer)
Definition: cpu.h:48
attributes.h
TX_EMBED_INPUT_PFA_MAP
#define TX_EMBED_INPUT_PFA_MAP(map, tot_len, d1, d2)
Definition: tx_priv.h:271
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
len
int len
Definition: vorbis_enc_data.h:426
FF_TX_MAP_SCATTER
@ FF_TX_MAP_SCATTER
Definition: tx_priv.h:179
TX_LEN_UNLIMITED
#define TX_LEN_UNLIMITED
Definition: tx_priv.h:216
tx_priv.h
ret
ret
Definition: filter_design.txt:187
ff_tx_init_subtx
av_cold int ff_tx_init_subtx(AVTXContext *s, enum AVTXType type, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx.c:712
TX_FACTOR_ANY
#define TX_FACTOR_ANY
Definition: tx_priv.h:209
FF_TX_INVERSE_ONLY
#define FF_TX_INVERSE_ONLY
Definition: tx_priv.h:157
fft_pfa_init
static av_cold int fft_pfa_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_float_init.c:171
FFTXCodelet
Definition: tx_priv.h:199
ff_tx_init_tabs
av_cold void TX_TAB() ff_tx_init_tabs(int len)
Definition: tx_template.c:148
m_inv_init
static av_cold int m_inv_init(AVTXContext *s, const FFTXCodelet *cd, uint64_t flags, FFTXCodeletOptions *opts, int len, int inv, const void *scale)
Definition: tx_float_init.c:137
FF_TX_ASM_CALL
#define FF_TX_ASM_CALL
Definition: tx_priv.h:159
mem.h
scale
static void scale(int *out, const int *in, const int w, const int h, const int shift)
Definition: intra.c:291
ff_tx_codelet_list_float_x86
const FFTXCodelet *const ff_tx_codelet_list_float_x86[]
Definition: tx_float_init.c:231
DECL_INIT_FN
#define DECL_INIT_FN(basis, interleave)
Definition: tx_float_init.c:71
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:474
ff_tx_gen_pfa_input_map
int ff_tx_gen_pfa_input_map(AVTXContext *s, FFTXCodeletOptions *opts, int d1, int d2)
Definition: tx.c:44
TX_DECL_FN
#define TX_DECL_FN(fn, suffix)
Definition: tx_priv.h:68