FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
mpegaudiodsp.c
Go to the documentation of this file.
1 /*
2  * SIMD-optimized MP3 decoding functions
3  * Copyright (c) 2010 Vitor Sessak
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavutil/attributes.h"
23 #include "libavutil/cpu.h"
24 #include "libavutil/internal.h"
25 #include "libavutil/x86/asm.h"
26 #include "libavutil/x86/cpu.h"
28 
29 #define DECL(CPU)\
30 static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
31 void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
32 
33 #if ARCH_X86_32
34 DECL(sse)
35 #endif
36 DECL(sse2)
37 DECL(sse3)
38 DECL(ssse3)
39 DECL(avx)
40 
41 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
42  float *tmpbuf);
43 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
44  float *tmpbuf);
45 
46 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
47 
48 #if HAVE_6REGS && HAVE_SSE_INLINE
49 
50 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
51 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
52 
53 #define SUM8(op, sum, w, p) \
54 { \
55  op(sum, (w)[0 * 64], (p)[0 * 64]); \
56  op(sum, (w)[1 * 64], (p)[1 * 64]); \
57  op(sum, (w)[2 * 64], (p)[2 * 64]); \
58  op(sum, (w)[3 * 64], (p)[3 * 64]); \
59  op(sum, (w)[4 * 64], (p)[4 * 64]); \
60  op(sum, (w)[5 * 64], (p)[5 * 64]); \
61  op(sum, (w)[6 * 64], (p)[6 * 64]); \
62  op(sum, (w)[7 * 64], (p)[7 * 64]); \
63 }
64 
65 static void apply_window(const float *buf, const float *win1,
66  const float *win2, float *sum1, float *sum2, int len)
67 {
68  x86_reg count = - 4*len;
69  const float *win1a = win1+len;
70  const float *win2a = win2+len;
71  const float *bufa = buf+len;
72  float *sum1a = sum1+len;
73  float *sum2a = sum2+len;
74 
75 
76 #define MULT(a, b) \
77  "movaps " #a "(%1,%0), %%xmm1 \n\t" \
78  "movaps " #a "(%3,%0), %%xmm2 \n\t" \
79  "mulps %%xmm2, %%xmm1 \n\t" \
80  "subps %%xmm1, %%xmm0 \n\t" \
81  "mulps " #b "(%2,%0), %%xmm2 \n\t" \
82  "subps %%xmm2, %%xmm4 \n\t" \
83 
84  __asm__ volatile(
85  "1: \n\t"
86  "xorps %%xmm0, %%xmm0 \n\t"
87  "xorps %%xmm4, %%xmm4 \n\t"
88 
89  MULT( 0, 0)
90  MULT( 256, 64)
91  MULT( 512, 128)
92  MULT( 768, 192)
93  MULT(1024, 256)
94  MULT(1280, 320)
95  MULT(1536, 384)
96  MULT(1792, 448)
97 
98  "movaps %%xmm0, (%4,%0) \n\t"
99  "movaps %%xmm4, (%5,%0) \n\t"
100  "add $16, %0 \n\t"
101  "jl 1b \n\t"
102  :"+&r"(count)
103  :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
104  );
105 
106 #undef MULT
107 }
108 
109 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
110  int incr)
111 {
112  LOCAL_ALIGNED_16(float, suma, [17]);
113  LOCAL_ALIGNED_16(float, sumb, [17]);
114  LOCAL_ALIGNED_16(float, sumc, [17]);
115  LOCAL_ALIGNED_16(float, sumd, [17]);
116 
117  float sum;
118 
119  /* copy to avoid wrap */
120  __asm__ volatile(
121  "movaps 0(%0), %%xmm0 \n\t" \
122  "movaps 16(%0), %%xmm1 \n\t" \
123  "movaps 32(%0), %%xmm2 \n\t" \
124  "movaps 48(%0), %%xmm3 \n\t" \
125  "movaps %%xmm0, 0(%1) \n\t" \
126  "movaps %%xmm1, 16(%1) \n\t" \
127  "movaps %%xmm2, 32(%1) \n\t" \
128  "movaps %%xmm3, 48(%1) \n\t" \
129  "movaps 64(%0), %%xmm0 \n\t" \
130  "movaps 80(%0), %%xmm1 \n\t" \
131  "movaps 96(%0), %%xmm2 \n\t" \
132  "movaps 112(%0), %%xmm3 \n\t" \
133  "movaps %%xmm0, 64(%1) \n\t" \
134  "movaps %%xmm1, 80(%1) \n\t" \
135  "movaps %%xmm2, 96(%1) \n\t" \
136  "movaps %%xmm3, 112(%1) \n\t"
137  ::"r"(in), "r"(in+512)
138  :"memory"
139  );
140 
141  apply_window(in + 16, win , win + 512, suma, sumc, 16);
142  apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
143 
144  SUM8(MACS, suma[0], win + 32, in + 48);
145 
146  sumc[ 0] = 0;
147  sumb[16] = 0;
148  sumd[16] = 0;
149 
150 #define SUMS(suma, sumb, sumc, sumd, out1, out2) \
151  "movups " #sumd "(%4), %%xmm0 \n\t" \
152  "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
153  "subps " #suma "(%1), %%xmm0 \n\t" \
154  "movaps %%xmm0," #out1 "(%0) \n\t" \
155 \
156  "movups " #sumc "(%3), %%xmm0 \n\t" \
157  "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
158  "addps " #sumb "(%2), %%xmm0 \n\t" \
159  "movaps %%xmm0," #out2 "(%0) \n\t"
160 
161  if (incr == 1) {
162  __asm__ volatile(
163  SUMS( 0, 48, 4, 52, 0, 112)
164  SUMS(16, 32, 20, 36, 16, 96)
165  SUMS(32, 16, 36, 20, 32, 80)
166  SUMS(48, 0, 52, 4, 48, 64)
167 
168  :"+&r"(out)
169  :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
170  :"memory"
171  );
172  out += 16*incr;
173  } else {
174  int j;
175  float *out2 = out + 32 * incr;
176  out[0 ] = -suma[ 0];
177  out += incr;
178  out2 -= incr;
179  for(j=1;j<16;j++) {
180  *out = -suma[ j] + sumd[16-j];
181  *out2 = sumb[16-j] + sumc[ j];
182  out += incr;
183  out2 -= incr;
184  }
185  }
186 
187  sum = 0;
188  SUM8(MLSS, sum, win + 16 + 32, in + 32);
189  *out = sum;
190 }
191 
192 #endif /* HAVE_6REGS && HAVE_SSE_INLINE */
193 
194 #if HAVE_YASM
195 #define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
196 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
197  int count, int switch_point, int block_type) \
198 { \
199  int align_end = count - (count & 3); \
200  int j; \
201  for (j = 0; j < align_end; j+= 4) { \
202  LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
203  float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
204  /* apply window & overlap with previous buffer */ \
205  \
206  /* select window */ \
207  ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
208  in += 4*18; \
209  buf += 4*18; \
210  out += 4; \
211  } \
212  for (; j < count; j++) { \
213  /* apply window & overlap with previous buffer */ \
214  \
215  /* select window */ \
216  int win_idx = (switch_point && j < 2) ? 0 : block_type; \
217  float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
218  \
219  ff_imdct36_float_ ## CPU1(out, buf, in, win); \
220  \
221  in += 18; \
222  buf++; \
223  out++; \
224  } \
225 }
226 
227 #if HAVE_SSE
228 #if ARCH_X86_32
229 DECL_IMDCT_BLOCKS(sse,sse)
230 #endif
231 DECL_IMDCT_BLOCKS(sse2,sse)
232 DECL_IMDCT_BLOCKS(sse3,sse)
233 DECL_IMDCT_BLOCKS(ssse3,sse)
234 #endif
235 #if HAVE_AVX_EXTERNAL
236 DECL_IMDCT_BLOCKS(avx,avx)
237 #endif
238 #endif /* HAVE_YASM */
239 
241 {
242  int cpu_flags = av_get_cpu_flags();
243 
244  int i, j;
245  for (j = 0; j < 4; j++) {
246  for (i = 0; i < 40; i ++) {
247  mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
248  mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
249  mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
250  mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
251  mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
252  mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
253  mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
254  mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
255  }
256  }
257 
258 #if HAVE_6REGS && HAVE_SSE_INLINE
259  if (INLINE_SSE(cpu_flags)) {
260  s->apply_window_float = apply_window_mp3;
261  }
262 #endif /* HAVE_SSE_INLINE */
263 
264 #if HAVE_YASM
265 #if HAVE_SSE
266 #if ARCH_X86_32
267  if (EXTERNAL_SSE(cpu_flags)) {
268  s->imdct36_blocks_float = imdct36_blocks_sse;
269  }
270 #endif
271  if (EXTERNAL_SSE2(cpu_flags)) {
272  s->imdct36_blocks_float = imdct36_blocks_sse2;
273  }
274  if (EXTERNAL_SSE3(cpu_flags)) {
275  s->imdct36_blocks_float = imdct36_blocks_sse3;
276  }
277  if (EXTERNAL_SSSE3(cpu_flags)) {
278  s->imdct36_blocks_float = imdct36_blocks_ssse3;
279  }
280 #endif
281 #if HAVE_AVX_EXTERNAL
282  if (EXTERNAL_AVX(cpu_flags)) {
283  s->imdct36_blocks_float = imdct36_blocks_avx;
284  }
285 #endif
286 #endif /* HAVE_YASM */
287 }