FFmpeg
idctdsp.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2001 Michel Lespinasse
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 /* NOTE: This code is based on GPL code from the libmpeg2 project. The
22  * author, Michel Lespinasses, has given explicit permission to release
23  * under LGPL as part of FFmpeg.
24  *
25  * FFmpeg integration by Dieter Shirley
26  *
27  * This file is a direct copy of the AltiVec IDCT module from the libmpeg2
28  * project. I've deleted all of the libmpeg2-specific code, renamed the
29  * functions and reordered the function parameters. The only change to the
30  * IDCT function itself was to factor out the partial transposition, and to
31  * perform a full transpose at the end of the function. */
32 
33 #include "config.h"
34 
35 #include <stdlib.h>
36 #include <string.h>
37 
38 #include "libavutil/attributes.h"
39 #include "libavutil/cpu.h"
40 #include "libavutil/ppc/cpu.h"
42 
43 #include "libavcodec/avcodec.h"
44 #include "libavcodec/idctdsp.h"
45 
46 #if HAVE_ALTIVEC
47 
48 #define IDCT_HALF \
49  /* 1st stage */ \
50  t1 = vec_mradds(a1, vx7, vx1); \
51  t8 = vec_mradds(a1, vx1, vec_subs(zero, vx7)); \
52  t7 = vec_mradds(a2, vx5, vx3); \
53  t3 = vec_mradds(ma2, vx3, vx5); \
54  \
55  /* 2nd stage */ \
56  t5 = vec_adds(vx0, vx4); \
57  t0 = vec_subs(vx0, vx4); \
58  t2 = vec_mradds(a0, vx6, vx2); \
59  t4 = vec_mradds(a0, vx2, vec_subs(zero, vx6)); \
60  t6 = vec_adds(t8, t3); \
61  t3 = vec_subs(t8, t3); \
62  t8 = vec_subs(t1, t7); \
63  t1 = vec_adds(t1, t7); \
64  \
65  /* 3rd stage */ \
66  t7 = vec_adds(t5, t2); \
67  t2 = vec_subs(t5, t2); \
68  t5 = vec_adds(t0, t4); \
69  t0 = vec_subs(t0, t4); \
70  t4 = vec_subs(t8, t3); \
71  t3 = vec_adds(t8, t3); \
72  \
73  /* 4th stage */ \
74  vy0 = vec_adds(t7, t1); \
75  vy7 = vec_subs(t7, t1); \
76  vy1 = vec_mradds(c4, t3, t5); \
77  vy6 = vec_mradds(mc4, t3, t5); \
78  vy2 = vec_mradds(c4, t4, t0); \
79  vy5 = vec_mradds(mc4, t4, t0); \
80  vy3 = vec_adds(t2, t6); \
81  vy4 = vec_subs(t2, t6)
82 
83 #define IDCT \
84  vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
85  vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \
86  \
87  vec_s16 c4 = vec_splat(constants[0], 0); \
88  vec_s16 a0 = vec_splat(constants[0], 1); \
89  vec_s16 a1 = vec_splat(constants[0], 2); \
90  vec_s16 a2 = vec_splat(constants[0], 3); \
91  vec_s16 mc4 = vec_splat(constants[0], 4); \
92  vec_s16 ma2 = vec_splat(constants[0], 5); \
93  vec_s16 bias = (vec_s16) vec_splat((vec_s32) constants[0], 3); \
94  \
95  vec_s16 zero = vec_splat_s16(0); \
96  vec_u16 shift = vec_splat_u16(4); \
97  \
98  vec_s16 vx0 = vec_mradds(vec_sl(block[0], shift), constants[1], zero); \
99  vec_s16 vx1 = vec_mradds(vec_sl(block[1], shift), constants[2], zero); \
100  vec_s16 vx2 = vec_mradds(vec_sl(block[2], shift), constants[3], zero); \
101  vec_s16 vx3 = vec_mradds(vec_sl(block[3], shift), constants[4], zero); \
102  vec_s16 vx4 = vec_mradds(vec_sl(block[4], shift), constants[1], zero); \
103  vec_s16 vx5 = vec_mradds(vec_sl(block[5], shift), constants[4], zero); \
104  vec_s16 vx6 = vec_mradds(vec_sl(block[6], shift), constants[3], zero); \
105  vec_s16 vx7 = vec_mradds(vec_sl(block[7], shift), constants[2], zero); \
106  \
107  IDCT_HALF; \
108  \
109  vx0 = vec_mergeh(vy0, vy4); \
110  vx1 = vec_mergel(vy0, vy4); \
111  vx2 = vec_mergeh(vy1, vy5); \
112  vx3 = vec_mergel(vy1, vy5); \
113  vx4 = vec_mergeh(vy2, vy6); \
114  vx5 = vec_mergel(vy2, vy6); \
115  vx6 = vec_mergeh(vy3, vy7); \
116  vx7 = vec_mergel(vy3, vy7); \
117  \
118  vy0 = vec_mergeh(vx0, vx4); \
119  vy1 = vec_mergel(vx0, vx4); \
120  vy2 = vec_mergeh(vx1, vx5); \
121  vy3 = vec_mergel(vx1, vx5); \
122  vy4 = vec_mergeh(vx2, vx6); \
123  vy5 = vec_mergel(vx2, vx6); \
124  vy6 = vec_mergeh(vx3, vx7); \
125  vy7 = vec_mergel(vx3, vx7); \
126  \
127  vx0 = vec_adds(vec_mergeh(vy0, vy4), bias); \
128  vx1 = vec_mergel(vy0, vy4); \
129  vx2 = vec_mergeh(vy1, vy5); \
130  vx3 = vec_mergel(vy1, vy5); \
131  vx4 = vec_mergeh(vy2, vy6); \
132  vx5 = vec_mergel(vy2, vy6); \
133  vx6 = vec_mergeh(vy3, vy7); \
134  vx7 = vec_mergel(vy3, vy7); \
135  \
136  IDCT_HALF; \
137  \
138  shift = vec_splat_u16(6); \
139  vx0 = vec_sra(vy0, shift); \
140  vx1 = vec_sra(vy1, shift); \
141  vx2 = vec_sra(vy2, shift); \
142  vx3 = vec_sra(vy3, shift); \
143  vx4 = vec_sra(vy4, shift); \
144  vx5 = vec_sra(vy5, shift); \
145  vx6 = vec_sra(vy6, shift); \
146  vx7 = vec_sra(vy7, shift)
147 
148 static const vec_s16 constants[5] = {
149  { 23170, 13573, 6518, 21895, -23170, -21895, 32, 31 },
150  { 16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725 },
151  { 22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521 },
152  { 21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692 },
153  { 19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722 }
154 };
155 
156 static void idct_altivec(int16_t *blk)
157 {
158  vec_s16 *block = (vec_s16 *) blk;
159 
160  IDCT;
161 
162  block[0] = vx0;
163  block[1] = vx1;
164  block[2] = vx2;
165  block[3] = vx3;
166  block[4] = vx4;
167  block[5] = vx5;
168  block[6] = vx6;
169  block[7] = vx7;
170 }
171 
172 static void idct_put_altivec(uint8_t *dest, ptrdiff_t stride, int16_t *blk)
173 {
174  vec_s16 *block = (vec_s16 *) blk;
175  vec_u8 tmp;
176 
177  IDCT;
178 
179 #define COPY(dest, src) \
180  tmp = vec_packsu(src, src); \
181  vec_ste((vec_u32) tmp, 0, (unsigned int *) dest); \
182  vec_ste((vec_u32) tmp, 4, (unsigned int *) dest)
183 
184  COPY(dest, vx0);
185  dest += stride;
186  COPY(dest, vx1);
187  dest += stride;
188  COPY(dest, vx2);
189  dest += stride;
190  COPY(dest, vx3);
191  dest += stride;
192  COPY(dest, vx4);
193  dest += stride;
194  COPY(dest, vx5);
195  dest += stride;
196  COPY(dest, vx6);
197  dest += stride;
198  COPY(dest, vx7);
199 }
200 
201 static void idct_add_altivec(uint8_t *dest, ptrdiff_t stride, int16_t *blk)
202 {
203  vec_s16 *block = (vec_s16 *) blk;
204  vec_u8 tmp;
205  vec_s16 tmp2, tmp3;
206  vec_u8 perm0;
207  vec_u8 perm1;
208  vec_u8 p0, p1, p;
209 
210  IDCT;
211 
212 #if HAVE_BIGENDIAN
213  p0 = vec_lvsl(0, dest);
214  p1 = vec_lvsl(stride, dest);
215  p = vec_splat_u8(-1);
216  perm0 = vec_mergeh(p, p0);
217  perm1 = vec_mergeh(p, p1);
218 #endif
219 
220 #if HAVE_BIGENDIAN
221 #define GET_TMP2(dest, prm) \
222  tmp = vec_ld(0, dest); \
223  tmp2 = (vec_s16) vec_perm(tmp, (vec_u8) zero, prm);
224 #else
225 #define GET_TMP2(dest, prm) \
226  tmp = vec_vsx_ld(0, dest); \
227  tmp2 = (vec_s16) vec_mergeh(tmp, (vec_u8) zero)
228 #endif
229 
230 #define ADD(dest, src, perm) \
231  GET_TMP2(dest, perm); \
232  tmp3 = vec_adds(tmp2, src); \
233  tmp = vec_packsu(tmp3, tmp3); \
234  vec_ste((vec_u32) tmp, 0, (unsigned int *) dest); \
235  vec_ste((vec_u32) tmp, 4, (unsigned int *) dest)
236 
237  ADD(dest, vx0, perm0);
238  dest += stride;
239  ADD(dest, vx1, perm1);
240  dest += stride;
241  ADD(dest, vx2, perm0);
242  dest += stride;
243  ADD(dest, vx3, perm1);
244  dest += stride;
245  ADD(dest, vx4, perm0);
246  dest += stride;
247  ADD(dest, vx5, perm1);
248  dest += stride;
249  ADD(dest, vx6, perm0);
250  dest += stride;
251  ADD(dest, vx7, perm1);
252 }
253 
254 #endif /* HAVE_ALTIVEC */
255 
257  unsigned high_bit_depth)
258 {
259 #if HAVE_ALTIVEC
261  return;
262 
263  if (!high_bit_depth && avctx->lowres == 0) {
264  if ((avctx->idct_algo == FF_IDCT_AUTO && !(avctx->flags & AV_CODEC_FLAG_BITEXACT)) ||
265  (avctx->idct_algo == FF_IDCT_ALTIVEC)) {
266  c->idct = idct_altivec;
267  c->idct_add = idct_add_altivec;
268  c->idct_put = idct_put_altivec;
269  c->perm_type = FF_IDCT_PERM_TRANSPOSE;
270  }
271  }
272 #endif /* HAVE_ALTIVEC */
273 }
FF_IDCT_ALTIVEC
#define FF_IDCT_ALTIVEC
Definition: avcodec.h:1553
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:103
FF_IDCT_AUTO
#define FF_IDCT_AUTO
Definition: avcodec.h:1548
AVCodecContext::flags
int flags
AV_CODEC_FLAG_*.
Definition: avcodec.h:502
vec_s16
#define vec_s16
Definition: util_altivec.h:37
av_cold
#define av_cold
Definition: attributes.h:90
COPY
#define COPY(src, name)
blk
#define blk(i)
Definition: sha.c:186
constants
static const struct @368 constants[]
ADD
#define ADD(a, b)
Definition: dct32_template.c:123
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
AVCodecContext::lowres
int lowres
low resolution decoding, 1-> 1/2 size, 2->1/4 size
Definition: avcodec.h:1854
PPC_ALTIVEC
#define PPC_ALTIVEC(flags)
Definition: cpu.h:25
ff_idctdsp_init_ppc
av_cold void ff_idctdsp_init_ppc(IDCTDSPContext *c, AVCodecContext *avctx, unsigned high_bit_depth)
Definition: idctdsp.c:256
cpu.h
vec_u8
#define vec_u8
Definition: util_altivec.h:34
attributes.h
AVCodecContext::idct_algo
int idct_algo
IDCT algorithm, see FF_IDCT_* below.
Definition: avcodec.h:1547
idctdsp.h
avcodec.h
stride
#define stride
Definition: h264pred_template.c:537
FF_IDCT_PERM_TRANSPOSE
@ FF_IDCT_PERM_TRANSPOSE
Definition: idctdsp.h:31
IDCTDSPContext
Definition: idctdsp.h:43
AVCodecContext
main external API structure.
Definition: avcodec.h:445
IDCT
#define IDCT(H)
Definition: hevcdsp_template.c:242
AV_CODEC_FLAG_BITEXACT
#define AV_CODEC_FLAG_BITEXACT
Use only bitexact stuff (except (I)DCT).
Definition: avcodec.h:342
util_altivec.h
cpu.h
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207