35 #define REPT4(...) { __VA_ARGS__, __VA_ARGS__, __VA_ARGS__, __VA_ARGS__ } 38 static const vec_s8 h_subpel_filters_inner[7] =
40 REPT4( -6, 123, 12, -1),
41 REPT4(-11, 108, 36, -8),
42 REPT4( -9, 93, 50, -6),
43 REPT4(-16, 77, 77, -16),
44 REPT4( -6, 50, 93, -9),
45 REPT4( -8, 36, 108, -11),
46 REPT4( -1, 12, 123, -6),
52 static const vec_s8 h_subpel_filters_outer[3] =
59 #define LOAD_H_SUBPEL_FILTER(i) \ 60 vec_s8 filter_inner = h_subpel_filters_inner[i]; \ 61 vec_s8 filter_outerh = h_subpel_filters_outer[(i)>>1]; \ 62 vec_s8 filter_outerl = vec_sld(filter_outerh, filter_outerh, 2) 65 #define GET_PIXHL(offset) \ 66 a = vec_ld((offset)-is6tap-1, src); \ 67 b = vec_ld((offset)-is6tap-1+15, src); \ 68 pixh = vec_perm(a, b, permh##offset); \ 69 pixl = vec_perm(a, b, perml##offset) 71 #define GET_OUTER(offset) outer = vec_perm(a, b, perm_6tap##offset) 73 #define GET_PIXHL(offset) \ 74 a = vec_vsx_ld((offset)-is6tap-1, src); \ 75 pixh = vec_perm(a, a, perm_inner); \ 76 pixl = vec_perm(a, a, vec_add(perm_inner, vec_splat_u8(4))) 78 #define GET_OUTER(offset) outer = vec_perm(a, a, perm_outer) 81 #define FILTER_H(dstv, off) \ 83 filth = vec_msum(filter_inner, pixh, c64); \ 84 filtl = vec_msum(filter_inner, pixl, c64); \ 88 filth = vec_msum(filter_outerh, outer, filth); \ 89 filtl = vec_msum(filter_outerl, outer, filtl); \ 93 dstv = vec_packs(filth, filtl); \ 94 dstv = vec_sra(dstv, c7) 97 void put_vp8_epel_h_altivec_core(
uint8_t *dst, ptrdiff_t dst_stride,
99 int h,
int mx,
int w,
int is6tap)
101 LOAD_H_SUBPEL_FILTER(mx-1);
103 vec_u8 align_vec0, align_vec8, permh0, permh8;
104 vec_u8 perm_6tap0, perm_6tap8, perml0, perml8;
111 vec_u8 perm_inner6 = { 1,2,3,4, 2,3,4,5, 3,4,5,6, 4,5,6,7 };
112 vec_u8 perm_inner4 = { 0,1,2,3, 1,2,3,4, 2,3,4,5, 3,4,5,6 };
113 vec_u8 perm_inner = is6tap ? perm_inner6 : perm_inner4;
114 vec_u8 perm_outer = { 4,9, 0,5, 5,10, 1,6, 6,11, 2,7, 7,12, 3,8 };
115 vec_s32 c64 = vec_sl(vec_splat_s32(1), vec_splat_u32(6));
119 align_vec0 = vec_lvsl( -is6tap-1, src);
120 align_vec8 = vec_lvsl(8-is6tap-1, src);
122 permh0 = vec_perm(align_vec0, align_vec0, perm_inner);
123 permh8 = vec_perm(align_vec8, align_vec8, perm_inner);
124 perm_inner = vec_add(perm_inner, vec_splat_u8(4));
125 perml0 = vec_perm(align_vec0, align_vec0, perm_inner);
126 perml8 = vec_perm(align_vec8, align_vec8, perm_inner);
127 perm_6tap0 = vec_perm(align_vec0, align_vec0, perm_outer);
128 perm_6tap8 = vec_perm(align_vec8, align_vec8, perm_outer);
136 filt = vec_packsu(f16h, f16l);
137 vec_st(filt, 0, dst);
139 filt = vec_packsu(f16h, f16h);
140 vec_ste((
vec_u32)filt, 0, (uint32_t*)dst);
142 vec_ste((
vec_u32)filt, 4, (uint32_t*)dst);
150 static const vec_u8 v_subpel_filters[7] =
152 { 0, 6, 123, 12, 1, 0 },
153 { 2, 11, 108, 36, 8, 1 },
154 { 0, 9, 93, 50, 6, 0 },
155 { 3, 16, 77, 77, 16, 3 },
156 { 0, 6, 50, 93, 9, 0 },
157 { 1, 8, 36, 108, 11, 2 },
158 { 0, 1, 12, 123, 6, 0 },
161 #define LOAD_V_SUBPEL_FILTER(i) \ 162 vec_u8 subpel_filter = v_subpel_filters[i]; \ 163 vec_u8 f0 = vec_splat(subpel_filter, 0); \ 164 vec_u8 f1 = vec_splat(subpel_filter, 1); \ 165 vec_u8 f2 = vec_splat(subpel_filter, 2); \ 166 vec_u8 f3 = vec_splat(subpel_filter, 3); \ 167 vec_u8 f4 = vec_splat(subpel_filter, 4); \ 168 vec_u8 f5 = vec_splat(subpel_filter, 5) 170 #define FILTER_V(dstv, vec_mul) \ 171 s1f = (vec_s16)vec_mul(s1, f1); \ 172 s2f = (vec_s16)vec_mul(s2, f2); \ 173 s3f = (vec_s16)vec_mul(s3, f3); \ 174 s4f = (vec_s16)vec_mul(s4, f4); \ 175 s2f = vec_subs(s2f, s1f); \ 176 s3f = vec_subs(s3f, s4f); \ 178 s0f = (vec_s16)vec_mul(s0, f0); \ 179 s5f = (vec_s16)vec_mul(s5, f5); \ 180 s2f = vec_adds(s2f, s0f); \ 181 s3f = vec_adds(s3f, s5f); \ 183 dstv = vec_adds(s2f, s3f); \ 184 dstv = vec_adds(dstv, c64); \ 185 dstv = vec_sra(dstv, c7) 188 #define LOAD_HL(off, s, perm) load_with_perm_vec(off, s, perm) 190 #define LOAD_HL(off, s, perm) vec_mergeh(vec_vsx_ld(off,s), vec_vsx_ld(off+8,s)) 194 void put_vp8_epel_v_altivec_core(
uint8_t *dst, ptrdiff_t dst_stride,
195 uint8_t *src, ptrdiff_t src_stride,
196 int h,
int my,
int w,
int is6tap)
198 LOAD_V_SUBPEL_FILTER(my-1);
200 vec_s16 s0f, s1f, s2f, s3f, s4f, s5f, f16h, f16l;
201 vec_s16 c64 = vec_sl(vec_splat_s16(1), vec_splat_u16(6));
207 align_vech = vec_lvsl(0, src);
208 align_vecl = vec_sld(align_vech, align_vech, 8);
210 perm_vec = vec_mergeh(align_vech, align_vecl);
212 perm_vec = vec_mergeh(align_vech, align_vech);
216 s0 = LOAD_HL(-2*src_stride, src, perm_vec);
217 s1 = LOAD_HL(-1*src_stride, src, perm_vec);
218 s2 = LOAD_HL( 0*src_stride, src, perm_vec);
219 s3 = LOAD_HL( 1*src_stride, src, perm_vec);
221 s4 = LOAD_HL( 2*src_stride, src, perm_vec);
223 src += (2+is6tap)*src_stride;
227 s5 = LOAD_HL(0, src, perm_vec);
229 s4 = LOAD_HL(0, src, perm_vec);
231 FILTER_V(f16h, vec_mule);
234 FILTER_V(f16l, vec_mulo);
235 filt = vec_packsu(f16h, f16l);
236 vec_st(filt, 0, dst);
238 filt = vec_packsu(f16h, f16h);
242 vec_ste((
vec_u32)filt, 4, (uint32_t*)dst);
243 vec_ste((
vec_u32)filt, 0, (uint32_t*)dst);
259 #define EPEL_FUNCS(WIDTH, TAPS) \ 261 void put_vp8_epel ## WIDTH ## _h ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \ 263 put_vp8_epel_h_altivec_core(dst, dst_stride, src, src_stride, h, mx, WIDTH, TAPS == 6); \ 267 void put_vp8_epel ## WIDTH ## _v ## TAPS ## _altivec(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int h, int mx, int my) \ 269 put_vp8_epel_v_altivec_core(dst, dst_stride, src, src_stride, h, my, WIDTH, TAPS == 6); \ 272 #define EPEL_HV(WIDTH, HTAPS, VTAPS) \ 273 static void put_vp8_epel ## WIDTH ## _h ## HTAPS ## v ## VTAPS ## _altivec(uint8_t *dst, ptrdiff_t dstride, uint8_t *src, ptrdiff_t sstride, int h, int mx, int my) \ 275 DECLARE_ALIGNED(16, uint8_t, tmp)[(2*WIDTH+5)*16]; \ 277 put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-2*sstride, sstride, h+5, mx, my); \ 278 put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+2*16, 16, h, mx, my); \ 280 put_vp8_epel ## WIDTH ## _h ## HTAPS ## _altivec(tmp, 16, src-sstride, sstride, h+4, mx, my); \ 281 put_vp8_epel ## WIDTH ## _v ## VTAPS ## _altivec(dst, dstride, tmp+16, 16, h, mx, my); \ 301 static
void put_vp8_pixels16_altivec(
uint8_t *dst, ptrdiff_t dstride,
uint8_t *src, ptrdiff_t sstride,
int h,
int mx,
int my)
303 register vector
unsigned char perm;
305 register ptrdiff_t dstride2 = dstride << 1, sstride2 = sstride << 1;
306 register ptrdiff_t dstride3 = dstride2 + dstride, sstride3 = sstride + sstride2;
307 register ptrdiff_t dstride4 = dstride << 2, sstride4 = sstride << 2;
310 perm = vec_lvsl(0, src);
317 for (i = 0; i <
h; i += 4) {
318 vec_st(load_with_perm_vec(0, src, perm), 0, dst);
319 vec_st(load_with_perm_vec(sstride, src, perm), dstride, dst);
320 vec_st(load_with_perm_vec(sstride2, src, perm), dstride2, dst);
321 vec_st(load_with_perm_vec(sstride3, src, perm), dstride3, dst);
av_cold void ff_vp78dsp_init_ppc(VP8DSPContext *c)
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
vp8_mc_func put_vp8_epel_pixels_tab[3][3][3]
first dimension: 4-log2(width) second dimension: 0 if no vertical interpolation is needed; 1 4-tap ve...
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
VP8 compatible video decoder.
#define PPC_ALTIVEC(flags)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Contains misc utility macros and inline functions.
static const int8_t filt[NUMTAPS *2]
#define EPEL_FUNCS(depth)