FFmpeg
h264idct_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "h264dsp_mips.h"
24 
25 #define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3) \
26 { \
27  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
28  \
29  tmp0_m = in0 + in2; \
30  tmp1_m = in0 - in2; \
31  tmp2_m = in1 >> 1; \
32  tmp2_m = tmp2_m - in3; \
33  tmp3_m = in3 >> 1; \
34  tmp3_m = in1 + tmp3_m; \
35  \
36  BUTTERFLY_4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3); \
37 }
38 
39 static void avc_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
40  int32_t de_q_val)
41 {
42 #define DC_DEST_STRIDE 16
43  int16_t out0, out1, out2, out3, out4, out5, out6, out7;
44  v8i16 src1, src3;
45  v8i16 vec0, vec1, vec2, vec3;
46  v8i16 tmp0, tmp1, tmp2, tmp3;
47  v8i16 hres0, hres1, hres2, hres3;
48  v8i16 vres0, vres1, vres2, vres3;
49  v4i32 vres0_r, vres1_r, vres2_r, vres3_r;
50  const v4i32 de_q_vec = __msa_fill_w(de_q_val);
51  const v8i16 src0 = LD_SH(src);
52  const v8i16 src2 = LD_SH(src + 8);
53 
54  ILVL_D2_SH(src0, src0, src2, src2, src1, src3);
55  TRANSPOSE4x4_SH_SH(src0, src1, src2, src3, tmp0, tmp1, tmp2, tmp3);
56  BUTTERFLY_4(tmp0, tmp2, tmp3, tmp1, vec0, vec3, vec2, vec1);
57  BUTTERFLY_4(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1);
58  TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
59  BUTTERFLY_4(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1);
60  BUTTERFLY_4(vec0, vec1, vec2, vec3, vres0, vres1, vres2, vres3);
61  UNPCK_R_SH_SW(vres0, vres0_r);
62  UNPCK_R_SH_SW(vres1, vres1_r);
63  UNPCK_R_SH_SW(vres2, vres2_r);
64  UNPCK_R_SH_SW(vres3, vres3_r);
65 
66  vres0_r *= de_q_vec;
67  vres1_r *= de_q_vec;
68  vres2_r *= de_q_vec;
69  vres3_r *= de_q_vec;
70 
71  SRARI_W4_SW(vres0_r, vres1_r, vres2_r, vres3_r, 8);
72  PCKEV_H2_SH(vres1_r, vres0_r, vres3_r, vres2_r, vec0, vec1);
73 
74  out0 = __msa_copy_s_h(vec0, 0);
75  out1 = __msa_copy_s_h(vec0, 1);
76  out2 = __msa_copy_s_h(vec0, 2);
77  out3 = __msa_copy_s_h(vec0, 3);
78  out4 = __msa_copy_s_h(vec0, 4);
79  out5 = __msa_copy_s_h(vec0, 5);
80  out6 = __msa_copy_s_h(vec0, 6);
81  out7 = __msa_copy_s_h(vec0, 7);
82  SH(out0, (dst + 0 * DC_DEST_STRIDE));
83  SH(out1, (dst + 2 * DC_DEST_STRIDE));
84  SH(out2, (dst + 8 * DC_DEST_STRIDE));
85  SH(out3, (dst + 10 * DC_DEST_STRIDE));
86  SH(out4, (dst + 1 * DC_DEST_STRIDE));
87  SH(out5, (dst + 3 * DC_DEST_STRIDE));
88  SH(out6, (dst + 9 * DC_DEST_STRIDE));
89  SH(out7, (dst + 11 * DC_DEST_STRIDE));
90 
91  out0 = __msa_copy_s_h(vec1, 0);
92  out1 = __msa_copy_s_h(vec1, 1);
93  out2 = __msa_copy_s_h(vec1, 2);
94  out3 = __msa_copy_s_h(vec1, 3);
95  out4 = __msa_copy_s_h(vec1, 4);
96  out5 = __msa_copy_s_h(vec1, 5);
97  out6 = __msa_copy_s_h(vec1, 6);
98  out7 = __msa_copy_s_h(vec1, 7);
99  SH(out0, (dst + 4 * DC_DEST_STRIDE));
100  SH(out1, (dst + 6 * DC_DEST_STRIDE));
101  SH(out2, (dst + 12 * DC_DEST_STRIDE));
102  SH(out3, (dst + 14 * DC_DEST_STRIDE));
103  SH(out4, (dst + 5 * DC_DEST_STRIDE));
104  SH(out5, (dst + 7 * DC_DEST_STRIDE));
105  SH(out6, (dst + 13 * DC_DEST_STRIDE));
106  SH(out7, (dst + 15 * DC_DEST_STRIDE));
107 
108 #undef DC_DEST_STRIDE
109 }
110 
111 static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
112 {
113  v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
114  v8i16 vec0, vec1, vec2, vec3;
115  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
116  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
117  v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
118  v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
119  v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
120  v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
121  v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
122  v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
123  v8i16 zeros = { 0 };
124 
125  src[0] += 32;
126 
127  LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
128  ST_SH8(zeros, zeros, zeros, zeros, zeros, zeros, zeros, zeros, src, 8);
129 
130  vec0 = src0 + src4;
131  vec1 = src0 - src4;
132  vec2 = src2 >> 1;
133  vec2 = vec2 - src6;
134  vec3 = src6 >> 1;
135  vec3 = src2 + vec3;
136 
137  BUTTERFLY_4(vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3);
138 
139  vec0 = src7 >> 1;
140  vec0 = src5 - vec0 - src3 - src7;
141  vec1 = src3 >> 1;
142  vec1 = src1 - vec1 + src7 - src3;
143  vec2 = src5 >> 1;
144  vec2 = vec2 - src1 + src7 + src5;
145  vec3 = src1 >> 1;
146  vec3 = vec3 + src3 + src5 + src1;
147  tmp4 = vec3 >> 2;
148  tmp4 += vec0;
149  tmp5 = vec2 >> 2;
150  tmp5 += vec1;
151  tmp6 = vec1 >> 2;
152  tmp6 -= vec2;
153  tmp7 = vec0 >> 2;
154  tmp7 = vec3 - tmp7;
155 
156  BUTTERFLY_8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
157  res0, res1, res2, res3, res4, res5, res6, res7);
158  TRANSPOSE8x8_SH_SH(res0, res1, res2, res3, res4, res5, res6, res7,
159  res0, res1, res2, res3, res4, res5, res6, res7);
160  UNPCK_SH_SW(res0, tmp0_r, tmp0_l);
161  UNPCK_SH_SW(res1, tmp1_r, tmp1_l);
162  UNPCK_SH_SW(res2, tmp2_r, tmp2_l);
163  UNPCK_SH_SW(res3, tmp3_r, tmp3_l);
164  UNPCK_SH_SW(res4, tmp4_r, tmp4_l);
165  UNPCK_SH_SW(res5, tmp5_r, tmp5_l);
166  UNPCK_SH_SW(res6, tmp6_r, tmp6_l);
167  UNPCK_SH_SW(res7, tmp7_r, tmp7_l);
168  BUTTERFLY_4(tmp0_r, tmp0_l, tmp4_l, tmp4_r, vec0_r, vec0_l, vec1_l, vec1_r);
169 
170  vec2_r = tmp2_r >> 1;
171  vec2_l = tmp2_l >> 1;
172  vec2_r -= tmp6_r;
173  vec2_l -= tmp6_l;
174  vec3_r = tmp6_r >> 1;
175  vec3_l = tmp6_l >> 1;
176  vec3_r += tmp2_r;
177  vec3_l += tmp2_l;
178 
179  BUTTERFLY_4(vec0_r, vec1_r, vec2_r, vec3_r, tmp0_r, tmp2_r, tmp4_r, tmp6_r);
180  BUTTERFLY_4(vec0_l, vec1_l, vec2_l, vec3_l, tmp0_l, tmp2_l, tmp4_l, tmp6_l);
181 
182  vec0_r = tmp7_r >> 1;
183  vec0_l = tmp7_l >> 1;
184  vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
185  vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
186  vec1_r = tmp3_r >> 1;
187  vec1_l = tmp3_l >> 1;
188  vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
189  vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
190  vec2_r = tmp5_r >> 1;
191  vec2_l = tmp5_l >> 1;
192  vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
193  vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
194  vec3_r = tmp1_r >> 1;
195  vec3_l = tmp1_l >> 1;
196  vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
197  vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
198  tmp1_r = vec3_r >> 2;
199  tmp1_l = vec3_l >> 2;
200  tmp1_r += vec0_r;
201  tmp1_l += vec0_l;
202  tmp3_r = vec2_r >> 2;
203  tmp3_l = vec2_l >> 2;
204  tmp3_r += vec1_r;
205  tmp3_l += vec1_l;
206  tmp5_r = vec1_r >> 2;
207  tmp5_l = vec1_l >> 2;
208  tmp5_r -= vec2_r;
209  tmp5_l -= vec2_l;
210  tmp7_r = vec0_r >> 2;
211  tmp7_l = vec0_l >> 2;
212  tmp7_r = vec3_r - tmp7_r;
213  tmp7_l = vec3_l - tmp7_l;
214 
215  BUTTERFLY_4(tmp0_r, tmp0_l, tmp7_l, tmp7_r, res0_r, res0_l, res7_l, res7_r);
216  BUTTERFLY_4(tmp2_r, tmp2_l, tmp5_l, tmp5_r, res1_r, res1_l, res6_l, res6_r);
217  BUTTERFLY_4(tmp4_r, tmp4_l, tmp3_l, tmp3_r, res2_r, res2_l, res5_l, res5_r);
218  BUTTERFLY_4(tmp6_r, tmp6_l, tmp1_l, tmp1_r, res3_r, res3_l, res4_l, res4_r);
219  SRA_4V(res0_r, res0_l, res1_r, res1_l, 6);
220  SRA_4V(res2_r, res2_l, res3_r, res3_l, 6);
221  SRA_4V(res4_r, res4_l, res5_r, res5_l, 6);
222  SRA_4V(res6_r, res6_l, res7_r, res7_l, 6);
223  PCKEV_H4_SH(res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
224  res0, res1, res2, res3);
225  PCKEV_H4_SH(res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
226  res4, res5, res6, res7);
227  LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
228  ILVR_B4_SH(zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
229  tmp0, tmp1, tmp2, tmp3);
230  ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
231  tmp4, tmp5, tmp6, tmp7);
232  ADD4(res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
233  res0, res1, res2, res3);
234  ADD4(res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
235  res4, res5, res6, res7);
236  CLIP_SH8_0_255(res0, res1, res2, res3, res4, res5, res6, res7);
237  PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
238  dst0, dst1, dst2, dst3);
239  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
240 }
241 
242 static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
243  int32_t dst_stride)
244 {
245  int32_t dc_val;
246  v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
247  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
248  v8i16 dc;
249  v16i8 zeros = { 0 };
250 
251  dc_val = (src[0] + 32) >> 6;
252  dc = __msa_fill_h(dc_val);
253 
254  src[0] = 0;
255 
256  LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
257  ILVR_B4_SH(zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
258  dst0_r, dst1_r, dst2_r, dst3_r);
259  ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
260  dst4_r, dst5_r, dst6_r, dst7_r);
261  ADD4(dst0_r, dc, dst1_r, dc, dst2_r, dc, dst3_r, dc,
262  dst0_r, dst1_r, dst2_r, dst3_r);
263  ADD4(dst4_r, dc, dst5_r, dc, dst6_r, dc, dst7_r, dc,
264  dst4_r, dst5_r, dst6_r, dst7_r);
265  CLIP_SH8_0_255(dst0_r, dst1_r, dst2_r, dst3_r,
266  dst4_r, dst5_r, dst6_r, dst7_r);
267  PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r,
268  dst0, dst1, dst2, dst3);
269  ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
270 }
271 
272 void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
273 {
274  uint32_t src0_m, src1_m, src2_m, src3_m, out0_m, out1_m, out2_m, out3_m;
275  v16i8 dst0_m = { 0 };
276  v16i8 dst1_m = { 0 };
277  v8i16 hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3;
278  v8i16 inp0_m, inp1_m, res0_m, res1_m, src1, src3;
279  const v8i16 src0 = LD_SH(src);
280  const v8i16 src2 = LD_SH(src + 8);
281  const v8i16 zero = { 0 };
282  const uint8_t *dst1 = dst + dst_stride;
283  const uint8_t *dst2 = dst + 2 * dst_stride;
284  const uint8_t *dst3 = dst + 3 * dst_stride;
285 
286  ILVL_D2_SH(src0, src0, src2, src2, src1, src3);
287  ST_SH2(zero, zero, src, 8);
288  AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
289  TRANSPOSE4x4_SH_SH(hres0, hres1, hres2, hres3, hres0, hres1, hres2, hres3);
290  AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
291  src0_m = LW(dst);
292  src1_m = LW(dst1);
293  SRARI_H4_SH(vres0, vres1, vres2, vres3, 6);
294  src2_m = LW(dst2);
295  src3_m = LW(dst3);
296  ILVR_D2_SH(vres1, vres0, vres3, vres2, inp0_m, inp1_m);
297  INSERT_W2_SB(src0_m, src1_m, dst0_m);
298  INSERT_W2_SB(src2_m, src3_m, dst1_m);
299  ILVR_B2_SH(zero, dst0_m, zero, dst1_m, res0_m, res1_m);
300  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);
301  CLIP_SH2_0_255(res0_m, res1_m);
302  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);
303  out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);
304  out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);
305  out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);
306  out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);
307  SW(out0_m, dst);
308  SW(out1_m, dst1);
309  SW(out2_m, dst2);
310  SW(out3_m, dst3);
311 }
312 
313 void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src,
314  int32_t dst_stride)
315 {
316  avc_idct8_addblk_msa(dst, src, dst_stride);
317 }
318 
319 void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src,
320  int32_t dst_stride)
321 {
322  v16u8 pred = { 0 };
323  v16i8 out;
324  v8i16 pred_r, pred_l;
325  const uint32_t src0 = LW(dst);
326  const uint32_t src1 = LW(dst + dst_stride);
327  const uint32_t src2 = LW(dst + 2 * dst_stride);
328  const uint32_t src3 = LW(dst + 3 * dst_stride);
329  const int16_t dc = (src[0] + 32) >> 6;
330  const v8i16 input_dc = __msa_fill_h(dc);
331 
332  src[0] = 0;
333  INSERT_W4_UB(src0, src1, src2, src3, pred);
334  UNPCK_UB_SH(pred, pred_r, pred_l);
335  ADD2(pred_r, input_dc, pred_l, input_dc, pred_r, pred_l);
336  CLIP_SH2_0_255(pred_r, pred_l);
337  out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
338  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
339 }
340 
341 void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
342  int32_t dst_stride)
343 {
344  avc_idct8_dc_addblk_msa(dst, src, dst_stride);
345 }
346 
347 void ff_h264_idct_add16_msa(uint8_t *dst,
348  const int32_t *blk_offset,
349  int16_t *block, int32_t dst_stride,
350  const uint8_t nzc[5 * 8])
351 {
352  int32_t i;
353 
354  for (i = 0; i < 16; i++) {
355  int32_t nnz = nzc[scan8[i]];
356 
357  if (nnz) {
358  if (nnz == 1 && ((dctcoef *) block)[i * 16])
359  ff_h264_idct4x4_addblk_dc_msa(dst + blk_offset[i],
360  block + i * 16 * sizeof(pixel),
361  dst_stride);
362  else
363  ff_h264_idct_add_msa(dst + blk_offset[i],
364  block + i * 16 * sizeof(pixel),
365  dst_stride);
366  }
367  }
368 }
369 
370 void ff_h264_idct8_add4_msa(uint8_t *dst, const int32_t *blk_offset,
371  int16_t *block, int32_t dst_stride,
372  const uint8_t nzc[5 * 8])
373 {
374  int32_t cnt;
375 
376  for (cnt = 0; cnt < 16; cnt += 4) {
377  int32_t nnz = nzc[scan8[cnt]];
378 
379  if (nnz) {
380  if (nnz == 1 && ((dctcoef *) block)[cnt * 16])
381  ff_h264_idct8_dc_addblk_msa(dst + blk_offset[cnt],
382  block + cnt * 16 * sizeof(pixel),
383  dst_stride);
384  else
385  ff_h264_idct8_addblk_msa(dst + blk_offset[cnt],
386  block + cnt * 16 * sizeof(pixel),
387  dst_stride);
388  }
389  }
390 }
391 
392 void ff_h264_idct_add8_msa(uint8_t **dst,
393  const int32_t *blk_offset,
394  int16_t *block, int32_t dst_stride,
395  const uint8_t nzc[15 * 8])
396 {
397  int32_t i, j;
398 
399  for (j = 1; j < 3; j++) {
400  for (i = (j * 16); i < (j * 16 + 4); i++) {
401  if (nzc[scan8[i]])
402  ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i],
403  block + i * 16 * sizeof(pixel),
404  dst_stride);
405  else if (((dctcoef *) block)[i * 16])
406  ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i],
407  block + i * 16 * sizeof(pixel),
408  dst_stride);
409  }
410  }
411 }
412 
413 void ff_h264_idct_add8_422_msa(uint8_t **dst,
414  const int32_t *blk_offset,
415  int16_t *block, int32_t dst_stride,
416  const uint8_t nzc[15 * 8])
417 {
418  int32_t i, j;
419 
420  for (j = 1; j < 3; j++) {
421  for (i = (j * 16); i < (j * 16 + 4); i++) {
422  if (nzc[scan8[i]])
423  ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i],
424  block + i * 16 * sizeof(pixel),
425  dst_stride);
426  else if (((dctcoef *) block)[i * 16])
427  ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i],
428  block + i * 16 * sizeof(pixel),
429  dst_stride);
430  }
431  }
432 
433  for (j = 1; j < 3; j++) {
434  for (i = (j * 16 + 4); i < (j * 16 + 8); i++) {
435  if (nzc[scan8[i + 4]])
436  ff_h264_idct_add_msa(dst[j - 1] + blk_offset[i + 4],
437  block + i * 16 * sizeof(pixel),
438  dst_stride);
439  else if (((dctcoef *) block)[i * 16])
440  ff_h264_idct4x4_addblk_dc_msa(dst[j - 1] + blk_offset[i + 4],
441  block + i * 16 * sizeof(pixel),
442  dst_stride);
443  }
444  }
445 }
446 
448  const int32_t *blk_offset,
449  int16_t *block,
450  int32_t dst_stride,
451  const uint8_t nzc[5 * 8])
452 {
453  int32_t i;
454 
455  for (i = 0; i < 16; i++) {
456  if (nzc[scan8[i]])
457  ff_h264_idct_add_msa(dst + blk_offset[i],
458  block + i * 16 * sizeof(pixel), dst_stride);
459  else if (((dctcoef *) block)[i * 16])
460  ff_h264_idct4x4_addblk_dc_msa(dst + blk_offset[i],
461  block + i * 16 * sizeof(pixel),
462  dst_stride);
463  }
464 }
465 
466 void ff_h264_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src,
467  int32_t de_qval)
468 {
469  avc_deq_idct_luma_dc_msa(dst, src, de_qval);
470 }
ff_h264_idct_add16_msa
void ff_h264_idct_add16_msa(uint8_t *dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[5 *8])
Definition: h264idct_msa.c:347
out
FILE * out
Definition: movenc.c:55
PCKEV_B4_SB
#define PCKEV_B4_SB(...)
Definition: generic_macros_msa.h:1738
src1
const pixel * src1
Definition: h264pred_template.c:421
SRARI_H4_SH
#define SRARI_H4_SH(...)
Definition: generic_macros_msa.h:2067
avc_deq_idct_luma_dc_msa
static void avc_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src, int32_t de_q_val)
Definition: h264idct_msa.c:39
AVC_ITRANS_H
#define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3)
Definition: h264idct_msa.c:25
LD_SH
#define LD_SH(...)
Definition: generic_macros_msa.h:35
ff_h264_idct8_add4_msa
void ff_h264_idct8_add4_msa(uint8_t *dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[5 *8])
Definition: h264idct_msa.c:370
ff_h264_idct8_dc_addblk_msa
void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:341
ADD4
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2123
ff_h264_idct_add_msa
void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:272
SH
#define SH(val, pdst)
Definition: generic_macros_msa.h:154
INSERT_W2_SB
#define INSERT_W2_SB(...)
Definition: generic_macros_msa.h:1144
ADD2
#define ADD2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2118
generic_macros_msa.h
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: vp8_lpf_lsx.c:234
dctcoef
#define dctcoef
Definition: bit_depth_template.c:84
scan8
static const uint8_t scan8[16 *3+3]
Definition: h264_parse.h:40
TRANSPOSE8x8_SH_SH
#define TRANSPOSE8x8_SH_SH(...)
Definition: generic_macros_msa.h:2505
SW
#define SW(val, pdst)
Definition: generic_macros_msa.h:167
SRA_4V
#define SRA_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1939
PCKEV_H4_SH
#define PCKEV_H4_SH(...)
Definition: generic_macros_msa.h:1768
ILVL_D2_SH
#define ILVL_D2_SH(...)
Definition: generic_macros_msa.h:1479
ST_D8
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:511
ff_h264_idct8_addblk_msa
void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:313
ff_h264_deq_idct_luma_dc_msa
void ff_h264_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src, int32_t de_qval)
Definition: h264idct_msa.c:466
h264dsp_mips.h
ST_SH2
#define ST_SH2(...)
Definition: generic_macros_msa.h:366
pixel
uint8_t pixel
Definition: tiny_ssim.c:41
UNPCK_SH_SW
#define UNPCK_SH_SW(in, out0, out1)
Definition: generic_macros_msa.h:2224
ff_h264_idct_add8_422_msa
void ff_h264_idct_add8_422_msa(uint8_t **dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_msa.c:413
avc_idct8_addblk_msa
static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:111
TRANSPOSE4x4_SH_SH
#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2466
bit_depth_template.c
CLIP_SH8_0_255
#define CLIP_SH8_0_255(in0, in1, in2, in3, in4, in5, in6, in7)
Definition: generic_macros_msa.h:953
CLIP_SH2_0_255
#define CLIP_SH2_0_255(in0, in1)
Definition: generic_macros_msa.h:941
LW
#define LW(psrc)
Definition: generic_macros_msa.h:104
dc
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff) *mv_scale Intra DC Prediction block[y][x] dc[1]
Definition: snow.txt:400
ff_h264_idct_add16_intra_msa
void ff_h264_idct_add16_intra_msa(uint8_t *dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[5 *8])
Definition: h264idct_msa.c:447
ST_SH8
#define ST_SH8(...)
Definition: generic_macros_msa.h:392
LD_SH8
#define LD_SH8(...)
Definition: generic_macros_msa.h:338
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
DC_DEST_STRIDE
#define DC_DEST_STRIDE
BUTTERFLY_8
#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4, out5, out6, out7)
Definition: generic_macros_msa.h:2263
src2
const pixel * src2
Definition: h264pred_template.c:422
UNPCK_UB_SH
#define UNPCK_UB_SH(in, out0, out1)
Definition: generic_macros_msa.h:2206
INSERT_W4_UB
#define INSERT_W4_UB(...)
Definition: generic_macros_msa.h:1153
ff_h264_idct4x4_addblk_dc_msa
void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:319
pred
static const float pred[4]
Definition: siprdata.h:259
ILVR_D2_SH
#define ILVR_D2_SH(...)
Definition: generic_macros_msa.h:1445
LD_SB8
#define LD_SB8(...)
Definition: generic_macros_msa.h:336
SRARI_W4_SW
#define SRARI_W4_SW(...)
Definition: generic_macros_msa.h:2092
BUTTERFLY_4
#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2249
ILVR_B4_SH
#define ILVR_B4_SH(...)
Definition: generic_macros_msa.h:1362
src0
const pixel *const src0
Definition: h264pred_template.c:420
ff_h264_idct_add8_msa
void ff_h264_idct_add8_msa(uint8_t **dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
Definition: h264idct_msa.c:392
zero
#define zero
Definition: regdef.h:64
UNPCK_R_SH_SW
#define UNPCK_R_SH_SW(in, out)
Definition: generic_macros_msa.h:2172
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
PCKEV_B2_SB
#define PCKEV_B2_SB(...)
Definition: generic_macros_msa.h:1719
int32_t
int32_t
Definition: audioconvert.c:56
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
ILVR_B2_SH
#define ILVR_B2_SH(...)
Definition: generic_macros_msa.h:1340
PCKEV_H2_SH
#define PCKEV_H2_SH(...)
Definition: generic_macros_msa.h:1759
avc_idct8_dc_addblk_msa
static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Definition: h264idct_msa.c:242