25 #define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3) \ 27 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \ 32 tmp2_m = tmp2_m - in3; \ 34 tmp3_m = in1 + tmp3_m; \ 36 BUTTERFLY_4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, out0, out1, out2, out3); \ 42 #define DC_DEST_STRIDE 16 43 int16_t out0, out1, out2, out3, out4, out5, out6, out7;
45 v8i16 vec0, vec1, vec2, vec3;
46 v8i16 tmp0, tmp1, tmp2, tmp3;
47 v8i16 hres0, hres1, hres2, hres3;
48 v8i16 vres0, vres1, vres2, vres3;
49 v4i32 vres0_r, vres1_r, vres2_r, vres3_r;
50 const v4i32 de_q_vec = __msa_fill_w(de_q_val);
52 const v8i16 src2 =
LD_SH(src + 8);
54 ILVL_D2_SH(src0, src0, src2, src2, src1, src3);
56 BUTTERFLY_4(tmp0, tmp2, tmp3, tmp1, vec0, vec3, vec2, vec1);
57 BUTTERFLY_4(vec0, vec1, vec2, vec3, hres0, hres3, hres2, hres1);
59 BUTTERFLY_4(hres0, hres1, hres3, hres2, vec0, vec3, vec2, vec1);
60 BUTTERFLY_4(vec0, vec1, vec2, vec3, vres0, vres1, vres2, vres3);
72 PCKEV_H2_SH(vres1_r, vres0_r, vres3_r, vres2_r, vec0, vec1);
74 out0 = __msa_copy_s_h(vec0, 0);
75 out1 = __msa_copy_s_h(vec0, 1);
76 out2 = __msa_copy_s_h(vec0, 2);
77 out3 = __msa_copy_s_h(vec0, 3);
78 out4 = __msa_copy_s_h(vec0, 4);
79 out5 = __msa_copy_s_h(vec0, 5);
80 out6 = __msa_copy_s_h(vec0, 6);
81 out7 = __msa_copy_s_h(vec0, 7);
91 out0 = __msa_copy_s_h(vec1, 0);
92 out1 = __msa_copy_s_h(vec1, 1);
93 out2 = __msa_copy_s_h(vec1, 2);
94 out3 = __msa_copy_s_h(vec1, 3);
95 out4 = __msa_copy_s_h(vec1, 4);
96 out5 = __msa_copy_s_h(vec1, 5);
97 out6 = __msa_copy_s_h(vec1, 6);
98 out7 = __msa_copy_s_h(vec1, 7);
108 #undef DC_DEST_STRIDE 113 v8i16
src0,
src1, src2, src3, src4, src5, src6, src7;
114 v8i16 vec0, vec1, vec2, vec3;
115 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
116 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
117 v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
118 v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
119 v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
120 v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
121 v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
122 v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
127 LD_SH8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
128 ST_SH8(zeros, zeros, zeros, zeros, zeros, zeros, zeros, zeros, src, 8);
137 BUTTERFLY_4(vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3);
140 vec0 = src5 - vec0 - src3 - src7;
142 vec1 = src1 - vec1 + src7 - src3;
144 vec2 = vec2 - src1 + src7 + src5;
146 vec3 = vec3 + src3 + src5 +
src1;
156 BUTTERFLY_8(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
157 res0, res1, res2, res3, res4, res5, res6, res7);
159 res0, res1, res2, res3, res4, res5, res6, res7);
168 BUTTERFLY_4(tmp0_r, tmp0_l, tmp4_l, tmp4_r, vec0_r, vec0_l, vec1_l, vec1_r);
170 vec2_r = tmp2_r >> 1;
171 vec2_l = tmp2_l >> 1;
174 vec3_r = tmp6_r >> 1;
175 vec3_l = tmp6_l >> 1;
179 BUTTERFLY_4(vec0_r, vec1_r, vec2_r, vec3_r, tmp0_r, tmp2_r, tmp4_r, tmp6_r);
180 BUTTERFLY_4(vec0_l, vec1_l, vec2_l, vec3_l, tmp0_l, tmp2_l, tmp4_l, tmp6_l);
182 vec0_r = tmp7_r >> 1;
183 vec0_l = tmp7_l >> 1;
184 vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
185 vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
186 vec1_r = tmp3_r >> 1;
187 vec1_l = tmp3_l >> 1;
188 vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
189 vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
190 vec2_r = tmp5_r >> 1;
191 vec2_l = tmp5_l >> 1;
192 vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
193 vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
194 vec3_r = tmp1_r >> 1;
195 vec3_l = tmp1_l >> 1;
196 vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
197 vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
198 tmp1_r = vec3_r >> 2;
199 tmp1_l = vec3_l >> 2;
202 tmp3_r = vec2_r >> 2;
203 tmp3_l = vec2_l >> 2;
206 tmp5_r = vec1_r >> 2;
207 tmp5_l = vec1_l >> 2;
210 tmp7_r = vec0_r >> 2;
211 tmp7_l = vec0_l >> 2;
212 tmp7_r = vec3_r - tmp7_r;
213 tmp7_l = vec3_l - tmp7_l;
215 BUTTERFLY_4(tmp0_r, tmp0_l, tmp7_l, tmp7_r, res0_r, res0_l, res7_l, res7_r);
216 BUTTERFLY_4(tmp2_r, tmp2_l, tmp5_l, tmp5_r, res1_r, res1_l, res6_l, res6_r);
217 BUTTERFLY_4(tmp4_r, tmp4_l, tmp3_l, tmp3_r, res2_r, res2_l, res5_l, res5_r);
218 BUTTERFLY_4(tmp6_r, tmp6_l, tmp1_l, tmp1_r, res3_r, res3_l, res4_l, res4_r);
219 SRA_4V(res0_r, res0_l, res1_r, res1_l, 6);
220 SRA_4V(res2_r, res2_l, res3_r, res3_l, 6);
221 SRA_4V(res4_r, res4_l, res5_r, res5_l, 6);
222 SRA_4V(res6_r, res6_l, res7_r, res7_l, 6);
223 PCKEV_H4_SH(res0_l, res0_r, res1_l, res1_r, res2_l, res2_r, res3_l, res3_r,
224 res0, res1, res2, res3);
225 PCKEV_H4_SH(res4_l, res4_r, res5_l, res5_r, res6_l, res6_r, res7_l, res7_r,
226 res4, res5, res6, res7);
227 LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
228 ILVR_B4_SH(zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
229 tmp0, tmp1, tmp2, tmp3);
230 ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
231 tmp4, tmp5, tmp6, tmp7);
232 ADD4(res0, tmp0, res1, tmp1, res2, tmp2, res3, tmp3,
233 res0, res1, res2, res3);
234 ADD4(res4, tmp4, res5, tmp5, res6, tmp6, res7, tmp7,
235 res4, res5, res6, res7);
237 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
238 dst0, dst1, dst2, dst3);
239 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
246 v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
247 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
251 dc_val = (src[0] + 32) >> 6;
252 dc = __msa_fill_h(dc_val);
256 LD_SB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
257 ILVR_B4_SH(zeros, dst0, zeros, dst1, zeros, dst2, zeros, dst3,
258 dst0_r, dst1_r, dst2_r, dst3_r);
259 ILVR_B4_SH(zeros, dst4, zeros, dst5, zeros, dst6, zeros, dst7,
260 dst4_r, dst5_r, dst6_r, dst7_r);
261 ADD4(dst0_r, dc, dst1_r, dc, dst2_r, dc, dst3_r, dc,
262 dst0_r, dst1_r, dst2_r, dst3_r);
263 ADD4(dst4_r, dc, dst5_r, dc, dst6_r, dc, dst7_r, dc,
264 dst4_r, dst5_r, dst6_r, dst7_r);
266 dst4_r, dst5_r, dst6_r, dst7_r);
267 PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r,
268 dst0, dst1, dst2, dst3);
269 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
274 uint32_t src0_m, src1_m, src2_m, src3_m, out0_m, out1_m, out2_m, out3_m;
275 v16i8 dst0_m = { 0 };
276 v16i8 dst1_m = { 0 };
277 v8i16 hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3;
278 v8i16 inp0_m, inp1_m, res0_m, res1_m,
src1, src3;
280 const v8i16 src2 =
LD_SH(src + 8);
281 const v8i16
zero = { 0 };
282 const uint8_t *dst1 = dst + dst_stride;
283 const uint8_t *dst2 = dst + 2 * dst_stride;
284 const uint8_t *dst3 = dst + 3 * dst_stride;
286 ILVL_D2_SH(src0, src0, src2, src2, src1, src3);
287 ST_SH2(zero, zero, src, 8);
288 AVC_ITRANS_H(src0, src1, src2, src3, hres0, hres1, hres2, hres3);
290 AVC_ITRANS_H(hres0, hres1, hres2, hres3, vres0, vres1, vres2, vres3);
296 ILVR_D2_SH(vres1, vres0, vres3, vres2, inp0_m, inp1_m);
299 ILVR_B2_SH(zero, dst0_m, zero, dst1_m, res0_m, res1_m);
300 ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);
302 PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);
303 out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);
304 out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);
305 out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);
306 out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);
324 v8i16 pred_r, pred_l;
325 const uint32_t
src0 =
LW(dst);
326 const uint32_t
src1 =
LW(dst + dst_stride);
327 const uint32_t src2 =
LW(dst + 2 * dst_stride);
328 const uint32_t src3 =
LW(dst + 3 * dst_stride);
329 const int16_t
dc = (src[0] + 32) >> 6;
330 const v8i16 input_dc = __msa_fill_h(dc);
335 ADD2(pred_r, input_dc, pred_l, input_dc, pred_r, pred_l);
337 out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
338 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
354 for (i = 0; i < 16; i++) {
358 if (nnz == 1 && ((
dctcoef *) block)[i * 16])
360 block + i * 16 *
sizeof(
pixel),
364 block + i * 16 *
sizeof(
pixel),
376 for (cnt = 0; cnt < 16; cnt += 4) {
380 if (nnz == 1 && ((
dctcoef *) block)[cnt * 16])
382 block + cnt * 16 *
sizeof(
pixel),
386 block + cnt * 16 *
sizeof(
pixel),
399 for (j = 1; j < 3; j++) {
400 for (i = (j * 16); i < (j * 16 + 4); i++) {
403 block + i * 16 *
sizeof(
pixel),
405 else if (((
dctcoef *) block)[i * 16])
407 block + i * 16 *
sizeof(
pixel),
420 for (j = 1; j < 3; j++) {
421 for (i = (j * 16); i < (j * 16 + 4); i++) {
424 block + i * 16 *
sizeof(
pixel),
426 else if (((
dctcoef *) block)[i * 16])
428 block + i * 16 *
sizeof(
pixel),
433 for (j = 1; j < 3; j++) {
434 for (i = (j * 16 + 4); i < (j * 16 + 8); i++) {
435 if (nzc[
scan8[i + 4]])
437 block + i * 16 *
sizeof(
pixel),
439 else if (((
dctcoef *) block)[i * 16])
441 block + i * 16 *
sizeof(
pixel),
455 for (i = 0; i < 16; i++) {
458 block + i * 16 *
sizeof(
pixel), dst_stride);
459 else if (((
dctcoef *) block)[i * 16])
461 block + i * 16 *
sizeof(
pixel),
void ff_h264_idct_add8_msa(uint8_t **dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
static void avc_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src, int32_t de_q_val)
#define AVC_ITRANS_H(in0, in1, in2, in3, out0, out1, out2, out3)
#define SRA_4V(in0, in1, in2, in3, shift)
The exact code depends on how similar the blocks are and how related they are to the block
#define UNPCK_UB_SH(in, out0, out1)
#define CLIP_SH8_0_255(in0, in1, in2, in3,in4, in5, in6, in7)
#define CLIP_SH2_0_255(in0, in1)
#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3)
void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
#define UNPCK_R_SH_SW(in, out)
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
void ff_h264_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
void ff_h264_idct_add8_422_msa(uint8_t **dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
void ff_h264_idct_add16_intra_msa(uint8_t *dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
#define TRANSPOSE8x8_SH_SH(...)
void ff_h264_deq_idct_luma_dc_msa(int16_t *dst, int16_t *src, int32_t de_qval)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
void ff_h264_idct8_add4_msa(uint8_t *dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,out0, out1, out2, out3, out4, out5, out6, out7)
#define INSERT_W4_UB(...)
void ff_h264_idct4x4_addblk_dc_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
static const float pred[4]
#define UNPCK_SH_SW(in, out0, out1)
static void avc_idct8_addblk_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2]...the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so...,+,-,+,-,+,+,-,+,-,+,...hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32-hcoeff[1]-hcoeff[2]-...a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2}an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||.........intra?||||:Block01:yes no||||:Block02:.................||||:Block03::y DC::ref index:||||:Block04::cb DC::motion x:||||.........:cr DC::motion y:||||.................|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------------------------------|||Y subbands||Cb subbands||Cr subbands||||------||------||------|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||------||------||------||||------||------||------|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||------||------||------||||------||------||------|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||------||------||------||||------||------||------|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------------------------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction------------|\Dequantization-------------------\||Reference frames|\IDWT|--------------|Motion\|||Frame 0||Frame 1||Compensation.OBMC v-------|--------------|--------------.\------> Frame n output Frame Frame<----------------------------------/|...|-------------------Range Coder:============Binary Range Coder:-------------------The implemented range coder is an adapted version based upon"Range encoding: an algorithm for removing redundancy from a digitised message."by G.N.N.Martin.The symbols encoded by the Snow range coder are bits(0|1).The associated probabilities are not fix but change depending on the symbol mix seen so far.bit seen|new state---------+-----------------------------------------------0|256-state_transition_table[256-old_state];1|state_transition_table[old_state];state_transition_table={0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:-------------------------FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1.the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled top and top right vectors is used as motion vector prediction the used motion vector is the sum of the predictor and(mvx_diff, mvy_diff)*mv_scale Intra DC Prediction block[y][x] dc[1]
static const uint8_t scan8[16 *3+3]
#define ADD2(in0, in1, in2, in3, out0, out1)
#define INSERT_W2_SB(...)
void ff_h264_idct_add16_msa(uint8_t *dst, const int32_t *blk_offset, int16_t *block, int32_t dst_stride, const uint8_t nzc[15 *8])
#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)