26 -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
30 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
33 #define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, \ 34 mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, \ 35 res0, res1, mul_val_b0, mul_val_b1, round) \ 37 v8i16 res0_m, res1_m, res2_m, res3_m; \ 39 MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1, \ 40 mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m); \ 42 res0_m += mul_val_h1 * tmp0; \ 43 res1_m += mul_val_h3 * tmp0; \ 44 res2_m += mul_val_h1 * tmp0; \ 45 res3_m += mul_val_h3 * tmp0; \ 47 res0_m += mul_val_b0 * src0_r; \ 48 res1_m += mul_val_b0 * src0_l; \ 49 res2_m += (mul_val_b0 - 1) * src0_r; \ 50 res3_m += (mul_val_b0 - 1) * src0_l; \ 52 res0_m += mul_val_b1 * tmp1; \ 53 res1_m += mul_val_b1 * tmp1; \ 54 res2_m += (mul_val_b1 + 1) * tmp1; \ 55 res3_m += (mul_val_b1 + 1) * tmp1; \ 57 SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round); \ 58 PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1); \ 68 v8i16 vec0, vec1, vec2;
71 src_data =
LW(src_top);
72 SW4(src_data, src_data, src_data, src_data, dst, stride);
75 src_data =
LW(src_left);
77 vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
79 vec0 = __msa_fill_h(src_left[-1]);
80 vec1 = __msa_fill_h(src_top[0]);
82 vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
88 for (col = 0; col < 4; col++) {
89 dst[stride * col] = (
uint8_t) vec2[col];
101 uint16_t val0, val1, val2, val3;
103 v8i16 vec0, vec1, vec2;
106 src_data1 =
LD(src_top);
108 for (row = 8; row--;) {
109 SD(src_data1, tmp_dst);
114 src_data1 =
LD(src_left);
116 vec2 = (v8i16) __msa_insert_d((v2i64)
zero, 0, src_data1);
118 vec0 = __msa_fill_h(src_left[-1]);
119 vec1 = __msa_fill_h(src_top[0]);
121 vec2 = (v8i16) __msa_ilvr_b(zero, (v16i8) vec2);
158 v8i16 vec0, vec1, vec2, vec3;
160 src =
LD_UB(src_top);
162 for (row = 16; row--;) {
168 src =
LD_UB(src_left);
170 vec0 = __msa_fill_h(src_left[-1]);
171 vec1 = __msa_fill_h(src_top[0]);
174 SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
179 ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
182 src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
184 for (col = 0; col < 16; col++) {
185 dst[stride * col] = src[col];
195 uint32_t val0, val1, val2, val3;
197 v8i16 src0_r, src_top_val, src_left_val;
200 val0 = src_left[0] * 0x01010101;
201 val1 = src_left[1] * 0x01010101;
202 val2 = src_left[2] * 0x01010101;
203 val3 = src_left[3] * 0x01010101;
204 SW4(val0, val1, val2, val3, dst, stride);
208 src0 = (v16i8) __msa_insert_w((v4i32)
src0, 0, val0);
209 src_top_val = __msa_fill_h(src_top[-1]);
210 src_left_val = __msa_fill_h(src_left[0]);
212 src0_r = (v8i16) __msa_ilvr_b(zero, src0);
214 src0_r -= src_top_val;
216 src0_r += src_left_val;
218 src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
219 val0 = __msa_copy_s_w((v4i32) src0, 0);
229 uint64_t val0, val1, val2, val3;
231 v8i16 src0_r, src_top_val, src_left_val;
234 val0 = src_left[0] * 0x0101010101010101;
235 val1 = src_left[1] * 0x0101010101010101;
236 val2 = src_left[2] * 0x0101010101010101;
237 val3 = src_left[3] * 0x0101010101010101;
238 SD4(val0, val1, val2, val3, dst, stride);
240 val0 = src_left[4] * 0x0101010101010101;
241 val1 = src_left[5] * 0x0101010101010101;
242 val2 = src_left[6] * 0x0101010101010101;
243 val3 = src_left[7] * 0x0101010101010101;
244 SD4(val0, val1, val2, val3, dst + 4 * stride, stride);
248 src0 = (v16i8) __msa_insert_d((v2i64)
src0, 0, val0);
249 src_top_val = __msa_fill_h(src_top[-1]);
250 src_left_val = __msa_fill_h(src_left[0]);
252 src0_r = (v8i16) __msa_ilvr_b(zero, src0);
254 src0_r -= src_top_val;
256 src0_r += src_left_val;
258 src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
259 val0 = __msa_copy_s_d((v2i64) src0, 0);
271 uint8_t inp0, inp1, inp2, inp3;
273 v8i16 src0_r, src0_l, src_left_val, src_top_val;
275 src_left_val = __msa_fill_h(src_left[0]);
277 for (row = 4; row--;) {
284 src0 = __msa_fill_b(inp0);
285 src1 = __msa_fill_b(inp1);
286 src2 = __msa_fill_b(inp2);
287 src3 = __msa_fill_b(inp3);
289 ST_SB4(src0, src1, src2, src3, tmp_dst, stride);
294 src0 =
LD_SB(src_top);
295 src_top_val = __msa_fill_h(src_top[-1]);
298 SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
303 ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
305 src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
315 uint8_t inp0, inp1, inp2, inp3;
318 for (row = 0; row < 8; row++) {
319 inp0 = src_left[row * 4];
320 inp1 = src_left[row * 4 + 1];
321 inp2 = src_left[row * 4 + 2];
322 inp3 = src_left[row * 4 + 3];
324 src0 = __msa_fill_b(inp0);
325 src1 = __msa_fill_b(inp1);
326 src2 = __msa_fill_b(inp2);
327 src3 = __msa_fill_b(inp3);
329 ST_SB2(src0, src0, dst, 16);
331 ST_SB2(src1, src1, dst, 16);
333 ST_SB2(src2, src2, dst, 16);
335 ST_SB2(src3, src3, dst, 16);
346 uint32_t addition = 0;
347 uint32_t val0, val1, val2;
351 v8u16 sum, vec0, vec1;
356 sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
357 sum = (v8u16) __msa_hadd_u_w(sum, sum);
358 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
359 sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
360 addition = __msa_copy_u_w((v4i32) sum, 0);
361 store = (v16u8) __msa_fill_b(addition);
362 val0 = __msa_copy_u_w((v4i32) store, 0);
363 SW4(val0, val0, val0, val0, dst, stride)
366 ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
372 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
373 store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
374 val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
375 store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
376 val0 = __msa_copy_u_w((v4i32) store, 0);
385 ADD2(val0, addition, val1, addition, val0, val1);
395 tmp_dst[stride * 1] = val0;
396 tmp_dst[stride * 2] = val1;
397 tmp_dst[stride * 3] = val2;
407 uint32_t row, col,
val;
408 uint32_t addition = 0;
412 v8u16 sum, vec0, vec1;
418 sum = __msa_hadd_u_h((v16u8) src, (v16u8) src);
419 sum = (v8u16) __msa_hadd_u_w(sum, sum);
420 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
421 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
422 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
423 sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
424 addition = __msa_copy_u_w((v4i32) sum, 0);
425 store = (v16u8) __msa_fill_b(addition);
426 val0 = __msa_copy_u_d((v2i64) store, 0);
428 for (row = 8; row--;) {
434 ILVR_B2_UH(zero, store, zero, src, vec0, vec1);
439 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
440 store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
441 val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
442 store = (v16u8) __msa_insert_b((v16i8) store, 0,
val);
443 val0 = __msa_copy_u_d((v2i64) store, 0);
447 src = (v16u8) __msa_insert_d((v2i64)
src, 0, val0);
448 vec1 = (v8u16) __msa_ilvr_b(zero, (v16i8)
src);
449 vec0 = (v8u16) __msa_fill_h(addition);
452 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
454 for (col = 1; col < 8; col++) {
455 tmp_dst[stride * col] = vec1[col];
466 uint32_t row, col,
val;
467 uint32_t addition = 0;
468 v16u8 src_above1, store, src_left1;
469 v8u16 sum, sum_above, sum_left;
470 v8u16 vec0, vec1, vec2;
473 src_above1 =
LD_UB(src_top);
474 src_left1 =
LD_UB(src_left);
476 HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
477 sum = sum_above + sum_left;
478 sum = (v8u16) __msa_hadd_u_w(sum, sum);
479 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
480 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
481 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
482 sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
483 addition = __msa_copy_u_w((v4i32) sum, 0);
484 store = (v16u8) __msa_fill_b(addition);
486 for (row = 16; row--;) {
492 vec0 = (v8u16) __msa_ilvr_b(zero, (v16i8) store);
494 ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
496 ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
498 store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
499 val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
500 store = (v16u8) __msa_insert_b((v16i8) store, 0,
val);
501 ST_UB(store, tmp_dst);
504 vec0 = (v8u16) __msa_fill_h(addition);
506 ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
508 store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
510 for (col = 1; col < 16; col++) {
511 tmp_dst[stride * col] = store[col];
521 v16u8 src_above1, src_above2, store, src_left1, src_left2;
522 v8u16 sum_above1, sum_above2;
523 v8u16 sum_left1, sum_left2;
524 v8u16 sum, sum_above, sum_left;
526 LD_UB2(src_top, 16, src_above1, src_above2);
527 LD_UB2(src_left, 16, src_left1, src_left2);
528 HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
529 HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
530 sum_above = sum_above1 + sum_above2;
531 sum_left = sum_left1 + sum_left2;
532 sum = sum_above + sum_left;
533 sum = (v8u16) __msa_hadd_u_w(sum, sum);
534 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
535 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
536 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
537 sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
538 store = (v16u8) __msa_splati_b((v16i8) sum, 0);
540 for (row = 16; row--;) {
541 ST_UB2(store, store, dst, 16);
543 ST_UB2(store, store, dst, 16);
553 v16i8 src_vec0, src_vec1;
554 v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
555 v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
556 v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
562 mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
564 src_vec0 = (v16i8) __msa_insert_w((v4i32)
zero, 0,
src0);
565 src_vec1 = (v16i8) __msa_insert_w((v4i32)
zero, 0,
src1);
567 ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src1_r);
568 SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
570 tmp0 = __msa_fill_h(src_top[4]);
571 tmp1 = __msa_fill_h(src_left[4]);
573 MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
574 res0, res1, res2, res3);
576 res0 += mul_val1 * tmp0;
577 res1 += mul_val1 * tmp0;
578 res2 += mul_val1 * tmp0;
579 res3 += mul_val1 * tmp0;
581 res0 += 3 * src_vec0_r;
582 res1 += 2 * src_vec0_r;
591 src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
592 ST_W4(src_vec0, 0, 1, 2, 3, dst, stride);
600 v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
601 v8i16 src_vec0_r, src_vec1_r;
602 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
603 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
604 v8i16 tmp0, tmp1, tmp2;
605 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
606 v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
612 src_vec0 = (v16i8) __msa_insert_d((v2i64)
zero, 0,
src0);
613 src_vec1 = (v16i8) __msa_insert_d((v2i64)
zero, 0,
src1);
615 ILVR_B2_SH(zero, src_vec0, zero, src_vec1, src_vec0_r, src_vec1_r);
616 SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
617 SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
619 tmp0 = __msa_fill_h(src_top[8]);
620 tmp1 = __msa_fill_h(src_left[8]);
622 MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
623 res0, res1, res2, res3);
624 MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
625 res4, res5, res6, res7);
627 tmp2 = mul_val1 * tmp0;
637 res0 += 7 * src_vec0_r;
638 res1 += 6 * src_vec0_r;
639 res2 += 5 * src_vec0_r;
640 res3 += 4 * src_vec0_r;
641 res4 += 3 * src_vec0_r;
642 res5 += 2 * src_vec0_r;
656 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
657 src_vec0, src_vec1, src_vec2, src_vec3);
659 ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1,
660 0, 1, 0, 1, dst, stride);
668 v8i16 src0_r, src1_r, src0_l, src1_l;
670 v8i16 res0, res1, tmp0, tmp1;
671 v8i16 mul_val2, mul_val3;
672 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
673 v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
675 src0 =
LD_UB(src_top);
676 src1 =
LD_UB(src_left);
681 mul_val2 = mul_val0 - 8;
682 mul_val3 = mul_val1 + 8;
684 tmp0 = __msa_fill_h(src_top[16]);
685 tmp1 = __msa_fill_h(src_left[16]);
689 mul_val0, mul_val1, mul_val2, mul_val3,
690 res0, res1, 15, 1, 5);
691 ST_SH2(res0, res1, dst, stride);
696 mul_val0, mul_val1, mul_val2, mul_val3,
697 res0, res1, 13, 3, 5);
698 ST_SH2(res0, res1, dst, stride);
703 mul_val0, mul_val1, mul_val2, mul_val3,
704 res0, res1, 11, 5, 5);
705 ST_SH2(res0, res1, dst, stride);
710 mul_val0, mul_val1, mul_val2, mul_val3,
711 res0, res1, 9, 7, 5);
712 ST_SH2(res0, res1, dst, stride);
717 mul_val0, mul_val1, mul_val2, mul_val3,
718 res0, res1, 7, 9, 5);
719 ST_SH2(res0, res1, dst, stride);
724 mul_val0, mul_val1, mul_val2, mul_val3,
725 res0, res1, 5, 11, 5);
726 ST_SH2(res0, res1, dst, stride);
731 mul_val0, mul_val1, mul_val2, mul_val3,
732 res0, res1, 3, 13, 5);
733 ST_SH2(res0, res1, dst, stride);
738 mul_val0, mul_val1, mul_val2, mul_val3,
739 res0, res1, 1, 15, 5);
740 ST_SH2(res0, res1, dst, stride);
749 v8i16 src0_r, src1_r, src0_l, src1_l;
750 v8i16 vec0, vec1, res0, res1;
752 v8i16 mul_val2, mul_val3;
753 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
754 v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
756 tmp0 = __msa_fill_h(src_top[32 - offset]);
757 tmp1 = __msa_fill_h(src_left[32]);
759 src0 =
LD_SB(src_top);
760 src1 =
LD_SB(src_left);
767 mul_val2 = mul_val0 - 8;
768 mul_val3 = mul_val1 + 8;
772 mul_val0, mul_val1, mul_val2, mul_val3,
773 res0, res1, 31, 1, 6);
774 ST_SH2(res0, res1, dst, stride);
779 mul_val0, mul_val1, mul_val2, mul_val3,
780 res0, res1, 29, 3, 6);
781 ST_SH2(res0, res1, dst, stride);
786 mul_val0, mul_val1, mul_val2, mul_val3,
787 res0, res1, 27, 5, 6);
788 ST_SH2(res0, res1, dst, stride);
793 mul_val0, mul_val1, mul_val2, mul_val3,
794 res0, res1, 25, 7, 6);
795 ST_SH2(res0, res1, dst, stride);
800 mul_val0, mul_val1, mul_val2, mul_val3,
801 res0, res1, 23, 9, 6);
802 ST_SH2(res0, res1, dst, stride);
807 mul_val0, mul_val1, mul_val2, mul_val3,
808 res0, res1, 21, 11, 6);
809 ST_SH2(res0, res1, dst, stride);
814 mul_val0, mul_val1, mul_val2, mul_val3,
815 res0, res1, 19, 13, 6);
816 ST_SH2(res0, res1, dst, stride);
821 mul_val0, mul_val1, mul_val2, mul_val3,
822 res0, res1, 17, 15, 6);
823 ST_SH2(res0, res1, dst, stride);
832 v8i16 src0_r, src1_r, src0_l, src1_l;
833 v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
834 v8i16 mul_val2, mul_val3;
835 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
836 v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
838 tmp0 = __msa_fill_h(src_top[32 - offset]);
839 tmp1 = __msa_fill_h(src_left[16]);
841 src0 =
LD_SB(src_top);
842 src1 =
LD_SB(src_left);
849 mul_val2 = mul_val0 - 8;
850 mul_val3 = mul_val1 + 8;
854 mul_val0, mul_val1, mul_val2, mul_val3,
855 res0, res1, 15, 17, 6);
856 ST_SH2(res0, res1, dst, stride);
861 mul_val0, mul_val1, mul_val2, mul_val3,
862 res0, res1, 13, 19, 6);
863 ST_SH2(res0, res1, dst, stride);
868 mul_val0, mul_val1, mul_val2, mul_val3,
869 res0, res1, 11, 21, 6);
870 ST_SH2(res0, res1, dst, stride);
875 mul_val0, mul_val1, mul_val2, mul_val3,
876 res0, res1, 9, 23, 6);
877 ST_SH2(res0, res1, dst, stride);
882 mul_val0, mul_val1, mul_val2, mul_val3,
883 res0, res1, 7, 25, 6);
884 ST_SH2(res0, res1, dst, stride);
889 mul_val0, mul_val1, mul_val2, mul_val3,
890 res0, res1, 5, 27, 6);
891 ST_SH2(res0, res1, dst, stride);
896 mul_val0, mul_val1, mul_val2, mul_val3,
897 res0, res1, 3, 29, 6);
898 ST_SH2(res0, res1, dst, stride);
903 mul_val0, mul_val1, mul_val2, mul_val3,
904 res0, res1, 1, 31, 6);
905 ST_SH2(res0, res1, dst, stride);
914 (dst + 16), stride, 16);
920 (dst + 16), stride, 16);
929 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
931 uint8_t *ref_tmp = ref_array + 4;
934 int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
935 int32_t idx2, fact_val2, idx3, fact_val3;
939 v16i8 top0, top1, top2, top3;
942 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
943 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
946 inv_angle_val = inv_angle[mode - 18];
951 if (angle < 0 && last < -1) {
952 inv_angle_val = inv_angle[mode - 18];
957 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
958 offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
959 ref_tmp[h_cnt] = src_left[
offset];
965 idx0 = angle_loop >> 5;
966 fact_val0 = angle_loop & 31;
969 idx1 = angle_loop >> 5;
970 fact_val1 = angle_loop & 31;
973 idx2 = angle_loop >> 5;
974 fact_val2 = angle_loop & 31;
977 idx3 = angle_loop >> 5;
978 fact_val3 = angle_loop & 31;
980 top0 =
LD_SB(ref + idx0 + 1);
981 top1 =
LD_SB(ref + idx1 + 1);
982 top2 =
LD_SB(ref + idx2 + 1);
983 top3 =
LD_SB(ref + idx3 + 1);
985 fact0 = __msa_fill_h(fact_val0);
986 fact1 = __msa_fill_h(32 - fact_val0);
988 fact2 = __msa_fill_h(fact_val1);
989 fact3 = __msa_fill_h(32 - fact_val1);
991 fact4 = __msa_fill_h(fact_val2);
992 fact5 = __msa_fill_h(32 - fact_val2);
994 fact6 = __msa_fill_h(fact_val3);
995 fact7 = __msa_fill_h(32 - fact_val3);
997 ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
998 ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
999 ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1000 diff0, diff2, diff4, diff6);
1001 SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2,
1002 diff1, diff3, diff5, diff7);
1003 ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1004 ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1005 MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1007 diff1 += diff0 * fact1;
1008 diff3 += diff2 * fact3;
1011 dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
1012 ST_W4(dst_val0, 0, 1, 2, 3, dst, stride);
1021 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1022 uint8_t ref_array[3 * 32 + 4];
1023 uint8_t *ref_tmp = ref_array + 8;
1025 const uint8_t *src_left_tmp = src_left - 1;
1027 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1028 int32_t idx2, fact_val2, idx3, fact_val3;
1030 int32_t inv_angle_val, inv_angle_val_loop;
1032 v16i8 top0, top1, top2, top3;
1033 v16u8 dst_val0, dst_val1;
1034 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1035 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1038 inv_angle_val = inv_angle[mode - 18];
1039 last = (angle) >> 2;
1044 inv_angle_val_loop = inv_angle_val * last;
1050 SW(tmp1, ref_tmp + 4);
1051 SW(tmp2, ref_tmp + 8);
1053 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1054 offset = (inv_angle_val_loop + 128) >> 8;
1055 ref_tmp[h_cnt] = src_left_tmp[
offset];
1056 inv_angle_val_loop += inv_angle_val;
1061 for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1062 idx0 = (angle_loop) >> 5;
1063 fact_val0 = (angle_loop) & 31;
1064 angle_loop += angle;
1066 idx1 = (angle_loop) >> 5;
1067 fact_val1 = (angle_loop) & 31;
1068 angle_loop += angle;
1070 idx2 = (angle_loop) >> 5;
1071 fact_val2 = (angle_loop) & 31;
1072 angle_loop += angle;
1074 idx3 = (angle_loop) >> 5;
1075 fact_val3 = (angle_loop) & 31;
1076 angle_loop += angle;
1078 top0 =
LD_SB(ref + idx0 + 1);
1079 top1 =
LD_SB(ref + idx1 + 1);
1080 top2 =
LD_SB(ref + idx2 + 1);
1081 top3 =
LD_SB(ref + idx3 + 1);
1083 fact0 = __msa_fill_h(fact_val0);
1084 fact1 = __msa_fill_h(32 - fact_val0);
1085 fact2 = __msa_fill_h(fact_val1);
1086 fact3 = __msa_fill_h(32 - fact_val1);
1087 fact4 = __msa_fill_h(fact_val2);
1088 fact5 = __msa_fill_h(32 - fact_val2);
1089 fact6 = __msa_fill_h(fact_val3);
1090 fact7 = __msa_fill_h(32 - fact_val3);
1097 SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
1098 diff1, diff3, diff5, diff7);
1099 MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1100 diff1, diff3, diff5, diff7);
1102 diff1 += diff0 * fact1;
1103 diff3 += diff2 * fact3;
1104 diff5 += diff4 * fact5;
1105 diff7 += diff6 * fact7;
1108 PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
1109 ST_D4(dst_val0, dst_val1, 0, 1, 0, 1, dst, stride);
1120 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1121 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1122 int32_t idx2, fact_val2, idx3, fact_val3;
1125 int32_t inv_angle_val, inv_angle_val_loop;
1126 uint8_t ref_array[3 * 32 + 4];
1127 uint8_t *ref_tmp = ref_array + 16;
1129 const uint8_t *src_left_tmp = src_left - 1;
1131 v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1132 v16i8 dst0, dst1, dst2, dst3;
1133 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1134 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1135 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1138 inv_angle_val = inv_angle[mode - 18];
1144 inv_angle_val_loop = inv_angle_val * last;
1147 tmp0 =
LW(ref + 16);
1148 ST_UB(top0, ref_tmp);
1149 SW(tmp0, ref_tmp + 16);
1151 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1152 offset = (inv_angle_val_loop + 128) >> 8;
1153 ref_tmp[h_cnt] = src_left_tmp[
offset];
1154 inv_angle_val_loop += inv_angle_val;
1159 for (v_cnt = 4; v_cnt--;) {
1160 idx0 = (angle_loop) >> 5;
1161 fact_val0 = (angle_loop) & 31;
1162 angle_loop += angle;
1164 idx1 = (angle_loop) >> 5;
1165 fact_val1 = (angle_loop) & 31;
1166 angle_loop += angle;
1168 idx2 = (angle_loop) >> 5;
1169 fact_val2 = (angle_loop) & 31;
1170 angle_loop += angle;
1172 idx3 = (angle_loop) >> 5;
1173 fact_val3 = (angle_loop) & 31;
1174 angle_loop += angle;
1176 LD_UB2(ref + idx0 + 1, 16, top0, top1);
1177 LD_UB2(ref + idx1 + 1, 16, top2, top3);
1178 LD_UB2(ref + idx2 + 1, 16, top4, top5);
1179 LD_UB2(ref + idx3 + 1, 16, top6, top7);
1181 fact0 = __msa_fill_h(fact_val0);
1182 fact1 = __msa_fill_h(32 - fact_val0);
1183 fact2 = __msa_fill_h(fact_val1);
1184 fact3 = __msa_fill_h(32 - fact_val1);
1185 fact4 = __msa_fill_h(fact_val2);
1186 fact5 = __msa_fill_h(32 - fact_val2);
1187 fact6 = __msa_fill_h(fact_val3);
1188 fact7 = __msa_fill_h(32 - fact_val3);
1190 SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1191 top1, top3, top5, top7);
1201 MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1202 diff2, diff3, diff6, diff7);
1203 MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1204 diff10, diff11, diff14, diff15);
1206 diff2 += diff0 * fact1;
1207 diff3 += diff1 * fact1;
1208 diff6 += diff4 * fact3;
1209 diff7 += diff5 * fact3;
1210 diff10 += diff8 * fact5;
1211 diff11 += diff9 * fact5;
1212 diff14 += diff12 * fact7;
1213 diff15 += diff13 * fact7;
1217 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1218 dst0, dst1, dst2, dst3);
1219 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
1230 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1231 uint8_t ref_array[3 * 32 + 4];
1234 const uint8_t *src_left_tmp = src_left - 1;
1235 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1236 int32_t tmp0, tmp1, tmp2, tmp3;
1238 int32_t inv_angle_val, inv_angle_val_loop;
1240 v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1241 v16i8 dst0, dst1, dst2, dst3;
1242 v8i16 fact0, fact1, fact2, fact3;
1243 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1244 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1246 ref_tmp = ref_array + 32;
1249 inv_angle_val = inv_angle[mode - 18];
1255 inv_angle_val_loop = inv_angle_val * last;
1256 LD_UB2(ref, 16, top0, top1);
1262 ST_UB2(top0, top1, ref_tmp, 16);
1268 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1269 offset = (inv_angle_val_loop + 128) >> 8;
1270 ref_tmp[h_cnt] = src_left_tmp[
offset];
1271 inv_angle_val_loop += inv_angle_val;
1277 for (v_cnt = 16; v_cnt--;) {
1278 idx0 = (angle_loop) >> 5;
1279 fact_val0 = (angle_loop) & 31;
1280 angle_loop += angle;
1282 idx1 = (angle_loop) >> 5;
1283 fact_val1 = (angle_loop) & 31;
1284 angle_loop += angle;
1286 top0 =
LD_UB(ref + idx0 + 1);
1287 top4 =
LD_UB(ref + idx1 + 1);
1288 top1 =
LD_UB(ref + idx0 + 17);
1289 top5 =
LD_UB(ref + idx1 + 17);
1290 top3 =
LD_UB(ref + idx0 + 33);
1291 top7 =
LD_UB(ref + idx1 + 33);
1293 fact0 = __msa_fill_h(fact_val0);
1294 fact1 = __msa_fill_h(32 - fact_val0);
1295 fact2 = __msa_fill_h(fact_val1);
1296 fact3 = __msa_fill_h(32 - fact_val1);
1301 SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1302 top1, top3, top5, top7);
1312 MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1313 diff2, diff3, diff6, diff7);
1314 MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1315 diff10, diff11, diff14, diff15);
1317 diff2 += diff0 * fact1;
1318 diff3 += diff1 * fact1;
1319 diff6 += diff4 * fact1;
1320 diff7 += diff5 * fact1;
1321 diff10 += diff8 * fact3;
1322 diff11 += diff9 * fact3;
1323 diff14 += diff12 * fact3;
1324 diff15 += diff13 * fact3;
1328 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1329 dst0, dst1, dst2, dst3);
1331 ST_SB2(dst0, dst1, dst, 16);
1333 ST_SB2(dst2, dst3, dst, 16);
1344 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1345 uint8_t ref_array[3 * 32 + 4];
1346 uint8_t *ref_tmp = ref_array + 4;
1349 int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
1350 int32_t idx2, fact_val2, idx3, fact_val3;
1351 int32_t angle, angle_loop, inv_angle_val;
1353 v16i8 dst_val0, dst_val1;
1354 v16u8 top0, top1, top2, top3;
1356 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1357 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1365 inv_angle_val = inv_angle[mode - 11];
1370 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1371 offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
1372 ref_tmp[h_cnt] = src_top[
offset];
1378 idx0 = angle_loop >> 5;
1379 fact_val0 = angle_loop & 31;
1380 angle_loop += angle;
1382 idx1 = angle_loop >> 5;
1383 fact_val1 = angle_loop & 31;
1384 angle_loop += angle;
1386 idx2 = angle_loop >> 5;
1387 fact_val2 = angle_loop & 31;
1388 angle_loop += angle;
1390 idx3 = angle_loop >> 5;
1391 fact_val3 = angle_loop & 31;
1393 top0 =
LD_UB(ref + idx0 + 1);
1394 top1 =
LD_UB(ref + idx1 + 1);
1395 top2 =
LD_UB(ref + idx2 + 1);
1396 top3 =
LD_UB(ref + idx3 + 1);
1398 fact0 = __msa_fill_h(fact_val0);
1399 fact1 = __msa_fill_h(32 - fact_val0);
1400 fact2 = __msa_fill_h(fact_val1);
1401 fact3 = __msa_fill_h(32 - fact_val1);
1402 fact4 = __msa_fill_h(fact_val2);
1403 fact5 = __msa_fill_h(32 - fact_val2);
1404 fact6 = __msa_fill_h(fact_val3);
1405 fact7 = __msa_fill_h(32 - fact_val3);
1407 ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
1408 ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
1409 ILVR_B4_SH(zero, top0, zero, top1, zero, top2, zero, top3,
1410 diff0, diff2, diff4, diff6);
1411 SLDI_B4_SH(zero, diff0, zero, diff2, zero, diff4, zero, diff6, 2,
1412 diff1, diff3, diff5, diff7);
1413 ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1414 ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1415 MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1417 diff1 += diff0 * fact1;
1418 diff3 += diff2 * fact3;
1421 PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
1423 diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
1424 diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
1426 diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
1428 dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
1429 dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
1431 ST_W2(dst_val0, 0, 1, dst, stride);
1432 ST_W2(dst_val1, 0, 1, dst + 2 * stride, stride);
1441 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1442 uint8_t ref_array[3 * 32 + 4];
1443 uint8_t *ref_tmp = ref_array + 8;
1445 const uint8_t *src_top_tmp = src_top - 1;
1448 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1449 int32_t idx2, fact_val2, idx3, fact_val3;
1450 int32_t angle, angle_loop, inv_angle_val;
1451 v16i8 top0, top1, top2, top3;
1452 v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
1453 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1454 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1457 last = (angle) >> 2;
1462 inv_angle_val = inv_angle[mode - 11];
1468 SW(tmp1, ref_tmp + 4);
1469 SW(tmp2, ref_tmp + 8);
1471 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1472 offset = (h_cnt * inv_angle_val + 128) >> 8;
1473 ref_tmp[h_cnt] = src_top_tmp[
offset];
1479 for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1482 idx0 = angle_loop >> 5;
1483 fact_val0 = angle_loop & 31;
1484 angle_loop += angle;
1486 idx1 = angle_loop >> 5;
1487 fact_val1 = angle_loop & 31;
1488 angle_loop += angle;
1490 idx2 = angle_loop >> 5;
1491 fact_val2 = angle_loop & 31;
1492 angle_loop += angle;
1494 idx3 = angle_loop >> 5;
1495 fact_val3 = angle_loop & 31;
1496 angle_loop += angle;
1498 top0 =
LD_SB(ref + idx0 + 1);
1499 top1 =
LD_SB(ref + idx1 + 1);
1500 top2 =
LD_SB(ref + idx2 + 1);
1501 top3 =
LD_SB(ref + idx3 + 1);
1503 fact0 = __msa_fill_h(fact_val0);
1504 fact1 = __msa_fill_h(32 - fact_val0);
1505 fact2 = __msa_fill_h(fact_val1);
1506 fact3 = __msa_fill_h(32 - fact_val1);
1507 fact4 = __msa_fill_h(fact_val2);
1508 fact5 = __msa_fill_h(32 - fact_val2);
1509 fact6 = __msa_fill_h(fact_val3);
1510 fact7 = __msa_fill_h(32 - fact_val3);
1516 SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
1517 diff1, diff3, diff5, diff7);
1518 MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1519 diff1, diff3, diff5, diff7);
1521 diff1 += diff0 * fact1;
1522 diff3 += diff2 * fact3;
1523 diff5 += diff4 * fact5;
1524 diff7 += diff6 * fact7;
1527 PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
1528 dst_val0, dst_val1, dst_val2, dst_val3);
1529 ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1531 ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1542 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1543 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1544 int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
1545 v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1546 v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1547 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1548 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1549 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1551 uint8_t ref_array[3 * 32 + 4];
1552 uint8_t *ref_tmp = ref_array + 16;
1553 const uint8_t *
ref, *src_top_tmp = src_top - 1;
1558 last = (angle) >> 1;
1563 inv_angle_val = inv_angle[mode - 11];
1566 tmp0 =
LW(ref + 16);
1567 ST_SB(top0, ref_tmp);
1568 SW(tmp0, ref_tmp + 16);
1570 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1571 offset = (h_cnt * inv_angle_val + 128) >> 8;
1572 ref_tmp[h_cnt] = src_top_tmp[
offset];
1578 for (v_cnt = 0; v_cnt < 4; v_cnt++) {
1581 idx0 = angle_loop >> 5;
1582 fact_val0 = angle_loop & 31;
1583 angle_loop += angle;
1585 idx1 = angle_loop >> 5;
1586 fact_val1 = angle_loop & 31;
1587 angle_loop += angle;
1589 idx2 = angle_loop >> 5;
1590 fact_val2 = angle_loop & 31;
1591 angle_loop += angle;
1593 idx3 = angle_loop >> 5;
1594 fact_val3 = angle_loop & 31;
1595 angle_loop += angle;
1597 LD_SB2(ref + idx0 + 1, 16, top0, top1);
1598 LD_SB2(ref + idx1 + 1, 16, top2, top3);
1599 LD_SB2(ref + idx2 + 1, 16, top4, top5);
1600 LD_SB2(ref + idx3 + 1, 16, top6, top7);
1602 fact0 = __msa_fill_h(fact_val0);
1603 fact1 = __msa_fill_h(32 - fact_val0);
1604 fact2 = __msa_fill_h(fact_val1);
1605 fact3 = __msa_fill_h(32 - fact_val1);
1606 fact4 = __msa_fill_h(fact_val2);
1607 fact5 = __msa_fill_h(32 - fact_val2);
1608 fact6 = __msa_fill_h(fact_val3);
1609 fact7 = __msa_fill_h(32 - fact_val3);
1611 SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1612 top1, top3, top5, top7);
1623 MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1624 diff2, diff3, diff6, diff7);
1625 MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1626 diff10, diff11, diff14, diff15);
1628 diff2 += diff0 * fact1;
1629 diff3 += diff1 * fact1;
1630 diff6 += diff4 * fact3;
1631 diff7 += diff5 * fact3;
1632 diff10 += diff8 * fact5;
1633 diff11 += diff9 * fact5;
1634 diff14 += diff12 * fact7;
1635 diff15 += diff13 * fact7;
1639 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1640 dst_val0, dst_val1, dst_val2, dst_val3);
1641 ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1642 ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
1645 ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1647 ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
1658 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1659 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
1660 v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1661 v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1662 v8i16 fact0, fact1, fact2, fact3;
1663 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1664 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1666 uint8_t ref_array[3 * 32 + 4];
1667 uint8_t *ref_tmp = ref_array + 32;
1668 const uint8_t *
ref, *src_top_tmp = src_top - 1;
1678 inv_angle_val = inv_angle[mode - 11];
1680 LD_SB2(ref, 16, top0, top1);
1681 tmp0 =
LW(ref + 32);
1682 ST_SB2(top0, top1, ref_tmp, 16);
1683 SW(tmp0, ref_tmp + 32);
1685 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1686 offset = (h_cnt * inv_angle_val + 128) >> 8;
1687 ref_tmp[h_cnt] = src_top_tmp[
offset];
1693 for (v_cnt = 0; v_cnt < 16; v_cnt++) {
1695 idx0 = angle_loop >> 5;
1696 fact_val0 = angle_loop & 31;
1697 angle_loop += angle;
1699 idx1 = angle_loop >> 5;
1700 fact_val1 = angle_loop & 31;
1701 angle_loop += angle;
1703 top0 =
LD_SB(ref + idx0 + 1);
1704 top4 =
LD_SB(ref + idx1 + 1);
1705 top1 =
LD_SB(ref + idx0 + 17);
1706 top5 =
LD_SB(ref + idx1 + 17);
1707 top3 =
LD_SB(ref + idx0 + 33);
1708 top7 =
LD_SB(ref + idx1 + 33);
1710 fact0 = __msa_fill_h(fact_val0);
1711 fact1 = __msa_fill_h(32 - fact_val0);
1712 fact2 = __msa_fill_h(fact_val1);
1713 fact3 = __msa_fill_h(32 - fact_val1);
1718 SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1719 top1, top3, top5, top7);
1730 MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1731 diff2, diff3, diff6, diff7);
1732 MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1733 diff10, diff11, diff14, diff15);
1735 diff2 += diff0 * fact1;
1736 diff3 += diff1 * fact1;
1737 diff6 += diff4 * fact1;
1738 diff7 += diff5 * fact1;
1739 diff10 += diff8 * fact3;
1740 diff11 += diff9 * fact3;
1741 diff14 += diff12 * fact3;
1742 diff15 += diff13 * fact3;
1746 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1747 dst_val0, dst_val1, dst_val2, dst_val3);
1751 ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1753 ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1755 ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1757 ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
1771 src2 =
LD_UB(src + 16);
1773 for (row = 32; row--;) {
1774 ST_UB2(src1, src2, dst, 16);
1841 }
else if (mode == 26) {
1843 }
else if (mode >= 18) {
1859 }
else if (mode == 26) {
1861 }
else if (mode >= 18) {
1877 }
else if (mode == 26) {
1879 }
else if (mode >= 18) {
1895 }
else if (mode == 26) {
1897 }
else if (mode >= 18) {
1913 int size_in_luma_h = 16 << hshift;
1915 int size_in_luma_v = 16 << vshift;
1917 int x = x0 >> hshift;
1918 int y = y0 >> vshift;
1933 uint8_t left_array[2 * 32 + 1];
1934 uint8_t filtered_left_array[2 * 32 + 1];
1935 uint8_t top_array[2 * 32 + 1];
1936 uint8_t filtered_top_array[2 * 32 + 1];
1940 uint8_t *filtered_left = filtered_left_array + 1;
1941 uint8_t *filtered_top = filtered_top_array + 1;
1954 int bottom_left_size =
1955 (((y0 + 2 * size_in_luma_v) >
1957 2 * size_in_luma_v)) -
1958 (y0 + size_in_luma_v)) >> vshift;
1959 int top_right_size =
1960 (((x0 + 2 * size_in_luma_h) >
1962 (x0 + size_in_luma_h)) >> hshift;
1969 if (!size_in_luma_pu_h)
1970 size_in_luma_pu_h++;
1971 if (cand_bottom_left == 1 && on_pu_edge_x) {
1976 ((size_in_luma_pu_v) >
1979 y_bottom_pu) : (size_in_luma_pu_v));
1980 cand_bottom_left = 0;
1981 for (i = 0; i <
max; i += 2)
1985 i) * min_pu_width]).pred_flag ==
1988 if (cand_left == 1 && on_pu_edge_x) {
1992 ((size_in_luma_pu_v) >
1995 y_left_pu) : (size_in_luma_pu_v));
1997 for (i = 0; i <
max; i += 2)
2001 i) * min_pu_width]).pred_flag ==
2004 if (cand_up_left == 1) {
2009 (y_top_pu) * min_pu_width]).pred_flag ==
2012 if (cand_up == 1 && on_pu_edge_y) {
2016 ((size_in_luma_pu_h) >
2019 x_top_pu) : (size_in_luma_pu_h));
2021 for (i = 0; i <
max; i += 2)
2025 min_pu_width]).pred_flag ==
PF_INTRA);
2027 if (cand_up_right == 1 && on_pu_edge_y) {
2032 ((size_in_luma_pu_h) >
2035 x_right_pu) : (size_in_luma_pu_h));
2037 for (i = 0; i <
max; i += 2)
2041 min_pu_width]).pred_flag ==
PF_INTRA);
2044 vec0 = (v16u8) __msa_ldi_b(128);
2048 ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2053 left[-1] = src[(-1) + stride * (-1)];
2057 vec0 =
LD_UB(src - stride);
2060 if (cand_up_right) {
2061 vec0 =
LD_UB(src - stride + 16);
2062 ST_UB(vec0, (top + 16));
2066 ((src[(16 + top_right_size - 1) + stride * (-1)]) *
2068 for (i = 0; i < (16 - top_right_size); i += 4)
2074 for (i = 0; i < 16; i++)
2075 left[i] = src[(-1) + stride * (
i)];
2076 if (cand_bottom_left) {
2077 for (i = 16; i < 16 + bottom_left_size; i++)
2078 left[i] = src[(-1) + stride * (
i)];
2081 ((src[(-1) + stride * (16 + bottom_left_size - 1)]) *
2083 for (i = 0; i < (16 - bottom_left_size); i += 4)
2090 if (cand_bottom_left || cand_left || cand_up_left || cand_up
2093 x0 + ((2 * 16) << hshift) <
2096 y0 + ((2 * 16) << vshift) <
2098 int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2099 if (!cand_up_right) {
2100 size_max_x = x0 + ((16) << hshift) < s->
ps.
sps->
width ?
2103 if (!cand_bottom_left) {
2104 size_max_y = y0 + ((16) << vshift) < s->
ps.
sps->
height ?
2107 if (cand_bottom_left || cand_left || cand_up_left) {
2111 ((-1) << hshift)) >> s->
ps.
sps->
2112 log2_min_pu_size)) + (((y0 +
2117 * min_pu_width]).pred_flag ==
2122 ((-1) << hshift)) >> s->
ps.
sps->
2123 log2_min_pu_size)) + (((y0 + ((j)
2128 * min_pu_width]).pred_flag ==
PF_INTRA)) {
2130 while (j < size_max_x
2133 ((j) << hshift)) >> s->
ps.
sps->
2134 log2_min_pu_size)) + (((y0 +
2140 * min_pu_width]).pred_flag ==
2143 for (i = j; i > (j) - (j + 1); i--)
2147 1) << hshift)) >> s->
ps.
sps->
2148 log2_min_pu_size)) + (((y0 +
2154 * min_pu_width]).pred_flag ==
2156 top[i - 1] = top[i];
2161 while (j < size_max_x
2164 ((j) << hshift)) >> s->
ps.
sps->
2165 log2_min_pu_size)) + (((y0 + ((-1)
2170 * min_pu_width]).pred_flag ==
2175 for (i = j; i > (j) - (j + 1); i--)
2186 min_pu_width]).pred_flag ==
2188 top[i - 1] = top[i];
2190 for (i = j; i > (j) - (j); i--)
2201 min_pu_width]).pred_flag ==
2203 top[i - 1] = top[i];
2209 if (cand_bottom_left || cand_left) {
2210 a = ((
left[-1]) * 0x01010101U);
2211 for (i = 0; i < (0) + (size_max_y); i += 4)
2214 ((-1) << hshift)) >> s->
ps.
sps->
2215 log2_min_pu_size)) + (((y0 +
2220 * min_pu_width]).pred_flag ==
2224 a = ((
left[i + 3]) * 0x01010101U);
2227 vec0 = (v16u8) __msa_fill_b(
left[-1]);
2231 if (!cand_bottom_left) {
2233 vec0 = (v16u8) __msa_fill_b(
left[15]);
2237 if (x0 != 0 && y0 != 0) {
2238 a = ((
left[size_max_y - 1]) * 0x01010101U);
2239 for (i = (size_max_y - 1);
2240 i > (size_max_y - 1) - (size_max_y); i -= 4)
2243 ((-1) << hshift)) >> s->
ps.
sps->
2244 log2_min_pu_size)) + (((y0 +
2250 * min_pu_width]).pred_flag ==
2254 a = ((
left[i - 3]) * 0x01010101U);
2257 ((-1) << hshift)) >> s->
ps.
sps->
2258 log2_min_pu_size)) + (((y0 + ((-1)
2263 * min_pu_width]).pred_flag ==
PF_INTRA))
2265 }
else if (x0 == 0) {
2267 uint32_t pix = ((0) * 0x01010101U);
2268 for (i = 0; i < (size_max_y); i += 4)
2272 a = ((
left[size_max_y - 1]) * 0x01010101U);
2273 for (i = (size_max_y - 1);
2274 i > (size_max_y - 1) - (size_max_y); i -= 4)
2277 ((-1) << hshift)) >> s->
ps.
sps->
2278 log2_min_pu_size)) + (((y0 +
2284 * min_pu_width]).pred_flag ==
2288 a = ((
left[i - 3]) * 0x01010101U);
2292 a = ((
left[-1]) * 0x01010101U);
2293 for (i = 0; i < (0) + (size_max_x); i += 4)
2296 ((i) << hshift)) >> s->
ps.
sps->
2297 log2_min_pu_size)) + (((y0 + ((-1)
2302 * min_pu_width]).pred_flag ==
2306 a = ((top[i + 3]) * 0x01010101U);
2311 if (!cand_bottom_left) {
2313 vec0 = (v16u8) __msa_fill_b(
left[15]);
2317 }
else if (cand_up_left) {
2318 vec0 = (v16u8) __msa_fill_b(
left[-1]);
2323 }
else if (cand_up) {
2326 vec0 = (v16u8) __msa_fill_b(
left[-1]);
2332 }
else if (cand_up_right) {
2333 vec0 = (v16u8) __msa_fill_b(top[16]);
2346 vec0 = (v16u8) __msa_ldi_b(128);
2348 ST_UB2(vec0, vec0, top, 16);
2354 vec0 = (v16u8) __msa_fill_b(
left[16]);
2357 if (!cand_up_left) {
2361 vec0 = (v16u8) __msa_fill_b(
left[-1]);
2364 if (!cand_up_right) {
2365 vec0 = (v16u8) __msa_fill_b(top[15]);
2366 ST_UB(vec0, (top + 16));
2375 int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2376 int min_dist_vert_hor =
2385 if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
2386 filtered_left[2 * 16 - 1] =
left[2 * 16 - 1];
2387 filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
2388 for (i = 2 * 16 - 2; i >= 0; i--)
2389 filtered_left[i] = (
left[i + 1] + 2 *
left[i] +
2390 left[i - 1] + 2) >> 2;
2393 (
left[0] + 2 *
left[-1] + top[0] + 2) >> 2;
2394 for (i = 2 * 16 - 2; i >= 0; i--)
2395 filtered_top[i] = (top[i + 1] + 2 * top[i] +
2396 top[i - 1] + 2) >> 2;
2397 left = filtered_left;
2410 (
uint8_t *) left, stride, 4, c_idx);
2422 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2423 v8i16 res0, res1, res2, res3;
2424 v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
2425 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
2430 int size_in_luma_h = 32 << hshift;
2432 int size_in_luma_v = 32 << vshift;
2434 int x = x0 >> hshift;
2435 int y = y0 >> vshift;
2450 uint8_t left_array[2 * 32 + 1];
2451 uint8_t filtered_left_array[2 * 32 + 1];
2452 uint8_t top_array[2 * 32 + 1];
2453 uint8_t filtered_top_array[2 * 32 + 1];
2457 uint8_t *filtered_left = filtered_left_array + 1;
2458 uint8_t *filtered_top = filtered_top_array + 1;
2471 int bottom_left_size =
2472 (((y0 + 2 * size_in_luma_v) >
2474 2 * size_in_luma_v)) -
2475 (y0 + size_in_luma_v)) >> vshift;
2476 int top_right_size =
2477 (((x0 + 2 * size_in_luma_h) >
2479 (x0 + size_in_luma_h)) >> hshift;
2486 if (!size_in_luma_pu_h)
2487 size_in_luma_pu_h++;
2488 if (cand_bottom_left == 1 && on_pu_edge_x) {
2493 ((size_in_luma_pu_v) >
2496 y_bottom_pu) : (size_in_luma_pu_v));
2497 cand_bottom_left = 0;
2498 for (i = 0; i <
max; i += 2)
2502 i) * min_pu_width]).pred_flag ==
2505 if (cand_left == 1 && on_pu_edge_x) {
2509 ((size_in_luma_pu_v) >
2512 y_left_pu) : (size_in_luma_pu_v));
2514 for (i = 0; i <
max; i += 2)
2518 i) * min_pu_width]).pred_flag ==
2521 if (cand_up_left == 1) {
2526 (y_top_pu) * min_pu_width]).pred_flag ==
2529 if (cand_up == 1 && on_pu_edge_y) {
2533 ((size_in_luma_pu_h) >
2536 x_top_pu) : (size_in_luma_pu_h));
2538 for (i = 0; i <
max; i += 2)
2542 min_pu_width]).pred_flag ==
PF_INTRA);
2544 if (cand_up_right == 1 && on_pu_edge_y) {
2549 ((size_in_luma_pu_h) >
2552 x_right_pu) : (size_in_luma_pu_h));
2554 for (i = 0; i <
max; i += 2)
2558 min_pu_width]).pred_flag ==
PF_INTRA);
2560 vec0 = (v16u8) __msa_ldi_b(128);
2563 ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2568 left[-1] = src[(-1) + stride * (-1)];
2572 LD_UB2(src - stride, 16, vec0, vec1);
2573 ST_UB2(vec0, vec1, top, 16);
2576 if (cand_up_right) {
2577 LD_UB2(src - stride + 32, 16, vec0, vec1);
2578 ST_UB2(vec0, vec1, (top + 32), 16);
2581 ((src[(32 + top_right_size - 1) + stride * (-1)]) *
2583 for (i = 0; i < (32 - top_right_size); i += 4)
2589 for (i = 0; i < 32; i++)
2590 left[i] = src[(-1) + stride * (
i)];
2591 if (cand_bottom_left) {
2592 for (i = 32; i < 32 + bottom_left_size; i++)
2593 left[i] = src[(-1) + stride * (
i)];
2596 ((src[(-1) + stride * (32 + bottom_left_size - 1)]) *
2598 for (i = 0; i < (32 - bottom_left_size); i += 4)
2605 if (cand_bottom_left || cand_left || cand_up_left || cand_up
2608 x0 + ((2 * 32) << hshift) <
2611 y0 + ((2 * 32) << vshift) <
2613 int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2614 if (!cand_up_right) {
2615 size_max_x = x0 + ((32) << hshift) < s->
ps.
sps->
width ?
2618 if (!cand_bottom_left) {
2619 size_max_y = y0 + ((32) << vshift) < s->
ps.
sps->
height ?
2622 if (cand_bottom_left || cand_left || cand_up_left) {
2626 ((-1) << hshift)) >> s->
ps.
sps->
2627 log2_min_pu_size)) + (((y0 +
2632 * min_pu_width]).pred_flag ==
2637 ((-1) << hshift)) >> s->
ps.
sps->
2638 log2_min_pu_size)) + (((y0 + ((j)
2643 * min_pu_width]).pred_flag ==
PF_INTRA)) {
2645 while (j < size_max_x
2648 ((j) << hshift)) >> s->
ps.
sps->
2649 log2_min_pu_size)) + (((y0 +
2655 * min_pu_width]).pred_flag ==
2658 for (i = j; i > (j) - (j + 1); i--)
2662 1) << hshift)) >> s->
ps.
sps->
2663 log2_min_pu_size)) + (((y0 +
2669 * min_pu_width]).pred_flag ==
2671 top[i - 1] = top[i];
2676 while (j < size_max_x
2679 ((j) << hshift)) >> s->
ps.
sps->
2680 log2_min_pu_size)) + (((y0 + ((-1)
2685 * min_pu_width]).pred_flag ==
2690 for (i = j; i > (j) - (j + 1); i--)
2701 min_pu_width]).pred_flag ==
2703 top[i - 1] = top[i];
2705 for (i = j; i > (j) - (j); i--)
2716 min_pu_width]).pred_flag ==
2718 top[i - 1] = top[i];
2724 if (cand_bottom_left || cand_left) {
2725 a = ((
left[-1]) * 0x01010101U);
2726 for (i = 0; i < (0) + (size_max_y); i += 4)
2729 ((-1) << hshift)) >> s->
ps.
sps->
2730 log2_min_pu_size)) + (((y0 +
2735 * min_pu_width]).pred_flag ==
2739 a = ((
left[i + 3]) * 0x01010101U);
2742 vec0 = (v16u8) __msa_fill_b(
left[-1]);
2746 if (!cand_bottom_left) {
2747 vec0 = (v16u8) __msa_fill_b(
left[31]);
2751 if (x0 != 0 && y0 != 0) {
2752 a = ((
left[size_max_y - 1]) * 0x01010101U);
2753 for (i = (size_max_y - 1);
2754 i > (size_max_y - 1) - (size_max_y); i -= 4)
2757 ((-1) << hshift)) >> s->
ps.
sps->
2758 log2_min_pu_size)) + (((y0 +
2764 * min_pu_width]).pred_flag ==
2768 a = ((
left[i - 3]) * 0x01010101U);
2771 ((-1) << hshift)) >> s->
ps.
sps->
2772 log2_min_pu_size)) + (((y0 + ((-1)
2777 * min_pu_width]).pred_flag ==
PF_INTRA))
2779 }
else if (x0 == 0) {
2781 uint32_t pix = ((0) * 0x01010101U);
2782 for (i = 0; i < (size_max_y); i += 4)
2786 a = ((
left[size_max_y - 1]) * 0x01010101U);
2787 for (i = (size_max_y - 1);
2788 i > (size_max_y - 1) - (size_max_y); i -= 4)
2791 ((-1) << hshift)) >> s->
ps.
sps->
2792 log2_min_pu_size)) + (((y0 +
2798 * min_pu_width]).pred_flag ==
2802 a = ((
left[i - 3]) * 0x01010101U);
2806 a = ((
left[-1]) * 0x01010101U);
2807 for (i = 0; i < (0) + (size_max_x); i += 4)
2810 ((i) << hshift)) >> s->
ps.
sps->
2811 log2_min_pu_size)) + (((y0 + ((-1)
2816 * min_pu_width]).pred_flag ==
2820 a = ((top[i + 3]) * 0x01010101U);
2825 if (!cand_bottom_left) {
2827 vec0 = (v16u8) __msa_fill_b(
left[31]);
2830 }
else if (cand_up_left) {
2831 vec0 = (v16u8) __msa_fill_b(
left[-1]);
2836 }
else if (cand_up) {
2839 vec0 = (v16u8) __msa_fill_b(
left[-1]);
2845 }
else if (cand_up_right) {
2846 vec0 = (v16u8) __msa_fill_b(top[32]);
2848 ST_UB2(vec0, vec0, top, 16);
2860 vec0 = (v16u8) __msa_ldi_b(128);
2862 ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2868 vec0 = (v16u8) __msa_fill_b(
left[32]);
2872 if (!cand_up_left) {
2876 vec0 = (v16u8) __msa_fill_b(
left[-1]);
2878 ST_UB2(vec0, vec0, top, 16);
2880 if (!cand_up_right) {
2881 vec0 = (v16u8) __msa_fill_b(top[31]);
2883 ST_UB2(vec0, vec0, (top + 32), 16);
2892 int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2893 int min_dist_vert_hor =
2902 if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
2903 int threshold = 1 << (8 - 5);
2906 && ((top[-1] + top[63] - 2 * top[31]) >=
2907 0 ? (top[-1] + top[63] -
2908 2 * top[31]) : (-(top[-1] + top[63] -
2909 2 * top[31]))) < threshold
2913 2 *
left[31]))) < threshold) {
2916 filtered_top[-1] = top[-1];
2917 filtered_top[63] = top[63];
2920 for (i = 0; i < 63; i++) {
2922 ((63 -
i) * top[-1] + (i + 1) * top[63] + 32) >> 6;
2925 tmp0 = __msa_fill_h(top[-1]);
2926 tmp1 = __msa_fill_h(top[63]);
2928 tmp2 = mul_val0 - 8;
2929 tmp3 = mul_val0 - 16;
2930 tmp4 = mul_val0 - 24;
2931 tmp5 = mul_val1 + 8;
2932 tmp6 = mul_val1 + 16;
2933 tmp7 = mul_val1 + 24;
2935 res0 = mul_val0 * tmp0;
2939 res0 += mul_val1 * tmp1;
2940 res1 += tmp5 * tmp1;
2941 res2 += tmp6 * tmp1;
2942 res3 += tmp7 * tmp1;
2944 res0 = __msa_srari_h(res0, 6);
2945 res1 = __msa_srari_h(res1, 6);
2946 res2 = __msa_srari_h(res2, 6);
2947 res3 = __msa_srari_h(res3, 6);
2949 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2950 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2952 ST_UB2(vec0, vec1, filtered_top, 16);
2954 res0 = mul_val0 - 32;
2955 tmp2 = mul_val0 - 40;
2956 tmp3 = mul_val0 - 48;
2957 tmp4 = mul_val0 - 56;
2958 res3 = mul_val1 + 32;
2959 tmp5 = mul_val1 + 40;
2960 tmp6 = mul_val1 + 48;
2961 tmp7 = mul_val1 + 56;
2966 res0 += res3 * tmp1;
2968 res1 += tmp5 * tmp1;
2969 res2 += tmp6 * tmp1;
2970 res3 += tmp7 * tmp1;
2972 res0 = __msa_srari_h(res0, 6);
2973 res1 = __msa_srari_h(res1, 6);
2974 res2 = __msa_srari_h(res2, 6);
2975 res3 = __msa_srari_h(res3, 6);
2977 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2978 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2980 ST_UB2(vec0, vec1, (filtered_top + 32), 16);
2982 filtered_top[63] = top[63];
2984 tmp0 = __msa_fill_h(
left[-1]);
2985 tmp1 = __msa_fill_h(
left[63]);
2987 tmp2 = mul_val0 - 8;
2988 tmp3 = mul_val0 - 16;
2989 tmp4 = mul_val0 - 24;
2990 tmp5 = mul_val1 + 8;
2991 tmp6 = mul_val1 + 16;
2992 tmp7 = mul_val1 + 24;
2994 res0 = mul_val0 * tmp0;
2998 res0 += mul_val1 * tmp1;
2999 res1 += tmp5 * tmp1;
3000 res2 += tmp6 * tmp1;
3001 res3 += tmp7 * tmp1;
3003 res0 = __msa_srari_h(res0, 6);
3004 res1 = __msa_srari_h(res1, 6);
3005 res2 = __msa_srari_h(res2, 6);
3006 res3 = __msa_srari_h(res3, 6);
3008 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3009 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3013 res0 = mul_val0 - 32;
3014 tmp2 = mul_val0 - 40;
3015 tmp3 = mul_val0 - 48;
3016 tmp4 = mul_val0 - 56;
3017 res3 = mul_val1 + 32;
3018 tmp5 = mul_val1 + 40;
3019 tmp6 = mul_val1 + 48;
3020 tmp7 = mul_val1 + 56;
3025 res0 += res3 * tmp1;
3027 res1 += tmp5 * tmp1;
3028 res2 += tmp6 * tmp1;
3029 res3 += tmp7 * tmp1;
3031 res0 = __msa_srari_h(res0, 6);
3032 res1 = __msa_srari_h(res1, 6);
3033 res2 = __msa_srari_h(res2, 6);
3034 res3 = __msa_srari_h(res3, 6);
3036 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3037 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3045 filtered_left[2 * 32 - 1] =
left[2 * 32 - 1];
3046 filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
3047 for (i = 2 * 32 - 2; i >= 0; i--)
3048 filtered_left[i] = (
left[i + 1] + 2 *
left[i] +
3049 left[i - 1] + 2) >> 2;
3052 (
left[0] + 2 *
left[-1] + top[0] + 2) >> 2;
3053 for (i = 2 * 32 - 2; i >= 0; i--)
3054 filtered_top[i] = (top[i + 1] + 2 * top[i] +
3055 top[i - 1] + 2) >> 2;
3056 left = filtered_left;
3070 (
uint8_t *) left, stride, 5, c_idx);
void ff_hevc_intra_pred_planar_2_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
static void hevc_intra_pred_horiz_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
#define MUL2(in0, in1, in2, in3, out0, out1)
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
static const int8_t intra_pred_angle_up[17]
static void hevc_intra_pred_dc_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
#define SPLATI_H2_SH(...)
static void hevc_intra_pred_vert_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
static void hevc_intra_pred_plane_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
#define UNPCK_UB_SH(in, out0, out1)
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define CLIP_SH_0_255(in)
#define SPLATI_H4_SH(...)
void ff_hevc_intra_pred_dc_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int log2, int c_idx)
#define CLIP_SH2_0_255(in0, in1)
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5,idx6, idx7, pdst, stride)
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
#define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1,mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3,res0, res1, mul_val_b0, mul_val_b1, round)
static const int8_t intra_pred_angle_low[16]
static void hevc_intra_pred_horiz_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
static void hevc_intra_pred_angular_lower_16width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, int32_t dst_stride)
#define SW4(in0, in1, in2, in3, pdst, stride)
static void hevc_intra_pred_angular_lower_4width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
static void hevc_intra_pred_horiz_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
static void hevc_intra_pred_plane_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
static void hevc_intra_pred_angular_lower_32width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
uint8_t constrained_intra_pred_flag
static void hevc_intra_pred_horiz_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
void ff_hevc_intra_pred_planar_1_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
int linesize[AV_NUM_DATA_POINTERS]
For video, size in bytes of each picture line.
unsigned int log2_min_pu_size
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2]...the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so...,+,-,+,-,+,+,-,+,-,+,...hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32-hcoeff[1]-hcoeff[2]-...a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2}an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||.........intra?||||:Block01:yes no||||:Block02:.................||||:Block03::y DC::ref index:||||:Block04::cb DC::motion x:||||.........:cr DC::motion y:||||.................|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------------------------------|||Y subbands||Cb subbands||Cr subbands||||------||------||------|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||------||------||------||||------||------||------|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||------||------||------||||------||------||------|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||------||------||------||||------||------||------|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------------------------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction------------|\Dequantization-------------------\||Reference frames|\IDWT|--------------|Motion\|||Frame 0||Frame 1||Compensation.OBMC v-------|--------------|--------------.\------> Frame n output Frame Frame<----------------------------------/|...|-------------------Range Coder:============Binary Range Coder:-------------------The implemented range coder is an adapted version based upon"Range encoding: an algorithm for removing redundancy from a digitised message."by G.N.N.Martin.The symbols encoded by the Snow range coder are bits(0|1).The associated probabilities are not fix but change depending on the symbol mix seen so far.bit seen|new state---------+-----------------------------------------------0|256-state_transition_table[256-old_state];1|state_transition_table[old_state];state_transition_table={0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:-------------------------FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1.the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
static void hevc_intra_pred_angular_upper_32width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
void(* pred_angular[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int c_idx, int mode)
static void hevc_intra_pred_vert_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
void(* pred_dc)(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride, int log2_size, int c_idx)
void ff_intra_pred_8_32x32_msa(HEVCContext *s, int x0, int y0, int c_idx)
static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
static void hevc_intra_pred_dc_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
unsigned int log2_min_tb_size
#define ADD2(in0, in1, in2, in3, out0, out1)
#define INSERT_W2_SB(...)
static void hevc_intra_pred_vert_4x4_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
#define SD4(in0, in1, in2, in3, pdst, stride)
static void hevc_intra_pred_dc_32x32_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride)
void ff_hevc_intra_pred_planar_3_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
void ff_intra_pred_8_16x16_msa(HEVCContext *s, int x0, int y0, int c_idx)
HEVCLocalContext * HEVClc
static void process_intra_lower_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, uint8_t offset)
void ff_pred_intra_pred_angular_1_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
static void hevc_intra_pred_angular_upper_16width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
uint8_t * data[AV_NUM_DATA_POINTERS]
pointer to the picture/channel planes.
#define INSERT_D2_UB(...)
GLint GLenum GLboolean GLsizei stride
void ff_pred_intra_pred_angular_2_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
static int ref[MAX_W *MAX_W]
#define SUB2(in0, in1, in2, in3, out0, out1)
static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)
void ff_hevc_intra_pred_planar_0_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride)
uint8_t sps_strong_intra_smoothing_enable_flag
void ff_pred_intra_pred_angular_3_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
void ff_pred_intra_pred_angular_0_msa(uint8_t *dst, const uint8_t *src_top, const uint8_t *src_left, ptrdiff_t stride, int c_idx, int mode)
static void hevc_intra_pred_dc_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t flag)
void(* pred_planar[4])(uint8_t *src, const uint8_t *top, const uint8_t *left, ptrdiff_t stride)
static void process_intra_upper_16x16_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, uint8_t offset)
#define ST_W2(in, idx0, idx1, pdst, stride)
int * min_tb_addr_zs
MinTbAddrZS.
int intra_smoothing_disabled_flag
static double val(void *priv, double ch)
mode
Use these values in ebur128_init (or'ed).
static void hevc_intra_pred_angular_upper_4width_msa(const uint8_t *src_top, const uint8_t *src_left, uint8_t *dst, int32_t stride, int32_t mode)