28     v8i16 r0, r1, r2, r3, r4, r5, r6, r7, sign;
 
   29     v4i32 r0_r, r0_l, r1_r, r1_l, r2_r, r2_l, r3_r, r3_l,
 
   30           r4_r, r4_l, r5_r, r5_l, r6_r, r6_l, r7_r, r7_l;
 
   31     v4i32 
A, 
B, 
C, 
D, Ad, Bd, Cd, Dd, 
E, 
F, 
G, 
H;
 
   32     v4i32 Ed, Gd, Add, Bdd, Fd, Hd;
 
   34     v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
 
   35     v4i32 c0, 
c1, 
c2, c3, c4, c5, c6, c7;
 
   36     v4i32 f0, f1, f2, f3, f4, f5, f6, f7;
 
   39     v16i8 
mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
 
   40     v4i32 cnst64277w = {64277, 64277, 64277, 64277};
 
   41     v4i32 cnst60547w = {60547, 60547, 60547, 60547};
 
   42     v4i32 cnst54491w = {54491, 54491, 54491, 54491};
 
   43     v4i32 cnst46341w = {46341, 46341, 46341, 46341};
 
   44     v4i32 cnst36410w = {36410, 36410, 36410, 36410};
 
   45     v4i32 cnst25080w = {25080, 25080, 25080, 25080};
 
   46     v4i32 cnst12785w = {12785, 12785, 12785, 12785};
 
   47     v4i32 cnst8w = {8, 8, 8, 8};
 
   48     v4i32 cnst2048w = {2048, 2048, 2048, 2048};
 
   49     v4i32 cnst128w = {128, 128, 128, 128};
 
   53     sign = __msa_clti_s_h(r0, 0);
 
   54     r0_r = (v4i32) __msa_ilvr_h(sign, r0);
 
   55     r0_l = (v4i32) __msa_ilvl_h(sign, r0);
 
   56     sign = __msa_clti_s_h(r1, 0);
 
   57     r1_r = (v4i32) __msa_ilvr_h(sign, r1);
 
   58     r1_l = (v4i32) __msa_ilvl_h(sign, r1);
 
   59     sign = __msa_clti_s_h(r2, 0);
 
   60     r2_r = (v4i32) __msa_ilvr_h(sign, r2);
 
   61     r2_l = (v4i32) __msa_ilvl_h(sign, r2);
 
   62     sign = __msa_clti_s_h(r3, 0);
 
   63     r3_r = (v4i32) __msa_ilvr_h(sign, r3);
 
   64     r3_l = (v4i32) __msa_ilvl_h(sign, r3);
 
   65     sign = __msa_clti_s_h(r4, 0);
 
   66     r4_r = (v4i32) __msa_ilvr_h(sign, r4);
 
   67     r4_l = (v4i32) __msa_ilvl_h(sign, r4);
 
   68     sign = __msa_clti_s_h(r5, 0);
 
   69     r5_r = (v4i32) __msa_ilvr_h(sign, r5);
 
   70     r5_l = (v4i32) __msa_ilvl_h(sign, r5);
 
   71     sign = __msa_clti_s_h(r6, 0);
 
   72     r6_r = (v4i32) __msa_ilvr_h(sign, r6);
 
   73     r6_l = (v4i32) __msa_ilvl_h(sign, r6);
 
   74     sign = __msa_clti_s_h(r7, 0);
 
   75     r7_r = (v4i32) __msa_ilvr_h(sign, r7);
 
   76     r7_l = (v4i32) __msa_ilvl_h(sign, r7);
 
   79     A = ((r1_r * cnst64277w) >> 16) + ((r7_r * cnst12785w) >> 16);
 
   80     B = ((r1_r * cnst12785w) >> 16) - ((r7_r * cnst64277w) >> 16);
 
   81     C = ((r3_r * cnst54491w) >> 16) + ((r5_r * cnst36410w) >> 16);
 
   82     D = ((r5_r * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16);
 
   83     Ad = ((
A - 
C) * cnst46341w) >> 16;
 
   84     Bd = ((
B - 
D) * cnst46341w) >> 16;
 
   87     E = ((r0_r + r4_r) * cnst46341w) >> 16;
 
   88     F = ((r0_r - r4_r) * cnst46341w) >> 16;
 
   89     G = ((r2_r * cnst60547w) >> 16) + ((r6_r * cnst25080w) >> 16);
 
   90     H = ((r2_r * cnst25080w) >> 16) - ((r6_r * cnst60547w) >> 16);
 
  107     A = ((r1_l * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16);
 
  108     B = ((r1_l * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16);
 
  109     C = ((r3_l * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16);
 
  110     D = ((r5_l * cnst54491w) >> 16) - ((r3_l * cnst36410w) >> 16);
 
  111     Ad = ((
A - 
C) * cnst46341w) >> 16;
 
  112     Bd = ((
B - 
D) * cnst46341w) >> 16;
 
  115     E = ((r0_l + r4_l) * cnst46341w) >> 16;
 
  116     F = ((r0_l - r4_l) * cnst46341w) >> 16;
 
  117     G = ((r2_l * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16);
 
  118     H = ((r2_l * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16);
 
  136                        r0_r, r1_r, r2_r, r3_r);
 
  138                        r0_l, r1_l, r2_l, r3_l);
 
  139     A = ((r1_r * cnst64277w) >> 16) + ((r3_l * cnst12785w) >> 16);
 
  140     B = ((r1_r * cnst12785w) >> 16) - ((r3_l * cnst64277w) >> 16);
 
  141     C = ((r3_r * cnst54491w) >> 16) + ((r1_l * cnst36410w) >> 16);
 
  142     D = ((r1_l * cnst54491w) >> 16) - ((r3_r * cnst36410w) >> 16);
 
  143     Ad = ((
A - 
C) * cnst46341w) >> 16;
 
  144     Bd = ((
B - 
D) * cnst46341w) >> 16;
 
  147     E = ((r0_r + r0_l) * cnst46341w) >> 16;
 
  149     F = ((r0_r - r0_l) * cnst46341w) >> 16;
 
  155     G = ((r2_r * cnst60547w) >> 16) + ((r2_l * cnst25080w) >> 16);
 
  156     H = ((r2_r * cnst25080w) >> 16) - ((r2_l * cnst60547w) >> 16);
 
  191     sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r);
 
  192     sign_l = __msa_or_v(sign_l, (v16u8)r3_r);
 
  193     sign_l = __msa_or_v(sign_l, (v16u8)r0_l);
 
  194     sign_l = __msa_or_v(sign_l, (v16u8)r1_l);
 
  195     sign_l = __msa_or_v(sign_l, (v16u8)r2_l);
 
  196     sign_l = __msa_or_v(sign_l, (v16u8)r3_l);
 
  197     sign_t = __msa_ceqi_w((v4i32)sign_l, 0);
 
  198     Add = ((r0_r * cnst46341w) + (8 << 16)) >> 20;
 
  200         Bdd = Add + cnst128w;
 
  221     Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
 
  222     Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
 
  223     Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t);
 
  224     Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t);
 
  225     Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t);
 
  226     Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t);
 
  227     Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t);
 
  228     Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t);
 
  229     sign_t = __msa_ceqi_w(sign_t, 0);
 
  230     A = (v4i32)__msa_and_v((v16u8)
A, (v16u8)sign_t);
 
  231     B = (v4i32)__msa_and_v((v16u8)
B, (v16u8)sign_t);
 
  232     C = (v4i32)__msa_and_v((v16u8)
C, (v16u8)sign_t);
 
  233     D = (v4i32)__msa_and_v((v16u8)
D, (v16u8)sign_t);
 
  234     E = (v4i32)__msa_and_v((v16u8)
E, (v16u8)sign_t);
 
  235     F = (v4i32)__msa_and_v((v16u8)
F, (v16u8)sign_t);
 
  236     G = (v4i32)__msa_and_v((v16u8)
G, (v16u8)sign_t);
 
  237     H = (v4i32)__msa_and_v((v16u8)
H, (v16u8)sign_t);
 
  249                        r4_r, r5_r, r6_r, r7_r);
 
  251                        r4_l, r5_l, r6_l, r7_l);
 
  252     A = ((r5_r * cnst64277w) >> 16) + ((r7_l * cnst12785w) >> 16);
 
  253     B = ((r5_r * cnst12785w) >> 16) - ((r7_l * cnst64277w) >> 16);
 
  254     C = ((r7_r * cnst54491w) >> 16) + ((r5_l * cnst36410w) >> 16);
 
  255     D = ((r5_l * cnst54491w) >> 16) - ((r7_r * cnst36410w) >> 16);
 
  256     Ad = ((
A - 
C) * cnst46341w) >> 16;
 
  257     Bd = ((
B - 
D) * cnst46341w) >> 16;
 
  260     E = ((r4_r + r4_l) * cnst46341w) >> 16;
 
  262     F = ((r4_r - r4_l) * cnst46341w) >> 16;
 
  268     G = ((r6_r * cnst60547w) >> 16) + ((r6_l * cnst25080w) >> 16);
 
  269     H = ((r6_r * cnst25080w) >> 16) - ((r6_l * cnst60547w) >> 16);
 
  299     sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r);
 
  300     sign_l = __msa_or_v(sign_l, (v16u8)r7_r);
 
  301     sign_l = __msa_or_v(sign_l, (v16u8)r4_l);
 
  302     sign_l = __msa_or_v(sign_l, (v16u8)r5_l);
 
  303     sign_l = __msa_or_v(sign_l, (v16u8)r6_l);
 
  304     sign_l = __msa_or_v(sign_l, (v16u8)r7_l);
 
  305     sign_t = __msa_ceqi_w((v4i32)sign_l, 0);
 
  306     Add = ((r4_r * cnst46341w) + (8 << 16)) >> 20;
 
  308         Bdd = Add + cnst128w;
 
  329     Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
 
  330     Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
 
  331     Cd = (v4i32)__msa_and_v((v16u8)Cd, (v16u8)sign_t);
 
  332     Dd = (v4i32)__msa_and_v((v16u8)Dd, (v16u8)sign_t);
 
  333     Ed = (v4i32)__msa_and_v((v16u8)Ed, (v16u8)sign_t);
 
  334     Fd = (v4i32)__msa_and_v((v16u8)Fd, (v16u8)sign_t);
 
  335     Gd = (v4i32)__msa_and_v((v16u8)Gd, (v16u8)sign_t);
 
  336     Hd = (v4i32)__msa_and_v((v16u8)Hd, (v16u8)sign_t);
 
  337     sign_t = __msa_ceqi_w(sign_t, 0);
 
  338     A = (v4i32)__msa_and_v((v16u8)
A, (v16u8)sign_t);
 
  339     B = (v4i32)__msa_and_v((v16u8)
B, (v16u8)sign_t);
 
  340     C = (v4i32)__msa_and_v((v16u8)
C, (v16u8)sign_t);
 
  341     D = (v4i32)__msa_and_v((v16u8)
D, (v16u8)sign_t);
 
  342     E = (v4i32)__msa_and_v((v16u8)
E, (v16u8)sign_t);
 
  343     F = (v4i32)__msa_and_v((v16u8)
F, (v16u8)sign_t);
 
  344     G = (v4i32)__msa_and_v((v16u8)
G, (v16u8)sign_t);
 
  345     H = (v4i32)__msa_and_v((v16u8)
H, (v16u8)sign_t);
 
  384     int i = (
block[0] + 15) >> 5;
 
  386     v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
 
  387     v4i32 c0, 
c1, 
c2, c3, c4, c5, c6, c7;
 
  388     v4i32 e0, e1, e2, e3, e4, e5, e6, e7;
 
  389     v4i32 r0, r1, r2, r3, r4, r5, r6, r7;
 
  390     v16i8 
mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
 
  393     LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7);
 
  434     ST_D1(d1, 0, dest + line_size);
 
  435     ST_D1(d2, 0, dest + 2 * line_size);
 
  436     ST_D1(d3, 0, dest + 3 * line_size);
 
  437     ST_D1(d4, 0, dest + 4 * line_size);
 
  438     ST_D1(d5, 0, dest + 5 * line_size);
 
  439     ST_D1(d6, 0, dest + 6 * line_size);
 
  440     ST_D1(d7, 0, dest + 7 * line_size);
 
  446                               int *bounding_values)
 
  449     v4i32 e0, e1, f0, f1, g0, g1;
 
  451     v16i8 d0, d1, d2, d3;
 
  452     v8i16 c0, 
c1, 
c2, c3;
 
  454     v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3},
 
  455           cnst4h = {4, 4, 4, 4, 4, 4, 4, 4};
 
  456     v16i8 
mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
 
  460     LD_SB4(first_pixel + nstride * 2, 
stride, d0, d1, d2, d3);
 
  463     r0 = (c0 - c3) + (
c2 - 
c1) * cnst3h;
 
  468     for (
int i = 0; 
i < 8; 
i++)
 
  469         temp_32[
i] = bounding_values[temp_16[
i]];
 
  470     LD_SW2(temp_32, 4, e0, e1);
 
  481     ST_D1(d1, 0, first_pixel + nstride);
 
  482     ST_D1(d2, 0, first_pixel);
 
  486                               int *bounding_values)
 
  488     v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
 
  489     v8i16 c0, 
c1, 
c2, c3, c4, c5, c6, c7;
 
  491     v4i32 e0, e1, f0, f1, g0, g1;
 
  493     v8i16 cnst3h = {3, 3, 3, 3, 3, 3, 3, 3},
 
  494           cnst4h = {4, 4, 4, 4, 4, 4, 4, 4};
 
  495     v16i8 
mask = {0, 16, 4, 20, 8, 24, 12, 28, 0, 0, 0, 0, 0, 0, 0, 0};
 
  499     LD_SB8(first_pixel - 2, 
stride, d0, d1, d2, d3, d4, d5, d6, d7);
 
  505                        c0, 
c1, 
c2, c3, c4, c5, c6, c7);
 
  506     r0 = (c0 - c3) + (
c2 - 
c1) * cnst3h;
 
  512     for (
int i = 0; 
i < 8; 
i++)
 
  513         temp_32[
i] = bounding_values[temp_16[
i]];
 
  514     LD_SW2(temp_32, 4, e0, e1);
 
  529                                  const uint8_t *src2, ptrdiff_t 
stride, 
int h)
 
  532         v16i8 d0, d1, d2, d3, d4, d5, d6, d7;
 
  533         v16i8 c0, 
c1, 
c2, c3;
 
  538         v16i8 
mask = {0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23};
 
  545         a0 = (v4i32) __msa_pckev_d((v2i64)
c1, (v2i64)c0);
 
  546         a2 = (v4i32) __msa_pckod_d((v2i64)
c1, (v2i64)c0);
 
  547         a1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)
c2);
 
  548         a3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)
c2);
 
  553         b0 = (v4i32) __msa_pckev_d((v2i64)
c1, (v2i64)c0);
 
  554         b2 = (v4i32) __msa_pckod_d((v2i64)
c1, (v2i64)c0);
 
  555         b1 = (v4i32) __msa_pckev_d((v2i64)c3, (v2i64)
c2);
 
  556         b3 = (v4i32) __msa_pckod_d((v2i64)c3, (v2i64)
c2);
 
  558         e0 = (v4i32) __msa_xor_v((v16u8)
a0, (v16u8)
b0);
 
  559         e0 = (v4i32) __msa_and_v((v16u8)e0, (v16u8)fmask);
 
  560         t0 = ((v4u32)e0) >> 1;
 
  561         e2 = (v4i32) __msa_and_v((v16u8)
a0, (v16u8)
b0);
 
  564         e1 = (v4i32) __msa_xor_v((v16u8)
a1, (v16u8)
b1);
 
  565         e1 = (v4i32) __msa_and_v((v16u8)e1, (v16u8)fmask);
 
  566         t1 = ((v4u32)e1) >> 1;
 
  567         e2 = (v4i32) __msa_and_v((v16u8)
a1, (v16u8)
b1);
 
  570         f0 = (v4i32) __msa_xor_v((v16u8)
a2, (v16u8)
b2);
 
  571         f0 = (v4i32) __msa_and_v((v16u8)f0, (v16u8)fmask);
 
  572         t2 = ((v4u32)f0) >> 1;
 
  573         f2 = (v4i32) __msa_and_v((v16u8)
a2, (v16u8)
b2);
 
  576         f1 = (v4i32) __msa_xor_v((v16u8)
a3, (v16u8)
b3);
 
  577         f1 = (v4i32) __msa_and_v((v16u8)f1, (v16u8)fmask);
 
  578         t3 = ((v4u32)f1) >> 1;
 
  579         f2 = (v4i32) __msa_and_v((v16u8)
a3, (v16u8)
b3);
 
  582         ST_W8(
t0, 
t1, 0, 1, 2, 3, 0, 1, 2, 3, dst, 
stride);
 
  583         ST_W8(
t2, 
t3, 0, 1, 2, 3, 0, 1, 2, 3, dst + 4, 
stride);
 
  587         for (
i = 0; 
i < 
h; 
i++) {