29     __m256i in0, in1, in2, in3;
 
   30     __m256i temp0, temp1, temp2, temp3, 
t1, 
t2, 
t3, 
t4, 
t5, 
t6, 
t7, 
t8;
 
   31     __m256i const_1  = {0x000c000c000c000c, 0x000c000c000c000c,
 
   32                         0x000c000c000c000c, 0x000c000c000c000c};
 
   33     __m256i const_2  = {0xfff4000cfff4000c, 0xfff4000cfff4000c,
 
   34                         0xfff4000cfff4000c, 0xfff4000cfff4000c};
 
   35     __m256i const_3  = {0x0006001000060010, 0x0006001000060010,
 
   36                         0x0006001000060010, 0x0006001000060010};
 
   37     __m256i const_4  = {0xfff00006fff00006, 0xfff00006fff00006,
 
   38                         0xfff00006fff00006, 0xfff00006fff00006};
 
   39     __m256i const_5  = {0x000f0010000f0010, 0x000f0010000f0010,
 
   40                         0x000f0010000f0010, 0x000f0010000f0010};
 
   41     __m256i const_6  = {0x0004000900040009, 0x0004000900040009,
 
   42                         0x0004000900040009, 0x0004000900040009};
 
   43     __m256i const_7  = {0xfffc000ffffc000f, 0xfffc000ffffc000f,
 
   44                         0xfffc000ffffc000f, 0xfffc000ffffc000f};
 
   45     __m256i const_8  = {0xfff7fff0fff7fff0, 0xfff7fff0fff7fff0,
 
   46                         0xfff7fff0fff7fff0, 0xfff7fff0fff7fff0};
 
   47     __m256i const_9  = {0xfff00009fff00009, 0xfff00009fff00009,
 
   48                         0xfff00009fff00009, 0xfff00009fff00009};
 
   49     __m256i const_10 = {0x000f0004000f0004, 0x000f0004000f0004,
 
   50                         0x000f0004000f0004, 0x000f0004000f0004};
 
   51     __m256i const_11 = {0xfff70004fff70004, 0xfff70004fff70004,
 
   52                         0xfff70004fff70004, 0xfff70004fff70004};
 
   53     __m256i const_12 = {0xfff0000ffff0000f, 0xfff0000ffff0000f,
 
   54                         0xfff0000ffff0000f, 0xfff0000ffff0000f};
 
   58     DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
 
   61     DUP2_ARG2(__lasx_xvilvl_h, in2, in0, in3, in1, temp0, temp1);
 
   62     t2 = __lasx_xvreplgr2vr_w(con_4);
 
   65     DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_3, temp1, const_4, 
t3, 
t4);
 
   67     t5 = __lasx_xvadd_w(
t1, 
t3);
 
   68     t6 = __lasx_xvadd_w(
t2, 
t4);
 
   69     t7 = __lasx_xvsub_w(
t2, 
t4);
 
   70     t8 = __lasx_xvsub_w(
t1, 
t3);
 
   72     DUP2_ARG2(__lasx_xvilvh_h, in1, in0, in3, in2, temp0, temp1);
 
   73     temp2 = __lasx_xvdp2_w_h(const_5, temp0);
 
   74     t1 = __lasx_xvdp2add_w_h(temp2, temp1, const_6);
 
   75     temp2 = __lasx_xvdp2_w_h(const_7, temp0);
 
   76     t2 = __lasx_xvdp2add_w_h(temp2, temp1, const_8);
 
   77     temp2 = __lasx_xvdp2_w_h(const_9, temp0);
 
   78     t3 = __lasx_xvdp2add_w_h(temp2, temp1, const_10);
 
   79     temp2 = __lasx_xvdp2_w_h(const_11, temp0);
 
   80     t4 = __lasx_xvdp2add_w_h(temp2, temp1, const_12);
 
   83               temp0, temp1, temp2, temp3);
 
   86     DUP4_ARG2(__lasx_xvsrai_w, temp0, 3, temp1, 3, temp2, 3, temp3, 3,
 
   87               temp0, temp1, temp2, temp3);
 
   88     DUP4_ARG2(__lasx_xvsrai_w, in0, 3, in1, 3, in2, 3, in3, 3,
 
   92     DUP4_ARG2(__lasx_xvpackev_h, temp1, temp0, temp3, temp2, in1, in0,
 
   93               in3, in2, temp0, temp1, temp2, temp3);
 
   94     DUP2_ARG2(__lasx_xvilvl_w, temp1, temp0, temp3, temp2, 
t1, 
t3);
 
   95     DUP2_ARG2(__lasx_xvilvh_w, temp1, temp0, temp3, temp2, 
t2, 
t4);
 
   96     DUP4_ARG3(__lasx_xvpermi_q, 
t3, 
t1, 0x20, 
t3, 
t1, 0x31, 
t4, 
t2, 0x20,
 
   97               t4, 
t2, 0x31, in0, in1, in2, in3);
 
   98     DUP2_ARG2(__lasx_xvilvl_h, in1, in0, in3, in2, temp0, temp1);
 
   99     t3    = __lasx_xvreplgr2vr_w(con_64);
 
  100     DUP2_ARG3(__lasx_xvdp2add_w_h, 
t3, temp0, const_1, 
t3, temp0,
 
  102     DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_3, temp1, const_4, 
t3, 
t4);
 
  104     t5    = __lasx_xvadd_w(
t1, 
t3);
 
  105     t6    = __lasx_xvadd_w(
t2, 
t4);
 
  106     t7    = __lasx_xvsub_w(
t2, 
t4);
 
  107     t8    = __lasx_xvsub_w(
t1, 
t3);
 
  109     DUP2_ARG2(__lasx_xvilvh_h, in2, in0, in3, in1, temp0, temp1);
 
  110     temp2 = __lasx_xvdp2_w_h(const_5, temp0);
 
  111     t1 = __lasx_xvdp2add_w_h(temp2, temp1, const_6);
 
  112     temp2 = __lasx_xvdp2_w_h(const_7, temp0);
 
  113     t2 = __lasx_xvdp2add_w_h(temp2, temp1, const_8);
 
  114     temp2 = __lasx_xvdp2_w_h(const_9, temp0);
 
  115     t3 = __lasx_xvdp2add_w_h(temp2, temp1, const_10);
 
  116     temp2 = __lasx_xvdp2_w_h(const_11, temp0);
 
  117     t4 = __lasx_xvdp2add_w_h(temp2, temp1, const_12);
 
  120               temp0, temp1, temp2, temp3);
 
  123     DUP4_ARG2(__lasx_xvaddi_wu, in0, 1, in1, 1, in2, 1, in3, 1,
 
  125     DUP4_ARG3(__lasx_xvsrani_h_w, temp1, temp0, 7, temp3, temp2, 7,
 
  126               in1, in0, 7, in3, in2, 7, 
t1, 
t2, 
t3, 
t4);
 
  129     __lasx_xvst(in0, 
block, 0);
 
  130     __lasx_xvst(in1, 
block, 32);
 
  131     __lasx_xvst(in2, 
block, 64);
 
  132     __lasx_xvst(in3, 
block, 96);
 
  139     ptrdiff_t stride2 = 
stride << 1;
 
  140     ptrdiff_t stride3 = stride2 + 
stride;
 
  141     uint8_t *dst = dest + (stride2 << 1);
 
  142     __m256i in0, in1, in2, in3, in4, in5, in6, in7;
 
  143     __m256i const_dc, temp0, temp1, temp2, temp3;
 
  144     __m256i reg0, reg1, reg2, reg3;
 
  146     dc = (3 * 
dc +  1) >> 1;
 
  147     dc = (3 * 
dc + 16) >> 5;
 
  149     const_dc = __lasx_xvreplgr2vr_h(
dc);
 
  150     DUP4_ARG2(__lasx_xvldrepl_d, dest, 0, dest + 
stride, 0, dest + stride2,
 
  151               0, dest + stride3, 0, in0, in1, in2, in3);
 
  153               0, dst + stride3, 0, in4, in5, in6, in7);
 
  155     DUP4_ARG2(__lasx_xvilvl_d, in1, in0, in3, in2, in5, in4, in7, in6,
 
  156               temp0, temp1, temp2, temp3);
 
  157     DUP4_ARG1(__lasx_vext2xv_hu_bu, temp0, temp1, temp2, temp3,
 
  158               temp0, temp1, temp2, temp3);
 
  160     DUP4_ARG2(__lasx_xvadd_h, temp0, const_dc, temp1, const_dc, temp2,
 
  161               const_dc, temp3, const_dc, reg0, reg1, reg2, reg3);
 
  162     DUP2_ARG3(__lasx_xvssrarni_bu_h, reg1, reg0, 0, reg3, reg2, 0,
 
  164     __lasx_xvstelm_d(temp0, dest, 0, 0);
 
  165     __lasx_xvstelm_d(temp0, dest + 
stride, 0, 2);
 
  166     __lasx_xvstelm_d(temp0, dest + stride2, 0, 1);
 
  167     __lasx_xvstelm_d(temp0, dest + stride3, 0, 3);
 
  168     __lasx_xvstelm_d(temp1, dst, 0, 0);
 
  169     __lasx_xvstelm_d(temp1, dst + 
stride, 0, 2);
 
  170     __lasx_xvstelm_d(temp1, dst + stride2, 0, 1);
 
  171     __lasx_xvstelm_d(temp1, dst + stride3, 0, 3);
 
  176     ptrdiff_t stride2 = 
stride << 1;
 
  177     ptrdiff_t stride3 = stride2 + 
stride;
 
  178     __m256i 
shift    = {0x0000000400000000, 0x0000000500000001,
 
  179                         0x0000000600000002, 0x0000000700000003};
 
  180     __m256i const_64 = {0x0000004000000040, 0x0000004000000040,
 
  181                         0x0000004000000040, 0x0000004000000040};
 
  182     __m256i const_1  = {0x00060010000C000C, 0x00060010000C000C,
 
  183                         0x00060010000C000C, 0x00060010000C000C};
 
  184     __m256i const_2  = {0xFFF00006FFF4000C, 0xFFF00006FFF4000C,
 
  185                         0xFFF00006FFF4000C, 0xFFF00006FFF4000C};
 
  186     __m256i const_3  = {0x0004000F00090010, 0x0004000F00090010,
 
  187                         0x0004000F00090010, 0x0004000F00090010};
 
  188     __m256i const_4  = {0xFFF7FFFCFFF0000F, 0xFFF7FFFCFFF0000F,
 
  189                         0xFFF7FFFCFFF0000F, 0xFFF7FFFCFFF0000F};
 
  190     __m256i const_5  = {0x000FFFF000040009, 0x000FFFF000040009,
 
  191                         0x000FFFF000040009, 0x000FFFF000040009};
 
  192     __m256i const_6  = {0xFFF0FFF7000F0004, 0xFFF0FFF7000F0004,
 
  193                         0xFFF0FFF7000F0004, 0xFFF0FFF7000F0004};
 
  194     __m256i const_7  = {0x0000000000000004, 0x0000000000000004,
 
  195                         0x0000000000000004, 0x0000000000000004};
 
  196     __m256i const_8  = {0x0011001100110011, 0x0011001100110011,
 
  197                         0x0011001100110011, 0x0011001100110011};
 
  198     __m256i const_9  = {0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011,
 
  199                         0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011};
 
  200     __m256i const_10 = {0x000A0016000A0016, 0x000A0016000A0016,
 
  201                         0x000A0016000A0016, 0x000A0016000A0016};
 
  202     __m256i const_11 = {0x0016FFF60016FFF6, 0x0016FFF60016FFF6,
 
  203                         0x0016FFF60016FFF6, 0x0016FFF60016FFF6};
 
  205     __m256i temp0, temp1, temp2, temp3, 
t1, 
t2, 
t3, 
t4;
 
  209     temp0 = __lasx_xvpermi_d(in0, 0xB1);
 
  210     temp1 = __lasx_xvpermi_d(in1, 0xB1);
 
  211     DUP2_ARG2(__lasx_xvilvl_h, temp0, in0, temp1, in1, temp0, temp1);
 
  212     temp2 = __lasx_xvpickev_w(temp1, temp0);
 
  213     temp3 = __lasx_xvpickod_w(temp1, temp0);
 
  215     DUP2_ARG2(__lasx_xvdp2_w_h, temp2, const_1, temp2, const_2, temp0, temp1);
 
  216     t1    = __lasx_xvadd_w(temp0, const_7);
 
  217     t2    = __lasx_xvadd_w(temp1, const_7);
 
  218     temp0 = __lasx_xvpickev_w(
t2, 
t1);
 
  219     temp1 = __lasx_xvpickod_w(
t2, 
t1);
 
  220     t3    = __lasx_xvadd_w(temp0, temp1);
 
  221     t4    = __lasx_xvsub_w(temp0, temp1);
 
  222     t4    = __lasx_xvpermi_d(
t4, 0xB1);
 
  224     DUP4_ARG2(__lasx_xvdp4_d_h, temp3, const_3, temp3, const_4, temp3,
 
  225               const_5, temp3, const_6, 
t1, 
t2, temp0, temp1);
 
  226     temp2 = __lasx_xvpickev_w(
t2, 
t1);
 
  227     temp3 = __lasx_xvpickev_w(temp1, temp0);
 
  229     t1    = __lasx_xvadd_w(temp2, 
t3);
 
  230     t2    = __lasx_xvadd_w(temp3, 
t4);
 
  231     temp0 = __lasx_xvsub_w(
t4, temp3);
 
  232     temp1 = __lasx_xvsub_w(
t3, temp2);
 
  234     DUP2_ARG3(__lasx_xvsrani_h_w, 
t2, 
t1, 3, temp1, temp0, 3, temp2, temp3);
 
  235     temp3 = __lasx_xvshuf4i_h(temp3, 0x4E);
 
  236     temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20);
 
  237     temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31);
 
  238     DUP2_ARG3(__lasx_xvdp2add_w_h, const_64, temp0, const_8, const_64, temp0,
 
  240     DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_10, temp1, const_11, 
t3, 
t4);
 
  241     temp0 = __lasx_xvadd_w(
t1, 
t3);
 
  242     temp1 = __lasx_xvsub_w(
t2, 
t4);
 
  243     temp2 = __lasx_xvadd_w(
t2, 
t4);
 
  244     temp3 = __lasx_xvsub_w(
t1, 
t3);
 
  245     DUP4_ARG2(__lasx_xvsrai_w, temp0, 7, temp1, 7, temp2, 7, temp3, 7,
 
  248     temp0 = __lasx_xvldrepl_d(dest, 0);
 
  249     DUP4_ARG2(__lasx_xvldrepl_d, dest, 0, dest + 
stride, 0, dest + stride2, 0,
 
  250               dest + stride3, 0, temp0, temp1, temp2, temp3);
 
  251     DUP4_ARG1(__lasx_vext2xv_wu_bu, temp0, temp1, temp2, temp3,
 
  252               temp0, temp1, temp2, temp3);
 
  257     temp2 = __lasx_xvpickev_b(temp1, temp0);
 
  258     temp0 = __lasx_xvperm_w(temp2, 
shift);
 
  259     __lasx_xvstelm_d(temp0, dest, 0, 0);
 
  260     __lasx_xvstelm_d(temp0, dest + 
stride, 0, 1);
 
  261     __lasx_xvstelm_d(temp0, dest + stride2, 0, 2);
 
  262     __lasx_xvstelm_d(temp0, dest + stride3, 0, 3);
 
  269     ptrdiff_t stride2 = 
stride << 1;
 
  270     ptrdiff_t stride3 = stride2 + 
stride;
 
  271     __m256i in0, in1, in2, in3;
 
  272     __m256i const_dc, temp0, temp1, reg0, reg1;
 
  274     dc = (3  * 
dc + 1) >> 1;
 
  275     dc = (17 * 
dc + 64) >> 7;
 
  276     const_dc = __lasx_xvreplgr2vr_h(
dc);
 
  278     DUP4_ARG2(__lasx_xvldrepl_d, dest, 0, dest + 
stride, 0, dest + stride2,
 
  279               0, dest + stride3, 0, in0, in1, in2, in3);
 
  280     DUP2_ARG2(__lasx_xvilvl_d, in1, in0, in3, in2, temp0, temp1);
 
  281     DUP2_ARG1(__lasx_vext2xv_hu_bu, temp0, temp1, temp0, temp1);
 
  282     DUP2_ARG2(__lasx_xvadd_h, temp0, const_dc, temp1, const_dc, reg0, reg1);
 
  283     temp0 = __lasx_xvssrarni_bu_h(reg1, reg0, 0);
 
  284     __lasx_xvstelm_d(temp0, dest, 0, 0);
 
  285     __lasx_xvstelm_d(temp0, dest + 
stride, 0, 2);
 
  286     __lasx_xvstelm_d(temp0, dest + stride2, 0, 1);
 
  287     __lasx_xvstelm_d(temp0, dest + stride3, 0, 3);
 
  294     ptrdiff_t stride2 = 
stride << 1;
 
  295     ptrdiff_t stride3 = stride2 + 
stride;
 
  296     uint8_t *dst = dest + (stride2 << 1);
 
  297     __m256i in0, in1, in2, in3, in4, in5, in6, in7;
 
  298     __m256i const_dc, temp0, temp1, temp2, temp3, reg0, reg1;
 
  300     dc = (17 * 
dc +  4) >> 3;
 
  301     dc = (12 * 
dc + 64) >> 7;
 
  302     const_dc = __lasx_xvreplgr2vr_h(
dc);
 
  304     DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dest + 
stride, 0, dest + stride2,
 
  305               0, dest + stride3, 0, in0, in1, in2, in3);
 
  307               0, dst + stride3, 0, in4, in5, in6, in7);
 
  309     DUP4_ARG2(__lasx_xvilvl_w, in1, in0, in3, in2, in5, in4, in7, in6,
 
  310               temp0, temp1, temp2, temp3);
 
  311     DUP2_ARG2(__lasx_xvilvl_d, temp1, temp0, temp3, temp2, reg0, reg1);
 
  312     DUP2_ARG1(__lasx_vext2xv_hu_bu, reg0, reg1, temp0, temp1);
 
  313     DUP2_ARG2(__lasx_xvadd_h, temp0, const_dc, temp1, const_dc, reg0, reg1);
 
  314     temp0 = __lasx_xvssrarni_bu_h(reg1, reg0, 0);
 
  315     __lasx_xvstelm_w(temp0, dest, 0, 0);
 
  316     __lasx_xvstelm_w(temp0, dest + 
stride, 0, 1);
 
  317     __lasx_xvstelm_w(temp0, dest + stride2, 0, 4);
 
  318     __lasx_xvstelm_w(temp0, dest + stride3, 0, 5);
 
  319     __lasx_xvstelm_w(temp0, dst, 0, 2);
 
  320     __lasx_xvstelm_w(temp0, dst + 
stride, 0, 3);
 
  321     __lasx_xvstelm_w(temp0, dst + stride2, 0, 6);
 
  322     __lasx_xvstelm_w(temp0, dst + stride3, 0, 7);
 
  327     ptrdiff_t stride2 = 
stride << 1;
 
  328     ptrdiff_t stride3 = stride2 + 
stride;
 
  329     uint8_t *dst = dest + (stride2 << 1);
 
  330     __m256i in0, in1, in2, in3;
 
  331     __m256i temp0, temp1, temp2, temp3, 
t1, 
t2, 
t3, 
t4;
 
  333     __m256i const_1  = {0x0011001100110011, 0x0011001100110011,
 
  334                         0x0011001100110011, 0x0011001100110011};
 
  335     __m256i const_2  = {0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011,
 
  336                         0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011};
 
  337     __m256i const_3  = {0x000A0016000A0016, 0x000A0016000A0016,
 
  338                         0x000A0016000A0016, 0x000A0016000A0016};
 
  339     __m256i const_4  = {0x0016FFF60016FFF6, 0x0016FFF60016FFF6,
 
  340                         0x0016FFF60016FFF6, 0x0016FFF60016FFF6};
 
  341     __m256i const_5  = {0x0000000400000004, 0x0000000400000004,
 
  342                         0x0000000400000004, 0x0000000400000004};
 
  343     __m256i const_6  = {0x0000004000000040, 0x0000004000000040,
 
  344                         0x0000004000000040, 0x0000004000000040};
 
  345     __m256i const_7  = {0x000C000C000C000C, 0X000C000C000C000C,
 
  346                         0xFFF4000CFFF4000C, 0xFFF4000CFFF4000C};
 
  347     __m256i const_8  = {0x0006001000060010, 0x0006001000060010,
 
  348                         0xFFF00006FFF00006, 0xFFF00006FFF00006};
 
  349     __m256i const_9  = {0x0009001000090010, 0x0009001000090010,
 
  350                         0x0004000F0004000F, 0x0004000F0004000F};
 
  351     __m256i const_10 = {0xFFF0000FFFF0000F, 0xFFF0000FFFF0000F,
 
  352                         0xFFF7FFFCFFF7FFFC, 0xFFF7FFFCFFF7FFFC};
 
  353     __m256i const_11 = {0x0004000900040009, 0x0004000900040009,
 
  354                         0x000FFFF0000FFFF0, 0x000FFFF0000FFFF0};
 
  355     __m256i const_12 = {0x000F0004000F0004, 0x000F0004000F0004,
 
  356                         0xFFF0FFF7FFF0FFF7, 0xFFF0FFF7FFF0FFF7};
 
  357     __m256i 
shift    = {0x0000000400000000, 0x0000000600000002,
 
  358                         0x0000000500000001, 0x0000000700000003};
 
  363     in0   = __lasx_xvilvl_d(in1, in0);
 
  364     in1   = __lasx_xvilvl_d(in3, in2);
 
  365     temp0 = __lasx_xvpickev_h(in1, in0);
 
  366     temp1 = __lasx_xvpickod_h(in1, in0);
 
  367     temp0 = __lasx_xvperm_w(temp0, 
shift);
 
  368     temp1 = __lasx_xvperm_w(temp1, 
shift);
 
  370     DUP2_ARG3(__lasx_xvdp2add_w_h, const_5, temp0, const_1, const_5, temp0,
 
  372     DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_3, temp1, const_4, 
t3, 
t4);
 
  374     temp0 = __lasx_xvadd_w(
t1, 
t3);
 
  375     temp1 = __lasx_xvsub_w(
t2, 
t4);
 
  376     temp2 = __lasx_xvadd_w(
t2, 
t4);
 
  377     temp3 = __lasx_xvsub_w(
t1, 
t3);
 
  378     DUP4_ARG2(__lasx_xvsrai_w, temp0, 3, temp1, 3, temp2, 3, temp3, 3,
 
  379               temp0, temp1, temp2, temp3);
 
  382     t1    = __lasx_xvpickev_w(temp1, temp0);
 
  383     t2    = __lasx_xvpickev_w(temp3, temp2);
 
  384     t1    = __lasx_xvpickev_h(
t2, 
t1);
 
  385     t3    = __lasx_xvpickod_w(temp1, temp0);
 
  386     t4    = __lasx_xvpickod_w(temp3, temp2);
 
  387     temp1 = __lasx_xvpickev_h(
t4, 
t3);
 
  388     temp2 = __lasx_xvpermi_q(
t1, 
t1, 0x00);
 
  389     temp3 = __lasx_xvpermi_q(
t1, 
t1, 0x11);
 
  390     t1 = __lasx_xvdp2add_w_h(const_6, temp2, const_7);
 
  391     t2 = __lasx_xvdp2_w_h(temp3, const_8);
 
  392     t3    = __lasx_xvadd_w(
t1, 
t2);
 
  393     t4    = __lasx_xvsub_w(
t1, 
t2);
 
  394     t4    = __lasx_xvpermi_d(
t4, 0x4E);
 
  396     DUP4_ARG2(__lasx_xvdp2_w_h, temp1, const_9, temp1, const_10, temp1,
 
  397               const_11, temp1, const_12, 
t1, 
t2, temp2, temp3);
 
  399     temp0 = __lasx_xvpermi_q(
t2, 
t1, 0x20);
 
  400     temp1 = __lasx_xvpermi_q(
t2, 
t1, 0x31);
 
  401     t1    = __lasx_xvadd_w(temp0, temp1);
 
  402     temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20);
 
  403     temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31);
 
  404     t2    = __lasx_xvadd_w(temp1, temp0);
 
  405     temp0 = __lasx_xvadd_w(
t1, 
t3);
 
  406     temp1 = __lasx_xvadd_w(
t2, 
t4);
 
  407     temp2 = __lasx_xvsub_w(
t4, 
t2);
 
  408     temp3 = __lasx_xvsub_w(
t3, 
t1);
 
  409     temp2 = __lasx_xvaddi_wu(temp2, 1);
 
  410     temp3 = __lasx_xvaddi_wu(temp3, 1);
 
  411     DUP4_ARG2(__lasx_xvsrai_w, temp0, 7, temp1, 7, temp2, 7, temp3, 7,
 
  412               temp0, temp1, temp2, temp3);
 
  414     DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dest + 
stride, 0, dest + stride2, 0,
 
  415               dest + stride3, 0, const_1, const_2, const_3, const_4);
 
  416     DUP4_ARG2(__lasx_xvldrepl_w, dst, 0, dst + 
stride, 0, dst + stride2, 0,
 
  417               dst + stride3, 0, const_5, const_6, const_7, const_8);
 
  419     DUP4_ARG2(__lasx_xvilvl_w, const_2, const_1, const_4, const_3, const_5,
 
  420               const_6, const_7, const_8, const_1, const_2, const_3, const_4);
 
  421     DUP4_ARG1(__lasx_vext2xv_wu_bu, const_1, const_2, const_3, const_4,
 
  422               const_1, const_2, const_3, const_4);
 
  423     DUP4_ARG2(__lasx_xvadd_w, temp0, const_1, temp1, const_2, temp2, const_3,
 
  424               temp3, const_4, temp0, temp1, temp2, temp3);
 
  425     DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3,
 
  426               temp0, temp1, temp2, temp3);
 
  427     DUP2_ARG2(__lasx_xvpickev_h, temp1, temp0, temp3, temp2, temp0, temp1);
 
  428     temp0   = __lasx_xvpickev_b(temp1, temp0);
 
  429     __lasx_xvstelm_w(temp0, dest, 0, 0);
 
  430     __lasx_xvstelm_w(temp0, dest + 
stride, 0, 4);
 
  431     __lasx_xvstelm_w(temp0, dest + stride2, 0, 1);
 
  432     __lasx_xvstelm_w(temp0, dest + stride3, 0, 5);
 
  433     __lasx_xvstelm_w(temp0, dst, 0, 6);
 
  434     __lasx_xvstelm_w(temp0, dst + 
stride, 0, 2);
 
  435     __lasx_xvstelm_w(temp0, dst + stride2, 0, 7);
 
  436     __lasx_xvstelm_w(temp0, dst + stride3, 0, 3);
 
  443     uint8_t *dst1 = dest + 
stride;
 
  444     uint8_t *dst2 = dst1 + 
stride;
 
  445     uint8_t *dst3 = dst2 + 
stride;
 
  446     __m256i in0, in1, in2, in3, temp0, temp1, const_dc;
 
  449     dc = (17 * 
dc +  4) >> 3;
 
  450     dc = (17 * 
dc + 64) >> 7;
 
  451     const_dc = __lasx_xvreplgr2vr_h(
dc);
 
  453     DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dst1, 0, dst2, 0, dst3, 0,
 
  455     DUP2_ARG2(__lasx_xvilvl_w, in1, in0, in3, in2, temp0, temp1);
 
  456     in0   = __lasx_xvpermi_q(temp1, temp0, 0x20);
 
  457     temp0 = __lasx_xvilvl_b(
zero, in0);
 
  458     in0   = __lasx_xvadd_h(temp0, const_dc);
 
  459     temp0 = __lasx_xvssrarni_bu_h(in0, in0, 0);
 
  460     __lasx_xvstelm_w(temp0, dest, 0, 0);
 
  461     __lasx_xvstelm_w(temp0, dst1, 0, 1);
 
  462     __lasx_xvstelm_w(temp0, dst2, 0, 4);
 
  463     __lasx_xvstelm_w(temp0, dst3, 0, 5);
 
  468     uint8_t *dst1 = dest + 
stride;
 
  469     uint8_t *dst2 = dst1 + 
stride;
 
  470     uint8_t *dst3 = dst2 + 
stride;
 
  471     __m256i in0, in1, in2, in3;
 
  472     __m256i temp0, temp1, temp2, temp3, 
t1, 
t2;
 
  474     __m256i const_1  = {0x0011001100110011, 0xFFEF0011FFEF0011,
 
  475                         0x0011001100110011, 0xFFEF0011FFEF0011};
 
  476     __m256i const_2  = {0x000A0016000A0016, 0x0016FFF60016FFF6,
 
  477                         0x000A0016000A0016, 0x0016FFF60016FFF6};
 
  478     __m256i const_64 = {0x0000004000000040, 0x0000004000000040,
 
  479                         0x0000004000000040, 0x0000004000000040};
 
  483     temp0 = __lasx_xvilvl_d(in1, in0);
 
  484     temp1 = __lasx_xvpickev_h(temp0, temp0);
 
  485     temp2 = __lasx_xvpickod_h(temp0, temp0);
 
  486     DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_1, temp2, const_2, 
t1, 
t2);
 
  487     t1    = __lasx_xvaddi_wu(
t1, 4);
 
  488     in0   = __lasx_xvadd_w(
t1, 
t2);
 
  489     in1   = __lasx_xvsub_w(
t1, 
t2);
 
  490     DUP2_ARG2(__lasx_xvsrai_w, in0, 3, in1, 3, in0, in1);
 
  492     temp0   = __lasx_xvpickev_h(in1, in0);
 
  493     temp1   = __lasx_xvpermi_q(temp0, temp0, 0x00);
 
  494     temp2   = __lasx_xvpermi_q(temp0, temp0, 0x11);
 
  495     const_1 = __lasx_xvpermi_d(const_1, 0xD8);
 
  496     const_2 = __lasx_xvpermi_d(const_2, 0xD8);
 
  497     t1 = __lasx_xvdp2add_w_h(const_64, temp1, const_1);
 
  498     t2 = __lasx_xvdp2_w_h(temp2, const_2);
 
  499     in0     = __lasx_xvadd_w(
t1, 
t2);
 
  500     in1     = __lasx_xvsub_w(
t1, 
t2);
 
  501     DUP2_ARG2(__lasx_xvsrai_w, in0, 7, in1, 7, in0, in1);
 
  502     temp0   = __lasx_xvshuf4i_w(in0, 0x9C);
 
  503     temp1   = __lasx_xvshuf4i_w(in1, 0x9C);
 
  505     DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dst1, 0, dst2, 0, dst3, 0,
 
  507     temp2   = __lasx_xvilvl_w(in2, in0);
 
  508     temp2   = __lasx_vext2xv_wu_bu(temp2);
 
  509     temp3   = __lasx_xvilvl_w(in1, in3);
 
  510     temp3   = __lasx_vext2xv_wu_bu(temp3);
 
  511     temp0   = __lasx_xvadd_w(temp0, temp2);
 
  512     temp1   = __lasx_xvadd_w(temp1, temp3);
 
  513     DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1);
 
  514     temp1   = __lasx_xvpickev_h(temp1, temp0);
 
  515     temp0   = __lasx_xvpickev_b(temp1, temp1);
 
  516     __lasx_xvstelm_w(temp0, dest, 0, 0);
 
  517     __lasx_xvstelm_w(temp0, dst1, 0, 5);
 
  518     __lasx_xvstelm_w(temp0, dst2, 0, 4);
 
  519     __lasx_xvstelm_w(temp0, dst3, 0, 1);
 
  523                                       ptrdiff_t 
stride, 
int hmode, 
int vmode,
 
  526     __m256i in0, in1, in2, in3;
 
  528     __m256i temp0, temp1, const_para1_2, const_para0_3;
 
  529     __m256i const_r, const_sh;
 
  530     __m256i sh = {0x0000000400000000, 0x0000000500000001,
 
  531                   0x0000000600000002, 0x0000000700000003};
 
  532     static const uint8_t para_value[][4] = {{4, 3, 53, 18},
 
  535     static const int shift_value[] = {0, 5, 1, 5};
 
  536     int shift = (shift_value[hmode] + shift_value[vmode]) >> 1;
 
  538     const uint8_t *para_v = para_value[vmode - 1];
 
  539     ptrdiff_t stride2 = 
stride << 1;
 
  540     ptrdiff_t stride4 = 
stride << 2;
 
  541     ptrdiff_t stride3 = stride2 + 
stride;
 
  543     const_r  = __lasx_xvreplgr2vr_h(
r);
 
  544     const_sh = __lasx_xvreplgr2vr_h(
shift);
 
  546     const_para0_3 = __lasx_xvldrepl_h(para_v, 0);
 
  547     const_para1_2 = __lasx_xvldrepl_h(para_v, 2);
 
  549               src + stride3, 0, in0, in1, in2, in3);
 
  550     DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
 
  552     DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, temp0, temp1);
 
  553     t0 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
 
  554     t0 = __lasx_xvdp2sub_h_bu(
t0, temp1, const_para0_3);
 
  556     in0   = __lasx_xvld(
src, 0);
 
  557     in0   = __lasx_xvpermi_d(in0, 0xD8);
 
  558     DUP2_ARG2(__lasx_xvilvl_b, in3, in2, in0, in1, temp0, temp1);
 
  559     t1 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
 
  560     t1 = __lasx_xvdp2sub_h_bu(
t1, temp1, const_para0_3);
 
  562     in1   = __lasx_xvld(
src, 0);
 
  563     in1   = __lasx_xvpermi_d(in1, 0xD8);
 
  564     DUP2_ARG2(__lasx_xvilvl_b, in0, in3, in1, in2, temp0, temp1);
 
  565     t2 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
 
  566     t2 = __lasx_xvdp2sub_h_bu(
t2, temp1, const_para0_3);
 
  568     in2   = __lasx_xvld(
src, 0);
 
  569     in2   = __lasx_xvpermi_d(in2, 0xD8);
 
  570     DUP2_ARG2(__lasx_xvilvl_b, in1, in0, in2, in3, temp0, temp1);
 
  571     t3 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
 
  572     t3 = __lasx_xvdp2sub_h_bu(
t3, temp1, const_para0_3);
 
  574     in3   = __lasx_xvld(
src, 0);
 
  575     in3   = __lasx_xvpermi_d(in3, 0xD8);
 
  576     DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, temp0, temp1);
 
  577     t4 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
 
  578     t4 = __lasx_xvdp2sub_h_bu(
t4, temp1, const_para0_3);
 
  580     in0   = __lasx_xvld(
src, 0);
 
  581     in0   = __lasx_xvpermi_d(in0, 0xD8);
 
  582     DUP2_ARG2(__lasx_xvilvl_b, in3, in2, in0, in1, temp0, temp1);
 
  583     t5 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
 
  584     t5 = __lasx_xvdp2sub_h_bu(
t5, temp1, const_para0_3);
 
  586     in1   = __lasx_xvld(
src, 0);
 
  587     in1   = __lasx_xvpermi_d(in1, 0xD8);
 
  588     DUP2_ARG2(__lasx_xvilvl_b, in0, in3, in1, in2, temp0, temp1);
 
  589     t6 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
 
  590     t6 = __lasx_xvdp2sub_h_bu(
t6, temp1, const_para0_3);
 
  592     in2   = __lasx_xvld(
src, 0);
 
  593     in2   = __lasx_xvpermi_d(in2, 0xD8);
 
  594     DUP2_ARG2(__lasx_xvilvl_b, in1, in0, in2, in3, temp0, temp1);
 
  595     t7 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
 
  596     t7 = __lasx_xvdp2sub_h_bu(
t7, temp1, const_para0_3);
 
  607     para_v  = para_value[hmode - 1];
 
  608     const_para0_3 = __lasx_xvldrepl_h(para_v, 0);
 
  609     const_para1_2 = __lasx_xvldrepl_h(para_v, 2);
 
  610     const_para0_3 = __lasx_vext2xv_h_b(const_para0_3);
 
  611     const_para1_2 = __lasx_vext2xv_h_b(const_para1_2);
 
  613     const_r = __lasx_xvreplgr2vr_w(
r);
 
  619     t7      = __lasx_xvpermi_d(
t7, 0xD8);
 
  621     t0 = __lasx_xvdp2_w_h(temp0, const_para1_2);
 
  622     t0 = __lasx_xvdp2sub_w_h(
t0, temp1, const_para0_3);
 
  624     t1 = __lasx_xvdp2_w_h(temp0, const_para1_2);
 
  625     t1 = __lasx_xvdp2sub_w_h(
t1, temp1, const_para0_3);
 
  627     t2 = __lasx_xvdp2_w_h(temp0, const_para1_2);
 
  628     t2 = __lasx_xvdp2sub_w_h(
t2, temp1, const_para0_3);
 
  630     t3 = __lasx_xvdp2_w_h(temp0, const_para1_2);
 
  631     t3 = __lasx_xvdp2sub_w_h(
t3, temp1, const_para0_3);
 
  633     t4 = __lasx_xvdp2_w_h(temp0, const_para1_2);
 
  634     t4 = __lasx_xvdp2sub_w_h(
t4, temp1, const_para0_3);
 
  636     t5 = __lasx_xvdp2_w_h(temp0, const_para1_2);
 
  637     t5 = __lasx_xvdp2sub_w_h(
t5, temp1, const_para0_3);
 
  639     t6 = __lasx_xvdp2_w_h(temp0, const_para1_2);
 
  640     t6 = __lasx_xvdp2sub_w_h(
t6, temp1, const_para0_3);
 
  641     DUP2_ARG2(__lasx_xvilvl_h, in1, in0, in2, 
t7, temp0, temp1);
 
  642     t7 = __lasx_xvdp2_w_h(temp0, const_para1_2);
 
  643     t7 = __lasx_xvdp2sub_w_h(
t7, temp1, const_para0_3);
 
  648     DUP4_ARG2(__lasx_xvsrai_w, 
t0, 7, 
t1, 7, 
t2, 7, 
t3, 7, 
t0, 
t1, 
t2, 
t3);
 
  649     DUP4_ARG2(__lasx_xvsrai_w, 
t4, 7, 
t5, 7, 
t6, 7, 
t7, 7, 
t4, 
t5, 
t6, 
t7);
 
  657     t0 = __lasx_xvperm_w(
t0, sh);
 
  658     t1 = __lasx_xvperm_w(
t1, sh);
 
  659     __lasx_xvstelm_d(
t0, dst, 0, 0);
 
  660     __lasx_xvstelm_d(
t0, dst + 
stride, 0, 1);
 
  661     __lasx_xvstelm_d(
t0, dst + stride2, 0, 2);
 
  662     __lasx_xvstelm_d(
t0, dst + stride3, 0, 3);
 
  664     __lasx_xvstelm_d(
t1, dst, 0, 0);
 
  665     __lasx_xvstelm_d(
t1, dst + 
stride, 0, 1);
 
  666     __lasx_xvstelm_d(
t1, dst + stride2, 0, 2);
 
  667     __lasx_xvstelm_d(
t1, dst + stride3, 0, 3);
 
  670 #define PUT_VC1_MSPEL_MC_LASX(hmode, vmode)                                   \ 
  671 void ff_put_vc1_mspel_mc ## hmode ## vmode ## _lasx(uint8_t *dst,             \ 
  672                                                 const uint8_t *src,           \ 
  673                                                 ptrdiff_t stride, int rnd)    \ 
  675     put_vc1_mspel_mc_h_v_lasx(dst, src, stride, hmode, vmode, rnd);           \ 
  677 void ff_put_vc1_mspel_mc ## hmode ## vmode ## _16_lasx(uint8_t *dst,          \ 
  678                                                    const uint8_t *src,        \ 
  679                                                    ptrdiff_t stride, int rnd) \ 
  681     put_vc1_mspel_mc_h_v_lasx(dst, src, stride, hmode, vmode, rnd);           \ 
  682     put_vc1_mspel_mc_h_v_lasx(dst + 8, src + 8, stride, hmode, vmode, rnd);   \ 
  683     dst += 8 * stride, src += 8 * stride;                                     \ 
  684     put_vc1_mspel_mc_h_v_lasx(dst, src, stride, hmode, vmode, rnd);           \ 
  685     put_vc1_mspel_mc_h_v_lasx(dst + 8, src + 8, stride, hmode, vmode, rnd);   \ 
  702                                        ptrdiff_t 
stride, 
int h, 
int x, 
int y)
 
  704     const int intA = (8 - x) * (8 - y);
 
  705     const int intB =     (x) * (8 - y);
 
  706     const int intC = (8 - x) *     (y);
 
  707     const int intD =     (x) *     (y);
 
  708     __m256i src00, src01, src10, src11;
 
  712     av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
 
  714     A = __lasx_xvreplgr2vr_h(intA);
 
  715     B = __lasx_xvreplgr2vr_h(intB);
 
  716     C = __lasx_xvreplgr2vr_h(intC);
 
  717     D = __lasx_xvreplgr2vr_h(intD);
 
  718     for(
i = 0; 
i < 
h; 
i++){
 
  723         DUP4_ARG1(__lasx_vext2xv_hu_bu, src00, src01, src10, src11,
 
  724                   src00, src01, src10, src11);
 
  725         DUP4_ARG2(__lasx_xvmul_h, src00, 
A, src01, 
B, src10, 
C, src11, 
D,
 
  726                   src00, src01, src10, src11);
 
  727         src00 = __lasx_xvadd_h(src00, src01);
 
  728         src10 = __lasx_xvadd_h(src10, src11);
 
  729         src00 = __lasx_xvadd_h(src00, src10);
 
  730         src00 = __lasx_xvaddi_hu(src00, 28);
 
  731         src00 = __lasx_xvsrli_h(src00, 6);
 
  732         src00 = __lasx_xvpickev_b(src00, src00);
 
  733         __lasx_xvstelm_d(src00, dst, 0, 0);
 
  741     __m256i in0, in1, in2, in3, temp0, temp1, 
t0;
 
  742     __m256i const_para0_3, const_para1_2, const_r, const_sh;
 
  743     static const uint16_t para_value[][2] = {{0x0304, 0x1235},
 
  746     const uint16_t *para_v = para_value[vmode - 1];
 
  747     static const int shift_value[] = {0, 6, 4, 6};
 
  748     static int add_value[3];
 
  749     ptrdiff_t stride_2x = 
stride << 1;
 
  751     add_value[2] = add_value[0] = 31 + 
rnd, add_value[1] = 7 + 
rnd;
 
  753     const_r  = __lasx_xvreplgr2vr_h(add_value[vmode - 1]);
 
  754     const_sh = __lasx_xvreplgr2vr_h(shift_value[vmode]);
 
  755     const_para0_3 = __lasx_xvreplgr2vr_h(*para_v);
 
  756     const_para1_2 = __lasx_xvreplgr2vr_h(*(para_v + 1));
 
  760     in0   = __lasx_xvpermi_d(in0, 0xD8);
 
  761     in1   = __lasx_xvpermi_d(in1, 0xD8);
 
  762     in2   = __lasx_xvpermi_d(in2, 0xD8);
 
  763     for (; 
i < 16; 
i++) {
 
  764         in3 = __lasx_xvld(
src + stride_2x, 0);
 
  765         in3 = __lasx_xvpermi_d(in3, 0xD8);
 
  766         DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, temp0, temp1);
 
  767         t0 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
 
  768         t0 = __lasx_xvdp2sub_h_bu(
t0, temp1, const_para0_3);
 
  769         t0 = __lasx_xvadd_h(
t0, const_r);
 
  770         t0 = __lasx_xvsra_h(
t0, const_sh);
 
  771         t0 = __lasx_xvclip255_h(
t0);
 
  772         t0 = __lasx_xvpickev_b(
t0, 
t0);
 
  773         __lasx_xvstelm_d(
t0, dst, 0, 0);
 
  774         __lasx_xvstelm_d(
t0, dst, 8, 2);
 
  783 #define PUT_VC1_MSPEL_MC_V_LASX(vmode)                                    \ 
  784 void ff_put_vc1_mspel_mc0 ## vmode ## _16_lasx(uint8_t *dst,              \ 
  785                                                const uint8_t *src,        \ 
  786                                                ptrdiff_t stride, int rnd) \ 
  788     put_vc1_mspel_mc_v_lasx(dst, src, stride, vmode, rnd);                \ 
  795 #define ROW_LASX(in0, in1, in2, in3, out0)                                \ 
  796     DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, tmp0_m, tmp1_m);       \ 
  797     out0 = __lasx_xvdp2_h_bu(tmp0_m, const_para1_2);                      \ 
  798     out0 = __lasx_xvdp2sub_h_bu(out0, tmp1_m, const_para0_3);             \ 
  799     out0 = __lasx_xvadd_h(out0, const_r);                                 \ 
  800     out0 = __lasx_xvsra_h(out0, const_sh);                                \ 
  801     out0 = __lasx_xvclip255_h(out0);                                      \ 
  802     out0 = __lasx_xvpickev_b(out0, out0);                                 \ 
  803     out0 = __lasx_xvpermi_d(out0, 0xd8);                                  \ 
  808     __m256i in0, in1, in2, in3, in4, in5, in6, in7,
 
  809             in8, in9, in10, in11, in12, in13, in14, in15;
 
  810     __m256i out0, out1, out2, out3, out4, out5, out6, out7, out8, out9,
 
  811             out10, out11, out12, out13, out14, out15, out16, out17, out18;
 
  812     __m256i const_para0_3, const_para1_2, const_r, const_sh;
 
  813     __m256i tmp0_m, tmp1_m, tmp2_m, tmp3_m;
 
  814     __m256i tmp4_m, tmp5_m, tmp6_m, tmp7_m;
 
  816     ptrdiff_t stride2 = 
stride << 1;
 
  817     ptrdiff_t stride4 = 
stride << 2;
 
  818     ptrdiff_t stride3 = stride2 + 
stride;
 
  819     static const uint16_t para_value[][2] = {{0x0304, 0x1235},
 
  822     const uint16_t *para_v = para_value[hmode - 1];
 
  823     static const int shift_value[] = {0, 6, 4, 6};
 
  824     static int add_value[3];
 
  825     uint8_t *_src = (uint8_t*)
src - 1;
 
  826     add_value[2] = add_value[0] = 32 - 
rnd, add_value[1] = 8 - 
rnd;
 
  828     const_r  = __lasx_xvreplgr2vr_h(add_value[hmode - 1]);
 
  829     const_sh = __lasx_xvreplgr2vr_h(shift_value[hmode]);
 
  830     const_para0_3 = __lasx_xvreplgr2vr_h(*para_v);
 
  831     const_para1_2 = __lasx_xvreplgr2vr_h(*(para_v + 1));
 
  833     in0 = __lasx_xvld(_src, 0);
 
  835     in3 = __lasx_xvldx(_src, stride3);
 
  837     in4 = __lasx_xvld(_src, 0);
 
  839     in7 = __lasx_xvldx(_src, stride3);
 
  841     in8 = __lasx_xvld(_src, 0);
 
  843     in11 = __lasx_xvldx(_src, stride3);
 
  845     in12 = __lasx_xvld(_src, 0);
 
  847     in15 = __lasx_xvldx(_src, stride3);
 
  848     DUP4_ARG2(__lasx_xvilvl_b, in2, in0, in3, in1, in6, in4, in7, in5,
 
  849               tmp0_m, tmp1_m, tmp2_m, tmp3_m);
 
  850     DUP4_ARG2(__lasx_xvilvl_b, in10, in8, in11, in9, in14, in12, in15, in13,
 
  851               tmp4_m, tmp5_m, tmp6_m, tmp7_m);
 
  852     DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
 
  854     DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
 
  856     DUP4_ARG2(__lasx_xvilvl_w, 
t2, 
t0, 
t3, 
t1, 
t6, 
t4, 
t7, 
t5, tmp0_m, tmp4_m,
 
  858     DUP4_ARG2(__lasx_xvilvh_w, 
t2, 
t0, 
t3, 
t1, 
t6, 
t4, 
t7, 
t5, tmp2_m, tmp6_m,
 
  860     DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
 
  861               tmp7_m, tmp6_m, out0, out2, out4, out6);
 
  862     DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
 
  863               tmp7_m, tmp6_m, out1, out3, out5, out7);
 
  865     DUP4_ARG2(__lasx_xvilvh_b, in2, in0, in3, in1, in6, in4, in7, in5,
 
  866               tmp0_m, tmp1_m, tmp2_m, tmp3_m);
 
  867     DUP4_ARG2(__lasx_xvilvh_b, in10, in8, in11, in9, in14, in12, in15, in13,
 
  868               tmp4_m, tmp5_m, tmp6_m, tmp7_m);
 
  869     DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
 
  871     DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
 
  873     DUP4_ARG2(__lasx_xvilvl_w, 
t2, 
t0, 
t3, 
t1, 
t6, 
t4, 
t7, 
t5, tmp0_m, tmp4_m,
 
  875     DUP4_ARG2(__lasx_xvilvh_w, 
t2, 
t0, 
t3, 
t1, 
t6, 
t4, 
t7, 
t5, tmp2_m, tmp6_m,
 
  877     DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
 
  878               tmp7_m, tmp6_m, out8, out10, out12, out14);
 
  879     DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
 
  880               tmp7_m, tmp6_m, out9, out11, out13, out15);
 
  881     DUP2_ARG3(__lasx_xvpermi_q, out0, out0, 0x31, out1, out1, 0x31, out16, out17);
 
  882     out18 = __lasx_xvpermi_q(out2, out2, 0x31);
 
  884     DUP4_ARG2(__lasx_xvpermi_d, out0, 0xD8, out1, 0xD8, out2, 0xD8, out3, 0xD8,
 
  885               out0, out1, out2, out3);
 
  886     DUP4_ARG2(__lasx_xvpermi_d, out4, 0xD8, out5, 0xD8, out6, 0xD8, out7, 0xD8,
 
  887               out4, out5, out6, out7);
 
  888     DUP4_ARG2(__lasx_xvpermi_d, out8, 0xD8, out9, 0xD8, out10, 0xD8, out11,
 
  889               0xD8, out8, out9, out10, out11);
 
  890     DUP4_ARG2(__lasx_xvpermi_d, out12, 0xD8, out13, 0xD8, out14, 0xD8, out15,
 
  891               0xD8, out12, out13, out14, out15);
 
  892     out16 = __lasx_xvpermi_d(out16, 0xD8);
 
  893     out17 = __lasx_xvpermi_d(out17, 0xD8);
 
  894     out18 = __lasx_xvpermi_d(out18, 0xD8);
 
  896     ROW_LASX(out0,  out1,  out2,  out3,  in0);
 
  897     ROW_LASX(out1,  out2,  out3,  out4,  in1);
 
  898     ROW_LASX(out2,  out3,  out4,  out5,  in2);
 
  899     ROW_LASX(out3,  out4,  out5,  out6,  in3);
 
  900     ROW_LASX(out4,  out5,  out6,  out7,  in4);
 
  901     ROW_LASX(out5,  out6,  out7,  out8,  in5);
 
  902     ROW_LASX(out6,  out7,  out8,  out9,  in6);
 
  903     ROW_LASX(out7,  out8,  out9,  out10, in7);
 
  904     ROW_LASX(out8,  out9,  out10, out11, in8);
 
  905     ROW_LASX(out9,  out10, out11, out12, in9);
 
  906     ROW_LASX(out10, out11, out12, out13, in10);
 
  907     ROW_LASX(out11, out12, out13, out14, in11);
 
  908     ROW_LASX(out12, out13, out14, out15, in12);
 
  909     ROW_LASX(out13, out14, out15, out16, in13);
 
  910     ROW_LASX(out14, out15, out16, out17, in14);
 
  911     ROW_LASX(out15, out16, out17, out18, in15);
 
  913     DUP4_ARG2(__lasx_xvilvl_b, in2, in0, in3, in1, in6, in4, in7, in5,
 
  914               tmp0_m, tmp1_m, tmp2_m, tmp3_m);
 
  915     DUP4_ARG2(__lasx_xvilvl_b, in10, in8, in11, in9, in14, in12, in15, in13,
 
  916               tmp4_m, tmp5_m, tmp6_m, tmp7_m);
 
  917     DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
 
  919     DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
 
  921     DUP4_ARG2(__lasx_xvilvl_w, 
t2, 
t0, 
t3, 
t1, 
t6, 
t4, 
t7, 
t5, tmp0_m, tmp4_m,
 
  923     DUP4_ARG2(__lasx_xvilvh_w, 
t2, 
t0, 
t3, 
t1, 
t6, 
t4, 
t7, 
t5, tmp2_m, tmp6_m,
 
  925     DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
 
  926               tmp7_m, tmp6_m, out0, out2, out4, out6);
 
  927     DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
 
  928               tmp7_m, tmp6_m, out1, out3, out5, out7);
 
  930     DUP4_ARG2(__lasx_xvilvh_b, in2, in0, in3, in1, in6, in4, in7, in5,
 
  931               tmp0_m, tmp1_m, tmp2_m, tmp3_m);
 
  932     DUP4_ARG2(__lasx_xvilvh_b, in10, in8, in11, in9, in14, in12, in15, in13,
 
  933               tmp4_m, tmp5_m, tmp6_m, tmp7_m);
 
  934     DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
 
  936     DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
 
  938     DUP4_ARG2(__lasx_xvilvl_w, 
t2, 
t0, 
t3, 
t1, 
t6, 
t4, 
t7, 
t5, tmp0_m, tmp4_m,
 
  940     DUP4_ARG2(__lasx_xvilvh_w, 
t2, 
t0, 
t3, 
t1, 
t6, 
t4, 
t7, 
t5, tmp2_m, tmp6_m,
 
  942     DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
 
  943               tmp7_m, tmp6_m, out8, out10, out12, out14);
 
  944     DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
 
  945               tmp7_m, tmp6_m, out9, out11, out13, out15);
 
  946     __lasx_xvstelm_d(out0, dst, 0, 0);
 
  947     __lasx_xvstelm_d(out0, dst, 8, 1);
 
  949     __lasx_xvstelm_d(out1, dst, 0, 0);
 
  950     __lasx_xvstelm_d(out1, dst, 8, 1);
 
  952     __lasx_xvstelm_d(out2, dst, 0, 0);
 
  953     __lasx_xvstelm_d(out2, dst, 8, 1);
 
  955     __lasx_xvstelm_d(out3, dst, 0, 0);
 
  956     __lasx_xvstelm_d(out3, dst, 8, 1);
 
  958     __lasx_xvstelm_d(out4, dst, 0, 0);
 
  959     __lasx_xvstelm_d(out4, dst, 8, 1);
 
  961     __lasx_xvstelm_d(out5, dst, 0, 0);
 
  962     __lasx_xvstelm_d(out5, dst, 8, 1);
 
  964     __lasx_xvstelm_d(out6, dst, 0, 0);
 
  965     __lasx_xvstelm_d(out6, dst, 8, 1);
 
  967     __lasx_xvstelm_d(out7, dst, 0, 0);
 
  968     __lasx_xvstelm_d(out7, dst, 8, 1);
 
  970     __lasx_xvstelm_d(out8, dst, 0, 0);
 
  971     __lasx_xvstelm_d(out8, dst, 8, 1);
 
  973     __lasx_xvstelm_d(out9, dst, 0, 0);
 
  974     __lasx_xvstelm_d(out9, dst, 8, 1);
 
  976     __lasx_xvstelm_d(out10, dst, 0, 0);
 
  977     __lasx_xvstelm_d(out10, dst, 8, 1);
 
  979     __lasx_xvstelm_d(out11, dst, 0, 0);
 
  980     __lasx_xvstelm_d(out11, dst, 8, 1);
 
  982     __lasx_xvstelm_d(out12, dst, 0, 0);
 
  983     __lasx_xvstelm_d(out12, dst, 8, 1);
 
  985     __lasx_xvstelm_d(out13, dst, 0, 0);
 
  986     __lasx_xvstelm_d(out13, dst, 8, 1);
 
  988     __lasx_xvstelm_d(out14, dst, 0, 0);
 
  989     __lasx_xvstelm_d(out14, dst, 8, 1);
 
  991     __lasx_xvstelm_d(out15, dst, 0, 0);
 
  992     __lasx_xvstelm_d(out15, dst, 8, 1);
 
  995 #define PUT_VC1_MSPEL_MC_H_LASX(hmode)                                    \ 
  996 void ff_put_vc1_mspel_mc ## hmode ## 0_16_lasx(uint8_t *dst,              \ 
  997                                                const uint8_t *src,        \ 
  998                                                ptrdiff_t stride, int rnd) \ 
 1000     put_vc1_mspel_mc_h_lasx(dst, src, stride, hmode, rnd);                \