28 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in,   \ 
   29                          p1_or_q1_org_in, p2_or_q2_org_in,   \ 
   30                          neg_tc_in, tc_in, p1_or_q1_out)     \ 
   32     __m256i clip3, temp;                                     \ 
   34     clip3 = __lasx_xvavgr_hu(p0_or_q0_org_in,                \ 
   36     temp = __lasx_xvslli_h(p1_or_q1_org_in, 1);              \ 
   37     clip3 = __lasx_xvsub_h(clip3, temp);                     \ 
   38     clip3 = __lasx_xvavg_h(p2_or_q2_org_in, clip3);          \ 
   39     clip3 = __lasx_xvclip_h(clip3, neg_tc_in, tc_in);        \ 
   40     p1_or_q1_out = __lasx_xvadd_h(p1_or_q1_org_in, clip3);   \ 
   43 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in,       \ 
   44                      p1_or_q1_org_in, q1_or_p1_org_in,       \ 
   45                      neg_threshold_in, threshold_in,         \ 
   46                      p0_or_q0_out, q0_or_p0_out)             \ 
   48     __m256i q0_sub_p0, p1_sub_q1, delta;                     \ 
   50     q0_sub_p0 = __lasx_xvsub_h(q0_or_p0_org_in,              \ 
   52     p1_sub_q1 = __lasx_xvsub_h(p1_or_q1_org_in,              \ 
   54     q0_sub_p0 = __lasx_xvslli_h(q0_sub_p0, 2);               \ 
   55     p1_sub_q1 = __lasx_xvaddi_hu(p1_sub_q1, 4);              \ 
   56     delta = __lasx_xvadd_h(q0_sub_p0, p1_sub_q1);            \ 
   57     delta = __lasx_xvsrai_h(delta, 3);                       \ 
   58     delta = __lasx_xvclip_h(delta, neg_threshold_in,         \ 
   60     p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, delta);   \ 
   61     q0_or_p0_out = __lasx_xvsub_h(q0_or_p0_org_in, delta);   \ 
   63     p0_or_q0_out = __lasx_xvclip255_h(p0_or_q0_out);         \ 
   64     q0_or_p0_out = __lasx_xvclip255_h(q0_or_p0_out);         \ 
   68                                int alpha_in, 
int beta_in, int8_t *
tc)
 
   70     ptrdiff_t img_width_2x = img_width << 1;
 
   71     ptrdiff_t img_width_4x = img_width << 2;
 
   72     ptrdiff_t img_width_8x = img_width << 3;
 
   73     ptrdiff_t img_width_3x = img_width_2x + img_width;
 
   74     __m256i tmp_vec0, bs_vec;
 
   75     __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
 
   76                       0x0101010100000000, 0x0303030302020202};
 
   78     tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)
tc, 0);
 
   79     tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
 
   80     bs_vec   = __lasx_xvslti_b(tc_vec, 0);
 
   81     bs_vec   = __lasx_xvxori_b(bs_vec, 255);
 
   82     bs_vec   = __lasx_xvandi_b(bs_vec, 1);
 
   84     if (__lasx_xbnz_v(bs_vec)) {
 
   86         __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
 
   87         __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, 
alpha, beta;
 
   88         __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
 
   89         __m256i is_bs_greater_than0;
 
   90         __m256i 
zero = __lasx_xvldi(0);
 
   92         is_bs_greater_than0 = __lasx_xvslt_bu(
zero, bs_vec);
 
   95             uint8_t *src_tmp = 
src + img_width_8x;
 
   96             __m256i row0, row1, row2, row3, row4, row5, row6, row7;
 
   97             __m256i row8, row9, row10, row11, row12, row13, row14, row15;
 
  100                       src, img_width_3x, row0, row1, row2, row3);
 
  103                       src, img_width_3x, row4, row5, row6, row7);
 
  105             DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
 
  106                       img_width_2x, src_tmp, img_width_3x,
 
  107                       row8, row9, row10, row11);
 
  108             src_tmp += img_width_4x;
 
  109             DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
 
  110                       img_width_2x, src_tmp, img_width_3x,
 
  111                       row12, row13, row14, row15);
 
  112             src_tmp -= img_width_4x;
 
  114             LASX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6,
 
  115                                  row7, row8, row9, row10, row11,
 
  116                                  row12, row13, row14, row15,
 
  117                                  p3_org, p2_org, p1_org, p0_org,
 
  118                                  q0_org, q1_org, q2_org, q3_org);
 
  121         p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
 
  122         p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
 
  123         q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
 
  125         alpha = __lasx_xvreplgr2vr_b(alpha_in);
 
  126         beta  = __lasx_xvreplgr2vr_b(beta_in);
 
  128         is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, 
alpha);
 
  129         is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
 
  130         is_less_than       = is_less_than_alpha & is_less_than_beta;
 
  131         is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
 
  132         is_less_than       = is_less_than_beta & is_less_than;
 
  133         is_less_than       = is_less_than & is_bs_greater_than0;
 
  135         if (__lasx_xbnz_v(is_less_than)) {
 
  136             __m256i neg_tc_h, tc_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
 
  137             __m256i p2_asub_p0, q2_asub_q0;
 
  139             neg_tc_h = __lasx_xvneg_b(tc_vec);
 
  140             neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
 
  141             tc_h     = __lasx_vext2xv_hu_bu(tc_vec);
 
  142             p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
 
  143             p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
 
  144             q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
 
  146             p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
 
  147             is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
 
  148             is_less_than_beta = is_less_than_beta & is_less_than;
 
  150             if (__lasx_xbnz_v(is_less_than_beta)) {
 
  151                 __m256i p2_org_h, p1_h;
 
  153                 p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
 
  155                                  neg_tc_h, tc_h, p1_h);
 
  156                 p1_h = __lasx_xvpickev_b(p1_h, p1_h);
 
  157                 p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
 
  158                 p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
 
  159                 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
 
  160                 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
 
  163             q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
 
  164             is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
 
  165             is_less_than_beta = is_less_than_beta & is_less_than;
 
  167             q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
 
  169             if (__lasx_xbnz_v(is_less_than_beta)) {
 
  170                 __m256i q2_org_h, q1_h;
 
  172                 q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
 
  174                                  neg_tc_h, tc_h, q1_h);
 
  175                 q1_h = __lasx_xvpickev_b(q1_h, q1_h);
 
  176                 q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
 
  177                 q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
 
  179                 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
 
  180                 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
 
  184                 __m256i neg_thresh_h, p0_h, q0_h;
 
  186                 neg_thresh_h = __lasx_xvneg_b(tc_vec);
 
  187                 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
 
  188                 tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
 
  191                              neg_thresh_h, tc_h, p0_h, q0_h);
 
  192                 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
 
  194                 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
 
  196                 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
 
  197                 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
 
  201                 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
 
  202                 __m256i control = {0x0000000400000000, 0x0000000500000001,
 
  203                                    0x0000000600000002, 0x0000000700000003};
 
  205                 DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org,
 
  206                           q2_org, 0x02, p2_org, q1_org, 0x02, p3_org,
 
  207                           q0_org, 0x02, p0_org, p1_org, p2_org, p3_org);
 
  208                 DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
 
  210                 DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
 
  212                 DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
 
  213                 DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
 
  214                 DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
 
  215                           control, row7, control, row4, row5, row6, row7);
 
  216                 __lasx_xvstelm_d(row4, 
src, 0, 0);
 
  217                 __lasx_xvstelm_d(row4, 
src + img_width, 0, 1);
 
  219                 __lasx_xvstelm_d(row4, 
src, 0, 2);
 
  220                 __lasx_xvstelm_d(row4, 
src + img_width, 0, 3);
 
  222                 __lasx_xvstelm_d(row5, 
src, 0, 0);
 
  223                 __lasx_xvstelm_d(row5, 
src + img_width, 0, 1);
 
  225                 __lasx_xvstelm_d(row5, 
src, 0, 2);
 
  226                 __lasx_xvstelm_d(row5, 
src + img_width, 0, 3);
 
  228                 __lasx_xvstelm_d(row6, 
src, 0, 0);
 
  229                 __lasx_xvstelm_d(row6, 
src + img_width, 0, 1);
 
  231                 __lasx_xvstelm_d(row6, 
src, 0, 2);
 
  232                 __lasx_xvstelm_d(row6, 
src + img_width, 0, 3);
 
  234                 __lasx_xvstelm_d(row7, 
src, 0, 0);
 
  235                 __lasx_xvstelm_d(row7, 
src + img_width, 0, 1);
 
  237                 __lasx_xvstelm_d(row7, 
src, 0, 2);
 
  238                 __lasx_xvstelm_d(row7, 
src + img_width, 0, 3);
 
  245                                    int alpha_in, 
int beta_in, int8_t *
tc)
 
  247     ptrdiff_t img_width_2x = img_width << 1;
 
  248     ptrdiff_t img_width_3x = img_width + img_width_2x;
 
  249     __m256i tmp_vec0, bs_vec;
 
  250     __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
 
  251                       0x0101010100000000, 0x0303030302020202};
 
  253     tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)
tc, 0);
 
  254     tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
 
  255     bs_vec   = __lasx_xvslti_b(tc_vec, 0);
 
  256     bs_vec   = __lasx_xvxori_b(bs_vec, 255);
 
  257     bs_vec   = __lasx_xvandi_b(bs_vec, 1);
 
  259     if (__lasx_xbnz_v(bs_vec)) {
 
  260         __m256i p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
 
  261         __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, 
alpha, beta;
 
  262         __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
 
  263         __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
 
  264         __m256i is_bs_greater_than0;
 
  265         __m256i 
zero = __lasx_xvldi(0);
 
  267         alpha = __lasx_xvreplgr2vr_b(alpha_in);
 
  268         beta  = __lasx_xvreplgr2vr_b(beta_in);
 
  272         p0_org = __lasx_xvldx(
data, -img_width);
 
  275         is_bs_greater_than0 = __lasx_xvslt_bu(
zero, bs_vec);
 
  276         p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
 
  277         p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
 
  278         q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
 
  280         is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, 
alpha);
 
  281         is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
 
  282         is_less_than       = is_less_than_alpha & is_less_than_beta;
 
  283         is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
 
  284         is_less_than       = is_less_than_beta & is_less_than;
 
  285         is_less_than       = is_less_than & is_bs_greater_than0;
 
  287         if (__lasx_xbnz_v(is_less_than)) {
 
  288             __m256i neg_tc_h, tc_h, p2_asub_p0, q2_asub_q0;
 
  290             q2_org = __lasx_xvldx(
data, img_width_2x);
 
  292             neg_tc_h = __lasx_xvneg_b(tc_vec);
 
  293             neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
 
  294             tc_h     = __lasx_vext2xv_hu_bu(tc_vec);
 
  295             p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
 
  296             p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
 
  297             q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
 
  299             p2_asub_p0        = __lasx_xvabsd_bu(p2_org, p0_org);
 
  300             is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
 
  301             is_less_than_beta = is_less_than_beta & is_less_than;
 
  303             if (__lasx_xbnz_v(is_less_than_beta)) {
 
  304                 __m256i p1_h, p2_org_h;
 
  306                 p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
 
  308                                  neg_tc_h, tc_h, p1_h);
 
  309                 p1_h = __lasx_xvpickev_b(p1_h, p1_h);
 
  310                 p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
 
  311                 p1_h   = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
 
  312                 p1_org = __lasx_xvpermi_q(p1_org, p1_h, 0x30);
 
  313                 __lasx_xvst(p1_org, 
data - img_width_2x, 0);
 
  315                 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
 
  316                 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
 
  319             q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
 
  320             is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
 
  321             is_less_than_beta = is_less_than_beta & is_less_than;
 
  323             q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
 
  325             if (__lasx_xbnz_v(is_less_than_beta)) {
 
  326                 __m256i q1_h, q2_org_h;
 
  328                 q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
 
  330                                  neg_tc_h, tc_h, q1_h);
 
  331                 q1_h = __lasx_xvpickev_b(q1_h, q1_h);
 
  332                 q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
 
  333                 q1_h = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
 
  334                 q1_org = __lasx_xvpermi_q(q1_org, q1_h, 0x30);
 
  335                 __lasx_xvst(q1_org, 
data + img_width, 0);
 
  337                 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
 
  338                 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
 
  343                 __m256i neg_thresh_h, p0_h, q0_h;
 
  345                 neg_thresh_h = __lasx_xvneg_b(tc_vec);
 
  346                 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
 
  347                 tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
 
  350                              neg_thresh_h, tc_h, p0_h, q0_h);
 
  351                 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
 
  353                 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0Xd8, q0_h, 0xd8,
 
  355                 p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
 
  356                 q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
 
  357                 p0_org = __lasx_xvpermi_q(p0_org, p0_h, 0x30);
 
  358                 q0_org = __lasx_xvpermi_q(q0_org, q0_h, 0x30);
 
  359                 __lasx_xvst(p0_org, 
data - img_width, 0);
 
  360                 __lasx_xvst(q0_org, 
data, 0);
 
  367                                  int alpha_in, 
int beta_in, int8_t *
tc)
 
  369     __m256i tmp_vec0, bs_vec;
 
  370     __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
 
  371     __m256i 
zero = __lasx_xvldi(0);
 
  372     ptrdiff_t img_width_2x = img_width << 1;
 
  373     ptrdiff_t img_width_4x = img_width << 2;
 
  374     ptrdiff_t img_width_3x = img_width_2x + img_width;
 
  376     tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)
tc, 0);
 
  377     tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
 
  378     bs_vec   = __lasx_xvslti_b(tc_vec, 0);
 
  379     bs_vec   = __lasx_xvxori_b(bs_vec, 255);
 
  380     bs_vec   = __lasx_xvandi_b(bs_vec, 1);
 
  381     bs_vec   = __lasx_xvpermi_q(
zero, bs_vec, 0x30);
 
  383     if (__lasx_xbnz_v(bs_vec)) {
 
  385         __m256i p1_org, p0_org, q0_org, q1_org;
 
  386         __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, 
alpha, beta;
 
  387         __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
 
  388         __m256i is_bs_greater_than0;
 
  390         is_bs_greater_than0 = __lasx_xvslt_bu(
zero, bs_vec);
 
  393             __m256i row0, row1, row2, row3, row4, row5, row6, row7;
 
  396                       src, img_width_3x, row0, row1, row2, row3);
 
  399                       src, img_width_3x, row4, row5, row6, row7);
 
  402             DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4,
 
  403                       row7, row5, p1_org, p0_org, q0_org, q1_org);
 
  404             row0 = __lasx_xvilvl_b(p0_org, p1_org);
 
  405             row1 = __lasx_xvilvl_b(q1_org, q0_org);
 
  406             row3 = __lasx_xvilvh_w(row1, row0);
 
  407             row2 = __lasx_xvilvl_w(row1, row0);
 
  408             p1_org = __lasx_xvpermi_d(row2, 0x00);
 
  409             p0_org = __lasx_xvpermi_d(row2, 0x55);
 
  410             q0_org = __lasx_xvpermi_d(row3, 0x00);
 
  411             q1_org = __lasx_xvpermi_d(row3, 0x55);
 
  414         p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
 
  415         p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
 
  416         q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
 
  418         alpha = __lasx_xvreplgr2vr_b(alpha_in);
 
  419         beta  = __lasx_xvreplgr2vr_b(beta_in);
 
  421         is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, 
alpha);
 
  422         is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
 
  423         is_less_than       = is_less_than_alpha & is_less_than_beta;
 
  424         is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
 
  425         is_less_than       = is_less_than_beta & is_less_than;
 
  426         is_less_than       = is_less_than & is_bs_greater_than0;
 
  428         if (__lasx_xbnz_v(is_less_than)) {
 
  429             __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
 
  431             p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
 
  432             p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
 
  433             q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
 
  434             q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
 
  437                 __m256i tc_h, neg_thresh_h, p0_h, q0_h;
 
  439                 neg_thresh_h = __lasx_xvneg_b(tc_vec);
 
  440                 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
 
  441                 tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
 
  444                              neg_thresh_h, tc_h, p0_h, q0_h);
 
  445                 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
 
  447                 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
 
  449                 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
 
  450                 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
 
  453             p0_org = __lasx_xvilvl_b(q0_org, p0_org);
 
  455             __lasx_xvstelm_h(p0_org, 
src, 0, 0);
 
  457             __lasx_xvstelm_h(p0_org, 
src, 0, 1);
 
  459             __lasx_xvstelm_h(p0_org, 
src, 0, 2);
 
  461             __lasx_xvstelm_h(p0_org, 
src, 0, 3);
 
  463             __lasx_xvstelm_h(p0_org, 
src, 0, 4);
 
  465             __lasx_xvstelm_h(p0_org, 
src, 0, 5);
 
  467             __lasx_xvstelm_h(p0_org, 
src, 0, 6);
 
  469             __lasx_xvstelm_h(p0_org, 
src, 0, 7);
 
  475                                  int alpha_in, 
int beta_in, int8_t *
tc)
 
  477     int img_width_2x = img_width << 1;
 
  478     __m256i tmp_vec0, bs_vec;
 
  479     __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
 
  480     __m256i 
zero = __lasx_xvldi(0);
 
  482     tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)
tc, 0);
 
  483     tc_vec   = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
 
  484     bs_vec   = __lasx_xvslti_b(tc_vec, 0);
 
  485     bs_vec   = __lasx_xvxori_b(bs_vec, 255);
 
  486     bs_vec   = __lasx_xvandi_b(bs_vec, 1);
 
  487     bs_vec   = __lasx_xvpermi_q(
zero, bs_vec, 0x30);
 
  489     if (__lasx_xbnz_v(bs_vec)) {
 
  490         __m256i p1_org, p0_org, q0_org, q1_org;
 
  491         __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, 
alpha, beta;
 
  492         __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
 
  493         __m256i is_bs_greater_than0;
 
  495         alpha = __lasx_xvreplgr2vr_b(alpha_in);
 
  496         beta  = __lasx_xvreplgr2vr_b(beta_in);
 
  502         is_bs_greater_than0 = __lasx_xvslt_bu(
zero, bs_vec);
 
  503         p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
 
  504         p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
 
  505         q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
 
  507         is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, 
alpha);
 
  508         is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
 
  509         is_less_than       = is_less_than_alpha & is_less_than_beta;
 
  510         is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
 
  511         is_less_than       = is_less_than_beta & is_less_than;
 
  512         is_less_than       = is_less_than & is_bs_greater_than0;
 
  514         if (__lasx_xbnz_v(is_less_than)) {
 
  515             __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
 
  517             p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
 
  518             p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
 
  519             q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
 
  520             q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
 
  523                 __m256i neg_thresh_h, tc_h, p0_h, q0_h;
 
  525                 neg_thresh_h = __lasx_xvneg_b(tc_vec);
 
  526                 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
 
  527                 tc_h         = __lasx_vext2xv_hu_bu(tc_vec);
 
  530                              neg_thresh_h, tc_h, p0_h, q0_h);
 
  531                 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
 
  533                 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
 
  535                 p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
 
  536                 q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
 
  537                 __lasx_xvstelm_d(p0_h, 
data - img_width, 0, 0);
 
  538                 __lasx_xvstelm_d(q0_h, 
data, 0, 0);
 
  544 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in,          \ 
  545                                  q3_or_p3_org_in, p1_or_q1_org_in,          \ 
  546                                  p2_or_q2_org_in, q1_or_p1_org_in,          \ 
  547                                  p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)  \ 
  550     __m256i const2, const3 = __lasx_xvldi(0);                               \ 
  552     const2 = __lasx_xvaddi_hu(const3, 2);                                   \ 
  553     const3 = __lasx_xvaddi_hu(const3, 3);                                   \ 
  554     threshold = __lasx_xvadd_h(p0_or_q0_org_in, q3_or_p3_org_in);           \ 
  555     threshold = __lasx_xvadd_h(p1_or_q1_org_in, threshold);                 \ 
  557     p0_or_q0_out = __lasx_xvslli_h(threshold, 1);                           \ 
  558     p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p2_or_q2_org_in);           \ 
  559     p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, q1_or_p1_org_in);           \ 
  560     p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const3);                   \ 
  562     p1_or_q1_out = __lasx_xvadd_h(p2_or_q2_org_in, threshold);              \ 
  563     p1_or_q1_out = __lasx_xvsrar_h(p1_or_q1_out, const2);                   \ 
  565     p2_or_q2_out = __lasx_xvmul_h(p2_or_q2_org_in, const3);                 \ 
  566     p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in);           \ 
  567     p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in);           \ 
  568     p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, threshold);                 \ 
  569     p2_or_q2_out = __lasx_xvsrar_h(p2_or_q2_out, const3);                   \ 
  573 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in,             \ 
  574                          p1_or_q1_org_in, p0_or_q0_out)                \ 
  576     __m256i const2 = __lasx_xvldi(0);                                  \ 
  577     const2 = __lasx_xvaddi_hu(const2, 2);                              \ 
  578     p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, q1_or_p1_org_in);   \ 
  579     p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in);      \ 
  580     p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in);      \ 
  581     p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const2);              \ 
  585                                      int alpha_in, 
int beta_in)
 
  587     ptrdiff_t img_width_2x = img_width << 1;
 
  588     ptrdiff_t img_width_4x = img_width << 2;
 
  589     ptrdiff_t img_width_3x = img_width_2x + img_width;
 
  591     __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, 
alpha, beta;
 
  592     __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
 
  593     __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
 
  594     __m256i 
zero = __lasx_xvldi(0);
 
  597         __m256i row0, row1, row2, row3, row4, row5, row6, row7;
 
  598         __m256i row8, row9, row10, row11, row12, row13, row14, row15;
 
  601                   src, img_width_3x, row0, row1, row2, row3);
 
  604                   src, img_width_3x, row4, row5, row6, row7);
 
  607                   src, img_width_3x, row8, row9, row10, row11);
 
  610                   src, img_width_3x, row12, row13, row14, row15);
 
  613         LASX_TRANSPOSE16x8_B(row0, row1, row2, row3,
 
  614                              row4, row5, row6, row7,
 
  615                              row8, row9, row10, row11,
 
  616                              row12, row13, row14, row15,
 
  617                              p3_org, p2_org, p1_org, p0_org,
 
  618                              q0_org, q1_org, q2_org, q3_org);
 
  621     alpha = __lasx_xvreplgr2vr_b(alpha_in);
 
  622     beta  = __lasx_xvreplgr2vr_b(beta_in);
 
  623     p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
 
  624     p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
 
  625     q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
 
  627     is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, 
alpha);
 
  628     is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
 
  629     is_less_than       = is_less_than_beta & is_less_than_alpha;
 
  630     is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
 
  631     is_less_than       = is_less_than_beta & is_less_than;
 
  632     is_less_than       = __lasx_xvpermi_q(
zero, is_less_than, 0x30);
 
  634     if (__lasx_xbnz_v(is_less_than)) {
 
  635         __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
 
  636         __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
 
  637         __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(
alpha, 2);
 
  639         less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
 
  640         less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
 
  641                                                  less_alpha_shift2_add2);
 
  643         p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
 
  644         p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
 
  645         q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
 
  646         q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
 
  648         p2_asub_p0               = __lasx_xvabsd_bu(p2_org, p0_org);
 
  649         is_less_than_beta        = __lasx_xvslt_bu(p2_asub_p0, beta);
 
  650         is_less_than_beta        = is_less_than_beta & less_alpha_shift2_add2;
 
  651         negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
 
  652         is_less_than_beta        = is_less_than_beta & is_less_than;
 
  653         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
 
  656         if (__lasx_xbnz_v(is_less_than_beta)) {
 
  657             __m256i p2_org_h, p3_org_h, p1_h, p2_h;
 
  659             p2_org_h   = __lasx_vext2xv_hu_bu(p2_org);
 
  660             p3_org_h   = __lasx_vext2xv_hu_bu(p3_org);
 
  663                                      p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
 
  665             p0_h = __lasx_xvpickev_b(p0_h, p0_h);
 
  666             p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
 
  667             DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
 
  668             DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
 
  669             p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
 
  670             p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
 
  671             p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
 
  676         p0_h = __lasx_xvpickev_b(p0_h, p0_h);
 
  677         p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
 
  678         p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
 
  681         q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
 
  682         is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
 
  683         is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
 
  684         negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
 
  685         is_less_than_beta = is_less_than_beta & is_less_than;
 
  686         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
 
  689         if (__lasx_xbnz_v(is_less_than_beta)) {
 
  690             __m256i q2_org_h, q3_org_h, q1_h, q2_h;
 
  692             q2_org_h   = __lasx_vext2xv_hu_bu(q2_org);
 
  693             q3_org_h   = __lasx_vext2xv_hu_bu(q3_org);
 
  696                                      q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
 
  698             q0_h = __lasx_xvpickev_b(q0_h, q0_h);
 
  699             q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
 
  700             DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
 
  701             DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
 
  702             q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
 
  703             q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
 
  704             q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
 
  711         q0_h = __lasx_xvpickev_b(q0_h, q0_h);
 
  712         q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
 
  713         q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
 
  717             __m256i row0, row1, row2, row3, row4, row5, row6, row7;
 
  718             __m256i control = {0x0000000400000000, 0x0000000500000001,
 
  719                                0x0000000600000002, 0x0000000700000003};
 
  721             DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org, q2_org,
 
  722                       0x02, p2_org, q1_org, 0x02, p3_org, q0_org, 0x02,
 
  723                       p0_org, p1_org, p2_org, p3_org);
 
  724             DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
 
  726             DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
 
  728             DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
 
  729             DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
 
  730             DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
 
  731                       control, row7, control, row4, row5, row6, row7);
 
  733             __lasx_xvstelm_d(row4, 
src, 0, 0);
 
  734             __lasx_xvstelm_d(row4, 
src + img_width, 0, 1);
 
  736             __lasx_xvstelm_d(row4, 
src, 0, 2);
 
  737             __lasx_xvstelm_d(row4, 
src + img_width, 0, 3);
 
  739             __lasx_xvstelm_d(row5, 
src, 0, 0);
 
  740             __lasx_xvstelm_d(row5, 
src + img_width, 0, 1);
 
  742             __lasx_xvstelm_d(row5, 
src, 0, 2);
 
  743             __lasx_xvstelm_d(row5, 
src + img_width, 0, 3);
 
  745             __lasx_xvstelm_d(row6, 
src, 0, 0);
 
  746             __lasx_xvstelm_d(row6, 
src + img_width, 0, 1);
 
  748             __lasx_xvstelm_d(row6, 
src, 0, 2);
 
  749             __lasx_xvstelm_d(row6, 
src + img_width, 0, 3);
 
  751             __lasx_xvstelm_d(row7, 
src, 0, 0);
 
  752             __lasx_xvstelm_d(row7, 
src + img_width, 0, 1);
 
  754             __lasx_xvstelm_d(row7, 
src, 0, 2);
 
  755             __lasx_xvstelm_d(row7, 
src + img_width, 0, 3);
 
  761                                      int alpha_in, 
int beta_in)
 
  763     ptrdiff_t img_width_2x = img_width << 1;
 
  764     ptrdiff_t img_width_3x = img_width_2x + img_width;
 
  765     uint8_t *
src = 
data - img_width_2x;
 
  766     __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, 
alpha, beta;
 
  767     __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
 
  768     __m256i p1_org, p0_org, q0_org, q1_org;
 
  769     __m256i 
zero = __lasx_xvldi(0);
 
  772               src, img_width_3x, p1_org, p0_org, q0_org, q1_org);
 
  773     alpha = __lasx_xvreplgr2vr_b(alpha_in);
 
  774     beta  = __lasx_xvreplgr2vr_b(beta_in);
 
  775     p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
 
  776     p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
 
  777     q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
 
  779     is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, 
alpha);
 
  780     is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
 
  781     is_less_than       = is_less_than_beta & is_less_than_alpha;
 
  782     is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
 
  783     is_less_than       = is_less_than_beta & is_less_than;
 
  784     is_less_than       = __lasx_xvpermi_q(
zero, is_less_than, 0x30);
 
  786     if (__lasx_xbnz_v(is_less_than)) {
 
  787         __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
 
  788         __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
 
  789         __m256i p2_org = __lasx_xvldx(
src, -img_width);
 
  790         __m256i q2_org = __lasx_xvldx(
data, img_width_2x);
 
  791         __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(
alpha, 2);
 
  792         less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
 
  793         less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
 
  794                                                  less_alpha_shift2_add2);
 
  796         p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
 
  797         p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
 
  798         q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
 
  799         q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
 
  801         p2_asub_p0               = __lasx_xvabsd_bu(p2_org, p0_org);
 
  802         is_less_than_beta        = __lasx_xvslt_bu(p2_asub_p0, beta);
 
  803         is_less_than_beta        = is_less_than_beta & less_alpha_shift2_add2;
 
  804         negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
 
  805         is_less_than_beta        = is_less_than_beta & is_less_than;
 
  806         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
 
  809         if (__lasx_xbnz_v(is_less_than_beta)) {
 
  810             __m256i p2_org_h, p3_org_h, p1_h, p2_h;
 
  811             __m256i p3_org = __lasx_xvldx(
src, -img_width_2x);
 
  813             p2_org_h   = __lasx_vext2xv_hu_bu(p2_org);
 
  814             p3_org_h   = __lasx_vext2xv_hu_bu(p3_org);
 
  817                                      p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
 
  819             p0_h = __lasx_xvpickev_b(p0_h, p0_h);
 
  820             p0_h =  __lasx_xvpermi_d(p0_h, 0xd8);
 
  821             DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
 
  822             DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
 
  823             p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
 
  824             p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
 
  825             p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
 
  827             __lasx_xvst(p1_org, 
src, 0);
 
  828             __lasx_xvst(p2_org, 
src - img_width, 0);
 
  833         p0_h = __lasx_xvpickev_b(p0_h, p0_h);
 
  834         p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
 
  835         p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
 
  836         __lasx_xvst(p0_org, 
data - img_width, 0);
 
  839         q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
 
  840         is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
 
  841         is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
 
  842         negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
 
  843         is_less_than_beta = is_less_than_beta & is_less_than;
 
  844         negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
 
  847         if (__lasx_xbnz_v(is_less_than_beta)) {
 
  848             __m256i q2_org_h, q3_org_h, q1_h, q2_h;
 
  849             __m256i q3_org = __lasx_xvldx(
data, img_width_2x + img_width);
 
  851             q2_org_h   = __lasx_vext2xv_hu_bu(q2_org);
 
  852             q3_org_h   = __lasx_vext2xv_hu_bu(q3_org);
 
  855                                      q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
 
  857             q0_h = __lasx_xvpickev_b(q0_h, q0_h);
 
  858             q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
 
  859             DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
 
  860             DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
 
  861             q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
 
  862             q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
 
  863             q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
 
  865             __lasx_xvst(q1_org, 
data + img_width, 0);
 
  866             __lasx_xvst(q2_org, 
data + img_width_2x, 0);
 
  872         q0_h = __lasx_xvpickev_b(q0_h, q0_h);
 
  873         q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
 
  874         q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
 
  876         __lasx_xvst(q0_org, 
data, 0);
 
  881                                        int alpha_in, 
int beta_in)
 
  884     ptrdiff_t img_width_2x = img_width << 1;
 
  885     ptrdiff_t img_width_4x = img_width << 2;
 
  886     ptrdiff_t img_width_3x = img_width_2x + img_width;
 
  887     __m256i p1_org, p0_org, q0_org, q1_org;
 
  888     __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, 
alpha, beta;
 
  889     __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
 
  892         __m256i row0, row1, row2, row3, row4, row5, row6, row7;
 
  895                   img_width_3x, row0, row1, row2, row3);
 
  898                   img_width_3x, row4, row5, row6, row7);
 
  901         DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4, row7, row5,
 
  902                   p1_org, p0_org, q0_org, q1_org);
 
  903         row0 = __lasx_xvilvl_b(p0_org, p1_org);
 
  904         row1 = __lasx_xvilvl_b(q1_org, q0_org);
 
  905         row3 = __lasx_xvilvh_w(row1, row0);
 
  906         row2 = __lasx_xvilvl_w(row1, row0);
 
  907         p1_org = __lasx_xvpermi_d(row2, 0x00);
 
  908         p0_org = __lasx_xvpermi_d(row2, 0x55);
 
  909         q0_org = __lasx_xvpermi_d(row3, 0x00);
 
  910         q1_org = __lasx_xvpermi_d(row3, 0x55);
 
  913     alpha = __lasx_xvreplgr2vr_b(alpha_in);
 
  914     beta  = __lasx_xvreplgr2vr_b(beta_in);
 
  916     p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
 
  917     p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
 
  918     q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
 
  920     is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, 
alpha);
 
  921     is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
 
  922     is_less_than       = is_less_than_alpha & is_less_than_beta;
 
  923     is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
 
  924     is_less_than       = is_less_than_beta & is_less_than;
 
  926     if (__lasx_xbnz_v(is_less_than)) {
 
  927         __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
 
  929         p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
 
  930         p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
 
  931         q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
 
  932         q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
 
  936         DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
 
  937         DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
 
  938         p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
 
  939         q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
 
  941     p0_org = __lasx_xvilvl_b(q0_org, p0_org);
 
  943     __lasx_xvstelm_h(p0_org, 
src, 0, 0);
 
  945     __lasx_xvstelm_h(p0_org, 
src, 0, 1);
 
  947     __lasx_xvstelm_h(p0_org, 
src, 0, 2);
 
  949     __lasx_xvstelm_h(p0_org, 
src, 0, 3);
 
  951     __lasx_xvstelm_h(p0_org, 
src, 0, 4);
 
  953     __lasx_xvstelm_h(p0_org, 
src, 0, 5);
 
  955     __lasx_xvstelm_h(p0_org, 
src, 0, 6);
 
  957     __lasx_xvstelm_h(p0_org, 
src, 0, 7);
 
  961                                        int alpha_in, 
int beta_in)
 
  963     ptrdiff_t img_width_2x = img_width << 1;
 
  964     __m256i p1_org, p0_org, q0_org, q1_org;
 
  965     __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, 
alpha, beta;
 
  966     __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
 
  968     alpha = __lasx_xvreplgr2vr_b(alpha_in);
 
  969     beta  = __lasx_xvreplgr2vr_b(beta_in);
 
  971     p1_org = __lasx_xvldx(
data, -img_width_2x);
 
  972     p0_org = __lasx_xvldx(
data, -img_width);
 
  975     p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
 
  976     p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
 
  977     q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
 
  979     is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, 
alpha);
 
  980     is_less_than_beta  = __lasx_xvslt_bu(p1_asub_p0, beta);
 
  981     is_less_than       = is_less_than_alpha & is_less_than_beta;
 
  982     is_less_than_beta  = __lasx_xvslt_bu(q1_asub_q0, beta);
 
  983     is_less_than       = is_less_than_beta & is_less_than;
 
  985     if (__lasx_xbnz_v(is_less_than)) {
 
  986         __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
 
  988         p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
 
  989         p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
 
  990         q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
 
  991         q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
 
  995         DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
 
  996         DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
 
  997         p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
 
  998         q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
 
  999         __lasx_xvstelm_d(p0_h, 
data - img_width, 0, 0);
 
 1000         __lasx_xvstelm_d(q0_h, 
data, 0, 0);
 
 1006                                       int log2_denom, 
int weight_dst,
 
 1007                                       int weight_src, 
int offset_in)
 
 1011     __m256i dst0, dst1, dst2, dst3;
 
 1012     __m256i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1013     __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 1015     int stride_2x = 
stride << 1;
 
 1016     int stride_4x = 
stride << 2;
 
 1017     int stride_3x = stride_2x + 
stride;
 
 1019     offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
 
 1020     offset_in  += ((weight_src + weight_dst) << 7);
 
 1023     tmp0   = __lasx_xvreplgr2vr_b(weight_src);
 
 1024     tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
 
 1025     wgt    = __lasx_xvilvh_b(tmp1, tmp0);
 
 1026     offset = __lasx_xvreplgr2vr_h(offset_in);
 
 1027     denom  = __lasx_xvreplgr2vr_h(log2_denom);
 
 1030               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1033               src, stride_3x, tmp4, tmp5, tmp6, tmp7);
 
 1035     DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
 
 1036               0x20, tmp7, tmp6, 0x20, 
src0, 
src1, src2, src3);
 
 1038               dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1041               dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
 
 1043     DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
 
 1044               0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
 
 1048     DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
 
 1049               dst0, dst1, dst2, dst3);
 
 1051               dst3, src3, vec0, vec2, vec4, vec6);
 
 1053               dst3, src3, vec1, vec3, vec5, vec7);
 
 1056               offset, wgt, vec2, 
offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
 
 1058               offset, wgt, vec6, 
offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
 
 1060     tmp0 = __lasx_xvsra_h(tmp0, denom);
 
 1061     tmp1 = __lasx_xvsra_h(tmp1, denom);
 
 1062     tmp2 = __lasx_xvsra_h(tmp2, denom);
 
 1063     tmp3 = __lasx_xvsra_h(tmp3, denom);
 
 1064     tmp4 = __lasx_xvsra_h(tmp4, denom);
 
 1065     tmp5 = __lasx_xvsra_h(tmp5, denom);
 
 1066     tmp6 = __lasx_xvsra_h(tmp6, denom);
 
 1067     tmp7 = __lasx_xvsra_h(tmp7, denom);
 
 1069     DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
 
 1070                                   tmp0, tmp1, tmp2, tmp3);
 
 1071     DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
 
 1072                                   tmp4, tmp5, tmp6, tmp7);
 
 1073     DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
 
 1074               dst0, dst1, dst2, dst3);
 
 1075     __lasx_xvstelm_d(dst0, dst, 0, 0);
 
 1076     __lasx_xvstelm_d(dst0, dst, 8, 1);
 
 1078     __lasx_xvstelm_d(dst0, dst, 0, 2);
 
 1079     __lasx_xvstelm_d(dst0, dst, 8, 3);
 
 1081     __lasx_xvstelm_d(dst1, dst, 0, 0);
 
 1082     __lasx_xvstelm_d(dst1, dst, 8, 1);
 
 1084     __lasx_xvstelm_d(dst1, dst, 0, 2);
 
 1085     __lasx_xvstelm_d(dst1, dst, 8, 3);
 
 1087     __lasx_xvstelm_d(dst2, dst, 0, 0);
 
 1088     __lasx_xvstelm_d(dst2, dst, 8, 1);
 
 1090     __lasx_xvstelm_d(dst2, dst, 0, 2);
 
 1091     __lasx_xvstelm_d(dst2, dst, 8, 3);
 
 1093     __lasx_xvstelm_d(dst3, dst, 0, 0);
 
 1094     __lasx_xvstelm_d(dst3, dst, 8, 1);
 
 1096     __lasx_xvstelm_d(dst3, dst, 0, 2);
 
 1097     __lasx_xvstelm_d(dst3, dst, 8, 3);
 
 1102                   src, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1105                   src, stride_3x, tmp4, tmp5, tmp6, tmp7);
 
 1107         DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
 
 1108                   tmp4, 0x20, tmp7, tmp6, 0x20, 
src0, 
src1, src2, src3);
 
 1110                   dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1113                   dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
 
 1115         DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
 
 1116                   tmp4, 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
 
 1120         DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
 
 1121                   dst0, dst1, dst2, dst3);
 
 1123                   dst3, src3, vec0, vec2, vec4, vec6);
 
 1125                   dst3, src3, vec1, vec3, vec5, vec7);
 
 1128                   offset, wgt, vec2, 
offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
 
 1130                   offset, wgt, vec6, 
offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
 
 1132         tmp0 = __lasx_xvsra_h(tmp0, denom);
 
 1133         tmp1 = __lasx_xvsra_h(tmp1, denom);
 
 1134         tmp2 = __lasx_xvsra_h(tmp2, denom);
 
 1135         tmp3 = __lasx_xvsra_h(tmp3, denom);
 
 1136         tmp4 = __lasx_xvsra_h(tmp4, denom);
 
 1137         tmp5 = __lasx_xvsra_h(tmp5, denom);
 
 1138         tmp6 = __lasx_xvsra_h(tmp6, denom);
 
 1139         tmp7 = __lasx_xvsra_h(tmp7, denom);
 
 1141         DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
 
 1142                                       tmp0, tmp1, tmp2, tmp3);
 
 1143         DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
 
 1144                                       tmp4, tmp5, tmp6, tmp7);
 
 1145         DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7,
 
 1146                   tmp6, dst0, dst1, dst2, dst3);
 
 1147         __lasx_xvstelm_d(dst0, dst, 0, 0);
 
 1148         __lasx_xvstelm_d(dst0, dst, 8, 1);
 
 1150         __lasx_xvstelm_d(dst0, dst, 0, 2);
 
 1151         __lasx_xvstelm_d(dst0, dst, 8, 3);
 
 1153         __lasx_xvstelm_d(dst1, dst, 0, 0);
 
 1154         __lasx_xvstelm_d(dst1, dst, 8, 1);
 
 1156         __lasx_xvstelm_d(dst1, dst, 0, 2);
 
 1157         __lasx_xvstelm_d(dst1, dst, 8, 3);
 
 1159         __lasx_xvstelm_d(dst2, dst, 0, 0);
 
 1160         __lasx_xvstelm_d(dst2, dst, 8, 1);
 
 1162         __lasx_xvstelm_d(dst2, dst, 0, 2);
 
 1163         __lasx_xvstelm_d(dst2, dst, 8, 3);
 
 1165         __lasx_xvstelm_d(dst3, dst, 0, 0);
 
 1166         __lasx_xvstelm_d(dst3, dst, 8, 1);
 
 1168         __lasx_xvstelm_d(dst3, dst, 0, 2);
 
 1169         __lasx_xvstelm_d(dst3, dst, 8, 3);
 
 1177     __m256i wgt, vec0, vec1;
 
 1179     __m256i tmp0, tmp1, tmp2, tmp3, denom, 
offset;
 
 1180     ptrdiff_t stride_2x = 
stride << 1;
 
 1181     ptrdiff_t stride_3x = stride_2x + 
stride;
 
 1183     offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
 
 1184     offset_in  += ((weight_src + weight_dst) << 7);
 
 1187     tmp0   = __lasx_xvreplgr2vr_b(weight_src);
 
 1188     tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
 
 1189     wgt    = __lasx_xvilvh_b(tmp1, tmp0);
 
 1190     offset = __lasx_xvreplgr2vr_h(offset_in);
 
 1191     denom  = __lasx_xvreplgr2vr_h(log2_denom);
 
 1194               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1195     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1196     src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1198               dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1199     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1200     dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1202     vec0 = __lasx_xvilvl_b(dst0, 
src0);
 
 1203     vec1 = __lasx_xvilvh_b(dst0, 
src0);
 
 1206     tmp0 = __lasx_xvsra_h(tmp0, denom);
 
 1207     tmp1 = __lasx_xvsra_h(tmp1, denom);
 
 1208     DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
 
 1209     dst0 = __lasx_xvpickev_b(tmp1, tmp0);
 
 1210     __lasx_xvstelm_d(dst0, dst, 0, 0);
 
 1211     __lasx_xvstelm_d(dst0, dst + 
stride, 0, 1);
 
 1212     __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
 
 1213     __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
 
 1220     __m256i wgt, vec0, vec1, vec2, vec3;
 
 1222     __m256i tmp0, tmp1, tmp2, tmp3, denom, 
offset;
 
 1223     ptrdiff_t stride_2x = 
stride << 1;
 
 1224     ptrdiff_t stride_4x = 
stride << 2;
 
 1225     ptrdiff_t stride_3x = stride_2x + 
stride;
 
 1226     uint8_t* dst_tmp = dst;
 
 1228     offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
 
 1229     offset_in  += ((weight_src + weight_dst) << 7);
 
 1232     tmp0   = __lasx_xvreplgr2vr_b(weight_src);
 
 1233     tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
 
 1234     wgt    = __lasx_xvilvh_b(tmp1, tmp0);
 
 1235     offset = __lasx_xvreplgr2vr_h(offset_in);
 
 1236     denom  = __lasx_xvreplgr2vr_h(log2_denom);
 
 1239               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1241     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1242     src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1244               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1245     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1246     src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1247     tmp0 = __lasx_xvld(dst_tmp, 0);
 
 1248     DUP2_ARG2(__lasx_xvldx, dst_tmp, 
stride, dst_tmp, stride_2x, tmp1, tmp2);
 
 1249     tmp3 = __lasx_xvldx(dst_tmp, stride_3x);
 
 1250     dst_tmp += stride_4x;
 
 1251     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1252     dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1253     DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, 
stride, dst_tmp, stride_2x,
 
 1254               dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1255     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1256     dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1263               offset, wgt, vec2, 
offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
 
 1264     tmp0 = __lasx_xvsra_h(tmp0, denom);
 
 1265     tmp1 = __lasx_xvsra_h(tmp1, denom);
 
 1266     tmp2 = __lasx_xvsra_h(tmp2, denom);
 
 1267     tmp3 = __lasx_xvsra_h(tmp3, denom);
 
 1268     DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
 
 1269                                   tmp0, tmp1, tmp2, tmp3);
 
 1270     DUP2_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, dst0, dst1);
 
 1271     __lasx_xvstelm_d(dst0, dst, 0, 0);
 
 1272     __lasx_xvstelm_d(dst0, dst + 
stride, 0, 1);
 
 1273     __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
 
 1274     __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
 
 1276     __lasx_xvstelm_d(dst1, dst, 0, 0);
 
 1277     __lasx_xvstelm_d(dst1, dst + 
stride, 0, 1);
 
 1278     __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
 
 1279     __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
 
 1286     __m256i wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
 
 1287     __m256i 
src0, 
src1, src2, src3, dst0, dst1, dst2, dst3;
 
 1288     __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, 
offset;
 
 1289     ptrdiff_t stride_2x = 
stride << 1;
 
 1290     ptrdiff_t stride_4x = 
stride << 2;
 
 1291     ptrdiff_t stride_3x = stride_2x + 
stride;
 
 1292     uint8_t* dst_tmp = dst;
 
 1294     offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
 
 1295     offset_in  += ((weight_src + weight_dst) << 7);
 
 1298     tmp0   = __lasx_xvreplgr2vr_b(weight_src);
 
 1299     tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
 
 1300     wgt    = __lasx_xvilvh_b(tmp1, tmp0);
 
 1301     offset = __lasx_xvreplgr2vr_h(offset_in);
 
 1302     denom  = __lasx_xvreplgr2vr_h(log2_denom);
 
 1305               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1307     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1308     src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1310               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1312     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1313     src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1315               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1317     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1318     src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1320               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1321     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1322     src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1324     DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, 
stride, dst_tmp, stride_2x,
 
 1325               dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1326     dst_tmp += stride_4x;
 
 1327     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1328     dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1329     DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, 
stride, dst_tmp, stride_2x,
 
 1330               dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1331     dst_tmp += stride_4x;
 
 1332     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1333     dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1334     DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, 
stride, dst_tmp, stride_2x,
 
 1335               dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1336     dst_tmp += stride_4x;
 
 1337     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1338     dst2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1339     DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, 
stride, dst_tmp, stride_2x,
 
 1340               dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1341     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1342     dst3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1346     DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
 
 1347               dst0, dst1, dst2, dst3);
 
 1349               dst3, src3, vec0, vec2, vec4, vec6);
 
 1351               dst3, src3, vec1, vec3, vec5, vec7);
 
 1353               offset, wgt, vec2, 
offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
 
 1355               offset, wgt, vec6, 
offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
 
 1356     tmp0 = __lasx_xvsra_h(tmp0, denom);
 
 1357     tmp1 = __lasx_xvsra_h(tmp1, denom);
 
 1358     tmp2 = __lasx_xvsra_h(tmp2, denom);
 
 1359     tmp3 = __lasx_xvsra_h(tmp3, denom);
 
 1360     tmp4 = __lasx_xvsra_h(tmp4, denom);
 
 1361     tmp5 = __lasx_xvsra_h(tmp5, denom);
 
 1362     tmp6 = __lasx_xvsra_h(tmp6, denom);
 
 1363     tmp7 = __lasx_xvsra_h(tmp7, denom);
 
 1364     DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
 
 1365                                   tmp0, tmp1, tmp2, tmp3);
 
 1366     DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
 
 1367                                   tmp4, tmp5, tmp6, tmp7);
 
 1368     DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
 
 1369                    dst0, dst1, dst2, dst3)
 
 1370     __lasx_xvstelm_d(dst0, dst, 0, 0);
 
 1371     __lasx_xvstelm_d(dst0, dst + 
stride, 0, 1);
 
 1372     __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
 
 1373     __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
 
 1375     __lasx_xvstelm_d(dst1, dst, 0, 0);
 
 1376     __lasx_xvstelm_d(dst1, dst + 
stride, 0, 1);
 
 1377     __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
 
 1378     __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
 
 1380     __lasx_xvstelm_d(dst2, dst, 0, 0);
 
 1381     __lasx_xvstelm_d(dst2, dst + 
stride, 0, 1);
 
 1382     __lasx_xvstelm_d(dst2, dst + stride_2x, 0, 2);
 
 1383     __lasx_xvstelm_d(dst2, dst + stride_3x, 0, 3);
 
 1385     __lasx_xvstelm_d(dst3, dst, 0, 0);
 
 1386     __lasx_xvstelm_d(dst3, dst + 
stride, 0, 1);
 
 1387     __lasx_xvstelm_d(dst3, dst + stride_2x, 0, 2);
 
 1388     __lasx_xvstelm_d(dst3, dst + stride_3x, 0, 3);
 
 1393                                      int log2_denom, 
int weight_dst,
 
 1394                                      int weight_src, 
int offset)
 
 1399     } 
else if (8 == 
height) {
 
 1414     __m256i tmp0, tmp1, denom, 
offset;
 
 1416     offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
 
 1417     offset_in  += ((weight_src + weight_dst) << 7);
 
 1420     tmp0   = __lasx_xvreplgr2vr_b(weight_src);
 
 1421     tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
 
 1422     wgt    = __lasx_xvilvh_b(tmp1, tmp0);
 
 1423     offset = __lasx_xvreplgr2vr_h(offset_in);
 
 1424     denom  = __lasx_xvreplgr2vr_h(log2_denom);
 
 1427     src0 = __lasx_xvilvl_w(tmp1, tmp0);
 
 1429     dst0 = __lasx_xvilvl_w(tmp1, tmp0);
 
 1431     vec0 = __lasx_xvilvl_b(dst0, 
src0);
 
 1432     tmp0 = __lasx_xvdp2add_h_b(
offset, wgt, vec0);
 
 1433     tmp0 = __lasx_xvsra_h(tmp0, denom);
 
 1434     tmp0 = __lasx_xvclip255_h(tmp0);
 
 1435     tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
 
 1436     __lasx_xvstelm_w(tmp0, dst, 0, 0);
 
 1437     __lasx_xvstelm_w(tmp0, dst + 
stride, 0, 1);
 
 1446     __m256i tmp0, tmp1, tmp2, tmp3, denom, 
offset;
 
 1447     ptrdiff_t stride_2x = 
stride << 1;
 
 1448     ptrdiff_t stride_3x = stride_2x + 
stride;
 
 1450     offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
 
 1451     offset_in  += ((weight_src + weight_dst) << 7);
 
 1454     tmp0   = __lasx_xvreplgr2vr_b(weight_src);
 
 1455     tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
 
 1456     wgt    = __lasx_xvilvh_b(tmp1, tmp0);
 
 1457     offset = __lasx_xvreplgr2vr_h(offset_in);
 
 1458     denom  = __lasx_xvreplgr2vr_h(log2_denom);
 
 1461               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1462     DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
 
 1463     src0 = __lasx_xvilvl_w(tmp1, tmp0);
 
 1465               dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1466     DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
 
 1467     dst0 = __lasx_xvilvl_w(tmp1, tmp0);
 
 1469     vec0 = __lasx_xvilvl_b(dst0, 
src0);
 
 1470     dst0 = __lasx_xvilvh_b(dst0, 
src0);
 
 1471     vec0 = __lasx_xvpermi_q(vec0, dst0, 0x02);
 
 1472     tmp0 = __lasx_xvdp2add_h_b(
offset, wgt, vec0);
 
 1473     tmp0 = __lasx_xvsra_h(tmp0, denom);
 
 1474     tmp0 = __lasx_xvclip255_h(tmp0);
 
 1475     tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
 
 1476     __lasx_xvstelm_w(tmp0, dst, 0, 0);
 
 1477     __lasx_xvstelm_w(tmp0, dst + 
stride, 0, 1);
 
 1478     __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 4);
 
 1479     __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 5);
 
 1486     __m256i wgt, vec0, vec1;
 
 1488     __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, 
offset;
 
 1489     ptrdiff_t stride_2x = 
stride << 1;
 
 1490     ptrdiff_t stride_4x = 
stride << 2;
 
 1491     ptrdiff_t stride_3x = stride_2x + 
stride;
 
 1493     offset_in   = (unsigned) ((offset_in + 1) | 1) << log2_denom;
 
 1494     offset_in  += ((weight_src + weight_dst) << 7);
 
 1497     tmp0   = __lasx_xvreplgr2vr_b(weight_src);
 
 1498     tmp1   = __lasx_xvreplgr2vr_b(weight_dst);
 
 1499     wgt    = __lasx_xvilvh_b(tmp1, tmp0);
 
 1500     offset = __lasx_xvreplgr2vr_h(offset_in);
 
 1501     denom  = __lasx_xvreplgr2vr_h(log2_denom);
 
 1504               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1507               src, stride_3x, tmp4, tmp5, tmp6, tmp7);
 
 1508     DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
 
 1509               tmp0, tmp1, tmp2, tmp3);
 
 1510     DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1511     src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1513               dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1516               dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
 
 1518     DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
 
 1519               tmp0, tmp1, tmp2, tmp3);
 
 1520     DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1521     dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1523     vec0 = __lasx_xvilvl_b(dst0, 
src0);
 
 1524     vec1 = __lasx_xvilvh_b(dst0, 
src0);
 
 1527     tmp0 = __lasx_xvsra_h(tmp0, denom);
 
 1528     tmp1 = __lasx_xvsra_h(tmp1, denom);
 
 1529     DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
 
 1530     tmp0 = __lasx_xvpickev_b(tmp1, tmp0);
 
 1531     __lasx_xvstelm_w(tmp0, dst, 0, 0);
 
 1532     __lasx_xvstelm_w(tmp0, dst + 
stride, 0, 1);
 
 1533     __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 2);
 
 1534     __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 3);
 
 1536     __lasx_xvstelm_w(tmp0, dst, 0, 4);
 
 1537     __lasx_xvstelm_w(tmp0, dst + 
stride, 0, 5);
 
 1538     __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 6);
 
 1539     __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 7);
 
 1544                                      int log2_denom, 
int weight_dst,
 
 1545                                      int weight_src, 
int offset)
 
 1550     } 
else if (4 == 
height) {
 
 1560                                     int height, 
int log2_denom,
 
 1561                                     int weight_src, 
int offset_in)
 
 1563     uint32_t offset_val;
 
 1564     ptrdiff_t stride_2x = 
stride << 1;
 
 1565     ptrdiff_t stride_4x = 
stride << 2;
 
 1566     ptrdiff_t stride_3x = stride_2x + 
stride;
 
 1567     __m256i 
zero = __lasx_xvldi(0);
 
 1569     __m256i src0_l, src1_l, src2_l, src3_l, src0_h, src1_h, src2_h, src3_h;
 
 1570     __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
 
 1571     __m256i wgt, denom, 
offset;
 
 1573     offset_val = (unsigned) offset_in << log2_denom;
 
 1575     wgt    = __lasx_xvreplgr2vr_h(weight_src);
 
 1576     offset = __lasx_xvreplgr2vr_h(offset_val);
 
 1577     denom  = __lasx_xvreplgr2vr_h(log2_denom);
 
 1580               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1583               src, stride_3x, tmp4, tmp5, tmp6, tmp7);
 
 1585     DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
 
 1586               0x20, tmp7, tmp6, 0x20, 
src0, 
src1, src2, src3);
 
 1588               zero, src3, src0_l, src1_l, src2_l, src3_l);
 
 1590               zero, src3, src0_h, src1_h, src2_h, src3_h);
 
 1591     src0_l = __lasx_xvmul_h(wgt, src0_l);
 
 1592     src0_h = __lasx_xvmul_h(wgt, src0_h);
 
 1593     src1_l = __lasx_xvmul_h(wgt, src1_l);
 
 1594     src1_h = __lasx_xvmul_h(wgt, src1_h);
 
 1595     src2_l = __lasx_xvmul_h(wgt, src2_l);
 
 1596     src2_h = __lasx_xvmul_h(wgt, src2_h);
 
 1597     src3_l = __lasx_xvmul_h(wgt, src3_l);
 
 1598     src3_h = __lasx_xvmul_h(wgt, src3_h);
 
 1600               src1_h, 
offset, src0_l, src0_h, src1_l, src1_h);
 
 1602               src3_h, 
offset, src2_l, src2_h, src3_l, src3_h);
 
 1603     src0_l = __lasx_xvmaxi_h(src0_l, 0);
 
 1604     src0_h = __lasx_xvmaxi_h(src0_h, 0);
 
 1605     src1_l = __lasx_xvmaxi_h(src1_l, 0);
 
 1606     src1_h = __lasx_xvmaxi_h(src1_h, 0);
 
 1607     src2_l = __lasx_xvmaxi_h(src2_l, 0);
 
 1608     src2_h = __lasx_xvmaxi_h(src2_h, 0);
 
 1609     src3_l = __lasx_xvmaxi_h(src3_l, 0);
 
 1610     src3_h = __lasx_xvmaxi_h(src3_h, 0);
 
 1611     src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
 
 1612     src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
 
 1613     src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
 
 1614     src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
 
 1615     src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
 
 1616     src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
 
 1617     src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
 
 1618     src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
 
 1619     __lasx_xvstelm_d(src0_l, 
src, 0, 0);
 
 1620     __lasx_xvstelm_d(src0_h, 
src, 8, 0);
 
 1622     __lasx_xvstelm_d(src0_l, 
src, 0, 2);
 
 1623     __lasx_xvstelm_d(src0_h, 
src, 8, 2);
 
 1625     __lasx_xvstelm_d(src1_l, 
src, 0, 0);
 
 1626     __lasx_xvstelm_d(src1_h, 
src, 8, 0);
 
 1628     __lasx_xvstelm_d(src1_l, 
src, 0, 2);
 
 1629     __lasx_xvstelm_d(src1_h, 
src, 8, 2);
 
 1631     __lasx_xvstelm_d(src2_l, 
src, 0, 0);
 
 1632     __lasx_xvstelm_d(src2_h, 
src, 8, 0);
 
 1634     __lasx_xvstelm_d(src2_l, 
src, 0, 2);
 
 1635     __lasx_xvstelm_d(src2_h, 
src, 8, 2);
 
 1637     __lasx_xvstelm_d(src3_l, 
src, 0, 0);
 
 1638     __lasx_xvstelm_d(src3_h, 
src, 8, 0);
 
 1640     __lasx_xvstelm_d(src3_l, 
src, 0, 2);
 
 1641     __lasx_xvstelm_d(src3_h, 
src, 8, 2);
 
 1646                   src, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1649                   src, stride_3x, tmp4, tmp5, tmp6, tmp7);
 
 1651         DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
 
 1652                   tmp4, 0x20, tmp7, tmp6, 0x20, 
src0, 
src1, src2, src3);
 
 1654                   zero, src3, src0_l, src1_l, src2_l, src3_l);
 
 1656                   zero, src3, src0_h, src1_h, src2_h, src3_h);
 
 1657         src0_l = __lasx_xvmul_h(wgt, src0_l);
 
 1658         src0_h = __lasx_xvmul_h(wgt, src0_h);
 
 1659         src1_l = __lasx_xvmul_h(wgt, src1_l);
 
 1660         src1_h = __lasx_xvmul_h(wgt, src1_h);
 
 1661         src2_l = __lasx_xvmul_h(wgt, src2_l);
 
 1662         src2_h = __lasx_xvmul_h(wgt, src2_h);
 
 1663         src3_l = __lasx_xvmul_h(wgt, src3_l);
 
 1664         src3_h = __lasx_xvmul_h(wgt, src3_h);
 
 1666                   offset, src1_h, 
offset, src0_l, src0_h, src1_l, src1_h);
 
 1668                   offset, src3_h, 
offset, src2_l, src2_h, src3_l, src3_h);
 
 1669         src0_l = __lasx_xvmaxi_h(src0_l, 0);
 
 1670         src0_h = __lasx_xvmaxi_h(src0_h, 0);
 
 1671         src1_l = __lasx_xvmaxi_h(src1_l, 0);
 
 1672         src1_h = __lasx_xvmaxi_h(src1_h, 0);
 
 1673         src2_l = __lasx_xvmaxi_h(src2_l, 0);
 
 1674         src2_h = __lasx_xvmaxi_h(src2_h, 0);
 
 1675         src3_l = __lasx_xvmaxi_h(src3_l, 0);
 
 1676         src3_h = __lasx_xvmaxi_h(src3_h, 0);
 
 1677         src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
 
 1678         src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
 
 1679         src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
 
 1680         src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
 
 1681         src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
 
 1682         src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
 
 1683         src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
 
 1684         src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
 
 1685         __lasx_xvstelm_d(src0_l, 
src, 0, 0);
 
 1686         __lasx_xvstelm_d(src0_h, 
src, 8, 0);
 
 1688         __lasx_xvstelm_d(src0_l, 
src, 0, 2);
 
 1689         __lasx_xvstelm_d(src0_h, 
src, 8, 2);
 
 1691         __lasx_xvstelm_d(src1_l, 
src, 0, 0);
 
 1692         __lasx_xvstelm_d(src1_h, 
src, 8, 0);
 
 1694         __lasx_xvstelm_d(src1_l, 
src, 0, 2);
 
 1695         __lasx_xvstelm_d(src1_h, 
src, 8, 2);
 
 1697         __lasx_xvstelm_d(src2_l, 
src, 0, 0);
 
 1698         __lasx_xvstelm_d(src2_h, 
src, 8, 0);
 
 1700         __lasx_xvstelm_d(src2_l, 
src, 0, 2);
 
 1701         __lasx_xvstelm_d(src2_h, 
src, 8, 2);
 
 1703         __lasx_xvstelm_d(src3_l, 
src, 0, 0);
 
 1704         __lasx_xvstelm_d(src3_h, 
src, 8, 0);
 
 1706         __lasx_xvstelm_d(src3_l, 
src, 0, 2);
 
 1707         __lasx_xvstelm_d(src3_h, 
src, 8, 2);
 
 1715     uint32_t offset_val;
 
 1716     ptrdiff_t stride_2x = 
stride << 1;
 
 1717     ptrdiff_t stride_3x = stride_2x + 
stride;
 
 1718     __m256i wgt, 
zero = __lasx_xvldi(0);
 
 1719     __m256i 
src0, src0_h, src0_l;
 
 1720     __m256i tmp0, tmp1, tmp2, tmp3, denom, 
offset;
 
 1722     offset_val = (unsigned) offset_in << log2_denom;
 
 1724     wgt    = __lasx_xvreplgr2vr_h(weight_src);
 
 1725     offset = __lasx_xvreplgr2vr_h(offset_val);
 
 1726     denom  = __lasx_xvreplgr2vr_h(log2_denom);
 
 1729               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1730     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1731     src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1732     src0_l = __lasx_xvilvl_b(
zero, 
src0);
 
 1733     src0_h = __lasx_xvilvh_b(
zero, 
src0);
 
 1734     src0_l = __lasx_xvmul_h(wgt, src0_l);
 
 1735     src0_h = __lasx_xvmul_h(wgt, src0_h);
 
 1736     src0_l = __lasx_xvsadd_h(src0_l, 
offset);
 
 1737     src0_h = __lasx_xvsadd_h(src0_h, 
offset);
 
 1738     src0_l = __lasx_xvmaxi_h(src0_l, 0);
 
 1739     src0_h = __lasx_xvmaxi_h(src0_h, 0);
 
 1740     src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
 
 1741     src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
 
 1743     src0 = __lasx_xvpickev_d(src0_h, src0_l);
 
 1744     __lasx_xvstelm_d(
src0, 
src, 0, 0);
 
 1746     __lasx_xvstelm_d(
src0, 
src + stride_2x, 0, 2);
 
 1747     __lasx_xvstelm_d(
src0, 
src + stride_3x, 0, 3);
 
 1753     __m256i 
src0, 
src1, src0_h, src0_l, src1_h, src1_l, 
zero = __lasx_xvldi(0);
 
 1754     __m256i tmp0, tmp1, tmp2, tmp3, denom, 
offset, wgt;
 
 1755     uint32_t offset_val;
 
 1756     uint8_t* src_tmp = 
src;
 
 1757     ptrdiff_t stride_2x = 
stride << 1;
 
 1758     ptrdiff_t stride_4x = 
stride << 2;
 
 1759     ptrdiff_t stride_3x = stride_2x + 
stride;
 
 1761     offset_val = (unsigned) offset_in << log2_denom;
 
 1763     wgt    = __lasx_xvreplgr2vr_h(src_weight);
 
 1764     offset = __lasx_xvreplgr2vr_h(offset_val);
 
 1765     denom  = __lasx_xvreplgr2vr_h(log2_denom);
 
 1767     DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, 
stride, src_tmp, stride_2x,
 
 1768               src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1769     src_tmp += stride_4x;
 
 1770     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1771     src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1772     DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, 
stride, src_tmp, stride_2x,
 
 1773               src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1774     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1775     src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1778     src0_l = __lasx_xvmul_h(wgt, src0_l);
 
 1779     src0_h = __lasx_xvmul_h(wgt, src0_h);
 
 1780     src1_l = __lasx_xvmul_h(wgt, src1_l);
 
 1781     src1_h = __lasx_xvmul_h(wgt, src1_h);
 
 1783               src1_h, 
offset, src0_l, src0_h, src1_l, src1_h);
 
 1784     src0_l = __lasx_xvmaxi_h(src0_l, 0);
 
 1785     src0_h = __lasx_xvmaxi_h(src0_h, 0);
 
 1786     src1_l = __lasx_xvmaxi_h(src1_l, 0);
 
 1787     src1_h = __lasx_xvmaxi_h(src1_h, 0);
 
 1788     src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
 
 1789     src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
 
 1790     src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
 
 1791     src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
 
 1794     __lasx_xvstelm_d(
src0, 
src, 0, 0);
 
 1796     __lasx_xvstelm_d(
src0, 
src + stride_2x, 0, 2);
 
 1797     __lasx_xvstelm_d(
src0, 
src + stride_3x, 0, 3);
 
 1799     __lasx_xvstelm_d(
src1, 
src, 0, 0);
 
 1801     __lasx_xvstelm_d(
src1, 
src + stride_2x, 0, 2);
 
 1802     __lasx_xvstelm_d(
src1, 
src + stride_3x, 0, 3);
 
 1810     __m256i src0_h, src0_l, src1_h, src1_l, src2_h, src2_l, src3_h, src3_l;
 
 1811     __m256i tmp0, tmp1, tmp2, tmp3, denom, 
offset, wgt;
 
 1812     __m256i 
zero = __lasx_xvldi(0);
 
 1813     uint32_t offset_val;
 
 1814     uint8_t* src_tmp = 
src;
 
 1815     ptrdiff_t stride_2x = 
stride << 1;
 
 1816     ptrdiff_t stride_4x = 
stride << 2;
 
 1817     ptrdiff_t stride_3x = stride_2x + 
stride;
 
 1819     offset_val = (unsigned) offset_in << log2_denom;
 
 1821     wgt    = __lasx_xvreplgr2vr_h(src_weight);
 
 1822     offset = __lasx_xvreplgr2vr_h(offset_val);
 
 1823     denom  = __lasx_xvreplgr2vr_h(log2_denom);
 
 1825     DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, 
stride, src_tmp, stride_2x,
 
 1826               src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1827     src_tmp += stride_4x;
 
 1828     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1829     src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1830     DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, 
stride, src_tmp, stride_2x,
 
 1831               src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1832     src_tmp += stride_4x;
 
 1833     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1834     src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1835     DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, 
stride, src_tmp, stride_2x,
 
 1836               src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1837     src_tmp += stride_4x;
 
 1838     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1839     src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1840     DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, 
stride, src_tmp, stride_2x,
 
 1841               src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1842     DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1843     src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 1846               src0_l, src1_l, src2_l, src3_l);
 
 1848               src0_h, src1_h, src2_h, src3_h);
 
 1849     src0_l = __lasx_xvmul_h(wgt, src0_l);
 
 1850     src0_h = __lasx_xvmul_h(wgt, src0_h);
 
 1851     src1_l = __lasx_xvmul_h(wgt, src1_l);
 
 1852     src1_h = __lasx_xvmul_h(wgt, src1_h);
 
 1853     src2_l = __lasx_xvmul_h(wgt, src2_l);
 
 1854     src2_h = __lasx_xvmul_h(wgt, src2_h);
 
 1855     src3_l = __lasx_xvmul_h(wgt, src3_l);
 
 1856     src3_h = __lasx_xvmul_h(wgt, src3_h);
 
 1859               src1_h, 
offset, src0_l, src0_h, src1_l, src1_h);
 
 1861               src3_h, 
offset, src2_l, src2_h, src3_l, src3_h);
 
 1863     src0_l = __lasx_xvmaxi_h(src0_l, 0);
 
 1864     src0_h = __lasx_xvmaxi_h(src0_h, 0);
 
 1865     src1_l = __lasx_xvmaxi_h(src1_l, 0);
 
 1866     src1_h = __lasx_xvmaxi_h(src1_h, 0);
 
 1867     src2_l = __lasx_xvmaxi_h(src2_l, 0);
 
 1868     src2_h = __lasx_xvmaxi_h(src2_h, 0);
 
 1869     src3_l = __lasx_xvmaxi_h(src3_l, 0);
 
 1870     src3_h = __lasx_xvmaxi_h(src3_h, 0);
 
 1871     src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
 
 1872     src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
 
 1873     src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
 
 1874     src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
 
 1875     src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
 
 1876     src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
 
 1877     src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
 
 1878     src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
 
 1879     DUP4_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src2_h, src2_l,
 
 1880               src3_h, src3_l, 
src0, 
src1, src2, src3);
 
 1882     __lasx_xvstelm_d(
src0, 
src, 0, 0);
 
 1884     __lasx_xvstelm_d(
src0, 
src + stride_2x, 0, 2);
 
 1885     __lasx_xvstelm_d(
src0, 
src + stride_3x, 0, 3);
 
 1887     __lasx_xvstelm_d(
src1, 
src, 0, 0);
 
 1889     __lasx_xvstelm_d(
src1, 
src + stride_2x, 0, 2);
 
 1890     __lasx_xvstelm_d(
src1, 
src + stride_3x, 0, 3);
 
 1892     __lasx_xvstelm_d(src2, 
src, 0, 0);
 
 1893     __lasx_xvstelm_d(src2, 
src + 
stride, 0, 1);
 
 1894     __lasx_xvstelm_d(src2, 
src + stride_2x, 0, 2);
 
 1895     __lasx_xvstelm_d(src2, 
src + stride_3x, 0, 3);
 
 1897     __lasx_xvstelm_d(src3, 
src, 0, 0);
 
 1898     __lasx_xvstelm_d(src3, 
src + 
stride, 0, 1);
 
 1899     __lasx_xvstelm_d(src3, 
src + stride_2x, 0, 2);
 
 1900     __lasx_xvstelm_d(src3, 
src + stride_3x, 0, 3);
 
 1904                                    int height, 
int log2_denom,
 
 1905                                    int weight_src, 
int offset)
 
 1909     } 
else if (8 == 
height) {
 
 1920     uint32_t offset_val;
 
 1921     __m256i wgt, 
zero = __lasx_xvldi(0);
 
 1924     offset_val = (unsigned) offset_in << log2_denom;
 
 1926     wgt    = __lasx_xvreplgr2vr_h(weight_src);
 
 1927     offset = __lasx_xvreplgr2vr_h(offset_val);
 
 1928     denom  = __lasx_xvreplgr2vr_h(log2_denom);
 
 1931     src0 = __lasx_xvilvl_w(tmp1, tmp0);
 
 1936     src0 = __lasx_xvssrlrn_bu_h(
src0, denom);
 
 1937     __lasx_xvstelm_w(
src0, 
src, 0, 0);
 
 1946     __m256i 
src0, tmp0, tmp1, tmp2, tmp3, denom, 
offset;
 
 1947     uint32_t offset_val;
 
 1948     ptrdiff_t stride_2x = 
stride << 1;
 
 1949     ptrdiff_t stride_3x = stride_2x + 
stride;
 
 1951     offset_val = (unsigned) offset_in << log2_denom;
 
 1953     wgt    = __lasx_xvreplgr2vr_h(weight_src);
 
 1954     offset = __lasx_xvreplgr2vr_h(offset_val);
 
 1955     denom  = __lasx_xvreplgr2vr_h(log2_denom);
 
 1958               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1959     DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
 
 1960     src0 = __lasx_xvilvl_w(tmp1, tmp0);
 
 1965     src0 = __lasx_xvssrlrn_bu_h(
src0, denom);
 
 1966     __lasx_xvstelm_w(
src0, 
src, 0, 0);
 
 1968     __lasx_xvstelm_w(
src0, 
src + stride_2x, 0, 4);
 
 1969     __lasx_xvstelm_w(
src0, 
src + stride_3x, 0, 5);
 
 1976     __m256i 
src0, src0_h, src0_l;
 
 1977     __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, 
offset;
 
 1978     __m256i wgt, 
zero = __lasx_xvldi(0);
 
 1979     uint32_t offset_val;
 
 1980     ptrdiff_t stride_2x = 
stride << 1;
 
 1981     ptrdiff_t stride_4x = 
stride << 2;
 
 1982     ptrdiff_t stride_3x = stride_2x + 
stride;
 
 1984     offset_val = (unsigned) offset_in << log2_denom;
 
 1986     wgt    = __lasx_xvreplgr2vr_h(weight_src);
 
 1987     offset = __lasx_xvreplgr2vr_h(offset_val);
 
 1988     denom  = __lasx_xvreplgr2vr_h(log2_denom);
 
 1991               src, stride_3x, tmp0, tmp1, tmp2, tmp3);
 
 1994               src, stride_3x, tmp4, tmp5, tmp6, tmp7);
 
 1996     DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7,
 
 1997               tmp5, tmp0, tmp1, tmp2, tmp3);
 
 1998     DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
 
 1999     src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
 
 2000     src0_l = __lasx_xvilvl_b(
zero, 
src0);
 
 2001     src0_h = __lasx_xvilvh_b(
zero, 
src0);
 
 2002     src0_l = __lasx_xvmul_h(wgt, src0_l);
 
 2003     src0_h = __lasx_xvmul_h(wgt, src0_h);
 
 2004     src0_l = __lasx_xvsadd_h(src0_l, 
offset);
 
 2005     src0_h = __lasx_xvsadd_h(src0_h, 
offset);
 
 2006     src0_l = __lasx_xvmaxi_h(src0_l, 0);
 
 2007     src0_h = __lasx_xvmaxi_h(src0_h, 0);
 
 2008     src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
 
 2009     src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
 
 2010     __lasx_xvstelm_w(src0_l, 
src, 0, 0);
 
 2011     __lasx_xvstelm_w(src0_l, 
src + 
stride, 0, 1);
 
 2012     __lasx_xvstelm_w(src0_h, 
src + stride_2x, 0, 0);
 
 2013     __lasx_xvstelm_w(src0_h, 
src + stride_3x, 0, 1);
 
 2015     __lasx_xvstelm_w(src0_l, 
src, 0, 4);
 
 2016     __lasx_xvstelm_w(src0_l, 
src + 
stride, 0, 5);
 
 2017     __lasx_xvstelm_w(src0_h, 
src + stride_2x, 0, 4);
 
 2018     __lasx_xvstelm_w(src0_h, 
src + stride_3x, 0, 5);
 
 2022                                    int height, 
int log2_denom,
 
 2023                                    int weight_src, 
int offset)
 
 2027     } 
else if (4 == 
height) {
 
 2036     __m256i 
src0, dst0, dst1, dst2, dst3, 
zero;
 
 2038     uint8_t* _dst1 = _dst + 
stride;
 
 2039     uint8_t* _dst2 = _dst1 + 
stride;
 
 2040     uint8_t* _dst3 = _dst2 + 
stride;
 
 2042     src0 = __lasx_xvld(_src, 0);
 
 2043     dst0 = __lasx_xvldrepl_w(_dst, 0);
 
 2044     dst1 = __lasx_xvldrepl_w(_dst1, 0);
 
 2045     dst2 = __lasx_xvldrepl_w(_dst2, 0);
 
 2046     dst3 = __lasx_xvldrepl_w(_dst3, 0);
 
 2047     tmp0 = __lasx_xvilvl_w(dst1, dst0);
 
 2048     tmp1 = __lasx_xvilvl_w(dst3, dst2);
 
 2049     dst0 = __lasx_xvilvl_d(tmp1, tmp0);
 
 2050     tmp0 = __lasx_vext2xv_hu_bu(dst0);
 
 2051     zero = __lasx_xvldi(0);
 
 2052     tmp1 = __lasx_xvadd_h(
src0, tmp0);
 
 2053     dst0 = __lasx_xvpickev_b(tmp1, tmp1);
 
 2054     __lasx_xvstelm_w(dst0, _dst, 0, 0);
 
 2055     __lasx_xvstelm_w(dst0, _dst1, 0, 1);
 
 2056     __lasx_xvstelm_w(dst0, _dst2, 0, 4);
 
 2057     __lasx_xvstelm_w(dst0, _dst3, 0, 5);
 
 2058     __lasx_xvst(
zero, _src, 0);
 
 2064     __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
 
 2065     __m256i tmp0, tmp1, tmp2, tmp3;
 
 2066     __m256i 
zero = __lasx_xvldi(0);
 
 2067     uint8_t *_dst1 = _dst + 
stride;
 
 2068     uint8_t *_dst2 = _dst1 + 
stride;
 
 2069     uint8_t *_dst3 = _dst2 + 
stride;
 
 2070     uint8_t *_dst4 = _dst3 + 
stride;
 
 2071     uint8_t *_dst5 = _dst4 + 
stride;
 
 2072     uint8_t *_dst6 = _dst5 + 
stride;
 
 2073     uint8_t *_dst7 = _dst6 + 
stride;
 
 2075     src0 = __lasx_xvld(_src, 0);
 
 2076     src1 = __lasx_xvld(_src, 32);
 
 2077     src2 = __lasx_xvld(_src, 64);
 
 2078     src3 = __lasx_xvld(_src, 96);
 
 2079     dst0 = __lasx_xvldrepl_d(_dst, 0);
 
 2080     dst1 = __lasx_xvldrepl_d(_dst1, 0);
 
 2081     dst2 = __lasx_xvldrepl_d(_dst2, 0);
 
 2082     dst3 = __lasx_xvldrepl_d(_dst3, 0);
 
 2083     dst4 = __lasx_xvldrepl_d(_dst4, 0);
 
 2084     dst5 = __lasx_xvldrepl_d(_dst5, 0);
 
 2085     dst6 = __lasx_xvldrepl_d(_dst6, 0);
 
 2086     dst7 = __lasx_xvldrepl_d(_dst7, 0);
 
 2087     tmp0 = __lasx_xvilvl_d(dst1, dst0);
 
 2088     tmp1 = __lasx_xvilvl_d(dst3, dst2);
 
 2089     tmp2 = __lasx_xvilvl_d(dst5, dst4);
 
 2090     tmp3 = __lasx_xvilvl_d(dst7, dst6);
 
 2091     dst0 = __lasx_vext2xv_hu_bu(tmp0);
 
 2092     dst1 = __lasx_vext2xv_hu_bu(tmp1);
 
 2093     dst1 = __lasx_vext2xv_hu_bu(tmp1);
 
 2094     dst2 = __lasx_vext2xv_hu_bu(tmp2);
 
 2095     dst3 = __lasx_vext2xv_hu_bu(tmp3);
 
 2096     tmp0 = __lasx_xvadd_h(
src0, dst0);
 
 2097     tmp1 = __lasx_xvadd_h(
src1, dst1);
 
 2098     tmp2 = __lasx_xvadd_h(src2, dst2);
 
 2099     tmp3 = __lasx_xvadd_h(src3, dst3);
 
 2100     dst1 = __lasx_xvpickev_b(tmp1, tmp0);
 
 2101     dst2 = __lasx_xvpickev_b(tmp3, tmp2);
 
 2102     __lasx_xvst(
zero, _src, 0);
 
 2103     __lasx_xvst(
zero, _src, 32);
 
 2104     __lasx_xvst(
zero, _src, 64);
 
 2105     __lasx_xvst(
zero, _src, 96);
 
 2106     __lasx_xvstelm_d(dst1, _dst, 0, 0);
 
 2107     __lasx_xvstelm_d(dst1, _dst1, 0, 2);
 
 2108     __lasx_xvstelm_d(dst1, _dst2, 0, 1);
 
 2109     __lasx_xvstelm_d(dst1, _dst3, 0, 3);
 
 2110     __lasx_xvstelm_d(dst2, _dst4, 0, 0);
 
 2111     __lasx_xvstelm_d(dst2, _dst5, 0, 2);
 
 2112     __lasx_xvstelm_d(dst2, _dst6, 0, 1);
 
 2113     __lasx_xvstelm_d(dst2, _dst7, 0, 3);