FFmpeg
hevc_mc_biw_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
29 };
30 
31 #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \
32  out0, out1) \
33 { \
34  v4i32 out0_r, out1_r, out0_l, out1_l; \
35  \
36  ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
37  ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
38  \
39  out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
40  out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
41  out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
42  out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
43  \
44  SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
45  PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
46  CLIP_SH2_0_255(out0, out1); \
47 }
48 
49 #define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \
50  wgt, rnd, offset, out0, out1, out2, out3) \
51 { \
52  HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1); \
53  HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3); \
54 }
55 
56 #define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, \
57  offset, out0, out1) \
58 { \
59  v4i32 out0_r, out1_r, out0_l, out1_l; \
60  \
61  ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
62  ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
63  out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
64  out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
65  out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
66  out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
67  SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
68  PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
69  CLIP_SH2_0_255(out0, out1); \
70 }
71 
72 #define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
73  vec3, wgt, rnd, offset, out0, out1, \
74  out2, out3) \
75 { \
76  HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, \
77  out0, out1); \
78  HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset, \
79  out2, out3); \
80 }
81 
82 static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
83  int32_t src_stride,
84  int16_t *src1_ptr,
85  int32_t src2_stride,
86  uint8_t *dst,
87  int32_t dst_stride,
89  int32_t weight0,
90  int32_t weight1,
91  int32_t offset0,
92  int32_t offset1,
93  int32_t rnd_val)
94 {
95  uint32_t loop_cnt, tp0, tp1, tp2, tp3;
96  uint64_t tpd0, tpd1, tpd2, tpd3;
98  v16u8 out0, out1;
99  v16i8 zero = { 0 };
100  v16i8 src0 = { 0 }, src1 = { 0 };
101  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
102  v8i16 dst0, dst1, dst2, dst3, weight_vec;
103  v4i32 dst0_r, dst0_l, offset_vec, rnd_vec;
104 
105  offset = (offset0 + offset1) << rnd_val;
106  weight0 = weight0 & 0x0000FFFF;
107  weight = weight0 | (weight1 << 16);
108 
109  offset_vec = __msa_fill_w(offset);
110  weight_vec = (v8i16) __msa_fill_w(weight);
111  rnd_vec = __msa_fill_w(rnd_val + 1);
112 
113  if (2 == height) {
114  LW2(src0_ptr, src_stride, tp0, tp1);
115  INSERT_W2_SB(tp0, tp1, src0);
116  LD2(src1_ptr, src2_stride, tpd0, tpd1);
117  INSERT_D2_SH(tpd0, tpd1, in0);
118 
119  dst0 = (v8i16) __msa_ilvr_b(zero, src0);
120  dst0 <<= 6;
121 
122  ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
123  dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec);
124  dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
125  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
126  dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
127  CLIP_SH_0_255(dst0);
128  out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
129  ST_W2(out0, 0, 1, dst, dst_stride);
130  } else if (4 == height) {
131  LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
132  INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
133  LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
134  INSERT_D2_SH(tpd0, tpd1, in0);
135  INSERT_D2_SH(tpd2, tpd3, in1);
136  ILVRL_B2_SH(zero, src0, dst0, dst1);
137  SLLI_2V(dst0, dst1, 6);
138  HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec,
139  offset_vec, dst0, dst1);
140  out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
141  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
142  } else if (0 == height % 8) {
143  for (loop_cnt = (height >> 3); loop_cnt--;) {
144  LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
145  src0_ptr += 4 * src_stride;
146  INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
147  LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
148  src0_ptr += 4 * src_stride;
149  INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
150  LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
151  src1_ptr += (4 * src2_stride);
152  INSERT_D2_SH(tpd0, tpd1, in0);
153  INSERT_D2_SH(tpd2, tpd3, in1);
154  LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
155  src1_ptr += (4 * src2_stride);
156  INSERT_D2_SH(tpd0, tpd1, in2);
157  INSERT_D2_SH(tpd2, tpd3, in3);
158  ILVRL_B2_SH(zero, src0, dst0, dst1);
159  ILVRL_B2_SH(zero, src1, dst2, dst3);
160  SLLI_4V(dst0, dst1, dst2, dst3, 6);
161  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
162  in3, weight_vec, rnd_vec, offset_vec,
163  dst0, dst1, dst2, dst3);
164  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
165  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
166  dst += (8 * dst_stride);
167  }
168  }
169 }
170 
171 static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr,
172  int32_t src_stride,
173  int16_t *src1_ptr,
174  int32_t src2_stride,
175  uint8_t *dst,
176  int32_t dst_stride,
177  int32_t height,
178  int32_t weight0,
179  int32_t weight1,
180  int32_t offset0,
181  int32_t offset1,
182  int32_t rnd_val)
183 {
184  uint32_t loop_cnt;
186  uint64_t tp0, tp1, tp2, tp3;
187  v16u8 out0, out1;
188  v16i8 zero = { 0 };
189  v16i8 src0 = { 0 }, src1 = { 0 };
190  v8i16 in0, in1, in2, in3;
191  v8i16 dst0, dst1, dst2, dst3;
192  v4i32 offset_vec, weight_vec, rnd_vec;
193 
194  offset = (offset0 + offset1) << rnd_val;
195  weight0 = weight0 & 0x0000FFFF;
196  weight = weight0 | (weight1 << 16);
197 
198  weight_vec = __msa_fill_w(weight);
199  offset_vec = __msa_fill_w(offset);
200  rnd_vec = __msa_fill_w(rnd_val + 1);
201 
202  for (loop_cnt = (height >> 2); loop_cnt--;) {
203  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
204  src0_ptr += (4 * src_stride);
205  INSERT_D2_SB(tp0, tp1, src0);
206  INSERT_D2_SB(tp2, tp3, src1);
207  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
208  src1_ptr += (4 * src2_stride);
209  ILVRL_B2_SH(zero, src0, dst0, dst1);
210  ILVRL_B2_SH(zero, src1, dst2, dst3);
211  SLLI_4V(dst0, dst1, dst2, dst3, 6);
212  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3,
213  in0, in1, in2, in3,
214  weight_vec, rnd_vec, offset_vec,
215  dst0, dst1, dst2, dst3);
216  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
217  ST_W2(out0, 0, 2, dst, dst_stride);
218  ST_H2(out0, 2, 6, dst + 4, dst_stride);
219  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
220  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
221  dst += (4 * dst_stride);
222  }
223 }
224 
225 static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
226  int32_t src_stride,
227  int16_t *src1_ptr,
228  int32_t src2_stride,
229  uint8_t *dst,
230  int32_t dst_stride,
231  int32_t height,
232  int32_t weight0,
233  int32_t weight1,
234  int32_t offset0,
235  int32_t offset1,
236  int32_t rnd_val)
237 {
238  uint64_t tp0, tp1, tp2, tp3;
240  v16u8 out0, out1, out2;
241  v16i8 zero = { 0 };
242  v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 };
243  v8i16 in0, in1, in2, in3, in4, in5;
244  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
245  v4i32 offset_vec, weight_vec, rnd_vec;
246 
247  offset = (offset0 + offset1) << rnd_val;
248  weight0 = weight0 & 0x0000FFFF;
249  weight = weight0 | (weight1 << 16);
250 
251  offset_vec = __msa_fill_w(offset);
252  weight_vec = __msa_fill_w(weight);
253  rnd_vec = __msa_fill_w(rnd_val + 1);
254 
255  if (2 == height) {
256  LD2(src0_ptr, src_stride, tp0, tp1);
257  INSERT_D2_SB(tp0, tp1, src0);
258  LD_SH2(src1_ptr, src2_stride, in0, in1);
259  ILVRL_B2_SH(zero, src0, dst0, dst1);
260  SLLI_2V(dst0, dst1, 6);
261 
262  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
263  weight_vec, rnd_vec, offset_vec,
264  dst0, dst1);
265 
266  out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
267  ST_D2(out0, 0, 1, dst, dst_stride);
268  } else if (6 == height) {
269  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
270  src0_ptr += 4 * src_stride;
271  INSERT_D2_SB(tp0, tp1, src0);
272  INSERT_D2_SB(tp2, tp3, src1);
273  LD2(src0_ptr, src_stride, tp0, tp1);
274  INSERT_D2_SB(tp0, tp1, src2);
275  ILVRL_B2_SH(zero, src0, dst0, dst1);
276  ILVRL_B2_SH(zero, src1, dst2, dst3);
277  ILVRL_B2_SH(zero, src2, dst4, dst5);
278  LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
279  SLLI_4V(dst0, dst1, dst2, dst3, 6);
280  SLLI_2V(dst4, dst5, 6);
281  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
282  weight_vec, rnd_vec, offset_vec, dst0, dst1,
283  dst2, dst3);
284  HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
285  offset_vec, dst4, dst5);
286  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
287  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
288  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
289  } else if (0 == height % 4) {
290  uint32_t loop_cnt;
291 
292  for (loop_cnt = (height >> 2); loop_cnt--;) {
293  LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
294  src0_ptr += (4 * src_stride);
295  INSERT_D2_SB(tp0, tp1, src0);
296  INSERT_D2_SB(tp2, tp3, src1);
297  ILVRL_B2_SH(zero, src0, dst0, dst1);
298  ILVRL_B2_SH(zero, src1, dst2, dst3);
299  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
300  src1_ptr += (4 * src2_stride);
301 
302  SLLI_4V(dst0, dst1, dst2, dst3, 6);
303  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
304  in3, weight_vec, rnd_vec, offset_vec,
305  dst0, dst1, dst2, dst3);
306  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
307  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
308  dst += (4 * dst_stride);
309  }
310  }
311 }
312 
313 static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr,
314  int32_t src_stride,
315  int16_t *src1_ptr,
316  int32_t src2_stride,
317  uint8_t *dst,
318  int32_t dst_stride,
319  int32_t height,
320  int32_t weight0,
321  int32_t weight1,
322  int32_t offset0,
323  int32_t offset1,
324  int32_t rnd_val)
325 {
326  uint32_t loop_cnt;
328  v16i8 zero = { 0 };
329  v16u8 out0, out1, out2;
330  v16i8 src0, src1, src2, src3;
331  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
332  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
333  v4i32 offset_vec, weight_vec, rnd_vec;
334 
335  offset = (offset0 + offset1) << rnd_val;
336  weight0 = weight0 & 0x0000FFFF;
337  weight = weight0 | (weight1 << 16);
338 
339  offset_vec = __msa_fill_w(offset);
340  weight_vec = __msa_fill_w(weight);
341  rnd_vec = __msa_fill_w(rnd_val + 1);
342 
343  for (loop_cnt = (16 >> 2); loop_cnt--;) {
344  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
345  src0_ptr += (4 * src_stride);
346  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
347  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
348  src1_ptr += (4 * src2_stride);
349 
350  ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
351  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
352  dst0, dst1, dst2, dst3);
353 
354  SLLI_4V(dst0, dst1, dst2, dst3, 6);
355  ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
356  ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
357 
358  dst4 <<= 6;
359  dst5 <<= 6;
360  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
361  weight_vec, rnd_vec, offset_vec, dst0, dst1,
362  dst2, dst3);
363  HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
364  offset_vec, dst4, dst5);
365  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
366  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
367  ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
368  dst += (4 * dst_stride);
369  }
370 }
371 
372 static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr,
373  int32_t src_stride,
374  int16_t *src1_ptr,
375  int32_t src2_stride,
376  uint8_t *dst,
377  int32_t dst_stride,
378  int32_t height,
379  int32_t weight0,
380  int32_t weight1,
381  int32_t offset0,
382  int32_t offset1,
383  int32_t rnd_val)
384 {
385  uint32_t loop_cnt;
387  v16u8 out0, out1, out2, out3;
388  v16i8 zero = { 0 };
389  v16i8 src0, src1, src2, src3;
390  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
391  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
392  v4i32 offset_vec, weight_vec, rnd_vec;
393 
394  offset = (offset0 + offset1) << rnd_val;
395  weight0 = weight0 & 0x0000FFFF;
396  weight = weight0 | (weight1 << 16);
397 
398  offset_vec = __msa_fill_w(offset);
399  weight_vec = __msa_fill_w(weight);
400  rnd_vec = __msa_fill_w(rnd_val + 1);
401 
402  for (loop_cnt = (height >> 2); loop_cnt--;) {
403  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
404  src0_ptr += (4 * src_stride);
405  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
406  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
407  src1_ptr += (4 * src2_stride);
408  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
409  tmp2, tmp3);
410  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
411  tmp6, tmp7);
412  SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
413  SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
414  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp1, tmp4, tmp5, in0, in1, in4, in5,
415  weight_vec, rnd_vec, offset_vec, tmp0, tmp1,
416  tmp4, tmp5);
417  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp3, tmp6, tmp7, in2, in3, in6, in7,
418  weight_vec, rnd_vec, offset_vec, tmp2, tmp3,
419  tmp6, tmp7);
420  PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
421  PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
422  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
423  dst += (4 * dst_stride);
424  }
425 }
426 
427 static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr,
428  int32_t src_stride,
429  int16_t *src1_ptr,
430  int32_t src2_stride,
431  uint8_t *dst,
432  int32_t dst_stride,
433  int32_t height,
434  int32_t weight0,
435  int32_t weight1,
436  int32_t offset0,
437  int32_t offset1,
438  int32_t rnd_val)
439 {
440  uint32_t loop_cnt;
442  v16u8 out0, out1, out2, out3, out4, out5;
443  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
444  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
445  v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
446  v4i32 offset_vec, weight_vec, rnd_vec;
447 
448  offset = (offset0 + offset1) << rnd_val;
449  weight0 = weight0 & 0x0000FFFF;
450  weight = weight0 | (weight1 << 16);
451 
452  offset_vec = __msa_fill_w(offset);
453  weight_vec = __msa_fill_w(weight);
454  rnd_vec = __msa_fill_w(rnd_val + 1);
455 
456  for (loop_cnt = 8; loop_cnt--;) {
457  LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
458  LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
459  src0_ptr += (4 * src_stride);
460  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
461  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
462  LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
463  src1_ptr += (4 * src2_stride);
464 
465  ILVRL_B2_SH(zero, src0, dst0, dst1);
466  ILVRL_B2_SH(zero, src1, dst2, dst3);
467  ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
468  ILVRL_B2_SH(zero, src4, dst6, dst7);
469  ILVRL_B2_SH(zero, src5, dst8, dst9);
470  ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
471  SLLI_4V(dst0, dst1, dst2, dst3, 6);
472  SLLI_4V(dst4, dst5, dst6, dst7, 6);
473  SLLI_4V(dst8, dst9, dst10, dst11, 6);
474  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in4, in1, in5,
475  weight_vec, rnd_vec, offset_vec, dst0, dst1,
476  dst2, dst3);
477  HEVC_BIW_RND_CLIP4_MAX_SATU(dst4, dst5, dst6, dst7, in8, in9, in2, in6,
478  weight_vec, rnd_vec, offset_vec, dst4, dst5,
479  dst6, dst7);
480  HEVC_BIW_RND_CLIP4_MAX_SATU(dst8, dst9, dst10, dst11, in3, in7, in10,
481  in11, weight_vec, rnd_vec, offset_vec,
482  dst8, dst9, dst10, dst11);
483  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
484  PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
485  ST_UB4(out0, out1, out3, out4, dst, dst_stride);
486  ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
487  dst += (4 * dst_stride);
488  }
489 }
490 
491 static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr,
492  int32_t src_stride,
493  int16_t *src1_ptr,
494  int32_t src2_stride,
495  uint8_t *dst,
496  int32_t dst_stride,
497  int32_t height,
498  int32_t weight0,
499  int32_t weight1,
500  int32_t offset0,
501  int32_t offset1,
502  int32_t rnd_val)
503 {
504  uint32_t loop_cnt;
506  v16u8 out0, out1, out2, out3;
507  v16i8 zero = { 0 };
508  v16i8 src0, src1, src2, src3;
509  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
510  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
511  v4i32 offset_vec, weight_vec, rnd_vec;
512 
513  offset = (offset0 + offset1) << rnd_val;
514  weight0 = weight0 & 0x0000FFFF;
515  weight = weight0 | (weight1 << 16);
516 
517  offset_vec = __msa_fill_w(offset);
518  weight_vec = __msa_fill_w(weight);
519  rnd_vec = __msa_fill_w(rnd_val + 1);
520 
521  for (loop_cnt = (height >> 1); loop_cnt--;) {
522  LD_SB2(src0_ptr, 16, src0, src1);
523  src0_ptr += src_stride;
524  LD_SB2(src0_ptr, 16, src2, src3);
525  src0_ptr += src_stride;
526  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
527  src1_ptr += src2_stride;
528  LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
529  src1_ptr += src2_stride;
530 
531  ILVRL_B2_SH(zero, src0, tmp0, tmp4);
532  ILVRL_B2_SH(zero, src1, tmp1, tmp5);
533  ILVRL_B2_SH(zero, src2, tmp2, tmp6);
534  ILVRL_B2_SH(zero, src3, tmp3, tmp7);
535  SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
536  SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
537  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
538  weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
539  tmp1, tmp5);
540  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
541  weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
542  tmp3, tmp7);
543  PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
544  PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
545  ST_UB2(out0, out1, dst, 16);
546  dst += dst_stride;
547  ST_UB2(out2, out3, dst, 16);
548  dst += dst_stride;
549  }
550 }
551 
552 static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr,
553  int32_t src_stride,
554  int16_t *src1_ptr,
555  int32_t src2_stride,
556  uint8_t *dst,
557  int32_t dst_stride,
558  int32_t height,
559  int32_t weight0,
560  int32_t weight1,
561  int32_t offset0,
562  int32_t offset1,
563  int32_t rnd_val)
564 {
565  uint32_t loop_cnt;
567  v16u8 out0, out1, out2;
568  v16i8 src0, src1, src2;
569  v16i8 zero = { 0 };
570  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5;
571  v4i32 offset_vec, weight_vec, rnd_vec;
572 
573  offset = (offset0 + offset1) << rnd_val;
574  weight0 = weight0 & 0x0000FFFF;
575  weight = weight0 | (weight1 << 16);
576 
577  offset_vec = __msa_fill_w(offset);
578  weight_vec = __msa_fill_w(weight);
579  rnd_vec = __msa_fill_w(rnd_val + 1);
580 
581  for (loop_cnt = 64; loop_cnt--;) {
582  LD_SB3(src0_ptr, 16, src0, src1, src2);
583  src0_ptr += src_stride;
584  LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
585  src1_ptr += src2_stride;
586 
587  ILVRL_B2_SH(zero, src0, dst0, dst1);
588  ILVRL_B2_SH(zero, src1, dst2, dst3);
589  ILVRL_B2_SH(zero, src2, dst4, dst5);
590  SLLI_4V(dst0, dst1, dst2, dst3, 6);
591  SLLI_2V(dst4, dst5, 6);
592  HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
593  weight_vec, rnd_vec, offset_vec, dst0, dst1,
594  dst2, dst3);
595  HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
596  offset_vec, dst4, dst5);
597  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
598  ST_UB2(out0, out1, dst, 16);
599  ST_UB(out2, dst + 32);
600  dst += dst_stride;
601  }
602 }
603 
604 static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr,
605  int32_t src_stride,
606  int16_t *src1_ptr,
607  int32_t src2_stride,
608  uint8_t *dst,
609  int32_t dst_stride,
610  int32_t height,
611  int32_t weight0,
612  int32_t weight1,
613  int32_t offset0,
614  int32_t offset1,
615  int32_t rnd_val)
616 {
617  uint32_t loop_cnt;
619  v16u8 out0, out1, out2, out3;
620  v16i8 zero = { 0 };
621  v16i8 src0, src1, src2, src3;
622  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
623  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
624  v4i32 offset_vec, weight_vec, rnd_vec;
625 
626  offset = (offset0 + offset1) << rnd_val;
627  weight0 = weight0 & 0x0000FFFF;
628  weight = weight0 | (weight1 << 16);
629 
630  offset_vec = __msa_fill_w(offset);
631  weight_vec = __msa_fill_w(weight);
632  rnd_vec = __msa_fill_w(rnd_val + 1);
633 
634  for (loop_cnt = height; loop_cnt--;) {
635  LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
636  src0_ptr += src_stride;
637  LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
638  src1_ptr += src2_stride;
639 
640  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
641  tmp2, tmp3);
642  ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
643  tmp6, tmp7);
644  SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
645  SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
646  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
647  weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
648  tmp1, tmp5);
649  HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
650  weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
651  tmp3, tmp7);
652  PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
653  PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
654  ST_UB4(out0, out1, out2, out3, dst, 16);
655  dst += dst_stride;
656  }
657 }
658 
659 static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr,
660  int32_t src_stride,
661  int16_t *src1_ptr,
662  int32_t src2_stride,
663  uint8_t *dst,
664  int32_t dst_stride,
665  const int8_t *filter,
666  int32_t height,
667  int32_t weight0,
668  int32_t weight1,
669  int32_t offset0,
670  int32_t offset1,
671  int32_t rnd_val)
672 {
673  uint32_t loop_cnt;
674  int32_t offset, weight, constant;
675  v8i16 filt0, filt1, filt2, filt3;
676  v16i8 src0, src1, src2, src3;
677  v16i8 mask1, mask2, mask3;
678  v16i8 vec0, vec1, vec2, vec3;
679  v8i16 dst0, dst1;
680  v8i16 in0, in1, in2, in3;
681  v8i16 filter_vec, out0, out1;
682  v4i32 weight_vec, offset_vec, rnd_vec;
683  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
684 
685  src0_ptr -= 3;
686  filter_vec = LD_SH(filter);
687  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
688 
689  mask1 = mask0 + 2;
690  mask2 = mask0 + 4;
691  mask3 = mask0 + 6;
692 
693  offset = (offset0 + offset1) << rnd_val;
694  weight0 = weight0 & 0x0000FFFF;
695  weight = weight0 | (weight1 << 16);
696  constant = 128 * weight1;
697  constant <<= 6;
698  offset += constant;
699 
700  offset_vec = __msa_fill_w(offset);
701  weight_vec = __msa_fill_w(weight);
702  rnd_vec = __msa_fill_w(rnd_val + 1);
703 
704  for (loop_cnt = (height >> 2); loop_cnt--;) {
705  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
706  src0_ptr += (4 * src_stride);
707  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
708  src1_ptr += (4 * src2_stride);
709  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
710  XORI_B4_128_SB(src0, src1, src2, src3);
711 
712  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
713  vec0, vec1, vec2, vec3);
714  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
715  filt3);
716  VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
717  vec0, vec1, vec2, vec3);
718  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
719  filt3);
720 
721  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
722  weight_vec, rnd_vec, offset_vec,
723  out0, out1);
724 
725  out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
726  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
727  dst += (4 * dst_stride);
728  }
729 }
730 
731 static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr,
732  int32_t src_stride,
733  int16_t *src1_ptr,
734  int32_t src2_stride,
735  uint8_t *dst,
736  int32_t dst_stride,
737  const int8_t *filter,
738  int32_t height,
739  int32_t weight0,
740  int32_t weight1,
741  int32_t offset0,
742  int32_t offset1,
743  int32_t rnd_val)
744 {
745  uint32_t loop_cnt;
746  int32_t offset, weight, constant;
747  v8i16 filt0, filt1, filt2, filt3;
748  v16i8 src0, src1, src2, src3;
749  v16i8 mask1, mask2, mask3;
750  v16i8 vec0, vec1, vec2, vec3;
751  v8i16 dst0, dst1, dst2, dst3;
752  v8i16 in0, in1, in2, in3;
753  v8i16 filter_vec, out0, out1, out2, out3;
754  v4i32 weight_vec, offset_vec, rnd_vec;
755  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
756 
757  src0_ptr -= 3;
758  offset = (offset0 + offset1) << rnd_val;
759  weight0 = weight0 & 0x0000FFFF;
760  weight = weight0 | (weight1 << 16);
761  constant = 128 * weight1;
762  constant <<= 6;
763  offset += constant;
764 
765  offset_vec = __msa_fill_w(offset);
766  weight_vec = __msa_fill_w(weight);
767  rnd_vec = __msa_fill_w(rnd_val + 1);
768 
769  filter_vec = LD_SH(filter);
770  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
771 
772  mask1 = mask0 + 2;
773  mask2 = mask0 + 4;
774  mask3 = mask0 + 6;
775 
776  for (loop_cnt = (height >> 2); loop_cnt--;) {
777  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
778  src0_ptr += (4 * src_stride);
779  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
780  src1_ptr += (4 * src2_stride);
781  XORI_B4_128_SB(src0, src1, src2, src3);
782 
783  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
784  vec0, vec1, vec2, vec3);
785  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
786  filt3);
787  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
788  vec0, vec1, vec2, vec3);
789  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
790  filt3);
791  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
792  vec0, vec1, vec2, vec3);
793  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
794  filt3);
795  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
796  vec0, vec1, vec2, vec3);
797  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
798  filt3);
799 
800  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
801  in0, in1, in2, in3,
802  weight_vec, rnd_vec, offset_vec,
803  out0, out1, out2, out3);
804 
805  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
806  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
807  dst += (4 * dst_stride);
808  }
809 }
810 
811 static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr,
812  int32_t src_stride,
813  int16_t *src1_ptr,
814  int32_t src2_stride,
815  uint8_t *dst,
816  int32_t dst_stride,
817  const int8_t *filter,
818  int32_t height,
819  int32_t weight0,
820  int32_t weight1,
821  int32_t offset0,
822  int32_t offset1,
823  int32_t rnd_val)
824 {
825  uint32_t loop_cnt;
826  int32_t offset, weight, constant;
827  v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
828  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
829  v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3;
830  v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec;
831  v4i32 weight_vec, offset_vec, rnd_vec;
832 
833  src0_ptr -= 3;
834 
835  weight0 = weight0 & 0x0000FFFF;
836  weight = weight0 | (weight1 << 16);
837  constant = 128 * weight1;
838  constant <<= 6;
839  offset = (offset0 + offset1) << rnd_val;
840  offset += constant;
841 
842  offset_vec = __msa_fill_w(offset);
843  weight_vec = __msa_fill_w(weight);
844  rnd_vec = __msa_fill_w(rnd_val + 1);
845 
846  filter_vec = LD_SH(filter);
847  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
848 
849  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
850  mask1 = mask0 + 2;
851  mask2 = mask0 + 4;
852  mask3 = mask0 + 6;
853  mask4 = LD_SB(&ff_hevc_mask_arr[16]);
854  mask5 = mask4 + 2;
855  mask6 = mask4 + 4;
856  mask7 = mask4 + 6;
857 
858  for (loop_cnt = 4; loop_cnt--;) {
859  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
860  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
861  XORI_B4_128_SB(src0, src1, src2, src3);
862  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
863  vec3);
864  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
865  filt3);
866  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
867  vec3);
868  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
869  filt3);
870  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
871  vec3);
872  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
873  filt3);
874  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
875  vec3);
876  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
877  filt3);
878  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
879  weight_vec, rnd_vec, offset_vec, out0, out1, out2,
880  out3);
881  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
882  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
883 
884  LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3);
885  src0_ptr += (4 * src_stride);
886  LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3);
887  src1_ptr += (4 * src2_stride);
888  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
889  XORI_B4_128_SB(src0, src1, src2, src3);
890  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
891  vec3);
892  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
893  filt3);
894  VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
895  vec3);
896  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
897  filt3);
898  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec,
899  offset_vec, out0, out1);
900  out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
901  ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
902  dst += (4 * dst_stride);
903  }
904 }
905 
906 static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr,
907  int32_t src_stride,
908  int16_t *src1_ptr,
909  int32_t src2_stride,
910  uint8_t *dst,
911  int32_t dst_stride,
912  const int8_t *filter,
913  int32_t height,
914  int32_t weight0,
915  int32_t weight1,
916  int32_t offset0,
917  int32_t offset1,
918  int32_t rnd_val)
919 {
920  uint32_t loop_cnt;
921  int32_t offset, weight, constant;
922  v16i8 src0, src1, src2, src3;
923  v8i16 in0, in1, in2, in3;
924  v8i16 filt0, filt1, filt2, filt3;
925  v16i8 mask1, mask2, mask3;
926  v8i16 filter_vec, out0, out1, out2, out3;
927  v16i8 vec0, vec1, vec2, vec3;
928  v8i16 dst0, dst1, dst2, dst3;
929  v4i32 weight_vec, offset_vec, rnd_vec;
930  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
931 
932  src0_ptr -= 3;
933  offset = (offset0 + offset1) << rnd_val;
934  weight0 = weight0 & 0x0000FFFF;
935  weight = weight0 | (weight1 << 16);
936  constant = 128 * weight1;
937  constant <<= 6;
938  offset += constant;
939 
940  offset_vec = __msa_fill_w(offset);
941  weight_vec = __msa_fill_w(weight);
942  rnd_vec = __msa_fill_w(rnd_val + 1);
943 
944  filter_vec = LD_SH(filter);
945  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
946 
947  mask1 = mask0 + 2;
948  mask2 = mask0 + 4;
949  mask3 = mask0 + 6;
950 
951  for (loop_cnt = (height >> 1); loop_cnt--;) {
952  LD_SB2(src0_ptr, 8, src0, src1);
953  src0_ptr += src_stride;
954  LD_SB2(src0_ptr, 8, src2, src3);
955  src0_ptr += src_stride;
956  LD_SH2(src1_ptr, 8, in0, in1);
957  src1_ptr += src2_stride;
958  LD_SH2(src1_ptr, 8, in2, in3);
959  src1_ptr += src2_stride;
960  XORI_B4_128_SB(src0, src1, src2, src3);
961 
962  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
963  vec0, vec1, vec2, vec3);
964  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
965  filt3);
966  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
967  vec0, vec1, vec2, vec3);
968  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
969  filt3);
970  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
971  vec0, vec1, vec2, vec3);
972  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
973  filt3);
974  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
975  vec0, vec1, vec2, vec3);
976  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
977  filt3);
978 
979  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
980  in0, in1, in2, in3,
981  weight_vec, rnd_vec, offset_vec,
982  out0, out1, out2, out3);
983 
984  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
985  ST_SH2(out0, out1, dst, dst_stride);
986  dst += (2 * dst_stride);
987  }
988 }
989 
990 static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr,
991  int32_t src_stride,
992  int16_t *src1_ptr,
993  int32_t src2_stride,
994  uint8_t *dst,
995  int32_t dst_stride,
996  const int8_t *filter,
997  int32_t height,
998  int32_t weight0,
999  int32_t weight1,
1000  int32_t offset0,
1001  int32_t offset1,
1002  int32_t rnd_val)
1003 {
1004  uint32_t loop_cnt;
1005  uint64_t dst_val0;
1006  int32_t offset, weight, constant;
1007  v16i8 src0, src1;
1008  v8i16 in0, in1, in2;
1009  v8i16 filt0, filt1, filt2, filt3;
1010  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1011  v16i8 vec0, vec1, vec2, vec3;
1012  v8i16 dst0, dst1, dst2;
1013  v4i32 dst2_r, dst2_l;
1014  v8i16 filter_vec, out0, out1, out2;
1015  v4i32 weight_vec, offset_vec, rnd_vec;
1016  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1017 
1018  src0_ptr = src0_ptr - 3;
1019  offset = (offset0 + offset1) << rnd_val;
1020  weight0 = weight0 & 0x0000FFFF;
1021  weight = weight0 | (weight1 << 16);
1022  constant = 128 * weight1;
1023  constant <<= 6;
1024  offset += constant;
1025 
1026  offset_vec = __msa_fill_w(offset);
1027  weight_vec = __msa_fill_w(weight);
1028  rnd_vec = __msa_fill_w(rnd_val + 1);
1029 
1030  filter_vec = LD_SH(filter);
1031  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1032 
1033  mask1 = mask0 + 2;
1034  mask2 = mask0 + 4;
1035  mask3 = mask0 + 6;
1036  mask4 = mask0 + 8;
1037  mask5 = mask0 + 10;
1038  mask6 = mask0 + 12;
1039  mask7 = mask0 + 14;
1040 
1041  LD_SB2(src0_ptr, 16, src0, src1);
1042  src0_ptr += src_stride;
1043  LD_SH2(src1_ptr, 8, in0, in1);
1044  in2 = LD_SH(src1_ptr + 16);
1045  src1_ptr += src2_stride;
1047 
1048  for (loop_cnt = 31; loop_cnt--;) {
1049  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1050  vec0, vec1, vec2, vec3);
1051  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1052  filt3);
1053  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1054  vec0, vec1, vec2, vec3);
1055  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1056  filt3);
1057  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1058  vec0, vec1, vec2, vec3);
1059  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1060  filt3);
1061 
1062  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
1063  weight_vec, rnd_vec, offset_vec,
1064  out0, out1);
1065 
1066  ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1067  dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1068  (v8i16) weight_vec);
1069  dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1070  (v8i16) weight_vec);
1071  SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1072  out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1073  CLIP_SH_0_255(out2);
1074 
1075  LD_SB2(src0_ptr, 16, src0, src1);
1076  src0_ptr += src_stride;
1077  LD_SH2(src1_ptr, 8, in0, in1);
1078  in2 = LD_SH(src1_ptr + 16);
1079  src1_ptr += src2_stride;
1081  PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1082  dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1083  ST_SH(out0, dst);
1084  SD(dst_val0, dst + 16);
1085  dst += dst_stride;
1086  }
1087 
1088  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1089  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1090  filt3);
1091  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1092  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1093  filt3);
1094  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1095  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1096  filt3);
1097  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec,
1098  out0, out1);
1099  ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1100  dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
1101  dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
1102  SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1103  out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1104  CLIP_SH_0_255(out2);
1105  PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1106  dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1107  ST_SH(out0, dst);
1108  SD(dst_val0, dst + 16);
1109  dst += dst_stride;
1110 }
1111 
1112 static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr,
1113  int32_t src_stride,
1114  int16_t *src1_ptr,
1115  int32_t src2_stride,
1116  uint8_t *dst,
1117  int32_t dst_stride,
1118  const int8_t *filter,
1119  int32_t height,
1120  int32_t weight0,
1121  int32_t weight1,
1122  int32_t offset0,
1123  int32_t offset1,
1124  int32_t rnd_val)
1125 {
1126  uint32_t loop_cnt;
1127  int32_t offset, weight, constant;
1128  v16i8 src0, src1, src2;
1129  v8i16 in0, in1, in2, in3;
1130  v8i16 filt0, filt1, filt2, filt3;
1131  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1132  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1133  v16i8 vec0, vec1, vec2, vec3;
1134  v8i16 dst0, dst1, dst2, dst3;
1135  v8i16 filter_vec, out0, out1, out2, out3;
1136  v4i32 weight_vec, offset_vec, rnd_vec;
1137 
1138  src0_ptr -= 3;
1139  offset = (offset0 + offset1) << rnd_val;
1140  weight0 = weight0 & 0x0000FFFF;
1141  weight = weight0 | (weight1 << 16);
1142  constant = 128 * weight1;
1143  constant <<= 6;
1144  offset += constant;
1145 
1146  offset_vec = __msa_fill_w(offset);
1147  weight_vec = __msa_fill_w(weight);
1148  rnd_vec = __msa_fill_w(rnd_val + 1);
1149 
1150  filter_vec = LD_SH(filter);
1151  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1152 
1153  mask1 = mask0 + 2;
1154  mask2 = mask0 + 4;
1155  mask3 = mask0 + 6;
1156  mask4 = mask0 + 8;
1157  mask5 = mask0 + 10;
1158  mask6 = mask0 + 12;
1159  mask7 = mask0 + 14;
1160 
1161  for (loop_cnt = height; loop_cnt--;) {
1162  LD_SB2(src0_ptr, 16, src0, src1);
1163  src2 = LD_SB(src0_ptr + 24);
1164  src0_ptr += src_stride;
1165  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1166  src1_ptr += src2_stride;
1167 
1168  XORI_B3_128_SB(src0, src1, src2);
1169 
1170  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1171  vec0, vec1, vec2, vec3);
1172  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1173  filt3);
1174  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1175  vec0, vec1, vec2, vec3);
1176  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1177  filt3);
1178  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1179  vec0, vec1, vec2, vec3);
1180  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1181  filt3);
1182  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1183  vec0, vec1, vec2, vec3);
1184  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1185  filt3);
1186 
1187  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1188  in0, in1, in2, in3,
1189  weight_vec, rnd_vec, offset_vec,
1190  out0, out1, out2, out3);
1191 
1192  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1193  ST_SH2(out0, out1, dst, 16);
1194  dst += dst_stride;
1195  }
1196 }
1197 
1198 static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr,
1199  int32_t src_stride,
1200  int16_t *src1_ptr,
1201  int32_t src2_stride,
1202  uint8_t *dst,
1203  int32_t dst_stride,
1204  const int8_t *filter,
1205  int32_t height,
1206  int32_t weight0,
1207  int32_t weight1,
1208  int32_t offset0,
1209  int32_t offset1,
1210  int32_t rnd_val)
1211 {
1212  uint32_t loop_cnt;
1213  int32_t offset, weight, constant;
1214  v16i8 src0, src1, src2, src3, src4;
1215  v8i16 in0, in1, in2, in3;
1216  v8i16 filt0, filt1, filt2, filt3;
1217  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1218  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1219  v16i8 vec0, vec1, vec2, vec3;
1220  v8i16 dst0, dst1, dst2, dst3;
1221  v8i16 filter_vec, out0, out1, out2, out3;
1222  v4i32 weight_vec, offset_vec, rnd_vec;
1223 
1224  src0_ptr -= 3;
1225  offset = (offset0 + offset1) << rnd_val;
1226  weight0 = weight0 & 0x0000FFFF;
1227  weight = weight0 | (weight1 << 16);
1228  constant = 128 * weight1;
1229  constant <<= 6;
1230  offset += constant;
1231 
1232  offset_vec = __msa_fill_w(offset);
1233  weight_vec = __msa_fill_w(weight);
1234  rnd_vec = __msa_fill_w(rnd_val + 1);
1235 
1236  filter_vec = LD_SH(filter);
1237  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1238 
1239  mask1 = mask0 + 2;
1240  mask2 = mask0 + 4;
1241  mask3 = mask0 + 6;
1242  mask4 = mask0 + 8;
1243  mask5 = mask0 + 10;
1244  mask6 = mask0 + 12;
1245  mask7 = mask0 + 14;
1246 
1247  for (loop_cnt = 64; loop_cnt--;) {
1248  LD_SB2(src0_ptr, 16, src0, src1);
1249  src2 = LD_SB(src0_ptr + 24);
1250  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1251  XORI_B3_128_SB(src0, src1, src2);
1252  LD_SB2(src0_ptr + 32, 8, src3, src4);
1253  src0_ptr += src_stride;
1254  XORI_B2_128_SB(src3, src4);
1255 
1256  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1257  vec0, vec1, vec2, vec3);
1258  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1259  filt3);
1260  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1261  vec0, vec1, vec2, vec3);
1262  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1263  filt3);
1264  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1265  vec0, vec1, vec2, vec3);
1266  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1267  filt3);
1268  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1269  vec0, vec1, vec2, vec3);
1270  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1271  filt3);
1272 
1273  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
1274  weight_vec, rnd_vec, offset_vec,
1275  out0, out1, out2, out3);
1276 
1277  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1278  ST_SH2(out0, out1, dst, 16);
1279 
1280  LD_SH2(src1_ptr + 32, 8, in2, in3);
1281  src1_ptr += src2_stride;
1282 
1283  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1284  vec0, vec1, vec2, vec3);
1285  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1286  filt3);
1287  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1288  vec0, vec1, vec2, vec3);
1289  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1290  filt3);
1291 
1292  HEVC_BIW_RND_CLIP2(dst0, dst1, in2, in3,
1293  weight_vec, rnd_vec, offset_vec,
1294  out0, out1);
1295 
1296  out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1297  ST_SH(out0, dst + 32);
1298  dst += dst_stride;
1299  }
1300 }
1301 
1302 static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr,
1303  int32_t src_stride,
1304  int16_t *src1_ptr,
1305  int32_t src2_stride,
1306  uint8_t *dst,
1307  int32_t dst_stride,
1308  const int8_t *filter,
1309  int32_t height,
1310  int32_t weight0,
1311  int32_t weight1,
1312  int32_t offset0,
1313  int32_t offset1,
1314  int32_t rnd_val)
1315 {
1316  uint8_t *src0_ptr_tmp;
1317  uint8_t *dst_tmp;
1318  int16_t *src1_ptr_tmp;
1319  uint32_t loop_cnt, cnt;
1320  int32_t offset, weight, constant;
1321  v16i8 src0, src1, src2;
1322  v8i16 in0, in1, in2, in3;
1323  v8i16 filt0, filt1, filt2, filt3;
1324  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1325  v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1326  v16i8 vec0, vec1, vec2, vec3;
1327  v8i16 dst0, dst1, dst2, dst3;
1328  v8i16 filter_vec, out0, out1, out2, out3;
1329  v4i32 weight_vec, offset_vec, rnd_vec;
1330 
1331  src0_ptr -= 3;
1332  offset = (offset0 + offset1) << rnd_val;
1333  weight0 = weight0 & 0x0000FFFF;
1334  weight = weight0 | (weight1 << 16);
1335  constant = 128 * weight1;
1336  constant <<= 6;
1337  offset += constant;
1338 
1339  offset_vec = __msa_fill_w(offset);
1340  weight_vec = __msa_fill_w(weight);
1341  rnd_vec = __msa_fill_w(rnd_val + 1);
1342 
1343  filter_vec = LD_SH(filter);
1344  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1345 
1346  mask1 = mask0 + 2;
1347  mask2 = mask0 + 4;
1348  mask3 = mask0 + 6;
1349  mask4 = mask0 + 8;
1350  mask5 = mask0 + 10;
1351  mask6 = mask0 + 12;
1352  mask7 = mask0 + 14;
1353 
1354  for (loop_cnt = height; loop_cnt--;) {
1355  src0_ptr_tmp = src0_ptr;
1356  dst_tmp = dst;
1357  src1_ptr_tmp = src1_ptr;
1358 
1359  for (cnt = 2; cnt--;) {
1360  LD_SB2(src0_ptr_tmp, 16, src0, src1);
1361  src2 = LD_SB(src0_ptr_tmp + 24);
1362  src0_ptr_tmp += 32;
1363  LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
1364  src1_ptr_tmp += 32;
1365  XORI_B3_128_SB(src0, src1, src2);
1366 
1367  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1368  vec0, vec1, vec2, vec3);
1369  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1370  filt2, filt3);
1371  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1372  vec0, vec1, vec2, vec3);
1373  dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1374  filt2, filt3);
1375  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1376  vec0, vec1, vec2, vec3);
1377  dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1378  filt2, filt3);
1379  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1380  vec0, vec1, vec2, vec3);
1381  dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1382  filt2, filt3);
1383 
1384  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1385  in0, in1, in2, in3,
1386  weight_vec, rnd_vec, offset_vec,
1387  out0, out1, out2, out3);
1388 
1389  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1390  ST_SH2(out0, out1, dst_tmp, 16);
1391  dst_tmp += 32;
1392  }
1393 
1394  src0_ptr += src_stride;
1395  src1_ptr += src2_stride;
1396  dst += dst_stride;
1397 
1398  }
1399 }
1400 
1401 static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr,
1402  int32_t src_stride,
1403  int16_t *src1_ptr,
1404  int32_t src2_stride,
1405  uint8_t *dst,
1406  int32_t dst_stride,
1407  const int8_t *filter,
1408  int32_t height,
1409  int32_t weight0,
1410  int32_t weight1,
1411  int32_t offset0,
1412  int32_t offset1,
1413  int32_t rnd_val)
1414 {
1415  uint32_t loop_cnt;
1417  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1418  v16i8 src11, src12, src13, src14;
1419  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1420  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1421  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1422  v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1423  v16i8 src2110, src4332, src6554, src8776, src10998;
1424  v16i8 src12111110, src14131312;
1425  v8i16 dst10, dst32, dst54, dst76;
1426  v8i16 filt0, filt1, filt2, filt3;
1427  v8i16 filter_vec, out0, out1, out2, out3;
1428  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1429 
1430  src0_ptr -= (3 * src_stride);
1431  offset = (offset0 + offset1) << rnd_val;
1432  weight0 = weight0 & 0x0000FFFF;
1433  weight = weight0 | (weight1 << 16);
1434 
1435  const_vec = __msa_ldi_w(128);
1436  const_vec <<= 6;
1437  offset_vec = __msa_fill_w(offset);
1438  weight_vec = __msa_fill_w(weight);
1439  rnd_vec = __msa_fill_w(rnd_val + 1);
1440  weight1_vec = __msa_fill_w(weight1);
1441  offset_vec += const_vec * weight1_vec;
1442 
1443  filter_vec = LD_SH(filter);
1444  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1445 
1446  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1447  src0_ptr += (7 * src_stride);
1448 
1449  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1450  src10_r, src32_r, src54_r, src21_r);
1451  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1452  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1453  src2110, src4332, src6554);
1454  XORI_B3_128_SB(src2110, src4332, src6554);
1455 
1456  for (loop_cnt = (height >> 3); loop_cnt--;) {
1457  LD_SB8(src0_ptr, src_stride,
1458  src7, src8, src9, src10, src11, src12, src13, src14);
1459  src0_ptr += (8 * src_stride);
1460  LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1461  src1_ptr += (8 * src2_stride);
1462 
1463  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1464  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1465  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1466  src76_r, src87_r, src98_r, src109_r);
1467  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1468  src1110_r, src1211_r, src1312_r, src1413_r);
1469  ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1470  src1413_r, src1312_r,
1471  src8776, src10998, src12111110, src14131312);
1472  XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1473 
1474  DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
1475  filt0, dst10, dst32, dst54, dst76);
1476  DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
1477  filt1, dst10, dst32, dst54, dst76);
1478  DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
1479  filt2, filt2, dst10, dst32, dst54, dst76);
1480  DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
1481  filt3, filt3, dst10, dst32, dst54, dst76);
1482 
1483  HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
1484  in0, in1, in2, in3,
1485  weight_vec, rnd_vec, offset_vec,
1486  out0, out1, out2, out3);
1487 
1488  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1489  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1490  dst += (8 * dst_stride);
1491 
1492  src2110 = src10998;
1493  src4332 = src12111110;
1494  src6554 = src14131312;
1495  src6 = src14;
1496  }
1497 }
1498 
1499 static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr,
1500  int32_t src_stride,
1501  int16_t *src1_ptr,
1502  int32_t src2_stride,
1503  uint8_t *dst,
1504  int32_t dst_stride,
1505  const int8_t *filter,
1506  int32_t height,
1507  int32_t weight0,
1508  int32_t weight1,
1509  int32_t offset0,
1510  int32_t offset1,
1511  int32_t rnd_val)
1512 {
1513  uint32_t loop_cnt;
1515  v16i8 src0, src1, src2, src3, src4, src5;
1516  v16i8 src6, src7, src8, src9, src10;
1517  v8i16 in0, in1, in2, in3;
1518  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1519  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1520  v8i16 tmp0, tmp1, tmp2, tmp3;
1521  v8i16 filt0, filt1, filt2, filt3;
1522  v8i16 filter_vec, out0, out1, out2, out3;
1523  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1524 
1525  src0_ptr -= (3 * src_stride);
1526  offset = (offset0 + offset1) << rnd_val;
1527  weight0 = weight0 & 0x0000FFFF;
1528  weight = weight0 | (weight1 << 16);
1529 
1530  const_vec = __msa_ldi_w(128);
1531  const_vec <<= 6;
1532  offset_vec = __msa_fill_w(offset);
1533  weight_vec = __msa_fill_w(weight);
1534  rnd_vec = __msa_fill_w(rnd_val + 1);
1535  weight1_vec = __msa_fill_w(weight1);
1536  offset_vec += const_vec * weight1_vec;
1537 
1538  filter_vec = LD_SH(filter);
1539  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1540 
1541  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1542  src0_ptr += (7 * src_stride);
1543  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1544 
1545  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1546  src10_r, src32_r, src54_r, src21_r);
1547  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1548 
1549  for (loop_cnt = (height >> 2); loop_cnt--;) {
1550  LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1551  src0_ptr += (4 * src_stride);
1552  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1553  src1_ptr += (4 * src2_stride);
1554 
1555  XORI_B4_128_SB(src7, src8, src9, src10);
1556  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1557  src76_r, src87_r, src98_r, src109_r);
1558 
1559  DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1560  filt0, tmp0, tmp1, tmp2, tmp3);
1561  DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1562  filt1, tmp0, tmp1, tmp2, tmp3);
1563  DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1564  filt2, tmp0, tmp1, tmp2, tmp3);
1565  DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1566  filt3, tmp0, tmp1, tmp2, tmp3);
1567 
1568  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1569  in0, in1, in2, in3,
1570  weight_vec, rnd_vec, offset_vec,
1571  out0, out1, out2, out3);
1572 
1573  PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1574  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1575  dst += (4 * dst_stride);
1576 
1577  src10_r = src54_r;
1578  src32_r = src76_r;
1579  src54_r = src98_r;
1580  src21_r = src65_r;
1581  src43_r = src87_r;
1582  src65_r = src109_r;
1583  src6 = src10;
1584  }
1585 }
1586 
1587 static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr,
1588  int32_t src_stride,
1589  int16_t *src1_ptr,
1590  int32_t src2_stride,
1591  uint8_t *dst,
1592  int32_t dst_stride,
1593  const int8_t *filter,
1594  int32_t height,
1595  int32_t weight0,
1596  int32_t weight1,
1597  int32_t offset0,
1598  int32_t offset1,
1599  int32_t rnd_val)
1600 {
1601  uint32_t loop_cnt;
1603  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1604  v8i16 in0, in1, in2, in3;
1605  v16i8 src10_r, src32_r, src54_r, src76_r;
1606  v16i8 src21_r, src43_r, src65_r, src87_r;
1607  v8i16 tmp0, tmp1, tmp2;
1608  v16i8 src10_l, src32_l, src54_l, src76_l;
1609  v16i8 src21_l, src43_l, src65_l, src87_l;
1610  v16i8 src2110, src4332, src6554, src8776;
1611  v8i16 filt0, filt1, filt2, filt3;
1612  v8i16 out0, out1, out2, filter_vec;
1613  v4i32 dst2_r, dst2_l;
1614  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1615 
1616  src0_ptr -= (3 * src_stride);
1617  offset = (offset0 + offset1) << rnd_val;
1618  weight0 = weight0 & 0x0000FFFF;
1619  weight = weight0 | (weight1 << 16);
1620 
1621  const_vec = __msa_ldi_w(128);
1622  const_vec <<= 6;
1623  offset_vec = __msa_fill_w(offset);
1624  weight_vec = __msa_fill_w(weight);
1625  rnd_vec = __msa_fill_w(rnd_val + 1);
1626  weight1_vec = __msa_fill_w(weight1);
1627  offset_vec += const_vec * weight1_vec;
1628 
1629  filter_vec = LD_SH(filter);
1630  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1631 
1632  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1633  src0_ptr += (7 * src_stride);
1634  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1635 
1636  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1637  src10_r, src32_r, src54_r, src21_r);
1638  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1639  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1640  src10_l, src32_l, src54_l, src21_l);
1641  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1642  ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1643  src2110, src4332, src6554);
1644 
1645  for (loop_cnt = 8; loop_cnt--;) {
1646  LD_SB2(src0_ptr, src_stride, src7, src8);
1647  src0_ptr += (2 * src_stride);
1648  LD_SH2(src1_ptr, src2_stride, in0, in1);
1649  LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
1650  src1_ptr += (2 * src2_stride);
1651  in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
1652  XORI_B2_128_SB(src7, src8);
1653 
1654  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1655  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1656  src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
1657 
1658  DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0,
1659  tmp0, tmp1, tmp2);
1660  DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1);
1661  tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1);
1662  DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1);
1663  tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2);
1664  DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1);
1665  tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3);
1666 
1667  HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
1668  weight_vec, rnd_vec, offset_vec,
1669  out0, out1);
1670 
1671  ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l);
1672  dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1673  (v8i16) weight_vec);
1674  dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1675  (v8i16) weight_vec);
1676  SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1677  out2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1678  CLIP_SH_0_255(out2);
1679  PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1680  ST_D2(out0, 0, 1, dst, dst_stride);
1681  ST_W2(out2, 0, 1, dst + 8, dst_stride);
1682  dst += (2 * dst_stride);
1683 
1684  src10_r = src32_r;
1685  src32_r = src54_r;
1686  src54_r = src76_r;
1687  src21_r = src43_r;
1688  src43_r = src65_r;
1689  src65_r = src87_r;
1690  src2110 = src4332;
1691  src4332 = src6554;
1692  src6554 = src8776;
1693  src6 = src8;
1694  }
1695 }
1696 
1697 static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr,
1698  int32_t src_stride,
1699  int16_t *src1_ptr,
1700  int32_t src2_stride,
1701  uint8_t *dst,
1702  int32_t dst_stride,
1703  const int8_t *filter,
1704  int32_t height,
1705  int32_t weight0,
1706  int32_t weight1,
1707  int32_t offset0,
1708  int32_t offset1,
1709  int32_t rnd_val,
1710  int32_t width)
1711 {
1712  uint8_t *src0_ptr_tmp;
1713  int16_t *src1_ptr_tmp;
1714  uint8_t *dst_tmp;
1715  uint32_t loop_cnt, cnt;
1717  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1718  v8i16 in0, in1, in2, in3;
1719  v16i8 src10_r, src32_r, src54_r, src76_r;
1720  v16i8 src21_r, src43_r, src65_r, src87_r;
1721  v16i8 src10_l, src32_l, src54_l, src76_l;
1722  v16i8 src21_l, src43_l, src65_l, src87_l;
1723  v8i16 tmp0, tmp1, tmp2, tmp3;
1724  v8i16 filt0, filt1, filt2, filt3;
1725  v8i16 filter_vec;
1726  v8i16 out0, out1, out2, out3;
1727  v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1728 
1729  src0_ptr -= (3 * src_stride);
1730 
1731  offset = (offset0 + offset1) << rnd_val;
1732  weight0 = weight0 & 0x0000FFFF;
1733  weight = weight0 | (weight1 << 16);
1734 
1735  const_vec = __msa_ldi_w(128);
1736  const_vec <<= 6;
1737  offset_vec = __msa_fill_w(offset);
1738  weight_vec = __msa_fill_w(weight);
1739  rnd_vec = __msa_fill_w(rnd_val + 1);
1740  weight1_vec = __msa_fill_w(weight1);
1741  offset_vec += const_vec * weight1_vec;
1742 
1743  filter_vec = LD_SH(filter);
1744  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1745 
1746  for (cnt = (width >> 4); cnt--;) {
1747  src0_ptr_tmp = src0_ptr;
1748  src1_ptr_tmp = src1_ptr;
1749  dst_tmp = dst;
1750 
1751  LD_SB7(src0_ptr_tmp, src_stride,
1752  src0, src1, src2, src3, src4, src5, src6);
1753  src0_ptr_tmp += (7 * src_stride);
1754 
1755  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1756  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1757  src10_r, src32_r, src54_r, src21_r);
1758  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1759  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1760  src10_l, src32_l, src54_l, src21_l);
1761  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1762 
1763  for (loop_cnt = (height >> 1); loop_cnt--;) {
1764  LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1765  src0_ptr_tmp += (2 * src_stride);
1766  LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1767  LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1768  src1_ptr_tmp += (2 * src2_stride);
1769 
1770  XORI_B2_128_SB(src7, src8);
1771  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1772  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1773 
1774  DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0,
1775  filt0, filt0, tmp0, tmp1, tmp2, tmp3);
1776  DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1,
1777  filt1, filt1, tmp0, tmp1, tmp2, tmp3);
1778  DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2,
1779  filt2, filt2, tmp0, tmp1, tmp2, tmp3);
1780  DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3,
1781  filt3, filt3, tmp0, tmp1, tmp2, tmp3);
1782 
1783  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1784  in0, in1, in2, in3,
1785  weight_vec, rnd_vec, offset_vec,
1786  out0, out1, out2, out3);
1787 
1788  PCKEV_B2_SH(out2, out0, out3, out1, out0, out1);
1789  ST_SH2(out0, out1, dst_tmp, dst_stride);
1790  dst_tmp += (2 * dst_stride);
1791 
1792  src10_r = src32_r;
1793  src32_r = src54_r;
1794  src54_r = src76_r;
1795  src21_r = src43_r;
1796  src43_r = src65_r;
1797  src65_r = src87_r;
1798  src10_l = src32_l;
1799  src32_l = src54_l;
1800  src54_l = src76_l;
1801  src21_l = src43_l;
1802  src43_l = src65_l;
1803  src65_l = src87_l;
1804  src6 = src8;
1805  }
1806 
1807  src0_ptr += 16;
1808  src1_ptr += 16;
1809  dst += 16;
1810  }
1811 }
1812 
1813 static void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr,
1814  int32_t src_stride,
1815  int16_t *src1_ptr,
1816  int32_t src2_stride,
1817  uint8_t *dst,
1818  int32_t dst_stride,
1819  const int8_t *filter,
1820  int32_t height,
1821  int32_t weight0,
1822  int32_t weight1,
1823  int32_t offset0,
1824  int32_t offset1,
1825  int32_t rnd_val)
1826 {
1827  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1828  src1_ptr, src2_stride,
1829  dst, dst_stride, filter, height,
1830  weight0, weight1, offset0, offset1,
1831  rnd_val, 16);
1832 }
1833 
1834 static void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr,
1835  int32_t src_stride,
1836  int16_t *src1_ptr,
1837  int32_t src2_stride,
1838  uint8_t *dst,
1839  int32_t dst_stride,
1840  const int8_t *filter,
1841  int32_t height,
1842  int32_t weight0,
1843  int32_t weight1,
1844  int32_t offset0,
1845  int32_t offset1,
1846  int32_t rnd_val)
1847 {
1848  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1849  src1_ptr, src2_stride,
1850  dst, dst_stride, filter, height,
1851  weight0, weight1, offset0, offset1,
1852  rnd_val, 16);
1853  hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride,
1854  src1_ptr + 16, src2_stride,
1855  dst + 16, dst_stride, filter, height,
1856  weight0, weight1, offset0, offset1, rnd_val);
1857 }
1858 
1859 static void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr,
1860  int32_t src_stride,
1861  int16_t *src1_ptr,
1862  int32_t src2_stride,
1863  uint8_t *dst,
1864  int32_t dst_stride,
1865  const int8_t *filter,
1866  int32_t height,
1867  int32_t weight0,
1868  int32_t weight1,
1869  int32_t offset0,
1870  int32_t offset1,
1871  int32_t rnd_val)
1872 {
1873  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1874  src1_ptr, src2_stride,
1875  dst, dst_stride, filter, height,
1876  weight0, weight1, offset0, offset1,
1877  rnd_val, 32);
1878 }
1879 
1880 static void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr,
1881  int32_t src_stride,
1882  int16_t *src1_ptr,
1883  int32_t src2_stride,
1884  uint8_t *dst,
1885  int32_t dst_stride,
1886  const int8_t *filter,
1887  int32_t height,
1888  int32_t weight0,
1889  int32_t weight1,
1890  int32_t offset0,
1891  int32_t offset1,
1892  int32_t rnd_val)
1893 {
1894  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1895  src1_ptr, src2_stride,
1896  dst, dst_stride, filter, height,
1897  weight0, weight1, offset0, offset1,
1898  rnd_val, 48);
1899 }
1900 
1901 static void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr,
1902  int32_t src_stride,
1903  int16_t *src1_ptr,
1904  int32_t src2_stride,
1905  uint8_t *dst,
1906  int32_t dst_stride,
1907  const int8_t *filter,
1908  int32_t height,
1909  int32_t weight0,
1910  int32_t weight1,
1911  int32_t offset0,
1912  int32_t offset1,
1913  int32_t rnd_val)
1914 {
1915  hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1916  src1_ptr, src2_stride,
1917  dst, dst_stride, filter, height,
1918  weight0, weight1, offset0, offset1,
1919  rnd_val, 64);
1920 }
1921 
1922 static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
1923  int32_t src_stride,
1924  int16_t *src1_ptr,
1925  int32_t src2_stride,
1926  uint8_t *dst,
1927  int32_t dst_stride,
1928  const int8_t *filter_x,
1929  const int8_t *filter_y,
1930  int32_t height,
1931  int32_t weight0,
1932  int32_t weight1,
1933  int32_t offset0,
1934  int32_t offset1,
1935  int32_t rnd_val)
1936 {
1937  uint32_t loop_cnt;
1938  uint64_t tp0, tp1;
1940  v16u8 out;
1941  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1942  v8i16 in0 = { 0 }, in1 = { 0 };
1943  v8i16 filt0, filt1, filt2, filt3;
1944  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1945  v16i8 mask1, mask2, mask3;
1946  v8i16 filter_vec, weight_vec;
1947  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1948  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1949  v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1950  v8i16 tmp0, tmp1, tmp2, tmp3;
1951  v8i16 dst10, dst32, dst54, dst76;
1952  v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
1953  v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
1954  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1955 
1956  src0_ptr -= ((3 * src_stride) + 3);
1957 
1958  filter_vec = LD_SH(filter_x);
1959  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1960 
1961  filter_vec = LD_SH(filter_y);
1962  UNPCK_R_SB_SH(filter_vec, filter_vec);
1963 
1964  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1965 
1966  mask1 = mask0 + 2;
1967  mask2 = mask0 + 4;
1968  mask3 = mask0 + 6;
1969 
1970  offset = (offset0 + offset1) << rnd_val;
1971  weight0 = weight0 & 0x0000FFFF;
1972  weight = weight0 | (weight1 << 16);
1973 
1974  const_vec = __msa_fill_w((128 * weight1));
1975  const_vec <<= 6;
1976  offset_vec = __msa_fill_w(offset);
1977  rnd_vec = __msa_fill_w(rnd_val + 1);
1978  offset_vec += const_vec;
1979  weight_vec = (v8i16) __msa_fill_w(weight);
1980 
1981  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1982  src0_ptr += (7 * src_stride);
1983 
1984  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1985 
1986  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1987  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1988  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1989  vec8, vec9, vec10, vec11);
1990  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1991  vec12, vec13, vec14, vec15);
1992 
1993  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1994  filt3);
1995  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1996  filt3);
1997  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1998  filt3);
1999  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2000  filt3);
2001 
2002  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2003  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2004  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2005 
2006  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2007 
2008  for (loop_cnt = height >> 2; loop_cnt--;) {
2009  LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2010  src0_ptr += (4 * src_stride);
2011  XORI_B4_128_SB(src7, src8, src9, src10);
2012 
2013  LD2(src1_ptr, src2_stride, tp0, tp1);
2014  INSERT_D2_SH(tp0, tp1, in0);
2015  src1_ptr += (2 * src2_stride);
2016  LD2(src1_ptr, src2_stride, tp0, tp1);
2017  INSERT_D2_SH(tp0, tp1, in1);
2018  src1_ptr += (2 * src2_stride);
2019 
2020  VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
2021  vec0, vec1, vec2, vec3);
2022  VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
2023  vec4, vec5, vec6, vec7);
2024  dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2025  filt3);
2026  dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2027  filt3);
2028 
2029  dst76 = __msa_ilvr_h(dst97, dst66);
2030  ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2031  dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2032  dst98 = __msa_ilvr_h(dst66, dst108);
2033 
2034  dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2035  filt_h2, filt_h3);
2036  dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2037  filt_h2, filt_h3);
2038  dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2039  filt_h2, filt_h3);
2040  dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2041  filt_h2, filt_h3);
2042  SRA_4V(dst0, dst1, dst2, dst3, 6);
2043  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2044  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2045  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2046  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2047  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2048  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2049  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2050  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2051  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
2052  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2053  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2054  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2055  dst += (4 * dst_stride);
2056 
2057  dst10 = dst54;
2058  dst32 = dst76;
2059  dst54 = dst98;
2060  dst21 = dst65;
2061  dst43 = dst87;
2062  dst65 = dst109;
2063  dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2064  }
2065 }
2066 
2067 static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr,
2068  int32_t src_stride,
2069  int16_t *src1_ptr,
2070  int32_t src2_stride,
2071  uint8_t *dst,
2072  int32_t dst_stride,
2073  const int8_t *filter_x,
2074  const int8_t *filter_y,
2075  int32_t height,
2076  int32_t weight0,
2077  int32_t weight1,
2078  int32_t offset0,
2079  int32_t offset1,
2080  int32_t rnd_val,
2081  int32_t width8mult)
2082 {
2083  uint32_t loop_cnt, cnt;
2085  uint8_t *src0_ptr_tmp;
2086  int16_t *src1_ptr_tmp;
2087  uint8_t *dst_tmp;
2088  v16u8 out;
2089  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2090  v8i16 in0, in1;
2091  v8i16 filt0, filt1, filt2, filt3;
2092  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2093  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2094  v16i8 mask1, mask2, mask3;
2095  v8i16 filter_vec, weight_vec;
2096  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2097  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2098  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
2099  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
2100  v8i16 tmp0, tmp1, tmp2, tmp3;
2101  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2102  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2103  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2104  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
2105  v4i32 offset_vec, rnd_vec, const_vec;
2106 
2107  src0_ptr -= ((3 * src_stride) + 3);
2108 
2109  offset = (offset0 + offset1) << rnd_val;
2110  weight0 = weight0 & 0x0000FFFF;
2111  weight = weight0 | (weight1 << 16);
2112 
2113  const_vec = __msa_fill_w((128 * weight1));
2114  const_vec <<= 6;
2115  offset_vec = __msa_fill_w(offset);
2116  rnd_vec = __msa_fill_w(rnd_val + 1);
2117  offset_vec += const_vec;
2118  weight_vec = (v8i16) __msa_fill_w(weight);
2119 
2120  filter_vec = LD_SH(filter_x);
2121  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2122 
2123  filter_vec = LD_SH(filter_y);
2124  UNPCK_R_SB_SH(filter_vec, filter_vec);
2125 
2126  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2127 
2128  mask1 = mask0 + 2;
2129  mask2 = mask0 + 4;
2130  mask3 = mask0 + 6;
2131 
2132  for (cnt = width8mult; cnt--;) {
2133  src0_ptr_tmp = src0_ptr;
2134  src1_ptr_tmp = src1_ptr;
2135  dst_tmp = dst;
2136 
2137  LD_SB7(src0_ptr_tmp, src_stride,
2138  src0, src1, src2, src3, src4, src5, src6);
2139  src0_ptr_tmp += (7 * src_stride);
2140 
2141  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2142 
2143  /* row 0 row 1 row 2 row 3 */
2144  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
2145  vec0, vec1, vec2, vec3);
2146  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
2147  vec4, vec5, vec6, vec7);
2148  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
2149  vec8, vec9, vec10, vec11);
2150  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2151  vec12, vec13, vec14, vec15);
2152 
2153  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2154  filt3);
2155  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2156  filt3);
2157  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2158  filt3);
2159  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2160  filt2, filt3);
2161 
2162  /* row 4 row 5 row 6 */
2163  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2164  vec0, vec1, vec2, vec3);
2165  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2166  vec4, vec5, vec6, vec7);
2167  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2168  vec8, vec9, vec10, vec11);
2169 
2170  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2171  filt3);
2172  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2173  filt3);
2174  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2175  filt3);
2176 
2177  for (loop_cnt = height >> 1; loop_cnt--;) {
2178  LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2179  XORI_B2_128_SB(src7, src8);
2180  src0_ptr_tmp += 2 * src_stride;
2181 
2182  LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2183  src1_ptr_tmp += (2 * src2_stride);
2184 
2185  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
2186  dst32_r, dst54_r, dst21_r);
2187  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
2188  dst32_l, dst54_l, dst21_l);
2189  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2190  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2191 
2192  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2193  vec0, vec1, vec2, vec3);
2194  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2195  filt2, filt3);
2196 
2197  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
2198  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2199  filt_h0, filt_h1, filt_h2, filt_h3);
2200  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2201  filt_h0, filt_h1, filt_h2, filt_h3);
2202 
2203  dst0_r >>= 6;
2204  dst0_l >>= 6;
2205 
2206  /* row 8 */
2207  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2208  vec0, vec1, vec2, vec3);
2209  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2210  filt2, filt3);
2211 
2212  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
2213  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
2214  filt_h0, filt_h1, filt_h2, filt_h3);
2215  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
2216  filt_h0, filt_h1, filt_h2, filt_h3);
2217 
2218  dst1_r >>= 6;
2219  dst1_l >>= 6;
2220 
2221  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
2222  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2223  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2224  dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2225  dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2226  dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2227  dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2228  SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
2229  CLIP_SW4_0_255(dst0_l, dst0_r, dst1_l, dst1_r);
2230  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
2231  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2232  ST_D2(out, 0, 1, dst_tmp, dst_stride);
2233  dst_tmp += (2 * dst_stride);
2234 
2235  dst0 = dst2;
2236  dst1 = dst3;
2237  dst2 = dst4;
2238  dst3 = dst5;
2239  dst4 = dst6;
2240  dst5 = dst7;
2241  dst6 = dst8;
2242  }
2243 
2244  src0_ptr += 8;
2245  src1_ptr += 8;
2246  dst += 8;
2247  }
2248 }
2249 
2250 static void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr,
2251  int32_t src_stride,
2252  int16_t *src1_ptr,
2253  int32_t src2_stride,
2254  uint8_t *dst,
2255  int32_t dst_stride,
2256  const int8_t *filter_x,
2257  const int8_t *filter_y,
2258  int32_t height,
2259  int32_t weight0,
2260  int32_t weight1,
2261  int32_t offset0,
2262  int32_t offset1,
2263  int32_t rnd_val)
2264 {
2265  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2266  src1_ptr, src2_stride,
2267  dst, dst_stride, filter_x, filter_y,
2268  height, weight0, weight1, offset0,
2269  offset1, rnd_val, 1);
2270 }
2271 
2272 static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
2273  int32_t src_stride,
2274  int16_t *src1_ptr,
2275  int32_t src2_stride,
2276  uint8_t *dst,
2277  int32_t dst_stride,
2278  const int8_t *filter_x,
2279  const int8_t *filter_y,
2280  int32_t height,
2281  int32_t weight0,
2282  int32_t weight1,
2283  int32_t offset0,
2284  int32_t offset1,
2285  int32_t rnd_val)
2286 {
2287  uint32_t loop_cnt;
2288  uint8_t *src0_ptr_tmp, *dst_tmp;
2289  int16_t *src1_ptr_tmp;
2291  uint64_t tp0, tp1;
2292  v16u8 out;
2293  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2294  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2295  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2296  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2297  v8i16 in0 = { 0 }, in1 = { 0 };
2298  v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3;
2299  v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2300  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
2301  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
2302  v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
2303  v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76;
2304  v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l;
2305  v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
2306 
2307  src0_ptr -= ((3 * src_stride) + 3);
2308 
2309  offset = (offset0 + offset1) << rnd_val;
2310  weight0 = weight0 & 0x0000FFFF;
2311  weight = weight0 | (weight1 << 16);
2312 
2313  const_vec = __msa_fill_w((128 * weight1));
2314  const_vec <<= 6;
2315  offset_vec = __msa_fill_w(offset);
2316  rnd_vec = __msa_fill_w(rnd_val + 1);
2317  offset_vec += const_vec;
2318  weight_vec = (v8i16) __msa_fill_w(weight);
2319 
2320  filter_vec = LD_SH(filter_x);
2321  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2322 
2323  filter_vec = LD_SH(filter_y);
2324  UNPCK_R_SB_SH(filter_vec, filter_vec);
2325 
2326  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2327 
2328  mask0 = LD_SB(ff_hevc_mask_arr);
2329  mask1 = mask0 + 2;
2330  mask2 = mask0 + 4;
2331  mask3 = mask0 + 6;
2332 
2333  src0_ptr_tmp = src0_ptr;
2334  src1_ptr_tmp = src1_ptr;
2335  dst_tmp = dst;
2336 
2337  LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2338  src0_ptr_tmp += (7 * src_stride);
2339  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2340 
2341  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2342  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2343  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2344  vec11);
2345  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2346  vec15);
2347  dsth0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2348  filt3);
2349  dsth1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2350  filt3);
2351  dsth2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2352  filt3);
2353  dsth3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2354  filt2, filt3);
2355  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2356  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2357  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2358  vec11);
2359  dsth4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2360  filt3);
2361  dsth5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2362  filt3);
2363  dsth6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2364  filt3);
2365 
2366  for (loop_cnt = 8; loop_cnt--;) {
2367  LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2368  src0_ptr_tmp += (2 * src_stride);
2369  XORI_B2_128_SB(src7, src8);
2370 
2371  LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2372  src1_ptr_tmp += (2 * src2_stride);
2373 
2374  ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2375  dst10_r, dst32_r, dst54_r, dst21_r);
2376  ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2377  dst10_l, dst32_l, dst54_l, dst21_l);
2378  ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r);
2379  ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l);
2380 
2381  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2382  vec3);
2383  dsth7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2384  filt3);
2385 
2386  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
2387  dst0 = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2388  filt_h1, filt_h2, filt_h3);
2389  dst1 = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
2390  filt_h1, filt_h2, filt_h3);
2391  dst0 >>= 6;
2392  dst1 >>= 6;
2393 
2394  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2395  vec3);
2396  dsth8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2397  filt3);
2398 
2399  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
2400  dst2 = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2401  filt_h1, filt_h2, filt_h3);
2402  dst3 = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0,
2403  filt_h1, filt_h2, filt_h3);
2404  dst2 >>= 6;
2405  dst3 >>= 6;
2406 
2407  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2408  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2409  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2410  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2411  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2412  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2413  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2414  SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec);
2415  CLIP_SW4_0_255(dst1, dst0, dst3, dst2);
2416  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2417  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2418  ST_D2(out, 0, 1, dst_tmp, dst_stride);
2419  dst_tmp += (2 * dst_stride);
2420 
2421  dsth0 = dsth2;
2422  dsth1 = dsth3;
2423  dsth2 = dsth4;
2424  dsth3 = dsth5;
2425  dsth4 = dsth6;
2426  dsth5 = dsth7;
2427  dsth6 = dsth8;
2428  }
2429 
2430  src0_ptr += 8;
2431  src1_ptr += 8;
2432  dst += 8;
2433 
2434  mask4 = LD_SB(ff_hevc_mask_arr + 16);
2435  mask5 = mask4 + 2;
2436  mask6 = mask4 + 4;
2437  mask7 = mask4 + 6;
2438 
2439  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2440  src0_ptr += (7 * src_stride);
2441  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2442 
2443  VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2444  VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2445  VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2446  vec11);
2447  VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2448  vec15);
2449  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2450  filt3);
2451  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2452  filt3);
2453  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2454  filt3);
2455  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2456  filt3);
2457  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2458  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2459  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2460 
2461  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2462 
2463  for (loop_cnt = 4; loop_cnt--;) {
2464  LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2465  src0_ptr += (4 * src_stride);
2466  XORI_B4_128_SB(src7, src8, src9, src10);
2467 
2468  LD2(src1_ptr, src2_stride, tp0, tp1);
2469  INSERT_D2_SH(tp0, tp1, in0);
2470  src1_ptr += (2 * src2_stride);
2471  LD2(src1_ptr, src2_stride, tp0, tp1);
2472  INSERT_D2_SH(tp0, tp1, in1);
2473  src1_ptr += (2 * src2_stride);
2474 
2475  VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2476  vec3);
2477  VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2478  vec7);
2479  dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2480  filt3);
2481  dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2482  filt3);
2483 
2484  dst76 = __msa_ilvr_h(dst97, dst66);
2485  ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2486  dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2487  dst98 = __msa_ilvr_h(dst66, dst108);
2488 
2489  dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2490  filt_h2, filt_h3);
2491  dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2492  filt_h2, filt_h3);
2493  dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2494  filt_h2, filt_h3);
2495  dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2496  filt_h2, filt_h3);
2497  SRA_4V(dst0, dst1, dst2, dst3, 6);
2498  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2499  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2500  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2501  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2502  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2503  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2504  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2505  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2506  CLIP_SW4_0_255(dst0, dst1, dst2, dst3);
2507  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2508  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2509  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2510  dst += (4 * dst_stride);
2511 
2512  dst10 = dst54;
2513  dst32 = dst76;
2514  dst54 = dst98;
2515  dst21 = dst65;
2516  dst43 = dst87;
2517  dst65 = dst109;
2518  dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2519  }
2520 }
2521 
2522 static void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr,
2523  int32_t src_stride,
2524  int16_t *src1_ptr,
2525  int32_t src2_stride,
2526  uint8_t *dst,
2527  int32_t dst_stride,
2528  const int8_t *filter_x,
2529  const int8_t *filter_y,
2530  int32_t height,
2531  int32_t weight0,
2532  int32_t weight1,
2533  int32_t offset0,
2534  int32_t offset1,
2535  int32_t rnd_val)
2536 {
2537  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2538  src1_ptr, src2_stride,
2539  dst, dst_stride, filter_x, filter_y,
2540  height, weight0, weight1, offset0,
2541  offset1, rnd_val, 2);
2542 }
2543 
2544 static void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr,
2545  int32_t src_stride,
2546  int16_t *src1_ptr,
2547  int32_t src2_stride,
2548  uint8_t *dst,
2549  int32_t dst_stride,
2550  const int8_t *filter_x,
2551  const int8_t *filter_y,
2552  int32_t height,
2553  int32_t weight0,
2554  int32_t weight1,
2555  int32_t offset0,
2556  int32_t offset1,
2557  int32_t rnd_val)
2558 {
2559  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2560  src1_ptr, src2_stride,
2561  dst, dst_stride, filter_x, filter_y,
2562  height, weight0, weight1, offset0,
2563  offset1, rnd_val, 3);
2564 }
2565 
2566 static void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr,
2567  int32_t src_stride,
2568  int16_t *src1_ptr,
2569  int32_t src2_stride,
2570  uint8_t *dst,
2571  int32_t dst_stride,
2572  const int8_t *filter_x,
2573  const int8_t *filter_y,
2574  int32_t height,
2575  int32_t weight0,
2576  int32_t weight1,
2577  int32_t offset0,
2578  int32_t offset1,
2579  int32_t rnd_val)
2580 {
2581  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2582  src1_ptr, src2_stride,
2583  dst, dst_stride, filter_x, filter_y,
2584  height, weight0, weight1, offset0,
2585  offset1, rnd_val, 4);
2586 }
2587 
2588 static void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr,
2589  int32_t src_stride,
2590  int16_t *src1_ptr,
2591  int32_t src2_stride,
2592  uint8_t *dst,
2593  int32_t dst_stride,
2594  const int8_t *filter_x,
2595  const int8_t *filter_y,
2596  int32_t height,
2597  int32_t weight0,
2598  int32_t weight1,
2599  int32_t offset0,
2600  int32_t offset1,
2601  int32_t rnd_val)
2602 {
2603  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2604  src1_ptr, src2_stride,
2605  dst, dst_stride, filter_x, filter_y,
2606  height, weight0, weight1, offset0,
2607  offset1, rnd_val, 6);
2608 }
2609 
2610 static void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr,
2611  int32_t src_stride,
2612  int16_t *src1_ptr,
2613  int32_t src2_stride,
2614  uint8_t *dst,
2615  int32_t dst_stride,
2616  const int8_t *filter_x,
2617  const int8_t *filter_y,
2618  int32_t height,
2619  int32_t weight0,
2620  int32_t weight1,
2621  int32_t offset0,
2622  int32_t offset1,
2623  int32_t rnd_val)
2624 {
2625  hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2626  src1_ptr, src2_stride,
2627  dst, dst_stride, filter_x, filter_y,
2628  height, weight0, weight1, offset0,
2629  offset1, rnd_val, 8);
2630 }
2631 
2632 static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
2633  int32_t src_stride,
2634  int16_t *src1_ptr,
2635  int32_t src2_stride,
2636  uint8_t *dst,
2637  int32_t dst_stride,
2638  const int8_t *filter,
2639  int32_t weight0,
2640  int32_t weight1,
2641  int32_t offset0,
2642  int32_t offset1,
2643  int32_t rnd_val)
2644 {
2645  int32_t offset, weight, constant;
2646  v8i16 filt0, filt1;
2647  v16i8 src0, src1;
2648  v8i16 in0, in1;
2649  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2650  v16i8 mask1, vec0, vec1;
2651  v8i16 dst0;
2652  v4i32 dst0_r, dst0_l;
2653  v8i16 out0, filter_vec;
2654  v4i32 weight_vec, offset_vec, rnd_vec;
2655 
2656  src0_ptr -= 1;
2657 
2658  filter_vec = LD_SH(filter);
2659  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2660 
2661  mask1 = mask0 + 2;
2662 
2663  offset = (offset0 + offset1) << rnd_val;
2664  weight0 = weight0 & 0x0000FFFF;
2665  weight = weight0 | (weight1 << 16);
2666  constant = 128 * weight1;
2667  constant <<= 6;
2668  offset += constant;
2669 
2670  offset_vec = __msa_fill_w(offset);
2671  weight_vec = __msa_fill_w(weight);
2672  rnd_vec = __msa_fill_w(rnd_val + 1);
2673 
2674  LD_SB2(src0_ptr, src_stride, src0, src1);
2675  LD_SH2(src1_ptr, src2_stride, in0, in1);
2676  in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2678 
2679  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2680  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2681 
2682  ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
2683  dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
2684  dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
2685  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2686  out0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2687  CLIP_SH_0_255(out0);
2688  out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
2689  ST_W2(out0, 0, 1, dst, dst_stride);
2690 }
2691 
2692 static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
2693  int32_t src_stride,
2694  int16_t *src1_ptr,
2695  int32_t src2_stride,
2696  uint8_t *dst,
2697  int32_t dst_stride,
2698  const int8_t *filter,
2699  int32_t weight0,
2700  int32_t weight1,
2701  int32_t offset0,
2702  int32_t offset1,
2703  int32_t rnd_val)
2704 {
2705  int32_t offset, weight, constant;
2706  v8i16 filt0, filt1;
2707  v16i8 src0, src1, src2, src3;
2708  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2709  v16i8 mask1;
2710  v8i16 dst0, dst1;
2711  v16i8 vec0, vec1;
2712  v8i16 in0, in1, in2, in3;
2713  v8i16 filter_vec;
2714  v4i32 weight_vec, offset_vec, rnd_vec;
2715 
2716  src0_ptr -= 1;
2717 
2718  /* rearranging filter */
2719  filter_vec = LD_SH(filter);
2720  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2721 
2722  mask1 = mask0 + 2;
2723 
2724  offset = (offset0 + offset1) << rnd_val;
2725  weight0 = weight0 & 0x0000FFFF;
2726  weight = weight0 | (weight1 << 16);
2727  constant = 128 * weight1;
2728  constant <<= 6;
2729  offset += constant;
2730 
2731  offset_vec = __msa_fill_w(offset);
2732  weight_vec = __msa_fill_w(weight);
2733  rnd_vec = __msa_fill_w(rnd_val + 1);
2734 
2735  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2736  XORI_B4_128_SB(src0, src1, src2, src3);
2737  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2738  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2739 
2740  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2741  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2742  VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2743  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2744  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2745  weight_vec, rnd_vec, offset_vec,
2746  dst0, dst1);
2747 
2748  dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2749  ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
2750 }
2751 
2752 static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
2753  int32_t src_stride,
2754  int16_t *src1_ptr,
2755  int32_t src2_stride,
2756  uint8_t *dst,
2757  int32_t dst_stride,
2758  const int8_t *filter,
2759  int32_t height,
2760  int32_t weight0,
2761  int32_t weight1,
2762  int32_t offset0,
2763  int32_t offset1,
2764  int32_t rnd_val)
2765 {
2766  uint32_t loop_cnt;
2767  int32_t weight, offset, constant;
2768  v8i16 filt0, filt1;
2769  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2770  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2771  v16i8 mask1;
2772  v16i8 vec0, vec1;
2773  v8i16 dst0, dst1, dst2, dst3;
2774  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2775  v8i16 filter_vec;
2776  v4i32 weight_vec, offset_vec, rnd_vec;
2777 
2778  src0_ptr -= 1;
2779 
2780  filter_vec = LD_SH(filter);
2781  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2782 
2783  offset = (offset0 + offset1) << rnd_val;
2784  weight0 = weight0 & 0x0000FFFF;
2785  weight = weight0 | (weight1 << 16);
2786  constant = 128 * weight1;
2787  constant <<= 6;
2788  offset += constant;
2789 
2790  offset_vec = __msa_fill_w(offset);
2791  weight_vec = __msa_fill_w(weight);
2792  rnd_vec = __msa_fill_w(rnd_val + 1);
2793 
2794  mask1 = mask0 + 2;
2795 
2796  for (loop_cnt = (height >> 3); loop_cnt--;) {
2797  LD_SB8(src0_ptr, src_stride,
2798  src0, src1, src2, src3, src4, src5, src6, src7);
2799  src0_ptr += (8 * src_stride);
2800  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2801  src1_ptr += (4 * src2_stride);
2802  LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2803  src1_ptr += (4 * src2_stride);
2804  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2805  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2806  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2807 
2808  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2809  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2810  VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2811  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2812  VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2813  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2814  VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2815  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2816  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2817  in0, in1, in2, in3,
2818  weight_vec, rnd_vec, offset_vec,
2819  dst0, dst1, dst2, dst3);
2820 
2821  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2822  ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2823  dst += (8 * dst_stride);
2824  }
2825 }
2826 
2827 static void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr,
2828  int32_t src_stride,
2829  int16_t *src1_ptr,
2830  int32_t src2_stride,
2831  uint8_t *dst,
2832  int32_t dst_stride,
2833  const int8_t *filter,
2834  int32_t height,
2835  int32_t weight0,
2836  int32_t weight1,
2837  int32_t offset0,
2838  int32_t offset1,
2839  int32_t rnd_val)
2840 {
2841  if (2 == height) {
2842  hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2843  dst, dst_stride, filter,
2844  weight0, weight1, offset0, offset1, rnd_val);
2845  } else if (4 == height) {
2846  hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2847  dst, dst_stride, filter,
2848  weight0, weight1, offset0, offset1, rnd_val);
2849  } else if (0 == (height % 8)) {
2850  hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
2851  src1_ptr, src2_stride,
2852  dst, dst_stride, filter, height,
2853  weight0, weight1, offset0, offset1,
2854  rnd_val);
2855  }
2856 }
2857 
2858 static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr,
2859  int32_t src_stride,
2860  int16_t *src1_ptr,
2861  int32_t src2_stride,
2862  uint8_t *dst,
2863  int32_t dst_stride,
2864  const int8_t *filter,
2865  int32_t height,
2866  int32_t weight0,
2867  int32_t weight1,
2868  int32_t offset0,
2869  int32_t offset1,
2870  int32_t rnd_val)
2871 {
2872  uint32_t loop_cnt;
2873  int32_t offset, weight, constant;
2874  v8i16 filt0, filt1;
2875  v16i8 src0, src1, src2, src3;
2876  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2877  v16i8 mask1;
2878  v16i8 vec0, vec1;
2879  v8i16 in0, in1, in2, in3;
2880  v8i16 dst0, dst1, dst2, dst3;
2881  v8i16 filter_vec;
2882  v4i32 weight_vec, offset_vec, rnd_vec;
2883 
2884  src0_ptr -= 1;
2885 
2886  filter_vec = LD_SH(filter);
2887  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2888 
2889  offset = (offset0 + offset1) << rnd_val;
2890  weight0 = weight0 & 0x0000FFFF;
2891  weight = weight0 | (weight1 << 16);
2892  constant = 128 * weight1;
2893  constant <<= 6;
2894  offset += constant;
2895 
2896  offset_vec = __msa_fill_w(offset);
2897  weight_vec = __msa_fill_w(weight);
2898  rnd_vec = __msa_fill_w(rnd_val + 1);
2899 
2900  mask1 = mask0 + 2;
2901 
2902  for (loop_cnt = 2; loop_cnt--;) {
2903  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2904  src0_ptr += (4 * src_stride);
2905  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2906  src1_ptr += (4 * src2_stride);
2907  XORI_B4_128_SB(src0, src1, src2, src3);
2908 
2909  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2910  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2911  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2912  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2913  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2914  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2915  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2916  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2917 
2918  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2919  in0, in1, in2, in3,
2920  weight_vec, rnd_vec, offset_vec,
2921  dst0, dst1, dst2, dst3);
2922 
2923  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2924  ST_W2(dst0, 0, 2, dst, dst_stride);
2925  ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2926  ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
2927  ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2928  dst += (4 * dst_stride);
2929  }
2930 }
2931 
2932 static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
2933  int32_t src_stride,
2934  int16_t *src1_ptr,
2935  int32_t src2_stride,
2936  uint8_t *dst,
2937  int32_t dst_stride,
2938  const int8_t *filter,
2939  int32_t weight0,
2940  int32_t weight1,
2941  int32_t offset0,
2942  int32_t offset1,
2943  int32_t rnd_val)
2944 {
2945  int32_t offset, weight, constant;
2946  v8i16 filt0, filt1;
2947  v16i8 src0, src1;
2948  v8i16 in0, in1;
2949  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2950  v16i8 mask1, vec0, vec1;
2951  v8i16 dst0, dst1;
2952  v8i16 filter_vec;
2953  v4i32 weight_vec, offset_vec, rnd_vec;
2954 
2955  src0_ptr -= 1;
2956 
2957  filter_vec = LD_SH(filter);
2958  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2959 
2960  offset = (offset0 + offset1) << rnd_val;
2961  weight0 = weight0 & 0x0000FFFF;
2962  weight = weight0 | (weight1 << 16);
2963  constant = 128 * weight1;
2964  constant <<= 6;
2965  offset += constant;
2966 
2967  offset_vec = __msa_fill_w(offset);
2968  weight_vec = __msa_fill_w(weight);
2969  rnd_vec = __msa_fill_w(rnd_val + 1);
2970 
2971  mask1 = mask0 + 2;
2972 
2973  LD_SB2(src0_ptr, src_stride, src0, src1);
2974  LD_SH2(src1_ptr, src2_stride, in0, in1);
2976  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2977  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2978  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2979  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2980  HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2981  weight_vec, rnd_vec, offset_vec,
2982  dst0, dst1);
2983 
2984  dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2985  ST_D2(dst0, 0, 1, dst, dst_stride);
2986 }
2987 
2988 static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
2989  int32_t src_stride,
2990  int16_t *src1_ptr,
2991  int32_t src2_stride,
2992  uint8_t *dst,
2993  int32_t dst_stride,
2994  const int8_t *filter,
2995  int32_t weight0,
2996  int32_t weight1,
2997  int32_t offset0,
2998  int32_t offset1,
2999  int32_t rnd_val)
3000 {
3001  int32_t weight, offset, constant;
3002  v8i16 filt0, filt1;
3003  v16i8 src0, src1, src2, src3, src4, src5;
3004  v8i16 in0, in1, in2, in3, in4, in5;
3005  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3006  v16i8 mask1;
3007  v16i8 vec0, vec1;
3008  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3009  v8i16 filter_vec;
3010  v4i32 weight_vec, offset_vec, rnd_vec;
3011 
3012  src0_ptr -= 1;
3013 
3014  filter_vec = LD_SH(filter);
3015  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3016 
3017  offset = (offset0 + offset1) << rnd_val;
3018  weight0 = weight0 & 0x0000FFFF;
3019  weight = weight0 | (weight1 << 16);
3020  constant = 128 * weight1;
3021  constant <<= 6;
3022  offset += constant;
3023 
3024  offset_vec = __msa_fill_w(offset);
3025  weight_vec = __msa_fill_w(weight);
3026  rnd_vec = __msa_fill_w(rnd_val + 1);
3027 
3028  mask1 = mask0 + 2;
3029 
3030  LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
3031 
3032  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3033  src1_ptr += (4 * src2_stride);
3034  LD_SH2(src1_ptr, src2_stride, in4, in5);
3035  XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
3036  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3037  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3038  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3039  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3040  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3041  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3042  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3043  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3044  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3045  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3046  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3047  dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3048  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3049  in0, in1, in2, in3,
3050  weight_vec, rnd_vec, offset_vec,
3051  dst0, dst1, dst2, dst3);
3052  HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3053  weight_vec, rnd_vec, offset_vec,
3054  dst4, dst5);
3055 
3056  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3057  dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3058  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3059  ST_D2(dst3, 0, 1, dst + 4 * dst_stride, dst_stride);
3060 }
3061 
3062 static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
3063  int32_t src_stride,
3064  int16_t *src1_ptr,
3065  int32_t src2_stride,
3066  uint8_t *dst,
3067  int32_t dst_stride,
3068  const int8_t *filter,
3069  int32_t height,
3070  int32_t weight0,
3071  int32_t weight1,
3072  int32_t offset0,
3073  int32_t offset1,
3074  int32_t rnd_val)
3075 {
3076  uint32_t loop_cnt;
3077  int32_t offset, weight, constant;
3078  v8i16 filt0, filt1;
3079  v16i8 src0, src1, src2, src3;
3080  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3081  v16i8 mask1;
3082  v16i8 vec0, vec1;
3083  v8i16 in0, in1, in2, in3;
3084  v8i16 dst0, dst1, dst2, dst3;
3085  v8i16 filter_vec;
3086  v4i32 weight_vec, offset_vec, rnd_vec;
3087 
3088  src0_ptr -= 1;
3089 
3090  filter_vec = LD_SH(filter);
3091  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3092 
3093  offset = (offset0 + offset1) << rnd_val;
3094  weight0 = weight0 & 0x0000FFFF;
3095  weight = weight0 | (weight1 << 16);
3096  constant = 128 * weight1;
3097  constant <<= 6;
3098  offset += constant;
3099 
3100  offset_vec = __msa_fill_w(offset);
3101  weight_vec = __msa_fill_w(weight);
3102  rnd_vec = __msa_fill_w(rnd_val + 1);
3103 
3104  mask1 = mask0 + 2;
3105 
3106  for (loop_cnt = (height >> 2); loop_cnt--;) {
3107  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3108  src0_ptr += (4 * src_stride);
3109  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3110  src1_ptr += (4 * src2_stride);
3111  XORI_B4_128_SB(src0, src1, src2, src3);
3112 
3113  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3114  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3115  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3116  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3117  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3118  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3119  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3120  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3121  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3122  in0, in1, in2, in3,
3123  weight_vec, rnd_vec, offset_vec,
3124  dst0, dst1, dst2, dst3);
3125 
3126  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3127  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3128  dst += (4 * dst_stride);
3129  }
3130 }
3131 
3132 static void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr,
3133  int32_t src_stride,
3134  int16_t *src1_ptr,
3135  int32_t src2_stride,
3136  uint8_t *dst,
3137  int32_t dst_stride,
3138  const int8_t *filter,
3139  int32_t height,
3140  int32_t weight0,
3141  int32_t weight1,
3142  int32_t offset0,
3143  int32_t offset1,
3144  int32_t rnd_val)
3145 {
3146  if (2 == height) {
3147  hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3148  dst, dst_stride, filter,
3149  weight0, weight1, offset0, offset1, rnd_val);
3150  } else if (6 == height) {
3151  hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3152  dst, dst_stride, filter,
3153  weight0, weight1, offset0, offset1, rnd_val);
3154  } else if (0 == (height % 4)) {
3155  hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
3156  src1_ptr, src2_stride,
3157  dst, dst_stride, filter, height,
3158  weight0, weight1, offset0, offset1,
3159  rnd_val);
3160  }
3161 }
3162 
3163 static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr,
3164  int32_t src_stride,
3165  int16_t *src1_ptr,
3166  int32_t src2_stride,
3167  uint8_t *dst,
3168  int32_t dst_stride,
3169  const int8_t *filter,
3170  int32_t height,
3171  int32_t weight0,
3172  int32_t weight1,
3173  int32_t offset0,
3174  int32_t offset1,
3175  int32_t rnd_val)
3176 {
3177  uint32_t loop_cnt;
3178  int32_t offset, weight, constant;
3179  v8i16 filt0, filt1;
3180  v16i8 src0, src1, src2, src3;
3181  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3182  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3183  v16i8 mask2 = {
3184  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
3185  };
3186  v16i8 mask1, mask3;
3187  v16i8 vec0, vec1;
3188  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3189  v8i16 filter_vec;
3190  v4i32 weight_vec, offset_vec, rnd_vec;
3191 
3192  src0_ptr -= 1;
3193 
3194  filter_vec = LD_SH(filter);
3195  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3196 
3197  offset = (offset0 + offset1) << rnd_val;
3198  weight0 = weight0 & 0x0000FFFF;
3199  weight = weight0 | (weight1 << 16);
3200  constant = 128 * weight1;
3201  constant <<= 6;
3202  offset += constant;
3203 
3204  offset_vec = __msa_fill_w(offset);
3205  weight_vec = __msa_fill_w(weight);
3206  rnd_vec = __msa_fill_w(rnd_val + 1);
3207 
3208  mask1 = mask0 + 2;
3209  mask3 = mask2 + 2;
3210 
3211  for (loop_cnt = 4; loop_cnt--;) {
3212  LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3213  src0_ptr += (4 * src_stride);
3214  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3215  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
3216  src1_ptr += (4 * src2_stride);
3217  ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3218  XORI_B4_128_SB(src0, src1, src2, src3);
3219 
3220  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3221  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3222  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3223  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3224  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3225  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3226  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3227  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3228  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3229  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3230  VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3231  dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3232 
3233  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3234  in0, in1, in2, in3,
3235  weight_vec, rnd_vec, offset_vec,
3236  dst0, dst1, dst2, dst3);
3237  HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3238  weight_vec, rnd_vec, offset_vec,
3239  dst4, dst5);
3240 
3241  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3242  dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3243  ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
3244  ST_W4(dst3, 0, 1, 2, 3, dst + 8, dst_stride);
3245  dst += (4 * dst_stride);
3246  }
3247 }
3248 
3249 static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr,
3250  int32_t src_stride,
3251  int16_t *src1_ptr,
3252  int32_t src2_stride,
3253  uint8_t *dst,
3254  int32_t dst_stride,
3255  const int8_t *filter,
3256  int32_t height,
3257  int32_t weight0,
3258  int32_t weight1,
3259  int32_t offset0,
3260  int32_t offset1,
3261  int32_t rnd_val)
3262 {
3263  uint32_t loop_cnt;
3264  int32_t offset, weight, constant;
3265  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
3266  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3267  v8i16 filt0, filt1;
3268  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3269  v16i8 mask1;
3270  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3271  v16i8 vec0, vec1;
3272  v8i16 filter_vec;
3273  v4i32 weight_vec, offset_vec, rnd_vec;
3274 
3275  src0_ptr -= 1;
3276 
3277  filter_vec = LD_SH(filter);
3278  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3279 
3280  offset = (offset0 + offset1) << rnd_val;
3281  weight0 = weight0 & 0x0000FFFF;
3282  weight = weight0 | (weight1 << 16);
3283  constant = 128 * weight1;
3284  constant <<= 6;
3285  offset += constant;
3286 
3287  offset_vec = __msa_fill_w(offset);
3288  weight_vec = __msa_fill_w(weight);
3289  rnd_vec = __msa_fill_w(rnd_val + 1);
3290 
3291  mask1 = mask0 + 2;
3292 
3293  for (loop_cnt = (height >> 2); loop_cnt--;) {
3294  LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
3295  LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
3296  src0_ptr += (4 * src_stride);
3297  LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
3298  LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
3299  src1_ptr += (4 * src2_stride);
3300  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3301 
3302  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3303  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3304  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3305  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3306  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3307  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3308  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3309  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3310  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3311  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3312  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3313  dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3314  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3315  dst6 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3316  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3317  dst7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3318  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3319  in0, in1, in2, in3,
3320  weight_vec, rnd_vec, offset_vec,
3321  dst0, dst1, dst2, dst3);
3322 
3323  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3324  ST_SH2(dst0, dst1, dst, dst_stride);
3325  dst += (2 * dst_stride);
3326 
3327  HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7,
3328  in4, in5, in6, in7,
3329  weight_vec, rnd_vec, offset_vec,
3330  dst0, dst1, dst2, dst3);
3331 
3332  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3333  ST_SH2(dst0, dst1, dst, dst_stride);
3334  dst += (2 * dst_stride);
3335  }
3336 }
3337 
3338 static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr,
3339  int32_t src_stride,
3340  int16_t *src1_ptr,
3341  int32_t src2_stride,
3342  uint8_t *dst,
3343  int32_t dst_stride,
3344  const int8_t *filter,
3345  int32_t height,
3346  int32_t weight0,
3347  int32_t weight1,
3348  int32_t offset0,
3349  int32_t offset1,
3350  int32_t rnd_val)
3351 {
3352  uint32_t loop_cnt;
3353  int32_t offset, weight, constant;
3354  v16i8 src0, src1, src2, src3;
3355  v8i16 filt0, filt1;
3356  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3357  v16i8 mask1, mask2, mask3;
3358  v16i8 vec0, vec1;
3359  v8i16 dst0, dst1, dst2, dst3;
3360  v8i16 in0, in1, in2, in3, in4, in5;
3361  v8i16 filter_vec;
3362  v4i32 weight_vec, offset_vec, rnd_vec;
3363 
3364  src0_ptr -= 1;
3365 
3366  filter_vec = LD_SH(filter);
3367  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3368 
3369  offset = (offset0 + offset1) << rnd_val;
3370  weight0 = weight0 & 0x0000FFFF;
3371  weight = weight0 | (weight1 << 16);
3372  constant = 128 * weight1;
3373  constant <<= 6;
3374  offset += constant;
3375 
3376  offset_vec = __msa_fill_w(offset);
3377  weight_vec = __msa_fill_w(weight);
3378  rnd_vec = __msa_fill_w(rnd_val + 1);
3379 
3380  mask1 = mask0 + 2;
3381  mask2 = mask0 + 8;
3382  mask3 = mask0 + 10;
3383 
3384  for (loop_cnt = 16; loop_cnt--;) {
3385  LD_SB2(src0_ptr, src_stride, src0, src2);
3386  LD_SB2(src0_ptr + 16, src_stride, src1, src3);
3387  src0_ptr += (2 * src_stride);
3388  LD_SH2(src1_ptr, src2_stride, in0, in2);
3389  LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
3390  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
3391  src1_ptr += (2 * src2_stride);
3392  XORI_B4_128_SB(src0, src1, src2, src3);
3393 
3394  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3395  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3396  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3397  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3398  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3399  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3400  VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3401  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3402  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3403  in0, in1, in2, in3,
3404  weight_vec, rnd_vec, offset_vec,
3405  dst0, dst1, dst2, dst3);
3406 
3407  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3408  ST_SH2(dst0, dst1, dst, dst_stride);
3409 
3410  /* 8 width */
3411  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3412  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3413  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3414  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3415  HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5,
3416  weight_vec, rnd_vec, offset_vec,
3417  dst0, dst1);
3418 
3419  dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3420  ST_D2(dst0, 0, 1, (dst + 16), dst_stride);
3421  dst += (2 * dst_stride);
3422  }
3423 }
3424 
3425 static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr,
3426  int32_t src_stride,
3427  int16_t *src1_ptr,
3428  int32_t src2_stride,
3429  uint8_t *dst,
3430  int32_t dst_stride,
3431  const int8_t *filter,
3432  int32_t height,
3433  int32_t weight0,
3434  int32_t weight1,
3435  int32_t offset0,
3436  int32_t offset1,
3437  int32_t rnd_val)
3438 {
3439  uint32_t loop_cnt;
3440  int32_t offset, weight, constant;
3441  v16i8 src0, src1, src2;
3442  v8i16 filt0, filt1;
3443  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3444  v16i8 mask1, mask2, mask3;
3445  v8i16 dst0, dst1, dst2, dst3;
3446  v16i8 vec0, vec1;
3447  v8i16 in0, in1, in2, in3;
3448  v8i16 filter_vec;
3449  v4i32 weight_vec, offset_vec, rnd_vec;
3450 
3451  src0_ptr -= 1;
3452 
3453  filter_vec = LD_SH(filter);
3454  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3455 
3456  offset = (offset0 + offset1) << rnd_val;
3457  weight0 = weight0 & 0x0000FFFF;
3458  weight = weight0 | (weight1 << 16);
3459  constant = 128 * weight1;
3460  constant <<= 6;
3461  offset += constant;
3462 
3463  offset_vec = __msa_fill_w(offset);
3464  weight_vec = __msa_fill_w(weight);
3465  rnd_vec = __msa_fill_w(rnd_val + 1);
3466 
3467  mask1 = mask0 + 2;
3468  mask2 = mask0 + 8;
3469  mask3 = mask0 + 10;
3470 
3471  for (loop_cnt = height; loop_cnt--;) {
3472  LD_SB2(src0_ptr, 16, src0, src1);
3473  src2 = LD_SB(src0_ptr + 24);
3474  src0_ptr += src_stride;
3475  LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3476  src1_ptr += src2_stride;
3477  XORI_B3_128_SB(src0, src1, src2);
3478 
3479  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3480  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3481  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3482  dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3483  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3484  dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3485  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3486  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3487  HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3488  in0, in1, in2, in3,
3489  weight_vec, rnd_vec, offset_vec,
3490  dst0, dst1, dst2, dst3);
3491 
3492  PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3493  ST_SH2(dst0, dst1, dst, 16);
3494  dst += dst_stride;
3495  }
3496 }
3497 
3498 static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
3499  int32_t src_stride,
3500  int16_t *src1_ptr,
3501  int32_t src2_stride,
3502  uint8_t *dst,
3503  int32_t dst_stride,
3504  const int8_t *filter,
3505  int32_t weight0,
3506  int32_t weight1,
3507  int32_t offset0,
3508  int32_t offset1,
3509  int32_t rnd_val)
3510 {
3511  int32_t weight, offset, constant;
3512  v16i8 src0, src1, src2, src3, src4;
3513  v8i16 in0, in1, dst10;
3514  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3515  v4i32 dst10_r, dst10_l;
3516  v8i16 filt0, filt1;
3517  v8i16 filter_vec, out;
3518  v4i32 weight_vec, offset_vec, rnd_vec;
3519 
3520  src0_ptr -= src_stride;
3521 
3522  offset = (offset0 + offset1) << rnd_val;
3523  weight0 = weight0 & 0x0000FFFF;
3524  weight = weight0 | (weight1 << 16);
3525  constant = 128 * weight1;
3526  constant <<= 6;
3527  offset += constant;
3528 
3529  offset_vec = __msa_fill_w(offset);
3530  weight_vec = __msa_fill_w(weight);
3531  rnd_vec = __msa_fill_w(rnd_val + 1);
3532 
3533  filter_vec = LD_SH(filter);
3534  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3535 
3536  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3537  src0_ptr += (3 * src_stride);
3538  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3539  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3540  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3541  LD_SB2(src0_ptr, src_stride, src3, src4);
3542  src0_ptr += (2 * src_stride);
3543  LD_SH2(src1_ptr, src2_stride, in0, in1);
3544  src1_ptr += (2 * src2_stride);
3545 
3546  in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3547  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3548  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3549  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3550 
3551  dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3552 
3553  ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l);
3554  dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
3555  dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
3556  SRAR_W2_SW(dst10_r, dst10_l, rnd_vec);
3557  out = __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
3558  CLIP_SH_0_255(out);
3559  out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out);
3560  ST_W2(out, 0, 1, dst, dst_stride);
3561 }
3562 
3563 static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
3564  int32_t src_stride,
3565  int16_t *src1_ptr,
3566  int32_t src2_stride,
3567  uint8_t *dst,
3568  int32_t dst_stride,
3569  const int8_t *filter,
3570  int32_t weight0,
3571  int32_t weight1,
3572  int32_t offset0,
3573  int32_t offset1,
3574  int32_t rnd_val)
3575 {
3576  int32_t weight, offset, constant;
3577  v16i8 src0, src1, src2, src3, src4, src5, src6;
3578  v8i16 in0, in1, in2, in3;
3579  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3580  v16i8 src2110, src4332, src6554;
3581  v8i16 dst10, dst32;
3582  v8i16 filt0, filt1;
3583  v8i16 filter_vec;
3584  v4i32 weight_vec, offset_vec, rnd_vec;
3585 
3586  src0_ptr -= src_stride;
3587 
3588  offset = (offset0 + offset1) << rnd_val;
3589  weight0 = weight0 & 0x0000FFFF;
3590  weight = weight0 | (weight1 << 16);
3591  constant = 128 * weight1;
3592  constant <<= 6;
3593  offset += constant;
3594 
3595  offset_vec = __msa_fill_w(offset);
3596  weight_vec = __msa_fill_w(weight);
3597  rnd_vec = __msa_fill_w(rnd_val + 1);
3598 
3599  filter_vec = LD_SH(filter);
3600  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3601 
3602  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3603  src0_ptr += (3 * src_stride);
3604  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3605  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3606  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3607 
3608  LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3609  src0_ptr += (4 * src_stride);
3610  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3611  src1_ptr += (4 * src2_stride);
3612  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3613  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3614  src32_r, src43_r, src54_r, src65_r);
3615  ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3616  XORI_B2_128_SB(src4332, src6554);
3617 
3618  dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3619  dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3620 
3621  HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1,
3622  weight_vec, rnd_vec, offset_vec,
3623  dst10, dst32);
3624 
3625  dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
3626  ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
3627  dst += (4 * dst_stride);
3628 }
3629 
3630 static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
3631  int32_t src_stride,
3632  int16_t *src1_ptr,
3633  int32_t src2_stride,
3634  uint8_t *dst,
3635  int32_t dst_stride,
3636  const int8_t *filter,
3637  int32_t height,
3638  int32_t weight0,
3639  int32_t weight1,
3640  int32_t offset0,
3641  int32_t offset1,
3642  int32_t rnd_val)
3643 {
3644  uint32_t loop_cnt;
3645  int32_t weight, offset, constant;
3646  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3647  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3648  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3649  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3650  v16i8 src2110, src4332, src6554, src8776;
3651  v8i16 dst10, dst32, dst54, dst76;
3652  v8i16 filt0, filt1;
3653  v8i16 filter_vec;
3654  v4i32 weight_vec, offset_vec, rnd_vec;
3655 
3656  src0_ptr -= src_stride;
3657 
3658  offset = (offset0 + offset1) << rnd_val;
3659  weight0 = weight0 & 0x0000FFFF;
3660  weight = weight0 | (weight1 << 16);
3661  constant = 128 * weight1;
3662  constant <<= 6;
3663  offset += constant;
3664 
3665  offset_vec = __msa_fill_w(offset);
3666  weight_vec = __msa_fill_w(weight);
3667  rnd_vec = __msa_fill_w(rnd_val + 1);
3668 
3669  filter_vec = LD_SH(filter);
3670  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3671 
3672  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3673  src0_ptr += (3 * src_stride);
3674  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3675  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3676  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3677 
3678  for (loop_cnt = (height >> 3); loop_cnt--;) {
3679  LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3680  src0_ptr += (6 * src_stride);
3681  LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3682  src1_ptr += (8 * src2_stride);
3683 
3684  ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3685  ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3686 
3687  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3688  src32_r, src43_r, src54_r, src65_r);
3689  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3690  ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3691  src4332, src6554, src8776);
3692  XORI_B3_128_SB(src4332, src6554, src8776);
3693 
3694  dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3695  dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3696  dst54 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3697 
3698  LD_SB2(src0_ptr, src_stride, src9, src2);
3699  src0_ptr += (2 * src_stride);
3700  ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3701  src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3702  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3703 
3704  dst76 = HEVC_FILT_4TAP_SH(src8776, src2110, filt0, filt1);
3705  HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
3706  in0, in1, in2, in3,
3707  weight_vec, rnd_vec, offset_vec,
3708  dst10, dst32, dst54, dst76);
3709 
3710  PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
3711  ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3712  dst += (8 * dst_stride);
3713  }
3714 }
3715 
3716 static void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr,
3717  int32_t src_stride,
3718  int16_t *src1_ptr,
3719  int32_t src2_stride,
3720  uint8_t *dst,
3721  int32_t dst_stride,
3722  const int8_t *filter,
3723  int32_t height,
3724  int32_t weight0,
3725  int32_t weight1,
3726  int32_t offset0,
3727  int32_t offset1,
3728  int32_t rnd_val)
3729 {
3730  if (2 == height) {
3731  hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3732  dst, dst_stride, filter,
3733  weight0, weight1, offset0, offset1, rnd_val);
3734  } else if (4 == height) {
3735  hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3736  dst, dst_stride, filter,
3737  weight0, weight1, offset0, offset1, rnd_val);
3738  } else if (0 == (height % 8)) {
3739  hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
3740  src1_ptr, src2_stride,
3741  dst, dst_stride, filter, height,
3742  weight0, weight1, offset0, offset1,
3743  rnd_val);
3744  }
3745 }
3746 
3747 static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr,
3748  int32_t src_stride,
3749  int16_t *src1_ptr,
3750  int32_t src2_stride,
3751  uint8_t *dst,
3752  int32_t dst_stride,
3753  const int8_t *filter,
3754  int32_t height,
3755  int32_t weight0,
3756  int32_t weight1,
3757  int32_t offset0,
3758  int32_t offset1,
3759  int32_t rnd_val)
3760 {
3761  uint32_t loop_cnt;
3762  int32_t offset, weight, constant;
3763  v16i8 src0, src1, src2, src3, src4;
3764  v8i16 in0, in1, in2, in3;
3765  v16i8 src10_r, src32_r, src21_r, src43_r;
3766  v8i16 tmp0, tmp1, tmp2, tmp3;
3767  v8i16 filt0, filt1;
3768  v8i16 filter_vec;
3769  v4i32 weight_vec, offset_vec, rnd_vec;
3770 
3771  src0_ptr -= src_stride;
3772 
3773  offset = (offset0 + offset1) << rnd_val;
3774  weight0 = weight0 & 0x0000FFFF;
3775  weight = weight0 | (weight1 << 16);
3776  constant = 128 * weight1;
3777  constant <<= 6;
3778  offset += constant;
3779 
3780  offset_vec = __msa_fill_w(offset);
3781  weight_vec = __msa_fill_w(weight);
3782  rnd_vec = __msa_fill_w(rnd_val + 1);
3783 
3784  filter_vec = LD_SH(filter);
3785  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3786 
3787  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3788  src0_ptr += (3 * src_stride);
3789  XORI_B3_128_SB(src0, src1, src2);
3790  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3791 
3792  for (loop_cnt = (height >> 2); loop_cnt--;) {
3793  LD_SB2(src0_ptr, src_stride, src3, src4);
3794  src0_ptr += (2 * src_stride);
3795  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3796  src1_ptr += (4 * src2_stride);
3797  XORI_B2_128_SB(src3, src4);
3798  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3799 
3800  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3801  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3802 
3803  LD_SB2(src0_ptr, src_stride, src1, src2);
3804  src0_ptr += (2 * src_stride);
3805  XORI_B2_128_SB(src1, src2);
3806  ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3807 
3808  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
3809  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
3810  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3811  in0, in1, in2, in3,
3812  weight_vec, rnd_vec, offset_vec,
3813  tmp0, tmp1, tmp2, tmp3);
3814 
3815  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3816  ST_W2(tmp0, 0, 2, dst, dst_stride);
3817  ST_H2(tmp0, 2, 6, dst + 4, dst_stride);
3818  ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
3819  ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3820  dst += (4 * dst_stride);
3821  }
3822 }
3823 
3824 static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
3825  int32_t src_stride,
3826  int16_t *src1_ptr,
3827  int32_t src2_stride,
3828  uint8_t *dst,
3829  int32_t dst_stride,
3830  const int8_t *filter,
3831  int32_t weight0,
3832  int32_t weight1,
3833  int32_t offset0,
3834  int32_t offset1,
3835  int32_t rnd_val)
3836 {
3837  int32_t offset, weight, constant;
3838  v16i8 src0, src1, src2, src3, src4;
3839  v8i16 in0, in1, tmp0, tmp1;
3840  v16i8 src10_r, src32_r, src21_r, src43_r;
3841  v8i16 filt0, filt1;
3842  v8i16 filter_vec;
3843  v4i32 weight_vec, offset_vec, rnd_vec;
3844 
3845  src0_ptr -= src_stride;
3846 
3847  offset = (offset0 + offset1) << rnd_val;
3848  weight0 = weight0 & 0x0000FFFF;
3849  weight = weight0 | (weight1 << 16);
3850  constant = 128 * weight1;
3851  constant <<= 6;
3852  offset += constant;
3853 
3854  offset_vec = __msa_fill_w(offset);
3855  weight_vec = __msa_fill_w(weight);
3856  rnd_vec = __msa_fill_w(rnd_val + 1);
3857 
3858  filter_vec = LD_SH(filter);
3859  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3860 
3861  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3862  src0_ptr += (3 * src_stride);
3863  XORI_B3_128_SB(src0, src1, src2);
3864  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3865 
3866  LD_SB2(src0_ptr, src_stride, src3, src4);
3867  LD_SH2(src1_ptr, src2_stride, in0, in1);
3868  XORI_B2_128_SB(src3, src4);
3869  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3870 
3871  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3872  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3873  HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
3874  weight_vec, rnd_vec, offset_vec,
3875  tmp0, tmp1);
3876 
3877  tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3878  ST_D2(tmp0, 0, 1, dst, dst_stride);
3879 }
3880 
3881 static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
3882  int32_t src_stride,
3883  int16_t *src1_ptr,
3884  int32_t src2_stride,
3885  uint8_t *dst,
3886  int32_t dst_stride,
3887  const int8_t *filter,
3888  int32_t weight0,
3889  int32_t weight1,
3890  int32_t offset0,
3891  int32_t offset1,
3892  int32_t rnd_val)
3893 {
3894  int32_t offset, weight, constant;
3895  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3896  v8i16 in0, in1, in2, in3, in4, in5;
3897  v16i8 src10_r, src32_r, src54_r, src76_r;
3898  v16i8 src21_r, src43_r, src65_r, src87_r;
3899  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3900  v8i16 filt0, filt1;
3901  v8i16 filter_vec;
3902  v4i32 weight_vec, offset_vec, rnd_vec;
3903 
3904  src0_ptr -= src_stride;
3905 
3906  offset = (offset0 + offset1) << rnd_val;
3907  weight0 = weight0 & 0x0000FFFF;
3908  weight = weight0 | (weight1 << 16);
3909  constant = 128 * weight1;
3910  constant <<= 6;
3911  offset += constant;
3912 
3913  offset_vec = __msa_fill_w(offset);
3914  weight_vec = __msa_fill_w(weight);
3915  rnd_vec = __msa_fill_w(rnd_val + 1);
3916 
3917  filter_vec = LD_SH(filter);
3918  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3919 
3920  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3921  src0_ptr += (3 * src_stride);
3922  XORI_B3_128_SB(src0, src1, src2);
3923  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3924 
3925  LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3926  LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3927  XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3928  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3929  src32_r, src43_r, src54_r, src65_r);
3930  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3931 
3932  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3933  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3934  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3935  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3936  tmp4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3937  tmp5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3938  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3939  in0, in1, in2, in3,
3940  weight_vec, rnd_vec, offset_vec,
3941  tmp0, tmp1, tmp2, tmp3);
3942  HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
3943  weight_vec, rnd_vec, offset_vec,
3944  tmp4, tmp5);
3945 
3946  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3947  tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
3948  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
3949  ST_D2(tmp3, 0, 1, dst + 4 * dst_stride, dst_stride);
3950 }
3951 
3952 static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
3953  int32_t src_stride,
3954  int16_t *src1_ptr,
3955  int32_t src2_stride,
3956  uint8_t *dst,
3957  int32_t dst_stride,
3958  const int8_t *filter,
3959  int32_t height,
3960  int32_t weight0,
3961  int32_t weight1,
3962  int32_t offset0,
3963  int32_t offset1,
3964  int32_t rnd_val)
3965 {
3966  uint32_t loop_cnt;
3967  int32_t offset, weight, constant;
3968  v16i8 src0, src1, src2, src3, src4;
3969  v8i16 in0, in1, in2, in3;
3970  v16i8 src10_r, src32_r, src21_r, src43_r;
3971  v8i16 tmp0, tmp1, tmp2, tmp3;
3972  v8i16 filt0, filt1;
3973  v8i16 filter_vec;
3974  v4i32 weight_vec, offset_vec, rnd_vec;
3975 
3976  src0_ptr -= src_stride;
3977 
3978  offset = (offset0 + offset1) << rnd_val;
3979  weight0 = weight0 & 0x0000FFFF;
3980  weight = weight0 | (weight1 << 16);
3981  constant = 128 * weight1;
3982  constant <<= 6;
3983  offset += constant;
3984 
3985  offset_vec = __msa_fill_w(offset);
3986  weight_vec = __msa_fill_w(weight);
3987  rnd_vec = __msa_fill_w(rnd_val + 1);
3988 
3989  filter_vec = LD_SH(filter);
3990  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3991 
3992  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3993  src0_ptr += (3 * src_stride);
3994  XORI_B3_128_SB(src0, src1, src2);
3995  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3996 
3997  for (loop_cnt = (height >> 2); loop_cnt--;) {
3998  LD_SB2(src0_ptr, src_stride, src3, src4);
3999  src0_ptr += (2 * src_stride);
4000  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4001  src1_ptr += (4 * src2_stride);
4002  XORI_B2_128_SB(src3, src4);
4003  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4004 
4005  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4006  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4007 
4008  LD_SB2(src0_ptr, src_stride, src1, src2);
4009  src0_ptr += (2 * src_stride);
4010  XORI_B2_128_SB(src1, src2);
4011  ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
4012 
4013  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4014  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4015  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4016  in0, in1, in2, in3,
4017  weight_vec, rnd_vec, offset_vec,
4018  tmp0, tmp1, tmp2, tmp3);
4019 
4020  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4021  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4022  dst += (4 * dst_stride);
4023  }
4024 }
4025 
4026 static void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr,
4027  int32_t src_stride,
4028  int16_t *src1_ptr,
4029  int32_t src2_stride,
4030  uint8_t *dst,
4031  int32_t dst_stride,
4032  const int8_t *filter,
4033  int32_t height,
4034  int32_t weight0,
4035  int32_t weight1,
4036  int32_t offset0,
4037  int32_t offset1,
4038  int32_t rnd_val)
4039 {
4040  if (2 == height) {
4041  hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4042  dst, dst_stride, filter,
4043  weight0, weight1, offset0, offset1, rnd_val);
4044  } else if (6 == height) {
4045  hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4046  dst, dst_stride, filter,
4047  weight0, weight1, offset0, offset1, rnd_val);
4048  } else {
4049  hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
4050  src1_ptr, src2_stride,
4051  dst, dst_stride, filter, height,
4052  weight0, weight1, offset0, offset1,
4053  rnd_val);
4054  }
4055 }
4056 
4057 static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr,
4058  int32_t src_stride,
4059  int16_t *src1_ptr,
4060  int32_t src2_stride,
4061  uint8_t *dst,
4062  int32_t dst_stride,
4063  const int8_t *filter,
4064  int32_t height,
4065  int32_t weight0,
4066  int32_t weight1,
4067  int32_t offset0,
4068  int32_t offset1,
4069  int32_t rnd_val)
4070 {
4071  uint32_t loop_cnt;
4072  int32_t offset, weight, constant;
4073  v16i8 src0, src1, src2, src3, src4, src5;
4074  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4075  v16i8 src10_r, src32_r, src21_r, src43_r;
4076  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4077  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4078  v16i8 src2110, src4332;
4079  v8i16 filt0, filt1;
4080  v8i16 filter_vec;
4081  v4i32 weight_vec, offset_vec, rnd_vec;
4082 
4083  src0_ptr -= (1 * src_stride);
4084 
4085  offset = (offset0 + offset1) << rnd_val;
4086  weight0 = weight0 & 0x0000FFFF;
4087  weight = weight0 | (weight1 << 16);
4088  constant = 128 * weight1;
4089  constant <<= 6;
4090  offset += constant;
4091 
4092  offset_vec = __msa_fill_w(offset);
4093  weight_vec = __msa_fill_w(weight);
4094  rnd_vec = __msa_fill_w(rnd_val + 1);
4095 
4096  filter_vec = LD_SH(filter);
4097  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4098 
4099  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4100  src0_ptr += (3 * src_stride);
4101  XORI_B3_128_SB(src0, src1, src2);
4102  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4103  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4104  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
4105 
4106  for (loop_cnt = (height >> 2); loop_cnt--;) {
4107  LD_SB2(src0_ptr, src_stride, src3, src4);
4108  src0_ptr += (2 * src_stride);
4109  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4110  LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
4111  src1_ptr += (4 * src2_stride);
4112  ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
4113  XORI_B2_128_SB(src3, src4);
4114 
4115  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4116  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4117  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
4118 
4119  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4120  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4121  tmp4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
4122 
4123  LD_SB2(src0_ptr, src_stride, src5, src2);
4124  src0_ptr += (2 * src_stride);
4125  XORI_B2_128_SB(src5, src2);
4126  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4127  ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
4128  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
4129 
4130  tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4131  tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4132  tmp5 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
4133  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4134  in0, in1, in2, in3,
4135  weight_vec, rnd_vec, offset_vec,
4136  tmp0, tmp1, tmp2, tmp3);
4137  HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
4138  weight_vec, rnd_vec, offset_vec,
4139  tmp4, tmp5);
4140 
4141  PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4142  tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4143  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
4144  ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
4145  dst += (4 * dst_stride);
4146  }
4147 }
4148 
4149 static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr,
4150  int32_t src_stride,
4151  int16_t *src1_ptr,
4152  int32_t src2_stride,
4153  uint8_t *dst,
4154  int32_t dst_stride,
4155  const int8_t *filter,
4156  int32_t height,
4157  int32_t weight0,
4158  int32_t weight1,
4159  int32_t offset0,
4160  int32_t offset1,
4161  int32_t rnd_val)
4162 {
4163  uint32_t loop_cnt;
4164  int32_t offset, weight, constant;
4165  v16i8 src0, src1, src2, src3, src4, src5;
4166  v8i16 in0, in1, in2, in3;
4167  v16i8 src10_r, src32_r, src21_r, src43_r;
4168  v16i8 src10_l, src32_l, src21_l, src43_l;
4169  v8i16 tmp0, tmp1, tmp2, tmp3;
4170  v8i16 filt0, filt1;
4171  v8i16 filter_vec;
4172  v4i32 weight_vec, offset_vec, rnd_vec;
4173 
4174  src0_ptr -= src_stride;
4175 
4176  offset = (offset0 + offset1) << rnd_val;
4177  weight0 = weight0 & 0x0000FFFF;
4178  weight = weight0 | (weight1 << 16);
4179  constant = 128 * weight1;
4180  constant <<= 6;
4181  offset += constant;
4182 
4183  offset_vec = __msa_fill_w(offset);
4184  weight_vec = __msa_fill_w(weight);
4185  rnd_vec = __msa_fill_w(rnd_val + 1);
4186 
4187  filter_vec = LD_SH(filter);
4188  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4189 
4190  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4191  src0_ptr += (3 * src_stride);
4192  XORI_B3_128_SB(src0, src1, src2);
4193  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4194  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4195 
4196  for (loop_cnt = (height >> 2); loop_cnt--;) {
4197  LD_SB2(src0_ptr, src_stride, src3, src4);
4198  src0_ptr += (2 * src_stride);
4199  LD_SH2(src1_ptr, src2_stride, in0, in1);
4200  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4201  src1_ptr += (2 * src2_stride);
4202  XORI_B2_128_SB(src3, src4);
4203  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4204  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4205 
4206  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4207  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4208  tmp2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4209  tmp3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4210 
4211  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4212  in0, in1, in2, in3,
4213  weight_vec, rnd_vec, offset_vec,
4214  tmp0, tmp1, tmp2, tmp3);
4215  PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4216  ST_SH2(tmp0, tmp1, dst, dst_stride);
4217  dst += (2 * dst_stride);
4218  LD_SB2(src0_ptr, src_stride, src5, src2);
4219  src0_ptr += (2 * src_stride);
4220 
4221  LD_SH2(src1_ptr, src2_stride, in0, in1);
4222  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4223  src1_ptr += (2 * src2_stride);
4224  XORI_B2_128_SB(src5, src2);
4225  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4226  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4227 
4228  tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4229  tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4230  tmp2 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4231  tmp3 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4232  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4233  in0, in1, in2, in3,
4234  weight_vec, rnd_vec, offset_vec,
4235  tmp0, tmp1, tmp2, tmp3);
4236 
4237  PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4238  ST_SH2(tmp0, tmp1, dst, dst_stride);
4239  dst += (2 * dst_stride);
4240  }
4241 }
4242 
4243 static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr,
4244  int32_t src_stride,
4245  int16_t *src1_ptr,
4246  int32_t src2_stride,
4247  uint8_t *dst,
4248  int32_t dst_stride,
4249  const int8_t *filter,
4250  int32_t height,
4251  int32_t weight0,
4252  int32_t weight1,
4253  int32_t offset0,
4254  int32_t offset1,
4255  int32_t rnd_val)
4256 {
4257  uint32_t loop_cnt;
4258  int32_t offset, weight, constant;
4259  v16i8 src0, src1, src2, src3, src4, src5;
4260  v16i8 src6, src7, src8, src9, src10, src11;
4261  v8i16 in0, in1, in2, in3, in4, in5;
4262  v16i8 src10_r, src32_r, src76_r, src98_r;
4263  v16i8 src10_l, src32_l, src21_l, src43_l;
4264  v16i8 src21_r, src43_r, src87_r, src109_r;
4265  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4266  v8i16 filt0, filt1;
4267  v8i16 filter_vec;
4268  v4i32 weight_vec, offset_vec, rnd_vec;
4269 
4270  src0_ptr -= src_stride;
4271 
4272  offset = (offset0 + offset1) << rnd_val;
4273  weight0 = weight0 & 0x0000FFFF;
4274  weight = weight0 | (weight1 << 16);
4275  constant = 128 * weight1;
4276  constant <<= 6;
4277  offset += constant;
4278 
4279  offset_vec = __msa_fill_w(offset);
4280  weight_vec = __msa_fill_w(weight);
4281  rnd_vec = __msa_fill_w(rnd_val + 1);
4282 
4283  filter_vec = LD_SH(filter);
4284  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4285 
4286  /* 16width */
4287  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4288  XORI_B3_128_SB(src0, src1, src2);
4289  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4290  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4291  /* 8width */
4292  LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4293  src0_ptr += (3 * src_stride);
4294  XORI_B3_128_SB(src6, src7, src8);
4295  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4296 
4297  for (loop_cnt = (height >> 2); loop_cnt--;) {
4298  /* 16width */
4299  LD_SB2(src0_ptr, src_stride, src3, src4);
4300  LD_SH2(src1_ptr, src2_stride, in0, in1);
4301  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4302  XORI_B2_128_SB(src3, src4);
4303  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4304  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4305 
4306  /* 8width */
4307  LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4308  src0_ptr += (2 * src_stride);
4309  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4310  src1_ptr += (2 * src2_stride);
4311  XORI_B2_128_SB(src9, src10);
4312  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4313  /* 16width */
4314  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4315  tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4316  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4317  tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4318  /* 8width */
4319  tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4320  tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4321  /* 16width */
4322  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4323  in0, in1, in2, in3,
4324  weight_vec, rnd_vec, offset_vec,
4325  tmp0, tmp1, tmp4, tmp5);
4326  /* 8width */
4327  HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4328  weight_vec, rnd_vec, offset_vec,
4329  tmp2, tmp3);
4330  /* 16width */
4331  PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4332  /* 8width */
4333  tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4334  ST_SH2(tmp0, tmp1, dst, dst_stride);
4335  ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4336  dst += (2 * dst_stride);
4337 
4338  /* 16width */
4339  LD_SB2(src0_ptr, src_stride, src5, src2);
4340  LD_SH2(src1_ptr, src2_stride, in0, in1);
4341  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4342  XORI_B2_128_SB(src5, src2);
4343  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4344  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4345  /* 8width */
4346  LD_SB2(src0_ptr + 16, src_stride, src11, src8);
4347  src0_ptr += (2 * src_stride);
4348  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4349  src1_ptr += (2 * src2_stride);
4350  XORI_B2_128_SB(src11, src8);
4351  ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
4352  /* 16width */
4353  tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4354  tmp4 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4355  tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4356  tmp5 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4357  /* 8width */
4358  tmp2 = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
4359  tmp3 = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
4360  /* 16width */
4361  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4362  in0, in1, in2, in3,
4363  weight_vec, rnd_vec, offset_vec,
4364  tmp0, tmp1, tmp4, tmp5);
4365  /* 8width */
4366  HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4367  weight_vec, rnd_vec, offset_vec,
4368  tmp2, tmp3);
4369  /* 16width */
4370  PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4371 
4372  /* 8width */
4373  tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4374  ST_SH2(tmp0, tmp1, dst, dst_stride);
4375  ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
4376  dst += (2 * dst_stride);
4377  }
4378 }
4379 
4380 static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr,
4381  int32_t src_stride,
4382  int16_t *src1_ptr,
4383  int32_t src2_stride,
4384  uint8_t *dst,
4385  int32_t dst_stride,
4386  const int8_t *filter,
4387  int32_t height,
4388  int32_t weight0,
4389  int32_t weight1,
4390  int32_t offset0,
4391  int32_t offset1,
4392  int32_t rnd_val)
4393 {
4394  uint32_t loop_cnt;
4395  uint8_t *dst_tmp = dst + 16;
4396  int32_t offset, weight, constant;
4397  v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
4398  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4399  v16i8 src10_r, src32_r, src76_r, src98_r;
4400  v16i8 src21_r, src43_r, src87_r, src109_r;
4401  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4402  v16i8 src10_l, src32_l, src76_l, src98_l;
4403  v16i8 src21_l, src43_l, src87_l, src109_l;
4404  v8i16 filt0, filt1;
4405  v8i16 filter_vec;
4406  v4i32 weight_vec, offset_vec, rnd_vec;
4407 
4408  src0_ptr -= src_stride;
4409 
4410  offset = (offset0 + offset1) << rnd_val;
4411  weight0 = weight0 & 0x0000FFFF;
4412  weight = weight0 | (weight1 << 16);
4413  constant = 128 * weight1;
4414  constant <<= 6;
4415  offset += constant;
4416 
4417  offset_vec = __msa_fill_w(offset);
4418  weight_vec = __msa_fill_w(weight);
4419  rnd_vec = __msa_fill_w(rnd_val + 1);
4420 
4421  filter_vec = LD_SH(filter);
4422  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4423 
4424  /* 16width */
4425  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4426  XORI_B3_128_SB(src0, src1, src2);
4427  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4428  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4429  /* next 16width */
4430  LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4431  src0_ptr += (3 * src_stride);
4432  XORI_B3_128_SB(src6, src7, src8);
4433  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4434  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
4435 
4436  for (loop_cnt = (height >> 1); loop_cnt--;) {
4437  /* 16width */
4438  LD_SB2(src0_ptr, src_stride, src3, src4);
4439  LD_SH2(src1_ptr, src2_stride, in0, in1);
4440  LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4441  XORI_B2_128_SB(src3, src4);
4442  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4443  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4444 
4445  /* 16width */
4446  tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4447  tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4448  tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4449  tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4450  /* 16width */
4451  HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4452  in0, in1, in2, in3,
4453  weight_vec, rnd_vec, offset_vec,
4454  tmp0, tmp1, tmp4, tmp5);
4455  /* 16width */
4456  PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4457  ST_SH2(tmp0, tmp1, dst, dst_stride);
4458  dst += (2 * dst_stride);
4459 
4460  src10_r = src32_r;
4461  src21_r = src43_r;
4462  src10_l = src32_l;
4463  src21_l = src43_l;
4464  src2 = src4;
4465 
4466  /* next 16width */
4467  LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4468  src0_ptr += (2 * src_stride);
4469  LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4470  LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
4471  src1_ptr += (2 * src2_stride);
4472  XORI_B2_128_SB(src9, src10);
4473  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4474  ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
4475  /* next 16width */
4476  tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4477  tmp6 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
4478  tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4479  tmp7 = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
4480  /* next 16width */
4481  HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
4482  in4, in5, in6, in7,
4483  weight_vec, rnd_vec, offset_vec,
4484  tmp2, tmp3, tmp6, tmp7);
4485 
4486  /* next 16width */
4487  PCKEV_B2_SH(tmp6, tmp2, tmp7, tmp3, tmp2, tmp3);
4488  ST_SH2(tmp2, tmp3, dst_tmp, dst_stride);
4489  dst_tmp += (2 * dst_stride);
4490 
4491  src76_r = src98_r;
4492  src87_r = src109_r;
4493  src76_l = src98_l;
4494  src87_l = src109_l;
4495  src8 = src10;
4496  }
4497 }
4498 
4499 static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
4500  int32_t src_stride,
4501  int16_t *src1_ptr,
4502  int32_t src2_stride,
4503  uint8_t *dst,
4504  int32_t dst_stride,
4505  const int8_t *filter_x,
4506  const int8_t *filter_y,
4507  int32_t weight0,
4508  int32_t weight1,
4509  int32_t offset0,
4510  int32_t offset1,
4511  int32_t rnd_val)
4512 {
4513  uint64_t tp0, tp1;
4515  v8i16 in0 = { 0 };
4516  v16u8 out;
4517  v16i8 src0, src1, src2, src3, src4;
4518  v8i16 filt0, filt1;
4519  v8i16 filt_h0, filt_h1;
4520  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4521  v16i8 mask1;
4522  v8i16 filter_vec, tmp, weight_vec;
4523  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4524  v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
4525  v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
4526 
4527  src0_ptr -= (src_stride + 1);
4528 
4529  filter_vec = LD_SH(filter_x);
4530  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4531 
4532  filter_vec = LD_SH(filter_y);
4533  UNPCK_R_SB_SH(filter_vec, filter_vec);
4534 
4535  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4536 
4537  mask1 = mask0 + 2;
4538 
4539  offset = (offset0 + offset1) << rnd_val;
4540  weight0 = weight0 & 0x0000FFFF;
4541  weight = weight0 | (weight1 << 16);
4542 
4543  const_vec = __msa_fill_w((128 * weight1));
4544  const_vec <<= 6;
4545  offset_vec = __msa_fill_w(offset);
4546  weight_vec = (v8i16) __msa_fill_w(weight);
4547  rnd_vec = __msa_fill_w(rnd_val + 1);
4548  offset_vec += const_vec;
4549 
4550  LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4551  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4552 
4553  VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
4554  VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
4555  VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
4556 
4557  dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4558  dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4559  dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4560 
4561  ILVRL_H2_SH(dst31, dst20, dst10, dst32);
4562  ILVRL_H2_SH(dst42, dst31, dst21, dst43);
4563 
4564  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4565  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4566  dst0 >>= 6;
4567  dst1 >>= 6;
4568  dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4569 
4570  LD2(src1_ptr, src2_stride, tp0, tp1);
4571  INSERT_D2_SH(tp0, tp1, in0);
4572 
4573  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4574  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4575  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4576  SRAR_W2_SW(dst0, dst1, rnd_vec);
4577  tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4578  CLIP_SH_0_255(tmp);
4579  out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
4580  ST_W2(out, 0, 1, dst, dst_stride);
4581 }
4582 
4583 static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
4584  int32_t src_stride,
4585  int16_t *src1_ptr,
4586  int32_t src2_stride,
4587  uint8_t *dst,
4588  int32_t dst_stride,
4589  const int8_t *filter_x,
4590  const int8_t *filter_y,
4591  int32_t weight0,
4592  int32_t weight1,
4593  int32_t offset0,
4594  int32_t offset1,
4595  int32_t rnd_val)
4596 {
4597  uint64_t tp0, tp1;
4599  v16u8 out;
4600  v8i16 in0 = { 0 }, in1 = { 0 };
4601  v16i8 src0, src1, src2, src3, src4, src5, src6;
4602  v8i16 filt0, filt1;
4603  v8i16 filt_h0, filt_h1;
4604  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4605  v16i8 mask1;
4606  v8i16 filter_vec, weight_vec;
4607  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4608  v8i16 tmp0, tmp1, tmp2, tmp3;
4609  v8i16 dst30, dst41, dst52, dst63;
4610  v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
4611  v4i32 offset_vec, rnd_vec, const_vec;
4612  v4i32 dst0, dst1, dst2, dst3;
4613 
4614  src0_ptr -= (src_stride + 1);
4615 
4616  filter_vec = LD_SH(filter_x);
4617  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4618 
4619  filter_vec = LD_SH(filter_y);
4620  UNPCK_R_SB_SH(filter_vec, filter_vec);
4621 
4622  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4623 
4624  mask1 = mask0 + 2;
4625 
4626  offset = (offset0 + offset1) << rnd_val;
4627  weight0 = weight0 & 0x0000FFFF;
4628  weight = weight0 | (weight1 << 16);
4629 
4630  const_vec = __msa_fill_w((128 * weight1));
4631  const_vec <<= 6;
4632  offset_vec = __msa_fill_w(offset);
4633  weight_vec = (v8i16) __msa_fill_w(weight);
4634  rnd_vec = __msa_fill_w(rnd_val + 1);
4635  offset_vec += const_vec;
4636 
4637  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4638  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4639 
4640  VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
4641  VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
4642  VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4643  VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4644 
4645  dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4646  dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4647  dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4648  dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4649 
4650  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
4651  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
4652  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
4653  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4654  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4655  dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
4656  dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
4657  SRA_4V(dst0, dst1, dst2, dst3, 6);
4658  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
4659 
4660  LD2(src1_ptr, src2_stride, tp0, tp1);
4661  INSERT_D2_SH(tp0, tp1, in0);
4662  src1_ptr += (2 * src2_stride);
4663  LD2(src1_ptr, src2_stride, tp0, tp1);
4664  INSERT_D2_SH(tp0, tp1, in1);
4665 
4666  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
4667  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
4668 
4669  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4670  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4671  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4672  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4673  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4674  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4675  CLIP_SH2_0_255(tmp0, tmp1);
4676  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4677  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
4678 }
4679 
4680 static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
4681  int32_t src_stride,
4682  int16_t *src1_ptr,
4683  int32_t src2_stride,
4684  uint8_t *dst,
4685  int32_t dst_stride,
4686  const int8_t *filter_x,
4687  const int8_t *filter_y,
4688  int32_t height,
4689  int32_t weight0,
4690  int32_t weight1,
4691  int32_t offset0,
4692  int32_t offset1,
4693  int32_t rnd_val)
4694 {
4695  uint32_t loop_cnt;
4696  uint64_t tp0, tp1;
4698  v16u8 out0, out1;
4699  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4700  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4701  v8i16 filt0, filt1;
4702  v8i16 filt_h0, filt_h1;
4703  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4704  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4705  v16i8 mask1;
4706  v8i16 filter_vec, weight_vec;
4707  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4708  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4709  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4710  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4711  v8i16 dst98_r, dst109_r;
4712  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4713  v4i32 offset_vec, rnd_vec, const_vec;
4714 
4715  src0_ptr -= (src_stride + 1);
4716 
4717  filter_vec = LD_SH(filter_x);
4718  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4719 
4720  filter_vec = LD_SH(filter_y);
4721  UNPCK_R_SB_SH(filter_vec, filter_vec);
4722 
4723  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4724 
4725  mask1 = mask0 + 2;
4726 
4727  offset = (offset0 + offset1) << rnd_val;
4728  weight0 = weight0 & 0x0000FFFF;
4729  weight = weight0 | (weight1 << 16);
4730 
4731  const_vec = __msa_fill_w((128 * weight1));
4732  const_vec <<= 6;
4733  offset_vec = __msa_fill_w(offset);
4734  weight_vec = (v8i16) __msa_fill_w(weight);
4735  rnd_vec = __msa_fill_w(rnd_val + 1);
4736  offset_vec += const_vec;
4737 
4738  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4739  src0_ptr += (3 * src_stride);
4740  XORI_B3_128_SB(src0, src1, src2);
4741 
4742  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
4743  VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
4744  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4745  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4746  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4747  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4748 
4749  for (loop_cnt = height >> 3; loop_cnt--;) {
4750  LD_SB8(src0_ptr, src_stride,
4751  src3, src4, src5, src6, src7, src8, src9, src10);
4752  src0_ptr += (8 * src_stride);
4753  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4754  VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4755  VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4756  VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4757  VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4758 
4759  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4760  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4761  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4762  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4763 
4764  dst32_r = __msa_ilvr_h(dst73, dst22);
4765  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4766  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4767  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4768  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4769  dst76_r = __msa_ilvr_h(dst22, dst106);
4770 
4771  LD2(src1_ptr, src2_stride, tp0, tp1);
4772  src1_ptr += 2 * src2_stride;
4773  INSERT_D2_SH(tp0, tp1, in0);
4774  LD2(src1_ptr, src2_stride, tp0, tp1);
4775  src1_ptr += 2 * src2_stride;
4776  INSERT_D2_SH(tp0, tp1, in1);
4777 
4778  LD2(src1_ptr, src2_stride, tp0, tp1);
4779  src1_ptr += 2 * src2_stride;
4780  INSERT_D2_SH(tp0, tp1, in2);
4781  LD2(src1_ptr, src2_stride, tp0, tp1);
4782  src1_ptr += 2 * src2_stride;
4783  INSERT_D2_SH(tp0, tp1, in3);
4784 
4785  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4786  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4787  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4788  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4789  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4790  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4791  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4792  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4793  SRA_4V(dst0, dst1, dst2, dst3, 6);
4794  SRA_4V(dst4, dst5, dst6, dst7, 6);
4795  PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
4796  dst2, dst3);
4797  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4798  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
4799  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
4800  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
4801  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4802  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4803  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4804  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4805  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
4806  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
4807  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
4808  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
4809  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4810  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4811  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4812  tmp2, tmp3);
4813  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4814  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4815  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4816  dst += (8 * dst_stride);
4817 
4818  dst10_r = dst98_r;
4819  dst21_r = dst109_r;
4820  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4821  }
4822 }
4823 
4824 static void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr,
4825  int32_t src_stride,
4826  int16_t *src1_ptr,
4827  int32_t src2_stride,
4828  uint8_t *dst,
4829  int32_t dst_stride,
4830  const int8_t *filter_x,
4831  const int8_t *filter_y,
4832  int32_t height,
4833  int32_t weight0,
4834  int32_t weight1,
4835  int32_t offset0,
4836  int32_t offset1,
4837  int32_t rnd_val)
4838 {
4839  if (2 == height) {
4840  hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4841  dst, dst_stride, filter_x, filter_y,
4842  weight0, weight1, offset0, offset1, rnd_val);
4843  } else if (4 == height) {
4844  hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4845  dst, dst_stride, filter_x, filter_y,
4846  weight0, weight1, offset0, offset1, rnd_val);
4847  } else if (0 == (height % 8)) {
4848  hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride,
4849  src1_ptr, src2_stride,
4850  dst, dst_stride, filter_x, filter_y,
4851  height, weight0, weight1,
4852  offset0, offset1, rnd_val);
4853  }
4854 }
4855 
4856 static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
4857  int32_t src_stride,
4858  int16_t *src1_ptr,
4859  int32_t src2_stride,
4860  uint8_t *dst,
4861  int32_t dst_stride,
4862  const int8_t *filter_x,
4863  const int8_t *filter_y,
4864  int32_t height,
4865  int32_t weight0,
4866  int32_t weight1,
4867  int32_t offset0,
4868  int32_t offset1,
4869  int32_t rnd_val)
4870 {
4871  uint32_t tpw0, tpw1, tpw2, tpw3;
4872  uint64_t tp0, tp1;
4874  v16u8 out0, out1, out2;
4875  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4876  v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4877  v8i16 in4 = { 0 }, in5 = { 0 };
4878  v8i16 filt0, filt1;
4879  v8i16 filt_h0, filt_h1, filter_vec;
4880  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4881  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4882  v16i8 mask1;
4883  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4884  v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec;
4885  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4886  v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4887  v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4888  v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4889  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4890  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4891  v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4892  v4i32 offset_vec, rnd_vec, const_vec;
4893 
4894  src0_ptr -= (src_stride + 1);
4895 
4896  filter_vec = LD_SH(filter_x);
4897  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4898 
4899  filter_vec = LD_SH(filter_y);
4900  UNPCK_R_SB_SH(filter_vec, filter_vec);
4901 
4902  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4903 
4904  mask1 = mask0 + 2;
4905 
4906  offset = (offset0 + offset1) << rnd_val;
4907  weight0 = weight0 & 0x0000FFFF;
4908  weight = weight0 | (weight1 << 16);
4909 
4910  const_vec = __msa_fill_w((128 * weight1));
4911  const_vec <<= 6;
4912  offset_vec = __msa_fill_w(offset);
4913  weight_vec = (v8i16) __msa_fill_w(weight);
4914  rnd_vec = __msa_fill_w(rnd_val + 1);
4915  offset_vec += const_vec;
4916 
4917  LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4918  src0_ptr += (3 * src_stride);
4919  XORI_B3_128_SB(src0, src1, src2);
4920 
4921  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4922  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4923  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4924  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4925  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4926  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4927 
4928  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4929  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4930 
4931  LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
4932  src10);
4933  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4934 
4935  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4936  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4937  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4938  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4939 
4940  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4941  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4942  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4943  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4944 
4945  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4946  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4947  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4948  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4949 
4950  dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4951  dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4952  dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4953  dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4954 
4955  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4956  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4957  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4958  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4959  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
4960  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
4961  ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
4962  ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
4963  PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4964  PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4965  dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4966 
4967  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4968  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4969  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4970  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4971  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4972  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4973  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4974  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4975  dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
4976  dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
4977  dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
4978  dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4979  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4980  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4981  SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4982  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1);
4983  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3);
4984 
4985  LD2(src1_ptr, src2_stride, tp0, tp1);
4986  INSERT_D2_SH(tp0, tp1, in0);
4987  LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4988  INSERT_D2_SH(tp0, tp1, in1);
4989 
4990  LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4991  INSERT_D2_SH(tp0, tp1, in2);
4992  LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4993  INSERT_D2_SH(tp0, tp1, in3);
4994 
4995  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4996  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
4997  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
4998  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
4999  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5000  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5001  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5002  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5003  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5004  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5005  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5006  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5007  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5008  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5009  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5010  tmp2, tmp3);
5011  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5012  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5013  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5014 
5015  PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
5016 
5017  LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5018  src1_ptr += (4 * src2_stride);
5019  INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
5020  LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5021  INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
5022 
5023  ILVRL_H2_SH(dst4, in4, tmp0, tmp1);
5024  ILVRL_H2_SH(dst5, in5, tmp2, tmp3);
5025 
5026  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5027  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5028  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5029  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5030  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5031  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5032 
5033  CLIP_SH2_0_255(tmp4, tmp5);
5034  out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5035  ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
5036 }
5037 
5038 static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
5039  int32_t src_stride,
5040  int16_t *src1_ptr,
5041  int32_t src2_stride,
5042  uint8_t *dst,
5043  int32_t dst_stride,
5044  const int8_t *filter_x,
5045  const int8_t *filter_y,
5046  int32_t weight0,
5047  int32_t weight1,
5048  int32_t offset0,
5049  int32_t offset1,
5050  int32_t rnd_val)
5051 {
5053  v16u8 out;
5054  v16i8 src0, src1, src2, src3, src4;
5055  v8i16 filt0, filt1;
5056  v8i16 filt_h0, filt_h1;
5057  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5058  v16i8 mask1;
5059  v8i16 filter_vec, weight_vec;
5060  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5061  v8i16 dst0, dst1, dst2, dst3, dst4;
5062  v8i16 in0, in1;
5063  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
5064  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
5065  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
5066  v8i16 tmp0, tmp1, tmp2, tmp3;
5067  v4i32 offset_vec, rnd_vec, const_vec;
5068 
5069  src0_ptr -= (src_stride + 1);
5070 
5071  filter_vec = LD_SH(filter_x);
5072  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5073 
5074  filter_vec = LD_SH(filter_y);
5075  UNPCK_R_SB_SH(filter_vec, filter_vec);
5076 
5077  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5078 
5079  mask1 = mask0 + 2;
5080 
5081  offset = (offset0 + offset1) << rnd_val;
5082  weight0 = weight0 & 0x0000FFFF;
5083  weight = weight0 | (weight1 << 16);
5084 
5085  const_vec = __msa_fill_w((128 * weight1));
5086  const_vec <<= 6;
5087  offset_vec = __msa_fill_w(offset);
5088  weight_vec = (v8i16) __msa_fill_w(weight);
5089  rnd_vec = __msa_fill_w(rnd_val + 1);
5090  offset_vec += const_vec;
5091 
5092  LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5093  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5094 
5095  LD_SH2(src1_ptr, src2_stride, in0, in1);
5096 
5097  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5098  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5099  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5100  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5101  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5102 
5103  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5104  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5105  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5106  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5107  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5108 
5109  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
5110  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
5111  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
5112  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
5113  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5114  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5115  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5116  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5117  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5118  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
5119 
5120  ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
5121  ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
5122 
5123  dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5124  dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5125  dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5126  dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5127  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5128  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
5129  CLIP_SH2_0_255(tmp0, tmp1);
5130  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
5131  ST_D2(out, 0, 1, dst, dst_stride);
5132 }
5133 
5134 static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr,
5135  int32_t src_stride,
5136  int16_t *src1_ptr,
5137  int32_t src2_stride,
5138  uint8_t *dst,
5139  int32_t dst_stride,
5140  const int8_t *filter_x,
5141  const int8_t *filter_y,
5142  int32_t weight0,
5143  int32_t weight1,
5144  int32_t offset0,
5145  int32_t offset1,
5146  int32_t rnd_val,
5147  int32_t width8mult)
5148 {
5150  uint32_t cnt;
5151  v16u8 out0, out1;
5152  v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
5153  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5154  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec;
5155  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5156  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3;
5157  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5158  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5159  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5160  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5161  v4i32 offset_vec, rnd_vec, const_vec;
5162 
5163  src0_ptr -= (src_stride + 1);
5164 
5165  filter_vec = LD_SH(filter_x);
5166  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5167 
5168  filter_vec = LD_SH(filter_y);
5169  UNPCK_R_SB_SH(filter_vec, filter_vec);
5170 
5171  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5172 
5173  mask0 = LD_SB(ff_hevc_mask_arr);
5174  mask1 = mask0 + 2;
5175 
5176  offset = (offset0 + offset1) << rnd_val;
5177  weight0 = weight0 & 0x0000FFFF;
5178  weight = weight0 | (weight1 << 16);
5179 
5180  const_vec = __msa_fill_w((128 * weight1));
5181  const_vec <<= 6;
5182  offset_vec = __msa_fill_w(offset);
5183  rnd_vec = __msa_fill_w(rnd_val + 1);
5184  offset_vec += const_vec;
5185  weight_vec = (v8i16) __msa_fill_w(weight);
5186 
5187  for (cnt = width8mult; cnt--;) {
5188  LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
5189  src0_ptr += 8;
5190  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
5191 
5192  LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
5193  src1_ptr += 8;
5194 
5195  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5196  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5197  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5198 
5199  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5200  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5201  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5202 
5203  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5204  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5205 
5206  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5207  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5208  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5209  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5210 
5211  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5212  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5213  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5214  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5215 
5216  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5217  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5218  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5219  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5220 
5221  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5222  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5223  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5224  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5225  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5226  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5227  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5228  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5229 
5230  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5231  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5232  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5233  dst3_r, dst0, dst1, dst2, dst3);
5234 
5235  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5236  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5237  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5238  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5239  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5240  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5241  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5242  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5243  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5244  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5245  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5246  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5247  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5248  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5249  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5250  tmp0, tmp1, tmp2, tmp3);
5251  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5252  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5253  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5254  dst += 8;
5255  }
5256 }
5257 
5258 static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
5259  int32_t src_stride,
5260  int16_t *src1_ptr,
5261  int32_t src2_stride,
5262  uint8_t *dst,
5263  int32_t dst_stride,
5264  const int8_t *filter_x,
5265  const int8_t *filter_y,
5266  int32_t weight0,
5267  int32_t weight1,
5268  int32_t offset0,
5269  int32_t offset1,
5270  int32_t rnd_val)
5271 {
5272  uint32_t offset, weight;
5273  v16u8 out0, out1, out2;
5274  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5275  v8i16 filt0, filt1;
5276  v8i16 filt_h0, filt_h1;
5277  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5278  v16i8 mask1;
5279  v8i16 filter_vec, weight_vec;
5280  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5281  v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
5282  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
5283  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5284  v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
5285  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
5286  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
5287  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
5288  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
5289  v8i16 in0, in1, in2, in3, in4, in5;
5290  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5291  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5292  v4i32 offset_vec, rnd_vec, const_vec;
5293 
5294  src0_ptr -= (src_stride + 1);
5295 
5296  filter_vec = LD_SH(filter_x);
5297  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5298 
5299  filter_vec = LD_SH(filter_y);
5300  UNPCK_R_SB_SH(filter_vec, filter_vec);
5301 
5302  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5303 
5304  mask1 = mask0 + 2;
5305 
5306  offset = (offset0 + offset1) << rnd_val;
5307  weight0 = weight0 & 0x0000FFFF;
5308  weight = weight0 | (weight1 << 16);
5309 
5310  const_vec = __msa_fill_w((128 * weight1));
5311  const_vec <<= 6;
5312  offset_vec = __msa_fill_w(offset);
5313  weight_vec = (v8i16) __msa_fill_w(weight);
5314  rnd_vec = __msa_fill_w(rnd_val + 1);
5315  offset_vec += const_vec;
5316 
5317  LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5318  src0_ptr += (5 * src_stride);
5319  LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
5320 
5321  XORI_B5_128_SB(src0, src1, src2, src3, src4);
5322  XORI_B4_128_SB(src5, src6, src7, src8);
5323 
5324  LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
5325 
5326  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5327  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5328  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5329  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5330  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5331  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
5332  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
5333  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
5334  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
5335 
5336  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5337  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5338  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5339  dsth3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5340  dsth4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5341  dsth5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
5342  dsth6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
5343  dsth7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
5344  dsth8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
5345 
5346  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5347  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5348  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5349  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5350  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5351  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5352  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
5353  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
5354 
5355  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5356  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5357  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5358  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5359  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5360  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5361  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5362  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5363  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5364  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
5365  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5366  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
5367 
5368  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5369  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5370  SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
5371  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
5372  dst0, dst1, dst2, dst3);
5373 
5374  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5375  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5376  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5377  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5378  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5379  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5380  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5381  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5382  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5383  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5384  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5385  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5386  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5387  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5388  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5389  tmp0, tmp1, tmp2, tmp3);
5390  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5391  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5392 
5393  PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
5394  ILVRL_H2_SH(dst0, in4, tmp0, tmp1);
5395  ILVRL_H2_SH(dst1, in5, tmp2, tmp3);
5396  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5397  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5398  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5399  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5400  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5401  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5402  CLIP_SH2_0_255(tmp4, tmp5);
5403  out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5404  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
5405  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
5406 }
5407 
5408 static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
5409  int32_t src_stride,
5410  int16_t *src1_ptr,
5411  int32_t src2_stride,
5412  uint8_t *dst,
5413  int32_t dst_stride,
5414  const int8_t *filter_x,
5415  const int8_t *filter_y,
5416  int32_t height,
5417  int32_t weight0,
5418  int32_t weight1,
5419  int32_t offset0,
5420  int32_t offset1,
5421  int32_t rnd_val,
5422  int32_t width)
5423 {
5424  uint32_t loop_cnt;
5425  uint32_t cnt;
5427  uint8_t *src0_ptr_tmp;
5428  int16_t *src1_ptr_tmp;
5429  uint8_t *dst_tmp;
5430  v16u8 out0, out1;
5431  v16i8 src0, src1, src2, src3, src4, src5, src6;
5432  v8i16 in0, in1, in2, in3;
5433  v8i16 filt0, filt1;
5434  v8i16 filt_h0, filt_h1;
5435  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5436  v16i8 mask1;
5437  v8i16 filter_vec;
5438  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5439  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5440  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5441  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5442  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5443  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5444  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec;
5445  v4i32 offset_vec, rnd_vec, const_vec;
5446 
5447  src0_ptr -= (src_stride + 1);
5448 
5449  filter_vec = LD_SH(filter_x);
5450  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5451 
5452  filter_vec = LD_SH(filter_y);
5453  UNPCK_R_SB_SH(filter_vec, filter_vec);
5454 
5455  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5456 
5457  mask1 = mask0 + 2;
5458 
5459  offset = (offset0 + offset1) << rnd_val;
5460  weight0 = weight0 & 0x0000FFFF;
5461  weight = weight0 | (weight1 << 16);
5462 
5463  const_vec = __msa_fill_w((128 * weight1));
5464  const_vec <<= 6;
5465  offset_vec = __msa_fill_w(offset);
5466  weight_vec = (v8i16) __msa_fill_w(weight);
5467  rnd_vec = __msa_fill_w(rnd_val + 1);
5468  offset_vec += const_vec;
5469 
5470  for (cnt = width >> 3; cnt--;) {
5471  src0_ptr_tmp = src0_ptr;
5472  src1_ptr_tmp = src1_ptr;
5473  dst_tmp = dst;
5474 
5475  LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5476  src0_ptr_tmp += (3 * src_stride);
5477  XORI_B3_128_SB(src0, src1, src2);
5478 
5479  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5480  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5481  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5482  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5483  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5484  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5485 
5486  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5487  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5488 
5489  for (loop_cnt = height >> 2; loop_cnt--;) {
5490  LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5491  src0_ptr_tmp += (4 * src_stride);
5492  LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5493  src1_ptr_tmp += (4 * src2_stride);
5494  XORI_B4_128_SB(src3, src4, src5, src6);
5495 
5496  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5497  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5498  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5499  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5500 
5501  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5502  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5503  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5504  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5505 
5506  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5507  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5508  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5509  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5510 
5511  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5512  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5513  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5514  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5515  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5516  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5517  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5518  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5519 
5520  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5521  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5522  PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5523  dst3_r, dst0, dst1, dst2, dst3);
5524  ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5525  ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5526  ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5527  ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5528  dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5529  dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5530  dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5531  dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5532  dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5533  dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5534  dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5535  dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5536  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5537  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5538  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5539  tmp0, tmp1, tmp2, tmp3);
5540  CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
5541  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5542  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5543  dst_tmp += (4 * dst_stride);
5544 
5545  dst10_r = dst54_r;
5546  dst10_l = dst54_l;
5547  dst21_r = dst65_r;
5548  dst21_l = dst65_l;
5549  dsth2 = dsth6;
5550  }
5551 
5552  src0_ptr += 8;
5553  dst += 8;
5554  src1_ptr += 8;
5555  }
5556 }
5557 
5558 static void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr,
5559  int32_t src_stride,
5560  int16_t *src1_ptr,
5561  int32_t src2_stride,
5562  uint8_t *dst,
5563  int32_t dst_stride,
5564  const int8_t *filter_x,
5565  const int8_t *filter_y,
5566  int32_t height,
5567  int32_t weight0,
5568  int32_t weight1,
5569  int32_t offset0,
5570  int32_t offset1,
5571  int32_t rnd_val)
5572 {
5573  if (2 == height) {
5574  hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5575  dst, dst_stride, filter_x, filter_y,
5576  weight0, weight1, offset0, offset1, rnd_val);
5577  } else if (4 == height) {
5578  hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5579  src2_stride, dst, dst_stride, filter_x,
5580  filter_y, weight0, weight1, offset0,
5581  offset1, rnd_val, 1);
5582  } else if (6 == height) {
5583  hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5584  dst, dst_stride, filter_x, filter_y,
5585  weight0, weight1, offset0, offset1, rnd_val);
5586  } else if (0 == (height % 4)) {
5587  hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5588  src1_ptr, src2_stride,
5589  dst, dst_stride, filter_x, filter_y,
5590  height, weight0,
5591  weight1, offset0, offset1, rnd_val, 8);
5592  }
5593 }
5594 
5595 static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
5596  int32_t src_stride,
5597  int16_t *src1_ptr,
5598  int32_t src2_stride,
5599  uint8_t *dst,
5600  int32_t dst_stride,
5601