FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
hevc_mc_uniw_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
30 };
31 
32 #define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \
33  out0_h, out1_h) \
34 { \
35  v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m; \
36  \
37  ILVRL_H2_SW(in0_h, in0_h, in0_r_m, in0_l_m); \
38  ILVRL_H2_SW(in1_h, in1_h, in1_r_m, in1_l_m); \
39  DOTP_SH4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, wgt_w, wgt_w, wgt_w, \
40  wgt_w, in0_r_m, in1_r_m, in0_l_m, in1_l_m); \
41  SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w); \
42  PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h); \
43  ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h); \
44  CLIP_SH2_0_255_MAX_SATU(out0_h, out1_h); \
45 }
46 
47 #define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w, \
48  offset_h, rnd_w, out0_h, out1_h, \
49  out2_h, out3_h) \
50 { \
51  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \
52  out0_h, out1_h); \
53  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w, \
54  out2_h, out3_h); \
55 }
56 
58  int32_t src_stride,
59  uint8_t *dst,
60  int32_t dst_stride,
64  int32_t rnd_val)
65 {
66  uint32_t loop_cnt, tp0, tp1, tp2, tp3;
67  v16i8 zero = { 0 };
68  v16u8 out0, out1;
69  v16i8 src0 = { 0 }, src1 = { 0 };
70  v8i16 dst0, dst1, dst2, dst3, offset_vec;
71  v4i32 weight_vec, rnd_vec;
72 
73  weight = weight & 0x0000FFFF;
74  weight_vec = __msa_fill_w(weight);
75  offset_vec = __msa_fill_h(offset);
76  rnd_vec = __msa_fill_w(rnd_val);
77 
78  if (2 == height) {
79  v4i32 dst0_r, dst0_l;
80 
81  LW2(src, src_stride, tp0, tp1);
82  INSERT_W2_SB(tp0, tp1, src0);
83  dst0 = (v8i16) __msa_ilvr_b(zero, src0);
84  dst0 <<= 6;
85 
86  ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
87  DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
88  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
89  dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
90  dst0 += offset_vec;
91  dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
92  out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
93  ST4x2_UB(out0, dst, dst_stride);
94  } else if (4 == height) {
95  LW4(src, src_stride, tp0, tp1, tp2, tp3);
96  INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
97  ILVRL_B2_SH(zero, src0, dst0, dst1);
98  SLLI_2V(dst0, dst1, 6);
99  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
100  rnd_vec, dst0, dst1);
101  out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
102  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
103  } else if (0 == (height % 8)) {
104  for (loop_cnt = (height >> 3); loop_cnt--;) {
105  LW4(src, src_stride, tp0, tp1, tp2, tp3);
106  src += 4 * src_stride;
107  INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
108  LW4(src, src_stride, tp0, tp1, tp2, tp3);
109  src += 4 * src_stride;
110  INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
111  ILVRL_B2_SH(zero, src0, dst0, dst1);
112  ILVRL_B2_SH(zero, src1, dst2, dst3);
113  SLLI_4V(dst0, dst1, dst2, dst3, 6);
114  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
115  offset_vec, rnd_vec, dst0, dst1,
116  dst2, dst3);
117  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
118  ST4x8_UB(out0, out1, dst, dst_stride);
119  dst += 8 * dst_stride;
120  }
121  }
122 }
123 
125  int32_t src_stride,
126  uint8_t *dst,
127  int32_t dst_stride,
128  int32_t height,
129  int32_t weight,
130  int32_t offset,
131  int32_t rnd_val)
132 {
133  uint32_t loop_cnt;
134  uint64_t tp0, tp1, tp2, tp3;
135  v16i8 zero = { 0 };
136  v16u8 out0, out1, out2, out3;
137  v16i8 src0, src1, src2, src3;
138  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
139  v4i32 weight_vec, rnd_vec;
140 
141  weight = weight & 0x0000FFFF;
142  weight_vec = __msa_fill_w(weight);
143  offset_vec = __msa_fill_h(offset);
144  rnd_vec = __msa_fill_w(rnd_val);
145 
146  for (loop_cnt = (height >> 3); loop_cnt--;) {
147  LD4(src, src_stride, tp0, tp1, tp2, tp3);
148  src += (4 * src_stride);
149  INSERT_D2_SB(tp0, tp1, src0);
150  INSERT_D2_SB(tp2, tp3, src1);
151  LD4(src, src_stride, tp0, tp1, tp2, tp3);
152  src += (4 * src_stride);
153  INSERT_D2_SB(tp0, tp1, src2);
154  INSERT_D2_SB(tp2, tp3, src3);
155 
156  ILVRL_B2_SH(zero, src0, dst0, dst1);
157  ILVRL_B2_SH(zero, src1, dst2, dst3);
158  ILVRL_B2_SH(zero, src2, dst4, dst5);
159  ILVRL_B2_SH(zero, src3, dst6, dst7);
160 
161  SLLI_4V(dst0, dst1, dst2, dst3, 6);
162  SLLI_4V(dst4, dst5, dst6, dst7, 6);
163 
164  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
165  offset_vec, rnd_vec, dst0, dst1, dst2,
166  dst3);
167  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
168  offset_vec, rnd_vec, dst4, dst5, dst6,
169  dst7);
170  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
171  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
172 
173  ST6x4_UB(out0, out1, dst, dst_stride);
174  dst += (4 * dst_stride);
175  ST6x4_UB(out2, out3, dst, dst_stride);
176  dst += (4 * dst_stride);
177  }
178 }
179 
181  int32_t src_stride,
182  uint8_t *dst,
183  int32_t dst_stride,
184  int32_t height,
185  int32_t weight,
186  int32_t offset,
187  int32_t rnd_val)
188 {
189  uint32_t loop_cnt;
190  uint64_t tp0, tp1, tp2, tp3;
191  v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
192  v16i8 zero = { 0 };
193  v16u8 out0, out1, out2, out3;
194  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
195  v4i32 weight_vec, rnd_vec;
196 
197  weight = weight & 0x0000FFFF;
198  weight_vec = __msa_fill_w(weight);
199  offset_vec = __msa_fill_h(offset);
200  rnd_vec = __msa_fill_w(rnd_val);
201 
202  if (2 == height) {
203  LD2(src, src_stride, tp0, tp1);
204  INSERT_D2_SB(tp0, tp1, src0);
205  ILVRL_B2_SH(zero, src0, dst0, dst1);
206  SLLI_2V(dst0, dst1, 6);
207  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
208  rnd_vec, dst0, dst1);
209  out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
210  ST8x2_UB(out0, dst, dst_stride);
211  } else if (4 == height) {
212  LD4(src, src_stride, tp0, tp1, tp2, tp3);
213  INSERT_D2_SB(tp0, tp1, src0);
214  INSERT_D2_SB(tp2, tp3, src1);
215  ILVRL_B2_SH(zero, src0, dst0, dst1);
216  ILVRL_B2_SH(zero, src1, dst2, dst3);
217  SLLI_4V(dst0, dst1, dst2, dst3, 6);
218  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
219  offset_vec, rnd_vec, dst0, dst1, dst2,
220  dst3);
221  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
222  ST8x4_UB(out0, out1, dst, dst_stride);
223  } else if (6 == height) {
224  LD4(src, src_stride, tp0, tp1, tp2, tp3);
225  src += 4 * src_stride;
226  INSERT_D2_SB(tp0, tp1, src0);
227  INSERT_D2_SB(tp2, tp3, src1);
228  LD2(src, src_stride, tp0, tp1);
229  INSERT_D2_SB(tp0, tp1, src2);
230  ILVRL_B2_SH(zero, src0, dst0, dst1);
231  ILVRL_B2_SH(zero, src1, dst2, dst3);
232  ILVRL_B2_SH(zero, src2, dst4, dst5);
233  SLLI_4V(dst0, dst1, dst2, dst3, 6);
234  SLLI_2V(dst4, dst5, 6);
235  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
236  offset_vec, rnd_vec, dst0, dst1, dst2,
237  dst3);
238  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
239  rnd_vec, dst4, dst5);
240  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
241  ST8x4_UB(out0, out1, dst, dst_stride);
242  dst += (4 * dst_stride);
243  ST8x2_UB(out2, dst, dst_stride);
244  } else if (0 == height % 8) {
245  for (loop_cnt = (height >> 3); loop_cnt--;) {
246  LD4(src, src_stride, tp0, tp1, tp2, tp3);
247  src += 4 * src_stride;
248  INSERT_D2_SB(tp0, tp1, src0);
249  INSERT_D2_SB(tp2, tp3, src1);
250  LD4(src, src_stride, tp0, tp1, tp2, tp3);
251  src += 4 * src_stride;
252  INSERT_D2_SB(tp0, tp1, src2);
253  INSERT_D2_SB(tp2, tp3, src3);
254 
255  ILVRL_B2_SH(zero, src0, dst0, dst1);
256  ILVRL_B2_SH(zero, src1, dst2, dst3);
257  ILVRL_B2_SH(zero, src2, dst4, dst5);
258  ILVRL_B2_SH(zero, src3, dst6, dst7);
259  SLLI_4V(dst0, dst1, dst2, dst3, 6);
260  SLLI_4V(dst4, dst5, dst6, dst7, 6);
261  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
262  offset_vec, rnd_vec, dst0, dst1,
263  dst2, dst3);
264  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
265  offset_vec, rnd_vec, dst4, dst5,
266  dst6, dst7);
267  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
268  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
269  ST8x4_UB(out0, out1, dst, dst_stride);
270  dst += (4 * dst_stride);
271  ST8x4_UB(out2, out3, dst, dst_stride);
272  dst += (4 * dst_stride);
273  }
274  }
275 }
276 
278  int32_t src_stride,
279  uint8_t *dst,
280  int32_t dst_stride,
281  int32_t height,
282  int32_t weight,
283  int32_t offset,
284  int32_t rnd_val)
285 {
286  uint32_t loop_cnt;
287  v16u8 out0, out1, out2;
288  v16i8 src0, src1, src2, src3;
289  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
290  v8i16 offset_vec;
291  v16i8 zero = { 0 };
292  v4i32 weight_vec, rnd_vec;
293 
294  weight = weight & 0x0000FFFF;
295  weight_vec = __msa_fill_w(weight);
296  offset_vec = __msa_fill_h(offset);
297  rnd_vec = __msa_fill_w(rnd_val);
298 
299  for (loop_cnt = 4; loop_cnt--;) {
300  LD_SB4(src, src_stride, src0, src1, src2, src3);
301  src += (4 * src_stride);
302  ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
303  dst0, dst1, dst2, dst3);
304 
305  ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
306  ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
307  SLLI_4V(dst0, dst1, dst2, dst3, 6);
308  SLLI_2V(dst4, dst5, 6);
309  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
310  offset_vec, rnd_vec, dst0, dst1, dst2,
311  dst3);
312  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
313  rnd_vec, dst4, dst5);
314 
315  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
316  ST12x4_UB(out0, out1, out2, dst, dst_stride);
317  dst += (4 * dst_stride);
318  }
319 }
320 
322  int32_t src_stride,
323  uint8_t *dst,
324  int32_t dst_stride,
325  int32_t height,
326  int32_t weight,
327  int32_t offset,
328  int32_t rnd_val)
329 {
330  uint32_t loop_cnt;
331  v16u8 out0, out1, out2, out3;
332  v16i8 src0, src1, src2, src3;
333  v16i8 zero = { 0 };
334  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
335  v4i32 weight_vec, rnd_vec;
336 
337  weight = weight & 0x0000FFFF;
338  weight_vec = __msa_fill_w(weight);
339  offset_vec = __msa_fill_h(offset);
340  rnd_vec = __msa_fill_w(rnd_val);
341 
342  for (loop_cnt = height >> 2; loop_cnt--;) {
343  LD_SB4(src, src_stride, src0, src1, src2, src3);
344  src += (4 * src_stride);
345  ILVRL_B2_SH(zero, src0, dst0, dst1);
346  ILVRL_B2_SH(zero, src1, dst2, dst3);
347  ILVRL_B2_SH(zero, src2, dst4, dst5);
348  ILVRL_B2_SH(zero, src3, dst6, dst7);
349  SLLI_4V(dst0, dst1, dst2, dst3, 6);
350  SLLI_4V(dst4, dst5, dst6, dst7, 6);
351  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
352  offset_vec, rnd_vec, dst0, dst1, dst2,
353  dst3);
354  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
355  offset_vec, rnd_vec, dst4, dst5, dst6,
356  dst7);
357  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
358  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
359  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
360  dst += (4 * dst_stride);
361  }
362 }
363 
365  int32_t src_stride,
366  uint8_t *dst,
367  int32_t dst_stride,
368  int32_t height,
369  int32_t weight,
370  int32_t offset,
371  int32_t rnd_val)
372 {
373  uint32_t loop_cnt;
374  v16u8 out0, out1, out2, out3, out4, out5;
375  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
376  v16i8 zero = { 0 };
377  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
378  v8i16 dst8, dst9, dst10, dst11;
379  v4i32 weight_vec, rnd_vec;
380 
381  weight = weight & 0x0000FFFF;
382  weight_vec = __msa_fill_w(weight);
383  offset_vec = __msa_fill_h(offset);
384  rnd_vec = __msa_fill_w(rnd_val);
385 
386  for (loop_cnt = (height >> 2); loop_cnt--;) {
387  LD_SB4(src, src_stride, src0, src1, src4, src5);
388  LD_SB4(src + 16, src_stride, src2, src3, src6, src7);
389  src += (4 * src_stride);
390 
391  ILVRL_B2_SH(zero, src0, dst0, dst1);
392  ILVRL_B2_SH(zero, src1, dst2, dst3);
393  ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
394  ILVRL_B2_SH(zero, src4, dst6, dst7);
395  ILVRL_B2_SH(zero, src5, dst8, dst9);
396  ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
397  SLLI_4V(dst0, dst1, dst2, dst3, 6);
398  SLLI_4V(dst4, dst5, dst6, dst7, 6);
399  SLLI_4V(dst8, dst9, dst10, dst11, 6);
400  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
401  offset_vec, rnd_vec, dst0, dst1, dst2,
402  dst3);
403  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
404  offset_vec, rnd_vec, dst4, dst5, dst6,
405  dst7);
406  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
407  offset_vec, rnd_vec, dst8, dst9, dst10,
408  dst11);
409  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
410  PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
411  ST_UB4(out0, out1, out3, out4, dst, dst_stride);
412  ST8x4_UB(out2, out5, dst + 16, dst_stride);
413  dst += (4 * dst_stride);
414  }
415 }
416 
418  int32_t src_stride,
419  uint8_t *dst,
420  int32_t dst_stride,
421  int32_t height,
422  int32_t weight,
423  int32_t offset,
424  int32_t rnd_val)
425 {
426  uint32_t loop_cnt;
427  v16u8 out0, out1, out2, out3;
428  v16i8 src0, src1, src2, src3;
429  v16i8 zero = { 0 };
430  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
431  v4i32 weight_vec, rnd_vec;
432 
433  weight = weight & 0x0000FFFF;
434  weight_vec = __msa_fill_w(weight);
435  offset_vec = __msa_fill_h(offset);
436  rnd_vec = __msa_fill_w(rnd_val);
437 
438  for (loop_cnt = (height >> 1); loop_cnt--;) {
439  LD_SB2(src, src_stride, src0, src1);
440  LD_SB2(src + 16, src_stride, src2, src3);
441  src += (2 * src_stride);
442 
443  ILVRL_B2_SH(zero, src0, dst0, dst1);
444  ILVRL_B2_SH(zero, src1, dst2, dst3);
445  ILVRL_B2_SH(zero, src2, dst4, dst5);
446  ILVRL_B2_SH(zero, src3, dst6, dst7);
447  SLLI_4V(dst0, dst1, dst2, dst3, 6);
448  SLLI_4V(dst4, dst5, dst6, dst7, 6);
449  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
450  offset_vec, rnd_vec, dst0, dst1, dst2,
451  dst3);
452  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
453  offset_vec, rnd_vec, dst4, dst5, dst6,
454  dst7);
455  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
456  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
457  ST_UB2(out0, out1, dst, dst_stride);
458  ST_UB2(out2, out3, dst + 16, dst_stride);
459  dst += (2 * dst_stride);
460  }
461 }
462 
464  int32_t src_stride,
465  uint8_t *dst,
466  int32_t dst_stride,
467  int32_t height,
468  int32_t weight,
469  int32_t offset,
470  int32_t rnd_val)
471 {
472  uint32_t loop_cnt;
473  v16u8 out0, out1, out2, out3, out4, out5;
474  v16i8 src0, src1, src2, src3, src4, src5;
475  v16i8 zero = { 0 };
476  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, offset_vec;
477  v8i16 dst6, dst7, dst8, dst9, dst10, dst11;
478  v4i32 weight_vec, rnd_vec;
479 
480  weight = weight & 0x0000FFFF;
481  weight_vec = __msa_fill_w(weight);
482  offset_vec = __msa_fill_h(offset);
483  rnd_vec = __msa_fill_w(rnd_val);
484 
485  for (loop_cnt = (height >> 1); loop_cnt--;) {
486  LD_SB3(src, 16, src0, src1, src2);
487  src += src_stride;
488  LD_SB3(src, 16, src3, src4, src5);
489  src += src_stride;
490 
491  ILVRL_B2_SH(zero, src0, dst0, dst1);
492  ILVRL_B2_SH(zero, src1, dst2, dst3);
493  ILVRL_B2_SH(zero, src2, dst4, dst5);
494  ILVRL_B2_SH(zero, src3, dst6, dst7);
495  ILVRL_B2_SH(zero, src4, dst8, dst9);
496  ILVRL_B2_SH(zero, src5, dst10, dst11);
497  SLLI_4V(dst0, dst1, dst2, dst3, 6);
498  SLLI_4V(dst4, dst5, dst6, dst7, 6);
499  SLLI_4V(dst8, dst9, dst10, dst11, 6);
500  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
501  offset_vec, rnd_vec, dst0, dst1, dst2,
502  dst3);
503  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
504  offset_vec, rnd_vec, dst4, dst5, dst6,
505  dst7);
506  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
507  offset_vec, rnd_vec, dst8, dst9, dst10,
508  dst11);
509  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
510  PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
511  ST_UB2(out0, out1, dst, 16);
512  ST_UB(out2, dst + 32);
513  dst += dst_stride;
514  ST_UB2(out3, out4, dst, 16);
515  ST_UB(out5, dst + 32);
516  dst += dst_stride;
517  }
518 }
519 
521  int32_t src_stride,
522  uint8_t *dst,
523  int32_t dst_stride,
524  int32_t height,
525  int32_t weight,
526  int32_t offset,
527  int32_t rnd_val)
528 {
529  uint32_t loop_cnt;
530  v16u8 out0, out1, out2, out3, out4, out5, out6, out7;
531  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
532  v16i8 zero = { 0 };
533  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
534  v8i16 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
535  v4i32 weight_vec, rnd_vec;
536 
537  weight = weight & 0x0000FFFF;
538  weight_vec = __msa_fill_w(weight);
539  offset_vec = __msa_fill_h(offset);
540  rnd_vec = __msa_fill_w(rnd_val);
541 
542  for (loop_cnt = (height >> 1); loop_cnt--;) {
543  LD_SB4(src, 16, src0, src1, src2, src3);
544  src += src_stride;
545  LD_SB4(src, 16, src4, src5, src6, src7);
546  src += src_stride;
547 
548  ILVRL_B2_SH(zero, src0, dst0, dst1);
549  ILVRL_B2_SH(zero, src1, dst2, dst3);
550  ILVRL_B2_SH(zero, src2, dst4, dst5);
551  ILVRL_B2_SH(zero, src3, dst6, dst7);
552  ILVRL_B2_SH(zero, src4, dst8, dst9);
553  ILVRL_B2_SH(zero, src5, dst10, dst11);
554  ILVRL_B2_SH(zero, src6, dst12, dst13);
555  ILVRL_B2_SH(zero, src7, dst14, dst15);
556  SLLI_4V(dst0, dst1, dst2, dst3, 6);
557  SLLI_4V(dst4, dst5, dst6, dst7, 6);
558  SLLI_4V(dst8, dst9, dst10, dst11, 6);
559  SLLI_4V(dst12, dst13, dst14, dst15, 6);
560  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
561  offset_vec, rnd_vec, dst0, dst1, dst2,
562  dst3);
563  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
564  offset_vec, rnd_vec, dst4, dst5, dst6,
565  dst7);
566  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
567  offset_vec, rnd_vec, dst8, dst9, dst10,
568  dst11);
569  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst12, dst13, dst14, dst15, weight_vec,
570  offset_vec, rnd_vec, dst12, dst13, dst14,
571  dst15);
572  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
573  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
574  PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
575  PCKEV_B2_UB(dst13, dst12, dst15, dst14, out6, out7);
576  ST_UB4(out0, out1, out2, out3, dst, 16);
577  dst += dst_stride;
578  ST_UB4(out4, out5, out6, out7, dst, 16);
579  dst += dst_stride;
580  }
581 }
582 
584  int32_t src_stride,
585  uint8_t *dst,
586  int32_t dst_stride,
587  const int8_t *filter,
588  int32_t height,
589  int32_t weight,
590  int32_t offset,
591  int32_t rnd_val)
592 {
593  uint32_t loop_cnt;
594  v16u8 out0, out1;
595  v8i16 filt0, filt1, filt2, filt3;
596  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
597  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
598  v16i8 mask0, mask1, mask2, mask3, vec11, vec12, vec13, vec14, vec15;
599  v8i16 filter_vec, dst01, dst23, dst45, dst67;
600  v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
601  v4i32 weight_vec, rnd_vec;
602 
603  src -= 3;
604  weight = weight & 0x0000FFFF;
605 
606  weight_vec = __msa_fill_w(weight);
607  rnd_vec = __msa_fill_w(rnd_val);
608 
609  weight *= 128;
610  rnd_val -= 6;
611 
612  weight_vec_h = __msa_fill_h(weight);
613  offset_vec = __msa_fill_h(offset);
614  denom_vec = __msa_fill_h(rnd_val);
615 
616  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
617  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
618 
619  filter_vec = LD_SH(filter);
620  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
621 
622  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
623  mask1 = mask0 + 2;
624  mask2 = mask0 + 4;
625  mask3 = mask0 + 6;
626 
627  for (loop_cnt = (height >> 3); loop_cnt--;) {
628  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
629  src += (8 * src_stride);
630  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
631 
632  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
633  vec0, vec1, vec2, vec3);
634  VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
635  vec4, vec5, vec6, vec7);
636  VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
637  vec8, vec9, vec10, vec11);
638  VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
639  vec12, vec13, vec14, vec15);
640  dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
641  filt3);
642  dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
643  filt3);
644  dst45 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
645  filt3);
646  dst67 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
647  filt2, filt3);
648 
649  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec,
650  offset_vec, rnd_vec, dst0, dst1, dst2,
651  dst3);
652 
653  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
654  ST4x8_UB(out0, out1, dst, dst_stride);
655  dst += (8 * dst_stride);
656  }
657 }
658 
660  int32_t src_stride,
661  uint8_t *dst,
662  int32_t dst_stride,
663  const int8_t *filter,
664  int32_t height,
665  int32_t weight,
666  int32_t offset,
667  int32_t rnd_val)
668 {
669  uint32_t loop_cnt;
670  v16u8 out0, out1;
671  v16i8 src0, src1, src2, src3;
672  v8i16 filt0, filt1, filt2, filt3;
673  v16i8 mask0, mask1, mask2, mask3;
674  v8i16 filter_vec;
675  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
676  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
677  v8i16 dst0, dst1, dst2, dst3;
678  v8i16 weight_vec_h, offset_vec, denom_vec;
679  v4i32 weight_vec, rnd_vec;
680 
681  src -= 3;
682  weight = weight & 0x0000FFFF;
683 
684  weight_vec = __msa_fill_w(weight);
685  rnd_vec = __msa_fill_w(rnd_val);
686 
687  weight *= 128;
688  rnd_val -= 6;
689 
690  weight_vec_h = __msa_fill_h(weight);
691  offset_vec = __msa_fill_h(offset);
692  denom_vec = __msa_fill_h(rnd_val);
693 
694  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
695  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
696 
697  filter_vec = LD_SH(filter);
698  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
699 
700  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
701  mask1 = mask0 + 2;
702  mask2 = mask0 + 4;
703  mask3 = mask0 + 6;
704 
705  for (loop_cnt = (height >> 2); loop_cnt--;) {
706  LD_SB4(src, src_stride, src0, src1, src2, src3);
707  src += (4 * src_stride);
708  XORI_B4_128_SB(src0, src1, src2, src3);
709 
710  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
711  vec0, vec1, vec2, vec3);
712  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
713  vec4, vec5, vec6, vec7);
714  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
715  vec8, vec9, vec10, vec11);
716  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
717  vec12, vec13, vec14, vec15);
718  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
719  filt3);
720  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
721  filt3);
722  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
723  filt3);
724  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
725  filt2, filt3);
726 
727  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
728  offset_vec, rnd_vec, dst0, dst1, dst2,
729  dst3);
730 
731  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
732  ST8x4_UB(out0, out1, dst, dst_stride);
733  dst += (4 * dst_stride);
734  }
735 }
736 
738  int32_t src_stride,
739  uint8_t *dst,
740  int32_t dst_stride,
741  const int8_t *filter,
742  int32_t height,
743  int32_t weight,
744  int32_t offset,
745  int32_t rnd_val)
746 {
747  uint32_t loop_cnt;
748  v16u8 out0, out1, out2;
749  v8i16 filt0, filt1, filt2, filt3;
750  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
751  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
752  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
753  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
754  v8i16 filter_vec;
755  v8i16 dst01, dst23, dst0, dst1, dst2, dst3, dst4, dst5;
756  v8i16 weight_vec_h, offset_vec, denom_vec;
757  v4i32 weight_vec, rnd_vec;
758 
759  src -= 3;
760  weight = weight & 0x0000FFFF;
761 
762  weight_vec = __msa_fill_w(weight);
763  rnd_vec = __msa_fill_w(rnd_val);
764 
765  weight *= 128;
766  rnd_val -= 6;
767 
768  weight_vec_h = __msa_fill_h(weight);
769  offset_vec = __msa_fill_h(offset);
770  denom_vec = __msa_fill_h(rnd_val);
771 
772  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
773  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
774 
775  filter_vec = LD_SH(filter);
776  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
777 
778  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
779  mask1 = mask0 + 2;
780  mask2 = mask0 + 4;
781  mask3 = mask0 + 6;
782  mask4 = LD_SB(&ff_hevc_mask_arr[16]);
783  mask5 = mask4 + 2;
784  mask6 = mask4 + 4;
785  mask7 = mask4 + 6;
786 
787  for (loop_cnt = (height >> 2); loop_cnt--;) {
788  LD_SB4(src, src_stride, src0, src1, src2, src3);
789  LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
790  src += (4 * src_stride);
791  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
792 
793  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
794  vec0, vec1, vec2, vec3);
795  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
796  vec4, vec5, vec6, vec7);
797  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
798  vec8, vec9, vec10, vec11);
799  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
800  vec12, vec13, vec14, vec15);
801  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
802  filt3);
803  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
804  filt3);
805  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
806  filt3);
807  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
808  filt2, filt3);
809  VSHF_B4_SB(src4, src5, mask4, mask5, mask6, mask7,
810  vec0, vec1, vec2, vec3);
811  VSHF_B4_SB(src6, src7, mask4, mask5, mask6, mask7,
812  vec4, vec5, vec6, vec7);
813  dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
814  filt3);
815  dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
816  filt3);
817 
818  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
819  offset_vec, rnd_vec, dst0, dst1, dst2,
820  dst3);
821  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst01, dst23, weight_vec, offset_vec,
822  rnd_vec, dst4, dst5);
823 
824  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
825  ST8x4_UB(out0, out1, dst, dst_stride);
826  ST4x4_UB(out2, out2, 0, 1, 2, 3, dst + 8, dst_stride);
827  dst += (4 * dst_stride);
828  }
829 }
830 
832  int32_t src_stride,
833  uint8_t *dst,
834  int32_t dst_stride,
835  const int8_t *filter,
836  int32_t height,
837  int32_t weight,
838  int32_t offset,
839  int32_t rnd_val)
840 {
841  uint32_t loop_cnt;
842  v16u8 out0, out1;
843  v16i8 src0, src1, src2, src3;
844  v8i16 filt0, filt1, filt2, filt3;
845  v16i8 mask0, mask1, mask2, mask3;
846  v8i16 filter_vec;
847  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
848  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
849  v8i16 dst0, dst1, dst2, dst3;
850  v8i16 weight_vec_h, offset_vec, denom_vec;
851  v4i32 weight_vec, rnd_vec;
852 
853  src -= 3;
854 
855  weight_vec = __msa_fill_w(weight);
856  rnd_vec = __msa_fill_w(rnd_val);
857 
858  weight *= 128;
859  rnd_val -= 6;
860 
861  weight_vec_h = __msa_fill_h(weight);
862  offset_vec = __msa_fill_h(offset);
863  denom_vec = __msa_fill_h(rnd_val);
864 
865  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
866  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
867 
868  filter_vec = LD_SH(filter);
869  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
870 
871  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
872  mask1 = mask0 + 2;
873  mask2 = mask0 + 4;
874  mask3 = mask0 + 6;
875 
876  for (loop_cnt = (height >> 1); loop_cnt--;) {
877  LD_SB2(src, src_stride, src0, src2);
878  LD_SB2(src + 8, src_stride, src1, src3);
879  src += (2 * src_stride);
880  XORI_B4_128_SB(src0, src1, src2, src3);
881 
882  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
883  vec0, vec1, vec2, vec3);
884  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
885  vec4, vec5, vec6, vec7);
886  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
887  vec8, vec9, vec10, vec11);
888  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
889  vec12, vec13, vec14, vec15);
890  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
891  filt3);
892  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
893  filt3);
894  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
895  filt3);
896  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
897  filt2, filt3);
898 
899  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
900  offset_vec, rnd_vec, dst0, dst1, dst2,
901  dst3);
902 
903  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
904  ST_UB2(out0, out1, dst, dst_stride);
905  dst += (2 * dst_stride);
906  }
907 }
908 
910  int32_t src_stride,
911  uint8_t *dst,
912  int32_t dst_stride,
913  const int8_t *filter,
914  int32_t height,
915  int32_t weight,
916  int32_t offset,
917  int32_t rnd_val)
918 {
919  uint32_t loop_cnt;
920  v16u8 out0, out1, out2;
921  v16i8 src0, src1, src2, src3;
922  v8i16 filt0, filt1, filt2, filt3;
923  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
924  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
925  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
926  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
927  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
928  v4i32 weight_vec, rnd_vec;
929 
930  src -= 3;
931 
932  weight_vec = __msa_fill_w(weight);
933  rnd_vec = __msa_fill_w(rnd_val);
934 
935  weight *= 128;
936  rnd_val -= 6;
937 
938  weight_vec_h = __msa_fill_h(weight);
939  offset_vec = __msa_fill_h(offset);
940  denom_vec = __msa_fill_h(rnd_val);
941 
942  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
943  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
944 
945  filter_vec = LD_SH(filter);
946  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
947 
948  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
949  mask1 = mask0 + 2;
950  mask2 = mask0 + 4;
951  mask3 = mask0 + 6;
952  mask4 = mask0 + 8;
953  mask5 = mask0 + 10;
954  mask6 = mask0 + 12;
955  mask7 = mask0 + 14;
956 
957  for (loop_cnt = 16; loop_cnt--;) {
958  LD_SB2(src, 16, src0, src1);
959  src += src_stride;
960  LD_SB2(src, 16, src2, src3);
961  src += src_stride;
962  XORI_B4_128_SB(src0, src1, src2, src3);
963  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
964  vec0, vec1, vec2, vec3);
965  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
966  vec4, vec5, vec6, vec7);
967  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
968  vec8, vec9, vec10, vec11);
969  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
970  vec12, vec13, vec14, vec15);
971  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
972  filt3);
973  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
974  filt3);
975  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
976  filt3);
977  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
978  filt2, filt3);
979 
980  VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
981  vec0, vec1, vec2, vec3);
982  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
983  vec4, vec5, vec6, vec7);
984  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
985  filt3);
986  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
987  filt3);
988 
989  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
990  offset_vec, rnd_vec, dst0, dst1, dst2,
991  dst3);
992  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
993  rnd_vec, dst4, dst5);
994 
995  PCKEV_B3_UB(dst1, dst0, dst4, dst3, dst5, dst2, out0, out1, out2);
996  ST_UB2(out0, out1, dst, dst_stride);
997  ST8x2_UB(out2, dst + 16, dst_stride);
998  dst += (2 * dst_stride);
999  }
1000 }
1001 
1003  int32_t src_stride,
1004  uint8_t *dst,
1005  int32_t dst_stride,
1006  const int8_t *filter,
1007  int32_t height,
1008  int32_t weight,
1009  int32_t offset,
1010  int32_t rnd_val)
1011 {
1012  uint32_t loop_cnt;
1013  v16u8 out0, out1, out2, out3;
1014  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1015  v8i16 filt0, filt1, filt2, filt3;
1016  v16i8 mask0, mask1, mask2, mask3;
1017  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1018  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1019  v8i16 filter_vec;
1020  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1021  v8i16 weight_vec_h, offset_vec, denom_vec;
1022  v4i32 weight_vec, rnd_vec;
1023 
1024  src -= 3;
1025 
1026  weight_vec = __msa_fill_w(weight);
1027  rnd_vec = __msa_fill_w(rnd_val);
1028 
1029  weight *= 128;
1030  rnd_val -= 6;
1031 
1032  weight_vec_h = __msa_fill_h(weight);
1033  offset_vec = __msa_fill_h(offset);
1034  denom_vec = __msa_fill_h(rnd_val);
1035 
1036  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1037  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1038 
1039  filter_vec = LD_SH(filter);
1040  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1041 
1042  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1043  mask1 = mask0 + 2;
1044  mask2 = mask0 + 4;
1045  mask3 = mask0 + 6;
1046 
1047  for (loop_cnt = height >> 1; loop_cnt--;) {
1048  LD_SB4(src, 8, src0, src1, src2, src3);
1049  src += src_stride;
1050  LD_SB4(src, 8, src4, src5, src6, src7);
1051  src += src_stride;
1052  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1053 
1054  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1055  vec0, vec1, vec2, vec3);
1056  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1057  vec4, vec5, vec6, vec7);
1058  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1059  vec8, vec9, vec10, vec11);
1060  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1061  vec12, vec13, vec14, vec15);
1062  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1063  filt3);
1064  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1065  filt3);
1066  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1067  filt3);
1068  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1069  filt2, filt3);
1070 
1071  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1072  vec0, vec1, vec2, vec3);
1073  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1074  vec4, vec5, vec6, vec7);
1075  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1076  vec8, vec9, vec10, vec11);
1077  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1078  vec12, vec13, vec14, vec15);
1079  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1080  filt3);
1081  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1082  filt3);
1083  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1084  filt3);
1085  dst7 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1086  filt2, filt3);
1087 
1088  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1089  offset_vec, rnd_vec, dst0, dst1, dst2,
1090  dst3);
1091  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
1092  offset_vec, rnd_vec, dst4, dst5, dst6,
1093  dst7);
1094 
1095  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1096  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
1097  ST_UB2(out0, out1, dst, 16);
1098  dst += dst_stride;
1099  ST_UB2(out2, out3, dst, 16);
1100  dst += dst_stride;
1101  }
1102 }
1103 
1105  int32_t src_stride,
1106  uint8_t *dst,
1107  int32_t dst_stride,
1108  const int8_t *filter,
1109  int32_t height,
1110  int32_t weight,
1111  int32_t offset,
1112  int32_t rnd_val)
1113 {
1114  uint32_t loop_cnt;
1115  v16u8 out0, out1, out2;
1116  v16i8 src0, src1, src2, src3;
1117  v8i16 filt0, filt1, filt2, filt3;
1118  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1119  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1120  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1121  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1122  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
1123  v4i32 weight_vec, rnd_vec;
1124 
1125  src -= 3;
1126 
1127  weight = weight & 0x0000FFFF;
1128  weight_vec = __msa_fill_w(weight);
1129  rnd_vec = __msa_fill_w(rnd_val);
1130 
1131  weight *= 128;
1132  rnd_val -= 6;
1133 
1134  weight_vec_h = __msa_fill_h(weight);
1135  offset_vec = __msa_fill_h(offset);
1136  denom_vec = __msa_fill_h(rnd_val);
1137 
1138  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1139  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1140 
1141  filter_vec = LD_SH(filter);
1142  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1143 
1144  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1145  mask1 = mask0 + 2;
1146  mask2 = mask0 + 4;
1147  mask3 = mask0 + 6;
1148  mask4 = mask0 + 8;
1149  mask5 = mask0 + 10;
1150  mask6 = mask0 + 12;
1151  mask7 = mask0 + 14;
1152 
1153  for (loop_cnt = 64; loop_cnt--;) {
1154  LD_SB3(src, 16, src0, src1, src2);
1155  src3 = LD_SB(src + 40);
1156  src += src_stride;
1157  XORI_B4_128_SB(src0, src1, src2, src3);
1158 
1159  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1160  vec0, vec1, vec2, vec3);
1161  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1162  vec4, vec5, vec6, vec7);
1163  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1164  vec8, vec9, vec10, vec11);
1165  VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
1166  vec12, vec13, vec14, vec15);
1167  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1168  filt3);
1169  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1170  filt3);
1171  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1172  filt3);
1173  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1174  filt2, filt3);
1175 
1176  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1177  vec0, vec1, vec2, vec3);
1178  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1179  vec4, vec5, vec6, vec7);
1180  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1181  filt3);
1182  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1183  filt3);
1184 
1185  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1186  offset_vec, rnd_vec, dst0, dst1, dst2,
1187  dst3);
1188  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
1189  rnd_vec, dst4, dst5);
1190 
1191  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
1192  ST_UB2(out0, out1, dst, 16);
1193  ST_UB(out2, dst + 32);
1194  dst += dst_stride;
1195  }
1196 }
1197 
1199  int32_t src_stride,
1200  uint8_t *dst,
1201  int32_t dst_stride,
1202  const int8_t *filter,
1203  int32_t height,
1204  int32_t weight,
1205  int32_t offset,
1206  int32_t rnd_val)
1207 {
1208  uint8_t *src_tmp;
1209  uint8_t *dst_tmp;
1210  uint32_t loop_cnt, cnt;
1211  v16u8 out0, out1;
1212  v16i8 src0, src1, src2;
1213  v8i16 filt0, filt1, filt2, filt3;
1214  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1215  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1216  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1217  v8i16 dst0, dst1, dst2, dst3;
1218  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
1219  v4i32 weight_vec, rnd_vec;
1220 
1221  src -= 3;
1222 
1223  weight_vec = __msa_fill_w(weight);
1224  rnd_vec = __msa_fill_w(rnd_val);
1225 
1226  weight *= 128;
1227  rnd_val -= 6;
1228 
1229  weight_vec_h = __msa_fill_h(weight);
1230  offset_vec = __msa_fill_h(offset);
1231  denom_vec = __msa_fill_h(rnd_val);
1232 
1233  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1234  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1235 
1236  filter_vec = LD_SH(filter);
1237  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1238 
1239  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1240  mask1 = mask0 + 2;
1241  mask2 = mask0 + 4;
1242  mask3 = mask0 + 6;
1243  mask4 = mask0 + 8;
1244  mask5 = mask0 + 10;
1245  mask6 = mask0 + 12;
1246  mask7 = mask0 + 14;
1247 
1248  for (loop_cnt = height; loop_cnt--;) {
1249  src_tmp = src;
1250  dst_tmp = dst;
1251 
1252  for (cnt = 2; cnt--;) {
1253  LD_SB2(src_tmp, 16, src0, src1);
1254  src2 = LD_SB(src_tmp + 24);
1255  src_tmp += 32;
1256  XORI_B3_128_SB(src0, src1, src2);
1257 
1258  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1259  vec0, vec1, vec2, vec3);
1260  VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1261  vec4, vec5, vec6, vec7);
1262  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1263  vec8, vec9, vec10, vec11);
1264  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1265  vec12, vec13, vec14, vec15);
1266  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1267  filt2, filt3);
1268  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1,
1269  filt2, filt3);
1270  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1271  filt2, filt3);
1272  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1273  filt2, filt3);
1274 
1275  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1276  offset_vec, rnd_vec, dst0, dst1,
1277  dst2, dst3);
1278 
1279  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1280  ST_UB2(out0, out1, dst_tmp, 16);
1281  dst_tmp += 32;
1282  }
1283 
1284  src += src_stride;
1285  dst += dst_stride;
1286  }
1287 }
1288 
1290  int32_t src_stride,
1291  uint8_t *dst,
1292  int32_t dst_stride,
1293  const int8_t *filter,
1294  int32_t height,
1295  int32_t weight,
1296  int32_t offset,
1297  int32_t rnd_val)
1298 {
1299  int32_t loop_cnt;
1300  v16u8 out0, out1;
1301  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1302  v16i8 src9, src10, src11, src12, src13, src14;
1303  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1304  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1305  v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1306  v16i8 src2110, src4332, src6554, src8776, src10998;
1307  v16i8 src12111110, src14131312;
1308  v8i16 filter_vec, dst01, dst23, dst45, dst67;
1309  v8i16 filt0, filt1, filt2, filt3;
1310  v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
1311  v4i32 weight_vec, rnd_vec;
1312 
1313  src -= (3 * src_stride);
1314 
1315 
1316  weight_vec = __msa_fill_w(weight);
1317  rnd_vec = __msa_fill_w(rnd_val);
1318 
1319  weight *= 128;
1320  rnd_val -= 6;
1321 
1322  weight_vec_h = __msa_fill_h(weight);
1323  offset_vec = __msa_fill_h(offset);
1324  denom_vec = __msa_fill_h(rnd_val);
1325 
1326  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1327  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1328 
1329  filter_vec = LD_SH(filter);
1330  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1331 
1332  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1333  src += (7 * src_stride);
1334 
1335  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1336  src10_r, src32_r, src54_r, src21_r);
1337 
1338  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1339 
1340  ILVR_D3_SB(src21_r, src10_r, src43_r,
1341  src32_r, src65_r, src54_r, src2110, src4332, src6554);
1342 
1343  XORI_B3_128_SB(src2110, src4332, src6554);
1344 
1345  for (loop_cnt = (height >> 3); loop_cnt--;) {
1346  LD_SB8(src, src_stride,
1347  src7, src8, src9, src10, src11, src12, src13, src14);
1348  src += (8 * src_stride);
1349  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1350  src76_r, src87_r, src98_r, src109_r);
1351  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1352  src1110_r, src1211_r, src1312_r, src1413_r);
1353  ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1354  src1413_r, src1312_r,
1355  src8776, src10998, src12111110, src14131312);
1356  XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1357  dst01 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0,
1358  filt1, filt2, filt3);
1359  dst23 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0,
1360  filt1, filt2, filt3);
1361  dst45 = HEVC_FILT_8TAP_SH(src6554, src8776, src10998, src12111110,
1362  filt0, filt1, filt2, filt3);
1363  dst67 = HEVC_FILT_8TAP_SH(src8776, src10998, src12111110, src14131312,
1364  filt0, filt1, filt2, filt3);
1365 
1366  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec,
1367  offset_vec, rnd_vec, dst0, dst1, dst2,
1368  dst3);
1369 
1370  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1371  ST4x8_UB(out0, out1, dst, dst_stride);
1372  dst += (8 * dst_stride);
1373 
1374  src2110 = src10998;
1375  src4332 = src12111110;
1376  src6554 = src14131312;
1377  src6 = src14;
1378  }
1379 }
1380 
1382  int32_t src_stride,
1383  uint8_t *dst,
1384  int32_t dst_stride,
1385  const int8_t *filter,
1386  int32_t height,
1387  int32_t weight,
1388  int32_t offset,
1389  int32_t rnd_val)
1390 {
1391  int32_t loop_cnt;
1392  v16u8 out0, out1;
1393  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1394  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1395  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1396  v8i16 filt0, filt1, filt2, filt3;
1397  v8i16 filter_vec;
1398  v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
1399  v4i32 weight_vec, rnd_vec;
1400 
1401  src -= (3 * src_stride);
1402 
1403  weight_vec = __msa_fill_w(weight);
1404  rnd_vec = __msa_fill_w(rnd_val);
1405 
1406  weight *= 128;
1407  rnd_val -= 6;
1408 
1409  weight_vec_h = __msa_fill_h(weight);
1410  offset_vec = __msa_fill_h(offset);
1411  denom_vec = __msa_fill_h(rnd_val);
1412 
1413  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1414  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1415 
1416  filter_vec = LD_SH(filter);
1417  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1418 
1419  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1420  src += (7 * src_stride);
1421  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1422 
1423  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1424  src10_r, src32_r, src54_r, src21_r);
1425  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1426 
1427  for (loop_cnt = (height >> 2); loop_cnt--;) {
1428  LD_SB4(src, src_stride, src7, src8, src9, src10);
1429  src += (4 * src_stride);
1430  XORI_B4_128_SB(src7, src8, src9, src10);
1431  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1432  src76_r, src87_r, src98_r, src109_r);
1433  dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1434  filt1, filt2, filt3);
1435  dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1436  filt1, filt2, filt3);
1437  dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1438  filt1, filt2, filt3);
1439  dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1440  filt1, filt2, filt3);
1441 
1442  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1443  offset_vec, rnd_vec, dst0, dst1, dst2,
1444  dst3);
1445 
1446  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1447  ST8x4_UB(out0, out1, dst, dst_stride);
1448  dst += (4 * dst_stride);
1449 
1450  src10_r = src54_r;
1451  src32_r = src76_r;
1452  src54_r = src98_r;
1453  src21_r = src65_r;
1454  src43_r = src87_r;
1455  src65_r = src109_r;
1456  src6 = src10;
1457  }
1458 }
1459 
1461  int32_t src_stride,
1462  uint8_t *dst,
1463  int32_t dst_stride,
1464  const int8_t *filter,
1465  int32_t height,
1466  int32_t weight,
1467  int32_t offset,
1468  int32_t rnd_val)
1469 {
1470  int32_t loop_cnt;
1471  v16u8 out0, out1, out2;
1472  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1473  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1474  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1475  v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1476  v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1477  v16i8 src2110, src4332, src6554, src8776, src10998;
1478  v8i16 filt0, filt1, filt2, filt3;
1479  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1480  v8i16 weight_vec_h, offset_vec, denom_vec, filter_vec;
1481  v4i32 weight_vec, rnd_vec;
1482 
1483  src -= (3 * src_stride);
1484 
1485  weight = weight & 0x0000FFFF;
1486  weight_vec = __msa_fill_w(weight);
1487  rnd_vec = __msa_fill_w(rnd_val);
1488 
1489  weight *= 128;
1490  rnd_val -= 6;
1491 
1492  weight_vec_h = __msa_fill_h(weight);
1493  offset_vec = __msa_fill_h(offset);
1494  denom_vec = __msa_fill_h(rnd_val);
1495 
1496  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1497  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1498 
1499  filter_vec = LD_SH(filter);
1500  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1501 
1502  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1503  src += (7 * src_stride);
1504  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1505 
1506  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1507  src10_r, src32_r, src54_r, src21_r);
1508  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1509  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1510  src10_l, src32_l, src54_l, src21_l);
1511  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1512  ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1513  src2110, src4332, src6554);
1514 
1515  for (loop_cnt = 4; loop_cnt--;) {
1516  LD_SB4(src, src_stride, src7, src8, src9, src10);
1517  src += (4 * src_stride);
1518  XORI_B4_128_SB(src7, src8, src9, src10);
1519 
1520  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1521  src76_r, src87_r, src98_r, src109_r);
1522  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1523  src76_l, src87_l, src98_l, src109_l);
1524  ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1525 
1526  dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1527  filt1, filt2, filt3);
1528  dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1529  filt1, filt2, filt3);
1530  dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1531  filt1, filt2, filt3);
1532  dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1533  filt1, filt2, filt3);
1534  dst4 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0,
1535  filt1, filt2, filt3);
1536  dst5 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0,
1537  filt1, filt2, filt3);
1538 
1539  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1540  offset_vec, rnd_vec, dst0, dst1, dst2,
1541  dst3);
1542  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
1543  rnd_vec, dst4, dst5);
1544 
1545  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
1546  ST8x4_UB(out0, out1, dst, dst_stride);
1547  ST4x4_UB(out2, out2, 0, 1, 2, 3, dst + 8, dst_stride);
1548  dst += (4 * dst_stride);
1549 
1550  src10_r = src54_r;
1551  src32_r = src76_r;
1552  src54_r = src98_r;
1553  src21_r = src65_r;
1554  src43_r = src87_r;
1555  src65_r = src109_r;
1556  src2110 = src6554;
1557  src4332 = src8776;
1558  src6554 = src10998;
1559  src6 = src10;
1560  }
1561 }
1562 
1564  int32_t src_stride,
1565  uint8_t *dst,
1566  int32_t dst_stride,
1567  const int8_t *filter,
1568  int32_t height,
1569  int32_t weight,
1570  int32_t offset,
1571  int32_t rnd_val,
1572  int32_t weightmul16)
1573 {
1574  uint8_t *src_tmp;
1575  uint8_t *dst_tmp;
1576  int32_t loop_cnt, cnt;
1577  v16u8 out0, out1, out2, out3;
1578  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1579  v16i8 src10_r, src32_r, src54_r, src76_r;
1580  v16i8 src21_r, src43_r, src65_r, src87_r;
1581  v16i8 src10_l, src32_l, src54_l, src76_l;
1582  v16i8 src21_l, src43_l, src65_l, src87_l;
1583  v16i8 src98_r, src109_r, src98_l, src109_l;
1584  v8i16 filt0, filt1, filt2, filt3;
1585  v8i16 filter_vec;
1586  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1587  v8i16 weight_vec_h, offset_vec, denom_vec;
1588  v4i32 weight_vec, rnd_vec;
1589 
1590  src -= (3 * src_stride);
1591 
1592  weight_vec = __msa_fill_w(weight);
1593  rnd_vec = __msa_fill_w(rnd_val);
1594 
1595  weight *= 128;
1596  rnd_val -= 6;
1597 
1598  weight_vec_h = __msa_fill_h(weight);
1599  offset_vec = __msa_fill_h(offset);
1600  denom_vec = __msa_fill_h(rnd_val);
1601 
1602  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1603  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1604 
1605  filter_vec = LD_SH(filter);
1606  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1607 
1608  for (cnt = weightmul16; cnt--;) {
1609  src_tmp = src;
1610  dst_tmp = dst;
1611 
1612  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1613  src_tmp += (7 * src_stride);
1614  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1615 
1616  for (loop_cnt = (height >> 2); loop_cnt--;) {
1617  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1618  src_tmp += (4 * src_stride);
1619  XORI_B4_128_SB(src7, src8, src9, src10);
1620 
1621  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1622  src10_r, src32_r, src54_r, src21_r);
1623  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1624  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1625  src10_l, src32_l, src54_l, src21_l);
1626  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1627  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1628  src76_r, src87_r, src98_r, src109_r);
1629  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1630  src76_l, src87_l, src98_l, src109_l);
1631 
1632  dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1633  filt1, filt2, filt3);
1634  dst1 = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1635  filt1, filt2, filt3);
1636  dst2 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1637  filt1, filt2, filt3);
1638  dst3 = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1639  filt1, filt2, filt3);
1640  dst4 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1641  filt1, filt2, filt3);
1642  dst5 = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1643  filt1, filt2, filt3);
1644  dst6 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1645  filt1, filt2, filt3);
1646  dst7 = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1647  filt1, filt2, filt3);
1648 
1649  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1650  offset_vec, rnd_vec, dst0, dst1,
1651  dst2, dst3);
1652  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
1653  offset_vec, rnd_vec, dst4, dst5,
1654  dst6, dst7);
1655  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1656  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
1657  ST_UB4(out0, out1, out2, out3, dst_tmp, dst_stride);
1658  dst_tmp += (4 * dst_stride);
1659 
1660  src0 = src4;
1661  src1 = src5;
1662  src2 = src6;
1663  src3 = src7;
1664  src4 = src8;
1665  src5 = src9;
1666  src6 = src10;
1667  }
1668 
1669  src += 16;
1670  dst += 16;
1671  }
1672 }
1673 
1675  int32_t src_stride,
1676  uint8_t *dst,
1677  int32_t dst_stride,
1678  const int8_t *filter,
1679  int32_t height,
1680  int32_t weight,
1681  int32_t offset,
1682  int32_t rnd_val)
1683 {
1684  hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1685  filter, height, weight,
1686  offset, rnd_val, 1);
1687 }
1688 
1690  int32_t src_stride,
1691  uint8_t *dst,
1692  int32_t dst_stride,
1693  const int8_t *filter,
1694  int32_t height,
1695  int32_t weight,
1696  int32_t offset,
1697  int32_t rnd_val)
1698 {
1699  hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1700  filter, 32, weight,
1701  offset, rnd_val, 1);
1702 
1703  hevc_vt_uniwgt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
1704  filter, 32, weight, offset, rnd_val);
1705 }
1706 
1708  int32_t src_stride,
1709  uint8_t *dst,
1710  int32_t dst_stride,
1711  const int8_t *filter,
1712  int32_t height,
1713  int32_t weight,
1714  int32_t offset,
1715  int32_t rnd_val)
1716 {
1717  hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1718  filter, height, weight,
1719  offset, rnd_val, 2);
1720 }
1721 
1723  int32_t src_stride,
1724  uint8_t *dst,
1725  int32_t dst_stride,
1726  const int8_t *filter,
1727  int32_t height,
1728  int32_t weight,
1729  int32_t offset,
1730  int32_t rnd_val)
1731 {
1732  hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1733  filter, 64, weight,
1734  offset, rnd_val, 3);
1735 }
1736 
1738  int32_t src_stride,
1739  uint8_t *dst,
1740  int32_t dst_stride,
1741  const int8_t *filter,
1742  int32_t height,
1743  int32_t weight,
1744  int32_t offset,
1745  int32_t rnd_val)
1746 {
1747  hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1748  filter, height, weight,
1749  offset, rnd_val, 4);
1750 }
1751 
1753  int32_t src_stride,
1754  uint8_t *dst,
1755  int32_t dst_stride,
1756  const int8_t *filter_x,
1757  const int8_t *filter_y,
1758  int32_t height,
1759  int32_t weight,
1760  int32_t offset,
1761  int32_t rnd_val)
1762 {
1763  uint32_t loop_cnt;
1764  v16u8 out;
1765  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1766  v8i16 filt0, filt1, filt2, filt3;
1767  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1768  v16i8 mask1, mask2, mask3;
1769  v8i16 filter_vec;
1770  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1771  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1772  v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1773  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
1774  v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
1775  v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
1776  v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
1777  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1778 
1779  src -= ((3 * src_stride) + 3);
1780  filter_vec = LD_SH(filter_x);
1781  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1782 
1783  filter_vec = LD_SH(filter_y);
1784  UNPCK_R_SB_SH(filter_vec, filter_vec);
1785 
1786  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1787 
1788  mask1 = mask0 + 2;
1789  mask2 = mask0 + 4;
1790  mask3 = mask0 + 6;
1791 
1792  weight_vec = __msa_fill_w(weight);
1793  offset_vec = __msa_fill_w(offset);
1794  rnd_vec = __msa_fill_w(rnd_val);
1795  denom_vec = rnd_vec - 6;
1796 
1797  const_128 = __msa_ldi_w(128);
1798  const_128 *= weight_vec;
1799  offset_vec += __msa_srar_w(const_128, denom_vec);
1800 
1801  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1802  src += (7 * src_stride);
1803  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1804 
1805  /* row 0 row 1 row 2 row 3 */
1806  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1807  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1808  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1809  vec8, vec9, vec10, vec11);
1810  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1811  vec12, vec13, vec14, vec15);
1812  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1813  filt3);
1814  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1815  filt3);
1816  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1817  filt3);
1818  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1819  filt3);
1820 
1821  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1822  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1823  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1824 
1825  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1826 
1827  for (loop_cnt = height >> 2; loop_cnt--;) {
1828  LD_SB4(src, src_stride, src7, src8, src9, src10);
1829  src += (4 * src_stride);
1830  XORI_B4_128_SB(src7, src8, src9, src10);
1831 
1832  VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1833  vec0, vec1, vec2, vec3);
1834  VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1835  vec4, vec5, vec6, vec7);
1836  dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1837  filt3);
1838  dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1839  filt3);
1840 
1841  dst76_r = __msa_ilvr_h(dst97, dst66);
1842  ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
1843  dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1844  dst98_r = __msa_ilvr_h(dst66, dst108);
1845 
1846  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1847  filt_h1, filt_h2, filt_h3);
1848  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1849  filt_h1, filt_h2, filt_h3);
1850  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1851  filt_h1, filt_h2, filt_h3);
1852  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1853  filt_h1, filt_h2, filt_h3);
1854 
1855  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1856  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
1857  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
1858  SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
1859  ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
1860  ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
1861  CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst2_r, dst3_r);
1862  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1863  out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
1864  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1865  dst += (4 * dst_stride);
1866 
1867  dst10_r = dst54_r;
1868  dst32_r = dst76_r;
1869  dst54_r = dst98_r;
1870  dst21_r = dst65_r;
1871  dst43_r = dst87_r;
1872  dst65_r = dst109_r;
1873  dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1874  }
1875 }
1876 
1878  int32_t src_stride,
1879  uint8_t *dst,
1880  int32_t dst_stride,
1881  const int8_t *filter_x,
1882  const int8_t *filter_y,
1883  int32_t height,
1884  int32_t weight,
1885  int32_t offset,
1886  int32_t rnd_val,
1887  int32_t width)
1888 {
1889  uint32_t loop_cnt, cnt;
1890  uint8_t *src_tmp;
1891  uint8_t *dst_tmp;
1892  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1893  v8i16 filt0, filt1, filt2, filt3;
1894  v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1895  v16i8 mask1, mask2, mask3;
1896  v8i16 filter_vec;
1897  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1898  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1899  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1900  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1901  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1902  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1903  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1904  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1905  v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
1906  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1907 
1908  src -= ((3 * src_stride) + 3);
1909 
1910  weight_vec = __msa_fill_w(weight);
1911  offset_vec = __msa_fill_w(offset);
1912  rnd_vec = __msa_fill_w(rnd_val);
1913  denom_vec = rnd_vec - 6;
1914 
1915  const_128 = __msa_ldi_w(128);
1916  const_128 *= weight_vec;
1917  offset_vec += __msa_srar_w(const_128, denom_vec);
1918 
1919  filter_vec = LD_SH(filter_x);
1920  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1921 
1922  filter_vec = LD_SH(filter_y);
1923  UNPCK_R_SB_SH(filter_vec, filter_vec);
1924  SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1925 
1926  mask1 = mask0 + 2;
1927  mask2 = mask0 + 4;
1928  mask3 = mask0 + 6;
1929 
1930  for (cnt = width >> 3; cnt--;) {
1931  src_tmp = src;
1932  dst_tmp = dst;
1933 
1934  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1935  src_tmp += (7 * src_stride);
1936  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1937 
1938  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1939  vec0, vec1, vec2, vec3);
1940  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1941  vec4, vec5, vec6, vec7);
1942  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1943  vec8, vec9, vec10, vec11);
1944  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1945  vec12, vec13, vec14, vec15);
1946  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1947  filt3);
1948  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1949  filt3);
1950  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1951  filt3);
1952  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1953  filt2, filt3);
1954 
1955  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1956  vec0, vec1, vec2, vec3);
1957  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1958  vec4, vec5, vec6, vec7);
1959  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1960  vec8, vec9, vec10, vec11);
1961  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1962  filt3);
1963  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1964  filt3);
1965  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1966  filt3);
1967 
1968  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1969  dst10_r, dst32_r, dst54_r, dst21_r);
1970  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1971  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1972  dst10_l, dst32_l, dst54_l, dst21_l);
1973  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1974 
1975  for (loop_cnt = height >> 1; loop_cnt--;) {
1976  LD_SB2(src_tmp, src_stride, src7, src8);
1977  src_tmp += 2 * src_stride;
1978  XORI_B2_128_SB(src7, src8);
1979 
1980  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1981  vec0, vec1, vec2, vec3);
1982  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1983  filt2, filt3);
1984 
1985  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1986  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1987  filt_h0, filt_h1, filt_h2, filt_h3);
1988  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1989  filt_h0, filt_h1, filt_h2, filt_h3);
1990  dst0_r >>= 6;
1991  dst0_l >>= 6;
1992 
1993  /* row 8 */
1994  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1995  vec0, vec1, vec2, vec3);
1996  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1997  filt2, filt3);
1998 
1999  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
2000  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
2001  filt_h0, filt_h1, filt_h2, filt_h3);
2002  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
2003  filt_h0, filt_h1, filt_h2, filt_h3);
2004  dst1_r >>= 6;
2005  dst1_l >>= 6;
2006 
2007  MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
2008  MUL2(dst1_r, weight_vec, dst1_l, weight_vec, dst1_r, dst1_l);
2009  SRAR_W4_SW(dst0_r, dst1_r, dst0_l, dst1_l, rnd_vec);
2010  ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2011  ADD2(dst1_r, offset_vec, dst1_l, offset_vec, dst1_r, dst1_l);
2012  CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst0_l, dst1_l);
2013 
2014  PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
2015  dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2016  ST8x2_UB(dst0_r, dst_tmp, dst_stride);
2017  dst_tmp += (2 * dst_stride);
2018 
2019  dst10_r = dst32_r;
2020  dst32_r = dst54_r;
2021  dst54_r = dst76_r;
2022  dst10_l = dst32_l;
2023  dst32_l = dst54_l;
2024  dst54_l = dst76_l;
2025  dst21_r = dst43_r;
2026  dst43_r = dst65_r;
2027  dst65_r = dst87_r;
2028  dst21_l = dst43_l;
2029  dst43_l = dst65_l;
2030  dst65_l = dst87_l;
2031  dst6 = dst8;
2032  }
2033 
2034  src += 8;
2035  dst += 8;
2036  }
2037 }
2038 
2040  int32_t src_stride,
2041  uint8_t *dst,
2042  int32_t dst_stride,
2043  const int8_t *filter_x,
2044  const int8_t *filter_y,
2045  int32_t height,
2046  int32_t weight,
2047  int32_t offset,
2048  int32_t rnd_val)
2049 {
2050  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2051  filter_x, filter_y, height, weight,
2052  offset, rnd_val, 8);
2053 }
2054 
2056  int32_t src_stride,
2057  uint8_t *dst,
2058  int32_t dst_stride,
2059  const int8_t *filter_x,
2060  const int8_t *filter_y,
2061  int32_t height,
2062  int32_t weight,
2063  int32_t offset,
2064  int32_t rnd_val)
2065 {
2066  uint32_t loop_cnt;
2067  uint8_t *src_tmp, *dst_tmp;
2068  v16u8 out;
2069  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2070  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2071  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2072  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2073  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2074  v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
2075  v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2076  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst10_l, dst32_l, dst54_l;
2077  v8i16 dst98_r, dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
2078  v8i16 dst76_l, filter_vec;
2079  v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
2080  v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
2081 
2082  src -= ((3 * src_stride) + 3);
2083 
2084  filter_vec = LD_SH(filter_x);
2085  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2086 
2087  filter_vec = LD_SH(filter_y);
2088  UNPCK_R_SB_SH(filter_vec, filter_vec);
2089 
2090  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2091 
2092  weight_vec = __msa_fill_w(weight);
2093  offset_vec = __msa_fill_w(offset);
2094  rnd_vec = __msa_fill_w(rnd_val);
2095  denom_vec = rnd_vec - 6;
2096 
2097  const_128 = __msa_ldi_w(128);
2098  const_128 *= weight_vec;
2099  offset_vec += __msa_srar_w(const_128, denom_vec);
2100 
2101  mask0 = LD_SB(ff_hevc_mask_arr);
2102  mask1 = mask0 + 2;
2103  mask2 = mask0 + 4;
2104  mask3 = mask0 + 6;
2105 
2106  src_tmp = src;
2107  dst_tmp = dst;
2108 
2109  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2110  src_tmp += (7 * src_stride);
2111  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2112 
2113  /* row 0 row 1 row 2 row 3 */
2114  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2115  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2116  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2117  vec11);
2118  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2119  vec15);
2120  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2121  filt3);
2122  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2123  filt3);
2124  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2125  filt3);
2126  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2127  filt2, filt3);
2128  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2129  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2130  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2131  vec11);
2132  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2133  filt3);
2134  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2135  filt3);
2136  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2137  filt3);
2138 
2139  for (loop_cnt = 16; loop_cnt--;) {
2140  src7 = LD_SB(src_tmp);
2141  src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
2142  src_tmp += src_stride;
2143 
2144  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2145  vec3);
2146  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2147  filt3);
2148  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
2149  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
2150  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
2151  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
2152 
2153  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2154  filt_h0, filt_h1, filt_h2, filt_h3);
2155  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2156  filt_h0, filt_h1, filt_h2, filt_h3);
2157  dst0_r >>= 6;
2158  dst0_l >>= 6;
2159 
2160  MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
2161  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2162  ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2163  CLIP_SW2_0_255_MAX_SATU(dst0_r, dst0_l);
2164  dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2165  out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
2166  ST8x1_UB(out, dst_tmp);
2167  dst_tmp += dst_stride;
2168 
2169  dst0 = dst1;
2170  dst1 = dst2;
2171  dst2 = dst3;
2172  dst3 = dst4;
2173  dst4 = dst5;
2174  dst5 = dst6;
2175  dst6 = dst7;
2176  }
2177 
2178  src += 8;
2179  dst += 8;
2180 
2181  mask4 = LD_SB(ff_hevc_mask_arr + 16);
2182  mask5 = mask4 + 2;
2183  mask6 = mask4 + 4;
2184  mask7 = mask4 + 6;
2185 
2186  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
2187  src += (7 * src_stride);
2188  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2189 
2190  VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2191  VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2192  VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2193  vec11);
2194  VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2195  vec15);
2196  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2197  filt3);
2198  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2199  filt3);
2200  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2201  filt3);
2202  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2203  filt3);
2204  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
2205  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
2206  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
2207 
2208  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2209 
2210  for (loop_cnt = 4; loop_cnt--;) {
2211  LD_SB4(src, src_stride, src7, src8, src9, src10);
2212  src += (4 * src_stride);
2213  XORI_B4_128_SB(src7, src8, src9, src10);
2214 
2215  VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2216  vec3);
2217  VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2218  vec7);
2219  dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2220  filt3);
2221  dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2222  filt3);
2223 
2224  dst76_r = __msa_ilvr_h(dst97, dst66);
2225  ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
2226  dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2227  dst98_r = __msa_ilvr_h(dst66, dst108);
2228 
2229  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2230  filt_h1, filt_h2, filt_h3);
2231  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2232  filt_h1, filt_h2, filt_h3);
2233  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
2234  filt_h1, filt_h2, filt_h3);
2235  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
2236  filt_h1, filt_h2, filt_h3);
2237 
2238  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2239  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
2240  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
2241  SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
2242  ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
2243  ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
2244  CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst2_r, dst3_r);
2245  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
2246  out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2247  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2248  dst += (4 * dst_stride);
2249 
2250  dst10_r = dst54_r;
2251  dst32_r = dst76_r;
2252  dst54_r = dst98_r;
2253  dst21_r = dst65_r;
2254  dst43_r = dst87_r;
2255  dst65_r = dst109_r;
2256  dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2257  }
2258 }
2259 
2261  int32_t src_stride,
2262  uint8_t *dst,
2263  int32_t dst_stride,
2264  const int8_t *filter_x,
2265  const int8_t *filter_y,
2266  int32_t height,
2267  int32_t weight,
2268  int32_t offset,
2269  int32_t rnd_val)
2270 {
2271  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2272  filter_x, filter_y, height, weight,
2273  offset, rnd_val, 16);
2274 }
2275 
2277  int32_t src_stride,
2278  uint8_t *dst,
2279  int32_t dst_stride,
2280  const int8_t *filter_x,
2281  const int8_t *filter_y,
2282  int32_t height,
2283  int32_t weight,
2284  int32_t offset,
2285  int32_t rnd_val)
2286 {
2287  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2288  filter_x, filter_y, height, weight,
2289  offset, rnd_val, 24);
2290 }
2291 
2293  int32_t src_stride,
2294  uint8_t *dst,
2295  int32_t dst_stride,
2296  const int8_t *filter_x,
2297  const int8_t *filter_y,
2298  int32_t height,
2299  int32_t weight,
2300  int32_t offset,
2301  int32_t rnd_val)
2302 {
2303  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2304  filter_x, filter_y, height, weight,
2305  offset, rnd_val, 32);
2306 }
2307 
2309  int32_t src_stride,
2310  uint8_t *dst,
2311  int32_t dst_stride,
2312  const int8_t *filter_x,
2313  const int8_t *filter_y,
2314  int32_t height,
2315  int32_t weight,
2316  int32_t offset,
2317  int32_t rnd_val)
2318 {
2319  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2320  filter_x, filter_y, height, weight,
2321  offset, rnd_val, 48);
2322 }
2323 
2325  int32_t src_stride,
2326  uint8_t *dst,
2327  int32_t dst_stride,
2328  const int8_t *filter_x,
2329  const int8_t *filter_y,
2330  int32_t height,
2331  int32_t weight,
2332  int32_t offset,
2333  int32_t rnd_val)
2334 {
2335  hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2336  filter_x, filter_y, height, weight,
2337  offset, rnd_val, 64);
2338 }
2339 
2341  int32_t src_stride,
2342  uint8_t *dst,
2343  int32_t dst_stride,
2344  const int8_t *filter,
2345  int32_t weight,
2346  int32_t offset,
2347  int32_t rnd_val)
2348 {
2349  v16u8 out;
2350  v8i16 filt0, filt1;
2351  v16i8 src0, src1, vec0, vec1;
2352  v16i8 mask1;
2353  v8i16 dst0;
2354  v4i32 dst0_r, dst0_l;
2355  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2356  v4i32 weight_vec, rnd_vec;
2357  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2358 
2359  src -= 1;
2360 
2361  filter_vec = LD_SH(filter);
2362  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2363 
2364  mask1 = mask0 + 2;
2365 
2366  weight = weight & 0x0000FFFF;
2367 
2368  weight_vec = __msa_fill_w(weight);
2369  rnd_vec = __msa_fill_w(rnd_val);
2370 
2371  weight *= 128;
2372  rnd_val -= 6;
2373 
2374  weight_vec_h = __msa_fill_h(weight);
2375  offset_vec = __msa_fill_h(offset);
2376  denom_vec = __msa_fill_h(rnd_val);
2377 
2378  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2379  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2380 
2381  LD_SB2(src, src_stride, src0, src1);
2382  XORI_B2_128_SB(src0, src1);
2383 
2384  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2385  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2386 
2387  ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
2388  DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
2389  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2390  dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2391  dst0 = __msa_adds_s_h(dst0, offset_vec);
2392  dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
2393  out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
2394  ST4x2_UB(out, dst, dst_stride);
2395  dst += (4 * dst_stride);
2396 }
2397 
2399  int32_t src_stride,
2400  uint8_t *dst,
2401  int32_t dst_stride,
2402  const int8_t *filter,
2403  int32_t weight,
2404  int32_t offset,
2405  int32_t rnd_val)
2406 {
2407  v16u8 out;
2408  v8i16 filt0, filt1;
2409  v16i8 src0, src1, src2, src3;
2410  v16i8 mask1, vec0, vec1, vec2, vec3;
2411  v8i16 dst0, dst1;
2412  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2413  v4i32 weight_vec, rnd_vec;
2414  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2415 
2416  src -= 1;
2417 
2418  /* rearranging filter */
2419  filter_vec = LD_SH(filter);
2420  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2421 
2422  mask1 = mask0 + 2;
2423 
2424  weight = weight & 0x0000FFFF;
2425 
2426  weight_vec = __msa_fill_w(weight);
2427  rnd_vec = __msa_fill_w(rnd_val);
2428 
2429  weight *= 128;
2430  rnd_val -= 6;
2431 
2432  weight_vec_h = __msa_fill_h(weight);
2433  offset_vec = __msa_fill_h(offset);
2434  denom_vec = __msa_fill_h(rnd_val);
2435 
2436  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2437  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2438 
2439  LD_SB4(src, src_stride, src0, src1, src2, src3);
2440  XORI_B4_128_SB(src0, src1, src2, src3);
2441 
2442  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2443  VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
2444  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2445  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2446 
2447  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
2448  dst0, dst1);
2449 
2450  out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2451  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2452  dst += (4 * dst_stride);
2453 }
2454 
2456  int32_t src_stride,
2457  uint8_t *dst,
2458  int32_t dst_stride,
2459  const int8_t *filter,
2460  int32_t height,
2461  int32_t weight,
2462  int32_t offset,
2463  int32_t rnd_val)
2464 {
2465  uint32_t loop_cnt;
2466  v16u8 out0, out1;
2467  v8i16 filt0, filt1;
2468  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2469  v16i8 mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2470  v8i16 dst0, dst1, dst2, dst3;
2471  v8i16 filter_vec;
2472  v8i16 weight_vec_h, offset_vec, denom_vec;
2473  v4i32 weight_vec, rnd_vec;
2474  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2475 
2476  src -= 1;
2477 
2478  filter_vec = LD_SH(filter);
2479  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2480 
2481  weight = weight & 0x0000FFFF;
2482 
2483  weight_vec = __msa_fill_w(weight);
2484  rnd_vec = __msa_fill_w(rnd_val);
2485 
2486  weight *= 128;
2487  rnd_val -= 6;
2488 
2489  weight_vec_h = __msa_fill_h(weight);
2490  offset_vec = __msa_fill_h(offset);
2491  denom_vec = __msa_fill_h(rnd_val);
2492 
2493  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2494  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2495 
2496  mask1 = mask0 + 2;
2497 
2498  for (loop_cnt = (height >> 3); loop_cnt--;) {
2499  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2500  src += (8 * src_stride);
2501 
2502  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2503 
2504  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2505  VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
2506  VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec4, vec5);
2507  VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec6, vec7);
2508  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2509  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2510  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2511  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2512 
2513  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2514  weight_vec, offset_vec, rnd_vec,
2515  dst0, dst1, dst2, dst3);
2516 
2517  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
2518  ST4x8_UB(out0, out1, dst, dst_stride);
2519  dst += (8 * dst_stride);
2520  }
2521 }
2522 
2524  int32_t src_stride,
2525  uint8_t *dst,
2526  int32_t dst_stride,
2527  const int8_t *filter,
2528  int32_t height,
2529  int32_t weight,
2530  int32_t offset,
2531  int32_t rnd_val)
2532 {
2533  if (2 == height) {
2534  hevc_hz_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
2535  filter, weight, offset, rnd_val);
2536  } else if (4 == height) {
2537  hevc_hz_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
2538  filter, weight, offset, rnd_val);
2539  } else if (8 == height || 16 == height) {
2540  hevc_hz_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
2541  filter, height, weight,
2542  offset, rnd_val);
2543  }
2544 }
2545 
2547  int32_t src_stride,
2548  uint8_t *dst,
2549  int32_t dst_stride,
2550  const int8_t *filter,
2551  int32_t height,
2552  int32_t weight,
2553  int32_t offset,
2554  int32_t rnd_val)
2555 {
2556  v16u8 out0, out1, out2, out3;
2557  v8i16 filt0, filt1;
2558  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2559  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2560  v16i8 mask1;
2561  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2562  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2563  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2564  v4i32 weight_vec, rnd_vec;
2565 
2566  src -= 1;
2567 
2568  filter_vec = LD_SH(filter);
2569  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2570 
2571  weight = weight & 0x0000FFFF;
2572 
2573  weight_vec = __msa_fill_w(weight);
2574  rnd_vec = __msa_fill_w(rnd_val);
2575 
2576  weight *= 128;
2577  rnd_val -= 6;
2578 
2579  weight_vec_h = __msa_fill_h(weight);
2580  offset_vec = __msa_fill_h(offset);
2581  denom_vec = __msa_fill_h(rnd_val);
2582 
2583  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2584  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2585 
2586  mask1 = mask0 + 2;
2587 
2588  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2589  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2590  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2591  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2592  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2593  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2594  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2595  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2596  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2597  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2598  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2599  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
2600  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
2601  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
2602  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2603  dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2604  dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2605  dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2606 
2607  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2608  weight_vec, offset_vec, rnd_vec,
2609  dst0, dst1, dst2, dst3);
2610  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
2611  weight_vec, offset_vec, rnd_vec,
2612  dst4, dst5, dst6, dst7);
2613 
2614  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
2615  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
2616  ST6x4_UB(out0, out1, dst, dst_stride);
2617  dst += (4 * dst_stride);
2618  ST6x4_UB(out2, out3, dst, dst_stride);
2619 }
2620 
2622  int32_t src_stride,
2623  uint8_t *dst,
2624  int32_t dst_stride,
2625  const int8_t *filter,
2626  int32_t weight,
2627  int32_t offset,
2628  int32_t rnd_val)
2629 {
2630  v16u8 out;
2631  v8i16 filt0, filt1, dst0, dst1;
2632  v16i8 src0, src1;
2633  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2634  v16i8 mask1;
2635  v16i8 vec0, vec1, vec2, vec3;
2636  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2637  v4i32 weight_vec, rnd_vec;
2638 
2639  src -= 1;
2640 
2641  filter_vec = LD_SH(filter);
2642  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2643 
2644  weight = weight & 0x0000FFFF;
2645 
2646  weight_vec = __msa_fill_w(weight);
2647  rnd_vec = __msa_fill_w(rnd_val);
2648 
2649  weight *= 128;
2650  rnd_val -= 6;
2651 
2652  weight_vec_h = __msa_fill_h(weight);
2653  offset_vec = __msa_fill_h(offset);
2654  denom_vec = __msa_fill_h(rnd_val);
2655 
2656  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2657  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2658 
2659  mask1 = mask0 + 2;
2660 
2661  LD_SB2(src, src_stride, src0, src1);
2662  XORI_B2_128_SB(src0, src1);
2663 
2664  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2665  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2666  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2667  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2668 
2669  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
2670  dst0, dst1);
2671 
2672  out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2673  ST8x2_UB(out, dst, dst_stride);
2674 }
2675 
2677  int32_t src_stride,
2678  uint8_t *dst,
2679  int32_t dst_stride,
2680  const int8_t *filter,
2681  int32_t weight,
2682  int32_t offset,
2683  int32_t rnd_val)
2684 {
2685  v16u8 out0, out1;
2686  v16i8 src0, src1, src2, src3;
2687  v16i8 mask0, mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2688  v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
2689  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2690  v4i32 weight_vec, rnd_vec;
2691 
2692  src -= 1;
2693 
2694  filter_vec = LD_SH(filter);
2695  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2696 
2697  weight = weight & 0x0000FFFF;
2698  weight_vec = __msa_fill_w(weight);
2699  rnd_vec = __msa_fill_w(rnd_val);
2700 
2701  weight *= 128;
2702  rnd_val -= 6;
2703 
2704  weight_vec_h = __msa_fill_h(weight);
2705  offset_vec = __msa_fill_h(offset);
2706  denom_vec = __msa_fill_h(rnd_val);
2707 
2708  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2709  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2710 
2711  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2712  mask1 = mask0 + 2;
2713 
2714  LD_SB4(src, src_stride, src0, src1, src2, src3);
2715  XORI_B4_128_SB(src0, src1, src2, src3);
2716  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2717  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2718  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2719  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2720  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2721  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2722  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2723  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2724 
2725  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2726  weight_vec, offset_vec, rnd_vec,
2727  dst0, dst1, dst2, dst3);
2728 
2729  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
2730  ST8x4_UB(out0, out1, dst, dst_stride);
2731 }
2732 
2734  int32_t src_stride,
2735  uint8_t *dst,
2736  int32_t dst_stride,
2737  const int8_t *filter,
2738  int32_t weight,
2739  int32_t offset,
2740  int32_t rnd_val)
2741 {
2742  v16u8 out0, out1, out2;
2743  v8i16 filt0, filt1;
2744  v16i8 src0, src1, src2, src3, src4, src5;
2745  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2746  v16i8 mask1;
2747  v16i8 vec11;
2748  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2749  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2750  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2751  v4i32 weight_vec, rnd_vec;
2752 
2753  src -= 1;
2754 
2755  filter_vec = LD_SH(filter);
2756  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2757 
2758  weight = weight & 0x0000FFFF;
2759 
2760  weight_vec = __msa_fill_w(weight);
2761  rnd_vec = __msa_fill_w(rnd_val);
2762 
2763  weight *= 128;
2764  rnd_val -= 6;
2765 
2766  weight_vec_h = __msa_fill_h(weight);
2767  offset_vec = __msa_fill_h(offset);
2768  denom_vec = __msa_fill_h(rnd_val);
2769 
2770  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2771  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2772 
2773  mask1 = mask0 + 2;
2774 
2775  LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
2776  XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
2777 
2778  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2779  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2780  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2781  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2782  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
2783  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
2784  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2785  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2786  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2787  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2788  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
2789  dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
2790 
2791  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2792  weight_vec, offset_vec, rnd_vec,
2793  dst0, dst1, dst2, dst3);
2794 
2795  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec,
2796  dst4, dst5);
2797 
2798  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
2799  ST8x4_UB(out0, out1, dst, dst_stride);
2800  dst += (4 * dst_stride);
2801  ST8x2_UB(out2, dst, dst_stride);
2802 }
2803 
2805  int32_t src_stride,
2806  uint8_t *dst,
2807  int32_t dst_stride,
2808  const int8_t *filter,
2809  int32_t height,
2810  int32_t weight,
2811  int32_t offset,
2812  int32_t rnd_val)
2813 {
2814  uint32_t loop_cnt;
2815  v8i16 filt0, filt1;
2816  v16u8 out0, out1, out2, out3;
2817  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2818  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2819  v16i8 mask1;
2820  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2821  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2822  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2823  v4i32 weight_vec, rnd_vec;
2824 
2825  src -= 1;
2826 
2827  filter_vec = LD_SH(filter);
2828  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2829 
2830  weight = weight & 0x0000FFFF;
2831 
2832  weight_vec = __msa_fill_w(weight);
2833  rnd_vec = __msa_fill_w(rnd_val);
2834 
2835  weight *= 128;
2836  rnd_val -= 6;
2837 
2838  weight_vec_h = __msa_fill_h(weight);
2839  offset_vec = __msa_fill_h(offset);
2840  denom_vec = __msa_fill_h(rnd_val);
2841 
2842  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2843  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2844 
2845  mask1 = mask0 + 2;
2846 
2847  for (loop_cnt = (height >> 3); loop_cnt--;) {
2848  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2849  src += (8 * src_stride);
2850  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2851 
2852  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2853  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2854  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2855  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2856  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2857  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2858  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2859  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2860  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2861  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
2862  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
2863  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
2864  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2865  dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2866  dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2867  dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2868 
2869  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2870  weight_vec, offset_vec, rnd_vec,
2871  dst0, dst1, dst2, dst3);
2872 
2873  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
2874  weight_vec, offset_vec, rnd_vec,
2875  dst4, dst5, dst6, dst7);
2876 
2877  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
2878  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
2879  ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
2880  dst += (8 * dst_stride);
2881  }
2882 }
2883 
2885  int32_t src_stride,
2886  uint8_t *dst,
2887  int32_t dst_stride,
2888  const int8_t *filter,
2889  int32_t height,
2890  int32_t weight,
2891  int32_t offset,
2892  int32_t rnd_val)
2893 {
2894  if (2 == height) {
2895  hevc_hz_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
2896  filter, weight, offset, rnd_val);
2897  } else if (4 == height) {
2898  hevc_hz_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride,
2899  filter, weight, offset, rnd_val);
2900  } else if (6 == height) {
2901  hevc_hz_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
2902  filter, weight, offset, rnd_val);
2903  } else {
2904  hevc_hz_uniwgt_4t_8x8multiple_msa(src, src_stride, dst, dst_stride,
2905  filter, height, weight, offset,
2906  rnd_val);
2907  }
2908 }
2909 
2911  int32_t src_stride,
2912  uint8_t *dst,
2913  int32_t dst_stride,
2914  const int8_t *filter,
2915  int32_t height,
2916  int32_t weight,
2917  int32_t offset,
2918  int32_t rnd_val)
2919 {
2920  uint32_t loop_cnt;
2921  v16u8 out0, out1, out2;
2922  v8i16 filt0, filt1;
2923  v16i8 src0, src1, src2, src3;
2924  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2925  v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2926  };
2927  v16i8 mask1;
2928  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2929  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2930  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2931  v16i8 mask3, vec11;
2932  v4i32 weight_vec, rnd_vec;
2933 
2934  src -= 1;
2935 
2936  filter_vec = LD_SH(filter);
2937  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2938 
2939  weight = weight & 0x0000FFFF;
2940 
2941  weight_vec = __msa_fill_w(weight);
2942  rnd_vec = __msa_fill_w(rnd_val);
2943 
2944  weight *= 128;
2945  rnd_val -= 6;
2946 
2947  weight_vec_h = __msa_fill_h(weight);
2948  offset_vec = __msa_fill_h(offset);
2949  denom_vec = __msa_fill_h(rnd_val);
2950 
2951  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2952  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2953 
2954  mask1 = mask0 + 2;
2955  mask3 = mask2 + 2;
2956 
2957  for (loop_cnt = 4; loop_cnt--;) {
2958  LD_SB4(src, src_stride, src0, src1, src2, src3);
2959  src += (4 * src_stride);
2960 
2961  XORI_B4_128_SB(src0, src1, src2, src3);
2962 
2963  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2964  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2965  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2966  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2967  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec8, vec9);
2968  VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec10, vec11);
2969  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2970  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2971  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
2972  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
2973  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
2974  dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
2975 
2976  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
2977  weight_vec, offset_vec, rnd_vec,
2978  dst0, dst1, dst2, dst3);
2979 
2980  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
2981  rnd_vec, dst4, dst5);
2982 
2983  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
2984  ST12x4_UB(out0, out1, out2, dst, dst_stride);
2985  dst += (4 * dst_stride);
2986  }
2987 }
2988 
2990  int32_t src_stride,
2991  uint8_t *dst,
2992  int32_t dst_stride,
2993  const int8_t *filter,
2994  int32_t height,
2995  int32_t weight,
2996  int32_t offset,
2997  int32_t rnd_val)
2998 {
2999  uint32_t loop_cnt;
3000  v16u8 out0, out1, out2, out3;
3001  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
3002  v8i16 filt0, filt1;
3003  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3004  v16i8 mask1;
3005  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3006  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3007  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3008  v4i32 weight_vec, rnd_vec;
3009 
3010  src -= 1;
3011 
3012  filter_vec = LD_SH(filter);
3013  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3014 
3015  weight = weight & 0x0000FFFF;
3016 
3017  weight_vec = __msa_fill_w(weight);
3018  rnd_vec = __msa_fill_w(rnd_val);
3019 
3020  weight *= 128;
3021  rnd_val -= 6;
3022 
3023  weight_vec_h = __msa_fill_h(weight);
3024  offset_vec = __msa_fill_h(offset);
3025  denom_vec = __msa_fill_h(rnd_val);
3026 
3027  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3028  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3029 
3030  mask1 = mask0 + 2;
3031 
3032  for (loop_cnt = (height >> 2); loop_cnt--;) {
3033  LD_SB4(src, src_stride, src0, src2, src4, src6);
3034  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3035  src += (4 * src_stride);
3036 
3037  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3038 
3039  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3040  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3041  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3042  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3043  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3044  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3045  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3046  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3047  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3048  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
3049  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
3050  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
3051  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3052  dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3053  dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3054  dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3055 
3056  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3057  weight_vec, offset_vec, rnd_vec,
3058  dst0, dst1, dst2, dst3);
3059 
3060  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
3061  weight_vec, offset_vec, rnd_vec,
3062  dst4, dst5, dst6, dst7);
3063 
3064  PCKEV_B4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3065  out0, out1, out2, out3);
3066 
3067  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
3068  dst += (4 * dst_stride);
3069  }
3070 }
3071 
3073  int32_t src_stride,
3074  uint8_t *dst,
3075  int32_t dst_stride,
3076  const int8_t *filter,
3077  int32_t height,
3078  int32_t weight,
3079  int32_t offset,
3080  int32_t rnd_val)
3081 {
3082  uint32_t loop_cnt;
3083  v16u8 out0, out1, out2;
3084  v16i8 src0, src1, src2, src3;
3085  v8i16 filt0, filt1;
3086  v16i8 mask0, mask1, mask2, mask3;
3087  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3088  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3089  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3090  v4i32 weight_vec, rnd_vec;
3091 
3092  src -= 1;
3093 
3094  filter_vec = LD_SH(filter);
3095  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3096 
3097  weight = weight & 0x0000FFFF;
3098  weight_vec = __msa_fill_w(weight);
3099  rnd_vec = __msa_fill_w(rnd_val);
3100 
3101  weight *= 128;
3102  rnd_val -= 6;
3103 
3104  weight_vec_h = __msa_fill_h(weight);
3105  offset_vec = __msa_fill_h(offset);
3106  denom_vec = __msa_fill_h(rnd_val);
3107 
3108  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3109  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3110 
3111  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3112  mask1 = mask0 + 2;
3113  mask2 = mask0 + 8;
3114  mask3 = mask0 + 10;
3115 
3116  for (loop_cnt = 16; loop_cnt--;) {
3117  LD_SB2(src, src_stride, src0, src2);
3118  LD_SB2(src + 16, src_stride, src1, src3);
3119  src += (2 * src_stride);
3120 
3121  XORI_B4_128_SB(src0, src1, src2, src3);
3122 
3123  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3124  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
3125  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3126  VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec6, vec7);
3127  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3128  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3129  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3130  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3131  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3132  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec2, vec3);
3133  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3134  dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3135 
3136  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3137  weight_vec, offset_vec, rnd_vec,
3138  dst0, dst1, dst2, dst3);
3139 
3140  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
3141  rnd_vec, dst4, dst5);
3142 
3143  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
3144  ST_UB2(out0, out1, dst, dst_stride);
3145  ST8x2_UB(out2, dst + 16, dst_stride);
3146  dst += (2 * dst_stride);
3147  }
3148 }
3149 
3151  int32_t src_stride,
3152  uint8_t *dst,
3153  int32_t dst_stride,
3154  const int8_t *filter,
3155  int32_t height,
3156  int32_t weight,
3157  int32_t offset,
3158  int32_t rnd_val)
3159 {
3160  uint32_t loop_cnt;
3161  v16u8 out0, out1, out2, out3;
3162  v16i8 src0, src1, src2, src3, src4, src5;
3163  v8i16 filt0, filt1;
3164  v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3165  v16i8 mask1, mask2, mask3;
3166  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3167  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3168  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3169  v4i32 weight_vec, rnd_vec;
3170 
3171  src -= 1;
3172 
3173  filter_vec = LD_SH(filter);
3174  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3175 
3176  weight = weight & 0x0000FFFF;
3177 
3178  weight_vec = __msa_fill_w(weight);
3179  rnd_vec = __msa_fill_w(rnd_val);
3180 
3181  weight *= 128;
3182  rnd_val -= 6;
3183 
3184  weight_vec_h = __msa_fill_h(weight);
3185  offset_vec = __msa_fill_h(offset);
3186  denom_vec = __msa_fill_h(rnd_val);
3187 
3188  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3189  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3190 
3191  mask1 = mask0 + 2;
3192  mask2 = mask0 + 8;
3193  mask3 = mask0 + 10;
3194 
3195  for (loop_cnt = (height >> 1); loop_cnt--;) {
3196  LD_SB2(src, 16, src0, src1);
3197  src2 = LD_SB(src + 24);
3198  src += src_stride;
3199  LD_SB2(src, 16, src3, src4);
3200  src5 = LD_SB(src + 24);
3201  src += src_stride;
3202  XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
3203  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3204  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
3205  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec4, vec5);
3206  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec6, vec7);
3207  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3208  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3209  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3210  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3211  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3212  VSHF_B2_SB(src3, src4, src3, src4, mask2, mask3, vec2, vec3);
3213  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec4, vec5);
3214  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec6, vec7);
3215  dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3216  dst5 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3217  dst6 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3218  dst7 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3219 
3220  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3221  weight_vec, offset_vec, rnd_vec,
3222  dst0, dst1, dst2, dst3);
3223 
3224  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
3225  weight_vec, offset_vec, rnd_vec,
3226  dst4, dst5, dst6, dst7);
3227 
3228  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3229  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
3230  ST_UB2(out0, out1, dst, 16);
3231  dst += dst_stride;
3232  ST_UB2(out2, out3, dst, 16);
3233  dst += dst_stride;
3234  }
3235 }
3236 
3238  int32_t src_stride,
3239  uint8_t *dst,
3240  int32_t dst_stride,
3241  const int8_t *filter,
3242  int32_t weight,
3243  int32_t offset,
3244  int32_t rnd_val)
3245 {
3246  v16u8 out;
3247  v16i8 src0, src1, src2, src3, src4;
3248  v16i8 src10_r, src32_r, src21_r, src43_r;
3249  v16i8 src2110, src4332;
3250  v8i16 dst0;
3251  v4i32 dst0_r, dst0_l;
3252  v8i16 filt0, filt1;
3253  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3254  v4i32 weight_vec, rnd_vec;
3255 
3256  src -= src_stride;
3257 
3258  weight = weight & 0x0000FFFF;
3259 
3260  weight_vec = __msa_fill_w(weight);
3261  rnd_vec = __msa_fill_w(rnd_val);
3262 
3263  weight *= 128;
3264  rnd_val -= 6;
3265 
3266  weight_vec_h = __msa_fill_h(weight);
3267  offset_vec = __msa_fill_h(offset);
3268  denom_vec = __msa_fill_h(rnd_val);
3269 
3270  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3271  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3272 
3273  filter_vec = LD_SH(filter);
3274  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3275 
3276  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3277  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3278  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3279  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3280  XORI_B2_128_SB(src2110, src4332);
3281  dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3282  ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
3283  DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
3284  SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
3285  dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
3286  dst0 = __msa_adds_s_h(dst0, offset_vec);
3287  dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
3288  out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
3289  ST4x2_UB(out, dst, dst_stride);
3290 }
3291 
3293  int32_t src_stride,
3294  uint8_t *dst,
3295  int32_t dst_stride,
3296  const int8_t *filter,
3297  int32_t weight,
3298  int32_t offset,
3299  int32_t rnd_val)
3300 {
3301  v16u8 out;
3302  v16i8 src0, src1, src2, src3, src4, src5, src6;
3303  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3304  v16i8 src2110, src4332, src6554;
3305  v8i16 dst0, dst1;
3306  v8i16 filt0, filt1;
3307  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3308  v4i32 weight_vec, rnd_vec;
3309 
3310  src -= src_stride;
3311 
3312  weight = weight & 0x0000FFFF;
3313 
3314  weight_vec = __msa_fill_w(weight);
3315  rnd_vec = __msa_fill_w(rnd_val);
3316 
3317  weight *= 128;
3318  rnd_val -= 6;
3319 
3320  weight_vec_h = __msa_fill_h(weight);
3321  offset_vec = __msa_fill_h(offset);
3322  denom_vec = __msa_fill_h(rnd_val);
3323 
3324  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3325  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3326 
3327  filter_vec = LD_SH(filter);
3328  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3329 
3330  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3331  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3332  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3333  src32_r, src43_r, src54_r, src65_r);
3334  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
3335  src2110, src4332, src6554);
3336  XORI_B3_128_SB(src2110, src4332, src6554);
3337  dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3338  dst1 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3339  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
3340  dst0, dst1);
3341 
3342  out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3343  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
3344 }
3345 
3347  int32_t src_stride,
3348  uint8_t *dst,
3349  int32_t dst_stride,
3350  const int8_t *filter,
3351  int32_t height,
3352  int32_t weight,
3353  int32_t offset,
3354  int32_t rnd_val)
3355 {
3356  int32_t loop_cnt;
3357  v16u8 out0, out1;
3358  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3359  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3360  v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3361  v16i8 src2110, src4332, src6554, src8776;
3362  v16i8 src10998;
3363  v8i16 dst0, dst1, dst2, dst3, filt0, filt1;
3364  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3365  v4i32 weight_vec, rnd_vec;
3366 
3367  src -= src_stride;
3368 
3369  weight = weight & 0x0000FFFF;
3370 
3371  weight_vec = __msa_fill_w(weight);
3372  rnd_vec = __msa_fill_w(rnd_val);
3373 
3374  weight *= 128;
3375  rnd_val -= 6;
3376 
3377  weight_vec_h = __msa_fill_h(weight);
3378  offset_vec = __msa_fill_h(offset);
3379  denom_vec = __msa_fill_h(rnd_val);
3380 
3381  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3382  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3383 
3384  filter_vec = LD_SH(filter);
3385  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3386 
3387  LD_SB3(src, src_stride, src0, src1, src2);
3388  src += (3 * src_stride);
3389  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3390  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3391  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3392 
3393  for (loop_cnt = (height >> 3); loop_cnt--;) {
3394  LD_SB8(src, src_stride,
3395  src3, src4, src5, src6, src7, src8, src9, src10);
3396  src += (8 * src_stride);
3397  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3398  src32_r, src43_r, src54_r, src65_r);
3399  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3400  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3401  ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3402  src109_r, src98_r, src4332, src6554, src8776, src10998);
3403  XORI_B4_128_SB(src4332, src6554, src8776, src10998);
3404  dst0 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3405  dst1 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3406  dst2 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3407  dst3 = HEVC_FILT_4TAP_SH(src8776, src10998, filt0, filt1);
3408 
3409  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3410  weight_vec, offset_vec, rnd_vec,
3411  dst0, dst1, dst2, dst3);
3412 
3413  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3414  ST4x8_UB(out0, out1, dst, dst_stride);
3415  dst += (8 * dst_stride);
3416 
3417  src2 = src10;
3418  src2110 = src10998;
3419  }
3420 }
3421 
3423  int32_t src_stride,
3424  uint8_t *dst,
3425  int32_t dst_stride,
3426  const int8_t *filter,
3427  int32_t height,
3428  int32_t weight,
3429  int32_t offset,
3430  int32_t rnd_val)
3431 {
3432  if (2 == height) {
3433  hevc_vt_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
3434  filter, weight, offset, rnd_val);
3435  } else if (4 == height) {
3436  hevc_vt_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
3437  filter, weight, offset, rnd_val);
3438  } else if (0 == (height % 8)) {
3439  hevc_vt_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
3440  filter, height, weight, offset,
3441  rnd_val);
3442  }
3443 }
3444 
3446  int32_t src_stride,
3447  uint8_t *dst,
3448  int32_t dst_stride,
3449  const int8_t *filter,
3450  int32_t height,
3451  int32_t weight,
3452  int32_t offset,
3453  int32_t rnd_val)
3454 {
3455  v16u8 out0, out1, out2, out3;
3456  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3457  v16i8 src10_r, src32_r, src21_r, src43_r;
3458  v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
3459  v8i16 filt0, filt1;
3460  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3461  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3462  v4i32 weight_vec, rnd_vec;
3463 
3464  src -= src_stride;
3465 
3466  weight = weight & 0x0000FFFF;
3467 
3468  weight_vec = __msa_fill_w(weight);
3469  rnd_vec = __msa_fill_w(rnd_val);
3470 
3471  weight *= 128;
3472  rnd_val -= 6;
3473 
3474  weight_vec_h = __msa_fill_h(weight);
3475  offset_vec = __msa_fill_h(offset);
3476  denom_vec = __msa_fill_h(rnd_val);
3477 
3478  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3479  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3480 
3481  filter_vec = LD_SH(filter);
3482  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3483 
3484  LD_SB3(src, src_stride, src0, src1, src2);
3485  src += (3 * src_stride);
3486  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3487  XORI_B3_128_SB(src0, src1, src2);
3488  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3489  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3490  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3491  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3492  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3493  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3494  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3495  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3496  dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3497  dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3498  dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3499  dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3500  dst6 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3501  dst7 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3502 
3503  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3,
3504  weight_vec, offset_vec, rnd_vec,
3505  dst0, dst1, dst2, dst3);
3506  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7,
3507  weight_vec, offset_vec, rnd_vec,
3508  dst4, dst5, dst6, dst7);
3509 
3510  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3511  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
3512  ST6x4_UB(out0, out1, dst, dst_stride);
3513  dst += (4 * dst_stride);
3514  ST6x4_UB(out2, out3, dst, dst_stride);
3515 }
3516 
3518  int32_t src_stride,
3519  uint8_t *dst,
3520  int32_t dst_stride,
3521  const int8_t *filter,
3522  int32_t weight,
3523  int32_t offset,
3524  int32_t rnd_val)
3525 {
3526  v16u8 out;
3527  v16i8 src0, src1, src2, src3, src4;
3528  v16i8 src10_r, src32_r, src21_r, src43_r;
3529  v8i16 dst0, dst1;
3530  v8i16 filt0, filt1;
3531  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3532  v4i32 weight_vec, rnd_vec;
3533 
3534  src -= src_stride;
3535 
3536  weight = weight & 0x0000FFFF;
3537 
3538  weight_vec = __msa_fill_w(weight);
3539  rnd_vec = __msa_fill_w(rnd_val);
3540 
3541  weight *= 128;
3542  rnd_val -= 6;
3543 
3544  weight_vec_h = __msa_fill_h(weight);
3545  offset_vec = __msa_fill_h(offset);
3546  denom_vec = __msa_fill_h(rnd_val);
3547 
3548  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3549  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3550 
3551  filter_vec = LD_SH(filter);
3552  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3553 
3554  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3555  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3556  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3557  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3558  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3559  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3560 
3561  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec, rnd_vec,
3562  dst0, dst1);
3563 
3564  out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3565  ST8x2_UB(out, dst, dst_stride);
3566 }
3567 
3569  int32_t src_stride,
3570  uint8_t *dst,
3571  int32_t dst_stride,
3572  const int8_t *filter,
3573  int32_t weight,
3574  int32_t offset,
3575  int32_t rnd_val)
3576 {
3577  v16u8 out0, out1;
3578  v16i8 src0, src1, src2, src3, src4;
3579  v16i8 src10_r, src32_r, src21_r, src43_r;
3580  v16i8 src5, src6, src54_r, src65_r;
3581  v8i16 filt0, filt1;
3582  v8i16 dst0, dst1, dst2, dst3;
3583  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3584  v4i32 weight_vec, rnd_vec;
3585 
3586  src -= src_stride;
3587 
3588  weight = weight & 0x0000FFFF;
3589 
3590  weight_vec = __msa_fill_w(weight);
3591  rnd_vec = __msa_fill_w(rnd_val);
3592 
3593  weight *= 128;
3594  rnd_val -= 6;
3595 
3596  weight_vec_h = __msa_fill_h(weight);
3597  offset_vec = __msa_fill_h(offset);
3598  denom_vec = __msa_fill_h(rnd_val);
3599 
3600  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3601  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3602 
3603  filter_vec = LD_SH(filter);
3604  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3605 
3606  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3607  src += (3 * src_stride);
3608  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3609  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3610  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3611  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3612  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3613  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3614  dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3615  dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3616  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3617  offset_vec, rnd_vec, dst0, dst1, dst2,
3618  dst3);
3619  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3620  ST8x4_UB(out0, out1, dst, dst_stride);
3621 }
3622 
3624  int32_t src_stride,
3625  uint8_t *dst,
3626  int32_t dst_stride,
3627  const int8_t *filter,
3628  int32_t weight,
3629  int32_t offset,
3630  int32_t rnd_val)
3631 {
3632  v16u8 out0, out1, out2;
3633  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3634  v16i8 src10_r, src32_r, src54_r, src76_r;
3635  v16i8 src21_r, src43_r, src65_r, src87_r;
3636  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3637  v8i16 filt0, filt1;
3638  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3639  v4i32 weight_vec, rnd_vec;
3640 
3641  src -= src_stride;
3642 
3643  weight = weight & 0x0000FFFF;
3644 
3645  weight_vec = __msa_fill_w(weight);
3646  rnd_vec = __msa_fill_w(rnd_val);
3647 
3648  weight *= 128;
3649  rnd_val -= 6;
3650 
3651  weight_vec_h = __msa_fill_h(weight);
3652  offset_vec = __msa_fill_h(offset);
3653  denom_vec = __msa_fill_h(rnd_val);
3654 
3655  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3656  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3657 
3658  filter_vec = LD_SH(filter);
3659  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3660 
3661  LD_SB3(src, src_stride, src0, src1, src2);
3662  src += (3 * src_stride);
3663  LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
3664 
3665  XORI_B3_128_SB(src0, src1, src2);
3666  XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3667  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3668  src32_r, src43_r);
3669  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3670  src76_r, src87_r);
3671  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3672  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3673  dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3674  dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3675  dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3676  dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3677  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3678  offset_vec, rnd_vec, dst0, dst1, dst2, dst3);
3679  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec,
3680  dst4, dst5);
3681  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
3682  ST8x4_UB(out0, out1, dst, dst_stride);
3683  dst += (4 * dst_stride);
3684  ST8x2_UB(out2, dst, dst_stride);
3685 }
3686 
3688  int32_t src_stride,
3689  uint8_t *dst,
3690  int32_t dst_stride,
3691  const int8_t *filter,
3692  int32_t height,
3693  int32_t weight,
3694  int32_t offset,
3695  int32_t rnd_val)
3696 {
3697  int32_t loop_cnt;
3698  v16u8 out0, out1, out2, out3;
3699  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3700  v16i8 src10_r, src32_r, src21_r, src43_r;
3701  v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
3702  v8i16 filt0, filt1;
3703  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3704  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3705  v4i32 weight_vec, rnd_vec;
3706 
3707  src -= src_stride;
3708 
3709  weight = weight & 0x0000FFFF;
3710 
3711  weight_vec = __msa_fill_w(weight);
3712  rnd_vec = __msa_fill_w(rnd_val);
3713 
3714  weight *= 128;
3715  rnd_val -= 6;
3716 
3717  weight_vec_h = __msa_fill_h(weight);
3718  offset_vec = __msa_fill_h(offset);
3719  denom_vec = __msa_fill_h(rnd_val);
3720 
3721  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3722  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3723 
3724  filter_vec = LD_SH(filter);
3725  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3726 
3727  LD_SB3(src, src_stride, src0, src1, src2);
3728  src += (3 * src_stride);
3729  XORI_B3_128_SB(src0, src1, src2);
3730  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3731 
3732  for (loop_cnt = (height >> 3); loop_cnt--;) {
3733  LD_SB8(src, src_stride,
3734  src3, src4, src5, src6, src7, src8, src9, src10);
3735  src += (8 * src_stride);
3736  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3737  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3738  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3739  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3740  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3741  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3742  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3743  dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3744  dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3745  dst4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3746  dst5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3747  dst6 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3748  dst7 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3749  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3750  offset_vec, rnd_vec, dst0, dst1, dst2,
3751  dst3);
3752  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
3753  offset_vec, rnd_vec, dst4, dst5, dst6,
3754  dst7);
3755  PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
3756  PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
3757  ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
3758  dst += (8 * dst_stride);
3759 
3760  src2 = src10;
3761  src10_r = src98_r;
3762  src21_r = src109_r;
3763  }
3764 }
3765 
3767  int32_t src_stride,
3768  uint8_t *dst,
3769  int32_t dst_stride,
3770  const int8_t *filter,
3771  int32_t height,
3772  int32_t weight,
3773  int32_t offset,
3774  int32_t rnd_val)
3775 {
3776  if (2 == height) {
3777  hevc_vt_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
3778  filter, weight, offset, rnd_val);
3779  } else if (4 == height) {
3780  hevc_vt_uniwgt_4t_8x4_msa(src, src_stride, dst, dst_stride,
3781  filter, weight, offset, rnd_val);
3782  } else if (6 == height) {
3783  hevc_vt_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
3784  filter, weight, offset, rnd_val);
3785  } else {
3786  hevc_vt_uniwgt_4t_8x8mult_msa(src, src_stride, dst, dst_stride,
3787  filter, height, weight, offset,
3788  rnd_val);
3789  }
3790 }
3791 
3793  int32_t src_stride,
3794  uint8_t *dst,
3795  int32_t dst_stride,
3796  const int8_t *filter,
3797  int32_t height,
3798  int32_t weight,
3799  int32_t offset,
3800  int32_t rnd_val)
3801 {
3802  int32_t loop_cnt;
3803  v16u8 out0, out1, out2, out3, out4, out5;
3804  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3805  v16i8 src10_r, src32_r, src21_r, src43_r;
3806  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3807  v16i8 src2110, src4332;
3808  v16i8 src54_r, src76_r, src98_r, src65_r, src87_r, src109_r;
3809  v16i8 src76_l, src98_l, src87_l, src109_l, src6554, src8776, src10998;
3810  v8i16 filt0, filt1;
3811  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3812  v8i16 dst9, dst10, dst11, filter_vec, weight_vec_h, offset_vec, denom_vec;
3813  v4i32 weight_vec, rnd_vec;
3814 
3815  src -= (1 * src_stride);
3816 
3817  weight = weight & 0x0000FFFF;
3818 
3819  weight_vec = __msa_fill_w(weight);
3820  rnd_vec = __msa_fill_w(rnd_val);
3821 
3822  weight *= 128;
3823  rnd_val -= 6;
3824 
3825  weight_vec_h = __msa_fill_h(weight);
3826  offset_vec = __msa_fill_h(offset);
3827  denom_vec = __msa_fill_h(rnd_val);
3828 
3829  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3830  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3831 
3832  filter_vec = LD_SH(filter);
3833  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3834 
3835  LD_SB3(src, src_stride, src0, src1, src2);
3836  src += (3 * src_stride);
3837  XORI_B3_128_SB(src0, src1, src2);
3838  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3839  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3840  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3841 
3842  for (loop_cnt = 2; loop_cnt--;) {
3843  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3844  src += (8 * src_stride);
3845  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3846  ILVRL_B2_SB(src3, src2, src32_r, src32_l);
3847  ILVRL_B2_SB(src4, src3, src43_r, src43_l);
3848  ILVRL_B2_SB(src5, src4, src54_r, src54_l);
3849  ILVRL_B2_SB(src6, src5, src65_r, src65_l);
3850  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3851  src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3852  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3853  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3854  dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3855  dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3856  dst4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3857  dst5 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3858  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3859  offset_vec, rnd_vec, dst0, dst1, dst2,
3860  dst3);
3861  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
3862  rnd_vec, dst4, dst5);
3863  PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
3864  ST12x4_UB(out0, out1, out2, dst, dst_stride);
3865  dst += (4 * dst_stride);
3866 
3867  ILVRL_B2_SB(src7, src6, src76_r, src76_l);
3868  ILVRL_B2_SB(src8, src7, src87_r, src87_l);
3869  ILVRL_B2_SB(src9, src8, src98_r, src98_l);
3870  ILVRL_B2_SB(src10, src9, src109_r, src109_l);
3871  src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
3872  src10998 = (v16i8) __msa_ilvr_d((v2i64) src109_l, (v2i64) src98_l);
3873  dst6 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3874  dst7 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3875  dst8 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3876  dst9 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3877  dst10 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3878  dst11 = HEVC_FILT_4TAP_SH(src8776, src10998, filt0, filt1);
3879  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst6, dst7, dst8, dst9, weight_vec,
3880  offset_vec, rnd_vec, dst6, dst7, dst8,
3881  dst9);
3882  HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst10, dst11, weight_vec, offset_vec,
3883  rnd_vec, dst10, dst11);
3884  PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
3885  ST12x4_UB(out3, out4, out5, dst, dst_stride);
3886  dst += (4 * dst_stride);
3887 
3888  src2 = src10;
3889  src10_r = src98_r;
3890  src21_r = src109_r;
3891  src2110 = src10998;
3892  }
3893 }
3894 
3896  int32_t src_stride,
3897  uint8_t *dst,
3898  int32_t dst_stride,
3899  const int8_t *filter,
3900  int32_t height,
3901  int32_t weight,
3902  int32_t offset,
3903  int32_t rnd_val)
3904 {
3905  int32_t loop_cnt;
3906  v16u8 out0, out1, out2, out3;
3907  v16i8 src0, src1, src2, src3, src4, src5;
3908  v16i8 src10_r, src32_r, src21_r, src43_r;
3909  v16i8 src10_l, src32_l, src21_l, src43_l;
3910  v16i8 src54_r, src54_l, src65_r, src65_l, src6;
3911  v8i16 filt0, filt1;
3912  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3913  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3914  v4i32 weight_vec, rnd_vec;
3915 
3916  src -= src_stride;
3917 
3918  weight = weight & 0x0000FFFF;
3919 
3920  weight_vec = __msa_fill_w(weight);
3921  rnd_vec = __msa_fill_w(rnd_val);
3922 
3923  weight *= 128;
3924  rnd_val -= 6;
3925 
3926  weight_vec_h = __msa_fill_h(weight);
3927  offset_vec = __msa_fill_h(offset);
3928  denom_vec = __msa_fill_h(rnd_val);
3929 
3930  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3931  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3932 
3933  filter_vec = LD_SH(filter);
3934  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3935 
3936  LD_SB3(src, src_stride, src0, src1, src2);
3937  src += (3 * src_stride);
3938  XORI_B3_128_SB(src0, src1, src2);
3939  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3940  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3941 
3942  for (loop_cnt = (height >> 2); loop_cnt--;) {
3943  LD_SB4(src, src_stride, src3, src4, src5, src6);
3944  src += (4 * src_stride);
3945  XORI_B4_128_SB(src3, src4, src5, src6);
3946  ILVRL_B2_SB(src3, src2, src32_r, src32_l);
3947  ILVRL_B2_SB(src4, src3, src43_r, src43_l);
3948  ILVRL_B2_SB(src5, src4, src54_r, src54_l);
3949  ILVRL_B2_SB(src6, src5, src65_r, src65_l);
3950  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3951  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3952  dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3953  dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3954  dst4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3955  dst5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3956  dst6 = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
3957  dst7 = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
3958  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
3959  offset_vec, rnd_vec, dst0, dst1, dst2,
3960  dst3);
3961  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
3962  offset_vec, rnd_vec, dst4, dst5, dst6,
3963  dst7);
3964  PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1,
3965  out2, out3);
3966  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
3967  dst += (4 * dst_stride);
3968 
3969  src2 = src6;
3970  src10_r = src54_r;
3971  src21_r = src65_r;
3972  src10_l = src54_l;
3973  src21_l = src65_l;
3974  }
3975 }
3976 
3978  int32_t src_stride,
3979  uint8_t *dst,
3980  int32_t dst_stride,
3981  const int8_t *filter,
3982  int32_t height,
3983  int32_t weight,
3984  int32_t offset,
3985  int32_t rnd_val)
3986 {
3987  uint32_t loop_cnt;
3988  v16u8 out0, out1, out2, out3, out4, out5;
3989  v16i8 src0, src1, src2, src3, src4, src5;
3990  v16i8 src6, src7, src8, src9, src10, src11, src12, src13;
3991  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3992  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3993  v16i8 src87_r, src98_r, src109_r, src1110_r, src1211_r, src1312_r;
3994  v8i16 filt0, filt1;
3995  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
3996  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec, dst11;
3997  v4i32 weight_vec, rnd_vec;
3998 
3999  src -= src_stride;
4000 
4001  weight = weight & 0x0000FFFF;
4002 
4003  weight_vec = __msa_fill_w(weight);
4004  rnd_vec = __msa_fill_w(rnd_val);
4005 
4006  weight *= 128;
4007  rnd_val -= 6;
4008 
4009  weight_vec_h = __msa_fill_h(weight);
4010  offset_vec = __msa_fill_h(offset);
4011  denom_vec = __msa_fill_h(rnd_val);
4012 
4013  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
4014  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
4015 
4016  filter_vec = LD_SH(filter);
4017  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4018 
4019  LD_SB3(src, src_stride, src0, src1, src2);
4020  LD_SB3(src + 16, src_stride, src7, src8, src9);
4021  src += (3 * src_stride);
4022  XORI_B3_128_SB(src0, src1, src2);
4023  XORI_B3_128_SB(src7, src8, src9);
4024  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4025  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4026  ILVR_B2_SB(src8, src7, src9, src8, src87_r, src98_r);
4027 
4028  for (loop_cnt = 8; loop_cnt--;) {
4029  LD_SB4(src, src_stride, src3, src4, src5, src6);
4030  LD_SB4(src + 16, src_stride, src10, src11, src12, src13);
4031  src += (4 * src_stride);
4032  XORI_B4_128_SB(src3, src4, src5, src6);
4033  XORI_B4_128_SB(src10, src11, src12, src13);
4034  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4035  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4036  ILVRL_B2_SB(src5, src4, src54_r, src54_l);
4037  ILVRL_B2_SB(src6, src5, src65_r, src65_l);
4038  ILVR_B2_SB(src10, src9, src11, src10, src109_r, src1110_r);
4039  ILVR_B2_SB(src12, src11, src13, src12, src1211_r, src1312_r);
4040  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4041  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4042  dst2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
4043  dst3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
4044  dst4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4045  dst5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4046  dst6 = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
4047  dst7 = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
4048  dst8 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4049  dst9 = HEVC_FILT_4TAP_SH(src98_r, src1110_r, filt0, filt1);
4050  dst10 = HEVC_FILT_4TAP_SH(src109_r, src1211_r, filt0, filt1);
4051  dst11 = HEVC_FILT_4TAP_SH(src1110_r, src1312_r, filt0, filt1);
4052  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
4053  offset_vec, rnd_vec, dst0, dst1, dst2,
4054  dst3);
4055  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
4056  offset_vec, rnd_vec, dst4, dst5, dst6,
4057  dst7);
4058  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
4059  offset_vec, rnd_vec, dst8, dst9, dst10,
4060  dst11);
4061  PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1,
4062  out2, out3);
4063  PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
4064  ST_UB4(out0, out1, out2, out3, dst, dst_stride);
4065  ST8x4_UB(out4, out5, dst + 16, dst_stride);
4066  dst += (4 * dst_stride);
4067 
4068  src2 = src6;
4069  src9 = src13;
4070  src10_r = src54_r;
4071  src21_r = src65_r;
4072  src10_l = src54_l;
4073  src21_l = src65_l;
4074  src87_r = src1211_r;
4075  src98_r = src1312_r;
4076  }
4077 }
4078 
4080  int32_t src_stride,
4081  uint8_t *dst,
4082  int32_t dst_stride,
4083  const int8_t *filter,
4084  int32_t height,
4085  int32_t weight,
4086  int32_t offset,
4087  int32_t rnd_val)
4088 {
4089  uint32_t loop_cnt;
4090  v16u8 out0, out1, out2, out3;
4091  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
4092  v16i8 src10_r, src32_r, src76_r, src98_r;
4093  v16i8 src21_r, src43_r, src65_r, src87_r;
4094  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4095  v16i8 src10_l, src32_l, src76_l, src98_l;
4096  v16i8 src21_l, src43_l, src65_l, src87_l;
4097  v8i16 filt0, filt1;
4098  v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
4099  v4i32 weight_vec, rnd_vec;
4100 
4101  src -= src_stride;
4102 
4103  weight = weight & 0x0000FFFF;
4104 
4105  weight_vec = __msa_fill_w(weight);
4106  rnd_vec = __msa_fill_w(rnd_val);
4107 
4108  weight *= 128;
4109  rnd_val -= 6;
4110 
4111  weight_vec_h = __msa_fill_h(weight);
4112  offset_vec = __msa_fill_h(offset);
4113  denom_vec = __msa_fill_h(rnd_val);
4114 
4115  weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
4116  offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
4117 
4118  filter_vec = LD_SH(filter);
4119  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4120 
4121  LD_SB3(src, src_stride, src0, src1, src2);
4122  LD_SB3(src + 16, src_stride, src5, src6, src7);
4123  src += (3 * src_stride);
4124  XORI_B6_128_SB(src0, src1, src2, src5, src6, src7);
4125  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4126  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4127  ILVR_B2_SB(src6, src5, src7, src6, src65_r, src76_r);
4128  ILVL_B2_SB(src6, src5, src7, src6, src65_l, src76_l);
4129 
4130  for (loop_cnt = (height >> 1); loop_cnt--;) {
4131  LD_SB2(src, src_stride, src3, src4);
4132  LD_SB2(src + 16, src_stride, src8, src9);
4133  src += (2 * src_stride);
4134  XORI_B4_128_SB(src3, src4, src8, src9);
4135  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4136  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4137  ILVRL_B2_SB(src8, src7, src87_r, src87_l);
4138  ILVRL_B2_SB(src9, src8, src98_r, src98_l);
4139  dst0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4140  dst1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4141  dst2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4142  dst3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4143  dst4 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
4144  dst5 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4145  dst6 = HEVC_FILT_4TAP_SH(src65_l, src87_l, filt0, filt1);
4146  dst7 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
4147  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
4148  offset_vec, rnd_vec, dst0, dst1, dst2,
4149  dst3);
4150  HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
4151  offset_vec, rnd_vec, dst4, dst5, dst6,
4152  dst7);
4153  PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst6, dst4, dst7, dst5, out0, out1,
4154  out2, out3);
4155  ST_UB2(out0, out2, dst, 16);
4156  dst += dst_stride;
4157  ST_UB2(out1, out3, dst, 16);
4158  dst += dst_stride;
4159 
4160  src2 = src4;
4161  src7 = src9;
4162  src10_r = src32_r;
4163  src21_r = src43_r;
4164  src10_l = src32_l;
4165  src21_l = src43_l;
4166  src65_r = src87_r;
4167  src76_r = src98_r;
4168  src65_l = src87_l;
4169  src76_l = src98_l;
4170  }
4171 }
4172 
4174  int32_t src_stride,
4175  uint8_t *dst,
4176  int32_t dst_stride,
4177  const int8_t *filter_x,
4178  const int8_t *filter_y,
4179  int32_t weight,
4180  int32_t offset,
4181  int32_t rnd_val)
4182 {
4183  v16u8 out;
4184  v16i8 src0, src1, src2, src3, src4;
4185  v8i16 filt0, filt1;
4186  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4187  v16i8 mask1;
4188  v8i16 filt_h0, filt_h1, filter_vec, tmp;
4189  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4190  v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
4191  v8i16 offset_vec, const_128, denom_vec;
4192  v4i32 dst0, dst1, weight_vec, rnd_vec;
4193 
4194  src -= (src_stride + 1);
4195 
4196  filter_vec = LD_SH(filter_x);
4197  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4198 
4199  filter_vec = LD_SH(filter_y);
4200  UNPCK_R_SB_SH(filter_vec, filter_vec);
4201 
4202  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4203 
4204  mask1 = mask0 + 2;
4205 
4206  weight_vec = __msa_fill_w(weight);
4207  rnd_vec = __msa_fill_w(rnd_val);
4208 
4209  offset_vec = __msa_fill_h(offset);
4210  denom_vec = __msa_fill_h(rnd_val - 6);
4211  const_128 = __msa_fill_h((128 * weight));
4212  offset_vec += __msa_srar_h(const_128, denom_vec);
4213 
4214  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
4215  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4216  VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
4217  VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
4218  VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
4219  dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4220  dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4221  dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4222  ILVRL_H2_SH(dst31, dst20, dst10, dst32);
4223  ILVRL_H2_SH(dst42, dst31, dst21, dst43);
4224  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4225  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4226  dst0 >>= 6;
4227  dst1 >>= 6;
4228  MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
4229  SRAR_W2_SW(dst0, dst1, rnd_vec);
4230  tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4231  tmp += offset_vec;
4232  tmp = CLIP_SH_0_255_MAX_SATU(tmp);
4233  out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
4234  ST4x2_UB(out, dst, dst_stride);
4235 }
4236 
4238  int32_t src_stride,
4239  uint8_t *dst,
4240  int32_t dst_stride,
4241  const int8_t *filter_x,
4242  const int8_t *filter_y,
4243  int32_t weight,
4244  int32_t offset,
4245  int32_t rnd_val)
4246 {
4247  v16u8 out;
4248  v16i8 src0, src1, src2, src3, src4, src5, src6;
4249  v8i16 filt0, filt1;
4250  v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1;
4251  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4252  v16i8 mask1;
4253  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4254  v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65;
4255  v8i16 offset_vec, const_128, denom_vec;
4256  v4i32 dst0, dst1, dst2, dst3, weight_vec, rnd_vec;
4257 
4258  src -= (src_stride + 1);
4259 
4260  filter_vec = LD_SH(filter_x);
4261  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4262 
4263  filter_vec = LD_SH(filter_y);
4264  UNPCK_R_SB_SH(filter_vec, filter_vec);
4265 
4266  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4267 
4268  mask1 = mask0 + 2;
4269 
4270  weight_vec = __msa_fill_w(weight);
4271  rnd_vec = __msa_fill_w(rnd_val);
4272 
4273  offset_vec = __msa_fill_h(offset);
4274  denom_vec = __msa_fill_h(rnd_val - 6);
4275  const_128 = __msa_fill_h((128 * weight));
4276  offset_vec += __msa_srar_h(const_128, denom_vec);
4277 
4278  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
4279  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4280  VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
4281  VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
4282  VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4283  VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4284  dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4285  dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4286  dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4287  dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4288  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
4289  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
4290  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
4291  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4292  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4293  dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
4294  dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
4295  SRA_4V(dst0, dst1, dst2, dst3, 6);
4296  MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
4297  MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
4298  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4299  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4300  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4301  CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
4302  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4303  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
4304 }
4305 
4307  int32_t src_stride,
4308  uint8_t *dst,
4309  int32_t dst_stride,
4310  const int8_t *filter_x,
4311  const int8_t *filter_y,
4312  int32_t height,
4313  int32_t weight,
4314  int32_t offset,
4315  int32_t rnd_val)
4316 {
4317  uint32_t loop_cnt;
4318  v16u8 out0, out1;
4319  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4320  v8i16 filt0, filt1;
4321  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4322  v16i8 mask1;
4323  v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
4324  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4325  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4326  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4327  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4328  v8i16 dst98_r, dst109_r, offset_vec, const_128, denom_vec;
4329  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
4330 
4331  src -= (src_stride + 1);
4332 
4333  filter_vec = LD_SH(filter_x);
4334  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4335 
4336  filter_vec = LD_SH(filter_y);
4337  UNPCK_R_SB_SH(filter_vec, filter_vec);
4338 
4339  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4340 
4341  mask1 = mask0 + 2;
4342 
4343  weight_vec = __msa_fill_w(weight);
4344  rnd_vec = __msa_fill_w(rnd_val);
4345 
4346  offset_vec = __msa_fill_h(offset);
4347  denom_vec = __msa_fill_h(rnd_val - 6);
4348  const_128 = __msa_fill_h((128 * weight));
4349  offset_vec += __msa_srar_h(const_128, denom_vec);
4350 
4351  LD_SB3(src, src_stride, src0, src1, src2);
4352  src += (3 * src_stride);
4353  XORI_B3_128_SB(src0, src1, src2);
4354 
4355  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
4356  VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
4357  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4358  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4359  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4360  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4361 
4362  for (loop_cnt = height >> 3; loop_cnt--;) {
4363  LD_SB8(src, src_stride,
4364  src3, src4, src5, src6, src7, src8, src9, src10);
4365  src += (8 * src_stride);
4366  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4367 
4368  VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4369  VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4370  VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4371  VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4372  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4373  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4374  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4375  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4376  dst32_r = __msa_ilvr_h(dst73, dst22);
4377  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4378  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4379  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4380  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4381  dst76_r = __msa_ilvr_h(dst22, dst106);
4382  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4383  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4384  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4385  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4386  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4387  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4388  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4389  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4390  SRA_4V(dst0, dst1, dst2, dst3, 6);
4391  SRA_4V(dst4, dst5, dst6, dst7, 6);
4392  MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
4393  MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
4394  MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
4395  MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
4396  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4397  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4398  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4399  tmp2, tmp3);
4400  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4401  ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4402  CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
4403  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4404  ST4x8_UB(out0, out1, dst, dst_stride);
4405  dst += (8 * dst_stride);
4406 
4407  dst10_r = dst98_r;
4408  dst21_r = dst109_r;
4409  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4410  }
4411 }
4412 
4414  int32_t src_stride,
4415  uint8_t *dst,
4416  int32_t dst_stride,
4417  const int8_t *filter_x,
4418  const int8_t *filter_y,
4419  int32_t height,
4420  int32_t weight,
4421  int32_t offset,
4422  int32_t rnd_val)
4423 {
4424  if (2 == height) {
4425  hevc_hv_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
4426  filter_x, filter_y, weight,
4427  offset, rnd_val);
4428  } else if (4 == height) {
4429  hevc_hv_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
4430  filter_x,filter_y, weight,
4431  offset, rnd_val);
4432  } else if (0 == (height % 8)) {
4433  hevc_hv_uniwgt_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
4434  filter_x, filter_y, height, weight,
4435  offset, rnd_val);
4436  }
4437 }
4438 
4440  int32_t src_stride,
4441  uint8_t *dst,
4442  int32_t dst_stride,
4443  const int8_t *filter_x,
4444  const int8_t *filter_y,
4445  int32_t height,
4446  int32_t weight,
4447  int32_t offset,
4448  int32_t rnd_val)
4449 {
4450  v16u8 out0, out1, out2;
4451  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4452  v8i16 filt0, filt1;
4453  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4454  v16i8 mask1;
4455  v8i16 filt_h0, filt_h1, filter_vec;
4456  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4457  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4458  v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4459  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4460  v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4461  v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4462  v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4463  v8i16 offset_vec, const_128, denom_vec;
4464  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
4465  v4i32 dst0_l, dst1_l, dst2_l, dst3_l, weight_vec, rnd_vec;
4466 
4467  src -= (src_stride + 1);
4468 
4469  filter_vec = LD_SH(filter_x);
4470  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4471 
4472  filter_vec = LD_SH(filter_y);
4473  UNPCK_R_SB_SH(filter_vec, filter_vec);
4474 
4475  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4476 
4477  mask1 = mask0 + 2;
4478 
4479  weight_vec = __msa_fill_w(weight);
4480  rnd_vec = __msa_fill_w(rnd_val);
4481 
4482  offset_vec = __msa_fill_h(offset);
4483  denom_vec = __msa_fill_h(rnd_val - 6);
4484  const_128 = __msa_fill_h((128 * weight));
4485  offset_vec += __msa_srar_h(const_128, denom_vec);
4486 
4487  LD_SB3(src, src_stride, src0, src1, src2);
4488  src += (3 * src_stride);
4489  XORI_B3_128_SB(src0, src1, src2);
4490 
4491  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4492  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4493  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4494  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4495  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4496  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4497  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4498  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4499 
4500  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
4501  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4502  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4503  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4504  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4505  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4506  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4507  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4508  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4509  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4510  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4511  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4512  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4513  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4514  dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4515  dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4516  dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4517  dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4518  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4519  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4520  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4521  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4522  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
4523  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
4524  ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
4525  ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
4526  PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4527  PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4528  dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4529  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4530  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4531  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4532  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4533  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4534  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4535  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4536  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4537  dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
4538  dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
4539  dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
4540  dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4541  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4542  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4543  SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4544  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4545  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4546  MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
4547  MUL2(dst6_r, weight_vec, dst7_r, weight_vec, dst6_r, dst7_r);
4548  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4549  MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4550  SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
4551  SRAR_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, rnd_vec);
4552  SRAR_W4_SW(dst0_l, dst1_l, dst2_l, dst3_l, rnd_vec);
4553  PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
4554  PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
4555  PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
4556  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4557  ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4558  ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
4559  CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
4560  CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
4561  PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
4562  ST4x8_UB(out0, out1, dst, dst_stride);
4563  ST2x4_UB(out2, 0, dst + 4, dst_stride);
4564  dst += 4 * dst_stride;
4565  ST2x4_UB(out2, 4, dst + 4, dst_stride);
4566 }
4567 
4569  int32_t src_stride,
4570  uint8_t *dst,
4571  int32_t dst_stride,
4572  const int8_t *filter_x,
4573  const int8_t *filter_y,
4574  int32_t weight,
4575  int32_t offset,
4576  int32_t rnd_val)
4577 {
4578  v16u8 out;
4579  v16i8 src0, src1, src2, src3, src4;
4580  v8i16 filt0, filt1;
4581  v8i16 filt_h0, filt_h1, filter_vec;
4582  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4583  v16i8 mask1;
4584  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4585  v8i16 dst0, dst1, dst2, dst3, dst4;
4586  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4587  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4588  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4589  v8i16 tmp0, tmp1;
4590  v8i16 offset_vec, const_128, denom_vec;
4591  v4i32 weight_vec, rnd_vec;
4592 
4593  src -= (src_stride + 1);
4594 
4595  filter_vec = LD_SH(filter_x);
4596  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4597 
4598  filter_vec = LD_SH(filter_y);
4599  UNPCK_R_SB_SH(filter_vec, filter_vec);
4600 
4601  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4602 
4603  mask1 = mask0 + 2;
4604 
4605  weight_vec = __msa_fill_w(weight);
4606  rnd_vec = __msa_fill_w(rnd_val);
4607 
4608  offset_vec = __msa_fill_h(offset);
4609  denom_vec = __msa_fill_h(rnd_val - 6);
4610  const_128 = __msa_fill_h((128 * weight));
4611  offset_vec += __msa_srar_h(const_128, denom_vec);
4612 
4613  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
4614  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4615  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4616  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4617  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4618  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4619  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4620  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4621  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4622  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4623  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4624  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4625  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4626  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4627  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4628  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4629  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4630  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4631  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4632  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4633  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4634  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4635  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4636  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4637  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
4638  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4639  CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
4640  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4641  ST8x2_UB(out, dst, dst_stride);
4642 }
4643 
4645  int32_t src_stride,
4646  uint8_t *dst,
4647  int32_t dst_stride,
4648  const int8_t *filter_x,
4649  const int8_t *filter_y,
4650  int32_t width8mult,
4651  int32_t weight,
4652  int32_t offset,
4653  int32_t rnd_val)
4654 {
4655  uint32_t cnt;
4656  v16u8 out0, out1;
4657  v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
4658  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4659  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
4660  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4661  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4662  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4663  v8i16 offset_vec, const_128, denom_vec;
4664  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4665  v4i32 weight_vec, rnd_vec;
4666 
4667  src -= (src_stride + 1);
4668 
4669  filter_vec = LD_SH(filter_x);
4670  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4671 
4672  filter_vec = LD_SH(filter_y);
4673  UNPCK_R_SB_SH(filter_vec, filter_vec);
4674 
4675  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4676 
4677  mask0 = LD_SB(ff_hevc_mask_arr);
4678  mask1 = mask0 + 2;
4679 
4680  weight_vec = __msa_fill_w(weight);
4681  rnd_vec = __msa_fill_w(rnd_val);
4682 
4683  offset_vec = __msa_fill_h(offset);
4684  denom_vec = __msa_fill_h(rnd_val - 6);
4685  const_128 = __msa_fill_h((128 * weight));
4686  offset_vec += __msa_srar_h(const_128, denom_vec);
4687 
4688  for (cnt = width8mult; cnt--;) {
4689  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
4690  src += 8;
4691  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4692  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4693  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4694  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4695  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4696  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4697  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4698  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4699  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4700  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4701  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4702  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4703  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4704  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4705  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4706  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4707  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4708  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4709  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4710  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4711  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4712  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4713  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4714  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4715  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4716  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4717  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4718  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4719  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4720  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4721  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4722  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4723  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4724  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4725  MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4726  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4727  SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
4728  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4729  dst3_r, tmp0, tmp1, tmp2, tmp3);
4730  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4731  ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4732  CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
4733  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4734  ST8x4_UB(out0, out1, dst, dst_stride);
4735  dst += 8;
4736  }
4737 }
4738 
4740  int32_t src_stride,
4741  uint8_t *dst,
4742  int32_t dst_stride,
4743  const int8_t *filter_x,
4744  const int8_t *filter_y,
4745  int32_t weight,
4746  int32_t offset,
4747  int32_t rnd_val)
4748 {
4749  v16u8 out0, out1, out2;
4750  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4751  v8i16 filt0, filt1;
4752  v8i16 filt_h0, filt_h1, filter_vec;
4753  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4754  v16i8 mask1;
4755  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4756  v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
4757  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4758  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4759  v4i32 dst4_r, dst4_l, dst5_r, dst5_l, weight_vec, rnd_vec;
4760  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4761  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4762  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4763  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4764  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4765  v8i16 offset_vec, const_128, denom_vec;
4766 
4767  src -= (src_stride + 1);
4768 
4769  filter_vec = LD_SH(filter_x);
4770  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4771 
4772  filter_vec = LD_SH(filter_y);
4773  UNPCK_R_SB_SH(filter_vec, filter_vec);
4774 
4775  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4776 
4777  mask1 = mask0 + 2;
4778 
4779  weight_vec = __msa_fill_w(weight);
4780  rnd_vec = __msa_fill_w(rnd_val);
4781 
4782  offset_vec = __msa_fill_h(offset);
4783  denom_vec = __msa_fill_h(rnd_val - 6);
4784  const_128 = __msa_fill_h((128 * weight));
4785  offset_vec += __msa_srar_h(const_128, denom_vec);
4786 
4787  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
4788  src += (5 * src_stride);
4789  LD_SB4(src, src_stride, src5, src6, src7, src8);
4790  XORI_B5_128_SB(src0, src1, src2, src3, src4);
4791  XORI_B4_128_SB(src5, src6, src7, src8);
4792  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4793  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4794  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4795  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4796  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4797  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
4798  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
4799  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
4800  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
4801  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4802  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4803  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4804  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4805  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4806  dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
4807  dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
4808  dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
4809  dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
4810  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4811  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4812  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4813  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4814  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4815  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4816  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
4817  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
4818  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4819  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4820  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4821  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4822  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4823  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4824  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4825  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4826  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4827  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
4828  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4829  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
4830  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4831  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4832  SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
4833  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4834  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4835  MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
4836  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4837  MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4838  MUL2(dst4_l, weight_vec, dst5_l, weight_vec, dst4_l, dst5_l);
4839  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4840  SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
4841  SRAR_W4_SW(dst4_r, dst4_l, dst5_r, dst5_l, rnd_vec);
4842  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
4843  tmp0, tmp1, tmp2, tmp3);
4844  PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
4845  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4846  ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4847  ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
4848  CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
4849  CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
4850  PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
4851  ST8x4_UB(out0, out1, dst, dst_stride);
4852  dst += (4 * dst_stride);
4853  ST8x2_UB(out2, dst, dst_stride);
4854 }
4855 
4857  int32_t src_stride,
4858  uint8_t *dst,
4859  int32_t dst_stride,
4860  const int8_t *filter_x,
4861  const int8_t *filter_y,
4862  int32_t height,
4863  int32_t weight,
4864  int32_t offset,
4865  int32_t rnd_val,
4866  int32_t width8mult)
4867 {
4868  uint32_t loop_cnt, cnt;
4869  uint8_t *src_tmp;
4870  uint8_t *dst_tmp;
4871  v16u8 out0, out1;
4872  v16i8 src0, src1, src2, src3, src4, src5, src6;
4873  v8i16 filt0, filt1;
4874  v8i16 filt_h0, filt_h1, filter_vec;
4875  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4876  v16i8 mask1;
4877  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4878  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4879  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4880  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4881  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4882  v8i16 offset_vec, const_128, denom_vec;
4883  v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
4884  v4i32 weight_vec, rnd_vec;
4885 
4886  src -= (src_stride + 1);
4887 
4888  filter_vec = LD_SH(filter_x);
4889  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4890 
4891  filter_vec = LD_SH(filter_y);
4892  UNPCK_R_SB_SH(filter_vec, filter_vec);
4893 
4894  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4895 
4896  mask1 = mask0 + 2;
4897 
4898  weight_vec = __msa_fill_w(weight);
4899  rnd_vec = __msa_fill_w(rnd_val);
4900 
4901  offset_vec = __msa_fill_h(offset);
4902  denom_vec = __msa_fill_h(rnd_val - 6);
4903  const_128 = __msa_fill_h((128 * weight));
4904  offset_vec += __msa_srar_h(const_128, denom_vec);
4905 
4906  for (cnt = width8mult; cnt--;) {
4907  src_tmp = src;
4908  dst_tmp = dst;
4909 
4910  LD_SB3(src_tmp, src_stride, src0, src1, src2);
4911  src_tmp += (3 * src_stride);
4912  XORI_B3_128_SB(src0, src1, src2);
4913 
4914  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4915  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4916  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4917  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4918  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4919  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4920 
4921  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4922  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4923 
4924  for (loop_cnt = height >> 2; loop_cnt--;) {
4925  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
4926  src_tmp += (4 * src_stride);
4927  XORI_B4_128_SB(src3, src4, src5, src6);
4928 
4929  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4930  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4931  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4932  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4933  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4934  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4935  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4936  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4937  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4938  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4939  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4940  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4941  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4942  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4943  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4944  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4945  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4946  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4947  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4948  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4949  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4950  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4951  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4952  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4953  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4954  MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4955  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4956  SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
4957  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4958  dst3_r, tmp0, tmp1, tmp2, tmp3);
4959  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4960  ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4961  CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
4962  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4963  ST8x4_UB(out0, out1, dst_tmp, dst_stride);
4964  dst_tmp += (4 * dst_stride);
4965 
4966  dst10_r = dst54_r;
4967  dst10_l = dst54_l;
4968  dst21_r = dst65_r;
4969  dst21_l = dst65_l;
4970  dst2 = dst6;
4971  }
4972 
4973  src += 8;
4974  dst += 8;
4975  }
4976 }
4977 
4979  int32_t src_stride,
4980  uint8_t *dst,
4981  int32_t dst_stride,
4982  const int8_t *filter_x,
4983  const int8_t *filter_y,
4984  int32_t height,
4985  int32_t weight,
4986  int32_t offset,
4987  int32_t rnd_val)
4988 {
4989 
4990  if (2 == height) {
4991  hevc_hv_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
4992  filter_x, filter_y, weight,
4993  offset, rnd_val);
4994  } else if (4 == height) {
4995  hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride,
4996  filter_x, filter_y, 1, weight,
4997  offset, rnd_val);
4998  } else if (6 == height) {
4999  hevc_hv_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
5000  filter_x, filter_y, weight,
5001  offset, rnd_val);
5002  } else if (0 == (height % 4)) {
5003  hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5004  filter_x, filter_y, height, weight,
5005  offset, rnd_val, 1);
5006  }
5007 }
5008 
5010  int32_t src_stride,
5011  uint8_t *dst,
5012  int32_t dst_stride,
5013  const int8_t *filter_x,
5014  const int8_t *filter_y,
5015  int32_t height,
5016  int32_t weight,
5017  int32_t offset,
5018  int32_t rnd_val)
5019 {
5020  uint32_t loop_cnt;
5021  uint8_t *src_tmp, *dst_tmp;
5022  v16u8 out0, out1;
5023  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5024  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5025  v16i8 mask0, mask1, mask2, mask3;
5026  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
5027  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5028  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
5029  v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5030  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5031  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5032  v8i16 offset_vec, const_128, denom_vec;
5033  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5034  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
5035 
5036  src -= (src_stride + 1);
5037 
5038  filter_vec = LD_SH(filter_x);
5039  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5040 
5041  filter_vec = LD_SH(filter_y);
5042  UNPCK_R_SB_SH(filter_vec, filter_vec);
5043 
5044  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5045 
5046  mask0 = LD_SB(ff_hevc_mask_arr);
5047  mask1 = mask0 + 2;
5048 
5049  weight_vec = __msa_fill_w(weight);
5050  rnd_vec = __msa_fill_w(rnd_val);
5051 
5052  offset_vec = __msa_fill_h(offset);
5053  denom_vec = __msa_fill_h(rnd_val - 6);
5054  const_128 = __msa_fill_h((128 * weight));
5055  offset_vec += __msa_srar_h(const_128, denom_vec);
5056 
5057  src_tmp = src;
5058  dst_tmp = dst;
5059 
5060  LD_SB3(src_tmp, src_stride, src0, src1, src2);
5061  src_tmp += (3 * src_stride);
5062  XORI_B3_128_SB(src0, src1, src2);
5063  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5064  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5065  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5066  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5067  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5068  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5069  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5070  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5071 
5072  for (loop_cnt = 4; loop_cnt--;) {
5073  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
5074  src_tmp += (4 * src_stride);
5075  XORI_B4_128_SB(src3, src4, src5, src6);
5076  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5077  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5078  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5079  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5080  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5081  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5082  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5083  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5084  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5085  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5086  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5087  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5088  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5089  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5090  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5091  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5092  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5093  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5094  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5095  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5096  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5097  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5098  MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
5099  MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
5100  MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
5101  MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
5102  SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5103  SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
5104  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5105  dst3_r, tmp0, tmp1, tmp2, tmp3);
5106  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
5107  ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
5108  CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
5109  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5110  ST8x4_UB(out0, out1, dst_tmp, dst_stride);
5111  dst_tmp += (4 * dst_stride);
5112 
5113  dst10_r = dst54_r;
5114  dst10_l = dst54_l;
5115  dst21_r = dst65_r;
5116  dst21_l = dst65_l;
5117  dsth2 = dsth6;
5118  }
5119 
5120  src += 8;
5121  dst += 8;
5122 
5123  mask2 = LD_SB(ff_hevc_mask_arr + 16);
5124  mask3 = mask2 + 2;
5125 
5126  LD_SB3(src, src_stride, src0, src1, src2);
5127  src += (3 * src_stride);
5128  XORI_B3_128_SB(src0, src1, src2);
5129  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
5130  VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
5131  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5132  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5133  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
5134  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
5135 
5136  for (loop_cnt = 2; loop_cnt--;) {
5137  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9,
5138  src10);
5139  src += (8 * src_stride);
5140  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
5141  VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
5142  VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
5143  VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
5144  VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
5145  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5146  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5147  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5148  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5149  dst32_r = __msa_ilvr_h(dst73, dst22);
5150  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
5151  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
5152  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
5153  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
5154  dst76_r = __msa_ilvr_h(dst22, dst106);
5155  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5156  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5157  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5158  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5159  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5160  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5161  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
5162  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
5163  SRA_4V(dst0, dst1, dst2, dst3, 6);
5164  SRA_4V(dst4, dst5, dst6, dst7, 6);
5165  MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
5166  MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
5167  MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
5168  MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
5169  SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5170  SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5171  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5172  tmp2, tmp3);
5173  ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
5174  ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
5175  CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
5176  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5177  ST4x8_UB(out0, out1, dst, dst_stride);
5178  dst += (8 * dst_stride);
5179 
5180  dst10_r = dst98_r;
5181  dst21_r = dst109_r;
5182  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5183  }
5184 }
5185 
5187  int32_t src_stride,
5188  uint8_t *dst,
5189  int32_t dst_stride,
5190  const int8_t *filter_x,
5191  const int8_t *filter_y,
5192  int32_t height,
5193  int32_t weight,
5194  int32_t offset,
5195  int32_t rnd_val)
5196 {
5197  if (4 == height) {
5198  hevc_hv_uniwgt_4t_8multx4_msa(src, src_stride, dst, dst_stride,
5199  filter_x, filter_y, 2, weight, offset,
5200  rnd_val);
5201  } else {
5202  hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5203  filter_x, filter_y, height, weight,
5204  offset, rnd_val, 2);
5205  }
5206 }
5207 
5209  int32_t src_stride,
5210  uint8_t *dst,
5211  int32_t dst_stride,
5212  const int8_t *filter_x,
5213  const int8_t *filter_y,
5214  int32_t height,
5215  int32_t weight,
5216  int32_t offset,
5217  int32_t rnd_val)
5218 {
5219  hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5220  filter_x, filter_y, height, weight,
5221  offset, rnd_val, 3);
5222 }
5223 
5225  int32_t src_stride,
5226  uint8_t *dst,
5227  int32_t dst_stride,
5228  const int8_t *filter_x,
5229  const int8_t *filter_y,
5230  int32_t height,
5231  int32_t weight,
5232  int32_t offset,
5233  int32_t rnd_val)
5234 {
5235  hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
5236  filter_x, filter_y, height, weight,
5237  offset, rnd_val, 4);
5238 }
5239 
5240 #define UNIWGT_MC_COPY(WIDTH) \
5241 void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
5242  ptrdiff_t dst_stride, \
5243  uint8_t *src, \
5244  ptrdiff_t src_stride, \
5245  int height, \
5246  int denom, \
5247  int weight, \
5248  int offset, \
5249  intptr_t mx, \
5250  intptr_t my, \
5251  int width) \
5252 { \
5253  int shift = denom + 14 - 8; \
5254  hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
5255  height, weight, offset, shift); \
5256 }
5257 
5258 UNIWGT_MC_COPY(4);
5259 UNIWGT_MC_COPY(6);
5260 UNIWGT_MC_COPY(8);
5261 UNIWGT_MC_COPY(12);
5262 UNIWGT_MC_COPY(16);
5263 UNIWGT_MC_COPY(24);
5264 UNIWGT_MC_COPY(32);
5265 UNIWGT_MC_COPY(48);
5266 UNIWGT_MC_COPY(64);
5267 
5268 #undef UNIWGT_MC_COPY
5269 
5270 #define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
5271 void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
5272  ptrdiff_t \
5273  dst_stride, \
5274  uint8_t *src, \
5275  ptrdiff_t \
5276  src_stride, \
5277  int height, \
5278  int denom, \
5279  int weight, \
5280  int offset, \
5281  intptr_t mx, \
5282  intptr_t my, \
5283  int width) \
5284 { \
5285  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
5286  int shift = denom + 14 - 8; \
5287  \
5288  hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
5289  dst_stride, filter, height, \
5290  weight, offset, shift); \
5291 }
5292 
5293 UNI_W_MC(qpel, h, 4, 8, hz, mx);
5294 UNI_W_MC(qpel, h, 8, 8, hz, mx);
5295 UNI_W_MC(qpel, h, 12, 8, hz, mx);
5296 UNI_W_MC(qpel, h, 16, 8, hz, mx);
5297 UNI_W_MC(qpel, h, 24, 8, hz, mx);
5298 UNI_W_MC(qpel, h, 32, 8, hz, mx);
5299 UNI_W_MC(qpel, h, 48, 8, hz, mx);
5300 UNI_W_MC(qpel, h, 64, 8, hz, mx);
5301 
5302 UNI_W_MC(qpel, v, 4, 8, vt, my);
5303 UNI_W_MC(qpel, v, 8, 8, vt, my);
5304 UNI_W_MC(qpel, v, 12, 8, vt, my);
5305 UNI_W_MC(qpel, v, 16, 8, vt, my);
5306 UNI_W_MC(qpel, v, 24, 8, vt, my);
5307 UNI_W_MC(qpel, v, 32, 8, vt, my);
5308 UNI_W_MC(qpel, v, 48, 8, vt, my);
5309 UNI_W_MC(qpel, v, 64, 8, vt, my);
5310 
5311 UNI_W_MC(epel, h, 4, 4, hz, mx);
5312 UNI_W_MC(epel, h, 6, 4, hz, mx);
5313 UNI_W_MC(epel, h, 8, 4, hz, mx);
5314 UNI_W_MC(epel, h, 12, 4, hz, mx);
5315 UNI_W_MC(epel, h, 16, 4, hz, mx);
5316 UNI_W_MC(epel, h, 24, 4, hz, mx);
5317 UNI_W_MC(epel, h, 32, 4, hz, mx);
5318 
5319 UNI_W_MC(epel, v, 4, 4, vt, my);
5320 UNI_W_MC(epel, v, 6, 4, vt, my);
5321 UNI_W_MC(epel, v, 8, 4, vt, my);
5322 UNI_W_MC(epel, v, 12, 4, vt, my);
5323 UNI_W_MC(epel, v, 16, 4, vt, my);
5324 UNI_W_MC(epel, v, 24, 4, vt, my);
5325 UNI_W_MC(epel, v, 32, 4, vt, my);
5326 
5327 #undef UNI_W_MC
5328 
5329 #define UNI_W_MC_HV(PEL, WIDTH, TAP) \
5330 void ff_hevc_put_hevc_uni_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
5331  ptrdiff_t dst_stride, \
5332  uint8_t *src, \
5333  ptrdiff_t src_stride, \
5334  int height, \
5335  int denom, \
5336  int weight, \
5337  int offset, \
5338  intptr_t mx, \
5339  intptr_t my, \
5340  int width) \
5341 { \
5342  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
5343  const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
5344  int shift = denom + 14 - 8; \
5345  \
5346  hevc_hv_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
5347  filter_x, filter_y, height, \
5348  weight, offset, shift); \
5349 }
5350 
5351 UNI_W_MC_HV(qpel, 4, 8);
5352 UNI_W_MC_HV(qpel, 8, 8);
5353 UNI_W_MC_HV(qpel, 12, 8);
5354 UNI_W_MC_HV(qpel, 16, 8);
5355 UNI_W_MC_HV(qpel, 24, 8);
5356 UNI_W_MC_HV(qpel, 32, 8);
5357 UNI_W_MC_HV(qpel, 48, 8);
5358 UNI_W_MC_HV(qpel, 64, 8);
5359 
5360 UNI_W_MC_HV(epel, 4, 4);
5361 UNI_W_MC_HV(epel, 6, 4);
5362 UNI_W_MC_HV(epel, 8, 4);
5363 UNI_W_MC_HV(epel, 12, 4);
5364 UNI_W_MC_HV(epel, 16, 4);
5365 UNI_W_MC_HV(epel, 24, 4);
5366 UNI_W_MC_HV(epel, 32, 4);
5367 
5368 #undef UNI_W_MC_HV
#define VSHF_B4_SB(...)
static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
#define ILVRL_B2_SH(...)
static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
#define ILVR_H4_SH(...)
static void hevc_vt_uniwgt_4t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
#define UNI_W_MC_HV(PEL, WIDTH, TAP)
#define XORI_B2_128_SB(...)
#define MUL2(in0, in1, in2, in3, out0, out1)
static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define LD_SB(...)
#define XORI_B3_128_SB(...)
static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ILVRL_H2_SW(...)
#define PCKEV_B3_UB(...)
#define UNPCK_R_SB_SH(in, out)
#define ILVR_B2_SB(...)
#define SPLATI_H2_SH(...)
#define src
Definition: vp8dsp.c:254
#define ILVL_H2_SH(...)
#define CLIP_SW2_0_255_MAX_SATU(in0, in1)
#define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,out0_h, out1_h)
#define LD_SB2(...)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define ILVL_H4_SH(...)
static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width8mult)
static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define VSHF_B2_SB(...)
static void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SRA_4V(in0, in1, in2, in3, shift)
static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, int clip)
Definition: cfhd.c:153
#define ILVR_D2_SB(...)
uint8_t
#define LD4(psrc, stride, out0, out1, out2, out3)
#define SPLATI_W2_SH(...)
static void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SPLATI_H4_SH(...)
#define DOTP_SH2_SW(...)
#define ILVL_B2_SB(...)
#define height
static void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define LD_SH(...)
#define ILVRL_H2_SH(...)
#define CLIP_SH4_0_255_MAX_SATU(in0, in1, in2, in3)
#define ILVR_D3_SB(...)
#define ILVR_D4_SB(...)
#define LD_SB8(...)
static void hevc_vt_uniwgt_4t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_8x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static int aligned(int val)
Definition: dashdec.c:176
static void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST8x2_UB(in, pdst, stride)
static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
#define CLIP_SH_0_255_MAX_SATU(in)
static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B7_128_SB(...)
static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define zero
Definition: regdef.h:64
#define LW2(psrc, stride, out0, out1)
#define ILVR_B2_SH(...)
#define XORI_B4_128_SB(...)
static const uint8_t offset[127][2]
Definition: vf_spp.c:92
#define SPLATI_W4_SH(...)
static void hevc_vt_uniwgt_8t_16multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t weightmul16)
static void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define width
#define PCKEV_D2_SH(...)
static void hevc_hv_uniwgt_4t_8multx4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult, int32_t weight, int32_t offset, int32_t rnd_val)
#define SPLATI_W4_SW(...)
static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define PCKEV_H2_SW(...)
int32_t
#define PCKEV_H2_SH(...)
#define LD_SB3(...)
#define SRAR_W4_SW(...)
static void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST_UB(...)
static void hevc_uniwgt_copy_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define LD_SB4(...)
static void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST2x4_UB(in, stidx, pdst, stride)
#define PCKEV_B4_UB(...)
static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
#define ST_UB2(...)
static void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST_UB4(...)
#define src1
Definition: h264pred.c:139
static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width)
static void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ILVL_B4_SB(...)
static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SRAR_W2_SW(...)
static void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static const uint8_t ff_hevc_mask_arr[16 *2]
static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
#define ILVR_B4_SH(...)
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_uniwgt_copy_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
#define src0
Definition: h264pred.c:138
#define ADD2(in0, in1, in2, in3, out0, out1)
#define INSERT_W2_SB(...)
static int weight(int i, int blen, int offset)
Definition: diracdec.c:1547
static void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SLLI_4V(in0, in1, in2, in3, shift)
static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST4x8_UB(in0, in1, pdst, stride)
#define ST6x4_UB(in0, in1, pdst, stride)
#define LD_SB7(...)
static void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define LD_SB5(...)
static void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ILVL_W2_SB(...)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
#define LW4(psrc, stride, out0, out1, out2, out3)
static void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ILVRL_B2_SB(...)
#define LD_SB6(...)
#define UNIWGT_MC_COPY(WIDTH)
#define ST8x4_UB(in0, in1, pdst, stride)
#define ILVR_H2_SH(...)
#define ST8x8_UB(in0, in1, in2, in3, pdst, stride)
#define INSERT_D2_SB(...)
static void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B6_128_SB(...)
#define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w,offset_h, rnd_w, out0_h, out1_h,out2_h, out3_h)
static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST12x4_UB(in0, in1, in2, pdst, stride)
#define ILVR_B4_SB(...)
FILE * out
Definition: movenc.c:54
static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST8x1_UB(in, pdst)
#define SLLI_2V(in0, in1, shift)
#define PCKEV_H4_SH(...)
#define ST4x2_UB(in, pdst, stride)
#define INSERT_W4_SB(...)
#define LD2(psrc, stride, out0, out1)
static void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
#define PCKEV_B2_UB(...)
#define CLIP_SH2_0_255_MAX_SATU(in0, in1)
static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
#define CLIP_SW4_0_255_MAX_SATU(in0, in1, in2, in3)
static uint8_t tmp[11]
Definition: aes_ctr.c:26