FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
hevc_mc_uni_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
26  uint8_t *dst, int32_t dst_stride,
28 {
29  int32_t cnt;
30  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
31  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
32 
33  if (0 == height % 12) {
34  for (cnt = (height / 12); cnt--;) {
35  LD_UB8(src, src_stride,
36  src0, src1, src2, src3, src4, src5, src6, src7);
37  src += (8 * src_stride);
38 
39  out0 = __msa_copy_u_d((v2i64) src0, 0);
40  out1 = __msa_copy_u_d((v2i64) src1, 0);
41  out2 = __msa_copy_u_d((v2i64) src2, 0);
42  out3 = __msa_copy_u_d((v2i64) src3, 0);
43  out4 = __msa_copy_u_d((v2i64) src4, 0);
44  out5 = __msa_copy_u_d((v2i64) src5, 0);
45  out6 = __msa_copy_u_d((v2i64) src6, 0);
46  out7 = __msa_copy_u_d((v2i64) src7, 0);
47 
48  SD4(out0, out1, out2, out3, dst, dst_stride);
49  dst += (4 * dst_stride);
50  SD4(out4, out5, out6, out7, dst, dst_stride);
51  dst += (4 * dst_stride);
52 
53  LD_UB4(src, src_stride, src0, src1, src2, src3);
54  src += (4 * src_stride);
55 
56  out0 = __msa_copy_u_d((v2i64) src0, 0);
57  out1 = __msa_copy_u_d((v2i64) src1, 0);
58  out2 = __msa_copy_u_d((v2i64) src2, 0);
59  out3 = __msa_copy_u_d((v2i64) src3, 0);
60 
61  SD4(out0, out1, out2, out3, dst, dst_stride);
62  dst += (4 * dst_stride);
63  }
64  } else if (0 == height % 8) {
65  for (cnt = height >> 3; cnt--;) {
66  LD_UB8(src, src_stride,
67  src0, src1, src2, src3, src4, src5, src6, src7);
68  src += (8 * src_stride);
69 
70  out0 = __msa_copy_u_d((v2i64) src0, 0);
71  out1 = __msa_copy_u_d((v2i64) src1, 0);
72  out2 = __msa_copy_u_d((v2i64) src2, 0);
73  out3 = __msa_copy_u_d((v2i64) src3, 0);
74  out4 = __msa_copy_u_d((v2i64) src4, 0);
75  out5 = __msa_copy_u_d((v2i64) src5, 0);
76  out6 = __msa_copy_u_d((v2i64) src6, 0);
77  out7 = __msa_copy_u_d((v2i64) src7, 0);
78 
79  SD4(out0, out1, out2, out3, dst, dst_stride);
80  dst += (4 * dst_stride);
81  SD4(out4, out5, out6, out7, dst, dst_stride);
82  dst += (4 * dst_stride);
83  }
84  } else if (0 == height % 4) {
85  for (cnt = (height / 4); cnt--;) {
86  LD_UB4(src, src_stride, src0, src1, src2, src3);
87  src += (4 * src_stride);
88  out0 = __msa_copy_u_d((v2i64) src0, 0);
89  out1 = __msa_copy_u_d((v2i64) src1, 0);
90  out2 = __msa_copy_u_d((v2i64) src2, 0);
91  out3 = __msa_copy_u_d((v2i64) src3, 0);
92 
93  SD4(out0, out1, out2, out3, dst, dst_stride);
94  dst += (4 * dst_stride);
95  }
96  } else if (0 == height % 2) {
97  for (cnt = (height / 2); cnt--;) {
98  LD_UB2(src, src_stride, src0, src1);
99  src += (2 * src_stride);
100  out0 = __msa_copy_u_d((v2i64) src0, 0);
101  out1 = __msa_copy_u_d((v2i64) src1, 0);
102 
103  SD(out0, dst);
104  dst += dst_stride;
105  SD(out1, dst);
106  dst += dst_stride;
107  }
108  }
109 }
110 
111 static void copy_width12_msa(uint8_t *src, int32_t src_stride,
112  uint8_t *dst, int32_t dst_stride,
113  int32_t height)
114 {
115  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
116 
117  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
118  src += (8 * src_stride);
119  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
120  dst += (8 * dst_stride);
121  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
122  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
123 }
124 
125 static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
126  uint8_t *dst, int32_t dst_stride,
128 {
129  int32_t cnt, loop_cnt;
130  uint8_t *src_tmp, *dst_tmp;
131  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
132 
133  for (cnt = (width >> 4); cnt--;) {
134  src_tmp = src;
135  dst_tmp = dst;
136 
137  for (loop_cnt = (height >> 3); loop_cnt--;) {
138  LD_UB8(src_tmp, src_stride,
139  src0, src1, src2, src3, src4, src5, src6, src7);
140  src_tmp += (8 * src_stride);
141 
142  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
143  dst_tmp, dst_stride);
144  dst_tmp += (8 * dst_stride);
145  }
146 
147  src += 16;
148  dst += 16;
149  }
150 }
151 
152 static void copy_width16_msa(uint8_t *src, int32_t src_stride,
153  uint8_t *dst, int32_t dst_stride,
154  int32_t height)
155 {
156  int32_t cnt;
157  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
158 
159  if (0 == height % 12) {
160  for (cnt = (height / 12); cnt--;) {
161  LD_UB8(src, src_stride,
162  src0, src1, src2, src3, src4, src5, src6, src7);
163  src += (8 * src_stride);
164  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
165  dst, dst_stride);
166  dst += (8 * dst_stride);
167 
168  LD_UB4(src, src_stride, src0, src1, src2, src3);
169  src += (4 * src_stride);
170  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
171  dst += (4 * dst_stride);
172  }
173  } else if (0 == height % 8) {
174  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
175  } else if (0 == height % 4) {
176  for (cnt = (height >> 2); cnt--;) {
177  LD_UB4(src, src_stride, src0, src1, src2, src3);
178  src += (4 * src_stride);
179 
180  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
181  dst += (4 * dst_stride);
182  }
183  }
184 }
185 
186 static void copy_width24_msa(uint8_t *src, int32_t src_stride,
187  uint8_t *dst, int32_t dst_stride,
188  int32_t height)
189 {
190  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
191  copy_width8_msa(src + 16, src_stride, dst + 16, dst_stride, height);
192 }
193 
194 static void copy_width32_msa(uint8_t *src, int32_t src_stride,
195  uint8_t *dst, int32_t dst_stride,
196  int32_t height)
197 {
198  int32_t cnt;
199  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
200 
201  if (0 == height % 12) {
202  for (cnt = (height / 12); cnt--;) {
203  LD_UB4(src, src_stride, src0, src1, src2, src3);
204  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
205  src += (4 * src_stride);
206  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
207  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
208  dst += (4 * dst_stride);
209 
210  LD_UB4(src, src_stride, src0, src1, src2, src3);
211  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
212  src += (4 * src_stride);
213  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
214  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
215  dst += (4 * dst_stride);
216 
217  LD_UB4(src, src_stride, src0, src1, src2, src3);
218  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
219  src += (4 * src_stride);
220  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
221  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
222  dst += (4 * dst_stride);
223  }
224  } else if (0 == height % 8) {
225  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
226  } else if (0 == height % 4) {
227  for (cnt = (height >> 2); cnt--;) {
228  LD_UB4(src, src_stride, src0, src1, src2, src3);
229  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
230  src += (4 * src_stride);
231  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
232  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
233  dst += (4 * dst_stride);
234  }
235  }
236 }
237 
238 static void copy_width48_msa(uint8_t *src, int32_t src_stride,
239  uint8_t *dst, int32_t dst_stride,
240  int32_t height)
241 {
242  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 48);
243 }
244 
245 static void copy_width64_msa(uint8_t *src, int32_t src_stride,
246  uint8_t *dst, int32_t dst_stride,
247  int32_t height)
248 {
249  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
250 }
251 
253  /* 8 width cases */
254  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
255  /* 4 width cases */
256  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
257  /* 4 width cases */
258  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
259 };
260 
261 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
262  filt0, filt1, filt2, filt3) \
263 ( { \
264  v8i16 tmp0, tmp1; \
265  \
266  tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
267  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
268  tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \
269  tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3); \
270  tmp0 = __msa_adds_s_h(tmp0, tmp1); \
271  \
272  tmp0; \
273 } )
274 
275 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
276  mask0, mask1, mask2, mask3, \
277  filt0, filt1, filt2, filt3, \
278  out0, out1) \
279 { \
280  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
281  v8i16 res0_m, res1_m, res2_m, res3_m; \
282  \
283  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
284  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
285  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
286  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
287  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
288  DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
289  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
290  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
291  ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
292 }
293 
294 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
295  mask0, mask1, mask2, mask3, \
296  filt0, filt1, filt2, filt3, \
297  out0, out1, out2, out3) \
298 { \
299  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
300  v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
301  \
302  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
303  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
304  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
305  res0_m, res1_m, res2_m, res3_m); \
306  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
307  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
308  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
309  res4_m, res5_m, res6_m, res7_m); \
310  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
311  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
312  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
313  res0_m, res1_m, res2_m, res3_m); \
314  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
315  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
316  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
317  res4_m, res5_m, res6_m, res7_m); \
318  ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
319  res7_m, out0, out1, out2, out3); \
320 }
321 
322 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
323 ( { \
324  v8i16 tmp0; \
325  \
326  tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
327  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
328  \
329  tmp0; \
330 } )
331 
332 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
333  mask0, mask1, filt0, filt1, \
334  out0, out1) \
335 { \
336  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
337  \
338  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
339  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
340  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
341  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
342 }
343 
344 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
345  mask0, mask1, filt0, filt1, \
346  out0, out1, out2, out3) \
347 { \
348  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
349  \
350  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
351  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
352  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
353  out0, out1, out2, out3); \
354  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
355  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
356  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
357  out0, out1, out2, out3); \
358 }
359 
360 static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
361  uint8_t *dst, int32_t dst_stride,
362  const int8_t *filter, uint8_t rnd_val)
363 {
364  v16u8 mask0, mask1, mask2, mask3, out;
365  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
366  v8i16 filt, out0, out1;
367  v8i16 rnd_vec;
368 
369  mask0 = LD_UB(&mc_filt_mask_arr[16]);
370  src -= 3;
371  rnd_vec = __msa_fill_h(rnd_val);
372 
373  /* rearranging filter */
374  filt = LD_SH(filter);
375  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
376 
377  mask1 = mask0 + 2;
378  mask2 = mask0 + 4;
379  mask3 = mask0 + 6;
380 
381  LD_SB4(src, src_stride, src0, src1, src2, src3);
382  XORI_B4_128_SB(src0, src1, src2, src3);
383  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
384  mask3, filt0, filt1, filt2, filt3, out0, out1);
385  SRAR_H2_SH(out0, out1, rnd_vec);
386  SAT_SH2_SH(out0, out1, 7);
387  out = PCKEV_XORI128_UB(out0, out1);
388  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
389 }
390 
391 static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
392  uint8_t *dst, int32_t dst_stride,
393  const int8_t *filter, uint8_t rnd_val)
394 {
395  v16i8 filt0, filt1, filt2, filt3;
396  v16i8 src0, src1, src2, src3;
397  v16u8 mask0, mask1, mask2, mask3, out;
398  v8i16 filt, out0, out1, out2, out3;
399  v8i16 rnd_vec;
400 
401  mask0 = LD_UB(&mc_filt_mask_arr[16]);
402  src -= 3;
403  rnd_vec = __msa_fill_h(rnd_val);
404 
405  /* rearranging filter */
406  filt = LD_SH(filter);
407  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
408 
409  mask1 = mask0 + 2;
410  mask2 = mask0 + 4;
411  mask3 = mask0 + 6;
412 
413  LD_SB4(src, src_stride, src0, src1, src2, src3);
414  XORI_B4_128_SB(src0, src1, src2, src3);
415  src += (4 * src_stride);
416  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
417  mask3, filt0, filt1, filt2, filt3, out0, out1);
418  LD_SB4(src, src_stride, src0, src1, src2, src3);
419  XORI_B4_128_SB(src0, src1, src2, src3);
420  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
421  mask3, filt0, filt1, filt2, filt3, out2, out3);
422  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
423  SAT_SH4_SH(out0, out1, out2, out3, 7);
424  out = PCKEV_XORI128_UB(out0, out1);
425  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
426  dst += (4 * dst_stride);
427  out = PCKEV_XORI128_UB(out2, out3);
428  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
429 }
430 
431 static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
432  uint8_t *dst, int32_t dst_stride,
433  const int8_t *filter, uint8_t rnd_val)
434 {
435  v16u8 mask0, mask1, mask2, mask3, out;
436  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
437  v8i16 filt, out0, out1, out2, out3;
438  v8i16 rnd_vec;
439 
440  mask0 = LD_UB(&mc_filt_mask_arr[16]);
441  src -= 3;
442  rnd_vec = __msa_fill_h(rnd_val);
443 
444  /* rearranging filter */
445  filt = LD_SH(filter);
446  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
447 
448  mask1 = mask0 + 2;
449  mask2 = mask0 + 4;
450  mask3 = mask0 + 6;
451 
452  LD_SB4(src, src_stride, src0, src1, src2, src3);
453  XORI_B4_128_SB(src0, src1, src2, src3);
454  src += (4 * src_stride);
455  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
456  mask3, filt0, filt1, filt2, filt3, out0, out1);
457  LD_SB4(src, src_stride, src0, src1, src2, src3);
458  XORI_B4_128_SB(src0, src1, src2, src3);
459  src += (4 * src_stride);
460  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
461  mask3, filt0, filt1, filt2, filt3, out2, out3);
462  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
463  SAT_SH4_SH(out0, out1, out2, out3, 7);
464  out = PCKEV_XORI128_UB(out0, out1);
465  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
466  dst += (4 * dst_stride);
467  out = PCKEV_XORI128_UB(out2, out3);
468  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
469  dst += (4 * dst_stride);
470 
471  LD_SB4(src, src_stride, src0, src1, src2, src3);
472  XORI_B4_128_SB(src0, src1, src2, src3);
473  src += (4 * src_stride);
474  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
475  mask3, filt0, filt1, filt2, filt3, out0, out1);
476  LD_SB4(src, src_stride, src0, src1, src2, src3);
477  XORI_B4_128_SB(src0, src1, src2, src3);
478  src += (4 * src_stride);
479  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
480  mask3, filt0, filt1, filt2, filt3, out2, out3);
481 
482  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
483  SAT_SH4_SH(out0, out1, out2, out3, 7);
484  out = PCKEV_XORI128_UB(out0, out1);
485  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
486  dst += (4 * dst_stride);
487  out = PCKEV_XORI128_UB(out2, out3);
488  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
489 }
490 
491 static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
492  uint8_t *dst, int32_t dst_stride,
493  const int8_t *filter, int32_t height, uint8_t rnd_val)
494 {
495  if (4 == height) {
496  common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
497  } else if (8 == height) {
498  common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
499  } else if (16 == height) {
500  common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter,
501  rnd_val);
502  }
503 }
504 
505 static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride,
506  uint8_t *dst, int32_t dst_stride,
507  const int8_t *filter, uint8_t rnd_val)
508 {
509  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
510  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
511  v8i16 filt, out0, out1, out2, out3;
512  v8i16 rnd_vec;
513 
514  mask0 = LD_UB(&mc_filt_mask_arr[0]);
515  src -= 3;
516  rnd_vec = __msa_fill_h(rnd_val);
517 
518  /* rearranging filter */
519  filt = LD_SH(filter);
520  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
521 
522  mask1 = mask0 + 2;
523  mask2 = mask0 + 4;
524  mask3 = mask0 + 6;
525 
526  LD_SB4(src, src_stride, src0, src1, src2, src3);
527  XORI_B4_128_SB(src0, src1, src2, src3);
528  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
529  mask3, filt0, filt1, filt2, filt3, out0, out1,
530  out2, out3);
531  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
532  SAT_SH4_SH(out0, out1, out2, out3, 7);
533  tmp0 = PCKEV_XORI128_UB(out0, out1);
534  tmp1 = PCKEV_XORI128_UB(out2, out3);
535  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
536 }
537 
538 static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride,
539  uint8_t *dst, int32_t dst_stride,
540  const int8_t *filter, int32_t height,
541  uint8_t rnd_val)
542 {
543  uint32_t loop_cnt;
544  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
545  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
546  v8i16 filt, out0, out1, out2, out3;
547  v8i16 rnd_vec;
548 
549  mask0 = LD_UB(&mc_filt_mask_arr[0]);
550  src -= 3;
551  rnd_vec = __msa_fill_h(rnd_val);
552 
553  /* rearranging filter */
554  filt = LD_SH(filter);
555  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
556 
557  mask1 = mask0 + 2;
558  mask2 = mask0 + 4;
559  mask3 = mask0 + 6;
560 
561  for (loop_cnt = (height >> 2); loop_cnt--;) {
562  LD_SB4(src, src_stride, src0, src1, src2, src3);
563  XORI_B4_128_SB(src0, src1, src2, src3);
564  src += (4 * src_stride);
565  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
566  mask3, filt0, filt1, filt2, filt3, out0,
567  out1, out2, out3);
568  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
569  SAT_SH4_SH(out0, out1, out2, out3, 7);
570  tmp0 = PCKEV_XORI128_UB(out0, out1);
571  tmp1 = PCKEV_XORI128_UB(out2, out3);
572  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
573  dst += (4 * dst_stride);
574  }
575 }
576 
577 static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
578  uint8_t *dst, int32_t dst_stride,
579  const int8_t *filter, int32_t height,
580  uint8_t rnd_val)
581 {
582  if (4 == height) {
583  common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
584  } else {
585  common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
586  height, rnd_val);
587  }
588 }
589 
590 static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
591  uint8_t *dst, int32_t dst_stride,
592  const int8_t *filter, int32_t height,
593  uint8_t rnd_val)
594 {
595  uint8_t *src1_ptr, *dst1;
596  uint32_t loop_cnt;
597  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
598  v8i16 filt, out0, out1, out2, out3;
599  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00, tmp0, tmp1;
600  v8i16 rnd_vec;
601 
602  mask00 = LD_UB(&mc_filt_mask_arr[0]);
603  mask0 = LD_UB(&mc_filt_mask_arr[16]);
604  rnd_vec = __msa_fill_h(rnd_val);
605 
606  src1_ptr = src - 3;
607  dst1 = dst;
608 
609  dst = dst1 + 8;
610  src = src1_ptr + 8;
611 
612  /* rearranging filter */
613  filt = LD_SH(filter);
614  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
615 
616  mask1 = mask00 + 2;
617  mask2 = mask00 + 4;
618  mask3 = mask00 + 6;
619  mask4 = mask0 + 2;
620  mask5 = mask0 + 4;
621  mask6 = mask0 + 6;
622 
623  for (loop_cnt = (height >> 2); loop_cnt--;) {
624  /* 8 width */
625  LD_SB4(src1_ptr, src_stride, src0, src1, src2, src3);
626  XORI_B4_128_SB(src0, src1, src2, src3);
627  src1_ptr += (4 * src_stride);
628  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask00, mask1, mask2,
629  mask3, filt0, filt1, filt2, filt3, out0,
630  out1, out2, out3);
631  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
632  SAT_SH4_SH(out0, out1, out2, out3, 7);
633  tmp0 = PCKEV_XORI128_UB(out0, out1);
634  tmp1 = PCKEV_XORI128_UB(out2, out3);
635  ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
636  dst1 += (4 * dst_stride);
637 
638  /* 4 width */
639  LD_SB4(src, src_stride, src0, src1, src2, src3);
640  XORI_B4_128_SB(src0, src1, src2, src3);
641  src += (4 * src_stride);
642  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask4, mask5,
643  mask6, filt0, filt1, filt2, filt3, out0,
644  out1);
645  SRAR_H2_SH(out0, out1, rnd_vec);
646  SAT_SH2_SH(out0, out1, 7);
647  tmp0 = PCKEV_XORI128_UB(out0, out1);
648  ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst, dst_stride);
649  dst += (4 * dst_stride);
650  }
651 }
652 
653 static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
654  uint8_t *dst, int32_t dst_stride,
655  const int8_t *filter, int32_t height,
656  uint8_t rnd_val)
657 {
658  uint32_t loop_cnt;
659  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
660  v16u8 mask0, mask1, mask2, mask3, out;
661  v8i16 filt, out0, out1, out2, out3;
662  v8i16 rnd_vec;
663 
664  mask0 = LD_UB(&mc_filt_mask_arr[0]);
665  src -= 3;
666  rnd_vec = __msa_fill_h(rnd_val);
667 
668  /* rearranging filter */
669  filt = LD_SH(filter);
670  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
671 
672  mask1 = mask0 + 2;
673  mask2 = mask0 + 4;
674  mask3 = mask0 + 6;
675 
676  for (loop_cnt = (height >> 1); loop_cnt--;) {
677  LD_SB2(src, src_stride, src0, src2);
678  LD_SB2(src + 8, src_stride, src1, src3);
679  XORI_B4_128_SB(src0, src1, src2, src3);
680  src += (2 * src_stride);
681  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
682  mask3, filt0, filt1, filt2, filt3, out0,
683  out1, out2, out3);
684  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
685  SAT_SH4_SH(out0, out1, out2, out3, 7);
686  out = PCKEV_XORI128_UB(out0, out1);
687  ST_UB(out, dst);
688  dst += dst_stride;
689  out = PCKEV_XORI128_UB(out2, out3);
690  ST_UB(out, dst);
691  dst += dst_stride;
692  }
693 }
694 
695 static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
696  uint8_t *dst, int32_t dst_stride,
697  const int8_t *filter, int32_t height,
698  uint8_t rnd_val)
699 {
700  uint32_t loop_cnt;
701  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
702  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
703  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
704  v16i8 vec11;
705  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10;
706  v8i16 out11, filt;
707  v8i16 rnd_vec;
708 
709  mask0 = LD_UB(&mc_filt_mask_arr[0]);
710  src -= 3;
711  rnd_vec = __msa_fill_h(rnd_val);
712 
713  /* rearranging filter */
714  filt = LD_SH(filter);
715  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
716 
717  mask1 = mask0 + 2;
718  mask2 = mask0 + 4;
719  mask3 = mask0 + 6;
720  mask4 = mask0 + 8;
721  mask5 = mask0 + 10;
722  mask6 = mask0 + 12;
723  mask7 = mask0 + 14;
724 
725  for (loop_cnt = (height >> 1); loop_cnt--;) {
726  LD_SB2(src, src_stride, src0, src2);
727  LD_SB2(src + 16, src_stride, src1, src3);
728  XORI_B4_128_SB(src0, src1, src2, src3);
729  src += (2 * src_stride);
730  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
731  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
732  VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
733  DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
734  out8, out2, out9);
735  DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
736  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
737  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
738  VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
739  DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2, out4,
740  out10, out6, out11);
741  DOTP_SB2_SH(vec1, vec3, filt2, filt2, out5, out7);
742  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
743  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
744  VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
745  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
746  out0, out8, out2, out9);
747  DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
748  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
749  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
750  VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
751  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
752  out4, out10, out6, out11);
753  DPADD_SB2_SH(vec5, vec7, filt3, filt3, out5, out7);
754  ADDS_SH4_SH(out0, out4, out8, out10, out2, out6, out9, out11, out0,
755  out8, out2, out9);
756  ADDS_SH2_SH(out1, out5, out3, out7, out1, out3);
757  SRAR_H4_SH(out0, out8, out2, out9, rnd_vec);
758  SRAR_H2_SH(out1, out3, rnd_vec);
759  SAT_SH4_SH(out0, out8, out2, out9, 7);
760  SAT_SH2_SH(out1, out3, 7);
761  out = PCKEV_XORI128_UB(out8, out9);
762  ST8x2_UB(out, dst + 16, dst_stride);
763  out = PCKEV_XORI128_UB(out0, out1);
764  ST_UB(out, dst);
765  dst += dst_stride;
766  out = PCKEV_XORI128_UB(out2, out3);
767  ST_UB(out, dst);
768  dst += dst_stride;
769  }
770 }
771 
772 static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
773  uint8_t *dst, int32_t dst_stride,
774  const int8_t *filter, int32_t height,
775  uint8_t rnd_val)
776 {
777  uint32_t loop_cnt;
778  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
779  v16u8 mask0, mask1, mask2, mask3, out;
780  v8i16 filt, out0, out1, out2, out3;
781  v8i16 rnd_vec;
782 
783  mask0 = LD_UB(&mc_filt_mask_arr[0]);
784  src -= 3;
785  rnd_vec = __msa_fill_h(rnd_val);
786 
787  /* rearranging filter */
788  filt = LD_SH(filter);
789  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
790 
791  mask1 = mask0 + 2;
792  mask2 = mask0 + 4;
793  mask3 = mask0 + 6;
794 
795  for (loop_cnt = (height >> 1); loop_cnt--;) {
796  src0 = LD_SB(src);
797  src2 = LD_SB(src + 16);
798  src3 = LD_SB(src + 24);
799  src1 = __msa_sldi_b(src2, src0, 8);
800  src += src_stride;
801  XORI_B4_128_SB(src0, src1, src2, src3);
802  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
803  mask3, filt0, filt1, filt2, filt3, out0,
804  out1, out2, out3);
805  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
806  SAT_SH4_SH(out0, out1, out2, out3, 7);
807 
808  src0 = LD_SB(src);
809  src2 = LD_SB(src + 16);
810  src3 = LD_SB(src + 24);
811  src1 = __msa_sldi_b(src2, src0, 8);
812  src += src_stride;
813 
814  out = PCKEV_XORI128_UB(out0, out1);
815  ST_UB(out, dst);
816  out = PCKEV_XORI128_UB(out2, out3);
817  ST_UB(out, dst + 16);
818  dst += dst_stride;
819 
820  XORI_B4_128_SB(src0, src1, src2, src3);
821  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
822  mask3, filt0, filt1, filt2, filt3, out0,
823  out1, out2, out3);
824  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
825  SAT_SH4_SH(out0, out1, out2, out3, 7);
826  out = PCKEV_XORI128_UB(out0, out1);
827  ST_UB(out, dst);
828  out = PCKEV_XORI128_UB(out2, out3);
829  ST_UB(out, dst + 16);
830  dst += dst_stride;
831  }
832 }
833 
834 static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
835  uint8_t *dst, int32_t dst_stride,
836  const int8_t *filter, int32_t height,
837  uint8_t rnd_val)
838 {
839  uint32_t loop_cnt;
840  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
841  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
842  v8i16 filt, out0, out1, out2, out3, out4, out5, out6;
843  v8i16 rnd_vec;
844 
845  mask0 = LD_UB(&mc_filt_mask_arr[0]);
846  src -= 3;
847  rnd_vec = __msa_fill_h(rnd_val);
848 
849  /* rearranging filter */
850  filt = LD_SH(filter);
851  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
852 
853  mask1 = mask0 + 2;
854  mask2 = mask0 + 4;
855  mask3 = mask0 + 6;
856  mask4 = mask0 + 8;
857  mask5 = mask0 + 10;
858  mask6 = mask0 + 12;
859  mask7 = mask0 + 14;
860 
861  for (loop_cnt = height; loop_cnt--;) {
862  LD_SB3(src, 16, src0, src2, src3);
863  src1 = __msa_sldi_b(src2, src0, 8);
864 
865  XORI_B4_128_SB(src0, src1, src2, src3);
866  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
867  vec0, vec1, vec2);
868  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
869  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
870  vec0, vec1, vec2);
871  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
872  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
873  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
874  vec0, vec1, vec2);
875  DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
876  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
877  vec0, vec1, vec2);
878  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out3, out4);
879  out5 = __msa_dpadd_s_h(out5, vec2, filt3);
880  ADDS_SH2_SH(out0, out3, out1, out4, out0, out1);
881  out2 = __msa_adds_s_h(out2, out5);
882  SRAR_H2_SH(out0, out1, rnd_vec);
883  out6 = __msa_srar_h(out2, rnd_vec);
884  SAT_SH3_SH(out0, out1, out6, 7);
885  out = PCKEV_XORI128_UB(out0, out1);
886  ST_UB(out, dst);
887 
888  src1 = LD_SB(src + 40);
889  src += src_stride;
890  src1 = (v16i8) __msa_xori_b((v16u8) src1, 128);
891 
892  VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask4, mask0, mask0,
893  vec0, vec1, vec2);
894  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
895  VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask5, mask1, mask1,
896  vec0, vec1, vec2);
897  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
898  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
899  VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask6, mask2, mask2,
900  vec0, vec1, vec2);
901  DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
902  VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask7, mask3, mask3,
903  vec0, vec1, vec2);
904  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out3, out4);
905  out5 = __msa_dpadd_s_h(out5, vec2, filt3);
906  ADDS_SH2_SH(out0, out3, out1, out4, out3, out4);
907  out5 = __msa_adds_s_h(out2, out5);
908  SRAR_H3_SH(out3, out4, out5, rnd_vec);
909  SAT_SH3_SH(out3, out4, out5, 7);
910  out = PCKEV_XORI128_UB(out6, out3);
911  ST_UB(out, dst + 16);
912  out = PCKEV_XORI128_UB(out4, out5);
913  ST_UB(out, dst + 32);
914  dst += dst_stride;
915  }
916 }
917 
918 static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
919  uint8_t *dst, int32_t dst_stride,
920  const int8_t *filter, int32_t height,
921  uint8_t rnd_val)
922 {
923  int32_t loop_cnt;
924  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
925  v16u8 mask0, mask1, mask2, mask3, out;
926  v8i16 filt, out0, out1, out2, out3;
927  v8i16 rnd_vec;
928 
929  mask0 = LD_UB(&mc_filt_mask_arr[0]);
930  src -= 3;
931  rnd_vec = __msa_fill_h(rnd_val);
932 
933  /* rearranging filter */
934  filt = LD_SH(filter);
935  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
936 
937  mask1 = mask0 + 2;
938  mask2 = mask0 + 4;
939  mask3 = mask0 + 6;
940 
941  for (loop_cnt = height; loop_cnt--;) {
942  src0 = LD_SB(src);
943  src2 = LD_SB(src + 16);
944  src3 = LD_SB(src + 24);
945  src1 = __msa_sldi_b(src2, src0, 8);
946 
947  XORI_B4_128_SB(src0, src1, src2, src3);
948  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
949  mask2, mask3, filt0, filt1, filt2, filt3,
950  out0, out1, out2, out3);
951  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
952  SAT_SH4_SH(out0, out1, out2, out3, 7);
953  out = PCKEV_XORI128_UB(out0, out1);
954  ST_UB(out, dst);
955  out = PCKEV_XORI128_UB(out2, out3);
956  ST_UB(out, dst + 16);
957 
958  src0 = LD_SB(src + 32);
959  src2 = LD_SB(src + 48);
960  src3 = LD_SB(src + 56);
961  src1 = __msa_sldi_b(src2, src0, 8);
962  src += src_stride;
963 
964  XORI_B4_128_SB(src0, src1, src2, src3);
965  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
966  mask2, mask3, filt0, filt1, filt2, filt3,
967  out0, out1, out2, out3);
968  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
969  SAT_SH4_SH(out0, out1, out2, out3, 7);
970  out = PCKEV_XORI128_UB(out0, out1);
971  ST_UB(out, dst + 32);
972  out = PCKEV_XORI128_UB(out2, out3);
973  ST_UB(out, dst + 48);
974  dst += dst_stride;
975  }
976 }
977 
978 static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
979  uint8_t *dst, int32_t dst_stride,
980  const int8_t *filter, int32_t height,
981  uint8_t rnd_val)
982 {
983  uint32_t loop_cnt;
984  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
985  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
986  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
987  v16i8 src10998, filt0, filt1, filt2, filt3;
988  v16u8 out;
989  v8i16 filt, out10, out32;
990  v8i16 rnd_vec;
991 
992  src -= (3 * src_stride);
993  rnd_vec = __msa_fill_h(rnd_val);
994 
995  filt = LD_SH(filter);
996  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
997 
998  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
999  src += (7 * src_stride);
1000 
1001  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1002  src54_r, src21_r);
1003  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1004  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
1005  src4332, src6554);
1006  XORI_B3_128_SB(src2110, src4332, src6554);
1007 
1008  for (loop_cnt = (height >> 2); loop_cnt--;) {
1009  LD_SB4(src, src_stride, src7, src8, src9, src10);
1010  src += (4 * src_stride);
1011 
1012  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1013  src87_r, src98_r, src109_r);
1014  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
1015  XORI_B2_128_SB(src8776, src10998);
1016  out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
1017  filt1, filt2, filt3);
1018  out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
1019  filt1, filt2, filt3);
1020  SRAR_H2_SH(out10, out32, rnd_vec);
1021  SAT_SH2_SH(out10, out32, 7);
1022  out = PCKEV_XORI128_UB(out10, out32);
1023  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1024  dst += (4 * dst_stride);
1025 
1026  src2110 = src6554;
1027  src4332 = src8776;
1028  src6554 = src10998;
1029  src6 = src10;
1030  }
1031 }
1032 
1033 static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
1034  uint8_t *dst, int32_t dst_stride,
1035  const int8_t *filter, int32_t height,
1036  uint8_t rnd_val)
1037 {
1038  uint32_t loop_cnt;
1039  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1040  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1041  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1042  v16u8 tmp0, tmp1;
1043  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
1044  v8i16 rnd_vec;
1045 
1046  src -= (3 * src_stride);
1047  rnd_vec = __msa_fill_h(rnd_val);
1048 
1049  filt = LD_SH(filter);
1050  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1051 
1052  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1053  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1054  src += (7 * src_stride);
1055  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1056  src54_r, src21_r);
1057  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1058 
1059  for (loop_cnt = (height >> 2); loop_cnt--;) {
1060  LD_SB4(src, src_stride, src7, src8, src9, src10);
1061  XORI_B4_128_SB(src7, src8, src9, src10);
1062  src += (4 * src_stride);
1063 
1064  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1065  src87_r, src98_r, src109_r);
1066  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
1067  filt1, filt2, filt3);
1068  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
1069  filt1, filt2, filt3);
1070  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
1071  filt1, filt2, filt3);
1072  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
1073  filt1, filt2, filt3);
1074  SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
1075  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1076  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1077  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1078  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1079  dst += (4 * dst_stride);
1080 
1081  src10_r = src54_r;
1082  src32_r = src76_r;
1083  src54_r = src98_r;
1084  src21_r = src65_r;
1085  src43_r = src87_r;
1086  src65_r = src109_r;
1087  src6 = src10;
1088  }
1089 }
1090 
1091 static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1092  uint8_t *dst, int32_t dst_stride,
1093  const int8_t *filter, int32_t height,
1094  uint8_t rnd_val)
1095 {
1096  int32_t loop_cnt;
1097  uint32_t out2, out3;
1098  uint64_t out0, out1;
1099  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
1100  v16i8 res2, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1101  v8i16 vec01, vec23, vec45, vec67, tmp0, tmp1, tmp2;
1102  v8i16 filt, filt0, filt1, filt2, filt3;
1103  v8i16 rnd_vec;
1104  v4i32 mask = { 2, 6, 2, 6 };
1105 
1106  src -= (3 * src_stride);
1107  rnd_vec = __msa_fill_h(rnd_val);
1108 
1109  /* rearranging filter_y */
1110  filt = LD_SH(filter);
1111  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1112 
1113  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1114  src += (7 * src_stride);
1115 
1116  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1117 
1118  /* 4 width */
1119  VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
1120  VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
1121  VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
1122 
1123  for (loop_cnt = (height >> 1); loop_cnt--;) {
1124  LD_SB2(src, src_stride, src7, src8);
1125  XORI_B2_128_SB(src7, src8);
1126  src += (2 * src_stride);
1127 
1128  ILVR_B4_SH(src1, src0, src3, src2, src5, src4, src7, src6,
1129  vec01, vec23, vec45, vec67);
1130  tmp0 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
1131  filt2, filt3);
1132  ILVR_B4_SH(src2, src1, src4, src3, src6, src5, src8, src7, vec01, vec23,
1133  vec45, vec67);
1134  tmp1 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
1135  filt2, filt3);
1136 
1137  /* 4 width */
1138  VSHF_W2_SB(src6, src7, src7, src8, mask, mask, vec6, vec7);
1139  ILVR_B4_SH(vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec01, vec23,
1140  vec45, vec67);
1141  tmp2 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
1142  filt2, filt3);
1143  SRAR_H3_SH(tmp0, tmp1, tmp2, rnd_vec);
1144  SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
1145  PCKEV_B3_SB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, res0, res1, res2);
1146  XORI_B3_128_SB(res0, res1, res2);
1147 
1148  out0 = __msa_copy_u_d((v2i64) res0, 0);
1149  out1 = __msa_copy_u_d((v2i64) res1, 0);
1150  out2 = __msa_copy_u_w((v4i32) res2, 0);
1151  out3 = __msa_copy_u_w((v4i32) res2, 1);
1152  SD(out0, dst);
1153  SW(out2, (dst + 8));
1154  dst += dst_stride;
1155  SD(out1, dst);
1156  SW(out3, (dst + 8));
1157  dst += dst_stride;
1158 
1159  src0 = src2;
1160  src1 = src3;
1161  src2 = src4;
1162  src3 = src5;
1163  src4 = src6;
1164  src5 = src7;
1165  src6 = src8;
1166  vec0 = vec2;
1167  vec1 = vec3;
1168  vec2 = vec4;
1169  vec3 = vec5;
1170  vec4 = vec6;
1171  vec5 = vec7;
1172  }
1173 }
1174 
1175 static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1176  uint8_t *dst, int32_t dst_stride,
1177  const int8_t *filter, int32_t height,
1178  uint8_t rnd_val)
1179 {
1180  uint32_t loop_cnt;
1181  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1182  v16i8 filt0, filt1, filt2, filt3;
1183  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1184  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1185  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1186  v16u8 tmp0, tmp1, tmp2, tmp3;
1187  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1188  v8i16 rnd_vec;
1189 
1190  src -= (3 * src_stride);
1191  rnd_vec = __msa_fill_h(rnd_val);
1192 
1193  filt = LD_SH(filter);
1194  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1195 
1196  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1197  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1198  src += (7 * src_stride);
1199  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1200  src54_r, src21_r);
1201  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1202  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1203  src54_l, src21_l);
1204  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1205 
1206  for (loop_cnt = (height >> 2); loop_cnt--;) {
1207  LD_SB4(src, src_stride, src7, src8, src9, src10);
1208  XORI_B4_128_SB(src7, src8, src9, src10);
1209  src += (4 * src_stride);
1210 
1211  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1212  src87_r, src98_r, src109_r);
1213  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1214  src87_l, src98_l, src109_l);
1215  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
1216  filt1, filt2, filt3);
1217  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
1218  filt1, filt2, filt3);
1219  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
1220  filt1, filt2, filt3);
1221  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
1222  filt1, filt2, filt3);
1223  out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
1224  filt1, filt2, filt3);
1225  out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
1226  filt1, filt2, filt3);
1227  out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
1228  filt1, filt2, filt3);
1229  out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
1230  filt1, filt2, filt3);
1231  SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
1232  SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
1233  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1234  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1235  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1236  out3_r, tmp0, tmp1, tmp2, tmp3);
1237  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1238  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1239  dst += (4 * dst_stride);
1240 
1241  src10_r = src54_r;
1242  src32_r = src76_r;
1243  src54_r = src98_r;
1244  src21_r = src65_r;
1245  src43_r = src87_r;
1246  src65_r = src109_r;
1247  src10_l = src54_l;
1248  src32_l = src76_l;
1249  src54_l = src98_l;
1250  src21_l = src65_l;
1251  src43_l = src87_l;
1252  src65_l = src109_l;
1253  src6 = src10;
1254  }
1255 }
1256 
1258  uint8_t *dst, int32_t dst_stride,
1259  const int8_t *filter, int32_t height,
1260  uint8_t rnd_val, int32_t width)
1261 {
1262  uint8_t *src_tmp;
1263  uint8_t *dst_tmp;
1264  uint32_t loop_cnt, cnt;
1265  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1266  v16i8 filt0, filt1, filt2, filt3;
1267  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1268  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1269  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1270  v16u8 tmp0, tmp1, tmp2, tmp3;
1271  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1272  v8i16 rnd_vec;
1273 
1274  src -= (3 * src_stride);
1275  rnd_vec = __msa_fill_h(rnd_val);
1276 
1277  filt = LD_SH(filter);
1278  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1279 
1280  for (cnt = (width >> 4); cnt--;) {
1281  src_tmp = src;
1282  dst_tmp = dst;
1283 
1284  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1285  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1286  src_tmp += (7 * src_stride);
1287  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1288  src32_r, src54_r, src21_r);
1289  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1290  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1291  src32_l, src54_l, src21_l);
1292  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1293 
1294  for (loop_cnt = (height >> 2); loop_cnt--;) {
1295  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1296  XORI_B4_128_SB(src7, src8, src9, src10);
1297  src_tmp += (4 * src_stride);
1298  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1299  src87_r, src98_r, src109_r);
1300  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1301  src87_l, src98_l, src109_l);
1302  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
1303  filt0, filt1, filt2, filt3);
1304  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
1305  filt0, filt1, filt2, filt3);
1306  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
1307  filt0, filt1, filt2, filt3);
1308  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
1309  filt0, filt1, filt2, filt3);
1310  out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
1311  filt0, filt1, filt2, filt3);
1312  out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
1313  filt0, filt1, filt2, filt3);
1314  out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
1315  filt0, filt1, filt2, filt3);
1316  out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
1317  filt0, filt1, filt2, filt3);
1318  SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
1319  SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
1320  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1321  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1322  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1323  out3_r, tmp0, tmp1, tmp2, tmp3);
1324  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1325  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1326  dst_tmp += (4 * dst_stride);
1327 
1328  src10_r = src54_r;
1329  src32_r = src76_r;
1330  src54_r = src98_r;
1331  src21_r = src65_r;
1332  src43_r = src87_r;
1333  src65_r = src109_r;
1334  src10_l = src54_l;
1335  src32_l = src76_l;
1336  src54_l = src98_l;
1337  src21_l = src65_l;
1338  src43_l = src87_l;
1339  src65_l = src109_l;
1340  src6 = src10;
1341  }
1342 
1343  src += 16;
1344  dst += 16;
1345  }
1346 }
1347 
1348 static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1349  uint8_t *dst, int32_t dst_stride,
1350  const int8_t *filter, int32_t height, uint8_t rnd_val)
1351 {
1352  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1353  rnd_val, 16);
1354 
1355  common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1356  height, rnd_val);
1357 }
1358 
1359 static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1360  uint8_t *dst, int32_t dst_stride,
1361  const int8_t *filter, int32_t height, uint8_t rnd_val)
1362 {
1363  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1364  rnd_val, 32);
1365 }
1366 
1367 static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1368  uint8_t *dst, int32_t dst_stride,
1369  const int8_t *filter, int32_t height, uint8_t rnd_val)
1370 {
1371  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1372  rnd_val, 48);
1373 }
1374 
1375 static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1376  uint8_t *dst, int32_t dst_stride,
1377  const int8_t *filter, int32_t height, uint8_t rnd_val)
1378 {
1379  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1380  rnd_val, 64);
1381 }
1382 
1384  int32_t src_stride,
1385  uint8_t *dst,
1386  int32_t dst_stride,
1387  const int8_t *filter_x,
1388  const int8_t *filter_y,
1389  int32_t height)
1390 {
1391  uint32_t loop_cnt;
1392  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1393  v8i16 filt0, filt1, filt2, filt3;
1394  v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1395  v16i8 mask1, mask2, mask3;
1396  v8i16 filter_vec, const_vec;
1397  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1398  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1399  v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1400  v4i32 dst0_r, dst1_r;
1401  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1402  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1403  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1404  v8i16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1405 
1406  src -= ((3 * src_stride) + 3);
1407  filter_vec = LD_SH(filter_x);
1408  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1409 
1410  filter_vec = LD_SH(filter_y);
1411  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1412  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1413 
1414  SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1415 
1416  mask1 = mask0 + 2;
1417  mask2 = mask0 + 4;
1418  mask3 = mask0 + 6;
1419 
1420  const_vec = __msa_ldi_h(128);
1421  const_vec <<= 6;
1422 
1423  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1424  src += (7 * src_stride);
1425  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1426 
1427  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1428  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1429  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1430  vec8, vec9, vec10, vec11);
1431  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1432  vec12, vec13, vec14, vec15);
1433 
1434  dst30 = const_vec;
1435  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1436  dst30, dst30, dst30, dst30);
1437  dst41 = const_vec;
1438  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1439  dst41, dst41, dst41, dst41);
1440  dst52 = const_vec;
1441  DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1442  dst52, dst52, dst52, dst52);
1443  dst63 = const_vec;
1444  DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1445  dst63, dst63, dst63, dst63);
1446 
1447  ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1448  dst10_r, dst21_r, dst32_r);
1449  dst43_r = __msa_ilvl_h(dst41, dst30);
1450  dst54_r = __msa_ilvl_h(dst52, dst41);
1451  dst65_r = __msa_ilvl_h(dst63, dst52);
1452  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1453 
1454  for (loop_cnt = height >> 1; loop_cnt--;) {
1455  LD_SB2(src, src_stride, src7, src8);
1456  src += 2 * src_stride;
1457  XORI_B2_128_SB(src7, src8);
1458 
1459  VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1460  vec0, vec1, vec2, vec3);
1461  dst87 = const_vec;
1462  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1463  dst87, dst87, dst87, dst87);
1464 
1465  dst76_r = __msa_ilvr_h(dst87, dst66);
1466  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1467  filt_h0, filt_h1, filt_h2, filt_h3);
1468  dst87_r = __msa_vshf_h(mask4, dst87, dst87);
1469  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1470  filt_h0, filt_h1, filt_h2, filt_h3);
1471 
1472  dst0_r >>= 6;
1473  dst1_r >>= 6;
1474  SRARI_W2_SW(dst0_r, dst1_r, 6);
1475  dst0_r = CLIP_SW_0_255(dst0_r);
1476  dst1_r = CLIP_SW_0_255(dst1_r);
1477 
1478  HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
1479  ST4x2_UB(dst0_r, dst, dst_stride);
1480  dst += (2 * dst_stride);
1481 
1482  dst10_r = dst32_r;
1483  dst32_r = dst54_r;
1484  dst54_r = dst76_r;
1485  dst21_r = dst43_r;
1486  dst43_r = dst65_r;
1487  dst65_r = dst87_r;
1488  dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1489  }
1490 }
1491 
1493  int32_t src_stride,
1494  uint8_t *dst,
1495  int32_t dst_stride,
1496  const int8_t *filter_x,
1497  const int8_t *filter_y,
1499 {
1500  uint32_t loop_cnt, cnt;
1501  uint8_t *src_tmp;
1502  uint8_t *dst_tmp;
1503  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1504  v8i16 filt0, filt1, filt2, filt3;
1505  v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1506  v16i8 mask1, mask2, mask3;
1507  v8i16 filter_vec, const_vec;
1508  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1509  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1510  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1511  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1512  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1513  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1514  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1515  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1516  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1517 
1518  src -= ((3 * src_stride) + 3);
1519  const_vec = __msa_ldi_h(128);
1520  const_vec <<= 6;
1521 
1522  filter_vec = LD_SH(filter_x);
1523  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1524 
1525  filter_vec = LD_SH(filter_y);
1526  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1527  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1528 
1529  SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1530 
1531  mask1 = mask0 + 2;
1532  mask2 = mask0 + 4;
1533  mask3 = mask0 + 6;
1534 
1535  for (cnt = width >> 3; cnt--;) {
1536  src_tmp = src;
1537  dst_tmp = dst;
1538 
1539  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1540  src_tmp += (7 * src_stride);
1541  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1542 
1543  /* row 0 row 1 row 2 row 3 */
1544  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1545  vec0, vec1, vec2, vec3);
1546  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1547  vec4, vec5, vec6, vec7);
1548  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1549  vec8, vec9, vec10, vec11);
1550  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1551  vec12, vec13, vec14, vec15);
1552  dst0 = const_vec;
1553  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1554  dst0, dst0, dst0, dst0);
1555  dst1 = const_vec;
1556  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1557  dst1, dst1, dst1, dst1);
1558  dst2 = const_vec;
1559  DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1560  dst2, dst2, dst2, dst2);
1561  dst3 = const_vec;
1562  DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1563  dst3, dst3, dst3, dst3);
1564 
1565  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1566  vec0, vec1, vec2, vec3);
1567  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1568  vec4, vec5, vec6, vec7);
1569  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1570  vec8, vec9, vec10, vec11);
1571  dst4 = const_vec;
1572  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1573  dst4, dst4, dst4, dst4);
1574  dst5 = const_vec;
1575  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1576  dst5, dst5, dst5, dst5);
1577  dst6 = const_vec;
1578  DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1579  dst6, dst6, dst6, dst6);
1580 
1581  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1582  dst10_r, dst32_r, dst54_r, dst21_r);
1583  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1584  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1585  dst10_l, dst32_l, dst54_l, dst21_l);
1586  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1587 
1588  for (loop_cnt = height >> 1; loop_cnt--;) {
1589  LD_SB2(src_tmp, src_stride, src7, src8);
1590  XORI_B2_128_SB(src7, src8);
1591  src_tmp += 2 * src_stride;
1592 
1593  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1594  vec0, vec1, vec2, vec3);
1595  dst7 = const_vec;
1596  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1597  dst7, dst7, dst7, dst7);
1598 
1599  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1600  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1601  filt_h0, filt_h1, filt_h2, filt_h3);
1602  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1603  filt_h0, filt_h1, filt_h2, filt_h3);
1604  dst0_r >>= 6;
1605  dst0_l >>= 6;
1606 
1607  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1608  vec0, vec1, vec2, vec3);
1609  dst8 = const_vec;
1610  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1611  dst8, dst8, dst8, dst8);
1612 
1613  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1614  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1615  filt_h0, filt_h1, filt_h2, filt_h3);
1616  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1617  filt_h0, filt_h1, filt_h2, filt_h3);
1618  dst1_r >>= 6;
1619  dst1_l >>= 6;
1620  SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1621  dst0_r = CLIP_SW_0_255(dst0_r);
1622  dst0_l = CLIP_SW_0_255(dst0_l);
1623  dst1_r = CLIP_SW_0_255(dst1_r);
1624  dst1_l = CLIP_SW_0_255(dst1_l);
1625 
1626  HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
1627  ST8x2_UB(dst0_r, dst_tmp, dst_stride);
1628  dst_tmp += (2 * dst_stride);
1629 
1630  dst10_r = dst32_r;
1631  dst32_r = dst54_r;
1632  dst54_r = dst76_r;
1633  dst10_l = dst32_l;
1634  dst32_l = dst54_l;
1635  dst54_l = dst76_l;
1636  dst21_r = dst43_r;
1637  dst43_r = dst65_r;
1638  dst65_r = dst87_r;
1639  dst21_l = dst43_l;
1640  dst43_l = dst65_l;
1641  dst65_l = dst87_l;
1642  dst6 = dst8;
1643  }
1644 
1645  src += 8;
1646  dst += 8;
1647  }
1648 }
1649 
1651  int32_t src_stride,
1652  uint8_t *dst,
1653  int32_t dst_stride,
1654  const int8_t *filter_x,
1655  const int8_t *filter_y,
1656  int32_t height)
1657 {
1658  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1659  filter_x, filter_y, height, 8);
1660 }
1661 
1663  int32_t src_stride,
1664  uint8_t *dst,
1665  int32_t dst_stride,
1666  const int8_t *filter_x,
1667  const int8_t *filter_y,
1668  int32_t height)
1669 {
1670  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1671  filter_x, filter_y, height, 8);
1672 
1673  hevc_hv_uni_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
1674  filter_x, filter_y, height);
1675 }
1676 
1678  int32_t src_stride,
1679  uint8_t *dst,
1680  int32_t dst_stride,
1681  const int8_t *filter_x,
1682  const int8_t *filter_y,
1683  int32_t height)
1684 {
1685  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1686  filter_x, filter_y, height, 16);
1687 }
1688 
1690  int32_t src_stride,
1691  uint8_t *dst,
1692  int32_t dst_stride,
1693  const int8_t *filter_x,
1694  const int8_t *filter_y,
1695  int32_t height)
1696 {
1697  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1698  filter_x, filter_y, height, 24);
1699 }
1700 
1702  int32_t src_stride,
1703  uint8_t *dst,
1704  int32_t dst_stride,
1705  const int8_t *filter_x,
1706  const int8_t *filter_y,
1707  int32_t height)
1708 {
1709  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1710  filter_x, filter_y, height, 32);
1711 }
1712 
1714  int32_t src_stride,
1715  uint8_t *dst,
1716  int32_t dst_stride,
1717  const int8_t *filter_x,
1718  const int8_t *filter_y,
1719  int32_t height)
1720 {
1721  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1722  filter_x, filter_y, height, 48);
1723 }
1724 
1726  int32_t src_stride,
1727  uint8_t *dst,
1728  int32_t dst_stride,
1729  const int8_t *filter_x,
1730  const int8_t *filter_y,
1731  int32_t height)
1732 {
1733  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1734  filter_x, filter_y, height, 64);
1735 }
1736 
1737 static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
1738  uint8_t *dst, int32_t dst_stride,
1739  const int8_t *filter, uint8_t rnd_val)
1740 {
1741  v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1742  v16u8 out;
1743  v8i16 filt, res0;
1744  v8i16 rnd_vec;
1745 
1746  mask0 = LD_SB(&mc_filt_mask_arr[16]);
1747  src -= 1;
1748  rnd_vec = __msa_fill_h(rnd_val);
1749 
1750  /* rearranging filter */
1751  filt = LD_SH(filter);
1752  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1753 
1754  mask1 = mask0 + 2;
1755 
1756  LD_SB2(src, src_stride, src0, src1);
1757  XORI_B2_128_SB(src0, src1);
1758  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1759  res0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
1760  res0 = __msa_srar_h(res0, rnd_vec);
1761  res0 = __msa_sat_s_h(res0, 7);
1762  out = PCKEV_XORI128_UB(res0, res0);
1763  ST4x2_UB(out, dst, dst_stride);
1764 }
1765 
1766 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
1767  uint8_t *dst, int32_t dst_stride,
1768  const int8_t *filter, uint8_t rnd_val)
1769 {
1770  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1771  v8i16 filt, out0, out1;
1772  v16u8 out;
1773  v8i16 rnd_vec;
1774 
1775  mask0 = LD_SB(&mc_filt_mask_arr[16]);
1776  src -= 1;
1777  rnd_vec = __msa_fill_h(rnd_val);
1778 
1779  /* rearranging filter */
1780  filt = LD_SH(filter);
1781  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1782 
1783  mask1 = mask0 + 2;
1784 
1785  LD_SB4(src, src_stride, src0, src1, src2, src3);
1786  XORI_B4_128_SB(src0, src1, src2, src3);
1787  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1788  filt0, filt1, out0, out1);
1789  SRAR_H2_SH(out0, out1, rnd_vec);
1790  SAT_SH2_SH(out0, out1, 7);
1791  out = PCKEV_XORI128_UB(out0, out1);
1792  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1793 }
1794 
1795 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
1796  uint8_t *dst, int32_t dst_stride,
1797  const int8_t *filter, uint8_t rnd_val)
1798 {
1799  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1800  v16u8 out;
1801  v8i16 filt, out0, out1, out2, out3;
1802  v8i16 rnd_vec;
1803 
1804  mask0 = LD_SB(&mc_filt_mask_arr[16]);
1805  src -= 1;
1806  rnd_vec = __msa_fill_h(rnd_val);
1807 
1808  /* rearranging filter */
1809  filt = LD_SH(filter);
1810  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1811 
1812  mask1 = mask0 + 2;
1813 
1814  LD_SB4(src, src_stride, src0, src1, src2, src3);
1815  src += (4 * src_stride);
1816 
1817  XORI_B4_128_SB(src0, src1, src2, src3);
1818  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1819  filt0, filt1, out0, out1);
1820  LD_SB4(src, src_stride, src0, src1, src2, src3);
1821  XORI_B4_128_SB(src0, src1, src2, src3);
1822  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1823  filt0, filt1, out2, out3);
1824  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
1825  SAT_SH4_SH(out0, out1, out2, out3, 7);
1826  out = PCKEV_XORI128_UB(out0, out1);
1827  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1828  dst += (4 * dst_stride);
1829  out = PCKEV_XORI128_UB(out2, out3);
1830  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1831 }
1832 
1833 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
1834  uint8_t *dst, int32_t dst_stride,
1835  const int8_t *filter, uint8_t rnd_val)
1836 {
1837  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1838  v16i8 filt0, filt1, mask0, mask1;
1839  v16u8 out;
1840  v8i16 filt, out0, out1, out2, out3;
1841  v8i16 rnd_vec;
1842 
1843  mask0 = LD_SB(&mc_filt_mask_arr[16]);
1844  src -= 1;
1845  rnd_vec = __msa_fill_h(rnd_val);
1846 
1847  /* rearranging filter */
1848  filt = LD_SH(filter);
1849  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1850 
1851  mask1 = mask0 + 2;
1852 
1853  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1854  src += (8 * src_stride);
1855  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1856  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1857  filt0, filt1, out0, out1);
1858  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
1859  filt0, filt1, out2, out3);
1860  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
1861  SAT_SH4_SH(out0, out1, out2, out3, 7);
1862  out = PCKEV_XORI128_UB(out0, out1);
1863  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1864  dst += (4 * dst_stride);
1865  out = PCKEV_XORI128_UB(out2, out3);
1866  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1867  dst += (4 * dst_stride);
1868 
1869  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1870  src += (8 * src_stride);
1871  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1872  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1873  filt0, filt1, out0, out1);
1874  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
1875  filt0, filt1, out2, out3);
1876  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
1877  SAT_SH4_SH(out0, out1, out2, out3, 7);
1878  out = PCKEV_XORI128_UB(out0, out1);
1879  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1880  dst += (4 * dst_stride);
1881  out = PCKEV_XORI128_UB(out2, out3);
1882  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1883 }
1884 
1885 static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
1886  uint8_t *dst, int32_t dst_stride,
1887  const int8_t *filter, int32_t height,
1888  uint8_t rnd_val)
1889 {
1890  if (2 == height) {
1891  common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
1892  } else if (4 == height) {
1893  common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
1894  } else if (8 == height) {
1895  common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
1896  } else if (16 == height) {
1897  common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter,
1898  rnd_val);
1899  }
1900 }
1901 
1902 static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
1903  uint8_t *dst, int32_t dst_stride,
1904  const int8_t *filter, int32_t height,
1905  uint8_t rnd_val)
1906 {
1907  uint32_t loop_cnt;
1908  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1909  v16u8 out4, out5;
1910  v8i16 filt, out0, out1, out2, out3;
1911  v8i16 rnd_vec;
1912 
1913  mask0 = LD_SB(&mc_filt_mask_arr[0]);
1914  src -= 1;
1915  rnd_vec = __msa_fill_h(rnd_val);
1916 
1917  /* rearranging filter */
1918  filt = LD_SH(filter);
1919  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1920 
1921  mask1 = mask0 + 2;
1922 
1923  for (loop_cnt = (height >> 2); loop_cnt--;) {
1924  LD_SB4(src, src_stride, src0, src1, src2, src3);
1925  src += (4 * src_stride);
1926 
1927  XORI_B4_128_SB(src0, src1, src2, src3);
1928  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
1929  filt1, out0, out1, out2, out3);
1930  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
1931  SAT_SH4_SH(out0, out1, out2, out3, 7);
1932 
1933  out4 = PCKEV_XORI128_UB(out0, out1);
1934  out5 = PCKEV_XORI128_UB(out2, out3);
1935  ST6x4_UB(out4, out5, dst, dst_stride);
1936  dst += (4 * dst_stride);
1937  }
1938 }
1939 
1940 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
1941  uint8_t *dst, int32_t dst_stride,
1942  const int8_t *filter, int32_t height,
1943  uint8_t rnd_val)
1944 {
1945  uint32_t loop_cnt;
1946  v16i8 src0, src1, filt0, filt1, mask0, mask1;
1947  v16u8 out;
1948  v8i16 filt, vec0, vec1, vec2, vec3;
1949  v8i16 rnd_vec;
1950 
1951  mask0 = LD_SB(&mc_filt_mask_arr[0]);
1952  src -= 1;
1953  rnd_vec = __msa_fill_h(rnd_val);
1954 
1955  filt = LD_SH(filter);
1956  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1957 
1958  mask1 = mask0 + 2;
1959 
1960  for (loop_cnt = (height >> 1); loop_cnt--;) {
1961  LD_SB2(src, src_stride, src0, src1);
1962  src += (2 * src_stride);
1963 
1964  XORI_B2_128_SB(src0, src1);
1965  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1966  DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
1967  VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
1968  DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
1969  SRAR_H2_SH(vec0, vec1, rnd_vec);
1970  SAT_SH2_SH(vec0, vec1, 7);
1971  out = PCKEV_XORI128_UB(vec0, vec1);
1972  ST8x2_UB(out, dst, dst_stride);
1973  dst += (2 * dst_stride);
1974  }
1975 }
1976 
1977 static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
1978  uint8_t *dst, int32_t dst_stride,
1979  const int8_t *filter, int32_t height,
1980  uint8_t rnd_val)
1981 {
1982  uint32_t loop_cnt;
1983  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1984  v16u8 tmp0, tmp1;
1985  v8i16 filt, out0, out1, out2, out3;
1986  v8i16 rnd_vec;
1987 
1988  mask0 = LD_SB(&mc_filt_mask_arr[0]);
1989  src -= 1;
1990  rnd_vec = __msa_fill_h(rnd_val);
1991 
1992  /* rearranging filter */
1993  filt = LD_SH(filter);
1994  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1995 
1996  mask1 = mask0 + 2;
1997 
1998  for (loop_cnt = (height >> 2); loop_cnt--;) {
1999  LD_SB4(src, src_stride, src0, src1, src2, src3);
2000  src += (4 * src_stride);
2001 
2002  XORI_B4_128_SB(src0, src1, src2, src3);
2003  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2004  filt1, out0, out1, out2, out3);
2005  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
2006  SAT_SH4_SH(out0, out1, out2, out3, 7);
2007  tmp0 = PCKEV_XORI128_UB(out0, out1);
2008  tmp1 = PCKEV_XORI128_UB(out2, out3);
2009  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2010  dst += (4 * dst_stride);
2011  }
2012 }
2013 
2014 static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
2015  uint8_t *dst, int32_t dst_stride,
2016  const int8_t *filter, int32_t height,
2017  uint8_t rnd_val)
2018 {
2019  if ((2 == height) || (6 == height)) {
2020  common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
2021  height, rnd_val);
2022  } else {
2023  common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
2024  height, rnd_val);
2025  }
2026 }
2027 
2028 static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
2029  uint8_t *dst, int32_t dst_stride,
2030  const int8_t *filter, int32_t height,
2031  uint8_t rnd_val)
2032 {
2033  uint32_t loop_cnt;
2034  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2035  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2036  v16i8 vec10, vec11;
2037  v16u8 tmp0, tmp1;
2038  v8i16 filt, out0, out1, out2, out3, out4, out5;
2039  v8i16 rnd_vec;
2040 
2041  mask0 = LD_SB(&mc_filt_mask_arr[0]);
2042  mask2 = LD_SB(&mc_filt_mask_arr[32]);
2043 
2044  src -= 1;
2045 
2046  /* rearranging filter */
2047  filt = LD_SH(filter);
2048  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2049 
2050  mask1 = mask0 + 2;
2051  mask3 = mask2 + 2;
2052 
2053  rnd_vec = __msa_fill_h(rnd_val);
2054 
2055  for (loop_cnt = (height >> 2); loop_cnt--;) {
2056  LD_SB4(src, src_stride, src0, src1, src2, src3);
2057  src += (4 * src_stride);
2058 
2059  XORI_B4_128_SB(src0, src1, src2, src3);
2060  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2061  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2062  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2063  DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2064  out2, out3, out4, out5);
2065  DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2066  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2067  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2068  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2069  DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2070  out2, out3, out4, out5);
2071  DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2072  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
2073  SRAR_H2_SH(out4, out5, rnd_vec);
2074  SAT_SH4_SH(out0, out1, out2, out3, 7);
2075  SAT_SH2_SH(out4, out5, 7);
2076  tmp0 = PCKEV_XORI128_UB(out2, out3);
2077  tmp1 = PCKEV_XORI128_UB(out4, out5);
2078  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2079  tmp0 = PCKEV_XORI128_UB(out0, out1);
2080  ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2081  dst += (4 * dst_stride);
2082  }
2083 }
2084 
2085 static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
2086  uint8_t *dst, int32_t dst_stride,
2087  const int8_t *filter, int32_t height,
2088  uint8_t rnd_val)
2089 {
2090  uint32_t loop_cnt;
2091  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2092  v16i8 filt0, filt1, mask0, mask1;
2093  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2094  v16u8 out;
2095  v8i16 rnd_vec;
2096 
2097  mask0 = LD_SB(&mc_filt_mask_arr[0]);
2098  src -= 1;
2099  rnd_vec = __msa_fill_h(rnd_val);
2100 
2101  /* rearranging filter */
2102  filt = LD_SH(filter);
2103  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2104 
2105  mask1 = mask0 + 2;
2106 
2107  for (loop_cnt = (height >> 2); loop_cnt--;) {
2108  LD_SB4(src, src_stride, src0, src2, src4, src6);
2109  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2110  src += (4 * src_stride);
2111 
2112  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2113  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2114  filt1, out0, out1, out2, out3);
2115  HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
2116  filt1, out4, out5, out6, out7);
2117  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
2118  SRAR_H4_SH(out4, out5, out6, out7, rnd_vec);
2119  SAT_SH4_SH(out0, out1, out2, out3, 7);
2120  SAT_SH4_SH(out4, out5, out6, out7, 7);
2121  out = PCKEV_XORI128_UB(out0, out1);
2122  ST_UB(out, dst);
2123  dst += dst_stride;
2124  out = PCKEV_XORI128_UB(out2, out3);
2125  ST_UB(out, dst);
2126  dst += dst_stride;
2127  out = PCKEV_XORI128_UB(out4, out5);
2128  ST_UB(out, dst);
2129  dst += dst_stride;
2130  out = PCKEV_XORI128_UB(out6, out7);
2131  ST_UB(out, dst);
2132  dst += dst_stride;
2133  }
2134 }
2135 
2136 static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
2137  uint8_t *dst, int32_t dst_stride,
2138  const int8_t *filter, int32_t height,
2139  uint8_t rnd_val)
2140 {
2141  uint8_t *dst1 = dst + 16;
2142  uint32_t loop_cnt;
2143  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2144  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2145  v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2146  v8i16 filt, out0, out1, out2, out3;
2147  v16u8 tmp0, tmp1;
2148  v8i16 rnd_vec;
2149 
2150  mask0 = LD_SB(&mc_filt_mask_arr[0]);
2151  src -= 1;
2152  rnd_vec = __msa_fill_h(rnd_val);
2153 
2154  /* rearranging filter */
2155  filt = LD_SH(filter);
2156  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2157 
2158  mask1 = mask0 + 2;
2159  mask00 = mask0 + 8;
2160  mask11 = mask0 + 10;
2161 
2162  for (loop_cnt = (height >> 2); loop_cnt--;) {
2163  LD_SB4(src, src_stride, src0, src2, src4, src6);
2164  LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2165  src += (4 * src_stride);
2166 
2167  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2168  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2169  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2170  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2171  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2172  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2173  out0, out1, out2, out3);
2174  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2175  out0, out1, out2, out3);
2176  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
2177  SAT_SH4_SH(out0, out1, out2, out3, 7);
2178  tmp0 = PCKEV_XORI128_UB(out0, out1);
2179  ST_UB(tmp0, dst);
2180  dst += dst_stride;
2181  tmp0 = PCKEV_XORI128_UB(out2, out3);
2182  ST_UB(tmp0, dst);
2183  dst += dst_stride;
2184 
2185  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2186  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2187  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2188  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2189  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2190  out0, out1, out2, out3);
2191  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2192  out0, out1, out2, out3);
2193  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
2194  SAT_SH4_SH(out0, out1, out2, out3, 7);
2195  tmp0 = PCKEV_XORI128_UB(out0, out1);
2196  ST_UB(tmp0, dst);
2197  dst += dst_stride;
2198  tmp0 = PCKEV_XORI128_UB(out2, out3);
2199  ST_UB(tmp0, dst);
2200  dst += dst_stride;
2201 
2202  /* 8 width */
2203  VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2204  VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2205  VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2206  VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2207 
2208  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2209  out0, out1, out2, out3);
2210  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2211  out0, out1, out2, out3);
2212 
2213  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
2214  SAT_SH4_SH(out0, out1, out2, out3, 7);
2215  tmp0 = PCKEV_XORI128_UB(out0, out1);
2216  tmp1 = PCKEV_XORI128_UB(out2, out3);
2217  ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
2218  dst1 += (4 * dst_stride);
2219  }
2220 }
2221 
2222 static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
2223  uint8_t *dst, int32_t dst_stride,
2224  const int8_t *filter, int32_t height,
2225  uint8_t rnd_val)
2226 {
2227  uint32_t loop_cnt;
2228  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2229  v16i8 filt0, filt1, mask0, mask1;
2230  v16u8 out;
2231  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2232  v8i16 rnd_vec;
2233 
2234  mask0 = LD_SB(&mc_filt_mask_arr[0]);
2235  src -= 1;
2236  rnd_vec = __msa_fill_h(rnd_val);
2237 
2238  /* rearranging filter */
2239  filt = LD_SH(filter);
2240  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2241 
2242  mask1 = mask0 + 2;
2243 
2244  for (loop_cnt = (height >> 1); loop_cnt--;) {
2245  src0 = LD_SB(src);
2246  src2 = LD_SB(src + 16);
2247  src3 = LD_SB(src + 24);
2248  src += src_stride;
2249  src4 = LD_SB(src);
2250  src6 = LD_SB(src + 16);
2251  src7 = LD_SB(src + 24);
2252  SLDI_B2_SB(src2, src6, src0, src4, src1, src5, 8);
2253  src += src_stride;
2254 
2255  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2256  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2257  filt0, filt1, out0, out1, out2, out3);
2258  HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2259  filt0, filt1, out4, out5, out6, out7);
2260  SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
2261  SRAR_H4_SH(out4, out5, out6, out7, rnd_vec);
2262  SAT_SH4_SH(out0, out1, out2, out3, 7);
2263  SAT_SH4_SH(out4, out5, out6, out7, 7);
2264  out = PCKEV_XORI128_UB(out0, out1);
2265  ST_UB(out, dst);
2266  out = PCKEV_XORI128_UB(out2, out3);
2267  ST_UB(out, dst + 16);
2268  dst += dst_stride;
2269  out = PCKEV_XORI128_UB(out4, out5);
2270  ST_UB(out, dst);
2271  out = PCKEV_XORI128_UB(out6, out7);
2272  ST_UB(out, dst + 16);
2273  dst += dst_stride;
2274  }
2275 }
2276 
2277 static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
2278  uint8_t *dst, int32_t dst_stride,
2279  const int8_t *filter, uint8_t rnd_val)
2280 {
2281  v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2282  v16i8 src2110, src4332, filt0, filt1;
2283  v16u8 out;
2284  v8i16 filt, out10;
2285  v8i16 rnd_vec;
2286 
2287  src -= src_stride;
2288  rnd_vec = __msa_fill_h(rnd_val);
2289 
2290  filt = LD_SH(filter);
2291  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2292 
2293  LD_SB3(src, src_stride, src0, src1, src2);
2294  src += (3 * src_stride);
2295 
2296  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2297  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2298  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2299  LD_SB2(src, src_stride, src3, src4);
2300  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2301  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2302  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2303  out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
2304  out10 = __msa_srar_h(out10, rnd_vec);
2305  out10 = __msa_sat_s_h(out10, 7);
2306  out = PCKEV_XORI128_UB(out10, out10);
2307  ST4x2_UB(out, dst, dst_stride);
2308 }
2309 
2311  uint8_t *dst, int32_t dst_stride,
2312  const int8_t *filter, int32_t height,
2313  uint8_t rnd_val)
2314 {
2315  uint32_t loop_cnt;
2316  v16i8 src0, src1, src2, src3, src4, src5;
2317  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2318  v16i8 src2110, src4332, filt0, filt1;
2319  v8i16 filt, out10, out32;
2320  v16u8 out;
2321  v8i16 rnd_vec;
2322 
2323  src -= src_stride;
2324  rnd_vec = __msa_fill_h(rnd_val);
2325 
2326  filt = LD_SH(filter);
2327  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2328 
2329  LD_SB3(src, src_stride, src0, src1, src2);
2330  src += (3 * src_stride);
2331 
2332  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2333 
2334  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2335  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2336 
2337  for (loop_cnt = (height >> 2); loop_cnt--;) {
2338  LD_SB3(src, src_stride, src3, src4, src5);
2339  src += (3 * src_stride);
2340  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2341  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2342  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2343  out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
2344 
2345  src2 = LD_SB(src);
2346  src += (src_stride);
2347  ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2348  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2349  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2350  out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
2351  SRAR_H2_SH(out10, out32, rnd_vec);
2352  SAT_SH2_SH(out10, out32, 7);
2353  out = PCKEV_XORI128_UB(out10, out32);
2354  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2355  dst += (4 * dst_stride);
2356  }
2357 }
2358 
2359 static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
2360  uint8_t *dst, int32_t dst_stride,
2361  const int8_t *filter, int32_t height,
2362  uint8_t rnd_val)
2363 {
2364  if (2 == height) {
2365  common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
2366  } else {
2367  common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2368  height, rnd_val);
2369  }
2370 }
2371 
2372 static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
2373  uint8_t *dst, int32_t dst_stride,
2374  const int8_t *filter, int32_t height,
2375  uint8_t rnd_val)
2376 {
2377  uint32_t loop_cnt;
2378  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, out0, out1;
2379  v8i16 vec01, vec12, vec23, vec30, tmp0, tmp1, tmp2, tmp3;
2380  v8i16 filt, filt0, filt1;
2381  v8i16 rnd_vec;
2382 
2383  src -= src_stride;
2384  rnd_vec = __msa_fill_h(rnd_val);
2385 
2386  /* rearranging filter_y */
2387  filt = LD_SH(filter);
2388  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2389 
2390  LD_UB3(src, src_stride, src0, src1, src2);
2391  src += (3 * src_stride);
2392 
2393  vec0 = (v16u8) __msa_xori_b((v16u8) src0, 128);
2394  vec1 = (v16u8) __msa_xori_b((v16u8) src1, 128);
2395  vec2 = (v16u8) __msa_xori_b((v16u8) src2, 128);
2396 
2397  for (loop_cnt = (height >> 2); loop_cnt--;) {
2398  LD_UB4(src, src_stride, src3, src0, src1, src2);
2399  src += (4 * src_stride);
2400 
2401  vec3 = (v16u8) __msa_xori_b((v16u8) src3, 128);
2402  ILVR_B2_SH(vec1, vec0, vec3, vec2, vec01, vec23);
2403  tmp0 = FILT_4TAP_DPADD_S_H(vec01, vec23, filt0, filt1);
2404 
2405  vec0 = __msa_xori_b((v16u8) src0, 128);
2406  ILVR_B2_SH(vec2, vec1, vec0, vec3, vec12, vec30);
2407  tmp1 = FILT_4TAP_DPADD_S_H(vec12, vec30, filt0, filt1);
2408 
2409  vec1 = __msa_xori_b((v16u8) src1, 128);
2410  vec01 = (v8i16) __msa_ilvr_b((v16i8) vec1, (v16i8) vec0);
2411  tmp2 = FILT_4TAP_DPADD_S_H(vec23, vec01, filt0, filt1);
2412 
2413  vec2 = __msa_xori_b((v16u8) src2, 128);
2414  vec12 = (v8i16) __msa_ilvr_b((v16i8) vec2, (v16i8) vec1);
2415  tmp3 = FILT_4TAP_DPADD_S_H(vec30, vec12, filt0, filt1);
2416 
2417  SRAR_H4_SH(tmp0, tmp1, tmp2, tmp3, rnd_vec);
2418  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
2419  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
2420  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
2421  ST6x4_UB(out0, out1, dst, dst_stride);
2422  dst += (4 * dst_stride);
2423  }
2424 }
2425 
2426 static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
2427  uint8_t *dst, int32_t dst_stride,
2428  const int8_t *filter, uint8_t rnd_val)
2429 {
2430  v16i8 src0, src1, src2, src3, src4;
2431  v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2432  v16u8 out;
2433  v8i16 rnd_vec;
2434 
2435  src -= src_stride;
2436  rnd_vec = __msa_fill_h(rnd_val);
2437 
2438  /* rearranging filter_y */
2439  filt = LD_SH(filter);
2440  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2441 
2442  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2443  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2444  ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2445  tmp0 = FILT_4TAP_DPADD_S_H(src01, src23, filt0, filt1);
2446  ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2447  tmp1 = FILT_4TAP_DPADD_S_H(src12, src34, filt0, filt1);
2448  SRAR_H2_SH(tmp0, tmp1, rnd_vec);
2449  SAT_SH2_SH(tmp0, tmp1, 7);
2450  out = PCKEV_XORI128_UB(tmp0, tmp1);
2451  ST8x2_UB(out, dst, dst_stride);
2452 }
2453 
2454 static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
2455  uint8_t *dst, int32_t dst_stride,
2456  const int8_t *filter, uint8_t rnd_val)
2457 {
2458  uint32_t loop_cnt;
2459  uint64_t out0, out1, out2;
2460  v16i8 src0, src1, src2, src3, src4, src5;
2461  v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2462  v8i16 filt, filt0, filt1;
2463  v8i16 rnd_vec;
2464 
2465  src -= src_stride;
2466  rnd_vec = __msa_fill_h(rnd_val);
2467 
2468  /* rearranging filter_y */
2469  filt = LD_SH(filter);
2470  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2471 
2472  LD_SB3(src, src_stride, src0, src1, src2);
2473  src += (3 * src_stride);
2474 
2475  XORI_B3_128_SB(src0, src1, src2);
2476  ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2477 
2478  for (loop_cnt = 2; loop_cnt--;) {
2479  LD_SB3(src, src_stride, src3, src4, src5);
2480  src += (3 * src_stride);
2481 
2482  XORI_B3_128_SB(src3, src4, src5);
2483  ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2484  tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
2485  tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt0, filt1);
2486  tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt0, filt1);
2487  SRAR_H3_SH(tmp0, tmp1, tmp2, rnd_vec);
2488  SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2489  PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2490  XORI_B2_128_SH(tmp0, tmp2);
2491 
2492  out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2493  out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2494  out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2495  SD(out0, dst);
2496  dst += dst_stride;
2497  SD(out1, dst);
2498  dst += dst_stride;
2499  SD(out2, dst);
2500  dst += dst_stride;
2501 
2502  src2 = src5;
2503  vec0 = vec3;
2504  vec2 = vec4;
2505  }
2506 }
2507 
2508 static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2509  uint8_t *dst, int32_t dst_stride,
2510  const int8_t *filter, int32_t height,
2511  uint8_t rnd_val)
2512 {
2513  uint32_t loop_cnt;
2514  v16i8 src0, src1, src2, src7, src8, src9, src10;
2515  v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2516  v16u8 tmp0, tmp1;
2517  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2518  v8i16 rnd_vec;
2519 
2520  src -= src_stride;
2521  rnd_vec = __msa_fill_h(rnd_val);
2522 
2523  filt = LD_SH(filter);
2524  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2525 
2526  LD_SB3(src, src_stride, src0, src1, src2);
2527  src += (3 * src_stride);
2528 
2529  XORI_B3_128_SB(src0, src1, src2);
2530  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2531 
2532  for (loop_cnt = (height >> 2); loop_cnt--;) {
2533  LD_SB4(src, src_stride, src7, src8, src9, src10);
2534  src += (4 * src_stride);
2535 
2536  XORI_B4_128_SB(src7, src8, src9, src10);
2537  ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2538  src72_r, src87_r, src98_r, src109_r);
2539  out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
2540  out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
2541  out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
2542  out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2543  SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
2544  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2545  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2546  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2547  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2548  dst += (4 * dst_stride);
2549 
2550  src10_r = src98_r;
2551  src21_r = src109_r;
2552  src2 = src10;
2553  }
2554 }
2555 
2556 static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
2557  uint8_t *dst, int32_t dst_stride,
2558  const int8_t *filter, int32_t height,
2559  uint8_t rnd_val)
2560 {
2561  if (2 == height) {
2562  common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
2563  } else if (6 == height) {
2564  common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
2565  } else {
2566  common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2567  filter, height, rnd_val);
2568  }
2569 }
2570 
2571 static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
2572  uint8_t *dst, int32_t dst_stride,
2573  const int8_t *filter, int32_t height,
2574  uint8_t rnd_val)
2575 {
2576  uint32_t loop_cnt;
2577  v16i8 src0, src1, src2, src3, src4, src5, src6;
2578  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2579  v16u8 out0, out1;
2580  v8i16 src10, src21, src32, src43, src54, src65, src87, src109, src1211;
2581  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, filt, filt0, filt1;
2582  v4u32 mask = { 2, 6, 2, 6 };
2583  v8i16 rnd_vec;
2584 
2585  /* rearranging filter_y */
2586  filt = LD_SH(filter);
2587  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2588 
2589  rnd_vec = __msa_fill_h(rnd_val);
2590 
2591  src -= src_stride;
2592 
2593  LD_SB3(src, src_stride, src0, src1, src2);
2594  src += (3 * src_stride);
2595 
2596  XORI_B3_128_SB(src0, src1, src2);
2597  VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
2598 
2599  for (loop_cnt = (height >> 2); loop_cnt--;) {
2600  LD_SB4(src, src_stride, src3, src4, src5, src6);
2601  src += (4 * src_stride);
2602 
2603  XORI_B4_128_SB(src3, src4, src5, src6);
2604  ILVR_B2_SH(src1, src0, src3, src2, src10, src32);
2605  VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
2606  VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
2607  tmp0 = FILT_4TAP_DPADD_S_H(src10, src32, filt0, filt1);
2608  ILVR_B4_SH(src2, src1, src4, src3, src5, src4, src6, src5,
2609  src21, src43, src54, src65);
2610  tmp1 = FILT_4TAP_DPADD_S_H(src21, src43, filt0, filt1);
2611  tmp2 = FILT_4TAP_DPADD_S_H(src32, src54, filt0, filt1);
2612  tmp3 = FILT_4TAP_DPADD_S_H(src43, src65, filt0, filt1);
2613  ILVR_B3_SH(vec1, vec0, vec3, vec2, vec5, vec4, src87, src109, src1211);
2614  tmp4 = FILT_4TAP_DPADD_S_H(src87, src109, filt0, filt1);
2615  tmp5 = FILT_4TAP_DPADD_S_H(src109, src1211, filt0, filt1);
2616  SRAR_H4_SH(tmp0, tmp1, tmp2, tmp3, rnd_vec);
2617  SRAR_H2_SH(tmp4, tmp5, rnd_vec);
2618  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
2619  SAT_SH2_SH(tmp4, tmp5, 7);
2620  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
2621  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
2622  ST8x4_UB(out0, out1, dst, dst_stride);
2623  out0 = PCKEV_XORI128_UB(tmp4, tmp5);
2624  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
2625  dst += (4 * dst_stride);
2626 
2627  src0 = src4;
2628  src1 = src5;
2629  src2 = src6;
2630  vec0 = vec4;
2631  vec1 = vec5;
2632  src2 = src6;
2633  }
2634 }
2635 
2636 static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
2637  uint8_t *dst, int32_t dst_stride,
2638  const int8_t *filter, int32_t height,
2639  uint8_t rnd_val)
2640 {
2641  uint32_t loop_cnt;
2642  v16i8 src0, src1, src2, src3, src4, src5, src6;
2643  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2644  v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2645  v16u8 tmp0, tmp1, tmp2, tmp3;
2646  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2647  v8i16 rnd_vec;
2648 
2649  src -= src_stride;
2650  rnd_vec = __msa_fill_h(rnd_val);
2651 
2652  filt = LD_SH(filter);
2653  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2654 
2655  LD_SB3(src, src_stride, src0, src1, src2);
2656  src += (3 * src_stride);
2657 
2658  XORI_B3_128_SB(src0, src1, src2);
2659  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2660  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2661 
2662  for (loop_cnt = (height >> 2); loop_cnt--;) {
2663  LD_SB4(src, src_stride, src3, src4, src5, src6);
2664  src += (4 * src_stride);
2665 
2666  XORI_B4_128_SB(src3, src4, src5, src6);
2667  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2668  src32_r, src43_r, src54_r, src65_r);
2669  ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2670  src32_l, src43_l, src54_l, src65_l);
2671  out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2672  out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2673  out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
2674  out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
2675  out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2676  out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2677  out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
2678  out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
2679  SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
2680  SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
2681  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2682  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2683  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2684  out3_r, tmp0, tmp1, tmp2, tmp3);
2685  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2686  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2687  dst += (4 * dst_stride);
2688 
2689  src10_r = src54_r;
2690  src21_r = src65_r;
2691  src10_l = src54_l;
2692  src21_l = src65_l;
2693  src2 = src6;
2694  }
2695 }
2696 
2697 static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
2698  uint8_t *dst, int32_t dst_stride,
2699  const int8_t *filter, int32_t height,
2700  uint8_t rnd_val)
2701 {
2702  uint32_t loop_cnt;
2703  uint64_t out0, out1;
2704  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2705  v16i8 src11, filt0, filt1;
2706  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2707  v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2708  v16u8 out;
2709  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2710  v8i16 rnd_vec;
2711 
2712  src -= src_stride;
2713 
2714  filt = LD_SH(filter);
2715  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2716 
2717  rnd_vec = __msa_fill_h(rnd_val);
2718 
2719  /* 16 width */
2720  LD_SB3(src, src_stride, src0, src1, src2);
2721  XORI_B3_128_SB(src0, src1, src2);
2722  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2723  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2724 
2725  /* 8 width */
2726  LD_SB3(src + 16, src_stride, src6, src7, src8);
2727  src += (3 * src_stride);
2728  XORI_B3_128_SB(src6, src7, src8);
2729  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2730 
2731  for (loop_cnt = (height >> 2); loop_cnt--;) {
2732  /* 16 width */
2733  LD_SB2(src, src_stride, src3, src4);
2734  XORI_B2_128_SB(src3, src4);
2735  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2736  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2737 
2738  /* 8 width */
2739  LD_SB2(src + 16, src_stride, src9, src10);
2740  src += (2 * src_stride);
2741  XORI_B2_128_SB(src9, src10);
2742  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2743 
2744  /* 16 width */
2745  out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2746  out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2747  out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2748  out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2749 
2750  /* 8 width */
2751  out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1);
2752  out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2753 
2754  /* 16 + 8 width */
2755  SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
2756  SRAR_H2_SH(out0_l, out1_l, rnd_vec);
2757  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2758  SAT_SH2_SH(out0_l, out1_l, 7);
2759  out = PCKEV_XORI128_UB(out0_r, out0_l);
2760  ST_UB(out, dst);
2761  PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2762  XORI_B2_128_SH(out2_r, out3_r);
2763  out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2764  out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2765  SD(out0, dst + 16);
2766  dst += dst_stride;
2767  out = PCKEV_XORI128_UB(out1_r, out1_l);
2768  ST_UB(out, dst);
2769  SD(out1, dst + 16);
2770  dst += dst_stride;
2771 
2772  /* 16 width */
2773  LD_SB2(src, src_stride, src5, src2);
2774  XORI_B2_128_SB(src5, src2);
2775  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2776  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2777 
2778  /* 8 width */
2779  LD_SB2(src + 16, src_stride, src11, src8);
2780  src += (2 * src_stride);
2781  XORI_B2_128_SB(src11, src8);
2782  ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2783 
2784  /* 16 width */
2785  out0_r = FILT_4TAP_DPADD_S_H(src32_r, src10_r, filt0, filt1);
2786  out0_l = FILT_4TAP_DPADD_S_H(src32_l, src10_l, filt0, filt1);
2787  out1_r = FILT_4TAP_DPADD_S_H(src43_r, src21_r, filt0, filt1);
2788  out1_l = FILT_4TAP_DPADD_S_H(src43_l, src21_l, filt0, filt1);
2789 
2790  /* 8 width */
2791  out2_r = FILT_4TAP_DPADD_S_H(src98_r, src76_r, filt0, filt1);
2792  out3_r = FILT_4TAP_DPADD_S_H(src109_r, src87_r, filt0, filt1);
2793 
2794  /* 16 + 8 width */
2795  SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
2796  SRAR_H2_SH(out0_l, out1_l, rnd_vec);
2797  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2798  SAT_SH2_SH(out0_l, out1_l, 7);
2799  out = PCKEV_XORI128_UB(out0_r, out0_l);
2800  ST_UB(out, dst);
2801  out = PCKEV_XORI128_UB(out2_r, out2_r);
2802  ST8x1_UB(out, dst + 16);
2803  dst += dst_stride;
2804  out = PCKEV_XORI128_UB(out1_r, out1_l);
2805  ST_UB(out, dst);
2806  out = PCKEV_XORI128_UB(out3_r, out3_r);
2807  ST8x1_UB(out, dst + 16);
2808  dst += dst_stride;
2809  }
2810 }
2811 
2813  uint8_t *dst, int32_t dst_stride,
2814  const int8_t *filter, int32_t height,
2815  uint8_t rnd_val, int32_t width)
2816 {
2817  uint32_t loop_cnt, cnt;
2818  uint8_t *dst_tmp, *src_tmp;
2819  v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
2820  v16i8 src10_r, src32_r, src76_r, src98_r;
2821  v16i8 src21_r, src43_r, src87_r, src109_r;
2822  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2823  v16i8 src10_l, src32_l, src76_l, src98_l;
2824  v16i8 src21_l, src43_l, src87_l, src109_l;
2825  v8i16 filt;
2826  v16i8 filt0, filt1;
2827  v8i16 rnd_vec;
2828  v16u8 out;
2829 
2830  src -= src_stride;
2831  rnd_vec = __msa_fill_h(rnd_val);
2832 
2833  filt = LD_SH(filter);
2834  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2835 
2836  for (cnt = (width >> 5); cnt--;) {
2837  dst_tmp = dst;
2838  src_tmp = src;
2839 
2840  /* 16 width */
2841  LD_SB3(src_tmp, src_stride, src0, src1, src2);
2842  XORI_B3_128_SB(src0, src1, src2);
2843 
2844  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2845  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2846 
2847  /* next 16 width */
2848  LD_SB3(src_tmp + 16, src_stride, src6, src7, src8);
2849  src_tmp += (3 * src_stride);
2850 
2851  XORI_B3_128_SB(src6, src7, src8);
2852  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2853  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
2854 
2855  for (loop_cnt = (height >> 1); loop_cnt--;) {
2856  /* 16 width */
2857  LD_SB2(src_tmp, src_stride, src3, src4);
2858  XORI_B2_128_SB(src3, src4);
2859  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2860  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2861 
2862  /* 16 width */
2863  out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2864  out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2865  out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2866  out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2867 
2868  /* 16 width */
2869  SRAR_H4_SH(out0_r, out1_r, out0_l, out1_l, rnd_vec);
2870  SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
2871  out = PCKEV_XORI128_UB(out0_r, out0_l);
2872  ST_UB(out, dst_tmp);
2873  out = PCKEV_XORI128_UB(out1_r, out1_l);
2874  ST_UB(out, dst_tmp + dst_stride);
2875 
2876  src10_r = src32_r;
2877  src21_r = src43_r;
2878  src10_l = src32_l;
2879  src21_l = src43_l;
2880  src2 = src4;
2881 
2882  /* next 16 width */
2883  LD_SB2(src_tmp + 16, src_stride, src9, src10);
2884  src_tmp += (2 * src_stride);
2885  XORI_B2_128_SB(src9, src10);
2886  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2887  ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
2888 
2889  /* next 16 width */
2890  out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1);
2891  out2_l = FILT_4TAP_DPADD_S_H(src76_l, src98_l, filt0, filt1);
2892  out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2893  out3_l = FILT_4TAP_DPADD_S_H(src87_l, src109_l, filt0, filt1);
2894 
2895  /* next 16 width */
2896  SRAR_H4_SH(out2_r, out3_r, out2_l, out3_l, rnd_vec);
2897  SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
2898  out = PCKEV_XORI128_UB(out2_r, out2_l);
2899  ST_UB(out, dst_tmp + 16);
2900  out = PCKEV_XORI128_UB(out3_r, out3_l);
2901  ST_UB(out, dst_tmp + 16 + dst_stride);
2902 
2903  dst_tmp += 2 * dst_stride;
2904 
2905  src76_r = src98_r;
2906  src87_r = src109_r;
2907  src76_l = src98_l;
2908  src87_l = src109_l;
2909  src8 = src10;
2910  }
2911 
2912  src += 32;
2913  dst += 32;
2914  }
2915 }
2916 
2917 static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
2918  uint8_t *dst, int32_t dst_stride,
2919  const int8_t *filter, int32_t height,
2920  uint8_t rnd_val)
2921 {
2922  common_vt_4t_32w_mult_msa(src, src_stride, dst, dst_stride,
2923  filter, height, rnd_val, 32);
2924 }
2925 
2927  int32_t src_stride,
2928  uint8_t *dst,
2929  int32_t dst_stride,
2930  const int8_t *filter_x,
2931  const int8_t *filter_y,
2932  int32_t height)
2933 {
2934  v16i8 src0, src1, src2, src3, src4;
2935  v8i16 filt0, filt1;
2936  v4i32 filt_h0, filt_h1;
2937  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2938  v16i8 mask1;
2939  v8i16 filter_vec, const_vec;
2940  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2941  v8i16 dst0, dst1, dst2, dst3, dst4;
2942  v4i32 dst0_r, dst1_r;
2943  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
2944 
2945  src -= (src_stride + 1);
2946 
2947  filter_vec = LD_SH(filter_x);
2948  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2949 
2950  filter_vec = LD_SH(filter_y);
2951  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2952  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2953 
2954  SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
2955 
2956  mask1 = mask0 + 2;
2957 
2958  const_vec = __msa_ldi_h(128);
2959  const_vec <<= 6;
2960 
2961  LD_SB3(src, src_stride, src0, src1, src2);
2962  src += (3 * src_stride);
2963 
2964  XORI_B3_128_SB(src0, src1, src2);
2965 
2966  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2967  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2968  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2969 
2970  dst0 = const_vec;
2971  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2972  dst1 = const_vec;
2973  DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
2974  dst2 = const_vec;
2975  DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
2976 
2977  ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2978  LD_SB2(src, src_stride, src3, src4);
2979  XORI_B2_128_SB(src3, src4);
2980 
2981  /* row 3 */
2982  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2983  dst3 = const_vec;
2984  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2985 
2986  dst32_r = __msa_ilvr_h(dst3, dst2);
2987  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
2988  dst0_r >>= 6;
2989 
2990  /* row 4 */
2991  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2992  dst4 = const_vec;
2993  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2994 
2995  dst43_r = __msa_ilvr_h(dst4, dst3);
2996  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
2997  dst1_r >>= 6;
2998 
2999  dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
3000  dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 6);
3001  dst0_r = (v4i32) CLIP_SH_0_255(dst0_r);
3002  dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
3003 
3004  ST4x2_UB(dst0_r, dst, dst_stride);
3005 }
3006 
3008  int32_t src_stride,
3009  uint8_t *dst,
3010  int32_t dst_stride,
3011  const int8_t *filter_x,
3012  const int8_t *filter_y,
3013  int32_t height)
3014 {
3015  v16i8 src0, src1, src2, src3, src4, src5, src6;
3016  v8i16 filt0, filt1;
3017  v4i32 filt_h0, filt_h1;
3018  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3019  v16i8 mask1;
3020  v8i16 filter_vec, const_vec;
3021  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3022  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3023  v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
3024  v8i16 out0_r, out1_r;
3025  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3026 
3027  src -= (src_stride + 1);
3028 
3029  filter_vec = LD_SH(filter_x);
3030  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3031 
3032  filter_vec = LD_SH(filter_y);
3033  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3034  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3035 
3036  SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3037 
3038  mask1 = mask0 + 2;
3039 
3040  const_vec = __msa_ldi_h(128);
3041  const_vec <<= 6;
3042 
3043  LD_SB3(src, src_stride, src0, src1, src2);
3044  src += (3 * src_stride);
3045 
3046  XORI_B3_128_SB(src0, src1, src2);
3047 
3048  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3049  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3050  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3051 
3052  dst0 = const_vec;
3053  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3054  dst1 = const_vec;
3055  DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3056  dst2 = const_vec;
3057  DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3058 
3059  ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3060  LD_SB4(src, src_stride, src3, src4, src5, src6);
3061  XORI_B4_128_SB(src3, src4, src5, src6);
3062 
3063  /* row 3 */
3064  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3065  dst3 = const_vec;
3066  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3067 
3068  dst32_r = __msa_ilvr_h(dst3, dst2);
3069  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3070  dst0_r >>= 6;
3071 
3072  /* row 4 */
3073  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3074  dst4 = const_vec;
3075  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3076 
3077  dst43_r = __msa_ilvr_h(dst4, dst3);
3078  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3079  dst1_r >>= 6;
3080 
3081  /* row 5 */
3082  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3083  dst5 = const_vec;
3084  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3085 
3086  dst10_r = __msa_ilvr_h(dst5, dst4);
3087  dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3088  dst2_r >>= 6;
3089 
3090  /* row 6 */
3091  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3092  dst2 = const_vec;
3093  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3094 
3095  dst21_r = __msa_ilvr_h(dst2, dst5);
3096  dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3097  dst3_r >>= 6;
3098 
3099  PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, out0_r, out1_r);
3100  SRARI_H2_SH(out0_r, out1_r, 6);
3101  CLIP_SH2_0_255(out0_r, out1_r);
3102  out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
3103 
3104  ST4x4_UB(out0_r, out0_r, 0, 1, 2, 3, dst, dst_stride);
3105 }
3106 
3108  int32_t src_stride,
3109  uint8_t *dst,
3110  int32_t dst_stride,
3111  const int8_t *filter_x,
3112  const int8_t *filter_y,
3113  int32_t height)
3114 {
3115  uint32_t loop_cnt;
3116  v16i8 src0, src1, src2, src3, src4, src5;
3117  v16i8 src6, src7, src8, src9, src10;
3118  v8i16 filt0, filt1;
3119  v4i32 filt_h0, filt_h1;
3120  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3121  v16i8 mask1;
3122  v8i16 filter_vec, const_vec;
3123  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3124  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
3125  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3126  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3127  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3128  v8i16 out0_r, out1_r, out2_r, out3_r;
3129 
3130  src -= (src_stride + 1);
3131 
3132  filter_vec = LD_SH(filter_x);
3133  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3134 
3135  filter_vec = LD_SH(filter_y);
3136  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3137  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3138 
3139  SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3140 
3141  mask1 = mask0 + 2;
3142 
3143  const_vec = __msa_ldi_h(128);
3144  const_vec <<= 6;
3145 
3146  LD_SB3(src, src_stride, src0, src1, src2);
3147  src += (3 * src_stride);
3148 
3149  XORI_B3_128_SB(src0, src1, src2);
3150 
3151  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3152  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3153  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3154 
3155  dst0 = const_vec;
3156  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3157  dst1 = const_vec;
3158  DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3159  dst2 = const_vec;
3160  DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3161 
3162  ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3163 
3164  for (loop_cnt = height >> 3; loop_cnt--;) {
3165  LD_SB8(src, src_stride,
3166  src3, src4, src5, src6, src7, src8, src9, src10);
3167  src += (8 * src_stride);
3168 
3169  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3170 
3171  /* row 3 */
3172  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3173  dst3 = const_vec;
3174  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3175 
3176  dst32_r = __msa_ilvr_h(dst3, dst2);
3177  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3178  dst0_r >>= 6;
3179 
3180  /* row 4 */
3181  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3182  dst4 = const_vec;
3183  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3184 
3185  dst43_r = __msa_ilvr_h(dst4, dst3);
3186  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3187  dst1_r >>= 6;
3188 
3189  /* row 5 */
3190  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3191  dst5 = const_vec;
3192  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3193 
3194  dst54_r = __msa_ilvr_h(dst5, dst4);
3195  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3196  dst2_r >>= 6;
3197 
3198  /* row 6 */
3199  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3200  dst6 = const_vec;
3201  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3202 
3203  dst65_r = __msa_ilvr_h(dst6, dst5);
3204  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3205  dst3_r >>= 6;
3206 
3207  /* row 7 */
3208  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3209  dst7 = const_vec;
3210  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3211 
3212  dst76_r = __msa_ilvr_h(dst7, dst6);
3213  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3214  dst4_r >>= 6;
3215 
3216  /* row 8 */
3217  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3218  dst8 = const_vec;
3219  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3220 
3221  dst87_r = __msa_ilvr_h(dst8, dst7);
3222  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3223  dst5_r >>= 6;
3224 
3225  /* row 9 */
3226  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
3227  dst9 = const_vec;
3228  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
3229 
3230  dst10_r = __msa_ilvr_h(dst9, dst8);
3231  dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
3232  dst6_r >>= 6;
3233 
3234  /* row 10 */
3235  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
3236  dst2 = const_vec;
3237  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3238 
3239  dst21_r = __msa_ilvr_h(dst2, dst9);
3240  dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
3241  dst7_r >>= 6;
3242 
3243  PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3244  dst5_r, dst4_r, dst7_r, dst6_r,
3245  out0_r, out1_r, out2_r, out3_r);
3246 
3247  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3248  CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3249 
3250  PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3251  ST4x8_UB(out0_r, out1_r, dst, dst_stride);
3252  dst += (8 * dst_stride);
3253  }
3254 }
3255 
3257  int32_t src_stride,
3258  uint8_t *dst,
3259  int32_t dst_stride,
3260  const int8_t *filter_x,
3261  const int8_t *filter_y,
3262  int32_t height)
3263 {
3264  if (2 == height) {
3265  hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3266  filter_x, filter_y, height);
3267  } else if (4 == height) {
3268  hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3269  filter_x, filter_y, height);
3270  } else if (0 == (height % 8)) {
3271  hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3272  filter_x, filter_y, height);
3273  }
3274 }
3275 
3277  int32_t src_stride,
3278  uint8_t *dst,
3279  int32_t dst_stride,
3280  const int8_t *filter_x,
3281  const int8_t *filter_y,
3282  int32_t height)
3283 {
3284  uint32_t loop_cnt;
3285  v16i8 src0, src1, src2, src3, src4, src5, src6;
3286  v8i16 filt0, filt1;
3287  v4i32 filt_h0, filt_h1;
3288  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3289  v16i8 mask1;
3290  v8i16 filter_vec, const_vec;
3291  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3292  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3293  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3294  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3295  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3296  v8i16 out0_r, out1_r, out2_r, out3_r;
3297 
3298  src -= (src_stride + 1);
3299 
3300  filter_vec = LD_SH(filter_x);
3301  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3302 
3303  filter_vec = LD_SH(filter_y);
3304  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3305  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3306 
3307  SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3308 
3309  mask1 = mask0 + 2;
3310 
3311  const_vec = __msa_ldi_h(128);
3312  const_vec <<= 6;
3313 
3314  LD_SB3(src, src_stride, src0, src1, src2);
3315  src += (3 * src_stride);
3316 
3317  XORI_B3_128_SB(src0, src1, src2);
3318 
3319  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3320  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3321  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3322 
3323  dst0 = const_vec;
3324  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3325  dst1 = const_vec;
3326  DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3327  dst2 = const_vec;
3328  DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3329 
3330  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3331  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3332 
3333  for (loop_cnt = height >> 2; loop_cnt--;) {
3334  LD_SB4(src, src_stride, src3, src4, src5, src6);
3335  src += (4 * src_stride);
3336 
3337  XORI_B4_128_SB(src3, src4, src5, src6);
3338 
3339  /* row 3 */
3340  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3341  dst3 = const_vec;
3342  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3343 
3344  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3345  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3346  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3347  dst0_r >>= 6;
3348  dst0_l >>= 6;
3349 
3350  /* row 4 */
3351  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3352  dst4 = const_vec;
3353  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3354 
3355  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3356  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3357  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3358  dst1_r >>= 6;
3359  dst1_l >>= 6;
3360 
3361  /* row 5 */
3362  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3363  dst5 = const_vec;
3364  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3365 
3366  ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3367  dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3368  dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3369 
3370  dst2_r >>= 6;
3371  dst2_l >>= 6;
3372 
3373  /* row 6 */
3374  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3375  dst2 = const_vec;
3376  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3377 
3378  ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3379  dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3380  dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3381 
3382  dst3_r >>= 6;
3383  dst3_l >>= 6;
3384 
3385  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3386  dst2_l, dst2_r, dst3_l, dst3_r,
3387  out0_r, out1_r, out2_r, out3_r);
3388 
3389  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3390  CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3391 
3392  PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3393  ST6x4_UB(out0_r, out1_r, dst, dst_stride);
3394  dst += (4 * dst_stride);
3395  }
3396 }
3397 
3399  int32_t src_stride,
3400  uint8_t *dst,
3401  int32_t dst_stride,
3402  const int8_t *filter_x,
3403  const int8_t *filter_y,
3404  int32_t height)
3405 {
3406  v16i8 src0, src1, src2, src3, src4;
3407  v8i16 filt0, filt1;
3408  v4i32 filt_h0, filt_h1;
3409  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3410  v16i8 mask1;
3411  v8i16 filter_vec, const_vec;
3412  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3413  v8i16 dst0, dst1, dst2, dst3, dst4;
3414  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3415  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3416  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3417  v8i16 out0_r, out1_r;
3418 
3419  src -= (src_stride + 1);
3420 
3421  filter_vec = LD_SH(filter_x);
3422  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3423 
3424  filter_vec = LD_SH(filter_y);
3425  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3426  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3427 
3428  SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3429 
3430  mask1 = mask0 + 2;
3431 
3432  const_vec = __msa_ldi_h(128);
3433  const_vec <<= 6;
3434 
3435  LD_SB3(src, src_stride, src0, src1, src2);
3436  src += (3 * src_stride);
3437 
3438  XORI_B3_128_SB(src0, src1, src2);
3439 
3440  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3441  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3442  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3443 
3444  dst0 = const_vec;
3445  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3446  dst1 = const_vec;
3447  DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3448  dst2 = const_vec;
3449  DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3450 
3451  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3452  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3453 
3454  LD_SB2(src, src_stride, src3, src4);
3455  XORI_B2_128_SB(src3, src4);
3456 
3457  /* row 3 */
3458  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3459  dst3 = const_vec;
3460  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3461 
3462  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3463  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3464  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3465  dst0_r >>= 6;
3466  dst0_l >>= 6;
3467 
3468  /* row 4 */
3469  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3470  dst4 = const_vec;
3471  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3472 
3473  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3474  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3475  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3476  dst1_r >>= 6;
3477  dst1_l >>= 6;
3478 
3479  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3480  SRARI_H2_SH(out0_r, out1_r, 6);
3481  CLIP_SH2_0_255(out0_r, out1_r);
3482  out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
3483 
3484  ST8x2_UB(out0_r, dst, dst_stride);
3485 }
3486 
3488  int32_t src_stride,
3489  uint8_t *dst,
3490  int32_t dst_stride,
3491  const int8_t *filter_x,
3492  const int8_t *filter_y,
3493  int32_t height)
3494 {
3495  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3496  v8i16 filt0, filt1;
3497  v4i32 filt_h0, filt_h1;
3498  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3499  v16i8 mask1;
3500  v8i16 filter_vec, const_vec;
3501  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3502  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3503  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3504  v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3505  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3506  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3507  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3508  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3509  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3510 
3511  src -= (src_stride + 1);
3512 
3513  filter_vec = LD_SH(filter_x);
3514  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3515 
3516  filter_vec = LD_SH(filter_y);
3517  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3518  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3519 
3520  SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3521 
3522  mask1 = mask0 + 2;
3523 
3524  const_vec = __msa_ldi_h(128);
3525  const_vec <<= 6;
3526 
3527  LD_SB3(src, src_stride, src0, src1, src2);
3528  src += (3 * src_stride);
3529 
3530  XORI_B3_128_SB(src0, src1, src2);
3531 
3532  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3533  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3534  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3535 
3536  dst0 = const_vec;
3537  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3538  dst1 = const_vec;
3539  DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3540  dst2 = const_vec;
3541  DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3542 
3543  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3544  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3545 
3546  LD_SB2(src, src_stride, src3, src4);
3547  src += (2 * src_stride);
3548 
3549  XORI_B2_128_SB(src3, src4);
3550 
3551  /* row 3 */
3552  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3553  dst3 = const_vec;
3554  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3555 
3556  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3557  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3558  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3559 
3560  dst0_r >>= 6;
3561  dst0_l >>= 6;
3562 
3563  /* row 4 */
3564  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3565  dst4 = const_vec;
3566  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3567 
3568  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3569  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3570  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3571  dst1_r >>= 6;
3572  dst1_l >>= 6;
3573 
3574  LD_SB2(src, src_stride, src5, src6);
3575  src += (2 * src_stride);
3576 
3577  XORI_B2_128_SB(src5, src6);
3578 
3579  /* row 5 */
3580  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3581  dst5 = const_vec;
3582  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3583 
3584  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3585  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3586  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3587  dst2_r >>= 6;
3588  dst2_l >>= 6;
3589 
3590  /* row 6 */
3591  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3592  dst6 = const_vec;
3593  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3594 
3595  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3596  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3597  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3598  dst3_r >>= 6;
3599  dst3_l >>= 6;
3600 
3601  LD_SB2(src, src_stride, src7, src8);
3602  src += (2 * src_stride);
3603 
3604  XORI_B2_128_SB(src7, src8);
3605 
3606  /* row 7 */
3607  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3608  dst7 = const_vec;
3609  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3610 
3611  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3612  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3613  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3614 
3615  dst4_r >>= 6;
3616  dst4_l >>= 6;
3617 
3618  /* row 8 */
3619  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3620  dst8 = const_vec;
3621  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3622 
3623  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3624  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3625  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3626  dst5_r >>= 6;
3627  dst5_l >>= 6;
3628 
3629  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3630  dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3631  PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3632  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3633  SRARI_H2_SH(out4_r, out5_r, 6);
3634  CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3635  CLIP_SH2_0_255(out4_r, out5_r);
3636 
3637  PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3638  out2_r = (v8i16) __msa_pckev_b((v16i8) out5_r, (v16i8) out4_r);
3639 
3640  ST8x4_UB(out0_r, out1_r, dst, dst_stride);
3641  dst += (4 * dst_stride);
3642  ST8x2_UB(out2_r, dst, dst_stride);
3643 }
3644 
3646  int32_t src_stride,
3647  uint8_t *dst,
3648  int32_t dst_stride,
3649  const int8_t *filter_x,
3650  const int8_t *filter_y,
3651  int32_t height,
3652  int32_t width)
3653 {
3654  uint32_t loop_cnt, cnt;
3655  uint8_t *src_tmp;
3656  uint8_t *dst_tmp;
3657  v16i8 src0, src1, src2, src3, src4, src5, src6;
3658  v8i16 filt0, filt1;
3659  v4i32 filt_h0, filt_h1;
3660  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3661  v16i8 mask1;
3662  v8i16 filter_vec, const_vec;
3663  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3664  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3665  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3666  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3667  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3668  v8i16 out0_r, out1_r, out2_r, out3_r;
3669 
3670  src -= (src_stride + 1);
3671 
3672  filter_vec = LD_SH(filter_x);
3673  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3674 
3675  filter_vec = LD_SH(filter_y);
3676  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3677  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3678 
3679  SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3680 
3681  mask1 = mask0 + 2;
3682 
3683  const_vec = __msa_ldi_h(128);
3684  const_vec <<= 6;
3685 
3686  for (cnt = width >> 3; cnt--;) {
3687  src_tmp = src;
3688  dst_tmp = dst;
3689 
3690  LD_SB3(src_tmp, src_stride, src0, src1, src2);
3691  src_tmp += (3 * src_stride);
3692 
3693  XORI_B3_128_SB(src0, src1, src2);
3694 
3695  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3696  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3697  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3698 
3699  dst0 = const_vec;
3700  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3701  dst1 = const_vec;
3702  DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3703  dst2 = const_vec;
3704  DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3705 
3706  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3707  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3708 
3709  for (loop_cnt = height >> 2; loop_cnt--;) {
3710  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3711  src_tmp += (4 * src_stride);
3712 
3713  XORI_B4_128_SB(src3, src4, src5, src6);
3714 
3715  /* row 3 */
3716  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3717  dst3 = const_vec;
3718  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3719 
3720  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3721  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3722  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3723 
3724  dst0_r >>= 6;
3725  dst0_l >>= 6;
3726 
3727  /* row 4 */
3728  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3729  dst4 = const_vec;
3730  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3731 
3732  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3733  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3734  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3735  dst1_r >>= 6;
3736  dst1_l >>= 6;
3737 
3738  /* row 5 */
3739  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3740  dst5 = const_vec;
3741  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3742 
3743  ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3744  dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3745  dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3746 
3747  dst2_r >>= 6;
3748  dst2_l >>= 6;
3749 
3750  /* row 6 */
3751  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3752  dst2 = const_vec;
3753  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3754 
3755  ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3756  dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3757  dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3758 
3759  dst3_r >>= 6;
3760  dst3_l >>= 6;
3761 
3762  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3763  dst2_l, dst2_r, dst3_l, dst3_r,
3764  out0_r, out1_r, out2_r, out3_r);
3765 
3766  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3767  CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3768 
3769  PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3770  ST8x4_UB(out0_r, out1_r, dst_tmp, dst_stride);
3771  dst_tmp += (4 * dst_stride);
3772  }
3773 
3774  src += 8;
3775  dst += 8;
3776  }
3777 }
3778 
3780  int32_t src_stride,
3781  uint8_t *dst,
3782  int32_t dst_stride,
3783  const int8_t *filter_x,
3784  const int8_t *filter_y,
3785  int32_t height)
3786 {
3787  if (2 == height) {
3788  hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3789  filter_x, filter_y, height);
3790  } else if (6 == height) {
3791  hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3792  filter_x, filter_y, height);
3793  } else if (0 == (height % 4)) {
3794  hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3795  filter_x, filter_y, height, 8);
3796  }
3797 }
3798 
3800  int32_t src_stride,
3801  uint8_t *dst,
3802  int32_t dst_stride,
3803  const int8_t *filter_x,
3804  const int8_t *filter_y,
3805  int32_t height)
3806 {
3807  hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3808  filter_x, filter_y, height, 8);
3809 
3810  hevc_hv_uni_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
3811  filter_x, filter_y, height);
3812 }
3813 
3815  int32_t src_stride,
3816  uint8_t *dst,
3817  int32_t dst_stride,
3818  const int8_t *filter_x,
3819  const int8_t *filter_y,
3820  int32_t height)
3821 {
3822  hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3823  filter_x, filter_y, height, 16);
3824 }
3825 
3827  int32_t src_stride,
3828  uint8_t *dst,
3829  int32_t dst_stride,
3830  const int8_t *filter_x,
3831  const int8_t *filter_y,
3832  int32_t height)
3833 {
3834  hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3835  filter_x, filter_y, height, 24);
3836 }
3837 
3839  int32_t src_stride,
3840  uint8_t *dst,
3841  int32_t dst_stride,
3842  const int8_t *filter_x,
3843  const int8_t *filter_y,
3844  int32_t height)
3845 {
3846  hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3847  filter_x, filter_y, height, 32);
3848 }
3849 
3850 #define UNI_MC_COPY(WIDTH) \
3851 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
3852  ptrdiff_t dst_stride, \
3853  uint8_t *src, \
3854  ptrdiff_t src_stride, \
3855  int height, \
3856  intptr_t mx, \
3857  intptr_t my, \
3858  int width) \
3859 { \
3860  copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
3861 }
3862 
3863 UNI_MC_COPY(8);
3864 UNI_MC_COPY(12);
3865 UNI_MC_COPY(16);
3866 UNI_MC_COPY(24);
3867 UNI_MC_COPY(32);
3868 UNI_MC_COPY(48);
3869 UNI_MC_COPY(64);
3870 
3871 #undef UNI_MC_COPY
3872 
3873 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
3874 void ff_hevc_put_hevc_uni_##PEL##_##DIR####WIDTH##_8_msa(uint8_t *dst, \
3875  ptrdiff_t \
3876  dst_stride, \
3877  uint8_t *src, \
3878  ptrdiff_t \
3879  src_stride, \
3880  int height, \
3881  intptr_t mx, \
3882  intptr_t my, \
3883  int width) \
3884 { \
3885  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
3886  \
3887  common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
3888  filter, height, 6); \
3889 }
3890 
3891 UNI_MC(qpel, h, 4, 8, hz, mx);
3892 UNI_MC(qpel, h, 8, 8, hz, mx);
3893 UNI_MC(qpel, h, 12, 8, hz, mx);
3894 UNI_MC(qpel, h, 16, 8, hz, mx);
3895 UNI_MC(qpel, h, 24, 8, hz, mx);
3896 UNI_MC(qpel, h, 32, 8, hz, mx);
3897 UNI_MC(qpel, h, 48, 8, hz, mx);
3898 UNI_MC(qpel, h, 64, 8, hz, mx);
3899 
3900 UNI_MC(qpel, v, 4, 8, vt, my);
3901 UNI_MC(qpel, v, 8, 8, vt, my);
3902 UNI_MC(qpel, v, 12, 8, vt, my);
3903 UNI_MC(qpel, v, 16, 8, vt, my);
3904 UNI_MC(qpel, v, 24, 8, vt, my);
3905 UNI_MC(qpel, v, 32, 8, vt, my);
3906 UNI_MC(qpel, v, 48, 8, vt, my);
3907 UNI_MC(qpel, v, 64, 8, vt, my);
3908 
3909 UNI_MC(epel, h, 4, 4, hz, mx);
3910 UNI_MC(epel, h, 6, 4, hz, mx);
3911 UNI_MC(epel, h, 8, 4, hz, mx);
3912 UNI_MC(epel, h, 12, 4, hz, mx);
3913 UNI_MC(epel, h, 16, 4, hz, mx);
3914 UNI_MC(epel, h, 24, 4, hz, mx);
3915 UNI_MC(epel, h, 32, 4, hz, mx);
3916 
3917 UNI_MC(epel, v, 4, 4, vt, my);
3918 UNI_MC(epel, v, 6, 4, vt, my);
3919 UNI_MC(epel, v, 8, 4, vt, my);
3920 UNI_MC(epel, v, 12, 4, vt, my);
3921 UNI_MC(epel, v, 16, 4, vt, my);
3922 UNI_MC(epel, v, 24, 4, vt, my);
3923 UNI_MC(epel, v, 32, 4, vt, my);
3924 
3925 #undef UNI_MC
3926 
3927 #define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
3928 void ff_hevc_put_hevc_uni_##PEL##_##DIR####WIDTH##_8_msa(uint8_t *dst, \
3929  ptrdiff_t \
3930  dst_stride, \
3931  uint8_t *src, \
3932  ptrdiff_t \
3933  src_stride, \
3934  int height, \
3935  intptr_t mx, \
3936  intptr_t my, \
3937  int width) \
3938 { \
3939  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
3940  const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
3941  \
3942  hevc_##DIR1##_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
3943  dst_stride, filter_x, \
3944  filter_y, height); \
3945 }
3946 
3947 UNI_MC_HV(qpel, hv, 4, 8, hv);
3948 UNI_MC_HV(qpel, hv, 8, 8, hv);
3949 UNI_MC_HV(qpel, hv, 12, 8, hv);
3950 UNI_MC_HV(qpel, hv, 16, 8, hv);
3951 UNI_MC_HV(qpel, hv, 24, 8, hv);
3952 UNI_MC_HV(qpel, hv, 32, 8, hv);
3953 UNI_MC_HV(qpel, hv, 48, 8, hv);
3954 UNI_MC_HV(qpel, hv, 64, 8, hv);
3955 
3956 UNI_MC_HV(epel, hv, 4, 4, hv);
3957 UNI_MC_HV(epel, hv, 6, 4, hv);
3958 UNI_MC_HV(epel, hv, 8, 4, hv);
3959 UNI_MC_HV(epel, hv, 12, 4, hv);
3960 UNI_MC_HV(epel, hv, 16, 4, hv);
3961 UNI_MC_HV(epel, hv, 24, 4, hv);
3962 UNI_MC_HV(epel, hv, 32, 4, hv);
3963 
3964 #undef UNI_MC_HV
#define VSHF_B4_SB(...)
static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val, int32_t width)
#define HEVC_PCK_SW_SB2(in0, in1, out)
static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
#define XORI_B5_128_SB(...)
float v
#define XORI_B8_128_SB(...)
static void hevc_hv_uni_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
uint8_t mc_filt_mask_arr[16 *3]
#define SRARI_W4_SW(...)
static void hevc_hv_uni_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ILVR_H4_SH(...)
#define PCKEV_B2_SH(...)
static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void copy_width24_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define XORI_B2_128_SB(...)
#define PCKEV_XORI128_UB(in0, in1)
static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define LD_SB(...)
#define XORI_B3_128_SB(...)
static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void hevc_hv_uni_4t_8w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define LD_UB4(...)
#define DPADD_SB4_SH(...)
#define ILVR_B2_SB(...)
#define SPLATI_H2_SH(...)
static void hevc_hv_uni_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void copy_width48_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define ILVL_H2_SH(...)
static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define ILVR_H3_SH(...)
#define LD_SB2(...)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define ILVL_H4_SH(...)
#define XORI_B2_128_SH(...)
#define VSHF_B2_SB(...)
#define XORI_B4_128_UB(...)
#define HEVC_PCK_SW_SB4(in0, in1, in2, in3, out)
static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1, out2, out3)
#define ILVR_D2_SB(...)
uint8_t
#define LD_UB2(...)
static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define VSHF_W2_SB(...)
static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define SRARI_H4_SH(...)
#define CLIP_SH_0_255(in)
#define SPLATI_H4_SH(...)
static void hevc_hv_uni_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ILVL_B2_SB(...)
#define LD_SH(...)
static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t width)
#define CLIP_SW_0_255(in)
static void hevc_hv_uni_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
#define ILVRL_H2_SH(...)
static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define ILVR_D3_SB(...)
#define LD_SB8(...)
#define CLIP_SH2_0_255(in0, in1)
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1)
#define SD
Definition: dvdsubdec.c:735
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define ST8x2_UB(in, pdst, stride)
static const uint16_t mask[17]
Definition: lzw.c:38
static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define VSHF_B2_SH(...)
#define SPLATI_H2_SB(...)
static void hevc_hv_uni_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define XORI_B7_128_SB(...)
static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define ADDS_SH2_SH(...)
#define ILVR_B2_SH(...)
#define XORI_B4_128_SB(...)
static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define CLIP_SH4_0_255(in0, in1, in2, in3)
#define DPADD_SB2_SH(...)
#define SRAR_H2_SH(...)
#define SRARI_H2_SH(...)
#define PCKEV_B3_SB(...)
#define LD_UB3(...)
#define LD_UB8(...)
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
#define SPLATI_W4_SW(...)
static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
int32_t
static void copy_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define PCKEV_H2_SH(...)
#define SRARI_W2_SW(...)
#define LD_SB3(...)
static void copy_width12_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val, int32_t width)
#define ST_UB(...)
static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define SAT_SH4_SH(...)
#define SPLATI_H4_SB(...)
#define LD_SB4(...)
#define PCKEV_B4_UB(...)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_hv_uni_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST_UB8(...)
static void hevc_hv_uni_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST_UB4(...)
AVS_Value src
Definition: avisynth_c.h:482
#define ILVL_B4_SB(...)
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1, out2, out3)
static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
#define SAT_SH2_SH(...)
#define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)
static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void copy_width64_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_uni_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
BYTE int const BYTE int int int height
Definition: avisynth_c.h:676
#define DOTP_SB4_SH(...)
#define ILVR_B4_SH(...)
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1)
static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define SAT_SH3_SH(...)
static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void copy_width16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define SD4(in0, in1, in2, in3, pdst, stride)
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
#define SW(val, pdst)
#define SRAR_H4_SH(...)
static const int8_t filt[NUMTAPS]
Definition: af_earwax.c:39
#define ST4x8_UB(in0, in1, pdst, stride)
#define ST6x4_UB(in0, in1, pdst, stride)
#define LD_SB7(...)
#define LD_SB5(...)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
static void filter(MpegAudioContext *s, int ch, const short *samples, int incr)
#define ST8x4_UB(in0, in1, pdst, stride)
static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define ILVR_H2_SH(...)
#define UNI_MC_COPY(WIDTH)
static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define LD_UB(...)
static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static void hevc_hv_uni_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,filt0, filt1, filt2, filt3)
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(constuint8_t *) pi-0x80)*(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(constuint8_t *) pi-0x80)*(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(constint16_t *) pi >>8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t,*(constint16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t,*(constint16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(constint32_t *) pi >>24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t,*(constint32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t,*(constint32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(constfloat *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(constfloat *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(constfloat *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(constdouble *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(constdouble *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(constdouble *) pi *(1U<< 31))))#defineSET_CONV_FUNC_GROUP(ofmt, ifmt) staticvoidset_generic_function(AudioConvert *ac){}voidff_audio_convert_free(AudioConvert **ac){if(!*ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);}AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enumAVSampleFormatout_fmt, enumAVSampleFormatin_fmt, intchannels, intsample_rate, intapply_map){AudioConvert *ac;intin_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) returnNULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method!=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt)>2){ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc){av_free(ac);returnNULL;}returnac;}in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar){ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar?ac->channels:1;}elseif(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;elseac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);returnac;}intff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in){intuse_generic=1;intlen=in->nb_samples;intp;if(ac->dc){av_log(ac->avr, AV_LOG_TRACE,"%dsamples-audio_convert:%sto%s(dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));returnff_convert_dither(ac-> out
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, uint8_t rnd_val)
#define ILVR_B4_SB(...)
static void hevc_hv_uni_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ILVR_B3_SH(...)
static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define SRAR_H3_SH(...)
#define SPLATI_W2_SW(...)
static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
#define ST8x1_UB(in, pdst)
#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)
#define PCKEV_H4_SH(...)
#define ST4x2_UB(in, pdst, stride)
#define SLDI_B2_SB(...)
static void copy_width32_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define ADDS_SH4_SH(...)
#define DOTP_SB2_SH(...)
static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, uint8_t rnd_val)
static int width
#define VSHF_B3_SB(...)
#define DOTP_SB3_SH(...)