FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vp9_mc_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/vp9dsp.h"
23 #include "vp9dsp_mips.h"
24 
25 static const uint8_t mc_filt_mask_arr[16 * 3] = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30  /* 4 width cases */
31  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33 
34 static const int8_t vp9_bilinear_filters_msa[15][2] = {
35  {120, 8},
36  {112, 16},
37  {104, 24},
38  {96, 32},
39  {88, 40},
40  {80, 48},
41  {72, 56},
42  {64, 64},
43  {56, 72},
44  {48, 80},
45  {40, 88},
46  {32, 96},
47  {24, 104},
48  {16, 112},
49  {8, 120}
50 };
51 
52 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
53  filt0, filt1, filt2, filt3) \
54 ( { \
55  v8i16 tmp0, tmp1; \
56  \
57  tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
58  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
59  tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \
60  tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3); \
61  tmp0 = __msa_adds_s_h(tmp0, tmp1); \
62  \
63  tmp0; \
64 } )
65 
66 #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, \
67  filt_h0, filt_h1, filt_h2, filt_h3) \
68 ( { \
69  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
70  v8i16 hz_out_m; \
71  \
72  VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, \
73  vec0_m, vec1_m, vec2_m, vec3_m); \
74  hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, \
75  filt_h0, filt_h1, filt_h2, filt_h3); \
76  \
77  hz_out_m = __msa_srari_h(hz_out_m, 7); \
78  hz_out_m = __msa_sat_s_h(hz_out_m, 7); \
79  \
80  hz_out_m; \
81 } )
82 
83 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
84  mask0, mask1, mask2, mask3, \
85  filt0, filt1, filt2, filt3, \
86  out0, out1) \
87 { \
88  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
89  v8i16 res0_m, res1_m, res2_m, res3_m; \
90  \
91  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
92  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
93  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
94  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
95  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
96  DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
97  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
98  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
99  ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
100 }
101 
102 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
103  mask0, mask1, mask2, mask3, \
104  filt0, filt1, filt2, filt3, \
105  out0, out1, out2, out3) \
106 { \
107  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
108  v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
109  \
110  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
111  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
112  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
113  res0_m, res1_m, res2_m, res3_m); \
114  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
115  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
116  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
117  res4_m, res5_m, res6_m, res7_m); \
118  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
119  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
120  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
121  res0_m, res1_m, res2_m, res3_m); \
122  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
123  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
124  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
125  res4_m, res5_m, res6_m, res7_m); \
126  ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
127  res7_m, out0, out1, out2, out3); \
128 }
129 
130 #define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \
131 { \
132  v16u8 tmp_m; \
133  \
134  tmp_m = PCKEV_XORI128_UB(in1, in0); \
135  tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \
136  ST_UB(tmp_m, (pdst)); \
137 }
138 
139 #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \
140 { \
141  v16u8 tmp_m; \
142  \
143  tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
144  tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \
145  ST_UB(tmp_m, (pdst)); \
146 }
147 
148 #define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \
149  pdst, stride) \
150 { \
151  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
152  uint8_t *pdst_m = (uint8_t *) (pdst); \
153  \
154  PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
155  PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
156  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
157  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
158 }
159 
160 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
161  uint8_t *dst, int32_t dst_stride,
162  const int8_t *filter)
163 {
164  v16u8 mask0, mask1, mask2, mask3, out;
165  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
166  v8i16 filt, out0, out1;
167 
168  mask0 = LD_UB(&mc_filt_mask_arr[16]);
169  src -= 3;
170 
171  /* rearranging filter */
172  filt = LD_SH(filter);
173  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
174 
175  mask1 = mask0 + 2;
176  mask2 = mask0 + 4;
177  mask3 = mask0 + 6;
178 
179  LD_SB4(src, src_stride, src0, src1, src2, src3);
180  XORI_B4_128_SB(src0, src1, src2, src3);
181  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
182  mask3, filt0, filt1, filt2, filt3, out0, out1);
183  SRARI_H2_SH(out0, out1, 7);
184  SAT_SH2_SH(out0, out1, 7);
185  out = PCKEV_XORI128_UB(out0, out1);
186  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
187 }
188 
189 static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
190  uint8_t *dst, int32_t dst_stride,
191  const int8_t *filter)
192 {
193  v16i8 filt0, filt1, filt2, filt3;
194  v16i8 src0, src1, src2, src3;
195  v16u8 mask0, mask1, mask2, mask3, out;
196  v8i16 filt, out0, out1, out2, out3;
197 
198  mask0 = LD_UB(&mc_filt_mask_arr[16]);
199  src -= 3;
200 
201  /* rearranging filter */
202  filt = LD_SH(filter);
203  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
204 
205  mask1 = mask0 + 2;
206  mask2 = mask0 + 4;
207  mask3 = mask0 + 6;
208 
209  LD_SB4(src, src_stride, src0, src1, src2, src3);
210  XORI_B4_128_SB(src0, src1, src2, src3);
211  src += (4 * src_stride);
212  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
213  mask3, filt0, filt1, filt2, filt3, out0, out1);
214  LD_SB4(src, src_stride, src0, src1, src2, src3);
215  XORI_B4_128_SB(src0, src1, src2, src3);
216  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
217  mask3, filt0, filt1, filt2, filt3, out2, out3);
218  SRARI_H4_SH(out0, out1, out2, out3, 7);
219  SAT_SH4_SH(out0, out1, out2, out3, 7);
220  out = PCKEV_XORI128_UB(out0, out1);
221  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
222  dst += (4 * dst_stride);
223  out = PCKEV_XORI128_UB(out2, out3);
224  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
225 }
226 
227 static void common_hz_8t_4x16_msa(const uint8_t *src, int32_t src_stride,
228  uint8_t *dst, int32_t dst_stride,
229  const int8_t *filter)
230 {
231  v16u8 mask0, mask1, mask2, mask3, out;
232  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
233  v8i16 filt, out0, out1, out2, out3;
234 
235  mask0 = LD_UB(&mc_filt_mask_arr[16]);
236  src -= 3;
237 
238  /* rearranging filter */
239  filt = LD_SH(filter);
240  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
241 
242  mask1 = mask0 + 2;
243  mask2 = mask0 + 4;
244  mask3 = mask0 + 6;
245 
246  LD_SB4(src, src_stride, src0, src1, src2, src3);
247  XORI_B4_128_SB(src0, src1, src2, src3);
248  src += (4 * src_stride);
249  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
250  mask3, filt0, filt1, filt2, filt3, out0, out1);
251  LD_SB4(src, src_stride, src0, src1, src2, src3);
252  XORI_B4_128_SB(src0, src1, src2, src3);
253  src += (4 * src_stride);
254  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
255  mask3, filt0, filt1, filt2, filt3, out2, out3);
256  SRARI_H4_SH(out0, out1, out2, out3, 7);
257  SAT_SH4_SH(out0, out1, out2, out3, 7);
258  out = PCKEV_XORI128_UB(out0, out1);
259  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
260  dst += (4 * dst_stride);
261  out = PCKEV_XORI128_UB(out2, out3);
262  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
263  dst += (4 * dst_stride);
264 
265  LD_SB4(src, src_stride, src0, src1, src2, src3);
266  XORI_B4_128_SB(src0, src1, src2, src3);
267  src += (4 * src_stride);
268  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
269  mask3, filt0, filt1, filt2, filt3, out0, out1);
270  LD_SB4(src, src_stride, src0, src1, src2, src3);
271  XORI_B4_128_SB(src0, src1, src2, src3);
272  src += (4 * src_stride);
273  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
274  mask3, filt0, filt1, filt2, filt3, out2, out3);
275 
276  SRARI_H4_SH(out0, out1, out2, out3, 7);
277  SAT_SH4_SH(out0, out1, out2, out3, 7);
278  out = PCKEV_XORI128_UB(out0, out1);
279  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
280  dst += (4 * dst_stride);
281  out = PCKEV_XORI128_UB(out2, out3);
282  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
283 }
284 
285 static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
286  uint8_t *dst, int32_t dst_stride,
287  const int8_t *filter, int32_t height)
288 {
289  if (4 == height) {
290  common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
291  } else if (8 == height) {
292  common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
293  } else if (16 == height) {
294  common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
295  }
296 }
297 
298 static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride,
299  uint8_t *dst, int32_t dst_stride,
300  const int8_t *filter)
301 {
302  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
303  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
304  v8i16 filt, out0, out1, out2, out3;
305 
306  mask0 = LD_UB(&mc_filt_mask_arr[0]);
307  src -= 3;
308 
309  /* rearranging filter */
310  filt = LD_SH(filter);
311  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
312 
313  mask1 = mask0 + 2;
314  mask2 = mask0 + 4;
315  mask3 = mask0 + 6;
316 
317  LD_SB4(src, src_stride, src0, src1, src2, src3);
318  XORI_B4_128_SB(src0, src1, src2, src3);
319  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
320  mask3, filt0, filt1, filt2, filt3, out0, out1,
321  out2, out3);
322  SRARI_H4_SH(out0, out1, out2, out3, 7);
323  SAT_SH4_SH(out0, out1, out2, out3, 7);
324  tmp0 = PCKEV_XORI128_UB(out0, out1);
325  tmp1 = PCKEV_XORI128_UB(out2, out3);
326  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
327 }
328 
329 static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
330  uint8_t *dst, int32_t dst_stride,
331  const int8_t *filter, int32_t height)
332 {
333  uint32_t loop_cnt;
334  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
335  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
336  v8i16 filt, out0, out1, out2, out3;
337 
338  mask0 = LD_UB(&mc_filt_mask_arr[0]);
339  src -= 3;
340 
341  /* rearranging filter */
342  filt = LD_SH(filter);
343  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
344 
345  mask1 = mask0 + 2;
346  mask2 = mask0 + 4;
347  mask3 = mask0 + 6;
348 
349  for (loop_cnt = (height >> 2); loop_cnt--;) {
350  LD_SB4(src, src_stride, src0, src1, src2, src3);
351  XORI_B4_128_SB(src0, src1, src2, src3);
352  src += (4 * src_stride);
353  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
354  mask3, filt0, filt1, filt2, filt3, out0,
355  out1, out2, out3);
356  SRARI_H4_SH(out0, out1, out2, out3, 7);
357  SAT_SH4_SH(out0, out1, out2, out3, 7);
358  tmp0 = PCKEV_XORI128_UB(out0, out1);
359  tmp1 = PCKEV_XORI128_UB(out2, out3);
360  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
361  dst += (4 * dst_stride);
362  }
363 }
364 
365 static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
366  uint8_t *dst, int32_t dst_stride,
367  const int8_t *filter, int32_t height)
368 {
369  if (4 == height) {
370  common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
371  } else {
372  common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
373  height);
374  }
375 }
376 
377 static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
378  uint8_t *dst, int32_t dst_stride,
379  const int8_t *filter, int32_t height)
380 {
381  uint32_t loop_cnt;
382  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
383  v16u8 mask0, mask1, mask2, mask3, out;
384  v8i16 filt, out0, out1, out2, out3;
385 
386  mask0 = LD_UB(&mc_filt_mask_arr[0]);
387  src -= 3;
388 
389  /* rearranging filter */
390  filt = LD_SH(filter);
391  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
392 
393  mask1 = mask0 + 2;
394  mask2 = mask0 + 4;
395  mask3 = mask0 + 6;
396 
397  for (loop_cnt = (height >> 1); loop_cnt--;) {
398  LD_SB2(src, src_stride, src0, src2);
399  LD_SB2(src + 8, src_stride, src1, src3);
400  XORI_B4_128_SB(src0, src1, src2, src3);
401  src += (2 * src_stride);
402  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
403  mask3, filt0, filt1, filt2, filt3, out0,
404  out1, out2, out3);
405  SRARI_H4_SH(out0, out1, out2, out3, 7);
406  SAT_SH4_SH(out0, out1, out2, out3, 7);
407  out = PCKEV_XORI128_UB(out0, out1);
408  ST_UB(out, dst);
409  dst += dst_stride;
410  out = PCKEV_XORI128_UB(out2, out3);
411  ST_UB(out, dst);
412  dst += dst_stride;
413  }
414 }
415 
416 static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
417  uint8_t *dst, int32_t dst_stride,
418  const int8_t *filter, int32_t height)
419 {
420  uint32_t loop_cnt;
421  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
422  v16u8 mask0, mask1, mask2, mask3, out;
423  v8i16 filt, out0, out1, out2, out3;
424 
425  mask0 = LD_UB(&mc_filt_mask_arr[0]);
426  src -= 3;
427 
428  /* rearranging filter */
429  filt = LD_SH(filter);
430  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
431 
432  mask1 = mask0 + 2;
433  mask2 = mask0 + 4;
434  mask3 = mask0 + 6;
435 
436  for (loop_cnt = (height >> 1); loop_cnt--;) {
437  src0 = LD_SB(src);
438  src2 = LD_SB(src + 16);
439  src3 = LD_SB(src + 24);
440  src1 = __msa_sldi_b(src2, src0, 8);
441  src += src_stride;
442  XORI_B4_128_SB(src0, src1, src2, src3);
443  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
444  mask3, filt0, filt1, filt2, filt3, out0,
445  out1, out2, out3);
446  SRARI_H4_SH(out0, out1, out2, out3, 7);
447  SAT_SH4_SH(out0, out1, out2, out3, 7);
448 
449  src0 = LD_SB(src);
450  src2 = LD_SB(src + 16);
451  src3 = LD_SB(src + 24);
452  src1 = __msa_sldi_b(src2, src0, 8);
453  src += src_stride;
454 
455  out = PCKEV_XORI128_UB(out0, out1);
456  ST_UB(out, dst);
457  out = PCKEV_XORI128_UB(out2, out3);
458  ST_UB(out, dst + 16);
459  dst += dst_stride;
460 
461  XORI_B4_128_SB(src0, src1, src2, src3);
462  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
463  mask3, filt0, filt1, filt2, filt3, out0,
464  out1, out2, out3);
465  SRARI_H4_SH(out0, out1, out2, out3, 7);
466  SAT_SH4_SH(out0, out1, out2, out3, 7);
467  out = PCKEV_XORI128_UB(out0, out1);
468  ST_UB(out, dst);
469  out = PCKEV_XORI128_UB(out2, out3);
470  ST_UB(out, dst + 16);
471  dst += dst_stride;
472  }
473 }
474 
475 static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
476  uint8_t *dst, int32_t dst_stride,
477  const int8_t *filter, int32_t height)
478 {
479  int32_t loop_cnt;
480  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
481  v16u8 mask0, mask1, mask2, mask3, out;
482  v8i16 filt, out0, out1, out2, out3;
483 
484  mask0 = LD_UB(&mc_filt_mask_arr[0]);
485  src -= 3;
486 
487  /* rearranging filter */
488  filt = LD_SH(filter);
489  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
490 
491  mask1 = mask0 + 2;
492  mask2 = mask0 + 4;
493  mask3 = mask0 + 6;
494 
495  for (loop_cnt = height; loop_cnt--;) {
496  src0 = LD_SB(src);
497  src2 = LD_SB(src + 16);
498  src3 = LD_SB(src + 24);
499  src1 = __msa_sldi_b(src2, src0, 8);
500 
501  XORI_B4_128_SB(src0, src1, src2, src3);
502  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
503  mask2, mask3, filt0, filt1, filt2, filt3,
504  out0, out1, out2, out3);
505  SRARI_H4_SH(out0, out1, out2, out3, 7);
506  SAT_SH4_SH(out0, out1, out2, out3, 7);
507  out = PCKEV_XORI128_UB(out0, out1);
508  ST_UB(out, dst);
509  out = PCKEV_XORI128_UB(out2, out3);
510  ST_UB(out, dst + 16);
511 
512  src0 = LD_SB(src + 32);
513  src2 = LD_SB(src + 48);
514  src3 = LD_SB(src + 56);
515  src1 = __msa_sldi_b(src2, src0, 8);
516  src += src_stride;
517 
518  XORI_B4_128_SB(src0, src1, src2, src3);
519  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
520  mask2, mask3, filt0, filt1, filt2, filt3,
521  out0, out1, out2, out3);
522  SRARI_H4_SH(out0, out1, out2, out3, 7);
523  SAT_SH4_SH(out0, out1, out2, out3, 7);
524  out = PCKEV_XORI128_UB(out0, out1);
525  ST_UB(out, dst + 32);
526  out = PCKEV_XORI128_UB(out2, out3);
527  ST_UB(out, dst + 48);
528  dst += dst_stride;
529  }
530 }
531 
532 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
533  uint8_t *dst, int32_t dst_stride,
534  const int8_t *filter, int32_t height)
535 {
536  uint32_t loop_cnt;
537  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
538  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
539  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
540  v16i8 src10998, filt0, filt1, filt2, filt3;
541  v16u8 out;
542  v8i16 filt, out10, out32;
543 
544  src -= (3 * src_stride);
545 
546  filt = LD_SH(filter);
547  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
548 
549  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
550  src += (7 * src_stride);
551 
552  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
553  src54_r, src21_r);
554  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
555  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
556  src4332, src6554);
557  XORI_B3_128_SB(src2110, src4332, src6554);
558 
559  for (loop_cnt = (height >> 2); loop_cnt--;) {
560  LD_SB4(src, src_stride, src7, src8, src9, src10);
561  src += (4 * src_stride);
562 
563  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
564  src87_r, src98_r, src109_r);
565  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
566  XORI_B2_128_SB(src8776, src10998);
567  out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
568  filt1, filt2, filt3);
569  out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
570  filt1, filt2, filt3);
571  SRARI_H2_SH(out10, out32, 7);
572  SAT_SH2_SH(out10, out32, 7);
573  out = PCKEV_XORI128_UB(out10, out32);
574  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
575  dst += (4 * dst_stride);
576 
577  src2110 = src6554;
578  src4332 = src8776;
579  src6554 = src10998;
580  src6 = src10;
581  }
582 }
583 
584 static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
585  uint8_t *dst, int32_t dst_stride,
586  const int8_t *filter, int32_t height)
587 {
588  uint32_t loop_cnt;
589  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
590  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
591  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
592  v16u8 tmp0, tmp1;
593  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
594 
595  src -= (3 * src_stride);
596 
597  filt = LD_SH(filter);
598  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
599 
600  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
601  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
602  src += (7 * src_stride);
603  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
604  src54_r, src21_r);
605  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
606 
607  for (loop_cnt = (height >> 2); loop_cnt--;) {
608  LD_SB4(src, src_stride, src7, src8, src9, src10);
609  XORI_B4_128_SB(src7, src8, src9, src10);
610  src += (4 * src_stride);
611 
612  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
613  src87_r, src98_r, src109_r);
614  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
615  filt1, filt2, filt3);
616  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
617  filt1, filt2, filt3);
618  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
619  filt1, filt2, filt3);
620  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
621  filt1, filt2, filt3);
622  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
623  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
624  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
625  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
626  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
627  dst += (4 * dst_stride);
628 
629  src10_r = src54_r;
630  src32_r = src76_r;
631  src54_r = src98_r;
632  src21_r = src65_r;
633  src43_r = src87_r;
634  src65_r = src109_r;
635  src6 = src10;
636  }
637 }
638 
639 static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
640  uint8_t *dst, int32_t dst_stride,
641  const int8_t *filter, int32_t height)
642 {
643  uint32_t loop_cnt;
644  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
645  v16i8 filt0, filt1, filt2, filt3;
646  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
647  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
648  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
649  v16u8 tmp0, tmp1, tmp2, tmp3;
650  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
651 
652  src -= (3 * src_stride);
653 
654  filt = LD_SH(filter);
655  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
656 
657  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
658  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
659  src += (7 * src_stride);
660  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
661  src54_r, src21_r);
662  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
663  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
664  src54_l, src21_l);
665  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
666 
667  for (loop_cnt = (height >> 2); loop_cnt--;) {
668  LD_SB4(src, src_stride, src7, src8, src9, src10);
669  XORI_B4_128_SB(src7, src8, src9, src10);
670  src += (4 * src_stride);
671 
672  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
673  src87_r, src98_r, src109_r);
674  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
675  src87_l, src98_l, src109_l);
676  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
677  filt1, filt2, filt3);
678  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
679  filt1, filt2, filt3);
680  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
681  filt1, filt2, filt3);
682  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
683  filt1, filt2, filt3);
684  out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
685  filt1, filt2, filt3);
686  out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
687  filt1, filt2, filt3);
688  out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
689  filt1, filt2, filt3);
690  out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
691  filt1, filt2, filt3);
692  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
693  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
694  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
695  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
696  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
697  out3_r, tmp0, tmp1, tmp2, tmp3);
698  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
699  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
700  dst += (4 * dst_stride);
701 
702  src10_r = src54_r;
703  src32_r = src76_r;
704  src54_r = src98_r;
705  src21_r = src65_r;
706  src43_r = src87_r;
707  src65_r = src109_r;
708  src10_l = src54_l;
709  src32_l = src76_l;
710  src54_l = src98_l;
711  src21_l = src65_l;
712  src43_l = src87_l;
713  src65_l = src109_l;
714  src6 = src10;
715  }
716 }
717 
718 static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
719  uint8_t *dst, int32_t dst_stride,
720  const int8_t *filter, int32_t height,
721  int32_t width)
722 {
723  const uint8_t *src_tmp;
724  uint8_t *dst_tmp;
725  uint32_t loop_cnt, cnt;
726  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
727  v16i8 filt0, filt1, filt2, filt3;
728  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
729  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
730  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
731  v16u8 tmp0, tmp1, tmp2, tmp3;
732  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
733 
734  src -= (3 * src_stride);
735 
736  filt = LD_SH(filter);
737  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
738 
739  for (cnt = (width >> 4); cnt--;) {
740  src_tmp = src;
741  dst_tmp = dst;
742 
743  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
744  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
745  src_tmp += (7 * src_stride);
746  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
747  src32_r, src54_r, src21_r);
748  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
749  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
750  src32_l, src54_l, src21_l);
751  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
752 
753  for (loop_cnt = (height >> 2); loop_cnt--;) {
754  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
755  XORI_B4_128_SB(src7, src8, src9, src10);
756  src_tmp += (4 * src_stride);
757  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
758  src87_r, src98_r, src109_r);
759  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
760  src87_l, src98_l, src109_l);
761  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
762  filt0, filt1, filt2, filt3);
763  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
764  filt0, filt1, filt2, filt3);
765  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
766  filt0, filt1, filt2, filt3);
767  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
768  filt0, filt1, filt2, filt3);
769  out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
770  filt0, filt1, filt2, filt3);
771  out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
772  filt0, filt1, filt2, filt3);
773  out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
774  filt0, filt1, filt2, filt3);
775  out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
776  filt0, filt1, filt2, filt3);
777  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
778  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
779  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
780  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
781  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
782  out3_r, tmp0, tmp1, tmp2, tmp3);
783  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
784  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
785  dst_tmp += (4 * dst_stride);
786 
787  src10_r = src54_r;
788  src32_r = src76_r;
789  src54_r = src98_r;
790  src21_r = src65_r;
791  src43_r = src87_r;
792  src65_r = src109_r;
793  src10_l = src54_l;
794  src32_l = src76_l;
795  src54_l = src98_l;
796  src21_l = src65_l;
797  src43_l = src87_l;
798  src65_l = src109_l;
799  src6 = src10;
800  }
801 
802  src += 16;
803  dst += 16;
804  }
805 }
806 
807 static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
808  uint8_t *dst, int32_t dst_stride,
809  const int8_t *filter, int32_t height)
810 {
811  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
812  32);
813 }
814 
815 static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
816  uint8_t *dst, int32_t dst_stride,
817  const int8_t *filter, int32_t height)
818 {
819  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
820  64);
821 }
822 
823 static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride,
824  uint8_t *dst, int32_t dst_stride,
825  const int8_t *filter_horiz,
826  const int8_t *filter_vert,
827  int32_t height)
828 {
829  uint32_t loop_cnt;
830  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
831  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
832  v16u8 mask0, mask1, mask2, mask3, out;
833  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
834  v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
835  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
836 
837  mask0 = LD_UB(&mc_filt_mask_arr[16]);
838  src -= (3 + 3 * src_stride);
839 
840  /* rearranging filter */
841  filt = LD_SH(filter_horiz);
842  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
843 
844  mask1 = mask0 + 2;
845  mask2 = mask0 + 4;
846  mask3 = mask0 + 6;
847 
848  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
849  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
850  src += (7 * src_stride);
851 
852  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
853  filt_hz1, filt_hz2, filt_hz3);
854  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
855  filt_hz1, filt_hz2, filt_hz3);
856  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
857  filt_hz1, filt_hz2, filt_hz3);
858  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
859  filt_hz1, filt_hz2, filt_hz3);
860  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
861 
862  filt = LD_SH(filter_vert);
863  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
864 
865  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
866  out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
867 
868  for (loop_cnt = (height >> 2); loop_cnt--;) {
869  LD_SB4(src, src_stride, src7, src8, src9, src10);
870  XORI_B4_128_SB(src7, src8, src9, src10);
871  src += (4 * src_stride);
872 
873  hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
874  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
875  hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
876  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
877  tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
878  filt_vt2, filt_vt3);
879 
880  hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
881  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
882  hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
883  out4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
884  tmp1 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out4, filt_vt0, filt_vt1,
885  filt_vt2, filt_vt3);
886  SRARI_H2_SH(tmp0, tmp1, 7);
887  SAT_SH2_SH(tmp0, tmp1, 7);
888  out = PCKEV_XORI128_UB(tmp0, tmp1);
889  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
890  dst += (4 * dst_stride);
891 
892  hz_out5 = hz_out9;
893  out0 = out2;
894  out1 = out3;
895  out2 = out4;
896  }
897 }
898 
899 static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride,
900  uint8_t *dst, int32_t dst_stride,
901  const int8_t *filter_horiz,
902  const int8_t *filter_vert,
903  int32_t height)
904 {
905  uint32_t loop_cnt;
906  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
907  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
908  v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
909  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
910  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
911  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
912  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
913 
914  mask0 = LD_UB(&mc_filt_mask_arr[0]);
915  src -= (3 + 3 * src_stride);
916 
917  /* rearranging filter */
918  filt = LD_SH(filter_horiz);
919  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
920 
921  mask1 = mask0 + 2;
922  mask2 = mask0 + 4;
923  mask3 = mask0 + 6;
924 
925  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
926  src += (7 * src_stride);
927 
928  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
929  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
930  filt_hz1, filt_hz2, filt_hz3);
931  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
932  filt_hz1, filt_hz2, filt_hz3);
933  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
934  filt_hz1, filt_hz2, filt_hz3);
935  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
936  filt_hz1, filt_hz2, filt_hz3);
937  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
938  filt_hz1, filt_hz2, filt_hz3);
939  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
940  filt_hz1, filt_hz2, filt_hz3);
941  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
942  filt_hz1, filt_hz2, filt_hz3);
943 
944  filt = LD_SH(filter_vert);
945  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
946 
947  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
948  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
949  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
950 
951  for (loop_cnt = (height >> 2); loop_cnt--;) {
952  LD_SB4(src, src_stride, src7, src8, src9, src10);
953  src += (4 * src_stride);
954 
955  XORI_B4_128_SB(src7, src8, src9, src10);
956 
957  hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
958  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
959  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
960  tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
961  filt_vt2, filt_vt3);
962 
963  hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
964  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
965  out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
966  tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
967  filt_vt2, filt_vt3);
968 
969  hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
970  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
971  out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
972  tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0,
973  filt_vt1, filt_vt2, filt_vt3);
974 
975  hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
976  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
977  out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
978  tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
979  filt_vt2, filt_vt3);
980  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
981  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
982  vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
983  vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
984  ST8x4_UB(vec0, vec1, dst, dst_stride);
985  dst += (4 * dst_stride);
986 
987  hz_out6 = hz_out10;
988  out0 = out2;
989  out1 = out3;
990  out2 = out8;
991  out4 = out6;
992  out5 = out7;
993  out6 = out9;
994  }
995 }
996 
997 static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride,
998  uint8_t *dst, int32_t dst_stride,
999  const int8_t *filter_horiz,
1000  const int8_t *filter_vert,
1001  int32_t height)
1002 {
1003  int32_t multiple8_cnt;
1004 
1005  for (multiple8_cnt = 2; multiple8_cnt--;) {
1006  common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
1007  filter_vert, height);
1008 
1009  src += 8;
1010  dst += 8;
1011  }
1012 }
1013 
1014 static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride,
1015  uint8_t *dst, int32_t dst_stride,
1016  const int8_t *filter_horiz,
1017  const int8_t *filter_vert,
1018  int32_t height)
1019 {
1020  int32_t multiple8_cnt;
1021 
1022  for (multiple8_cnt = 4; multiple8_cnt--;) {
1023  common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
1024  filter_vert, height);
1025 
1026  src += 8;
1027  dst += 8;
1028  }
1029 }
1030 
1031 static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride,
1032  uint8_t *dst, int32_t dst_stride,
1033  const int8_t *filter_horiz,
1034  const int8_t *filter_vert,
1035  int32_t height)
1036 {
1037  int32_t multiple8_cnt;
1038 
1039  for (multiple8_cnt = 8; multiple8_cnt--;) {
1040  common_hv_8ht_8vt_8w_msa(src, src_stride, dst, dst_stride, filter_horiz,
1041  filter_vert, height);
1042 
1043  src += 8;
1044  dst += 8;
1045  }
1046 }
1047 
1049  int32_t src_stride,
1050  uint8_t *dst, int32_t dst_stride,
1051  const int8_t *filter)
1052 {
1053  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1054  v16u8 dst0, dst1, dst2, dst3, res2, res3;
1055  v16u8 mask0, mask1, mask2, mask3;
1056  v8i16 filt, res0, res1;
1057 
1058  mask0 = LD_UB(&mc_filt_mask_arr[16]);
1059  src -= 3;
1060 
1061  /* rearranging filter */
1062  filt = LD_SH(filter);
1063  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1064 
1065  mask1 = mask0 + 2;
1066  mask2 = mask0 + 4;
1067  mask3 = mask0 + 6;
1068 
1069  LD_SB4(src, src_stride, src0, src1, src2, src3);
1070  XORI_B4_128_SB(src0, src1, src2, src3);
1071  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1072  mask3, filt0, filt1, filt2, filt3, res0, res1);
1073  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1074  SRARI_H2_SH(res0, res1, 7);
1075  SAT_SH2_SH(res0, res1, 7);
1076  PCKEV_B2_UB(res0, res0, res1, res1, res2, res3);
1077  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
1078  XORI_B2_128_UB(res2, res3);
1079  AVER_UB2_UB(res2, dst0, res3, dst2, res2, res3);
1080  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
1081 }
1082 
1084  int32_t src_stride,
1085  uint8_t *dst, int32_t dst_stride,
1086  const int8_t *filter)
1087 {
1088  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1089  v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
1090  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1091  v8i16 filt, vec0, vec1, vec2, vec3;
1092 
1093  mask0 = LD_UB(&mc_filt_mask_arr[16]);
1094  src -= 3;
1095 
1096  /* rearranging filter */
1097  filt = LD_SH(filter);
1098  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1099 
1100  mask1 = mask0 + 2;
1101  mask2 = mask0 + 4;
1102  mask3 = mask0 + 6;
1103 
1104  LD_SB4(src, src_stride, src0, src1, src2, src3);
1105  XORI_B4_128_SB(src0, src1, src2, src3);
1106  src += (4 * src_stride);
1107  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1108  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1109  mask3, filt0, filt1, filt2, filt3, vec0, vec1);
1110  LD_SB4(src, src_stride, src0, src1, src2, src3);
1111  XORI_B4_128_SB(src0, src1, src2, src3);
1112  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1113  mask3, filt0, filt1, filt2, filt3, vec2, vec3);
1114  SRARI_H4_SH(vec0, vec1, vec2, vec3, 7);
1115  SAT_SH4_SH(vec0, vec1, vec2, vec3, 7);
1116  PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
1117  res0, res1, res2, res3);
1118  ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
1119  XORI_B2_128_UB(res0, res2);
1120  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1121  dst0, dst2, dst4, dst6);
1122  ILVR_D2_UB(dst2, dst0, dst6, dst4, dst0, dst4);
1123  AVER_UB2_UB(res0, dst0, res2, dst4, res0, res2);
1124  ST4x8_UB(res0, res2, dst, dst_stride);
1125 }
1126 
1128  int32_t src_stride,
1129  uint8_t *dst, int32_t dst_stride,
1130  const int8_t *filter,
1131  int32_t height)
1132 {
1133  if (4 == height) {
1134  common_hz_8t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
1135  filter);
1136  } else if (8 == height) {
1137  common_hz_8t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
1138  filter);
1139  }
1140 }
1141 
1143  int32_t src_stride,
1144  uint8_t *dst, int32_t dst_stride,
1145  const int8_t *filter,
1146  int32_t height)
1147 {
1148  int32_t loop_cnt;
1149  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1150  v16u8 mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
1151  v8i16 filt, out0, out1, out2, out3;
1152 
1153  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1154  src -= 3;
1155 
1156  /* rearranging filter */
1157  filt = LD_SH(filter);
1158  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1159 
1160  mask1 = mask0 + 2;
1161  mask2 = mask0 + 4;
1162  mask3 = mask0 + 6;
1163 
1164  for (loop_cnt = (height >> 2); loop_cnt--;) {
1165  LD_SB4(src, src_stride, src0, src1, src2, src3);
1166  XORI_B4_128_SB(src0, src1, src2, src3);
1167  src += (4 * src_stride);
1168  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
1169  mask3, filt0, filt1, filt2, filt3, out0,
1170  out1, out2, out3);
1171  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1172  SRARI_H4_SH(out0, out1, out2, out3, 7);
1173  SAT_SH4_SH(out0, out1, out2, out3, 7);
1174  CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
1175  dst, dst_stride);
1176  dst += (4 * dst_stride);
1177  }
1178 }
1179 
1181  int32_t src_stride,
1182  uint8_t *dst, int32_t dst_stride,
1183  const int8_t *filter,
1184  int32_t height)
1185 {
1186  int32_t loop_cnt;
1187  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1188  v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
1189  v8i16 filt, out0, out1, out2, out3;
1190  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1191  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1192 
1193  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1194  src -= 3;
1195 
1196  /* rearranging filter */
1197  filt = LD_SH(filter);
1198  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1199 
1200  mask1 = mask0 + 2;
1201  mask2 = mask0 + 4;
1202  mask3 = mask0 + 6;
1203 
1204  for (loop_cnt = height >> 1; loop_cnt--;) {
1205  LD_SB2(src, src_stride, src0, src2);
1206  LD_SB2(src + 8, src_stride, src1, src3);
1207  src += (2 * src_stride);
1208 
1209  XORI_B4_128_SB(src0, src1, src2, src3);
1210  VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1211  vec12);
1212  VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1213  vec13);
1214  VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
1215  vec14);
1216  VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
1217  vec15);
1218  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
1219  vec1, vec2, vec3);
1220  DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
1221  vec9, vec10, vec11);
1222  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
1223  vec1, vec2, vec3);
1224  DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1225  vec8, vec9, vec10, vec11);
1226  ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1227  out1, out2, out3);
1228  LD_UB2(dst, dst_stride, dst0, dst1);
1229  SRARI_H4_SH(out0, out1, out2, out3, 7);
1230  SAT_SH4_SH(out0, out1, out2, out3, 7);
1231  PCKEV_XORI128_AVG_ST_UB(out1, out0, dst0, dst);
1232  dst += dst_stride;
1233  PCKEV_XORI128_AVG_ST_UB(out3, out2, dst1, dst);
1234  dst += dst_stride;
1235  }
1236 }
1237 
1239  int32_t src_stride,
1240  uint8_t *dst, int32_t dst_stride,
1241  const int8_t *filter,
1242  int32_t height)
1243 {
1244  uint32_t loop_cnt;
1245  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1246  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
1247  v8i16 filt, out0, out1, out2, out3;
1248  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1249  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1250 
1251  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1252  src -= 3;
1253 
1254  /* rearranging filter */
1255  filt = LD_SH(filter);
1256  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1257 
1258  mask1 = mask0 + 2;
1259  mask2 = mask0 + 4;
1260  mask3 = mask0 + 6;
1261 
1262  for (loop_cnt = height; loop_cnt--;) {
1263  src0 = LD_SB(src);
1264  src2 = LD_SB(src + 16);
1265  src3 = LD_SB(src + 24);
1266  src1 = __msa_sldi_b(src2, src0, 8);
1267  src += src_stride;
1268 
1269  XORI_B4_128_SB(src0, src1, src2, src3);
1270  VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1271  vec12);
1272  VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1273  vec13);
1274  VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
1275  vec14);
1276  VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
1277  vec15);
1278  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
1279  vec1, vec2, vec3);
1280  DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
1281  vec9, vec10, vec11);
1282  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
1283  vec1, vec2, vec3);
1284  DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1285  vec8, vec9, vec10, vec11);
1286  ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1287  out1, out2, out3);
1288  SRARI_H4_SH(out0, out1, out2, out3, 7);
1289  SAT_SH4_SH(out0, out1, out2, out3, 7);
1290  LD_UB2(dst, 16, dst1, dst2);
1291  PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, dst);
1292  PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, dst + 16);
1293  dst += dst_stride;
1294  }
1295 }
1296 
1298  int32_t src_stride,
1299  uint8_t *dst, int32_t dst_stride,
1300  const int8_t *filter,
1301  int32_t height)
1302 {
1303  uint32_t loop_cnt, cnt;
1304  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
1305  v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
1306  v8i16 filt, out0, out1, out2, out3;
1307  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1308  v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1309 
1310  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1311  src -= 3;
1312 
1313  /* rearranging filter */
1314  filt = LD_SH(filter);
1315  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1316 
1317  mask1 = mask0 + 2;
1318  mask2 = mask0 + 4;
1319  mask3 = mask0 + 6;
1320 
1321  for (loop_cnt = height; loop_cnt--;) {
1322  for (cnt = 0; cnt < 2; ++cnt) {
1323  src0 = LD_SB(&src[cnt << 5]);
1324  src2 = LD_SB(&src[16 + (cnt << 5)]);
1325  src3 = LD_SB(&src[24 + (cnt << 5)]);
1326  src1 = __msa_sldi_b(src2, src0, 8);
1327 
1328  XORI_B4_128_SB(src0, src1, src2, src3);
1329  VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1330  vec12);
1331  VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1332  vec13);
1333  VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6,
1334  vec10, vec14);
1335  VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7,
1336  vec11, vec15);
1337  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1338  vec0, vec1, vec2, vec3);
1339  DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2,
1340  vec8, vec9, vec10, vec11);
1341  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
1342  vec0, vec1, vec2, vec3);
1343  DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1344  vec8, vec9, vec10, vec11);
1345  ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1346  out1, out2, out3);
1347  SRARI_H4_SH(out0, out1, out2, out3, 7);
1348  SAT_SH4_SH(out0, out1, out2, out3, 7);
1349  LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
1350  PCKEV_XORI128_AVG_ST_UB(out1, out0, dst1, &dst[cnt << 5]);
1351  PCKEV_XORI128_AVG_ST_UB(out3, out2, dst2, &dst[16 + (cnt << 5)]);
1352  }
1353 
1354  src += src_stride;
1355  dst += dst_stride;
1356  }
1357 }
1358 
1360  int32_t src_stride,
1361  uint8_t *dst, int32_t dst_stride,
1362  const int8_t *filter,
1363  int32_t height)
1364 {
1365  uint32_t loop_cnt;
1366  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1367  v16u8 dst0, dst1, dst2, dst3, out;
1368  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1369  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
1370  v16i8 src10998, filt0, filt1, filt2, filt3;
1371  v8i16 filt, out10, out32;
1372 
1373  src -= (3 * src_stride);
1374 
1375  filt = LD_SH(filter);
1376  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1377 
1378  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1379  src += (7 * src_stride);
1380 
1381  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1382  src54_r, src21_r);
1383  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1384  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
1385  src4332, src6554);
1386  XORI_B3_128_SB(src2110, src4332, src6554);
1387 
1388  for (loop_cnt = (height >> 2); loop_cnt--;) {
1389  LD_SB4(src, src_stride, src7, src8, src9, src10);
1390  src += (4 * src_stride);
1391 
1392  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1393  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1394  src87_r, src98_r, src109_r);
1395  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
1396  XORI_B2_128_SB(src8776, src10998);
1397  out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
1398  filt1, filt2, filt3);
1399  out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
1400  filt1, filt2, filt3);
1401  SRARI_H2_SH(out10, out32, 7);
1402  SAT_SH2_SH(out10, out32, 7);
1403  out = PCKEV_XORI128_UB(out10, out32);
1404  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
1405 
1406  dst0 = (v16u8) __msa_ilvr_d((v2i64) dst2, (v2i64) dst0);
1407  out = __msa_aver_u_b(out, dst0);
1408 
1409  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1410  dst += (4 * dst_stride);
1411 
1412  src2110 = src6554;
1413  src4332 = src8776;
1414  src6554 = src10998;
1415  src6 = src10;
1416  }
1417 }
1418 
1420  int32_t src_stride,
1421  uint8_t *dst, int32_t dst_stride,
1422  const int8_t *filter,
1423  int32_t height)
1424 {
1425  uint32_t loop_cnt;
1426  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1427  v16u8 dst0, dst1, dst2, dst3;
1428  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1429  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1430  v8i16 filt, out0, out1, out2, out3;
1431 
1432  src -= (3 * src_stride);
1433 
1434  filt = LD_SH(filter);
1435  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1436 
1437  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1438  src += (7 * src_stride);
1439 
1440  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1441  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1442  src54_r, src21_r);
1443  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1444 
1445  for (loop_cnt = (height >> 2); loop_cnt--;) {
1446  LD_SB4(src, src_stride, src7, src8, src9, src10);
1447  src += (4 * src_stride);
1448 
1449  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1450  XORI_B4_128_SB(src7, src8, src9, src10);
1451  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1452  src87_r, src98_r, src109_r);
1453  out0 = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
1454  filt1, filt2, filt3);
1455  out1 = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
1456  filt1, filt2, filt3);
1457  out2 = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
1458  filt1, filt2, filt3);
1459  out3 = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
1460  filt1, filt2, filt3);
1461  SRARI_H4_SH(out0, out1, out2, out3, 7);
1462  SAT_SH4_SH(out0, out1, out2, out3, 7);
1463  CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
1464  dst, dst_stride);
1465  dst += (4 * dst_stride);
1466 
1467  src10_r = src54_r;
1468  src32_r = src76_r;
1469  src54_r = src98_r;
1470  src21_r = src65_r;
1471  src43_r = src87_r;
1472  src65_r = src109_r;
1473  src6 = src10;
1474  }
1475 }
1476 
1478  int32_t src_stride,
1479  uint8_t *dst,
1480  int32_t dst_stride,
1481  const int8_t *filter,
1482  int32_t height,
1483  int32_t width)
1484 {
1485  const uint8_t *src_tmp;
1486  uint8_t *dst_tmp;
1487  uint32_t loop_cnt, cnt;
1488  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1489  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1490  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1491  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1492  v16i8 filt0, filt1, filt2, filt3;
1493  v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
1494  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l, filt;
1495 
1496  src -= (3 * src_stride);
1497 
1498  filt = LD_SH(filter);
1499  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1500 
1501  for (cnt = (width >> 4); cnt--;) {
1502  src_tmp = src;
1503  dst_tmp = dst;
1504 
1505  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1506  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1507  src_tmp += (7 * src_stride);
1508 
1509  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1510  src32_r, src54_r, src21_r);
1511  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1512  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1513  src32_l, src54_l, src21_l);
1514  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1515 
1516  for (loop_cnt = (height >> 2); loop_cnt--;) {
1517  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1518  src_tmp += (4 * src_stride);
1519 
1520  LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
1521  XORI_B4_128_SB(src7, src8, src9, src10);
1522  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1523  src87_r, src98_r, src109_r);
1524  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1525  src87_l, src98_l, src109_l);
1526  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
1527  filt0, filt1, filt2, filt3);
1528  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
1529  filt0, filt1, filt2, filt3);
1530  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
1531  filt0, filt1, filt2, filt3);
1532  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
1533  filt0, filt1, filt2, filt3);
1534  out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
1535  filt0, filt1, filt2, filt3);
1536  out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
1537  filt0, filt1, filt2, filt3);
1538  out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
1539  filt0, filt1, filt2, filt3);
1540  out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
1541  filt0, filt1, filt2, filt3);
1542  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1543  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1544  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1545  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1546  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1547  out3_r, tmp0, tmp1, tmp2, tmp3);
1548  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1549  AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
1550  dst0, dst1, dst2, dst3);
1551  ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
1552  dst_tmp += (4 * dst_stride);
1553 
1554  src10_r = src54_r;
1555  src32_r = src76_r;
1556  src54_r = src98_r;
1557  src21_r = src65_r;
1558  src43_r = src87_r;
1559  src65_r = src109_r;
1560  src10_l = src54_l;
1561  src32_l = src76_l;
1562  src54_l = src98_l;
1563  src21_l = src65_l;
1564  src43_l = src87_l;
1565  src65_l = src109_l;
1566  src6 = src10;
1567  }
1568 
1569  src += 16;
1570  dst += 16;
1571  }
1572 }
1573 
1575  int32_t src_stride,
1576  uint8_t *dst, int32_t dst_stride,
1577  const int8_t *filter,
1578  int32_t height)
1579 {
1580  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1581  filter, height, 16);
1582 }
1583 
1585  int32_t src_stride,
1586  uint8_t *dst, int32_t dst_stride,
1587  const int8_t *filter,
1588  int32_t height)
1589 {
1590  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1591  filter, height, 32);
1592 }
1593 
1595  int32_t src_stride,
1596  uint8_t *dst, int32_t dst_stride,
1597  const int8_t *filter,
1598  int32_t height)
1599 {
1600  common_vt_8t_and_aver_dst_16w_mult_msa(src, src_stride, dst, dst_stride,
1601  filter, height, 64);
1602 }
1603 
1605  int32_t src_stride,
1606  uint8_t *dst,
1607  int32_t dst_stride,
1608  const int8_t *filter_horiz,
1609  const int8_t *filter_vert,
1610  int32_t height)
1611 {
1612  uint32_t loop_cnt;
1613  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1614  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3, tmp0, tmp1;
1615  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1616  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1617  v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
1618  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1619 
1620  mask0 = LD_UB(&mc_filt_mask_arr[16]);
1621  src -= (3 + 3 * src_stride);
1622 
1623  /* rearranging filter */
1624  filt = LD_SH(filter_horiz);
1625  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1626 
1627  mask1 = mask0 + 2;
1628  mask2 = mask0 + 4;
1629  mask3 = mask0 + 6;
1630 
1631  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1632  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1633  src += (7 * src_stride);
1634 
1635  hz_out0 = HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
1636  filt_hz1, filt_hz2, filt_hz3);
1637  hz_out2 = HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
1638  filt_hz1, filt_hz2, filt_hz3);
1639  hz_out4 = HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
1640  filt_hz1, filt_hz2, filt_hz3);
1641  hz_out5 = HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
1642  filt_hz1, filt_hz2, filt_hz3);
1643  SLDI_B2_SH(hz_out2, hz_out4, hz_out0, hz_out2, hz_out1, hz_out3, 8);
1644 
1645  filt = LD_SH(filter_vert);
1646  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1647 
1648  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1649  vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1650 
1651  for (loop_cnt = (height >> 2); loop_cnt--;) {
1652  LD_SB4(src, src_stride, src7, src8, src9, src10);
1653  XORI_B4_128_SB(src7, src8, src9, src10);
1654  src += (4 * src_stride);
1655 
1656  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1657  hz_out7 = HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3,
1658  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1659  hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1660  vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1661  res0 = FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, filt_vt0, filt_vt1,
1662  filt_vt2, filt_vt3);
1663 
1664  hz_out9 = HORIZ_8TAP_FILT(src9, src10, mask0, mask1, mask2, mask3,
1665  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1666  hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
1667  vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
1668  res1 = FILT_8TAP_DPADD_S_H(vec1, vec2, vec3, vec4, filt_vt0, filt_vt1,
1669  filt_vt2, filt_vt3);
1670  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
1671 
1672  SRARI_H2_SH(res0, res1, 7);
1673  SAT_SH2_SH(res0, res1, 7);
1674  PCKEV_B2_UB(res0, res0, res1, res1, tmp0, tmp1);
1675  XORI_B2_128_UB(tmp0, tmp1);
1676  AVER_UB2_UB(tmp0, dst0, tmp1, dst2, tmp0, tmp1);
1677  ST4x4_UB(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
1678  dst += (4 * dst_stride);
1679 
1680  hz_out5 = hz_out9;
1681  vec0 = vec2;
1682  vec1 = vec3;
1683  vec2 = vec4;
1684  }
1685 }
1686 
1688  int32_t src_stride,
1689  uint8_t *dst,
1690  int32_t dst_stride,
1691  const int8_t *filter_horiz,
1692  const int8_t *filter_vert,
1693  int32_t height)
1694 {
1695  uint32_t loop_cnt;
1696  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1697  v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1698  v8i16 filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1699  v16u8 dst0, dst1, dst2, dst3, mask0, mask1, mask2, mask3;
1700  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1701  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
1702  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
1703 
1704  mask0 = LD_UB(&mc_filt_mask_arr[0]);
1705  src -= (3 + 3 * src_stride);
1706 
1707  /* rearranging filter */
1708  filt = LD_SH(filter_horiz);
1709  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1710 
1711  mask1 = mask0 + 2;
1712  mask2 = mask0 + 4;
1713  mask3 = mask0 + 6;
1714 
1715  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1716  src += (7 * src_stride);
1717 
1718  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1719  hz_out0 = HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
1720  filt_hz1, filt_hz2, filt_hz3);
1721  hz_out1 = HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
1722  filt_hz1, filt_hz2, filt_hz3);
1723  hz_out2 = HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
1724  filt_hz1, filt_hz2, filt_hz3);
1725  hz_out3 = HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
1726  filt_hz1, filt_hz2, filt_hz3);
1727  hz_out4 = HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
1728  filt_hz1, filt_hz2, filt_hz3);
1729  hz_out5 = HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
1730  filt_hz1, filt_hz2, filt_hz3);
1731  hz_out6 = HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
1732  filt_hz1, filt_hz2, filt_hz3);
1733 
1734  filt = LD_SH(filter_vert);
1735  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1736 
1737  ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1738  ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
1739  ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
1740 
1741  for (loop_cnt = (height >> 2); loop_cnt--;) {
1742  LD_SB4(src, src_stride, src7, src8, src9, src10);
1743  XORI_B4_128_SB(src7, src8, src9, src10);
1744  src += (4 * src_stride);
1745 
1746  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1747 
1748  hz_out7 = HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3,
1749  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1750  out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1751  tmp0 = FILT_8TAP_DPADD_S_H(out0, out1, out2, out3, filt_vt0, filt_vt1,
1752  filt_vt2, filt_vt3);
1753 
1754  hz_out8 = HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3,
1755  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1756  out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1757  tmp1 = FILT_8TAP_DPADD_S_H(out4, out5, out6, out7, filt_vt0, filt_vt1,
1758  filt_vt2, filt_vt3);
1759 
1760  hz_out9 = HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3,
1761  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1762  out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
1763  tmp2 = FILT_8TAP_DPADD_S_H(out1, out2, out3, out8, filt_vt0, filt_vt1,
1764  filt_vt2, filt_vt3);
1765 
1766  hz_out10 = HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3,
1767  filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1768  out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
1769  tmp3 = FILT_8TAP_DPADD_S_H(out5, out6, out7, out9, filt_vt0, filt_vt1,
1770  filt_vt2, filt_vt3);
1771 
1772  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1773  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
1774  CONVERT_UB_AVG_ST8x4_UB(tmp0, tmp1, tmp2, tmp3, dst0, dst1, dst2, dst3,
1775  dst, dst_stride);
1776  dst += (4 * dst_stride);
1777 
1778  hz_out6 = hz_out10;
1779  out0 = out2;
1780  out1 = out3;
1781  out2 = out8;
1782  out4 = out6;
1783  out5 = out7;
1784  out6 = out9;
1785  }
1786 }
1787 
1789  int32_t src_stride,
1790  uint8_t *dst,
1791  int32_t dst_stride,
1792  const int8_t *filter_horiz,
1793  const int8_t *filter_vert,
1794  int32_t height)
1795 {
1796  int32_t multiple8_cnt;
1797 
1798  for (multiple8_cnt = 2; multiple8_cnt--;) {
1799  common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1800  filter_horiz, filter_vert,
1801  height);
1802 
1803  src += 8;
1804  dst += 8;
1805  }
1806 }
1807 
1809  int32_t src_stride,
1810  uint8_t *dst,
1811  int32_t dst_stride,
1812  const int8_t *filter_horiz,
1813  const int8_t *filter_vert,
1814  int32_t height)
1815 {
1816  int32_t multiple8_cnt;
1817 
1818  for (multiple8_cnt = 4; multiple8_cnt--;) {
1819  common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1820  filter_horiz, filter_vert,
1821  height);
1822 
1823  src += 8;
1824  dst += 8;
1825  }
1826 }
1827 
1829  int32_t src_stride,
1830  uint8_t *dst,
1831  int32_t dst_stride,
1832  const int8_t *filter_horiz,
1833  const int8_t *filter_vert,
1834  int32_t height)
1835 {
1836  int32_t multiple8_cnt;
1837 
1838  for (multiple8_cnt = 8; multiple8_cnt--;) {
1839  common_hv_8ht_8vt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1840  filter_horiz, filter_vert,
1841  height);
1842 
1843  src += 8;
1844  dst += 8;
1845  }
1846 }
1847 
1848 static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
1849  uint8_t *dst, int32_t dst_stride,
1850  const int8_t *filter)
1851 {
1852  v16i8 src0, src1, src2, src3, mask;
1853  v16u8 filt0, vec0, vec1, res0, res1;
1854  v8u16 vec2, vec3, filt;
1855 
1856  mask = LD_SB(&mc_filt_mask_arr[16]);
1857 
1858  /* rearranging filter */
1859  filt = LD_UH(filter);
1860  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1861 
1862  LD_SB4(src, src_stride, src0, src1, src2, src3);
1863  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1864  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1865  SRARI_H2_UH(vec2, vec3, 7);
1866  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
1867  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1868 }
1869 
1870 static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
1871  uint8_t *dst, int32_t dst_stride,
1872  const int8_t *filter)
1873 {
1874  v16u8 vec0, vec1, vec2, vec3, filt0;
1875  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
1876  v16i8 res0, res1, res2, res3;
1877  v8u16 vec4, vec5, vec6, vec7, filt;
1878 
1879  mask = LD_SB(&mc_filt_mask_arr[16]);
1880 
1881  /* rearranging filter */
1882  filt = LD_UH(filter);
1883  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1884 
1885  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1886  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1887  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
1888  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1889  vec4, vec5, vec6, vec7);
1890  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
1891  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1892  res0, res1, res2, res3);
1893  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1894  dst += (4 * dst_stride);
1895  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
1896 }
1897 
1898 void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1899  const uint8_t *src, ptrdiff_t src_stride,
1900  int height, int mx, int my)
1901 {
1902  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
1903 
1904  if (4 == height) {
1905  common_hz_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1906  } else if (8 == height) {
1907  common_hz_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1908  }
1909 }
1910 
1911 static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
1912  uint8_t *dst, int32_t dst_stride,
1913  const int8_t *filter)
1914 {
1915  v16u8 filt0;
1916  v16i8 src0, src1, src2, src3, mask;
1917  v8u16 vec0, vec1, vec2, vec3, filt;
1918 
1919  mask = LD_SB(&mc_filt_mask_arr[0]);
1920 
1921  /* rearranging filter */
1922  filt = LD_UH(filter);
1923  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1924 
1925  LD_SB4(src, src_stride, src0, src1, src2, src3);
1926  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1927  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1928  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1929  vec0, vec1, vec2, vec3);
1930  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1931  PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
1932  ST8x4_UB(src0, src1, dst, dst_stride);
1933 }
1934 
1935 static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
1936  uint8_t *dst, int32_t dst_stride,
1937  const int8_t *filter, int32_t height)
1938 {
1939  v16u8 filt0;
1940  v16i8 src0, src1, src2, src3, mask, out0, out1;
1941  v8u16 vec0, vec1, vec2, vec3, filt;
1942 
1943  mask = LD_SB(&mc_filt_mask_arr[0]);
1944 
1945  /* rearranging filter */
1946  filt = LD_UH(filter);
1947  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
1948 
1949  LD_SB4(src, src_stride, src0, src1, src2, src3);
1950  src += (4 * src_stride);
1951 
1952  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1953  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1954  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1955  vec0, vec1, vec2, vec3);
1956  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1957  LD_SB4(src, src_stride, src0, src1, src2, src3);
1958  src += (4 * src_stride);
1959 
1960  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1961  ST8x4_UB(out0, out1, dst, dst_stride);
1962  dst += (4 * dst_stride);
1963 
1964  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1965  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1966  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1967  vec0, vec1, vec2, vec3);
1968  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1969  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1970  ST8x4_UB(out0, out1, dst, dst_stride);
1971  dst += (4 * dst_stride);
1972 
1973  if (16 == height) {
1974  LD_SB4(src, src_stride, src0, src1, src2, src3);
1975  src += (4 * src_stride);
1976 
1977  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1978  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1979  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1980  vec0, vec1, vec2, vec3);
1981  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1982  LD_SB4(src, src_stride, src0, src1, src2, src3);
1983  src += (4 * src_stride);
1984 
1985  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1986  ST8x4_UB(out0, out1, dst, dst_stride);
1987 
1988  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1989  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1990  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1991  vec0, vec1, vec2, vec3);
1992  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
1993  PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
1994  ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
1995  }
1996 }
1997 
1998 void ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
1999  const uint8_t *src, ptrdiff_t src_stride,
2000  int height, int mx, int my)
2001 {
2002  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2003 
2004  if (4 == height) {
2005  common_hz_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
2006  } else {
2007  common_hz_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
2008  height);
2009  }
2010 }
2011 
2012 void ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2013  const uint8_t *src, ptrdiff_t src_stride,
2014  int height, int mx, int my)
2015 {
2016  uint32_t loop_cnt;
2017  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2018  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2019  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2020  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
2021 
2022  mask = LD_SB(&mc_filt_mask_arr[0]);
2023 
2024  loop_cnt = (height >> 2) - 1;
2025 
2026  /* rearranging filter */
2027  filt = LD_UH(filter);
2028  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2029 
2030  LD_SB4(src, src_stride, src0, src2, src4, src6);
2031  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2032  src += (4 * src_stride);
2033 
2034  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2035  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2036  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2037  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2038  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2039  out0, out1, out2, out3);
2040  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2041  out4, out5, out6, out7);
2042  SRARI_H4_UH(out0, out1, out2, out3, 7);
2043  SRARI_H4_UH(out4, out5, out6, out7, 7);
2044  PCKEV_ST_SB(out0, out1, dst);
2045  dst += dst_stride;
2046  PCKEV_ST_SB(out2, out3, dst);
2047  dst += dst_stride;
2048  PCKEV_ST_SB(out4, out5, dst);
2049  dst += dst_stride;
2050  PCKEV_ST_SB(out6, out7, dst);
2051  dst += dst_stride;
2052 
2053  for (; loop_cnt--;) {
2054  LD_SB4(src, src_stride, src0, src2, src4, src6);
2055  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2056  src += (4 * src_stride);
2057 
2058  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2059  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2060  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2061  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2062  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2063  out0, out1, out2, out3);
2064  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2065  out4, out5, out6, out7);
2066  SRARI_H4_UH(out0, out1, out2, out3, 7);
2067  SRARI_H4_UH(out4, out5, out6, out7, 7);
2068  PCKEV_ST_SB(out0, out1, dst);
2069  dst += dst_stride;
2070  PCKEV_ST_SB(out2, out3, dst);
2071  dst += dst_stride;
2072  PCKEV_ST_SB(out4, out5, dst);
2073  dst += dst_stride;
2074  PCKEV_ST_SB(out6, out7, dst);
2075  dst += dst_stride;
2076  }
2077 }
2078 
2079 void ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2080  const uint8_t *src, ptrdiff_t src_stride,
2081  int height, int mx, int my)
2082 {
2083  uint32_t loop_cnt;
2084  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2085  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2086  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2087  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
2088 
2089  mask = LD_SB(&mc_filt_mask_arr[0]);
2090 
2091  /* rearranging filter */
2092  filt = LD_UH(filter);
2093  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2094 
2095  for (loop_cnt = height >> 1; loop_cnt--;) {
2096  src0 = LD_SB(src);
2097  src2 = LD_SB(src + 16);
2098  src3 = LD_SB(src + 24);
2099  src1 = __msa_sldi_b(src2, src0, 8);
2100  src += src_stride;
2101  src4 = LD_SB(src);
2102  src6 = LD_SB(src + 16);
2103  src7 = LD_SB(src + 24);
2104  src5 = __msa_sldi_b(src6, src4, 8);
2105  src += src_stride;
2106 
2107  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2108  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2109  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2110  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2111  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2112  out0, out1, out2, out3);
2113  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2114  out4, out5, out6, out7);
2115  SRARI_H4_UH(out0, out1, out2, out3, 7);
2116  SRARI_H4_UH(out4, out5, out6, out7, 7);
2117  PCKEV_ST_SB(out0, out1, dst);
2118  PCKEV_ST_SB(out2, out3, dst + 16);
2119  dst += dst_stride;
2120  PCKEV_ST_SB(out4, out5, dst);
2121  PCKEV_ST_SB(out6, out7, dst + 16);
2122  dst += dst_stride;
2123  }
2124 }
2125 
2126 void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2127  const uint8_t *src, ptrdiff_t src_stride,
2128  int height, int mx, int my)
2129 {
2130  uint32_t loop_cnt;
2131  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2132  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2133  v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2134  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
2135 
2136  mask = LD_SB(&mc_filt_mask_arr[0]);
2137 
2138  /* rearranging filter */
2139  filt = LD_UH(filter);
2140  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2141 
2142  for (loop_cnt = height; loop_cnt--;) {
2143  src0 = LD_SB(src);
2144  src2 = LD_SB(src + 16);
2145  src4 = LD_SB(src + 32);
2146  src6 = LD_SB(src + 48);
2147  src7 = LD_SB(src + 56);
2148  SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
2149  src += src_stride;
2150 
2151  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2152  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2153  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2154  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2155  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2156  out0, out1, out2, out3);
2157  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2158  out4, out5, out6, out7);
2159  SRARI_H4_UH(out0, out1, out2, out3, 7);
2160  SRARI_H4_UH(out4, out5, out6, out7, 7);
2161  PCKEV_ST_SB(out0, out1, dst);
2162  PCKEV_ST_SB(out2, out3, dst + 16);
2163  PCKEV_ST_SB(out4, out5, dst + 32);
2164  PCKEV_ST_SB(out6, out7, dst + 48);
2165  dst += dst_stride;
2166  }
2167 }
2168 
2169 static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride,
2170  uint8_t *dst, int32_t dst_stride,
2171  const int8_t *filter)
2172 {
2173  v16i8 src0, src1, src2, src3, src4;
2174  v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2175  v16u8 filt0;
2176  v8i16 filt;
2177  v8u16 tmp0, tmp1;
2178 
2179  filt = LD_SH(filter);
2180  filt0 = (v16u8) __msa_splati_h(filt, 0);
2181 
2182  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2183  src += (5 * src_stride);
2184 
2185  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2186  src10_r, src21_r, src32_r, src43_r);
2187  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2188  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
2189  SRARI_H2_UH(tmp0, tmp1, 7);
2190  SAT_UH2_UH(tmp0, tmp1, 7);
2191  src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2192  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
2193 }
2194 
2195 static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
2196  uint8_t *dst, int32_t dst_stride,
2197  const int8_t *filter)
2198 {
2199  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2200  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
2201  v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
2202  v8u16 tmp0, tmp1, tmp2, tmp3;
2203  v16u8 filt0;
2204  v8i16 filt;
2205 
2206  filt = LD_SH(filter);
2207  filt0 = (v16u8) __msa_splati_h(filt, 0);
2208 
2209  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2210  src += (8 * src_stride);
2211 
2212  src8 = LD_SB(src);
2213  src += src_stride;
2214 
2215  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2216  src32_r, src43_r);
2217  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2218  src76_r, src87_r);
2219  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
2220  src87_r, src76_r, src2110, src4332, src6554, src8776);
2221  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
2222  tmp0, tmp1, tmp2, tmp3);
2223  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2224  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2225  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
2226  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
2227  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2228 }
2229 
2230 void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2231  const uint8_t *src, ptrdiff_t src_stride,
2232  int height, int mx, int my)
2233 {
2234  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2235 
2236  if (4 == height) {
2237  common_vt_2t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2238  } else if (8 == height) {
2239  common_vt_2t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2240  }
2241 }
2242 
2243 static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride,
2244  uint8_t *dst, int32_t dst_stride,
2245  const int8_t *filter)
2246 {
2247  v16u8 src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
2248  v16i8 out0, out1;
2249  v8u16 tmp0, tmp1, tmp2, tmp3;
2250  v8i16 filt;
2251 
2252  /* rearranging filter_y */
2253  filt = LD_SH(filter);
2254  filt0 = (v16u8) __msa_splati_h(filt, 0);
2255 
2256  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
2257  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
2258  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
2259  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2260  tmp0, tmp1, tmp2, tmp3);
2261  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2262  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2263  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2264  ST8x4_UB(out0, out1, dst, dst_stride);
2265 }
2266 
2267 static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
2268  uint8_t *dst, int32_t dst_stride,
2269  const int8_t *filter, int32_t height)
2270 {
2271  uint32_t loop_cnt;
2272  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2273  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2274  v16i8 out0, out1;
2275  v8u16 tmp0, tmp1, tmp2, tmp3;
2276  v8i16 filt;
2277 
2278  /* rearranging filter_y */
2279  filt = LD_SH(filter);
2280  filt0 = (v16u8) __msa_splati_h(filt, 0);
2281 
2282  src0 = LD_UB(src);
2283  src += src_stride;
2284 
2285  for (loop_cnt = (height >> 3); loop_cnt--;) {
2286  LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
2287  src += (8 * src_stride);
2288 
2289  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
2290  vec0, vec1, vec2, vec3);
2291  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
2292  vec4, vec5, vec6, vec7);
2293  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2294  tmp0, tmp1, tmp2, tmp3);
2295  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2296  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2297  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2298  ST8x4_UB(out0, out1, dst, dst_stride);
2299  dst += (4 * dst_stride);
2300 
2301  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2302  tmp0, tmp1, tmp2, tmp3);
2303  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2304  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2305  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2306  ST8x4_UB(out0, out1, dst, dst_stride);
2307  dst += (4 * dst_stride);
2308 
2309  src0 = src8;
2310  }
2311 }
2312 
2313 void ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2314  const uint8_t *src, ptrdiff_t src_stride,
2315  int height, int mx, int my)
2316 {
2317  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2318 
2319  if (4 == height) {
2320  common_vt_2t_8x4_msa(src, src_stride, dst, dst_stride, filter);
2321  } else {
2322  common_vt_2t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
2323  height);
2324  }
2325 }
2326 
2327 void ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2328  const uint8_t *src, ptrdiff_t src_stride,
2329  int height, int mx, int my)
2330 {
2331  uint32_t loop_cnt;
2332  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2333  v16u8 src0, src1, src2, src3, src4;
2334  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2335  v8u16 tmp0, tmp1, tmp2, tmp3;
2336  v8i16 filt;
2337 
2338  /* rearranging filter_y */
2339  filt = LD_SH(filter);
2340  filt0 = (v16u8) __msa_splati_h(filt, 0);
2341 
2342  src0 = LD_UB(src);
2343  src += src_stride;
2344 
2345  for (loop_cnt = (height >> 2); loop_cnt--;) {
2346  LD_UB4(src, src_stride, src1, src2, src3, src4);
2347  src += (4 * src_stride);
2348 
2349  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2350  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2351  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2352  SRARI_H2_UH(tmp0, tmp1, 7);
2353  SAT_UH2_UH(tmp0, tmp1, 7);
2354  PCKEV_ST_SB(tmp0, tmp1, dst);
2355  dst += dst_stride;
2356 
2357  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
2358  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
2359  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2360  SRARI_H2_UH(tmp2, tmp3, 7);
2361  SAT_UH2_UH(tmp2, tmp3, 7);
2362  PCKEV_ST_SB(tmp2, tmp3, dst);
2363  dst += dst_stride;
2364 
2365  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2366  SRARI_H2_UH(tmp0, tmp1, 7);
2367  SAT_UH2_UH(tmp0, tmp1, 7);
2368  PCKEV_ST_SB(tmp0, tmp1, dst);
2369  dst += dst_stride;
2370 
2371  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2372  SRARI_H2_UH(tmp2, tmp3, 7);
2373  SAT_UH2_UH(tmp2, tmp3, 7);
2374  PCKEV_ST_SB(tmp2, tmp3, dst);
2375  dst += dst_stride;
2376 
2377  src0 = src4;
2378  }
2379 }
2380 
2381 void ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2382  const uint8_t *src, ptrdiff_t src_stride,
2383  int height, int mx, int my)
2384 {
2385  uint32_t loop_cnt;
2386  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2387  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
2388  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2389  v8u16 tmp0, tmp1, tmp2, tmp3;
2390  v8i16 filt;
2391 
2392  /* rearranging filter_y */
2393  filt = LD_SH(filter);
2394  filt0 = (v16u8) __msa_splati_h(filt, 0);
2395 
2396  src0 = LD_UB(src);
2397  src5 = LD_UB(src + 16);
2398  src += src_stride;
2399 
2400  for (loop_cnt = (height >> 2); loop_cnt--;) {
2401  LD_UB4(src, src_stride, src1, src2, src3, src4);
2402  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2403  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2404 
2405  LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
2406  src += (4 * src_stride);
2407 
2408  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2409  SRARI_H2_UH(tmp0, tmp1, 7);
2410  SAT_UH2_UH(tmp0, tmp1, 7);
2411  PCKEV_ST_SB(tmp0, tmp1, dst);
2412  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2413  SRARI_H2_UH(tmp2, tmp3, 7);
2414  SAT_UH2_UH(tmp2, tmp3, 7);
2415  PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
2416 
2417  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
2418  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
2419  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2420  SRARI_H2_UH(tmp0, tmp1, 7);
2421  SAT_UH2_UH(tmp0, tmp1, 7);
2422  PCKEV_ST_SB(tmp0, tmp1, dst + 2 * dst_stride);
2423 
2424  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2425  SRARI_H2_UH(tmp2, tmp3, 7);
2426  SAT_UH2_UH(tmp2, tmp3, 7);
2427  PCKEV_ST_SB(tmp2, tmp3, dst + 3 * dst_stride);
2428 
2429  ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
2430  ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
2431  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2432  SRARI_H2_UH(tmp0, tmp1, 7);
2433  SAT_UH2_UH(tmp0, tmp1, 7);
2434  PCKEV_ST_SB(tmp0, tmp1, dst + 16);
2435 
2436  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2437  SRARI_H2_UH(tmp2, tmp3, 7);
2438  SAT_UH2_UH(tmp2, tmp3, 7);
2439  PCKEV_ST_SB(tmp2, tmp3, dst + 16 + dst_stride);
2440 
2441  ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
2442  ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
2443  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2444  SRARI_H2_UH(tmp0, tmp1, 7);
2445  SAT_UH2_UH(tmp0, tmp1, 7);
2446  PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
2447 
2448  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2449  SRARI_H2_UH(tmp2, tmp3, 7);
2450  SAT_UH2_UH(tmp2, tmp3, 7);
2451  PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
2452  dst += (4 * dst_stride);
2453 
2454  src0 = src4;
2455  src5 = src9;
2456  }
2457 }
2458 
2459 void ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
2460  const uint8_t *src, ptrdiff_t src_stride,
2461  int height, int mx, int my)
2462 {
2463  uint32_t loop_cnt;
2464  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
2465  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2466  v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2467  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2468  v8i16 filt;
2469 
2470  /* rearranging filter_y */
2471  filt = LD_SH(filter);
2472  filt0 = (v16u8) __msa_splati_h(filt, 0);
2473 
2474  LD_UB4(src, 16, src0, src3, src6, src9);
2475  src += src_stride;
2476 
2477  for (loop_cnt = (height >> 1); loop_cnt--;) {
2478  LD_UB2(src, src_stride, src1, src2);
2479  LD_UB2(src + 16, src_stride, src4, src5);
2480  LD_UB2(src + 32, src_stride, src7, src8);
2481  LD_UB2(src + 48, src_stride, src10, src11);
2482  src += (2 * src_stride);
2483 
2484  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2485  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2486  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2487  SRARI_H2_UH(tmp0, tmp1, 7);
2488  SAT_UH2_UH(tmp0, tmp1, 7);
2489  PCKEV_ST_SB(tmp0, tmp1, dst);
2490 
2491  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2492  SRARI_H2_UH(tmp2, tmp3, 7);
2493  SAT_UH2_UH(tmp2, tmp3, 7);
2494  PCKEV_ST_SB(tmp2, tmp3, dst + dst_stride);
2495 
2496  ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
2497  ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
2498  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
2499  SRARI_H2_UH(tmp4, tmp5, 7);
2500  SAT_UH2_UH(tmp4, tmp5, 7);
2501  PCKEV_ST_SB(tmp4, tmp5, dst + 16);
2502 
2503  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
2504  SRARI_H2_UH(tmp6, tmp7, 7);
2505  SAT_UH2_UH(tmp6, tmp7, 7);
2506  PCKEV_ST_SB(tmp6, tmp7, dst + 16 + dst_stride);
2507 
2508  ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
2509  ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
2510  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2511  SRARI_H2_UH(tmp0, tmp1, 7);
2512  SAT_UH2_UH(tmp0, tmp1, 7);
2513  PCKEV_ST_SB(tmp0, tmp1, dst + 32);
2514 
2515  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2516  SRARI_H2_UH(tmp2, tmp3, 7);
2517  SAT_UH2_UH(tmp2, tmp3, 7);
2518  PCKEV_ST_SB(tmp2, tmp3, dst + 32 + dst_stride);
2519 
2520  ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
2521  ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
2522  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
2523  SRARI_H2_UH(tmp4, tmp5, 7);
2524  SAT_UH2_UH(tmp4, tmp5, 7);
2525  PCKEV_ST_SB(tmp4, tmp5, dst + 48);
2526 
2527  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
2528  SRARI_H2_UH(tmp6, tmp7, 7);
2529  SAT_UH2_UH(tmp6, tmp7, 7);
2530  PCKEV_ST_SB(tmp6, tmp7, dst + 48 + dst_stride);
2531  dst += (2 * dst_stride);
2532 
2533  src0 = src2;
2534  src3 = src5;
2535  src6 = src8;
2536  src9 = src11;
2537  }
2538 }
2539 
2540 static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride,
2541  uint8_t *dst, int32_t dst_stride,
2542  const int8_t *filter_horiz, const int8_t *filter_vert)
2543 {
2544  v16i8 src0, src1, src2, src3, src4, mask;
2545  v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
2546  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, filt, tmp0, tmp1;
2547 
2548  mask = LD_SB(&mc_filt_mask_arr[16]);
2549 
2550  /* rearranging filter */
2551  filt = LD_UH(filter_horiz);
2552  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
2553 
2554  filt = LD_UH(filter_vert);
2555  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
2556 
2557  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2558  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
2559  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
2560  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2561  hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
2562  hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
2563 
2564  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2565  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
2566  SRARI_H2_UH(tmp0, tmp1, 7);
2567  SAT_UH2_UH(tmp0, tmp1, 7);
2568  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
2569  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2570 }
2571 
2572 static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
2573  uint8_t *dst, int32_t dst_stride,
2574  const int8_t *filter_horiz, const int8_t *filter_vert)
2575 {
2576  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
2577  v16i8 res0, res1, res2, res3;
2578  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2579  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2580  v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7, filt;
2581 
2582  mask = LD_SB(&mc_filt_mask_arr[16]);
2583 
2584  /* rearranging filter */
2585  filt = LD_UH(filter_horiz);
2586  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
2587 
2588  filt = LD_UH(filter_vert);
2589  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
2590 
2591  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2592  src += (8 * src_stride);
2593  src8 = LD_SB(src);
2594 
2595  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
2596  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
2597  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
2598  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
2599  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
2600  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
2601  hz_out3, hz_out5, 8);
2602  hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2603 
2604  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2605  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2606  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2607  vec4, vec5, vec6, vec7);
2608  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2609  SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
2610  PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2611  res0, res1, res2, res3);
2612  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2613  dst += (4 * dst_stride);
2614  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
2615 }
2616 
2617 void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2618  const uint8_t *src, ptrdiff_t src_stride,
2619  int height, int mx, int my)
2620 {
2621  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2622  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2623 
2624  if (4 == height) {
2625  common_hv_2ht_2vt_4x4_msa(src, src_stride, dst, dst_stride,
2626  filter_horiz, filter_vert);
2627  } else if (8 == height) {
2628  common_hv_2ht_2vt_4x8_msa(src, src_stride, dst, dst_stride,
2629  filter_horiz, filter_vert);
2630  }
2631 }
2632 
2633 static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride,
2634  uint8_t *dst, int32_t dst_stride,
2635  const int8_t *filter_horiz, const int8_t *filter_vert)
2636 {
2637  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2638  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2639  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2640  v8i16 filt;
2641 
2642  mask = LD_SB(&mc_filt_mask_arr[0]);
2643 
2644  /* rearranging filter */
2645  filt = LD_SH(filter_horiz);
2646  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2647 
2648  filt = LD_SH(filter_vert);
2649  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2650 
2651  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2652 
2653  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2654  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2655  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2656  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2657 
2658  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2659  vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2660  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2661 
2662  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2663  vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2664  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2665 
2666  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2667  vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2668  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2669 
2670  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2671  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
2672  PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
2673  ST8x4_UB(out0, out1, dst, dst_stride);
2674 }
2675 
2676 static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride,
2677  uint8_t *dst, int32_t dst_stride,
2678  const int8_t *filter_horiz, const int8_t *filter_vert,
2679  int32_t height)
2680 {
2681  uint32_t loop_cnt;
2682  v16i8 src0, src1, src2, src3, src4, mask, out0, out1;
2683  v16u8 filt_hz, filt_vt, vec0;
2684  v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2685  v8i16 filt;
2686 
2687  mask = LD_SB(&mc_filt_mask_arr[0]);
2688 
2689  /* rearranging filter */
2690  filt = LD_SH(filter_horiz);
2691  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2692 
2693  filt = LD_SH(filter_vert);
2694  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2695 
2696  src0 = LD_SB(src);
2697  src += src_stride;
2698 
2699  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2700 
2701  for (loop_cnt = (height >> 3); loop_cnt--;) {
2702  LD_SB4(src, src_stride, src1, src2, src3, src4);
2703  src += (4 * src_stride);
2704 
2705  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2706  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2707  tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2708 
2709  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2710  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2711  tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2712 
2713  SRARI_H2_UH(tmp1, tmp2, 7);
2714  SAT_UH2_UH(tmp1, tmp2, 7);
2715 
2716  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2717  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2718  tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2719 
2720  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2721  LD_SB4(src, src_stride, src1, src2, src3, src4);
2722  src += (4 * src_stride);
2723  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2724  tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2725 
2726  SRARI_H2_UH(tmp3, tmp4, 7);
2727  SAT_UH2_UH(tmp3, tmp4, 7);
2728  PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
2729  ST8x4_UB(out0, out1, dst, dst_stride);
2730  dst += (4 * dst_stride);
2731 
2732  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2733  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2734  tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2735 
2736  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2737  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2738  tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2739 
2740  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2741  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2742  tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2743 
2744  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2745  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2746  tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2747 
2748  SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2749  SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
2750  PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
2751  ST8x4_UB(out0, out1, dst, dst_stride);
2752  dst += (4 * dst_stride);
2753  }
2754 }
2755 
2756 void ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2757  const uint8_t *src, ptrdiff_t src_stride,
2758  int height, int mx, int my)
2759 {
2760  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2761  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2762 
2763  if (4 == height) {
2764  common_hv_2ht_2vt_8x4_msa(src, src_stride, dst, dst_stride,
2765  filter_horiz, filter_vert);
2766  } else {
2767  common_hv_2ht_2vt_8x8mult_msa(src, src_stride, dst, dst_stride,
2768  filter_horiz, filter_vert, height);
2769  }
2770 }
2771 
2772 void ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2773  const uint8_t *src, ptrdiff_t src_stride,
2774  int height, int mx, int my)
2775 {
2776  uint32_t loop_cnt;
2777  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
2778  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
2779  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2780  v16u8 filt_hz, filt_vt, vec0, vec1;
2781  v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2782  v8i16 filt;
2783 
2784  mask = LD_SB(&mc_filt_mask_arr[0]);
2785 
2786  /* rearranging filter */
2787  filt = LD_SH(filter_horiz);
2788  filt_hz = (v16u8) __msa_splati_h(filt, 0);
2789 
2790  filt = LD_SH(filter_vert);
2791  filt_vt = (v16u8) __msa_splati_h(filt, 0);
2792 
2793  LD_SB2(src, 8, src0, src1);
2794  src += src_stride;
2795 
2796  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2797  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2798 
2799 
2800  for (loop_cnt = (height >> 2); loop_cnt--;) {
2801  LD_SB4(src, src_stride, src0, src2, src4, src6);
2802  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2803  src += (4 * src_stride);
2804 
2805  hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
2806  hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
2807  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2808  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2809  SRARI_H2_UH(tmp1, tmp2, 7);
2810  SAT_UH2_UH(tmp1, tmp2, 7);
2811  PCKEV_ST_SB(tmp1, tmp2, dst);
2812  dst += dst_stride;
2813 
2814  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
2815  hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
2816  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2817  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2818  SRARI_H2_UH(tmp1, tmp2, 7);
2819  SAT_UH2_UH(tmp1, tmp2, 7);
2820  PCKEV_ST_SB(tmp1, tmp2, dst);
2821  dst += dst_stride;
2822 
2823  hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
2824  hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
2825  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2826  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2827  SRARI_H2_UH(tmp1, tmp2, 7);
2828  SAT_UH2_UH(tmp1, tmp2, 7);
2829  PCKEV_ST_SB(tmp1, tmp2, dst);
2830  dst += dst_stride;
2831 
2832  hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
2833  hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
2834  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2835  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2836  SRARI_H2_UH(tmp1, tmp2, 7);
2837  SAT_UH2_UH(tmp1, tmp2, 7);
2838  PCKEV_ST_SB(tmp1, tmp2, dst);
2839  dst += dst_stride;
2840  }
2841 }
2842 
2843 void ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2844  const uint8_t *src, ptrdiff_t src_stride,
2845  int height, int mx, int my)
2846 {
2847  int32_t multiple8_cnt;
2848 
2849  for (multiple8_cnt = 2; multiple8_cnt--;) {
2850  ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
2851 
2852  src += 16;
2853  dst += 16;
2854  }
2855 }
2856 
2857 void ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
2858  const uint8_t *src, ptrdiff_t src_stride,
2859  int height, int mx, int my)
2860 {
2861  int32_t multiple8_cnt;
2862 
2863  for (multiple8_cnt = 4; multiple8_cnt--;) {
2864  ff_put_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
2865 
2866  src += 16;
2867  dst += 16;
2868  }
2869 }
2870 
2872  int32_t src_stride,
2873  uint8_t *dst, int32_t dst_stride,
2874  const int8_t *filter)
2875 {
2876  v16i8 src0, src1, src2, src3, mask;
2877  v16u8 filt0, dst0, dst1, dst2, dst3, vec0, vec1, res0, res1;
2878  v8u16 vec2, vec3, filt;
2879 
2880  mask = LD_SB(&mc_filt_mask_arr[16]);
2881 
2882  /* rearranging filter */
2883  filt = LD_UH(filter);
2884  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2885 
2886  LD_SB4(src, src_stride, src0, src1, src2, src3);
2887  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2888  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
2889  DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
2890  SRARI_H2_UH(vec2, vec3, 7);
2891  PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
2892  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
2893  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
2894  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2895 }
2896 
2898  int32_t src_stride,
2899  uint8_t *dst, int32_t dst_stride,
2900  const int8_t *filter)
2901 {
2902  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
2903  v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
2904  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2905  v8u16 vec4, vec5, vec6, vec7, filt;
2906 
2907  mask = LD_SB(&mc_filt_mask_arr[16]);
2908 
2909  /* rearranging filter */
2910  filt = LD_UH(filter);
2911  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2912 
2913  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2914  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2915  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
2916  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
2917  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
2918  vec6, vec7);
2919  SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
2920  PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
2921  res2, res3);
2922  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
2923  dst4, dst6);
2924  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
2925  res2, res3);
2926  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
2927  dst += (4 * dst_stride);
2928  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
2929 }
2930 
2931 void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
2932  const uint8_t *src, ptrdiff_t src_stride,
2933  int height, int mx, int my)
2934 {
2935  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
2936 
2937  if (4 == height) {
2938  common_hz_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
2939  filter);
2940  } else if (8 == height) {
2941  common_hz_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
2942  filter);
2943  }
2944 }
2945 
2947  int32_t src_stride,
2948  uint8_t *dst, int32_t dst_stride,
2949  const int8_t *filter)
2950 {
2951  v16i8 src0, src1, src2, src3, mask;
2952  v16u8 filt0, dst0, dst1, dst2, dst3;
2953  v8u16 vec0, vec1, vec2, vec3, filt;
2954 
2955  mask = LD_SB(&mc_filt_mask_arr[0]);
2956 
2957  /* rearranging filter */
2958  filt = LD_UH(filter);
2959  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2960 
2961  LD_SB4(src, src_stride, src0, src1, src2, src3);
2962  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2963  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2964  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2965  vec0, vec1, vec2, vec3);
2966  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2967  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2968  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
2969  dst, dst_stride);
2970 }
2971 
2973  int32_t src_stride,
2974  uint8_t *dst,
2975  int32_t dst_stride,
2976  const int8_t *filter,
2977  int32_t height)
2978 {
2979  v16i8 src0, src1, src2, src3, mask;
2980  v16u8 filt0, dst0, dst1, dst2, dst3;
2981  v8u16 vec0, vec1, vec2, vec3, filt;
2982 
2983  mask = LD_SB(&mc_filt_mask_arr[0]);
2984 
2985  /* rearranging filter */
2986  filt = LD_UH(filter);
2987  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
2988 
2989  LD_SB4(src, src_stride, src0, src1, src2, src3);
2990  src += (4 * src_stride);
2991  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2992  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2993  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
2994  vec2, vec3);
2995  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
2996  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2997  LD_SB4(src, src_stride, src0, src1, src2, src3);
2998  src += (4 * src_stride);
2999  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
3000  dst, dst_stride);
3001  dst += (4 * dst_stride);
3002 
3003  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
3004  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
3005  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
3006  vec2, vec3);
3007  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
3008  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3009  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
3010  dst, dst_stride);
3011  dst += (4 * dst_stride);
3012 
3013  if (16 == height) {
3014  LD_SB4(src, src_stride, src0, src1, src2, src3);
3015  src += (4 * src_stride);
3016 
3017  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
3018  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
3019  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
3020  vec1, vec2, vec3);
3021  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
3022  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3023  LD_SB4(src, src_stride, src0, src1, src2, src3);
3024  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
3025  dst, dst_stride);
3026  dst += (4 * dst_stride);
3027 
3028  VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
3029  VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
3030  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
3031  vec1, vec2, vec3);
3032  SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
3033  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3034  PCKEV_AVG_ST8x4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
3035  dst, dst_stride);
3036  }
3037 }
3038 
3039 void ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3040  const uint8_t *src, ptrdiff_t src_stride,
3041  int height, int mx, int my)
3042 {
3043  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3044 
3045  if (4 == height) {
3046  common_hz_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3047  filter);
3048  } else {
3049  common_hz_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
3050  filter, height);
3051  }
3052 }
3053 
3054 void ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3055  const uint8_t *src, ptrdiff_t src_stride,
3056  int height, int mx, int my)
3057 {
3058  uint32_t loop_cnt;
3059  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3060  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3061  v16u8 filt0, dst0, dst1, dst2, dst3;
3062  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3063  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
3064 
3065  mask = LD_SB(&mc_filt_mask_arr[0]);
3066 
3067  /* rearranging filter */
3068  filt = LD_UH(filter);
3069  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3070 
3071  LD_SB4(src, src_stride, src0, src2, src4, src6);
3072  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3073  src += (4 * src_stride);
3074 
3075  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3076  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3077  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3078  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3079  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
3080  res2, res3);
3081  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
3082  res6, res7);
3083  SRARI_H4_UH(res0, res1, res2, res3, 7);
3084  SRARI_H4_UH(res4, res5, res6, res7, 7);
3085  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3086  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3087  dst += dst_stride;
3088  PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
3089  dst += dst_stride;
3090  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3091  dst += dst_stride;
3092  PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
3093  dst += dst_stride;
3094 
3095  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
3096  LD_SB4(src, src_stride, src0, src2, src4, src6);
3097  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3098  src += (4 * src_stride);
3099 
3100  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3101  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3102  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3103  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3104  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
3105  res1, res2, res3);
3106  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4,
3107  res5, res6, res7);
3108  SRARI_H4_UH(res0, res1, res2, res3, 7);
3109  SRARI_H4_UH(res4, res5, res6, res7, 7);
3110  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3111  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3112  dst += dst_stride;
3113  PCKEV_AVG_ST_UB(res3, res2, dst1, dst);
3114  dst += dst_stride;
3115  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3116  dst += dst_stride;
3117  PCKEV_AVG_ST_UB(res7, res6, dst3, dst);
3118  dst += dst_stride;
3119  }
3120 }
3121 
3122 void ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3123  const uint8_t *src, ptrdiff_t src_stride,
3124  int height, int mx, int my)
3125 {
3126  uint32_t loop_cnt;
3127  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3128  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3129  v16u8 filt0, dst0, dst1, dst2, dst3;
3130  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3131  v8u16 res0, res1, res2, res3, res4, res5, res6, res7, filt;
3132 
3133  mask = LD_SB(&mc_filt_mask_arr[0]);
3134 
3135  /* rearranging filter */
3136  filt = LD_UH(filter);
3137  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3138 
3139  for (loop_cnt = (height >> 1); loop_cnt--;) {
3140  src0 = LD_SB(src);
3141  src2 = LD_SB(src + 16);
3142  src3 = LD_SB(src + 24);
3143  src1 = __msa_sldi_b(src2, src0, 8);
3144  src += src_stride;
3145  src4 = LD_SB(src);
3146  src6 = LD_SB(src + 16);
3147  src7 = LD_SB(src + 24);
3148  src5 = __msa_sldi_b(src6, src4, 8);
3149  src += src_stride;
3150 
3151  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3152  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3153  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3154  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3155  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3156  res0, res1, res2, res3);
3157  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3158  res4, res5, res6, res7);
3159  SRARI_H4_UH(res0, res1, res2, res3, 7);
3160  SRARI_H4_UH(res4, res5, res6, res7, 7);
3161  LD_UB2(dst, 16, dst0, dst1);
3162  PCKEV_AVG_ST_UB(res1, res0, dst0, dst);
3163  PCKEV_AVG_ST_UB(res3, res2, dst1, (dst + 16));
3164  dst += dst_stride;
3165  LD_UB2(dst, 16, dst2, dst3);
3166  PCKEV_AVG_ST_UB(res5, res4, dst2, dst);
3167  PCKEV_AVG_ST_UB(res7, res6, dst3, (dst + 16));
3168  dst += dst_stride;
3169  }
3170 }
3171 
3172 void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride,
3173  const uint8_t *src, ptrdiff_t src_stride,
3174  int height, int mx, int my)
3175 {
3176  uint32_t loop_cnt;
3177  const int8_t *filter = vp9_bilinear_filters_msa[mx - 1];
3178  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3179  v16u8 filt0, dst0, dst1, dst2, dst3;
3180  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3181  v8u16 out0, out1, out2, out3, out4, out5, out6, out7, filt;
3182 
3183  mask = LD_SB(&mc_filt_mask_arr[0]);
3184 
3185  /* rearranging filter */
3186  filt = LD_UH(filter);
3187  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3188 
3189  for (loop_cnt = height; loop_cnt--;) {
3190  LD_SB4(src, 16, src0, src2, src4, src6);
3191  src7 = LD_SB(src + 56);
3192  SLDI_B3_SB(src2, src4, src6, src0, src2, src4, src1, src3, src5, 8);
3193  src += src_stride;
3194 
3195  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3196  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3197  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3198  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3199  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3200  out0, out1, out2, out3);
3201  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3202  out4, out5, out6, out7);
3203  SRARI_H4_UH(out0, out1, out2, out3, 7);
3204  SRARI_H4_UH(out4, out5, out6, out7, 7);
3205  LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
3206  PCKEV_AVG_ST_UB(out1, out0, dst0, dst);
3207  PCKEV_AVG_ST_UB(out3, out2, dst1, dst + 16);
3208  PCKEV_AVG_ST_UB(out5, out4, dst2, dst + 32);
3209  PCKEV_AVG_ST_UB(out7, out6, dst3, dst + 48);
3210  dst += dst_stride;
3211  }
3212 }
3213 
3215  int32_t src_stride,
3216  uint8_t *dst, int32_t dst_stride,
3217  const int8_t *filter)
3218 {
3219  v16i8 src0, src1, src2, src3, src4;
3220  v16u8 dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
3221  v16i8 src10_r, src32_r, src21_r, src43_r;
3222  v8i16 filt;
3223  v8u16 tmp0, tmp1;
3224 
3225  filt = LD_SH(filter);
3226  filt0 = (v16u8) __msa_splati_h(filt, 0);
3227 
3228  LD_SB4(src, src_stride, src0, src1, src2, src3);
3229  src += (4 * src_stride);
3230 
3231  src4 = LD_SB(src);
3232  src += src_stride;
3233 
3234  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3235  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
3236  dst0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
3237  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
3238  src10_r, src21_r, src32_r, src43_r);
3239  ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3240  DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
3241  SRARI_H2_UH(tmp0, tmp1, 7);
3242  SAT_UH2_UH(tmp0, tmp1, 7);
3243 
3244  out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3245  out = __msa_aver_u_b(out, dst0);
3246 
3247  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
3248 }
3249 
3251  int32_t src_stride,
3252  uint8_t *dst, int32_t dst_stride,
3253  const int8_t *filter)
3254 {
3255  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3256  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
3257  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3258  v16u8 src2110, src4332, src6554, src8776, filt0;
3259  v8u16 tmp0, tmp1, tmp2, tmp3;
3260  v8i16 filt;
3261 
3262  filt = LD_SH(filter);
3263  filt0 = (v16u8) __msa_splati_h(filt, 0);
3264 
3265  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3266  src += (8 * src_stride);
3267  src8 = LD_SB(src);
3268 
3269  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
3270  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
3271  dst2, dst3);
3272  ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
3273  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3274  src32_r, src43_r);
3275  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3276  src76_r, src87_r);
3277  ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
3278  src87_r, src76_r, src2110, src4332, src6554, src8776);
3279  DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
3280  tmp0, tmp1, tmp2, tmp3);
3281  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3282  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3283  PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
3284  AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
3285  ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
3286  dst += (4 * dst_stride);
3287  ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst, dst_stride);
3288 }
3289 
3290 void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3291  const uint8_t *src, ptrdiff_t src_stride,
3292  int height, int mx, int my)
3293 {
3294  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3295 
3296  if (4 == height) {
3297  common_vt_2t_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
3298  filter);
3299  } else if (8 == height) {
3300  common_vt_2t_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
3301  filter);
3302  }
3303 }
3304 
3306  int32_t src_stride,
3307  uint8_t *dst,
3308  int32_t dst_stride,
3309  const int8_t *filter)
3310 {
3311  v16u8 src0, src1, src2, src3, src4;
3312  v16u8 dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
3313  v8u16 tmp0, tmp1, tmp2, tmp3;
3314  v8i16 filt;
3315 
3316  /* rearranging filter_y */
3317  filt = LD_SH(filter);
3318  filt0 = (v16u8) __msa_splati_h(filt, 0);
3319 
3320  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
3321  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3322  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
3323  ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
3324  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3325  tmp0, tmp1, tmp2, tmp3);
3326  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3327  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3328  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
3329  dst, dst_stride);
3330 }
3331 
3333  int32_t src_stride,
3334  uint8_t *dst,
3335  int32_t dst_stride,
3336  const int8_t *filter,
3337  int32_t height)
3338 {
3339  uint32_t loop_cnt;
3340  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3341  v16u8 dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3342  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
3343  v8u16 tmp0, tmp1, tmp2, tmp3;
3344  v8i16 filt;
3345 
3346  /* rearranging filter_y */
3347  filt = LD_SH(filter);
3348  filt0 = (v16u8) __msa_splati_h(filt, 0);
3349 
3350  src0 = LD_UB(src);
3351  src += src_stride;
3352 
3353  for (loop_cnt = (height >> 3); loop_cnt--;) {
3354  LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
3355  src += (8 * src_stride);
3356  LD_UB8(dst, dst_stride, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8);
3357 
3358  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
3359  vec0, vec1, vec2, vec3);
3360  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
3361  vec4, vec5, vec6, vec7);
3362  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3363  tmp0, tmp1, tmp2, tmp3);
3364  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3365  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3366  PCKEV_AVG_ST8x4_UB(tmp0, dst1, tmp1, dst2, tmp2, dst3, tmp3,
3367  dst4, dst, dst_stride);
3368  dst += (4 * dst_stride);
3369 
3370  DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3371  tmp0, tmp1, tmp2, tmp3);
3372  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3373  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3374  PCKEV_AVG_ST8x4_UB(tmp0, dst5, tmp1, dst6, tmp2, dst7, tmp3,
3375  dst8, dst, dst_stride);
3376  dst += (4 * dst_stride);
3377 
3378  src0 = src8;
3379  }
3380 }
3381 
3382 void ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3383  const uint8_t *src, ptrdiff_t src_stride,
3384  int height, int mx, int my)
3385 {
3386  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3387 
3388  if (4 == height) {
3389  common_vt_2t_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3390  filter);
3391  } else {
3392  common_vt_2t_and_aver_dst_8x8mult_msa(src, src_stride, dst, dst_stride,
3393  filter, height);
3394  }
3395 }
3396 
3397 void ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3398  const uint8_t *src, ptrdiff_t src_stride,
3399  int height, int mx, int my)
3400 {
3401  uint32_t loop_cnt;
3402  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3403  v16u8 src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
3404  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3405  v8u16 tmp0, tmp1, tmp2, tmp3, filt;
3406 
3407  /* rearranging filter_y */
3408  filt = LD_UH(filter);
3409  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3410 
3411  src0 = LD_UB(src);
3412  src += src_stride;
3413 
3414  for (loop_cnt = (height >> 2); loop_cnt--;) {
3415  LD_UB4(src, src_stride, src1, src2, src3, src4);
3416  src += (4 * src_stride);
3417 
3418  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3419  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3420  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3421  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3422  SRARI_H2_UH(tmp0, tmp1, 7);
3423  SAT_UH2_UH(tmp0, tmp1, 7);
3424  PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3425  dst += dst_stride;
3426 
3427  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
3428  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
3429  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3430  SRARI_H2_UH(tmp2, tmp3, 7);
3431  SAT_UH2_UH(tmp2, tmp3, 7);
3432  PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst);
3433  dst += dst_stride;
3434 
3435  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3436  SRARI_H2_UH(tmp0, tmp1, 7);
3437  SAT_UH2_UH(tmp0, tmp1, 7);
3438  PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
3439  dst += dst_stride;
3440 
3441  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3442  SRARI_H2_UH(tmp2, tmp3, 7);
3443  SAT_UH2_UH(tmp2, tmp3, 7);
3444  PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst);
3445  dst += dst_stride;
3446 
3447  src0 = src4;
3448  }
3449 }
3450 
3451 void ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3452  const uint8_t *src, ptrdiff_t src_stride,
3453  int height, int mx, int my)
3454 {
3455  uint32_t loop_cnt;
3456  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3457  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3458  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3459  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
3460  v8u16 tmp0, tmp1, tmp2, tmp3, filt;
3461 
3462  /* rearranging filter_y */
3463  filt = LD_UH(filter);
3464  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3465 
3466  LD_UB2(src, 16, src0, src5);
3467  src += src_stride;
3468 
3469  for (loop_cnt = (height >> 2); loop_cnt--;) {
3470  LD_UB4(src, src_stride, src1, src2, src3, src4);
3471  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3472  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3473  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3474 
3475  LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
3476  LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
3477  src += (4 * src_stride);
3478 
3479  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3480  SRARI_H2_UH(tmp0, tmp1, 7);
3481  SAT_UH2_UH(tmp0, tmp1, 7);
3482  PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3483 
3484  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3485  SRARI_H2_UH(tmp2, tmp3, 7);
3486  SAT_UH2_UH(tmp2, tmp3, 7);
3487  PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
3488 
3489  ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
3490  ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
3491  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3492  SRARI_H2_UH(tmp0, tmp1, 7);
3493  SAT_UH2_UH(tmp0, tmp1, 7);
3494  PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst + 2 * dst_stride);
3495 
3496  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3497  SRARI_H2_UH(tmp2, tmp3, 7);
3498  SAT_UH2_UH(tmp2, tmp3, 7);
3499  PCKEV_AVG_ST_UB(tmp3, tmp2, dst3, dst + 3 * dst_stride);
3500 
3501  ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
3502  ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
3503  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3504  SRARI_H2_UH(tmp0, tmp1, 7);
3505  SAT_UH2_UH(tmp0, tmp1, 7);
3506  PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 16);
3507 
3508  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3509  SRARI_H2_UH(tmp2, tmp3, 7);
3510  SAT_UH2_UH(tmp2, tmp3, 7);
3511  PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 16 + dst_stride);
3512 
3513  ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
3514  ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
3515  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3516  SRARI_H2_UH(tmp0, tmp1, 7);
3517  SAT_UH2_UH(tmp0, tmp1, 7);
3518  PCKEV_AVG_ST_UB(tmp1, tmp0, dst6, dst + 16 + 2 * dst_stride);
3519 
3520  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3521  SRARI_H2_UH(tmp2, tmp3, 7);
3522  SAT_UH2_UH(tmp2, tmp3, 7);
3523  PCKEV_AVG_ST_UB(tmp3, tmp2, dst7, dst + 16 + 3 * dst_stride);
3524  dst += (4 * dst_stride);
3525 
3526  src0 = src4;
3527  src5 = src9;
3528  }
3529 }
3530 
3531 void ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride,
3532  const uint8_t *src, ptrdiff_t src_stride,
3533  int height, int mx, int my)
3534 {
3535  uint32_t loop_cnt;
3536  const int8_t *filter = vp9_bilinear_filters_msa[my - 1];
3537  v16u8 src0, src1, src2, src3, src4, src5;
3538  v16u8 src6, src7, src8, src9, src10, src11, filt0;
3539  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3540  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3541  v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3542  v8u16 filt;
3543 
3544  /* rearranging filter_y */
3545  filt = LD_UH(filter);
3546  filt0 = (v16u8) __msa_splati_h((v8i16) filt, 0);
3547 
3548  LD_UB4(src, 16, src0, src3, src6, src9);
3549  src += src_stride;
3550 
3551  for (loop_cnt = (height >> 1); loop_cnt--;) {
3552  LD_UB2(src, src_stride, src1, src2);
3553  LD_UB2(dst, dst_stride, dst0, dst1);
3554  LD_UB2(src + 16, src_stride, src4, src5);
3555  LD_UB2(dst + 16, dst_stride, dst2, dst3);
3556  LD_UB2(src + 32, src_stride, src7, src8);
3557  LD_UB2(dst + 32, dst_stride, dst4, dst5);
3558  LD_UB2(src + 48, src_stride, src10, src11);
3559  LD_UB2(dst + 48, dst_stride, dst6, dst7);
3560  src += (2 * src_stride);
3561 
3562  ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3563  ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3564  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3565  SRARI_H2_UH(tmp0, tmp1, 7);
3566  SAT_UH2_UH(tmp0, tmp1, 7);
3567  PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3568 
3569  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3570  SRARI_H2_UH(tmp2, tmp3, 7);
3571  SAT_UH2_UH(tmp2, tmp3, 7);
3572  PCKEV_AVG_ST_UB(tmp3, tmp2, dst1, dst + dst_stride);
3573 
3574  ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
3575  ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
3576  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
3577  SRARI_H2_UH(tmp4, tmp5, 7);
3578  SAT_UH2_UH(tmp4, tmp5, 7);
3579  PCKEV_AVG_ST_UB(tmp5, tmp4, dst2, dst + 16);
3580 
3581  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
3582  SRARI_H2_UH(tmp6, tmp7, 7);
3583  SAT_UH2_UH(tmp6, tmp7, 7);
3584  PCKEV_AVG_ST_UB(tmp7, tmp6, dst3, dst + 16 + dst_stride);
3585 
3586  ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
3587  ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
3588  DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3589  SRARI_H2_UH(tmp0, tmp1, 7);
3590  SAT_UH2_UH(tmp0, tmp1, 7);
3591  PCKEV_AVG_ST_UB(tmp1, tmp0, dst4, dst + 32);
3592 
3593  DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3594  SRARI_H2_UH(tmp2, tmp3, 7);
3595  SAT_UH2_UH(tmp2, tmp3, 7);
3596  PCKEV_AVG_ST_UB(tmp3, tmp2, dst5, dst + 32 + dst_stride);
3597 
3598  ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
3599  ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
3600  DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
3601  SRARI_H2_UH(tmp4, tmp5, 7);
3602  SAT_UH2_UH(tmp4, tmp5, 7);
3603  PCKEV_AVG_ST_UB(tmp5, tmp4, dst6, (dst + 48));
3604 
3605  DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
3606  SRARI_H2_UH(tmp6, tmp7, 7);
3607  SAT_UH2_UH(tmp6, tmp7, 7);
3608  PCKEV_AVG_ST_UB(tmp7, tmp6, dst7, dst + 48 + dst_stride);
3609  dst += (2 * dst_stride);
3610 
3611  src0 = src2;
3612  src3 = src5;
3613  src6 = src8;
3614  src9 = src11;
3615  }
3616 }
3617 
3619  int32_t src_stride,
3620  uint8_t *dst,
3621  int32_t dst_stride,
3622  const int8_t *filter_horiz,
3623  const int8_t *filter_vert)
3624 {
3625  v16i8 src0, src1, src2, src3, src4, mask;
3626  v16u8 filt_hz, filt_vt, vec0, vec1;
3627  v16u8 dst0, dst1, dst2, dst3, res0, res1;
3628  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1, filt;
3629 
3630  mask = LD_SB(&mc_filt_mask_arr[16]);
3631 
3632  /* rearranging filter */
3633  filt = LD_UH(filter_horiz);
3634  filt_hz = (v16u8) __msa_splati_h((v8i16) filt, 0);
3635 
3636  filt = LD_UH(filter_vert);
3637  filt_vt = (v16u8) __msa_splati_h((v8i16) filt, 0);
3638 
3639  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3640 
3641  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
3642  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
3643  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3644  hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
3645  hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
3646  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3647 
3648  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3649  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst2);
3650  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3651  SRARI_H2_UH(tmp0, tmp1, 7);
3652  SAT_UH2_UH(tmp0, tmp1, 7);
3653  PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
3654  AVER_UB2_UB(res0, dst0, res1, dst2, res0, res1);
3655  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3656 }
3657 
3659  int32_t src_stride,
3660  uint8_t *dst,
3661  int32_t dst_stride,
3662  const int8_t *filter_horiz,
3663  const int8_t *filter_vert)
3664 {
3665  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
3666  v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
3667  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3668  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3669  v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
3670  v8i16 filt;
3671 
3672  mask = LD_SB(&mc_filt_mask_arr[16]);
3673 
3674  /* rearranging filter */
3675  filt = LD_SH(filter_horiz);
3676  filt_hz = (v16u8) __msa_splati_h(filt, 0);
3677 
3678  filt = LD_SH(filter_vert);
3679  filt_vt = (v16u8) __msa_splati_h(filt, 0);
3680 
3681  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3682  src += (8 * src_stride);
3683  src8 = LD_SB(src);
3684 
3685  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, 7);
3686  hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, 7);
3687  hz_out4 = HORIZ_2TAP_FILT_UH(src4, src5, mask, filt_hz, 7);
3688  hz_out6 = HORIZ_2TAP_FILT_UH(src6, src7, mask, filt_hz, 7);
3689  hz_out8 = HORIZ_2TAP_FILT_UH(src8, src8, mask, filt_hz, 7);
3690  SLDI_B3_UH(hz_out2, hz_out4, hz_out6, hz_out0, hz_out2, hz_out4, hz_out1,
3691  hz_out3, hz_out5, 8);
3692  hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
3693 
3694  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
3695  ILVR_W4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst2,
3696  dst4, dst6);
3697  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3698  ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
3699  DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
3700  tmp0, tmp1, tmp2, tmp3);
3701  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3702  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3703  PCKEV_B4_UB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3, res0, res1,
3704  res2, res3);
3705  AVER_UB4_UB(res0, dst0, res1, dst2, res2, dst4, res3, dst6, res0, res1,
3706  res2, res3);
3707  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
3708  dst += (4 * dst_stride);
3709  ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
3710 }
3711 
3712 void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3713  const uint8_t *src, ptrdiff_t src_stride,
3714  int height, int mx, int my)
3715 {
3716  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3717  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3718 
3719  if (4 == height) {
3720  common_hv_2ht_2vt_and_aver_dst_4x4_msa(src, src_stride, dst, dst_stride,
3721  filter_horiz, filter_vert);
3722  } else if (8 == height) {
3723  common_hv_2ht_2vt_and_aver_dst_4x8_msa(src, src_stride, dst, dst_stride,
3724  filter_horiz, filter_vert);
3725  }
3726 }
3727 
3729  int32_t src_stride,
3730  uint8_t *dst,
3731  int32_t dst_stride,
3732  const int8_t *filter_horiz,
3733  const int8_t *filter_vert)
3734 {
3735  v16i8 src0, src1, src2, src3, src4, mask;
3736  v16u8 filt_hz, filt_vt, dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3;
3737  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
3738  v8i16 filt;
3739 
3740  mask = LD_SB(&mc_filt_mask_arr[0]);
3741 
3742  /* rearranging filter */
3743  filt = LD_SH(filter_horiz);
3744  filt_hz = (v16u8) __msa_splati_h(filt, 0);
3745 
3746  filt = LD_SH(filter_vert);
3747  filt_vt = (v16u8) __msa_splati_h(filt, 0);
3748 
3749  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3750  src += (5 * src_stride);
3751 
3752  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3753  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3754  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3755  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3756  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
3757 
3758  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3759  vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3760  tmp1 = __msa_dotp_u_h(vec1, filt_vt);
3761 
3762  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3763  vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3764  tmp2 = __msa_dotp_u_h(vec2, filt_vt);
3765 
3766  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3767  vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3768  tmp3 = __msa_dotp_u_h(vec3, filt_vt);
3769 
3770  SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3771  SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
3772  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
3773  dst, dst_stride);
3774 }
3775 
3777  int32_t src_stride,
3778  uint8_t *dst,
3779  int32_t dst_stride,
3780  const int8_t *filter_horiz,
3781  const int8_t *filter_vert,
3782  int32_t height)
3783 {
3784  uint32_t loop_cnt;
3785  v16i8 src0, src1, src2, src3, src4, mask;
3786  v16u8 filt_hz, filt_vt, vec0, dst0, dst1, dst2, dst3;
3787  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
3788  v8i16 filt;
3789 
3790  mask = LD_SB(&mc_filt_mask_arr[0]);
3791 
3792  /* rearranging filter */
3793  filt = LD_SH(filter_horiz);
3794  filt_hz = (v16u8) __msa_splati_h(filt, 0);
3795 
3796  filt = LD_SH(filter_vert);
3797  filt_vt = (v16u8) __msa_splati_h(filt, 0);
3798 
3799  src0 = LD_SB(src);
3800  src += src_stride;
3801 
3802  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3803 
3804  for (loop_cnt = (height >> 2); loop_cnt--;) {
3805  LD_SB4(src, src_stride, src1, src2, src3, src4);
3806  src += (4 * src_stride);
3807 
3808  hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3809  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3810  tmp0 = __msa_dotp_u_h(vec0, filt_vt);
3811 
3812  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3813  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3814  tmp1 = __msa_dotp_u_h(vec0, filt_vt);
3815 
3816  SRARI_H2_UH(tmp0, tmp1, 7);
3817  SAT_UH2_UH(tmp0, tmp1, 7);
3818 
3819  hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3820  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3821  tmp2 = __msa_dotp_u_h(vec0, filt_vt);
3822 
3823  hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3824  vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3825  tmp3 = __msa_dotp_u_h(vec0, filt_vt);
3826 
3827  SRARI_H2_UH(tmp2, tmp3, 7);
3828  SAT_UH2_UH(tmp2, tmp3, 7);
3829  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3830  PCKEV_AVG_ST8x4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3,
3831  dst3, dst, dst_stride);
3832  dst += (4 * dst_stride);
3833  }
3834 }
3835 
3836 void ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3837  const uint8_t *src, ptrdiff_t src_stride,
3838  int height, int mx, int my)
3839 {
3840  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3841  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3842 
3843  if (4 == height) {
3844  common_hv_2ht_2vt_and_aver_dst_8x4_msa(src, src_stride, dst, dst_stride,
3845  filter_horiz, filter_vert);
3846  } else {
3848  dst, dst_stride,
3849  filter_horiz, filter_vert,
3850  height);
3851  }
3852 }
3853 
3854 void ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3855  const uint8_t *src, ptrdiff_t src_stride,
3856  int height, int mx, int my)
3857 {
3858  uint32_t loop_cnt;
3859  const int8_t *filter_horiz = vp9_bilinear_filters_msa[mx - 1];
3860  const int8_t *filter_vert = vp9_bilinear_filters_msa[my - 1];
3861  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask;
3862  v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
3863  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
3864  v8i16 filt;
3865 
3866  mask = LD_SB(&mc_filt_mask_arr[0]);
3867 
3868  /* rearranging filter */
3869  filt = LD_SH(filter_horiz);
3870  filt_hz = (v16u8) __msa_splati_h(filt, 0);
3871 
3872  filt = LD_SH(filter_vert);
3873  filt_vt = (v16u8) __msa_splati_h(filt, 0);
3874 
3875  LD_SB2(src, 8, src0, src1);
3876  src += src_stride;
3877 
3878  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3879  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3880 
3881  for (loop_cnt = (height >> 2); loop_cnt--;) {
3882  LD_SB4(src, src_stride, src0, src2, src4, src6);
3883  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3884  src += (4 * src_stride);
3885  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3886 
3887  hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, 7);
3888  hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
3889  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3890  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3891  SRARI_H2_UH(tmp0, tmp1, 7);
3892  SAT_UH2_UH(tmp0, tmp1, 7);
3893  PCKEV_AVG_ST_UB(tmp1, tmp0, dst0, dst);
3894  dst += dst_stride;
3895 
3896  hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, 7);
3897  hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, 7);
3898  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
3899  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3900  SRARI_H2_UH(tmp0, tmp1, 7);
3901  SAT_UH2_UH(tmp0, tmp1, 7);
3902  PCKEV_AVG_ST_UB(tmp1, tmp0, dst1, dst);
3903  dst += dst_stride;
3904 
3905  hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, 7);
3906  hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, 7);
3907  ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3908  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3909  SRARI_H2_UH(tmp0, tmp1, 7);
3910  SAT_UH2_UH(tmp0, tmp1, 7);
3911  PCKEV_AVG_ST_UB(tmp1, tmp0, dst2, dst);
3912  dst += dst_stride;
3913 
3914  hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, 7);
3915  hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, 7);
3916  ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
3917  DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3918  SRARI_H2_UH(tmp0, tmp1, 7);
3919  SAT_UH2_UH(tmp0, tmp1, 7);
3920  PCKEV_AVG_ST_UB(tmp1, tmp0, dst3, dst);
3921  dst += dst_stride;
3922  }
3923 }
3924 
3925 void ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3926  const uint8_t *src, ptrdiff_t src_stride,
3927  int height, int mx, int my)
3928 {
3929  int32_t multiple8_cnt;
3930 
3931  for (multiple8_cnt = 2; multiple8_cnt--;) {
3932  ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
3933 
3934  src += 16;
3935  dst += 16;
3936  }
3937 }
3938 
3939 void ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
3940  const uint8_t *src, ptrdiff_t src_stride,
3941  int height, int mx, int my)
3942 {
3943  int32_t multiple8_cnt;
3944 
3945  for (multiple8_cnt = 4; multiple8_cnt--;) {
3946  ff_avg_bilin_16hv_msa(dst, dst_stride, src, src_stride, height, mx, my);
3947 
3948  src += 16;
3949  dst += 16;
3950  }
3951 }
3952 
3953 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
3954  uint8_t *dst, int32_t dst_stride,
3955  int32_t height)
3956 {
3957  int32_t cnt;
3958  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
3959  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
3960 
3961  if (0 == height % 12) {
3962  for (cnt = (height / 12); cnt--;) {
3963  LD_UB8(src, src_stride,
3964  src0, src1, src2, src3, src4, src5, src6, src7);
3965  src += (8 * src_stride);
3966 
3967  out0 = __msa_copy_u_d((v2i64) src0, 0);
3968  out1 = __msa_copy_u_d((v2i64) src1, 0);
3969  out2 = __msa_copy_u_d((v2i64) src2, 0);
3970  out3 = __msa_copy_u_d((v2i64) src3, 0);
3971  out4 = __msa_copy_u_d((v2i64) src4, 0);
3972  out5 = __msa_copy_u_d((v2i64) src5, 0);
3973  out6 = __msa_copy_u_d((v2i64) src6, 0);
3974  out7 = __msa_copy_u_d((v2i64) src7, 0);
3975 
3976  SD4(out0, out1, out2, out3, dst, dst_stride);
3977  dst += (4 * dst_stride);
3978  SD4(out4, out5, out6, out7, dst, dst_stride);
3979  dst += (4 * dst_stride);
3980 
3981  LD_UB4(src, src_stride, src0, src1, src2, src3);
3982  src += (4 * src_stride);
3983 
3984  out0 = __msa_copy_u_d((v2i64) src0, 0);
3985  out1 = __msa_copy_u_d((v2i64) src1, 0);
3986  out2 = __msa_copy_u_d((v2i64) src2, 0);
3987  out3 = __msa_copy_u_d((v2i64) src3, 0);
3988 
3989  SD4(out0, out1, out2, out3, dst, dst_stride);
3990  dst += (4 * dst_stride);
3991  }
3992  } else if (0 == height % 8) {
3993  for (cnt = height >> 3; cnt--;) {
3994  LD_UB8(src, src_stride,
3995  src0, src1, src2, src3, src4, src5, src6, src7);
3996  src += (8 * src_stride);
3997 
3998  out0 = __msa_copy_u_d((v2i64) src0, 0);
3999  out1 = __msa_copy_u_d((v2i64) src1, 0);
4000  out2 = __msa_copy_u_d((v2i64) src2, 0);
4001  out3 = __msa_copy_u_d((v2i64) src3, 0);
4002  out4 = __msa_copy_u_d((v2i64) src4, 0);
4003  out5 = __msa_copy_u_d((v2i64) src5, 0);
4004  out6 = __msa_copy_u_d((v2i64) src6, 0);
4005  out7 = __msa_copy_u_d((v2i64) src7, 0);
4006 
4007  SD4(out0, out1, out2, out3, dst, dst_stride);
4008  dst += (4 * dst_stride);
4009  SD4(out4, out5, out6, out7, dst, dst_stride);
4010  dst += (4 * dst_stride);
4011  }
4012  } else if (0 == height % 4) {
4013  for (cnt = (height / 4); cnt--;) {
4014  LD_UB4(src, src_stride, src0, src1, src2, src3);
4015  src += (4 * src_stride);
4016  out0 = __msa_copy_u_d((v2i64) src0, 0);
4017  out1 = __msa_copy_u_d((v2i64) src1, 0);
4018  out2 = __msa_copy_u_d((v2i64) src2, 0);
4019  out3 = __msa_copy_u_d((v2i64) src3, 0);
4020 
4021  SD4(out0, out1, out2, out3, dst, dst_stride);
4022  dst += (4 * dst_stride);
4023  }
4024  } else if (0 == height % 2) {
4025  for (cnt = (height / 2); cnt--;) {
4026  LD_UB2(src, src_stride, src0, src1);
4027  src += (2 * src_stride);
4028  out0 = __msa_copy_u_d((v2i64) src0, 0);
4029  out1 = __msa_copy_u_d((v2i64) src1, 0);
4030 
4031  SD(out0, dst);
4032  dst += dst_stride;
4033  SD(out1, dst);
4034  dst += dst_stride;
4035  }
4036  }
4037 }
4038 
4039 static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
4040  uint8_t *dst, int32_t dst_stride,
4042 {
4043  int32_t cnt, loop_cnt;
4044  const uint8_t *src_tmp;
4045  uint8_t *dst_tmp;
4046  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4047 
4048  for (cnt = (width >> 4); cnt--;) {
4049  src_tmp = src;
4050  dst_tmp = dst;
4051 
4052  for (loop_cnt = (height >> 3); loop_cnt--;) {
4053  LD_UB8(src_tmp, src_stride,
4054  src0, src1, src2, src3, src4, src5, src6, src7);
4055  src_tmp += (8 * src_stride);
4056 
4057  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
4058  dst_tmp, dst_stride);
4059  dst_tmp += (8 * dst_stride);
4060  }
4061 
4062  src += 16;
4063  dst += 16;
4064  }
4065 }
4066 
4067 static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
4068  uint8_t *dst, int32_t dst_stride,
4069  int32_t height)
4070 {
4071  int32_t cnt;
4072  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4073 
4074  if (0 == height % 12) {
4075  for (cnt = (height / 12); cnt--;) {
4076  LD_UB8(src, src_stride,
4077  src0, src1, src2, src3, src4, src5, src6, src7);
4078  src += (8 * src_stride);
4079  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
4080  dst, dst_stride);
4081  dst += (8 * dst_stride);
4082 
4083  LD_UB4(src, src_stride, src0, src1, src2, src3);
4084  src += (4 * src_stride);
4085  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
4086  dst += (4 * dst_stride);
4087  }
4088  } else if (0 == height % 8) {
4089  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
4090  } else if (0 == height % 4) {
4091  for (cnt = (height >> 2); cnt--;) {
4092  LD_UB4(src, src_stride, src0, src1, src2, src3);
4093  src += (4 * src_stride);
4094 
4095  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
4096  dst += (4 * dst_stride);
4097  }
4098  }
4099 }
4100 
4101 static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
4102  uint8_t *dst, int32_t dst_stride,
4103  int32_t height)
4104 {
4105  int32_t cnt;
4106  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4107 
4108  if (0 == height % 12) {
4109  for (cnt = (height / 12); cnt--;) {
4110  LD_UB4(src, src_stride, src0, src1, src2, src3);
4111  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
4112  src += (4 * src_stride);
4113  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
4114  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
4115  dst += (4 * dst_stride);
4116 
4117  LD_UB4(src, src_stride, src0, src1, src2, src3);
4118  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
4119  src += (4 * src_stride);
4120  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
4121  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
4122  dst += (4 * dst_stride);
4123 
4124  LD_UB4(src, src_stride, src0, src1, src2, src3);
4125  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
4126  src += (4 * src_stride);
4127  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
4128  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
4129  dst += (4 * dst_stride);
4130  }
4131  } else if (0 == height % 8) {
4132  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
4133  } else if (0 == height % 4) {
4134  for (cnt = (height >> 2); cnt--;) {
4135  LD_UB4(src, src_stride, src0, src1, src2, src3);
4136  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
4137  src += (4 * src_stride);
4138  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
4139  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
4140  dst += (4 * dst_stride);
4141  }
4142  }
4143 }
4144 
4145 static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
4146  uint8_t *dst, int32_t dst_stride,
4147  int32_t height)
4148 {
4149  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
4150 }
4151 
4152 static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
4153  uint8_t *dst, int32_t dst_stride,
4154  int32_t height)
4155 {
4156  int32_t cnt;
4157  uint32_t out0, out1, out2, out3;
4158  v16u8 src0, src1, src2, src3;
4159  v16u8 dst0, dst1, dst2, dst3;
4160 
4161  if (0 == (height % 4)) {
4162  for (cnt = (height / 4); cnt--;) {
4163  LD_UB4(src, src_stride, src0, src1, src2, src3);
4164  src += (4 * src_stride);
4165 
4166  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
4167 
4168  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4169  dst0, dst1, dst2, dst3);
4170 
4171  out0 = __msa_copy_u_w((v4i32) dst0, 0);
4172  out1 = __msa_copy_u_w((v4i32) dst1, 0);
4173  out2 = __msa_copy_u_w((v4i32) dst2, 0);
4174  out3 = __msa_copy_u_w((v4i32) dst3, 0);
4175  SW4(out0, out1, out2, out3, dst, dst_stride);
4176  dst += (4 * dst_stride);
4177  }
4178  } else if (0 == (height % 2)) {
4179  for (cnt = (height / 2); cnt--;) {
4180  LD_UB2(src, src_stride, src0, src1);
4181  src += (2 * src_stride);
4182 
4183  LD_UB2(dst, dst_stride, dst0, dst1);
4184 
4185  AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
4186 
4187  out0 = __msa_copy_u_w((v4i32) dst0, 0);
4188  out1 = __msa_copy_u_w((v4i32) dst1, 0);
4189  SW(out0, dst);
4190  dst += dst_stride;
4191  SW(out1, dst);
4192  dst += dst_stride;
4193  }
4194  }
4195 }
4196 
4197 static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
4198  uint8_t *dst, int32_t dst_stride,
4199  int32_t height)
4200 {
4201  int32_t cnt;
4202  uint64_t out0, out1, out2, out3;
4203  v16u8 src0, src1, src2, src3;
4204  v16u8 dst0, dst1, dst2, dst3;
4205 
4206  for (cnt = (height / 4); cnt--;) {
4207  LD_UB4(src, src_stride, src0, src1, src2, src3);
4208  src += (4 * src_stride);
4209  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
4210 
4211  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4212  dst0, dst1, dst2, dst3);
4213 
4214  out0 = __msa_copy_u_d((v2i64) dst0, 0);
4215  out1 = __msa_copy_u_d((v2i64) dst1, 0);
4216  out2 = __msa_copy_u_d((v2i64) dst2, 0);
4217  out3 = __msa_copy_u_d((v2i64) dst3, 0);
4218  SD4(out0, out1, out2, out3, dst, dst_stride);
4219  dst += (4 * dst_stride);
4220  }
4221 }
4222 
4223 static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
4224  uint8_t *dst, int32_t dst_stride,
4225  int32_t height)
4226 {
4227  int32_t cnt;
4228  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4229  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4230 
4231  for (cnt = (height / 8); cnt--;) {
4232  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
4233  src += (8 * src_stride);
4234  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
4235 
4236  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4237  dst0, dst1, dst2, dst3);
4238  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4239  dst4, dst5, dst6, dst7);
4240  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
4241  dst += (8 * dst_stride);
4242  }
4243 }
4244 
4245 static void avg_width32_msa(const uint8_t *src, int32_t src_stride,
4246  uint8_t *dst, int32_t dst_stride,
4247  int32_t height)
4248 {
4249  int32_t cnt;
4250  uint8_t *dst_dup = dst;
4251  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4252  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4253  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4254  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4255 
4256  for (cnt = (height / 8); cnt--;) {
4257  LD_UB4(src, src_stride, src0, src2, src4, src6);
4258  LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
4259  src += (4 * src_stride);
4260  LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
4261  LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
4262  dst_dup += (4 * dst_stride);
4263  LD_UB4(src, src_stride, src8, src10, src12, src14);
4264  LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
4265  src += (4 * src_stride);
4266  LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
4267  LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
4268  dst_dup += (4 * dst_stride);
4269 
4270  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4271  dst0, dst1, dst2, dst3);
4272  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4273  dst4, dst5, dst6, dst7);
4274  AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
4275  dst8, dst9, dst10, dst11);
4276  AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
4277  dst12, dst13, dst14, dst15);
4278 
4279  ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
4280  ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
4281  dst += (4 * dst_stride);
4282  ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
4283  ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
4284  dst += (4 * dst_stride);
4285  }
4286 }
4287 
4288 static void avg_width64_msa(const uint8_t *src, int32_t src_stride,
4289  uint8_t *dst, int32_t dst_stride,
4290  int32_t height)
4291 {
4292  int32_t cnt;
4293  uint8_t *dst_dup = dst;
4294  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
4295  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4296  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4297  v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4298 
4299  for (cnt = (height / 4); cnt--;) {
4300  LD_UB4(src, 16, src0, src1, src2, src3);
4301  src += src_stride;
4302  LD_UB4(src, 16, src4, src5, src6, src7);
4303  src += src_stride;
4304  LD_UB4(src, 16, src8, src9, src10, src11);
4305  src += src_stride;
4306  LD_UB4(src, 16, src12, src13, src14, src15);
4307  src += src_stride;
4308 
4309  LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
4310  dst_dup += dst_stride;
4311  LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
4312  dst_dup += dst_stride;
4313  LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
4314  dst_dup += dst_stride;
4315  LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
4316  dst_dup += dst_stride;
4317 
4318  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4319  dst0, dst1, dst2, dst3);
4320  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4321  dst4, dst5, dst6, dst7);
4322  AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
4323  dst8, dst9, dst10, dst11);
4324  AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
4325  dst12, dst13, dst14, dst15);
4326 
4327  ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
4328  dst += dst_stride;
4329  ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
4330  dst += dst_stride;
4331  ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
4332  dst += dst_stride;
4333  ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
4334  dst += dst_stride;
4335  }
4336 }
4337 
4338 static const int8_t vp9_subpel_filters_msa[3][15][8] = {
4339  [FILTER_8TAP_REGULAR] = {
4340  {0, 1, -5, 126, 8, -3, 1, 0},
4341  {-1, 3, -10, 122, 18, -6, 2, 0},
4342  {-1, 4, -13, 118, 27, -9, 3, -1},
4343  {-1, 4, -16, 112, 37, -11, 4, -1},
4344  {-1, 5, -18, 105, 48, -14, 4, -1},
4345  {-1, 5, -19, 97, 58, -16, 5, -1},
4346  {-1, 6, -19, 88, 68, -18, 5, -1},
4347  {-1, 6, -19, 78, 78, -19, 6, -1},
4348  {-1, 5, -18, 68, 88, -19, 6, -1},
4349  {-1, 5, -16, 58, 97, -19, 5, -1},
4350  {-1, 4, -14, 48, 105, -18, 5, -1},
4351  {-1, 4, -11, 37, 112, -16, 4, -1},
4352  {-1, 3, -9, 27, 118, -13, 4, -1},
4353  {0, 2, -6, 18, 122, -10, 3, -1},
4354  {0, 1, -3, 8, 126, -5, 1, 0},
4355  }, [FILTER_8TAP_SHARP] = {
4356  {-1, 3, -7, 127, 8, -3, 1, 0},
4357  {-2, 5, -13, 125, 17, -6, 3, -1},
4358  {-3, 7, -17, 121, 27, -10, 5, -2},
4359  {-4, 9, -20, 115, 37, -13, 6, -2},
4360  {-4, 10, -23, 108, 48, -16, 8, -3},
4361  {-4, 10, -24, 100, 59, -19, 9, -3},
4362  {-4, 11, -24, 90, 70, -21, 10, -4},
4363  {-4, 11, -23, 80, 80, -23, 11, -4},
4364  {-4, 10, -21, 70, 90, -24, 11, -4},
4365  {-3, 9, -19, 59, 100, -24, 10, -4},
4366  {-3, 8, -16, 48, 108, -23, 10, -4},
4367  {-2, 6, -13, 37, 115, -20, 9, -4},
4368  {-2, 5, -10, 27, 121, -17, 7, -3},
4369  {-1, 3, -6, 17, 125, -13, 5, -2},
4370  {0, 1, -3, 8, 127, -7, 3, -1},
4371  }, [FILTER_8TAP_SMOOTH] = {
4372  {-3, -1, 32, 64, 38, 1, -3, 0},
4373  {-2, -2, 29, 63, 41, 2, -3, 0},
4374  {-2, -2, 26, 63, 43, 4, -4, 0},
4375  {-2, -3, 24, 62, 46, 5, -4, 0},
4376  {-2, -3, 21, 60, 49, 7, -4, 0},
4377  {-1, -4, 18, 59, 51, 9, -4, 0},
4378  {-1, -4, 16, 57, 53, 12, -4, -1},
4379  {-1, -4, 14, 55, 55, 14, -4, -1},
4380  {-1, -4, 12, 53, 57, 16, -4, -1},
4381  {0, -4, 9, 51, 59, 18, -4, -1},
4382  {0, -4, 7, 49, 60, 21, -3, -2},
4383  {0, -4, 5, 46, 62, 24, -3, -2},
4384  {0, -4, 4, 43, 63, 26, -2, -2},
4385  {0, -3, 2, 41, 63, 29, -2, -2},
4386  {0, -3, 1, 38, 64, 32, -1, -3},
4387  }
4388 };
4389 
4390 #define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx) \
4391 void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \
4392  const uint8_t *src, \
4393  ptrdiff_t srcstride, \
4394  int h, int mx, int my) \
4395 { \
4396  const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1]; \
4397  \
4398  common_hz_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h); \
4399 } \
4400  \
4401 void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \
4402  const uint8_t *src, \
4403  ptrdiff_t srcstride, \
4404  int h, int mx, int my) \
4405 { \
4406  const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1]; \
4407  \
4408  common_vt_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h); \
4409 } \
4410  \
4411 void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \
4412  const uint8_t *src, \
4413  ptrdiff_t srcstride, \
4414  int h, int mx, int my) \
4415 { \
4416  const uint8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1]; \
4417  const uint8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1]; \
4418  \
4419  common_hv_8ht_8vt_##SIZE##w_msa(src, srcstride, dst, dststride, hfilter, \
4420  vfilter, h); \
4421 } \
4422  \
4423 void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \
4424  const uint8_t *src, \
4425  ptrdiff_t srcstride, \
4426  int h, int mx, int my) \
4427 { \
4428  const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1]; \
4429  \
4430  common_hz_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, \
4431  dststride, filter, h); \
4432 } \
4433  \
4434 void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \
4435  const uint8_t *src, \
4436  ptrdiff_t srcstride, \
4437  int h, int mx, int my) \
4438 { \
4439  const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1]; \
4440  \
4441  common_vt_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, dststride, \
4442  filter, h); \
4443 } \
4444  \
4445 void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \
4446  const uint8_t *src, \
4447  ptrdiff_t srcstride, \
4448  int h, int mx, int my) \
4449 { \
4450  const uint8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1]; \
4451  const uint8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1]; \
4452  \
4453  common_hv_8ht_8vt_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, \
4454  dststride, hfilter, \
4455  vfilter, h); \
4456 }
4457 
4458 #define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE) \
4459 void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \
4460  const uint8_t *src, ptrdiff_t srcstride, \
4461  int h, int mx, int my) \
4462 { \
4463  \
4464  copy_width##SIZE##_msa(src, srcstride, dst, dststride, h); \
4465 } \
4466  \
4467 void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \
4468  const uint8_t *src, ptrdiff_t srcstride, \
4469  int h, int mx, int my) \
4470 { \
4471  \
4472  avg_width##SIZE##_msa(src, srcstride, dst, dststride, h); \
4473 }
4474 
4475 #define VP9_AVG_MIPS_MSA_FUNC(SIZE) \
4476 void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \
4477  const uint8_t *src, ptrdiff_t srcstride, \
4478  int h, int mx, int my) \
4479 { \
4480  \
4481  avg_width##SIZE##_msa(src, srcstride, dst, dststride, h); \
4482 }
4483 
4489 
4495 
4501 
4507 
4508 #undef VP9_8TAP_MIPS_MSA_FUNC
4509 #undef VP9_COPY_AVG_MIPS_MSA_FUNC
4510 #undef VP9_AVG_MIPS_MSA_FUNC
void ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2313
void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3172
static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1828
static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1083
void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3712
static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:807
#define SLDI_B2_SH(...)
#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE)
Definition: vp9_mc_msa.c:4458
#define VSHF_B4_SH(...)
static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:2676
void ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3836
static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:584
#define XORI_B2_128_SB(...)
static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:160
static void copy_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:3953
#define PCKEV_XORI128_UB(in0, in1)
#define SD
Definition: ccaption_dec.c:647
static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:3618
static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2871
#define LD_SB(...)
#define XORI_B3_128_SB(...)
#define SLDI_B3_UH(...)
#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx)
Definition: vp9_mc_msa.c:4390
#define ILVR_D2_UB(...)
void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2617
void ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2079
#define LD_UB4(...)
void ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3397
#define DPADD_SB4_SH(...)
#define ILVR_B2_SB(...)
static void copy_width64_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4145
static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:3250
static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1604
static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1808
#define LD_SB2(...)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1127
static const int8_t vp9_subpel_filters_msa[3][15][8]
Definition: vp9_mc_msa.c:4338
#define XORI_B4_128_UB(...)
static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:2972
#define PCKEV_ST_SB(in0, in1, pdst)
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1, out2, out3)
Definition: vp9_mc_msa.c:102
static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1870
#define ILVR_D2_SB(...)
static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:3776
uint8_t
#define LD_UB2(...)
#define SRARI_H4_SH(...)
#define XORI_B2_128_UB(...)
void ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2857
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, uint8_t clip)
Definition: cfhd.c:82
#define SPLATI_H4_SH(...)
#define ILVL_B2_SB(...)
#define LD_SH(...)
static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:3214
static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2243
static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:815
static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:329
#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)
Definition: vp9_mc_msa.c:139
#define LD_UB5(...)
void ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2756
#define ILVR_D3_SB(...)
#define ILVR_D4_SB(...)
#define LD_SB8(...)
void ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2327
static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
Definition: vp9_mc_msa.c:718
#define PCKEV_B2_SB(...)
static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:532
void ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2772
static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1031
void ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2459
static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2946
static const uint16_t mask[17]
Definition: lzw.c:38
static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1687
static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1911
#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,filt_h0, filt_h1, filt_h2, filt_h3)
Definition: vp9_mc_msa.c:66
static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:823
static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1419
#define XORI_B7_128_SB(...)
void ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3925
#define SW4(in0, in1, in2, in3, pdst, stride)
#define XORI_B4_128_SB(...)
static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:365
static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:997
static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1180
static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
Definition: vp9_mc_msa.c:1477
static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:298
static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:475
static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:2633
#define SRARI_H2_SH(...)
#define ILVR_B4_UB(...)
void ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3122
static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1014
static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2195
#define LD_UB8(...)
#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)
void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2230
#define SRARI_H2_UH(...)
#define VSHF_B2_UH(...)
int32_t
void ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3039
void ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3451
#define PCKEV_B4_SB(...)
#define AVER_UB2_UB(...)
static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:2540
static void avg_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4223
static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:377
#define ST_UB(...)
static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:1788
static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1584
#define SAT_SH4_SH(...)
#define SPLATI_H4_SB(...)
#define LD_SB4(...)
#define src
Definition: vp9dsp.c:530
static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t width)
Definition: vp9_mc_msa.c:4039
#define PCKEV_B4_UB(...)
static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:2572
void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2126
#define ST_UB8(...)
FILE * out
Definition: movenc-test.c:54
#define AVER_UB4_UB(...)
void ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:1998
#define ST_UB4(...)
#define ILVR_W4_UB(...)
static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1238
#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,dst0, dst1, dst2, dst3, pdst, stride)
static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:639
#define src1
Definition: h264pred.c:139
static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1048
void ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2381
#define ILVL_B4_SB(...)
#define SAT_SH2_SH(...)
static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2169
static void copy_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4067
static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:416
static const int8_t vp9_bilinear_filters_msa[15][2]
Definition: vp9_mc_msa.c:34
static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4197
static void avg_width64_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4288
#define ILVR_D4_UB(...)
void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:1898
BYTE int const BYTE int int int height
Definition: avisynth_c.h:676
static void avg_width32_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4245
#define DOTP_SB4_SH(...)
#define DOTP_UB2_UH(...)
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,filt0, filt1, filt2, filt3)
Definition: vp9_mc_msa.c:52
void ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2012
static void copy_width32_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4101
#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst)
Definition: vp9_mc_msa.c:130
#define SRARI_H4_UH(...)
#define src0
Definition: h264pred.c:138
static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:3332
static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
Definition: vp9_mc_msa.c:899
static void common_hz_8t_4x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:227
#define SD4(in0, in1, in2, in3, pdst, stride)
void ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3054
#define SW(val, pdst)
static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:2267
static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:3305
static const int8_t filt[NUMTAPS]
Definition: af_earwax.c:39
#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3,pdst, stride)
Definition: vp9_mc_msa.c:148
static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:189
static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1359
#define ILVR_W2_UB(...)
#define ST4x8_UB(in0, in1, pdst, stride)
#define LD_SB7(...)
#define LD_SB5(...)
void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3290
static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1297
static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:3728
#define ILVEV_B2_SH(...)
#define ILVEV_B2_UB(...)
#define ST8x4_UB(in0, in1, pdst, stride)
#define ILVL_B2_UB(...)
void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2931
#define SAT_UH2_UH(...)
void ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3382
#define SAT_UH4_UH(...)
void ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3531
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1)
Definition: vp9_mc_msa.c:83
static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:285
#define SLDI_B3_SB(...)
#define LD_UB(...)
static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1142
#define DOTP_UB4_UH(...)
#define VSHF_B2_UB(...)
void ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3939
#define ILVR_B4_SB(...)
static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1935
static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1574
static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
Definition: vp9_mc_msa.c:3658
void ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:3854
static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:2897
static const uint8_t mc_filt_mask_arr[16 *3]
Definition: vp9_mc_msa.c:25
static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: vp9_mc_msa.c:1848
#define LD_UH(...)
#define VP9_AVG_MIPS_MSA_FUNC(SIZE)
Definition: vp9_mc_msa.c:4475
void ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp9_mc_msa.c:2843
#define PCKEV_B2_UB(...)
static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: vp9_mc_msa.c:4152
#define ILVR_B2_UB(...)
static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: vp9_mc_msa.c:1594
#define ADDS_SH4_SH(...)
static int width