FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
qpeldsp_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "qpeldsp_mips.h"
23 
24 #define APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, coef0, coef1, coef2) \
25 ( { \
26  v16u8 out, tmp0, tmp1; \
27  v16u8 data0, data1, data2, data3, data4, data5; \
28  v8i16 res_r, res_l; \
29  v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
30  v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
31  \
32  VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \
33  ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \
34  data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \
35  data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \
36  HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \
37  ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \
38  data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \
39  data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \
40  sum0_r *= (v8u16) (coef0); \
41  sum0_l *= (v8u16) (coef0); \
42  ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \
43  data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \
44  data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \
45  DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
46  ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \
47  HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
48  DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
49  res_r = (v8i16) (sum0_r - sum3_r); \
50  res_l = (v8i16) (sum0_l - sum3_l); \
51  SRARI_H2_SH(res_r, res_l, 5); \
52  CLIP_SH2_0_255(res_r, res_l); \
53  out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
54  \
55  out; \
56 } )
57 
58 #define APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, \
59  mask0, mask1, mask2, mask3, \
60  coef0, coef1, coef2) \
61 ( { \
62  v16u8 out; \
63  v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
64  v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \
65  v8i16 res0_r, res1_r; \
66  \
67  VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \
68  VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \
69  HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \
70  DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \
71  VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \
72  VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \
73  DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \
74  DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \
75  res0_r = (v8i16) (sum0_r - sum3_r); \
76  res1_r = (v8i16) (sum4_r - sum7_r); \
77  SRARI_H2_SH(res0_r, res1_r, 5); \
78  CLIP_SH2_0_255(res0_r, res1_r); \
79  out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \
80  \
81  out; \
82 } )
83 
84 #define APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, \
85  mask0, mask1, mask2, mask3, \
86  coef0, coef1, coef2) \
87 ( { \
88  v16u8 out; \
89  v8i16 res0_r; \
90  v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
91  \
92  VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \
93  sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \
94  sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \
95  VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \
96  DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \
97  res0_r = (v8i16) (sum0_r - sum3_r); \
98  res0_r = __msa_srari_h(res0_r, 5); \
99  res0_r = CLIP_SH_0_255(res0_r); \
100  out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \
101  \
102  out; \
103 } )
104 
105 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, \
106  mask2, mask3, coef0, \
107  coef1, coef2) \
108 ( { \
109  v16u8 out; \
110  v8i16 res0_r; \
111  v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
112  \
113  VSHF_B2_UH(inp0, inp0, inp0, inp0, mask0, mask3, sum0_r, sum3_r); \
114  sum3_r = __msa_hadd_u_h((v16u8) sum3_r, (v16u8) sum3_r); \
115  sum0_r = __msa_dotp_u_h((v16u8) sum0_r, (v16u8) coef0); \
116  VSHF_B2_UH(inp0, inp0, inp0, inp0, mask2, mask1, sum2_r, sum1_r); \
117  DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \
118  res0_r = (v8i16) (sum0_r - sum3_r); \
119  res0_r += 15; \
120  res0_r >>= 5; \
121  res0_r = CLIP_SH_0_255(res0_r); \
122  out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \
123  \
124  out; \
125 } )
126 
127 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask, \
128  coef0, coef1, coef2) \
129 ( { \
130  v16u8 out, tmp0, tmp1; \
131  v16u8 data0, data1, data2, data3, data4, data5; \
132  v8i16 res_r, res_l; \
133  v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
134  v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
135  \
136  VSHF_B2_UB(inp0, inp0, inp1, inp1, mask, mask, tmp0, tmp1); \
137  ILVRL_B2_UH(inp1, inp0, sum0_r, sum0_l); \
138  data0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 15); \
139  data3 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 1); \
140  HADD_UB2_UH(sum0_r, sum0_l, sum0_r, sum0_l); \
141  ILVRL_B2_UH(data3, data0, sum1_r, sum1_l); \
142  data1 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 14); \
143  data4 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 2); \
144  sum0_r *= (v8u16) (coef0); \
145  sum0_l *= (v8u16) (coef0); \
146  ILVRL_B2_UH(data4, data1, sum2_r, sum2_l); \
147  data2 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) tmp0, 13); \
148  data5 = (v16u8) __msa_sldi_b((v16i8) tmp1, (v16i8) inp1, 3); \
149  DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
150  ILVRL_B2_UH(data5, data2, sum3_r, sum3_l); \
151  HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
152  DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
153  res_r = (v8i16) (sum0_r - sum3_r); \
154  res_l = (v8i16) (sum0_l - sum3_l); \
155  res_r += 15; \
156  res_l += 15; \
157  res_r >>= 5; \
158  res_l >>= 5; \
159  CLIP_SH2_0_255(res_r, res_l); \
160  out = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
161  \
162  out; \
163 } )
164 
165 #define APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, \
166  mask0, mask1, mask2, mask3, \
167  coef0, coef1, coef2) \
168 ( { \
169  v16u8 out; \
170  v8i16 res0_r, res1_r; \
171  v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
172  v8u16 sum4_r, sum5_r, sum6_r, sum7_r; \
173  \
174  VSHF_B2_UH(inp0, inp0, inp1, inp1, mask0, mask0, sum0_r, sum4_r); \
175  VSHF_B2_UH(inp0, inp0, inp1, inp1, mask3, mask3, sum3_r, sum7_r); \
176  HADD_UB2_UH(sum3_r, sum7_r, sum3_r, sum7_r); \
177  DOTP_UB2_UH(sum0_r, sum4_r, coef0, coef0, sum0_r, sum4_r); \
178  VSHF_B2_UH(inp0, inp0, inp1, inp1, mask2, mask2, sum2_r, sum6_r); \
179  VSHF_B2_UH(inp0, inp0, inp1, inp1, mask1, mask1, sum1_r, sum5_r); \
180  DPADD_UB2_UH(sum2_r, sum6_r, coef2, coef2, sum0_r, sum4_r); \
181  DPADD_UB2_UH(sum1_r, sum5_r, coef1, coef1, sum3_r, sum7_r); \
182  res0_r = (v8i16) (sum0_r - sum3_r); \
183  res1_r = (v8i16) (sum4_r - sum7_r); \
184  res0_r += 15; \
185  res1_r += 15; \
186  res0_r >>= 5; \
187  res1_r >>= 5; \
188  CLIP_SH2_0_255(res0_r, res1_r); \
189  out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r); \
190  \
191  out; \
192 } )
193 
194 #define APPLY_VERT_QPEL_FILTER(inp0, inp1, inp2, inp3, \
195  inp4, inp5, inp6, inp7, \
196  coef0, coef1, coef2) \
197 ( { \
198  v16u8 res; \
199  v8i16 res_r, res_l; \
200  v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
201  v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
202  \
203  ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \
204  ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \
205  DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \
206  HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
207  ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \
208  ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \
209  DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
210  DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
211  res_r = (v8i16) (sum0_r - sum3_r); \
212  res_l = (v8i16) (sum0_l - sum3_l); \
213  SRARI_H2_SH(res_r, res_l, 5); \
214  CLIP_SH2_0_255(res_r, res_l); \
215  res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
216  \
217  res; \
218 } )
219 
220 #define APPLY_VERT_QPEL_FILTER_8BYTE(inp00, inp01, inp02, inp03, \
221  inp04, inp05, inp06, inp07, \
222  inp10, inp11, inp12, inp13, \
223  inp14, inp15, inp16, inp17, \
224  coef0, coef1, coef2) \
225 ( { \
226  v16u8 res; \
227  v8i16 val0, val1; \
228  v8u16 sum00, sum01, sum02, sum03; \
229  v8u16 sum10, sum11, sum12, sum13; \
230  \
231  ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \
232  sum00, sum10, sum03, sum13); \
233  DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \
234  HADD_UB2_UH(sum03, sum13, sum03, sum13); \
235  ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \
236  sum02, sum12, sum01, sum11); \
237  DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \
238  DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \
239  val0 = (v8i16) (sum00 - sum03); \
240  val1 = (v8i16) (sum10 - sum13); \
241  SRARI_H2_SH(val0, val1, 5); \
242  CLIP_SH2_0_255(val0, val1); \
243  res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \
244  \
245  res; \
246 } )
247 
248 #define APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp1, inp2, inp3, \
249  inp4, inp5, inp6, inp7, \
250  coef0, coef1, coef2) \
251 ( { \
252  v16u8 res; \
253  v8i16 res_r, res_l; \
254  v8u16 sum0_r, sum1_r, sum2_r, sum3_r; \
255  v8u16 sum0_l, sum1_l, sum2_l, sum3_l; \
256  \
257  ILVRL_B2_UH(inp4, inp0, sum0_r, sum0_l); \
258  ILVRL_B2_UH(inp7, inp3, sum3_r, sum3_l); \
259  DOTP_UB2_UH(sum0_r, sum0_l, coef0, coef0, sum0_r, sum0_l); \
260  HADD_UB2_UH(sum3_r, sum3_l, sum3_r, sum3_l); \
261  ILVRL_B2_UH(inp6, inp2, sum2_r, sum2_l); \
262  ILVRL_B2_UH(inp5, inp1, sum1_r, sum1_l); \
263  DPADD_UB2_UH(sum2_r, sum2_l, coef2, coef2, sum0_r, sum0_l); \
264  DPADD_UB2_UH(sum1_r, sum1_l, coef1, coef1, sum3_r, sum3_l); \
265  res_r = (v8i16) (sum0_r - sum3_r); \
266  res_l = (v8i16) (sum0_l - sum3_l); \
267  res_r += 15; \
268  res_l += 15; \
269  res_r >>= 5; \
270  res_l >>= 5; \
271  CLIP_SH2_0_255(res_r, res_l); \
272  res = (v16u8) __msa_pckev_b((v16i8) res_l, (v16i8) res_r); \
273  \
274  res; \
275 } )
276 
277 #define APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp00, inp01, inp02, inp03, \
278  inp04, inp05, inp06, inp07, \
279  inp10, inp11, inp12, inp13, \
280  inp14, inp15, inp16, inp17, \
281  coef0, coef1, coef2) \
282 ( { \
283  v16u8 res; \
284  v8i16 val0, val1; \
285  v8u16 sum00, sum01, sum02, sum03; \
286  v8u16 sum10, sum11, sum12, sum13; \
287  \
288  ILVR_B4_UH(inp04, inp00, inp14, inp10, inp07, inp03, inp17, inp13, \
289  sum00, sum10, sum03, sum13); \
290  DOTP_UB2_UH(sum00, sum10, coef0, coef0, sum00, sum10); \
291  HADD_UB2_UH(sum03, sum13, sum03, sum13); \
292  ILVR_B4_UH(inp06, inp02, inp16, inp12, inp05, inp01, inp15, inp11, \
293  sum02, sum12, sum01, sum11); \
294  DPADD_UB2_UH(sum02, sum12, coef2, coef2, sum00, sum10); \
295  DPADD_UB2_UH(sum01, sum11, coef1, coef1, sum03, sum13); \
296  val0 = (v8i16) (sum00 - sum03); \
297  val1 = (v8i16) (sum10 - sum13); \
298  val0 += 15; \
299  val1 += 15; \
300  val0 >>= 5; \
301  val1 >>= 5; \
302  CLIP_SH2_0_255(val0, val1); \
303  res = (v16u8) __msa_pckev_b((v16i8) val1, (v16i8) val0); \
304  \
305  res; \
306 } )
307 
309  int32_t src_stride,
310  uint8_t *dst,
311  int32_t dst_stride,
312  int32_t height)
313 {
314  uint8_t loop_count;
315  v16u8 inp0, inp1, inp2, inp3;
316  v16u8 res0, res1;
317  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
318  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
319  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
320  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
321  v16u8 const20 = (v16u8) __msa_ldi_b(20);
322  v16u8 const6 = (v16u8) __msa_ldi_b(6);
323  v16u8 const3 = (v16u8) __msa_ldi_b(3);
324 
325  for (loop_count = (height >> 2); loop_count--;) {
326  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
327  src += (4 * src_stride);
328  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
329  mask0, mask1, mask2, mask3,
330  const20, const6, const3);
331  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
332  mask0, mask1, mask2, mask3,
333  const20, const6, const3);
334  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
335  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
336  AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
337  ST8x4_UB(res0, res1, dst, dst_stride);
338  dst += (4 * dst_stride);
339  }
340 }
341 
343  int32_t src_stride,
344  uint8_t *dst,
345  int32_t dst_stride,
346  int32_t height)
347 {
348  uint8_t loop_count;
349  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
350  v16u8 res;
351  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
352  v16u8 const6 = (v16u8) __msa_ldi_b(6);
353  v16u8 const3 = (v16u8) __msa_ldi_b(3);
354  v8u16 const20 = (v8u16) __msa_ldi_h(20);
355 
356  for (loop_count = (height >> 2); loop_count--;) {
357  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
358  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
359  src += (4 * src_stride);
360  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
361  const20, const6, const3);
362  res = __msa_aver_u_b(inp0, res);
363  ST_UB(res, dst);
364  dst += dst_stride;
365 
366  res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
367  const20, const6, const3);
368  res = __msa_aver_u_b(inp2, res);
369  ST_UB(res, dst);
370  dst += dst_stride;
371 
372  res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
373  const20, const6, const3);
374  res = __msa_aver_u_b(inp4, res);
375  ST_UB(res, dst);
376  dst += dst_stride;
377 
378  res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
379  const20, const6, const3);
380  res = __msa_aver_u_b(inp6, res);
381  ST_UB(res, dst);
382  dst += dst_stride;
383  }
384 }
385 
387  int32_t src_stride,
388  uint8_t *dst,
389  int32_t dst_stride,
390  int32_t height)
391 {
392  uint8_t loop_count;
393  v16u8 inp0, inp1, inp2, inp3;
394  v16u8 res0, res1;
395  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
396  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
397  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
398  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
399  v16u8 const20 = (v16u8) __msa_ldi_b(20);
400  v16u8 const6 = (v16u8) __msa_ldi_b(6);
401  v16u8 const3 = (v16u8) __msa_ldi_b(3);
402 
403  for (loop_count = (height >> 2); loop_count--;) {
404  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
405  src += (4 * src_stride);
406  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
407  mask0, mask1, mask2, mask3,
408  const20, const6, const3);
409  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
410  mask0, mask1, mask2, mask3,
411  const20, const6, const3);
412  ST8x4_UB(res0, res1, dst, dst_stride);
413  dst += (4 * dst_stride);
414  }
415 }
416 
418  int32_t src_stride,
419  uint8_t *dst,
420  int32_t dst_stride,
421  int32_t height)
422 {
423  uint8_t loop_count;
424  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
425  v16u8 res;
426  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
427  v8u16 const20 = (v8u16) __msa_ldi_h(20);
428  v16u8 const6 = (v16u8) __msa_ldi_b(6);
429  v16u8 const3 = (v16u8) __msa_ldi_b(3);
430 
431  for (loop_count = (height >> 2); loop_count--;) {
432  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
433  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
434  src += (4 * src_stride);
435  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
436  const20, const6, const3);
437  ST_UB(res, dst);
438  dst += dst_stride;
439 
440  res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
441  const20, const6, const3);
442  ST_UB(res, dst);
443  dst += dst_stride;
444 
445  res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
446  const20, const6, const3);
447  ST_UB(res, dst);
448  dst += dst_stride;
449 
450  res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
451  const20, const6, const3);
452  ST_UB(res, dst);
453  dst += dst_stride;
454  }
455 }
456 
458  int32_t src_stride,
459  uint8_t *dst,
460  int32_t dst_stride,
461  int32_t height)
462 {
463  uint8_t loop_count;
464  v16u8 inp0, inp1, inp2, inp3;
465  v16u8 res0, res1;
466  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
467  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
468  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
469  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
470  v16u8 const20 = (v16u8) __msa_ldi_b(20);
471  v16u8 const6 = (v16u8) __msa_ldi_b(6);
472  v16u8 const3 = (v16u8) __msa_ldi_b(3);
473 
474  for (loop_count = (height >> 2); loop_count--;) {
475  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
476  src += (4 * src_stride);
477  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
478  mask0, mask1, mask2, mask3,
479  const20, const6, const3);
480  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
481  mask0, mask1, mask2, mask3,
482  const20, const6, const3);
483  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
484  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
485  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
486  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
487  AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
488  ST8x4_UB(res0, res1, dst, dst_stride);
489  dst += (4 * dst_stride);
490  }
491 }
492 
494  int32_t src_stride,
495  uint8_t *dst,
496  int32_t dst_stride,
497  int32_t height)
498 {
499  uint8_t loop_count;
500  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
501  v16u8 res;
502  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
503  v8u16 const20 = (v8u16) __msa_ldi_h(20);
504  v16u8 const6 = (v16u8) __msa_ldi_b(6);
505  v16u8 const3 = (v16u8) __msa_ldi_b(3);
506 
507  for (loop_count = (height >> 2); loop_count--;) {
508  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
509  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
510  src += (4 * src_stride);
511  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
512  const20, const6, const3);
513  res = __msa_aver_u_b(res, inp1);
514  ST_UB(res, dst);
515  dst += dst_stride;
516 
517  res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
518  const20, const6, const3);
519  res = __msa_aver_u_b(res, inp3);
520  ST_UB(res, dst);
521  dst += dst_stride;
522 
523  res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
524  const20, const6, const3);
525  res = __msa_aver_u_b(res, inp5);
526  ST_UB(res, dst);
527  dst += dst_stride;
528 
529  res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
530  const20, const6, const3);
531  res = __msa_aver_u_b(res, inp7);
532  ST_UB(res, dst);
533  dst += dst_stride;
534  }
535 }
536 
538  int32_t src_stride,
539  uint8_t *dst,
540  int32_t dst_stride,
541  int32_t height)
542 {
543  uint8_t loop_count;
544  v16u8 inp0, inp1, inp2, inp3;
545  v16u8 res0, res1;
546  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
547  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
548  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
549  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
550  v16u8 const20 = (v16u8) __msa_ldi_b(20);
551  v16u8 const6 = (v16u8) __msa_ldi_b(6);
552  v16u8 const3 = (v16u8) __msa_ldi_b(3);
553 
554  for (loop_count = (height >> 2); loop_count--;) {
555  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
556  src += (4 * src_stride);
557  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
558  mask2, mask3, const20,
559  const6, const3);
560  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
561  mask2, mask3, const20,
562  const6, const3);
563  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
564  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
565  res0 = __msa_ave_u_b(inp0, res0);
566  res1 = __msa_ave_u_b(inp2, res1);
567  ST8x4_UB(res0, res1, dst, dst_stride);
568  dst += (4 * dst_stride);
569  }
570 }
571 
573  int32_t src_stride,
574  uint8_t *dst,
575  int32_t dst_stride,
576  int32_t height)
577 {
578  uint8_t loop_count;
579  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
580  v16u8 res;
581  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
582  v8u16 const20 = (v8u16) __msa_ldi_h(20);
583  v16u8 const6 = (v16u8) __msa_ldi_b(6);
584  v16u8 const3 = (v16u8) __msa_ldi_b(3);
585 
586  for (loop_count = (height >> 2); loop_count--;) {
587  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
588  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
589  src += (4 * src_stride);
590  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
591  const20, const6, const3);
592  res = __msa_ave_u_b(inp0, res);
593  ST_UB(res, dst);
594  dst += dst_stride;
595 
596  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
597  const20, const6, const3);
598  res = __msa_ave_u_b(inp2, res);
599  ST_UB(res, dst);
600  dst += dst_stride;
601 
602  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
603  const20, const6, const3);
604  res = __msa_ave_u_b(inp4, res);
605  ST_UB(res, dst);
606  dst += dst_stride;
607 
608  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
609  const20, const6, const3);
610  res = __msa_ave_u_b(inp6, res);
611  ST_UB(res, dst);
612  dst += dst_stride;
613  }
614 }
615 
617  int32_t src_stride,
618  uint8_t *dst,
619  int32_t dst_stride,
620  int32_t height)
621 {
622  uint8_t loop_count;
623  v16u8 inp0, inp1, inp2, inp3;
624  v16u8 res0, res1;
625  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
626  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
627  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
628  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
629  v16u8 const20 = (v16u8) __msa_ldi_b(20);
630  v16u8 const6 = (v16u8) __msa_ldi_b(6);
631  v16u8 const3 = (v16u8) __msa_ldi_b(3);
632 
633  for (loop_count = (height >> 2); loop_count--;) {
634  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
635  src += (4 * src_stride);
636  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
637  mask2, mask3, const20,
638  const6, const3);
639  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
640  mask2, mask3, const20,
641  const6, const3);
642  ST8x4_UB(res0, res1, dst, dst_stride);
643  dst += (4 * dst_stride);
644  }
645 }
646 
648  int32_t src_stride,
649  uint8_t *dst,
650  int32_t dst_stride,
651  int32_t height)
652 {
653  uint8_t loop_count;
654  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
655  v16u8 res;
656  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
657  v16u8 const6 = (v16u8) __msa_ldi_b(6);
658  v16u8 const3 = (v16u8) __msa_ldi_b(3);
659  v8u16 const20 = (v8u16) __msa_ldi_h(20);
660 
661  for (loop_count = (height >> 2); loop_count--;) {
662  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
663  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
664  src += (4 * src_stride);
665  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
666  const20, const6, const3);
667  ST_UB(res, dst);
668  dst += dst_stride;
669 
670  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
671  const20, const6, const3);
672  ST_UB(res, dst);
673  dst += dst_stride;
674 
675  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
676  const20, const6, const3);
677  ST_UB(res, dst);
678  dst += dst_stride;
679 
680  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
681  const20, const6, const3);
682  ST_UB(res, dst);
683  dst += dst_stride;
684  }
685 }
686 
688  int32_t src_stride,
689  uint8_t *dst,
690  int32_t dst_stride,
691  int32_t height)
692 {
693  uint8_t loop_count;
694  v16u8 inp0, inp1, inp2, inp3;
695  v16u8 res0, res1;
696  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
697  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
698  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
699  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
700  v16u8 const20 = (v16u8) __msa_ldi_b(20);
701  v16u8 const6 = (v16u8) __msa_ldi_b(6);
702  v16u8 const3 = (v16u8) __msa_ldi_b(3);
703 
704  for (loop_count = (height >> 2); loop_count--;) {
705  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
706  src += (4 * src_stride);
707  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
708  mask2, mask3, const20,
709  const6, const3);
710  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
711  mask2, mask3, const20,
712  const6, const3);
713  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
714  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
715  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
716  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
717  res0 = __msa_ave_u_b(inp0, res0);
718  res1 = __msa_ave_u_b(inp2, res1);
719  ST8x4_UB(res0, res1, dst, dst_stride);
720  dst += (4 * dst_stride);
721  }
722 }
723 
725  int32_t src_stride,
726  uint8_t *dst,
727  int32_t dst_stride,
728  int32_t height)
729 {
730  uint8_t loop_count;
731  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
732  v16u8 res;
733  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
734  v16u8 const6 = (v16u8) __msa_ldi_b(6);
735  v16u8 const3 = (v16u8) __msa_ldi_b(3);
736  v8u16 const20 = (v8u16) __msa_ldi_h(20);
737 
738  for (loop_count = (height >> 2); loop_count--;) {
739  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
740  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
741  src += (4 * src_stride);
742  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
743  const20, const6, const3);
744  res = __msa_ave_u_b(res, inp1);
745  ST_UB(res, dst);
746  dst += dst_stride;
747 
748  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
749  const20, const6, const3);
750  res = __msa_ave_u_b(res, inp3);
751  ST_UB(res, dst);
752  dst += dst_stride;
753 
754  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
755  const20, const6, const3);
756  res = __msa_ave_u_b(res, inp5);
757  ST_UB(res, dst);
758  dst += dst_stride;
759 
760  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
761  const20, const6, const3);
762  res = __msa_ave_u_b(res, inp7);
763  ST_UB(res, dst);
764  dst += dst_stride;
765  }
766 }
767 
769  int32_t src_stride,
770  uint8_t *dst,
771  int32_t dst_stride,
772  int32_t height)
773 {
774  uint8_t loop_count;
775  v16u8 inp0, inp1, inp2, inp3;
776  v16u8 dst0, dst1, dst2, dst3;
777  v16u8 res0, res1;
778  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
779  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
780  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
781  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
782  v16u8 const20 = (v16u8) __msa_ldi_b(20);
783  v16u8 const6 = (v16u8) __msa_ldi_b(6);
784  v16u8 const3 = (v16u8) __msa_ldi_b(3);
785 
786  for (loop_count = (height >> 2); loop_count--;) {
787  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
788  src += (4 * src_stride);
789  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
790  mask0, mask1, mask2, mask3,
791  const20, const6, const3);
792  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
793  mask0, mask1, mask2, mask3,
794  const20, const6, const3);
795  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
796  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
797  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
798  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
799  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
800  AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
801  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
802  ST8x4_UB(res0, res1, dst, dst_stride);
803  dst += (4 * dst_stride);
804  }
805 }
806 
808  int32_t src_stride,
809  uint8_t *dst,
810  int32_t dst_stride,
811  int32_t height)
812 {
813  uint8_t loop_count;
814  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
815  v16u8 res0, res1;
816  v16u8 dst0, dst1;
817  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
818  v16u8 const6 = (v16u8) __msa_ldi_b(6);
819  v16u8 const3 = (v16u8) __msa_ldi_b(3);
820  v8u16 const20 = (v8u16) __msa_ldi_h(20);
821 
822  for (loop_count = (height >> 2); loop_count--;) {
823  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
824  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
825  src += (4 * src_stride);
826  res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
827  const20, const6, const3);
828  res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
829  const20, const6, const3);
830  LD_UB2(dst, dst_stride, dst0, dst1);
831  AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
832  AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
833  ST_UB2(res0, res1, dst, dst_stride);
834  dst += (2 * dst_stride);
835 
836  res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
837  const20, const6, const3);
838  res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
839  const20, const6, const3);
840  LD_UB2(dst, dst_stride, dst0, dst1);
841  AVER_UB2_UB(inp4, res0, inp6, res1, res0, res1);
842  AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
843  ST_UB2(res0, res1, dst, dst_stride);
844  dst += (2 * dst_stride);
845  }
846 }
847 
849  int32_t src_stride,
850  uint8_t *dst,
851  int32_t dst_stride,
852  int32_t height)
853 {
854  uint8_t loop_count;
855  v16u8 inp0, inp1, inp2, inp3;
856  v16u8 dst0, dst1, dst2, dst3;
857  v16u8 res0, res1;
858  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
859  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
860  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
861  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
862  v16u8 const20 = (v16u8) __msa_ldi_b(20);
863  v16u8 const6 = (v16u8) __msa_ldi_b(6);
864  v16u8 const3 = (v16u8) __msa_ldi_b(3);
865 
866  for (loop_count = (height >> 2); loop_count--;) {
867  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
868  src += (4 * src_stride);
869  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
870  mask0, mask1, mask2, mask3,
871  const20, const6, const3);
872  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
873  mask0, mask1, mask2, mask3,
874  const20, const6, const3);
875  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
876  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
877  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
878  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
879  ST8x4_UB(res0, res1, dst, dst_stride);
880  dst += (4 * dst_stride);
881  }
882 }
883 
885  int32_t src_stride,
886  uint8_t *dst,
887  int32_t dst_stride,
888  int32_t height)
889 {
890  uint8_t loop_count;
891  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
892  v16u8 res0, res1;
893  v16u8 dst0, dst1;
894  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
895  v16u8 const6 = (v16u8) __msa_ldi_b(6);
896  v16u8 const3 = (v16u8) __msa_ldi_b(3);
897  v8u16 const20 = (v8u16) __msa_ldi_h(20);
898 
899  for (loop_count = (height >> 2); loop_count--;) {
900  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
901  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
902  src += (4 * src_stride);
903  res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
904  const20, const6, const3);
905  res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
906  const20, const6, const3);
907  LD_UB2(dst, dst_stride, dst0, dst1);
908  AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
909  ST_UB2(res0, res1, dst, dst_stride);
910  dst += (2 * dst_stride);
911 
912  res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
913  const20, const6, const3);
914  res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
915  const20, const6, const3);
916  LD_UB2(dst, dst_stride, dst0, dst1);
917  AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
918  ST_UB2(res0, res1, dst, dst_stride);
919  dst += (2 * dst_stride);
920  }
921 }
922 
924  int32_t src_stride,
925  uint8_t *dst,
926  int32_t dst_stride,
927  int32_t height)
928 {
929  uint8_t loop_count;
930  v16u8 inp0, inp1, inp2, inp3;
931  v16u8 dst0, dst1, dst2, dst3;
932  v16u8 res0, res1;
933  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
934  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
935  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
936  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
937  v16u8 const20 = (v16u8) __msa_ldi_b(20);
938  v16u8 const6 = (v16u8) __msa_ldi_b(6);
939  v16u8 const3 = (v16u8) __msa_ldi_b(3);
940 
941  for (loop_count = (height >> 2); loop_count--;) {
942  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
943  src += (4 * src_stride);
944  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
945  mask0, mask1, mask2, mask3,
946  const20, const6, const3);
947  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
948  mask0, mask1, mask2, mask3,
949  const20, const6, const3);
950  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
951  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
952  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
953  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
954  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
955  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
956  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
957  AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
958  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
959  ST8x4_UB(res0, res1, dst, dst_stride);
960  dst += (4 * dst_stride);
961  }
962 }
963 
965  int32_t src_stride,
966  uint8_t *dst,
967  int32_t dst_stride,
968  int32_t height)
969 {
970  uint8_t loop_count;
971  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
972  v16u8 res0, res1, dst0, dst1;
973  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
974  v16u8 const6 = (v16u8) __msa_ldi_b(6);
975  v16u8 const3 = (v16u8) __msa_ldi_b(3);
976  v8u16 const20 = (v8u16) __msa_ldi_h(20);
977 
978  for (loop_count = (height >> 2); loop_count--;) {
979  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
980  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
981  src += (4 * src_stride);
982  res0 = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
983  const20, const6, const3);
984  res1 = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
985  const20, const6, const3);
986  LD_UB2(dst, dst_stride, dst0, dst1);
987  AVER_UB2_UB(res0, inp1, res1, inp3, res0, res1);
988  AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
989  ST_UB2(res0, res1, dst, dst_stride);
990  dst += (2 * dst_stride);
991  res0 = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
992  const20, const6, const3);
993  res1 = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
994  const20, const6, const3);
995  LD_UB2(dst, dst_stride, dst0, dst1);
996  AVER_UB2_UB(res0, inp5, res1, inp7, res0, res1);
997  AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
998  ST_UB2(res0, res1, dst, dst_stride);
999  dst += (2 * dst_stride);
1000  }
1001 }
1002 
1003 
1005  int32_t src_stride,
1006  uint8_t *dst,
1007  int32_t dst_stride)
1008 {
1009  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1010  v16u8 tmp0, tmp1, res0, res1;
1011  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1012  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1013  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1014 
1015  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1016  src += (4 * src_stride);
1017  LD_UB2(src, src_stride, inp4, inp5);
1018  src += (2 * src_stride);
1019  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1020  inp1, inp2, inp3, inp4,
1021  inp1, inp0, inp0, inp1,
1022  inp2, inp3, inp4, inp5,
1023  const20, const6, const3);
1024  LD_UB2(src, src_stride, inp6, inp7);
1025  src += (2 * src_stride);
1026  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1027  inp3, inp4, inp5, inp6,
1028  inp3, inp2, inp1, inp0,
1029  inp4, inp5, inp6, inp7,
1030  const20, const6, const3);
1031  tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
1032  tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
1033  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1034  ST8x4_UB(res0, res1, dst, dst_stride);
1035  dst += (4 * dst_stride);
1036 
1037  inp8 = LD_UB(src);
1038  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1039  inp5, inp6, inp7, inp8,
1040  inp5, inp4, inp3, inp2,
1041  inp6, inp7, inp8, inp8,
1042  const20, const6, const3);
1043  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1044  inp7, inp8, inp8, inp7,
1045  inp7, inp6, inp5, inp4,
1046  inp8, inp8, inp7, inp6,
1047  const20, const6, const3);
1048  tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
1049  tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
1050  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1051  ST8x4_UB(res0, res1, dst, dst_stride);
1052  dst += (4 * dst_stride);
1053 }
1054 
1056  int32_t src_stride,
1057  uint8_t *dst,
1058  int32_t dst_stride)
1059 {
1060  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1061  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1062  v16u8 res0;
1063  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1064  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1065  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1066 
1067  LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1068  src += (5 * src_stride);
1069  res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1070  inp1, inp2, inp3, inp4,
1071  const20, const6, const3);
1072  res0 = __msa_aver_u_b(res0, inp0);
1073  ST_UB(res0, dst);
1074  dst += dst_stride;
1075 
1076  inp5 = LD_UB(src);
1077  src += src_stride;
1078  res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1079  inp2, inp3, inp4, inp5,
1080  const20, const6, const3);
1081  res0 = __msa_aver_u_b(res0, inp1);
1082  ST_UB(res0, dst);
1083  dst += dst_stride;
1084 
1085  inp6 = LD_UB(src);
1086  src += src_stride;
1087  res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1088  inp3, inp4, inp5, inp6,
1089  const20, const6, const3);
1090  res0 = __msa_aver_u_b(res0, inp2);
1091  ST_UB(res0, dst);
1092  dst += dst_stride;
1093 
1094  inp7 = LD_UB(src);
1095  src += src_stride;
1096  res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1097  inp4, inp5, inp6, inp7,
1098  const20, const6, const3);
1099  res0 = __msa_aver_u_b(res0, inp3);
1100  ST_UB(res0, dst);
1101  dst += dst_stride;
1102 
1103  LD_UB2(src, src_stride, inp8, inp9);
1104  src += (2 * src_stride);
1105  res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1106  inp5, inp6, inp7, inp8,
1107  const20, const6, const3);
1108  res0 = __msa_aver_u_b(res0, inp4);
1109  ST_UB(res0, dst);
1110  dst += dst_stride;
1111 
1112  res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1113  inp6, inp7, inp8, inp9,
1114  const20, const6, const3);
1115  res0 = __msa_aver_u_b(res0, inp5);
1116  ST_UB(res0, dst);
1117  dst += dst_stride;
1118 
1119  LD_UB2(src, src_stride, inp10, inp11);
1120  src += (2 * src_stride);
1121  res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1122  inp7, inp8, inp9, inp10,
1123  const20, const6, const3);
1124  res0 = __msa_aver_u_b(res0, inp6);
1125  ST_UB(res0, dst);
1126  dst += dst_stride;
1127 
1128  res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1129  inp8, inp9, inp10, inp11,
1130  const20, const6, const3);
1131  res0 = __msa_aver_u_b(res0, inp7);
1132  ST_UB(res0, dst);
1133  dst += dst_stride;
1134 
1135  LD_UB2(src, src_stride, inp12, inp13);
1136  src += (2 * src_stride);
1137  res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1138  inp9, inp10, inp11, inp12,
1139  const20, const6, const3);
1140  res0 = __msa_aver_u_b(res0, inp8);
1141  ST_UB(res0, dst);
1142  dst += dst_stride;
1143 
1144  res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1145  inp10, inp11, inp12, inp13,
1146  const20, const6, const3);
1147  res0 = __msa_aver_u_b(res0, inp9);
1148  ST_UB(res0, dst);
1149  dst += dst_stride;
1150 
1151  LD_UB2(src, src_stride, inp14, inp15);
1152  src += (2 * src_stride);
1153  res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1154  inp11, inp12, inp13, inp14,
1155  const20, const6, const3);
1156  res0 = __msa_aver_u_b(res0, inp10);
1157  ST_UB(res0, dst);
1158  dst += dst_stride;
1159 
1160  res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1161  inp12, inp13, inp14, inp15,
1162  const20, const6, const3);
1163  res0 = __msa_aver_u_b(res0, inp11);
1164  ST_UB(res0, dst);
1165  dst += dst_stride;
1166 
1167  inp16 = LD_UB(src);
1168  res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1169  inp13, inp14, inp15, inp16,
1170  const20, const6, const3);
1171  res0 = __msa_aver_u_b(res0, inp12);
1172  ST_UB(res0, dst);
1173  dst += dst_stride;
1174 
1175  res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1176  inp14, inp15, inp16, inp16,
1177  const20, const6, const3);
1178  res0 = __msa_aver_u_b(res0, inp13);
1179  ST_UB(res0, dst);
1180  dst += dst_stride;
1181 
1182  res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1183  inp15, inp16, inp16, inp15,
1184  const20, const6, const3);
1185  res0 = __msa_aver_u_b(res0, inp14);
1186  ST_UB(res0, dst);
1187  dst += dst_stride;
1188 
1189  res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1190  inp16, inp16, inp15, inp14,
1191  const20, const6, const3);
1192  res0 = __msa_aver_u_b(res0, inp15);
1193  ST_UB(res0, dst);
1194 }
1195 
1196 static void vert_mc_qpel_8x8_msa(const uint8_t *src,
1197  int32_t src_stride,
1198  uint8_t *dst,
1199  int32_t dst_stride)
1200 {
1201  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1202  v16u8 res0, res1;
1203  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1204  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1205  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1206 
1207  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1208  src += (4 * src_stride);
1209  LD_UB2(src, src_stride, inp4, inp5);
1210  src += (2 * src_stride);
1211  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1212  inp1, inp2, inp3, inp4,
1213  inp1, inp0, inp0, inp1,
1214  inp2, inp3, inp4, inp5,
1215  const20, const6, const3);
1216  LD_UB2(src, src_stride, inp6, inp7);
1217  src += (2 * src_stride);
1218  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1219  inp3, inp4, inp5, inp6,
1220  inp3, inp2, inp1, inp0,
1221  inp4, inp5, inp6, inp7,
1222  const20, const6, const3);
1223  ST8x4_UB(res0, res1, dst, dst_stride);
1224  dst += (4 * dst_stride);
1225 
1226  inp8 = LD_UB(src);
1227  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1228  inp5, inp6, inp7, inp8,
1229  inp5, inp4, inp3, inp2,
1230  inp6, inp7, inp8, inp8,
1231  const20, const6, const3);
1232  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1233  inp7, inp8, inp8, inp7,
1234  inp7, inp6, inp5, inp4,
1235  inp8, inp8, inp7, inp6,
1236  const20, const6, const3);
1237  ST8x4_UB(res0, res1, dst, dst_stride);
1238  dst += (4 * dst_stride);
1239 }
1240 
1242  int32_t src_stride,
1243  uint8_t *dst,
1244  int32_t dst_stride)
1245 {
1246  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1247  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1248  v16u8 res0;
1249  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1250  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1251  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1252 
1253  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1254  src += (4 * src_stride);
1255  inp4 = LD_UB(src);
1256  src += src_stride;
1257  res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1258  inp1, inp2, inp3, inp4,
1259  const20, const6, const3);
1260  ST_UB(res0, dst);
1261  dst += dst_stride;
1262 
1263  inp5 = LD_UB(src);
1264  src += src_stride;
1265  res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1266  inp2, inp3, inp4, inp5,
1267  const20, const6, const3);
1268  ST_UB(res0, dst);
1269  dst += dst_stride;
1270 
1271  inp6 = LD_UB(src);
1272  src += src_stride;
1273  res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1274  inp3, inp4, inp5, inp6,
1275  const20, const6, const3);
1276  ST_UB(res0, dst);
1277  dst += dst_stride;
1278 
1279  inp7 = LD_UB(src);
1280  src += src_stride;
1281  res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1282  inp4, inp5, inp6, inp7,
1283  const20, const6, const3);
1284  ST_UB(res0, dst);
1285  dst += dst_stride;
1286 
1287  inp8 = LD_UB(src);
1288  src += src_stride;
1289  res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1290  inp5, inp6, inp7, inp8,
1291  const20, const6, const3);
1292  ST_UB(res0, dst);
1293  dst += dst_stride;
1294 
1295  inp9 = LD_UB(src);
1296  src += src_stride;
1297  res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1298  inp6, inp7, inp8, inp9,
1299  const20, const6, const3);
1300  ST_UB(res0, dst);
1301  dst += dst_stride;
1302 
1303  inp10 = LD_UB(src);
1304  src += src_stride;
1305  res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1306  inp7, inp8, inp9, inp10,
1307  const20, const6, const3);
1308  ST_UB(res0, dst);
1309  dst += dst_stride;
1310 
1311  inp11 = LD_UB(src);
1312  src += src_stride;
1313  res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1314  inp8, inp9, inp10, inp11,
1315  const20, const6, const3);
1316  ST_UB(res0, dst);
1317  dst += dst_stride;
1318 
1319  inp12 = LD_UB(src);
1320  src += src_stride;
1321  res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1322  inp9, inp10, inp11, inp12,
1323  const20, const6, const3);
1324  ST_UB(res0, dst);
1325  dst += dst_stride;
1326 
1327  inp13 = LD_UB(src);
1328  src += src_stride;
1329  res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1330  inp10, inp11, inp12, inp13,
1331  const20, const6, const3);
1332  ST_UB(res0, dst);
1333  dst += dst_stride;
1334 
1335  inp14 = LD_UB(src);
1336  src += src_stride;
1337  res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1338  inp11, inp12, inp13, inp14,
1339  const20, const6, const3);
1340  ST_UB(res0, dst);
1341  dst += dst_stride;
1342 
1343  inp15 = LD_UB(src);
1344  src += src_stride;
1345  res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1346  inp12, inp13, inp14, inp15,
1347  const20, const6, const3);
1348  ST_UB(res0, dst);
1349  dst += dst_stride;
1350 
1351  inp16 = LD_UB(src);
1352  res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1353  inp13, inp14, inp15, inp16,
1354  const20, const6, const3);
1355  ST_UB(res0, dst);
1356  dst += dst_stride;
1357 
1358  res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1359  inp14, inp15, inp16, inp16,
1360  const20, const6, const3);
1361  ST_UB(res0, dst);
1362  dst += dst_stride;
1363 
1364  res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1365  inp15, inp16, inp16, inp15,
1366  const20, const6, const3);
1367  ST_UB(res0, dst);
1368  dst += dst_stride;
1369 
1370  res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1371  inp16, inp16, inp15, inp14,
1372  const20, const6, const3);
1373  ST_UB(res0, dst);
1374  dst += dst_stride;
1375 }
1376 
1378  int32_t src_stride,
1379  uint8_t *dst,
1380  int32_t dst_stride)
1381 {
1382  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1383  v16u8 tmp0, tmp1, res0, res1;
1384  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1385  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1386  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1387 
1388  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1389  src += (4 * src_stride);
1390  LD_UB2(src, src_stride, inp4, inp5);
1391  src += (2 * src_stride);
1392  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1393  inp1, inp2, inp3, inp4,
1394  inp1, inp0, inp0, inp1,
1395  inp2, inp3, inp4, inp5,
1396  const20, const6, const3);
1397 
1398  LD_UB2(src, src_stride, inp6, inp7);
1399  src += (2 * src_stride);
1400  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1401  inp3, inp4, inp5, inp6,
1402  inp3, inp2, inp1, inp0,
1403  inp4, inp5, inp6, inp7,
1404  const20, const6, const3);
1405  tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
1406  tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
1407  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1408  ST8x4_UB(res0, res1, dst, dst_stride);
1409  dst += (4 * dst_stride);
1410 
1411  inp8 = LD_UB(src);
1412  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1413  inp5, inp6, inp7, inp8,
1414  inp5, inp4, inp3, inp2,
1415  inp6, inp7, inp8, inp8,
1416  const20, const6, const3);
1417  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1418  inp7, inp8, inp8, inp7,
1419  inp7, inp6, inp5, inp4,
1420  inp8, inp8, inp7, inp6,
1421  const20, const6, const3);
1422  tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
1423  tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
1424  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
1425  ST8x4_UB(res0, res1, dst, dst_stride);
1426 }
1427 
1429  int32_t src_stride,
1430  uint8_t *dst,
1431  int32_t dst_stride)
1432 {
1433  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1434  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1435  v16u8 res0;
1436  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1437  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1438  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1439 
1440  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1441  src += (4 * src_stride);
1442  inp4 = LD_UB(src);
1443  src += src_stride;
1444  res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
1445  inp1, inp2, inp3, inp4,
1446  const20, const6, const3);
1447  res0 = __msa_aver_u_b(res0, inp1);
1448  ST_UB(res0, dst);
1449  dst += dst_stride;
1450 
1451  inp5 = LD_UB(src);
1452  src += src_stride;
1453  res0 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
1454  inp2, inp3, inp4, inp5,
1455  const20, const6, const3);
1456  res0 = __msa_aver_u_b(res0, inp2);
1457  ST_UB(res0, dst);
1458  dst += dst_stride;
1459 
1460  inp6 = LD_UB(src);
1461  src += src_stride;
1462  res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
1463  inp3, inp4, inp5, inp6,
1464  const20, const6, const3);
1465  res0 = __msa_aver_u_b(res0, inp3);
1466  ST_UB(res0, dst);
1467  dst += dst_stride;
1468 
1469  inp7 = LD_UB(src);
1470  src += src_stride;
1471  res0 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
1472  inp4, inp5, inp6, inp7,
1473  const20, const6, const3);
1474  res0 = __msa_aver_u_b(res0, inp4);
1475  ST_UB(res0, dst);
1476  dst += dst_stride;
1477 
1478  inp8 = LD_UB(src);
1479  src += src_stride;
1480  res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
1481  inp5, inp6, inp7, inp8,
1482  const20, const6, const3);
1483  res0 = __msa_aver_u_b(res0, inp5);
1484  ST_UB(res0, dst);
1485  dst += dst_stride;
1486 
1487  inp9 = LD_UB(src);
1488  src += src_stride;
1489  res0 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
1490  inp6, inp7, inp8, inp9,
1491  const20, const6, const3);
1492  res0 = __msa_aver_u_b(res0, inp6);
1493  ST_UB(res0, dst);
1494  dst += dst_stride;
1495 
1496  inp10 = LD_UB(src);
1497  src += src_stride;
1498  res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
1499  inp7, inp8, inp9, inp10,
1500  const20, const6, const3);
1501  res0 = __msa_aver_u_b(res0, inp7);
1502  ST_UB(res0, dst);
1503  dst += dst_stride;
1504 
1505  inp11 = LD_UB(src);
1506  src += src_stride;
1507  res0 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
1508  inp8, inp9, inp10, inp11,
1509  const20, const6, const3);
1510  res0 = __msa_aver_u_b(res0, inp8);
1511  ST_UB(res0, dst);
1512  dst += dst_stride;
1513 
1514  inp12 = LD_UB(src);
1515  src += src_stride;
1516  res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
1517  inp9, inp10, inp11, inp12,
1518  const20, const6, const3);
1519  res0 = __msa_aver_u_b(res0, inp9);
1520  ST_UB(res0, dst);
1521  dst += dst_stride;
1522 
1523  inp13 = LD_UB(src);
1524  src += src_stride;
1525  res0 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
1526  inp10, inp11, inp12, inp13,
1527  const20, const6, const3);
1528  res0 = __msa_aver_u_b(res0, inp10);
1529  ST_UB(res0, dst);
1530  dst += dst_stride;
1531 
1532  inp14 = LD_UB(src);
1533  src += src_stride;
1534  res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
1535  inp11, inp12, inp13, inp14,
1536  const20, const6, const3);
1537  res0 = __msa_aver_u_b(res0, inp11);
1538  ST_UB(res0, dst);
1539  dst += dst_stride;
1540 
1541  inp15 = LD_UB(src);
1542  src += src_stride;
1543  res0 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
1544  inp12, inp13, inp14, inp15,
1545  const20, const6, const3);
1546  res0 = __msa_aver_u_b(res0, inp12);
1547  ST_UB(res0, dst);
1548  dst += dst_stride;
1549 
1550  inp16 = LD_UB(src);
1551  res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
1552  inp13, inp14, inp15, inp16,
1553  const20, const6, const3);
1554  res0 = __msa_aver_u_b(res0, inp13);
1555  ST_UB(res0, dst);
1556  dst += dst_stride;
1557 
1558  res0 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
1559  inp14, inp15, inp16, inp16,
1560  const20, const6, const3);
1561  res0 = __msa_aver_u_b(res0, inp14);
1562  ST_UB(res0, dst);
1563  dst += dst_stride;
1564 
1565  res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
1566  inp15, inp16, inp16, inp15,
1567  const20, const6, const3);
1568  res0 = __msa_aver_u_b(res0, inp15);
1569  ST_UB(res0, dst);
1570  dst += dst_stride;
1571 
1572  res0 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
1573  inp16, inp16, inp15, inp14,
1574  const20, const6, const3);
1575  res0 = __msa_aver_u_b(res0, inp16);
1576  ST_UB(res0, dst);
1577 }
1578 
1580  int32_t src_stride,
1581  uint8_t *dst,
1582  int32_t dst_stride)
1583 {
1584  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1585  v16u8 tmp0, tmp1, res0, res1;
1586  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1587  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1588  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1589 
1590  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1591  src += (4 * src_stride);
1592  LD_UB2(src, src_stride, inp4, inp5);
1593  src += (2 * src_stride);
1594  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1595  inp1, inp2, inp3, inp4,
1596  inp1, inp0, inp0, inp1,
1597  inp2, inp3, inp4, inp5,
1598  const20, const6, const3);
1599  LD_UB2(src, src_stride, inp6, inp7);
1600  src += (2 * src_stride);
1601  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1602  inp3, inp4, inp5, inp6,
1603  inp3, inp2, inp1, inp0,
1604  inp4, inp5, inp6, inp7,
1605  const20, const6, const3);
1606  tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
1607  tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
1608  res0 = __msa_ave_u_b(res0, tmp0);
1609  res1 = __msa_ave_u_b(res1, tmp1);
1610  ST8x4_UB(res0, res1, dst, dst_stride);
1611  dst += (4 * dst_stride);
1612 
1613  inp8 = LD_UB(src);
1614  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1615  inp5, inp6, inp7, inp8,
1616  inp5, inp4, inp3, inp2,
1617  inp6, inp7, inp8, inp8,
1618  const20, const6, const3);
1619  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1620  inp7, inp8, inp8, inp7,
1621  inp7, inp6, inp5, inp4,
1622  inp8, inp8, inp7, inp6,
1623  const20, const6, const3);
1624  tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
1625  tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
1626  res0 = __msa_ave_u_b(res0, tmp0);
1627  res1 = __msa_ave_u_b(res1, tmp1);
1628  ST8x4_UB(res0, res1, dst, dst_stride);
1629  dst += (4 * dst_stride);
1630 }
1631 
1633  int32_t src_stride,
1634  uint8_t *dst,
1635  int32_t dst_stride)
1636 {
1637  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1638  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1639  v16u8 res0;
1640  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1641  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1642  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1643 
1644  LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1645  src += (5 * src_stride);
1646  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
1647  inp1, inp2, inp3, inp4,
1648  const20, const6, const3);
1649  res0 = __msa_ave_u_b(res0, inp0);
1650  ST_UB(res0, dst);
1651  dst += dst_stride;
1652 
1653  inp5 = LD_UB(src);
1654  src += src_stride;
1655  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
1656  inp2, inp3, inp4, inp5,
1657  const20, const6, const3);
1658  res0 = __msa_ave_u_b(res0, inp1);
1659  ST_UB(res0, dst);
1660  dst += dst_stride;
1661 
1662  inp6 = LD_UB(src);
1663  src += src_stride;
1664  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
1665  inp3, inp4, inp5, inp6,
1666  const20, const6, const3);
1667  res0 = __msa_ave_u_b(res0, inp2);
1668  ST_UB(res0, dst);
1669  dst += dst_stride;
1670 
1671  inp7 = LD_UB(src);
1672  src += src_stride;
1673  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
1674  inp4, inp5, inp6, inp7,
1675  const20, const6, const3);
1676  res0 = __msa_ave_u_b(res0, inp3);
1677  ST_UB(res0, dst);
1678  dst += dst_stride;
1679 
1680  inp8 = LD_UB(src);
1681  src += src_stride;
1682  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
1683  inp5, inp6, inp7, inp8,
1684  const20, const6, const3);
1685  res0 = __msa_ave_u_b(res0, inp4);
1686  ST_UB(res0, dst);
1687  dst += dst_stride;
1688 
1689  inp9 = LD_UB(src);
1690  src += src_stride;
1691  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
1692  inp6, inp7, inp8, inp9,
1693  const20, const6, const3);
1694  res0 = __msa_ave_u_b(res0, inp5);
1695  ST_UB(res0, dst);
1696  dst += dst_stride;
1697 
1698  inp10 = LD_UB(src);
1699  src += src_stride;
1700  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
1701  inp7, inp8, inp9, inp10,
1702  const20, const6, const3);
1703  res0 = __msa_ave_u_b(res0, inp6);
1704  ST_UB(res0, dst);
1705  dst += dst_stride;
1706 
1707  inp11 = LD_UB(src);
1708  src += src_stride;
1709  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
1710  inp8, inp9, inp10, inp11,
1711  const20, const6, const3);
1712  res0 = __msa_ave_u_b(res0, inp7);
1713  ST_UB(res0, dst);
1714  dst += dst_stride;
1715 
1716  inp12 = LD_UB(src);
1717  src += src_stride;
1718  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
1719  inp9, inp10, inp11, inp12,
1720  const20, const6, const3);
1721  res0 = __msa_ave_u_b(res0, inp8);
1722  ST_UB(res0, dst);
1723  dst += dst_stride;
1724 
1725  inp13 = LD_UB(src);
1726  src += src_stride;
1727  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
1728  inp10, inp11, inp12, inp13,
1729  const20, const6, const3);
1730  res0 = __msa_ave_u_b(res0, inp9);
1731  ST_UB(res0, dst);
1732  dst += dst_stride;
1733 
1734  inp14 = LD_UB(src);
1735  src += src_stride;
1736  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
1737  inp11, inp12, inp13, inp14,
1738  const20, const6, const3);
1739  res0 = __msa_ave_u_b(res0, inp10);
1740  ST_UB(res0, dst);
1741  dst += dst_stride;
1742 
1743  inp15 = LD_UB(src);
1744  src += src_stride;
1745  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
1746  inp12, inp13, inp14, inp15,
1747  const20, const6, const3);
1748  res0 = __msa_ave_u_b(res0, inp11);
1749  ST_UB(res0, dst);
1750  dst += dst_stride;
1751 
1752  inp16 = LD_UB(src);
1753  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
1754  inp13, inp14, inp15, inp16,
1755  const20, const6, const3);
1756  res0 = __msa_ave_u_b(res0, inp12);
1757  ST_UB(res0, dst);
1758  dst += dst_stride;
1759 
1760  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
1761  inp14, inp15, inp16, inp16,
1762  const20, const6, const3);
1763  res0 = __msa_ave_u_b(res0, inp13);
1764  ST_UB(res0, dst);
1765  dst += dst_stride;
1766 
1767  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
1768  inp15, inp16, inp16, inp15,
1769  const20, const6, const3);
1770  res0 = __msa_ave_u_b(res0, inp14);
1771  ST_UB(res0, dst);
1772  dst += dst_stride;
1773 
1774  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
1775  inp16, inp16, inp15, inp14,
1776  const20, const6, const3);
1777  res0 = __msa_ave_u_b(res0, inp15);
1778  ST_UB(res0, dst);
1779  dst += dst_stride;
1780 }
1781 
1783  int32_t src_stride,
1784  uint8_t *dst,
1785  int32_t dst_stride)
1786 {
1787  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1788  v16u8 res0, res1;
1789  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1790  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1791  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1792 
1793  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1794  src += (4 * src_stride);
1795  LD_UB2(src, src_stride, inp4, inp5);
1796  src += (2 * src_stride);
1797  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1798  inp1, inp2, inp3, inp4,
1799  inp1, inp0, inp0, inp1,
1800  inp2, inp3, inp4, inp5,
1801  const20, const6, const3);
1802  LD_UB2(src, src_stride, inp6, inp7);
1803  src += (2 * src_stride);
1804  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1805  inp3, inp4, inp5, inp6,
1806  inp3, inp2, inp1, inp0,
1807  inp4, inp5, inp6, inp7,
1808  const20, const6, const3);
1809  ST8x4_UB(res0, res1, dst, dst_stride);
1810  dst += (4 * dst_stride);
1811 
1812  inp8 = LD_UB(src);
1813  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1814  inp5, inp6, inp7, inp8,
1815  inp5, inp4, inp3, inp2,
1816  inp6, inp7, inp8, inp8,
1817  const20, const6, const3);
1818  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
1819  inp7, inp8, inp8, inp7,
1820  inp7, inp6, inp5, inp4,
1821  inp8, inp8, inp7, inp6,
1822  const20, const6, const3);
1823  ST8x4_UB(res0, res1, dst, dst_stride);
1824  dst += (4 * dst_stride);
1825 }
1826 
1828  int32_t src_stride,
1829  uint8_t *dst,
1830  int32_t dst_stride)
1831 {
1832  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1833  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
1834  v16u8 res0;
1835  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1836  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1837  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1838 
1839  LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
1840  src += (5 * src_stride);
1841  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
1842  inp1, inp2, inp3, inp4,
1843  const20, const6, const3);
1844  ST_UB(res0, dst);
1845  dst += dst_stride;
1846 
1847  inp5 = LD_UB(src);
1848  src += src_stride;
1849  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
1850  inp2, inp3, inp4, inp5,
1851  const20, const6, const3);
1852  ST_UB(res0, dst);
1853  dst += dst_stride;
1854 
1855  inp6 = LD_UB(src);
1856  src += src_stride;
1857  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
1858  inp3, inp4, inp5, inp6,
1859  const20, const6, const3);
1860  ST_UB(res0, dst);
1861  dst += dst_stride;
1862 
1863  inp7 = LD_UB(src);
1864  src += src_stride;
1865  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
1866  inp4, inp5, inp6, inp7,
1867  const20, const6, const3);
1868  ST_UB(res0, dst);
1869  dst += dst_stride;
1870 
1871  inp8 = LD_UB(src);
1872  src += src_stride;
1873  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
1874  inp5, inp6, inp7, inp8,
1875  const20, const6, const3);
1876  ST_UB(res0, dst);
1877  dst += dst_stride;
1878 
1879  inp9 = LD_UB(src);
1880  src += src_stride;
1881  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
1882  inp6, inp7, inp8, inp9,
1883  const20, const6, const3);
1884  ST_UB(res0, dst);
1885  dst += dst_stride;
1886 
1887  inp10 = LD_UB(src);
1888  src += src_stride;
1889  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
1890  inp7, inp8, inp9, inp10,
1891  const20, const6, const3);
1892  ST_UB(res0, dst);
1893  dst += dst_stride;
1894 
1895  inp11 = LD_UB(src);
1896  src += src_stride;
1897  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
1898  inp8, inp9, inp10, inp11,
1899  const20, const6, const3);
1900  ST_UB(res0, dst);
1901  dst += dst_stride;
1902 
1903  inp12 = LD_UB(src);
1904  src += src_stride;
1905  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
1906  inp9, inp10, inp11, inp12,
1907  const20, const6, const3);
1908  ST_UB(res0, dst);
1909  dst += dst_stride;
1910 
1911  inp13 = LD_UB(src);
1912  src += src_stride;
1913  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
1914  inp10, inp11, inp12, inp13,
1915  const20, const6, const3);
1916  ST_UB(res0, dst);
1917  dst += dst_stride;
1918 
1919  inp14 = LD_UB(src);
1920  src += src_stride;
1921  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
1922  inp11, inp12, inp13, inp14,
1923  const20, const6, const3);
1924  ST_UB(res0, dst);
1925  dst += dst_stride;
1926 
1927  inp15 = LD_UB(src);
1928  src += src_stride;
1929  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
1930  inp12, inp13, inp14, inp15,
1931  const20, const6, const3);
1932  ST_UB(res0, dst);
1933  dst += dst_stride;
1934 
1935  inp16 = LD_UB(src);
1936  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
1937  inp13, inp14, inp15, inp16,
1938  const20, const6, const3);
1939  ST_UB(res0, dst);
1940  dst += dst_stride;
1941 
1942  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
1943  inp14, inp15, inp16, inp16,
1944  const20, const6, const3);
1945  ST_UB(res0, dst);
1946  dst += dst_stride;
1947 
1948  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
1949  inp15, inp16, inp16, inp15,
1950  const20, const6, const3);
1951  ST_UB(res0, dst);
1952  dst += dst_stride;
1953 
1954  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
1955  inp16, inp16, inp15, inp14,
1956  const20, const6, const3);
1957  ST_UB(res0, dst);
1958 }
1959 
1961  int32_t src_stride,
1962  uint8_t *dst,
1963  int32_t dst_stride)
1964 {
1965  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
1966  v16u8 tmp0, tmp1, res0, res1;
1967  v16u8 const20 = (v16u8) __msa_ldi_b(20);
1968  v16u8 const6 = (v16u8) __msa_ldi_b(6);
1969  v16u8 const3 = (v16u8) __msa_ldi_b(3);
1970 
1971  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
1972  src += (4 * src_stride);
1973  LD_UB2(src, src_stride, inp4, inp5);
1974  src += (2 * src_stride);
1975  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp0, inp1, inp2,
1976  inp1, inp2, inp3, inp4,
1977  inp1, inp0, inp0, inp1,
1978  inp2, inp3, inp4, inp5,
1979  const20, const6, const3);
1980  LD_UB2(src, src_stride, inp6, inp7);
1981  src += (2 * src_stride);
1982  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp1, inp0, inp0,
1983  inp3, inp4, inp5, inp6,
1984  inp3, inp2, inp1, inp0,
1985  inp4, inp5, inp6, inp7,
1986  const20, const6, const3);
1987  tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
1988  tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
1989  res0 = __msa_ave_u_b(res0, tmp0);
1990  res1 = __msa_ave_u_b(res1, tmp1);
1991  ST8x4_UB(res0, res1, dst, dst_stride);
1992  dst += (4 * dst_stride);
1993 
1994  inp8 = LD_UB(src);
1995  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
1996  inp5, inp6, inp7, inp8,
1997  inp5, inp4, inp3, inp2,
1998  inp6, inp7, inp8, inp8,
1999  const20, const6, const3);
2000  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2001  inp7, inp8, inp8, inp7,
2002  inp7, inp6, inp5, inp4,
2003  inp8, inp8, inp7, inp6,
2004  const20, const6, const3);
2005  tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
2006  tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
2007  res0 = __msa_ave_u_b(res0, tmp0);
2008  res1 = __msa_ave_u_b(res1, tmp1);
2009  ST8x4_UB(res0, res1, dst, dst_stride);
2010 }
2011 
2013  int32_t src_stride,
2014  uint8_t *dst,
2015  int32_t dst_stride)
2016 {
2017  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2018  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2019  v16u8 res0;
2020  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2021  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2022  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2023 
2024  LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2025  src += (5 * src_stride);
2026  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp0, inp0, inp1, inp2,
2027  inp1, inp2, inp3, inp4,
2028  const20, const6, const3);
2029  res0 = __msa_ave_u_b(res0, inp1);
2030  ST_UB(res0, dst);
2031  dst += dst_stride;
2032 
2033  inp5 = LD_UB(src);
2034  src += src_stride;
2035  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp1, inp0, inp0, inp1,
2036  inp2, inp3, inp4, inp5,
2037  const20, const6, const3);
2038  res0 = __msa_ave_u_b(res0, inp2);
2039  ST_UB(res0, dst);
2040  dst += dst_stride;
2041 
2042  inp6 = LD_UB(src);
2043  src += src_stride;
2044  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp2, inp1, inp0, inp0,
2045  inp3, inp4, inp5, inp6,
2046  const20, const6, const3);
2047  res0 = __msa_ave_u_b(res0, inp3);
2048  ST_UB(res0, dst);
2049  dst += dst_stride;
2050 
2051  inp7 = LD_UB(src);
2052  src += src_stride;
2053  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp3, inp2, inp1, inp0,
2054  inp4, inp5, inp6, inp7,
2055  const20, const6, const3);
2056  res0 = __msa_ave_u_b(res0, inp4);
2057  ST_UB(res0, dst);
2058  dst += dst_stride;
2059 
2060  inp8 = LD_UB(src);
2061  src += src_stride;
2062  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp4, inp3, inp2, inp1,
2063  inp5, inp6, inp7, inp8,
2064  const20, const6, const3);
2065  res0 = __msa_ave_u_b(res0, inp5);
2066  ST_UB(res0, dst);
2067  dst += dst_stride;
2068 
2069  inp9 = LD_UB(src);
2070  src += src_stride;
2071  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp5, inp4, inp3, inp2,
2072  inp6, inp7, inp8, inp9,
2073  const20, const6, const3);
2074  res0 = __msa_ave_u_b(res0, inp6);
2075  ST_UB(res0, dst);
2076  dst += dst_stride;
2077 
2078  inp10 = LD_UB(src);
2079  src += src_stride;
2080  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp6, inp5, inp4, inp3,
2081  inp7, inp8, inp9, inp10,
2082  const20, const6, const3);
2083  res0 = __msa_ave_u_b(res0, inp7);
2084  ST_UB(res0, dst);
2085  dst += dst_stride;
2086 
2087  inp11 = LD_UB(src);
2088  src += src_stride;
2089  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp7, inp6, inp5, inp4,
2090  inp8, inp9, inp10, inp11,
2091  const20, const6, const3);
2092  res0 = __msa_ave_u_b(res0, inp8);
2093  ST_UB(res0, dst);
2094  dst += dst_stride;
2095 
2096  inp12 = LD_UB(src);
2097  src += src_stride;
2098  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp8, inp7, inp6, inp5,
2099  inp9, inp10, inp11, inp12,
2100  const20, const6, const3);
2101  res0 = __msa_ave_u_b(res0, inp9);
2102  ST_UB(res0, dst);
2103  dst += dst_stride;
2104 
2105  inp13 = LD_UB(src);
2106  src += src_stride;
2107  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp9, inp8, inp7, inp6,
2108  inp10, inp11, inp12, inp13,
2109  const20, const6, const3);
2110  res0 = __msa_ave_u_b(res0, inp10);
2111  ST_UB(res0, dst);
2112  dst += dst_stride;
2113 
2114  inp14 = LD_UB(src);
2115  src += src_stride;
2116  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp10, inp9, inp8, inp7,
2117  inp11, inp12, inp13, inp14,
2118  const20, const6, const3);
2119  res0 = __msa_ave_u_b(res0, inp11);
2120  ST_UB(res0, dst);
2121  dst += dst_stride;
2122 
2123  inp15 = LD_UB(src);
2124  src += src_stride;
2125  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp11, inp10, inp9, inp8,
2126  inp12, inp13, inp14, inp15,
2127  const20, const6, const3);
2128  res0 = __msa_ave_u_b(res0, inp12);
2129  ST_UB(res0, dst);
2130  dst += dst_stride;
2131 
2132  inp16 = LD_UB(src);
2133  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp12, inp11, inp10, inp9,
2134  inp13, inp14, inp15, inp16,
2135  const20, const6, const3);
2136  res0 = __msa_ave_u_b(res0, inp13);
2137  ST_UB(res0, dst);
2138  dst += dst_stride;
2139 
2140  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp13, inp12, inp11, inp10,
2141  inp14, inp15, inp16, inp16,
2142  const20, const6, const3);
2143  res0 = __msa_ave_u_b(res0, inp14);
2144  ST_UB(res0, dst);
2145  dst += dst_stride;
2146 
2147  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp14, inp13, inp12, inp11,
2148  inp15, inp16, inp16, inp15,
2149  const20, const6, const3);
2150  res0 = __msa_ave_u_b(res0, inp15);
2151  ST_UB(res0, dst);
2152  dst += dst_stride;
2153 
2154  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER(inp15, inp14, inp13, inp12,
2155  inp16, inp16, inp15, inp14,
2156  const20, const6, const3);
2157  res0 = __msa_ave_u_b(res0, inp16);
2158  ST_UB(res0, dst);
2159 }
2160 
2162  int32_t src_stride,
2163  uint8_t *dst,
2164  int32_t dst_stride)
2165 {
2166  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2167  v16u8 dst0, dst1, dst2, dst3;
2168  v16u8 tmp0, tmp1, res0, res1;
2169  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2170  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2171  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2172 
2173  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2174  src += (4 * src_stride);
2175  LD_UB2(src, src_stride, inp4, inp5);
2176  src += (2 * src_stride);
2177  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2178  inp1, inp2, inp3, inp4,
2179  inp1, inp0, inp0, inp1,
2180  inp2, inp3, inp4, inp5,
2181  const20, const6, const3);
2182 
2183  LD_UB2(src, src_stride, inp6, inp7);
2184  src += (2 * src_stride);
2185  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2186  inp3, inp4, inp5, inp6,
2187  inp3, inp2, inp1, inp0,
2188  inp4, inp5, inp6, inp7,
2189  const20, const6, const3);
2190 
2191  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2192  tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
2193  tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
2194  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2195  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2196  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2197  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2198  ST8x4_UB(res0, res1, dst, dst_stride);
2199  dst += (4 * dst_stride);
2200 
2201  inp8 = LD_UB(src);
2202  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2203  inp5, inp6, inp7, inp8,
2204  inp5, inp4, inp3, inp2,
2205  inp6, inp7, inp8, inp8,
2206  const20, const6, const3);
2207  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2208  inp7, inp8, inp8, inp7,
2209  inp7, inp6, inp5, inp4,
2210  inp8, inp8, inp7, inp6,
2211  const20, const6, const3);
2212 
2213  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2214  tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
2215  tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
2216  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2217  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2218  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2219  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2220  ST8x4_UB(res0, res1, dst, dst_stride);
2221 }
2222 
2224  int32_t src_stride,
2225  uint8_t *dst,
2226  int32_t dst_stride)
2227 {
2228  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2229  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2230  v16u8 res0, res1, dst0, dst1;
2231  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2232  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2233  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2234 
2235  LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2236  src += (5 * src_stride);
2237  res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2238  inp1, inp2, inp3, inp4,
2239  const20, const6, const3);
2240 
2241  inp5 = LD_UB(src);
2242  src += src_stride;
2243  res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2244  inp2, inp3, inp4, inp5,
2245  const20, const6, const3);
2246 
2247  LD_UB2(dst, dst_stride, dst0, dst1);
2248  AVER_UB2_UB(res0, inp0, res1, inp1, res0, res1);
2249  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2250  ST_UB2(res0, res1, dst, dst_stride);
2251  dst += (2 * dst_stride);
2252 
2253  inp6 = LD_UB(src);
2254  src += src_stride;
2255  res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2256  inp3, inp4, inp5, inp6,
2257  const20, const6, const3);
2258 
2259  inp7 = LD_UB(src);
2260  src += src_stride;
2261  res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2262  inp4, inp5, inp6, inp7,
2263  const20, const6, const3);
2264 
2265  LD_UB2(dst, dst_stride, dst0, dst1);
2266  AVER_UB2_UB(res0, inp2, res1, inp3, res0, res1);
2267  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2268  ST_UB2(res0, res1, dst, dst_stride);
2269  dst += (2 * dst_stride);
2270 
2271  LD_UB2(src, src_stride, inp8, inp9);
2272  src += (2 * src_stride);
2273  res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2274  inp5, inp6, inp7, inp8,
2275  const20, const6, const3);
2276  res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2277  inp6, inp7, inp8, inp9,
2278  const20, const6, const3);
2279 
2280  LD_UB2(dst, dst_stride, dst0, dst1);
2281  AVER_UB2_UB(res0, inp4, res1, inp5, res0, res1);
2282  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2283  ST_UB2(res0, res1, dst, dst_stride);
2284  dst += (2 * dst_stride);
2285 
2286  LD_UB2(src, src_stride, inp10, inp11);
2287  src += (2 * src_stride);
2288  res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2289  inp7, inp8, inp9, inp10,
2290  const20, const6, const3);
2291  res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2292  inp8, inp9, inp10, inp11,
2293  const20, const6, const3);
2294 
2295  LD_UB2(dst, dst_stride, dst0, dst1);
2296  AVER_UB2_UB(res0, inp6, res1, inp7, res0, res1);
2297  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2298  ST_UB2(res0, res1, dst, dst_stride);
2299  dst += (2 * dst_stride);
2300 
2301  LD_UB2(src, src_stride, inp12, inp13);
2302  src += (2 * src_stride);
2303  res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2304  inp9, inp10, inp11, inp12,
2305  const20, const6, const3);
2306  res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2307  inp10, inp11, inp12, inp13,
2308  const20, const6, const3);
2309  LD_UB2(dst, dst_stride, dst0, dst1);
2310  AVER_UB2_UB(res0, inp8, res1, inp9, res0, res1);
2311  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2312  ST_UB2(res0, res1, dst, dst_stride);
2313  dst += (2 * dst_stride);
2314 
2315  LD_UB2(src, src_stride, inp14, inp15);
2316  src += (2 * src_stride);
2317  res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2318  inp11, inp12, inp13, inp14,
2319  const20, const6, const3);
2320  res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2321  inp12, inp13, inp14, inp15,
2322  const20, const6, const3);
2323 
2324  LD_UB2(dst, dst_stride, dst0, dst1);
2325  AVER_UB2_UB(res0, inp10, res1, inp11, res0, res1);
2326  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2327  ST_UB2(res0, res1, dst, dst_stride);
2328  dst += (2 * dst_stride);
2329 
2330  inp16 = LD_UB(src);
2331  res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2332  inp13, inp14, inp15, inp16,
2333  const20, const6, const3);
2334  res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2335  inp14, inp15, inp16, inp16,
2336  const20, const6, const3);
2337  LD_UB2(dst, dst_stride, dst0, dst1);
2338  AVER_UB2_UB(res0, inp12, res1, inp13, res0, res1);
2339  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2340  ST_UB2(res0, res1, dst, dst_stride);
2341  dst += (2 * dst_stride);
2342 
2343  res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2344  inp15, inp16, inp16, inp15,
2345  const20, const6, const3);
2346  res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2347  inp16, inp16, inp15, inp14,
2348  const20, const6, const3);
2349  LD_UB2(dst, dst_stride, dst0, dst1);
2350  AVER_UB2_UB(res0, inp14, res1, inp15, res0, res1);
2351  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2352  ST_UB2(res0, res1, dst, dst_stride);
2353 }
2354 
2356  int32_t src_stride,
2357  uint8_t *dst,
2358  int32_t dst_stride)
2359 {
2360  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2361  v16u8 dst0, dst1, dst2, dst3;
2362  v16u8 res0, res1;
2363  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2364  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2365  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2366 
2367  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2368  src += (4 * src_stride);
2369  LD_UB2(src, src_stride, inp4, inp5);
2370  src += (2 * src_stride);
2371  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2372  inp1, inp2, inp3, inp4,
2373  inp1, inp0, inp0, inp1,
2374  inp2, inp3, inp4, inp5,
2375  const20, const6, const3);
2376  LD_UB2(src, src_stride, inp6, inp7);
2377  src += (2 * src_stride);
2378  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2379  inp3, inp4, inp5, inp6,
2380  inp3, inp2, inp1, inp0,
2381  inp4, inp5, inp6, inp7,
2382  const20, const6, const3);
2383  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2384  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2385  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2386  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2387  ST8x4_UB(res0, res1, dst, dst_stride);
2388  dst += (4 * dst_stride);
2389 
2390  inp8 = LD_UB(src);
2391  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2392  inp5, inp6, inp7, inp8,
2393  inp5, inp4, inp3, inp2,
2394  inp6, inp7, inp8, inp8,
2395  const20, const6, const3);
2396  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2397  inp7, inp8, inp8, inp7,
2398  inp7, inp6, inp5, inp4,
2399  inp8, inp8, inp7, inp6,
2400  const20, const6, const3);
2401  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2402  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2403  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2404  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2405  ST8x4_UB(res0, res1, dst, dst_stride);
2406  dst += (4 * dst_stride);
2407 }
2408 
2410  int32_t src_stride,
2411  uint8_t *dst,
2412  int32_t dst_stride)
2413 {
2414  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2415  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2416  v16u8 res0, res1, dst0, dst1;
2417  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2418  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2419  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2420 
2421  LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2422  src += (5 * src_stride);
2423  res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2424  inp1, inp2, inp3, inp4,
2425  const20, const6, const3);
2426  inp5 = LD_UB(src);
2427  src += src_stride;
2428  res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2429  inp2, inp3, inp4, inp5,
2430  const20, const6, const3);
2431  LD_UB2(dst, dst_stride, dst0, dst1);
2432  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2433  ST_UB2(res0, res1, dst, dst_stride);
2434  dst += (2 * dst_stride);
2435 
2436  inp6 = LD_UB(src);
2437  src += src_stride;
2438  res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2439  inp3, inp4, inp5, inp6,
2440  const20, const6, const3);
2441  inp7 = LD_UB(src);
2442  src += src_stride;
2443  res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2444  inp4, inp5, inp6, inp7,
2445  const20, const6, const3);
2446  LD_UB2(dst, dst_stride, dst0, dst1);
2447  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2448  ST_UB2(res0, res1, dst, dst_stride);
2449  dst += (2 * dst_stride);
2450 
2451  inp8 = LD_UB(src);
2452  src += src_stride;
2453  res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2454  inp5, inp6, inp7, inp8,
2455  const20, const6, const3);
2456  inp9 = LD_UB(src);
2457  src += src_stride;
2458  res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2459  inp6, inp7, inp8, inp9,
2460  const20, const6, const3);
2461  LD_UB2(dst, dst_stride, dst0, dst1);
2462  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2463  ST_UB2(res0, res1, dst, dst_stride);
2464  dst += (2 * dst_stride);
2465 
2466  inp10 = LD_UB(src);
2467  src += src_stride;
2468  res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2469  inp7, inp8, inp9, inp10,
2470  const20, const6, const3);
2471  inp11 = LD_UB(src);
2472  src += src_stride;
2473  res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2474  inp8, inp9, inp10, inp11,
2475  const20, const6, const3);
2476  LD_UB2(dst, dst_stride, dst0, dst1);
2477  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2478  ST_UB2(res0, res1, dst, dst_stride);
2479  dst += (2 * dst_stride);
2480 
2481  inp12 = LD_UB(src);
2482  src += src_stride;
2483  res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2484  inp9, inp10, inp11, inp12,
2485  const20, const6, const3);
2486  inp13 = LD_UB(src);
2487  src += src_stride;
2488  res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2489  inp10, inp11, inp12, inp13,
2490  const20, const6, const3);
2491  LD_UB2(dst, dst_stride, dst0, dst1);
2492  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2493  ST_UB2(res0, res1, dst, dst_stride);
2494  dst += (2 * dst_stride);
2495 
2496  inp14 = LD_UB(src);
2497  src += src_stride;
2498  res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2499  inp11, inp12, inp13, inp14,
2500  const20, const6, const3);
2501  inp15 = LD_UB(src);
2502  src += src_stride;
2503  res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2504  inp12, inp13, inp14, inp15,
2505  const20, const6, const3);
2506  LD_UB2(dst, dst_stride, dst0, dst1);
2507  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2508  ST_UB2(res0, res1, dst, dst_stride);
2509  dst += (2 * dst_stride);
2510 
2511  inp16 = LD_UB(src);
2512  res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2513  inp13, inp14, inp15, inp16,
2514  const20, const6, const3);
2515  res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2516  inp14, inp15, inp16, inp16,
2517  const20, const6, const3);
2518  LD_UB2(dst, dst_stride, dst0, dst1);
2519  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2520  ST_UB2(res0, res1, dst, dst_stride);
2521  dst += (2 * dst_stride);
2522 
2523  res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2524  inp15, inp16, inp16, inp15,
2525  const20, const6, const3);
2526  res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2527  inp16, inp16, inp15, inp14,
2528  const20, const6, const3);
2529  LD_UB2(dst, dst_stride, dst0, dst1);
2530  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2531  ST_UB2(res0, res1, dst, dst_stride);
2532 }
2533 
2535  int32_t src_stride,
2536  uint8_t *dst,
2537  int32_t dst_stride)
2538 {
2539  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2540  v16u8 dst0, dst1, dst2, dst3;
2541  v16u8 tmp0, tmp1, res0, res1;
2542  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2543  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2544  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2545 
2546  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
2547  src += (4 * src_stride);
2548  LD_UB2(src, src_stride, inp4, inp5);
2549  src += (2 * src_stride);
2550  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp0, inp0, inp1, inp2,
2551  inp1, inp2, inp3, inp4,
2552  inp1, inp0, inp0, inp1,
2553  inp2, inp3, inp4, inp5,
2554  const20, const6, const3);
2555  LD_UB2(src, src_stride, inp6, inp7);
2556  src += (2 * src_stride);
2557  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp2, inp1, inp0, inp0,
2558  inp3, inp4, inp5, inp6,
2559  inp3, inp2, inp1, inp0,
2560  inp4, inp5, inp6, inp7,
2561  const20, const6, const3);
2562  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2563  tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
2564  tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
2565  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2566  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2567  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2568  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2569  ST8x4_UB(res0, res1, dst, dst_stride);
2570  dst += (4 * dst_stride);
2571 
2572  inp8 = LD_UB(src);
2573  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
2574  inp5, inp6, inp7, inp8,
2575  inp5, inp4, inp3, inp2,
2576  inp6, inp7, inp8, inp8,
2577  const20, const6, const3);
2578  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(inp6, inp5, inp4, inp3,
2579  inp7, inp8, inp8, inp7,
2580  inp7, inp6, inp5, inp4,
2581  inp8, inp8, inp7, inp6,
2582  const20, const6, const3);
2583  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2584  tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
2585  tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
2586  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2587  dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
2588  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
2589  AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
2590  ST8x4_UB(res0, res1, dst, dst_stride);
2591 }
2592 
2594  int32_t src_stride,
2595  uint8_t *dst,
2596  int32_t dst_stride)
2597 {
2598  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7, inp8;
2599  v16u8 inp9, inp10, inp11, inp12, inp13, inp14, inp15, inp16;
2600  v16u8 res0, res1, dst0, dst1;
2601  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2602  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2603  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2604 
2605  LD_UB5(src, src_stride, inp0, inp1, inp2, inp3, inp4);
2606  src += (5 * src_stride);
2607  res0 = APPLY_VERT_QPEL_FILTER(inp0, inp0, inp1, inp2,
2608  inp1, inp2, inp3, inp4,
2609  const20, const6, const3);
2610  inp5 = LD_UB(src);
2611  src += src_stride;
2612  res1 = APPLY_VERT_QPEL_FILTER(inp1, inp0, inp0, inp1,
2613  inp2, inp3, inp4, inp5,
2614  const20, const6, const3);
2615  LD_UB2(dst, dst_stride, dst0, dst1);
2616  AVER_UB2_UB(res0, inp1, res1, inp2, res0, res1);
2617  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2618  ST_UB2(res0, res1, dst, dst_stride);
2619  dst += (2 * dst_stride);
2620 
2621  inp6 = LD_UB(src);
2622  src += src_stride;
2623  res0 = APPLY_VERT_QPEL_FILTER(inp2, inp1, inp0, inp0,
2624  inp3, inp4, inp5, inp6,
2625  const20, const6, const3);
2626  inp7 = LD_UB(src);
2627  src += src_stride;
2628  res1 = APPLY_VERT_QPEL_FILTER(inp3, inp2, inp1, inp0,
2629  inp4, inp5, inp6, inp7,
2630  const20, const6, const3);
2631  LD_UB2(dst, dst_stride, dst0, dst1);
2632  AVER_UB2_UB(res0, inp3, res1, inp4, res0, res1);
2633  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2634  ST_UB2(res0, res1, dst, dst_stride);
2635  dst += (2 * dst_stride);
2636 
2637  inp8 = LD_UB(src);
2638  src += src_stride;
2639  res0 = APPLY_VERT_QPEL_FILTER(inp4, inp3, inp2, inp1,
2640  inp5, inp6, inp7, inp8,
2641  const20, const6, const3);
2642  inp9 = LD_UB(src);
2643  src += src_stride;
2644  res1 = APPLY_VERT_QPEL_FILTER(inp5, inp4, inp3, inp2,
2645  inp6, inp7, inp8, inp9,
2646  const20, const6, const3);
2647  LD_UB2(dst, dst_stride, dst0, dst1);
2648  AVER_UB2_UB(res0, inp5, res1, inp6, res0, res1);
2649  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2650  ST_UB2(res0, res1, dst, dst_stride);
2651  dst += (2 * dst_stride);
2652 
2653  inp10 = LD_UB(src);
2654  src += src_stride;
2655  res0 = APPLY_VERT_QPEL_FILTER(inp6, inp5, inp4, inp3,
2656  inp7, inp8, inp9, inp10,
2657  const20, const6, const3);
2658  inp11 = LD_UB(src);
2659  src += src_stride;
2660  res1 = APPLY_VERT_QPEL_FILTER(inp7, inp6, inp5, inp4,
2661  inp8, inp9, inp10, inp11,
2662  const20, const6, const3);
2663  LD_UB2(dst, dst_stride, dst0, dst1);
2664  AVER_UB2_UB(res0, inp7, res1, inp8, res0, res1);
2665  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2666  ST_UB2(res0, res1, dst, dst_stride);
2667  dst += (2 * dst_stride);
2668 
2669  inp12 = LD_UB(src);
2670  src += src_stride;
2671  res0 = APPLY_VERT_QPEL_FILTER(inp8, inp7, inp6, inp5,
2672  inp9, inp10, inp11, inp12,
2673  const20, const6, const3);
2674  inp13 = LD_UB(src);
2675  src += src_stride;
2676  res1 = APPLY_VERT_QPEL_FILTER(inp9, inp8, inp7, inp6,
2677  inp10, inp11, inp12, inp13,
2678  const20, const6, const3);
2679  LD_UB2(dst, dst_stride, dst0, dst1);
2680  AVER_UB2_UB(res0, inp9, res1, inp10, res0, res1);
2681  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2682  ST_UB2(res0, res1, dst, dst_stride);
2683  dst += (2 * dst_stride);
2684 
2685  inp14 = LD_UB(src);
2686  src += src_stride;
2687  res0 = APPLY_VERT_QPEL_FILTER(inp10, inp9, inp8, inp7,
2688  inp11, inp12, inp13, inp14,
2689  const20, const6, const3);
2690  inp15 = LD_UB(src);
2691  src += src_stride;
2692  res1 = APPLY_VERT_QPEL_FILTER(inp11, inp10, inp9, inp8,
2693  inp12, inp13, inp14, inp15,
2694  const20, const6, const3);
2695  LD_UB2(dst, dst_stride, dst0, dst1);
2696  AVER_UB2_UB(res0, inp11, res1, inp12, res0, res1);
2697  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2698  ST_UB2(res0, res1, dst, dst_stride);
2699  dst += (2 * dst_stride);
2700 
2701  inp16 = LD_UB(src);
2702  res0 = APPLY_VERT_QPEL_FILTER(inp12, inp11, inp10, inp9,
2703  inp13, inp14, inp15, inp16,
2704  const20, const6, const3);
2705  res1 = APPLY_VERT_QPEL_FILTER(inp13, inp12, inp11, inp10,
2706  inp14, inp15, inp16, inp16,
2707  const20, const6, const3);
2708  LD_UB2(dst, dst_stride, dst0, dst1);
2709  AVER_UB2_UB(res0, inp13, res1, inp14, res0, res1);
2710  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2711  ST_UB2(res0, res1, dst, dst_stride);
2712  dst += (2 * dst_stride);
2713 
2714  res0 = APPLY_VERT_QPEL_FILTER(inp14, inp13, inp12, inp11,
2715  inp15, inp16, inp16, inp15,
2716  const20, const6, const3);
2717  res1 = APPLY_VERT_QPEL_FILTER(inp15, inp14, inp13, inp12,
2718  inp16, inp16, inp15, inp14,
2719  const20, const6, const3);
2720  LD_UB2(dst, dst_stride, dst0, dst1);
2721  AVER_UB2_UB(res0, inp15, res1, inp16, res0, res1);
2722  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
2723  ST_UB2(res0, res1, dst, dst_stride);
2724 }
2725 
2727  int32_t src_stride,
2728  uint8_t *dst,
2729  int32_t dst_stride,
2730  int32_t height)
2731 {
2732  uint8_t loop_count;
2733  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
2734  v16u8 res;
2735  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
2736  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2737  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2738  v8u16 const20 = (v8u16) __msa_ldi_h(20);
2739 
2740  for (loop_count = (height >> 2); loop_count--;) {
2741  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
2742  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
2743  src += (4 * src_stride);
2744  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2745  const20, const6, const3);
2746  res = __msa_ave_u_b(inp0, res);
2747  ST_UB(res, dst);
2748  dst += dst_stride;
2749 
2750  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
2751  const20, const6, const3);
2752  res = __msa_ave_u_b(inp2, res);
2753  ST_UB(res, dst);
2754  dst += dst_stride;
2755 
2756  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
2757  const20, const6, const3);
2758  res = __msa_ave_u_b(inp4, res);
2759  ST_UB(res, dst);
2760  dst += dst_stride;
2761 
2762  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
2763  const20, const6, const3);
2764  res = __msa_ave_u_b(inp6, res);
2765  ST_UB(res, dst);
2766  dst += dst_stride;
2767  }
2768 
2769  LD_UB2(src, 1, inp0, inp1);
2770  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2771  const20, const6, const3);
2772  res = __msa_ave_u_b(inp0, res);
2773  ST_UB(res, dst);
2774 }
2775 
2777  int32_t src_stride,
2778  uint8_t *dst,
2779  int32_t dst_stride)
2780 {
2781  uint8_t buff[272];
2782 
2783  hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
2784  vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
2785 }
2786 
2788  int32_t src_stride,
2789  uint8_t *dst,
2790  int32_t dst_stride)
2791 {
2792  v16u8 inp0, inp1, inp2, inp3;
2793  v16u8 res0, res1, avg0, avg1;
2794  v16u8 horiz0, horiz1, horiz2, horiz3;
2795  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
2796  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2797  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
2798  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
2799  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
2800  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2801  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2802  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2803 
2804  LD_UB2(src, src_stride, inp0, inp1);
2805  src += (2 * src_stride);
2806  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2807  mask2, mask3, const20,
2808  const6, const3);
2809  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
2810  horiz0 = __msa_ave_u_b(inp0, res0);
2811  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
2812  LD_UB2(src, src_stride, inp2, inp3);
2813  src += (2 * src_stride);
2814  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2815  mask2, mask3, const20,
2816  const6, const3);
2817  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
2818  horiz2 = __msa_ave_u_b(inp2, res1);
2819  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
2820  LD_UB2(src, src_stride, inp0, inp1);
2821  src += (2 * src_stride);
2822  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2823  mask2, mask3, const20,
2824  const6, const3);
2825  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
2826  horiz4 = __msa_ave_u_b(inp0, res0);
2827  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
2828  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
2829  horiz1, horiz2, horiz3, horiz4,
2830  horiz1, horiz0, horiz0, horiz1,
2831  horiz2, horiz3, horiz4, horiz5,
2832  const20, const6, const3);
2833  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2834  res0 = __msa_ave_u_b(avg0, res0);
2835  ST8x2_UB(res0, dst, dst_stride);
2836  dst += (2 * dst_stride);
2837 
2838  LD_UB2(src, src_stride, inp2, inp3);
2839  src += (2 * src_stride);
2840  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2841  mask2, mask3, const20,
2842  const6, const3);
2843  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
2844  horiz6 = __msa_ave_u_b(inp2, res1);
2845  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
2846  inp0 = LD_UB(src);
2847  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
2848  mask2, mask3, const20,
2849  const6, const3);
2850  horiz8 = __msa_ave_u_b(inp0, res0);
2851  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
2852  horiz3, horiz4, horiz5, horiz6,
2853  horiz3, horiz2, horiz1, horiz0,
2854  horiz4, horiz5, horiz6, horiz7,
2855  const20, const6, const3);
2856  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
2857  res1 = __msa_ave_u_b(avg1, res1);
2858  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
2859  horiz5, horiz6, horiz7, horiz8,
2860  horiz5, horiz4, horiz3, horiz2,
2861  horiz6, horiz7, horiz8, horiz8,
2862  const20, const6, const3);
2863  ST8x2_UB(res1, dst, dst_stride);
2864  dst += 2 * dst_stride;
2865 
2866  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
2867  res0 = __msa_ave_u_b(avg0, res0);
2868  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
2869  horiz7, horiz8, horiz8, horiz7,
2870  horiz7, horiz6, horiz5, horiz4,
2871  horiz8, horiz8, horiz7, horiz6,
2872  const20, const6, const3);
2873  ST8x2_UB(res0, dst, dst_stride);
2874  dst += 2 * dst_stride;
2875 
2876  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
2877  res1 = __msa_ave_u_b(avg1, res1);
2878  ST8x2_UB(res1, dst, dst_stride);
2879 }
2880 
2882  int32_t src_stride,
2883  uint8_t *dst,
2884  int32_t dst_stride,
2885  int32_t height)
2886 {
2887  uint8_t loop_count;
2888  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
2889  v16u8 res;
2890  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
2891  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2892  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2893  v8u16 const20 = (v8u16) __msa_ldi_h(20);
2894 
2895  for (loop_count = (height >> 2); loop_count--;) {
2896  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
2897  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
2898  src += (4 * src_stride);
2899  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2900  const20, const6, const3);
2901  ST_UB(res, dst);
2902  dst += dst_stride;
2903 
2904  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
2905  const20, const6, const3);
2906  ST_UB(res, dst);
2907  dst += dst_stride;
2908 
2909  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
2910  const20, const6, const3);
2911  ST_UB(res, dst);
2912  dst += dst_stride;
2913 
2914  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
2915  const20, const6, const3);
2916  ST_UB(res, dst);
2917  dst += dst_stride;
2918  }
2919 
2920  LD_UB2(src, 1, inp0, inp1);
2921  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
2922  const20, const6, const3);
2923  ST_UB(res, dst);
2924 }
2925 
2927  int32_t src_stride,
2928  uint8_t *dst,
2929  int32_t dst_stride)
2930 {
2931  uint8_t buff[272];
2932 
2933  hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
2934  vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
2935 }
2936 
2938  int32_t src_stride,
2939  uint8_t *dst,
2940  int32_t dst_stride)
2941 {
2942  v16u8 inp0, inp1, inp2, inp3;
2943  v16u8 res0, res1, avg0, avg1;
2944  v16u8 horiz0, horiz1, horiz2, horiz3;
2945  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
2946  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2947  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
2948  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
2949  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
2950  v16u8 const20 = (v16u8) __msa_ldi_b(20);
2951  v16u8 const6 = (v16u8) __msa_ldi_b(6);
2952  v16u8 const3 = (v16u8) __msa_ldi_b(3);
2953 
2954  LD_UB2(src, src_stride, inp0, inp1);
2955  src += (2 * src_stride);
2956  horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2957  mask2, mask3, const20,
2958  const6, const3);
2959  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
2960 
2961  LD_UB2(src, src_stride, inp2, inp3);
2962  src += (2 * src_stride);
2963  horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2964  mask2, mask3, const20,
2965  const6, const3);
2966  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
2967  LD_UB2(src, src_stride, inp0, inp1);
2968  src += (2 * src_stride);
2969  horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
2970  mask2, mask3, const20,
2971  const6, const3);
2972  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
2973  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
2974  horiz1, horiz2, horiz3, horiz4,
2975  horiz1, horiz0, horiz0, horiz1,
2976  horiz2, horiz3, horiz4, horiz5,
2977  const20, const6, const3);
2978  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
2979  res0 = __msa_ave_u_b(avg0, res0);
2980  ST8x2_UB(res0, dst, dst_stride);
2981  dst += (2 * dst_stride);
2982 
2983  LD_UB2(src, src_stride, inp2, inp3);
2984  src += (2 * src_stride);
2985  horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
2986  mask2, mask3, const20,
2987  const6, const3);
2988  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
2989  inp0 = LD_UB(src);
2990  horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
2991  mask2, mask3, const20,
2992  const6, const3);
2993  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
2994  horiz3, horiz4, horiz5, horiz6,
2995  horiz3, horiz2, horiz1, horiz0,
2996  horiz4, horiz5, horiz6, horiz7,
2997  const20, const6, const3);
2998  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
2999  res1 = __msa_ave_u_b(avg1, res1);
3000  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3001  res0 = __msa_ave_u_b(avg0, res0);
3002  ST8x2_UB(res1, dst, dst_stride);
3003  dst += (2 * dst_stride);
3004 
3005  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3006  horiz5, horiz6, horiz7, horiz8,
3007  horiz5, horiz4, horiz3, horiz2,
3008  horiz6, horiz7, horiz8, horiz8,
3009  const20, const6, const3);
3010  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3011  res0 = __msa_ave_u_b(avg0, res0);
3012  ST8x2_UB(res0, dst, dst_stride);
3013  dst += (2 * dst_stride);
3014 
3015  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3016  horiz7, horiz8, horiz8, horiz7,
3017  horiz7, horiz6, horiz5, horiz4,
3018  horiz8, horiz8, horiz7, horiz6,
3019  const20, const6, const3);
3020  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3021  res1 = __msa_ave_u_b(avg1, res1);
3022  ST8x2_UB(res1, dst, dst_stride);
3023 }
3024 
3026  int32_t src_stride,
3027  uint8_t *dst,
3028  int32_t dst_stride,
3029  int32_t height)
3030 {
3031  uint8_t loop_count;
3032  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3033  v16u8 res;
3034  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3035  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3036  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3037  v8u16 const20 = (v8u16) __msa_ldi_h(20);
3038 
3039  for (loop_count = (height >> 2); loop_count--;) {
3040  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3041  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3042  src += (4 * src_stride);
3043  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
3044  const20, const6, const3);
3045  res = __msa_ave_u_b(res, inp1);
3046  ST_UB(res, dst);
3047  dst += dst_stride;
3048 
3049  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp2, inp3, mask,
3050  const20, const6, const3);
3051  res = __msa_ave_u_b(res, inp3);
3052  ST_UB(res, dst);
3053  dst += dst_stride;
3054 
3055  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp4, inp5, mask,
3056  const20, const6, const3);
3057  res = __msa_ave_u_b(res, inp5);
3058  ST_UB(res, dst);
3059  dst += dst_stride;
3060 
3061  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp6, inp7, mask,
3062  const20, const6, const3);
3063  res = __msa_ave_u_b(res, inp7);
3064  ST_UB(res, dst);
3065  dst += dst_stride;
3066  }
3067 
3068  LD_UB2(src, 1, inp0, inp1);
3069  res = APPLY_HORIZ_QPEL_NO_ROUND_FILTER(inp0, inp1, mask,
3070  const20, const6, const3);
3071  res = __msa_ave_u_b(inp1, res);
3072  ST_UB(res, dst);
3073 }
3074 
3076  int32_t src_stride,
3077  uint8_t *dst,
3078  int32_t dst_stride)
3079 {
3080  uint8_t buff[272];
3081 
3082  hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3083  vert_mc_qpel_no_rnd_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3084 }
3085 
3087  int32_t src_stride,
3088  uint8_t *dst,
3089  int32_t dst_stride)
3090 {
3091  v16u8 inp0, inp1, inp2, inp3;
3092  v16u8 res0, res1, avg0, avg1;
3093  v16u8 horiz0, horiz1, horiz2, horiz3;
3094  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3095  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3096  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3097  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3098  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3099  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3100  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3101  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3102 
3103  LD_UB2(src, src_stride, inp0, inp1);
3104  src += (2 * src_stride);
3105  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3106  mask2, mask3, const20,
3107  const6, const3);
3108  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3109 
3110  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3111  horiz0 = __msa_ave_u_b(inp0, res0);
3112  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3113  LD_UB2(src, src_stride, inp2, inp3);
3114  src += (2 * src_stride);
3115  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3116  mask2, mask3, const20,
3117  const6, const3);
3118  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3119 
3120  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3121  horiz2 = __msa_ave_u_b(inp2, res1);
3122  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3123  LD_UB2(src, src_stride, inp0, inp1);
3124  src += (2 * src_stride);
3125  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3126  mask2, mask3, const20,
3127  const6, const3);
3128  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3129 
3130  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3131  horiz4 = __msa_ave_u_b(inp0, res0);
3132  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3133  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3134  horiz1, horiz2, horiz3, horiz4,
3135  horiz1, horiz0, horiz0, horiz1,
3136  horiz2, horiz3, horiz4, horiz5,
3137  const20, const6, const3);
3138  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3139  res0 = __msa_ave_u_b(avg0, res0);
3140  ST8x2_UB(res0, dst, dst_stride);
3141  dst += (2 * dst_stride);
3142 
3143  LD_UB2(src, src_stride, inp2, inp3);
3144  src += (2 * src_stride);
3145  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3146  mask2, mask3, const20,
3147  const6, const3);
3148  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3149 
3150  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3151  horiz6 = __msa_ave_u_b(inp2, res1);
3152  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3153  inp0 = LD_UB(src);
3154  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3155  mask2, mask3, const20,
3156  const6, const3);
3157  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3158  horiz8 = __msa_ave_u_b(inp0, res0);
3159  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3160  horiz3, horiz4, horiz5, horiz6,
3161  horiz3, horiz2, horiz1, horiz0,
3162  horiz4, horiz5, horiz6, horiz7,
3163  const20, const6, const3);
3164  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
3165  res1 = __msa_ave_u_b(avg1, res1);
3166  ST8x2_UB(res1, dst, dst_stride);
3167  dst += (2 * dst_stride);
3168 
3169  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3170  horiz5, horiz6, horiz7, horiz8,
3171  horiz5, horiz4, horiz3, horiz2,
3172  horiz6, horiz7, horiz8, horiz8,
3173  const20, const6, const3);
3174  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3175  res0 = __msa_ave_u_b(avg0, res0);
3176  ST8x2_UB(res0, dst, dst_stride);
3177  dst += (2 * dst_stride);
3178 
3179  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3180  horiz7, horiz8, horiz8, horiz7,
3181  horiz7, horiz6, horiz5, horiz4,
3182  horiz8, horiz8, horiz7, horiz6,
3183  const20, const6, const3);
3184  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3185  res1 = __msa_ave_u_b(avg1, res1);
3186  ST8x2_UB(res1, dst, dst_stride);
3187 }
3188 
3190  int32_t src_stride,
3191  uint8_t *dst,
3192  int32_t dst_stride)
3193 {
3194  uint8_t buff[272];
3195 
3196  hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3197  vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3198 }
3199 
3201  int32_t src_stride,
3202  uint8_t *dst,
3203  int32_t dst_stride)
3204 {
3205  v16u8 inp0, inp1, inp2, inp3;
3206  v16u8 res0, res1;
3207  v16u8 horiz0, horiz1, horiz2, horiz3;
3208  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3209  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3210  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3211  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3212  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3213  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3214  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3215  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3216 
3217  LD_UB2(src, src_stride, inp0, inp1);
3218  src += (2 * src_stride);
3219  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3220  mask2, mask3, const20,
3221  const6, const3);
3222  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3223  horiz0 = __msa_ave_u_b(inp0, res0);
3224  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3225  LD_UB2(src, src_stride, inp2, inp3);
3226  src += (2 * src_stride);
3227  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3228  mask2, mask3, const20,
3229  const6, const3);
3230  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3231  horiz2 = __msa_ave_u_b(inp2, res1);
3232  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3233  LD_UB2(src, src_stride, inp0, inp1);
3234  src += (2 * src_stride);
3235  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3236  mask2, mask3, const20,
3237  const6, const3);
3238  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3239  horiz4 = __msa_ave_u_b(inp0, res0);
3240  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3241  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3242  horiz1, horiz2, horiz3, horiz4,
3243  horiz1, horiz0, horiz0, horiz1,
3244  horiz2, horiz3, horiz4, horiz5,
3245  const20, const6, const3);
3246 
3247  LD_UB2(src, src_stride, inp2, inp3);
3248  src += (2 * src_stride);
3249  ST8x2_UB(res0, dst, dst_stride);
3250  dst += 2 * dst_stride;
3251 
3252  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3253  mask2, mask3, const20,
3254  const6, const3);
3255  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3256  horiz6 = __msa_ave_u_b(inp2, res1);
3257  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3258  inp0 = LD_UB(src);
3259  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3260  mask2, mask3, const20,
3261  const6, const3);
3262  horiz8 = __msa_ave_u_b(inp0, res0);
3263  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3264  horiz3, horiz4, horiz5, horiz6,
3265  horiz3, horiz2, horiz1, horiz0,
3266  horiz4, horiz5, horiz6, horiz7,
3267  const20, const6, const3);
3268  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3269  horiz5, horiz6, horiz7, horiz8,
3270  horiz5, horiz4, horiz3, horiz2,
3271  horiz6, horiz7, horiz8, horiz8,
3272  const20, const6, const3);
3273  ST8x2_UB(res1, dst, dst_stride);
3274  dst += 2 * dst_stride;
3275 
3276  ST8x2_UB(res0, dst, dst_stride);
3277  dst += (2 * dst_stride);
3278 
3279  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3280  horiz7, horiz8, horiz8, horiz7,
3281  horiz7, horiz6, horiz5, horiz4,
3282  horiz8, horiz8, horiz7, horiz6,
3283  const20, const6, const3);
3284  ST8x2_UB(res1, dst, dst_stride);
3285 }
3286 
3288  int32_t src_stride,
3289  uint8_t *dst,
3290  int32_t dst_stride)
3291 {
3292  uint8_t buff[272];
3293 
3294  hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3295  vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3296 }
3297 
3299  int32_t src_stride,
3300  uint8_t *dst,
3301  int32_t dst_stride)
3302 {
3303  v16u8 inp0, inp1, inp2, inp3;
3304  v16u8 res0, res1;
3305  v16u8 horiz0, horiz1, horiz2, horiz3;
3306  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3307  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3308  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3309  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3310  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3311  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3312  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3313  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3314 
3315  LD_UB2(src, src_stride, inp0, inp1);
3316  src += (2 * src_stride);
3317  horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3318  mask2, mask3, const20,
3319  const6, const3);
3320  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3321  LD_UB2(src, src_stride, inp2, inp3);
3322  src += (2 * src_stride);
3323  horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3324  mask2, mask3, const20,
3325  const6, const3);
3326  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3327  LD_UB2(src, src_stride, inp0, inp1);
3328  src += (2 * src_stride);
3329  horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3330  mask2, mask3, const20,
3331  const6, const3);
3332  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3333  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3334  horiz1, horiz2, horiz3, horiz4,
3335  horiz1, horiz0, horiz0, horiz1,
3336  horiz2, horiz3, horiz4, horiz5,
3337  const20, const6, const3);
3338  LD_UB2(src, src_stride, inp2, inp3);
3339  src += (2 * src_stride);
3340  ST8x2_UB(res0, dst, dst_stride);
3341  dst += 2 * dst_stride;
3342 
3343  horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3344  mask2, mask3, const20,
3345  const6, const3);
3346  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3347  inp0 = LD_UB(src);
3348  horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3349  mask2, mask3, const20,
3350  const6, const3);
3351  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3352  horiz3, horiz4, horiz5, horiz6,
3353  horiz3, horiz2, horiz1, horiz0,
3354  horiz4, horiz5, horiz6, horiz7,
3355  const20, const6, const3);
3356  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3357  horiz5, horiz6, horiz7, horiz8,
3358  horiz5, horiz4, horiz3, horiz2,
3359  horiz6, horiz7, horiz8, horiz8,
3360  const20, const6, const3);
3361  ST8x2_UB(res1, dst, dst_stride);
3362  dst += 2 * dst_stride;
3363 
3364 
3365  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3366  horiz7, horiz8, horiz8, horiz7,
3367  horiz7, horiz6, horiz5, horiz4,
3368  horiz8, horiz8, horiz7, horiz6,
3369  const20, const6, const3);
3370  ST8x2_UB(res0, dst, dst_stride);
3371  dst += 2 * dst_stride;
3372  ST8x2_UB(res1, dst, dst_stride);
3373 }
3374 
3376  int32_t src_stride,
3377  uint8_t *dst,
3378  int32_t dst_stride)
3379 {
3380  uint8_t buff[272];
3381 
3382  hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3383  vert_mc_qpel_no_rnd_16x16_msa(buff, 16, dst, dst_stride);
3384 }
3385 
3387  int32_t src_stride,
3388  uint8_t *dst,
3389  int32_t dst_stride)
3390 {
3391  v16u8 inp0, inp1, inp2, inp3;
3392  v16u8 res0, res1;
3393  v16u8 horiz0, horiz1, horiz2, horiz3;
3394  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3395  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3396  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3397  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3398  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3399  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3400  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3401  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3402 
3403  LD_UB2(src, src_stride, inp0, inp1);
3404  src += (2 * src_stride);
3405  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3406  mask2, mask3, const20,
3407  const6, const3);
3408  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3409 
3410  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3411  horiz0 = __msa_ave_u_b(inp0, res0);
3412  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3413  LD_UB2(src, src_stride, inp2, inp3);
3414  src += (2 * src_stride);
3415  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3416  mask2, mask3, const20,
3417  const6, const3);
3418  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3419 
3420  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3421  horiz2 = __msa_ave_u_b(inp2, res1);
3422  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3423  LD_UB2(src, src_stride, inp0, inp1);
3424  src += (2 * src_stride);
3425  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3426  mask2, mask3, const20,
3427  const6, const3);
3428  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3429 
3430  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3431  horiz4 = __msa_ave_u_b(inp0, res0);
3432  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3433  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3434  horiz1, horiz2, horiz3, horiz4,
3435  horiz1, horiz0, horiz0, horiz1,
3436  horiz2, horiz3, horiz4, horiz5,
3437  const20, const6, const3);
3438  LD_UB2(src, src_stride, inp2, inp3);
3439  src += (2 * src_stride);
3440  ST8x2_UB(res0, dst, dst_stride);
3441  dst += 2 * dst_stride;
3442 
3443  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3444  mask2, mask3, const20,
3445  const6, const3);
3446  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3447 
3448  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3449  horiz6 = __msa_ave_u_b(inp2, res1);
3450  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3451  inp0 = LD_UB(src);
3452  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3453  mask2, mask3, const20,
3454  const6, const3);
3455  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3456  horiz8 = __msa_ave_u_b(inp0, res0);
3457  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3458  horiz3, horiz4, horiz5, horiz6,
3459  horiz3, horiz2, horiz1, horiz0,
3460  horiz4, horiz5, horiz6, horiz7,
3461  const20, const6, const3);
3462  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3463  horiz5, horiz6, horiz7, horiz8,
3464  horiz5, horiz4, horiz3, horiz2,
3465  horiz6, horiz7, horiz8, horiz8,
3466  const20, const6, const3);
3467  ST8x2_UB(res1, dst, dst_stride);
3468  dst += 2 * dst_stride;
3469 
3470  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3471  horiz7, horiz8, horiz8, horiz7,
3472  horiz7, horiz6, horiz5, horiz4,
3473  horiz8, horiz8, horiz7, horiz6,
3474  const20, const6, const3);
3475  ST8x2_UB(res0, dst, dst_stride);
3476  dst += 2 * dst_stride;
3477  ST8x2_UB(res1, dst, dst_stride);
3478 }
3479 
3481  int32_t src_stride,
3482  uint8_t *dst,
3483  int32_t dst_stride)
3484 {
3485  uint8_t buff[272];
3486 
3487  hv_mc_qpel_no_rnd_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3488  vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3489 }
3490 
3492  int32_t src_stride,
3493  uint8_t *dst,
3494  int32_t dst_stride)
3495 {
3496  v16u8 inp0, inp1, inp2, inp3;
3497  v16u8 res0, res1, avg0, avg1;
3498  v16u8 horiz0, horiz1, horiz2, horiz3;
3499  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3500  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3501  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3502  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3503  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3504  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3505  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3506  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3507 
3508  LD_UB2(src, src_stride, inp0, inp1);
3509  src += (2 * src_stride);
3510  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3511  mask2, mask3, const20,
3512  const6, const3);
3513  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3514  horiz0 = __msa_ave_u_b(inp0, res0);
3515  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3516  LD_UB2(src, src_stride, inp2, inp3);
3517  src += (2 * src_stride);
3518  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3519  mask2, mask3, const20,
3520  const6, const3);
3521  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3522  horiz2 = __msa_ave_u_b(inp2, res1);
3523  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3524  LD_UB2(src, src_stride, inp0, inp1);
3525  src += (2 * src_stride);
3526  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3527  mask2, mask3, const20,
3528  const6, const3);
3529  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3530  horiz4 = __msa_ave_u_b(inp0, res0);
3531  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3532  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3533  horiz1, horiz2, horiz3, horiz4,
3534  horiz1, horiz0, horiz0, horiz1,
3535  horiz2, horiz3, horiz4, horiz5,
3536  const20, const6, const3);
3537  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3538  res0 = __msa_ave_u_b(avg0, res0);
3539  ST8x2_UB(res0, dst, dst_stride);
3540  dst += (2 * dst_stride);
3541 
3542  LD_UB2(src, src_stride, inp2, inp3);
3543  src += (2 * src_stride);
3544  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3545  mask2, mask3, const20,
3546  const6, const3);
3547  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3548  horiz6 = __msa_ave_u_b(inp2, res1);
3549  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3550  inp0 = LD_UB(src);
3551  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3552  mask2, mask3, const20,
3553  const6, const3);
3554  horiz8 = __msa_ave_u_b(inp0, res0);
3555  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3556  horiz3, horiz4, horiz5, horiz6,
3557  horiz3, horiz2, horiz1, horiz0,
3558  horiz4, horiz5, horiz6, horiz7,
3559  const20, const6, const3);
3560  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3561  res1 = __msa_ave_u_b(avg1, res1);
3562  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3563  horiz5, horiz6, horiz7, horiz8,
3564  horiz5, horiz4, horiz3, horiz2,
3565  horiz6, horiz7, horiz8, horiz8,
3566  const20, const6, const3);
3567  ST8x2_UB(res1, dst, dst_stride);
3568  dst += 2 * dst_stride;
3569 
3570  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3571  res0 = __msa_ave_u_b(avg0, res0);
3572 
3573  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3574  horiz7, horiz8, horiz8, horiz7,
3575  horiz7, horiz6, horiz5, horiz4,
3576  horiz8, horiz8, horiz7, horiz6,
3577  const20, const6, const3);
3578  ST8x2_UB(res0, dst, dst_stride);
3579  dst += 2 * dst_stride;
3580 
3581  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3582  res1 = __msa_ave_u_b(avg1, res1);
3583  ST8x2_UB(res1, dst, dst_stride);
3584 }
3585 
3587  int32_t src_stride,
3588  uint8_t *dst,
3589  int32_t dst_stride)
3590 {
3591  uint8_t buff[272];
3592 
3593  hv_mc_qpel_no_rnd_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3594  vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3595 }
3596 
3598  int32_t src_stride,
3599  uint8_t *dst,
3600  int32_t dst_stride)
3601 {
3602  v16u8 inp0, inp1, inp2, inp3;
3603  v16u8 res0, res1, avg0, avg1;
3604  v16u8 horiz0, horiz1, horiz2, horiz3;
3605  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3606  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3607  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3608  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3609  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3610  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3611  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3612  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3613 
3614  LD_UB2(src, src_stride, inp0, inp1);
3615  src += (2 * src_stride);
3616  horiz0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3617  mask2, mask3, const20,
3618  const6, const3);
3619  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3620  LD_UB2(src, src_stride, inp2, inp3);
3621  src += (2 * src_stride);
3622  horiz2 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3623  mask2, mask3, const20,
3624  const6, const3);
3625  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3626  LD_UB2(src, src_stride, inp0, inp1);
3627  src += (2 * src_stride);
3628  horiz4 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3629  mask2, mask3, const20,
3630  const6, const3);
3631  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3632  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3633  horiz1, horiz2, horiz3, horiz4,
3634  horiz1, horiz0, horiz0, horiz1,
3635  horiz2, horiz3, horiz4, horiz5,
3636  const20, const6, const3);
3637  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3638  res0 = __msa_ave_u_b(avg0, res0);
3639  LD_UB2(src, src_stride, inp2, inp3);
3640  src += (2 * src_stride);
3641  ST8x2_UB(res0, dst, dst_stride);
3642  dst += 2 * dst_stride;
3643 
3644  horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3645  mask2, mask3, const20,
3646  const6, const3);
3647  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3648  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3649  horiz3, horiz4, horiz5, horiz6,
3650  horiz3, horiz2, horiz1, horiz0,
3651  horiz4, horiz5, horiz6, horiz7,
3652  const20, const6, const3);
3653  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3654  res1 = __msa_ave_u_b(avg1, res1);
3655  inp0 = LD_UB(src);
3656  horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3657  mask2, mask3, const20,
3658  const6, const3);
3659  ST8x2_UB(res1, dst, dst_stride);
3660  dst += 2 * dst_stride;
3661 
3662  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3663  horiz5, horiz6, horiz7, horiz8,
3664  horiz5, horiz4, horiz3, horiz2,
3665  horiz6, horiz7, horiz8, horiz8,
3666  const20, const6, const3);
3667  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3668  res0 = __msa_ave_u_b(avg0, res0);
3669  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3670  horiz7, horiz8, horiz8, horiz7,
3671  horiz7, horiz6, horiz5, horiz4,
3672  horiz8, horiz8, horiz7, horiz6,
3673  const20, const6, const3);
3674  ST8x2_UB(res0, dst, dst_stride);
3675  dst += 2 * dst_stride;
3676 
3677  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3678  res1 = __msa_ave_u_b(avg1, res1);
3679  ST8x2_UB(res1, dst, dst_stride);
3680 }
3681 
3683  int32_t src_stride,
3684  uint8_t *dst,
3685  int32_t dst_stride)
3686 {
3687  uint8_t buff[272];
3688 
3689  hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
3690  vert_mc_qpel_no_rnd_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
3691 }
3692 
3694  int32_t src_stride,
3695  uint8_t *dst,
3696  int32_t dst_stride)
3697 {
3698  v16u8 inp0, inp1, inp2, inp3;
3699  v16u8 res0, res1, avg0, avg1;
3700  v16u8 horiz0, horiz1, horiz2, horiz3;
3701  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3702  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3703  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3704  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3705  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3706  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3707  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3708  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3709 
3710  LD_UB2(src, src_stride, inp0, inp1);
3711  src += (2 * src_stride);
3712  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3713  mask2, mask3, const20,
3714  const6, const3);
3715  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3716 
3717  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3718  horiz0 = __msa_ave_u_b(inp0, res0);
3719  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3720  LD_UB2(src, src_stride, inp2, inp3);
3721  src += (2 * src_stride);
3722  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3723  mask2, mask3, const20,
3724  const6, const3);
3725  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3726 
3727  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3728  horiz2 = __msa_ave_u_b(inp2, res1);
3729  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3730  LD_UB2(src, src_stride, inp0, inp1);
3731  src += (2 * src_stride);
3732  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
3733  mask2, mask3, const20,
3734  const6, const3);
3735 
3736  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
3737  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
3738  horiz4 = __msa_ave_u_b(inp0, res0);
3739  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3740  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3741  horiz1, horiz2, horiz3, horiz4,
3742  horiz1, horiz0, horiz0, horiz1,
3743  horiz2, horiz3, horiz4, horiz5,
3744  const20, const6, const3);
3745  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
3746  res0 = __msa_ave_u_b(avg0, res0);
3747  ST8x2_UB(res0, dst, dst_stride);
3748  dst += (2 * dst_stride);
3749 
3750  LD_UB2(src, src_stride, inp2, inp3);
3751  src += (2 * src_stride);
3752  res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
3753  mask2, mask3, const20,
3754  const6, const3);
3755  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
3756 
3757  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
3758  horiz6 = __msa_ave_u_b(inp2, res1);
3759  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3760  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3761  horiz3, horiz4, horiz5, horiz6,
3762  horiz3, horiz2, horiz1, horiz0,
3763  horiz4, horiz5, horiz6, horiz7,
3764  const20, const6, const3);
3765  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
3766  res1 = __msa_ave_u_b(avg1, res1);
3767  ST8x2_UB(res1, dst, dst_stride);
3768  dst += (2 * dst_stride);
3769 
3770  inp0 = LD_UB(src);
3771  res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
3772  mask2, mask3, const20,
3773  const6, const3);
3774  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
3775  horiz8 = __msa_ave_u_b(inp0, res0);
3776  res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3777  horiz5, horiz6, horiz7, horiz8,
3778  horiz5, horiz4, horiz3, horiz2,
3779  horiz6, horiz7, horiz8, horiz8,
3780  const20, const6, const3);
3781  res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3782  horiz7, horiz8, horiz8, horiz7,
3783  horiz7, horiz6, horiz5, horiz4,
3784  horiz8, horiz8, horiz7, horiz6,
3785  const20, const6, const3);
3786  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
3787  res0 = __msa_ave_u_b(avg0, res0);
3788  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
3789  res1 = __msa_ave_u_b(avg1, res1);
3790  ST8x4_UB(res0, res1, dst, dst_stride);
3791 }
3792 
3794  int32_t src_stride,
3795  uint8_t *dst,
3796  int32_t dst_stride,
3797  int32_t height)
3798 {
3799  uint8_t loop_count;
3800  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3801  v16u8 res;
3802  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3803  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3804  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3805  v8u16 const20 = (v8u16) __msa_ldi_h(20);
3806 
3807  for (loop_count = (height >> 2); loop_count--;) {
3808  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3809  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3810  src += (4 * src_stride);
3811  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
3812  const20, const6, const3);
3813  res = __msa_aver_u_b(inp0, res);
3814  ST_UB(res, dst);
3815  dst += dst_stride;
3816 
3817  res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
3818  const20, const6, const3);
3819  res = __msa_aver_u_b(inp2, res);
3820  ST_UB(res, dst);
3821  dst += dst_stride;
3822 
3823  res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
3824  const20, const6, const3);
3825  res = __msa_aver_u_b(inp4, res);
3826  ST_UB(res, dst);
3827  dst += dst_stride;
3828 
3829  res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
3830  const20, const6, const3);
3831  res = __msa_aver_u_b(inp6, res);
3832  ST_UB(res, dst);
3833  dst += dst_stride;
3834  }
3835 
3836  LD_UB2(src, 1, inp0, inp1);
3837  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
3838  res = __msa_aver_u_b(inp0, res);
3839  ST_UB(res, dst);
3840 }
3841 
3843  int32_t src_stride,
3844  uint8_t *dst,
3845  int32_t dst_stride)
3846 {
3847  uint8_t buff[272];
3848 
3849  hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
3850  vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3851 }
3852 
3854  int32_t src_stride,
3855  uint8_t *dst,
3856  int32_t dst_stride)
3857 {
3858  v16u8 inp0, inp1, inp2, inp3;
3859  v16u8 res0, res1, avg0, avg1;
3860  v16u8 horiz0, horiz1, horiz2, horiz3;
3861  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
3862  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3863  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
3864  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
3865  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
3866  v16u8 const20 = (v16u8) __msa_ldi_b(20);
3867  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3868  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3869 
3870  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
3871  src += (4 * src_stride);
3872  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
3873  const20, const6, const3);
3874  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
3875  const20, const6, const3);
3876  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3877  horiz0 = __msa_aver_u_b(inp0, res0);
3878  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
3879  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3880  horiz2 = __msa_aver_u_b(inp2, res1);
3881  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
3882  LD_UB2(src, src_stride, inp0, inp1);
3883  src += (2 * src_stride);
3884  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
3885  const20, const6, const3);
3886  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
3887  horiz4 = __msa_aver_u_b(inp0, res0);
3888  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
3889  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
3890  horiz1, horiz2, horiz3, horiz4,
3891  horiz1, horiz0, horiz0, horiz1,
3892  horiz2, horiz3, horiz4, horiz5,
3893  const20, const6, const3);
3894  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
3895  res0 = __msa_aver_u_b(avg0, res0);
3896  ST8x2_UB(res0, dst, dst_stride);
3897  dst += (2 * dst_stride);
3898 
3899  LD_UB2(src, src_stride, inp2, inp3);
3900  src += (2 * src_stride);
3901  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
3902  const20, const6, const3);
3903  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
3904  horiz6 = __msa_aver_u_b(inp2, res1);
3905  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
3906  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
3907  horiz3, horiz4, horiz5, horiz6,
3908  horiz3, horiz2, horiz1, horiz0,
3909  horiz4, horiz5, horiz6, horiz7,
3910  const20, const6, const3);
3911  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
3912  res1 = __msa_aver_u_b(avg1, res1);
3913 
3914  inp0 = LD_UB(src);
3915  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
3916  const20, const6, const3);
3917  horiz8 = __msa_aver_u_b(inp0, res0);
3918  ST8x2_UB(res1, dst, dst_stride);
3919  dst += 2 * dst_stride;
3920 
3921  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
3922  horiz5, horiz6, horiz7, horiz8,
3923  horiz5, horiz4, horiz3, horiz2,
3924  horiz6, horiz7, horiz8, horiz8,
3925  const20, const6, const3);
3926  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
3927  res0 = __msa_aver_u_b(avg0, res0);
3928  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
3929  horiz7, horiz8, horiz8, horiz7,
3930  horiz7, horiz6, horiz5, horiz4,
3931  horiz8, horiz8, horiz7, horiz6,
3932  const20, const6, const3);
3933  ST8x2_UB(res0, dst, dst_stride);
3934  dst += 2 * dst_stride;
3935  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
3936  res1 = __msa_aver_u_b(avg1, res1);
3937  ST8x2_UB(res1, dst, dst_stride);
3938 }
3939 
3941  int32_t src_stride,
3942  uint8_t *dst,
3943  int32_t dst_stride,
3944  int32_t height)
3945 {
3946  uint8_t loop_count;
3947  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
3948  v16u8 res;
3949  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
3950  v16u8 const6 = (v16u8) __msa_ldi_b(6);
3951  v16u8 const3 = (v16u8) __msa_ldi_b(3);
3952  v8u16 const20 = (v8u16) __msa_ldi_h(20);
3953 
3954  for (loop_count = (height >> 2); loop_count--;) {
3955  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
3956  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
3957  src += (4 * src_stride);
3958  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
3959  const20, const6, const3);
3960  ST_UB(res, dst);
3961  dst += dst_stride;
3962 
3963  res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
3964  const20, const6, const3);
3965  ST_UB(res, dst);
3966  dst += dst_stride;
3967 
3968  res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
3969  const20, const6, const3);
3970  ST_UB(res, dst);
3971  dst += dst_stride;
3972 
3973  res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
3974  const20, const6, const3);
3975  ST_UB(res, dst);
3976  dst += dst_stride;
3977  }
3978 
3979  LD_UB2(src, 1, inp0, inp1);
3980  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
3981  ST_UB(res, dst);
3982 }
3983 
3985  int32_t src_stride,
3986  uint8_t *dst,
3987  int32_t dst_stride)
3988 {
3989  uint8_t buff[272];
3990 
3991  hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
3992  vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
3993 }
3994 
3996  int32_t src_stride,
3997  uint8_t *dst,
3998  int32_t dst_stride)
3999 {
4000  v16u8 inp0, inp1, inp2, inp3;
4001  v16u8 res0, res1, avg0, avg1;
4002  v16u8 horiz0, horiz1, horiz2, horiz3;
4003  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4004  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4005  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4006  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4007  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4008  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4009  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4010  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4011 
4012  LD_UB2(src, src_stride, inp0, inp1);
4013  src += (2 * src_stride);
4014  horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4015  mask0, mask1, mask2, mask3,
4016  const20, const6, const3);
4017  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4018  LD_UB2(src, src_stride, inp2, inp3);
4019  src += (2 * src_stride);
4020  horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4021  mask0, mask1, mask2, mask3,
4022  const20, const6, const3);
4023  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4024  LD_UB2(src, src_stride, inp0, inp1);
4025  src += (2 * src_stride);
4026  horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4027  mask0, mask1, mask2, mask3,
4028  const20, const6, const3);
4029  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4030  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4031  horiz1, horiz2, horiz3, horiz4,
4032  horiz1, horiz0, horiz0, horiz1,
4033  horiz2, horiz3, horiz4, horiz5,
4034  const20, const6, const3);
4035  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4036  res0 = __msa_aver_u_b(avg0, res0);
4037  ST8x2_UB(res0, dst, dst_stride);
4038  dst += (2 * dst_stride);
4039 
4040  LD_UB2(src, src_stride, inp2, inp3);
4041  src += (2 * src_stride);
4042  horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4043  mask0, mask1, mask2, mask3,
4044  const20, const6, const3);
4045  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4046  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4047  horiz3, horiz4, horiz5, horiz6,
4048  horiz3, horiz2, horiz1, horiz0,
4049  horiz4, horiz5, horiz6, horiz7,
4050  const20, const6, const3);
4051  inp0 = LD_UB(src);
4052  horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4053  mask0, mask1, mask2, mask3,
4054  const20, const6, const3);
4055  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4056  res1 = __msa_aver_u_b(avg1, res1);
4057  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4058  horiz5, horiz6, horiz7, horiz8,
4059  horiz5, horiz4, horiz3, horiz2,
4060  horiz6, horiz7, horiz8, horiz8,
4061  const20, const6, const3);
4062  ST8x2_UB(res1, dst, dst_stride);
4063  dst += 2 * dst_stride;
4064 
4065  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4066  res0 = __msa_aver_u_b(avg0, res0);
4067  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4068  horiz7, horiz8, horiz8, horiz7,
4069  horiz7, horiz6, horiz5, horiz4,
4070  horiz8, horiz8, horiz7, horiz6,
4071  const20, const6, const3);
4072  ST8x2_UB(res0, dst, dst_stride);
4073  dst += 2 * dst_stride;
4074  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4075  res1 = __msa_aver_u_b(avg1, res1);
4076  ST8x2_UB(res1, dst, dst_stride);
4077 }
4078 
4080  int32_t src_stride,
4081  uint8_t *dst,
4082  int32_t dst_stride,
4083  int32_t height)
4084 {
4085  uint8_t loop_count;
4086  v16u8 inp0, inp1, inp2, inp3, inp4, inp5, inp6, inp7;
4087  v16u8 res;
4088  v16u8 mask = { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
4089  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4090  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4091  v8u16 const20 = (v8u16) __msa_ldi_h(20);
4092 
4093  for (loop_count = (height >> 2); loop_count--;) {
4094  LD_UB4(src, src_stride, inp0, inp2, inp4, inp6);
4095  LD_UB4((src + 1), src_stride, inp1, inp3, inp5, inp7);
4096  src += (4 * src_stride);
4097  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask,
4098  const20, const6, const3);
4099  res = __msa_aver_u_b(res, inp1);
4100  ST_UB(res, dst);
4101  dst += dst_stride;
4102 
4103  res = APPLY_HORIZ_QPEL_FILTER(inp2, inp3, mask,
4104  const20, const6, const3);
4105  res = __msa_aver_u_b(res, inp3);
4106  ST_UB(res, dst);
4107  dst += dst_stride;
4108 
4109  res = APPLY_HORIZ_QPEL_FILTER(inp4, inp5, mask,
4110  const20, const6, const3);
4111  res = __msa_aver_u_b(res, inp5);
4112  ST_UB(res, dst);
4113  dst += dst_stride;
4114 
4115  res = APPLY_HORIZ_QPEL_FILTER(inp6, inp7, mask,
4116  const20, const6, const3);
4117  res = __msa_aver_u_b(res, inp7);
4118  ST_UB(res, dst);
4119  dst += dst_stride;
4120  }
4121 
4122  LD_UB2(src, 1, inp0, inp1);
4123  res = APPLY_HORIZ_QPEL_FILTER(inp0, inp1, mask, const20, const6, const3);
4124  res = __msa_aver_u_b(inp1, res);
4125  ST_UB(res, dst);
4126 }
4127 
4129  int32_t src_stride,
4130  uint8_t *dst,
4131  int32_t dst_stride)
4132 {
4133  uint8_t buff[272];
4134 
4135  hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4136  vert_mc_qpel_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4137 }
4138 
4140  int32_t src_stride,
4141  uint8_t *dst,
4142  int32_t dst_stride)
4143 {
4144  v16u8 inp0, inp1, inp2, inp3;
4145  v16u8 res0, res1, avg0, avg1;
4146  v16u8 horiz0, horiz1, horiz2, horiz3;
4147  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4148  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4149  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4150  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4151  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4152  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4153  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4154  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4155 
4156  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4157  src += (4 * src_stride);
4158  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4159  const20, const6, const3);
4160  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4161  const20, const6, const3);
4162  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4163 
4164  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4165  horiz0 = __msa_aver_u_b(inp0, res0);
4166  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4167  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4168 
4169  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4170  horiz2 = __msa_aver_u_b(inp2, res1);
4171  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4172  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4173  src += (4 * src_stride);
4174  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4175  const20, const6, const3);
4176  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4177  const20, const6, const3);
4178  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4179 
4180  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4181  horiz4 = __msa_aver_u_b(inp0, res0);
4182  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4183  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4184 
4185  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4186  horiz6 = __msa_aver_u_b(inp2, res1);
4187  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4188  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4189  horiz1, horiz2, horiz3, horiz4,
4190  horiz1, horiz0, horiz0, horiz1,
4191  horiz2, horiz3, horiz4, horiz5,
4192  const20, const6, const3);
4193  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4194  res0 = __msa_aver_u_b(avg0, res0);
4195  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4196  horiz3, horiz4, horiz5, horiz6,
4197  horiz3, horiz2, horiz1, horiz0,
4198  horiz4, horiz5, horiz6, horiz7,
4199  const20, const6, const3);
4200  ST8x2_UB(res0, dst, dst_stride);
4201  dst += 2 * dst_stride;
4202 
4203  inp0 = LD_UB(src);
4204  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4205  const20, const6, const3);
4206  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4207  res1 = __msa_aver_u_b(avg1, res1);
4208  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4209  horiz8 = __msa_aver_u_b(inp0, res0);
4210  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4211  horiz5, horiz6, horiz7, horiz8,
4212  horiz5, horiz4, horiz3, horiz2,
4213  horiz6, horiz7, horiz8, horiz8,
4214  const20, const6, const3);
4215  ST8x2_UB(res1, dst, dst_stride);
4216  dst += 2 * dst_stride;
4217 
4218  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4219  res0 = __msa_aver_u_b(avg0, res0);
4220  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4221  horiz7, horiz8, horiz8, horiz7,
4222  horiz7, horiz6, horiz5, horiz4,
4223  horiz8, horiz8, horiz7, horiz6,
4224  const20, const6, const3);
4225  ST8x2_UB(res0, dst, dst_stride);
4226  dst += 2 * dst_stride;
4227 
4228  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4229  res1 = __msa_aver_u_b(avg1, res1);
4230  ST8x2_UB(res1, dst, dst_stride);
4231 }
4232 
4234  int32_t src_stride,
4235  uint8_t *dst,
4236  int32_t dst_stride)
4237 {
4238  uint8_t buff[272];
4239 
4240  hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4241  vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4242 }
4243 
4245  int32_t src_stride,
4246  uint8_t *dst,
4247  int32_t dst_stride)
4248 {
4249  v16u8 inp0, inp1, inp2, inp3;
4250  v16u8 res0, res1;
4251  v16u8 horiz0, horiz1, horiz2, horiz3;
4252  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4253  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4254  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4255  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4256  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4257  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4258  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4259  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4260 
4261  LD_UB2(src, src_stride, inp0, inp1);
4262  src += (2 * src_stride);
4263  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4264  const20, const6, const3);
4265  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4266  horiz0 = __msa_aver_u_b(inp0, res0);
4267  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4268 
4269  LD_UB2(src, src_stride, inp2, inp3);
4270  src += (2 * src_stride);
4271  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4272  const20, const6, const3);
4273  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4274  horiz2 = __msa_aver_u_b(inp2, res1);
4275  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4276  LD_UB2(src, src_stride, inp0, inp1);
4277  src += (2 * src_stride);
4278  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4279  const20, const6, const3);
4280  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4281  horiz4 = __msa_aver_u_b(inp0, res0);
4282  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4283  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4284  horiz1, horiz2, horiz3, horiz4,
4285  horiz1, horiz0, horiz0, horiz1,
4286  horiz2, horiz3, horiz4, horiz5,
4287  const20, const6, const3);
4288  ST8x2_UB(res0, dst, dst_stride);
4289  dst += (2 * dst_stride);
4290 
4291  LD_UB2(src, src_stride, inp2, inp3);
4292  src += (2 * src_stride);
4293  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4294  const20, const6, const3);
4295  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4296  horiz6 = __msa_aver_u_b(inp2, res1);
4297  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4298  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4299  horiz3, horiz4, horiz5, horiz6,
4300  horiz3, horiz2, horiz1, horiz0,
4301  horiz4, horiz5, horiz6, horiz7,
4302  const20, const6, const3);
4303  inp0 = LD_UB(src);
4304  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4305  const20, const6, const3);
4306  horiz8 = __msa_aver_u_b(inp0, res0);
4307  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4308  horiz5, horiz6, horiz7, horiz8,
4309  horiz5, horiz4, horiz3, horiz2,
4310  horiz6, horiz7, horiz8, horiz8,
4311  const20, const6, const3);
4312  ST8x2_UB(res1, dst, dst_stride);
4313  dst += 2 * dst_stride;
4314 
4315  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4316  horiz7, horiz8, horiz8, horiz7,
4317  horiz7, horiz6, horiz5, horiz4,
4318  horiz8, horiz8, horiz7, horiz6,
4319  const20, const6, const3);
4320  ST8x2_UB(res0, dst, dst_stride);
4321  dst += 2 * dst_stride;
4322  ST8x2_UB(res1, dst, dst_stride);
4323 }
4324 
4325 static void hv_mc_qpel_16x16_msa(const uint8_t *src,
4326  int32_t src_stride,
4327  uint8_t *dst,
4328  int32_t dst_stride)
4329 {
4330  uint8_t buff[272];
4331 
4332  hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4333  vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4334 }
4335 
4336 static void hv_mc_qpel_8x8_msa(const uint8_t *src, int32_t src_stride,
4337  uint8_t *dst, int32_t dst_stride)
4338 {
4339  v16u8 inp0, inp1, inp2, inp3;
4340  v16u8 res0, res1;
4341  v16u8 horiz0, horiz1, horiz2, horiz3;
4342  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4343  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4344  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4345  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4346  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4347  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4348  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4349  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4350 
4351  LD_UB2(src, src_stride, inp0, inp1);
4352  src += (2 * src_stride);
4353  horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4354  mask0, mask1, mask2, mask3,
4355  const20, const6, const3);
4356  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4357  LD_UB2(src, src_stride, inp2, inp3);
4358  src += (2 * src_stride);
4359  horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4360  mask0, mask1, mask2, mask3,
4361  const20, const6, const3);
4362  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4363  LD_UB2(src, src_stride, inp0, inp1);
4364  src += (2 * src_stride);
4365  horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4366  mask0, mask1, mask2, mask3,
4367  const20, const6, const3);
4368  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4369  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4370  horiz1, horiz2, horiz3, horiz4,
4371  horiz1, horiz0, horiz0, horiz1,
4372  horiz2, horiz3, horiz4, horiz5,
4373  const20, const6, const3);
4374  ST8x2_UB(res0, dst, dst_stride);
4375  dst += (2 * dst_stride);
4376 
4377  LD_UB2(src, src_stride, inp2, inp3);
4378  src += (2 * src_stride);
4379  horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4380  mask0, mask1, mask2, mask3,
4381  const20, const6, const3);
4382  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4383  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4384  horiz3, horiz4, horiz5, horiz6,
4385  horiz3, horiz2, horiz1, horiz0,
4386  horiz4, horiz5, horiz6, horiz7,
4387  const20, const6, const3);
4388  inp0 = LD_UB(src);
4389  horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4390  mask0, mask1, mask2, mask3,
4391  const20, const6, const3);
4392  ST8x2_UB(res1, dst, dst_stride);
4393  dst += 2 * dst_stride;
4394 
4395  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4396  horiz5, horiz6, horiz7, horiz8,
4397  horiz5, horiz4, horiz3, horiz2,
4398  horiz6, horiz7, horiz8, horiz8,
4399  const20, const6, const3);
4400  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4401  horiz7, horiz8, horiz8, horiz7,
4402  horiz7, horiz6, horiz5, horiz4,
4403  horiz8, horiz8, horiz7, horiz6,
4404  const20, const6, const3);
4405  ST8x2_UB(res0, dst, dst_stride);
4406  dst += 2 * dst_stride;
4407  ST8x2_UB(res1, dst, dst_stride);
4408 }
4409 
4411  int32_t src_stride,
4412  uint8_t *dst,
4413  int32_t dst_stride)
4414 {
4415  uint8_t buff[272];
4416 
4417  hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4418  vert_mc_qpel_16x16_msa(buff, 16, dst, dst_stride);
4419 }
4420 
4422  int32_t src_stride,
4423  uint8_t *dst,
4424  int32_t dst_stride)
4425 {
4426  v16u8 inp0, inp1, inp2, inp3;
4427  v16u8 res0, res1;
4428  v16u8 horiz0, horiz1, horiz2, horiz3;
4429  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4430  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4431  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4432  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4433  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4434  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4435  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4436  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4437 
4438  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4439  src += (4 * src_stride);
4440 
4441  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4442  const20, const6, const3);
4443  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4444  const20, const6, const3);
4445  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4446 
4447  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4448  horiz0 = __msa_aver_u_b(inp0, res0);
4449  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4450  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4451 
4452  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4453  horiz2 = __msa_aver_u_b(inp2, res1);
4454  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4455  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4456  src += (4 * src_stride);
4457  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4458  const20, const6, const3);
4459  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4460  const20, const6, const3);
4461  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4462 
4463  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4464  horiz4 = __msa_aver_u_b(inp0, res0);
4465  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4466  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4467 
4468  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4469  horiz6 = __msa_aver_u_b(inp2, res1);
4470  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4471  inp0 = LD_UB(src);
4472  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4473  const20, const6, const3);
4474  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4475  horiz8 = __msa_aver_u_b(inp0, res0);
4476  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4477  horiz1, horiz2, horiz3, horiz4,
4478  horiz1, horiz0, horiz0, horiz1,
4479  horiz2, horiz3, horiz4, horiz5,
4480  const20, const6, const3);
4481  ST8x2_UB(res0, dst, dst_stride);
4482  dst += (2 * dst_stride);
4483 
4484  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4485  horiz3, horiz4, horiz5, horiz6,
4486  horiz3, horiz2, horiz1, horiz0,
4487  horiz4, horiz5, horiz6, horiz7,
4488  const20, const6, const3);
4489  ST8x2_UB(res1, dst, dst_stride);
4490  dst += (2 * dst_stride);
4491 
4492  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4493  horiz5, horiz6, horiz7, horiz8,
4494  horiz5, horiz4, horiz3, horiz2,
4495  horiz6, horiz7, horiz8, horiz8,
4496  const20, const6, const3);
4497  ST8x2_UB(res0, dst, dst_stride);
4498  dst += (2 * dst_stride);
4499 
4500  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4501  horiz7, horiz8, horiz8, horiz7,
4502  horiz7, horiz6, horiz5, horiz4,
4503  horiz8, horiz8, horiz7, horiz6,
4504  const20, const6, const3);
4505  ST8x2_UB(res1, dst, dst_stride);
4506 }
4507 
4509  int32_t src_stride,
4510  uint8_t *dst,
4511  int32_t dst_stride)
4512 {
4513  uint8_t buff[272];
4514 
4515  hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4516  vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4517 }
4518 
4520  int32_t src_stride,
4521  uint8_t *dst,
4522  int32_t dst_stride)
4523 {
4524  v16u8 inp0, inp1, inp2, inp3;
4525  v16u8 res0, res1, avg0, avg1;
4526  v16u8 horiz0, horiz1, horiz2, horiz3;
4527  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4528  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4529  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4530  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4531  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4532  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4533  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4534  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4535 
4536  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4537  src += (4 * src_stride);
4538 
4539  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4540  const20, const6, const3);
4541  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4542  const20, const6, const3);
4543  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4544  horiz0 = __msa_aver_u_b(inp0, res0);
4545  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4546  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4547  horiz2 = __msa_aver_u_b(inp2, res1);
4548  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4549  LD_UB2(src, src_stride, inp0, inp1);
4550  src += (2 * src_stride);
4551 
4552  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4553  const20, const6, const3);
4554  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4555  horiz4 = __msa_aver_u_b(inp0, res0);
4556  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4557  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4558  horiz1, horiz2, horiz3, horiz4,
4559  horiz1, horiz0, horiz0, horiz1,
4560  horiz2, horiz3, horiz4, horiz5,
4561  const20, const6, const3);
4562  avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
4563  res0 = __msa_aver_u_b(avg0, res0);
4564  ST8x2_UB(res0, dst, dst_stride);
4565  dst += (2 * dst_stride);
4566 
4567  LD_UB2(src, src_stride, inp2, inp3);
4568  src += (2 * src_stride);
4569  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4570  const20, const6, const3);
4571  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4572  horiz6 = __msa_aver_u_b(inp2, res1);
4573  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4574  inp0 = LD_UB(src);
4575  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4576  const20, const6, const3);
4577  horiz8 = __msa_aver_u_b(inp0, res0);
4578  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4579  horiz3, horiz4, horiz5, horiz6,
4580  horiz3, horiz2, horiz1, horiz0,
4581  horiz4, horiz5, horiz6, horiz7,
4582  const20, const6, const3);
4583  avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
4584  res1 = __msa_aver_u_b(avg1, res1);
4585  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4586  horiz5, horiz6, horiz7, horiz8,
4587  horiz5, horiz4, horiz3, horiz2,
4588  horiz6, horiz7, horiz8, horiz8,
4589  const20, const6, const3);
4590  ST8x2_UB(res1, dst, dst_stride);
4591  dst += 2 * dst_stride;
4592 
4593  avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
4594  res0 = __msa_aver_u_b(avg0, res0);
4595  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4596  horiz7, horiz8, horiz8, horiz7,
4597  horiz7, horiz6, horiz5, horiz4,
4598  horiz8, horiz8, horiz7, horiz6,
4599  const20, const6, const3);
4600  ST8x2_UB(res0, dst, dst_stride);
4601  dst += 2 * dst_stride;
4602 
4603  avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
4604  res1 = __msa_aver_u_b(avg1, res1);
4605  ST8x2_UB(res1, dst, dst_stride);
4606  dst += (2 * dst_stride);
4607 }
4608 
4610  int32_t src_stride,
4611  uint8_t *dst,
4612  int32_t dst_stride)
4613 {
4614  uint8_t buff[272];
4615 
4616  hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4617  vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4618 }
4619 
4621  int32_t src_stride,
4622  uint8_t *dst,
4623  int32_t dst_stride)
4624 {
4625  v16u8 inp0, inp1, inp2, inp3;
4626  v16u8 res0, res1, avg0, avg1;
4627  v16u8 horiz0, horiz1, horiz2, horiz3;
4628  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4629  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4630  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4631  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4632  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4633  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4634  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4635  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4636 
4637  LD_UB2(src, src_stride, inp0, inp1);
4638  src += (2 * src_stride);
4639  horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4640  mask0, mask1, mask2, mask3,
4641  const20, const6, const3);
4642  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4643  LD_UB2(src, src_stride, inp2, inp3);
4644  src += (2 * src_stride);
4645  horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4646  mask0, mask1, mask2, mask3,
4647  const20, const6, const3);
4648  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4649  LD_UB2(src, src_stride, inp0, inp1);
4650  src += (2 * src_stride);
4651  horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4652  mask0, mask1, mask2, mask3,
4653  const20, const6, const3);
4654  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4655  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4656  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4657  horiz1, horiz2, horiz3, horiz4,
4658  horiz1, horiz0, horiz0, horiz1,
4659  horiz2, horiz3, horiz4, horiz5,
4660  const20, const6, const3);
4661  avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
4662  res0 = __msa_aver_u_b(avg0, res0);
4663  ST8x2_UB(res0, dst, dst_stride);
4664  dst += (2 * dst_stride);
4665 
4666  LD_UB2(src, src_stride, inp2, inp3);
4667  src += (2 * src_stride);
4668  horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4669  mask0, mask1, mask2, mask3,
4670  const20, const6, const3);
4671  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4672  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4673  horiz3, horiz4, horiz5, horiz6,
4674  horiz3, horiz2, horiz1, horiz0,
4675  horiz4, horiz5, horiz6, horiz7,
4676  const20, const6, const3);
4677  inp0 = LD_UB(src);
4678  horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
4679  mask0, mask1, mask2, mask3,
4680  const20, const6, const3);
4681  avg1 = (v16u8) __msa_insve_d((v2i64) horiz3, 1, (v2i64) horiz4);
4682  res1 = __msa_aver_u_b(avg1, res1);
4683  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4684  horiz5, horiz6, horiz7, horiz8,
4685  horiz5, horiz4, horiz3, horiz2,
4686  horiz6, horiz7, horiz8, horiz8,
4687  const20, const6, const3);
4688  ST8x2_UB(res1, dst, dst_stride);
4689  dst += 2 * dst_stride;
4690  avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
4691  res0 = __msa_aver_u_b(avg0, res0);
4692 
4693  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4694  horiz7, horiz8, horiz8, horiz7,
4695  horiz7, horiz6, horiz5, horiz4,
4696  horiz8, horiz8, horiz7, horiz6,
4697  const20, const6, const3);
4698  ST8x2_UB(res0, dst, dst_stride);
4699  dst += 2 * dst_stride;
4700  avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
4701  res1 = __msa_aver_u_b(avg1, res1);
4702  ST8x2_UB(res1, dst, dst_stride);
4703 }
4704 
4706  int32_t src_stride,
4707  uint8_t *dst,
4708  int32_t dst_stride)
4709 {
4710  uint8_t buff[272];
4711 
4712  hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
4713  vert_mc_qpel_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
4714 }
4715 
4717  int32_t src_stride,
4718  uint8_t *dst, int32_t dst_stride)
4719 {
4720  v16u8 inp0, inp1, inp2, inp3;
4721  v16u8 res0, res1, avg0, avg1;
4722  v16u8 horiz0, horiz1, horiz2, horiz3;
4723  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4724  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4725  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4726  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4727  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4728  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4729  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4730  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4731 
4732  LD_UB4(src, src_stride, inp0, inp1, inp2, inp3);
4733  src += (4 * src_stride);
4734  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4735  mask0, mask1, mask2, mask3,
4736  const20, const6, const3);
4737  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4738 
4739  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4740  horiz0 = __msa_aver_u_b(inp0, res0);
4741  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4742  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4743  const20, const6, const3);
4744  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4745 
4746  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4747  horiz2 = __msa_aver_u_b(inp2, res1);
4748  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4749  LD_UB2(src, src_stride, inp0, inp1);
4750  src += (2 * src_stride);
4751  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4752  const20, const6, const3);
4753  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
4754 
4755  inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
4756  horiz4 = __msa_aver_u_b(inp0, res0);
4757  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4758  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4759  horiz1, horiz2, horiz3, horiz4,
4760  horiz1, horiz0, horiz0, horiz1,
4761  horiz2, horiz3, horiz4, horiz5,
4762  const20, const6, const3);
4763  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
4764  res0 = __msa_aver_u_b(avg0, res0);
4765  LD_UB2(src, src_stride, inp2, inp3);
4766  src += (2 * src_stride);
4767  ST8x2_UB(res0, dst, dst_stride);
4768  dst += 2 * dst_stride;
4769 
4770  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4771  const20, const6, const3);
4772  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
4773 
4774  inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
4775  horiz6 = __msa_aver_u_b(inp2, res1);
4776  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4777  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4778  horiz3, horiz4, horiz5, horiz6,
4779  horiz3, horiz2, horiz1, horiz0,
4780  horiz4, horiz5, horiz6, horiz7,
4781  const20, const6, const3);
4782  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
4783  res1 = __msa_aver_u_b(avg1, res1);
4784  inp0 = LD_UB(src);
4785  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4786  const20, const6, const3);
4787  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
4788  horiz8 = __msa_aver_u_b(inp0, res0);
4789  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4790  horiz5, horiz6, horiz7, horiz8,
4791  horiz5, horiz4, horiz3, horiz2,
4792  horiz6, horiz7, horiz8, horiz8,
4793  const20, const6, const3);
4794  ST8x2_UB(res1, dst, dst_stride);
4795  dst += 2 * dst_stride;
4796 
4797  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
4798  res0 = __msa_aver_u_b(avg0, res0);
4799  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4800  horiz7, horiz8, horiz8, horiz7,
4801  horiz7, horiz6, horiz5, horiz4,
4802  horiz8, horiz8, horiz7, horiz6,
4803  const20, const6, const3);
4804  ST8x2_UB(res0, dst, dst_stride);
4805  dst += 2 * dst_stride;
4806 
4807  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
4808  res1 = __msa_aver_u_b(avg1, res1);
4809  ST8x2_UB(res1, dst, dst_stride);
4810 }
4811 
4813  int32_t src_stride,
4814  uint8_t *dst,
4815  int32_t dst_stride)
4816 {
4817  uint8_t buff[272];
4818 
4819  hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
4820  vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4821 }
4822 
4824  int32_t src_stride,
4825  uint8_t *dst,
4826  int32_t dst_stride)
4827 {
4828  v16u8 inp0, inp1, inp2, inp3;
4829  v16u8 res0, res1, avg0, avg1;
4830  v16u8 horiz0, horiz1, horiz2, horiz3;
4831  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4832  v16u8 dst0, dst1;
4833  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4834  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4835  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4836  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4837  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4838  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4839  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4840 
4841  LD_UB2(src, src_stride, inp0, inp1);
4842  src += (2 * src_stride);
4843  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4844  const20, const6, const3);
4845  LD_UB2(src, src_stride, inp2, inp3);
4846  src += (2 * src_stride);
4847  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4848  horiz0 = __msa_aver_u_b(inp0, res0);
4849  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4850  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4851  const20, const6, const3);
4852  LD_UB2(src, src_stride, inp0, inp1);
4853  src += (2 * src_stride);
4854  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4855  horiz2 = __msa_aver_u_b(inp2, res1);
4856  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4857  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
4858  const20, const6, const3);
4859  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
4860  horiz4 = __msa_aver_u_b(inp0, res0);
4861  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4862  LD_UB2(dst, dst_stride, dst0, dst1);
4863  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4864  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4865  horiz1, horiz2, horiz3, horiz4,
4866  horiz1, horiz0, horiz0, horiz1,
4867  horiz2, horiz3, horiz4, horiz5,
4868  const20, const6, const3);
4869  res0 = __msa_aver_u_b(avg0, res0);
4870  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4871  res0 = __msa_aver_u_b(avg0, res0);
4872  ST8x2_UB(res0, dst, dst_stride);
4873  dst += (2 * dst_stride);
4874 
4875  LD_UB2(src, src_stride, inp2, inp3);
4876  src += (2 * src_stride);
4877  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
4878  const20, const6, const3);
4879  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
4880  horiz6 = __msa_aver_u_b(inp2, res1);
4881  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4882  LD_UB2(dst, dst_stride, dst0, dst1);
4883  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4884  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4885  horiz3, horiz4, horiz5, horiz6,
4886  horiz3, horiz2, horiz1, horiz0,
4887  horiz4, horiz5, horiz6, horiz7,
4888  const20, const6, const3);
4889  res1 = __msa_aver_u_b(avg1, res1);
4890  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4891  res1 = __msa_aver_u_b(avg1, res1);
4892  ST8x2_UB(res1, dst, dst_stride);
4893  dst += (2 * dst_stride);
4894 
4895  inp0 = LD_UB(src);
4896  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
4897  const20, const6, const3);
4898  horiz8 = __msa_aver_u_b(inp0, res0);
4899  LD_UB2(dst, dst_stride, dst0, dst1);
4900  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
4901  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
4902  horiz5, horiz6, horiz7, horiz8,
4903  horiz5, horiz4, horiz3, horiz2,
4904  horiz6, horiz7, horiz8, horiz8,
4905  const20, const6, const3);
4906  res0 = __msa_aver_u_b(avg0, res0);
4907  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4908  res0 = __msa_aver_u_b(avg0, res0);
4909  ST8x2_UB(res0, dst, dst_stride);
4910  dst += (2 * dst_stride);
4911 
4912  LD_UB2(dst, dst_stride, dst0, dst1);
4913  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
4914  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
4915  horiz7, horiz8, horiz8, horiz7,
4916  horiz7, horiz6, horiz5, horiz4,
4917  horiz8, horiz8, horiz7, horiz6,
4918  const20, const6, const3);
4919  res1 = __msa_aver_u_b(avg1, res1);
4920  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4921  res1 = __msa_aver_u_b(avg1, res1);
4922  ST8x2_UB(res1, dst, dst_stride);
4923 }
4924 
4926  int32_t src_stride,
4927  uint8_t *dst,
4928  int32_t dst_stride)
4929 {
4930  uint8_t buff[272];
4931 
4932  hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
4933  vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
4934 }
4935 
4937  int32_t src_stride,
4938  uint8_t *dst,
4939  int32_t dst_stride)
4940 {
4941  v16u8 inp0, inp1, inp2, inp3;
4942  v16u8 res0, res1, avg0, avg1;
4943  v16u8 horiz0, horiz1, horiz2, horiz3;
4944  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
4945  v16u8 dst0, dst1;
4946  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4947  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
4948  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
4949  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
4950  v16u8 const20 = (v16u8) __msa_ldi_b(20);
4951  v16u8 const6 = (v16u8) __msa_ldi_b(6);
4952  v16u8 const3 = (v16u8) __msa_ldi_b(3);
4953 
4954  LD_UB2(src, src_stride, inp0, inp1);
4955  src += (2 * src_stride);
4956  horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4957  mask0, mask1, mask2, mask3,
4958  const20, const6, const3);
4959  LD_UB2(src, src_stride, inp2, inp3);
4960  src += (2 * src_stride);
4961  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
4962  horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4963  mask0, mask1, mask2, mask3,
4964  const20, const6, const3);
4965  LD_UB2(src, src_stride, inp0, inp1);
4966  src += (2 * src_stride);
4967  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
4968  horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
4969  mask0, mask1, mask2, mask3,
4970  const20, const6, const3);
4971  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
4972  LD_UB2(dst, dst_stride, dst0, dst1);
4973  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
4974  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
4975  horiz1, horiz2, horiz3, horiz4,
4976  horiz1, horiz0, horiz0, horiz1,
4977  horiz2, horiz3, horiz4, horiz5,
4978  const20, const6, const3);
4979  res0 = __msa_aver_u_b(avg0, res0);
4980  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
4981  res0 = __msa_aver_u_b(avg0, res0);
4982  ST8x2_UB(res0, dst, dst_stride);
4983  dst += (2 * dst_stride);
4984 
4985  LD_UB2(src, src_stride, inp2, inp3);
4986  src += (2 * src_stride);
4987  horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
4988  mask0, mask1, mask2, mask3,
4989  const20, const6, const3);
4990  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
4991  LD_UB2(dst, dst_stride, dst0, dst1);
4992  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
4993  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
4994  horiz3, horiz4, horiz5, horiz6,
4995  horiz3, horiz2, horiz1, horiz0,
4996  horiz4, horiz5, horiz6, horiz7,
4997  const20, const6, const3);
4998  res1 = __msa_aver_u_b(avg1, res1);
4999  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5000  res1 = __msa_aver_u_b(avg1, res1);
5001  ST8x2_UB(res1, dst, dst_stride);
5002  dst += (2 * dst_stride);
5003 
5004  inp0 = LD_UB(src);
5005  horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
5006  mask0, mask1, mask2, mask3,
5007  const20, const6, const3);
5008  LD_UB2(dst, dst_stride, dst0, dst1);
5009  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
5010  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5011  horiz5, horiz6, horiz7, horiz8,
5012  horiz5, horiz4, horiz3, horiz2,
5013  horiz6, horiz7, horiz8, horiz8,
5014  const20, const6, const3);
5015  res0 = __msa_aver_u_b(avg0, res0);
5016  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5017  res0 = __msa_aver_u_b(avg0, res0);
5018  ST8x2_UB(res0, dst, dst_stride);
5019  dst += (2 * dst_stride);
5020 
5021  LD_UB2(dst, dst_stride, dst0, dst1);
5022  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
5023  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5024  horiz7, horiz8, horiz8, horiz7,
5025  horiz7, horiz6, horiz5, horiz4,
5026  horiz8, horiz8, horiz7, horiz6,
5027  const20, const6, const3);
5028  res1 = __msa_aver_u_b(avg1, res1);
5029  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5030  res1 = __msa_aver_u_b(avg1, res1);
5031  ST8x2_UB(res1, dst, dst_stride);
5032 }
5033 
5035  int32_t src_stride,
5036  uint8_t *dst,
5037  int32_t dst_stride)
5038 {
5039  uint8_t buff[272];
5040 
5041  hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
5042  vert_mc_qpel_avg_dst_aver_src0_16x16_msa(buff, 16, dst, dst_stride);
5043 }
5044 
5046  int32_t src_stride,
5047  uint8_t *dst,
5048  int32_t dst_stride)
5049 {
5050  v16u8 inp0, inp1, inp2, inp3;
5051  v16u8 res0, res1, avg0, avg1;
5052  v16u8 horiz0, horiz1, horiz2, horiz3;
5053  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5054  v16u8 dst0, dst1;
5055  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5056  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5057  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5058  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5059  v16u8 const20 = (v16u8) __msa_ldi_b(20);
5060  v16u8 const6 = (v16u8) __msa_ldi_b(6);
5061  v16u8 const3 = (v16u8) __msa_ldi_b(3);
5062 
5063  LD_UB2(src, src_stride, inp0, inp1);
5064  src += (2 * src_stride);
5065  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5066  const20, const6, const3);
5067 
5068  LD_UB2(src, src_stride, inp2, inp3);
5069  src += (2 * src_stride);
5070  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5071 
5072  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5073  horiz0 = __msa_aver_u_b(inp0, res0);
5074  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5075  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5076  const20, const6, const3);
5077  LD_UB2(src, src_stride, inp0, inp1);
5078  src += (2 * src_stride);
5079  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5080 
5081  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5082  horiz2 = __msa_aver_u_b(inp2, res1);
5083  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5084  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5085  const20, const6, const3);
5086 
5087  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5088 
5089  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5090  horiz4 = __msa_aver_u_b(inp0, res0);
5091  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5092  LD_UB2(dst, dst_stride, dst0, dst1);
5093  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
5094  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5095  horiz1, horiz2, horiz3, horiz4,
5096  horiz1, horiz0, horiz0, horiz1,
5097  horiz2, horiz3, horiz4, horiz5,
5098  const20, const6, const3);
5099  res0 = __msa_aver_u_b(avg0, res0);
5100  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5101  res0 = __msa_aver_u_b(avg0, res0);
5102  ST8x2_UB(res0, dst, dst_stride);
5103  dst += (2 * dst_stride);
5104 
5105  LD_UB2(src, src_stride, inp2, inp3);
5106  src += (2 * src_stride);
5107  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5108  const20, const6, const3);
5109 
5110  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5111 
5112  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5113  horiz6 = __msa_aver_u_b(inp2, res1);
5114  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5115  LD_UB2(dst, dst_stride, dst0, dst1);
5116  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
5117  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5118  horiz3, horiz4, horiz5, horiz6,
5119  horiz3, horiz2, horiz1, horiz0,
5120  horiz4, horiz5, horiz6, horiz7,
5121  const20, const6, const3);
5122  res1 = __msa_aver_u_b(avg1, res1);
5123  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5124  res1 = __msa_aver_u_b(avg1, res1);
5125  ST8x2_UB(res1, dst, dst_stride);
5126  dst += (2 * dst_stride);
5127 
5128  inp0 = LD_UB(src);
5129  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5130  const20, const6, const3);
5131  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5132  horiz8 = __msa_aver_u_b(inp0, res0);
5133  LD_UB2(dst, dst_stride, dst0, dst1);
5134  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
5135  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5136  horiz5, horiz6, horiz7, horiz8,
5137  horiz5, horiz4, horiz3, horiz2,
5138  horiz6, horiz7, horiz8, horiz8,
5139  const20, const6, const3);
5140  res0 = __msa_aver_u_b(avg0, res0);
5141  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5142  res0 = __msa_aver_u_b(avg0, res0);
5143  ST8x2_UB(res0, dst, dst_stride);
5144  dst += (2 * dst_stride);
5145 
5146  LD_UB2(dst, dst_stride, dst0, dst1);
5147  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
5148  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5149  horiz7, horiz8, horiz8, horiz7,
5150  horiz7, horiz6, horiz5, horiz4,
5151  horiz8, horiz8, horiz7, horiz6,
5152  const20, const6, const3);
5153  res1 = __msa_aver_u_b(avg1, res1);
5154  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5155  res1 = __msa_aver_u_b(avg1, res1);
5156  ST8x2_UB(res1, dst, dst_stride);
5157 }
5158 
5160  int32_t src_stride,
5161  uint8_t *dst,
5162  int32_t dst_stride)
5163 {
5164  uint8_t buff[272];
5165 
5166  hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
5167  vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5168 }
5169 
5171  int32_t src_stride,
5172  uint8_t *dst,
5173  int32_t dst_stride)
5174 {
5175  v16u8 inp0, inp1, inp2, inp3;
5176  v16u8 res0, res1, avg0, avg1;
5177  v16u8 horiz0, horiz1, horiz2, horiz3;
5178  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5179  v16u8 dst0, dst1;
5180  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5181  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5182  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5183  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5184  v16u8 const20 = (v16u8) __msa_ldi_b(20);
5185  v16u8 const6 = (v16u8) __msa_ldi_b(6);
5186  v16u8 const3 = (v16u8) __msa_ldi_b(3);
5187 
5188  LD_UB2(src, src_stride, inp0, inp1);
5189  src += (2 * src_stride);
5190  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5191  const20, const6, const3);
5192  LD_UB2(src, src_stride, inp2, inp3);
5193  src += (2 * src_stride);
5194  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5195  horiz0 = __msa_aver_u_b(inp0, res0);
5196  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5197  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5198  const20, const6, const3);
5199  LD_UB2(src, src_stride, inp0, inp1);
5200  src += (2 * src_stride);
5201  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5202  horiz2 = __msa_aver_u_b(inp2, res1);
5203  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5204  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5205  const20, const6, const3);
5206  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5207  horiz4 = __msa_aver_u_b(inp0, res0);
5208  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5209  LD_UB2(dst, dst_stride, dst0, dst1);
5210  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5211  horiz1, horiz2, horiz3, horiz4,
5212  horiz1, horiz0, horiz0, horiz1,
5213  horiz2, horiz3, horiz4, horiz5,
5214  const20, const6, const3);
5215  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5216  res0 = __msa_aver_u_b(avg0, res0);
5217  ST8x2_UB(res0, dst, dst_stride);
5218  dst += (2 * dst_stride);
5219 
5220  LD_UB2(src, src_stride, inp2, inp3);
5221  src += (2 * src_stride);
5222  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5223  const20, const6, const3);
5224  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5225  horiz6 = __msa_aver_u_b(inp2, res1);
5226  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5227  LD_UB2(dst, dst_stride, dst0, dst1);
5228  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5229  horiz3, horiz4, horiz5, horiz6,
5230  horiz3, horiz2, horiz1, horiz0,
5231  horiz4, horiz5, horiz6, horiz7,
5232  const20, const6, const3);
5233  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5234  res1 = __msa_aver_u_b(avg1, res1);
5235  ST8x2_UB(res1, dst, dst_stride);
5236  dst += (2 * dst_stride);
5237 
5238  inp0 = LD_UB(src);
5239  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5240  const20, const6, const3);
5241  horiz8 = __msa_aver_u_b(inp0, res0);
5242  LD_UB2(dst, dst_stride, dst0, dst1);
5243  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5244  horiz5, horiz6, horiz7, horiz8,
5245  horiz5, horiz4, horiz3, horiz2,
5246  horiz6, horiz7, horiz8, horiz8,
5247  const20, const6, const3);
5248  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5249  res0 = __msa_aver_u_b(avg0, res0);
5250  ST8x2_UB(res0, dst, dst_stride);
5251  dst += (2 * dst_stride);
5252 
5253  LD_UB2(dst, dst_stride, dst0, dst1);
5254  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5255  horiz7, horiz8, horiz8, horiz7,
5256  horiz7, horiz6, horiz5, horiz4,
5257  horiz8, horiz8, horiz7, horiz6,
5258  const20, const6, const3);
5259  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5260  res1 = __msa_aver_u_b(avg1, res1);
5261  ST8x2_UB(res1, dst, dst_stride);
5262  dst += (2 * dst_stride);
5263 }
5264 
5265 static void hv_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, int32_t src_stride,
5266  uint8_t *dst, int32_t dst_stride)
5267 {
5268  uint8_t buff[272];
5269 
5270  hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
5271  vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5272 
5273 }
5274 
5275 static void hv_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, int32_t src_stride,
5276  uint8_t *dst, int32_t dst_stride)
5277 {
5278  v16u8 inp0, inp1, inp2, inp3;
5279  v16u8 res0, res1, avg0, avg1;
5280  v16u8 horiz0, horiz1, horiz2, horiz3;
5281  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5282  v16u8 dst0, dst1;
5283  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5284  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5285  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5286  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5287  v16u8 const20 = (v16u8) __msa_ldi_b(20);
5288  v16u8 const6 = (v16u8) __msa_ldi_b(6);
5289  v16u8 const3 = (v16u8) __msa_ldi_b(3);
5290 
5291  LD_UB2(src, src_stride, inp0, inp1);
5292  src += (2 * src_stride);
5293  horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5294  mask0, mask1, mask2, mask3,
5295  const20, const6, const3);
5296  LD_UB2(src, src_stride, inp2, inp3);
5297  src += (2 * src_stride);
5298  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5299  horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5300  mask0, mask1, mask2, mask3,
5301  const20, const6, const3);
5302  LD_UB2(src, src_stride, inp0, inp1);
5303  src += (2 * src_stride);
5304  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5305  horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5306  mask0, mask1, mask2, mask3,
5307  const20, const6, const3);
5308  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5309  LD_UB2(src, src_stride, inp2, inp3);
5310  src += (2 * src_stride);
5311  horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5312  mask0, mask1, mask2, mask3,
5313  const20, const6, const3);
5314  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5315  inp0 = LD_UB(src);
5316  horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
5317  mask0, mask1, mask2, mask3,
5318  const20, const6, const3);
5319  LD_UB2(dst, dst_stride, dst0, dst1);
5320  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5321  horiz1, horiz2, horiz3, horiz4,
5322  horiz1, horiz0, horiz0, horiz1,
5323  horiz2, horiz3, horiz4, horiz5,
5324  const20, const6, const3);
5325  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5326  res0 = __msa_aver_u_b(avg0, res0);
5327  ST8x2_UB(res0, dst, dst_stride);
5328  dst += (2 * dst_stride);
5329 
5330  LD_UB2(dst, dst_stride, dst0, dst1);
5331  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5332  horiz3, horiz4, horiz5, horiz6,
5333  horiz3, horiz2, horiz1, horiz0,
5334  horiz4, horiz5, horiz6, horiz7,
5335  const20, const6, const3);
5336  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5337  res1 = __msa_aver_u_b(avg1, res1);
5338  ST8x2_UB(res1, dst, dst_stride);
5339  dst += (2 * dst_stride);
5340 
5341  LD_UB2(dst, dst_stride, dst0, dst1);
5342  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5343  horiz5, horiz6, horiz7, horiz8,
5344  horiz5, horiz4, horiz3, horiz2,
5345  horiz6, horiz7, horiz8, horiz8,
5346  const20, const6, const3);
5347  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5348  res0 = __msa_aver_u_b(avg0, res0);
5349  ST8x2_UB(res0, dst, dst_stride);
5350  dst += (2 * dst_stride);
5351 
5352  LD_UB2(dst, dst_stride, dst0, dst1);
5353  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5354  horiz7, horiz8, horiz8, horiz7,
5355  horiz7, horiz6, horiz5, horiz4,
5356  horiz8, horiz8, horiz7, horiz6,
5357  const20, const6, const3);
5358  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5359  res1 = __msa_aver_u_b(avg1, res1);
5360  ST8x2_UB(res1, dst, dst_stride);
5361 }
5362 
5364  int32_t src_stride,
5365  uint8_t *dst,
5366  int32_t dst_stride)
5367 {
5368  uint8_t buff[272];
5369 
5370  hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
5371  vert_mc_qpel_avg_dst_16x16_msa(buff, 16, dst, dst_stride);
5372 }
5373 
5375  int32_t src_stride,
5376  uint8_t *dst,
5377  int32_t dst_stride)
5378 {
5379  v16u8 inp0, inp1, inp2, inp3;
5380  v16u8 res0, res1, avg0, avg1;
5381  v16u8 horiz0, horiz1, horiz2, horiz3;
5382  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5383  v16u8 dst0, dst1;
5384  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5385  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5386  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5387  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5388  v16u8 const20 = (v16u8) __msa_ldi_b(20);
5389  v16u8 const6 = (v16u8) __msa_ldi_b(6);
5390  v16u8 const3 = (v16u8) __msa_ldi_b(3);
5391 
5392  LD_UB2(src, src_stride, inp0, inp1);
5393  src += (2 * src_stride);
5394  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5395  const20, const6, const3);
5396  LD_UB2(src, src_stride, inp2, inp3);
5397  src += (2 * src_stride);
5398  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5399 
5400  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5401  horiz0 = __msa_aver_u_b(inp0, res0);
5402  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5403  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5404  const20, const6, const3);
5405  LD_UB2(src, src_stride, inp0, inp1);
5406  src += (2 * src_stride);
5407  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5408 
5409  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5410  horiz2 = __msa_aver_u_b(inp2, res1);
5411  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5412  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5413  const20, const6, const3);
5414 
5415  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5416 
5417  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5418  horiz4 = __msa_aver_u_b(inp0, res0);
5419  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5420  LD_UB2(dst, dst_stride, dst0, dst1);
5421  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5422  horiz1, horiz2, horiz3, horiz4,
5423  horiz1, horiz0, horiz0, horiz1,
5424  horiz2, horiz3, horiz4, horiz5,
5425  const20, const6, const3);
5426  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5427  res0 = __msa_aver_u_b(avg0, res0);
5428  ST8x2_UB(res0, dst, dst_stride);
5429  dst += (2 * dst_stride);
5430 
5431  LD_UB2(src, src_stride, inp2, inp3);
5432  src += (2 * src_stride);
5433  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5434  const20, const6, const3);
5435 
5436  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5437 
5438  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5439  horiz6 = __msa_aver_u_b(inp2, res1);
5440  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5441  LD_UB2(dst, dst_stride, dst0, dst1);
5442  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5443  horiz3, horiz4, horiz5, horiz6,
5444  horiz3, horiz2, horiz1, horiz0,
5445  horiz4, horiz5, horiz6, horiz7,
5446  const20, const6, const3);
5447  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5448  res1 = __msa_aver_u_b(avg1, res1);
5449  ST8x2_UB(res1, dst, dst_stride);
5450  dst += (2 * dst_stride);
5451 
5452  inp0 = LD_UB(src);
5453  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5454  const20, const6, const3);
5455  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5456  horiz8 = __msa_aver_u_b(inp0, res0);
5457  LD_UB2(dst, dst_stride, dst0, dst1);
5458  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5459  horiz5, horiz6, horiz7, horiz8,
5460  horiz5, horiz4, horiz3, horiz2,
5461  horiz6, horiz7, horiz8, horiz8,
5462  const20, const6, const3);
5463  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5464  res0 = __msa_aver_u_b(avg0, res0);
5465  ST8x2_UB(res0, dst, dst_stride);
5466  dst += (2 * dst_stride);
5467 
5468  LD_UB2(dst, dst_stride, dst0, dst1);
5469  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5470  horiz7, horiz8, horiz8, horiz7,
5471  horiz7, horiz6, horiz5, horiz4,
5472  horiz8, horiz8, horiz7, horiz6,
5473  const20, const6, const3);
5474  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5475  res1 = __msa_aver_u_b(avg1, res1);
5476  ST8x2_UB(res1, dst, dst_stride);
5477 }
5478 
5480  int32_t src_stride,
5481  uint8_t *dst,
5482  int32_t dst_stride)
5483 {
5484  uint8_t buff[272];
5485 
5486  hv_mc_qpel_aver_horiz_src0_16x16_msa(src, src_stride, buff, 16, 16);
5487  vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5488 }
5489 
5491  int32_t src_stride,
5492  uint8_t *dst,
5493  int32_t dst_stride)
5494 {
5495  v16u8 inp0, inp1, inp2, inp3;
5496  v16u8 res0, res1, avg0, avg1;
5497  v16u8 horiz0, horiz1, horiz2, horiz3;
5498  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5499  v16u8 dst0, dst1;
5500  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5501  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5502  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5503  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5504  v16u8 const20 = (v16u8) __msa_ldi_b(20);
5505  v16u8 const6 = (v16u8) __msa_ldi_b(6);
5506  v16u8 const3 = (v16u8) __msa_ldi_b(3);
5507 
5508  LD_UB2(src, src_stride, inp0, inp1);
5509  src += (2 * src_stride);
5510 
5511  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5512  const20, const6, const3);
5513  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5514  horiz0 = __msa_aver_u_b(inp0, res0);
5515  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5516  LD_UB2(src, src_stride, inp2, inp3);
5517  src += (2 * src_stride);
5518  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5519  const20, const6, const3);
5520  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5521  horiz2 = __msa_aver_u_b(inp2, res1);
5522  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5523  LD_UB2(dst, dst_stride, dst0, dst1);
5524  LD_UB2(src, src_stride, inp0, inp1);
5525  src += (2 * src_stride);
5526  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5527  const20, const6, const3);
5528  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5529  horiz4 = __msa_aver_u_b(inp0, res0);
5530  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5531  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5532  horiz1, horiz2, horiz3, horiz4,
5533  horiz1, horiz0, horiz0, horiz1,
5534  horiz2, horiz3, horiz4, horiz5,
5535  const20, const6, const3);
5536  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5537  res0 = __msa_aver_u_b(avg0, res0);
5538  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5539  res0 = __msa_aver_u_b(avg0, res0);
5540  ST8x2_UB(res0, dst, dst_stride);
5541  dst += (2 * dst_stride);
5542 
5543  LD_UB2(dst, dst_stride, dst0, dst1);
5544  LD_UB2(src, src_stride, inp2, inp3);
5545  src += (2 * src_stride);
5546  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5547  const20, const6, const3);
5548  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5549  horiz6 = __msa_aver_u_b(inp2, res1);
5550  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5551  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5552  horiz3, horiz4, horiz5, horiz6,
5553  horiz3, horiz2, horiz1, horiz0,
5554  horiz4, horiz5, horiz6, horiz7,
5555  const20, const6, const3);
5556  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5557  res1 = __msa_aver_u_b(avg1, res1);
5558  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5559  res1 = __msa_aver_u_b(avg1, res1);
5560  ST8x2_UB(res1, dst, dst_stride);
5561  dst += (2 * dst_stride);
5562 
5563  inp0 = LD_UB(src);
5564  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5565  const20, const6, const3);
5566  horiz8 = __msa_aver_u_b(inp0, res0);
5567  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
5568  horiz5, horiz6, horiz7, horiz8,
5569  horiz5, horiz4, horiz3, horiz2,
5570  horiz6, horiz7, horiz8, horiz8,
5571  const20, const6, const3);
5572  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
5573  horiz7, horiz8, horiz8, horiz7,
5574  horiz7, horiz6, horiz5, horiz4,
5575  horiz8, horiz8, horiz7, horiz6,
5576  const20, const6, const3);
5577  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5578  res0 = __msa_aver_u_b(avg0, res0);
5579  LD_UB2(dst, dst_stride, dst0, dst1);
5580  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5581  res0 = __msa_aver_u_b(avg0, res0);
5582  ST8x2_UB(res0, dst, dst_stride);
5583  dst += (2 * dst_stride);
5584 
5585  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5586  res1 = __msa_aver_u_b(avg1, res1);
5587  LD_UB2(dst, dst_stride, dst0, dst1);
5588  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5589  res1 = __msa_aver_u_b(avg1, res1);
5590  ST8x2_UB(res1, dst, dst_stride);
5591 }
5592 
5594  int32_t src_stride,
5595  uint8_t *dst,
5596  int32_t dst_stride)
5597 {
5598  uint8_t buff[272];
5599 
5600  hv_mc_qpel_aver_horiz_16x16_msa(src, src_stride, buff, 16, 16);
5601  vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5602 }
5603 
5605  int32_t src_stride,
5606  uint8_t *dst,
5607  int32_t dst_stride)
5608 {
5609  v16u8 inp0, inp1, inp2, inp3;
5610  v16u8 res0, res1, avg0, avg1;
5611  v16u8 horiz0, horiz1, horiz2, horiz3;
5612  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5613  v16u8 dst0, dst1;
5614  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5615  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5616  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5617  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5618  v16u8 const20 = (v16u8) __msa_ldi_b(20);
5619  v16u8 const6 = (v16u8) __msa_ldi_b(6);
5620  v16u8 const3 = (v16u8) __msa_ldi_b(3);
5621 
5622  LD_UB2(src, src_stride, inp0, inp1);
5623  src += (2 * src_stride);
5624  horiz0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5625  mask0, mask1, mask2, mask3,
5626  const20, const6, const3);
5627  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5628  LD_UB2(src, src_stride, inp2, inp3);
5629  src += (2 * src_stride);
5630  horiz2 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5631  mask0, mask1, mask2, mask3,
5632  const20, const6, const3);
5633  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5634  LD_UB2(dst, dst_stride, dst0, dst1);
5635  LD_UB2(src, src_stride, inp0, inp1);
5636  src += (2 * src_stride);
5637  horiz4 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
5638  mask0, mask1, mask2, mask3,
5639  const20, const6, const3);
5640  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5641  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2,
5642  horiz1, horiz2, horiz3, horiz4,
5643  horiz1, horiz0, horiz0, horiz1,
5644  horiz2, horiz3, horiz4, horiz5,
5645  const20, const6, const3);
5646  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5647  res0 = __msa_aver_u_b(avg0, res0);
5648  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5649  res0 = __msa_aver_u_b(avg0, res0);
5650  ST8x2_UB(res0, dst, dst_stride);
5651  dst += (2 * dst_stride);
5652 
5653  LD_UB2(dst, dst_stride, dst0, dst1);
5654  LD_UB2(src, src_stride, inp2, inp3);
5655  src += (2 * src_stride);
5656  horiz6 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
5657  mask0, mask1, mask2, mask3,
5658  const20, const6, const3);
5659  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5660  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
5661  horiz3, horiz4, horiz5, horiz6,
5662  horiz3, horiz2, horiz1, horiz0,
5663  horiz4, horiz5, horiz6, horiz7,
5664  const20, const6, const3);
5665  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5666  res1 = __msa_aver_u_b(avg1, res1);
5667  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5668  res1 = __msa_aver_u_b(avg1, res1);
5669  ST8x2_UB(res1, dst, dst_stride);
5670  dst += (2 * dst_stride);
5671 
5672  inp0 = LD_UB(src);
5673  horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
5674  mask0, mask1, mask2, mask3,
5675  const20, const6, const3);
5676  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
5677  horiz6, horiz7, horiz8, horiz5, horiz4,
5678  horiz3, horiz2, horiz6, horiz7, horiz8,
5679  horiz8, const20, const6, const3);
5680  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
5681  horiz8, horiz8, horiz7, horiz7, horiz6,
5682  horiz5, horiz4, horiz8, horiz8, horiz7,
5683  horiz6, const20, const6, const3);
5684  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5685  res0 = __msa_aver_u_b(avg0, res0);
5686  LD_UB2(dst, dst_stride, dst0, dst1);
5687  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5688  res0 = __msa_aver_u_b(avg0, res0);
5689  ST8x2_UB(res0, dst, dst_stride);
5690  dst += (2 * dst_stride);
5691 
5692  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5693  res1 = __msa_aver_u_b(avg1, res1);
5694  LD_UB2(dst, dst_stride, dst0, dst1);
5695  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5696  res1 = __msa_aver_u_b(avg1, res1);
5697  ST8x2_UB(res1, dst, dst_stride);
5698 }
5699 
5701  int32_t src_stride,
5702  uint8_t *dst,
5703  int32_t dst_stride)
5704 {
5705  uint8_t buff[272];
5706 
5707  hv_mc_qpel_aver_horiz_src1_16x16_msa(src, src_stride, buff, 16, 16);
5708  vert_mc_qpel_avg_dst_aver_src1_16x16_msa(buff, 16, dst, dst_stride);
5709 }
5710 
5712  int32_t src_stride,
5713  uint8_t *dst,
5714  int32_t dst_stride)
5715 {
5716  v16u8 inp0, inp1, inp2, inp3;
5717  v16u8 res0, res1, avg0, avg1;
5718  v16u8 horiz0, horiz1, horiz2, horiz3;
5719  v16u8 horiz4, horiz5, horiz6, horiz7, horiz8;
5720  v16u8 dst0, dst1;
5721  v16u8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5722  v16u8 mask1 = { 0, 2, 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 8 };
5723  v16u8 mask2 = { 1, 3, 0, 4, 0, 5, 1, 6, 2, 7, 3, 8, 4, 8, 5, 7 };
5724  v16u8 mask3 = { 2, 4, 1, 5, 0, 6, 0, 7, 1, 8, 2, 8, 3, 7, 4, 6 };
5725  v16u8 const20 = (v16u8) __msa_ldi_b(20);
5726  v16u8 const6 = (v16u8) __msa_ldi_b(6);
5727  v16u8 const3 = (v16u8) __msa_ldi_b(3);
5728 
5729  LD_UB2(src, src_stride, inp0, inp1);
5730  src += (2 * src_stride);
5731  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5732  const20, const6, const3);
5733  LD_UB2(src, src_stride, inp2, inp3);
5734  src += (2 * src_stride);
5735  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5736 
5737  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5738  horiz0 = __msa_aver_u_b(inp0, res0);
5739  horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
5740  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5741  const20, const6, const3);
5742  LD_UB2(src, src_stride, inp0, inp1);
5743  src += (2 * src_stride);
5744  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5745 
5746  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5747  horiz2 = __msa_aver_u_b(inp2, res1);
5748  horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
5749  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
5750  const20, const6, const3);
5751  SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
5752 
5753  inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
5754  horiz4 = __msa_aver_u_b(inp0, res0);
5755  horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
5756  LD_UB2(dst, dst_stride, dst0, dst1);
5757  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
5758  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz0, horiz0, horiz1, horiz2, horiz1,
5759  horiz2, horiz3, horiz4, horiz1, horiz0,
5760  horiz0, horiz1, horiz2, horiz3, horiz4,
5761  horiz5, const20, const6, const3);
5762  res0 = __msa_aver_u_b(avg0, res0);
5763  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5764  res0 = __msa_aver_u_b(avg0, res0);
5765  ST8x2_UB(res0, dst, dst_stride);
5766  dst += (2 * dst_stride);
5767 
5768  LD_UB2(src, src_stride, inp2, inp3);
5769  src += (2 * src_stride);
5770  res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
5771  const20, const6, const3);
5772  SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
5773 
5774  inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
5775  horiz6 = __msa_aver_u_b(inp2, res1);
5776  horiz7 = (v16u8) __msa_splati_d((v2i64) horiz6, 1);
5777  LD_UB2(dst, dst_stride, dst0, dst1);
5778  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
5779  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, horiz3,
5780  horiz4, horiz5, horiz6, horiz3, horiz2,
5781  horiz1, horiz0, horiz4, horiz5, horiz6,
5782  horiz7, const20, const6, const3);
5783  res1 = __msa_aver_u_b(avg1, res1);
5784  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5785  res1 = __msa_aver_u_b(avg1, res1);
5786  ST8x2_UB(res1, dst, dst_stride);
5787  dst += (2 * dst_stride);
5788 
5789  inp0 = LD_UB(src);
5790  res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
5791  const20, const6, const3);
5792  inp0 = (v16u8) __msa_sldi_b((v16i8) inp0, (v16i8) inp0, 1);
5793  horiz8 = __msa_aver_u_b(inp0, res0);
5794  LD_UB2(dst, dst_stride, dst0, dst1);
5795  avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
5796  res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5,
5797  horiz6, horiz7, horiz8, horiz5, horiz4,
5798  horiz3, horiz2, horiz6, horiz7, horiz8,
5799  horiz8, const20, const6, const3);
5800  res0 = __msa_aver_u_b(avg0, res0);
5801  avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5802  res0 = __msa_aver_u_b(avg0, res0);
5803  ST8x2_UB(res0, dst, dst_stride);
5804  dst += (2 * dst_stride);
5805 
5806  LD_UB2(dst, dst_stride, dst0, dst1);
5807  avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
5808  res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7,
5809  horiz8, horiz8, horiz7, horiz7, horiz6,
5810  horiz5, horiz4, horiz8, horiz8, horiz7,
5811  horiz6, const20, const6, const3);
5812  res1 = __msa_aver_u_b(avg1, res1);
5813  avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
5814  res1 = __msa_aver_u_b(avg1, res1);
5815  ST8x2_UB(res1, dst, dst_stride);
5816 }
5817 
5818 static void copy_8x8_msa(const uint8_t *src, int32_t src_stride,
5819  uint8_t *dst, int32_t dst_stride)
5820 {
5821  uint64_t src0, src1;
5822  int32_t loop_cnt;
5823 
5824  for (loop_cnt = 4; loop_cnt--;) {
5825  src0 = LD(src);
5826  src += src_stride;
5827  src1 = LD(src);
5828  src += src_stride;
5829 
5830  SD(src0, dst);
5831  dst += dst_stride;
5832  SD(src1, dst);
5833  dst += dst_stride;
5834  }
5835 }
5836 
5837 static void copy_16x16_msa(const uint8_t *src, int32_t src_stride,
5838  uint8_t *dst, int32_t dst_stride)
5839 {
5840  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
5841  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
5842 
5843  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
5844  src += (8 * src_stride);
5845  LD_UB8(src, src_stride,
5846  src8, src9, src10, src11, src12, src13, src14, src15);
5847 
5848  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
5849  dst += (8 * dst_stride);
5850  ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
5851  dst, dst_stride);
5852 }
5853 
5854 static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
5855  uint8_t *dst, int32_t dst_stride,
5856  int32_t height)
5857 {
5858  int32_t cnt;
5859  uint64_t out0, out1, out2, out3;
5860  v16u8 src0, src1, src2, src3;
5861  v16u8 dst0, dst1, dst2, dst3;
5862 
5863  for (cnt = (height / 4); cnt--;) {
5864  LD_UB4(src, src_stride, src0, src1, src2, src3);
5865  src += (4 * src_stride);
5866  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
5867 
5868  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
5869  dst0, dst1, dst2, dst3);
5870 
5871  out0 = __msa_copy_u_d((v2i64) dst0, 0);
5872  out1 = __msa_copy_u_d((v2i64) dst1, 0);
5873  out2 = __msa_copy_u_d((v2i64) dst2, 0);
5874  out3 = __msa_copy_u_d((v2i64) dst3, 0);
5875  SD4(out0, out1, out2, out3, dst, dst_stride);
5876  dst += (4 * dst_stride);
5877  }
5878 }
5879 
5880 static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
5881  uint8_t *dst, int32_t dst_stride,
5882  int32_t height)
5883 {
5884  int32_t cnt;
5885  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
5886  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5887 
5888  for (cnt = (height / 8); cnt--;) {
5889  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
5890  src += (8 * src_stride);
5891  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
5892 
5893  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
5894  dst0, dst1, dst2, dst3);
5895  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
5896  dst4, dst5, dst6, dst7);
5897  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
5898  dst += (8 * dst_stride);
5899  }
5900 }
5901 
5902 void ff_copy_16x16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5903 {
5904  copy_16x16_msa(src, stride, dest, stride);
5905 }
5906 
5907 void ff_copy_8x8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5908 {
5909  copy_8x8_msa(src, stride, dest, stride);
5910 }
5911 
5913  const uint8_t *src,
5914  ptrdiff_t stride)
5915 {
5916  horiz_mc_qpel_aver_src0_8width_msa(src, stride, dest, stride, 8);
5917 }
5918 
5920  const uint8_t *src,
5921  ptrdiff_t stride)
5922 {
5923  horiz_mc_qpel_aver_src0_16width_msa(src, stride, dest, stride, 16);
5924 }
5925 
5927  ptrdiff_t stride)
5928 {
5929  horiz_mc_qpel_8width_msa(src, stride, dest, stride, 8);
5930 }
5931 
5933  const uint8_t *src, ptrdiff_t stride)
5934 {
5935  horiz_mc_qpel_16width_msa(src, stride, dest, stride, 16);
5936 }
5937 
5939  const uint8_t *src,
5940  ptrdiff_t stride)
5941 {
5942  horiz_mc_qpel_aver_src1_8width_msa(src, stride, dest, stride, 8);
5943 }
5944 
5946  const uint8_t *src,
5947  ptrdiff_t stride)
5948 {
5949  horiz_mc_qpel_aver_src1_16width_msa(src, stride, dest, stride, 16);
5950 }
5951 
5953  const uint8_t *src,
5954  ptrdiff_t stride)
5955 {
5956  horiz_mc_qpel_no_rnd_aver_src0_8width_msa(src, stride, dest, stride, 8);
5957 }
5958 
5960  const uint8_t *src,
5961  ptrdiff_t stride)
5962 {
5963  horiz_mc_qpel_no_rnd_aver_src0_16width_msa(src, stride, dest, stride, 16);
5964 }
5965 
5967  const uint8_t *src, ptrdiff_t stride)
5968 {
5969  horiz_mc_qpel_no_rnd_8width_msa(src, stride, dest, stride, 8);
5970 }
5971 
5973  const uint8_t *src, ptrdiff_t stride)
5974 {
5975  horiz_mc_qpel_no_rnd_16width_msa(src, stride, dest, stride, 16);
5976 }
5977 
5979  const uint8_t *src,
5980  ptrdiff_t stride)
5981 {
5982  horiz_mc_qpel_no_rnd_aver_src1_8width_msa(src, stride, dest, stride, 8);
5983 }
5984 
5986  const uint8_t *src,
5987  ptrdiff_t stride)
5988 {
5989  horiz_mc_qpel_no_rnd_aver_src1_16width_msa(src, stride, dest, stride, 16);
5990 }
5991 
5992 void ff_avg_width8_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5993 {
5994  avg_width8_msa(src, stride, dest, stride, 8);
5995 }
5996 
5997 void ff_avg_width16_msa(uint8_t *dest, const uint8_t *src, ptrdiff_t stride)
5998 {
5999  avg_width16_msa(src, stride, dest, stride, 16);
6000 }
6001 
6003  const uint8_t *src,
6004  ptrdiff_t stride)
6005 {
6006  horiz_mc_qpel_avg_dst_aver_src0_8width_msa(src, stride, dest, stride, 8);
6007 }
6008 
6010  const uint8_t *src,
6011  ptrdiff_t stride)
6012 {
6013  horiz_mc_qpel_avg_dst_aver_src0_16width_msa(src, stride, dest, stride, 16);
6014 }
6015 
6017  const uint8_t *src, ptrdiff_t stride)
6018 {
6019  horiz_mc_qpel_avg_dst_8width_msa(src, stride, dest, stride, 8);
6020 }
6021 
6023  const uint8_t *src, ptrdiff_t stride)
6024 {
6025  horiz_mc_qpel_avg_dst_16width_msa(src, stride, dest, stride, 16);
6026 }
6027 
6029  const uint8_t *src,
6030  ptrdiff_t stride)
6031 {
6032  horiz_mc_qpel_avg_dst_aver_src1_8width_msa(src, stride, dest, stride, 8);
6033 }
6034 
6036  const uint8_t *src,
6037  ptrdiff_t stride)
6038 {
6039  horiz_mc_qpel_avg_dst_aver_src1_16width_msa(src, stride, dest, stride, 16);
6040 }
6041 
6042 
6044  const uint8_t *src, ptrdiff_t stride)
6045 {
6046  vert_mc_qpel_aver_src0_8x8_msa(src, stride, dest, stride);
6047 }
6048 
6050  const uint8_t *src, ptrdiff_t stride)
6051 {
6052  vert_mc_qpel_aver_src0_16x16_msa(src, stride, dest, stride);
6053 }
6054 
6056  ptrdiff_t stride)
6057 {
6058  vert_mc_qpel_8x8_msa(src, stride, dest, stride);
6059 }
6060 
6062  ptrdiff_t stride)
6063 {
6064  vert_mc_qpel_16x16_msa(src, stride, dest, stride);
6065 }
6066 
6068  const uint8_t *src, ptrdiff_t stride)
6069 {
6070  vert_mc_qpel_aver_src1_8x8_msa(src, stride, dest, stride);
6071 }
6072 
6074  const uint8_t *src, ptrdiff_t stride)
6075 {
6076  vert_mc_qpel_aver_src1_16x16_msa(src, stride, dest, stride);
6077 }
6078 
6080  const uint8_t *src,
6081  ptrdiff_t stride)
6082 {
6083  vert_mc_qpel_no_rnd_aver_src0_8x8_msa(src, stride, dest, stride);
6084 }
6085 
6087  const uint8_t *src,
6088  ptrdiff_t stride)
6089 {
6090  vert_mc_qpel_no_rnd_aver_src0_16x16_msa(src, stride, dest, stride);
6091 }
6092 
6094  const uint8_t *src, ptrdiff_t stride)
6095 {
6096  vert_mc_qpel_no_rnd_8x8_msa(src, stride, dest, stride);
6097 }
6098 
6100  const uint8_t *src, ptrdiff_t stride)
6101 {
6102  vert_mc_qpel_no_rnd_16x16_msa(src, stride, dest, stride);
6103 }
6104 
6106  const uint8_t *src,
6107  ptrdiff_t stride)
6108 {
6109  vert_mc_qpel_no_rnd_aver_src1_8x8_msa(src, stride, dest, stride);
6110 }
6111 
6113  const uint8_t *src,
6114  ptrdiff_t stride)
6115 {
6116  vert_mc_qpel_no_rnd_aver_src1_16x16_msa(src, stride, dest, stride);
6117 }
6118 
6120  const uint8_t *src,
6121  ptrdiff_t stride)
6122 {
6123  vert_mc_qpel_avg_dst_aver_src0_8x8_msa(src, stride, dest, stride);
6124 }
6125 
6127  const uint8_t *src,
6128  ptrdiff_t stride)
6129 {
6130  vert_mc_qpel_avg_dst_aver_src0_16x16_msa(src, stride, dest, stride);
6131 }
6132 
6134  const uint8_t *src, ptrdiff_t stride)
6135 {
6136  vert_mc_qpel_avg_dst_8x8_msa(src, stride, dest, stride);
6137 }
6138 
6140  const