FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
h264qpel_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 -2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "h264dsp_mips.h"
23 
24 #define AVC_CALC_DPADD_H_6PIX_2COEFF_SH(in0, in1, in2, in3, in4, in5) \
25 ( { \
26  v4i32 tmp0_m, tmp1_m; \
27  v8i16 out0_m, out1_m, out2_m, out3_m; \
28  v8i16 minus5h_m = __msa_ldi_h(-5); \
29  v8i16 plus20h_m = __msa_ldi_h(20); \
30  \
31  ILVRL_H2_SW(in5, in0, tmp0_m, tmp1_m); \
32  \
33  tmp0_m = __msa_hadd_s_w((v8i16) tmp0_m, (v8i16) tmp0_m); \
34  tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \
35  \
36  ILVRL_H2_SH(in1, in4, out0_m, out1_m); \
37  DPADD_SH2_SW(out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m); \
38  ILVRL_H2_SH(in2, in3, out2_m, out3_m); \
39  DPADD_SH2_SW(out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m); \
40  \
41  SRARI_W2_SW(tmp0_m, tmp1_m, 10); \
42  SAT_SW2_SW(tmp0_m, tmp1_m, 7); \
43  out0_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
44  \
45  out0_m; \
46 } )
47 
48 static const uint8_t luma_mask_arr[16 * 8] = {
49  /* 8 width cases */
50  0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
51  1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
52  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
53 
54  /* 4 width cases */
55  0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
56  1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
57  2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
58 
59  2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
60  3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
61 };
62 
63 #define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, \
64  out1, out2) \
65 { \
66  v16i8 tmp0_m, tmp1_m; \
67  v16i8 minus5b_m = __msa_ldi_b(-5); \
68  v16i8 plus20b_m = __msa_ldi_b(20); \
69  \
70  ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m); \
71  HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2); \
72  ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m); \
73  DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2); \
74  ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m); \
75  DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \
76 }
77 
78 #define AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \
79 ( { \
80  v8i16 tmp1_m; \
81  v16i8 tmp0_m, tmp2_m; \
82  v16i8 minus5b_m = __msa_ldi_b(-5); \
83  v16i8 plus20b_m = __msa_ldi_b(20); \
84  \
85  tmp1_m = (v8i16) __msa_ilvr_b((v16i8) vec5, (v16i8) vec0); \
86  tmp1_m = __msa_hadd_s_h((v16i8) tmp1_m, (v16i8) tmp1_m); \
87  \
88  ILVR_B2_SB(vec4, vec1, vec3, vec2, tmp0_m, tmp2_m); \
89  DPADD_SB2_SH(tmp0_m, tmp2_m, minus5b_m, plus20b_m, tmp1_m, tmp1_m); \
90  \
91  tmp1_m; \
92 } )
93 
94 #define AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \
95 ( { \
96  v4i32 tmp1_m; \
97  v8i16 tmp2_m, tmp3_m; \
98  v8i16 minus5h_m = __msa_ldi_h(-5); \
99  v8i16 plus20h_m = __msa_ldi_h(20); \
100  \
101  tmp1_m = (v4i32) __msa_ilvr_h((v8i16) vec5, (v8i16) vec0); \
102  tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \
103  \
104  ILVR_H2_SH(vec1, vec4, vec2, vec3, tmp2_m, tmp3_m); \
105  DPADD_SH2_SW(tmp2_m, tmp3_m, minus5h_m, plus20h_m, tmp1_m, tmp1_m); \
106  \
107  tmp1_m = __msa_srari_w(tmp1_m, 10); \
108  tmp1_m = __msa_sat_s_w(tmp1_m, 7); \
109  \
110  tmp2_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp1_m); \
111  \
112  tmp2_m; \
113 } )
114 
115 #define AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1, \
116  mask0, mask1, mask2) \
117 ( { \
118  v8i16 hz_out_m; \
119  v16i8 vec0_m, vec1_m, vec2_m; \
120  v16i8 minus5b_m = __msa_ldi_b(-5); \
121  v16i8 plus20b_m = __msa_ldi_b(20); \
122  \
123  vec0_m = __msa_vshf_b((v16i8) mask0, (v16i8) src1, (v16i8) src0); \
124  hz_out_m = __msa_hadd_s_h(vec0_m, vec0_m); \
125  \
126  VSHF_B2_SB(src0, src1, src0, src1, mask1, mask2, vec1_m, vec2_m); \
127  DPADD_SB2_SH(vec1_m, vec2_m, minus5b_m, plus20b_m, hz_out_m, hz_out_m); \
128  \
129  hz_out_m; \
130 } )
131 
132 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \
133 ( { \
134  v8i16 out0_m; \
135  v16i8 tmp0_m; \
136  v16i8 minus5b = __msa_ldi_b(-5); \
137  v16i8 plus20b = __msa_ldi_b(20); \
138  \
139  tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0); \
140  out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m); \
141  \
142  tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0); \
143  out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \
144  \
145  tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0); \
146  out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m); \
147  \
148  out0_m; \
149 } )
150 
151 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
152 ( { \
153  v8i16 out0_m; \
154  \
155  out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
156  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
157  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
158  \
159  out0_m; \
160 } )
161 
162 #define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2) \
163 ( { \
164  v4i32 out0_m; \
165  \
166  out0_m = __msa_dotp_s_w((v8i16) in0, (v8i16) coeff0); \
167  out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in1, (v8i16) coeff1); \
168  out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in2, (v8i16) coeff2); \
169  out0_m = __msa_srari_w(out0_m, 10); \
170  out0_m = __msa_sat_s_w(out0_m, 7); \
171  out0_m; \
172 } )
173 
174 static void avc_luma_mid_4w_msa(const uint8_t *src, int32_t src_stride,
175  uint8_t *dst, int32_t dst_stride,
176  int32_t height)
177 {
178  uint32_t loop_cnt;
179  v16i8 src0, src1, src2, src3, src4;
180  v16i8 mask0, mask1, mask2;
181  v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
182  v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
183  v8i16 dst0, dst1, dst2, dst3;
184 
185  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
186  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
187  src += (5 * src_stride);
188 
189  XORI_B5_128_SB(src0, src1, src2, src3, src4);
190 
191  hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
192  mask0, mask1, mask2);
193  hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
194  mask0, mask1, mask2);
195 
196  PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
197 
198  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
199 
200  for (loop_cnt = (height >> 2); loop_cnt--;) {
201  LD_SB4(src, src_stride, src0, src1, src2, src3);
202  src += (4 * src_stride);
203 
204  XORI_B4_128_SB(src0, src1, src2, src3);
205 
206  hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
207  mask0, mask1,
208  mask2);
209  hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
210  mask0, mask1,
211  mask2);
212 
213  PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
214 
215  dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
216  hz_out3, hz_out4, hz_out5);
217  dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
218  hz_out4, hz_out5, hz_out6);
219  dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4,
220  hz_out5, hz_out6, hz_out7);
221  dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5,
222  hz_out6, hz_out7, hz_out8);
223 
224  PCKEV_B2_SB(dst1, dst0, dst3, dst2, src0, src1);
225  XORI_B2_128_SB(src0, src1);
226 
227  ST4x4_UB(src0, src1, 0, 2, 0, 2, dst, dst_stride);
228 
229  dst += (4 * dst_stride);
230 
231  hz_out0 = hz_out4;
232  hz_out1 = hz_out5;
233  hz_out2 = hz_out6;
234  hz_out3 = hz_out7;
235  hz_out4 = hz_out8;
236  }
237 }
238 
239 static void avc_luma_mid_8w_msa(const uint8_t *src, int32_t src_stride,
240  uint8_t *dst, int32_t dst_stride,
241  int32_t height)
242 {
243  uint32_t loop_cnt;
244  v16i8 src0, src1, src2, src3, src4;
245  v16i8 mask0, mask1, mask2;
246  v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
247  v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
248  v8i16 dst0, dst1, dst2, dst3;
249  v16u8 out0, out1;
250 
251  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
252 
253  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
254  XORI_B5_128_SB(src0, src1, src2, src3, src4);
255  src += (5 * src_stride);
256 
257  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
258  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
259  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
260  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
261  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
262 
263  for (loop_cnt = (height >> 2); loop_cnt--;) {
264  LD_SB4(src, src_stride, src0, src1, src2, src3);
265  XORI_B4_128_SB(src0, src1, src2, src3);
266  src += (4 * src_stride);
267 
268  hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
269  hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
270  hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
271  hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
272  dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
273  hz_out3, hz_out4, hz_out5);
274  dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
275  hz_out4, hz_out5, hz_out6);
276  dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
277  hz_out5, hz_out6, hz_out7);
278  dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
279  hz_out6, hz_out7, hz_out8);
280  out0 = PCKEV_XORI128_UB(dst0, dst1);
281  out1 = PCKEV_XORI128_UB(dst2, dst3);
282  ST8x4_UB(out0, out1, dst, dst_stride);
283 
284  dst += (4 * dst_stride);
285  hz_out3 = hz_out7;
286  hz_out1 = hz_out5;
287  hz_out5 = hz_out4;
288  hz_out4 = hz_out8;
289  hz_out2 = hz_out6;
290  hz_out0 = hz_out5;
291  }
292 }
293 
294 static void avc_luma_mid_16w_msa(const uint8_t *src, int32_t src_stride,
295  uint8_t *dst, int32_t dst_stride,
296  int32_t height)
297 {
298  uint32_t multiple8_cnt;
299 
300  for (multiple8_cnt = 2; multiple8_cnt--;) {
301  avc_luma_mid_8w_msa(src, src_stride, dst, dst_stride, height);
302  src += 8;
303  dst += 8;
304  }
305 }
306 
307 static void avc_luma_midh_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
308  uint8_t *dst, int32_t dst_stride,
309  int32_t height, uint8_t horiz_offset)
310 {
311  uint32_t row;
312  v16i8 src0, src1, src2, src3, src4, src5, src6;
313  v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
314  v4i32 hz_res0, hz_res1;
315  v8i16 dst0, dst1;
316  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
317  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
318  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
319  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
320  v8i16 minus5h = __msa_ldi_h(-5);
321  v8i16 plus20h = __msa_ldi_h(20);
322  v8i16 zeros = { 0 };
323  v16u8 out;
324 
325  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
326  src += (5 * src_stride);
327  XORI_B5_128_SB(src0, src1, src2, src3, src4);
328 
329  for (row = (height >> 1); row--;) {
330  LD_SB2(src, src_stride, src5, src6);
331  src += (2 * src_stride);
332 
333  XORI_B2_128_SB(src5, src6);
334  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
335  vt_res0, vt_res1);
336  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
337  vt_res2, vt_res3);
338  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
339  mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
340  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
341  mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
342  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
343  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
344  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
345  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
346 
347  SRARI_W2_SW(hz_res0, hz_res1, 10);
348  SAT_SW2_SW(hz_res0, hz_res1, 7);
349 
350  dst0 = __msa_srari_h(shf_vec2, 5);
351  dst1 = __msa_srari_h(shf_vec5, 5);
352 
353  SAT_SH2_SH(dst0, dst1, 7);
354 
355  if (horiz_offset) {
356  dst0 = __msa_ilvod_h(zeros, dst0);
357  dst1 = __msa_ilvod_h(zeros, dst1);
358  } else {
359  ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
360  }
361 
362  hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
363  hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
364  dst0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
365 
366  out = PCKEV_XORI128_UB(dst0, dst0);
367  ST4x2_UB(out, dst, dst_stride);
368 
369  dst += (2 * dst_stride);
370 
371  src0 = src2;
372  src1 = src3;
373  src2 = src4;
374  src3 = src5;
375  src4 = src6;
376  }
377 }
378 
379 static void avc_luma_midh_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
380  uint8_t *dst, int32_t dst_stride,
381  int32_t height, uint8_t horiz_offset)
382 {
383  uint32_t multiple8_cnt;
384 
385  for (multiple8_cnt = 2; multiple8_cnt--;) {
386  avc_luma_midh_qrt_4w_msa(src, src_stride, dst, dst_stride, height,
387  horiz_offset);
388 
389  src += 4;
390  dst += 4;
391  }
392 }
393 
394 static void avc_luma_midh_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
395  uint8_t *dst, int32_t dst_stride,
396  int32_t height, uint8_t horiz_offset)
397 {
398  uint32_t multiple8_cnt;
399 
400  for (multiple8_cnt = 4; multiple8_cnt--;) {
401  avc_luma_midh_qrt_4w_msa(src, src_stride, dst, dst_stride, height,
402  horiz_offset);
403 
404  src += 4;
405  dst += 4;
406  }
407 }
408 
409 static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y,
410  int32_t src_stride, uint8_t *dst,
411  int32_t dst_stride, int32_t height)
412 {
413  uint32_t loop_cnt;
414  v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
415  v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
416  v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
417  v16i8 mask0, mask1, mask2;
418  v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
419  v8i16 out0, out1;
420  v16u8 out;
421 
422  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
423 
424  LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
425  src_y += (5 * src_stride);
426 
427  src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
428  src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
429  src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
430  src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
431 
432  XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
433 
434  for (loop_cnt = (height >> 2); loop_cnt--;) {
435  LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
436  src_x += (4 * src_stride);
437 
438  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
439 
441  src_hz1, mask0,
442  mask1, mask2);
444  src_hz3, mask0,
445  mask1, mask2);
446 
447  SRARI_H2_SH(hz_out0, hz_out1, 5);
448  SAT_SH2_SH(hz_out0, hz_out1, 7);
449 
450  LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
451  src_y += (4 * src_stride);
452 
453  src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
454  src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
455  src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
456  src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
457 
458  XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
459 
460  /* filter calc */
461  vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1,
462  src_vt2, src_vt3,
463  src_vt4, src_vt5);
464  vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3,
465  src_vt4, src_vt5,
466  src_vt6, src_vt7);
467 
468  SRARI_H2_SH(vert_out0, vert_out1, 5);
469  SAT_SH2_SH(vert_out0, vert_out1, 7);
470 
471  out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
472  out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
473 
474  SAT_SH2_SH(out0, out1, 7);
475  out = PCKEV_XORI128_UB(out0, out1);
476  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
477  dst += (4 * dst_stride);
478 
479  src_vt3 = src_vt7;
480  src_vt1 = src_vt5;
481  src_vt0 = src_vt4;
482  src_vt4 = src_vt8;
483  src_vt2 = src_vt6;
484  }
485 }
486 
487 static void avc_luma_hv_qrt_8w_msa(const uint8_t *src_x, const uint8_t *src_y,
488  int32_t src_stride, uint8_t *dst,
489  int32_t dst_stride, int32_t height)
490 {
491  uint32_t loop_cnt;
492  v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
493  v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
494  v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
495  v16i8 mask0, mask1, mask2;
496  v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
497  v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
498  v8i16 out0, out1, out2, out3;
499  v16u8 tmp0, tmp1;
500 
501  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
502  LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
503  src_y += (5 * src_stride);
504 
505  src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
506  src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
507  src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
508  src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
509 
510  XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
511 
512  for (loop_cnt = (height >> 2); loop_cnt--;) {
513  LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
514  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
515  src_x += (4 * src_stride);
516 
517  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
518  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
519  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
520  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
521 
522  SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
523  SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
524 
525  LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
526  src_y += (4 * src_stride);
527 
528  src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
529  src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
530  src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
531  src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
532 
533  XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
534 
535  /* filter calc */
536  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt0, src_vt1, src_vt2, src_vt3,
537  src_vt4, src_vt5, vert_out0, vert_out1);
538  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt2, src_vt3, src_vt4, src_vt5,
539  src_vt6, src_vt7, vert_out2, vert_out3);
540 
541  SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
542  SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
543 
544  out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
545  out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
546  out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
547  out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
548 
549  SAT_SH4_SH(out0, out1, out2, out3, 7);
550  tmp0 = PCKEV_XORI128_UB(out0, out1);
551  tmp1 = PCKEV_XORI128_UB(out2, out3);
552  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
553 
554  dst += (4 * dst_stride);
555  src_vt3 = src_vt7;
556  src_vt1 = src_vt5;
557  src_vt5 = src_vt4;
558  src_vt4 = src_vt8;
559  src_vt2 = src_vt6;
560  src_vt0 = src_vt5;
561  }
562 }
563 
564 static void avc_luma_hv_qrt_16w_msa(const uint8_t *src_x, const uint8_t *src_y,
565  int32_t src_stride, uint8_t *dst,
566  int32_t dst_stride, int32_t height)
567 {
568  uint32_t multiple8_cnt;
569 
570  for (multiple8_cnt = 2; multiple8_cnt--;) {
571  avc_luma_hv_qrt_8w_msa(src_x, src_y, src_stride, dst, dst_stride,
572  height);
573 
574  src_x += 8;
575  src_y += 8;
576  dst += 8;
577  }
578 }
579 
581  int32_t src_stride,
582  uint8_t *dst, int32_t dst_stride)
583 {
584  v16i8 src0, src1, src2, src3;
585  v16u8 dst0, dst1, dst2, dst3, res;
586  v8i16 res0, res1;
587  v16i8 mask0, mask1, mask2;
588  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
589  v16i8 minus5b = __msa_ldi_b(-5);
590  v16i8 plus20b = __msa_ldi_b(20);
591 
592  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
593  LD_SB4(src, src_stride, src0, src1, src2, src3);
594 
595  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
596  XORI_B4_128_SB(src0, src1, src2, src3);
597  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
598  HADD_SB2_SH(vec0, vec1, res0, res1);
599  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
600  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
601  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
602  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
603  SRARI_H2_SH(res0, res1, 5);
604  SAT_SH2_SH(res0, res1, 7);
605  res = PCKEV_XORI128_UB(res0, res1);
606  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
607 
608  dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
609  res = __msa_aver_u_b(res, dst0);
610 
611  ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
612 }
613 
615  int32_t src_stride,
616  uint8_t *dst, int32_t dst_stride)
617 {
618  uint32_t loop_cnt;
619  v16i8 src0, src1, src2, src3;
620  v16u8 dst0, dst1, dst2, dst3;
621  v8i16 res0, res1, res2, res3;
622  v16i8 mask0, mask1, mask2;
623  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
624  v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
625  v16i8 minus5b = __msa_ldi_b(-5);
626  v16i8 plus20b = __msa_ldi_b(20);
627 
628  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
629 
630  for (loop_cnt = 2; loop_cnt--;) {
631  LD_SB4(src, src_stride, src0, src1, src2, src3);
632  src += (4 * src_stride);
633 
634  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
635 
636  XORI_B4_128_SB(src0, src1, src2, src3);
637  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
638  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
639  HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
640  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
641  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
642  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
643  res0, res1, res2, res3);
644  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
645  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
646  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
647  plus20b, res0, res1, res2, res3);
648  SRARI_H4_SH(res0, res1, res2, res3, 5);
649  SAT_SH4_SH(res0, res1, res2, res3, 7);
650  ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
651  CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1,
652  dst, dst_stride);
653 
654  dst += (4 * dst_stride);
655  }
656 }
657 
659  int32_t src_stride,
660  uint8_t *dst, int32_t dst_stride)
661 {
662  uint32_t loop_cnt;
663  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
664  v16u8 dst0, dst1, dst2, dst3;
665  v16i8 mask0, mask1, mask2;
666  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
667  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
668  v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
669  v16i8 minus5b = __msa_ldi_b(-5);
670  v16i8 plus20b = __msa_ldi_b(20);
671 
672  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
673 
674  for (loop_cnt = 4; loop_cnt--;) {
675  LD_SB2(src, 8, src0, src1);
676  src += src_stride;
677  LD_SB2(src, 8, src2, src3);
678  src += src_stride;
679 
680  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
681 
682  XORI_B4_128_SB(src0, src1, src2, src3);
683  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
684  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
685  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
686  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
687  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
688  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
689  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
690  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
691  minus5b, res0, res1, res2, res3);
692  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
693  plus20b, res0, res1, res2, res3);
694  LD_SB2(src, 8, src4, src5);
695  src += src_stride;
696  LD_SB2(src, 8, src6, src7);
697  src += src_stride;
698  XORI_B4_128_SB(src4, src5, src6, src7);
699  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
700  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
701  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
702  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
703  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
704  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
705  HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
706  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
707  minus5b, res4, res5, res6, res7);
708  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
709  plus20b, res4, res5, res6, res7);
710  SRARI_H4_SH(res0, res1, res2, res3, 5);
711  SRARI_H4_SH(res4, res5, res6, res7, 5);
712  SAT_SH4_SH(res0, res1, res2, res3, 7);
713  SAT_SH4_SH(res4, res5, res6, res7, 7);
714  PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
715  vec0, vec1, vec2, vec3);
716  XORI_B4_128_SB(vec0, vec1, vec2, vec3);
717  AVER_UB4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
718  dst0, dst1, dst2, dst3);
719  ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
720  dst += (4 * dst_stride);
721  }
722 }
723 
725  int32_t src_stride,
726  uint8_t *dst,
727  int32_t dst_stride,
728  uint8_t hor_offset)
729 {
730  uint8_t slide;
731  v16i8 src0, src1, src2, src3;
732  v16u8 dst0, dst1, dst2, dst3;
733  v16i8 mask0, mask1, mask2;
734  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
735  v8i16 out0, out1;
736  v16i8 minus5b = __msa_ldi_b(-5);
737  v16i8 plus20b = __msa_ldi_b(20);
738  v16u8 res0, res1;
739 
740  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
741 
742  if (hor_offset) {
743  slide = 3;
744  } else {
745  slide = 2;
746  }
747 
748  LD_SB4(src, src_stride, src0, src1, src2, src3);
749  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
750 
751  XORI_B4_128_SB(src0, src1, src2, src3);
752  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
753  HADD_SB2_SH(vec0, vec1, out0, out1);
754  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
755  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
756  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
757  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
758  SRARI_H2_SH(out0, out1, 5);
759  SAT_SH2_SH(out0, out1, 7);
760 
761  PCKEV_B2_UB(out0, out0, out1, out1, res0, res1);
762 
763  src0 = __msa_sld_b(src0, src0, slide);
764  src1 = __msa_sld_b(src1, src1, slide);
765  src2 = __msa_sld_b(src2, src2, slide);
766  src3 = __msa_sld_b(src3, src3, slide);
767  src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
768  src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
769  res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src0);
770  res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src1);
771 
772  XORI_B2_128_UB(res0, res1);
773 
774  dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
775  dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
776 
777  AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
778 
779  ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
780 }
781 
783  int32_t src_stride,
784  uint8_t *dst,
785  int32_t dst_stride,
786  uint8_t hor_offset)
787 {
788  uint8_t slide;
789  uint32_t loop_cnt;
790  v16i8 src0, src1, src2, src3;
791  v16i8 mask0, mask1, mask2;
792  v16u8 dst0, dst1, dst2, dst3;
793  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
794  v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
795  v8i16 out0, out1, out2, out3;
796  v16i8 minus5b = __msa_ldi_b(-5);
797  v16i8 plus20b = __msa_ldi_b(20);
798  v16i8 res0, res1, res2, res3;
799 
800  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
801 
802  if (hor_offset) {
803  slide = 3;
804  } else {
805  slide = 2;
806  }
807 
808  for (loop_cnt = 2; loop_cnt--;) {
809  LD_SB4(src, src_stride, src0, src1, src2, src3);
810  src += (4 * src_stride);
811 
812  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
813 
814  XORI_B4_128_SB(src0, src1, src2, src3);
815  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
816  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
817  HADD_SB4_SH(vec0, vec1, vec2, vec3, out0, out1, out2, out3);
818  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
819  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
820  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
821  out0, out1, out2, out3);
822  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
823  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
824  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
825  plus20b, out0, out1, out2, out3);
826 
827  src0 = __msa_sld_b(src0, src0, slide);
828  src1 = __msa_sld_b(src1, src1, slide);
829  src2 = __msa_sld_b(src2, src2, slide);
830  src3 = __msa_sld_b(src3, src3, slide);
831 
832  SRARI_H4_SH(out0, out1, out2, out3, 5);
833  SAT_SH4_SH(out0, out1, out2, out3, 7);
834 
835  PCKEV_B4_SB(out0, out0, out1, out1, out2, out2, out3, out3,
836  res0, res1, res2, res3);
837 
838  res0 = __msa_aver_s_b(res0, src0);
839  res1 = __msa_aver_s_b(res1, src1);
840  res2 = __msa_aver_s_b(res2, src2);
841  res3 = __msa_aver_s_b(res3, src3);
842 
843  XORI_B4_128_SB(res0, res1, res2, res3);
844  AVER_ST8x4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
845  dst, dst_stride);
846 
847  dst += (4 * dst_stride);
848  }
849 }
850 
852  int32_t src_stride,
853  uint8_t *dst,
854  int32_t dst_stride,
855  uint8_t hor_offset)
856 {
857  uint32_t loop_cnt;
858  v16i8 out0, out1;
859  v16i8 src0, src1, src2, src3;
860  v16i8 mask0, mask1, mask2, vshf;
861  v16u8 dst0, dst1;
862  v8i16 res0, res1, res2, res3;
863  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
864  v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
865  v16i8 minus5b = __msa_ldi_b(-5);
866  v16i8 plus20b = __msa_ldi_b(20);
867 
868  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
869 
870  if (hor_offset) {
871  vshf = LD_SB(&luma_mask_arr[16 + 96]);
872  } else {
873  vshf = LD_SB(&luma_mask_arr[96]);
874  }
875 
876  for (loop_cnt = 8; loop_cnt--;) {
877  LD_SB2(src, 8, src0, src1);
878  src += src_stride;
879  LD_SB2(src, 8, src2, src3);
880  src += src_stride;
881 
882  LD_UB2(dst, dst_stride, dst0, dst1);
883 
884  XORI_B4_128_SB(src0, src1, src2, src3);
885  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
886  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
887  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
888  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
889  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
890  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
891  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
892  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
893  minus5b, res0, res1, res2, res3);
894  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
895  plus20b, res0, res1, res2, res3);
896  VSHF_B2_SB(src0, src1, src2, src3, vshf, vshf, src0, src2);
897  SRARI_H4_SH(res0, res1, res2, res3, 5);
898  SAT_SH4_SH(res0, res1, res2, res3, 7);
899  PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
900 
901  out0 = __msa_aver_s_b(out0, src0);
902  out1 = __msa_aver_s_b(out1, src2);
903 
904  XORI_B2_128_SB(out0, out1);
905  AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
906  ST_UB2(dst0, dst1, dst, dst_stride);
907  dst += (2 * dst_stride);
908  }
909 }
910 
912  int32_t src_stride,
913  uint8_t *dst, int32_t dst_stride)
914 {
915  int16_t filt_const0 = 0xfb01;
916  int16_t filt_const1 = 0x1414;
917  int16_t filt_const2 = 0x1fb;
918  v16u8 dst0, dst1, dst2, dst3;
919  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
920  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
921  v16i8 src87_r, src2110, src4332, src6554, src8776;
922  v8i16 out10, out32;
923  v16i8 filt0, filt1, filt2;
924  v16u8 res;
925 
926  filt0 = (v16i8) __msa_fill_h(filt_const0);
927  filt1 = (v16i8) __msa_fill_h(filt_const1);
928  filt2 = (v16i8) __msa_fill_h(filt_const2);
929 
930  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
931  src += (5 * src_stride);
932 
933  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
934  src10_r, src21_r, src32_r, src43_r);
935  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
936  XORI_B2_128_SB(src2110, src4332);
937  LD_SB4(src, src_stride, src5, src6, src7, src8);
938  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
939  src54_r, src65_r, src76_r, src87_r);
940  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
941  XORI_B2_128_SB(src6554, src8776);
942  out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
943  out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
944  SRARI_H2_SH(out10, out32, 5);
945  SAT_SH2_SH(out10, out32, 7);
946  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
947  res = PCKEV_XORI128_UB(out10, out32);
948 
949  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
950 
951  dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
952  dst0 = __msa_aver_u_b(res, dst0);
953 
954  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
955 }
956 
958  int32_t src_stride,
959  uint8_t *dst, int32_t dst_stride)
960 {
961  int32_t loop_cnt;
962  int16_t filt_const0 = 0xfb01;
963  int16_t filt_const1 = 0x1414;
964  int16_t filt_const2 = 0x1fb;
965  v16u8 dst0, dst1, dst2, dst3;
966  v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
967  v16i8 src10_r, src32_r, src76_r, src98_r;
968  v16i8 src21_r, src43_r, src87_r, src109_r;
969  v8i16 out0, out1, out2, out3;
970  v16i8 filt0, filt1, filt2;
971 
972  filt0 = (v16i8) __msa_fill_h(filt_const0);
973  filt1 = (v16i8) __msa_fill_h(filt_const1);
974  filt2 = (v16i8) __msa_fill_h(filt_const2);
975 
976  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
977  src += (5 * src_stride);
978 
979  XORI_B5_128_SB(src0, src1, src2, src3, src4);
980  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
981  src10_r, src21_r, src32_r, src43_r);
982 
983  for (loop_cnt = 2; loop_cnt--;) {
984  LD_SB4(src, src_stride, src7, src8, src9, src10);
985  src += (4 * src_stride);
986 
987  XORI_B4_128_SB(src7, src8, src9, src10);
988  ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
989  src76_r, src87_r, src98_r, src109_r);
990  out0 = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
991  out1 = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
992  out2 = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
993  out3 = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
994  SRARI_H4_SH(out0, out1, out2, out3, 5);
995  SAT_SH4_SH(out0, out1, out2, out3, 7);
996  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
997  ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
998  CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
999  dst, dst_stride);
1000  dst += (4 * dst_stride);
1001 
1002  src10_r = src76_r;
1003  src32_r = src98_r;
1004  src21_r = src87_r;
1005  src43_r = src109_r;
1006  src4 = src10;
1007  }
1008 }
1009 
1011  int32_t src_stride,
1012  uint8_t *dst, int32_t dst_stride)
1013 {
1014  int32_t loop_cnt;
1015  int16_t filt_const0 = 0xfb01;
1016  int16_t filt_const1 = 0x1414;
1017  int16_t filt_const2 = 0x1fb;
1018  v16u8 dst0, dst1, dst2, dst3;
1019  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1020  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1021  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1022  v16i8 src65_l, src87_l;
1023  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1024  v16i8 filt0, filt1, filt2;
1025  v16u8 res0, res1, res2, res3;
1026 
1027  filt0 = (v16i8) __msa_fill_h(filt_const0);
1028  filt1 = (v16i8) __msa_fill_h(filt_const1);
1029  filt2 = (v16i8) __msa_fill_h(filt_const2);
1030 
1031  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1032  src += (5 * src_stride);
1033 
1034  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1035  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1036  src10_r, src21_r, src32_r, src43_r);
1037  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1038  src10_l, src21_l, src32_l, src43_l);
1039 
1040  for (loop_cnt = 4; loop_cnt--;) {
1041  LD_SB4(src, src_stride, src5, src6, src7, src8);
1042  src += (4 * src_stride);
1043 
1044  XORI_B4_128_SB(src5, src6, src7, src8);
1045  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1046  src54_r, src65_r, src76_r, src87_r);
1047  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1048  src54_l, src65_l, src76_l, src87_l);
1049  out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1050  out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1051  out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1052  out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1053  out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1054  out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1055  out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1056  out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1057  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1058  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1059  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1060  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1061  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1062  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1063  out3_r, res0, res1, res2, res3);
1064  XORI_B4_128_UB(res0, res1, res2, res3);
1065  AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
1066  res0, res1, res2, res3);
1067  ST_UB4(res0, res1, res2, res3, dst, dst_stride);
1068  dst += (4 * dst_stride);
1069 
1070  src10_r = src54_r;
1071  src32_r = src76_r;
1072  src21_r = src65_r;
1073  src43_r = src87_r;
1074  src10_l = src54_l;
1075  src32_l = src76_l;
1076  src21_l = src65_l;
1077  src43_l = src87_l;
1078  src4 = src8;
1079  }
1080 }
1081 
1083  int32_t src_stride,
1084  uint8_t *dst,
1085  int32_t dst_stride,
1086  uint8_t ver_offset)
1087 {
1088  int16_t filt_const0 = 0xfb01;
1089  int16_t filt_const1 = 0x1414;
1090  int16_t filt_const2 = 0x1fb;
1091  v16u8 dst0, dst1, dst2, dst3;
1092  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1093  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1094  v16i8 src87_r, src2110, src4332, src6554, src8776;
1095  v8i16 out10, out32;
1096  v16i8 filt0, filt1, filt2;
1097  v16u8 res;
1098 
1099  filt0 = (v16i8) __msa_fill_h(filt_const0);
1100  filt1 = (v16i8) __msa_fill_h(filt_const1);
1101  filt2 = (v16i8) __msa_fill_h(filt_const2);
1102 
1103  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1104  src += (5 * src_stride);
1105 
1106  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1107  src10_r, src21_r, src32_r, src43_r);
1108  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1109  XORI_B2_128_SB(src2110, src4332);
1110  LD_SB4(src, src_stride, src5, src6, src7, src8);
1111  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1112  src54_r, src65_r, src76_r, src87_r);
1113  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1114  XORI_B2_128_SB(src6554, src8776);
1115  out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1116  out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1117  SRARI_H2_SH(out10, out32, 5);
1118  SAT_SH2_SH(out10, out32, 7);
1119  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1120  res = PCKEV_XORI128_UB(out10, out32);
1121 
1122  if (ver_offset) {
1123  src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
1124  src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
1125  } else {
1126  src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1127  src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
1128  }
1129 
1130  src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1131  res = __msa_aver_u_b(res, (v16u8) src32_r);
1132 
1133  ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1134 
1135  dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
1136  dst0 = __msa_aver_u_b(res, dst0);
1137 
1138  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1139 }
1140 
1142  int32_t src_stride,
1143  uint8_t *dst,
1144  int32_t dst_stride,
1145  uint8_t ver_offset)
1146 {
1147  int32_t loop_cnt;
1148  int16_t filt_const0 = 0xfb01;
1149  int16_t filt_const1 = 0x1414;
1150  int16_t filt_const2 = 0x1fb;
1151  v16u8 dst0, dst1, dst2, dst3;
1152  v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
1153  v16i8 src10_r, src32_r, src76_r, src98_r;
1154  v16i8 src21_r, src43_r, src87_r, src109_r;
1155  v8i16 out0_r, out1_r, out2_r, out3_r;
1156  v16i8 res0, res1;
1157  v16u8 vec0, vec1;
1158  v16i8 filt0, filt1, filt2;
1159 
1160  filt0 = (v16i8) __msa_fill_h(filt_const0);
1161  filt1 = (v16i8) __msa_fill_h(filt_const1);
1162  filt2 = (v16i8) __msa_fill_h(filt_const2);
1163 
1164  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1165  src += (5 * src_stride);
1166 
1167  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1168  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1169  src10_r, src21_r, src32_r, src43_r);
1170 
1171  for (loop_cnt = 2; loop_cnt--;) {
1172  LD_SB4(src, src_stride, src7, src8, src9, src10);
1173  src += (4 * src_stride);
1174 
1175  XORI_B4_128_SB(src7, src8, src9, src10);
1176  ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
1177  src76_r, src87_r, src98_r, src109_r);
1178  out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
1179  out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
1180  out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
1181  out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
1182  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1183  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1184  PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, res0, res1);
1185 
1186  if (ver_offset) {
1187  PCKEV_D2_SB(src4, src3, src8, src7, src10_r, src32_r);
1188  } else {
1189  PCKEV_D2_SB(src3, src2, src7, src4, src10_r, src32_r);
1190  }
1191 
1192  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1193  ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1194 
1195  vec0 = (v16u8) __msa_aver_s_b(res0, src10_r);
1196  vec1 = (v16u8) __msa_aver_s_b(res1, src32_r);
1197 
1198  XORI_B2_128_UB(vec0, vec1);
1199  AVER_UB2_UB(vec0, dst0, vec1, dst1, vec0, vec1);
1200  ST8x4_UB(vec0, vec1, dst, dst_stride);
1201  dst += (4 * dst_stride);
1202 
1203  src10_r = src76_r;
1204  src32_r = src98_r;
1205  src21_r = src87_r;
1206  src43_r = src109_r;
1207  src2 = src8;
1208  src3 = src9;
1209  src4 = src10;
1210  }
1211 }
1212 
1214  int32_t src_stride,
1215  uint8_t *dst,
1216  int32_t dst_stride,
1217  uint8_t ver_offset)
1218 {
1219  int32_t loop_cnt;
1220  int16_t filt_const0 = 0xfb01;
1221  int16_t filt_const1 = 0x1414;
1222  int16_t filt_const2 = 0x1fb;
1223  v16u8 dst0, dst1, dst2, dst3;
1224  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1225  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1226  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1227  v16i8 src65_l, src87_l;
1228  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1229  v16i8 out0, out1, out2, out3;
1230  v16i8 filt0, filt1, filt2;
1231  v16u8 res0, res1, res2, res3;
1232 
1233  filt0 = (v16i8) __msa_fill_h(filt_const0);
1234  filt1 = (v16i8) __msa_fill_h(filt_const1);
1235  filt2 = (v16i8) __msa_fill_h(filt_const2);
1236 
1237  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1238  src += (5 * src_stride);
1239 
1240  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1241  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1242  src10_r, src21_r, src32_r, src43_r);
1243  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1244  src10_l, src21_l, src32_l, src43_l);
1245 
1246  for (loop_cnt = 4; loop_cnt--;) {
1247  LD_SB4(src, src_stride, src5, src6, src7, src8);
1248  src += (4 * src_stride);
1249 
1250  XORI_B4_128_SB(src5, src6, src7, src8);
1251  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1252  src54_r, src65_r, src76_r, src87_r);
1253  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1254  src54_l, src65_l, src76_l, src87_l);
1255  out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1256  out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1257  out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1258  out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1259  out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1260  out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1261  out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1262  out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1263  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1264  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1265  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1266  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1267  PCKEV_B4_SB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1268  out3_r, out0, out1, out2, out3);
1269  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1270 
1271  if (ver_offset) {
1272  res0 = (v16u8) __msa_aver_s_b(out0, src3);
1273  res1 = (v16u8) __msa_aver_s_b(out1, src4);
1274  res2 = (v16u8) __msa_aver_s_b(out2, src5);
1275  res3 = (v16u8) __msa_aver_s_b(out3, src6);
1276  } else {
1277  res0 = (v16u8) __msa_aver_s_b(out0, src2);
1278  res1 = (v16u8) __msa_aver_s_b(out1, src3);
1279  res2 = (v16u8) __msa_aver_s_b(out2, src4);
1280  res3 = (v16u8) __msa_aver_s_b(out3, src5);
1281  }
1282 
1283  XORI_B4_128_UB(res0, res1, res2, res3);
1284  AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
1285  dst0, dst1, dst2, dst3);
1286  ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
1287  dst += (4 * dst_stride);
1288 
1289  src10_r = src54_r;
1290  src32_r = src76_r;
1291  src21_r = src65_r;
1292  src43_r = src87_r;
1293  src10_l = src54_l;
1294  src32_l = src76_l;
1295  src21_l = src65_l;
1296  src43_l = src87_l;
1297  src2 = src6;
1298  src3 = src7;
1299  src4 = src8;
1300  }
1301 }
1302 
1304  int32_t src_stride,
1305  uint8_t *dst, int32_t dst_stride)
1306 {
1307  v16i8 src0, src1, src2, src3, src4;
1308  v16i8 mask0, mask1, mask2;
1309  v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1310  v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1311  v8i16 res0, res1, res2, res3;
1312  v16u8 dst0, dst1, dst2, dst3;
1313  v16u8 tmp0, tmp1, tmp2, tmp3;
1314 
1315  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1316  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1317  src += (5 * src_stride);
1318 
1319  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1320 
1321  hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
1322  mask0, mask1, mask2);
1323  hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
1324  mask0, mask1, mask2);
1325 
1326  PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
1327 
1328  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1329 
1330  LD_SB4(src, src_stride, src0, src1, src2, src3);
1331  XORI_B4_128_SB(src0, src1, src2, src3);
1332 
1333  hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
1334  mask0, mask1, mask2);
1335  hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
1336  mask0, mask1, mask2);
1337 
1338  PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
1339 
1340  res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
1341  hz_out3, hz_out4, hz_out5);
1342  res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
1343  hz_out4, hz_out5, hz_out6);
1344  res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4,
1345  hz_out5, hz_out6, hz_out7);
1346  res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5,
1347  hz_out6, hz_out7, hz_out8);
1348  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1349  tmp0 = PCKEV_XORI128_UB(res0, res1);
1350  tmp1 = PCKEV_XORI128_UB(res2, res3);
1351  PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2, tmp3);
1352  AVER_UB2_UB(tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
1353 
1354  ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
1355 }
1356 
1358  int32_t src_stride,
1359  uint8_t *dst, int32_t dst_stride,
1360  int32_t height)
1361 {
1362  uint32_t loop_cnt;
1363  v16i8 src0, src1, src2, src3, src4;
1364  v16i8 mask0, mask1, mask2;
1365  v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1366  v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1367  v16u8 dst0, dst1, dst2, dst3;
1368  v8i16 res0, res1, res2, res3;
1369 
1370  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1371 
1372  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1373  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1374  src += (5 * src_stride);
1375 
1376  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1377  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1378  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1379  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1380  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1381 
1382  for (loop_cnt = (height >> 2); loop_cnt--;) {
1383  LD_SB4(src, src_stride, src0, src1, src2, src3);
1384  XORI_B4_128_SB(src0, src1, src2, src3);
1385  src += (4 * src_stride);
1386 
1387  hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1388  hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1389  hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1390  hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1391 
1392  res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
1393  hz_out3, hz_out4, hz_out5);
1394  res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
1395  hz_out4, hz_out5, hz_out6);
1396  res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
1397  hz_out5, hz_out6, hz_out7);
1398  res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
1399  hz_out6, hz_out7, hz_out8);
1400  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1401  ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1402  CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1,
1403  dst, dst_stride);
1404 
1405  dst += (4 * dst_stride);
1406  hz_out3 = hz_out7;
1407  hz_out1 = hz_out5;
1408  hz_out5 = hz_out4;
1409  hz_out4 = hz_out8;
1410  hz_out2 = hz_out6;
1411  hz_out0 = hz_out5;
1412  }
1413 }
1414 
1416  int32_t src_stride,
1417  uint8_t *dst,
1418  int32_t dst_stride)
1419 {
1420  avc_luma_mid_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 16);
1421  avc_luma_mid_and_aver_dst_8w_msa(src + 8, src_stride, dst + 8, dst_stride,
1422  16);
1423 }
1424 
1426  int32_t src_stride,
1427  uint8_t *dst,
1428  int32_t dst_stride,
1429  int32_t height,
1430  uint8_t horiz_offset)
1431 {
1432  uint32_t row;
1433  v16i8 src0, src1, src2, src3, src4, src5, src6;
1434  v16u8 dst0, dst1, res;
1435  v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
1436  v4i32 hz_res0, hz_res1;
1437  v8i16 res0, res1;
1438  v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
1439  v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
1440  v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
1441  v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
1442  v8i16 minus5h = __msa_ldi_h(-5);
1443  v8i16 plus20h = __msa_ldi_h(20);
1444  v8i16 zeros = { 0 };
1445 
1446  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1447  src += (5 * src_stride);
1448 
1449  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1450 
1451  for (row = (height >> 1); row--;) {
1452  LD_SB2(src, src_stride, src5, src6);
1453  src += (2 * src_stride);
1454 
1455  XORI_B2_128_SB(src5, src6);
1456  LD_UB2(dst, dst_stride, dst0, dst1);
1457 
1458  dst0 = (v16u8) __msa_ilvr_w((v4i32) dst1, (v4i32) dst0);
1459 
1460  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
1461  vt_res0, vt_res1);
1462  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
1463  vt_res2, vt_res3);
1464  VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
1465  mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
1466  VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
1467  mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
1468 
1469  hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
1470  DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
1471 
1472  hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
1473  DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
1474 
1475  SRARI_W2_SW(hz_res0, hz_res1, 10);
1476  SAT_SW2_SW(hz_res0, hz_res1, 7);
1477 
1478  res0 = __msa_srari_h(shf_vec2, 5);
1479  res1 = __msa_srari_h(shf_vec5, 5);
1480 
1481  SAT_SH2_SH(res0, res1, 7);
1482 
1483  if (horiz_offset) {
1484  res0 = __msa_ilvod_h(zeros, res0);
1485  res1 = __msa_ilvod_h(zeros, res1);
1486  } else {
1487  ILVEV_H2_SH(res0, zeros, res1, zeros, res0, res1);
1488  }
1489  hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) res0);
1490  hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) res1);
1491  res0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
1492 
1493  res = PCKEV_XORI128_UB(res0, res0);
1494 
1495  dst0 = __msa_aver_u_b(res, dst0);
1496 
1497  ST4x2_UB(dst0, dst, dst_stride);
1498  dst += (2 * dst_stride);
1499 
1500  src0 = src2;
1501  src1 = src3;
1502  src2 = src4;
1503  src3 = src5;
1504  src4 = src6;
1505  }
1506 }
1507 
1509  int32_t src_stride,
1510  uint8_t *dst,
1511  int32_t dst_stride,
1512  int32_t height,
1513  uint8_t horiz_offset)
1514 {
1515  uint32_t multiple8_cnt;
1516 
1517  for (multiple8_cnt = 2; multiple8_cnt--;) {
1518  avc_luma_midh_qrt_and_aver_dst_4w_msa(src, src_stride, dst, dst_stride,
1519  height, horiz_offset);
1520 
1521  src += 4;
1522  dst += 4;
1523  }
1524 }
1525 
1527  int32_t src_stride,
1528  uint8_t *dst,
1529  int32_t dst_stride,
1530  int32_t height,
1531  uint8_t horiz_offset)
1532 {
1533  uint32_t multiple8_cnt;
1534 
1535  for (multiple8_cnt = 4; multiple8_cnt--;) {
1536  avc_luma_midh_qrt_and_aver_dst_4w_msa(src, src_stride, dst, dst_stride,
1537  height, horiz_offset);
1538 
1539  src += 4;
1540  dst += 4;
1541  }
1542 }
1543 
1545  int32_t src_stride,
1546  uint8_t *dst,
1547  int32_t dst_stride,
1548  int32_t height,
1549  uint8_t ver_offset)
1550 {
1551  int32_t loop_cnt;
1552  int32_t out0, out1;
1553  v16i8 src0, src1, src2, src3, src4;
1554  v16u8 dst0, dst1;
1555  v16i8 mask0, mask1, mask2;
1556  v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1557  v8i16 hz_out4, hz_out5, hz_out6;
1558  v8i16 res0, res1, res2, res3;
1559  v16u8 vec0, vec1;
1560 
1561  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1562  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1563  src += (5 * src_stride);
1564 
1565  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1566 
1567  hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
1568  mask0, mask1, mask2);
1569  hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
1570  mask0, mask1, mask2);
1571 
1572  PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
1573 
1574  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1575 
1576  for (loop_cnt = (height >> 1); loop_cnt--;) {
1577  LD_SB2(src, src_stride, src0, src1);
1578  src += (2 * src_stride);
1579 
1580  XORI_B2_128_SB(src0, src1);
1581  LD_UB2(dst, dst_stride, dst0, dst1);
1582  hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
1583  mask0, mask1,
1584  mask2);
1585  hz_out6 = (v8i16) __msa_pckod_d((v2i64) hz_out5, (v2i64) hz_out5);
1586  res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
1587  hz_out3, hz_out4, hz_out5);
1588  res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
1589  hz_out4, hz_out5, hz_out6);
1590 
1591  if (ver_offset) {
1592  res1 = __msa_srari_h(hz_out3, 5);
1593  res3 = __msa_srari_h(hz_out4, 5);
1594  } else {
1595  res1 = __msa_srari_h(hz_out2, 5);
1596  res3 = __msa_srari_h(hz_out3, 5);
1597  }
1598 
1599  SAT_SH2_SH(res1, res3, 7);
1600 
1601  res0 = __msa_aver_s_h(res0, res1);
1602  res1 = __msa_aver_s_h(res2, res3);
1603 
1604  vec0 = PCKEV_XORI128_UB(res0, res0);
1605  vec1 = PCKEV_XORI128_UB(res1, res1);
1606 
1607  AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1);
1608 
1609  out0 = __msa_copy_u_w((v4i32) dst0, 0);
1610  out1 = __msa_copy_u_w((v4i32) dst1, 0);
1611  SW(out0, dst);
1612  dst += dst_stride;
1613  SW(out1, dst);
1614  dst += dst_stride;
1615 
1616  hz_out0 = hz_out2;
1617  hz_out1 = hz_out3;
1618  hz_out2 = hz_out4;
1619  hz_out3 = hz_out5;
1620  hz_out4 = hz_out6;
1621  }
1622 }
1623 
1625  int32_t src_stride,
1626  uint8_t *dst,
1627  int32_t dst_stride,
1628  int32_t height,
1629  uint8_t vert_offset)
1630 {
1631  int32_t loop_cnt;
1632  v16i8 src0, src1, src2, src3, src4;
1633  v16u8 dst0, dst1, dst2, dst3;
1634  v16i8 mask0, mask1, mask2;
1635  v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1636  v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1637  v8i16 res0, res1, res2, res3;
1638  v8i16 res4, res5, res6, res7;
1639 
1640  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1641 
1642  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1643  XORI_B5_128_SB(src0, src1, src2, src3, src4);
1644  src += (5 * src_stride);
1645 
1646  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1647  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1648  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1649  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1650  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1651 
1652  for (loop_cnt = (height >> 2); loop_cnt--;) {
1653  LD_SB4(src, src_stride, src0, src1, src2, src3);
1654  XORI_B4_128_SB(src0, src1, src2, src3);
1655  src += (4 * src_stride);
1656 
1657  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1658 
1659  hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1660  hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1661  hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1662  hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1663 
1664  res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
1665  hz_out3, hz_out4, hz_out5);
1666  res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
1667  hz_out4, hz_out5, hz_out6);
1668  res4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
1669  hz_out5, hz_out6, hz_out7);
1670  res6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
1671  hz_out6, hz_out7, hz_out8);
1672 
1673  if (vert_offset) {
1674  res1 = __msa_srari_h(hz_out3, 5);
1675  res3 = __msa_srari_h(hz_out4, 5);
1676  res5 = __msa_srari_h(hz_out5, 5);
1677  res7 = __msa_srari_h(hz_out6, 5);
1678  } else {
1679  res1 = __msa_srari_h(hz_out2, 5);
1680  res3 = __msa_srari_h(hz_out3, 5);
1681  res5 = __msa_srari_h(hz_out4, 5);
1682  res7 = __msa_srari_h(hz_out5, 5);
1683  }
1684 
1685  SAT_SH4_SH(res1, res3, res5, res7, 7);
1686 
1687  res0 = __msa_aver_s_h(res0, res1);
1688  res1 = __msa_aver_s_h(res2, res3);
1689  res2 = __msa_aver_s_h(res4, res5);
1690  res3 = __msa_aver_s_h(res6, res7);
1691  ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1692  CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1,
1693  dst, dst_stride);
1694  dst += (4 * dst_stride);
1695 
1696  hz_out0 = hz_out4;
1697  hz_out1 = hz_out5;
1698  hz_out2 = hz_out6;
1699  hz_out3 = hz_out7;
1700  hz_out4 = hz_out8;
1701  }
1702 }
1703 
1705  int32_t src_stride,
1706  uint8_t *dst,
1707  int32_t dst_stride,
1708  int32_t height,
1709  uint8_t vert_offset)
1710 {
1711  int32_t multiple8_cnt;
1712 
1713  for (multiple8_cnt = 2; multiple8_cnt--;) {
1714  avc_luma_midv_qrt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1715  height, vert_offset);
1716 
1717  src += 8;
1718  dst += 8;
1719  }
1720 }
1721 
1723  const uint8_t *src_y,
1724  int32_t src_stride,
1725  uint8_t *dst,
1726  int32_t dst_stride)
1727 {
1728  v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
1729  v16u8 dst0, dst1, dst2, dst3;
1730  v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
1731  v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
1732  v16i8 mask0, mask1, mask2;
1733  v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
1734  v8i16 res0, res1;
1735  v16u8 res;
1736 
1737  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1738  LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
1739  src_y += (5 * src_stride);
1740 
1741  src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
1742  src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
1743  src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
1744  src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
1745 
1746  XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
1747  LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
1748  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1749  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
1750  hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0, src_hz1,
1751  mask0, mask1, mask2);
1752  hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2, src_hz3,
1753  mask0, mask1, mask2);
1754  SRARI_H2_SH(hz_out0, hz_out1, 5);
1755  SAT_SH2_SH(hz_out0, hz_out1, 7);
1756  LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
1757 
1758  src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
1759  src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
1760  src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
1761  src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
1762 
1763  XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
1764 
1765  /* filter calc */
1766  vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1, src_vt2,
1767  src_vt3, src_vt4, src_vt5);
1768  vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3, src_vt4,
1769  src_vt5, src_vt6, src_vt7);
1770  SRARI_H2_SH(vert_out0, vert_out1, 5);
1771  SAT_SH2_SH(vert_out0, vert_out1, 7);
1772 
1773  res1 = __msa_srari_h((hz_out1 + vert_out1), 1);
1774  res0 = __msa_srari_h((hz_out0 + vert_out0), 1);
1775 
1776  SAT_SH2_SH(res0, res1, 7);
1777  res = PCKEV_XORI128_UB(res0, res1);
1778 
1779  dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
1780  dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
1781  dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
1782  dst0 = __msa_aver_u_b(res, dst0);
1783 
1784  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1785 }
1786 
1788  const uint8_t *src_y,
1789  int32_t src_stride,
1790  uint8_t *dst,
1791  int32_t dst_stride)
1792 {
1793  uint32_t loop_cnt;
1794  v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
1795  v16u8 dst0, dst1, dst2, dst3;
1796  v16i8 src_vt0, src_vt1, src_vt2, src_vt3;
1797  v16i8 src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
1798  v16i8 mask0, mask1, mask2;
1799  v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1800  v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
1801  v8i16 out0, out1, out2, out3;
1802 
1803  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1804 
1805  LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
1806  src_y += (5 * src_stride);
1807 
1808  src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
1809  src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
1810  src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
1811  src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
1812 
1813  XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
1814 
1815  for (loop_cnt = 2; loop_cnt--;) {
1816  LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
1817  XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
1818  src_x += (4 * src_stride);
1819 
1820  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1821  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
1822  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
1823  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
1824  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
1825  SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
1826  SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
1827  LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
1828  src_y += (4 * src_stride);
1829 
1830  src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
1831  src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
1832  src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
1833  src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
1834 
1835  XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
1836  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt0, src_vt1, src_vt2, src_vt3,
1837  src_vt4, src_vt5, vert_out0, vert_out1);
1838  AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt2, src_vt3, src_vt4, src_vt5,
1839  src_vt6, src_vt7, vert_out2, vert_out3);
1840  SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
1841  SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
1842 
1843  out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
1844  out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
1845  out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
1846  out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
1847 
1848  SAT_SH4_SH(out0, out1, out2, out3, 7);
1849  ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1850  CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
1851  dst, dst_stride);
1852  dst += (4 * dst_stride);
1853 
1854  src_vt0 = src_vt4;
1855  src_vt1 = src_vt5;
1856  src_vt2 = src_vt6;
1857  src_vt3 = src_vt7;
1858  src_vt4 = src_vt8;
1859  }
1860 }
1861 
1863  const uint8_t *src_y,
1864  int32_t src_stride,
1865  uint8_t *dst,
1866  int32_t dst_stride)
1867 {
1868  uint32_t multiple8_cnt;
1869 
1870  for (multiple8_cnt = 2; multiple8_cnt--;) {
1871  avc_luma_hv_qrt_and_aver_dst_8x8_msa(src_x, src_y, src_stride,
1872  dst, dst_stride);
1873 
1874  src_x += 8;
1875  src_y += 8;
1876  dst += 8;
1877  }
1878 
1879  src_x += (8 * src_stride) - 16;
1880  src_y += (8 * src_stride) - 16;
1881  dst += (8 * dst_stride) - 16;
1882 
1883  for (multiple8_cnt = 2; multiple8_cnt--;) {
1884  avc_luma_hv_qrt_and_aver_dst_8x8_msa(src_x, src_y, src_stride,
1885  dst, dst_stride);
1886 
1887  src_x += 8;
1888  src_y += 8;
1889  dst += 8;
1890  }
1891 }
1892 
1894  ptrdiff_t stride)
1895 {
1896  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1897  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
1898 
1899  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1900  src += (8 * stride);
1901  LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15);
1902 
1903  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride);
1904  dst += (8 * stride);
1905  ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride);
1906 }
1907 
1909  ptrdiff_t stride)
1910 {
1911  uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
1912 
1913  LD4(src, stride, src0, src1, src2, src3);
1914  src += 4 * stride;
1915  LD4(src, stride, src4, src5, src6, src7);
1916  SD4(src0, src1, src2, src3, dst, stride);
1917  dst += 4 * stride;
1918  SD4(src4, src5, src6, src7, dst, stride);
1919 }
1920 
1922  ptrdiff_t stride)
1923 {
1924  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1925  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1926 
1927  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1928  src += (8 * stride);
1929  LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1930 
1931  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1932  dst2, dst3);
1933  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
1934  dst6, dst7);
1935  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
1936  dst += (8 * stride);
1937 
1938  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1939  LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1940 
1941  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1942  dst2, dst3);
1943  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
1944  dst6, dst7);
1945  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
1946 }
1947 
1949  ptrdiff_t stride)
1950 {
1951  uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1952  v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
1953  v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1954 
1955  LD4(src, stride, tp0, tp1, tp2, tp3);
1956  src += 4 * stride;
1957  LD4(src, stride, tp4, tp5, tp6, tp7);
1958  INSERT_D2_UB(tp0, tp1, src0);
1959  INSERT_D2_UB(tp2, tp3, src1);
1960  INSERT_D2_UB(tp4, tp5, src2);
1961  INSERT_D2_UB(tp6, tp7, src3);
1962 
1963  LD4(dst, stride, tp0, tp1, tp2, tp3);
1964  LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
1965  INSERT_D2_UB(tp0, tp1, dst0);
1966  INSERT_D2_UB(tp2, tp3, dst1);
1967  INSERT_D2_UB(tp4, tp5, dst2);
1968  INSERT_D2_UB(tp6, tp7, dst3);
1969 
1970  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1971  dst2, dst3);
1972 
1973  ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
1974 }
1975 
1977  ptrdiff_t stride)
1978 {
1979  uint32_t tp0, tp1, tp2, tp3;
1980  v16u8 src0 = { 0 }, dst0 = { 0 };
1981 
1982  LW4(src, stride, tp0, tp1, tp2, tp3);
1983  INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
1984  LW4(dst, stride, tp0, tp1, tp2, tp3);
1985  INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1986 
1987  dst0 = __msa_aver_u_b(src0, dst0);
1988 
1989  ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
1990 }
1991 
1993  ptrdiff_t stride)
1994 {
1995  uint32_t loop_cnt;
1996  v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
1997  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
1998  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1999  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2000  v16i8 minus5b = __msa_ldi_b(-5);
2001  v16i8 plus20b = __msa_ldi_b(20);
2002 
2003  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2004  mask3 = mask0 + 8;
2005  mask4 = mask1 + 8;
2006  mask5 = mask2 + 8;
2007  src -= 2;
2008 
2009  for (loop_cnt = 4; loop_cnt--;) {
2010  LD_SB2(src, 16, src0, src1);
2011  src += stride;
2012  LD_SB2(src, 16, src2, src3);
2013  src += stride;
2014  LD_SB2(src, 16, src4, src5);
2015  src += stride;
2016  LD_SB2(src, 16, src6, src7);
2017  src += stride;
2018 
2019  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2020  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
2021  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
2022  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
2023  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
2024  VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
2025  VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
2026  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
2027  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2028  minus5b, res0, res1, res2, res3);
2029  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2030  plus20b, res0, res1, res2, res3);
2031  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
2032  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
2033  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
2034  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
2035  VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
2036  VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
2037  HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
2038  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2039  minus5b, res4, res5, res6, res7);
2040  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2041  plus20b, res4, res5, res6, res7);
2042  SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2);
2043  SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2);
2044  SRARI_H4_SH(res0, res1, res2, res3, 5);
2045  SRARI_H4_SH(res4, res5, res6, res7, 5);
2046  SAT_SH4_SH(res0, res1, res2, res3, 7);
2047  SAT_SH4_SH(res4, res5, res6, res7, 7);
2048  PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
2049  PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
2050  dst0 = __msa_aver_s_b(dst0, src0);
2051  dst1 = __msa_aver_s_b(dst1, src2);
2052  dst2 = __msa_aver_s_b(dst2, src4);
2053  dst3 = __msa_aver_s_b(dst3, src6);
2054  XORI_B4_128_SB(dst0, dst1, dst2, dst3);
2055  ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
2056  dst += (4 * stride);
2057  }
2058 }
2059 
2061  ptrdiff_t stride)
2062 {
2063  uint32_t loop_cnt;
2064  v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
2065  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
2066  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2067  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2068  v16i8 minus5b = __msa_ldi_b(-5);
2069  v16i8 plus20b = __msa_ldi_b(20);
2070 
2071  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2072  mask3 = mask0 + 8;
2073  mask4 = mask1 + 8;
2074  mask5 = mask2 + 8;
2075  src -= 2;
2076 
2077  for (loop_cnt = 4; loop_cnt--;) {
2078  LD_SB2(src, 16, src0, src1);
2079  src += stride;
2080  LD_SB2(src, 16, src2, src3);
2081  src += stride;
2082  LD_SB2(src, 16, src4, src5);
2083  src += stride;
2084  LD_SB2(src, 16, src6, src7);
2085  src += stride;
2086 
2087  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2088  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
2089  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
2090  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
2091  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
2092  VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
2093  VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
2094  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
2095  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2096  minus5b, res0, res1, res2, res3);
2097  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2098  plus20b, res0, res1, res2, res3);
2099  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
2100  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
2101  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
2102  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
2103  VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
2104  VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
2105  HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
2106  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2107  minus5b, res4, res5, res6, res7);
2108  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2109  plus20b, res4, res5, res6, res7);
2110  SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3);
2111  SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3);
2112  SRARI_H4_SH(res0, res1, res2, res3, 5);
2113  SRARI_H4_SH(res4, res5, res6, res7, 5);
2114  SAT_SH4_SH(res0, res1, res2, res3, 7);
2115  SAT_SH4_SH(res4, res5, res6, res7, 7);
2116  PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
2117  PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
2118  dst0 = __msa_aver_s_b(dst0, src0);
2119  dst1 = __msa_aver_s_b(dst1, src2);
2120  dst2 = __msa_aver_s_b(dst2, src4);
2121  dst3 = __msa_aver_s_b(dst3, src6);
2122  XORI_B4_128_SB(dst0, dst1, dst2, dst3);
2123  ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
2124  dst += (4 * stride);
2125  }
2126 }
2127 
2129  ptrdiff_t stride)
2130 {
2131  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
2132  v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
2133  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2134  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2135  v16i8 minus5b = __msa_ldi_b(-5);
2136  v16i8 plus20b = __msa_ldi_b(20);
2137 
2138  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2139  LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2140  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2141  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2142  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2143  HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
2144  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
2145  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
2146  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2147  res0, res1, res2, res3);
2148  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
2149  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
2150  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
2151  res0, res1, res2, res3);
2152  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2153  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
2154  HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
2155  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
2156  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
2157  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2158  res4, res5, res6, res7);
2159  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
2160  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
2161  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
2162  res4, res5, res6, res7);
2163  SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
2164  SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
2165  SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2);
2166  SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2);
2167  PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
2168  PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
2169  SRARI_H4_SH(res0, res1, res2, res3, 5);
2170  SRARI_H4_SH(res4, res5, res6, res7, 5);
2171  SAT_SH4_SH(res0, res1, res2, res3, 7);
2172  SAT_SH4_SH(res4, res5, res6, res7, 7);
2173  PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
2174  PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
2175  tmp0 = __msa_aver_s_b(tmp0, src0);
2176  tmp1 = __msa_aver_s_b(tmp1, src1);
2177  tmp2 = __msa_aver_s_b(tmp2, src4);
2178  tmp3 = __msa_aver_s_b(tmp3, src5);
2179  XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
2180  ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
2181 }
2182 
2184  ptrdiff_t stride)
2185 {
2186  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
2187  v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
2188  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2189  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2190  v16i8 minus5b = __msa_ldi_b(-5);
2191  v16i8 plus20b = __msa_ldi_b(20);
2192 
2193  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2194  LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2195  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2196  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2197  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2198  HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
2199  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
2200  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
2201  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2202  res0, res1, res2, res3);
2203  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
2204  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
2205  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
2206  res0, res1, res2, res3);
2207  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2208  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
2209  HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
2210  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
2211  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
2212  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2213  res4, res5, res6, res7);
2214  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
2215  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
2216  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
2217  res4, res5, res6, res7);
2218  SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
2219  SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
2220  SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 3);
2221  SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 3);
2222  PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
2223  PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
2224  SRARI_H4_SH(res0, res1, res2, res3, 5);
2225  SRARI_H4_SH(res4, res5, res6, res7, 5);
2226  SAT_SH4_SH(res0, res1, res2, res3, 7);
2227  SAT_SH4_SH(res4, res5, res6, res7, 7);
2228  PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
2229  PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
2230  tmp0 = __msa_aver_s_b(tmp0, src0);
2231  tmp1 = __msa_aver_s_b(tmp1, src1);
2232  tmp2 = __msa_aver_s_b(tmp2, src4);
2233  tmp3 = __msa_aver_s_b(tmp3, src5);
2234  XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
2235  ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
2236 }
2237 
2239  ptrdiff_t stride)
2240 {
2241  v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
2242  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2243  v8i16 res0, res1;
2244  v16i8 minus5b = __msa_ldi_b(-5);
2245  v16i8 plus20b = __msa_ldi_b(20);
2246 
2247  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2248  LD_SB4(src - 2, stride, src0, src1, src2, src3);
2249  XORI_B4_128_SB(src0, src1, src2, src3);
2250  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2251  HADD_SB2_SH(vec0, vec1, res0, res1);
2252  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2253  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
2254  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2255  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
2256  SRARI_H2_SH(res0, res1, 5);
2257  SAT_SH2_SH(res0, res1, 7);
2258  res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
2259  SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
2260  SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
2261  src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
2262  src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
2263  src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
2264  res = __msa_aver_s_b(res, src0);
2265  res = (v16i8) __msa_xori_b((v16u8) res, 128);
2266  ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
2267 }
2268 
2270  ptrdiff_t stride)
2271 {
2272  v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
2273  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2274  v8i16 res0, res1;
2275  v16i8 minus5b = __msa_ldi_b(-5);
2276  v16i8 plus20b = __msa_ldi_b(20);
2277 
2278  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2279  LD_SB4(src - 2, stride, src0, src1, src2, src3);
2280  XORI_B4_128_SB(src0, src1, src2, src3);
2281  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2282  HADD_SB2_SH(vec0, vec1, res0, res1);
2283  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2284  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
2285  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2286  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
2287  SRARI_H2_SH(res0, res1, 5);
2288  SAT_SH2_SH(res0, res1, 7);
2289  res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
2290  SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
2291  SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
2292  src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
2293  src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
2294  src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
2295  res = __msa_aver_s_b(res, src0);
2296  res = (v16i8) __msa_xori_b((v16u8) res, 128);
2297  ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
2298 }
2299 
2301  ptrdiff_t stride)
2302 {
2303  uint32_t loop_cnt;
2304  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
2305  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2306  v16i8 vec11;
2307  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2308  v16i8 minus5b = __msa_ldi_b(-5);
2309  v16i8 plus20b = __msa_ldi_b(20);
2310 
2311  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2312  src -= 2;
2313 
2314  for (loop_cnt = 4; loop_cnt--;) {
2315  LD_SB2(src, 8, src0, src1);
2316  src += stride;
2317  LD_SB2(src, 8, src2, src3);
2318  src += stride;
2319  LD_SB2(src, 8, src4, src5);
2320  src += stride;
2321  LD_SB2(src, 8, src6, src7);
2322  src += stride;
2323 
2324  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2325  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
2326  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
2327  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
2328  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
2329  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
2330  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
2331  HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
2332  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2333  minus5b, res0, res1, res2, res3);
2334  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2335  plus20b, res0, res1, res2, res3);
2336  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
2337  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
2338  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
2339  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
2340  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
2341  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
2342  HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
2343  DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2344  minus5b, res4, res5, res6, res7);
2345  DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2346  plus20b, res4, res5, res6, res7);
2347  SRARI_H4_SH(res0, res1, res2, res3, 5);
2348  SRARI_H4_SH(res4, res5, res6, res7, 5);
2349  SAT_SH4_SH(res0, res1, res2, res3, 7);
2350  SAT_SH4_SH(res4, res5, res6, res7, 7);
2351  PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
2352  vec2, vec3);
2353  XORI_B4_128_SB(vec0, vec1, vec2, vec3);
2354  ST_SB4(vec0, vec1, vec2, vec3, dst, stride);
2355  dst += (4 * stride);
2356  }
2357 }
2358 
2360  ptrdiff_t stride)
2361 {
2362  v16u8 out0, out1, out2, out3;
2363  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
2364  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2365  v16i8 vec11;
2366  v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2367  v16i8 minus5b = __msa_ldi_b(-5);
2368  v16i8 plus20b = __msa_ldi_b(20);
2369 
2370  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2371  LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2372  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2373  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2374  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2375  HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
2376  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
2377  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
2378  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2379  res0, res1, res2, res3);
2380  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
2381  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
2382  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
2383  plus20b, res0, res1, res2, res3);
2384  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2385  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
2386  HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
2387  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
2388  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
2389  DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2390  res4, res5, res6, res7);
2391  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
2392  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
2393  DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
2394  plus20b, res4, res5, res6, res7);
2395  SRARI_H4_SH(res0, res1, res2, res3, 5);
2396  SRARI_H4_SH(res4, res5, res6, res7, 5);
2397  SAT_SH4_SH(res0, res1, res2, res3, 7);
2398  SAT_SH4_SH(res4, res5, res6, res7, 7);
2399  out0 = PCKEV_XORI128_UB(res0, res1);
2400  out1 = PCKEV_XORI128_UB(res2, res3);
2401  out2 = PCKEV_XORI128_UB(res4, res5);
2402  out3 = PCKEV_XORI128_UB(res6, res7);
2403  ST8x8_UB(out0, out1, out2, out3, dst, stride);
2404 }
2405 
2407  ptrdiff_t stride)
2408 {
2409  v16u8 out;
2410  v16i8 src0, src1, src2, src3, mask0, mask1, mask2;
2411  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2412  v8i16 res0, res1;
2413  v16i8 minus5b = __msa_ldi_b(-5);
2414  v16i8 plus20b = __msa_ldi_b(20);
2415 
2416  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2417  LD_SB4(src - 2, stride, src0, src1, src2, src3);
2418  XORI_B4_128_SB(src0, src1, src2, src3);
2419  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2420  HADD_SB2_SH(vec0, vec1, res0, res1);
2421  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2422  DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
2423  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2424  DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
2425  SRARI_H2_SH(res0, res1, 5);
2426  SAT_SH2_SH(res0, res1, 7);
2427  out = PCKEV_XORI128_UB(res0, res1);
2428  ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2429 }
2430 
2432  ptrdiff_t stride)
2433 {
2434  int32_t loop_cnt;
2435  int16_t filt_const0 = 0xfb01;
2436  int16_t filt_const1 = 0x1414;
2437  int16_t filt_const2 = 0x1fb;
2438  v16u8 res0, res1, res2, res3;
2439  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2440  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2441  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2442  v16i8 src65_l, src87_l, filt0, filt1, filt2;
2443  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2444 
2445  filt0 = (v16i8) __msa_fill_h(filt_const0);
2446  filt1 = (v16i8) __msa_fill_h(filt_const1);
2447  filt2 = (v16i8) __msa_fill_h(filt_const2);
2448 
2449  src -= (stride * 2);
2450 
2451  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2452  src += (5 * stride);
2453 
2454  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2455  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2456  src32_r, src43_r);
2457  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2458  src32_l, src43_l);
2459 
2460  for (loop_cnt = 4; loop_cnt--;) {
2461  LD_SB4(src, stride, src5, src6, src7, src8);
2462  src += (4 * stride);
2463 
2464  XORI_B4_128_SB(src5, src6, src7, src8);
2465  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
2466  src65_r, src76_r, src87_r);
2467  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
2468  src65_l, src76_l, src87_l);
2469  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2470  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2471  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2472  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2473  out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2474  out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2475  out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2476  out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2477  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2478  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2479  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
2480  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2481  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2482  out3_r, res0, res1, res2, res3);
2483  res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
2484  res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
2485  res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
2486  res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
2487  XORI_B4_128_UB(res0, res1, res2, res3);
2488  ST_UB4(res0, res1, res2, res3, dst, stride);
2489  dst += (4 * stride);
2490 
2491  src10_r = src54_r;
2492  src32_r = src76_r;
2493  src21_r = src65_r;
2494  src43_r = src87_r;
2495  src10_l = src54_l;
2496  src32_l = src76_l;
2497  src21_l = src65_l;
2498  src43_l = src87_l;
2499  src2 = src6;
2500  src3 = src7;
2501  src4 = src8;
2502  }
2503 }
2504 
2506  ptrdiff_t stride)
2507 {
2508  int32_t loop_cnt;
2509  int16_t filt_const0 = 0xfb01;
2510  int16_t filt_const1 = 0x1414;
2511  int16_t filt_const2 = 0x1fb;
2512  v16u8 res0, res1, res2, res3;
2513  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2514  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2515  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2516  v16i8 src65_l, src87_l, filt0, filt1, filt2;
2517  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2518 
2519  filt0 = (v16i8) __msa_fill_h(filt_const0);
2520  filt1 = (v16i8) __msa_fill_h(filt_const1);
2521  filt2 = (v16i8) __msa_fill_h(filt_const2);
2522 
2523  src -= (stride * 2);
2524 
2525  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2526  src += (5 * stride);
2527 
2528  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2529  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2530  src32_r, src43_r);
2531  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2532  src32_l, src43_l);
2533 
2534  for (loop_cnt = 4; loop_cnt--;) {
2535  LD_SB4(src, stride, src5, src6, src7, src8);
2536  src += (4 * stride);
2537 
2538  XORI_B4_128_SB(src5, src6, src7, src8);
2539  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
2540  src65_r, src76_r, src87_r);
2541  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
2542  src65_l, src76_l, src87_l);
2543  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2544  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2545  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2546  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2547  out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2548  out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2549  out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2550  out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2551  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2552  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2553  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
2554  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2555  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2556  out3_r, res0, res1, res2, res3);
2557  res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
2558  res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
2559  res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
2560  res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
2561  XORI_B4_128_UB(res0, res1, res2, res3);
2562  ST_UB4(res0, res1, res2, res3, dst, stride);
2563  dst += (4 * stride);
2564 
2565  src10_r = src54_r;
2566  src32_r = src76_r;
2567  src21_r = src65_r;
2568  src43_r = src87_r;
2569  src10_l = src54_l;
2570  src32_l = src76_l;
2571  src21_l = src65_l;
2572  src43_l = src87_l;
2573  src3 = src7;
2574  src4 = src8;
2575  }
2576 }
2577 
2579  ptrdiff_t stride)
2580 {
2581  const int16_t filt_const0 = 0xfb01;
2582  const int16_t filt_const1 = 0x1414;
2583  const int16_t filt_const2 = 0x1fb;
2584  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2585  v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
2586  v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
2587  v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3;
2588  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
2589 
2590  filt0 = (v16i8) __msa_fill_h(filt_const0);
2591  filt1 = (v16i8) __msa_fill_h(filt_const1);
2592  filt2 = (v16i8) __msa_fill_h(filt_const2);
2593 
2594  src -= (stride * 2);
2595 
2596  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2597  src += (5 * stride);
2598  LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
2599  XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
2600  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2601  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2602  src32_r, src43_r);
2603  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2604  src76_r, src87_r);
2605  ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
2606  src109_r, src1110_r, src1211_r);
2607  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2608  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2609  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2610  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2611  out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
2612  out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
2613  out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
2614  out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
2615  PCKEV_D2_SB(src3, src2, src5, src4, tmp0, tmp1);
2616  PCKEV_D2_SB(src7, src6, src9, src8, tmp2, tmp3);
2617  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2618  SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
2619  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2620  SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
2621  PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
2622  PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
2623  out0 = __msa_aver_s_b(out0, tmp0);
2624  out1 = __msa_aver_s_b(out1, tmp1);
2625  out2 = __msa_aver_s_b(out2, tmp2);
2626  out3 = __msa_aver_s_b(out3, tmp3);
2627  XORI_B4_128_SB(out0, out1, out2, out3);
2628  ST8x8_UB(out0, out1, out2, out3, dst, stride);
2629 }
2630 
2632  ptrdiff_t stride)
2633 {
2634  const int16_t filt_const0 = 0xfb01;
2635  const int16_t filt_const1 = 0x1414;
2636  const int16_t filt_const2 = 0x1fb;
2637  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2638  v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
2639  v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
2640  v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
2641  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
2642 
2643  filt0 = (v16i8) __msa_fill_h(filt_const0);
2644  filt1 = (v16i8) __msa_fill_h(filt_const1);
2645  filt2 = (v16i8) __msa_fill_h(filt_const2);
2646 
2647  src -= (stride * 2);
2648 
2649  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2650  src += (5 * stride);
2651  LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
2652  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2653  XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
2654  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2655  src32_r, src43_r);
2656  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2657  src76_r, src87_r);
2658  ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
2659  src109_r, src1110_r, src1211_r);
2660  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2661  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2662  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2663  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2664  out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
2665  out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
2666  out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
2667  out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
2668  PCKEV_D2_SB(src4, src3, src6, src5, tmp0, tmp1);
2669  PCKEV_D2_SB(src8, src7, src10, src9, tmp2, tmp3);
2670  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2671  SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
2672  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2673  SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
2674  PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
2675  PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
2676  out0 = __msa_aver_s_b(out0, tmp0);
2677  out1 = __msa_aver_s_b(out1, tmp1);
2678  out2 = __msa_aver_s_b(out2, tmp2);
2679  out3 = __msa_aver_s_b(out3, tmp3);
2680  XORI_B4_128_SB(out0, out1, out2, out3);
2681  ST8x8_UB(out0, out1, out2, out3, dst, stride);
2682 }
2683 
2685  ptrdiff_t stride)
2686 {
2687  int16_t filt_const0 = 0xfb01;
2688  int16_t filt_const1 = 0x1414;
2689  int16_t filt_const2 = 0x1fb;
2690  v16u8 out;
2691  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2692  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2693  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
2694  v8i16 out10, out32;
2695 
2696  filt0 = (v16i8) __msa_fill_h(filt_const0);
2697  filt1 = (v16i8) __msa_fill_h(filt_const1);
2698  filt2 = (v16i8) __msa_fill_h(filt_const2);
2699 
2700  src -= (stride * 2);
2701 
2702  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2703  src += (5 * stride);
2704  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2705  src32_r, src43_r);
2706  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2707  XORI_B2_128_SB(src2110, src4332);
2708  LD_SB4(src, stride, src5, src6, src7, src8);
2709  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2710  src76_r, src87_r);
2711  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
2712  XORI_B2_128_SB(src6554, src8776);
2713  out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
2714  out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
2715  SRARI_H2_SH(out10, out32, 5);
2716  SAT_SH2_SH(out10, out32, 7);
2717  out = PCKEV_XORI128_UB(out10, out32);
2718  src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
2719  src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
2720  src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
2721  out = __msa_aver_u_b(out, (v16u8) src32_r);
2722  ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2723 }
2724 
2726  ptrdiff_t stride)
2727 {
2728  int16_t filt_const0 = 0xfb01;
2729  int16_t filt_const1 = 0x1414;
2730  int16_t filt_const2 = 0x1fb;
2731  v16u8 out;
2732  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2733  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2734  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
2735  v8i16 out10, out32;
2736 
2737  filt0 = (v16i8) __msa_fill_h(filt_const0);
2738  filt1 = (v16i8) __msa_fill_h(filt_const1);
2739  filt2 = (v16i8) __msa_fill_h(filt_const2);
2740 
2741  src -= (stride * 2);
2742 
2743  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2744  src += (5 * stride);
2745  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2746  src32_r, src43_r);
2747  ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2748  XORI_B2_128_SB(src2110, src4332);
2749  LD_SB4(src, stride, src5, src6, src7, src8);
2750  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2751  src76_r, src87_r);
2752  ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
2753  XORI_B2_128_SB(src6554, src8776);
2754  out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
2755  out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
2756  SRARI_H2_SH(out10, out32, 5);
2757  SAT_SH2_SH(out10, out32, 7);
2758  out = PCKEV_XORI128_UB(out10, out32);
2759  src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
2760  src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
2761  src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
2762  out = __msa_aver_u_b(out, (v16u8) src32_r);
2763  ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2764 }
2765 
2767  ptrdiff_t stride)
2768 {
2769  avc_luma_hv_qrt_16w_msa(src - 2,
2770  src - (stride * 2), stride, dst, stride, 16);
2771 }
2772 
2774  ptrdiff_t stride)
2775 {
2776  avc_luma_hv_qrt_16w_msa(src - 2,
2777  src - (stride * 2) +
2778  sizeof(uint8_t), stride, dst, stride, 16);
2779 }
2780 
2782  ptrdiff_t stride)
2783 {
2784  avc_luma_hv_qrt_16w_msa(src + stride - 2,
2785  src - (stride * 2), stride, dst, stride, 16);
2786 }
2787 
2789  ptrdiff_t stride)
2790 {
2791  avc_luma_hv_qrt_16w_msa(src + stride - 2,
2792  src - (stride * 2) +
2793  sizeof(uint8_t), stride, dst, stride, 16);
2794 }
2795 
2797  ptrdiff_t stride)
2798 {
2799  avc_luma_hv_qrt_8w_msa(src - 2, src - (stride * 2), stride, dst, stride, 8);
2800 }
2801 
2803  ptrdiff_t stride)
2804 {
2805  avc_luma_hv_qrt_8w_msa(src - 2,
2806  src - (stride * 2) +
2807  sizeof(uint8_t), stride, dst, stride, 8);
2808 }
2809 
2811  ptrdiff_t stride)
2812 {
2813  avc_luma_hv_qrt_8w_msa(src + stride - 2,
2814  src - (stride * 2), stride, dst, stride, 8);
2815 }
2816 
2818  ptrdiff_t stride)
2819 {
2820  avc_luma_hv_qrt_8w_msa(src + stride - 2,
2821  src - (stride * 2) +
2822  sizeof(uint8_t), stride, dst, stride, 8);
2823 }
2824 
2825 
2827  ptrdiff_t stride)
2828 {
2829  avc_luma_hv_qrt_4w_msa(src - 2, src - (stride * 2), stride, dst, stride, 4);
2830 }
2831 
2833  ptrdiff_t stride)
2834 {
2835  avc_luma_hv_qrt_4w_msa(src - 2,
2836  src - (stride * 2) +
2837  sizeof(uint8_t), stride, dst, stride, 4);
2838 }
2839 
2841  ptrdiff_t stride)
2842 {
2843  avc_luma_hv_qrt_4w_msa(src + stride - 2,
2844  src - (stride * 2), stride, dst, stride, 4);
2845 }
2846 
2848  ptrdiff_t stride)
2849 {
2850  avc_luma_hv_qrt_4w_msa(src + stride - 2,
2851  src - (stride * 2) +
2852  sizeof(uint8_t), stride, dst, stride, 4);
2853 }
2854 
2856  ptrdiff_t stride)
2857 {
2858  uint8_t *dst_tmp = dst;
2859  const uint8_t *src_tmp = src - (2 * stride) - 2;
2860  uint32_t multiple8_cnt, loop_cnt;
2861  const int32_t filt_const0 = 0xfffb0001;
2862  const int32_t filt_const1 = 0x140014;
2863  const int32_t filt_const2 = 0x1fffb;
2864  v16u8 out0, out1;
2865  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
2866  v16i8 mask2;
2867  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2868  v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2869  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2870  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
2871  v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
2872  v8i16 hz_out87_l, filt0, filt1, filt2;
2873  v4i32 tmp0, tmp1;
2874 
2875  filt0 = (v8i16) __msa_fill_w(filt_const0);
2876  filt1 = (v8i16) __msa_fill_w(filt_const1);
2877  filt2 = (v8i16) __msa_fill_w(filt_const2);
2878 
2879  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2880 
2881  for (multiple8_cnt = 2; multiple8_cnt--;) {
2882  dst = dst_tmp;
2883  src = src_tmp;
2884 
2885  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2886  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2887  src += (5 * stride);
2888 
2889  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2890  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2891  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2892  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2893  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
2894 
2895  for (loop_cnt = 4; loop_cnt--;) {
2896  LD_SB4(src, stride, src5, src6, src7, src8);
2897  src += (4 * stride);
2898 
2899  XORI_B4_128_SB(src5, src6, src7, src8);
2900 
2901  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
2902  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
2903  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
2904  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2905 
2906  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2907  hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
2908  hz_out43_r);
2909  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2910  hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
2911  hz_out43_l);
2912  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2913  hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
2914  hz_out87_r);
2915  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2916  hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
2917  hz_out87_l);
2918 
2919  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
2920  filt1, filt2);
2921  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
2922  filt1, filt2);
2923  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2924  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
2925  filt1, filt2);
2926  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
2927  filt1, filt2);
2928  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2929  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
2930  filt1, filt2);
2931  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
2932  filt1, filt2);
2933  dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2934  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
2935  filt1, filt2);
2936  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
2937  filt1, filt2);
2938  dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2939 
2940  dst1 = __msa_srari_h(hz_out2, 5);
2941  dst3 = __msa_srari_h(hz_out3, 5);
2942  dst5 = __msa_srari_h(hz_out4, 5);
2943  dst7 = __msa_srari_h(hz_out5, 5);
2944  SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
2945 
2946  dst0 = __msa_aver_s_h(dst0, dst1);
2947  dst1 = __msa_aver_s_h(dst2, dst3);
2948  dst2 = __msa_aver_s_h(dst4, dst5);
2949  dst3 = __msa_aver_s_h(dst6, dst7);
2950 
2951  out0 = PCKEV_XORI128_UB(dst0, dst1);
2952  out1 = PCKEV_XORI128_UB(dst2, dst3);
2953  ST8x4_UB(out0, out1, dst, stride);
2954  dst += (4 * stride);
2955 
2956  hz_out0 = hz_out4;
2957  hz_out1 = hz_out5;
2958  hz_out2 = hz_out6;
2959  hz_out3 = hz_out7;
2960  hz_out4 = hz_out8;
2961  }
2962 
2963  src_tmp += 8;
2964  dst_tmp += 8;
2965  }
2966 }
2967 
2969  ptrdiff_t stride)
2970 {
2971  uint8_t *dst_tmp = dst;
2972  const uint8_t *src_tmp = src - (2 * stride) - 2;
2973  uint32_t multiple8_cnt, loop_cnt;
2974  const int32_t filt_const0 = 0xfffb0001;
2975  const int32_t filt_const1 = 0x140014;
2976  const int32_t filt_const2 = 0x1fffb;
2977  v16u8 out0, out1;
2978  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
2979  v16i8 mask2;
2980  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2981  v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2982  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2983  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
2984  v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
2985  v8i16 hz_out87_l, filt0, filt1, filt2;
2986  v4i32 tmp0, tmp1;
2987 
2988  filt0 = (v8i16) __msa_fill_w(filt_const0);
2989  filt1 = (v8i16) __msa_fill_w(filt_const1);
2990  filt2 = (v8i16) __msa_fill_w(filt_const2);
2991 
2992  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2993 
2994  for (multiple8_cnt = 2; multiple8_cnt--;) {
2995  dst = dst_tmp;
2996  src = src_tmp;
2997 
2998  LD_SB5(src, stride, src0, src1, src2, src3, src4);
2999  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3000  src += (5 * stride);
3001 
3002  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3003  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3004  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3005  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3006  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
3007 
3008  for (loop_cnt = 4; loop_cnt--;) {
3009  LD_SB4(src, stride, src5, src6, src7, src8);
3010  src += (4 * stride);
3011 
3012  XORI_B4_128_SB(src5, src6, src7, src8);
3013 
3014  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
3015  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
3016  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
3017  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3018 
3019  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
3020  hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
3021  hz_out43_r);
3022  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
3023  hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
3024  hz_out43_l);
3025  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
3026  hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
3027  hz_out87_r);
3028  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
3029  hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
3030  hz_out87_l);
3031 
3032  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
3033  filt1, filt2);
3034  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
3035  filt1, filt2);
3036  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3037  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
3038  filt1, filt2);
3039  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
3040  filt1, filt2);
3041  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3042  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
3043  filt1, filt2);
3044  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
3045  filt1, filt2);
3046  dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3047  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
3048  filt1, filt2);
3049  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
3050  filt1, filt2);
3051  dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3052 
3053  dst1 = __msa_srari_h(hz_out3, 5);
3054  dst3 = __msa_srari_h(hz_out4, 5);
3055  dst5 = __msa_srari_h(hz_out5, 5);
3056  dst7 = __msa_srari_h(hz_out6, 5);
3057  SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
3058 
3059  dst0 = __msa_aver_s_h(dst0, dst1);
3060  dst1 = __msa_aver_s_h(dst2, dst3);
3061  dst2 = __msa_aver_s_h(dst4, dst5);
3062  dst3 = __msa_aver_s_h(dst6, dst7);
3063 
3064  out0 = PCKEV_XORI128_UB(dst0, dst1);
3065  out1 = PCKEV_XORI128_UB(dst2, dst3);
3066  ST8x4_UB(out0, out1, dst, stride);
3067  dst += (4 * stride);
3068 
3069  hz_out0 = hz_out4;
3070  hz_out1 = hz_out5;
3071  hz_out2 = hz_out6;
3072  hz_out3 = hz_out7;
3073  hz_out4 = hz_out8;
3074  }
3075 
3076  src_tmp += 8;
3077  dst_tmp += 8;
3078  }
3079 }
3080 
3082  ptrdiff_t stride)
3083 {
3084  const int32_t filt_const0 = 0xfffb0001;
3085  const int32_t filt_const1 = 0x140014;
3086  const int32_t filt_const2 = 0x1fffb;
3087  v16u8 out0, out1;
3088  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3089  v16i8 src11, src12, mask0, mask1, mask2;
3090  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3091  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
3092  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3093  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
3094  v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
3095  v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
3096  v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
3097  v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
3098  v4i32 tmp0, tmp1;
3099 
3100  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3101 
3102  filt0 = (v8i16) __msa_fill_w(filt_const0);
3103  filt1 = (v8i16) __msa_fill_w(filt_const1);
3104  filt2 = (v8i16) __msa_fill_w(filt_const2);
3105 
3106  src -= ((2 * stride) + 2);
3107 
3108  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3109  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3110  src += (5 * stride);
3111 
3112  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3113  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3114  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3115  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3116  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
3117 
3118  LD_SB4(src, stride, src5, src6, src7, src8);
3119  src += (4 * stride);
3120  XORI_B4_128_SB(src5, src6, src7, src8);
3121 
3122  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
3123  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
3124  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
3125  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3126 
3127  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3128  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3129  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3130  hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
3131  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3132  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3133  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3134  hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
3135 
3136  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3137  filt2);
3138  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
3139  filt2);
3140  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3141  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3142  filt2);
3143  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
3144  filt2);
3145  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3146  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3147  filt2);
3148  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
3149  filt2);
3150  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3151  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3152  filt2);
3153  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
3154  filt2);
3155  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3156 
3157  SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
3158  SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
3159 
3160  dst0 = __msa_aver_s_h(dst0, hz_out2);
3161  dst1 = __msa_aver_s_h(dst1, hz_out3);
3162  dst2 = __msa_aver_s_h(dst2, hz_out4);
3163  dst3 = __msa_aver_s_h(dst3, hz_out5);
3164 
3165  out0 = PCKEV_XORI128_UB(dst0, dst1);
3166  out1 = PCKEV_XORI128_UB(dst2, dst3);
3167  ST8x4_UB(out0, out1, dst, stride);
3168  dst += (4 * stride);
3169 
3170  LD_SB4(src, stride, src9, src10, src11, src12);
3171  XORI_B4_128_SB(src9, src10, src11, src12);
3172  hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
3173  hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
3174  hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
3175  hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
3176  ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3177  hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
3178  hz_out1211_r);
3179  ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3180  hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
3181  hz_out1211_l);
3182  tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
3183  filt2);
3184  tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
3185  filt2);
3186  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3187  tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
3188  filt2);
3189  tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
3190  filt2);
3191  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3192  tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
3193  filt2);
3194  tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
3195  filt2);
3196  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3197  tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
3198  filt2);
3199  tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
3200  filt2);
3201  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3202 
3203  SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
3204  SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
3205 
3206  dst0 = __msa_aver_s_h(dst0, hz_out6);
3207  dst1 = __msa_aver_s_h(dst1, hz_out7);
3208  dst2 = __msa_aver_s_h(dst2, hz_out8);
3209  dst3 = __msa_aver_s_h(dst3, hz_out9);
3210 
3211  out0 = PCKEV_XORI128_UB(dst0, dst1);
3212  out1 = PCKEV_XORI128_UB(dst2, dst3);
3213  ST8x4_UB(out0, out1, dst, stride);
3214 }
3215 
3217  ptrdiff_t stride)
3218 {
3219  const int32_t filt_const0 = 0xfffb0001;
3220  const int32_t filt_const1 = 0x140014;
3221  const int32_t filt_const2 = 0x1fffb;
3222  v16u8 out0, out1;
3223  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3224  v16i8 src11, src12, mask0, mask1, mask2;
3225  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3226  v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
3227  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3228  v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
3229  v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
3230  v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
3231  v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
3232  v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
3233  v4i32 tmp0, tmp1;
3234 
3235  LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3236 
3237  filt0 = (v8i16) __msa_fill_w(filt_const0);
3238  filt1 = (v8i16) __msa_fill_w(filt_const1);
3239  filt2 = (v8i16) __msa_fill_w(filt_const2);
3240 
3241  src -= ((2 * stride) + 2);
3242 
3243  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3244  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3245  src += (5 * stride);
3246 
3247  hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3248  hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3249  hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3250  hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3251  hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
3252 
3253  LD_SB4(src, stride, src5, src6, src7, src8);
3254  src += (4 * stride);
3255  XORI_B4_128_SB(src5, src6, src7, src8);
3256 
3257  hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
3258  hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
3259  hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
3260  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3261 
3262  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3263  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3264  ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3265  hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
3266  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3267  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3268  ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3269  hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
3270 
3271  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3272  filt2);
3273  tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
3274  filt2);
3275  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3276  tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3277  filt2);
3278  tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
3279  filt2);
3280  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3281  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3282  filt2);
3283  tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
3284  filt2);
3285  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3286  tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3287  filt2);
3288  tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
3289  filt2);
3290  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3291 
3292  SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
3293  SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
3294 
3295  dst0 = __msa_aver_s_h(dst0, hz_out3);
3296  dst1 = __msa_aver_s_h(dst1, hz_out4);
3297  dst2 = __msa_aver_s_h(dst2, hz_out5);
3298  dst3 = __msa_aver_s_h(dst3, hz_out6);
3299 
3300  out0 = PCKEV_XORI128_UB(dst0, dst1);
3301  out1 = PCKEV_XORI128_UB(dst2, dst3);
3302  ST8x4_UB(out0, out1, dst, stride);
3303  dst += (4 * stride);
3304 
3305  LD_SB4(src, stride, src9, src10, src11, src12);
3306  XORI_B4_128_SB(src9, src10, src11, src12);
3307  hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
3308  hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
3309  hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
3310  hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
3311  ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3312  hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
3313  hz_out1211_r);
3314  ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3315  hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
3316  hz_out1211_l);
3317  tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
3318  filt2);
3319  tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
3320  filt2);
3321  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3322  tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
3323  filt2);
3324  tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
3325  filt2);
3326  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3327  tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
3328  filt2);
3329  tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
3330  filt2);
3331  dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3332  tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
3333  filt2);
3334  tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
3335  filt2);
3336  dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3337 
3338  SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
3339  SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
3340 
3341  dst0 = __msa_aver_s_h(dst0, hz_out7);
3342  dst1 = __msa_aver_s_h(dst1, hz_out8);
3343  dst2 = __msa_aver_s_h(dst2, hz_out9);
3344  dst3 = __msa_aver_s_h(dst3, hz_out10);
3345 
3346  out0 = PCKEV_XORI128_UB(dst0, dst1);
3347  out1 = PCKEV_XORI128_UB(dst2, dst3);
3348  ST8x4_UB(out0, out1, dst, stride);
3349 }
3350 
3352  ptrdiff_t stride)
3353 {
3354  const int32_t filt_const0 = 0xfffb0001;
3355  const int32_t filt_const1 = 0x140014;
3356  const int32_t filt_const2 = 0x1fffb;
3357  v16u8 res;
3358  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3359  v16i8 mask0, mask1, mask2;
3360  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3361  v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
3362  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3363  v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
3364  v4i32 tmp0, tmp1;
3365 
3366  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3367 
3368  filt0 = (v8i16) __msa_fill_w(filt_const0);
3369  filt1 = (v8i16) __msa_fill_w(filt_const1);
3370  filt2 = (v8i16) __msa_fill_w(filt_const2);
3371 
3372  src -= ((2 * stride) + 2);
3373 
3374  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3375  src += (5 * stride);
3376  LD_SB4(src, stride, src5, src6, src7, src8);
3377 
3378  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3379  XORI_B4_128_SB(src5, src6, src7, src8);
3380 
3381  hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
3382  hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
3383  hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
3384  hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
3385  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3386  PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
3387  PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
3388 
3389  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3390  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3391  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3392  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3393 
3394  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3395  filt2);
3396  tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3397  filt2);
3398  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3399  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3400  filt2);
3401  tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3402  filt2);
3403  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3404 
3405  SRARI_H2_SH(hz_out2, hz_out4, 5);
3406  SAT_SH2_SH(hz_out2, hz_out4, 7);
3407 
3408  dst0 = __msa_aver_s_h(dst0, hz_out2);
3409  dst1 = __msa_aver_s_h(dst1, hz_out4);
3410 
3411  res = PCKEV_XORI128_UB(dst0, dst1);
3412  ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
3413 }
3414 
3416  ptrdiff_t stride)
3417 {
3418  const int32_t filt_const0 = 0xfffb0001;
3419  const int32_t filt_const1 = 0x140014;
3420  const int32_t filt_const2 = 0x1fffb;
3421  v16u8 res;
3422  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3423  v16i8 mask0, mask1, mask2;
3424  v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3425  v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
3426  v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3427  v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
3428  v4i32 tmp0, tmp1;
3429 
3430  LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3431 
3432  filt0 = (v8i16) __msa_fill_w(filt_const0);
3433  filt1 = (v8i16) __msa_fill_w(filt_const1);
3434  filt2 = (v8i16) __msa_fill_w(filt_const2);
3435 
3436  src -= ((2 * stride) + 2);
3437 
3438  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3439  src += (5 * stride);
3440  LD_SB4(src, stride, src5, src6, src7, src8);
3441 
3442  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3443  XORI_B4_128_SB(src5, src6, src7, src8);
3444 
3445  hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
3446  hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
3447  hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
3448  hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
3449  hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3450  PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
3451  PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
3452 
3453  ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3454  hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3455  ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3456  hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3457 
3458  tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3459  filt2);
3460  tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3461  filt2);
3462  dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3463  tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3464  filt2);
3465  tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3466  filt2);
3467  dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3468 
3469  PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
3470  SRARI_H2_SH(hz_out0, hz_out1, 5);
3471  SAT_SH2_SH(hz_out0, hz_out1, 7);
3472 
3473  dst0 = __msa_aver_s_h(dst0, hz_out0);
3474  dst1 = __msa_aver_s_h(dst1, hz_out1);
3475 
3476  res = PCKEV_XORI128_UB(dst0, dst1);
3477  ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
3478 }
3479 
3481  ptrdiff_t stride)
3482 {
3483  int32_t loop_cnt;
3484  int16_t filt_const0 = 0xfb01;
3485  int16_t filt_const1 = 0x1414;
3486  int16_t filt_const2 = 0x1fb;
3487  v16u8 res0, res1, res2, res3;
3488  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3489  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3490  v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3491  v16i8 src65_l, src87_l, filt0, filt1, filt2;
3492  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3493 
3494  filt0 = (v16i8) __msa_fill_h(filt_const0);
3495  filt1 = (v16i8) __msa_fill_h(filt_const1);
3496  filt2 = (v16i8) __msa_fill_h(filt_const2);
3497  src -= (stride * 2);
3498 
3499  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3500  src += (5 * stride);
3501 
3502  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3503  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3504  src32_r, src43_r);
3505  ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3506  src32_l, src43_l);
3507 
3508  for (loop_cnt = 4; loop_cnt--;) {
3509  LD_SB4(src, stride, src5, src6, src7, src8);
3510  src += (4 * stride);
3511 
3512  XORI_B4_128_SB(src5, src6, src7, src8);
3513  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3514  src65_r, src76_r, src87_r);
3515  ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3516  src65_l, src76_l, src87_l);
3517  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3518  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3519  out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3520  out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3521  out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3522  out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3523  out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3524  out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3525  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3526  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3527  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3528  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3529  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3530  out3_r, res0, res1, res2, res3);
3531  XORI_B4_128_UB(res0, res1, res2, res3);
3532  ST_UB4(res0, res1, res2, res3, dst, stride);
3533  dst += (4 * stride);
3534 
3535  src10_r = src54_r;
3536  src32_r = src76_r;
3537  src21_r = src65_r;
3538  src43_r = src87_r;
3539  src10_l = src54_l;
3540  src32_l = src76_l;
3541  src21_l = src65_l;
3542  src43_l = src87_l;
3543  src4 = src8;
3544  }
3545 }
3546 
3548  ptrdiff_t stride)
3549 {
3550  const int16_t filt_const0 = 0xfb01;
3551  const int16_t filt_const1 = 0x1414;
3552  const int16_t filt_const2 = 0x1fb;
3553  v16u8 out0, out1, out2, out3;
3554  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3555  v16i8 src11, src12, src10_r, src21_r, src32_r, src43_r, src76_r, src87_r;
3556  v16i8 src98_r, src109_r, src89_r, src910_r, src1110_r, src1211_r;
3557  v16i8 filt0, filt1, filt2;
3558  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3559 
3560  filt0 = (v16i8) __msa_fill_h(filt_const0);
3561  filt1 = (v16i8) __msa_fill_h(filt_const1);
3562  filt2 = (v16i8) __msa_fill_h(filt_const2);
3563 
3564  src -= (stride * 2);
3565 
3566  LD_SB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3567  src += (8 * stride);
3568  LD_SB5(src, stride, src8, src9, src10, src11, src12);
3569  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3570  src32_r, src43_r);
3571  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src76_r, src87_r,
3572  src98_r, src109_r);
3573  ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src89_r,
3574  src910_r, src1110_r, src1211_r);
3575  XORI_B4_128_SB(src10_r, src21_r, src32_r, src43_r);
3576  XORI_B4_128_SB(src76_r, src87_r, src98_r, src109_r);
3577  XORI_B4_128_SB(src89_r, src910_r, src1110_r, src1211_r);
3578  out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3579  out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3580  out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3581  out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3582  out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src89_r, filt0, filt1, filt2);
3583  out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src910_r, filt0, filt1, filt2);
3584  out6_r = AVC_DOT_SH3_SH(src98_r, src89_r, src1110_r, filt0, filt1, filt2);
3585  out7_r = AVC_DOT_SH3_SH(src109_r, src910_r, src1211_r, filt0, filt1, filt2);
3586  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3587  SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3588  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3589  SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3590  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3591  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3592  out2 = PCKEV_XORI128_UB(out4_r, out5_r);
3593  out3 = PCKEV_XORI128_UB(out6_r, out7_r);
3594  ST8x8_UB(out0, out1, out2, out3, dst, stride);
3595 }
3596 
3598  ptrdiff_t stride)
3599 {
3600  const int16_t filt_const0 = 0xfb01;
3601  const int16_t filt_const1 = 0x1414;
3602  const int16_t filt_const2 = 0x1fb;
3603  v16u8 out;
3604  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3605  v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3606  v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3607  v8i16 out10, out32;
3608 
3609  filt0 = (v16i8) __msa_fill_h(filt_const0);
3610  filt1 = (v16i8) __msa_fill_h(filt_const1);
3611  filt2 = (v16i8) __msa_fill_h(filt_const2);
3612 
3613  src -= (stride * 2);
3614 
3615  LD_SB5(src, stride, src0, src1, src2, src3, src4);
3616  src += (5 * stride);
3617  LD_SB4(src, stride, src5, src6, src7, src8);
3618 
3619  ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3620  src32_r, src43_r);
3621  ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3622  src76_r, src87_r);
3623  ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
3624  src76_r, src2110, src4332, src6554, src8776);
3625  XORI_B4_128_SB(src2110, src4332, src6554, src8776);
3626  out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
3627  out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
3628  SRARI_H2_SH(out10, out32, 5);
3629  SAT_SH2_SH(out10, out32, 7);
3630  out = PCKEV_XORI128_UB(out10, out32);
3631  ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
3632 }
3633 
3635  ptrdiff_t stride)
3636 {
3637  avc_luma_midh_qrt_16w_msa(src - (2 * stride) - 2,
3638  stride, dst, stride, 16, 0);
3639 }
3640 
3642  ptrdiff_t stride)
3643 {
3644  avc_luma_midh_qrt_16w_msa(src - (2 * stride) - 2,
3645  stride, dst, stride, 16, 1);
3646 }
3647 
3649  ptrdiff_t stride)
3650 {
3651  avc_luma_midh_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 0);
3652 }
3653 
3655  ptrdiff_t stride)
3656 {
3657  avc_luma_midh_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 1);
3658 }
3659 
3661  ptrdiff_t stride)
3662 {
3663  avc_luma_midh_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 0);
3664 }
3665 
3667  ptrdiff_t stride)
3668 {
3669  avc_luma_midh_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 1);
3670 }
3671 
3673  ptrdiff_t stride)
3674 {
3675  avc_luma_mid_16w_msa(src - (2 * stride) - 2, stride, dst, stride, 16);
3676 }
3677 
3679  ptrdiff_t stride)
3680 {
3681  avc_luma_mid_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8);
3682 }
3683 
3685  ptrdiff_t stride)
3686 {
3687  avc_luma_mid_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4);
3688 }
3689 
3691  ptrdiff_t stride)
3692 {
3693  avc_luma_hz_qrt_and_aver_dst_16x16_msa(src - 2, stride, dst, stride, 0);
3694 }
3695 
3697  ptrdiff_t stride)
3698 {
3699  avc_luma_hz_qrt_and_aver_dst_16x16_msa(src - 2, stride, dst, stride, 1);
3700 }
3701 
3703  ptrdiff_t stride)
3704 {
3705  avc_luma_hz_qrt_and_aver_dst_8x8_msa(src - 2, stride, dst, stride, 0);
3706 }
3707 
3709  ptrdiff_t stride)
3710 {
3711  avc_luma_hz_qrt_and_aver_dst_8x8_msa(src - 2, stride, dst, stride, 1);
3712 }
3713 
3715  ptrdiff_t stride)
3716 {
3717  avc_luma_hz_qrt_and_aver_dst_4x4_msa(src - 2, stride, dst, stride, 0);
3718 }
3719 
3721  ptrdiff_t stride)
3722 {
3723  avc_luma_hz_qrt_and_aver_dst_4x4_msa(src - 2, stride, dst, stride, 1);
3724 }
3725 
3727  ptrdiff_t stride)
3728 {
3729  avc_luma_hz_and_aver_dst_16x16_msa(src - 2, stride, dst, stride);
3730 }
3731 
3733  ptrdiff_t stride)
3734 {
3735  avc_luma_hz_and_aver_dst_8x8_msa(src - 2, stride, dst, stride);
3736 }
3737 
3739  ptrdiff_t stride)
3740 {
3741  avc_luma_hz_and_aver_dst_4x4_msa(src - 2, stride, dst, stride);
3742 }
3743 
3745  ptrdiff_t stride)
3746 {
3747  avc_luma_vt_qrt_and_aver_dst_16x16_msa(src - (stride * 2),
3748  stride, dst, stride, 0);
3749 }
3750 
3752  ptrdiff_t stride)
3753 {
3754  avc_luma_vt_qrt_and_aver_dst_16x16_msa(src - (stride * 2),
3755  stride, dst, stride, 1);
3756 }
3757 
3759  ptrdiff_t stride)
3760 {
3761  avc_luma_vt_qrt_and_aver_dst_8x8_msa(src - (stride * 2),
3762  stride, dst, stride, 0);
3763 }
3764 
3766  ptrdiff_t stride)
3767 {
3768  avc_luma_vt_qrt_and_aver_dst_8x8_msa(src - (stride * 2),
3769  stride, dst, stride, 1);
3770 }
3771 
3773  ptrdiff_t stride)
3774 {
3775  avc_luma_vt_qrt_and_aver_dst_4x4_msa(src - (stride * 2),
3776  stride, dst, stride, 0);
3777 }
3778 
3780  ptrdiff_t stride)
3781 {
3782  avc_luma_vt_qrt_and_aver_dst_4x4_msa(src - (stride * 2),
3783  stride, dst, stride, 1);
3784 }
3785 
3787  ptrdiff_t stride)
3788 {
3790  src - (stride * 2),
3791  stride, dst, stride);
3792 }
3793 
3795  ptrdiff_t stride)
3796 {
3798  src - (stride * 2) +
3799  sizeof(uint8_t), stride,
3800  dst, stride);
3801 }
3802 
3804  ptrdiff_t stride)
3805 {
3806  avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
3807  src - (stride * 2),
3808  stride, dst, stride);
3809 }
3810 
3812  ptrdiff_t stride)
3813 {
3814  avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
3815  src - (stride * 2) +
3816  sizeof(uint8_t), stride,
3817  dst, stride);
3818 }
3819 
3821  ptrdiff_t stride)
3822 {
3824  src - (stride * 2),
3825  stride, dst, stride);
3826 }
3827 
3829  ptrdiff_t stride)
3830 {
3832  src - (stride * 2) +
3833  sizeof(uint8_t), stride, dst, stride);
3834 }
3835 
3837  ptrdiff_t stride)
3838 {
3839  avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
3840  src - (stride * 2),
3841  stride, dst, stride);
3842 }
3843 
3845  ptrdiff_t stride)
3846 {
3847  avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
3848  src - (stride * 2) +
3849  sizeof(uint8_t), stride, dst, stride);
3850 }
3851 
3852 
3854  ptrdiff_t stride)
3855 {
3857  src - (stride * 2),
3858  stride, dst, stride);
3859 }
3860 
3862  ptrdiff_t stride)
3863 {
3865  src - (stride * 2) +
3866  sizeof(uint8_t), stride, dst, stride);
3867 }
3868 
3870  ptrdiff_t stride)
3871 {
3872  avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
3873  src - (stride * 2),
3874  stride, dst, stride);
3875 }
3876 
3878  ptrdiff_t stride)
3879 {
3880  avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
3881  src - (stride * 2) +
3882  sizeof(uint8_t), stride, dst, stride);
3883 }
3884 
3886  ptrdiff_t stride)
3887 {
3888  avc_luma_midv_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
3889  stride, dst, stride, 16, 0);
3890 }
3891 
3893  ptrdiff_t stride)
3894 {
3895  avc_luma_midv_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
3896  stride, dst, stride, 16, 1);
3897 }
3898 
3900  ptrdiff_t stride)
3901 {
3902  avc_luma_midv_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
3903  stride, dst, stride, 8, 0);
3904 }
3905 
3907  ptrdiff_t stride)
3908 {
3909  avc_luma_midv_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
3910  stride, dst, stride, 8, 1);
3911 }
3912 
3914  ptrdiff_t stride)
3915 {
3916  avc_luma_midv_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
3917  stride, dst, stride, 4, 0);
3918 }
3919 
3921  ptrdiff_t stride)
3922 {
3923  avc_luma_midv_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
3924  stride, dst, stride, 4, 1);
3925 }
3926 
3928  ptrdiff_t stride)
3929 {
3930  avc_luma_vt_and_aver_dst_16x16_msa(src - (stride * 2), stride, dst, stride);
3931 }
3932 
3934  ptrdiff_t stride)
3935 {
3936  avc_luma_vt_and_aver_dst_8x8_msa(src - (stride * 2), stride, dst, stride);
3937 }
3938 
3940  ptrdiff_t stride)
3941 {
3942  avc_luma_vt_and_aver_dst_4x4_msa(src - (stride * 2), stride, dst, stride);
3943 }
3944 
3946  ptrdiff_t stride)
3947 {
3948  avc_luma_midh_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
3949  stride, dst, stride, 16, 0);
3950 }
3951 
3953  ptrdiff_t stride)
3954 {
3955  avc_luma_midh_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
3956  stride, dst, stride, 16, 1);
3957 }
3958 
3960  ptrdiff_t stride)
3961 {
3962  avc_luma_midh_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
3963  stride, dst, stride, 8, 0);
3964 }
3965 
3967  ptrdiff_t stride)
3968 {
3969  avc_luma_midh_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
3970  stride, dst, stride, 8, 1);
3971 }
3972 
3974  ptrdiff_t stride)
3975 {
3976  avc_luma_midh_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
3977  stride, dst, stride, 4, 0);
3978 }
3979 
3981  ptrdiff_t stride)
3982 {
3983  avc_luma_midh_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
3984  stride, dst, stride, 4, 1);
3985 }
3986 
3988  ptrdiff_t stride)
3989 {
3990  avc_luma_mid_and_aver_dst_16x16_msa(src - (2 * stride) - 2,
3991  stride, dst, stride);
3992 }
3993 
3995  ptrdiff_t stride)
3996 {
3997  avc_luma_mid_and_aver_dst_8w_msa(src - (2 * stride) - 2,
3998  stride, dst, stride, 8);
3999 }
4000 
4002  ptrdiff_t stride)
4003 {
4004  avc_luma_mid_and_aver_dst_4x4_msa(src - (2 * stride) - 2,
4005  stride, dst, stride);
4006 }
void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define XORI_B5_128_SB(...)
static void avc_luma_vt_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: h264qpel_msa.c:911
#define XORI_B8_128_SB(...)
void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_mid_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2)
Definition: h264qpel_msa.c:162
void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define ILVR_H4_SH(...)
static void avc_luma_midh_qrt_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
Definition: h264qpel_msa.c:394
static void avc_luma_hz_qrt_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t hor_offset)
Definition: h264qpel_msa.c:724
static void avc_luma_midv_qrt_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t vert_offset)
void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define XORI_B2_128_SB(...)
#define PCKEV_XORI128_UB(in0, in1)
void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define LD_SB(...)
void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hz_qrt_and_aver_dst_8x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t hor_offset)
Definition: h264qpel_msa.c:782
static void avc_luma_hv_qrt_8w_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: h264qpel_msa.c:487
#define ILVR_D2_UB(...)
#define LD_UB4(...)
#define DPADD_SB4_SH(...)
void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: h264qpel_msa.c:409
#define src
Definition: vp8dsp.c:254
void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midh_qrt_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
#define HADD_SB4_SH(...)
void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define LD_SB2(...)
void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define ILVL_H4_SH(...)
void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)
#define VSHF_B2_SB(...)
static void avc_luma_midv_qrt_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t ver_offset)
#define XORI_B4_128_UB(...)
void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hz_and_aver_dst_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: h264qpel_msa.c:658
#define ILVR_D2_SB(...)
uint8_t
#define LD4(psrc, stride, out0, out1, out2, out3)
void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define LD_UB2(...)
void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define SRARI_H4_SH(...)
void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define XORI_B2_128_UB(...)
static void avc_luma_mid_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: h264qpel_msa.c:239
void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define height
#define AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5)
Definition: h264qpel_msa.c:94
#define ILVR_D4_SB(...)
static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define LD_SB8(...)
void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midh_qrt_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define PCKEV_B2_SB(...)
#define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define SAT_SW2_SW(...)
void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midh_qrt_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
static void avc_luma_mid_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: h264qpel_msa.c:174
#define XORI_B4_128_SB(...)
void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVC_CALC_DPADD_H_6PIX_2COEFF_SH(in0, in1, in2, in3, in4, in5)
Definition: h264qpel_msa.c:24
#define VSHF_H3_SH(...)
#define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2)
Definition: h264qpel_msa.c:132
static void avc_luma_midh_qrt_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
Definition: h264qpel_msa.c:307
#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,dst0, dst1, pdst, stride)
void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define DPADD_SB2_SH(...)
#define SRARI_H2_SH(...)
void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static const uint8_t luma_mask_arr[16 *8]
Definition: h264qpel_msa.c:48
static void avc_luma_hv_qrt_16w_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: h264qpel_msa.c:564
#define LD_UB8(...)
#define PCKEV_D2_SH(...)
void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)
Definition: h264qpel_msa.c:151
void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
int32_t
void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define PCKEV_B4_SB(...)
#define AVER_UB2_UB(...)
#define SRARI_W2_SW(...)
#define LD_SB3(...)
void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define SAT_SH4_SH(...)
#define LD_SB4(...)
#define PCKEV_B4_UB(...)
void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define INSERT_W4_UB(...)
void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define ST_UB2(...)
#define ST_UB8(...)
#define AVER_UB4_UB(...)
#define ST_UB4(...)
void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define src1
Definition: h264pred.c:139
void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define ILVL_B4_SB(...)
static void avc_luma_vt_and_aver_dst_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define SAT_SH2_SH(...)
void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define PCKEV_D2_SB(...)
void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define src0
Definition: h264pred.c:138
void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midv_qrt_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t vert_offset)
void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define HADD_SB2_SH(...)
void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midh_qrt_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
Definition: h264qpel_msa.c:379
void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define SD4(in0, in1, in2, in3, pdst, stride)
static void avc_luma_hz_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: h264qpel_msa.c:580
#define SW(val, pdst)
#define ST_SB4(...)
#define DPADD_SH2_SW(...)
#define PCKOD_D2_SH(...)
static void avc_luma_hz_qrt_and_aver_dst_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t hor_offset)
Definition: h264qpel_msa.c:851
void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define ILVR_W2_UB(...)
void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define LD_SB5(...)
static void avc_luma_hz_and_aver_dst_8x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: h264qpel_msa.c:614
void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_vt_qrt_and_aver_dst_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t ver_offset)
#define INSERT_D2_UB(...)
#define AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,mask0, mask1, mask2)
Definition: h264qpel_msa.c:115
GLint GLenum GLboolean GLsizei stride
Definition: opengl_enc.c:105
static void avc_luma_vt_qrt_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t ver_offset)
#define LW4(psrc, stride, out0, out1, out2, out3)
#define ILVEV_H2_SH(...)
static void avc_luma_mid_and_aver_dst_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
#define ST8x4_UB(in0, in1, pdst, stride)
static void avc_luma_mid_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define ST8x8_UB(in0, in1, in2, in3, pdst, stride)
void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_vt_qrt_and_aver_dst_8x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t ver_offset)
void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5)
Definition: h264qpel_msa.c:78
void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define ILVR_B4_SB(...)
static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
FILE * out
Definition: movenc.c:54
void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define PCKEV_D2_UB(...)
static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define stride
#define ST4x2_UB(in, pdst, stride)
#define PCKEV_B2_UB(...)
static void avc_luma_mid_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: h264qpel_msa.c:294
#define SLDI_B2_SB(...)
void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5,out1, out2)
Definition: h264qpel_msa.c:63
void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_vt_and_aver_dst_8x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: h264qpel_msa.c:957
void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)