FFmpeg
h264qpel_lasx.c
Go to the documentation of this file.
1 /*
2  * Loongson LASX optimized h264qpel
3  *
4  * Copyright (c) 2020 Loongson Technology Corporation Limited
5  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "h264qpel_loongarch.h"
26 #include "libavutil/attributes.h"
27 
28 static const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = {
29  /* 8 width cases */
30  0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
31  0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
32  1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
33  1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
34  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
35  2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
36 };
37 
38 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \
39 ( { \
40  __m256i out0_m; \
41  __m256i tmp0_m; \
42  \
43  tmp0_m = __lasx_xvshuf_b(in1, in0, mask0); \
44  out0_m = __lasx_xvhaddw_h_b(tmp0_m, tmp0_m); \
45  tmp0_m = __lasx_xvshuf_b(in1, in0, mask1); \
46  out0_m = __lasx_xvdp2add_h_b(out0_m, minus5b, tmp0_m); \
47  tmp0_m = __lasx_xvshuf_b(in1, in0, mask2); \
48  out0_m = __lasx_xvdp2add_h_b(out0_m, plus20b, tmp0_m); \
49  \
50  out0_m; \
51 } )
52 
53 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
54 ( { \
55  __m256i out0_m; \
56  \
57  out0_m = __lasx_xvdp2_h_b(in0, coeff0); \
58  DUP2_ARG3(__lasx_xvdp2add_h_b, out0_m, in1, coeff1, out0_m,\
59  in2, coeff2, out0_m, out0_m); \
60  \
61  out0_m; \
62 } )
63 
64 static av_always_inline
66  uint8_t *src_y,
67  uint8_t *dst, ptrdiff_t stride)
68 {
69  const int16_t filt_const0 = 0xfb01;
70  const int16_t filt_const1 = 0x1414;
71  const int16_t filt_const2 = 0x1fb;
72  uint32_t loop_cnt;
73  ptrdiff_t stride_2x = stride << 1;
74  ptrdiff_t stride_3x = stride_2x + stride;
75  ptrdiff_t stride_4x = stride << 2;
76  __m256i tmp0, tmp1;
77  __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
78  __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
79  __m256i src_vt7, src_vt8;
80  __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h;
81  __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2;
82  __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
83  __m256i vt_out3, out0, out1, out2, out3;
84  __m256i minus5b = __lasx_xvldi(0xFB);
85  __m256i plus20b = __lasx_xvldi(20);
86 
87  filt0 = __lasx_xvreplgr2vr_h(filt_const0);
88  filt1 = __lasx_xvreplgr2vr_h(filt_const1);
89  filt2 = __lasx_xvreplgr2vr_h(filt_const2);
90 
91  mask0 = __lasx_xvld(luma_mask_arr, 0);
92  DUP2_ARG2(__lasx_xvld, luma_mask_arr, 32, luma_mask_arr, 64, mask1, mask2);
93  src_vt0 = __lasx_xvld(src_y, 0);
94  DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, src_y, stride_3x,
95  src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4);
96  src_y += stride_4x;
97 
98  src_vt0 = __lasx_xvxori_b(src_vt0, 128);
99  DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128,
100  src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4);
101 
102  for (loop_cnt = 4; loop_cnt--;) {
103  src_hz0 = __lasx_xvld(src_x, 0);
104  DUP2_ARG2(__lasx_xvldx, src_x, stride, src_x, stride_2x,
105  src_hz1, src_hz2);
106  src_hz3 = __lasx_xvldx(src_x, stride_3x);
107  src_x += stride_4x;
108  src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94);
109  src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94);
110  src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94);
111  src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94);
112  DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128,
113  src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3);
114 
115  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
116  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
117  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
118  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
119  hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5);
120  hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5);
121 
122  DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x,
123  src_y, stride_3x, src_y, stride_4x,
124  src_vt5, src_vt6, src_vt7, src_vt8);
125  src_y += stride_4x;
126 
127  DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128,
128  src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8);
129 
130  DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5,
131  0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02,
132  src_vt0, src_vt1, src_vt2, src_vt3);
133  src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02);
134  DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1,
135  src_vt3, src_vt2, src_vt87_h, src_vt3,
136  src_hz0, src_hz1, src_hz2, src_hz3);
137  DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1,
138  src_vt3, src_vt2, src_vt87_h, src_vt3,
139  src_vt0, src_vt1, src_vt2, src_vt3);
140  DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1, src_hz1,
141  0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3, 0x02,
142  src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h);
143  DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1, src_hz1,
144  0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3, 0x13,
145  src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h);
146  vt_out0 = AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h, filt0,
147  filt1, filt2);
148  vt_out1 = AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h, filt0,
149  filt1, filt2);
150  vt_out2 = AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h, filt0,
151  filt1, filt2);
152  vt_out3 = AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h, filt0,
153  filt1, filt2);
154  vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5);
155  vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5);
156 
157  DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
158  out0, out2);
159  DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
160  out1, out3);
161  tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1);
162  tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1);
163 
164  DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
165  out0 = __lasx_xvld(dst, 0);
166  DUP2_ARG2(__lasx_xvldx, dst, stride, dst, stride_2x, out1, out2);
167  out3 = __lasx_xvldx(dst, stride_3x);
168  out0 = __lasx_xvpermi_q(out0, out2, 0x02);
169  out1 = __lasx_xvpermi_q(out1, out3, 0x02);
170  out2 = __lasx_xvilvl_d(out1, out0);
171  out3 = __lasx_xvilvh_d(out1, out0);
172  out0 = __lasx_xvpermi_q(out2, out3, 0x02);
173  out1 = __lasx_xvpermi_q(out2, out3, 0x13);
174  tmp0 = __lasx_xvavgr_bu(out0, tmp0);
175  tmp1 = __lasx_xvavgr_bu(out1, tmp1);
176 
177  __lasx_xvstelm_d(tmp0, dst, 0, 0);
178  __lasx_xvstelm_d(tmp0, dst + stride, 0, 1);
179  __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0);
180  __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1);
181 
182  __lasx_xvstelm_d(tmp0, dst, 8, 2);
183  __lasx_xvstelm_d(tmp0, dst + stride, 8, 3);
184  __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2);
185  __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3);
186 
187  dst += stride_4x;
188  src_vt0 = src_vt4;
189  src_vt1 = src_vt5;
190  src_vt2 = src_vt6;
191  src_vt3 = src_vt7;
192  src_vt4 = src_vt8;
193  }
194 }
195 
196 static av_always_inline void
197 avc_luma_hv_qrt_16x16_lasx(uint8_t *src_x, uint8_t *src_y,
198  uint8_t *dst, ptrdiff_t stride)
199 {
200  const int16_t filt_const0 = 0xfb01;
201  const int16_t filt_const1 = 0x1414;
202  const int16_t filt_const2 = 0x1fb;
203  uint32_t loop_cnt;
204  ptrdiff_t stride_2x = stride << 1;
205  ptrdiff_t stride_3x = stride_2x + stride;
206  ptrdiff_t stride_4x = stride << 2;
207  __m256i tmp0, tmp1;
208  __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
209  __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
210  __m256i src_vt7, src_vt8;
211  __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h;
212  __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2;
213  __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
214  __m256i vt_out3, out0, out1, out2, out3;
215  __m256i minus5b = __lasx_xvldi(0xFB);
216  __m256i plus20b = __lasx_xvldi(20);
217 
218  filt0 = __lasx_xvreplgr2vr_h(filt_const0);
219  filt1 = __lasx_xvreplgr2vr_h(filt_const1);
220  filt2 = __lasx_xvreplgr2vr_h(filt_const2);
221 
222  mask0 = __lasx_xvld(luma_mask_arr, 0);
223  DUP2_ARG2(__lasx_xvld, luma_mask_arr, 32, luma_mask_arr, 64, mask1, mask2);
224  src_vt0 = __lasx_xvld(src_y, 0);
225  DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x, src_y, stride_3x,
226  src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4);
227  src_y += stride_4x;
228 
229  src_vt0 = __lasx_xvxori_b(src_vt0, 128);
230  DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128,
231  src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4);
232 
233  for (loop_cnt = 4; loop_cnt--;) {
234  src_hz0 = __lasx_xvld(src_x, 0);
235  DUP2_ARG2(__lasx_xvldx, src_x, stride, src_x, stride_2x,
236  src_hz1, src_hz2);
237  src_hz3 = __lasx_xvldx(src_x, stride_3x);
238  src_x += stride_4x;
239  src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94);
240  src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94);
241  src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94);
242  src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94);
243  DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128,
244  src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3);
245 
246  hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
247  hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
248  hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
249  hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
250  hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5);
251  hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5);
252 
253  DUP4_ARG2(__lasx_xvldx, src_y, stride, src_y, stride_2x,
254  src_y, stride_3x, src_y, stride_4x,
255  src_vt5, src_vt6, src_vt7, src_vt8);
256  src_y += stride_4x;
257 
258  DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128,
259  src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8);
260  DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5,
261  0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02,
262  src_vt0, src_vt1, src_vt2, src_vt3);
263  src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02);
264  DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1,
265  src_vt3, src_vt2, src_vt87_h, src_vt3,
266  src_hz0, src_hz1, src_hz2, src_hz3);
267  DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1,
268  src_vt3, src_vt2, src_vt87_h, src_vt3,
269  src_vt0, src_vt1, src_vt2, src_vt3);
270  DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1,
271  src_hz1, 0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3,
272  0x02, src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h);
273  DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1,
274  src_hz1, 0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3,
275  0x13, src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h);
276 
277  vt_out0 = AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h,
278  filt0, filt1, filt2);
279  vt_out1 = AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h,
280  filt0, filt1, filt2);
281  vt_out2 = AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h,
282  filt0, filt1, filt2);
283  vt_out3 = AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h,
284  filt0, filt1, filt2);
285  vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5);
286  vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5);
287 
288  DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
289  out0, out2);
290  DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
291  out1, out3);
292  tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1);
293  tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1);
294 
295  DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
296  __lasx_xvstelm_d(tmp0, dst, 0, 0);
297  __lasx_xvstelm_d(tmp0, dst + stride, 0, 1);
298  __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0);
299  __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1);
300 
301  __lasx_xvstelm_d(tmp0, dst, 8, 2);
302  __lasx_xvstelm_d(tmp0, dst + stride, 8, 3);
303  __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2);
304  __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3);
305 
306  dst += stride_4x;
307  src_vt0 = src_vt4;
308  src_vt1 = src_vt5;
309  src_vt2 = src_vt6;
310  src_vt3 = src_vt7;
311  src_vt4 = src_vt8;
312  }
313 }
314 
315 /* put_pixels8_8_inline_asm: dst = src */
316 static av_always_inline void
317 put_pixels8_8_inline_asm(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
318 {
319  uint64_t tmp[8];
320  ptrdiff_t stride_2, stride_3, stride_4;
321  __asm__ volatile (
322  "slli.d %[stride_2], %[stride], 1 \n\t"
323  "add.d %[stride_3], %[stride_2], %[stride] \n\t"
324  "slli.d %[stride_4], %[stride_2], 1 \n\t"
325  "ld.d %[tmp0], %[src], 0x0 \n\t"
326  "ldx.d %[tmp1], %[src], %[stride] \n\t"
327  "ldx.d %[tmp2], %[src], %[stride_2] \n\t"
328  "ldx.d %[tmp3], %[src], %[stride_3] \n\t"
329  "add.d %[src], %[src], %[stride_4] \n\t"
330  "ld.d %[tmp4], %[src], 0x0 \n\t"
331  "ldx.d %[tmp5], %[src], %[stride] \n\t"
332  "ldx.d %[tmp6], %[src], %[stride_2] \n\t"
333  "ldx.d %[tmp7], %[src], %[stride_3] \n\t"
334 
335  "st.d %[tmp0], %[dst], 0x0 \n\t"
336  "stx.d %[tmp1], %[dst], %[stride] \n\t"
337  "stx.d %[tmp2], %[dst], %[stride_2] \n\t"
338  "stx.d %[tmp3], %[dst], %[stride_3] \n\t"
339  "add.d %[dst], %[dst], %[stride_4] \n\t"
340  "st.d %[tmp4], %[dst], 0x0 \n\t"
341  "stx.d %[tmp5], %[dst], %[stride] \n\t"
342  "stx.d %[tmp6], %[dst], %[stride_2] \n\t"
343  "stx.d %[tmp7], %[dst], %[stride_3] \n\t"
344  : [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
345  [tmp2]"=&r"(tmp[2]), [tmp3]"=&r"(tmp[3]),
346  [tmp4]"=&r"(tmp[4]), [tmp5]"=&r"(tmp[5]),
347  [tmp6]"=&r"(tmp[6]), [tmp7]"=&r"(tmp[7]),
348  [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
349  [stride_4]"=&r"(stride_4),
350  [dst]"+&r"(dst), [src]"+&r"(src)
351  : [stride]"r"(stride)
352  : "memory"
353  );
354 }
355 
356 /* avg_pixels8_8_lsx : dst = avg(src, dst)
357  * put_pixels8_l2_8_lsx: dst = avg(src, half) , half stride is 8.
358  * avg_pixels8_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
359 static av_always_inline void
360 avg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
361 {
362  uint8_t *tmp = dst;
363  ptrdiff_t stride_2, stride_3, stride_4;
364  __asm__ volatile (
365  /* h0~h7 */
366  "slli.d %[stride_2], %[stride], 1 \n\t"
367  "add.d %[stride_3], %[stride_2], %[stride] \n\t"
368  "slli.d %[stride_4], %[stride_2], 1 \n\t"
369  "vld $vr0, %[src], 0 \n\t"
370  "vldx $vr1, %[src], %[stride] \n\t"
371  "vldx $vr2, %[src], %[stride_2] \n\t"
372  "vldx $vr3, %[src], %[stride_3] \n\t"
373  "add.d %[src], %[src], %[stride_4] \n\t"
374  "vld $vr4, %[src], 0 \n\t"
375  "vldx $vr5, %[src], %[stride] \n\t"
376  "vldx $vr6, %[src], %[stride_2] \n\t"
377  "vldx $vr7, %[src], %[stride_3] \n\t"
378 
379  "vld $vr8, %[tmp], 0 \n\t"
380  "vldx $vr9, %[tmp], %[stride] \n\t"
381  "vldx $vr10, %[tmp], %[stride_2] \n\t"
382  "vldx $vr11, %[tmp], %[stride_3] \n\t"
383  "add.d %[tmp], %[tmp], %[stride_4] \n\t"
384  "vld $vr12, %[tmp], 0 \n\t"
385  "vldx $vr13, %[tmp], %[stride] \n\t"
386  "vldx $vr14, %[tmp], %[stride_2] \n\t"
387  "vldx $vr15, %[tmp], %[stride_3] \n\t"
388 
389  "vavgr.bu $vr0, $vr8, $vr0 \n\t"
390  "vavgr.bu $vr1, $vr9, $vr1 \n\t"
391  "vavgr.bu $vr2, $vr10, $vr2 \n\t"
392  "vavgr.bu $vr3, $vr11, $vr3 \n\t"
393  "vavgr.bu $vr4, $vr12, $vr4 \n\t"
394  "vavgr.bu $vr5, $vr13, $vr5 \n\t"
395  "vavgr.bu $vr6, $vr14, $vr6 \n\t"
396  "vavgr.bu $vr7, $vr15, $vr7 \n\t"
397 
398  "vstelm.d $vr0, %[dst], 0, 0 \n\t"
399  "add.d %[dst], %[dst], %[stride] \n\t"
400  "vstelm.d $vr1, %[dst], 0, 0 \n\t"
401  "add.d %[dst], %[dst], %[stride] \n\t"
402  "vstelm.d $vr2, %[dst], 0, 0 \n\t"
403  "add.d %[dst], %[dst], %[stride] \n\t"
404  "vstelm.d $vr3, %[dst], 0, 0 \n\t"
405  "add.d %[dst], %[dst], %[stride] \n\t"
406  "vstelm.d $vr4, %[dst], 0, 0 \n\t"
407  "add.d %[dst], %[dst], %[stride] \n\t"
408  "vstelm.d $vr5, %[dst], 0, 0 \n\t"
409  "add.d %[dst], %[dst], %[stride] \n\t"
410  "vstelm.d $vr6, %[dst], 0, 0 \n\t"
411  "add.d %[dst], %[dst], %[stride] \n\t"
412  "vstelm.d $vr7, %[dst], 0, 0 \n\t"
413  : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [src]"+&r"(src),
414  [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
415  [stride_4]"=&r"(stride_4)
416  : [stride]"r"(stride)
417  : "memory"
418  );
419 }
420 
421 /* put_pixels16_8_lsx: dst = src */
422 static av_always_inline void
423 put_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
424 {
425  ptrdiff_t stride_2, stride_3, stride_4;
426  __asm__ volatile (
427  "slli.d %[stride_2], %[stride], 1 \n\t"
428  "add.d %[stride_3], %[stride_2], %[stride] \n\t"
429  "slli.d %[stride_4], %[stride_2], 1 \n\t"
430  "vld $vr0, %[src], 0 \n\t"
431  "vldx $vr1, %[src], %[stride] \n\t"
432  "vldx $vr2, %[src], %[stride_2] \n\t"
433  "vldx $vr3, %[src], %[stride_3] \n\t"
434  "add.d %[src], %[src], %[stride_4] \n\t"
435  "vld $vr4, %[src], 0 \n\t"
436  "vldx $vr5, %[src], %[stride] \n\t"
437  "vldx $vr6, %[src], %[stride_2] \n\t"
438  "vldx $vr7, %[src], %[stride_3] \n\t"
439  "add.d %[src], %[src], %[stride_4] \n\t"
440 
441  "vst $vr0, %[dst], 0 \n\t"
442  "vstx $vr1, %[dst], %[stride] \n\t"
443  "vstx $vr2, %[dst], %[stride_2] \n\t"
444  "vstx $vr3, %[dst], %[stride_3] \n\t"
445  "add.d %[dst], %[dst], %[stride_4] \n\t"
446  "vst $vr4, %[dst], 0 \n\t"
447  "vstx $vr5, %[dst], %[stride] \n\t"
448  "vstx $vr6, %[dst], %[stride_2] \n\t"
449  "vstx $vr7, %[dst], %[stride_3] \n\t"
450  "add.d %[dst], %[dst], %[stride_4] \n\t"
451 
452  "vld $vr0, %[src], 0 \n\t"
453  "vldx $vr1, %[src], %[stride] \n\t"
454  "vldx $vr2, %[src], %[stride_2] \n\t"
455  "vldx $vr3, %[src], %[stride_3] \n\t"
456  "add.d %[src], %[src], %[stride_4] \n\t"
457  "vld $vr4, %[src], 0 \n\t"
458  "vldx $vr5, %[src], %[stride] \n\t"
459  "vldx $vr6, %[src], %[stride_2] \n\t"
460  "vldx $vr7, %[src], %[stride_3] \n\t"
461 
462  "vst $vr0, %[dst], 0 \n\t"
463  "vstx $vr1, %[dst], %[stride] \n\t"
464  "vstx $vr2, %[dst], %[stride_2] \n\t"
465  "vstx $vr3, %[dst], %[stride_3] \n\t"
466  "add.d %[dst], %[dst], %[stride_4] \n\t"
467  "vst $vr4, %[dst], 0 \n\t"
468  "vstx $vr5, %[dst], %[stride] \n\t"
469  "vstx $vr6, %[dst], %[stride_2] \n\t"
470  "vstx $vr7, %[dst], %[stride_3] \n\t"
471  : [dst]"+&r"(dst), [src]"+&r"(src),
472  [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
473  [stride_4]"=&r"(stride_4)
474  : [stride]"r"(stride)
475  : "memory"
476  );
477 }
478 
479 /* avg_pixels16_8_lsx : dst = avg(src, dst)
480  * put_pixels16_l2_8_lsx: dst = avg(src, half) , half stride is 8.
481  * avg_pixels16_l2_8_lsx: dst = avg(avg(src, half), dst) , half stride is 8.*/
482 static av_always_inline void
483 avg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
484 {
485  uint8_t *tmp = dst;
486  ptrdiff_t stride_2, stride_3, stride_4;
487  __asm__ volatile (
488  /* h0~h7 */
489  "slli.d %[stride_2], %[stride], 1 \n\t"
490  "add.d %[stride_3], %[stride_2], %[stride] \n\t"
491  "slli.d %[stride_4], %[stride_2], 1 \n\t"
492  "vld $vr0, %[src], 0 \n\t"
493  "vldx $vr1, %[src], %[stride] \n\t"
494  "vldx $vr2, %[src], %[stride_2] \n\t"
495  "vldx $vr3, %[src], %[stride_3] \n\t"
496  "add.d %[src], %[src], %[stride_4] \n\t"
497  "vld $vr4, %[src], 0 \n\t"
498  "vldx $vr5, %[src], %[stride] \n\t"
499  "vldx $vr6, %[src], %[stride_2] \n\t"
500  "vldx $vr7, %[src], %[stride_3] \n\t"
501  "add.d %[src], %[src], %[stride_4] \n\t"
502 
503  "vld $vr8, %[tmp], 0 \n\t"
504  "vldx $vr9, %[tmp], %[stride] \n\t"
505  "vldx $vr10, %[tmp], %[stride_2] \n\t"
506  "vldx $vr11, %[tmp], %[stride_3] \n\t"
507  "add.d %[tmp], %[tmp], %[stride_4] \n\t"
508  "vld $vr12, %[tmp], 0 \n\t"
509  "vldx $vr13, %[tmp], %[stride] \n\t"
510  "vldx $vr14, %[tmp], %[stride_2] \n\t"
511  "vldx $vr15, %[tmp], %[stride_3] \n\t"
512  "add.d %[tmp], %[tmp], %[stride_4] \n\t"
513 
514  "vavgr.bu $vr0, $vr8, $vr0 \n\t"
515  "vavgr.bu $vr1, $vr9, $vr1 \n\t"
516  "vavgr.bu $vr2, $vr10, $vr2 \n\t"
517  "vavgr.bu $vr3, $vr11, $vr3 \n\t"
518  "vavgr.bu $vr4, $vr12, $vr4 \n\t"
519  "vavgr.bu $vr5, $vr13, $vr5 \n\t"
520  "vavgr.bu $vr6, $vr14, $vr6 \n\t"
521  "vavgr.bu $vr7, $vr15, $vr7 \n\t"
522 
523  "vst $vr0, %[dst], 0 \n\t"
524  "vstx $vr1, %[dst], %[stride] \n\t"
525  "vstx $vr2, %[dst], %[stride_2] \n\t"
526  "vstx $vr3, %[dst], %[stride_3] \n\t"
527  "add.d %[dst], %[dst], %[stride_4] \n\t"
528  "vst $vr4, %[dst], 0 \n\t"
529  "vstx $vr5, %[dst], %[stride] \n\t"
530  "vstx $vr6, %[dst], %[stride_2] \n\t"
531  "vstx $vr7, %[dst], %[stride_3] \n\t"
532  "add.d %[dst], %[dst], %[stride_4] \n\t"
533 
534  /* h8~h15 */
535  "vld $vr0, %[src], 0 \n\t"
536  "vldx $vr1, %[src], %[stride] \n\t"
537  "vldx $vr2, %[src], %[stride_2] \n\t"
538  "vldx $vr3, %[src], %[stride_3] \n\t"
539  "add.d %[src], %[src], %[stride_4] \n\t"
540  "vld $vr4, %[src], 0 \n\t"
541  "vldx $vr5, %[src], %[stride] \n\t"
542  "vldx $vr6, %[src], %[stride_2] \n\t"
543  "vldx $vr7, %[src], %[stride_3] \n\t"
544 
545  "vld $vr8, %[tmp], 0 \n\t"
546  "vldx $vr9, %[tmp], %[stride] \n\t"
547  "vldx $vr10, %[tmp], %[stride_2] \n\t"
548  "vldx $vr11, %[tmp], %[stride_3] \n\t"
549  "add.d %[tmp], %[tmp], %[stride_4] \n\t"
550  "vld $vr12, %[tmp], 0 \n\t"
551  "vldx $vr13, %[tmp], %[stride] \n\t"
552  "vldx $vr14, %[tmp], %[stride_2] \n\t"
553  "vldx $vr15, %[tmp], %[stride_3] \n\t"
554 
555  "vavgr.bu $vr0, $vr8, $vr0 \n\t"
556  "vavgr.bu $vr1, $vr9, $vr1 \n\t"
557  "vavgr.bu $vr2, $vr10, $vr2 \n\t"
558  "vavgr.bu $vr3, $vr11, $vr3 \n\t"
559  "vavgr.bu $vr4, $vr12, $vr4 \n\t"
560  "vavgr.bu $vr5, $vr13, $vr5 \n\t"
561  "vavgr.bu $vr6, $vr14, $vr6 \n\t"
562  "vavgr.bu $vr7, $vr15, $vr7 \n\t"
563 
564  "vst $vr0, %[dst], 0 \n\t"
565  "vstx $vr1, %[dst], %[stride] \n\t"
566  "vstx $vr2, %[dst], %[stride_2] \n\t"
567  "vstx $vr3, %[dst], %[stride_3] \n\t"
568  "add.d %[dst], %[dst], %[stride_4] \n\t"
569  "vst $vr4, %[dst], 0 \n\t"
570  "vstx $vr5, %[dst], %[stride] \n\t"
571  "vstx $vr6, %[dst], %[stride_2] \n\t"
572  "vstx $vr7, %[dst], %[stride_3] \n\t"
573  : [dst]"+&r"(dst), [tmp]"+&r"(tmp), [src]"+&r"(src),
574  [stride_2]"=&r"(stride_2), [stride_3]"=&r"(stride_3),
575  [stride_4]"=&r"(stride_4)
576  : [stride]"r"(stride)
577  : "memory"
578  );
579 }
580 
581 #define QPEL8_H_LOWPASS(out_v) \
582  src00 = __lasx_xvld(src, - 2); \
583  src += srcStride; \
584  src10 = __lasx_xvld(src, - 2); \
585  src += srcStride; \
586  src00 = __lasx_xvpermi_q(src00, src10, 0x02); \
587  src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1); \
588  src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2); \
589  src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3); \
590  src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4); \
591  src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5); \
592  DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\
593  src00 = __lasx_xvaddwl_h_bu(src00, src05); \
594  src02 = __lasx_xvmul_h(src02, h_20); \
595  src01 = __lasx_xvmul_h(src01, h_5); \
596  src02 = __lasx_xvssub_h(src02, src01); \
597  src02 = __lasx_xvsadd_h(src02, src00); \
598  src02 = __lasx_xvsadd_h(src02, h_16); \
599  out_v = __lasx_xvssrani_bu_h(src02, src02, 5); \
600 
601 static av_always_inline void
602 put_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride,
603  int srcStride)
604 {
605  int dstStride_2x = dstStride << 1;
606  __m256i src00, src01, src02, src03, src04, src05, src10;
607  __m256i out0, out1, out2, out3;
608  __m256i h_20 = __lasx_xvldi(0x414);
609  __m256i h_5 = __lasx_xvldi(0x405);
610  __m256i h_16 = __lasx_xvldi(0x410);
611  __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
612  __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
613  __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
614  __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
615  __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
616 
617  QPEL8_H_LOWPASS(out0)
618  QPEL8_H_LOWPASS(out1)
619  QPEL8_H_LOWPASS(out2)
620  QPEL8_H_LOWPASS(out3)
621  __lasx_xvstelm_d(out0, dst, 0, 0);
622  __lasx_xvstelm_d(out0, dst + dstStride, 0, 2);
623  dst += dstStride_2x;
624  __lasx_xvstelm_d(out1, dst, 0, 0);
625  __lasx_xvstelm_d(out1, dst + dstStride, 0, 2);
626  dst += dstStride_2x;
627  __lasx_xvstelm_d(out2, dst, 0, 0);
628  __lasx_xvstelm_d(out2, dst + dstStride, 0, 2);
629  dst += dstStride_2x;
630  __lasx_xvstelm_d(out3, dst, 0, 0);
631  __lasx_xvstelm_d(out3, dst + dstStride, 0, 2);
632 }
633 
634 #define QPEL8_V_LOWPASS(src0, src1, src2, src3, src4, src5, src6, \
635  tmp0, tmp1, tmp2, tmp3, tmp4, tmp5) \
636 { \
637  tmp0 = __lasx_xvpermi_q(src0, src1, 0x02); \
638  tmp1 = __lasx_xvpermi_q(src1, src2, 0x02); \
639  tmp2 = __lasx_xvpermi_q(src2, src3, 0x02); \
640  tmp3 = __lasx_xvpermi_q(src3, src4, 0x02); \
641  tmp4 = __lasx_xvpermi_q(src4, src5, 0x02); \
642  tmp5 = __lasx_xvpermi_q(src5, src6, 0x02); \
643  DUP2_ARG2(__lasx_xvaddwl_h_bu, tmp2, tmp3, tmp1, tmp4, tmp2, tmp1); \
644  tmp0 = __lasx_xvaddwl_h_bu(tmp0, tmp5); \
645  tmp2 = __lasx_xvmul_h(tmp2, h_20); \
646  tmp1 = __lasx_xvmul_h(tmp1, h_5); \
647  tmp2 = __lasx_xvssub_h(tmp2, tmp1); \
648  tmp2 = __lasx_xvsadd_h(tmp2, tmp0); \
649  tmp2 = __lasx_xvsadd_h(tmp2, h_16); \
650  tmp2 = __lasx_xvssrani_bu_h(tmp2, tmp2, 5); \
651 }
652 
653 static av_always_inline void
654 put_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride,
655  int srcStride)
656 {
657  int srcStride_2x = srcStride << 1;
658  int dstStride_2x = dstStride << 1;
659  int srcStride_4x = srcStride << 2;
660  int srcStride_3x = srcStride_2x + srcStride;
661  __m256i src00, src01, src02, src03, src04, src05, src06;
662  __m256i src07, src08, src09, src10, src11, src12;
663  __m256i tmp00, tmp01, tmp02, tmp03, tmp04, tmp05;
664  __m256i h_20 = __lasx_xvldi(0x414);
665  __m256i h_5 = __lasx_xvldi(0x405);
666  __m256i h_16 = __lasx_xvldi(0x410);
667 
668  DUP2_ARG2(__lasx_xvld, src - srcStride_2x, 0, src - srcStride, 0,
669  src00, src01);
670  src02 = __lasx_xvld(src, 0);
671  DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
672  srcStride_3x, src, srcStride_4x, src03, src04, src05, src06);
673  src += srcStride_4x;
674  DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
675  srcStride_3x, src, srcStride_4x, src07, src08, src09, src10);
676  src += srcStride_4x;
677  DUP2_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src11, src12);
678 
679  QPEL8_V_LOWPASS(src00, src01, src02, src03, src04, src05, src06,
680  tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
681  __lasx_xvstelm_d(tmp02, dst, 0, 0);
682  __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
683  dst += dstStride_2x;
684  QPEL8_V_LOWPASS(src02, src03, src04, src05, src06, src07, src08,
685  tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
686  __lasx_xvstelm_d(tmp02, dst, 0, 0);
687  __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
688  dst += dstStride_2x;
689  QPEL8_V_LOWPASS(src04, src05, src06, src07, src08, src09, src10,
690  tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
691  __lasx_xvstelm_d(tmp02, dst, 0, 0);
692  __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
693  dst += dstStride_2x;
694  QPEL8_V_LOWPASS(src06, src07, src08, src09, src10, src11, src12,
695  tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
696  __lasx_xvstelm_d(tmp02, dst, 0, 0);
697  __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
698 }
699 
700 static av_always_inline void
701 avg_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride,
702  int srcStride)
703 {
704  int srcStride_2x = srcStride << 1;
705  int srcStride_4x = srcStride << 2;
706  int dstStride_2x = dstStride << 1;
707  int dstStride_4x = dstStride << 2;
708  int srcStride_3x = srcStride_2x + srcStride;
709  int dstStride_3x = dstStride_2x + dstStride;
710  __m256i src00, src01, src02, src03, src04, src05, src06;
711  __m256i src07, src08, src09, src10, src11, src12, tmp00;
712  __m256i tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp08, tmp09;
713  __m256i h_20 = __lasx_xvldi(0x414);
714  __m256i h_5 = __lasx_xvldi(0x405);
715  __m256i h_16 = __lasx_xvldi(0x410);
716 
717 
718  DUP2_ARG2(__lasx_xvld, src - srcStride_2x, 0, src - srcStride, 0,
719  src00, src01);
720  src02 = __lasx_xvld(src, 0);
721  DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
722  srcStride_3x, src, srcStride_4x, src03, src04, src05, src06);
723  src += srcStride_4x;
724  DUP4_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src,
725  srcStride_3x, src, srcStride_4x, src07, src08, src09, src10);
726  src += srcStride_4x;
727  DUP2_ARG2(__lasx_xvldx, src, srcStride, src, srcStride_2x, src11, src12);
728 
729  tmp06 = __lasx_xvld(dst, 0);
730  DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x,
731  dst, dstStride_3x, dst, dstStride_4x,
732  tmp07, tmp02, tmp03, tmp04);
733  dst += dstStride_4x;
734  DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x,
735  tmp05, tmp00);
736  tmp01 = __lasx_xvldx(dst, dstStride_3x);
737  dst -= dstStride_4x;
738 
739  tmp06 = __lasx_xvpermi_q(tmp06, tmp07, 0x02);
740  tmp07 = __lasx_xvpermi_q(tmp02, tmp03, 0x02);
741  tmp08 = __lasx_xvpermi_q(tmp04, tmp05, 0x02);
742  tmp09 = __lasx_xvpermi_q(tmp00, tmp01, 0x02);
743 
744  QPEL8_V_LOWPASS(src00, src01, src02, src03, src04, src05, src06,
745  tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
746  tmp06 = __lasx_xvavgr_bu(tmp06, tmp02);
747  __lasx_xvstelm_d(tmp06, dst, 0, 0);
748  __lasx_xvstelm_d(tmp06, dst + dstStride, 0, 2);
749  dst += dstStride_2x;
750  QPEL8_V_LOWPASS(src02, src03, src04, src05, src06, src07, src08,
751  tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
752  tmp07 = __lasx_xvavgr_bu(tmp07, tmp02);
753  __lasx_xvstelm_d(tmp07, dst, 0, 0);
754  __lasx_xvstelm_d(tmp07, dst + dstStride, 0, 2);
755  dst += dstStride_2x;
756  QPEL8_V_LOWPASS(src04, src05, src06, src07, src08, src09, src10,
757  tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
758  tmp08 = __lasx_xvavgr_bu(tmp08, tmp02);
759  __lasx_xvstelm_d(tmp08, dst, 0, 0);
760  __lasx_xvstelm_d(tmp08, dst + dstStride, 0, 2);
761  dst += dstStride_2x;
762  QPEL8_V_LOWPASS(src06, src07, src08, src09, src10, src11, src12,
763  tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
764  tmp09 = __lasx_xvavgr_bu(tmp09, tmp02);
765  __lasx_xvstelm_d(tmp09, dst, 0, 0);
766  __lasx_xvstelm_d(tmp09, dst + dstStride, 0, 2);
767 }
768 
769 #define QPEL8_HV_LOWPASS_H(tmp) \
770 { \
771  src00 = __lasx_xvld(src, -2); \
772  src += srcStride; \
773  src10 = __lasx_xvld(src, -2); \
774  src += srcStride; \
775  src00 = __lasx_xvpermi_q(src00, src10, 0x02); \
776  src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1); \
777  src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2); \
778  src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3); \
779  src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4); \
780  src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5); \
781  DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\
782  src00 = __lasx_xvaddwl_h_bu(src00, src05); \
783  src02 = __lasx_xvmul_h(src02, h_20); \
784  src01 = __lasx_xvmul_h(src01, h_5); \
785  src02 = __lasx_xvssub_h(src02, src01); \
786  tmp = __lasx_xvsadd_h(src02, src00); \
787 }
788 
789 #define QPEL8_HV_LOWPASS_V(src0, src1, src2, src3, \
790  src4, src5, temp0, temp1, \
791  temp2, temp3, temp4, temp5, \
792  out) \
793 { \
794  DUP2_ARG2(__lasx_xvaddwl_w_h, src2, src3, src1, src4, temp0, temp2); \
795  DUP2_ARG2(__lasx_xvaddwh_w_h, src2, src3, src1, src4, temp1, temp3); \
796  temp4 = __lasx_xvaddwl_w_h(src0, src5); \
797  temp5 = __lasx_xvaddwh_w_h(src0, src5); \
798  temp0 = __lasx_xvmul_w(temp0, w_20); \
799  temp1 = __lasx_xvmul_w(temp1, w_20); \
800  temp2 = __lasx_xvmul_w(temp2, w_5); \
801  temp3 = __lasx_xvmul_w(temp3, w_5); \
802  temp0 = __lasx_xvssub_w(temp0, temp2); \
803  temp1 = __lasx_xvssub_w(temp1, temp3); \
804  temp0 = __lasx_xvsadd_w(temp0, temp4); \
805  temp1 = __lasx_xvsadd_w(temp1, temp5); \
806  temp0 = __lasx_xvsadd_w(temp0, w_512); \
807  temp1 = __lasx_xvsadd_w(temp1, w_512); \
808  temp0 = __lasx_xvssrani_hu_w(temp0, temp0, 10); \
809  temp1 = __lasx_xvssrani_hu_w(temp1, temp1, 10); \
810  temp0 = __lasx_xvpackev_d(temp1, temp0); \
811  out = __lasx_xvssrani_bu_h(temp0, temp0, 0); \
812 }
813 
814 static av_always_inline void
815 put_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
816  ptrdiff_t dstStride, ptrdiff_t srcStride)
817 {
818  __m256i src00, src01, src02, src03, src04, src05, src10;
819  __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
820  __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
821  __m256i h_20 = __lasx_xvldi(0x414);
822  __m256i h_5 = __lasx_xvldi(0x405);
823  __m256i w_20 = __lasx_xvldi(0x814);
824  __m256i w_5 = __lasx_xvldi(0x805);
825  __m256i w_512 = {512};
826  __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
827  __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
828  __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
829  __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
830  __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
831 
832  w_512 = __lasx_xvreplve0_w(w_512);
833 
834  src -= srcStride << 1;
835  QPEL8_HV_LOWPASS_H(tmp0)
836  QPEL8_HV_LOWPASS_H(tmp2)
837  QPEL8_HV_LOWPASS_H(tmp4)
838  QPEL8_HV_LOWPASS_H(tmp6)
839  QPEL8_HV_LOWPASS_H(tmp8)
840  QPEL8_HV_LOWPASS_H(tmp10)
841  QPEL8_HV_LOWPASS_H(tmp12)
842  tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21);
843  tmp9 = __lasx_xvpermi_q(tmp10, tmp8, 0x21);
844  tmp7 = __lasx_xvpermi_q(tmp8, tmp6, 0x21);
845  tmp5 = __lasx_xvpermi_q(tmp6, tmp4, 0x21);
846  tmp3 = __lasx_xvpermi_q(tmp4, tmp2, 0x21);
847  tmp1 = __lasx_xvpermi_q(tmp2, tmp0, 0x21);
848 
849  QPEL8_HV_LOWPASS_V(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, src00, src01,
850  src02, src03, src04, src05, tmp0)
851  QPEL8_HV_LOWPASS_V(tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src00, src01,
852  src02, src03, src04, src05, tmp2)
853  QPEL8_HV_LOWPASS_V(tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, src00, src01,
854  src02, src03, src04, src05, tmp4)
855  QPEL8_HV_LOWPASS_V(tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, src00, src01,
856  src02, src03, src04, src05, tmp6)
857  __lasx_xvstelm_d(tmp0, dst, 0, 0);
858  dst += dstStride;
859  __lasx_xvstelm_d(tmp0, dst, 0, 2);
860  dst += dstStride;
861  __lasx_xvstelm_d(tmp2, dst, 0, 0);
862  dst += dstStride;
863  __lasx_xvstelm_d(tmp2, dst, 0, 2);
864  dst += dstStride;
865  __lasx_xvstelm_d(tmp4, dst, 0, 0);
866  dst += dstStride;
867  __lasx_xvstelm_d(tmp4, dst, 0, 2);
868  dst += dstStride;
869  __lasx_xvstelm_d(tmp6, dst, 0, 0);
870  dst += dstStride;
871  __lasx_xvstelm_d(tmp6, dst, 0, 2);
872 }
873 
874 static av_always_inline void
875 avg_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride,
876  int srcStride)
877 {
878  int dstStride_2x = dstStride << 1;
879  int dstStride_4x = dstStride << 2;
880  int dstStride_3x = dstStride_2x + dstStride;
881  __m256i src00, src01, src02, src03, src04, src05, src10;
882  __m256i dst00, dst01, dst0, dst1, dst2, dst3;
883  __m256i out0, out1, out2, out3;
884  __m256i h_20 = __lasx_xvldi(0x414);
885  __m256i h_5 = __lasx_xvldi(0x405);
886  __m256i h_16 = __lasx_xvldi(0x410);
887  __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
888  __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
889  __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
890  __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
891  __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
892 
893  QPEL8_H_LOWPASS(out0)
894  QPEL8_H_LOWPASS(out1)
895  QPEL8_H_LOWPASS(out2)
896  QPEL8_H_LOWPASS(out3)
897  src00 = __lasx_xvld(dst, 0);
898  DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst,
899  dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04);
900  dst += dstStride_4x;
901  DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, dst00);
902  dst01 = __lasx_xvldx(dst, dstStride_3x);
903  dst -= dstStride_4x;
904  dst0 = __lasx_xvpermi_q(src00, src01, 0x02);
905  dst1 = __lasx_xvpermi_q(src02, src03, 0x02);
906  dst2 = __lasx_xvpermi_q(src04, src05, 0x02);
907  dst3 = __lasx_xvpermi_q(dst00, dst01, 0x02);
908  dst0 = __lasx_xvavgr_bu(dst0, out0);
909  dst1 = __lasx_xvavgr_bu(dst1, out1);
910  dst2 = __lasx_xvavgr_bu(dst2, out2);
911  dst3 = __lasx_xvavgr_bu(dst3, out3);
912  __lasx_xvstelm_d(dst0, dst, 0, 0);
913  __lasx_xvstelm_d(dst0, dst + dstStride, 0, 2);
914  __lasx_xvstelm_d(dst1, dst + dstStride_2x, 0, 0);
915  __lasx_xvstelm_d(dst1, dst + dstStride_3x, 0, 2);
916  dst += dstStride_4x;
917  __lasx_xvstelm_d(dst2, dst, 0, 0);
918  __lasx_xvstelm_d(dst2, dst + dstStride, 0, 2);
919  __lasx_xvstelm_d(dst3, dst + dstStride_2x, 0, 0);
920  __lasx_xvstelm_d(dst3, dst + dstStride_3x, 0, 2);
921 }
922 
923 static av_always_inline void
924 avg_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
925  ptrdiff_t dstStride, ptrdiff_t srcStride)
926 {
927  __m256i src00, src01, src02, src03, src04, src05, src10;
928  __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
929  __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
930  __m256i h_20 = __lasx_xvldi(0x414);
931  __m256i h_5 = __lasx_xvldi(0x405);
932  __m256i w_20 = __lasx_xvldi(0x814);
933  __m256i w_5 = __lasx_xvldi(0x805);
934  __m256i w_512 = {512};
935  __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
936  __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
937  __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
938  __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
939  __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
940  ptrdiff_t dstStride_2x = dstStride << 1;
941  ptrdiff_t dstStride_4x = dstStride << 2;
942  ptrdiff_t dstStride_3x = dstStride_2x + dstStride;
943 
944  w_512 = __lasx_xvreplve0_w(w_512);
945 
946  src -= srcStride << 1;
947  QPEL8_HV_LOWPASS_H(tmp0)
948  QPEL8_HV_LOWPASS_H(tmp2)
949  QPEL8_HV_LOWPASS_H(tmp4)
950  QPEL8_HV_LOWPASS_H(tmp6)
951  QPEL8_HV_LOWPASS_H(tmp8)
952  QPEL8_HV_LOWPASS_H(tmp10)
953  QPEL8_HV_LOWPASS_H(tmp12)
954  tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21);
955  tmp9 = __lasx_xvpermi_q(tmp10, tmp8, 0x21);
956  tmp7 = __lasx_xvpermi_q(tmp8, tmp6, 0x21);
957  tmp5 = __lasx_xvpermi_q(tmp6, tmp4, 0x21);
958  tmp3 = __lasx_xvpermi_q(tmp4, tmp2, 0x21);
959  tmp1 = __lasx_xvpermi_q(tmp2, tmp0, 0x21);
960 
961  QPEL8_HV_LOWPASS_V(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, src00, src01,
962  src02, src03, src04, src05, tmp0)
963  QPEL8_HV_LOWPASS_V(tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src00, src01,
964  src02, src03, src04, src05, tmp2)
965  QPEL8_HV_LOWPASS_V(tmp4, tmp5, tmp6, tmp7, tmp8, tmp9, src00, src01,
966  src02, src03, src04, src05, tmp4)
967  QPEL8_HV_LOWPASS_V(tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, src00, src01,
968  src02, src03, src04, src05, tmp6)
969 
970  src00 = __lasx_xvld(dst, 0);
971  DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst,
972  dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04);
973  dst += dstStride_4x;
974  DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, tmp8);
975  tmp9 = __lasx_xvldx(dst, dstStride_3x);
976  dst -= dstStride_4x;
977  tmp1 = __lasx_xvpermi_q(src00, src01, 0x02);
978  tmp3 = __lasx_xvpermi_q(src02, src03, 0x02);
979  tmp5 = __lasx_xvpermi_q(src04, src05, 0x02);
980  tmp7 = __lasx_xvpermi_q(tmp8, tmp9, 0x02);
981  tmp0 = __lasx_xvavgr_bu(tmp0, tmp1);
982  tmp2 = __lasx_xvavgr_bu(tmp2, tmp3);
983  tmp4 = __lasx_xvavgr_bu(tmp4, tmp5);
984  tmp6 = __lasx_xvavgr_bu(tmp6, tmp7);
985  __lasx_xvstelm_d(tmp0, dst, 0, 0);
986  dst += dstStride;
987  __lasx_xvstelm_d(tmp0, dst, 0, 2);
988  dst += dstStride;
989  __lasx_xvstelm_d(tmp2, dst, 0, 0);
990  dst += dstStride;
991  __lasx_xvstelm_d(tmp2, dst, 0, 2);
992  dst += dstStride;
993  __lasx_xvstelm_d(tmp4, dst, 0, 0);
994  dst += dstStride;
995  __lasx_xvstelm_d(tmp4, dst, 0, 2);
996  dst += dstStride;
997  __lasx_xvstelm_d(tmp6, dst, 0, 0);
998  dst += dstStride;
999  __lasx_xvstelm_d(tmp6, dst, 0, 2);
1000 }
1001 
1002 static av_always_inline void
1003 put_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1004  int dstStride, int srcStride)
1005 {
1006  put_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1007  put_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1008  src += srcStride << 3;
1009  dst += dstStride << 3;
1010  put_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1011  put_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1012 }
1013 
1014 static av_always_inline void
1015 avg_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1016  int dstStride, int srcStride)
1017 {
1018  avg_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1019  avg_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1020  src += srcStride << 3;
1021  dst += dstStride << 3;
1022  avg_h264_qpel8_h_lowpass_lasx(dst, src, dstStride, srcStride);
1023  avg_h264_qpel8_h_lowpass_lasx(dst+8, src+8, dstStride, srcStride);
1024 }
1025 
1026 static void put_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1027  int dstStride, int srcStride)
1028 {
1029  put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1030  put_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1031  src += 8*srcStride;
1032  dst += 8*dstStride;
1033  put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1034  put_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1035 }
1036 
1037 static void avg_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1038  int dstStride, int srcStride)
1039 {
1040  avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1041  avg_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1042  src += 8*srcStride;
1043  dst += 8*dstStride;
1044  avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, dstStride, srcStride);
1045  avg_h264_qpel8_v_lowpass_lasx(dst+8, (uint8_t*)src+8, dstStride, srcStride);
1046 }
1047 
1048 static void put_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1049  ptrdiff_t dstStride, ptrdiff_t srcStride)
1050 {
1051  put_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1052  put_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1053  src += srcStride << 3;
1054  dst += dstStride << 3;
1055  put_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1056  put_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1057 }
1058 
1059 static void avg_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src,
1060  ptrdiff_t dstStride, ptrdiff_t srcStride)
1061 {
1062  avg_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1063  avg_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1064  src += srcStride << 3;
1065  dst += dstStride << 3;
1066  avg_h264_qpel8_hv_lowpass_lasx(dst, src, dstStride, srcStride);
1067  avg_h264_qpel8_hv_lowpass_lasx(dst + 8, src + 8, dstStride, srcStride);
1068 }
1069 
1070 void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
1071  ptrdiff_t stride)
1072 {
1073  /* In mmi optimization, it used function ff_put_pixels8_8_mmi
1074  * which implemented in hpeldsp_mmi.c */
1076 }
1077 
1078 void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
1079  ptrdiff_t stride)
1080 {
1081  uint8_t half[64];
1082 
1084  /* in qpel8, the stride of half and height of block is 8 */
1086 }
1087 
1088 void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
1089  ptrdiff_t stride)
1090 {
1092 }
1093 
1094 void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
1095  ptrdiff_t stride)
1096 {
1097  uint8_t half[64];
1098 
1101 }
1102 
1103 void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src,
1104  ptrdiff_t stride)
1105 {
1106  uint8_t half[64];
1107 
1108  put_h264_qpel8_v_lowpass_lasx(half, (uint8_t*)src, 8, stride);
1110 }
1111 
1112 void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
1113  ptrdiff_t stride)
1114 {
1115  uint8_t halfH[64];
1116  uint8_t halfV[64];
1117 
1119  put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1120  put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1121 }
1122 
1123 void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
1124  ptrdiff_t stride)
1125 {
1126  uint8_t temp[128];
1127  uint8_t *const halfH = temp;
1128  uint8_t *const halfHV = temp + 64;
1129 
1132  put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1133 }
1134 
1135 void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
1136  ptrdiff_t stride)
1137 {
1138  uint8_t halfH[64];
1139  uint8_t halfV[64];
1140 
1142  put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1143  put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1144 }
1145 
1146 void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
1147  ptrdiff_t stride)
1148 {
1149  put_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, stride, stride);
1150 }
1151 
1152 void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
1153  ptrdiff_t stride)
1154 {
1155  uint8_t temp[128];
1156  uint8_t *const halfHV = temp;
1157  uint8_t *const halfH = temp + 64;
1158 
1160  put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src, 8, stride);
1161  put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1162 }
1163 
1164 void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
1165  ptrdiff_t stride)
1166 {
1168 }
1169 
1170 void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
1171  ptrdiff_t stride)
1172 {
1173  uint8_t temp[128];
1174  uint8_t *const halfHV = temp;
1175  uint8_t *const halfH = temp + 64;
1176 
1178  put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src + 1, 8, stride);
1179  put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1180 }
1181 
1182 void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src,
1183  ptrdiff_t stride)
1184 {
1185  uint8_t half[64];
1186 
1187  put_h264_qpel8_v_lowpass_lasx(half, (uint8_t*)src, 8, stride);
1189 }
1190 
1191 void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
1192  ptrdiff_t stride)
1193 {
1194  uint8_t halfH[64];
1195  uint8_t halfV[64];
1196 
1198  put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1199  put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1200 }
1201 
1202 void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
1203  ptrdiff_t stride)
1204 {
1205  uint8_t temp[128];
1206  uint8_t *const halfH = temp;
1207  uint8_t *const halfHV = temp + 64;
1208 
1211  put_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1212 }
1213 
1214 void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
1215  ptrdiff_t stride)
1216 {
1217  uint8_t halfH[64];
1218  uint8_t halfV[64];
1219 
1221  put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1222  put_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1223 }
1224 
1225 void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src,
1226  ptrdiff_t stride)
1227 {
1228  /* In mmi optimization, it used function ff_avg_pixels8_8_mmi
1229  * which implemented in hpeldsp_mmi.c */
1230  avg_pixels8_8_lsx(dst, src, stride);
1231 }
1232 
1233 void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src,
1234  ptrdiff_t stride)
1235 {
1236  uint8_t half[64];
1237 
1240 }
1241 
1242 void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src,
1243  ptrdiff_t stride)
1244 {
1246 }
1247 
1248 void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src,
1249  ptrdiff_t stride)
1250 {
1251  uint8_t half[64];
1252 
1255 }
1256 
1257 void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src,
1258  ptrdiff_t stride)
1259 {
1260  uint8_t halfH[64];
1261  uint8_t halfV[64];
1262 
1264  put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1265  avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1266 }
1267 
1268 void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src,
1269  ptrdiff_t stride)
1270 {
1271  uint8_t temp[128];
1272  uint8_t *const halfH = temp;
1273  uint8_t *const halfHV = temp + 64;
1274 
1277  avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1278 }
1279 
1280 void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src,
1281  ptrdiff_t stride)
1282 {
1283  uint8_t halfH[64];
1284  uint8_t halfV[64];
1285 
1287  put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1288  avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1289 }
1290 
1291 void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src,
1292  ptrdiff_t stride)
1293 {
1294  avg_h264_qpel8_v_lowpass_lasx(dst, (uint8_t*)src, stride, stride);
1295 }
1296 
1297 void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src,
1298  ptrdiff_t stride)
1299 {
1300  uint8_t temp[128];
1301  uint8_t *const halfHV = temp;
1302  uint8_t *const halfH = temp + 64;
1303 
1305  put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src, 8, stride);
1306  avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1307 }
1308 
1309 void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src,
1310  ptrdiff_t stride)
1311 {
1313 }
1314 
1315 void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src,
1316  ptrdiff_t stride)
1317 {
1318  uint8_t temp[128];
1319  uint8_t *const halfHV = temp;
1320  uint8_t *const halfH = temp + 64;
1321 
1323  put_h264_qpel8_v_lowpass_lasx(halfH, (uint8_t*)src + 1, 8, stride);
1324  avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1325 }
1326 
1327 void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src,
1328  ptrdiff_t stride)
1329 {
1330  uint8_t halfH[64];
1331  uint8_t halfV[64];
1332 
1334  put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src, 8, stride);
1335  avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1336 }
1337 
1338 void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src,
1339  ptrdiff_t stride)
1340 {
1341  uint8_t temp[128];
1342  uint8_t *const halfH = temp;
1343  uint8_t *const halfHV = temp + 64;
1344 
1347  avg_pixels8_l2_8_lsx(dst, halfH, halfHV, stride, 8);
1348 }
1349 
1350 void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src,
1351  ptrdiff_t stride)
1352 {
1353  uint8_t halfH[64];
1354  uint8_t halfV[64];
1355 
1357  put_h264_qpel8_v_lowpass_lasx(halfV, (uint8_t*)src + 1, 8, stride);
1358  avg_pixels8_l2_8_lsx(dst, halfH, halfV, stride, 8);
1359 }
1360 
1361 void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
1362  ptrdiff_t stride)
1363 {
1364  /* In mmi optimization, it used function ff_put_pixels16_8_mmi
1365  * which implemented in hpeldsp_mmi.c */
1366  put_pixels16_8_lsx(dst, src, stride);
1367 }
1368 
1369 void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
1370  ptrdiff_t stride)
1371 {
1372  uint8_t half[256];
1373 
1376 }
1377 
1378 void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
1379  ptrdiff_t stride)
1380 {
1382 }
1383 
1384 void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
1385  ptrdiff_t stride)
1386 {
1387  uint8_t half[256];
1388 
1391 }
1392 
1393 void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
1394  ptrdiff_t stride)
1395 {
1396  uint8_t half[256];
1397 
1400 }
1401 
1402 void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
1403  ptrdiff_t stride)
1404 {
1405  avc_luma_hv_qrt_16x16_lasx((uint8_t*)src - 2, (uint8_t*)src - (stride * 2),
1406  dst, stride);
1407 }
1408 
1409 void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
1410  ptrdiff_t stride)
1411 {
1412  uint8_t temp[512];
1413  uint8_t *const halfH = temp;
1414  uint8_t *const halfHV = temp + 256;
1415 
1418  put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1419 }
1420 
1421 void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
1422  ptrdiff_t stride)
1423 {
1424  avc_luma_hv_qrt_16x16_lasx((uint8_t*)src - 2, (uint8_t*)src - (stride * 2) + 1,
1425  dst, stride);
1426 }
1427 
1428 void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
1429  ptrdiff_t stride)
1430 {
1432 }
1433 
1434 void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
1435  ptrdiff_t stride)
1436 {
1437  uint8_t temp[512];
1438  uint8_t *const halfHV = temp;
1439  uint8_t *const halfH = temp + 256;
1440 
1443  put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1444 }
1445 
1446 void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
1447  ptrdiff_t stride)
1448 {
1450 }
1451 
1452 void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
1453  ptrdiff_t stride)
1454 {
1455  uint8_t temp[512];
1456  uint8_t *const halfHV = temp;
1457  uint8_t *const halfH = temp + 256;
1458 
1460  put_h264_qpel16_v_lowpass_lasx(halfH, src + 1, 16, stride);
1461  put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1462 }
1463 
1464 void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
1465  ptrdiff_t stride)
1466 {
1467  uint8_t half[256];
1468 
1471 }
1472 
1473 void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
1474  ptrdiff_t stride)
1475 {
1476  avc_luma_hv_qrt_16x16_lasx((uint8_t*)src + stride - 2, (uint8_t*)src - (stride * 2),
1477  dst, stride);
1478 }
1479 
1480 void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
1481  ptrdiff_t stride)
1482 {
1483  uint8_t temp[512];
1484  uint8_t *const halfH = temp;
1485  uint8_t *const halfHV = temp + 256;
1486 
1489  put_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1490 }
1491 
1492 void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
1493  ptrdiff_t stride)
1494 {
1495  avc_luma_hv_qrt_16x16_lasx((uint8_t*)src + stride - 2,
1496  (uint8_t*)src - (stride * 2) + 1, dst, stride);
1497 }
1498 
1499 void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src,
1500  ptrdiff_t stride)
1501 {
1502  /* In mmi optimization, it used function ff_avg_pixels16_8_mmi
1503  * which implemented in hpeldsp_mmi.c */
1504  avg_pixels16_8_lsx(dst, src, stride);
1505 }
1506 
1507 void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src,
1508  ptrdiff_t stride)
1509 {
1510  uint8_t half[256];
1511 
1514 }
1515 
1516 void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src,
1517  ptrdiff_t stride)
1518 {
1520 }
1521 
1522 void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src,
1523  ptrdiff_t stride)
1524 {
1525  uint8_t half[256];
1526 
1529 }
1530 
1531 void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src,
1532  ptrdiff_t stride)
1533 {
1534  uint8_t half[256];
1535 
1538 }
1539 
1540 void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src,
1541  ptrdiff_t stride)
1542 {
1544  (uint8_t*)src - (stride * 2),
1545  dst, stride);
1546 }
1547 
1548 void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src,
1549  ptrdiff_t stride)
1550 {
1551  uint8_t temp[512];
1552  uint8_t *const halfH = temp;
1553  uint8_t *const halfHV = temp + 256;
1554 
1557  avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1558 }
1559 
1560 void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src,
1561  ptrdiff_t stride)
1562 {
1564  (uint8_t*)src - (stride * 2) + 1,
1565  dst, stride);
1566 }
1567 
1568 void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src,
1569  ptrdiff_t stride)
1570 {
1572 }
1573 
1574 void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src,
1575  ptrdiff_t stride)
1576 {
1577  uint8_t temp[512];
1578  uint8_t *const halfHV = temp;
1579  uint8_t *const halfH = temp + 256;
1580 
1583  avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1584 }
1585 
1586 void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src,
1587  ptrdiff_t stride)
1588 {
1590 }
1591 
1592 void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src,
1593  ptrdiff_t stride)
1594 {
1595  uint8_t temp[512];
1596  uint8_t *const halfHV = temp;
1597  uint8_t *const halfH = temp + 256;
1598 
1600  put_h264_qpel16_v_lowpass_lasx(halfH, src + 1, 16, stride);
1601  avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1602 }
1603 
1604 void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src,
1605  ptrdiff_t stride)
1606 {
1607  uint8_t half[256];
1608 
1611 }
1612 
1613 void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src,
1614  ptrdiff_t stride)
1615 {
1617  (uint8_t*)src - (stride * 2),
1618  dst, stride);
1619 }
1620 
1621 void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src,
1622  ptrdiff_t stride)
1623 {
1624  uint8_t temp[512];
1625  uint8_t *const halfH = temp;
1626  uint8_t *const halfHV = temp + 256;
1627 
1630  avg_pixels16_l2_8_lsx(dst, halfH, halfHV, stride, 16);
1631 }
1632 
1633 void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src,
1634  ptrdiff_t stride)
1635 {
1637  (uint8_t*)src - (stride * 2) + 1,
1638  dst, stride);
1639 }
ff_avg_h264_qpel8_mc22_lasx
void ff_avg_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1309
ff_avg_h264_qpel8_mc33_lasx
void ff_avg_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1350
ff_put_h264_qpel8_mc01_lasx
void ff_put_h264_qpel8_mc01_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1103
put_pixels8_l2_8_lsx
void put_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half, ptrdiff_t dstStride, ptrdiff_t srcStride)
h264qpel_loongarch.h
ff_put_h264_qpel16_mc21_lasx
void ff_put_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1409
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
half
static uint8_t half(int a, int b)
Definition: mobiclip.c:539
ff_avg_h264_qpel16_mc10_lasx
void ff_avg_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1507
ff_put_h264_qpel8_mc23_lasx
void ff_put_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1202
ff_avg_h264_qpel16_mc01_lasx
void ff_avg_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1531
ff_put_h264_qpel16_mc00_lasx
void ff_put_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1361
DUP2_ARG2
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:58
avg_pixels16_l2_8_lsx
void avg_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half, ptrdiff_t dstStride, ptrdiff_t srcStride)
ff_avg_h264_qpel16_mc13_lasx
void ff_avg_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1613
ff_put_h264_qpel16_mc12_lasx
void ff_put_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1434
put_pixels16_l2_8_lsx
void put_pixels16_l2_8_lsx(uint8_t *dst, const uint8_t *src, uint8_t *half, ptrdiff_t dstStride, ptrdiff_t srcStride)
avg_pixels8_l2_8_lsx
void avg_pixels8_l2_8_lsx(uint8_t *dst, const uint8_t *src, const uint8_t *half, ptrdiff_t dstStride, ptrdiff_t srcStride)
ff_avg_h264_qpel16_mc30_lasx
void ff_avg_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1522
ff_put_h264_qpel8_mc30_lasx
void ff_put_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1094
ff_put_h264_qpel8_mc10_lasx
void ff_put_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1078
AVC_HORZ_FILTER_SH
#define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2)
Definition: h264qpel_lasx.c:38
avg_h264_qpel8_v_lowpass_lasx
static av_always_inline void avg_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride)
Definition: h264qpel_lasx.c:701
ff_avg_h264_qpel16_mc32_lasx
void ff_avg_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1592
aligned
static int aligned(int val)
Definition: dashdec.c:171
ff_put_h264_qpel8_mc20_lasx
void ff_put_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1088
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
QPEL8_V_LOWPASS
#define QPEL8_V_LOWPASS(src0, src1, src2, src3, src4, src5, src6, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5)
Definition: h264qpel_lasx.c:634
ff_avg_h264_qpel16_mc23_lasx
void ff_avg_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1621
avg_h264_qpel8_hv_lowpass_lasx
static av_always_inline void avg_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)
Definition: h264qpel_lasx.c:924
QPEL8_HV_LOWPASS_V
#define QPEL8_HV_LOWPASS_V(src0, src1, src2, src3, src4, src5, temp0, temp1, temp2, temp3, temp4, temp5, out)
Definition: h264qpel_lasx.c:789
avg_h264_qpel16_h_lowpass_lasx
static av_always_inline void avg_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)
Definition: h264qpel_lasx.c:1015
ff_put_h264_qpel16_mc20_lasx
void ff_put_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1378
avg_h264_qpel16_hv_lowpass_lasx
static void avg_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)
Definition: h264qpel_lasx.c:1059
ff_avg_h264_qpel8_mc30_lasx
void ff_avg_h264_qpel8_mc30_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1248
avc_luma_hv_qrt_and_aver_dst_16x16_lasx
static av_always_inline void avc_luma_hv_qrt_and_aver_dst_16x16_lasx(uint8_t *src_x, uint8_t *src_y, uint8_t *dst, ptrdiff_t stride)
Definition: h264qpel_lasx.c:65
put_h264_qpel16_hv_lowpass_lasx
static void put_h264_qpel16_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)
Definition: h264qpel_lasx.c:1048
ff_put_h264_qpel16_mc02_lasx
void ff_put_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1428
ff_put_h264_qpel8_mc11_lasx
void ff_put_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1112
ff_put_h264_qpel8_mc22_lasx
void ff_put_h264_qpel8_mc22_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1164
ff_put_h264_qpel8_mc00_lasx
void ff_put_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1070
ff_avg_h264_qpel8_mc02_lasx
void ff_avg_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1291
ff_avg_h264_qpel8_mc00_lasx
void ff_avg_h264_qpel8_mc00_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1225
ff_avg_h264_qpel16_mc31_lasx
void ff_avg_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1560
avc_luma_hv_qrt_16x16_lasx
static av_always_inline void avc_luma_hv_qrt_16x16_lasx(uint8_t *src_x, uint8_t *src_y, uint8_t *dst, ptrdiff_t stride)
Definition: h264qpel_lasx.c:197
AVC_DOT_SH3_SH
#define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)
Definition: h264qpel_lasx.c:53
put_h264_qpel16_v_lowpass_lasx
static void put_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)
Definition: h264qpel_lasx.c:1026
ff_avg_h264_qpel8_mc23_lasx
void ff_avg_h264_qpel8_mc23_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1338
ff_put_h264_qpel16_mc23_lasx
void ff_put_h264_qpel16_mc23_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1480
ff_avg_h264_qpel16_mc33_lasx
void ff_avg_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1633
avg_pixels8_8_lsx
static av_always_inline void avg_pixels8_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:360
avg_h264_qpel8_h_lowpass_lasx
static av_always_inline void avg_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)
Definition: h264qpel_lasx.c:875
avg_pixels16_8_lsx
static av_always_inline void avg_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:483
ff_avg_h264_qpel8_mc32_lasx
void ff_avg_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1315
put_h264_qpel8_h_lowpass_lasx
static av_always_inline void put_h264_qpel8_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)
Definition: h264qpel_lasx.c:602
avg_h264_qpel16_v_lowpass_lasx
static void avg_h264_qpel16_v_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)
Definition: h264qpel_lasx.c:1037
put_h264_qpel8_hv_lowpass_lasx
static av_always_inline void put_h264_qpel8_hv_lowpass_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t dstStride, ptrdiff_t srcStride)
Definition: h264qpel_lasx.c:815
ff_put_h264_qpel16_mc13_lasx
void ff_put_h264_qpel16_mc13_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1473
ff_avg_h264_qpel16_mc00_lasx
void ff_avg_h264_qpel16_mc00_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1499
ff_avg_h264_qpel16_mc02_lasx
void ff_avg_h264_qpel16_mc02_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1568
ff_put_h264_qpel16_mc11_lasx
void ff_put_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1402
ff_put_h264_qpel8_mc31_lasx
void ff_put_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1135
ff_put_h264_qpel16_mc22_lasx
void ff_put_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1446
ff_put_h264_qpel8_mc02_lasx
void ff_put_h264_qpel8_mc02_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1146
attributes.h
ff_avg_h264_qpel8_mc21_lasx
void ff_avg_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1268
ff_put_h264_qpel16_mc03_lasx
void ff_put_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1464
ff_avg_h264_qpel16_mc12_lasx
void ff_avg_h264_qpel16_mc12_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1574
av_always_inline
#define av_always_inline
Definition: attributes.h:49
ff_put_h264_qpel16_mc01_lasx
void ff_put_h264_qpel16_mc01_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1393
QPEL8_H_LOWPASS
#define QPEL8_H_LOWPASS(out_v)
Definition: h264qpel_lasx.c:581
stride
#define stride
Definition: h264pred_template.c:537
ff_avg_h264_qpel8_mc20_lasx
void ff_avg_h264_qpel8_mc20_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1242
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
ff_avg_h264_qpel16_mc11_lasx
void ff_avg_h264_qpel16_mc11_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1540
put_pixels16_8_lsx
static av_always_inline void put_pixels16_8_lsx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:423
luma_mask_arr
static const uint8_t luma_mask_arr[16 *6]
Definition: h264qpel_lasx.c:28
ff_avg_h264_qpel8_mc12_lasx
void ff_avg_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1297
ff_put_h264_qpel16_mc33_lasx
void ff_put_h264_qpel16_mc33_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1492
ff_put_h264_qpel8_mc12_lasx
void ff_put_h264_qpel8_mc12_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1152
ff_put_h264_qpel8_mc21_lasx
void ff_put_h264_qpel8_mc21_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1123
put_h264_qpel8_v_lowpass_lasx
static av_always_inline void put_h264_qpel8_v_lowpass_lasx(uint8_t *dst, uint8_t *src, int dstStride, int srcStride)
Definition: h264qpel_lasx.c:654
ff_avg_h264_qpel8_mc10_lasx
void ff_avg_h264_qpel8_mc10_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1233
put_pixels8_8_inline_asm
static av_always_inline void put_pixels8_8_inline_asm(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:317
ff_put_h264_qpel16_mc31_lasx
void ff_put_h264_qpel16_mc31_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1421
temp
else temp
Definition: vf_mcdeint.c:263
QPEL8_HV_LOWPASS_H
#define QPEL8_HV_LOWPASS_H(tmp)
Definition: h264qpel_lasx.c:769
ff_avg_h264_qpel8_mc11_lasx
void ff_avg_h264_qpel8_mc11_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1257
ff_put_h264_qpel16_mc32_lasx
void ff_put_h264_qpel16_mc32_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1452
ff_put_h264_qpel8_mc13_lasx
void ff_put_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1191
ff_put_h264_qpel8_mc03_lasx
void ff_put_h264_qpel8_mc03_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1182
loongson_intrinsics.h
ff_avg_h264_qpel16_mc20_lasx
void ff_avg_h264_qpel16_mc20_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1516
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
ff_avg_h264_qpel16_mc21_lasx
void ff_avg_h264_qpel16_mc21_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1548
put_h264_qpel16_h_lowpass_lasx
static av_always_inline void put_h264_qpel16_h_lowpass_lasx(uint8_t *dst, const uint8_t *src, int dstStride, int srcStride)
Definition: h264qpel_lasx.c:1003
ff_put_h264_qpel8_mc32_lasx
void ff_put_h264_qpel8_mc32_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1170
ff_put_h264_qpel16_mc30_lasx
void ff_put_h264_qpel16_mc30_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1384
ff_avg_h264_qpel16_mc03_lasx
void ff_avg_h264_qpel16_mc03_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1604
ff_avg_h264_qpel8_mc31_lasx
void ff_avg_h264_qpel8_mc31_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1280
ff_avg_h264_qpel8_mc13_lasx
void ff_avg_h264_qpel8_mc13_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1327
ff_put_h264_qpel16_mc10_lasx
void ff_put_h264_qpel16_mc10_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1369
ff_avg_h264_qpel16_mc22_lasx
void ff_avg_h264_qpel16_mc22_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1586
ff_put_h264_qpel8_mc33_lasx
void ff_put_h264_qpel8_mc33_lasx(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
Definition: h264qpel_lasx.c:1214
DUP4_ARG3
#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:83