29 __m256i in0, in1, in2, in3;
30 __m256i temp0, temp1, temp2, temp3, t1, t2, t3, t4, t5, t6, t7, t8;
31 __m256i const_1 = {0x000c000c000c000c, 0x000c000c000c000c,
32 0x000c000c000c000c, 0x000c000c000c000c};
33 __m256i const_2 = {0xfff4000cfff4000c, 0xfff4000cfff4000c,
34 0xfff4000cfff4000c, 0xfff4000cfff4000c};
35 __m256i const_3 = {0x0006001000060010, 0x0006001000060010,
36 0x0006001000060010, 0x0006001000060010};
37 __m256i const_4 = {0xfff00006fff00006, 0xfff00006fff00006,
38 0xfff00006fff00006, 0xfff00006fff00006};
39 __m256i const_5 = {0x000f0010000f0010, 0x000f0010000f0010,
40 0x000f0010000f0010, 0x000f0010000f0010};
41 __m256i const_6 = {0x0004000900040009, 0x0004000900040009,
42 0x0004000900040009, 0x0004000900040009};
43 __m256i const_7 = {0xfffc000ffffc000f, 0xfffc000ffffc000f,
44 0xfffc000ffffc000f, 0xfffc000ffffc000f};
45 __m256i const_8 = {0xfff7fff0fff7fff0, 0xfff7fff0fff7fff0,
46 0xfff7fff0fff7fff0, 0xfff7fff0fff7fff0};
47 __m256i const_9 = {0xfff00009fff00009, 0xfff00009fff00009,
48 0xfff00009fff00009, 0xfff00009fff00009};
49 __m256i const_10 = {0x000f0004000f0004, 0x000f0004000f0004,
50 0x000f0004000f0004, 0x000f0004000f0004};
51 __m256i const_11 = {0xfff70004fff70004, 0xfff70004fff70004,
52 0xfff70004fff70004, 0xfff70004fff70004};
53 __m256i const_12 = {0xfff0000ffff0000f, 0xfff0000ffff0000f,
54 0xfff0000ffff0000f, 0xfff0000ffff0000f};
58 DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
61 DUP2_ARG2(__lasx_xvilvl_h, in2, in0, in3, in1, temp0, temp1);
62 t2 = __lasx_xvreplgr2vr_w(con_4);
63 DUP2_ARG3(__lasx_xvdp2add_w_h, t2, temp0, const_1, t2, temp0,
65 DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_3, temp1, const_4, t3, t4);
67 t5 = __lasx_xvadd_w(t1, t3);
68 t6 = __lasx_xvadd_w(t2, t4);
69 t7 = __lasx_xvsub_w(t2, t4);
70 t8 = __lasx_xvsub_w(t1, t3);
72 DUP2_ARG2(__lasx_xvilvh_h, in1, in0, in3, in2, temp0, temp1);
73 temp2 = __lasx_xvdp2_w_h(const_5, temp0);
74 t1 = __lasx_xvdp2add_w_h(temp2, temp1, const_6);
75 temp2 = __lasx_xvdp2_w_h(const_7, temp0);
76 t2 = __lasx_xvdp2add_w_h(temp2, temp1, const_8);
77 temp2 = __lasx_xvdp2_w_h(const_9, temp0);
78 t3 = __lasx_xvdp2add_w_h(temp2, temp1, const_10);
79 temp2 = __lasx_xvdp2_w_h(const_11, temp0);
80 t4 = __lasx_xvdp2add_w_h(temp2, temp1, const_12);
82 DUP4_ARG2(__lasx_xvadd_w, t1, t5, t6, t2, t7, t3, t8, t4,
83 temp0, temp1, temp2, temp3);
84 DUP4_ARG2(__lasx_xvsub_w, t8, t4, t7, t3, t6, t2, t5, t1,
86 DUP4_ARG2(__lasx_xvsrai_w, temp0, 3, temp1, 3, temp2, 3, temp3, 3,
87 temp0, temp1, temp2, temp3);
88 DUP4_ARG2(__lasx_xvsrai_w, in0, 3, in1, 3, in2, 3, in3, 3,
92 DUP4_ARG2(__lasx_xvpackev_h, temp1, temp0, temp3, temp2, in1, in0,
93 in3, in2, temp0, temp1, temp2, temp3);
94 DUP2_ARG2(__lasx_xvilvl_w, temp1, temp0, temp3, temp2, t1, t3);
95 DUP2_ARG2(__lasx_xvilvh_w, temp1, temp0, temp3, temp2, t2, t4);
96 DUP4_ARG3(__lasx_xvpermi_q, t3, t1, 0x20, t3, t1, 0x31, t4, t2, 0x20,
97 t4, t2, 0x31, in0, in1, in2, in3);
98 DUP2_ARG2(__lasx_xvilvl_h, in1, in0, in3, in2, temp0, temp1);
99 t3 = __lasx_xvreplgr2vr_w(con_64);
100 DUP2_ARG3(__lasx_xvdp2add_w_h, t3, temp0, const_1, t3, temp0,
102 DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_3, temp1, const_4, t3, t4);
104 t5 = __lasx_xvadd_w(t1, t3);
105 t6 = __lasx_xvadd_w(t2, t4);
106 t7 = __lasx_xvsub_w(t2, t4);
107 t8 = __lasx_xvsub_w(t1, t3);
109 DUP2_ARG2(__lasx_xvilvh_h, in2, in0, in3, in1, temp0, temp1);
110 temp2 = __lasx_xvdp2_w_h(const_5, temp0);
111 t1 = __lasx_xvdp2add_w_h(temp2, temp1, const_6);
112 temp2 = __lasx_xvdp2_w_h(const_7, temp0);
113 t2 = __lasx_xvdp2add_w_h(temp2, temp1, const_8);
114 temp2 = __lasx_xvdp2_w_h(const_9, temp0);
115 t3 = __lasx_xvdp2add_w_h(temp2, temp1, const_10);
116 temp2 = __lasx_xvdp2_w_h(const_11, temp0);
117 t4 = __lasx_xvdp2add_w_h(temp2, temp1, const_12);
119 DUP4_ARG2(__lasx_xvadd_w, t5, t1, t6, t2, t7, t3, t8, t4,
120 temp0, temp1, temp2, temp3);
121 DUP4_ARG2(__lasx_xvsub_w, t8, t4, t7, t3, t6, t2, t5, t1,
123 DUP4_ARG2(__lasx_xvaddi_wu, in0, 1, in1, 1, in2, 1, in3, 1,
125 DUP4_ARG3(__lasx_xvsrani_h_w, temp1, temp0, 7, temp3, temp2, 7,
126 in1, in0, 7, in3, in2, 7, t1, t2, t3, t4);
127 DUP4_ARG2(__lasx_xvpermi_d, t1, 0xD8, t2, 0xD8, t3, 0xD8, t4, 0xD8,
129 __lasx_xvst(in0,
block, 0);
130 __lasx_xvst(in1,
block, 32);
131 __lasx_xvst(in2,
block, 64);
132 __lasx_xvst(in3,
block, 96);
139 ptrdiff_t stride2 =
stride << 1;
140 ptrdiff_t stride3 = stride2 +
stride;
141 uint8_t *
dst = dest + (stride2 << 1);
142 __m256i in0, in1, in2, in3, in4, in5, in6, in7;
143 __m256i const_dc, temp0, temp1, temp2, temp3;
144 __m256i reg0, reg1, reg2, reg3;
146 dc = (3 *
dc + 1) >> 1;
147 dc = (3 *
dc + 16) >> 5;
149 const_dc = __lasx_xvreplgr2vr_h(
dc);
150 DUP4_ARG2(__lasx_xvldrepl_d, dest, 0, dest +
stride, 0, dest + stride2,
151 0, dest + stride3, 0, in0, in1, in2, in3);
153 0,
dst + stride3, 0, in4, in5, in6, in7);
155 DUP4_ARG2(__lasx_xvilvl_d, in1, in0, in3, in2, in5, in4, in7, in6,
156 temp0, temp1, temp2, temp3);
157 DUP4_ARG1(__lasx_vext2xv_hu_bu, temp0, temp1, temp2, temp3,
158 temp0, temp1, temp2, temp3);
160 DUP4_ARG2(__lasx_xvadd_h, temp0, const_dc, temp1, const_dc, temp2,
161 const_dc, temp3, const_dc, reg0, reg1, reg2, reg3);
162 DUP2_ARG3(__lasx_xvssrarni_bu_h, reg1, reg0, 0, reg3, reg2, 0,
164 __lasx_xvstelm_d(temp0, dest, 0, 0);
165 __lasx_xvstelm_d(temp0, dest +
stride, 0, 2);
166 __lasx_xvstelm_d(temp0, dest + stride2, 0, 1);
167 __lasx_xvstelm_d(temp0, dest + stride3, 0, 3);
168 __lasx_xvstelm_d(temp1,
dst, 0, 0);
169 __lasx_xvstelm_d(temp1,
dst +
stride, 0, 2);
170 __lasx_xvstelm_d(temp1,
dst + stride2, 0, 1);
171 __lasx_xvstelm_d(temp1,
dst + stride3, 0, 3);
176 ptrdiff_t stride2 =
stride << 1;
177 ptrdiff_t stride3 = stride2 +
stride;
178 __m256i
shift = {0x0000000400000000, 0x0000000500000001,
179 0x0000000600000002, 0x0000000700000003};
180 __m256i const_64 = {0x0000004000000040, 0x0000004000000040,
181 0x0000004000000040, 0x0000004000000040};
182 __m256i const_1 = {0x00060010000C000C, 0x00060010000C000C,
183 0x00060010000C000C, 0x00060010000C000C};
184 __m256i const_2 = {0xFFF00006FFF4000C, 0xFFF00006FFF4000C,
185 0xFFF00006FFF4000C, 0xFFF00006FFF4000C};
186 __m256i const_3 = {0x0004000F00090010, 0x0004000F00090010,
187 0x0004000F00090010, 0x0004000F00090010};
188 __m256i const_4 = {0xFFF7FFFCFFF0000F, 0xFFF7FFFCFFF0000F,
189 0xFFF7FFFCFFF0000F, 0xFFF7FFFCFFF0000F};
190 __m256i const_5 = {0x000FFFF000040009, 0x000FFFF000040009,
191 0x000FFFF000040009, 0x000FFFF000040009};
192 __m256i const_6 = {0xFFF0FFF7000F0004, 0xFFF0FFF7000F0004,
193 0xFFF0FFF7000F0004, 0xFFF0FFF7000F0004};
194 __m256i const_7 = {0x0000000000000004, 0x0000000000000004,
195 0x0000000000000004, 0x0000000000000004};
196 __m256i const_8 = {0x0011001100110011, 0x0011001100110011,
197 0x0011001100110011, 0x0011001100110011};
198 __m256i const_9 = {0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011,
199 0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011};
200 __m256i const_10 = {0x000A0016000A0016, 0x000A0016000A0016,
201 0x000A0016000A0016, 0x000A0016000A0016};
202 __m256i const_11 = {0x0016FFF60016FFF6, 0x0016FFF60016FFF6,
203 0x0016FFF60016FFF6, 0x0016FFF60016FFF6};
205 __m256i temp0, temp1, temp2, temp3, t1, t2, t3, t4;
209 temp0 = __lasx_xvpermi_d(in0, 0xB1);
210 temp1 = __lasx_xvpermi_d(in1, 0xB1);
211 DUP2_ARG2(__lasx_xvilvl_h, temp0, in0, temp1, in1, temp0, temp1);
212 temp2 = __lasx_xvpickev_w(temp1, temp0);
213 temp3 = __lasx_xvpickod_w(temp1, temp0);
215 DUP2_ARG2(__lasx_xvdp2_w_h, temp2, const_1, temp2, const_2, temp0, temp1);
216 t1 = __lasx_xvadd_w(temp0, const_7);
217 t2 = __lasx_xvadd_w(temp1, const_7);
218 temp0 = __lasx_xvpickev_w(t2, t1);
219 temp1 = __lasx_xvpickod_w(t2, t1);
220 t3 = __lasx_xvadd_w(temp0, temp1);
221 t4 = __lasx_xvsub_w(temp0, temp1);
222 t4 = __lasx_xvpermi_d(t4, 0xB1);
224 DUP4_ARG2(__lasx_xvdp4_d_h, temp3, const_3, temp3, const_4, temp3,
225 const_5, temp3, const_6, t1, t2, temp0, temp1);
226 temp2 = __lasx_xvpickev_w(t2, t1);
227 temp3 = __lasx_xvpickev_w(temp1, temp0);
229 t1 = __lasx_xvadd_w(temp2, t3);
230 t2 = __lasx_xvadd_w(temp3, t4);
231 temp0 = __lasx_xvsub_w(t4, temp3);
232 temp1 = __lasx_xvsub_w(t3, temp2);
234 DUP2_ARG3(__lasx_xvsrani_h_w, t2, t1, 3, temp1, temp0, 3, temp2, temp3);
235 temp3 = __lasx_xvshuf4i_h(temp3, 0x4E);
236 temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20);
237 temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31);
238 DUP2_ARG3(__lasx_xvdp2add_w_h, const_64, temp0, const_8, const_64, temp0,
240 DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_10, temp1, const_11, t3, t4);
241 temp0 = __lasx_xvadd_w(t1, t3);
242 temp1 = __lasx_xvsub_w(t2, t4);
243 temp2 = __lasx_xvadd_w(t2, t4);
244 temp3 = __lasx_xvsub_w(t1, t3);
245 DUP4_ARG2(__lasx_xvsrai_w, temp0, 7, temp1, 7, temp2, 7, temp3, 7,
248 temp0 = __lasx_xvldrepl_d(dest, 0);
249 DUP4_ARG2(__lasx_xvldrepl_d, dest, 0, dest +
stride, 0, dest + stride2, 0,
250 dest + stride3, 0, temp0, temp1, temp2, temp3);
251 DUP4_ARG1(__lasx_vext2xv_wu_bu, temp0, temp1, temp2, temp3,
252 temp0, temp1, temp2, temp3);
253 DUP4_ARG2(__lasx_xvadd_w, temp0, t1, temp1, t2, temp2, t3, temp3, t4,
255 DUP4_ARG1(__lasx_xvclip255_w, t1, t2, t3, t4, t1, t2, t3, t4);
256 DUP2_ARG2(__lasx_xvpickev_h, t2, t1, t4, t3, temp0, temp1);
257 temp2 = __lasx_xvpickev_b(temp1, temp0);
258 temp0 = __lasx_xvperm_w(temp2,
shift);
259 __lasx_xvstelm_d(temp0, dest, 0, 0);
260 __lasx_xvstelm_d(temp0, dest +
stride, 0, 1);
261 __lasx_xvstelm_d(temp0, dest + stride2, 0, 2);
262 __lasx_xvstelm_d(temp0, dest + stride3, 0, 3);
269 ptrdiff_t stride2 =
stride << 1;
270 ptrdiff_t stride3 = stride2 +
stride;
271 __m256i in0, in1, in2, in3;
272 __m256i const_dc, temp0, temp1, reg0, reg1;
274 dc = (3 *
dc + 1) >> 1;
275 dc = (17 *
dc + 64) >> 7;
276 const_dc = __lasx_xvreplgr2vr_h(
dc);
278 DUP4_ARG2(__lasx_xvldrepl_d, dest, 0, dest +
stride, 0, dest + stride2,
279 0, dest + stride3, 0, in0, in1, in2, in3);
280 DUP2_ARG2(__lasx_xvilvl_d, in1, in0, in3, in2, temp0, temp1);
281 DUP2_ARG1(__lasx_vext2xv_hu_bu, temp0, temp1, temp0, temp1);
282 DUP2_ARG2(__lasx_xvadd_h, temp0, const_dc, temp1, const_dc, reg0, reg1);
283 temp0 = __lasx_xvssrarni_bu_h(reg1, reg0, 0);
284 __lasx_xvstelm_d(temp0, dest, 0, 0);
285 __lasx_xvstelm_d(temp0, dest +
stride, 0, 2);
286 __lasx_xvstelm_d(temp0, dest + stride2, 0, 1);
287 __lasx_xvstelm_d(temp0, dest + stride3, 0, 3);
294 ptrdiff_t stride2 =
stride << 1;
295 ptrdiff_t stride3 = stride2 +
stride;
296 uint8_t *
dst = dest + (stride2 << 1);
297 __m256i in0, in1, in2, in3, in4, in5, in6, in7;
298 __m256i const_dc, temp0, temp1, temp2, temp3, reg0, reg1;
300 dc = (17 *
dc + 4) >> 3;
301 dc = (12 *
dc + 64) >> 7;
302 const_dc = __lasx_xvreplgr2vr_h(
dc);
304 DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dest +
stride, 0, dest + stride2,
305 0, dest + stride3, 0, in0, in1, in2, in3);
307 0,
dst + stride3, 0, in4, in5, in6, in7);
309 DUP4_ARG2(__lasx_xvilvl_w, in1, in0, in3, in2, in5, in4, in7, in6,
310 temp0, temp1, temp2, temp3);
311 DUP2_ARG2(__lasx_xvilvl_d, temp1, temp0, temp3, temp2, reg0, reg1);
312 DUP2_ARG1(__lasx_vext2xv_hu_bu, reg0, reg1, temp0, temp1);
313 DUP2_ARG2(__lasx_xvadd_h, temp0, const_dc, temp1, const_dc, reg0, reg1);
314 temp0 = __lasx_xvssrarni_bu_h(reg1, reg0, 0);
315 __lasx_xvstelm_w(temp0, dest, 0, 0);
316 __lasx_xvstelm_w(temp0, dest +
stride, 0, 1);
317 __lasx_xvstelm_w(temp0, dest + stride2, 0, 4);
318 __lasx_xvstelm_w(temp0, dest + stride3, 0, 5);
319 __lasx_xvstelm_w(temp0,
dst, 0, 2);
320 __lasx_xvstelm_w(temp0,
dst +
stride, 0, 3);
321 __lasx_xvstelm_w(temp0,
dst + stride2, 0, 6);
322 __lasx_xvstelm_w(temp0,
dst + stride3, 0, 7);
327 ptrdiff_t stride2 =
stride << 1;
328 ptrdiff_t stride3 = stride2 +
stride;
329 uint8_t *
dst = dest + (stride2 << 1);
330 __m256i in0, in1, in2, in3;
331 __m256i temp0, temp1, temp2, temp3, t1, t2, t3, t4;
333 __m256i const_1 = {0x0011001100110011, 0x0011001100110011,
334 0x0011001100110011, 0x0011001100110011};
335 __m256i const_2 = {0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011,
336 0xFFEF0011FFEF0011, 0xFFEF0011FFEF0011};
337 __m256i const_3 = {0x000A0016000A0016, 0x000A0016000A0016,
338 0x000A0016000A0016, 0x000A0016000A0016};
339 __m256i const_4 = {0x0016FFF60016FFF6, 0x0016FFF60016FFF6,
340 0x0016FFF60016FFF6, 0x0016FFF60016FFF6};
341 __m256i const_5 = {0x0000000400000004, 0x0000000400000004,
342 0x0000000400000004, 0x0000000400000004};
343 __m256i const_6 = {0x0000004000000040, 0x0000004000000040,
344 0x0000004000000040, 0x0000004000000040};
345 __m256i const_7 = {0x000C000C000C000C, 0X000C000C000C000C,
346 0xFFF4000CFFF4000C, 0xFFF4000CFFF4000C};
347 __m256i const_8 = {0x0006001000060010, 0x0006001000060010,
348 0xFFF00006FFF00006, 0xFFF00006FFF00006};
349 __m256i const_9 = {0x0009001000090010, 0x0009001000090010,
350 0x0004000F0004000F, 0x0004000F0004000F};
351 __m256i const_10 = {0xFFF0000FFFF0000F, 0xFFF0000FFFF0000F,
352 0xFFF7FFFCFFF7FFFC, 0xFFF7FFFCFFF7FFFC};
353 __m256i const_11 = {0x0004000900040009, 0x0004000900040009,
354 0x000FFFF0000FFFF0, 0x000FFFF0000FFFF0};
355 __m256i const_12 = {0x000F0004000F0004, 0x000F0004000F0004,
356 0xFFF0FFF7FFF0FFF7, 0xFFF0FFF7FFF0FFF7};
357 __m256i
shift = {0x0000000400000000, 0x0000000600000002,
358 0x0000000500000001, 0x0000000700000003};
363 in0 = __lasx_xvilvl_d(in1, in0);
364 in1 = __lasx_xvilvl_d(in3, in2);
365 temp0 = __lasx_xvpickev_h(in1, in0);
366 temp1 = __lasx_xvpickod_h(in1, in0);
367 temp0 = __lasx_xvperm_w(temp0,
shift);
368 temp1 = __lasx_xvperm_w(temp1,
shift);
370 DUP2_ARG3(__lasx_xvdp2add_w_h, const_5, temp0, const_1, const_5, temp0,
372 DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_3, temp1, const_4, t3, t4);
374 temp0 = __lasx_xvadd_w(t1, t3);
375 temp1 = __lasx_xvsub_w(t2, t4);
376 temp2 = __lasx_xvadd_w(t2, t4);
377 temp3 = __lasx_xvsub_w(t1, t3);
378 DUP4_ARG2(__lasx_xvsrai_w, temp0, 3, temp1, 3, temp2, 3, temp3, 3,
379 temp0, temp1, temp2, temp3);
382 t1 = __lasx_xvpickev_w(temp1, temp0);
383 t2 = __lasx_xvpickev_w(temp3, temp2);
384 t1 = __lasx_xvpickev_h(t2, t1);
385 t3 = __lasx_xvpickod_w(temp1, temp0);
386 t4 = __lasx_xvpickod_w(temp3, temp2);
387 temp1 = __lasx_xvpickev_h(t4, t3);
388 temp2 = __lasx_xvpermi_q(t1, t1, 0x00);
389 temp3 = __lasx_xvpermi_q(t1, t1, 0x11);
390 t1 = __lasx_xvdp2add_w_h(const_6, temp2, const_7);
391 t2 = __lasx_xvdp2_w_h(temp3, const_8);
392 t3 = __lasx_xvadd_w(t1, t2);
393 t4 = __lasx_xvsub_w(t1, t2);
394 t4 = __lasx_xvpermi_d(t4, 0x4E);
396 DUP4_ARG2(__lasx_xvdp2_w_h, temp1, const_9, temp1, const_10, temp1,
397 const_11, temp1, const_12, t1, t2, temp2, temp3);
399 temp0 = __lasx_xvpermi_q(t2, t1, 0x20);
400 temp1 = __lasx_xvpermi_q(t2, t1, 0x31);
401 t1 = __lasx_xvadd_w(temp0, temp1);
402 temp0 = __lasx_xvpermi_q(temp3, temp2, 0x20);
403 temp1 = __lasx_xvpermi_q(temp3, temp2, 0x31);
404 t2 = __lasx_xvadd_w(temp1, temp0);
405 temp0 = __lasx_xvadd_w(t1, t3);
406 temp1 = __lasx_xvadd_w(t2, t4);
407 temp2 = __lasx_xvsub_w(t4, t2);
408 temp3 = __lasx_xvsub_w(t3, t1);
409 temp2 = __lasx_xvaddi_wu(temp2, 1);
410 temp3 = __lasx_xvaddi_wu(temp3, 1);
411 DUP4_ARG2(__lasx_xvsrai_w, temp0, 7, temp1, 7, temp2, 7, temp3, 7,
412 temp0, temp1, temp2, temp3);
414 DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dest +
stride, 0, dest + stride2, 0,
415 dest + stride3, 0, const_1, const_2, const_3, const_4);
417 dst + stride3, 0, const_5, const_6, const_7, const_8);
419 DUP4_ARG2(__lasx_xvilvl_w, const_2, const_1, const_4, const_3, const_5,
420 const_6, const_7, const_8, const_1, const_2, const_3, const_4);
421 DUP4_ARG1(__lasx_vext2xv_wu_bu, const_1, const_2, const_3, const_4,
422 const_1, const_2, const_3, const_4);
423 DUP4_ARG2(__lasx_xvadd_w, temp0, const_1, temp1, const_2, temp2, const_3,
424 temp3, const_4, temp0, temp1, temp2, temp3);
425 DUP4_ARG1(__lasx_xvclip255_w, temp0, temp1, temp2, temp3,
426 temp0, temp1, temp2, temp3);
427 DUP2_ARG2(__lasx_xvpickev_h, temp1, temp0, temp3, temp2, temp0, temp1);
428 temp0 = __lasx_xvpickev_b(temp1, temp0);
429 __lasx_xvstelm_w(temp0, dest, 0, 0);
430 __lasx_xvstelm_w(temp0, dest +
stride, 0, 4);
431 __lasx_xvstelm_w(temp0, dest + stride2, 0, 1);
432 __lasx_xvstelm_w(temp0, dest + stride3, 0, 5);
433 __lasx_xvstelm_w(temp0,
dst, 0, 6);
434 __lasx_xvstelm_w(temp0,
dst +
stride, 0, 2);
435 __lasx_xvstelm_w(temp0,
dst + stride2, 0, 7);
436 __lasx_xvstelm_w(temp0,
dst + stride3, 0, 3);
443 uint8_t *dst1 = dest +
stride;
444 uint8_t *dst2 = dst1 +
stride;
445 uint8_t *dst3 = dst2 +
stride;
446 __m256i in0, in1, in2, in3, temp0, temp1, const_dc;
449 dc = (17 *
dc + 4) >> 3;
450 dc = (17 *
dc + 64) >> 7;
451 const_dc = __lasx_xvreplgr2vr_h(
dc);
453 DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dst1, 0, dst2, 0, dst3, 0,
455 DUP2_ARG2(__lasx_xvilvl_w, in1, in0, in3, in2, temp0, temp1);
456 in0 = __lasx_xvpermi_q(temp1, temp0, 0x20);
457 temp0 = __lasx_xvilvl_b(
zero, in0);
458 in0 = __lasx_xvadd_h(temp0, const_dc);
459 temp0 = __lasx_xvssrarni_bu_h(in0, in0, 0);
460 __lasx_xvstelm_w(temp0, dest, 0, 0);
461 __lasx_xvstelm_w(temp0, dst1, 0, 1);
462 __lasx_xvstelm_w(temp0, dst2, 0, 4);
463 __lasx_xvstelm_w(temp0, dst3, 0, 5);
468 uint8_t *dst1 = dest +
stride;
469 uint8_t *dst2 = dst1 +
stride;
470 uint8_t *dst3 = dst2 +
stride;
471 __m256i in0, in1, in2, in3;
472 __m256i temp0, temp1, temp2, temp3, t1, t2;
474 __m256i const_1 = {0x0011001100110011, 0xFFEF0011FFEF0011,
475 0x0011001100110011, 0xFFEF0011FFEF0011};
476 __m256i const_2 = {0x000A0016000A0016, 0x0016FFF60016FFF6,
477 0x000A0016000A0016, 0x0016FFF60016FFF6};
478 __m256i const_64 = {0x0000004000000040, 0x0000004000000040,
479 0x0000004000000040, 0x0000004000000040};
483 temp0 = __lasx_xvilvl_d(in1, in0);
484 temp1 = __lasx_xvpickev_h(temp0, temp0);
485 temp2 = __lasx_xvpickod_h(temp0, temp0);
486 DUP2_ARG2(__lasx_xvdp2_w_h, temp1, const_1, temp2, const_2, t1, t2);
487 t1 = __lasx_xvaddi_wu(t1, 4);
488 in0 = __lasx_xvadd_w(t1, t2);
489 in1 = __lasx_xvsub_w(t1, t2);
490 DUP2_ARG2(__lasx_xvsrai_w, in0, 3, in1, 3, in0, in1);
492 temp0 = __lasx_xvpickev_h(in1, in0);
493 temp1 = __lasx_xvpermi_q(temp0, temp0, 0x00);
494 temp2 = __lasx_xvpermi_q(temp0, temp0, 0x11);
495 const_1 = __lasx_xvpermi_d(const_1, 0xD8);
496 const_2 = __lasx_xvpermi_d(const_2, 0xD8);
497 t1 = __lasx_xvdp2add_w_h(const_64, temp1, const_1);
498 t2 = __lasx_xvdp2_w_h(temp2, const_2);
499 in0 = __lasx_xvadd_w(t1, t2);
500 in1 = __lasx_xvsub_w(t1, t2);
501 DUP2_ARG2(__lasx_xvsrai_w, in0, 7, in1, 7, in0, in1);
502 temp0 = __lasx_xvshuf4i_w(in0, 0x9C);
503 temp1 = __lasx_xvshuf4i_w(in1, 0x9C);
505 DUP4_ARG2(__lasx_xvldrepl_w, dest, 0, dst1, 0, dst2, 0, dst3, 0,
507 temp2 = __lasx_xvilvl_w(in2, in0);
508 temp2 = __lasx_vext2xv_wu_bu(temp2);
509 temp3 = __lasx_xvilvl_w(in1, in3);
510 temp3 = __lasx_vext2xv_wu_bu(temp3);
511 temp0 = __lasx_xvadd_w(temp0, temp2);
512 temp1 = __lasx_xvadd_w(temp1, temp3);
513 DUP2_ARG1(__lasx_xvclip255_w, temp0, temp1, temp0, temp1);
514 temp1 = __lasx_xvpickev_h(temp1, temp0);
515 temp0 = __lasx_xvpickev_b(temp1, temp1);
516 __lasx_xvstelm_w(temp0, dest, 0, 0);
517 __lasx_xvstelm_w(temp0, dst1, 0, 5);
518 __lasx_xvstelm_w(temp0, dst2, 0, 4);
519 __lasx_xvstelm_w(temp0, dst3, 0, 1);
523 ptrdiff_t
stride,
int hmode,
int vmode,
526 __m256i in0, in1, in2, in3;
527 __m256i t0, t1, t2, t3, t4, t5, t6, t7;
528 __m256i temp0, temp1, const_para1_2, const_para0_3;
529 __m256i const_r, const_sh;
530 __m256i sh = {0x0000000400000000, 0x0000000500000001,
531 0x0000000600000002, 0x0000000700000003};
532 static const uint8_t para_value[][4] = {{4, 3, 53, 18},
535 static const int shift_value[] = {0, 5, 1, 5};
536 int shift = (shift_value[hmode] + shift_value[vmode]) >> 1;
538 const uint8_t *para_v = para_value[vmode - 1];
539 ptrdiff_t stride2 =
stride << 1;
540 ptrdiff_t stride4 =
stride << 2;
541 ptrdiff_t stride3 = stride2 +
stride;
543 const_r = __lasx_xvreplgr2vr_h(
r);
544 const_sh = __lasx_xvreplgr2vr_h(
shift);
546 const_para0_3 = __lasx_xvldrepl_h(para_v, 0);
547 const_para1_2 = __lasx_xvldrepl_h(para_v, 2);
549 src + stride3, 0, in0, in1, in2, in3);
550 DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
552 DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, temp0, temp1);
553 t0 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
554 t0 = __lasx_xvdp2sub_h_bu(t0, temp1, const_para0_3);
556 in0 = __lasx_xvld(
src, 0);
557 in0 = __lasx_xvpermi_d(in0, 0xD8);
558 DUP2_ARG2(__lasx_xvilvl_b, in3, in2, in0, in1, temp0, temp1);
559 t1 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
560 t1 = __lasx_xvdp2sub_h_bu(t1, temp1, const_para0_3);
562 in1 = __lasx_xvld(
src, 0);
563 in1 = __lasx_xvpermi_d(in1, 0xD8);
564 DUP2_ARG2(__lasx_xvilvl_b, in0, in3, in1, in2, temp0, temp1);
565 t2 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
566 t2 = __lasx_xvdp2sub_h_bu(t2, temp1, const_para0_3);
568 in2 = __lasx_xvld(
src, 0);
569 in2 = __lasx_xvpermi_d(in2, 0xD8);
570 DUP2_ARG2(__lasx_xvilvl_b, in1, in0, in2, in3, temp0, temp1);
571 t3 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
572 t3 = __lasx_xvdp2sub_h_bu(t3, temp1, const_para0_3);
574 in3 = __lasx_xvld(
src, 0);
575 in3 = __lasx_xvpermi_d(in3, 0xD8);
576 DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, temp0, temp1);
577 t4 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
578 t4 = __lasx_xvdp2sub_h_bu(t4, temp1, const_para0_3);
580 in0 = __lasx_xvld(
src, 0);
581 in0 = __lasx_xvpermi_d(in0, 0xD8);
582 DUP2_ARG2(__lasx_xvilvl_b, in3, in2, in0, in1, temp0, temp1);
583 t5 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
584 t5 = __lasx_xvdp2sub_h_bu(t5, temp1, const_para0_3);
586 in1 = __lasx_xvld(
src, 0);
587 in1 = __lasx_xvpermi_d(in1, 0xD8);
588 DUP2_ARG2(__lasx_xvilvl_b, in0, in3, in1, in2, temp0, temp1);
589 t6 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
590 t6 = __lasx_xvdp2sub_h_bu(t6, temp1, const_para0_3);
592 in2 = __lasx_xvld(
src, 0);
593 in2 = __lasx_xvpermi_d(in2, 0xD8);
594 DUP2_ARG2(__lasx_xvilvl_b, in1, in0, in2, in3, temp0, temp1);
595 t7 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
596 t7 = __lasx_xvdp2sub_h_bu(t7, temp1, const_para0_3);
597 DUP4_ARG2(__lasx_xvadd_h, t0, const_r, t1, const_r, t2, const_r, t3,
598 const_r, t0, t1, t2, t3);
599 DUP4_ARG2(__lasx_xvadd_h, t4, const_r, t5, const_r, t6, const_r, t7,
600 const_r, t4, t5, t6, t7);
601 DUP4_ARG2(__lasx_xvsra_h, t0, const_sh, t1, const_sh, t2, const_sh,
602 t3, const_sh, t0, t1, t2, t3);
603 DUP4_ARG2(__lasx_xvsra_h, t4, const_sh, t5, const_sh, t6, const_sh,
604 t7, const_sh, t4, t5, t6, t7);
605 LASX_TRANSPOSE8x8_H(t0, t1, t2, t3, t4, t5, t6, t7, t0,
606 t1, t2, t3, t4, t5, t6, t7);
607 para_v = para_value[hmode - 1];
608 const_para0_3 = __lasx_xvldrepl_h(para_v, 0);
609 const_para1_2 = __lasx_xvldrepl_h(para_v, 2);
610 const_para0_3 = __lasx_vext2xv_h_b(const_para0_3);
611 const_para1_2 = __lasx_vext2xv_h_b(const_para1_2);
613 const_r = __lasx_xvreplgr2vr_w(
r);
614 DUP4_ARG2(__lasx_xvpermi_d, t0, 0x72, t1, 0x72, t2, 0x72, t0, 0xD8,
616 DUP4_ARG2(__lasx_xvpermi_d, t1, 0xD8, t2, 0xD8, t3, 0xD8, t4, 0xD8,
618 DUP2_ARG2(__lasx_xvpermi_d, t5, 0xD8, t6, 0xD8, t5, t6);
619 t7 = __lasx_xvpermi_d(t7, 0xD8);
620 DUP2_ARG2(__lasx_xvilvl_h, t2, t1, t3, t0, temp0, temp1);
621 t0 = __lasx_xvdp2_w_h(temp0, const_para1_2);
622 t0 = __lasx_xvdp2sub_w_h(t0, temp1, const_para0_3);
623 DUP2_ARG2(__lasx_xvilvl_h, t3, t2, t4, t1, temp0, temp1);
624 t1 = __lasx_xvdp2_w_h(temp0, const_para1_2);
625 t1 = __lasx_xvdp2sub_w_h(t1, temp1, const_para0_3);
626 DUP2_ARG2(__lasx_xvilvl_h, t4, t3, t5, t2, temp0, temp1);
627 t2 = __lasx_xvdp2_w_h(temp0, const_para1_2);
628 t2 = __lasx_xvdp2sub_w_h(t2, temp1, const_para0_3);
629 DUP2_ARG2(__lasx_xvilvl_h, t5, t4, t6, t3, temp0, temp1);
630 t3 = __lasx_xvdp2_w_h(temp0, const_para1_2);
631 t3 = __lasx_xvdp2sub_w_h(t3, temp1, const_para0_3);
632 DUP2_ARG2(__lasx_xvilvl_h, t6, t5, t7, t4, temp0, temp1);
633 t4 = __lasx_xvdp2_w_h(temp0, const_para1_2);
634 t4 = __lasx_xvdp2sub_w_h(t4, temp1, const_para0_3);
635 DUP2_ARG2(__lasx_xvilvl_h, t7, t6, in0, t5, temp0, temp1);
636 t5 = __lasx_xvdp2_w_h(temp0, const_para1_2);
637 t5 = __lasx_xvdp2sub_w_h(t5, temp1, const_para0_3);
638 DUP2_ARG2(__lasx_xvilvl_h, in0, t7, in1, t6, temp0, temp1);
639 t6 = __lasx_xvdp2_w_h(temp0, const_para1_2);
640 t6 = __lasx_xvdp2sub_w_h(t6, temp1, const_para0_3);
641 DUP2_ARG2(__lasx_xvilvl_h, in1, in0, in2, t7, temp0, temp1);
642 t7 = __lasx_xvdp2_w_h(temp0, const_para1_2);
643 t7 = __lasx_xvdp2sub_w_h(t7, temp1, const_para0_3);
644 DUP4_ARG2(__lasx_xvadd_w, t0, const_r, t1, const_r, t2, const_r,
645 t3, const_r, t0, t1, t2, t3);
646 DUP4_ARG2(__lasx_xvadd_w, t4, const_r, t5, const_r, t6, const_r,
647 t7, const_r, t4, t5, t6, t7);
648 DUP4_ARG2(__lasx_xvsrai_w, t0, 7, t1, 7, t2, 7, t3, 7, t0, t1, t2, t3);
649 DUP4_ARG2(__lasx_xvsrai_w, t4, 7, t5, 7, t6, 7, t7, 7, t4, t5, t6, t7);
650 LASX_TRANSPOSE8x8_W(t0, t1, t2, t3, t4, t5, t6, t7,
651 t0, t1, t2, t3, t4, t5, t6, t7);
652 DUP4_ARG1(__lasx_xvclip255_w, t0, t1, t2, t3, t0, t1, t2, t3);
653 DUP4_ARG1(__lasx_xvclip255_w, t4, t5, t6, t7, t4, t5, t6, t7);
654 DUP4_ARG2(__lasx_xvpickev_h, t1, t0, t3, t2, t5, t4, t7, t6,
656 DUP2_ARG2(__lasx_xvpickev_b, t1, t0, t3, t2, t0, t1);
657 t0 = __lasx_xvperm_w(t0, sh);
658 t1 = __lasx_xvperm_w(t1, sh);
659 __lasx_xvstelm_d(t0,
dst, 0, 0);
660 __lasx_xvstelm_d(t0,
dst +
stride, 0, 1);
661 __lasx_xvstelm_d(t0,
dst + stride2, 0, 2);
662 __lasx_xvstelm_d(t0,
dst + stride3, 0, 3);
664 __lasx_xvstelm_d(t1,
dst, 0, 0);
665 __lasx_xvstelm_d(t1,
dst +
stride, 0, 1);
666 __lasx_xvstelm_d(t1,
dst + stride2, 0, 2);
667 __lasx_xvstelm_d(t1,
dst + stride3, 0, 3);
670 #define PUT_VC1_MSPEL_MC_LASX(hmode, vmode) \
671 void ff_put_vc1_mspel_mc ## hmode ## vmode ## _lasx(uint8_t *dst, \
672 const uint8_t *src, \
673 ptrdiff_t stride, int rnd) \
675 put_vc1_mspel_mc_h_v_lasx(dst, src, stride, hmode, vmode, rnd); \
677 void ff_put_vc1_mspel_mc ## hmode ## vmode ## _16_lasx(uint8_t *dst, \
678 const uint8_t *src, \
679 ptrdiff_t stride, int rnd) \
681 put_vc1_mspel_mc_h_v_lasx(dst, src, stride, hmode, vmode, rnd); \
682 put_vc1_mspel_mc_h_v_lasx(dst + 8, src + 8, stride, hmode, vmode, rnd); \
683 dst += 8 * stride, src += 8 * stride; \
684 put_vc1_mspel_mc_h_v_lasx(dst, src, stride, hmode, vmode, rnd); \
685 put_vc1_mspel_mc_h_v_lasx(dst + 8, src + 8, stride, hmode, vmode, rnd); \
702 ptrdiff_t
stride,
int h,
int x,
int y)
704 const int intA = (8 - x) * (8 - y);
705 const int intB = (x) * (8 - y);
706 const int intC = (8 - x) * (y);
707 const int intD = (x) * (y);
708 __m256i src00, src01, src10, src11;
712 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
714 A = __lasx_xvreplgr2vr_h(intA);
715 B = __lasx_xvreplgr2vr_h(intB);
716 C = __lasx_xvreplgr2vr_h(intC);
717 D = __lasx_xvreplgr2vr_h(intD);
718 for(
i = 0;
i <
h;
i++){
723 DUP4_ARG1(__lasx_vext2xv_hu_bu, src00, src01, src10, src11,
724 src00, src01, src10, src11);
725 DUP4_ARG2(__lasx_xvmul_h, src00,
A, src01,
B, src10,
C, src11,
D,
726 src00, src01, src10, src11);
727 src00 = __lasx_xvadd_h(src00, src01);
728 src10 = __lasx_xvadd_h(src10, src11);
729 src00 = __lasx_xvadd_h(src00, src10);
730 src00 = __lasx_xvaddi_hu(src00, 28);
731 src00 = __lasx_xvsrli_h(src00, 6);
732 src00 = __lasx_xvpickev_b(src00, src00);
733 __lasx_xvstelm_d(src00,
dst, 0, 0);
741 __m256i in0, in1, in2, in3, temp0, temp1, t0;
742 __m256i const_para0_3, const_para1_2, const_r, const_sh;
743 static const uint16_t para_value[][2] = {{0x0304, 0x1235},
746 const uint16_t *para_v = para_value[vmode - 1];
747 static const int shift_value[] = {0, 6, 4, 6};
748 static int add_value[3];
749 ptrdiff_t stride_2x =
stride << 1;
751 add_value[2] = add_value[0] = 31 +
rnd, add_value[1] = 7 +
rnd;
753 const_r = __lasx_xvreplgr2vr_h(add_value[vmode - 1]);
754 const_sh = __lasx_xvreplgr2vr_h(shift_value[vmode]);
755 const_para0_3 = __lasx_xvreplgr2vr_h(*para_v);
756 const_para1_2 = __lasx_xvreplgr2vr_h(*(para_v + 1));
760 in0 = __lasx_xvpermi_d(in0, 0xD8);
761 in1 = __lasx_xvpermi_d(in1, 0xD8);
762 in2 = __lasx_xvpermi_d(in2, 0xD8);
763 for (;
i < 16;
i++) {
764 in3 = __lasx_xvld(
src + stride_2x, 0);
765 in3 = __lasx_xvpermi_d(in3, 0xD8);
766 DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, temp0, temp1);
767 t0 = __lasx_xvdp2_h_bu(temp0, const_para1_2);
768 t0 = __lasx_xvdp2sub_h_bu(t0, temp1, const_para0_3);
769 t0 = __lasx_xvadd_h(t0, const_r);
770 t0 = __lasx_xvsra_h(t0, const_sh);
771 t0 = __lasx_xvclip255_h(t0);
772 t0 = __lasx_xvpickev_b(t0, t0);
773 __lasx_xvstelm_d(t0,
dst, 0, 0);
774 __lasx_xvstelm_d(t0,
dst, 8, 2);
783 #define PUT_VC1_MSPEL_MC_V_LASX(vmode) \
784 void ff_put_vc1_mspel_mc0 ## vmode ## _16_lasx(uint8_t *dst, \
785 const uint8_t *src, \
786 ptrdiff_t stride, int rnd) \
788 put_vc1_mspel_mc_v_lasx(dst, src, stride, vmode, rnd); \
795 #define ROW_LASX(in0, in1, in2, in3, out0) \
796 DUP2_ARG2(__lasx_xvilvl_b, in2, in1, in3, in0, tmp0_m, tmp1_m); \
797 out0 = __lasx_xvdp2_h_bu(tmp0_m, const_para1_2); \
798 out0 = __lasx_xvdp2sub_h_bu(out0, tmp1_m, const_para0_3); \
799 out0 = __lasx_xvadd_h(out0, const_r); \
800 out0 = __lasx_xvsra_h(out0, const_sh); \
801 out0 = __lasx_xvclip255_h(out0); \
802 out0 = __lasx_xvpickev_b(out0, out0); \
803 out0 = __lasx_xvpermi_d(out0, 0xd8); \
808 __m256i in0, in1, in2, in3, in4, in5, in6, in7,
809 in8, in9, in10, in11, in12, in13, in14, in15;
810 __m256i out0, out1, out2, out3, out4, out5, out6, out7, out8, out9,
811 out10, out11, out12, out13, out14, out15, out16, out17, out18;
812 __m256i const_para0_3, const_para1_2, const_r, const_sh;
813 __m256i tmp0_m, tmp1_m, tmp2_m, tmp3_m;
814 __m256i tmp4_m, tmp5_m, tmp6_m, tmp7_m;
815 __m256i t0, t1, t2, t3, t4, t5, t6, t7;
816 ptrdiff_t stride2 =
stride << 1;
817 ptrdiff_t stride4 =
stride << 2;
818 ptrdiff_t stride3 = stride2 +
stride;
819 static const uint16_t para_value[][2] = {{0x0304, 0x1235},
822 const uint16_t *para_v = para_value[hmode - 1];
823 static const int shift_value[] = {0, 6, 4, 6};
824 static int add_value[3];
825 uint8_t *
_src = (uint8_t*)
src - 1;
826 add_value[2] = add_value[0] = 32 -
rnd, add_value[1] = 8 -
rnd;
828 const_r = __lasx_xvreplgr2vr_h(add_value[hmode - 1]);
829 const_sh = __lasx_xvreplgr2vr_h(shift_value[hmode]);
830 const_para0_3 = __lasx_xvreplgr2vr_h(*para_v);
831 const_para1_2 = __lasx_xvreplgr2vr_h(*(para_v + 1));
833 in0 = __lasx_xvld(
_src, 0);
835 in3 = __lasx_xvldx(
_src, stride3);
837 in4 = __lasx_xvld(
_src, 0);
839 in7 = __lasx_xvldx(
_src, stride3);
841 in8 = __lasx_xvld(
_src, 0);
843 in11 = __lasx_xvldx(
_src, stride3);
845 in12 = __lasx_xvld(
_src, 0);
847 in15 = __lasx_xvldx(
_src, stride3);
848 DUP4_ARG2(__lasx_xvilvl_b, in2, in0, in3, in1, in6, in4, in7, in5,
849 tmp0_m, tmp1_m, tmp2_m, tmp3_m);
850 DUP4_ARG2(__lasx_xvilvl_b, in10, in8, in11, in9, in14, in12, in15, in13,
851 tmp4_m, tmp5_m, tmp6_m, tmp7_m);
852 DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
853 tmp7_m, tmp6_m, t0, t2, t4, t6);
854 DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
855 tmp7_m, tmp6_m, t1, t3, t5, t7);
856 DUP4_ARG2(__lasx_xvilvl_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp0_m, tmp4_m,
858 DUP4_ARG2(__lasx_xvilvh_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp2_m, tmp6_m,
860 DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
861 tmp7_m, tmp6_m, out0, out2, out4, out6);
862 DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
863 tmp7_m, tmp6_m, out1, out3, out5, out7);
865 DUP4_ARG2(__lasx_xvilvh_b, in2, in0, in3, in1, in6, in4, in7, in5,
866 tmp0_m, tmp1_m, tmp2_m, tmp3_m);
867 DUP4_ARG2(__lasx_xvilvh_b, in10, in8, in11, in9, in14, in12, in15, in13,
868 tmp4_m, tmp5_m, tmp6_m, tmp7_m);
869 DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
870 tmp7_m, tmp6_m, t0, t2, t4, t6);
871 DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
872 tmp7_m, tmp6_m, t1, t3, t5, t7);
873 DUP4_ARG2(__lasx_xvilvl_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp0_m, tmp4_m,
875 DUP4_ARG2(__lasx_xvilvh_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp2_m, tmp6_m,
877 DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
878 tmp7_m, tmp6_m, out8, out10, out12, out14);
879 DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
880 tmp7_m, tmp6_m, out9, out11, out13, out15);
881 DUP2_ARG3(__lasx_xvpermi_q, out0, out0, 0x31, out1, out1, 0x31, out16, out17);
882 out18 = __lasx_xvpermi_q(out2, out2, 0x31);
884 DUP4_ARG2(__lasx_xvpermi_d, out0, 0xD8, out1, 0xD8, out2, 0xD8, out3, 0xD8,
885 out0, out1, out2, out3);
886 DUP4_ARG2(__lasx_xvpermi_d, out4, 0xD8, out5, 0xD8, out6, 0xD8, out7, 0xD8,
887 out4, out5, out6, out7);
888 DUP4_ARG2(__lasx_xvpermi_d, out8, 0xD8, out9, 0xD8, out10, 0xD8, out11,
889 0xD8, out8, out9, out10, out11);
890 DUP4_ARG2(__lasx_xvpermi_d, out12, 0xD8, out13, 0xD8, out14, 0xD8, out15,
891 0xD8, out12, out13, out14, out15);
892 out16 = __lasx_xvpermi_d(out16, 0xD8);
893 out17 = __lasx_xvpermi_d(out17, 0xD8);
894 out18 = __lasx_xvpermi_d(out18, 0xD8);
896 ROW_LASX(out0, out1, out2, out3, in0);
897 ROW_LASX(out1, out2, out3, out4, in1);
898 ROW_LASX(out2, out3, out4, out5, in2);
899 ROW_LASX(out3, out4, out5, out6, in3);
900 ROW_LASX(out4, out5, out6, out7, in4);
901 ROW_LASX(out5, out6, out7, out8, in5);
902 ROW_LASX(out6, out7, out8, out9, in6);
903 ROW_LASX(out7, out8, out9, out10, in7);
904 ROW_LASX(out8, out9, out10, out11, in8);
905 ROW_LASX(out9, out10, out11, out12, in9);
906 ROW_LASX(out10, out11, out12, out13, in10);
907 ROW_LASX(out11, out12, out13, out14, in11);
908 ROW_LASX(out12, out13, out14, out15, in12);
909 ROW_LASX(out13, out14, out15, out16, in13);
910 ROW_LASX(out14, out15, out16, out17, in14);
911 ROW_LASX(out15, out16, out17, out18, in15);
913 DUP4_ARG2(__lasx_xvilvl_b, in2, in0, in3, in1, in6, in4, in7, in5,
914 tmp0_m, tmp1_m, tmp2_m, tmp3_m);
915 DUP4_ARG2(__lasx_xvilvl_b, in10, in8, in11, in9, in14, in12, in15, in13,
916 tmp4_m, tmp5_m, tmp6_m, tmp7_m);
917 DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
918 tmp7_m, tmp6_m, t0, t2, t4, t6);
919 DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
920 tmp7_m, tmp6_m, t1, t3, t5, t7);
921 DUP4_ARG2(__lasx_xvilvl_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp0_m, tmp4_m,
923 DUP4_ARG2(__lasx_xvilvh_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp2_m, tmp6_m,
925 DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
926 tmp7_m, tmp6_m, out0, out2, out4, out6);
927 DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
928 tmp7_m, tmp6_m, out1, out3, out5, out7);
930 DUP4_ARG2(__lasx_xvilvh_b, in2, in0, in3, in1, in6, in4, in7, in5,
931 tmp0_m, tmp1_m, tmp2_m, tmp3_m);
932 DUP4_ARG2(__lasx_xvilvh_b, in10, in8, in11, in9, in14, in12, in15, in13,
933 tmp4_m, tmp5_m, tmp6_m, tmp7_m);
934 DUP4_ARG2(__lasx_xvilvl_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
935 tmp7_m, tmp6_m, t0, t2, t4, t6);
936 DUP4_ARG2(__lasx_xvilvh_b, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
937 tmp7_m, tmp6_m, t1, t3, t5, t7);
938 DUP4_ARG2(__lasx_xvilvl_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp0_m, tmp4_m,
940 DUP4_ARG2(__lasx_xvilvh_w, t2, t0, t3, t1, t6, t4, t7, t5, tmp2_m, tmp6_m,
942 DUP4_ARG2(__lasx_xvilvl_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
943 tmp7_m, tmp6_m, out8, out10, out12, out14);
944 DUP4_ARG2(__lasx_xvilvh_d, tmp1_m, tmp0_m, tmp3_m, tmp2_m, tmp5_m, tmp4_m,
945 tmp7_m, tmp6_m, out9, out11, out13, out15);
946 __lasx_xvstelm_d(out0,
dst, 0, 0);
947 __lasx_xvstelm_d(out0,
dst, 8, 1);
949 __lasx_xvstelm_d(out1,
dst, 0, 0);
950 __lasx_xvstelm_d(out1,
dst, 8, 1);
952 __lasx_xvstelm_d(out2,
dst, 0, 0);
953 __lasx_xvstelm_d(out2,
dst, 8, 1);
955 __lasx_xvstelm_d(out3,
dst, 0, 0);
956 __lasx_xvstelm_d(out3,
dst, 8, 1);
958 __lasx_xvstelm_d(out4,
dst, 0, 0);
959 __lasx_xvstelm_d(out4,
dst, 8, 1);
961 __lasx_xvstelm_d(out5,
dst, 0, 0);
962 __lasx_xvstelm_d(out5,
dst, 8, 1);
964 __lasx_xvstelm_d(out6,
dst, 0, 0);
965 __lasx_xvstelm_d(out6,
dst, 8, 1);
967 __lasx_xvstelm_d(out7,
dst, 0, 0);
968 __lasx_xvstelm_d(out7,
dst, 8, 1);
970 __lasx_xvstelm_d(out8,
dst, 0, 0);
971 __lasx_xvstelm_d(out8,
dst, 8, 1);
973 __lasx_xvstelm_d(out9,
dst, 0, 0);
974 __lasx_xvstelm_d(out9,
dst, 8, 1);
976 __lasx_xvstelm_d(out10,
dst, 0, 0);
977 __lasx_xvstelm_d(out10,
dst, 8, 1);
979 __lasx_xvstelm_d(out11,
dst, 0, 0);
980 __lasx_xvstelm_d(out11,
dst, 8, 1);
982 __lasx_xvstelm_d(out12,
dst, 0, 0);
983 __lasx_xvstelm_d(out12,
dst, 8, 1);
985 __lasx_xvstelm_d(out13,
dst, 0, 0);
986 __lasx_xvstelm_d(out13,
dst, 8, 1);
988 __lasx_xvstelm_d(out14,
dst, 0, 0);
989 __lasx_xvstelm_d(out14,
dst, 8, 1);
991 __lasx_xvstelm_d(out15,
dst, 0, 0);
992 __lasx_xvstelm_d(out15,
dst, 8, 1);
995 #define PUT_VC1_MSPEL_MC_H_LASX(hmode) \
996 void ff_put_vc1_mspel_mc ## hmode ## 0_16_lasx(uint8_t *dst, \
997 const uint8_t *src, \
998 ptrdiff_t stride, int rnd) \
1000 put_vc1_mspel_mc_h_lasx(dst, src, stride, hmode, rnd); \