FFmpeg
loongson_intrinsics.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * All rights reserved.
4  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
5  * Xiwei Gu <guxiwei-hf@loongson.cn>
6  * Lu Wang <wanglu@loongson.cn>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  *
24  */
25 
26 #ifndef AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H
27 #define AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H
28 
29 /*
30  * Copyright (c) 2021 Loongson Technology Corporation Limited
31  * All rights reserved.
32  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
33  * Xiwei Gu <guxiwei-hf@loongson.cn>
34  * Lu Wang <wanglu@loongson.cn>
35  *
36  * This file is a header file for loongarch builtin extention.
37  *
38  */
39 
40 #ifndef LOONGSON_INTRINSICS_H
41 #define LOONGSON_INTRINSICS_H
42 
43 /**
44  * MAJOR version: Macro usage changes.
45  * MINOR version: Add new functions, or bug fix.
46  * MICRO version: Comment changes or implementation changes.
47  */
48 #define LSOM_VERSION_MAJOR 1
49 #define LSOM_VERSION_MINOR 0
50 #define LSOM_VERSION_MICRO 3
51 
52 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
53 { \
54  _OUT0 = _INS(_IN0); \
55  _OUT1 = _INS(_IN1); \
56 }
57 
58 #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
59 { \
60  _OUT0 = _INS(_IN0, _IN1); \
61  _OUT1 = _INS(_IN2, _IN3); \
62 }
63 
64 #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
65 { \
66  _OUT0 = _INS(_IN0, _IN1, _IN2); \
67  _OUT1 = _INS(_IN3, _IN4, _IN5); \
68 }
69 
70 #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
71 { \
72  DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
73  DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
74 }
75 
76 #define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
77  _OUT0, _OUT1, _OUT2, _OUT3) \
78 { \
79  DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
80  DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
81 }
82 
83 #define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
84  _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
85 { \
86  DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
87  DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
88 }
89 
90 #ifdef __loongarch_sx
91 #include <lsxintrin.h>
92 /*
93  * =============================================================================
94  * Description : Dot product & addition of byte vector elements
95  * Arguments : Inputs - in_c, in_h, in_l
96  * Outputs - out
97  * Retrun Type - halfword
98  * Details : Signed byte elements from in_h are multiplied by
99  * signed byte elements from in_l, and then added adjacent to
100  * each other to get results with the twice size of input.
101  * Then the results plus to signed half word elements from in_c.
102  * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
103  * in_c : 1,2,3,4, 1,2,3,4
104  * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
105  * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
106  * out : 23,40,41,26, 23,40,41,26
107  * =============================================================================
108  */
109 static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l)
110 {
111  __m128i out;
112 
113  out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
114  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
115  return out;
116 }
117 
118 /*
119  * =============================================================================
120  * Description : Dot product & addition of byte vector elements
121  * Arguments : Inputs - in_c, in_h, in_l
122  * Outputs - out
123  * Retrun Type - halfword
124  * Details : Unsigned byte elements from in_h are multiplied by
125  * unsigned byte elements from in_l, and then added adjacent to
126  * each other to get results with the twice size of input.
127  * The results plus to signed half word elements from in_c.
128  * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
129  * in_c : 1,2,3,4, 1,2,3,4
130  * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
131  * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
132  * out : 23,40,41,26, 23,40,41,26
133  * =============================================================================
134  */
135 static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_l)
136 {
137  __m128i out;
138 
139  out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
140  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
141  return out;
142 }
143 
144 /*
145  * =============================================================================
146  * Description : Dot product & addition of half word vector elements
147  * Arguments : Inputs - in_c, in_h, in_l
148  * Outputs - out
149  * Retrun Type - __m128i
150  * Details : Signed half word elements from in_h are multiplied by
151  * signed half word elements from in_l, and then added adjacent to
152  * each other to get results with the twice size of input.
153  * Then the results plus to signed word elements from in_c.
154  * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
155  * in_c : 1,2,3,4
156  * in_h : 1,2,3,4, 5,6,7,8
157  * in_l : 8,7,6,5, 4,3,2,1
158  * out : 23,40,41,26
159  * =============================================================================
160  */
161 static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l)
162 {
163  __m128i out;
164 
165  out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
166  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
167  return out;
168 }
169 
170 /*
171  * =============================================================================
172  * Description : Dot product of byte vector elements
173  * Arguments : Inputs - in_h, in_l
174  * Outputs - out
175  * Retrun Type - halfword
176  * Details : Signed byte elements from in_h are multiplied by
177  * signed byte elements from in_l, and then added adjacent to
178  * each other to get results with the twice size of input.
179  * Example : out = __lsx_vdp2_h_b(in_h, in_l)
180  * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
181  * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
182  * out : 22,38,38,22, 22,38,38,22
183  * =============================================================================
184  */
185 static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
186 {
187  __m128i out;
188 
189  out = __lsx_vmulwev_h_b(in_h, in_l);
190  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
191  return out;
192 }
193 
194 /*
195  * =============================================================================
196  * Description : Dot product of byte vector elements
197  * Arguments : Inputs - in_h, in_l
198  * Outputs - out
199  * Retrun Type - halfword
200  * Details : Unsigned byte elements from in_h are multiplied by
201  * unsigned byte elements from in_l, and then added adjacent to
202  * each other to get results with the twice size of input.
203  * Example : out = __lsx_vdp2_h_bu(in_h, in_l)
204  * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
205  * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
206  * out : 22,38,38,22, 22,38,38,22
207  * =============================================================================
208  */
209 static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
210 {
211  __m128i out;
212 
213  out = __lsx_vmulwev_h_bu(in_h, in_l);
214  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
215  return out;
216 }
217 
218 /*
219  * =============================================================================
220  * Description : Dot product of byte vector elements
221  * Arguments : Inputs - in_h, in_l
222  * Outputs - out
223  * Retrun Type - halfword
224  * Details : Unsigned byte elements from in_h are multiplied by
225  * signed byte elements from in_l, and then added adjacent to
226  * each other to get results with the twice size of input.
227  * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l)
228  * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
229  * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
230  * out : 22,38,38,22, 22,38,38,6
231  * =============================================================================
232  */
233 static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
234 {
235  __m128i out;
236 
237  out = __lsx_vmulwev_h_bu_b(in_h, in_l);
238  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
239  return out;
240 }
241 
242 /*
243  * =============================================================================
244  * Description : Dot product of byte vector elements
245  * Arguments : Inputs - in_h, in_l
246  * Outputs - out
247  * Retrun Type - halfword
248  * Details : Signed byte elements from in_h are multiplied by
249  * signed byte elements from in_l, and then added adjacent to
250  * each other to get results with the twice size of input.
251  * Example : out = __lsx_vdp2_w_h(in_h, in_l)
252  * in_h : 1,2,3,4, 5,6,7,8
253  * in_l : 8,7,6,5, 4,3,2,1
254  * out : 22,38,38,22
255  * =============================================================================
256  */
257 static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l)
258 {
259  __m128i out;
260 
261  out = __lsx_vmulwev_w_h(in_h, in_l);
262  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
263  return out;
264 }
265 
266 /*
267  * =============================================================================
268  * Description : Clip all halfword elements of input vector between min & max
269  * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) : (_in))
270  * Arguments : Inputs - _in (input vector)
271  * - min (min threshold)
272  * - max (max threshold)
273  * Outputs - out (output vector with clipped elements)
274  * Return Type - signed halfword
275  * Example : out = __lsx_vclip_h(_in)
276  * _in : -8,2,280,249, -8,255,280,249
277  * min : 1,1,1,1, 1,1,1,1
278  * max : 9,9,9,9, 9,9,9,9
279  * out : 1,2,9,9, 1,9,9,9
280  * =============================================================================
281  */
282 static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max)
283 {
284  __m128i out;
285 
286  out = __lsx_vmax_h(min, _in);
287  out = __lsx_vmin_h(max, out);
288  return out;
289 }
290 
291 /*
292  * =============================================================================
293  * Description : Set each element of vector between 0 and 255
294  * Arguments : Inputs - _in
295  * Outputs - out
296  * Retrun Type - halfword
297  * Details : Signed byte elements from _in are clamped between 0 and 255.
298  * Example : out = __lsx_vclip255_h(_in)
299  * _in : -8,255,280,249, -8,255,280,249
300  * out : 0,255,255,249, 0,255,255,249
301  * =============================================================================
302  */
303 static inline __m128i __lsx_vclip255_h(__m128i _in)
304 {
305  __m128i out;
306 
307  out = __lsx_vmaxi_h(_in, 0);
308  out = __lsx_vsat_hu(out, 7);
309  return out;
310 }
311 
312 /*
313  * =============================================================================
314  * Description : Set each element of vector between 0 and 255
315  * Arguments : Inputs - _in
316  * Outputs - out
317  * Retrun Type - word
318  * Details : Signed byte elements from _in are clamped between 0 and 255.
319  * Example : out = __lsx_vclip255_w(_in)
320  * _in : -8,255,280,249
321  * out : 0,255,255,249
322  * =============================================================================
323  */
324 static inline __m128i __lsx_vclip255_w(__m128i _in)
325 {
326  __m128i out;
327 
328  out = __lsx_vmaxi_w(_in, 0);
329  out = __lsx_vsat_wu(out, 7);
330  return out;
331 }
332 
333 /*
334  * =============================================================================
335  * Description : Swap two variables
336  * Arguments : Inputs - _in0, _in1
337  * Outputs - _in0, _in1 (in-place)
338  * Details : Swapping of two input variables using xor
339  * Example : LSX_SWAP(_in0, _in1)
340  * _in0 : 1,2,3,4
341  * _in1 : 5,6,7,8
342  * _in0(out) : 5,6,7,8
343  * _in1(out) : 1,2,3,4
344  * =============================================================================
345  */
346 #define LSX_SWAP(_in0, _in1) \
347 { \
348  _in0 = __lsx_vxor_v(_in0, _in1); \
349  _in1 = __lsx_vxor_v(_in0, _in1); \
350  _in0 = __lsx_vxor_v(_in0, _in1); \
351 } \
352 
353 /*
354  * =============================================================================
355  * Description : Transpose 4x4 block with word elements in vectors
356  * Arguments : Inputs - in0, in1, in2, in3
357  * Outputs - out0, out1, out2, out3
358  * Details :
359  * Example :
360  * 1, 2, 3, 4 1, 5, 9,13
361  * 5, 6, 7, 8 to 2, 6,10,14
362  * 9,10,11,12 =====> 3, 7,11,15
363  * 13,14,15,16 4, 8,12,16
364  * =============================================================================
365  */
366 #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
367 { \
368  __m128i _t0, _t1, _t2, _t3; \
369  \
370  _t0 = __lsx_vilvl_w(_in1, _in0); \
371  _t1 = __lsx_vilvh_w(_in1, _in0); \
372  _t2 = __lsx_vilvl_w(_in3, _in2); \
373  _t3 = __lsx_vilvh_w(_in3, _in2); \
374  _out0 = __lsx_vilvl_d(_t2, _t0); \
375  _out1 = __lsx_vilvh_d(_t2, _t0); \
376  _out2 = __lsx_vilvl_d(_t3, _t1); \
377  _out3 = __lsx_vilvh_d(_t3, _t1); \
378 }
379 
380 /*
381  * =============================================================================
382  * Description : Transpose 8x8 block with byte elements in vectors
383  * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
384  * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
385  * Details : The rows of the matrix become columns, and the columns become rows.
386  * Example : LSX_TRANSPOSE8x8_B
387  * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
388  * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
389  * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
390  * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
391  * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
392  * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
393  * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
394  * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
395  *
396  * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
397  * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
398  * _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
399  * _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
400  * _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
401  * _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
402  * _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
403  * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
404  * =============================================================================
405  */
406 #define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
407  _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
408 { \
409  __m128i zero = {0}; \
410  __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; \
411  __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
412  \
413  _t0 = __lsx_vilvl_b(_in2, _in0); \
414  _t1 = __lsx_vilvl_b(_in3, _in1); \
415  _t2 = __lsx_vilvl_b(_in6, _in4); \
416  _t3 = __lsx_vilvl_b(_in7, _in5); \
417  _t4 = __lsx_vilvl_b(_t1, _t0); \
418  _t5 = __lsx_vilvh_b(_t1, _t0); \
419  _t6 = __lsx_vilvl_b(_t3, _t2); \
420  _t7 = __lsx_vilvh_b(_t3, _t2); \
421  _out0 = __lsx_vilvl_w(_t6, _t4); \
422  _out2 = __lsx_vilvh_w(_t6, _t4); \
423  _out4 = __lsx_vilvl_w(_t7, _t5); \
424  _out6 = __lsx_vilvh_w(_t7, _t5); \
425  _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
426  _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
427  _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
428  _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
429 }
430 
431 /*
432  * =============================================================================
433  * Description : Transpose 8x8 block with half word elements in vectors
434  * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
435  * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
436  * Details :
437  * Example :
438  * 00,01,02,03,04,05,06,07 00,10,20,30,40,50,60,70
439  * 10,11,12,13,14,15,16,17 01,11,21,31,41,51,61,71
440  * 20,21,22,23,24,25,26,27 02,12,22,32,42,52,62,72
441  * 30,31,32,33,34,35,36,37 to 03,13,23,33,43,53,63,73
442  * 40,41,42,43,44,45,46,47 ======> 04,14,24,34,44,54,64,74
443  * 50,51,52,53,54,55,56,57 05,15,25,35,45,55,65,75
444  * 60,61,62,63,64,65,66,67 06,16,26,36,46,56,66,76
445  * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77
446  * =============================================================================
447  */
448 #define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
449  _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
450 { \
451  __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
452  \
453  _s0 = __lsx_vilvl_h(_in6, _in4); \
454  _s1 = __lsx_vilvl_h(_in7, _in5); \
455  _t0 = __lsx_vilvl_h(_s1, _s0); \
456  _t1 = __lsx_vilvh_h(_s1, _s0); \
457  _s0 = __lsx_vilvh_h(_in6, _in4); \
458  _s1 = __lsx_vilvh_h(_in7, _in5); \
459  _t2 = __lsx_vilvl_h(_s1, _s0); \
460  _t3 = __lsx_vilvh_h(_s1, _s0); \
461  _s0 = __lsx_vilvl_h(_in2, _in0); \
462  _s1 = __lsx_vilvl_h(_in3, _in1); \
463  _t4 = __lsx_vilvl_h(_s1, _s0); \
464  _t5 = __lsx_vilvh_h(_s1, _s0); \
465  _s0 = __lsx_vilvh_h(_in2, _in0); \
466  _s1 = __lsx_vilvh_h(_in3, _in1); \
467  _t6 = __lsx_vilvl_h(_s1, _s0); \
468  _t7 = __lsx_vilvh_h(_s1, _s0); \
469  \
470  _out0 = __lsx_vpickev_d(_t0, _t4); \
471  _out2 = __lsx_vpickev_d(_t1, _t5); \
472  _out4 = __lsx_vpickev_d(_t2, _t6); \
473  _out6 = __lsx_vpickev_d(_t3, _t7); \
474  _out1 = __lsx_vpickod_d(_t0, _t4); \
475  _out3 = __lsx_vpickod_d(_t1, _t5); \
476  _out5 = __lsx_vpickod_d(_t2, _t6); \
477  _out7 = __lsx_vpickod_d(_t3, _t7); \
478 }
479 
480 /*
481  * =============================================================================
482  * Description : Transpose input 8x4 byte block into 4x8
483  * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block)
484  * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block)
485  * Return Type - as per RTYPE
486  * Details : The rows of the matrix become columns, and the columns become rows.
487  * Example : LSX_TRANSPOSE8x4_B
488  * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
489  * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
490  * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
491  * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
492  * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
493  * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
494  * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
495  * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
496  *
497  * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
498  * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
499  * _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
500  * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
501  * =============================================================================
502  */
503 #define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
504  _out0, _out1, _out2, _out3) \
505 { \
506  __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
507  \
508  _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
509  _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
510  _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
511  _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
512  _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
513  \
514  _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
515  _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
516  _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
517  \
518  _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
519  _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
520  _out1 = __lsx_vilvh_d(_out2, _out0); \
521  _out3 = __lsx_vilvh_d(_out0, _out2); \
522 }
523 
524 /*
525  * =============================================================================
526  * Description : Transpose 16x8 block with byte elements in vectors
527  * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8
528  * in9, in10, in11, in12, in13, in14, in15
529  * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
530  * Details :
531  * Example :
532  * 000,001,002,003,004,005,006,007
533  * 008,009,010,011,012,013,014,015
534  * 016,017,018,019,020,021,022,023
535  * 024,025,026,027,028,029,030,031
536  * 032,033,034,035,036,037,038,039
537  * 040,041,042,043,044,045,046,047 000,008,...,112,120
538  * 048,049,050,051,052,053,054,055 001,009,...,113,121
539  * 056,057,058,059,060,061,062,063 to 002,010,...,114,122
540  * 064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
541  * 072,073,074,075,076,077,078,079 004,012,...,116,124
542  * 080,081,082,083,084,085,086,087 005,013,...,117,125
543  * 088,089,090,091,092,093,094,095 006,014,...,118,126
544  * 096,097,098,099,100,101,102,103 007,015,...,119,127
545  * 104,105,106,107,108,109,110,111
546  * 112,113,114,115,116,117,118,119
547  * 120,121,122,123,124,125,126,127
548  * =============================================================================
549  */
550 #define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _in8, \
551  _in9, _in10, _in11, _in12, _in13, _in14, _in15, _out0, \
552  _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
553 { \
554  __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
555  __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
556  DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
557  _tmp0, _tmp1, _tmp2, _tmp3); \
558  DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
559  _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
560  DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
561  DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
562  DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
563  DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
564  DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
565  DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
566  DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
567  DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
568  DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
569  DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
570  DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
571  DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
572 }
573 
574 /*
575  * =============================================================================
576  * Description : Butterfly of 4 input vectors
577  * Arguments : Inputs - in0, in1, in2, in3
578  * Outputs - out0, out1, out2, out3
579  * Details : Butterfly operation
580  * Example :
581  * out0 = in0 + in3;
582  * out1 = in1 + in2;
583  * out2 = in1 - in2;
584  * out3 = in0 - in3;
585  * =============================================================================
586  */
587 #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
588 { \
589  _out0 = __lsx_vadd_b(_in0, _in3); \
590  _out1 = __lsx_vadd_b(_in1, _in2); \
591  _out2 = __lsx_vsub_b(_in1, _in2); \
592  _out3 = __lsx_vsub_b(_in0, _in3); \
593 }
594 #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
595 { \
596  _out0 = __lsx_vadd_h(_in0, _in3); \
597  _out1 = __lsx_vadd_h(_in1, _in2); \
598  _out2 = __lsx_vsub_h(_in1, _in2); \
599  _out3 = __lsx_vsub_h(_in0, _in3); \
600 }
601 #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
602 { \
603  _out0 = __lsx_vadd_w(_in0, _in3); \
604  _out1 = __lsx_vadd_w(_in1, _in2); \
605  _out2 = __lsx_vsub_w(_in1, _in2); \
606  _out3 = __lsx_vsub_w(_in0, _in3); \
607 }
608 #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
609 { \
610  _out0 = __lsx_vadd_d(_in0, _in3); \
611  _out1 = __lsx_vadd_d(_in1, _in2); \
612  _out2 = __lsx_vsub_d(_in1, _in2); \
613  _out3 = __lsx_vsub_d(_in0, _in3); \
614 }
615 
616 /*
617  * =============================================================================
618  * Description : Butterfly of 8 input vectors
619  * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
620  * Outputs - _out0, _out1, _out2, _out3, ~
621  * Details : Butterfly operation
622  * Example :
623  * _out0 = _in0 + _in7;
624  * _out1 = _in1 + _in6;
625  * _out2 = _in2 + _in5;
626  * _out3 = _in3 + _in4;
627  * _out4 = _in3 - _in4;
628  * _out5 = _in2 - _in5;
629  * _out6 = _in1 - _in6;
630  * _out7 = _in0 - _in7;
631  * =============================================================================
632  */
633 #define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
634  _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
635 { \
636  _out0 = __lsx_vadd_b(_in0, _in7); \
637  _out1 = __lsx_vadd_b(_in1, _in6); \
638  _out2 = __lsx_vadd_b(_in2, _in5); \
639  _out3 = __lsx_vadd_b(_in3, _in4); \
640  _out4 = __lsx_vsub_b(_in3, _in4); \
641  _out5 = __lsx_vsub_b(_in2, _in5); \
642  _out6 = __lsx_vsub_b(_in1, _in6); \
643  _out7 = __lsx_vsub_b(_in0, _in7); \
644 }
645 
646 #define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
647  _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
648 { \
649  _out0 = __lsx_vadd_h(_in0, _in7); \
650  _out1 = __lsx_vadd_h(_in1, _in6); \
651  _out2 = __lsx_vadd_h(_in2, _in5); \
652  _out3 = __lsx_vadd_h(_in3, _in4); \
653  _out4 = __lsx_vsub_h(_in3, _in4); \
654  _out5 = __lsx_vsub_h(_in2, _in5); \
655  _out6 = __lsx_vsub_h(_in1, _in6); \
656  _out7 = __lsx_vsub_h(_in0, _in7); \
657 }
658 
659 #define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
660  _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
661 { \
662  _out0 = __lsx_vadd_w(_in0, _in7); \
663  _out1 = __lsx_vadd_w(_in1, _in6); \
664  _out2 = __lsx_vadd_w(_in2, _in5); \
665  _out3 = __lsx_vadd_w(_in3, _in4); \
666  _out4 = __lsx_vsub_w(_in3, _in4); \
667  _out5 = __lsx_vsub_w(_in2, _in5); \
668  _out6 = __lsx_vsub_w(_in1, _in6); \
669  _out7 = __lsx_vsub_w(_in0, _in7); \
670 }
671 
672 #define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
673  _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
674 { \
675  _out0 = __lsx_vadd_d(_in0, _in7); \
676  _out1 = __lsx_vadd_d(_in1, _in6); \
677  _out2 = __lsx_vadd_d(_in2, _in5); \
678  _out3 = __lsx_vadd_d(_in3, _in4); \
679  _out4 = __lsx_vsub_d(_in3, _in4); \
680  _out5 = __lsx_vsub_d(_in2, _in5); \
681  _out6 = __lsx_vsub_d(_in1, _in6); \
682  _out7 = __lsx_vsub_d(_in0, _in7); \
683 }
684 
685 #endif //LSX
686 
687 #ifdef __loongarch_asx
688 #include <lasxintrin.h>
689 /*
690  * =============================================================================
691  * Description : Dot product of byte vector elements
692  * Arguments : Inputs - in_h, in_l
693  * Output - out
694  * Return Type - signed halfword
695  * Details : Unsigned byte elements from in_h are multiplied with
696  * unsigned byte elements from in_l producing a result
697  * twice the size of input i.e. signed halfword.
698  * Then this multiplied results of adjacent odd-even elements
699  * are added to the out vector
700  * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
701  * =============================================================================
702  */
703 static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l)
704 {
705  __m256i out;
706 
707  out = __lasx_xvmulwev_h_bu(in_h, in_l);
708  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
709  return out;
710 }
711 
712 /*
713  * =============================================================================
714  * Description : Dot product of byte vector elements
715  * Arguments : Inputs - in_h, in_l
716  * Output - out
717  * Return Type - signed halfword
718  * Details : Signed byte elements from in_h are multiplied with
719  * signed byte elements from in_l producing a result
720  * twice the size of input i.e. signed halfword.
721  * Then this iniplication results of adjacent odd-even elements
722  * are added to the out vector
723  * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
724  * =============================================================================
725  */
726 static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l)
727 {
728  __m256i out;
729 
730  out = __lasx_xvmulwev_h_b(in_h, in_l);
731  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
732  return out;
733 }
734 
735 /*
736  * =============================================================================
737  * Description : Dot product of halfword vector elements
738  * Arguments : Inputs - in_h, in_l
739  * Output - out
740  * Return Type - signed word
741  * Details : Signed halfword elements from in_h are multiplied with
742  * signed halfword elements from in_l producing a result
743  * twice the size of input i.e. signed word.
744  * Then this multiplied results of adjacent odd-even elements
745  * are added to the out vector.
746  * Example : out = __lasx_xvdp2_w_h(in_h, in_l)
747  * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
748  * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
749  * out : 22,38,38,22, 22,38,38,22
750  * =============================================================================
751  */
752 static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l)
753 {
754  __m256i out;
755 
756  out = __lasx_xvmulwev_w_h(in_h, in_l);
757  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
758  return out;
759 }
760 
761 /*
762  * =============================================================================
763  * Description : Dot product of word vector elements
764  * Arguments : Inputs - in_h, in_l
765  * Output - out
766  * Retrun Type - signed double
767  * Details : Signed word elements from in_h are multiplied with
768  * signed word elements from in_l producing a result
769  * twice the size of input i.e. signed double word.
770  * Then this multiplied results of adjacent odd-even elements
771  * are added to the out vector.
772  * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
773  * =============================================================================
774  */
775 static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l)
776 {
777  __m256i out;
778 
779  out = __lasx_xvmulwev_d_w(in_h, in_l);
780  out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
781  return out;
782 }
783 
784 /*
785  * =============================================================================
786  * Description : Dot product of halfword vector elements
787  * Arguments : Inputs - in_h, in_l
788  * Output - out
789  * Return Type - signed word
790  * Details : Unsigned halfword elements from in_h are multiplied with
791  * signed halfword elements from in_l producing a result
792  * twice the size of input i.e. unsigned word.
793  * Multiplication result of adjacent odd-even elements
794  * are added to the out vector
795  * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
796  * =============================================================================
797  */
798 static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
799 {
800  __m256i out;
801 
802  out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
803  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
804  return out;
805 }
806 
807 /*
808  * =============================================================================
809  * Description : Dot product & addition of byte vector elements
810  * Arguments : Inputs - in_h, in_l
811  * Output - out
812  * Retrun Type - halfword
813  * Details : Signed byte elements from in_h are multiplied with
814  * signed byte elements from in_l producing a result
815  * twice the size of input i.e. signed halfword.
816  * Then this multiplied results of adjacent odd-even elements
817  * are added to the in_c vector.
818  * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
819  * =============================================================================
820  */
821 static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h, __m256i in_l)
822 {
823  __m256i out;
824 
825  out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
826  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
827  return out;
828 }
829 
830 /*
831  * =============================================================================
832  * Description : Dot product of halfword vector elements
833  * Arguments : Inputs - in_c, in_h, in_l
834  * Output - out
835  * Return Type - per RTYPE
836  * Details : Signed halfword elements from in_h are multiplied with
837  * signed halfword elements from in_l producing a result
838  * twice the size of input i.e. signed word.
839  * Multiplication result of adjacent odd-even elements
840  * are added to the in_c vector.
841  * Example : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
842  * in_c : 1,2,3,4, 1,2,3,4
843  * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
844  * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
845  * out : 23,40,41,26, 23,40,41,26
846  * =============================================================================
847  */
848 static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
849 {
850  __m256i out;
851 
852  out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
853  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
854  return out;
855 }
856 
857 /*
858  * =============================================================================
859  * Description : Dot product of halfword vector elements
860  * Arguments : Inputs - in_c, in_h, in_l
861  * Output - out
862  * Return Type - signed word
863  * Details : Unsigned halfword elements from in_h are multiplied with
864  * unsigned halfword elements from in_l producing a result
865  * twice the size of input i.e. signed word.
866  * Multiplication result of adjacent odd-even elements
867  * are added to the in_c vector.
868  * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
869  * =============================================================================
870  */
871 static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i in_l)
872 {
873  __m256i out;
874 
875  out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
876  out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
877  return out;
878 }
879 
880 /*
881  * =============================================================================
882  * Description : Dot product of halfword vector elements
883  * Arguments : Inputs - in_c, in_h, in_l
884  * Output - out
885  * Return Type - signed word
886  * Details : Unsigned halfword elements from in_h are multiplied with
887  * signed halfword elements from in_l producing a result
888  * twice the size of input i.e. signed word.
889  * Multiplication result of adjacent odd-even elements
890  * are added to the in_c vector
891  * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
892  * =============================================================================
893  */
894 static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i in_l)
895 {
896  __m256i out;
897 
898  out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
899  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
900  return out;
901 }
902 
903 /*
904  * =============================================================================
905  * Description : Vector Unsigned Dot Product and Subtract
906  * Arguments : Inputs - in_c, in_h, in_l
907  * Output - out
908  * Return Type - signed halfword
909  * Details : Unsigned byte elements from in_h are multiplied with
910  * unsigned byte elements from in_l producing a result
911  * twice the size of input i.e. signed halfword.
912  * Multiplication result of adjacent odd-even elements
913  * are added together and subtracted from double width elements
914  * in_c vector.
915  * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
916  * =============================================================================
917  */
918 static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i in_l)
919 {
920  __m256i out;
921 
922  out = __lasx_xvmulwev_h_bu(in_h, in_l);
923  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
924  out = __lasx_xvsub_h(in_c, out);
925  return out;
926 }
927 
928 /*
929  * =============================================================================
930  * Description : Vector Signed Dot Product and Subtract
931  * Arguments : Inputs - in_c, in_h, in_l
932  * Output - out
933  * Return Type - signed word
934  * Details : Signed halfword elements from in_h are multiplied with
935  * Signed halfword elements from in_l producing a result
936  * twice the size of input i.e. signed word.
937  * Multiplication result of adjacent odd-even elements
938  * are added together and subtracted from double width elements
939  * in_c vector.
940  * Example : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
941  * in_c : 0,0,0,0, 0,0,0,0
942  * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
943  * in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
944  * out : -7,-3,0,0, 0,-1,0,-1
945  * =============================================================================
946  */
947 static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
948 {
949  __m256i out;
950 
951  out = __lasx_xvmulwev_w_h(in_h, in_l);
952  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
953  out = __lasx_xvsub_w(in_c, out);
954  return out;
955 }
956 
957 /*
958  * =============================================================================
959  * Description : Dot product of halfword vector elements
960  * Arguments : Inputs - in_h, in_l
961  * Output - out
962  * Return Type - signed word
963  * Details : Signed halfword elements from in_h are iniplied with
964  * signed halfword elements from in_l producing a result
965  * four times the size of input i.e. signed doubleword.
966  * Then this iniplication results of four adjacent elements
967  * are added together and stored to the out vector.
968  * Example : out = __lasx_xvdp4_d_h(in_h, in_l)
969  * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
970  * in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
971  * out : -2,0,1,1
972  * =============================================================================
973  */
974 static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l)
975 {
976  __m256i out;
977 
978  out = __lasx_xvmulwev_w_h(in_h, in_l);
979  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
980  out = __lasx_xvhaddw_d_w(out, out);
981  return out;
982 }
983 
984 /*
985  * =============================================================================
986  * Description : The high half of the vector elements are expanded and
987  * added after being doubled.
988  * Arguments : Inputs - in_h, in_l
989  * Output - out
990  * Details : The in_h vector and the in_l vector are added after the
991  * higher half of the two-fold sign extension (signed byte
992  * to signed halfword) and stored to the out vector.
993  * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l)
994  * =============================================================================
995  */
996 static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
997 {
998  __m256i out;
999 
1000  out = __lasx_xvilvh_b(in_h, in_l);
1001  out = __lasx_xvhaddw_h_b(out, out);
1002  return out;
1003 }
1004 
1005 /*
1006  * =============================================================================
1007  * Description : The high half of the vector elements are expanded and
1008  * added after being doubled.
1009  * Arguments : Inputs - in_h, in_l
1010  * Output - out
1011  * Details : The in_h vector and the in_l vector are added after the
1012  * higher half of the two-fold sign extension (signed halfword
1013  * to signed word) and stored to the out vector.
1014  * Example : out = __lasx_xvaddwh_w_h(in_h, in_l)
1015  * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1016  * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1017  * out : 1,0,0,-1, 1,0,0, 2
1018  * =============================================================================
1019  */
1020  static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l)
1021 {
1022  __m256i out;
1023 
1024  out = __lasx_xvilvh_h(in_h, in_l);
1025  out = __lasx_xvhaddw_w_h(out, out);
1026  return out;
1027 }
1028 
1029 /*
1030  * =============================================================================
1031  * Description : The low half of the vector elements are expanded and
1032  * added after being doubled.
1033  * Arguments : Inputs - in_h, in_l
1034  * Output - out
1035  * Details : The in_h vector and the in_l vector are added after the
1036  * lower half of the two-fold sign extension (signed byte
1037  * to signed halfword) and stored to the out vector.
1038  * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1039  * =============================================================================
1040  */
1041 static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l)
1042 {
1043  __m256i out;
1044 
1045  out = __lasx_xvilvl_b(in_h, in_l);
1046  out = __lasx_xvhaddw_h_b(out, out);
1047  return out;
1048 }
1049 
1050 /*
1051  * =============================================================================
1052  * Description : The low half of the vector elements are expanded and
1053  * added after being doubled.
1054  * Arguments : Inputs - in_h, in_l
1055  * Output - out
1056  * Details : The in_h vector and the in_l vector are added after the
1057  * lower half of the two-fold sign extension (signed halfword
1058  * to signed word) and stored to the out vector.
1059  * Example : out = __lasx_xvaddwl_w_h(in_h, in_l)
1060  * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1061  * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1062  * out : 5,-1,4,2, 1,0,2,-1
1063  * =============================================================================
1064  */
1065 static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l)
1066 {
1067  __m256i out;
1068 
1069  out = __lasx_xvilvl_h(in_h, in_l);
1070  out = __lasx_xvhaddw_w_h(out, out);
1071  return out;
1072 }
1073 
1074 /*
1075  * =============================================================================
1076  * Description : The low half of the vector elements are expanded and
1077  * added after being doubled.
1078  * Arguments : Inputs - in_h, in_l
1079  * Output - out
1080  * Details : The out vector and the out vector are added after the
1081  * lower half of the two-fold zero extension (unsigned byte
1082  * to unsigned halfword) and stored to the out vector.
1083  * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1084  * =============================================================================
1085  */
1086 static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l)
1087 {
1088  __m256i out;
1089 
1090  out = __lasx_xvilvl_b(in_h, in_l);
1091  out = __lasx_xvhaddw_hu_bu(out, out);
1092  return out;
1093 }
1094 
1095 /*
1096  * =============================================================================
1097  * Description : The low half of the vector elements are expanded and
1098  * added after being doubled.
1099  * Arguments : Inputs - in_h, in_l
1100  * Output - out
1101  * Details : The in_l vector after double zero extension (unsigned byte to
1102  * signed halfword),added to the in_h vector.
1103  * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
1104  * =============================================================================
1105  */
1106 static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l)
1107 {
1108  __m256i out;
1109 
1110  out = __lasx_xvsllwil_hu_bu(in_l, 0);
1111  out = __lasx_xvadd_h(in_h, out);
1112  return out;
1113 }
1114 
1115 /*
1116  * =============================================================================
1117  * Description : The low half of the vector elements are expanded and
1118  * added after being doubled.
1119  * Arguments : Inputs - in_h, in_l
1120  * Output - out
1121  * Details : The in_l vector after double sign extension (signed halfword to
1122  * signed word), added to the in_h vector.
1123  * Example : out = __lasx_xvaddw_w_w_h(in_h, in_l)
1124  * in_h : 0, 1,0,0, -1,0,0,1,
1125  * in_l : 2,-1,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1,
1126  * out : 2, 0,1,2, -1,0,1,1,
1127  * =============================================================================
1128  */
1129 static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l)
1130 {
1131  __m256i out;
1132 
1133  out = __lasx_xvsllwil_w_h(in_l, 0);
1134  out = __lasx_xvadd_w(in_h, out);
1135  return out;
1136 }
1137 
1138 /*
1139  * =============================================================================
1140  * Description : Multiplication and addition calculation after expansion
1141  * of the lower half of the vector.
1142  * Arguments : Inputs - in_c, in_h, in_l
1143  * Output - out
1144  * Details : The in_h vector and the in_l vector are multiplied after
1145  * the lower half of the two-fold sign extension (signed halfword
1146  * to signed word), and the result is added to the vector in_c,
1147  * then stored to the out vector.
1148  * Example : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1149  * in_c : 1,2,3,4, 5,6,7,8
1150  * in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
1151  * in_l : 200, 300, 400, 500, 2000, 3000, 4000, 5000,
1152  * -200,-300,-400,-500, -2000,-3000,-4000,-5000
1153  * out : 201, 602,1203,2004, -995, -1794,-2793,-3992
1154  * =============================================================================
1155  */
1156 static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
1157 {
1158  __m256i tmp0, tmp1, out;
1159 
1160  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1161  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1162  tmp0 = __lasx_xvmul_w(tmp0, tmp1);
1163  out = __lasx_xvadd_w(tmp0, in_c);
1164  return out;
1165 }
1166 
1167 /*
1168  * =============================================================================
1169  * Description : Multiplication and addition calculation after expansion
1170  * of the higher half of the vector.
1171  * Arguments : Inputs - in_c, in_h, in_l
1172  * Output - out
1173  * Details : The in_h vector and the in_l vector are multiplied after
1174  * the higher half of the two-fold sign extension (signed
1175  * halfword to signed word), and the result is added to
1176  * the vector in_c, then stored to the out vector.
1177  * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1178  * =============================================================================
1179  */
1180 static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
1181 {
1182  __m256i tmp0, tmp1, out;
1183 
1184  tmp0 = __lasx_xvilvh_h(in_h, in_h);
1185  tmp1 = __lasx_xvilvh_h(in_l, in_l);
1186  tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
1187  out = __lasx_xvadd_w(tmp0, in_c);
1188  return out;
1189 }
1190 
1191 /*
1192  * =============================================================================
1193  * Description : Multiplication calculation after expansion of the lower
1194  * half of the vector.
1195  * Arguments : Inputs - in_h, in_l
1196  * Output - out
1197  * Details : The in_h vector and the in_l vector are multiplied after
1198  * the lower half of the two-fold sign extension (signed
1199  * halfword to signed word), then stored to the out vector.
1200  * Example : out = __lasx_xvmulwl_w_h(in_h, in_l)
1201  * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1202  * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1203  * out : 6,1,3,0, 0,0,1,0
1204  * =============================================================================
1205  */
1206 static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l)
1207 {
1208  __m256i tmp0, tmp1, out;
1209 
1210  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1211  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1212  out = __lasx_xvmul_w(tmp0, tmp1);
1213  return out;
1214 }
1215 
1216 /*
1217  * =============================================================================
1218  * Description : Multiplication calculation after expansion of the lower
1219  * half of the vector.
1220  * Arguments : Inputs - in_h, in_l
1221  * Output - out
1222  * Details : The in_h vector and the in_l vector are multiplied after
1223  * the lower half of the two-fold sign extension (signed
1224  * halfword to signed word), then stored to the out vector.
1225  * Example : out = __lasx_xvmulwh_w_h(in_h, in_l)
1226  * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1227  * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1228  * out : 0,0,0,0, 0,0,0,1
1229  * =============================================================================
1230  */
1231 static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l)
1232 {
1233  __m256i tmp0, tmp1, out;
1234 
1235  tmp0 = __lasx_xvilvh_h(in_h, in_h);
1236  tmp1 = __lasx_xvilvh_h(in_l, in_l);
1237  out = __lasx_xvmulwev_w_h(tmp0, tmp1);
1238  return out;
1239 }
1240 
1241 /*
1242  * =============================================================================
1243  * Description : The low half of the vector elements are expanded and
1244  * added saturately after being doubled.
1245  * Arguments : Inputs - in_h, in_l
1246  * Output - out
1247  * Details : The in_h vector adds the in_l vector saturately after the lower
1248  * half of the two-fold zero extension (unsigned byte to unsigned
1249  * halfword) and the results are stored to the out vector.
1250  * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
1251  * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
1252  * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
1253  * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
1254  * =============================================================================
1255  */
1256 static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l)
1257 {
1258  __m256i tmp1, out;
1259  __m256i zero = {0};
1260 
1261  tmp1 = __lasx_xvilvl_b(zero, in_l);
1262  out = __lasx_xvsadd_hu(in_h, tmp1);
1263  return out;
1264 }
1265 
1266 /*
1267  * =============================================================================
1268  * Description : Clip all halfword elements of input vector between min & max
1269  * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1270  * Arguments : Inputs - in (input vector)
1271  * - min (min threshold)
1272  * - max (max threshold)
1273  * Outputs - in (output vector with clipped elements)
1274  * Return Type - signed halfword
1275  * Example : out = __lasx_xvclip_h(in, min, max)
1276  * in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
1277  * min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
1278  * max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
1279  * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
1280  * =============================================================================
1281  */
1282 static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max)
1283 {
1284  __m256i out;
1285 
1286  out = __lasx_xvmax_h(min, in);
1287  out = __lasx_xvmin_h(max, out);
1288  return out;
1289 }
1290 
1291 /*
1292  * =============================================================================
1293  * Description : Clip all signed halfword elements of input vector
1294  * between 0 & 255
1295  * Arguments : Inputs - in (input vector)
1296  * Outputs - out (output vector with clipped elements)
1297  * Return Type - signed halfword
1298  * Example : See out = __lasx_xvclip255_w(in)
1299  * =============================================================================
1300  */
1301 static inline __m256i __lasx_xvclip255_h(__m256i in)
1302 {
1303  __m256i out;
1304 
1305  out = __lasx_xvmaxi_h(in, 0);
1306  out = __lasx_xvsat_hu(out, 7);
1307  return out;
1308 }
1309 
1310 /*
1311  * =============================================================================
1312  * Description : Clip all signed word elements of input vector
1313  * between 0 & 255
1314  * Arguments : Inputs - in (input vector)
1315  * Output - out (output vector with clipped elements)
1316  * Return Type - signed word
1317  * Example : out = __lasx_xvclip255_w(in)
1318  * in : -8,255,280,249, -8,255,280,249
1319  * out : 0,255,255,249, 0,255,255,249
1320  * =============================================================================
1321  */
1322 static inline __m256i __lasx_xvclip255_w(__m256i in)
1323 {
1324  __m256i out;
1325 
1326  out = __lasx_xvmaxi_w(in, 0);
1327  out = __lasx_xvsat_wu(out, 7);
1328  return out;
1329 }
1330 
1331 /*
1332  * =============================================================================
1333  * Description : Indexed halfword element values are replicated to all
1334  * elements in output vector. If 'indx < 8' use xvsplati_l_*,
1335  * if 'indx >= 8' use xvsplati_h_*.
1336  * Arguments : Inputs - in, idx
1337  * Output - out
1338  * Details : Idx element value from in vector is replicated to all
1339  * elements in out vector.
1340  * Valid index range for halfword operation is 0-7
1341  * Example : out = __lasx_xvsplati_l_h(in, idx)
1342  * in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
1343  * idx : 0x02
1344  * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
1345  * =============================================================================
1346  */
1347 static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx)
1348 {
1349  __m256i out;
1350 
1351  out = __lasx_xvpermi_q(in, in, 0x02);
1352  out = __lasx_xvreplve_h(out, idx);
1353  return out;
1354 }
1355 
1356 /*
1357  * =============================================================================
1358  * Description : Indexed halfword element values are replicated to all
1359  * elements in output vector. If 'indx < 8' use xvsplati_l_*,
1360  * if 'indx >= 8' use xvsplati_h_*.
1361  * Arguments : Inputs - in, idx
1362  * Output - out
1363  * Details : Idx element value from in vector is replicated to all
1364  * elements in out vector.
1365  * Valid index range for halfword operation is 0-7
1366  * Example : out = __lasx_xvsplati_h_h(in, idx)
1367  * in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
1368  * idx : 0x09
1369  * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1370  * =============================================================================
1371  */
1372 static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx)
1373 {
1374  __m256i out;
1375 
1376  out = __lasx_xvpermi_q(in, in, 0x13);
1377  out = __lasx_xvreplve_h(out, idx);
1378  return out;
1379 }
1380 
1381 /*
1382  * =============================================================================
1383  * Description : Transpose 4x4 block with double word elements in vectors
1384  * Arguments : Inputs - _in0, _in1, _in2, _in3
1385  * Outputs - _out0, _out1, _out2, _out3
1386  * Example : LASX_TRANSPOSE4x4_D
1387  * _in0 : 1,2,3,4
1388  * _in1 : 1,2,3,4
1389  * _in2 : 1,2,3,4
1390  * _in3 : 1,2,3,4
1391  *
1392  * _out0 : 1,1,1,1
1393  * _out1 : 2,2,2,2
1394  * _out2 : 3,3,3,3
1395  * _out3 : 4,4,4,4
1396  * =============================================================================
1397  */
1398 #define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1399 { \
1400  __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
1401  _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
1402  _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
1403  _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
1404  _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
1405  _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
1406  _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
1407  _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
1408  _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
1409 }
1410 
1411 /*
1412  * =============================================================================
1413  * Description : Transpose 8x8 block with word elements in vectors
1414  * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1415  * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
1416  * Example : LASX_TRANSPOSE8x8_W
1417  * _in0 : 1,2,3,4,5,6,7,8
1418  * _in1 : 2,2,3,4,5,6,7,8
1419  * _in2 : 3,2,3,4,5,6,7,8
1420  * _in3 : 4,2,3,4,5,6,7,8
1421  * _in4 : 5,2,3,4,5,6,7,8
1422  * _in5 : 6,2,3,4,5,6,7,8
1423  * _in6 : 7,2,3,4,5,6,7,8
1424  * _in7 : 8,2,3,4,5,6,7,8
1425  *
1426  * _out0 : 1,2,3,4,5,6,7,8
1427  * _out1 : 2,2,2,2,2,2,2,2
1428  * _out2 : 3,3,3,3,3,3,3,3
1429  * _out3 : 4,4,4,4,4,4,4,4
1430  * _out4 : 5,5,5,5,5,5,5,5
1431  * _out5 : 6,6,6,6,6,6,6,6
1432  * _out6 : 7,7,7,7,7,7,7,7
1433  * _out7 : 8,8,8,8,8,8,8,8
1434  * =============================================================================
1435  */
1436 #define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1437  _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1438 { \
1439  __m256i _s0_m, _s1_m; \
1440  __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1441  __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1442  \
1443  _s0_m = __lasx_xvilvl_w(_in2, _in0); \
1444  _s1_m = __lasx_xvilvl_w(_in3, _in1); \
1445  _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1446  _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1447  _s0_m = __lasx_xvilvh_w(_in2, _in0); \
1448  _s1_m = __lasx_xvilvh_w(_in3, _in1); \
1449  _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1450  _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1451  _s0_m = __lasx_xvilvl_w(_in6, _in4); \
1452  _s1_m = __lasx_xvilvl_w(_in7, _in5); \
1453  _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1454  _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1455  _s0_m = __lasx_xvilvh_w(_in6, _in4); \
1456  _s1_m = __lasx_xvilvh_w(_in7, _in5); \
1457  _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1458  _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1459  _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
1460  _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
1461  _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
1462  _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
1463  _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
1464  _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
1465  _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
1466  _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
1467 }
1468 
1469 /*
1470  * =============================================================================
1471  * Description : Transpose input 16x8 byte block
1472  * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1473  * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1474  * (input 16x8 byte block)
1475  * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
1476  * (output 8x16 byte block)
1477  * Details : The rows of the matrix become columns, and the columns become rows.
1478  * Example : See LASX_TRANSPOSE16x8_H
1479  * =============================================================================
1480  */
1481 #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1482  _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15, \
1483  _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1484 { \
1485  __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1486  __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1487  \
1488  _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1489  _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1490  _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1491  _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1492  _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
1493  _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
1494  _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
1495  _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
1496  _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1497  _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1498  _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1499  _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1500  _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
1501  _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
1502  _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
1503  _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
1504  _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
1505  _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
1506  _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
1507  _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
1508  _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
1509  _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
1510  _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
1511  _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
1512  _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
1513  _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
1514  _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
1515  _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
1516  _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
1517  _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
1518  _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
1519  _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
1520 }
1521 
1522 /*
1523  * =============================================================================
1524  * Description : Transpose input 16x8 byte block
1525  * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1526  * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1527  * (input 16x8 byte block)
1528  * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
1529  * (output 8x16 byte block)
1530  * Details : The rows of the matrix become columns, and the columns become rows.
1531  * Example : LASX_TRANSPOSE16x8_H
1532  * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1533  * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1534  * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1535  * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1536  * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1537  * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1538  * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1539  * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1540  * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1541  * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1542  * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1543  * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1544  * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1545  * _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1546  * _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1547  * _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1548  *
1549  * _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
1550  * _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1551  * _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
1552  * _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
1553  * _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
1554  * _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1555  * _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
1556  * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
1557  * =============================================================================
1558  */
1559 #define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1560  _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15, \
1561  _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1562  { \
1563  __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1564  __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1565  __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
1566  \
1567  _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
1568  _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
1569  _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
1570  _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
1571  _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
1572  _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
1573  _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
1574  _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
1575  _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1576  _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1577  _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1578  _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1579  _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1580  _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1581  _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1582  _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1583  _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1584  _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1585  _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1586  _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1587  _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1588  _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1589  _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1590  _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1591  _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1592  _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1593  _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1594  _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1595  \
1596  _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
1597  _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
1598  _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
1599  _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
1600  _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
1601  _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
1602  _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
1603  _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
1604  _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1605  _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1606  _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1607  _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1608  _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1609  _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1610  _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1611  _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1612  _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1613  _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1614  _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1615  _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1616  _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1617  _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1618  _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1619  _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1620  _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1621  _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1622  _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1623  _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1624 }
1625 
1626 /*
1627  * =============================================================================
1628  * Description : Transpose 4x4 block with halfword elements in vectors
1629  * Arguments : Inputs - _in0, _in1, _in2, _in3
1630  * Outputs - _out0, _out1, _out2, _out3
1631  * Return Type - signed halfword
1632  * Details : The rows of the matrix become columns, and the columns become rows.
1633  * Example : See LASX_TRANSPOSE8x8_H
1634  * =============================================================================
1635  */
1636 #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1637 { \
1638  __m256i _s0_m, _s1_m; \
1639  \
1640  _s0_m = __lasx_xvilvl_h(_in1, _in0); \
1641  _s1_m = __lasx_xvilvl_h(_in3, _in2); \
1642  _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
1643  _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
1644  _out1 = __lasx_xvilvh_d(_out0, _out0); \
1645  _out3 = __lasx_xvilvh_d(_out2, _out2); \
1646 }
1647 
1648 /*
1649  * =============================================================================
1650  * Description : Transpose input 8x8 byte block
1651  * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1652  * (input 8x8 byte block)
1653  * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
1654  * (output 8x8 byte block)
1655  * Example : See LASX_TRANSPOSE8x8_H
1656  * =============================================================================
1657  */
1658 #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0, \
1659  _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1660 { \
1661  __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1662  __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1663  _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1664  _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1665  _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1666  _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1667  _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1668  _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1669  _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1670  _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1671  _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
1672  _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
1673  _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
1674  _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
1675  _out1 = __lasx_xvbsrl_v(_out0, 8); \
1676  _out3 = __lasx_xvbsrl_v(_out2, 8); \
1677  _out5 = __lasx_xvbsrl_v(_out4, 8); \
1678  _out7 = __lasx_xvbsrl_v(_out6, 8); \
1679 }
1680 
1681 /*
1682  * =============================================================================
1683  * Description : Transpose 8x8 block with halfword elements in vectors.
1684  * Arguments : Inputs - _in0, _in1, ~
1685  * Outputs - _out0, _out1, ~
1686  * Details : The rows of the matrix become columns, and the columns become rows.
1687  * Example : LASX_TRANSPOSE8x8_H
1688  * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1689  * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1690  * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1691  * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1692  * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1693  * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1694  * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1695  * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1696  *
1697  * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
1698  * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1699  * _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
1700  * _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
1701  * _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
1702  * _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
1703  * _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
1704  * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
1705  * =============================================================================
1706  */
1707 #define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0, \
1708  _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1709 { \
1710  __m256i _s0_m, _s1_m; \
1711  __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1712  __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1713  \
1714  _s0_m = __lasx_xvilvl_h(_in6, _in4); \
1715  _s1_m = __lasx_xvilvl_h(_in7, _in5); \
1716  _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1717  _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1718  _s0_m = __lasx_xvilvh_h(_in6, _in4); \
1719  _s1_m = __lasx_xvilvh_h(_in7, _in5); \
1720  _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1721  _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1722  \
1723  _s0_m = __lasx_xvilvl_h(_in2, _in0); \
1724  _s1_m = __lasx_xvilvl_h(_in3, _in1); \
1725  _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1726  _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1727  _s0_m = __lasx_xvilvh_h(_in2, _in0); \
1728  _s1_m = __lasx_xvilvh_h(_in3, _in1); \
1729  _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1730  _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1731  \
1732  _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
1733  _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
1734  _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
1735  _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
1736  _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
1737  _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
1738  _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
1739  _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
1740 }
1741 
1742 /*
1743  * =============================================================================
1744  * Description : Butterfly of 4 input vectors
1745  * Arguments : Inputs - _in0, _in1, _in2, _in3
1746  * Outputs - _out0, _out1, _out2, _out3
1747  * Details : Butterfly operation
1748  * Example : LASX_BUTTERFLY_4
1749  * _out0 = _in0 + _in3;
1750  * _out1 = _in1 + _in2;
1751  * _out2 = _in1 - _in2;
1752  * _out3 = _in0 - _in3;
1753  * =============================================================================
1754  */
1755 #define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1756 { \
1757  _out0 = __lasx_xvadd_b(_in0, _in3); \
1758  _out1 = __lasx_xvadd_b(_in1, _in2); \
1759  _out2 = __lasx_xvsub_b(_in1, _in2); \
1760  _out3 = __lasx_xvsub_b(_in0, _in3); \
1761 }
1762 #define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1763 { \
1764  _out0 = __lasx_xvadd_h(_in0, _in3); \
1765  _out1 = __lasx_xvadd_h(_in1, _in2); \
1766  _out2 = __lasx_xvsub_h(_in1, _in2); \
1767  _out3 = __lasx_xvsub_h(_in0, _in3); \
1768 }
1769 #define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1770 { \
1771  _out0 = __lasx_xvadd_w(_in0, _in3); \
1772  _out1 = __lasx_xvadd_w(_in1, _in2); \
1773  _out2 = __lasx_xvsub_w(_in1, _in2); \
1774  _out3 = __lasx_xvsub_w(_in0, _in3); \
1775 }
1776 #define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1777 { \
1778  _out0 = __lasx_xvadd_d(_in0, _in3); \
1779  _out1 = __lasx_xvadd_d(_in1, _in2); \
1780  _out2 = __lasx_xvsub_d(_in1, _in2); \
1781  _out3 = __lasx_xvsub_d(_in0, _in3); \
1782 }
1783 
1784 /*
1785  * =============================================================================
1786  * Description : Butterfly of 8 input vectors
1787  * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
1788  * Outputs - _out0, _out1, _out2, _out3, ~
1789  * Details : Butterfly operation
1790  * Example : LASX_BUTTERFLY_8
1791  * _out0 = _in0 + _in7;
1792  * _out1 = _in1 + _in6;
1793  * _out2 = _in2 + _in5;
1794  * _out3 = _in3 + _in4;
1795  * _out4 = _in3 - _in4;
1796  * _out5 = _in2 - _in5;
1797  * _out6 = _in1 - _in6;
1798  * _out7 = _in0 - _in7;
1799  * =============================================================================
1800  */
1801 #define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1802  _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
1803 { \
1804  _out0 = __lasx_xvadd_b(_in0, _in7); \
1805  _out1 = __lasx_xvadd_b(_in1, _in6); \
1806  _out2 = __lasx_xvadd_b(_in2, _in5); \
1807  _out3 = __lasx_xvadd_b(_in3, _in4); \
1808  _out4 = __lasx_xvsub_b(_in3, _in4); \
1809  _out5 = __lasx_xvsub_b(_in2, _in5); \
1810  _out6 = __lasx_xvsub_b(_in1, _in6); \
1811  _out7 = __lasx_xvsub_b(_in0, _in7); \
1812 }
1813 
1814 #define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1815  _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
1816 { \
1817  _out0 = __lasx_xvadd_h(_in0, _in7); \
1818  _out1 = __lasx_xvadd_h(_in1, _in6); \
1819  _out2 = __lasx_xvadd_h(_in2, _in5); \
1820  _out3 = __lasx_xvadd_h(_in3, _in4); \
1821  _out4 = __lasx_xvsub_h(_in3, _in4); \
1822  _out5 = __lasx_xvsub_h(_in2, _in5); \
1823  _out6 = __lasx_xvsub_h(_in1, _in6); \
1824  _out7 = __lasx_xvsub_h(_in0, _in7); \
1825 }
1826 
1827 #define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1828  _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
1829 { \
1830  _out0 = __lasx_xvadd_w(_in0, _in7); \
1831  _out1 = __lasx_xvadd_w(_in1, _in6); \
1832  _out2 = __lasx_xvadd_w(_in2, _in5); \
1833  _out3 = __lasx_xvadd_w(_in3, _in4); \
1834  _out4 = __lasx_xvsub_w(_in3, _in4); \
1835  _out5 = __lasx_xvsub_w(_in2, _in5); \
1836  _out6 = __lasx_xvsub_w(_in1, _in6); \
1837  _out7 = __lasx_xvsub_w(_in0, _in7); \
1838 }
1839 
1840 #define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1841  _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
1842 { \
1843  _out0 = __lasx_xvadd_d(_in0, _in7); \
1844  _out1 = __lasx_xvadd_d(_in1, _in6); \
1845  _out2 = __lasx_xvadd_d(_in2, _in5); \
1846  _out3 = __lasx_xvadd_d(_in3, _in4); \
1847  _out4 = __lasx_xvsub_d(_in3, _in4); \
1848  _out5 = __lasx_xvsub_d(_in2, _in5); \
1849  _out6 = __lasx_xvsub_d(_in1, _in6); \
1850  _out7 = __lasx_xvsub_d(_in0, _in7); \
1851 }
1852 
1853 #endif //LASX
1854 
1855 /*
1856  * =============================================================================
1857  * Description : Print out elements in vector.
1858  * Arguments : Inputs - RTYPE, _element_num, _in0, _enter
1859  * Outputs -
1860  * Details : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
1861  * '_enter' is TRUE, prefix "\nVP:" will be added first.
1862  * Example : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
1863  * VP:1,2,3,4,
1864  * =============================================================================
1865  */
1866 #define VECT_PRINT(RTYPE, element_num, in0, enter) \
1867 { \
1868  RTYPE _tmp0 = (RTYPE)in0; \
1869  int _i = 0; \
1870  if (enter) \
1871  printf("\nVP:"); \
1872  for(_i = 0; _i < element_num; _i++) \
1873  printf("%d,",_tmp0[_i]); \
1874 }
1875 
1876 #endif /* LOONGSON_INTRINSICS_H */
1877 #endif /* AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H */
out
FILE * out
Definition: movenc.c:54
max
#define max(a, b)
Definition: cuda_runtime.h:33
zero
#define zero
Definition: regdef.h:64
min
float min
Definition: vorbis_enc_data.h:429