FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
generic_macros_msa.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
23 
24 #include <stdint.h>
25 #include <msa.h>
26 
27 #define ALIGNMENT 16
28 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
29 
30 #define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
31 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
32 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
33 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
34 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
35 #define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
36 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
37 
38 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
39 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
40 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
41 #define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
42 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
43 #define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
44 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
45 
46 #if (__mips_isa_rev >= 6)
47  #define LH(psrc) \
48  ( { \
49  uint16_t val_lh_m = *(uint16_t *)(psrc); \
50  val_lh_m; \
51  } )
52 
53  #define LW(psrc) \
54  ( { \
55  uint32_t val_lw_m = *(uint32_t *)(psrc); \
56  val_lw_m; \
57  } )
58 
59  #if (__mips == 64)
60  #define LD(psrc) \
61  ( { \
62  uint64_t val_ld_m = *(uint64_t *)(psrc); \
63  val_ld_m; \
64  } )
65  #else // !(__mips == 64)
66  #define LD(psrc) \
67  ( { \
68  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
69  uint32_t val0_ld_m, val1_ld_m; \
70  uint64_t val_ld_m = 0; \
71  \
72  val0_ld_m = LW(psrc_ld_m); \
73  val1_ld_m = LW(psrc_ld_m + 4); \
74  \
75  val_ld_m = (uint64_t) (val1_ld_m); \
76  val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
77  val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
78  \
79  val_ld_m; \
80  } )
81  #endif // (__mips == 64)
82 
83  #define SH(val, pdst) *(uint16_t *)(pdst) = (val);
84  #define SW(val, pdst) *(uint32_t *)(pdst) = (val);
85  #define SD(val, pdst) *(uint64_t *)(pdst) = (val);
86 
87 #else // !(__mips_isa_rev >= 6)
88  #define LH(psrc) \
89  ( { \
90  uint8_t *psrc_lh_m = (uint8_t *) (psrc); \
91  uint16_t val_lh_m; \
92  \
93  __asm__ volatile ( \
94  "ulh %[val_lh_m], %[psrc_lh_m] \n\t" \
95  \
96  : [val_lh_m] "=r" (val_lh_m) \
97  : [psrc_lh_m] "m" (*psrc_lh_m) \
98  ); \
99  \
100  val_lh_m; \
101  } )
102 
103  #define LW(psrc) \
104  ( { \
105  uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
106  uint32_t val_lw_m; \
107  \
108  __asm__ volatile ( \
109  "ulw %[val_lw_m], %[psrc_lw_m] \n\t" \
110  \
111  : [val_lw_m] "=r" (val_lw_m) \
112  : [psrc_lw_m] "m" (*psrc_lw_m) \
113  ); \
114  \
115  val_lw_m; \
116  } )
117 
118  #if (__mips == 64)
119  #define LD(psrc) \
120  ( { \
121  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
122  uint64_t val_ld_m = 0; \
123  \
124  __asm__ volatile ( \
125  "uld %[val_ld_m], %[psrc_ld_m] \n\t" \
126  \
127  : [val_ld_m] "=r" (val_ld_m) \
128  : [psrc_ld_m] "m" (*psrc_ld_m) \
129  ); \
130  \
131  val_ld_m; \
132  } )
133  #else // !(__mips == 64)
134  #define LD(psrc) \
135  ( { \
136  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
137  uint32_t val0_ld_m, val1_ld_m; \
138  uint64_t val_ld_m = 0; \
139  \
140  val0_ld_m = LW(psrc_ld_m); \
141  val1_ld_m = LW(psrc_ld_m + 4); \
142  \
143  val_ld_m = (uint64_t) (val1_ld_m); \
144  val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
145  val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
146  \
147  val_ld_m; \
148  } )
149  #endif // (__mips == 64)
150 
151  #define SH(val, pdst) \
152  { \
153  uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
154  uint16_t val_sh_m = (val); \
155  \
156  __asm__ volatile ( \
157  "ush %[val_sh_m], %[pdst_sh_m] \n\t" \
158  \
159  : [pdst_sh_m] "=m" (*pdst_sh_m) \
160  : [val_sh_m] "r" (val_sh_m) \
161  ); \
162  }
163 
164  #define SW(val, pdst) \
165  { \
166  uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
167  uint32_t val_sw_m = (val); \
168  \
169  __asm__ volatile ( \
170  "usw %[val_sw_m], %[pdst_sw_m] \n\t" \
171  \
172  : [pdst_sw_m] "=m" (*pdst_sw_m) \
173  : [val_sw_m] "r" (val_sw_m) \
174  ); \
175  }
176 
177  #define SD(val, pdst) \
178  { \
179  uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
180  uint32_t val0_sd_m, val1_sd_m; \
181  \
182  val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
183  val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
184  \
185  SW(val0_sd_m, pdst_sd_m); \
186  SW(val1_sd_m, pdst_sd_m + 4); \
187  }
188 #endif // (__mips_isa_rev >= 6)
189 
190 /* Description : Load 4 words with stride
191  Arguments : Inputs - psrc (source pointer to load from)
192  - stride
193  Outputs - out0, out1, out2, out3
194  Details : Loads word in 'out0' from (psrc)
195  Loads word in 'out1' from (psrc + stride)
196  Loads word in 'out2' from (psrc + 2 * stride)
197  Loads word in 'out3' from (psrc + 3 * stride)
198 */
199 #define LW4(psrc, stride, out0, out1, out2, out3) \
200 { \
201  out0 = LW((psrc)); \
202  out1 = LW((psrc) + stride); \
203  out2 = LW((psrc) + 2 * stride); \
204  out3 = LW((psrc) + 3 * stride); \
205 }
206 
207 #define LW2(psrc, stride, out0, out1) \
208 { \
209  out0 = LW((psrc)); \
210  out1 = LW((psrc) + stride); \
211 }
212 
213 /* Description : Load double words with stride
214  Arguments : Inputs - psrc (source pointer to load from)
215  - stride
216  Outputs - out0, out1
217  Details : Loads double word in 'out0' from (psrc)
218  Loads double word in 'out1' from (psrc + stride)
219 */
220 #define LD2(psrc, stride, out0, out1) \
221 { \
222  out0 = LD((psrc)); \
223  out1 = LD((psrc) + stride); \
224 }
225 #define LD4(psrc, stride, out0, out1, out2, out3) \
226 { \
227  LD2((psrc), stride, out0, out1); \
228  LD2((psrc) + 2 * stride, stride, out2, out3); \
229 }
230 
231 /* Description : Store 4 words with stride
232  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
233  Details : Stores word from 'in0' to (pdst)
234  Stores word from 'in1' to (pdst + stride)
235  Stores word from 'in2' to (pdst + 2 * stride)
236  Stores word from 'in3' to (pdst + 3 * stride)
237 */
238 #define SW4(in0, in1, in2, in3, pdst, stride) \
239 { \
240  SW(in0, (pdst)) \
241  SW(in1, (pdst) + stride); \
242  SW(in2, (pdst) + 2 * stride); \
243  SW(in3, (pdst) + 3 * stride); \
244 }
245 
246 /* Description : Store 4 double words with stride
247  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
248  Details : Stores double word from 'in0' to (pdst)
249  Stores double word from 'in1' to (pdst + stride)
250  Stores double word from 'in2' to (pdst + 2 * stride)
251  Stores double word from 'in3' to (pdst + 3 * stride)
252 */
253 #define SD4(in0, in1, in2, in3, pdst, stride) \
254 { \
255  SD(in0, (pdst)) \
256  SD(in1, (pdst) + stride); \
257  SD(in2, (pdst) + 2 * stride); \
258  SD(in3, (pdst) + 3 * stride); \
259 }
260 
261 /* Description : Load vector elements with stride
262  Arguments : Inputs - psrc (source pointer to load from)
263  - stride
264  Outputs - out0, out1
265  Return Type - as per RTYPE
266  Details : Loads elements in 'out0' from (psrc)
267  Loads elements in 'out1' from (psrc + stride)
268 */
269 #define LD_V2(RTYPE, psrc, stride, out0, out1) \
270 { \
271  out0 = LD_V(RTYPE, (psrc)); \
272  out1 = LD_V(RTYPE, (psrc) + stride); \
273 }
274 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
275 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
276 #define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
277 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
278 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
279 
280 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
281 { \
282  LD_V2(RTYPE, (psrc), stride, out0, out1); \
283  out2 = LD_V(RTYPE, (psrc) + 2 * stride); \
284 }
285 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
286 #define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__)
287 
288 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
289 { \
290  LD_V2(RTYPE, (psrc), stride, out0, out1); \
291  LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
292 }
293 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
294 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
295 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
296 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
297 
298 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
299 { \
300  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
301  out4 = LD_V(RTYPE, (psrc) + 4 * stride); \
302 }
303 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
304 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
305 
306 #define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
307 { \
308  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
309  LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
310 }
311 #define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
312 #define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
313 #define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
314 #define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
315 
316 #define LD_V7(RTYPE, psrc, stride, \
317  out0, out1, out2, out3, out4, out5, out6) \
318 { \
319  LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
320  LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
321 }
322 #define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__)
323 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
324 
325 #define LD_V8(RTYPE, psrc, stride, \
326  out0, out1, out2, out3, out4, out5, out6, out7) \
327 { \
328  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
329  LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
330 }
331 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
332 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
333 #define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
334 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
335 
336 #define LD_V16(RTYPE, psrc, stride, \
337  out0, out1, out2, out3, out4, out5, out6, out7, \
338  out8, out9, out10, out11, out12, out13, out14, out15) \
339 { \
340  LD_V8(RTYPE, (psrc), stride, \
341  out0, out1, out2, out3, out4, out5, out6, out7); \
342  LD_V8(RTYPE, (psrc) + 8 * stride, stride, \
343  out8, out9, out10, out11, out12, out13, out14, out15); \
344 }
345 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
346 
347 /* Description : Load as 4x4 block of signed halfword elements from 1D source
348  data into 4 vectors (Each vector with 4 signed halfwords)
349  Arguments : Inputs - psrc
350  Outputs - out0, out1, out2, out3
351 */
352 #define LD4x4_SH(psrc, out0, out1, out2, out3) \
353 { \
354  out0 = LD_SH(psrc); \
355  out2 = LD_SH(psrc + 8); \
356  out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
357  out3 = (v8i16) __msa_ilvl_d((v2i64) out2, (v2i64) out2); \
358 }
359 
360 /* Description : Store vectors with stride
361  Arguments : Inputs - in0, in1, stride
362  Outputs - pdst (destination pointer to store to)
363  Details : Stores elements from 'in0' to (pdst)
364  Stores elements from 'in1' to (pdst + stride)
365 */
366 #define ST_V2(RTYPE, in0, in1, pdst, stride) \
367 { \
368  ST_V(RTYPE, in0, (pdst)); \
369  ST_V(RTYPE, in1, (pdst) + stride); \
370 }
371 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
372 #define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
373 #define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
374 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
375 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
376 
377 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \
378 { \
379  ST_V2(RTYPE, in0, in1, (pdst), stride); \
380  ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
381 }
382 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
383 #define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
384 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
385 #define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
386 
387 #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
388 { \
389  ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
390  ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
391 }
392 #define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
393 
394 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
395 { \
396  ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
397  ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
398 }
399 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
400 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
401 #define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
402 
403 /* Description : Store as 2x4 byte block to destination memory from input vector
404  Arguments : Inputs - in, stidx, pdst, stride
405  Return Type - unsigned byte
406  Details : Index stidx halfword element from 'in' vector is copied and
407  stored on first line
408  Index stidx+1 halfword element from 'in' vector is copied and
409  stored on second line
410  Index stidx+2 halfword element from 'in' vector is copied and
411  stored on third line
412  Index stidx+3 halfword element from 'in' vector is copied and
413  stored on fourth line
414 */
415 #define ST2x4_UB(in, stidx, pdst, stride) \
416 { \
417  uint16_t out0_m, out1_m, out2_m, out3_m; \
418  uint8_t *pblk_2x4_m = (uint8_t *) (pdst); \
419  \
420  out0_m = __msa_copy_u_h((v8i16) in, (stidx)); \
421  out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1)); \
422  out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2)); \
423  out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3)); \
424  \
425  SH(out0_m, pblk_2x4_m); \
426  SH(out1_m, pblk_2x4_m + stride); \
427  SH(out2_m, pblk_2x4_m + 2 * stride); \
428  SH(out3_m, pblk_2x4_m + 3 * stride); \
429 }
430 
431 /* Description : Store as 4x2 byte block to destination memory from input vector
432  Arguments : Inputs - in, pdst, stride
433  Return Type - unsigned byte
434  Details : Index 0 word element from input vector is copied and stored
435  on first line
436  Index 1 word element from input vector is copied and stored
437  on second line
438 */
439 #define ST4x2_UB(in, pdst, stride) \
440 { \
441  uint32_t out0_m, out1_m; \
442  uint8_t *pblk_4x2_m = (uint8_t *) (pdst); \
443  \
444  out0_m = __msa_copy_u_w((v4i32) in, 0); \
445  out1_m = __msa_copy_u_w((v4i32) in, 1); \
446  \
447  SW(out0_m, pblk_4x2_m); \
448  SW(out1_m, pblk_4x2_m + stride); \
449 }
450 
451 /* Description : Store as 4x4 byte block to destination memory from input vector
452  Arguments : Inputs - in0, in1, pdst, stride
453  Return Type - unsigned byte
454  Details : Idx0 word element from input vector 'in0' is copied and stored
455  on first line
456  Idx1 word element from input vector 'in0' is copied and stored
457  on second line
458  Idx2 word element from input vector 'in1' is copied and stored
459  on third line
460  Idx3 word element from input vector 'in1' is copied and stored
461  on fourth line
462 */
463 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
464 { \
465  uint32_t out0_m, out1_m, out2_m, out3_m; \
466  uint8_t *pblk_4x4_m = (uint8_t *) (pdst); \
467  \
468  out0_m = __msa_copy_u_w((v4i32) in0, idx0); \
469  out1_m = __msa_copy_u_w((v4i32) in0, idx1); \
470  out2_m = __msa_copy_u_w((v4i32) in1, idx2); \
471  out3_m = __msa_copy_u_w((v4i32) in1, idx3); \
472  \
473  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
474 }
475 #define ST4x8_UB(in0, in1, pdst, stride) \
476 { \
477  uint8_t *pblk_4x8 = (uint8_t *) (pdst); \
478  \
479  ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
480  ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
481 }
482 
483 /* Description : Store as 6x4 byte block to destination memory from input
484  vectors
485  Arguments : Inputs - in0, in1, pdst, stride
486  Return Type - unsigned byte
487  Details : Index 0 word element from input vector 'in0' is copied and
488  stored on first line followed by index 2 halfword element
489  Index 2 word element from input vector 'in0' is copied and
490  stored on second line followed by index 2 halfword element
491  Index 0 word element from input vector 'in1' is copied and
492  stored on third line followed by index 2 halfword element
493  Index 2 word element from input vector 'in1' is copied and
494  stored on fourth line followed by index 2 halfword element
495 */
496 #define ST6x4_UB(in0, in1, pdst, stride) \
497 { \
498  uint32_t out0_m, out1_m, out2_m, out3_m; \
499  uint16_t out4_m, out5_m, out6_m, out7_m; \
500  uint8_t *pblk_6x4_m = (uint8_t *) (pdst); \
501  \
502  out0_m = __msa_copy_u_w((v4i32) in0, 0); \
503  out1_m = __msa_copy_u_w((v4i32) in0, 2); \
504  out2_m = __msa_copy_u_w((v4i32) in1, 0); \
505  out3_m = __msa_copy_u_w((v4i32) in1, 2); \
506  \
507  out4_m = __msa_copy_u_h((v8i16) in0, 2); \
508  out5_m = __msa_copy_u_h((v8i16) in0, 6); \
509  out6_m = __msa_copy_u_h((v8i16) in1, 2); \
510  out7_m = __msa_copy_u_h((v8i16) in1, 6); \
511  \
512  SW(out0_m, pblk_6x4_m); \
513  SH(out4_m, (pblk_6x4_m + 4)); \
514  pblk_6x4_m += stride; \
515  SW(out1_m, pblk_6x4_m); \
516  SH(out5_m, (pblk_6x4_m + 4)); \
517  pblk_6x4_m += stride; \
518  SW(out2_m, pblk_6x4_m); \
519  SH(out6_m, (pblk_6x4_m + 4)); \
520  pblk_6x4_m += stride; \
521  SW(out3_m, pblk_6x4_m); \
522  SH(out7_m, (pblk_6x4_m + 4)); \
523 }
524 
525 /* Description : Store as 8x1 byte block to destination memory from input vector
526  Arguments : Inputs - in, pdst
527  Details : Index 0 double word element from input vector 'in' is copied
528  and stored to destination memory at (pdst)
529 */
530 #define ST8x1_UB(in, pdst) \
531 { \
532  uint64_t out0_m; \
533  out0_m = __msa_copy_u_d((v2i64) in, 0); \
534  SD(out0_m, pdst); \
535 }
536 
537 /* Description : Store as 8x2 byte block to destination memory from input vector
538  Arguments : Inputs - in, pdst, stride
539  Details : Index 0 double word element from input vector 'in' is copied
540  and stored to destination memory at (pdst)
541  Index 1 double word element from input vector 'in' is copied
542  and stored to destination memory at (pdst + stride)
543 */
544 #define ST8x2_UB(in, pdst, stride) \
545 { \
546  uint64_t out0_m, out1_m; \
547  uint8_t *pblk_8x2_m = (uint8_t *) (pdst); \
548  \
549  out0_m = __msa_copy_u_d((v2i64) in, 0); \
550  out1_m = __msa_copy_u_d((v2i64) in, 1); \
551  \
552  SD(out0_m, pblk_8x2_m); \
553  SD(out1_m, pblk_8x2_m + stride); \
554 }
555 
556 /* Description : Store as 8x4 byte block to destination memory from input
557  vectors
558  Arguments : Inputs - in0, in1, pdst, stride
559  Details : Index 0 double word element from input vector 'in0' is copied
560  and stored to destination memory at (pblk_8x4_m)
561  Index 1 double word element from input vector 'in0' is copied
562  and stored to destination memory at (pblk_8x4_m + stride)
563  Index 0 double word element from input vector 'in1' is copied
564  and stored to destination memory at (pblk_8x4_m + 2 * stride)
565  Index 1 double word element from input vector 'in1' is copied
566  and stored to destination memory at (pblk_8x4_m + 3 * stride)
567 */
568 #define ST8x4_UB(in0, in1, pdst, stride) \
569 { \
570  uint64_t out0_m, out1_m, out2_m, out3_m; \
571  uint8_t *pblk_8x4_m = (uint8_t *) (pdst); \
572  \
573  out0_m = __msa_copy_u_d((v2i64) in0, 0); \
574  out1_m = __msa_copy_u_d((v2i64) in0, 1); \
575  out2_m = __msa_copy_u_d((v2i64) in1, 0); \
576  out3_m = __msa_copy_u_d((v2i64) in1, 1); \
577  \
578  SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
579 }
580 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride) \
581 { \
582  uint8_t *pblk_8x8_m = (uint8_t *) (pdst); \
583  \
584  ST8x4_UB(in0, in1, pblk_8x8_m, stride); \
585  ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride); \
586 }
587 #define ST12x4_UB(in0, in1, in2, pdst, stride) \
588 { \
589  uint8_t *pblk_12x4_m = (uint8_t *) (pdst); \
590  \
591  /* left 8x4 */ \
592  ST8x4_UB(in0, in1, pblk_12x4_m, stride); \
593  /* right 4x4 */ \
594  ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride); \
595 }
596 
597 /* Description : Store as 12x8 byte block to destination memory from
598  input vectors
599  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
600  Details : Index 0 double word element from input vector 'in0' is copied
601  and stored to destination memory at (pblk_12x8_m) followed by
602  index 2 word element from same input vector 'in0' at
603  (pblk_12x8_m + 8)
604  Similar to remaining lines
605 */
606 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
607 { \
608  uint64_t out0_m, out1_m, out2_m, out3_m; \
609  uint64_t out4_m, out5_m, out6_m, out7_m; \
610  uint32_t out8_m, out9_m, out10_m, out11_m; \
611  uint32_t out12_m, out13_m, out14_m, out15_m; \
612  uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
613  \
614  out0_m = __msa_copy_u_d((v2i64) in0, 0); \
615  out1_m = __msa_copy_u_d((v2i64) in1, 0); \
616  out2_m = __msa_copy_u_d((v2i64) in2, 0); \
617  out3_m = __msa_copy_u_d((v2i64) in3, 0); \
618  out4_m = __msa_copy_u_d((v2i64) in4, 0); \
619  out5_m = __msa_copy_u_d((v2i64) in5, 0); \
620  out6_m = __msa_copy_u_d((v2i64) in6, 0); \
621  out7_m = __msa_copy_u_d((v2i64) in7, 0); \
622  \
623  out8_m = __msa_copy_u_w((v4i32) in0, 2); \
624  out9_m = __msa_copy_u_w((v4i32) in1, 2); \
625  out10_m = __msa_copy_u_w((v4i32) in2, 2); \
626  out11_m = __msa_copy_u_w((v4i32) in3, 2); \
627  out12_m = __msa_copy_u_w((v4i32) in4, 2); \
628  out13_m = __msa_copy_u_w((v4i32) in5, 2); \
629  out14_m = __msa_copy_u_w((v4i32) in6, 2); \
630  out15_m = __msa_copy_u_w((v4i32) in7, 2); \
631  \
632  SD(out0_m, pblk_12x8_m); \
633  SW(out8_m, pblk_12x8_m + 8); \
634  pblk_12x8_m += stride; \
635  SD(out1_m, pblk_12x8_m); \
636  SW(out9_m, pblk_12x8_m + 8); \
637  pblk_12x8_m += stride; \
638  SD(out2_m, pblk_12x8_m); \
639  SW(out10_m, pblk_12x8_m + 8); \
640  pblk_12x8_m += stride; \
641  SD(out3_m, pblk_12x8_m); \
642  SW(out11_m, pblk_12x8_m + 8); \
643  pblk_12x8_m += stride; \
644  SD(out4_m, pblk_12x8_m); \
645  SW(out12_m, pblk_12x8_m + 8); \
646  pblk_12x8_m += stride; \
647  SD(out5_m, pblk_12x8_m); \
648  SW(out13_m, pblk_12x8_m + 8); \
649  pblk_12x8_m += stride; \
650  SD(out6_m, pblk_12x8_m); \
651  SW(out14_m, pblk_12x8_m + 8); \
652  pblk_12x8_m += stride; \
653  SD(out7_m, pblk_12x8_m); \
654  SW(out15_m, pblk_12x8_m + 8); \
655 }
656 
657 /* Description : average with rounding (in0 + in1 + 1) / 2.
658  Arguments : Inputs - in0, in1, in2, in3,
659  Outputs - out0, out1
660  Return Type - as per RTYPE
661  Details : Each byte element from 'in0' vector is added with each byte
662  element from 'in1' vector. The addition of the elements plus 1
663  (for rounding) is done unsigned with full precision,
664  i.e. the result has one extra bit. Unsigned division by 2
665  (or logical shift right by one bit) is performed before writing
666  the result to vector 'out0'
667  Similar for the pair of 'in2' and 'in3'
668 */
669 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
670 { \
671  out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1); \
672  out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3); \
673 }
674 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
675 
676 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
677  out0, out1, out2, out3) \
678 { \
679  AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
680  AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
681 }
682 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
683 
684 /* Description : Immediate number of columns to slide with zero
685  Arguments : Inputs - in0, in1, slide_val
686  Outputs - out0, out1
687  Return Type - as per RTYPE
688  Details : Byte elements from 'zero_m' vector are slide into 'in0' by
689  number of elements specified by 'slide_val'
690 */
691 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
692 { \
693  v16i8 zero_m = { 0 }; \
694  out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \
695  out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
696 }
697 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
698 #define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__)
699 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
700 
701 #define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2, slide_val) \
702 { \
703  v16i8 zero_m = { 0 }; \
704  SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
705  out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val); \
706 }
707 #define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__)
708 #define SLDI_B3_0_SB(...) SLDI_B3_0(v16i8, __VA_ARGS__)
709 
710 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
711  out0, out1, out2, out3, slide_val) \
712 { \
713  SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
714  SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
715 }
716 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
717 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
718 #define SLDI_B4_0_SH(...) SLDI_B4_0(v8i16, __VA_ARGS__)
719 
720 /* Description : Immediate number of columns to slide
721  Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
722  Outputs - out0, out1
723  Return Type - as per RTYPE
724  Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by
725  number of elements specified by 'slide_val'
726 */
727 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
728 { \
729  out0 = (RTYPE) __msa_sldi_b((v16i8) in0_0, (v16i8) in1_0, slide_val); \
730  out1 = (RTYPE) __msa_sldi_b((v16i8) in0_1, (v16i8) in1_1, slide_val); \
731 }
732 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
733 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
734 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
735 
736 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2, \
737  out0, out1, out2, slide_val) \
738 { \
739  SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
740  out2 = (RTYPE) __msa_sldi_b((v16i8) in0_2, (v16i8) in1_2, slide_val); \
741 }
742 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
743 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
744 
745 /* Description : Shuffle byte vector elements as per mask vector
746  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
747  Outputs - out0, out1
748  Return Type - as per RTYPE
749  Details : Selective byte elements from in0 & in1 are copied to out0 as
750  per control vector mask0
751  Selective byte elements from in2 & in3 are copied to out1 as
752  per control vector mask1
753 */
754 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
755 { \
756  out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
757  out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
758 }
759 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
760 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
761 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
762 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
763 
764 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
765  out0, out1, out2) \
766 { \
767  VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
768  out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
769 }
770 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
771 
772 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
773  out0, out1, out2, out3) \
774 { \
775  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
776  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
777 }
778 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
779 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
780 
781 /* Description : Shuffle halfword vector elements as per mask vector
782  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
783  Outputs - out0, out1
784  Return Type - as per RTYPE
785  Details : Selective halfword elements from in0 & in1 are copied to out0
786  as per control vector mask0
787  Selective halfword elements from in2 & in3 are copied to out1
788  as per control vector mask1
789 */
790 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
791 { \
792  out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \
793  out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \
794 }
795 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
796 
797 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
798  out0, out1, out2) \
799 { \
800  VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
801  out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4); \
802 }
803 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
804 
805 /* Description : Shuffle byte vector elements as per mask vector
806  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
807  Outputs - out0, out1
808  Return Type - as per RTYPE
809  Details : Selective byte elements from in0 & in1 are copied to out0 as
810  per control vector mask0
811  Selective byte elements from in2 & in3 are copied to out1 as
812  per control vector mask1
813 */
814 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
815 { \
816  out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
817  out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
818 }
819 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
820 
821 /* Description : Dot product of byte vector elements
822  Arguments : Inputs - mult0, mult1
823  cnst0, cnst1
824  Outputs - out0, out1
825  Return Type - as per RTYPE
826  Details : Unsigned byte elements from mult0 are multiplied with
827  unsigned byte elements from cnst0 producing a result
828  twice the size of input i.e. unsigned halfword.
829  Then this multiplication results of adjacent odd-even elements
830  are added together and stored to the out vector
831  (2 unsigned halfword results)
832 */
833 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
834 { \
835  out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0); \
836  out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1); \
837 }
838 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
839 
840 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
841  cnst0, cnst1, cnst2, cnst3, \
842  out0, out1, out2, out3) \
843 { \
844  DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
845  DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
846 }
847 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
848 
849 /* Description : Dot product of byte vector elements
850  Arguments : Inputs - mult0, mult1
851  cnst0, cnst1
852  Outputs - out0, out1
853  Return Type - as per RTYPE
854  Details : Signed byte elements from mult0 are multiplied with
855  signed byte elements from cnst0 producing a result
856  twice the size of input i.e. signed halfword.
857  Then this multiplication results of adjacent odd-even elements
858  are added together and stored to the out vector
859  (2 signed halfword results)
860 */
861 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
862 { \
863  out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
864  out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
865 }
866 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
867 
868 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
869  out0, out1, out2) \
870 { \
871  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
872  out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
873 }
874 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
875 
876 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
877  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
878 { \
879  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
880  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
881 }
882 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
883 
884 /* Description : Dot product of halfword vector elements
885  Arguments : Inputs - mult0, mult1
886  cnst0, cnst1
887  Outputs - out0, out1
888  Return Type - as per RTYPE
889  Details : Signed halfword elements from mult0 are multiplied with
890  signed halfword elements from cnst0 producing a result
891  twice the size of input i.e. signed word.
892  Then this multiplication results of adjacent odd-even elements
893  are added together and stored to the out vector
894  (2 signed word results)
895 */
896 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
897 { \
898  out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \
899  out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \
900 }
901 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
902 
903 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
904  cnst0, cnst1, cnst2, cnst3, \
905  out0, out1, out2, out3) \
906 { \
907  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
908  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
909 }
910 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
911 
912 /* Description : Dot product & addition of byte vector elements
913  Arguments : Inputs - mult0, mult1
914  cnst0, cnst1
915  Outputs - out0, out1
916  Return Type - as per RTYPE
917  Details : Signed byte elements from mult0 are multiplied with
918  signed byte elements from cnst0 producing a result
919  twice the size of input i.e. signed halfword.
920  Then this multiplication results of adjacent odd-even elements
921  are added to the out vector
922  (2 signed halfword results)
923 */
924 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
925 { \
926  out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
927  (v16i8) mult0, (v16i8) cnst0); \
928  out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
929  (v16i8) mult1, (v16i8) cnst1); \
930 }
931 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
932 
933 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
934  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
935 { \
936  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
937  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
938 }
939 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
940 
941 /* Description : Dot product & addition of byte vector elements
942  Arguments : Inputs - mult0, mult1
943  cnst0, cnst1
944  Outputs - out0, out1
945  Return Type - as per RTYPE
946  Details : Unsigned byte elements from mult0 are multiplied with
947  unsigned byte elements from cnst0 producing a result
948  twice the size of input i.e. unsigned halfword.
949  Then this multiplication results of adjacent odd-even elements
950  are added to the out vector
951  (2 unsigned halfword results)
952 */
953 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
954 { \
955  out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0, \
956  (v16u8) mult0, (v16u8) cnst0); \
957  out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1, \
958  (v16u8) mult1, (v16u8) cnst1); \
959 }
960 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
961 
962 /* Description : Dot product & addition of halfword vector elements
963  Arguments : Inputs - mult0, mult1
964  cnst0, cnst1
965  Outputs - out0, out1
966  Return Type - as per RTYPE
967  Details : Signed halfword elements from mult0 are multiplied with
968  signed halfword elements from cnst0 producing a result
969  twice the size of input i.e. signed word.
970  Then this multiplication results of adjacent odd-even elements
971  are added to the out vector
972  (2 signed word results)
973 */
974 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
975 { \
976  out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
977  (v8i16) mult0, (v8i16) cnst0); \
978  out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
979  (v8i16) mult1, (v8i16) cnst1); \
980 }
981 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
982 
983 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \
984  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
985 { \
986  DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
987  DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
988 }
989 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
990 
991 /* Description : Minimum values between unsigned elements of
992  either vector are copied to the output vector
993  Arguments : Inputs - in0, in1, min_vec
994  Outputs - in0, in1, (in place)
995  Return Type - as per RTYPE
996  Details : Minimum of unsigned halfword element values from 'in0' and
997  'min_value' are written to output vector 'in0'
998 */
999 #define MIN_UH2(RTYPE, in0, in1, min_vec) \
1000 { \
1001  in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec); \
1002  in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec); \
1003 }
1004 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
1005 
1006 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
1007 { \
1008  MIN_UH2(RTYPE, in0, in1, min_vec); \
1009  MIN_UH2(RTYPE, in2, in3, min_vec); \
1010 }
1011 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
1012 
1013 /* Description : Clips all halfword elements of input vector between min & max
1014  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1015  Arguments : Inputs - in (input vector)
1016  - min (min threshold)
1017  - max (max threshold)
1018  Outputs - out_m (output vector with clipped elements)
1019  Return Type - signed halfword
1020 */
1021 #define CLIP_SH(in, min, max) \
1022 ( { \
1023  v8i16 out_m; \
1024  \
1025  out_m = __msa_max_s_h((v8i16) min, (v8i16) in); \
1026  out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m); \
1027  out_m; \
1028 } )
1029 
1030 /* Description : Clips all signed halfword elements of input vector
1031  between 0 & 255
1032  Arguments : Inputs - in (input vector)
1033  Outputs - out_m (output vector with clipped elements)
1034  Return Type - signed halfword
1035 */
1036 #define CLIP_SH_0_255(in) \
1037 ( { \
1038  v8i16 max_m = __msa_ldi_h(255); \
1039  v8i16 out_m; \
1040  \
1041  out_m = __msa_maxi_s_h((v8i16) in, 0); \
1042  out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m); \
1043  out_m; \
1044 } )
1045 #define CLIP_SH2_0_255(in0, in1) \
1046 { \
1047  in0 = CLIP_SH_0_255(in0); \
1048  in1 = CLIP_SH_0_255(in1); \
1049 }
1050 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
1051 { \
1052  CLIP_SH2_0_255(in0, in1); \
1053  CLIP_SH2_0_255(in2, in3); \
1054 }
1055 
1056 #define CLIP_SH_0_255_MAX_SATU(in) \
1057 ( { \
1058  v8i16 out_m; \
1059  \
1060  out_m = __msa_maxi_s_h((v8i16) in, 0); \
1061  out_m = (v8i16) __msa_sat_u_h((v8u16) out_m, 7); \
1062  out_m; \
1063 } )
1064 #define CLIP_SH2_0_255_MAX_SATU(in0, in1) \
1065 { \
1066  in0 = CLIP_SH_0_255_MAX_SATU(in0); \
1067  in1 = CLIP_SH_0_255_MAX_SATU(in1); \
1068 }
1069 #define CLIP_SH4_0_255_MAX_SATU(in0, in1, in2, in3) \
1070 { \
1071  CLIP_SH2_0_255_MAX_SATU(in0, in1); \
1072  CLIP_SH2_0_255_MAX_SATU(in2, in3); \
1073 }
1074 
1075 /* Description : Clips all signed word elements of input vector
1076  between 0 & 255
1077  Arguments : Inputs - in (input vector)
1078  Outputs - out_m (output vector with clipped elements)
1079  Return Type - signed word
1080 */
1081 #define CLIP_SW_0_255(in) \
1082 ( { \
1083  v4i32 max_m = __msa_ldi_w(255); \
1084  v4i32 out_m; \
1085  \
1086  out_m = __msa_maxi_s_w((v4i32) in, 0); \
1087  out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \
1088  out_m; \
1089 } )
1090 
1091 /* Description : Addition of 4 signed word elements
1092  4 signed word elements of input vector are added together and
1093  resulted integer sum is returned
1094  Arguments : Inputs - in (signed word vector)
1095  Outputs - sum_m (i32 sum)
1096  Return Type - signed word
1097 */
1098 #define HADD_SW_S32(in) \
1099 ( { \
1100  v2i64 res0_m, res1_m; \
1101  int32_t sum_m; \
1102  \
1103  res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \
1104  res1_m = __msa_splati_d(res0_m, 1); \
1105  res0_m += res1_m; \
1106  sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \
1107  sum_m; \
1108 } )
1109 
1110 /* Description : Addition of 8 unsigned halfword elements
1111  8 unsigned halfword elements of input vector are added
1112  together and resulted integer sum is returned
1113  Arguments : Inputs - in (unsigned halfword vector)
1114  Outputs - sum_m (u32 sum)
1115  Return Type - unsigned word
1116 */
1117 #define HADD_UH_U32(in) \
1118 ( { \
1119  v4u32 res_m; \
1120  v2u64 res0_m, res1_m; \
1121  uint32_t sum_m; \
1122  \
1123  res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \
1124  res0_m = __msa_hadd_u_d(res_m, res_m); \
1125  res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \
1126  res0_m += res1_m; \
1127  sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \
1128  sum_m; \
1129 } )
1130 
1131 /* Description : Horizontal addition of signed byte vector elements
1132  Arguments : Inputs - in0, in1
1133  Outputs - out0, out1
1134  Return Type - as per RTYPE
1135  Details : Each signed odd byte element from 'in0' is added to
1136  even signed byte element from 'in0' (pairwise) and the
1137  halfword result is stored in 'out0'
1138 */
1139 #define HADD_SB2(RTYPE, in0, in1, out0, out1) \
1140 { \
1141  out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0); \
1142  out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1); \
1143 }
1144 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1145 
1146 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1147 { \
1148  HADD_SB2(RTYPE, in0, in1, out0, out1); \
1149  HADD_SB2(RTYPE, in2, in3, out2, out3); \
1150 }
1151 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1152 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1153 
1154 /* Description : Horizontal addition of unsigned byte vector elements
1155  Arguments : Inputs - in0, in1
1156  Outputs - out0, out1
1157  Return Type - as per RTYPE
1158  Details : Each unsigned odd byte element from 'in0' is added to
1159  even unsigned byte element from 'in0' (pairwise) and the
1160  halfword result is stored in 'out0'
1161 */
1162 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \
1163 { \
1164  out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0); \
1165  out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1); \
1166 }
1167 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1168 
1169 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2) \
1170 { \
1171  HADD_UB2(RTYPE, in0, in1, out0, out1); \
1172  out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2); \
1173 }
1174 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1175 
1176 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1177 { \
1178  HADD_UB2(RTYPE, in0, in1, out0, out1); \
1179  HADD_UB2(RTYPE, in2, in3, out2, out3); \
1180 }
1181 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1182 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1183 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1184 
1185 /* Description : Horizontal subtraction of unsigned byte vector elements
1186  Arguments : Inputs - in0, in1
1187  Outputs - out0, out1
1188  Return Type - as per RTYPE
1189  Details : Each unsigned odd byte element from 'in0' is subtracted from
1190  even unsigned byte element from 'in0' (pairwise) and the
1191  halfword result is stored in 'out0'
1192 */
1193 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
1194 { \
1195  out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
1196  out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
1197 }
1198 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1199 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1200 
1201 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1202 { \
1203  HSUB_UB2(RTYPE, in0, in1, out0, out1); \
1204  HSUB_UB2(RTYPE, in2, in3, out2, out3); \
1205 }
1206 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1207 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1208 
1209 /* Description : SAD (Sum of Absolute Difference)
1210  Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref)
1211  Outputs - sad_m (halfword vector with sad)
1212  Return Type - unsigned halfword
1213  Details : Absolute difference of all the byte elements from 'in0' with
1214  'ref0' is calculated and preserved in 'diff0'. From the 16
1215  unsigned absolute diff values, even-odd pairs are added
1216  together to generate 8 halfword results.
1217 */
1218 #define SAD_UB2_UH(in0, in1, ref0, ref1) \
1219 ( { \
1220  v16u8 diff0_m, diff1_m; \
1221  v8u16 sad_m = { 0 }; \
1222  \
1223  diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \
1224  diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \
1225  \
1226  sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \
1227  sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \
1228  \
1229  sad_m; \
1230 } )
1231 
1232 /* Description : Insert specified word elements from input vectors to 1
1233  destination vector
1234  Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
1235  Outputs - out (output vector)
1236  Return Type - as per RTYPE
1237 */
1238 #define INSERT_W2(RTYPE, in0, in1, out) \
1239 { \
1240  out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1241  out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1242 }
1243 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1244 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1245 
1246 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
1247 { \
1248  out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1249  out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1250  out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2); \
1251  out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3); \
1252 }
1253 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1254 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1255 #define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__)
1256 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1257 
1258 /* Description : Insert specified double word elements from input vectors to 1
1259  destination vector
1260  Arguments : Inputs - in0, in1 (2 input vectors)
1261  Outputs - out (output vector)
1262  Return Type - as per RTYPE
1263 */
1264 #define INSERT_D2(RTYPE, in0, in1, out) \
1265 { \
1266  out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0); \
1267  out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1); \
1268 }
1269 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1270 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1271 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
1272 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1273 
1274 /* Description : Interleave even byte elements from vectors
1275  Arguments : Inputs - in0, in1, in2, in3
1276  Outputs - out0, out1
1277  Return Type - as per RTYPE
1278  Details : Even byte elements of 'in0' and even byte
1279  elements of 'in1' are interleaved and copied to 'out0'
1280  Even byte elements of 'in2' and even byte
1281  elements of 'in3' are interleaved and copied to 'out1'
1282 */
1283 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1284 { \
1285  out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \
1286  out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \
1287 }
1288 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1289 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1290 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1291 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1292 
1293 /* Description : Interleave even halfword elements from vectors
1294  Arguments : Inputs - in0, in1, in2, in3
1295  Outputs - out0, out1
1296  Return Type - as per RTYPE
1297  Details : Even halfword elements of 'in0' and even halfword
1298  elements of 'in1' are interleaved and copied to 'out0'
1299  Even halfword elements of 'in2' and even halfword
1300  elements of 'in3' are interleaved and copied to 'out1'
1301 */
1302 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1303 { \
1304  out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
1305  out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
1306 }
1307 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1308 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1309 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1310 
1311 /* Description : Interleave even word elements from vectors
1312  Arguments : Inputs - in0, in1, in2, in3
1313  Outputs - out0, out1
1314  Return Type - as per RTYPE
1315  Details : Even word elements of 'in0' and even word
1316  elements of 'in1' are interleaved and copied to 'out0'
1317  Even word elements of 'in2' and even word
1318  elements of 'in3' are interleaved and copied to 'out1'
1319 */
1320 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1321 { \
1322  out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
1323  out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
1324 }
1325 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1326 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1327 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1328 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1329 
1330 /* Description : Interleave even double word elements from vectors
1331  Arguments : Inputs - in0, in1, in2, in3
1332  Outputs - out0, out1
1333  Return Type - as per RTYPE
1334  Details : Even double word elements of 'in0' and even double word
1335  elements of 'in1' are interleaved and copied to 'out0'
1336  Even double word elements of 'in2' and even double word
1337  elements of 'in3' are interleaved and copied to 'out1'
1338 */
1339 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1340 { \
1341  out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
1342  out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
1343 }
1344 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1345 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1346 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1347 
1348 /* Description : Interleave left half of byte elements from vectors
1349  Arguments : Inputs - in0, in1, in2, in3
1350  Outputs - out0, out1
1351  Return Type - as per RTYPE
1352  Details : Left half of byte elements of in0 and left half of byte
1353  elements of in1 are interleaved and copied to out0.
1354  Left half of byte elements of in2 and left half of byte
1355  elements of in3 are interleaved and copied to out1.
1356 */
1357 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1358 { \
1359  out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1360  out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
1361 }
1362 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1363 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1364 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1365 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1366 
1367 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1368  out0, out1, out2, out3) \
1369 { \
1370  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1371  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1372 }
1373 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1374 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1375 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1376 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1377 
1378 /* Description : Interleave left half of halfword elements from vectors
1379  Arguments : Inputs - in0, in1, in2, in3
1380  Outputs - out0, out1
1381  Return Type - as per RTYPE
1382  Details : Left half of halfword elements of in0 and left half of halfword
1383  elements of in1 are interleaved and copied to out0.
1384  Left half of halfword elements of in2 and left half of halfword
1385  elements of in3 are interleaved and copied to out1.
1386 */
1387 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1388 { \
1389  out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1390  out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
1391 }
1392 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1393 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1394 
1395 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1396  out0, out1, out2, out3) \
1397 { \
1398  ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1399  ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1400 }
1401 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1402 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1403 
1404 /* Description : Interleave left half of word elements from vectors
1405  Arguments : Inputs - in0, in1, in2, in3
1406  Outputs - out0, out1
1407  Return Type - as per RTYPE
1408  Details : Left half of word elements of in0 and left half of word
1409  elements of in1 are interleaved and copied to out0.
1410  Left half of word elements of in2 and left half of word
1411  elements of in3 are interleaved and copied to out1.
1412 */
1413 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1414 { \
1415  out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1416  out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
1417 }
1418 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1419 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1420 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1421 
1422 /* Description : Interleave right half of byte elements from vectors
1423  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1424  Outputs - out0, out1, out2, out3
1425  Return Type - as per RTYPE
1426  Details : Right half of byte elements of in0 and right half of byte
1427  elements of in1 are interleaved and copied to out0.
1428  Right half of byte elements of in2 and right half of byte
1429  elements of in3 are interleaved and copied to out1.
1430  Similar for other pairs
1431 */
1432 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1433 { \
1434  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1435  out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
1436 }
1437 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1438 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1439 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1440 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1441 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1442 
1443 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1444 { \
1445  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1446  out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \
1447 }
1448 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1449 #define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__)
1450 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1451 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1452 
1453 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1454  out0, out1, out2, out3) \
1455 { \
1456  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1457  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1458 }
1459 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1460 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1461 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1462 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1463 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1464 
1465 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1466  in8, in9, in10, in11, in12, in13, in14, in15, \
1467  out0, out1, out2, out3, out4, out5, out6, out7) \
1468 { \
1469  ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1470  out0, out1, out2, out3); \
1471  ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
1472  out4, out5, out6, out7); \
1473 }
1474 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1475 
1476 /* Description : Interleave right half of halfword elements from vectors
1477  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1478  Outputs - out0, out1, out2, out3
1479  Return Type - as per RTYPE
1480  Details : Right half of halfword elements of in0 and right half of
1481  halfword elements of in1 are interleaved and copied to out0.
1482  Right half of halfword elements of in2 and right half of
1483  halfword elements of in3 are interleaved and copied to out1.
1484  Similar for other pairs
1485 */
1486 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1487 { \
1488  out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1489  out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
1490 }
1491 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1492 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1493 
1494 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1495 { \
1496  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1497  out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
1498 }
1499 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1500 
1501 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1502  out0, out1, out2, out3) \
1503 { \
1504  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1505  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1506 }
1507 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1508 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1509 
1510 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1511 { \
1512  out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1513  out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1514 }
1515 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1516 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1517 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1518 
1519 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1520  out0, out1, out2, out3) \
1521 { \
1522  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1523  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1524 }
1525 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1526 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1527 
1528 /* Description : Interleave right half of double word elements from vectors
1529  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1530  Outputs - out0, out1, out2, out3
1531  Return Type - as per RTYPE
1532  Details : Right half of double word elements of in0 and right half of
1533  double word elements of in1 are interleaved and copied to out0.
1534  Right half of double word elements of in2 and right half of
1535  double word elements of in3 are interleaved and copied to out1.
1536 */
1537 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1538 { \
1539  out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
1540  out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3); \
1541 }
1542 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1543 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1544 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1545 
1546 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1547 { \
1548  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1549  out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5); \
1550 }
1551 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1552 
1553 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1554  out0, out1, out2, out3) \
1555 { \
1556  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1557  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1558 }
1559 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1560 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1561 
1562 /* Description : Interleave left half of double word elements from vectors
1563  Arguments : Inputs - in0, in1, in2, in3
1564  Outputs - out0, out1
1565  Return Type - as per RTYPE
1566  Details : Left half of double word elements of in0 and left half of
1567  double word elements of in1 are interleaved and copied to out0.
1568  Left half of double word elements of in2 and left half of
1569  double word elements of in3 are interleaved and copied to out1.
1570 */
1571 #define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1572 { \
1573  out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
1574  out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3); \
1575 }
1576 #define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__)
1577 #define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__)
1578 #define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__)
1579 
1580 /* Description : Interleave both left and right half of input vectors
1581  Arguments : Inputs - in0, in1
1582  Outputs - out0, out1
1583  Return Type - as per RTYPE
1584  Details : Right half of byte elements from 'in0' and 'in1' are
1585  interleaved and stored to 'out0'
1586  Left half of byte elements from 'in0' and 'in1' are
1587  interleaved and stored to 'out1'
1588 */
1589 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1590 { \
1591  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1592  out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1593 }
1594 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1595 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1596 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1597 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1598 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1599 
1600 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1601 { \
1602  out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1603  out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1604 }
1605 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
1606 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1607 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1608 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1609 
1610 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1611 { \
1612  out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1613  out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1614 }
1615 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1616 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1617 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1618 
1619 /* Description : Maximum values between signed elements of vector and
1620  5-bit signed immediate value are copied to the output vector
1621  Arguments : Inputs - in0, in1, in2, in3, max_val
1622  Outputs - in0, in1, in2, in3 (in place)
1623  Return Type - as per RTYPE
1624  Details : Maximum of signed halfword element values from 'in0' and
1625  'max_val' are written to output vector 'in0'
1626 */
1627 #define MAXI_SH2(RTYPE, in0, in1, max_val) \
1628 { \
1629  in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val); \
1630  in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val); \
1631 }
1632 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1633 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1634 
1635 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1636 { \
1637  MAXI_SH2(RTYPE, in0, in1, max_val); \
1638  MAXI_SH2(RTYPE, in2, in3, max_val); \
1639 }
1640 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1641 #define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__)
1642 
1643 #define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val) \
1644 { \
1645  MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val); \
1646  MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val); \
1647 }
1648 #define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__)
1649 #define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__)
1650 
1651 /* Description : Saturate the halfword element values to the max
1652  unsigned value of (sat_val+1 bits)
1653  The element data width remains unchanged
1654  Arguments : Inputs - in0, in1, in2, in3, sat_val
1655  Outputs - in0, in1, in2, in3 (in place)
1656  Return Type - as per RTYPE
1657  Details : Each unsigned halfword element from 'in0' is saturated to the
1658  value generated with (sat_val+1) bit range
1659  Results are in placed to original vectors
1660 */
1661 #define SAT_UH2(RTYPE, in0, in1, sat_val) \
1662 { \
1663  in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1664  in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1665 }
1666 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1667 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1668 
1669 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1670 { \
1671  SAT_UH2(RTYPE, in0, in1, sat_val); \
1672  SAT_UH2(RTYPE, in2, in3, sat_val); \
1673 }
1674 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1675 #define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__)
1676 
1677 #define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val) \
1678 { \
1679  SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val); \
1680  SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val); \
1681 }
1682 #define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__)
1683 #define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__)
1684 
1685 /* Description : Saturate the halfword element values to the max
1686  unsigned value of (sat_val+1 bits)
1687  The element data width remains unchanged
1688  Arguments : Inputs - in0, in1, in2, in3, sat_val
1689  Outputs - in0, in1, in2, in3 (in place)
1690  Return Type - as per RTYPE
1691  Details : Each unsigned halfword element from 'in0' is saturated to the
1692  value generated with (sat_val+1) bit range
1693  Results are in placed to original vectors
1694 */
1695 #define SAT_SH2(RTYPE, in0, in1, sat_val) \
1696 { \
1697  in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1698  in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1699 }
1700 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1701 
1702 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1703 { \
1704  SAT_SH2(RTYPE, in0, in1, sat_val); \
1705  in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1706 }
1707 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1708 
1709 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1710 { \
1711  SAT_SH2(RTYPE, in0, in1, sat_val); \
1712  SAT_SH2(RTYPE, in2, in3, sat_val); \
1713 }
1714 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1715 
1716 /* Description : Saturate the word element values to the max
1717  unsigned value of (sat_val+1 bits)
1718  The element data width remains unchanged
1719  Arguments : Inputs - in0, in1, in2, in3, sat_val
1720  Outputs - in0, in1, in2, in3 (in place)
1721  Return Type - as per RTYPE
1722  Details : Each unsigned word element from 'in0' is saturated to the
1723  value generated with (sat_val+1) bit range
1724  Results are in placed to original vectors
1725 */
1726 #define SAT_SW2(RTYPE, in0, in1, sat_val) \
1727 { \
1728  in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val); \
1729  in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val); \
1730 }
1731 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1732 
1733 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val) \
1734 { \
1735  SAT_SW2(RTYPE, in0, in1, sat_val); \
1736  SAT_SW2(RTYPE, in2, in3, sat_val); \
1737 }
1738 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1739 
1740 /* Description : Indexed halfword element values are replicated to all
1741  elements in output vector
1742  Arguments : Inputs - in, idx0, idx1
1743  Outputs - out0, out1
1744  Return Type - as per RTYPE
1745  Details : 'idx0' element value from 'in' vector is replicated to all
1746  elements in 'out0' vector
1747  Valid index range for halfword operation is 0-7
1748 */
1749 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1750 { \
1751  out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1752  out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1753 }
1754 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1755 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1756 
1757 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, \
1758  out0, out1, out2) \
1759 { \
1760  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1761  out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2); \
1762 }
1763 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1764 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1765 
1766 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1767  out0, out1, out2, out3) \
1768 { \
1769  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1770  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1771 }
1772 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1773 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1774 
1775 /* Description : Indexed word element values are replicated to all
1776  elements in output vector
1777  Arguments : Inputs - in, stidx
1778  Outputs - out0, out1
1779  Return Type - as per RTYPE
1780  Details : 'stidx' element value from 'in' vector is replicated to all
1781  elements in 'out0' vector
1782  'stidx + 1' element value from 'in' vector is replicated to all
1783  elements in 'out1' vector
1784  Valid index range for halfword operation is 0-3
1785 */
1786 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1787 { \
1788  out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1789  out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1790 }
1791 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1792 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1793 
1794 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1795 { \
1796  SPLATI_W2(RTYPE, in, 0, out0, out1); \
1797  SPLATI_W2(RTYPE, in, 2, out2, out3); \
1798 }
1799 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1800 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1801 
1802 /* Description : Pack even byte elements of vector pairs
1803  Arguments : Inputs - in0, in1, in2, in3
1804  Outputs - out0, out1
1805  Return Type - as per RTYPE
1806  Details : Even byte elements of in0 are copied to the left half of
1807  out0 & even byte elements of in1 are copied to the right
1808  half of out0.
1809  Even byte elements of in2 are copied to the left half of
1810  out1 & even byte elements of in3 are copied to the right
1811  half of out1.
1812 */
1813 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1814 { \
1815  out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1816  out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1817 }
1818 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1819 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1820 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1821 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1822 
1823 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1824 { \
1825  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1826  out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1827 }
1828 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1829 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1830 
1831 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1832  out0, out1, out2, out3) \
1833 { \
1834  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1835  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1836 }
1837 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1838 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1839 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1840 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1841 
1842 /* Description : Pack even halfword elements of vector pairs
1843  Arguments : Inputs - in0, in1, in2, in3
1844  Outputs - out0, out1
1845  Return Type - as per RTYPE
1846  Details : Even halfword elements of in0 are copied to the left half of
1847  out0 & even halfword elements of in1 are copied to the right
1848  half of out0.
1849  Even halfword elements of in2 are copied to the left half of
1850  out1 & even halfword elements of in3 are copied to the right
1851  half of out1.
1852 */
1853 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1854 { \
1855  out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1856  out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1857 }
1858 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1859 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1860 
1861 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1862  out0, out1, out2, out3) \
1863 { \
1864  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1865  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1866 }
1867 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1868 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1869 
1870 /* Description : Pack even double word elements of vector pairs
1871  Arguments : Inputs - in0, in1, in2, in3
1872  Outputs - out0, out1
1873  Return Type - as per RTYPE
1874  Details : Even double elements of in0 are copied to the left half of
1875  out0 & even double elements of in1 are copied to the right
1876  half of out0.
1877  Even double elements of in2 are copied to the left half of
1878  out1 & even double elements of in3 are copied to the right
1879  half of out1.
1880 */
1881 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1882 { \
1883  out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
1884  out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
1885 }
1886 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1887 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1888 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1889 
1890 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1891  out0, out1, out2, out3) \
1892 { \
1893  PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1894  PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1895 }
1896 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1897 
1898 /* Description : Pack odd double word elements of vector pairs
1899  Arguments : Inputs - in0, in1
1900  Outputs - out0, out1
1901  Return Type - as per RTYPE
1902  Details : As operation is on same input 'in0' vector, index 1 double word
1903  element is overwritten to index 0 and result is written to out0
1904  As operation is on same input 'in1' vector, index 1 double word
1905  element is overwritten to index 0 and result is written to out1
1906 */
1907 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1908 { \
1909  out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
1910  out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3); \
1911 }
1912 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1913 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1914 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1915 
1916 /* Description : Each byte element is logically xor'ed with immediate 128
1917  Arguments : Inputs - in0, in1
1918  Outputs - in0, in1 (in-place)
1919  Return Type - as per RTYPE
1920  Details : Each unsigned byte element from input vector 'in0' is
1921  logically xor'ed with 128 and result is in-place stored in
1922  'in0' vector
1923  Each unsigned byte element from input vector 'in1' is
1924  logically xor'ed with 128 and result is in-place stored in
1925  'in1' vector
1926  Similar for other pairs
1927 */
1928 #define XORI_B2_128(RTYPE, in0, in1) \
1929 { \
1930  in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1931  in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1932 }
1933 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1934 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1935 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1936 
1937 #define XORI_B3_128(RTYPE, in0, in1, in2) \
1938 { \
1939  XORI_B2_128(RTYPE, in0, in1); \
1940  in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
1941 }
1942 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1943 
1944 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1945 { \
1946  XORI_B2_128(RTYPE, in0, in1); \
1947  XORI_B2_128(RTYPE, in2, in3); \
1948 }
1949 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1950 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1951 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1952 
1953 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
1954 { \
1955  XORI_B3_128(RTYPE, in0, in1, in2); \
1956  XORI_B2_128(RTYPE, in3, in4); \
1957 }
1958 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1959 
1960 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
1961 { \
1962  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1963  XORI_B2_128(RTYPE, in4, in5); \
1964 }
1965 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1966 
1967 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1968 { \
1969  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1970  XORI_B3_128(RTYPE, in4, in5, in6); \
1971 }
1972 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1973 
1974 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
1975 { \
1976  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1977  XORI_B4_128(RTYPE, in4, in5, in6, in7); \
1978 }
1979 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1980 #define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__)
1981 
1982 /* Description : Addition of signed halfword elements and signed saturation
1983  Arguments : Inputs - in0, in1, in2, in3
1984  Outputs - out0, out1
1985  Return Type - as per RTYPE
1986  Details : Signed halfword elements from 'in0' are added to signed
1987  halfword elements of 'in1'. The result is then signed saturated
1988  between -32768 to +32767 (as per halfword data type)
1989  Similar for other pairs
1990 */
1991 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
1992 { \
1993  out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
1994  out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
1995 }
1996 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1997 
1998 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1999  out0, out1, out2, out3) \
2000 { \
2001  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
2002  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
2003 }
2004 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
2005 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
2006 
2007 /* Description : Shift left all elements of vector (generic for all data types)
2008  Arguments : Inputs - in0, in1, in2, in3, shift
2009  Outputs - in0, in1, in2, in3 (in place)
2010  Return Type - as per input vector RTYPE
2011  Details : Each element of vector 'in0' is left shifted by 'shift' and
2012  result is in place written to 'in0'
2013  Similar for other pairs
2014 */
2015 #define SLLI_2V(in0, in1, shift) \
2016 { \
2017  in0 = in0 << shift; \
2018  in1 = in1 << shift; \
2019 }
2020 #define SLLI_4V(in0, in1, in2, in3, shift) \
2021 { \
2022  in0 = in0 << shift; \
2023  in1 = in1 << shift; \
2024  in2 = in2 << shift; \
2025  in3 = in3 << shift; \
2026 }
2027 
2028 /* Description : Arithmetic shift right all elements of vector
2029  (generic for all data types)
2030  Arguments : Inputs - in0, in1, in2, in3, shift
2031  Outputs - in0, in1, in2, in3 (in place)
2032  Return Type - as per input vector RTYPE
2033  Details : Each element of vector 'in0' is right shifted by 'shift' and
2034  result is in place written to 'in0'
2035  Here, 'shift' is GP variable passed in
2036  Similar for other pairs
2037 */
2038 #define SRA_4V(in0, in1, in2, in3, shift) \
2039 { \
2040  in0 = in0 >> shift; \
2041  in1 = in1 >> shift; \
2042  in2 = in2 >> shift; \
2043  in3 = in3 >> shift; \
2044 }
2045 
2046 /* Description : Shift right logical all halfword elements of vector
2047  Arguments : Inputs - in0, in1, in2, in3, shift
2048  Outputs - in0, in1, in2, in3 (in place)
2049  Return Type - as per RTYPE
2050  Details : Each element of vector 'in0' is shifted right logical by
2051  number of bits respective element holds in vector 'shift' and
2052  result is in place written to 'in0'
2053  Here, 'shift' is a vector passed in
2054  Similar for other pairs
2055 */
2056 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
2057 { \
2058  in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
2059  in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
2060  in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
2061  in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
2062 }
2063 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
2064 
2065 #define SRLR_H4(RTYPE, in0, in1, in2, in3, shift) \
2066 { \
2067  in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift); \
2068  in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift); \
2069  in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift); \
2070  in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift); \
2071 }
2072 #define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__)
2073 #define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__)
2074 
2075 #define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift) \
2076 { \
2077  SRLR_H4(RTYPE, in0, in1, in2, in3, shift); \
2078  SRLR_H4(RTYPE, in4, in5, in6, in7, shift); \
2079 }
2080 #define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__)
2081 #define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__)
2082 
2083 /* Description : Shift right arithmetic rounded halfwords
2084  Arguments : Inputs - in0, in1, shift
2085  Outputs - in0, in1, (in place)
2086  Return Type - as per RTYPE
2087  Details : Each element of vector 'in0' is shifted right arithmetic by
2088  number of bits respective element holds in vector 'shift'.
2089  The last discarded bit is added to shifted value for rounding
2090  and the result is in place written to 'in0'
2091  Here, 'shift' is a vector passed in
2092  Similar for other pairs
2093 */
2094 #define SRAR_H2(RTYPE, in0, in1, shift) \
2095 { \
2096  in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
2097  in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
2098 }
2099 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2100 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2101 
2102 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \
2103 { \
2104  SRAR_H2(RTYPE, in0, in1, shift) \
2105  in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
2106 }
2107 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2108 
2109 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
2110 { \
2111  SRAR_H2(RTYPE, in0, in1, shift) \
2112  SRAR_H2(RTYPE, in2, in3, shift) \
2113 }
2114 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2115 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2116 
2117 /* Description : Shift right arithmetic rounded words
2118  Arguments : Inputs - in0, in1, shift
2119  Outputs - in0, in1, (in place)
2120  Return Type - as per RTYPE
2121  Details : Each element of vector 'in0' is shifted right arithmetic by
2122  number of bits respective element holds in vector 'shift'.
2123  The last discarded bit is added to shifted value for rounding
2124  and the result is in place written to 'in0'
2125  Here, 'shift' is a vector passed in
2126  Similar for other pairs
2127 */
2128 #define SRAR_W2(RTYPE, in0, in1, shift) \
2129 { \
2130  in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \
2131  in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \
2132 }
2133 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2134 
2135 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
2136 { \
2137  SRAR_W2(RTYPE, in0, in1, shift) \
2138  SRAR_W2(RTYPE, in2, in3, shift) \
2139 }
2140 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2141 
2142 /* Description : Shift right arithmetic rounded (immediate)
2143  Arguments : Inputs - in0, in1, in2, in3, shift
2144  Outputs - in0, in1, in2, in3 (in place)
2145  Return Type - as per RTYPE
2146  Details : Each element of vector 'in0' is shifted right arithmetic by
2147  value in 'shift'.
2148  The last discarded bit is added to shifted value for rounding
2149  and the result is in place written to 'in0'
2150  Similar for other pairs
2151 */
2152 #define SRARI_H2(RTYPE, in0, in1, shift) \
2153 { \
2154  in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
2155  in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
2156 }
2157 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2158 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2159 
2160 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
2161 { \
2162  SRARI_H2(RTYPE, in0, in1, shift); \
2163  SRARI_H2(RTYPE, in2, in3, shift); \
2164 }
2165 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2166 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2167 
2168 /* Description : Shift right arithmetic rounded (immediate)
2169  Arguments : Inputs - in0, in1, shift
2170  Outputs - in0, in1 (in place)
2171  Return Type - as per RTYPE
2172  Details : Each element of vector 'in0' is shifted right arithmetic by
2173  value in 'shift'.
2174  The last discarded bit is added to shifted value for rounding
2175  and the result is in place written to 'in0'
2176  Similar for other pairs
2177 */
2178 #define SRARI_W2(RTYPE, in0, in1, shift) \
2179 { \
2180  in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
2181  in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
2182 }
2183 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2184 
2185 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
2186 { \
2187  SRARI_W2(RTYPE, in0, in1, shift); \
2188  SRARI_W2(RTYPE, in2, in3, shift); \
2189 }
2190 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2191 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2192 
2193 /* Description : Multiplication of pairs of vectors
2194  Arguments : Inputs - in0, in1, in2, in3
2195  Outputs - out0, out1
2196  Details : Each element from 'in0' is multiplied with elements from 'in1'
2197  and result is written to 'out0'
2198  Similar for other pairs
2199 */
2200 #define MUL2(in0, in1, in2, in3, out0, out1) \
2201 { \
2202  out0 = in0 * in1; \
2203  out1 = in2 * in3; \
2204 }
2205 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2206 { \
2207  MUL2(in0, in1, in2, in3, out0, out1); \
2208  MUL2(in4, in5, in6, in7, out2, out3); \
2209 }
2210 
2211 /* Description : Addition of 2 pairs of vectors
2212  Arguments : Inputs - in0, in1, in2, in3
2213  Outputs - out0, out1
2214  Details : Each element from 2 pairs vectors is added and 2 results are
2215  produced
2216 */
2217 #define ADD2(in0, in1, in2, in3, out0, out1) \
2218 { \
2219  out0 = in0 + in1; \
2220  out1 = in2 + in3; \
2221 }
2222 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2223 { \
2224  ADD2(in0, in1, in2, in3, out0, out1); \
2225  ADD2(in4, in5, in6, in7, out2, out3); \
2226 }
2227 
2228 /* Description : Subtraction of 2 pairs of vectors
2229  Arguments : Inputs - in0, in1, in2, in3
2230  Outputs - out0, out1
2231  Details : Each element from 2 pairs vectors is subtracted and 2 results
2232  are produced
2233 */
2234 #define SUB2(in0, in1, in2, in3, out0, out1) \
2235 { \
2236  out0 = in0 - in1; \
2237  out1 = in2 - in3; \
2238 }
2239 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2240 { \
2241  out0 = in0 - in1; \
2242  out1 = in2 - in3; \
2243  out2 = in4 - in5; \
2244  out3 = in6 - in7; \
2245 }
2246 
2247 /* Description : Sign extend halfword elements from right half of the vector
2248  Arguments : Inputs - in (input halfword vector)
2249  Outputs - out (sign extended word vectors)
2250  Return Type - signed word
2251  Details : Sign bit of halfword elements from input vector 'in' is
2252  extracted and interleaved with same vector 'in0' to generate
2253  4 word elements keeping sign intact
2254 */
2255 #define UNPCK_R_SH_SW(in, out) \
2256 { \
2257  v8i16 sign_m; \
2258  \
2259  sign_m = __msa_clti_s_h((v8i16) in, 0); \
2260  out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \
2261 }
2262 
2263 /* Description : Sign extend byte elements from input vector and return
2264  halfword results in pair of vectors
2265  Arguments : Inputs - in (1 input byte vector)
2266  Outputs - out0, out1 (sign extended 2 halfword vectors)
2267  Return Type - signed halfword
2268  Details : Sign bit of byte elements from input vector 'in' is
2269  extracted and interleaved right with same vector 'in0' to
2270  generate 8 signed halfword elements in 'out0'
2271  Then interleaved left with same vector 'in0' to
2272  generate 8 signed halfword elements in 'out1'
2273 */
2274 #define UNPCK_SB_SH(in, out0, out1) \
2275 { \
2276  v16i8 tmp_m; \
2277  \
2278  tmp_m = __msa_clti_s_b((v16i8) in, 0); \
2279  ILVRL_B2_SH(tmp_m, in, out0, out1); \
2280 }
2281 
2282 /* Description : Zero extend unsigned byte elements to halfword elements
2283  Arguments : Inputs - in (1 input unsigned byte vector)
2284  Outputs - out0, out1 (unsigned 2 halfword vectors)
2285  Return Type - signed halfword
2286  Details : Zero extended right half of vector is returned in 'out0'
2287  Zero extended left half of vector is returned in 'out1'
2288 */
2289 #define UNPCK_UB_SH(in, out0, out1) \
2290 { \
2291  v16i8 zero_m = { 0 }; \
2292  \
2293  ILVRL_B2_SH(zero_m, in, out0, out1); \
2294 }
2295 
2296 /* Description : Sign extend halfword elements from input vector and return
2297  result in pair of vectors
2298  Arguments : Inputs - in (1 input halfword vector)
2299  Outputs - out0, out1 (sign extended 2 word vectors)
2300  Return Type - signed word
2301  Details : Sign bit of halfword elements from input vector 'in' is
2302  extracted and interleaved right with same vector 'in0' to
2303  generate 4 signed word elements in 'out0'
2304  Then interleaved left with same vector 'in0' to
2305  generate 4 signed word elements in 'out1'
2306 */
2307 #define UNPCK_SH_SW(in, out0, out1) \
2308 { \
2309  v8i16 tmp_m; \
2310  \
2311  tmp_m = __msa_clti_s_h((v8i16) in, 0); \
2312  ILVRL_H2_SW(tmp_m, in, out0, out1); \
2313 }
2314 
2315 /* Description : Swap two variables
2316  Arguments : Inputs - in0, in1
2317  Outputs - in0, in1 (in-place)
2318  Details : Swapping of two input variables using xor
2319 */
2320 #define SWAP(in0, in1) \
2321 { \
2322  in0 = in0 ^ in1; \
2323  in1 = in0 ^ in1; \
2324  in0 = in0 ^ in1; \
2325 }
2326 
2327 /* Description : Butterfly of 4 input vectors
2328  Arguments : Inputs - in0, in1, in2, in3
2329  Outputs - out0, out1, out2, out3
2330  Details : Butterfly operation
2331 */
2332 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
2333 { \
2334  out0 = in0 + in3; \
2335  out1 = in1 + in2; \
2336  \
2337  out2 = in1 - in2; \
2338  out3 = in0 - in3; \
2339 }
2340 
2341 /* Description : Butterfly of 8 input vectors
2342  Arguments : Inputs - in0 ... in7
2343  Outputs - out0 .. out7
2344  Details : Butterfly operation
2345 */
2346 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
2347  out0, out1, out2, out3, out4, out5, out6, out7) \
2348 { \
2349  out0 = in0 + in7; \
2350  out1 = in1 + in6; \
2351  out2 = in2 + in5; \
2352  out3 = in3 + in4; \
2353  \
2354  out4 = in3 - in4; \
2355  out5 = in2 - in5; \
2356  out6 = in1 - in6; \
2357  out7 = in0 - in7; \
2358 }
2359 
2360 /* Description : Butterfly of 16 input vectors
2361  Arguments : Inputs - in0 ... in15
2362  Outputs - out0 .. out15
2363  Details : Butterfly operation
2364 */
2365 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \
2366  in8, in9, in10, in11, in12, in13, in14, in15, \
2367  out0, out1, out2, out3, out4, out5, out6, out7, \
2368  out8, out9, out10, out11, out12, out13, out14, out15) \
2369 { \
2370  out0 = in0 + in15; \
2371  out1 = in1 + in14; \
2372  out2 = in2 + in13; \
2373  out3 = in3 + in12; \
2374  out4 = in4 + in11; \
2375  out5 = in5 + in10; \
2376  out6 = in6 + in9; \
2377  out7 = in7 + in8; \
2378  \
2379  out8 = in7 - in8; \
2380  out9 = in6 - in9; \
2381  out10 = in5 - in10; \
2382  out11 = in4 - in11; \
2383  out12 = in3 - in12; \
2384  out13 = in2 - in13; \
2385  out14 = in1 - in14; \
2386  out15 = in0 - in15; \
2387 }
2388 
2389 /* Description : Transposes input 4x4 byte block
2390  Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block)
2391  Outputs - out0, out1, out2, out3 (output 4x4 byte block)
2392  Return Type - unsigned byte
2393  Details :
2394 */
2395 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
2396 { \
2397  v16i8 zero_m = { 0 }; \
2398  v16i8 s0_m, s1_m, s2_m, s3_m; \
2399  \
2400  ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
2401  ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
2402  \
2403  out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
2404  out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
2405  out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
2406  out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
2407 }
2408 
2409 /* Description : Transposes input 8x4 byte block into 4x8
2410  Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block)
2411  Outputs - out0, out1, out2, out3 (output 4x8 byte block)
2412  Return Type - as per RTYPE
2413  Details :
2414 */
2415 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2416  out0, out1, out2, out3) \
2417 { \
2418  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2419  \
2420  ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
2421  tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2422  ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
2423  \
2424  tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2425  ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
2426  \
2427  ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
2428  out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
2429  out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2430 }
2431 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2432 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2433 
2434 /* Description : Transposes input 8x8 byte block
2435  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2436  (input 8x8 byte block)
2437  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2438  (output 8x8 byte block)
2439  Return Type - as per RTYPE
2440  Details :
2441 */
2442 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2443  out0, out1, out2, out3, out4, out5, out6, out7) \
2444 { \
2445  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2446  v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2447  \
2448  ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
2449  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2450  ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
2451  ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
2452  ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
2453  ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
2454  SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8); \
2455  SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8); \
2456 }
2457 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2458 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2459 
2460 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2461  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2462  in8, in9, in10, in11, in12, in13, in14, in15
2463  Outputs - out0, out1, out2, out3
2464  Return Type - unsigned byte
2465  Details :
2466 */
2467 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2468  in8, in9, in10, in11, in12, in13, in14, in15, \
2469  out0, out1, out2, out3) \
2470 { \
2471  v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2472  \
2473  ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
2474  out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2475  \
2476  ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
2477  out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2478  \
2479  ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
2480  \
2481  tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2482  ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
2483  \
2484  tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2485  ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
2486  out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2487  out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2488  \
2489  tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \
2490  tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
2491  out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2492  out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2493 }
2494 
2495 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2496  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2497  in8, in9, in10, in11, in12, in13, in14, in15
2498  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2499  Return Type - unsigned byte
2500  Details :
2501 */
2502 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2503  in8, in9, in10, in11, in12, in13, in14, in15, \
2504  out0, out1, out2, out3, out4, out5, out6, out7) \
2505 { \
2506  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2507  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2508  \
2509  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
2510  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
2511  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
2512  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
2513  \
2514  tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
2515  tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
2516  tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
2517  tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
2518  out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
2519  tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
2520  out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
2521  tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
2522  \
2523  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
2524  out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2525  out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2526  \
2527  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2528  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
2529  out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2530  out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2531  \
2532  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
2533  out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2534  out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2535  \
2536  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2537  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2538  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2539  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2540  out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2541  out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2542 }
2543 
2544 /* Description : Transposes 4x4 block with half word elements in vectors
2545  Arguments : Inputs - in0, in1, in2, in3
2546  Outputs - out0, out1, out2, out3
2547  Return Type - signed halfword
2548  Details :
2549 */
2550 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
2551 { \
2552  v8i16 s0_m, s1_m; \
2553  \
2554  ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
2555  ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
2556  out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
2557  out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2558 }
2559 
2560 /* Description : Transposes 8x8 block with half word elements in vectors
2561  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2562  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2563  Return Type - as per RTYPE
2564  Details :
2565 */
2566 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2567  out0, out1, out2, out3, out4, out5, out6, out7) \
2568 { \
2569  v8i16 s0_m, s1_m; \
2570  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2571  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2572  \
2573  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2574  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
2575  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2576  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
2577  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2578  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
2579  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2580  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
2581  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
2582  tmp3_m, tmp7_m, out0, out2, out4, out6); \
2583  out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
2584  out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
2585  out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
2586  out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
2587 }
2588 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2589 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2590 
2591 /* Description : Transposes 4x4 block with word elements in vectors
2592  Arguments : Inputs - in0, in1, in2, in3
2593  Outputs - out0, out1, out2, out3
2594  Return Type - signed word
2595  Details :
2596 */
2597 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
2598 { \
2599  v4i32 s0_m, s1_m, s2_m, s3_m; \
2600  \
2601  ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
2602  ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
2603  \
2604  out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
2605  out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
2606  out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
2607  out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
2608 }
2609 
2610 /* Description : Average byte elements from pair of vectors and store 8x4 byte
2611  block in destination memory
2612  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2613  Details : Each byte element from input vector pair 'in0' and 'in1' are
2614  averaged (a + b)/2 and stored in 'tmp0_m'
2615  Each byte element from input vector pair 'in2' and 'in3' are
2616  averaged (a + b)/2 and stored in 'tmp1_m'
2617  Each byte element from input vector pair 'in4' and 'in5' are
2618  averaged (a + b)/2 and stored in 'tmp2_m'
2619  Each byte element from input vector pair 'in6' and 'in7' are
2620  averaged (a + b)/2 and stored in 'tmp3_m'
2621  The half vector results from all 4 vectors are stored in
2622  destination memory as 8x4 byte block
2623 */
2624 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2625 { \
2626  uint64_t out0_m, out1_m, out2_m, out3_m; \
2627  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2628  \
2629  tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2630  tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2631  tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2632  tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2633  \
2634  out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
2635  out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \
2636  out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0); \
2637  out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0); \
2638  SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2639 }
2640 
2641 /* Description : Average byte elements from pair of vectors and store 16x4 byte
2642  block in destination memory
2643  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2644  Details : Each byte element from input vector pair 'in0' and 'in1' are
2645  averaged (a + b)/2 and stored in 'tmp0_m'
2646  Each byte element from input vector pair 'in2' and 'in3' are
2647  averaged (a + b)/2 and stored in 'tmp1_m'
2648  Each byte element from input vector pair 'in4' and 'in5' are
2649  averaged (a + b)/2 and stored in 'tmp2_m'
2650  Each byte element from input vector pair 'in6' and 'in7' are
2651  averaged (a + b)/2 and stored in 'tmp3_m'
2652  The results from all 4 vectors are stored in destination
2653  memory as 16x4 byte block
2654 */
2655 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2656 { \
2657  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2658  \
2659  tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2660  tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2661  tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2662  tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2663  \
2664  ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride); \
2665 }
2666 
2667 /* Description : Average rounded byte elements from pair of vectors and store
2668  8x4 byte block in destination memory
2669  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2670  Details : Each byte element from input vector pair 'in0' and 'in1' are
2671  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2672  Each byte element from input vector pair 'in2' and 'in3' are
2673  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2674  Each byte element from input vector pair 'in4' and 'in5' are
2675  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2676  Each byte element from input vector pair 'in6' and 'in7' are
2677  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2678  The half vector results from all 4 vectors are stored in
2679  destination memory as 8x4 byte block
2680 */
2681 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2682 { \
2683  uint64_t out0_m, out1_m, out2_m, out3_m; \
2684  v16u8 tp0_m, tp1_m, tp2_m, tp3_m; \
2685  \
2686  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2687  tp0_m, tp1_m, tp2_m, tp3_m); \
2688  \
2689  out0_m = __msa_copy_u_d((v2i64) tp0_m, 0); \
2690  out1_m = __msa_copy_u_d((v2i64) tp1_m, 0); \
2691  out2_m = __msa_copy_u_d((v2i64) tp2_m, 0); \
2692  out3_m = __msa_copy_u_d((v2i64) tp3_m, 0); \
2693  SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2694 }
2695 
2696 /* Description : Average rounded byte elements from pair of vectors and store
2697  16x4 byte block in destination memory
2698  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2699  Details : Each byte element from input vector pair 'in0' and 'in1' are
2700  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2701  Each byte element from input vector pair 'in2' and 'in3' are
2702  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2703  Each byte element from input vector pair 'in4' and 'in5' are
2704  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2705  Each byte element from input vector pair 'in6' and 'in7' are
2706  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2707  The vector results from all 4 vectors are stored in
2708  destination memory as 16x4 byte block
2709 */
2710 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2711 { \
2712  v16u8 t0_m, t1_m, t2_m, t3_m; \
2713  \
2714  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2715  t0_m, t1_m, t2_m, t3_m); \
2716  ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride); \
2717 }
2718 
2719 /* Description : Average rounded byte elements from pair of vectors,
2720  average rounded with destination and store 8x4 byte block
2721  in destination memory
2722  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2723  Details : Each byte element from input vector pair 'in0' and 'in1' are
2724  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2725  Each byte element from input vector pair 'in2' and 'in3' are
2726  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2727  Each byte element from input vector pair 'in4' and 'in5' are
2728  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2729  Each byte element from input vector pair 'in6' and 'in7' are
2730  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2731  The half vector results from all 4 vectors are stored in
2732  destination memory as 8x4 byte block
2733 */
2734 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2735  pdst, stride) \
2736 { \
2737  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2738  v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2739  \
2740  LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2741  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2742  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2743  AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2744  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2745 }
2746 
2747 /* Description : Average rounded byte elements from pair of vectors,
2748  average rounded with destination and store 16x4 byte block
2749  in destination memory
2750  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2751  Details : Each byte element from input vector pair 'in0' and 'in1' are
2752  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2753  Each byte element from input vector pair 'in2' and 'in3' are
2754  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2755  Each byte element from input vector pair 'in4' and 'in5' are
2756  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2757  Each byte element from input vector pair 'in6' and 'in7' are
2758  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2759  The vector results from all 4 vectors are stored in
2760  destination memory as 16x4 byte block
2761 */
2762 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2763  pdst, stride) \
2764 { \
2765  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2766  v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2767  \
2768  LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2769  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2770  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2771  AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2772  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2773 }
2774 
2775 /* Description : Add block 4x4
2776  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2777  Details : Least significant 4 bytes from each input vector are added to
2778  the destination bytes, clipped between 0-255 and then stored.
2779 */
2780 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2781 { \
2782  uint32_t src0_m, src1_m, src2_m, src3_m; \
2783  uint32_t out0_m, out1_m, out2_m, out3_m; \
2784  v8i16 inp0_m, inp1_m, res0_m, res1_m; \
2785  v16i8 dst0_m = { 0 }; \
2786  v16i8 dst1_m = { 0 }; \
2787  v16i8 zero_m = { 0 }; \
2788  \
2789  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
2790  LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
2791  INSERT_W2_SB(src0_m, src1_m, dst0_m); \
2792  INSERT_W2_SB(src2_m, src3_m, dst1_m); \
2793  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
2794  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
2795  CLIP_SH2_0_255(res0_m, res1_m); \
2796  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
2797  \
2798  out0_m = __msa_copy_u_w((v4i32) dst0_m, 0); \
2799  out1_m = __msa_copy_u_w((v4i32) dst0_m, 1); \
2800  out2_m = __msa_copy_u_w((v4i32) dst1_m, 0); \
2801  out3_m = __msa_copy_u_w((v4i32) dst1_m, 1); \
2802  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2803 }
2804 
2805 /* Description : Dot product and addition of 3 signed halfword input vectors
2806  Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
2807  Outputs - out0_m
2808  Return Type - signed halfword
2809  Details : Dot product of 'in0' with 'coeff0'
2810  Dot product of 'in1' with 'coeff1'
2811  Dot product of 'in2' with 'coeff2'
2812  Addition of all the 3 vector results
2813 
2814  out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2815 */
2816 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
2817 ( { \
2818  v8i16 tmp1_m; \
2819  v8i16 out0_m; \
2820  \
2821  out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
2822  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
2823  tmp1_m = __msa_dotp_s_h((v16i8) in2, (v16i8) coeff2); \
2824  out0_m = __msa_adds_s_h(out0_m, tmp1_m); \
2825  \
2826  out0_m; \
2827 } )
2828 
2829 /* Description : Pack even elements of input vectors & xor with 128
2830  Arguments : Inputs - in0, in1
2831  Outputs - out_m
2832  Return Type - unsigned byte
2833  Details : Signed byte even elements from 'in0' and 'in1' are packed
2834  together in one vector and the resulted vector is xor'ed with
2835  128 to shift the range from signed to unsigned byte
2836 */
2837 #define PCKEV_XORI128_UB(in0, in1) \
2838 ( { \
2839  v16u8 out_m; \
2840  out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2841  out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
2842  out_m; \
2843 } )
2844 
2845 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2846  as 8x4 unsigned byte block
2847  Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride
2848 */
2849 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \
2850  dst0, dst1, pdst, stride) \
2851 { \
2852  v16u8 tmp0_m, tmp1_m; \
2853  uint8_t *pdst_m = (uint8_t *) (pdst); \
2854  \
2855  tmp0_m = PCKEV_XORI128_UB(in0, in1); \
2856  tmp1_m = PCKEV_XORI128_UB(in2, in3); \
2857  AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
2858  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
2859 }
2860 
2861 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
2862  of results and store 4 words in destination memory as per
2863  stride
2864  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2865 */
2866 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2867 { \
2868  uint32_t out0_m, out1_m, out2_m, out3_m; \
2869  v16i8 tmp0_m, tmp1_m; \
2870  \
2871  PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
2872  \
2873  out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
2874  out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
2875  out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
2876  out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
2877  \
2878  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2879 }
2880 
2881 /* Description : Pack even byte elements and store byte vector in destination
2882  memory
2883  Arguments : Inputs - in0, in1, pdst
2884 */
2885 #define PCKEV_ST_SB(in0, in1, pdst) \
2886 { \
2887  v16i8 tmp_m; \
2888  tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2889  ST_SB(tmp_m, (pdst)); \
2890 }
2891 
2892 /* Description : Horizontal 2 tap filter kernel code
2893  Arguments : Inputs - in0, in1, mask, coeff, shift
2894 */
2895 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
2896 ( { \
2897  v16i8 tmp0_m; \
2898  v8u16 tmp1_m; \
2899  \
2900  tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0); \
2901  tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff); \
2902  tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift); \
2903  tmp1_m = __msa_sat_u_h(tmp1_m, shift); \
2904  \
2905  tmp1_m; \
2906 } )
2907 #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */