FFmpeg
output_lasx.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2022 Loongson Technology Corporation Limited
3  * Contributed by Hao Chen(chenhao@loongson.cn)
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "swscale_loongarch.h"
24 
25 void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
26  const int16_t **src, uint8_t *dest, int dstW,
27  const uint8_t *dither, int offset)
28 {
29  int i;
30  int len = dstW - 15;
31  __m256i mask = {0x1C0C180814041000, 0x1C1814100C080400,
32  0x1C0C180814041000, 0x1C1814100C080400};
33  __m256i val1, val2, val3;
34  uint8_t dither0 = dither[offset & 7];
35  uint8_t dither1 = dither[(offset + 1) & 7];
36  uint8_t dither2 = dither[(offset + 2) & 7];
37  uint8_t dither3 = dither[(offset + 3) & 7];
38  uint8_t dither4 = dither[(offset + 4) & 7];
39  uint8_t dither5 = dither[(offset + 5) & 7];
40  uint8_t dither6 = dither[(offset + 6) & 7];
41  uint8_t dither7 = dither[(offset + 7) & 7];
42  int val_1[8] = {dither0, dither2, dither4, dither6,
43  dither0, dither2, dither4, dither6};
44  int val_2[8] = {dither1, dither3, dither5, dither7,
45  dither1, dither3, dither5, dither7};
46  int val_3[8] = {dither0, dither1, dither2, dither3,
47  dither4, dither5, dither6, dither7};
48 
49  DUP2_ARG2(__lasx_xvld, val_1, 0, val_2, 0, val1, val2);
50  val3 = __lasx_xvld(val_3, 0);
51 
52  for (i = 0; i < len; i += 16) {
53  int j;
54  __m256i src0, filter0, val;
55  __m256i val_ev, val_od;
56 
57  val_ev = __lasx_xvslli_w(val1, 12);
58  val_od = __lasx_xvslli_w(val2, 12);
59 
60  for (j = 0; j < filterSize; j++) {
61  src0 = __lasx_xvld(src[j]+ i, 0);
62  filter0 = __lasx_xvldrepl_h((filter + j), 0);
63  val_ev = __lasx_xvmaddwev_w_h(val_ev, src0, filter0);
64  val_od = __lasx_xvmaddwod_w_h(val_od, src0, filter0);
65  }
66  val_ev = __lasx_xvsrai_w(val_ev, 19);
67  val_od = __lasx_xvsrai_w(val_od, 19);
68  val_ev = __lasx_xvclip255_w(val_ev);
69  val_od = __lasx_xvclip255_w(val_od);
70  val = __lasx_xvshuf_b(val_od, val_ev, mask);
71  __lasx_xvstelm_d(val, (dest + i), 0, 0);
72  __lasx_xvstelm_d(val, (dest + i), 8, 2);
73  }
74  if (dstW - i >= 8){
75  int j;
76  __m256i src0, filter0, val_h;
77  __m256i val_l;
78 
79  val_l = __lasx_xvslli_w(val3, 12);
80 
81  for (j = 0; j < filterSize; j++) {
82  src0 = __lasx_xvld(src[j] + i, 0);
83  src0 = __lasx_vext2xv_w_h(src0);
84  filter0 = __lasx_xvldrepl_h((filter + j), 0);
85  filter0 = __lasx_vext2xv_w_h(filter0);
86  val_l = __lasx_xvmadd_w(val_l, src0, filter0);
87  }
88  val_l = __lasx_xvsrai_w(val_l, 19);
89  val_l = __lasx_xvclip255_w(val_l);
90  val_h = __lasx_xvpermi_d(val_l, 0x4E);
91  val_l = __lasx_xvshuf_b(val_h, val_l, mask);
92  __lasx_xvstelm_d(val_l, (dest + i), 0, 1);
93  i += 8;
94  }
95  for (; i < dstW; i++) {
96  int val = dither[(i + offset) & 7] << 12;
97  int j;
98  for (j = 0; j< filterSize; j++)
99  val += src[j][i] * filter[j];
100 
101  dest[i] = av_clip_uint8(val >> 19);
102  }
103 }
104 
105 /*Copy from libswscale/output.c*/
106 static av_always_inline void
107 yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
108  unsigned A1, unsigned A2,
109  const void *_r, const void *_g, const void *_b, int y,
110  enum AVPixelFormat target, int hasAlpha)
111 {
112  if (target == AV_PIX_FMT_ARGB || target == AV_PIX_FMT_RGBA ||
113  target == AV_PIX_FMT_ABGR || target == AV_PIX_FMT_BGRA) {
114  uint32_t *dest = (uint32_t *) _dest;
115  const uint32_t *r = (const uint32_t *) _r;
116  const uint32_t *g = (const uint32_t *) _g;
117  const uint32_t *b = (const uint32_t *) _b;
118 
119 #if CONFIG_SMALL
120  dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
121  dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
122 #else
123 #if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
124  int sh = (target == AV_PIX_FMT_RGB32_1 ||
125  target == AV_PIX_FMT_BGR32_1) ? 0 : 24;
126  av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0xFF);
127 #endif
128  dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
129  dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
130 #endif
131  } else if (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) {
132  uint8_t *dest = (uint8_t *) _dest;
133  const uint8_t *r = (const uint8_t *) _r;
134  const uint8_t *g = (const uint8_t *) _g;
135  const uint8_t *b = (const uint8_t *) _b;
136 
137 #define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
138 #define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
139 
140  dest[i * 6 + 0] = r_b[Y1];
141  dest[i * 6 + 1] = g[Y1];
142  dest[i * 6 + 2] = b_r[Y1];
143  dest[i * 6 + 3] = r_b[Y2];
144  dest[i * 6 + 4] = g[Y2];
145  dest[i * 6 + 5] = b_r[Y2];
146 #undef r_b
147 #undef b_r
148  } else if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565 ||
149  target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555 ||
150  target == AV_PIX_FMT_RGB444 || target == AV_PIX_FMT_BGR444) {
151  uint16_t *dest = (uint16_t *) _dest;
152  const uint16_t *r = (const uint16_t *) _r;
153  const uint16_t *g = (const uint16_t *) _g;
154  const uint16_t *b = (const uint16_t *) _b;
155  int dr1, dg1, db1, dr2, dg2, db2;
156 
157  if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565) {
158  dr1 = ff_dither_2x2_8[ y & 1 ][0];
159  dg1 = ff_dither_2x2_4[ y & 1 ][0];
160  db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
161  dr2 = ff_dither_2x2_8[ y & 1 ][1];
162  dg2 = ff_dither_2x2_4[ y & 1 ][1];
163  db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
164  } else if (target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555) {
165  dr1 = ff_dither_2x2_8[ y & 1 ][0];
166  dg1 = ff_dither_2x2_8[ y & 1 ][1];
167  db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
168  dr2 = ff_dither_2x2_8[ y & 1 ][1];
169  dg2 = ff_dither_2x2_8[ y & 1 ][0];
170  db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
171  } else {
172  dr1 = ff_dither_4x4_16[ y & 3 ][0];
173  dg1 = ff_dither_4x4_16[ y & 3 ][1];
174  db1 = ff_dither_4x4_16[(y & 3) ^ 3][0];
175  dr2 = ff_dither_4x4_16[ y & 3 ][1];
176  dg2 = ff_dither_4x4_16[ y & 3 ][0];
177  db2 = ff_dither_4x4_16[(y & 3) ^ 3][1];
178  }
179 
180  dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
181  dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
182  } else /* 8/4 bits */ {
183  uint8_t *dest = (uint8_t *) _dest;
184  const uint8_t *r = (const uint8_t *) _r;
185  const uint8_t *g = (const uint8_t *) _g;
186  const uint8_t *b = (const uint8_t *) _b;
187  int dr1, dg1, db1, dr2, dg2, db2;
188 
189  if (target == AV_PIX_FMT_RGB8 || target == AV_PIX_FMT_BGR8) {
190  const uint8_t * const d64 = ff_dither_8x8_73[y & 7];
191  const uint8_t * const d32 = ff_dither_8x8_32[y & 7];
192  dr1 = dg1 = d32[(i * 2 + 0) & 7];
193  db1 = d64[(i * 2 + 0) & 7];
194  dr2 = dg2 = d32[(i * 2 + 1) & 7];
195  db2 = d64[(i * 2 + 1) & 7];
196  } else {
197  const uint8_t * const d64 = ff_dither_8x8_73 [y & 7];
198  const uint8_t * const d128 = ff_dither_8x8_220[y & 7];
199  dr1 = db1 = d128[(i * 2 + 0) & 7];
200  dg1 = d64[(i * 2 + 0) & 7];
201  dr2 = db2 = d128[(i * 2 + 1) & 7];
202  dg2 = d64[(i * 2 + 1) & 7];
203  }
204 
205  if (target == AV_PIX_FMT_RGB4 || target == AV_PIX_FMT_BGR4) {
206  dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
207  ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
208  } else {
209  dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
210  dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
211  }
212  }
213 }
214 
215 #define WRITE_YUV2RGB(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4) \
216 { \
217  Y1 = __lasx_xvpickve2gr_w(vec_y1, t1); \
218  Y2 = __lasx_xvpickve2gr_w(vec_y2, t2); \
219  U = __lasx_xvpickve2gr_w(vec_u, t3); \
220  V = __lasx_xvpickve2gr_w(vec_v, t4); \
221  r = c->table_rV[V]; \
222  g = (c->table_gU[U] + c->table_gV[V]); \
223  b = c->table_bU[U]; \
224  yuv2rgb_write(dest, count, Y1, Y2, 0, 0, \
225  r, g, b, y, target, 0); \
226  count++; \
227 }
228 
229 static void
230 yuv2rgb_X_template_lasx(SwsContext *c, const int16_t *lumFilter,
231  const int16_t **lumSrc, int lumFilterSize,
232  const int16_t *chrFilter, const int16_t **chrUSrc,
233  const int16_t **chrVSrc, int chrFilterSize,
234  const int16_t **alpSrc, uint8_t *dest, int dstW,
235  int y, enum AVPixelFormat target, int hasAlpha)
236 {
237  int i, j;
238  int count = 0;
239  int t = 1 << 18;
240  int len = dstW >> 6;
241  int res = dstW & 63;
242  int len_count = (dstW + 1) >> 1;
243  const void *r, *g, *b;
244  int head = YUVRGB_TABLE_HEADROOM;
245  __m256i headroom = __lasx_xvreplgr2vr_w(head);
246 
247  for (i = 0; i < len; i++) {
248  int Y1, Y2, U, V, count_lum = count << 1;
249  __m256i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
250  __m256i yl1_ev, yl1_od, yh1_ev, yh1_od, yl2_ev, yl2_od, yh2_ev, yh2_od;
251  __m256i u1_ev, u1_od, v1_ev, v1_od, u2_ev, u2_od, v2_ev, v2_od, temp;
252 
253  yl1_ev = __lasx_xvldrepl_w(&t, 0);
254  yl1_od = yl1_ev;
255  yh1_ev = yl1_ev;
256  yh1_od = yl1_ev;
257  u1_ev = yl1_ev;
258  v1_ev = yl1_ev;
259  u1_od = yl1_ev;
260  v1_od = yl1_ev;
261  yl2_ev = yl1_ev;
262  yl2_od = yl1_ev;
263  yh2_ev = yl1_ev;
264  yh2_od = yl1_ev;
265  u2_ev = yl1_ev;
266  v2_ev = yl1_ev;
267  u2_od = yl1_ev;
268  v2_od = yl1_ev;
269  for (j = 0; j < lumFilterSize; j++) {
270  const int16_t *src_lum = lumSrc[j] + count_lum;
271  temp = __lasx_xvldrepl_h((lumFilter + j), 0);
272  DUP4_ARG2(__lasx_xvld, src_lum, 0, src_lum, 32, src_lum, 64,
273  src_lum, 96, l_src1, l_src2, l_src3, l_src4);
274 
275  yl1_ev = __lasx_xvmaddwev_w_h(yl1_ev, temp, l_src1);
276  yl1_od = __lasx_xvmaddwod_w_h(yl1_od, temp, l_src1);
277  yh1_ev = __lasx_xvmaddwev_w_h(yh1_ev, temp, l_src2);
278  yh1_od = __lasx_xvmaddwod_w_h(yh1_od, temp, l_src2);
279  yl2_ev = __lasx_xvmaddwev_w_h(yl2_ev, temp, l_src3);
280  yl2_od = __lasx_xvmaddwod_w_h(yl2_od, temp, l_src3);
281  yh2_ev = __lasx_xvmaddwev_w_h(yh2_ev, temp, l_src4);
282  yh2_od = __lasx_xvmaddwod_w_h(yh2_od, temp, l_src4);
283  }
284  for (j = 0; j < chrFilterSize; j++) {
285  DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrUSrc[j] + count, 32,
286  u_src1, u_src2);
287  DUP2_ARG2(__lasx_xvld, chrVSrc[j] + count, 0, chrVSrc[j] + count, 32,
288  v_src1, v_src2);
289  temp = __lasx_xvldrepl_h((chrFilter + j), 0);
290  u1_ev = __lasx_xvmaddwev_w_h(u1_ev, temp, u_src1);
291  u1_od = __lasx_xvmaddwod_w_h(u1_od, temp, u_src1);
292  v1_ev = __lasx_xvmaddwev_w_h(v1_ev, temp, v_src1);
293  v1_od = __lasx_xvmaddwod_w_h(v1_od, temp, v_src1);
294  u2_ev = __lasx_xvmaddwev_w_h(u2_ev, temp, u_src2);
295  u2_od = __lasx_xvmaddwod_w_h(u2_od, temp, u_src2);
296  v2_ev = __lasx_xvmaddwev_w_h(v2_ev, temp, v_src2);
297  v2_od = __lasx_xvmaddwod_w_h(v2_od, temp, v_src2);
298  }
299  yl1_ev = __lasx_xvsrai_w(yl1_ev, 19);
300  yh1_ev = __lasx_xvsrai_w(yh1_ev, 19);
301  yl1_od = __lasx_xvsrai_w(yl1_od, 19);
302  yh1_od = __lasx_xvsrai_w(yh1_od, 19);
303  u1_ev = __lasx_xvsrai_w(u1_ev, 19);
304  v1_ev = __lasx_xvsrai_w(v1_ev, 19);
305  u1_od = __lasx_xvsrai_w(u1_od, 19);
306  v1_od = __lasx_xvsrai_w(v1_od, 19);
307  yl2_ev = __lasx_xvsrai_w(yl2_ev, 19);
308  yh2_ev = __lasx_xvsrai_w(yh2_ev, 19);
309  yl2_od = __lasx_xvsrai_w(yl2_od, 19);
310  yh2_od = __lasx_xvsrai_w(yh2_od, 19);
311  u2_ev = __lasx_xvsrai_w(u2_ev, 19);
312  v2_ev = __lasx_xvsrai_w(v2_ev, 19);
313  u2_od = __lasx_xvsrai_w(u2_od, 19);
314  v2_od = __lasx_xvsrai_w(v2_od, 19);
315  u1_ev = __lasx_xvadd_w(u1_ev, headroom);
316  v1_ev = __lasx_xvadd_w(v1_ev, headroom);
317  u1_od = __lasx_xvadd_w(u1_od, headroom);
318  v1_od = __lasx_xvadd_w(v1_od, headroom);
319  u2_ev = __lasx_xvadd_w(u2_ev, headroom);
320  v2_ev = __lasx_xvadd_w(v2_ev, headroom);
321  u2_od = __lasx_xvadd_w(u2_od, headroom);
322  v2_od = __lasx_xvadd_w(v2_od, headroom);
323  WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 0, 0, 0, 0);
324  WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 1, 1, 0, 0);
325  WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 2, 2, 1, 1);
326  WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 3, 3, 1, 1);
327  WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 4, 4, 2, 2);
328  WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 5, 5, 2, 2);
329  WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 6, 6, 3, 3);
330  WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 7, 7, 3, 3);
331  WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 0, 0, 4, 4);
332  WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 1, 1, 4, 4);
333  WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 2, 2, 5, 5);
334  WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 3, 3, 5, 5);
335  WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 4, 4, 6, 6);
336  WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 5, 5, 6, 6);
337  WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 6, 6, 7, 7);
338  WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 7, 7, 7, 7);
339  WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 0, 0, 0, 0);
340  WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 1, 1, 0, 0);
341  WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 2, 2, 1, 1);
342  WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 3, 3, 1, 1);
343  WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 4, 4, 2, 2);
344  WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 5, 5, 2, 2);
345  WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 6, 6, 3, 3);
346  WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 7, 7, 3, 3);
347  WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 0, 0, 4, 4);
348  WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 1, 1, 4, 4);
349  WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 2, 2, 5, 5);
350  WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 3, 3, 5, 5);
351  WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 4, 4, 6, 6);
352  WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 5, 5, 6, 6);
353  WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 6, 6, 7, 7);
354  WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 7, 7, 7, 7);
355  }
356  if (res >= 32) {
357  int Y1, Y2, U, V, count_lum = count << 1;
358  __m256i l_src1, l_src2, u_src, v_src;
359  __m256i yl_ev, yl_od, yh_ev, yh_od;
360  __m256i u_ev, u_od, v_ev, v_od, temp;
361 
362  yl_ev = __lasx_xvldrepl_w(&t, 0);
363  yl_od = yl_ev;
364  yh_ev = yl_ev;
365  yh_od = yl_ev;
366  u_ev = yl_ev;
367  v_ev = yl_ev;
368  u_od = yl_ev;
369  v_od = yl_ev;
370  for (j = 0; j < lumFilterSize; j++) {
371  temp = __lasx_xvldrepl_h((lumFilter + j), 0);
372  DUP2_ARG2(__lasx_xvld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
373  32, l_src1, l_src2);
374  yl_ev = __lasx_xvmaddwev_w_h(yl_ev, temp, l_src1);
375  yl_od = __lasx_xvmaddwod_w_h(yl_od, temp, l_src1);
376  yh_ev = __lasx_xvmaddwev_w_h(yh_ev, temp, l_src2);
377  yh_od = __lasx_xvmaddwod_w_h(yh_od, temp, l_src2);
378  }
379  for (j = 0; j < chrFilterSize; j++) {
380  DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
381  u_src, v_src);
382  temp = __lasx_xvldrepl_h((chrFilter + j), 0);
383  u_ev = __lasx_xvmaddwev_w_h(u_ev, temp, u_src);
384  u_od = __lasx_xvmaddwod_w_h(u_od, temp, u_src);
385  v_ev = __lasx_xvmaddwev_w_h(v_ev, temp, v_src);
386  v_od = __lasx_xvmaddwod_w_h(v_od, temp, v_src);
387  }
388  yl_ev = __lasx_xvsrai_w(yl_ev, 19);
389  yh_ev = __lasx_xvsrai_w(yh_ev, 19);
390  yl_od = __lasx_xvsrai_w(yl_od, 19);
391  yh_od = __lasx_xvsrai_w(yh_od, 19);
392  u_ev = __lasx_xvsrai_w(u_ev, 19);
393  v_ev = __lasx_xvsrai_w(v_ev, 19);
394  u_od = __lasx_xvsrai_w(u_od, 19);
395  v_od = __lasx_xvsrai_w(v_od, 19);
396  u_ev = __lasx_xvadd_w(u_ev, headroom);
397  v_ev = __lasx_xvadd_w(v_ev, headroom);
398  u_od = __lasx_xvadd_w(u_od, headroom);
399  v_od = __lasx_xvadd_w(v_od, headroom);
400  WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
401  WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0);
402  WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 2, 2, 1, 1);
403  WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 3, 3, 1, 1);
404  WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 4, 4, 2, 2);
405  WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 5, 5, 2, 2);
406  WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 6, 6, 3, 3);
407  WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 7, 7, 3, 3);
408  WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 0, 0, 4, 4);
409  WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 1, 1, 4, 4);
410  WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 2, 2, 5, 5);
411  WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 3, 3, 5, 5);
412  WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 4, 4, 6, 6);
413  WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 5, 5, 6, 6);
414  WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 6, 6, 7, 7);
415  WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 7, 7, 7, 7);
416  res -= 32;
417  }
418  if (res >= 16) {
419  int Y1, Y2, U, V;
420  int count_lum = count << 1;
421  __m256i l_src, u_src, v_src;
422  __m256i y_ev, y_od, u, v, temp;
423 
424  y_ev = __lasx_xvldrepl_w(&t, 0);
425  y_od = y_ev;
426  u = y_ev;
427  v = y_ev;
428  for (j = 0; j < lumFilterSize; j++) {
429  temp = __lasx_xvldrepl_h((lumFilter + j), 0);
430  l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
431  y_ev = __lasx_xvmaddwev_w_h(y_ev, temp, l_src);
432  y_od = __lasx_xvmaddwod_w_h(y_od, temp, l_src);
433  }
434  for (j = 0; j < chrFilterSize; j++) {
435  DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count,
436  0, u_src, v_src);
437  temp = __lasx_xvldrepl_h((chrFilter + j), 0);
438  u_src = __lasx_vext2xv_w_h(u_src);
439  v_src = __lasx_vext2xv_w_h(v_src);
440  u = __lasx_xvmaddwev_w_h(u, temp, u_src);
441  v = __lasx_xvmaddwev_w_h(v, temp, v_src);
442  }
443  y_ev = __lasx_xvsrai_w(y_ev, 19);
444  y_od = __lasx_xvsrai_w(y_od, 19);
445  u = __lasx_xvsrai_w(u, 19);
446  v = __lasx_xvsrai_w(v, 19);
447  u = __lasx_xvadd_w(u, headroom);
448  v = __lasx_xvadd_w(v, headroom);
449  WRITE_YUV2RGB(y_ev, y_od, u, v, 0, 0, 0, 0);
450  WRITE_YUV2RGB(y_ev, y_od, u, v, 1, 1, 1, 1);
451  WRITE_YUV2RGB(y_ev, y_od, u, v, 2, 2, 2, 2);
452  WRITE_YUV2RGB(y_ev, y_od, u, v, 3, 3, 3, 3);
453  WRITE_YUV2RGB(y_ev, y_od, u, v, 4, 4, 4, 4);
454  WRITE_YUV2RGB(y_ev, y_od, u, v, 5, 5, 5, 5);
455  WRITE_YUV2RGB(y_ev, y_od, u, v, 6, 6, 6, 6);
456  WRITE_YUV2RGB(y_ev, y_od, u, v, 7, 7, 7, 7);
457  res -= 16;
458  }
459  if (res >= 8) {
460  int Y1, Y2, U, V;
461  int count_lum = count << 1;
462  __m256i l_src, u_src, v_src;
463  __m256i y_ev, uv, temp;
464 
465  y_ev = __lasx_xvldrepl_w(&t, 0);
466  uv = y_ev;
467  for (j = 0; j < lumFilterSize; j++) {
468  temp = __lasx_xvldrepl_h((lumFilter + j), 0);
469  l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
470  l_src = __lasx_vext2xv_w_h(l_src);
471  y_ev = __lasx_xvmaddwev_w_h(y_ev, temp, l_src);
472  }
473  for (j = 0; j < chrFilterSize; j++) {
474  u_src = __lasx_xvldrepl_d((chrUSrc[j] + count), 0);
475  v_src = __lasx_xvldrepl_d((chrVSrc[j] + count), 0);
476  temp = __lasx_xvldrepl_h((chrFilter + j), 0);
477  u_src = __lasx_xvilvl_d(v_src, u_src);
478  u_src = __lasx_vext2xv_w_h(u_src);
479  uv = __lasx_xvmaddwev_w_h(uv, temp, u_src);
480  }
481  y_ev = __lasx_xvsrai_w(y_ev, 19);
482  uv = __lasx_xvsrai_w(uv, 19);
483  uv = __lasx_xvadd_w(uv, headroom);
484  WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 0, 1, 0, 4);
485  WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 2, 3, 1, 5);
486  WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 4, 5, 2, 6);
487  WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 6, 7, 3, 7);
488  }
489  for (; count < len_count; count++) {
490  int Y1 = 1 << 18;
491  int Y2 = Y1;
492  int U = Y1;
493  int V = Y1;
494 
495  for (j = 0; j < lumFilterSize; j++) {
496  Y1 += lumSrc[j][count * 2] * lumFilter[j];
497  Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
498  }
499  for (j = 0; j < chrFilterSize; j++) {
500  U += chrUSrc[j][count] * chrFilter[j];
501  V += chrVSrc[j][count] * chrFilter[j];
502  }
503  Y1 >>= 19;
504  Y2 >>= 19;
505  U >>= 19;
506  V >>= 19;
507  r = c->table_rV[V + YUVRGB_TABLE_HEADROOM];
508  g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
509  c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
510  b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
511 
512  yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
513  r, g, b, y, target, 0);
514  }
515 }
516 
517 static void
518 yuv2rgb_2_template_lasx(SwsContext *c, const int16_t *buf[2],
519  const int16_t *ubuf[2], const int16_t *vbuf[2],
520  const int16_t *abuf[2], uint8_t *dest, int dstW,
521  int yalpha, int uvalpha, int y,
522  enum AVPixelFormat target, int hasAlpha)
523 {
524  const int16_t *buf0 = buf[0], *buf1 = buf[1],
525  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
526  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
527  int yalpha1 = 4096 - yalpha;
528  int uvalpha1 = 4096 - uvalpha;
529  int i, count = 0;
530  int len = dstW - 15;
531  int len_count = (dstW + 1) >> 1;
532  const void *r, *g, *b;
533  int head = YUVRGB_TABLE_HEADROOM;
534  __m256i v_yalpha1 = __lasx_xvreplgr2vr_w(yalpha1);
535  __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
536  __m256i v_yalpha = __lasx_xvreplgr2vr_w(yalpha);
537  __m256i v_uvalpha = __lasx_xvreplgr2vr_w(uvalpha);
538  __m256i headroom = __lasx_xvreplgr2vr_w(head);
539 
540  for (i = 0; i < len; i += 16) {
541  int Y1, Y2, U, V;
542  int i_dex = i << 1;
543  int c_dex = count << 1;
544  __m256i y0_h, y0_l, y0, u0, v0;
545  __m256i y1_h, y1_l, y1, u1, v1;
546  __m256i y_l, y_h, u, v;
547 
548  DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
549  buf1, i_dex, y0, u0, v0, y1);
550  DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
551  DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
552  DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h);
553  DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
554  y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
555  y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
556  u0 = __lasx_xvmul_w(u0, v_uvalpha1);
557  v0 = __lasx_xvmul_w(v0, v_uvalpha1);
558  y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
559  y_h = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
560  u = __lasx_xvmadd_w(u0, v_uvalpha, u1);
561  v = __lasx_xvmadd_w(v0, v_uvalpha, v1);
562  y_l = __lasx_xvsrai_w(y_l, 19);
563  y_h = __lasx_xvsrai_w(y_h, 19);
564  u = __lasx_xvsrai_w(u, 19);
565  v = __lasx_xvsrai_w(v, 19);
566  u = __lasx_xvadd_w(u, headroom);
567  v = __lasx_xvadd_w(v, headroom);
568  WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
569  WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
570  WRITE_YUV2RGB(y_h, y_h, u, v, 0, 1, 2, 2);
571  WRITE_YUV2RGB(y_h, y_h, u, v, 2, 3, 3, 3);
572  WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 4, 4);
573  WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 5, 5);
574  WRITE_YUV2RGB(y_h, y_h, u, v, 4, 5, 6, 6);
575  WRITE_YUV2RGB(y_h, y_h, u, v, 6, 7, 7, 7);
576  }
577  if (dstW - i >= 8) {
578  int Y1, Y2, U, V;
579  int i_dex = i << 1;
580  __m256i y0_l, y0, u0, v0;
581  __m256i y1_l, y1, u1, v1;
582  __m256i y_l, u, v;
583 
584  y0 = __lasx_xvldx(buf0, i_dex);
585  u0 = __lasx_xvldrepl_d((ubuf0 + count), 0);
586  v0 = __lasx_xvldrepl_d((vbuf0 + count), 0);
587  y1 = __lasx_xvldx(buf1, i_dex);
588  u1 = __lasx_xvldrepl_d((ubuf1 + count), 0);
589  v1 = __lasx_xvldrepl_d((vbuf1 + count), 0);
590  DUP2_ARG1(__lasx_vext2xv_w_h, y0, y1, y0_l, y1_l);
591  DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
592  y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
593  u0 = __lasx_xvmul_w(u0, v_uvalpha1);
594  v0 = __lasx_xvmul_w(v0, v_uvalpha1);
595  y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
596  u = __lasx_xvmadd_w(u0, v_uvalpha, u1);
597  v = __lasx_xvmadd_w(v0, v_uvalpha, v1);
598  y_l = __lasx_xvsrai_w(y_l, 19);
599  u = __lasx_xvsrai_w(u, 19);
600  v = __lasx_xvsrai_w(v, 19);
601  u = __lasx_xvadd_w(u, headroom);
602  v = __lasx_xvadd_w(v, headroom);
603  WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
604  WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
605  WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 2, 2);
606  WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 3, 3);
607  i += 8;
608  }
609  for (; count < len_count; count++) {
610  int Y1 = (buf0[count * 2] * yalpha1 +
611  buf1[count * 2] * yalpha) >> 19;
612  int Y2 = (buf0[count * 2 + 1] * yalpha1 +
613  buf1[count * 2 + 1] * yalpha) >> 19;
614  int U = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
615  int V = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
616 
617  r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
618  g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
619  c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
620  b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
621 
622  yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
623  r, g, b, y, target, 0);
624  }
625 }
626 
627 static void
628 yuv2rgb_1_template_lasx(SwsContext *c, const int16_t *buf0,
629  const int16_t *ubuf[2], const int16_t *vbuf[2],
630  const int16_t *abuf0, uint8_t *dest, int dstW,
631  int uvalpha, int y, enum AVPixelFormat target,
632  int hasAlpha)
633 {
634  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
635  int i;
636  int len = (dstW - 15);
637  int len_count = (dstW + 1) >> 1;
638  const void *r, *g, *b;
639 
640  if (uvalpha < 2048) {
641  int count = 0;
642  int head = YUVRGB_TABLE_HEADROOM;
643  __m256i headroom = __lasx_xvreplgr2vr_h(head);
644 
645  for (i = 0; i < len; i += 16) {
646  int Y1, Y2, U, V;
647  int i_dex = i << 1;
648  int c_dex = count << 1;
649  __m256i src_y, src_u, src_v;
650  __m256i u, v, y_l, y_h;
651 
652  DUP2_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, src_y, src_u);
653  src_v = __lasx_xvldx(vbuf0, c_dex);
654  src_u = __lasx_xvpermi_q(src_u, src_v, 0x02);
655  src_y = __lasx_xvsrari_h(src_y, 7);
656  src_u = __lasx_xvsrari_h(src_u, 7);
657  y_l = __lasx_xvsllwil_w_h(src_y, 0);
658  y_h = __lasx_xvexth_w_h(src_y);
659  u = __lasx_xvaddwev_w_h(src_u, headroom);
660  v = __lasx_xvaddwod_w_h(src_u, headroom);
661  WRITE_YUV2RGB(y_l, y_l, u, u, 0, 1, 0, 4);
662  WRITE_YUV2RGB(y_l, y_l, v, v, 2, 3, 0, 4);
663  WRITE_YUV2RGB(y_h, y_h, u, u, 0, 1, 1, 5);
664  WRITE_YUV2RGB(y_h, y_h, v, v, 2, 3, 1, 5);
665  WRITE_YUV2RGB(y_l, y_l, u, u, 4, 5, 2, 6);
666  WRITE_YUV2RGB(y_l, y_l, v, v, 6, 7, 2, 6);
667  WRITE_YUV2RGB(y_h, y_h, u, u, 4, 5, 3, 7);
668  WRITE_YUV2RGB(y_h, y_h, v, v, 6, 7, 3, 7);
669  }
670  if (dstW - i >= 8){
671  int Y1, Y2, U, V;
672  int i_dex = i << 1;
673  __m256i src_y, src_u, src_v;
674  __m256i y_l, uv;
675 
676  src_y = __lasx_xvldx(buf0, i_dex);
677  src_u = __lasx_xvldrepl_d((ubuf0 + count), 0);
678  src_v = __lasx_xvldrepl_d((vbuf0 + count), 0);
679  src_u = __lasx_xvilvl_d(src_v, src_u);
680  y_l = __lasx_xvsrari_h(src_y, 7);
681  uv = __lasx_xvsrari_h(src_u, 7);
682  y_l = __lasx_vext2xv_w_h(y_l);
683  uv = __lasx_vext2xv_w_h(uv);
684  uv = __lasx_xvaddwev_w_h(uv, headroom);
685  WRITE_YUV2RGB(y_l, y_l, uv, uv, 0, 1, 0, 4);
686  WRITE_YUV2RGB(y_l, y_l, uv, uv, 2, 3, 1, 5);
687  WRITE_YUV2RGB(y_l, y_l, uv, uv, 4, 5, 2, 6);
688  WRITE_YUV2RGB(y_l, y_l, uv, uv, 6, 7, 3, 7);
689  i += 8;
690  }
691  for (; count < len_count; count++) {
692  int Y1 = (buf0[count * 2 ] + 64) >> 7;
693  int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
694  int U = (ubuf0[count] + 64) >> 7;
695  int V = (vbuf0[count] + 64) >> 7;
696 
697  r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
698  g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
699  c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
700  b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
701 
702  yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
703  r, g, b, y, target, 0);
704  }
705  } else {
706  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
707  int count = 0;
708  int HEADROOM = YUVRGB_TABLE_HEADROOM;
709  __m256i headroom = __lasx_xvreplgr2vr_w(HEADROOM);
710 
711  for (i = 0; i < len; i += 16) {
712  int Y1, Y2, U, V;
713  int i_dex = i << 1;
714  int c_dex = count << 1;
715  __m256i src_y, src_u0, src_v0, src_u1, src_v1;
716  __m256i y_l, y_h, u, v;
717 
718  DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
719  ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
720  src_v1 = __lasx_xvldx(vbuf1, c_dex);
721  src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
722  src_u1 = __lasx_xvpermi_q(src_u1, src_v1, 0x02);
723  src_y = __lasx_xvsrari_h(src_y, 7);
724  u = __lasx_xvaddwev_w_h(src_u0, src_u1);
725  v = __lasx_xvaddwod_w_h(src_u0, src_u1);
726  y_l = __lasx_xvsllwil_w_h(src_y, 0);
727  y_h = __lasx_xvexth_w_h(src_y);
728  u = __lasx_xvsrari_w(u, 8);
729  v = __lasx_xvsrari_w(v, 8);
730  u = __lasx_xvadd_w(u, headroom);
731  v = __lasx_xvadd_w(v, headroom);
732  WRITE_YUV2RGB(y_l, y_l, u, u, 0, 1, 0, 4);
733  WRITE_YUV2RGB(y_l, y_l, v, v, 2, 3, 0, 4);
734  WRITE_YUV2RGB(y_h, y_h, u, u, 0, 1, 1, 5);
735  WRITE_YUV2RGB(y_h, y_h, v, v, 2, 3, 1, 5);
736  WRITE_YUV2RGB(y_l, y_l, u, u, 4, 5, 2, 6);
737  WRITE_YUV2RGB(y_l, y_l, v, v, 6, 7, 2, 6);
738  WRITE_YUV2RGB(y_h, y_h, u, u, 4, 5, 3, 7);
739  WRITE_YUV2RGB(y_h, y_h, v, v, 6, 7, 3, 7);
740  }
741  if (dstW - i >= 8) {
742  int Y1, Y2, U, V;
743  int i_dex = i << 1;
744  __m256i src_y, src_u0, src_v0, src_u1, src_v1;
745  __m256i uv;
746 
747  src_y = __lasx_xvldx(buf0, i_dex);
748  src_u0 = __lasx_xvldrepl_d((ubuf0 + count), 0);
749  src_v0 = __lasx_xvldrepl_d((vbuf0 + count), 0);
750  src_u1 = __lasx_xvldrepl_d((ubuf1 + count), 0);
751  src_v1 = __lasx_xvldrepl_d((vbuf1 + count), 0);
752 
753  src_u0 = __lasx_xvilvl_h(src_u1, src_u0);
754  src_v0 = __lasx_xvilvl_h(src_v1, src_v0);
755  src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
756  src_y = __lasx_xvsrari_h(src_y, 7);
757  uv = __lasx_xvhaddw_w_h(src_u0, src_u0);
758  src_y = __lasx_vext2xv_w_h(src_y);
759  uv = __lasx_xvsrari_w(uv, 8);
760  uv = __lasx_xvadd_w(uv, headroom);
761  WRITE_YUV2RGB(src_y, src_y, uv, uv, 0, 1, 0, 4);
762  WRITE_YUV2RGB(src_y, src_y, uv, uv, 2, 3, 1, 5);
763  WRITE_YUV2RGB(src_y, src_y, uv, uv, 4, 5, 2, 6);
764  WRITE_YUV2RGB(src_y, src_y, uv, uv, 6, 7, 3, 7);
765  i += 8;
766  }
767  for (; count < len_count; count++) {
768  int Y1 = (buf0[count * 2 ] + 64) >> 7;
769  int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
770  int U = (ubuf0[count] + ubuf1[count] + 128) >> 8;
771  int V = (vbuf0[count] + vbuf1[count] + 128) >> 8;
772 
773  r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
774  g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
775  c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
776  b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
777 
778  yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
779  r, g, b, y, target, 0);
780  }
781  }
782 }
783 
784 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
785 static void name ## ext ## _X_lasx(SwsContext *c, const int16_t *lumFilter, \
786  const int16_t **lumSrc, int lumFilterSize, \
787  const int16_t *chrFilter, const int16_t **chrUSrc, \
788  const int16_t **chrVSrc, int chrFilterSize, \
789  const int16_t **alpSrc, uint8_t *dest, int dstW, \
790  int y) \
791 { \
792  name ## base ## _X_template_lasx(c, lumFilter, lumSrc, lumFilterSize, \
793  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
794  alpSrc, dest, dstW, y, fmt, hasAlpha); \
795 }
796 
797 #define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
798 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
799 static void name ## ext ## _2_lasx(SwsContext *c, const int16_t *buf[2], \
800  const int16_t *ubuf[2], const int16_t *vbuf[2], \
801  const int16_t *abuf[2], uint8_t *dest, int dstW, \
802  int yalpha, int uvalpha, int y) \
803 { \
804  name ## base ## _2_template_lasx(c, buf, ubuf, vbuf, abuf, dest, \
805  dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
806 }
807 
808 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
809 YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
810 static void name ## ext ## _1_lasx(SwsContext *c, const int16_t *buf0, \
811  const int16_t *ubuf[2], const int16_t *vbuf[2], \
812  const int16_t *abuf0, uint8_t *dest, int dstW, \
813  int uvalpha, int y) \
814 { \
815  name ## base ## _1_template_lasx(c, buf0, ubuf, vbuf, abuf0, dest, \
816  dstW, uvalpha, y, fmt, hasAlpha); \
817 }
818 
819 
820 #if CONFIG_SMALL
821 #else
822 #if CONFIG_SWSCALE_ALPHA
823 #endif
826 #endif
827 YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
828 YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
835 
836 // This function is copied from libswscale/output.c
838  uint8_t *dest, int i, int R, int A, int G, int B,
839  int y, enum AVPixelFormat target, int hasAlpha, int err[4])
840 {
841  int isrgb8 = target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8;
842 
843  if ((R | G | B) & 0xC0000000) {
844  R = av_clip_uintp2(R, 30);
845  G = av_clip_uintp2(G, 30);
846  B = av_clip_uintp2(B, 30);
847  }
848 
849  switch(target) {
850  case AV_PIX_FMT_ARGB:
851  dest[0] = hasAlpha ? A : 255;
852  dest[1] = R >> 22;
853  dest[2] = G >> 22;
854  dest[3] = B >> 22;
855  break;
856  case AV_PIX_FMT_RGB24:
857  dest[0] = R >> 22;
858  dest[1] = G >> 22;
859  dest[2] = B >> 22;
860  break;
861  case AV_PIX_FMT_RGBA:
862  dest[0] = R >> 22;
863  dest[1] = G >> 22;
864  dest[2] = B >> 22;
865  dest[3] = hasAlpha ? A : 255;
866  break;
867  case AV_PIX_FMT_ABGR:
868  dest[0] = hasAlpha ? A : 255;
869  dest[1] = B >> 22;
870  dest[2] = G >> 22;
871  dest[3] = R >> 22;
872  break;
873  case AV_PIX_FMT_BGR24:
874  dest[0] = B >> 22;
875  dest[1] = G >> 22;
876  dest[2] = R >> 22;
877  break;
878  case AV_PIX_FMT_BGRA:
879  dest[0] = B >> 22;
880  dest[1] = G >> 22;
881  dest[2] = R >> 22;
882  dest[3] = hasAlpha ? A : 255;
883  break;
886  case AV_PIX_FMT_BGR8:
887  case AV_PIX_FMT_RGB8:
888  {
889  int r,g,b;
890 
891  switch (c->dither) {
892  default:
893  case SWS_DITHER_AUTO:
894  case SWS_DITHER_ED:
895  R >>= 22;
896  G >>= 22;
897  B >>= 22;
898  R += (7*err[0] + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2])>>4;
899  G += (7*err[1] + 1*c->dither_error[1][i] + 5*c->dither_error[1][i+1] + 3*c->dither_error[1][i+2])>>4;
900  B += (7*err[2] + 1*c->dither_error[2][i] + 5*c->dither_error[2][i+1] + 3*c->dither_error[2][i+2])>>4;
901  c->dither_error[0][i] = err[0];
902  c->dither_error[1][i] = err[1];
903  c->dither_error[2][i] = err[2];
904  r = R >> (isrgb8 ? 5 : 7);
905  g = G >> (isrgb8 ? 5 : 6);
906  b = B >> (isrgb8 ? 6 : 7);
907  r = av_clip(r, 0, isrgb8 ? 7 : 1);
908  g = av_clip(g, 0, isrgb8 ? 7 : 3);
909  b = av_clip(b, 0, isrgb8 ? 3 : 1);
910  err[0] = R - r*(isrgb8 ? 36 : 255);
911  err[1] = G - g*(isrgb8 ? 36 : 85);
912  err[2] = B - b*(isrgb8 ? 85 : 255);
913  break;
914  case SWS_DITHER_A_DITHER:
915  if (isrgb8) {
916  /* see http://pippin.gimp.org/a_dither/ for details/origin */
917 #define A_DITHER(u,v) (((((u)+((v)*236))*119)&0xff))
918  r = (((R >> 19) + A_DITHER(i,y) -96)>>8);
919  g = (((G >> 19) + A_DITHER(i + 17,y) - 96)>>8);
920  b = (((B >> 20) + A_DITHER(i + 17*2,y) -96)>>8);
921  r = av_clip_uintp2(r, 3);
922  g = av_clip_uintp2(g, 3);
923  b = av_clip_uintp2(b, 2);
924  } else {
925  r = (((R >> 21) + A_DITHER(i,y)-256)>>8);
926  g = (((G >> 19) + A_DITHER(i + 17,y)-256)>>8);
927  b = (((B >> 21) + A_DITHER(i + 17*2,y)-256)>>8);
928  r = av_clip_uintp2(r, 1);
929  g = av_clip_uintp2(g, 2);
930  b = av_clip_uintp2(b, 1);
931  }
932  break;
933  case SWS_DITHER_X_DITHER:
934  if (isrgb8) {
935  /* see http://pippin.gimp.org/a_dither/ for details/origin */
936 #define X_DITHER(u,v) (((((u)^((v)*237))*181)&0x1ff)/2)
937  r = (((R >> 19) + X_DITHER(i,y) - 96)>>8);
938  g = (((G >> 19) + X_DITHER(i + 17,y) - 96)>>8);
939  b = (((B >> 20) + X_DITHER(i + 17*2,y) - 96)>>8);
940  r = av_clip_uintp2(r, 3);
941  g = av_clip_uintp2(g, 3);
942  b = av_clip_uintp2(b, 2);
943  } else {
944  r = (((R >> 21) + X_DITHER(i,y)-256)>>8);
945  g = (((G >> 19) + X_DITHER(i + 17,y)-256)>>8);
946  b = (((B >> 21) + X_DITHER(i + 17*2,y)-256)>>8);
947  r = av_clip_uintp2(r, 1);
948  g = av_clip_uintp2(g, 2);
949  b = av_clip_uintp2(b, 1);
950  }
951 
952  break;
953  }
954 
955  if(target == AV_PIX_FMT_BGR4_BYTE) {
956  dest[0] = r + 2*g + 8*b;
957  } else if(target == AV_PIX_FMT_RGB4_BYTE) {
958  dest[0] = b + 2*g + 8*r;
959  } else if(target == AV_PIX_FMT_BGR8) {
960  dest[0] = r + 8*g + 64*b;
961  } else if(target == AV_PIX_FMT_RGB8) {
962  dest[0] = b + 4*g + 32*r;
963  } else
964  av_assert2(0);
965  break; }
966  }
967 }
968 
969 #define YUV2RGB_SETUP \
970  int y_offset = c->yuv2rgb_y_offset; \
971  int y_coeff = c->yuv2rgb_y_coeff; \
972  int v2r_coe = c->yuv2rgb_v2r_coeff; \
973  int v2g_coe = c->yuv2rgb_v2g_coeff; \
974  int u2g_coe = c->yuv2rgb_u2g_coeff; \
975  int u2b_coe = c->yuv2rgb_u2b_coeff; \
976  __m256i offset = __lasx_xvreplgr2vr_w(y_offset); \
977  __m256i coeff = __lasx_xvreplgr2vr_w(y_coeff); \
978  __m256i v2r = __lasx_xvreplgr2vr_w(v2r_coe); \
979  __m256i v2g = __lasx_xvreplgr2vr_w(v2g_coe); \
980  __m256i u2g = __lasx_xvreplgr2vr_w(u2g_coe); \
981  __m256i u2b = __lasx_xvreplgr2vr_w(u2b_coe); \
982 
983 
984 #define YUV2RGB(y, u, v, R, G, B, offset, coeff, \
985  y_temp, v2r, v2g, u2g, u2b) \
986 { \
987  y = __lasx_xvsub_w(y, offset); \
988  y = __lasx_xvmul_w(y, coeff); \
989  y = __lasx_xvadd_w(y, y_temp); \
990  R = __lasx_xvmadd_w(y, v, v2r); \
991  v = __lasx_xvmadd_w(y, v, v2g); \
992  G = __lasx_xvmadd_w(v, u, u2g); \
993  B = __lasx_xvmadd_w(y, u, u2b); \
994 }
995 
996 #define WRITE_FULL_A(r, g, b, a, t1, s) \
997 { \
998  R = __lasx_xvpickve2gr_w(r, t1); \
999  G = __lasx_xvpickve2gr_w(g, t1); \
1000  B = __lasx_xvpickve2gr_w(b, t1); \
1001  A = __lasx_xvpickve2gr_w(a, t1); \
1002  if (A & 0x100) \
1003  A = av_clip_uint8(A); \
1004  yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\
1005  dest += step; \
1006 }
1007 
1008 #define WRITE_FULL(r, g, b, t1, s) \
1009 { \
1010  R = __lasx_xvpickve2gr_w(r, t1); \
1011  G = __lasx_xvpickve2gr_w(g, t1); \
1012  B = __lasx_xvpickve2gr_w(b, t1); \
1013  yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \
1014  dest += step; \
1015 }
1016 
1017 static void
1018 yuv2rgb_full_X_template_lasx(SwsContext *c, const int16_t *lumFilter,
1019  const int16_t **lumSrc, int lumFilterSize,
1020  const int16_t *chrFilter, const int16_t **chrUSrc,
1021  const int16_t **chrVSrc, int chrFilterSize,
1022  const int16_t **alpSrc, uint8_t *dest,
1023  int dstW, int y, enum AVPixelFormat target,
1024  int hasAlpha)
1025 {
1026  int i, j, B, G, R, A;
1027  int step = (target == AV_PIX_FMT_RGB24 ||
1028  target == AV_PIX_FMT_BGR24) ? 3 : 4;
1029  int err[4] = {0};
1030  int a_temp = 1 << 18;
1031  int templ = 1 << 9;
1032  int tempc = templ - (128 << 19);
1033  int ytemp = 1 << 21;
1034  int len = dstW - 15;
1035  __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
1037 
1038  if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
1039  || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
1040  step = 1;
1041 
1042  for (i = 0; i < len; i += 16) {
1043  __m256i l_src, u_src, v_src;
1044  __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od, temp;
1045  __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
1046  int n = i << 1;
1047 
1048  y_ev = y_od = __lasx_xvreplgr2vr_w(templ);
1049  u_ev = u_od = v_ev = v_od = __lasx_xvreplgr2vr_w(tempc);
1050  for (j = 0; j < lumFilterSize; j++) {
1051  temp = __lasx_xvldrepl_h((lumFilter + j), 0);
1052  l_src = __lasx_xvldx(lumSrc[j], n);
1053  y_ev = __lasx_xvmaddwev_w_h(y_ev, l_src, temp);
1054  y_od = __lasx_xvmaddwod_w_h(y_od, l_src, temp);
1055  }
1056  for (j = 0; j < chrFilterSize; j++) {
1057  temp = __lasx_xvldrepl_h((chrFilter + j), 0);
1058  DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n,
1059  u_src, v_src);
1060  DUP2_ARG3(__lasx_xvmaddwev_w_h, u_ev, u_src, temp, v_ev,
1061  v_src, temp, u_ev, v_ev);
1062  DUP2_ARG3(__lasx_xvmaddwod_w_h, u_od, u_src, temp, v_od,
1063  v_src, temp, u_od, v_od);
1064  }
1065  y_ev = __lasx_xvsrai_w(y_ev, 10);
1066  y_od = __lasx_xvsrai_w(y_od, 10);
1067  u_ev = __lasx_xvsrai_w(u_ev, 10);
1068  u_od = __lasx_xvsrai_w(u_od, 10);
1069  v_ev = __lasx_xvsrai_w(v_ev, 10);
1070  v_od = __lasx_xvsrai_w(v_od, 10);
1071  YUV2RGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
1072  y_temp, v2r, v2g, u2g, u2b);
1073  YUV2RGB(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
1074  y_temp, v2r, v2g, u2g, u2b);
1075 
1076  if (hasAlpha) {
1077  __m256i a_src, a_ev, a_od;
1078 
1079  a_ev = a_od = __lasx_xvreplgr2vr_w(a_temp);
1080  for (j = 0; j < lumFilterSize; j++) {
1081  temp = __lasx_xvldrepl_h(lumFilter + j, 0);
1082  a_src = __lasx_xvldx(alpSrc[j], n);
1083  a_ev = __lasx_xvmaddwev_w_h(a_ev, a_src, temp);
1084  a_od = __lasx_xvmaddwod_w_h(a_od, a_src, temp);
1085  }
1086  a_ev = __lasx_xvsrai_w(a_ev, 19);
1087  a_od = __lasx_xvsrai_w(a_od, 19);
1088  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
1089  WRITE_FULL_A(R_od, G_od, B_od, a_od, 0, 1);
1090  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 2);
1091  WRITE_FULL_A(R_od, G_od, B_od, a_od, 1, 3);
1092  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 4);
1093  WRITE_FULL_A(R_od, G_od, B_od, a_od, 2, 5);
1094  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 6);
1095  WRITE_FULL_A(R_od, G_od, B_od, a_od, 3, 7);
1096  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 8);
1097  WRITE_FULL_A(R_od, G_od, B_od, a_od, 4, 9);
1098  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 10);
1099  WRITE_FULL_A(R_od, G_od, B_od, a_od, 5, 11);
1100  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 12);
1101  WRITE_FULL_A(R_od, G_od, B_od, a_od, 6, 13);
1102  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 14);
1103  WRITE_FULL_A(R_od, G_od, B_od, a_od, 7, 15);
1104  } else {
1105  WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
1106  WRITE_FULL(R_od, G_od, B_od, 0, 1);
1107  WRITE_FULL(R_ev, G_ev, B_ev, 1, 2);
1108  WRITE_FULL(R_od, G_od, B_od, 1, 3);
1109  WRITE_FULL(R_ev, G_ev, B_ev, 2, 4);
1110  WRITE_FULL(R_od, G_od, B_od, 2, 5);
1111  WRITE_FULL(R_ev, G_ev, B_ev, 3, 6);
1112  WRITE_FULL(R_od, G_od, B_od, 3, 7);
1113  WRITE_FULL(R_ev, G_ev, B_ev, 4, 8);
1114  WRITE_FULL(R_od, G_od, B_od, 4, 9);
1115  WRITE_FULL(R_ev, G_ev, B_ev, 5, 10);
1116  WRITE_FULL(R_od, G_od, B_od, 5, 11);
1117  WRITE_FULL(R_ev, G_ev, B_ev, 6, 12);
1118  WRITE_FULL(R_od, G_od, B_od, 6, 13);
1119  WRITE_FULL(R_ev, G_ev, B_ev, 7, 14);
1120  WRITE_FULL(R_od, G_od, B_od, 7, 15);
1121  }
1122  }
1123  if (dstW - i >= 8) {
1124  __m256i l_src, u_src, v_src;
1125  __m256i y_ev, u_ev, v_ev, uv, temp;
1126  __m256i R_ev, G_ev, B_ev;
1127  int n = i << 1;
1128 
1129  y_ev = __lasx_xvreplgr2vr_w(templ);
1130  u_ev = v_ev = __lasx_xvreplgr2vr_w(tempc);
1131  for (j = 0; j < lumFilterSize; j++) {
1132  temp = __lasx_xvldrepl_h((lumFilter + j), 0);
1133  l_src = __lasx_xvldx(lumSrc[j], n);
1134  l_src = __lasx_xvpermi_d(l_src, 0xD8);
1135  l_src = __lasx_xvilvl_h(l_src, l_src);
1136  y_ev = __lasx_xvmaddwev_w_h(y_ev, l_src, temp);
1137  }
1138  for (j = 0; j < chrFilterSize; j++) {
1139  temp = __lasx_xvldrepl_h((chrFilter + j), 0);
1140  DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
1141  u_src = __lasx_xvpermi_d(u_src, 0xD8);
1142  v_src = __lasx_xvpermi_d(v_src, 0xD8);
1143  uv = __lasx_xvilvl_h(v_src, u_src);
1144  u_ev = __lasx_xvmaddwev_w_h(u_ev, uv, temp);
1145  v_ev = __lasx_xvmaddwod_w_h(v_ev, uv, temp);
1146  }
1147  y_ev = __lasx_xvsrai_w(y_ev, 10);
1148  u_ev = __lasx_xvsrai_w(u_ev, 10);
1149  v_ev = __lasx_xvsrai_w(v_ev, 10);
1150  YUV2RGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
1151  y_temp, v2r, v2g, u2g, u2b);
1152 
1153  if (hasAlpha) {
1154  __m256i a_src, a_ev;
1155 
1156  a_ev = __lasx_xvreplgr2vr_w(a_temp);
1157  for (j = 0; j < lumFilterSize; j++) {
1158  temp = __lasx_xvldrepl_h(lumFilter + j, 0);
1159  a_src = __lasx_xvldx(alpSrc[j], n);
1160  a_src = __lasx_xvpermi_d(a_src, 0xD8);
1161  a_src = __lasx_xvilvl_h(a_src, a_src);
1162  a_ev = __lasx_xvmaddwev_w_h(a_ev, a_src, temp);
1163  }
1164  a_ev = __lasx_xvsrai_w(a_ev, 19);
1165  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
1166  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 1);
1167  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 2);
1168  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 3);
1169  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 4);
1170  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 5);
1171  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 6);
1172  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 7);
1173  } else {
1174  WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
1175  WRITE_FULL(R_ev, G_ev, B_ev, 1, 1);
1176  WRITE_FULL(R_ev, G_ev, B_ev, 2, 2);
1177  WRITE_FULL(R_ev, G_ev, B_ev, 3, 3);
1178  WRITE_FULL(R_ev, G_ev, B_ev, 4, 4);
1179  WRITE_FULL(R_ev, G_ev, B_ev, 5, 5);
1180  WRITE_FULL(R_ev, G_ev, B_ev, 6, 6);
1181  WRITE_FULL(R_ev, G_ev, B_ev, 7, 7);
1182  }
1183  i += 8;
1184  }
1185  for (; i < dstW; i++) {
1186  int Y = templ;
1187  int V, U = V = tempc;
1188 
1189  A = 0;
1190  for (j = 0; j < lumFilterSize; j++) {
1191  Y += lumSrc[j][i] * lumFilter[j];
1192  }
1193  for (j = 0; j < chrFilterSize; j++) {
1194  U += chrUSrc[j][i] * chrFilter[j];
1195  V += chrVSrc[j][i] * chrFilter[j];
1196 
1197  }
1198  Y >>= 10;
1199  U >>= 10;
1200  V >>= 10;
1201  if (hasAlpha) {
1202  A = 1 << 18;
1203  for (j = 0; j < lumFilterSize; j++) {
1204  A += alpSrc[j][i] * lumFilter[j];
1205  }
1206  A >>= 19;
1207  if (A & 0x100)
1208  A = av_clip_uint8(A);
1209  }
1210  Y -= y_offset;
1211  Y *= y_coeff;
1212  Y += ytemp;
1213  R = (unsigned)Y + V * v2r_coe;
1214  G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
1215  B = (unsigned)Y + U * u2b_coe;
1216  yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
1217  dest += step;
1218  }
1219  c->dither_error[0][i] = err[0];
1220  c->dither_error[1][i] = err[1];
1221  c->dither_error[2][i] = err[2];
1222 }
1223 
1224 static void
1226  const int16_t *ubuf[2], const int16_t *vbuf[2],
1227  const int16_t *abuf[2], uint8_t *dest, int dstW,
1228  int yalpha, int uvalpha, int y,
1229  enum AVPixelFormat target, int hasAlpha)
1230 {
1231  const int16_t *buf0 = buf[0], *buf1 = buf[1],
1232  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1233  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1234  *abuf0 = hasAlpha ? abuf[0] : NULL,
1235  *abuf1 = hasAlpha ? abuf[1] : NULL;
1236  int yalpha1 = 4096 - yalpha;
1237  int uvalpha1 = 4096 - uvalpha;
1238  int uvtemp = 128 << 19;
1239  int atemp = 1 << 18;
1240  int err[4] = {0};
1241  int ytemp = 1 << 21;
1242  int len = dstW - 15;
1243  int i, R, G, B, A;
1244  int step = (target == AV_PIX_FMT_RGB24 ||
1245  target == AV_PIX_FMT_BGR24) ? 3 : 4;
1246  __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
1247  __m256i v_yalpha1 = __lasx_xvreplgr2vr_w(yalpha1);
1248  __m256i v_uvalpha = __lasx_xvreplgr2vr_w(uvalpha);
1249  __m256i v_yalpha = __lasx_xvreplgr2vr_w(yalpha);
1250  __m256i uv = __lasx_xvreplgr2vr_w(uvtemp);
1251  __m256i a_bias = __lasx_xvreplgr2vr_w(atemp);
1252  __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
1254 
1255  av_assert2(yalpha <= 4096U);
1256  av_assert2(uvalpha <= 4096U);
1257 
1258  if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
1259  || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
1260  step = 1;
1261 
1262  for (i = 0; i < len; i += 16) {
1263  __m256i b0, b1, ub0, ub1, vb0, vb1;
1264  __m256i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
1265  __m256i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
1266  __m256i y_l, y_h, v_l, v_h, u_l, u_h;
1267  __m256i R_l, R_h, G_l, G_h, B_l, B_h;
1268  int n = i << 1;
1269 
1270  DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0,
1271  n, ubuf1, n, b0, b1, ub0, ub1);
1272  DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0 , vb1);
1273  DUP2_ARG2(__lasx_xvsllwil_w_h, b0, 0, b1, 0, y0_l, y1_l);
1274  DUP4_ARG2(__lasx_xvsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
1275  u0_l, u1_l, v0_l, v1_l);
1276  DUP2_ARG1(__lasx_xvexth_w_h, b0, b1, y0_h, y1_h);
1277  DUP4_ARG1(__lasx_xvexth_w_h, ub0, ub1, vb0, vb1,
1278  u0_h, u1_h, v0_h, v1_h);
1279  y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
1280  y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
1281  u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
1282  u0_h = __lasx_xvmul_w(u0_h, v_uvalpha1);
1283  v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
1284  v0_h = __lasx_xvmul_w(v0_h, v_uvalpha1);
1285  y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
1286  y_h = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
1287  u_l = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
1288  u_h = __lasx_xvmadd_w(u0_h, v_uvalpha, u1_h);
1289  v_l = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
1290  v_h = __lasx_xvmadd_w(v0_h, v_uvalpha, v1_h);
1291  u_l = __lasx_xvsub_w(u_l, uv);
1292  u_h = __lasx_xvsub_w(u_h, uv);
1293  v_l = __lasx_xvsub_w(v_l, uv);
1294  v_h = __lasx_xvsub_w(v_h, uv);
1295  y_l = __lasx_xvsrai_w(y_l, 10);
1296  y_h = __lasx_xvsrai_w(y_h, 10);
1297  u_l = __lasx_xvsrai_w(u_l, 10);
1298  u_h = __lasx_xvsrai_w(u_h, 10);
1299  v_l = __lasx_xvsrai_w(v_l, 10);
1300  v_h = __lasx_xvsrai_w(v_h, 10);
1301  YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
1302  y_temp, v2r, v2g, u2g, u2b);
1303  YUV2RGB(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
1304  y_temp, v2r, v2g, u2g, u2b);
1305 
1306  if (hasAlpha) {
1307  __m256i a0, a1, a0_l, a0_h;
1308  __m256i a_l, a_h, a1_l, a1_h;
1309 
1310  DUP2_ARG2(__lasx_xvldx, abuf0, n, abuf1, n, a0, a1);
1311  DUP2_ARG2(__lasx_xvsllwil_w_h, a0, 0, a1, 0, a0_l, a1_l);
1312  DUP2_ARG1(__lasx_xvexth_w_h, a0, a1, a0_h, a1_h);
1313  a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
1314  a_h = __lasx_xvmadd_w(a_bias, a0_h, v_yalpha1);
1315  a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
1316  a_h = __lasx_xvmadd_w(a_h, v_yalpha, a1_h);
1317  a_l = __lasx_xvsrai_w(a_l, 19);
1318  a_h = __lasx_xvsrai_w(a_h, 19);
1319  WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
1320  WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
1321  WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
1322  WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
1323  WRITE_FULL_A(R_h, G_h, B_h, a_h, 0, 4);
1324  WRITE_FULL_A(R_h, G_h, B_h, a_h, 1, 5);
1325  WRITE_FULL_A(R_h, G_h, B_h, a_h, 2, 6);
1326  WRITE_FULL_A(R_h, G_h, B_h, a_h, 3, 7);
1327  WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 8);
1328  WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 9);
1329  WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 10);
1330  WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 11);
1331  WRITE_FULL_A(R_h, G_h, B_h, a_h, 4, 12);
1332  WRITE_FULL_A(R_h, G_h, B_h, a_h, 5, 13);
1333  WRITE_FULL_A(R_h, G_h, B_h, a_h, 6, 14);
1334  WRITE_FULL_A(R_h, G_h, B_h, a_h, 7, 15);
1335  } else {
1336  WRITE_FULL(R_l, G_l, B_l, 0, 0);
1337  WRITE_FULL(R_l, G_l, B_l, 1, 1);
1338  WRITE_FULL(R_l, G_l, B_l, 2, 2);
1339  WRITE_FULL(R_l, G_l, B_l, 3, 3);
1340  WRITE_FULL(R_h, G_h, B_h, 0, 4);
1341  WRITE_FULL(R_h, G_h, B_h, 1, 5);
1342  WRITE_FULL(R_h, G_h, B_h, 2, 6);
1343  WRITE_FULL(R_h, G_h, B_h, 3, 7);
1344  WRITE_FULL(R_l, G_l, B_l, 4, 8);
1345  WRITE_FULL(R_l, G_l, B_l, 5, 9);
1346  WRITE_FULL(R_l, G_l, B_l, 6, 10);
1347  WRITE_FULL(R_l, G_l, B_l, 7, 11);
1348  WRITE_FULL(R_h, G_h, B_h, 4, 12);
1349  WRITE_FULL(R_h, G_h, B_h, 5, 13);
1350  WRITE_FULL(R_h, G_h, B_h, 6, 14);
1351  WRITE_FULL(R_h, G_h, B_h, 7, 15);
1352  }
1353  }
1354  if (dstW - i >= 8) {
1355  __m256i b0, b1, ub0, ub1, vb0, vb1;
1356  __m256i y0_l, y1_l, u0_l;
1357  __m256i v0_l, u1_l, v1_l;
1358  __m256i y_l, u_l, v_l;
1359  __m256i R_l, G_l, B_l;
1360  int n = i << 1;
1361 
1362  DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0, n,
1363  ubuf1, n, b0, b1, ub0, ub1);
1364  DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0, vb1);
1365  DUP2_ARG1(__lasx_vext2xv_w_h, b0, b1, y0_l, y1_l);
1366  DUP4_ARG1(__lasx_vext2xv_w_h, ub0, ub1, vb0, vb1,
1367  u0_l, u1_l, v0_l, v1_l);
1368  y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
1369  u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
1370  v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
1371  y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
1372  u_l = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
1373  v_l = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
1374  u_l = __lasx_xvsub_w(u_l, uv);
1375  v_l = __lasx_xvsub_w(v_l, uv);
1376  y_l = __lasx_xvsrai_w(y_l, 10);
1377  u_l = __lasx_xvsrai_w(u_l, 10);
1378  v_l = __lasx_xvsrai_w(v_l, 10);
1379  YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
1380  y_temp, v2r, v2g, u2g, u2b);
1381 
1382  if (hasAlpha) {
1383  __m256i a0, a1, a0_l;
1384  __m256i a_l, a1_l;
1385 
1386  DUP2_ARG2(__lasx_xvldx, abuf0, n, abuf1, n, a0, a1);
1387  DUP2_ARG1(__lasx_vext2xv_w_h, a0, a1, a0_l, a1_l);
1388  a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
1389  a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
1390  a_l = __lasx_xvsrai_w(a_l, 19);
1391  WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
1392  WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
1393  WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
1394  WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
1395  WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
1396  WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
1397  WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
1398  WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
1399  } else {
1400  WRITE_FULL(R_l, G_l, B_l, 0, 0);
1401  WRITE_FULL(R_l, G_l, B_l, 1, 1);
1402  WRITE_FULL(R_l, G_l, B_l, 2, 2);
1403  WRITE_FULL(R_l, G_l, B_l, 3, 3);
1404  WRITE_FULL(R_l, G_l, B_l, 4, 4);
1405  WRITE_FULL(R_l, G_l, B_l, 5, 5);
1406  WRITE_FULL(R_l, G_l, B_l, 6, 6);
1407  WRITE_FULL(R_l, G_l, B_l, 7, 7);
1408  }
1409  i += 8;
1410  }
1411  for (; i < dstW; i++){
1412  int Y = ( buf0[i] * yalpha1 + buf1[i] * yalpha ) >> 10;
1413  int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha- uvtemp) >> 10;
1414  int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha- uvtemp) >> 10;
1415 
1416  A = 0;
1417  if (hasAlpha){
1418  A = (abuf0[i] * yalpha1 + abuf1[i] * yalpha + atemp) >> 19;
1419  if (A & 0x100)
1420  A = av_clip_uint8(A);
1421  }
1422 
1423  Y -= y_offset;
1424  Y *= y_coeff;
1425  Y += ytemp;
1426  R = (unsigned)Y + V * v2r_coe;
1427  G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
1428  B = (unsigned)Y + U * u2b_coe;
1429  yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
1430  dest += step;
1431  }
1432  c->dither_error[0][i] = err[0];
1433  c->dither_error[1][i] = err[1];
1434  c->dither_error[2][i] = err[2];
1435 }
1436 
1437 static void
1439  const int16_t *ubuf[2], const int16_t *vbuf[2],
1440  const int16_t *abuf0, uint8_t *dest, int dstW,
1441  int uvalpha, int y, enum AVPixelFormat target,
1442  int hasAlpha)
1443 {
1444  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
1445  int i, B, G, R, A;
1446  int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4;
1447  int err[4] = {0};
1448  int ytemp = 1 << 21;
1449  int bias_int = 64;
1450  int len = dstW - 15;
1451  __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
1453 
1454  if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
1455  || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
1456  step = 1;
1457  if (uvalpha < 2048) {
1458  int uvtemp = 128 << 7;
1459  __m256i uv = __lasx_xvreplgr2vr_w(uvtemp);
1460  __m256i bias = __lasx_xvreplgr2vr_w(bias_int);
1461 
1462  for (i = 0; i < len; i += 16) {
1463  __m256i b, ub, vb, ub_l, ub_h, vb_l, vb_h;
1464  __m256i y_l, y_h, u_l, u_h, v_l, v_h;
1465  __m256i R_l, R_h, G_l, G_h, B_l, B_h;
1466  int n = i << 1;
1467 
1468  DUP2_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, b, ub);
1469  vb = __lasx_xvldx(vbuf0, n);
1470  y_l = __lasx_xvsllwil_w_h(b, 2);
1471  y_h = __lasx_xvexth_w_h(b);
1472  DUP2_ARG2(__lasx_xvsllwil_w_h, ub, 0, vb, 0, ub_l, vb_l);
1473  DUP2_ARG1(__lasx_xvexth_w_h, ub, vb, ub_h, vb_h);
1474  y_h = __lasx_xvslli_w(y_h, 2);
1475  u_l = __lasx_xvsub_w(ub_l, uv);
1476  u_h = __lasx_xvsub_w(ub_h, uv);
1477  v_l = __lasx_xvsub_w(vb_l, uv);
1478  v_h = __lasx_xvsub_w(vb_h, uv);
1479  u_l = __lasx_xvslli_w(u_l, 2);
1480  u_h = __lasx_xvslli_w(u_h, 2);
1481  v_l = __lasx_xvslli_w(v_l, 2);
1482  v_h = __lasx_xvslli_w(v_h, 2);
1483  YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
1484  y_temp, v2r, v2g, u2g, u2b);
1485  YUV2RGB(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
1486  y_temp, v2r, v2g, u2g, u2b);
1487 
1488  if(hasAlpha) {
1489  __m256i a_src;
1490  __m256i a_l, a_h;
1491 
1492  a_src = __lasx_xvld(abuf0 + i, 0);
1493  a_l = __lasx_xvsllwil_w_h(a_src, 0);
1494  a_h = __lasx_xvexth_w_h(a_src);
1495  a_l = __lasx_xvadd_w(a_l, bias);
1496  a_h = __lasx_xvadd_w(a_h, bias);
1497  a_l = __lasx_xvsrai_w(a_l, 7);
1498  a_h = __lasx_xvsrai_w(a_h, 7);
1499  WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
1500  WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
1501  WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
1502  WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
1503  WRITE_FULL_A(R_h, G_h, B_h, a_h, 0, 4);
1504  WRITE_FULL_A(R_h, G_h, B_h, a_h, 1, 5);
1505  WRITE_FULL_A(R_h, G_h, B_h, a_h, 2, 6);
1506  WRITE_FULL_A(R_h, G_h, B_h, a_h, 3, 7);
1507  WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 8);
1508  WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 9);
1509  WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 10);
1510  WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 11);
1511  WRITE_FULL_A(R_h, G_h, B_h, a_h, 4, 12);
1512  WRITE_FULL_A(R_h, G_h, B_h, a_h, 5, 13);
1513  WRITE_FULL_A(R_h, G_h, B_h, a_h, 6, 14);
1514  WRITE_FULL_A(R_h, G_h, B_h, a_h, 7, 15);
1515  } else {
1516  WRITE_FULL(R_l, G_l, B_l, 0, 0);
1517  WRITE_FULL(R_l, G_l, B_l, 1, 1);
1518  WRITE_FULL(R_l, G_l, B_l, 2, 2);
1519  WRITE_FULL(R_l, G_l, B_l, 3, 3);
1520  WRITE_FULL(R_h, G_h, B_h, 0, 4);
1521  WRITE_FULL(R_h, G_h, B_h, 1, 5);
1522  WRITE_FULL(R_h, G_h, B_h, 2, 6);
1523  WRITE_FULL(R_h, G_h, B_h, 3, 7);
1524  WRITE_FULL(R_l, G_l, B_l, 4, 8);
1525  WRITE_FULL(R_l, G_l, B_l, 5, 9);
1526  WRITE_FULL(R_l, G_l, B_l, 6, 10);
1527  WRITE_FULL(R_l, G_l, B_l, 7, 11);
1528  WRITE_FULL(R_h, G_h, B_h, 4, 12);
1529  WRITE_FULL(R_h, G_h, B_h, 5, 13);
1530  WRITE_FULL(R_h, G_h, B_h, 6, 14);
1531  WRITE_FULL(R_h, G_h, B_h, 7, 15);
1532  }
1533  }
1534  if (dstW - i >= 8) {
1535  __m256i b, ub, vb, ub_l, vb_l;
1536  __m256i y_l, u_l, v_l;
1537  __m256i R_l, G_l, B_l;
1538  int n = i << 1;
1539 
1540  DUP2_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, b, ub);
1541  vb = __lasx_xvldx(vbuf0, n);
1542  y_l = __lasx_vext2xv_w_h(b);
1543  DUP2_ARG1(__lasx_vext2xv_w_h, ub, vb, ub_l, vb_l);
1544  y_l = __lasx_xvslli_w(y_l, 2);
1545  u_l = __lasx_xvsub_w(ub_l, uv);
1546  v_l = __lasx_xvsub_w(vb_l, uv);
1547  u_l = __lasx_xvslli_w(u_l, 2);
1548  v_l = __lasx_xvslli_w(v_l, 2);
1549  YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
1550  y_temp, v2r, v2g, u2g, u2b);
1551 
1552  if(hasAlpha) {
1553  __m256i a_src, a_l;
1554 
1555  a_src = __lasx_xvldx(abuf0, n);
1556  a_src = __lasx_vext2xv_w_h(a_src);
1557  a_l = __lasx_xvadd_w(bias, a_src);
1558  a_l = __lasx_xvsrai_w(a_l, 7);
1559  WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
1560  WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
1561  WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
1562  WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
1563  WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
1564  WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
1565  WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
1566  WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
1567  } else {
1568  WRITE_FULL(R_l, G_l, B_l, 0, 0);
1569  WRITE_FULL(R_l, G_l, B_l, 1, 1);
1570  WRITE_FULL(R_l, G_l, B_l, 2, 2);
1571  WRITE_FULL(R_l, G_l, B_l, 3, 3);
1572  WRITE_FULL(R_l, G_l, B_l, 4, 4);
1573  WRITE_FULL(R_l, G_l, B_l, 5, 5);
1574  WRITE_FULL(R_l, G_l, B_l, 6, 6);
1575  WRITE_FULL(R_l, G_l, B_l, 7, 7);
1576  }
1577  i += 8;
1578  }
1579  for (; i < dstW; i++) {
1580  int Y = buf0[i] << 2;
1581  int U = (ubuf0[i] - uvtemp) << 2;
1582  int V = (vbuf0[i] - uvtemp) << 2;
1583 
1584  A = 0;
1585  if(hasAlpha) {
1586  A = (abuf0[i] + 64) >> 7;
1587  if (A & 0x100)
1588  A = av_clip_uint8(A);
1589  }
1590  Y -= y_offset;
1591  Y *= y_coeff;
1592  Y += ytemp;
1593  R = (unsigned)Y + V * v2r_coe;
1594  G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
1595  B = (unsigned)Y + U * u2b_coe;
1596  yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
1597  dest += step;
1598  }
1599  } else {
1600  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
1601  int uvtemp = 128 << 8;
1602  __m256i uv = __lasx_xvreplgr2vr_w(uvtemp);
1603  __m256i zero = __lasx_xvldi(0);
1604  __m256i bias = __lasx_xvreplgr2vr_h(bias_int);
1605 
1606  for (i = 0; i < len; i += 16) {
1607  __m256i b, ub0, ub1, vb0, vb1;
1608  __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od;
1609  __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
1610  int n = i << 1;
1611 
1612  DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
1613  ubuf1, n, b, ub0, vb0, ub1);
1614  vb1 = __lasx_xvldx(vbuf, n);
1615  y_ev = __lasx_xvaddwev_w_h(b, zero);
1616  y_od = __lasx_xvaddwod_w_h(b, zero);
1617  DUP2_ARG2(__lasx_xvaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
1618  DUP2_ARG2(__lasx_xvaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
1619  DUP2_ARG2(__lasx_xvslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
1620  DUP4_ARG2(__lasx_xvsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
1621  u_ev, u_od, v_ev, v_od);
1622  DUP4_ARG2(__lasx_xvslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
1623  u_ev, u_od, v_ev, v_od);
1624  YUV2RGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
1625  y_temp, v2r, v2g, u2g, u2b);
1626  YUV2RGB(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
1627  y_temp, v2r, v2g, u2g, u2b);
1628 
1629  if(hasAlpha) {
1630  __m256i a_src;
1631  __m256i a_ev, a_od;
1632 
1633  a_src = __lasx_xvld(abuf0 + i, 0);
1634  a_ev = __lasx_xvaddwev_w_h(bias, a_src);
1635  a_od = __lasx_xvaddwod_w_h(bias, a_src);
1636  a_ev = __lasx_xvsrai_w(a_ev, 7);
1637  a_od = __lasx_xvsrai_w(a_od, 7);
1638  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
1639  WRITE_FULL_A(R_od, G_od, B_od, a_od, 0, 1);
1640  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 2);
1641  WRITE_FULL_A(R_od, G_od, B_od, a_od, 1, 3);
1642  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 4);
1643  WRITE_FULL_A(R_od, G_od, B_od, a_od, 2, 5);
1644  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 6);
1645  WRITE_FULL_A(R_od, G_od, B_od, a_od, 3, 7);
1646  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 8);
1647  WRITE_FULL_A(R_od, G_od, B_od, a_od, 4, 9);
1648  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 10);
1649  WRITE_FULL_A(R_od, G_od, B_od, a_od, 5, 11);
1650  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 12);
1651  WRITE_FULL_A(R_od, G_od, B_od, a_od, 6, 13);
1652  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 14);
1653  WRITE_FULL_A(R_od, G_od, B_od, a_od, 7, 15);
1654  } else {
1655  WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
1656  WRITE_FULL(R_od, G_od, B_od, 0, 1);
1657  WRITE_FULL(R_ev, G_ev, B_ev, 1, 2);
1658  WRITE_FULL(R_od, G_od, B_od, 1, 3);
1659  WRITE_FULL(R_ev, G_ev, B_ev, 2, 4);
1660  WRITE_FULL(R_od, G_od, B_od, 2, 5);
1661  WRITE_FULL(R_ev, G_ev, B_ev, 3, 6);
1662  WRITE_FULL(R_od, G_od, B_od, 3, 7);
1663  WRITE_FULL(R_ev, G_ev, B_ev, 4, 8);
1664  WRITE_FULL(R_od, G_od, B_od, 4, 9);
1665  WRITE_FULL(R_ev, G_ev, B_ev, 5, 10);
1666  WRITE_FULL(R_od, G_od, B_od, 5, 11);
1667  WRITE_FULL(R_ev, G_ev, B_ev, 6, 12);
1668  WRITE_FULL(R_od, G_od, B_od, 6, 13);
1669  WRITE_FULL(R_ev, G_ev, B_ev, 7, 14);
1670  WRITE_FULL(R_od, G_od, B_od, 7, 15);
1671  }
1672  }
1673  if (dstW - i >= 8) {
1674  __m256i b, ub0, ub1, vb0, vb1;
1675  __m256i y_l, u_l, v_l;
1676  __m256i R_l, G_l, B_l;
1677  int n = i << 1;
1678 
1679  DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
1680  ubuf1, n, b, ub0, vb0, ub1);
1681  vb1 = __lasx_xvldx(vbuf1, n);
1682  y_l = __lasx_vext2xv_w_h(b);
1683  y_l = __lasx_xvslli_w(y_l, 2);
1684  DUP4_ARG1(__lasx_vext2xv_w_h, ub0, vb0, ub1, vb1,
1685  ub0, vb0, ub1, vb1);
1686  DUP2_ARG2(__lasx_xvadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
1687  u_l = __lasx_xvsub_w(u_l, uv);
1688  v_l = __lasx_xvsub_w(v_l, uv);
1689  u_l = __lasx_xvslli_w(u_l, 1);
1690  v_l = __lasx_xvslli_w(v_l, 1);
1691  YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
1692  y_temp, v2r, v2g, u2g, u2b);
1693 
1694  if(hasAlpha) {
1695  __m256i a_src;
1696  __m256i a_l;
1697 
1698  a_src = __lasx_xvld(abuf0 + i, 0);
1699  a_src = __lasx_xvpermi_d(a_src, 0xD8);
1700  a_src = __lasx_xvilvl_h(a_src, a_src);
1701  a_l = __lasx_xvaddwev_w_h(bias, a_src);
1702  a_l = __lasx_xvsrai_w(a_l, 7);
1703  WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
1704  WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
1705  WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
1706  WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
1707  WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
1708  WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
1709  WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
1710  WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
1711  } else {
1712  WRITE_FULL(R_l, G_l, B_l, 0, 0);
1713  WRITE_FULL(R_l, G_l, B_l, 1, 1);
1714  WRITE_FULL(R_l, G_l, B_l, 2, 2);
1715  WRITE_FULL(R_l, G_l, B_l, 3, 3);
1716  WRITE_FULL(R_l, G_l, B_l, 4, 4);
1717  WRITE_FULL(R_l, G_l, B_l, 5, 5);
1718  WRITE_FULL(R_l, G_l, B_l, 6, 6);
1719  WRITE_FULL(R_l, G_l, B_l, 7, 7);
1720  }
1721  i += 8;
1722  }
1723  for (; i < dstW; i++) {
1724  int Y = buf0[i] << 2;
1725  int U = (ubuf0[i] + ubuf1[i] - uvtemp) << 1;
1726  int V = (vbuf0[i] + vbuf1[i] - uvtemp) << 1;
1727 
1728  A = 0;
1729  if(hasAlpha) {
1730  A = (abuf0[i] + 64) >> 7;
1731  if (A & 0x100)
1732  A = av_clip_uint8(A);
1733  }
1734  Y -= y_offset;
1735  Y *= y_coeff;
1736  Y += ytemp;
1737  R = (unsigned)Y + V * v2r_coe;
1738  G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
1739  B = (unsigned)Y + U * u2b_coe;
1740  yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
1741  dest += step;
1742  }
1743  }
1744  c->dither_error[0][i] = err[0];
1745  c->dither_error[1][i] = err[1];
1746  c->dither_error[2][i] = err[2];
1747 }
1748 #if CONFIG_SMALL
1749 YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA,
1750  CONFIG_SWSCALE_ALPHA && c->needAlpha)
1751 YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR,
1752  CONFIG_SWSCALE_ALPHA && c->needAlpha)
1753 YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA,
1754  CONFIG_SWSCALE_ALPHA && c->needAlpha)
1755 YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB,
1756  CONFIG_SWSCALE_ALPHA && c->needAlpha)
1757 #else
1758 #if CONFIG_SWSCALE_ALPHA
1759 YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA, 1)
1760 YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR, 1)
1761 YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA, 1)
1762 YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB, 1)
1763 #endif
1764 YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1765 YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1766 YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1767 YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1768 #endif
1769 YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1770 YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1771 
1772 YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full, AV_PIX_FMT_BGR4_BYTE, 0)
1773 YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full, AV_PIX_FMT_RGB4_BYTE, 0)
1774 YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
1775 YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
1776 
1777 
1779  yuv2planar1_fn *yuv2plane1,
1781  yuv2interleavedX_fn *yuv2nv12cX,
1782  yuv2packed1_fn *yuv2packed1,
1783  yuv2packed2_fn *yuv2packed2,
1784  yuv2packedX_fn *yuv2packedX,
1785  yuv2anyX_fn *yuv2anyX)
1786 {
1787  enum AVPixelFormat dstFormat = c->dstFormat;
1788 
1789  /* Add initialization once optimized */
1790  if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) {
1791  } else if (is16BPS(dstFormat)) {
1792  } else if (isNBPS(dstFormat)) {
1793  } else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
1794  } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
1795  } else {
1796  *yuv2plane1 = yuv2plane1_8_lasx;
1798  }
1799 
1800  if(c->flags & SWS_FULL_CHR_H_INT) {
1801  switch (c->dstFormat) {
1802  case AV_PIX_FMT_RGBA:
1803 #if CONFIG_SMALL
1804  c->yuv2packedX = yuv2rgba32_full_X_lasx;
1805  c->yuv2packed2 = yuv2rgba32_full_2_lasx;
1806  c->yuv2packed1 = yuv2rgba32_full_1_lasx;
1807 #else
1808 #if CONFIG_SWSCALE_ALPHA
1809  if (c->needAlpha) {
1810  c->yuv2packedX = yuv2rgba32_full_X_lasx;
1811  c->yuv2packed2 = yuv2rgba32_full_2_lasx;
1812  c->yuv2packed1 = yuv2rgba32_full_1_lasx;
1813  } else
1814 #endif /* CONFIG_SWSCALE_ALPHA */
1815  {
1816  c->yuv2packedX = yuv2rgbx32_full_X_lasx;
1817  c->yuv2packed2 = yuv2rgbx32_full_2_lasx;
1818  c->yuv2packed1 = yuv2rgbx32_full_1_lasx;
1819  }
1820 #endif /* !CONFIG_SMALL */
1821  break;
1822  case AV_PIX_FMT_ARGB:
1823 #if CONFIG_SMALL
1824  c->yuv2packedX = yuv2argb32_full_X_lasx;
1825  c->yuv2packed2 = yuv2argb32_full_2_lasx;
1826  c->yuv2packed1 = yuv2argb32_full_1_lasx;
1827 #else
1828 #if CONFIG_SWSCALE_ALPHA
1829  if (c->needAlpha) {
1830  c->yuv2packedX = yuv2argb32_full_X_lasx;
1831  c->yuv2packed2 = yuv2argb32_full_2_lasx;
1832  c->yuv2packed1 = yuv2argb32_full_1_lasx;
1833  } else
1834 #endif /* CONFIG_SWSCALE_ALPHA */
1835  {
1836  c->yuv2packedX = yuv2xrgb32_full_X_lasx;
1837  c->yuv2packed2 = yuv2xrgb32_full_2_lasx;
1838  c->yuv2packed1 = yuv2xrgb32_full_1_lasx;
1839  }
1840 #endif /* !CONFIG_SMALL */
1841  break;
1842  case AV_PIX_FMT_BGRA:
1843 #if CONFIG_SMALL
1844  c->yuv2packedX = yuv2bgra32_full_X_lasx;
1845  c->yuv2packed2 = yuv2bgra32_full_2_lasx;
1846  c->yuv2packed1 = yuv2bgra32_full_1_lasx;
1847 #else
1848 #if CONFIG_SWSCALE_ALPHA
1849  if (c->needAlpha) {
1850  c->yuv2packedX = yuv2bgra32_full_X_lasx;
1851  c->yuv2packed2 = yuv2bgra32_full_2_lasx;
1852  c->yuv2packed1 = yuv2bgra32_full_1_lasx;
1853  } else
1854 #endif /* CONFIG_SWSCALE_ALPHA */
1855  {
1856  c->yuv2packedX = yuv2bgrx32_full_X_lasx;
1857  c->yuv2packed2 = yuv2bgrx32_full_2_lasx;
1858  c->yuv2packed1 = yuv2bgrx32_full_1_lasx;
1859  }
1860 #endif /* !CONFIG_SMALL */
1861  break;
1862  case AV_PIX_FMT_ABGR:
1863 #if CONFIG_SMALL
1864  c->yuv2packedX = yuv2abgr32_full_X_lasx;
1865  c->yuv2packed2 = yuv2abgr32_full_2_lasx;
1866  c->yuv2packed1 = yuv2abgr32_full_1_lasx;
1867 #else
1868 #if CONFIG_SWSCALE_ALPHA
1869  if (c->needAlpha) {
1870  c->yuv2packedX = yuv2abgr32_full_X_lasx;
1871  c->yuv2packed2 = yuv2abgr32_full_2_lasx;
1872  c->yuv2packed1 = yuv2abgr32_full_1_lasx;
1873  } else
1874 #endif /* CONFIG_SWSCALE_ALPHA */
1875  {
1876  c->yuv2packedX = yuv2xbgr32_full_X_lasx;
1877  c->yuv2packed2 = yuv2xbgr32_full_2_lasx;
1878  c->yuv2packed1 = yuv2xbgr32_full_1_lasx;
1879  }
1880 #endif /* !CONFIG_SMALL */
1881  break;
1882  case AV_PIX_FMT_RGB24:
1883  c->yuv2packedX = yuv2rgb24_full_X_lasx;
1884  c->yuv2packed2 = yuv2rgb24_full_2_lasx;
1885  c->yuv2packed1 = yuv2rgb24_full_1_lasx;
1886  break;
1887  case AV_PIX_FMT_BGR24:
1888  c->yuv2packedX = yuv2bgr24_full_X_lasx;
1889  c->yuv2packed2 = yuv2bgr24_full_2_lasx;
1890  c->yuv2packed1 = yuv2bgr24_full_1_lasx;
1891  break;
1892  case AV_PIX_FMT_BGR4_BYTE:
1893  c->yuv2packedX = yuv2bgr4_byte_full_X_lasx;
1894  c->yuv2packed2 = yuv2bgr4_byte_full_2_lasx;
1895  c->yuv2packed1 = yuv2bgr4_byte_full_1_lasx;
1896  break;
1897  case AV_PIX_FMT_RGB4_BYTE:
1898  c->yuv2packedX = yuv2rgb4_byte_full_X_lasx;
1899  c->yuv2packed2 = yuv2rgb4_byte_full_2_lasx;
1900  c->yuv2packed1 = yuv2rgb4_byte_full_1_lasx;
1901  break;
1902  case AV_PIX_FMT_BGR8:
1903  c->yuv2packedX = yuv2bgr8_full_X_lasx;
1904  c->yuv2packed2 = yuv2bgr8_full_2_lasx;
1905  c->yuv2packed1 = yuv2bgr8_full_1_lasx;
1906  break;
1907  case AV_PIX_FMT_RGB8:
1908  c->yuv2packedX = yuv2rgb8_full_X_lasx;
1909  c->yuv2packed2 = yuv2rgb8_full_2_lasx;
1910  c->yuv2packed1 = yuv2rgb8_full_1_lasx;
1911  break;
1912  }
1913  } else {
1914  switch (c->dstFormat) {
1915  case AV_PIX_FMT_RGB32:
1916  case AV_PIX_FMT_BGR32:
1917 #if CONFIG_SMALL
1918 #else
1919 #if CONFIG_SWSCALE_ALPHA
1920  if (c->needAlpha) {
1921  } else
1922 #endif /* CONFIG_SWSCALE_ALPHA */
1923  {
1924  c->yuv2packed1 = yuv2rgbx32_1_lasx;
1925  c->yuv2packed2 = yuv2rgbx32_2_lasx;
1926  c->yuv2packedX = yuv2rgbx32_X_lasx;
1927  }
1928 #endif /* !CONFIG_SMALL */
1929  break;
1930  case AV_PIX_FMT_RGB32_1:
1931  case AV_PIX_FMT_BGR32_1:
1932 #if CONFIG_SMALL
1933 #else
1934 #if CONFIG_SWSCALE_ALPHA
1935  if (c->needAlpha) {
1936  } else
1937 #endif /* CONFIG_SWSCALE_ALPHA */
1938  {
1939  c->yuv2packed1 = yuv2rgbx32_1_1_lasx;
1940  c->yuv2packed2 = yuv2rgbx32_1_2_lasx;
1941  c->yuv2packedX = yuv2rgbx32_1_X_lasx;
1942  }
1943 #endif /* !CONFIG_SMALL */
1944  break;
1945  case AV_PIX_FMT_RGB24:
1946  c->yuv2packed1 = yuv2rgb24_1_lasx;
1947  c->yuv2packed2 = yuv2rgb24_2_lasx;
1948  c->yuv2packedX = yuv2rgb24_X_lasx;
1949  break;
1950  case AV_PIX_FMT_BGR24:
1951  c->yuv2packed1 = yuv2bgr24_1_lasx;
1952  c->yuv2packed2 = yuv2bgr24_2_lasx;
1953  c->yuv2packedX = yuv2bgr24_X_lasx;
1954  break;
1955  case AV_PIX_FMT_RGB565LE:
1956  case AV_PIX_FMT_RGB565BE:
1957  case AV_PIX_FMT_BGR565LE:
1958  case AV_PIX_FMT_BGR565BE:
1959  c->yuv2packed1 = yuv2rgb16_1_lasx;
1960  c->yuv2packed2 = yuv2rgb16_2_lasx;
1961  c->yuv2packedX = yuv2rgb16_X_lasx;
1962  break;
1963  case AV_PIX_FMT_RGB555LE:
1964  case AV_PIX_FMT_RGB555BE:
1965  case AV_PIX_FMT_BGR555LE:
1966  case AV_PIX_FMT_BGR555BE:
1967  c->yuv2packed1 = yuv2rgb15_1_lasx;
1968  c->yuv2packed2 = yuv2rgb15_2_lasx;
1969  c->yuv2packedX = yuv2rgb15_X_lasx;
1970  break;
1971  case AV_PIX_FMT_RGB444LE:
1972  case AV_PIX_FMT_RGB444BE:
1973  case AV_PIX_FMT_BGR444LE:
1974  case AV_PIX_FMT_BGR444BE:
1975  c->yuv2packed1 = yuv2rgb12_1_lasx;
1976  c->yuv2packed2 = yuv2rgb12_2_lasx;
1977  c->yuv2packedX = yuv2rgb12_X_lasx;
1978  break;
1979  case AV_PIX_FMT_RGB8:
1980  case AV_PIX_FMT_BGR8:
1981  c->yuv2packed1 = yuv2rgb8_1_lasx;
1982  c->yuv2packed2 = yuv2rgb8_2_lasx;
1983  c->yuv2packedX = yuv2rgb8_X_lasx;
1984  break;
1985  case AV_PIX_FMT_RGB4:
1986  case AV_PIX_FMT_BGR4:
1987  c->yuv2packed1 = yuv2rgb4_1_lasx;
1988  c->yuv2packed2 = yuv2rgb4_2_lasx;
1989  c->yuv2packedX = yuv2rgb4_X_lasx;
1990  break;
1991  case AV_PIX_FMT_RGB4_BYTE:
1992  case AV_PIX_FMT_BGR4_BYTE:
1993  c->yuv2packed1 = yuv2rgb4b_1_lasx;
1994  c->yuv2packed2 = yuv2rgb4b_2_lasx;
1995  c->yuv2packedX = yuv2rgb4b_X_lasx;
1996  break;
1997  }
1998  }
1999 }
yuv2packed2_fn
void(* yuv2packed2_fn)(struct SwsContext *c, const int16_t *lumSrc[2], const int16_t *chrUSrc[2], const int16_t *chrVSrc[2], const int16_t *alpSrc[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB output by doing bilinear scalin...
Definition: swscale_internal.h:221
A
#define A(x)
Definition: vpx_arith.h:28
yuv2planar1_fn
void(* yuv2planar1_fn)(const int16_t *src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
Write one line of horizontally scaled data to planar output without any additional vertical scaling (...
Definition: swscale_internal.h:115
yuv2packed1_fn
void(* yuv2packed1_fn)(struct SwsContext *c, const int16_t *lumSrc, const int16_t *chrUSrc[2], const int16_t *chrVSrc[2], const int16_t *alpSrc, uint8_t *dest, int dstW, int uvalpha, int y)
Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB output without any additional v...
Definition: swscale_internal.h:188
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:71
av_clip
#define av_clip
Definition: common.h:100
ff_dither_4x4_16
const uint8_t ff_dither_4x4_16[][8]
Definition: output.c:51
r
const char * r
Definition: vf_curves.c:127
AV_PIX_FMT_BGR32
#define AV_PIX_FMT_BGR32
Definition: pixfmt.h:453
AV_PIX_FMT_RGB444LE
@ AV_PIX_FMT_RGB444LE
packed RGB 4:4:4, 16bpp, (msb)4X 4R 4G 4B(lsb), little-endian, X=unused/undefined
Definition: pixfmt.h:136
u
#define u(width, name, range_min, range_max)
Definition: cbs_h2645.c:251
ff_dither_8x8_32
const uint8_t ff_dither_8x8_32[][8]
Definition: output.c:59
av_clip_uintp2
#define av_clip_uintp2
Definition: common.h:124
WRITE_FULL_A
#define WRITE_FULL_A(r, g, b, a, t1, s)
Definition: output_lasx.c:996
yuv2rgb_write
static av_always_inline void yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2, unsigned A1, unsigned A2, const void *_r, const void *_g, const void *_b, int y, enum AVPixelFormat target, int hasAlpha)
Definition: output_lasx.c:107
mask
int mask
Definition: mediacodecdec_common.c:154
SWS_DITHER_A_DITHER
@ SWS_DITHER_A_DITHER
Definition: swscale_internal.h:74
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
b
#define b
Definition: input.c:41
yuv2planeX
static void FUNC() yuv2planeX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
Definition: swscale_ppc_template.c:84
R
#define R
Definition: huffyuv.h:44
AV_PIX_FMT_RGB32_1
#define AV_PIX_FMT_RGB32_1
Definition: pixfmt.h:452
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
AV_PIX_FMT_BGR24
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:76
AV_PIX_FMT_BGRA
@ AV_PIX_FMT_BGRA
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:102
DUP2_ARG2
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:58
ff_sws_init_output_lasx
av_cold void ff_sws_init_output_lasx(SwsContext *c, yuv2planar1_fn *yuv2plane1, yuv2planarX_fn *yuv2planeX, yuv2interleavedX_fn *yuv2nv12cX, yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2, yuv2packedX_fn *yuv2packedX, yuv2anyX_fn *yuv2anyX)
Definition: output_lasx.c:1778
A2
@ A2
Definition: mvs.c:525
AV_PIX_FMT_GRAYF32LE
@ AV_PIX_FMT_GRAYF32LE
IEEE-754 single precision Y, 32bpp, little-endian.
Definition: pixfmt.h:364
AV_PIX_FMT_RGB555BE
@ AV_PIX_FMT_RGB555BE
packed RGB 5:5:5, 16bpp, (msb)1X 5R 5G 5B(lsb), big-endian , X=unused/undefined
Definition: pixfmt.h:114
is16BPS
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:706
rgb
Definition: rpzaenc.c:60
b1
static double b1(void *priv, double x, double y)
Definition: vf_xfade.c:2034
yuv2anyX_fn
void(* yuv2anyX_fn)(struct SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t **dest, int dstW, int y)
Write one line of horizontally scaled Y/U/V/A to YUV/RGB output by doing multi-point vertical scaling...
Definition: swscale_internal.h:287
YUV2RGBWRAPPER
#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha)
Definition: output_lasx.c:808
ub
#define ub(width, name)
Definition: cbs_h2645.c:401
swscale_loongarch.h
val
static double val(void *priv, double ch)
Definition: aeval.c:77
isNBPS
static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:720
WRITE_FULL
#define WRITE_FULL(r, g, b, t1, s)
Definition: output_lasx.c:1008
AV_PIX_FMT_BGR8
@ AV_PIX_FMT_BGR8
packed RGB 3:3:2, 8bpp, (msb)2B 3G 3R(lsb)
Definition: pixfmt.h:90
av_cold
#define av_cold
Definition: attributes.h:90
YUVRGB_TABLE_HEADROOM
#define YUVRGB_TABLE_HEADROOM
Definition: swscale_internal.h:44
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
SWS_DITHER_ED
@ SWS_DITHER_ED
Definition: swscale_internal.h:73
yuv2rgb_X_template_lasx
static void yuv2rgb_X_template_lasx(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int y, enum AVPixelFormat target, int hasAlpha)
Definition: output_lasx.c:230
g
const char * g
Definition: vf_curves.c:128
B
#define B
Definition: huffyuv.h:42
ff_dither_2x2_4
const uint8_t ff_dither_2x2_4[][8]
Definition: output.c:39
ff_dither_8x8_220
const uint8_t ff_dither_8x8_220[][8]
Definition: output.c:84
AV_PIX_FMT_RGB4
@ AV_PIX_FMT_RGB4
packed RGB 1:2:1 bitstream, 4bpp, (msb)1R 2G 1B(lsb), a byte contains two pixels, the first pixel in ...
Definition: pixfmt.h:94
AV_PIX_FMT_BGR32_1
#define AV_PIX_FMT_BGR32_1
Definition: pixfmt.h:454
AV_PIX_FMT_RGBA
@ AV_PIX_FMT_RGBA
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:100
YUV2RGB_SETUP
#define YUV2RGB_SETUP
Definition: output_lasx.c:969
isSemiPlanarYUV
static av_always_inline int isSemiPlanarYUV(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:752
b_r
#define b_r
A_DITHER
#define A_DITHER(u, v)
AV_PIX_FMT_RGB565LE
@ AV_PIX_FMT_RGB565LE
packed RGB 5:6:5, 16bpp, (msb) 5R 6G 5B(lsb), little-endian
Definition: pixfmt.h:113
yuv2rgb_2_template_lasx
static void yuv2rgb_2_template_lasx(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y, enum AVPixelFormat target, int hasAlpha)
Definition: output_lasx.c:518
NULL
#define NULL
Definition: coverity.c:32
bias
static int bias(int x, int c)
Definition: vqcdec.c:115
V
#define V
Definition: avdct.c:31
AV_PIX_FMT_BGR565LE
@ AV_PIX_FMT_BGR565LE
packed BGR 5:6:5, 16bpp, (msb) 5B 6G 5R(lsb), little-endian
Definition: pixfmt.h:118
AV_PIX_FMT_RGB8
@ AV_PIX_FMT_RGB8
packed RGB 3:3:2, 8bpp, (msb)3R 3G 2B(lsb)
Definition: pixfmt.h:93
AV_PIX_FMT_BGR4
@ AV_PIX_FMT_BGR4
packed RGB 1:2:1 bitstream, 4bpp, (msb)1B 2G 1R(lsb), a byte contains two pixels, the first pixel in ...
Definition: pixfmt.h:91
AV_PIX_FMT_BGR555BE
@ AV_PIX_FMT_BGR555BE
packed BGR 5:5:5, 16bpp, (msb)1X 5B 5G 5R(lsb), big-endian , X=unused/undefined
Definition: pixfmt.h:119
AV_PIX_FMT_ABGR
@ AV_PIX_FMT_ABGR
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:101
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
AV_PIX_FMT_BGR4_BYTE
@ AV_PIX_FMT_BGR4_BYTE
packed RGB 1:2:1, 8bpp, (msb)1B 2G 1R(lsb)
Definition: pixfmt.h:92
isDataInHighBits
static av_always_inline int isDataInHighBits(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:915
DUP4_ARG1
#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:70
AV_PIX_FMT_RGB24
@ AV_PIX_FMT_RGB24
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:75
DUP2_ARG1
#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:52
yuv2rgb_full_2_template_lasx
static void yuv2rgb_full_2_template_lasx(SwsContext *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y, enum AVPixelFormat target, int hasAlpha)
Definition: output_lasx.c:1225
A1
@ A1
Definition: mvs.c:524
AV_PIX_FMT_RGB444BE
@ AV_PIX_FMT_RGB444BE
packed RGB 4:4:4, 16bpp, (msb)4X 4R 4G 4B(lsb), big-endian, X=unused/undefined
Definition: pixfmt.h:137
SWS_FULL_CHR_H_INT
#define SWS_FULL_CHR_H_INT
Perform full chroma upsampling when upscaling to RGB.
Definition: swscale.h:97
AV_PIX_FMT_BGR555
#define AV_PIX_FMT_BGR555
Definition: pixfmt.h:471
DUP2_ARG3
#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:64
AV_PIX_FMT_BGR444BE
@ AV_PIX_FMT_BGR444BE
packed BGR 4:4:4, 16bpp, (msb)4X 4B 4G 4R(lsb), big-endian, X=unused/undefined
Definition: pixfmt.h:139
AV_PIX_FMT_RGB32
#define AV_PIX_FMT_RGB32
Definition: pixfmt.h:451
a0
static double a0(void *priv, double x, double y)
Definition: vf_xfade.c:2028
AV_PIX_FMT_BGR565BE
@ AV_PIX_FMT_BGR565BE
packed BGR 5:6:5, 16bpp, (msb) 5B 6G 5R(lsb), big-endian
Definition: pixfmt.h:117
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
YUV2RGB
#define YUV2RGB(y, u, v, R, G, B, offset, coeff, y_temp, v2r, v2g, u2g, u2b)
Definition: output_lasx.c:984
ff_dither_8x8_73
const uint8_t ff_dither_8x8_73[][8]
Definition: output.c:71
zero
static int zero(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:121
Y
#define Y
Definition: boxblur.h:37
AV_PIX_FMT_ARGB
@ AV_PIX_FMT_ARGB
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:99
av_assert2
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:67
AV_PIX_FMT_RGB555LE
@ AV_PIX_FMT_RGB555LE
packed RGB 5:5:5, 16bpp, (msb)1X 5R 5G 5B(lsb), little-endian, X=unused/undefined
Definition: pixfmt.h:115
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
yuv2rgb_full_X_template_lasx
static void yuv2rgb_full_X_template_lasx(SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int y, enum AVPixelFormat target, int hasAlpha)
Definition: output_lasx.c:1018
AV_PIX_FMT_BGR444
#define AV_PIX_FMT_BGR444
Definition: pixfmt.h:472
AV_PIX_FMT_RGB555
#define AV_PIX_FMT_RGB555
Definition: pixfmt.h:466
av_always_inline
#define av_always_inline
Definition: attributes.h:49
yuv2interleavedX_fn
void(* yuv2interleavedX_fn)(enum AVPixelFormat dstFormat, const uint8_t *chrDither, const int16_t *chrFilter, int chrFilterSize, const int16_t **chrUSrc, const int16_t **chrVSrc, uint8_t *dest, int dstW)
Write one line of horizontally scaled chroma to interleaved output with multi-point vertical scaling ...
Definition: swscale_internal.h:151
len
int len
Definition: vorbis_enc_data.h:426
AV_PIX_FMT_BGR565
#define AV_PIX_FMT_BGR565
Definition: pixfmt.h:470
AV_PIX_FMT_RGB4_BYTE
@ AV_PIX_FMT_RGB4_BYTE
packed RGB 1:2:1, 8bpp, (msb)1R 2G 1B(lsb)
Definition: pixfmt.h:95
headroom
static int headroom(int *la)
Definition: nellymoser.c:106
AV_PIX_FMT_RGB565
#define AV_PIX_FMT_RGB565
Definition: pixfmt.h:465
AV_PIX_FMT_GRAYF32BE
@ AV_PIX_FMT_GRAYF32BE
IEEE-754 single precision Y, 32bpp, big-endian.
Definition: pixfmt.h:363
U
#define U(x)
Definition: vpx_arith.h:37
yuv2planarX_fn
void(* yuv2planarX_fn)(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
Write one line of horizontally scaled data to planar output with multi-point vertical scaling between...
Definition: swscale_internal.h:131
yuv2packedX_fn
void(* yuv2packedX_fn)(struct SwsContext *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB output by doing multi-point ver...
Definition: swscale_internal.h:253
temp
else temp
Definition: vf_mcdeint.c:263
yuv2rgb_write_full
static av_always_inline void yuv2rgb_write_full(SwsContext *c, uint8_t *dest, int i, int R, int A, int G, int B, int y, enum AVPixelFormat target, int hasAlpha, int err[4])
Definition: output_lasx.c:837
yuv2rgb_1_template_lasx
static void yuv2rgb_1_template_lasx(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y, enum AVPixelFormat target, int hasAlpha)
Definition: output_lasx.c:628
av_clip_uint8
#define av_clip_uint8
Definition: common.h:106
G
#define G
Definition: huffyuv.h:43
AV_PIX_FMT_RGB565BE
@ AV_PIX_FMT_RGB565BE
packed RGB 5:6:5, 16bpp, (msb) 5R 6G 5B(lsb), big-endian
Definition: pixfmt.h:112
src0
const pixel *const src0
Definition: h264pred_template.c:420
filter0
static void filter0(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:352
loongson_intrinsics.h
yuv2rgb_full_1_template_lasx
static void yuv2rgb_full_1_template_lasx(SwsContext *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y, enum AVPixelFormat target, int hasAlpha)
Definition: output_lasx.c:1438
AV_PIX_FMT_BGR555LE
@ AV_PIX_FMT_BGR555LE
packed BGR 5:5:5, 16bpp, (msb)1X 5B 5G 5R(lsb), little-endian, X=unused/undefined
Definition: pixfmt.h:120
SWS_DITHER_AUTO
@ SWS_DITHER_AUTO
Definition: swscale_internal.h:71
coeff
static const double coeff[2][5]
Definition: vf_owdenoise.c:80
X_DITHER
#define X_DITHER(u, v)
b0
static double b0(void *priv, double x, double y)
Definition: vf_xfade.c:2033
a1
static double a1(void *priv, double x, double y)
Definition: vf_xfade.c:2029
r_b
#define r_b
d128
const uint8_t * d128
Definition: yuv2rgb.c:458
SWS_DITHER_X_DITHER
@ SWS_DITHER_X_DITHER
Definition: swscale_internal.h:75
SwsContext
Definition: swscale_internal.h:299
AV_PIX_FMT_BGR444LE
@ AV_PIX_FMT_BGR444LE
packed BGR 4:4:4, 16bpp, (msb)4X 4B 4G 4R(lsb), little-endian, X=unused/undefined
Definition: pixfmt.h:138
yuv2rgb
static void yuv2rgb(uint8_t *out, int ridx, int Y, int U, int V)
Definition: g2meet.c:263
src
#define src
Definition: vp8dsp.c:248
ff_dither_2x2_8
const uint8_t ff_dither_2x2_8[][8]
Definition: output.c:45
WRITE_YUV2RGB
#define WRITE_YUV2RGB(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4)
Definition: output_lasx.c:215
yuv2planeX_8_lasx
void yuv2planeX_8_lasx(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
Definition: output_lasx.c:25
AV_PIX_FMT_RGB444
#define AV_PIX_FMT_RGB444
Definition: pixfmt.h:467
dither
static const uint8_t dither[8][8]
Definition: vf_fspp.c:62