26 const int16_t **
src, uint8_t *dest,
int dstW,
31 __m256i
mask = {0x1C0C180814041000, 0x1C1814100C080400,
32 0x1C0C180814041000, 0x1C1814100C080400};
33 __m256i val1, val2, val3;
42 int val_1[8] = {dither0, dither2, dither4, dither6,
43 dither0, dither2, dither4, dither6};
44 int val_2[8] = {dither1, dither3, dither5, dither7,
45 dither1, dither3, dither5, dither7};
46 int val_3[8] = {dither0, dither1, dither2, dither3,
47 dither4, dither5, dither6, dither7};
49 DUP2_ARG2(__lasx_xvld, val_1, 0, val_2, 0, val1, val2);
50 val3 = __lasx_xvld(val_3, 0);
52 for (
i = 0;
i <
len;
i += 16) {
55 __m256i val_ev, val_od;
57 val_ev = __lasx_xvslli_w(val1, 12);
58 val_od = __lasx_xvslli_w(val2, 12);
60 for (j = 0; j < filterSize; j++) {
63 val_ev = __lasx_xvmaddwev_w_h(val_ev,
src0,
filter0);
64 val_od = __lasx_xvmaddwod_w_h(val_od,
src0,
filter0);
66 val_ev = __lasx_xvsrai_w(val_ev, 19);
67 val_od = __lasx_xvsrai_w(val_od, 19);
68 val_ev = __lasx_xvclip255_w(val_ev);
69 val_od = __lasx_xvclip255_w(val_od);
70 val = __lasx_xvshuf_b(val_od, val_ev,
mask);
71 __lasx_xvstelm_d(
val, (dest +
i), 0, 0);
72 __lasx_xvstelm_d(
val, (dest +
i), 8, 2);
79 val_l = __lasx_xvslli_w(val3, 12);
81 for (j = 0; j < filterSize; j++) {
88 val_l = __lasx_xvsrai_w(val_l, 19);
89 val_l = __lasx_xvclip255_w(val_l);
90 val_h = __lasx_xvpermi_d(val_l, 0x4E);
91 val_l = __lasx_xvshuf_b(val_h, val_l,
mask);
92 __lasx_xvstelm_d(val_l, (dest +
i), 0, 1);
95 for (;
i < dstW;
i++) {
98 for (j = 0; j< filterSize; j++)
108 unsigned A1,
unsigned A2,
109 const void *_r,
const void *_g,
const void *_b,
int y,
114 uint32_t *dest = (uint32_t *) _dest;
115 const uint32_t *
r = (
const uint32_t *) _r;
116 const uint32_t *
g = (
const uint32_t *) _g;
117 const uint32_t *
b = (
const uint32_t *) _b;
120 dest[
i * 2 + 0] =
r[Y1] +
g[Y1] +
b[Y1];
121 dest[
i * 2 + 1] =
r[Y2] +
g[Y2] +
b[Y2];
123 #if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
126 av_assert2((((
r[Y1] +
g[Y1] +
b[Y1]) >> sh) & 0xFF) == 0xFF);
128 dest[
i * 2 + 0] =
r[Y1] +
g[Y1] +
b[Y1];
129 dest[
i * 2 + 1] =
r[Y2] +
g[Y2] +
b[Y2];
132 uint8_t *dest = (uint8_t *) _dest;
133 const uint8_t *
r = (
const uint8_t *) _r;
134 const uint8_t *
g = (
const uint8_t *) _g;
135 const uint8_t *
b = (
const uint8_t *) _b;
137 #define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
138 #define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
140 dest[
i * 6 + 0] =
r_b[Y1];
141 dest[
i * 6 + 1] =
g[Y1];
142 dest[
i * 6 + 2] =
b_r[Y1];
143 dest[
i * 6 + 3] =
r_b[Y2];
144 dest[
i * 6 + 4] =
g[Y2];
145 dest[
i * 6 + 5] =
b_r[Y2];
151 uint16_t *dest = (uint16_t *) _dest;
152 const uint16_t *
r = (
const uint16_t *) _r;
153 const uint16_t *
g = (
const uint16_t *) _g;
154 const uint16_t *
b = (
const uint16_t *) _b;
155 int dr1, dg1, db1, dr2, dg2, db2;
180 dest[
i * 2 + 0] =
r[Y1 + dr1] +
g[Y1 + dg1] +
b[Y1 + db1];
181 dest[
i * 2 + 1] =
r[Y2 + dr2] +
g[Y2 + dg2] +
b[Y2 + db2];
183 uint8_t *dest = (uint8_t *) _dest;
184 const uint8_t *
r = (
const uint8_t *) _r;
185 const uint8_t *
g = (
const uint8_t *) _g;
186 const uint8_t *
b = (
const uint8_t *) _b;
187 int dr1, dg1, db1, dr2, dg2, db2;
192 dr1 = dg1 = d32[(
i * 2 + 0) & 7];
193 db1 = d64[(
i * 2 + 0) & 7];
194 dr2 = dg2 = d32[(
i * 2 + 1) & 7];
195 db2 = d64[(
i * 2 + 1) & 7];
199 dr1 = db1 =
d128[(
i * 2 + 0) & 7];
200 dg1 = d64[(
i * 2 + 0) & 7];
201 dr2 = db2 =
d128[(
i * 2 + 1) & 7];
202 dg2 = d64[(
i * 2 + 1) & 7];
206 dest[
i] =
r[Y1 + dr1] +
g[Y1 + dg1] +
b[Y1 + db1] +
207 ((
r[Y2 + dr2] +
g[Y2 + dg2] +
b[Y2 + db2]) << 4);
209 dest[
i * 2 + 0] =
r[Y1 + dr1] +
g[Y1 + dg1] +
b[Y1 + db1];
210 dest[
i * 2 + 1] =
r[Y2 + dr2] +
g[Y2 + dg2] +
b[Y2 + db2];
215 #define WRITE_YUV2RGB(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4) \
217 Y1 = __lasx_xvpickve2gr_w(vec_y1, t1); \
218 Y2 = __lasx_xvpickve2gr_w(vec_y2, t2); \
219 U = __lasx_xvpickve2gr_w(vec_u, t3); \
220 V = __lasx_xvpickve2gr_w(vec_v, t4); \
221 r = c->table_rV[V]; \
222 g = (c->table_gU[U] + c->table_gV[V]); \
223 b = c->table_bU[U]; \
224 yuv2rgb_write(dest, count, Y1, Y2, 0, 0, \
225 r, g, b, y, target, 0); \
231 const int16_t **lumSrc,
int lumFilterSize,
232 const int16_t *chrFilter,
const int16_t **chrUSrc,
233 const int16_t **chrVSrc,
int chrFilterSize,
234 const int16_t **alpSrc, uint8_t *dest,
int dstW,
242 int len_count = (dstW + 1) >> 1;
243 const void *
r, *
g, *
b;
245 __m256i
headroom = __lasx_xvreplgr2vr_w(head);
247 for (
i = 0;
i <
len;
i++) {
248 int Y1, Y2,
U,
V, count_lum = count << 1;
249 __m256i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
250 __m256i yl1_ev, yl1_od, yh1_ev, yh1_od, yl2_ev, yl2_od, yh2_ev, yh2_od;
251 __m256i u1_ev, u1_od, v1_ev, v1_od, u2_ev, u2_od, v2_ev, v2_od,
temp;
253 yl1_ev = __lasx_xvldrepl_w(&t, 0);
269 for (j = 0; j < lumFilterSize; j++) {
270 const int16_t *src_lum = lumSrc[j] + count_lum;
271 temp = __lasx_xvldrepl_h((lumFilter + j), 0);
272 DUP4_ARG2(__lasx_xvld, src_lum, 0, src_lum, 32, src_lum, 64,
273 src_lum, 96, l_src1, l_src2, l_src3, l_src4);
275 yl1_ev = __lasx_xvmaddwev_w_h(yl1_ev,
temp, l_src1);
276 yl1_od = __lasx_xvmaddwod_w_h(yl1_od,
temp, l_src1);
277 yh1_ev = __lasx_xvmaddwev_w_h(yh1_ev,
temp, l_src2);
278 yh1_od = __lasx_xvmaddwod_w_h(yh1_od,
temp, l_src2);
279 yl2_ev = __lasx_xvmaddwev_w_h(yl2_ev,
temp, l_src3);
280 yl2_od = __lasx_xvmaddwod_w_h(yl2_od,
temp, l_src3);
281 yh2_ev = __lasx_xvmaddwev_w_h(yh2_ev,
temp, l_src4);
282 yh2_od = __lasx_xvmaddwod_w_h(yh2_od,
temp, l_src4);
284 for (j = 0; j < chrFilterSize; j++) {
285 DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrUSrc[j] + count, 32,
287 DUP2_ARG2(__lasx_xvld, chrVSrc[j] + count, 0, chrVSrc[j] + count, 32,
289 temp = __lasx_xvldrepl_h((chrFilter + j), 0);
290 u1_ev = __lasx_xvmaddwev_w_h(u1_ev,
temp, u_src1);
291 u1_od = __lasx_xvmaddwod_w_h(u1_od,
temp, u_src1);
292 v1_ev = __lasx_xvmaddwev_w_h(v1_ev,
temp, v_src1);
293 v1_od = __lasx_xvmaddwod_w_h(v1_od,
temp, v_src1);
294 u2_ev = __lasx_xvmaddwev_w_h(u2_ev,
temp, u_src2);
295 u2_od = __lasx_xvmaddwod_w_h(u2_od,
temp, u_src2);
296 v2_ev = __lasx_xvmaddwev_w_h(v2_ev,
temp, v_src2);
297 v2_od = __lasx_xvmaddwod_w_h(v2_od,
temp, v_src2);
299 yl1_ev = __lasx_xvsrai_w(yl1_ev, 19);
300 yh1_ev = __lasx_xvsrai_w(yh1_ev, 19);
301 yl1_od = __lasx_xvsrai_w(yl1_od, 19);
302 yh1_od = __lasx_xvsrai_w(yh1_od, 19);
303 u1_ev = __lasx_xvsrai_w(u1_ev, 19);
304 v1_ev = __lasx_xvsrai_w(v1_ev, 19);
305 u1_od = __lasx_xvsrai_w(u1_od, 19);
306 v1_od = __lasx_xvsrai_w(v1_od, 19);
307 yl2_ev = __lasx_xvsrai_w(yl2_ev, 19);
308 yh2_ev = __lasx_xvsrai_w(yh2_ev, 19);
309 yl2_od = __lasx_xvsrai_w(yl2_od, 19);
310 yh2_od = __lasx_xvsrai_w(yh2_od, 19);
311 u2_ev = __lasx_xvsrai_w(u2_ev, 19);
312 v2_ev = __lasx_xvsrai_w(v2_ev, 19);
313 u2_od = __lasx_xvsrai_w(u2_od, 19);
314 v2_od = __lasx_xvsrai_w(v2_od, 19);
315 u1_ev = __lasx_xvadd_w(u1_ev,
headroom);
316 v1_ev = __lasx_xvadd_w(v1_ev,
headroom);
317 u1_od = __lasx_xvadd_w(u1_od,
headroom);
318 v1_od = __lasx_xvadd_w(v1_od,
headroom);
319 u2_ev = __lasx_xvadd_w(u2_ev,
headroom);
320 v2_ev = __lasx_xvadd_w(v2_ev,
headroom);
321 u2_od = __lasx_xvadd_w(u2_od,
headroom);
322 v2_od = __lasx_xvadd_w(v2_od,
headroom);
357 int Y1, Y2,
U,
V, count_lum = count << 1;
358 __m256i l_src1, l_src2, u_src, v_src;
359 __m256i yl_ev, yl_od, yh_ev, yh_od;
360 __m256i u_ev, u_od, v_ev, v_od,
temp;
362 yl_ev = __lasx_xvldrepl_w(&t, 0);
370 for (j = 0; j < lumFilterSize; j++) {
371 temp = __lasx_xvldrepl_h((lumFilter + j), 0);
372 DUP2_ARG2(__lasx_xvld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
374 yl_ev = __lasx_xvmaddwev_w_h(yl_ev,
temp, l_src1);
375 yl_od = __lasx_xvmaddwod_w_h(yl_od,
temp, l_src1);
376 yh_ev = __lasx_xvmaddwev_w_h(yh_ev,
temp, l_src2);
377 yh_od = __lasx_xvmaddwod_w_h(yh_od,
temp, l_src2);
379 for (j = 0; j < chrFilterSize; j++) {
380 DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
382 temp = __lasx_xvldrepl_h((chrFilter + j), 0);
383 u_ev = __lasx_xvmaddwev_w_h(u_ev,
temp, u_src);
384 u_od = __lasx_xvmaddwod_w_h(u_od,
temp, u_src);
385 v_ev = __lasx_xvmaddwev_w_h(v_ev,
temp, v_src);
386 v_od = __lasx_xvmaddwod_w_h(v_od,
temp, v_src);
388 yl_ev = __lasx_xvsrai_w(yl_ev, 19);
389 yh_ev = __lasx_xvsrai_w(yh_ev, 19);
390 yl_od = __lasx_xvsrai_w(yl_od, 19);
391 yh_od = __lasx_xvsrai_w(yh_od, 19);
392 u_ev = __lasx_xvsrai_w(u_ev, 19);
393 v_ev = __lasx_xvsrai_w(v_ev, 19);
394 u_od = __lasx_xvsrai_w(u_od, 19);
395 v_od = __lasx_xvsrai_w(v_od, 19);
396 u_ev = __lasx_xvadd_w(u_ev,
headroom);
397 v_ev = __lasx_xvadd_w(v_ev,
headroom);
398 u_od = __lasx_xvadd_w(u_od,
headroom);
399 v_od = __lasx_xvadd_w(v_od,
headroom);
420 int count_lum = count << 1;
421 __m256i l_src, u_src, v_src;
422 __m256i y_ev, y_od,
u, v,
temp;
424 y_ev = __lasx_xvldrepl_w(&t, 0);
428 for (j = 0; j < lumFilterSize; j++) {
429 temp = __lasx_xvldrepl_h((lumFilter + j), 0);
430 l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
431 y_ev = __lasx_xvmaddwev_w_h(y_ev,
temp, l_src);
432 y_od = __lasx_xvmaddwod_w_h(y_od,
temp, l_src);
434 for (j = 0; j < chrFilterSize; j++) {
435 DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count,
437 temp = __lasx_xvldrepl_h((chrFilter + j), 0);
438 u_src = __lasx_vext2xv_w_h(u_src);
439 v_src = __lasx_vext2xv_w_h(v_src);
440 u = __lasx_xvmaddwev_w_h(
u,
temp, u_src);
441 v = __lasx_xvmaddwev_w_h(v,
temp, v_src);
443 y_ev = __lasx_xvsrai_w(y_ev, 19);
444 y_od = __lasx_xvsrai_w(y_od, 19);
445 u = __lasx_xvsrai_w(
u, 19);
446 v = __lasx_xvsrai_w(v, 19);
461 int count_lum = count << 1;
462 __m256i l_src, u_src, v_src;
463 __m256i y_ev, uv,
temp;
465 y_ev = __lasx_xvldrepl_w(&t, 0);
467 for (j = 0; j < lumFilterSize; j++) {
468 temp = __lasx_xvldrepl_h((lumFilter + j), 0);
469 l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
470 l_src = __lasx_vext2xv_w_h(l_src);
471 y_ev = __lasx_xvmaddwev_w_h(y_ev,
temp, l_src);
473 for (j = 0; j < chrFilterSize; j++) {
474 u_src = __lasx_xvldrepl_d((chrUSrc[j] + count), 0);
475 v_src = __lasx_xvldrepl_d((chrVSrc[j] + count), 0);
476 temp = __lasx_xvldrepl_h((chrFilter + j), 0);
477 u_src = __lasx_xvilvl_d(v_src, u_src);
478 u_src = __lasx_vext2xv_w_h(u_src);
479 uv = __lasx_xvmaddwev_w_h(uv,
temp, u_src);
481 y_ev = __lasx_xvsrai_w(y_ev, 19);
482 uv = __lasx_xvsrai_w(uv, 19);
489 for (; count < len_count; count++) {
495 for (j = 0; j < lumFilterSize; j++) {
496 Y1 += lumSrc[j][count * 2] * lumFilter[j];
497 Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
499 for (j = 0; j < chrFilterSize; j++) {
500 U += chrUSrc[j][count] * chrFilter[j];
501 V += chrVSrc[j][count] * chrFilter[j];
513 r,
g,
b, y, target, 0);
519 const int16_t *ubuf[2],
const int16_t *vbuf[2],
520 const int16_t *abuf[2], uint8_t *dest,
int dstW,
521 int yalpha,
int uvalpha,
int y,
524 const int16_t *buf0 = buf[0], *buf1 = buf[1],
525 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
526 *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
527 int yalpha1 = 4096 - yalpha;
528 int uvalpha1 = 4096 - uvalpha;
531 int len_count = (dstW + 1) >> 1;
532 const void *
r, *
g, *
b;
534 __m256i v_yalpha1 = __lasx_xvreplgr2vr_w(yalpha1);
535 __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
536 __m256i v_yalpha = __lasx_xvreplgr2vr_w(yalpha);
537 __m256i v_uvalpha = __lasx_xvreplgr2vr_w(uvalpha);
538 __m256i
headroom = __lasx_xvreplgr2vr_w(head);
540 for (
i = 0;
i <
len;
i += 16) {
543 int c_dex = count << 1;
544 __m256i y0_h, y0_l, y0, u0, v0;
545 __m256i y1_h, y1_l, y1, u1, v1;
546 __m256i y_l, y_h,
u, v;
548 DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
549 buf1, i_dex, y0, u0, v0, y1);
550 DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
551 DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
552 DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h);
553 DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
554 y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
555 y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
556 u0 = __lasx_xvmul_w(u0, v_uvalpha1);
557 v0 = __lasx_xvmul_w(v0, v_uvalpha1);
558 y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
559 y_h = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
560 u = __lasx_xvmadd_w(u0, v_uvalpha, u1);
561 v = __lasx_xvmadd_w(v0, v_uvalpha, v1);
562 y_l = __lasx_xvsrai_w(y_l, 19);
563 y_h = __lasx_xvsrai_w(y_h, 19);
564 u = __lasx_xvsrai_w(
u, 19);
565 v = __lasx_xvsrai_w(v, 19);
580 __m256i y0_l, y0, u0, v0;
581 __m256i y1_l, y1, u1, v1;
584 y0 = __lasx_xvldx(buf0, i_dex);
585 u0 = __lasx_xvldrepl_d((ubuf0 + count), 0);
586 v0 = __lasx_xvldrepl_d((vbuf0 + count), 0);
587 y1 = __lasx_xvldx(buf1, i_dex);
588 u1 = __lasx_xvldrepl_d((ubuf1 + count), 0);
589 v1 = __lasx_xvldrepl_d((vbuf1 + count), 0);
590 DUP2_ARG1(__lasx_vext2xv_w_h, y0, y1, y0_l, y1_l);
591 DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
592 y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
593 u0 = __lasx_xvmul_w(u0, v_uvalpha1);
594 v0 = __lasx_xvmul_w(v0, v_uvalpha1);
595 y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
596 u = __lasx_xvmadd_w(u0, v_uvalpha, u1);
597 v = __lasx_xvmadd_w(v0, v_uvalpha, v1);
598 y_l = __lasx_xvsrai_w(y_l, 19);
599 u = __lasx_xvsrai_w(
u, 19);
600 v = __lasx_xvsrai_w(v, 19);
609 for (; count < len_count; count++) {
610 int Y1 = (buf0[count * 2] * yalpha1 +
611 buf1[count * 2] * yalpha) >> 19;
612 int Y2 = (buf0[count * 2 + 1] * yalpha1 +
613 buf1[count * 2 + 1] * yalpha) >> 19;
614 int U = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
615 int V = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
623 r,
g,
b, y, target, 0);
629 const int16_t *ubuf[2],
const int16_t *vbuf[2],
630 const int16_t *abuf0, uint8_t *dest,
int dstW,
634 const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
636 int len = (dstW - 15);
637 int len_count = (dstW + 1) >> 1;
638 const void *
r, *
g, *
b;
640 if (uvalpha < 2048) {
643 __m256i
headroom = __lasx_xvreplgr2vr_h(head);
645 for (
i = 0;
i <
len;
i += 16) {
648 int c_dex = count << 1;
649 __m256i src_y, src_u, src_v;
650 __m256i
u, v, y_l, y_h;
652 DUP2_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, src_y, src_u);
653 src_v = __lasx_xvldx(vbuf0, c_dex);
654 src_u = __lasx_xvpermi_q(src_u, src_v, 0x02);
655 src_y = __lasx_xvsrari_h(src_y, 7);
656 src_u = __lasx_xvsrari_h(src_u, 7);
657 y_l = __lasx_xvsllwil_w_h(src_y, 0);
658 y_h = __lasx_xvexth_w_h(src_y);
659 u = __lasx_xvaddwev_w_h(src_u,
headroom);
660 v = __lasx_xvaddwod_w_h(src_u,
headroom);
673 __m256i src_y, src_u, src_v;
676 src_y = __lasx_xvldx(buf0, i_dex);
677 src_u = __lasx_xvldrepl_d((ubuf0 + count), 0);
678 src_v = __lasx_xvldrepl_d((vbuf0 + count), 0);
679 src_u = __lasx_xvilvl_d(src_v, src_u);
680 y_l = __lasx_xvsrari_h(src_y, 7);
681 uv = __lasx_xvsrari_h(src_u, 7);
682 y_l = __lasx_vext2xv_w_h(y_l);
683 uv = __lasx_vext2xv_w_h(uv);
684 uv = __lasx_xvaddwev_w_h(uv,
headroom);
691 for (; count < len_count; count++) {
692 int Y1 = (buf0[count * 2 ] + 64) >> 7;
693 int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
694 int U = (ubuf0[count] + 64) >> 7;
695 int V = (vbuf0[count] + 64) >> 7;
703 r,
g,
b, y, target, 0);
706 const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
709 __m256i
headroom = __lasx_xvreplgr2vr_w(HEADROOM);
711 for (
i = 0;
i <
len;
i += 16) {
714 int c_dex = count << 1;
715 __m256i src_y, src_u0, src_v0, src_u1, src_v1;
716 __m256i y_l, y_h,
u, v;
718 DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
719 ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
720 src_v1 = __lasx_xvldx(vbuf1, c_dex);
721 src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
722 src_u1 = __lasx_xvpermi_q(src_u1, src_v1, 0x02);
723 src_y = __lasx_xvsrari_h(src_y, 7);
724 u = __lasx_xvaddwev_w_h(src_u0, src_u1);
725 v = __lasx_xvaddwod_w_h(src_u0, src_u1);
726 y_l = __lasx_xvsllwil_w_h(src_y, 0);
727 y_h = __lasx_xvexth_w_h(src_y);
728 u = __lasx_xvsrari_w(
u, 8);
729 v = __lasx_xvsrari_w(v, 8);
744 __m256i src_y, src_u0, src_v0, src_u1, src_v1;
747 src_y = __lasx_xvldx(buf0, i_dex);
748 src_u0 = __lasx_xvldrepl_d((ubuf0 + count), 0);
749 src_v0 = __lasx_xvldrepl_d((vbuf0 + count), 0);
750 src_u1 = __lasx_xvldrepl_d((ubuf1 + count), 0);
751 src_v1 = __lasx_xvldrepl_d((vbuf1 + count), 0);
753 src_u0 = __lasx_xvilvl_h(src_u1, src_u0);
754 src_v0 = __lasx_xvilvl_h(src_v1, src_v0);
755 src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
756 src_y = __lasx_xvsrari_h(src_y, 7);
757 uv = __lasx_xvhaddw_w_h(src_u0, src_u0);
758 src_y = __lasx_vext2xv_w_h(src_y);
759 uv = __lasx_xvsrari_w(uv, 8);
767 for (; count < len_count; count++) {
768 int Y1 = (buf0[count * 2 ] + 64) >> 7;
769 int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
770 int U = (ubuf0[count] + ubuf1[count] + 128) >> 8;
771 int V = (vbuf0[count] + vbuf1[count] + 128) >> 8;
779 r,
g,
b, y, target, 0);
784 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
785 static void name ## ext ## _X_lasx(SwsContext *c, const int16_t *lumFilter, \
786 const int16_t **lumSrc, int lumFilterSize, \
787 const int16_t *chrFilter, const int16_t **chrUSrc, \
788 const int16_t **chrVSrc, int chrFilterSize, \
789 const int16_t **alpSrc, uint8_t *dest, int dstW, \
792 name ## base ## _X_template_lasx(c, lumFilter, lumSrc, lumFilterSize, \
793 chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
794 alpSrc, dest, dstW, y, fmt, hasAlpha); \
797 #define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
798 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
799 static void name ## ext ## _2_lasx(SwsContext *c, const int16_t *buf[2], \
800 const int16_t *ubuf[2], const int16_t *vbuf[2], \
801 const int16_t *abuf[2], uint8_t *dest, int dstW, \
802 int yalpha, int uvalpha, int y) \
804 name ## base ## _2_template_lasx(c, buf, ubuf, vbuf, abuf, dest, \
805 dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
808 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
809 YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
810 static void name ## ext ## _1_lasx(SwsContext *c, const int16_t *buf0, \
811 const int16_t *ubuf[2], const int16_t *vbuf[2], \
812 const int16_t *abuf0, uint8_t *dest, int dstW, \
813 int uvalpha, int y) \
815 name ## base ## _1_template_lasx(c, buf0, ubuf, vbuf, abuf0, dest, \
816 dstW, uvalpha, y, fmt, hasAlpha); \
822 #if CONFIG_SWSCALE_ALPHA
838 uint8_t *dest,
int i,
int R,
int A,
int G,
int B,
843 if ((
R |
G |
B) & 0xC0000000) {
851 dest[0] = hasAlpha ?
A : 255;
865 dest[3] = hasAlpha ?
A : 255;
868 dest[0] = hasAlpha ?
A : 255;
882 dest[3] = hasAlpha ?
A : 255;
898 R += (7*err[0] + 1*
c->dither_error[0][
i] + 5*
c->dither_error[0][
i+1] + 3*
c->dither_error[0][
i+2])>>4;
899 G += (7*err[1] + 1*
c->dither_error[1][
i] + 5*
c->dither_error[1][
i+1] + 3*
c->dither_error[1][
i+2])>>4;
900 B += (7*err[2] + 1*
c->dither_error[2][
i] + 5*
c->dither_error[2][
i+1] + 3*
c->dither_error[2][
i+2])>>4;
901 c->dither_error[0][
i] = err[0];
902 c->dither_error[1][
i] = err[1];
903 c->dither_error[2][
i] = err[2];
904 r =
R >> (isrgb8 ? 5 : 7);
905 g =
G >> (isrgb8 ? 5 : 6);
906 b =
B >> (isrgb8 ? 6 : 7);
910 err[0] =
R -
r*(isrgb8 ? 36 : 255);
911 err[1] =
G -
g*(isrgb8 ? 36 : 85);
912 err[2] =
B -
b*(isrgb8 ? 85 : 255);
917 #define A_DITHER(u,v) (((((u)+((v)*236))*119)&0xff))
936 #define X_DITHER(u,v) (((((u)^((v)*237))*181)&0x1ff)/2)
956 dest[0] =
r + 2*
g + 8*
b;
958 dest[0] =
b + 2*
g + 8*
r;
960 dest[0] =
r + 8*
g + 64*
b;
962 dest[0] =
b + 4*
g + 32*
r;
969 #define YUV2RGB_SETUP \
970 int y_offset = c->yuv2rgb_y_offset; \
971 int y_coeff = c->yuv2rgb_y_coeff; \
972 int v2r_coe = c->yuv2rgb_v2r_coeff; \
973 int v2g_coe = c->yuv2rgb_v2g_coeff; \
974 int u2g_coe = c->yuv2rgb_u2g_coeff; \
975 int u2b_coe = c->yuv2rgb_u2b_coeff; \
976 __m256i offset = __lasx_xvreplgr2vr_w(y_offset); \
977 __m256i coeff = __lasx_xvreplgr2vr_w(y_coeff); \
978 __m256i v2r = __lasx_xvreplgr2vr_w(v2r_coe); \
979 __m256i v2g = __lasx_xvreplgr2vr_w(v2g_coe); \
980 __m256i u2g = __lasx_xvreplgr2vr_w(u2g_coe); \
981 __m256i u2b = __lasx_xvreplgr2vr_w(u2b_coe); \
984 #define YUV2RGB(y, u, v, R, G, B, offset, coeff, \
985 y_temp, v2r, v2g, u2g, u2b) \
987 y = __lasx_xvsub_w(y, offset); \
988 y = __lasx_xvmul_w(y, coeff); \
989 y = __lasx_xvadd_w(y, y_temp); \
990 R = __lasx_xvmadd_w(y, v, v2r); \
991 v = __lasx_xvmadd_w(y, v, v2g); \
992 G = __lasx_xvmadd_w(v, u, u2g); \
993 B = __lasx_xvmadd_w(y, u, u2b); \
996 #define WRITE_FULL_A(r, g, b, a, t1, s) \
998 R = __lasx_xvpickve2gr_w(r, t1); \
999 G = __lasx_xvpickve2gr_w(g, t1); \
1000 B = __lasx_xvpickve2gr_w(b, t1); \
1001 A = __lasx_xvpickve2gr_w(a, t1); \
1003 A = av_clip_uint8(A); \
1004 yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\
1008 #define WRITE_FULL(r, g, b, t1, s) \
1010 R = __lasx_xvpickve2gr_w(r, t1); \
1011 G = __lasx_xvpickve2gr_w(g, t1); \
1012 B = __lasx_xvpickve2gr_w(b, t1); \
1013 yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \
1019 const int16_t **lumSrc,
int lumFilterSize,
1020 const int16_t *chrFilter,
const int16_t **chrUSrc,
1021 const int16_t **chrVSrc,
int chrFilterSize,
1022 const int16_t **alpSrc, uint8_t *dest,
1026 int i, j,
B,
G,
R,
A;
1030 int a_temp = 1 << 18;
1032 int tempc = templ - (128 << 19);
1033 int ytemp = 1 << 21;
1034 int len = dstW - 15;
1035 __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
1042 for (
i = 0;
i <
len;
i += 16) {
1043 __m256i l_src, u_src, v_src;
1044 __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od,
temp;
1045 __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
1048 y_ev = y_od = __lasx_xvreplgr2vr_w(templ);
1049 u_ev = u_od = v_ev = v_od = __lasx_xvreplgr2vr_w(tempc);
1050 for (j = 0; j < lumFilterSize; j++) {
1051 temp = __lasx_xvldrepl_h((lumFilter + j), 0);
1052 l_src = __lasx_xvldx(lumSrc[j], n);
1053 y_ev = __lasx_xvmaddwev_w_h(y_ev, l_src,
temp);
1054 y_od = __lasx_xvmaddwod_w_h(y_od, l_src,
temp);
1056 for (j = 0; j < chrFilterSize; j++) {
1057 temp = __lasx_xvldrepl_h((chrFilter + j), 0);
1058 DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n,
1061 v_src,
temp, u_ev, v_ev);
1063 v_src,
temp, u_od, v_od);
1065 y_ev = __lasx_xvsrai_w(y_ev, 10);
1066 y_od = __lasx_xvsrai_w(y_od, 10);
1067 u_ev = __lasx_xvsrai_w(u_ev, 10);
1068 u_od = __lasx_xvsrai_w(u_od, 10);
1069 v_ev = __lasx_xvsrai_w(v_ev, 10);
1070 v_od = __lasx_xvsrai_w(v_od, 10);
1072 y_temp, v2r, v2g, u2g, u2b);
1074 y_temp, v2r, v2g, u2g, u2b);
1077 __m256i a_src, a_ev, a_od;
1079 a_ev = a_od = __lasx_xvreplgr2vr_w(a_temp);
1080 for (j = 0; j < lumFilterSize; j++) {
1081 temp = __lasx_xvldrepl_h(lumFilter + j, 0);
1082 a_src = __lasx_xvldx(alpSrc[j], n);
1083 a_ev = __lasx_xvmaddwev_w_h(a_ev, a_src,
temp);
1084 a_od = __lasx_xvmaddwod_w_h(a_od, a_src,
temp);
1086 a_ev = __lasx_xvsrai_w(a_ev, 19);
1087 a_od = __lasx_xvsrai_w(a_od, 19);
1123 if (dstW -
i >= 8) {
1124 __m256i l_src, u_src, v_src;
1125 __m256i y_ev, u_ev, v_ev, uv,
temp;
1126 __m256i R_ev, G_ev, B_ev;
1129 y_ev = __lasx_xvreplgr2vr_w(templ);
1130 u_ev = v_ev = __lasx_xvreplgr2vr_w(tempc);
1131 for (j = 0; j < lumFilterSize; j++) {
1132 temp = __lasx_xvldrepl_h((lumFilter + j), 0);
1133 l_src = __lasx_xvldx(lumSrc[j], n);
1134 l_src = __lasx_xvpermi_d(l_src, 0xD8);
1135 l_src = __lasx_xvilvl_h(l_src, l_src);
1136 y_ev = __lasx_xvmaddwev_w_h(y_ev, l_src,
temp);
1138 for (j = 0; j < chrFilterSize; j++) {
1139 temp = __lasx_xvldrepl_h((chrFilter + j), 0);
1140 DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
1141 u_src = __lasx_xvpermi_d(u_src, 0xD8);
1142 v_src = __lasx_xvpermi_d(v_src, 0xD8);
1143 uv = __lasx_xvilvl_h(v_src, u_src);
1144 u_ev = __lasx_xvmaddwev_w_h(u_ev, uv,
temp);
1145 v_ev = __lasx_xvmaddwod_w_h(v_ev, uv,
temp);
1147 y_ev = __lasx_xvsrai_w(y_ev, 10);
1148 u_ev = __lasx_xvsrai_w(u_ev, 10);
1149 v_ev = __lasx_xvsrai_w(v_ev, 10);
1151 y_temp, v2r, v2g, u2g, u2b);
1154 __m256i a_src, a_ev;
1156 a_ev = __lasx_xvreplgr2vr_w(a_temp);
1157 for (j = 0; j < lumFilterSize; j++) {
1158 temp = __lasx_xvldrepl_h(lumFilter + j, 0);
1159 a_src = __lasx_xvldx(alpSrc[j], n);
1160 a_src = __lasx_xvpermi_d(a_src, 0xD8);
1161 a_src = __lasx_xvilvl_h(a_src, a_src);
1162 a_ev = __lasx_xvmaddwev_w_h(a_ev, a_src,
temp);
1164 a_ev = __lasx_xvsrai_w(a_ev, 19);
1185 for (;
i < dstW;
i++) {
1187 int V,
U =
V = tempc;
1190 for (j = 0; j < lumFilterSize; j++) {
1191 Y += lumSrc[j][
i] * lumFilter[j];
1193 for (j = 0; j < chrFilterSize; j++) {
1194 U += chrUSrc[j][
i] * chrFilter[j];
1195 V += chrVSrc[j][
i] * chrFilter[j];
1203 for (j = 0; j < lumFilterSize; j++) {
1204 A += alpSrc[j][
i] * lumFilter[j];
1213 R = (unsigned)
Y +
V * v2r_coe;
1214 G = (unsigned)
Y +
V * v2g_coe +
U * u2g_coe;
1215 B = (unsigned)
Y +
U * u2b_coe;
1216 yuv2rgb_write_full(
c, dest,
i,
R,
A,
G,
B, y, target, hasAlpha, err);
1219 c->dither_error[0][
i] = err[0];
1220 c->dither_error[1][
i] = err[1];
1221 c->dither_error[2][
i] = err[2];
1226 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1227 const int16_t *abuf[2], uint8_t *dest,
int dstW,
1228 int yalpha,
int uvalpha,
int y,
1231 const int16_t *buf0 = buf[0], *buf1 = buf[1],
1232 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1233 *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1234 *abuf0 = hasAlpha ? abuf[0] :
NULL,
1235 *abuf1 = hasAlpha ? abuf[1] :
NULL;
1236 int yalpha1 = 4096 - yalpha;
1237 int uvalpha1 = 4096 - uvalpha;
1238 int uvtemp = 128 << 19;
1239 int atemp = 1 << 18;
1241 int ytemp = 1 << 21;
1242 int len = dstW - 15;
1246 __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
1247 __m256i v_yalpha1 = __lasx_xvreplgr2vr_w(yalpha1);
1248 __m256i v_uvalpha = __lasx_xvreplgr2vr_w(uvalpha);
1249 __m256i v_yalpha = __lasx_xvreplgr2vr_w(yalpha);
1250 __m256i uv = __lasx_xvreplgr2vr_w(uvtemp);
1251 __m256i a_bias = __lasx_xvreplgr2vr_w(atemp);
1252 __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
1262 for (
i = 0;
i <
len;
i += 16) {
1263 __m256i
b0,
b1, ub0, ub1, vb0, vb1;
1264 __m256i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
1265 __m256i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
1266 __m256i y_l, y_h, v_l, v_h, u_l, u_h;
1267 __m256i R_l, R_h, G_l, G_h, B_l, B_h;
1270 DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0,
1271 n, ubuf1, n,
b0,
b1, ub0, ub1);
1272 DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0 , vb1);
1274 DUP4_ARG2(__lasx_xvsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
1275 u0_l, u1_l, v0_l, v1_l);
1277 DUP4_ARG1(__lasx_xvexth_w_h, ub0, ub1, vb0, vb1,
1278 u0_h, u1_h, v0_h, v1_h);
1279 y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
1280 y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
1281 u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
1282 u0_h = __lasx_xvmul_w(u0_h, v_uvalpha1);
1283 v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
1284 v0_h = __lasx_xvmul_w(v0_h, v_uvalpha1);
1285 y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
1286 y_h = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
1287 u_l = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
1288 u_h = __lasx_xvmadd_w(u0_h, v_uvalpha, u1_h);
1289 v_l = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
1290 v_h = __lasx_xvmadd_w(v0_h, v_uvalpha, v1_h);
1291 u_l = __lasx_xvsub_w(u_l, uv);
1292 u_h = __lasx_xvsub_w(u_h, uv);
1293 v_l = __lasx_xvsub_w(v_l, uv);
1294 v_h = __lasx_xvsub_w(v_h, uv);
1295 y_l = __lasx_xvsrai_w(y_l, 10);
1296 y_h = __lasx_xvsrai_w(y_h, 10);
1297 u_l = __lasx_xvsrai_w(u_l, 10);
1298 u_h = __lasx_xvsrai_w(u_h, 10);
1299 v_l = __lasx_xvsrai_w(v_l, 10);
1300 v_h = __lasx_xvsrai_w(v_h, 10);
1302 y_temp, v2r, v2g, u2g, u2b);
1304 y_temp, v2r, v2g, u2g, u2b);
1307 __m256i
a0,
a1, a0_l, a0_h;
1308 __m256i a_l, a_h, a1_l, a1_h;
1313 a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
1314 a_h = __lasx_xvmadd_w(a_bias, a0_h, v_yalpha1);
1315 a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
1316 a_h = __lasx_xvmadd_w(a_h, v_yalpha, a1_h);
1317 a_l = __lasx_xvsrai_w(a_l, 19);
1318 a_h = __lasx_xvsrai_w(a_h, 19);
1354 if (dstW -
i >= 8) {
1355 __m256i
b0,
b1, ub0, ub1, vb0, vb1;
1356 __m256i y0_l, y1_l, u0_l;
1357 __m256i v0_l, u1_l, v1_l;
1358 __m256i y_l, u_l, v_l;
1359 __m256i R_l, G_l, B_l;
1362 DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0, n,
1363 ubuf1, n,
b0,
b1, ub0, ub1);
1364 DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0, vb1);
1366 DUP4_ARG1(__lasx_vext2xv_w_h, ub0, ub1, vb0, vb1,
1367 u0_l, u1_l, v0_l, v1_l);
1368 y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
1369 u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
1370 v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
1371 y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
1372 u_l = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
1373 v_l = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
1374 u_l = __lasx_xvsub_w(u_l, uv);
1375 v_l = __lasx_xvsub_w(v_l, uv);
1376 y_l = __lasx_xvsrai_w(y_l, 10);
1377 u_l = __lasx_xvsrai_w(u_l, 10);
1378 v_l = __lasx_xvsrai_w(v_l, 10);
1380 y_temp, v2r, v2g, u2g, u2b);
1383 __m256i
a0,
a1, a0_l;
1388 a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
1389 a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
1390 a_l = __lasx_xvsrai_w(a_l, 19);
1411 for (;
i < dstW;
i++){
1412 int Y = ( buf0[
i] * yalpha1 + buf1[
i] * yalpha ) >> 10;
1413 int U = (ubuf0[
i] * uvalpha1 + ubuf1[
i] * uvalpha- uvtemp) >> 10;
1414 int V = (vbuf0[
i] * uvalpha1 + vbuf1[
i] * uvalpha- uvtemp) >> 10;
1418 A = (abuf0[
i] * yalpha1 + abuf1[
i] * yalpha + atemp) >> 19;
1426 R = (unsigned)
Y +
V * v2r_coe;
1427 G = (unsigned)
Y +
V * v2g_coe +
U * u2g_coe;
1428 B = (unsigned)
Y +
U * u2b_coe;
1429 yuv2rgb_write_full(
c, dest,
i,
R,
A,
G,
B, y, target, hasAlpha, err);
1432 c->dither_error[0][
i] = err[0];
1433 c->dither_error[1][
i] = err[1];
1434 c->dither_error[2][
i] = err[2];
1439 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1440 const int16_t *abuf0, uint8_t *dest,
int dstW,
1444 const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
1448 int ytemp = 1 << 21;
1450 int len = dstW - 15;
1451 __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
1457 if (uvalpha < 2048) {
1458 int uvtemp = 128 << 7;
1459 __m256i uv = __lasx_xvreplgr2vr_w(uvtemp);
1460 __m256i
bias = __lasx_xvreplgr2vr_w(bias_int);
1462 for (
i = 0;
i <
len;
i += 16) {
1463 __m256i
b,
ub, vb, ub_l, ub_h, vb_l, vb_h;
1464 __m256i y_l, y_h, u_l, u_h, v_l, v_h;
1465 __m256i R_l, R_h, G_l, G_h, B_l, B_h;
1469 vb = __lasx_xvldx(vbuf0, n);
1470 y_l = __lasx_xvsllwil_w_h(
b, 2);
1471 y_h = __lasx_xvexth_w_h(
b);
1472 DUP2_ARG2(__lasx_xvsllwil_w_h,
ub, 0, vb, 0, ub_l, vb_l);
1474 y_h = __lasx_xvslli_w(y_h, 2);
1475 u_l = __lasx_xvsub_w(ub_l, uv);
1476 u_h = __lasx_xvsub_w(ub_h, uv);
1477 v_l = __lasx_xvsub_w(vb_l, uv);
1478 v_h = __lasx_xvsub_w(vb_h, uv);
1479 u_l = __lasx_xvslli_w(u_l, 2);
1480 u_h = __lasx_xvslli_w(u_h, 2);
1481 v_l = __lasx_xvslli_w(v_l, 2);
1482 v_h = __lasx_xvslli_w(v_h, 2);
1484 y_temp, v2r, v2g, u2g, u2b);
1486 y_temp, v2r, v2g, u2g, u2b);
1492 a_src = __lasx_xvld(abuf0 +
i, 0);
1493 a_l = __lasx_xvsllwil_w_h(a_src, 0);
1494 a_h = __lasx_xvexth_w_h(a_src);
1495 a_l = __lasx_xvadd_w(a_l,
bias);
1496 a_h = __lasx_xvadd_w(a_h,
bias);
1497 a_l = __lasx_xvsrai_w(a_l, 7);
1498 a_h = __lasx_xvsrai_w(a_h, 7);
1534 if (dstW -
i >= 8) {
1535 __m256i
b,
ub, vb, ub_l, vb_l;
1536 __m256i y_l, u_l, v_l;
1537 __m256i R_l, G_l, B_l;
1541 vb = __lasx_xvldx(vbuf0, n);
1542 y_l = __lasx_vext2xv_w_h(
b);
1543 DUP2_ARG1(__lasx_vext2xv_w_h,
ub, vb, ub_l, vb_l);
1544 y_l = __lasx_xvslli_w(y_l, 2);
1545 u_l = __lasx_xvsub_w(ub_l, uv);
1546 v_l = __lasx_xvsub_w(vb_l, uv);
1547 u_l = __lasx_xvslli_w(u_l, 2);
1548 v_l = __lasx_xvslli_w(v_l, 2);
1550 y_temp, v2r, v2g, u2g, u2b);
1555 a_src = __lasx_xvldx(abuf0, n);
1556 a_src = __lasx_vext2xv_w_h(a_src);
1557 a_l = __lasx_xvadd_w(
bias, a_src);
1558 a_l = __lasx_xvsrai_w(a_l, 7);
1579 for (;
i < dstW;
i++) {
1580 int Y = buf0[
i] << 2;
1581 int U = (ubuf0[
i] - uvtemp) << 2;
1582 int V = (vbuf0[
i] - uvtemp) << 2;
1586 A = (abuf0[
i] + 64) >> 7;
1593 R = (unsigned)
Y +
V * v2r_coe;
1594 G = (unsigned)
Y +
V * v2g_coe +
U * u2g_coe;
1595 B = (unsigned)
Y +
U * u2b_coe;
1596 yuv2rgb_write_full(
c, dest,
i,
R,
A,
G,
B, y, target, hasAlpha, err);
1600 const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
1601 int uvtemp = 128 << 8;
1602 __m256i uv = __lasx_xvreplgr2vr_w(uvtemp);
1603 __m256i
zero = __lasx_xvldi(0);
1604 __m256i
bias = __lasx_xvreplgr2vr_h(bias_int);
1606 for (
i = 0;
i <
len;
i += 16) {
1607 __m256i
b, ub0, ub1, vb0, vb1;
1608 __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od;
1609 __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
1612 DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
1613 ubuf1, n,
b, ub0, vb0, ub1);
1614 vb1 = __lasx_xvldx(vbuf, n);
1615 y_ev = __lasx_xvaddwev_w_h(
b,
zero);
1616 y_od = __lasx_xvaddwod_w_h(
b,
zero);
1617 DUP2_ARG2(__lasx_xvaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
1618 DUP2_ARG2(__lasx_xvaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
1619 DUP2_ARG2(__lasx_xvslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
1620 DUP4_ARG2(__lasx_xvsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
1621 u_ev, u_od, v_ev, v_od);
1622 DUP4_ARG2(__lasx_xvslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
1623 u_ev, u_od, v_ev, v_od);
1625 y_temp, v2r, v2g, u2g, u2b);
1627 y_temp, v2r, v2g, u2g, u2b);
1633 a_src = __lasx_xvld(abuf0 +
i, 0);
1634 a_ev = __lasx_xvaddwev_w_h(
bias, a_src);
1635 a_od = __lasx_xvaddwod_w_h(
bias, a_src);
1636 a_ev = __lasx_xvsrai_w(a_ev, 7);
1637 a_od = __lasx_xvsrai_w(a_od, 7);
1673 if (dstW -
i >= 8) {
1674 __m256i
b, ub0, ub1, vb0, vb1;
1675 __m256i y_l, u_l, v_l;
1676 __m256i R_l, G_l, B_l;
1679 DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
1680 ubuf1, n,
b, ub0, vb0, ub1);
1681 vb1 = __lasx_xvldx(vbuf1, n);
1682 y_l = __lasx_vext2xv_w_h(
b);
1683 y_l = __lasx_xvslli_w(y_l, 2);
1684 DUP4_ARG1(__lasx_vext2xv_w_h, ub0, vb0, ub1, vb1,
1685 ub0, vb0, ub1, vb1);
1686 DUP2_ARG2(__lasx_xvadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
1687 u_l = __lasx_xvsub_w(u_l, uv);
1688 v_l = __lasx_xvsub_w(v_l, uv);
1689 u_l = __lasx_xvslli_w(u_l, 1);
1690 v_l = __lasx_xvslli_w(v_l, 1);
1692 y_temp, v2r, v2g, u2g, u2b);
1698 a_src = __lasx_xvld(abuf0 +
i, 0);
1699 a_src = __lasx_xvpermi_d(a_src, 0xD8);
1700 a_src = __lasx_xvilvl_h(a_src, a_src);
1701 a_l = __lasx_xvaddwev_w_h(
bias, a_src);
1702 a_l = __lasx_xvsrai_w(a_l, 7);
1723 for (;
i < dstW;
i++) {
1724 int Y = buf0[
i] << 2;
1725 int U = (ubuf0[
i] + ubuf1[
i] - uvtemp) << 1;
1726 int V = (vbuf0[
i] + vbuf1[
i] - uvtemp) << 1;
1730 A = (abuf0[
i] + 64) >> 7;
1737 R = (unsigned)
Y +
V * v2r_coe;
1738 G = (unsigned)
Y +
V * v2g_coe +
U * u2g_coe;
1739 B = (unsigned)
Y +
U * u2b_coe;
1740 yuv2rgb_write_full(
c, dest,
i,
R,
A,
G,
B, y, target, hasAlpha, err);
1744 c->dither_error[0][
i] = err[0];
1745 c->dither_error[1][
i] = err[1];
1746 c->dither_error[2][
i] = err[2];
1750 CONFIG_SWSCALE_ALPHA &&
c->needAlpha)
1752 CONFIG_SWSCALE_ALPHA &&
c->needAlpha)
1754 CONFIG_SWSCALE_ALPHA &&
c->needAlpha)
1756 CONFIG_SWSCALE_ALPHA &&
c->needAlpha)
1758 #if CONFIG_SWSCALE_ALPHA
1791 }
else if (
is16BPS(dstFormat)) {
1792 }
else if (
isNBPS(dstFormat)) {
1796 *yuv2plane1 = yuv2plane1_8_lasx;
1801 switch (
c->dstFormat) {
1804 c->yuv2packedX = yuv2rgba32_full_X_lasx;
1805 c->yuv2packed2 = yuv2rgba32_full_2_lasx;
1806 c->yuv2packed1 = yuv2rgba32_full_1_lasx;
1808 #if CONFIG_SWSCALE_ALPHA
1810 c->yuv2packedX = yuv2rgba32_full_X_lasx;
1811 c->yuv2packed2 = yuv2rgba32_full_2_lasx;
1812 c->yuv2packed1 = yuv2rgba32_full_1_lasx;
1816 c->yuv2packedX = yuv2rgbx32_full_X_lasx;
1817 c->yuv2packed2 = yuv2rgbx32_full_2_lasx;
1818 c->yuv2packed1 = yuv2rgbx32_full_1_lasx;
1824 c->yuv2packedX = yuv2argb32_full_X_lasx;
1825 c->yuv2packed2 = yuv2argb32_full_2_lasx;
1826 c->yuv2packed1 = yuv2argb32_full_1_lasx;
1828 #if CONFIG_SWSCALE_ALPHA
1830 c->yuv2packedX = yuv2argb32_full_X_lasx;
1831 c->yuv2packed2 = yuv2argb32_full_2_lasx;
1832 c->yuv2packed1 = yuv2argb32_full_1_lasx;
1836 c->yuv2packedX = yuv2xrgb32_full_X_lasx;
1837 c->yuv2packed2 = yuv2xrgb32_full_2_lasx;
1838 c->yuv2packed1 = yuv2xrgb32_full_1_lasx;
1844 c->yuv2packedX = yuv2bgra32_full_X_lasx;
1845 c->yuv2packed2 = yuv2bgra32_full_2_lasx;
1846 c->yuv2packed1 = yuv2bgra32_full_1_lasx;
1848 #if CONFIG_SWSCALE_ALPHA
1850 c->yuv2packedX = yuv2bgra32_full_X_lasx;
1851 c->yuv2packed2 = yuv2bgra32_full_2_lasx;
1852 c->yuv2packed1 = yuv2bgra32_full_1_lasx;
1856 c->yuv2packedX = yuv2bgrx32_full_X_lasx;
1857 c->yuv2packed2 = yuv2bgrx32_full_2_lasx;
1858 c->yuv2packed1 = yuv2bgrx32_full_1_lasx;
1864 c->yuv2packedX = yuv2abgr32_full_X_lasx;
1865 c->yuv2packed2 = yuv2abgr32_full_2_lasx;
1866 c->yuv2packed1 = yuv2abgr32_full_1_lasx;
1868 #if CONFIG_SWSCALE_ALPHA
1870 c->yuv2packedX = yuv2abgr32_full_X_lasx;
1871 c->yuv2packed2 = yuv2abgr32_full_2_lasx;
1872 c->yuv2packed1 = yuv2abgr32_full_1_lasx;
1876 c->yuv2packedX = yuv2xbgr32_full_X_lasx;
1877 c->yuv2packed2 = yuv2xbgr32_full_2_lasx;
1878 c->yuv2packed1 = yuv2xbgr32_full_1_lasx;
1883 c->yuv2packedX = yuv2rgb24_full_X_lasx;
1884 c->yuv2packed2 = yuv2rgb24_full_2_lasx;
1885 c->yuv2packed1 = yuv2rgb24_full_1_lasx;
1888 c->yuv2packedX = yuv2bgr24_full_X_lasx;
1889 c->yuv2packed2 = yuv2bgr24_full_2_lasx;
1890 c->yuv2packed1 = yuv2bgr24_full_1_lasx;
1893 c->yuv2packedX = yuv2bgr4_byte_full_X_lasx;
1894 c->yuv2packed2 = yuv2bgr4_byte_full_2_lasx;
1895 c->yuv2packed1 = yuv2bgr4_byte_full_1_lasx;
1898 c->yuv2packedX = yuv2rgb4_byte_full_X_lasx;
1899 c->yuv2packed2 = yuv2rgb4_byte_full_2_lasx;
1900 c->yuv2packed1 = yuv2rgb4_byte_full_1_lasx;
1903 c->yuv2packedX = yuv2bgr8_full_X_lasx;
1904 c->yuv2packed2 = yuv2bgr8_full_2_lasx;
1905 c->yuv2packed1 = yuv2bgr8_full_1_lasx;
1908 c->yuv2packedX = yuv2rgb8_full_X_lasx;
1909 c->yuv2packed2 = yuv2rgb8_full_2_lasx;
1910 c->yuv2packed1 = yuv2rgb8_full_1_lasx;
1914 switch (
c->dstFormat) {
1919 #if CONFIG_SWSCALE_ALPHA
1924 c->yuv2packed1 = yuv2rgbx32_1_lasx;
1925 c->yuv2packed2 = yuv2rgbx32_2_lasx;
1926 c->yuv2packedX = yuv2rgbx32_X_lasx;
1934 #if CONFIG_SWSCALE_ALPHA
1939 c->yuv2packed1 = yuv2rgbx32_1_1_lasx;
1940 c->yuv2packed2 = yuv2rgbx32_1_2_lasx;
1941 c->yuv2packedX = yuv2rgbx32_1_X_lasx;
1946 c->yuv2packed1 = yuv2rgb24_1_lasx;
1947 c->yuv2packed2 = yuv2rgb24_2_lasx;
1948 c->yuv2packedX = yuv2rgb24_X_lasx;
1951 c->yuv2packed1 = yuv2bgr24_1_lasx;
1952 c->yuv2packed2 = yuv2bgr24_2_lasx;
1953 c->yuv2packedX = yuv2bgr24_X_lasx;
1959 c->yuv2packed1 = yuv2rgb16_1_lasx;
1960 c->yuv2packed2 = yuv2rgb16_2_lasx;
1961 c->yuv2packedX = yuv2rgb16_X_lasx;
1967 c->yuv2packed1 = yuv2rgb15_1_lasx;
1968 c->yuv2packed2 = yuv2rgb15_2_lasx;
1969 c->yuv2packedX = yuv2rgb15_X_lasx;
1975 c->yuv2packed1 = yuv2rgb12_1_lasx;
1976 c->yuv2packed2 = yuv2rgb12_2_lasx;
1977 c->yuv2packedX = yuv2rgb12_X_lasx;
1981 c->yuv2packed1 = yuv2rgb8_1_lasx;
1982 c->yuv2packed2 = yuv2rgb8_2_lasx;
1983 c->yuv2packedX = yuv2rgb8_X_lasx;
1987 c->yuv2packed1 = yuv2rgb4_1_lasx;
1988 c->yuv2packed2 = yuv2rgb4_2_lasx;
1989 c->yuv2packedX = yuv2rgb4_X_lasx;
1993 c->yuv2packed1 = yuv2rgb4b_1_lasx;
1994 c->yuv2packed2 = yuv2rgb4b_2_lasx;
1995 c->yuv2packedX = yuv2rgb4b_X_lasx;