26 const int16_t **
src, uint8_t *dest,
int dstW,
31 __m256i
mask = {0x1C0C180814041000, 0x1C1814100C080400,
32 0x1C0C180814041000, 0x1C1814100C080400};
33 __m256i val1, val2, val3;
42 int val_1[8] = {dither0, dither2, dither4, dither6,
43 dither0, dither2, dither4, dither6};
44 int val_2[8] = {dither1, dither3, dither5, dither7,
45 dither1, dither3, dither5, dither7};
46 int val_3[8] = {dither0, dither1, dither2, dither3,
47 dither4, dither5, dither6, dither7};
49 DUP2_ARG2(__lasx_xvld, val_1, 0, val_2, 0, val1, val2);
50 val3 = __lasx_xvld(val_3, 0);
52 for (
i = 0;
i <
len;
i += 16) {
55 __m256i val_ev, val_od;
57 val_ev = __lasx_xvslli_w(val1, 12);
58 val_od = __lasx_xvslli_w(val2, 12);
60 for (j = 0; j < filterSize; j++) {
63 val_ev = __lasx_xvmaddwev_w_h(val_ev,
src0,
filter0);
64 val_od = __lasx_xvmaddwod_w_h(val_od,
src0,
filter0);
66 val_ev = __lasx_xvsrai_w(val_ev, 19);
67 val_od = __lasx_xvsrai_w(val_od, 19);
68 val_ev = __lasx_xvclip255_w(val_ev);
69 val_od = __lasx_xvclip255_w(val_od);
70 val = __lasx_xvshuf_b(val_od, val_ev,
mask);
71 __lasx_xvstelm_d(
val, (dest +
i), 0, 0);
72 __lasx_xvstelm_d(
val, (dest +
i), 8, 2);
79 val_l = __lasx_xvslli_w(val3, 12);
81 for (j = 0; j < filterSize; j++) {
88 val_l = __lasx_xvsrai_w(val_l, 19);
89 val_l = __lasx_xvclip255_w(val_l);
90 val_h = __lasx_xvpermi_d(val_l, 0x4E);
91 val_l = __lasx_xvshuf_b(val_h, val_l,
mask);
92 __lasx_xvstelm_d(val_l, (dest +
i), 0, 1);
95 for (;
i < dstW;
i++) {
98 for (j = 0; j< filterSize; j++)
108 unsigned A1,
unsigned A2,
109 const void *_r,
const void *_g,
const void *_b,
int y,
114 uint32_t *dest = (uint32_t *) _dest;
115 const uint32_t *
r = (
const uint32_t *) _r;
116 const uint32_t *
g = (
const uint32_t *) _g;
117 const uint32_t *
b = (
const uint32_t *) _b;
120 dest[
i * 2 + 0] =
r[Y1] +
g[Y1] +
b[Y1];
121 dest[
i * 2 + 1] =
r[Y2] +
g[Y2] +
b[Y2];
123 #if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
126 av_assert2((((
r[Y1] +
g[Y1] +
b[Y1]) >> sh) & 0xFF) == 0xFF);
128 dest[
i * 2 + 0] =
r[Y1] +
g[Y1] +
b[Y1];
129 dest[
i * 2 + 1] =
r[Y2] +
g[Y2] +
b[Y2];
132 uint8_t *dest = (uint8_t *) _dest;
133 const uint8_t *
r = (
const uint8_t *) _r;
134 const uint8_t *
g = (
const uint8_t *) _g;
135 const uint8_t *
b = (
const uint8_t *) _b;
137 #define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
138 #define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
140 dest[
i * 6 + 0] =
r_b[Y1];
141 dest[
i * 6 + 1] =
g[Y1];
142 dest[
i * 6 + 2] =
b_r[Y1];
143 dest[
i * 6 + 3] =
r_b[Y2];
144 dest[
i * 6 + 4] =
g[Y2];
145 dest[
i * 6 + 5] =
b_r[Y2];
151 uint16_t *dest = (uint16_t *) _dest;
152 const uint16_t *
r = (
const uint16_t *) _r;
153 const uint16_t *
g = (
const uint16_t *) _g;
154 const uint16_t *
b = (
const uint16_t *) _b;
155 int dr1, dg1, db1, dr2, dg2, db2;
180 dest[
i * 2 + 0] =
r[Y1 + dr1] +
g[Y1 + dg1] +
b[Y1 + db1];
181 dest[
i * 2 + 1] =
r[Y2 + dr2] +
g[Y2 + dg2] +
b[Y2 + db2];
183 uint8_t *dest = (uint8_t *) _dest;
184 const uint8_t *
r = (
const uint8_t *) _r;
185 const uint8_t *
g = (
const uint8_t *) _g;
186 const uint8_t *
b = (
const uint8_t *) _b;
187 int dr1, dg1, db1, dr2, dg2, db2;
192 dr1 = dg1 = d32[(
i * 2 + 0) & 7];
193 db1 = d64[(
i * 2 + 0) & 7];
194 dr2 = dg2 = d32[(
i * 2 + 1) & 7];
195 db2 = d64[(
i * 2 + 1) & 7];
199 dr1 = db1 =
d128[(
i * 2 + 0) & 7];
200 dg1 = d64[(
i * 2 + 0) & 7];
201 dr2 = db2 =
d128[(
i * 2 + 1) & 7];
202 dg2 = d64[(
i * 2 + 1) & 7];
206 dest[
i] =
r[Y1 + dr1] +
g[Y1 + dg1] +
b[Y1 + db1] +
207 ((
r[Y2 + dr2] +
g[Y2 + dg2] +
b[Y2 + db2]) << 4);
209 dest[
i * 2 + 0] =
r[Y1 + dr1] +
g[Y1 + dg1] +
b[Y1 + db1];
210 dest[
i * 2 + 1] =
r[Y2 + dr2] +
g[Y2 + dg2] +
b[Y2 + db2];
215 #define WRITE_YUV2RGB(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4) \
217 Y1 = __lasx_xvpickve2gr_w(vec_y1, t1); \
218 Y2 = __lasx_xvpickve2gr_w(vec_y2, t2); \
219 U = __lasx_xvpickve2gr_w(vec_u, t3); \
220 V = __lasx_xvpickve2gr_w(vec_v, t4); \
221 r = c->table_rV[V]; \
222 g = (c->table_gU[U] + c->table_gV[V]); \
223 b = c->table_bU[U]; \
224 yuv2rgb_write(dest, count, Y1, Y2, 0, 0, \
225 r, g, b, y, target, 0); \
231 const int16_t **lumSrc,
int lumFilterSize,
232 const int16_t *chrFilter,
const int16_t **chrUSrc,
233 const int16_t **chrVSrc,
int chrFilterSize,
234 const int16_t **alpSrc, uint8_t *dest,
int dstW,
242 int len_count = (dstW + 1) >> 1;
243 const void *
r, *
g, *
b;
245 __m256i
headroom = __lasx_xvreplgr2vr_w(head);
247 for (
i = 0;
i <
len;
i++) {
248 int Y1, Y2,
U,
V, count_lum = count << 1;
249 __m256i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
250 __m256i yl1_ev, yl1_od, yh1_ev, yh1_od, yl2_ev, yl2_od, yh2_ev, yh2_od;
251 __m256i u1_ev, u1_od, v1_ev, v1_od, u2_ev, u2_od, v2_ev, v2_od,
temp;
253 yl1_ev = __lasx_xvldrepl_w(&t, 0);
269 for (j = 0; j < lumFilterSize; j++) {
270 const int16_t *src_lum = lumSrc[j] + count_lum;
271 temp = __lasx_xvldrepl_h((lumFilter + j), 0);
272 DUP4_ARG2(__lasx_xvld, src_lum, 0, src_lum, 32, src_lum, 64,
273 src_lum, 96, l_src1, l_src2, l_src3, l_src4);
275 yl1_ev = __lasx_xvmaddwev_w_h(yl1_ev,
temp, l_src1);
276 yl1_od = __lasx_xvmaddwod_w_h(yl1_od,
temp, l_src1);
277 yh1_ev = __lasx_xvmaddwev_w_h(yh1_ev,
temp, l_src2);
278 yh1_od = __lasx_xvmaddwod_w_h(yh1_od,
temp, l_src2);
279 yl2_ev = __lasx_xvmaddwev_w_h(yl2_ev,
temp, l_src3);
280 yl2_od = __lasx_xvmaddwod_w_h(yl2_od,
temp, l_src3);
281 yh2_ev = __lasx_xvmaddwev_w_h(yh2_ev,
temp, l_src4);
282 yh2_od = __lasx_xvmaddwod_w_h(yh2_od,
temp, l_src4);
284 for (j = 0; j < chrFilterSize; j++) {
285 DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrUSrc[j] + count, 32,
287 DUP2_ARG2(__lasx_xvld, chrVSrc[j] + count, 0, chrVSrc[j] + count, 32,
289 temp = __lasx_xvldrepl_h((chrFilter + j), 0);
290 u1_ev = __lasx_xvmaddwev_w_h(u1_ev,
temp, u_src1);
291 u1_od = __lasx_xvmaddwod_w_h(u1_od,
temp, u_src1);
292 v1_ev = __lasx_xvmaddwev_w_h(v1_ev,
temp, v_src1);
293 v1_od = __lasx_xvmaddwod_w_h(v1_od,
temp, v_src1);
294 u2_ev = __lasx_xvmaddwev_w_h(u2_ev,
temp, u_src2);
295 u2_od = __lasx_xvmaddwod_w_h(u2_od,
temp, u_src2);
296 v2_ev = __lasx_xvmaddwev_w_h(v2_ev,
temp, v_src2);
297 v2_od = __lasx_xvmaddwod_w_h(v2_od,
temp, v_src2);
299 yl1_ev = __lasx_xvsrai_w(yl1_ev, 19);
300 yh1_ev = __lasx_xvsrai_w(yh1_ev, 19);
301 yl1_od = __lasx_xvsrai_w(yl1_od, 19);
302 yh1_od = __lasx_xvsrai_w(yh1_od, 19);
303 u1_ev = __lasx_xvsrai_w(u1_ev, 19);
304 v1_ev = __lasx_xvsrai_w(v1_ev, 19);
305 u1_od = __lasx_xvsrai_w(u1_od, 19);
306 v1_od = __lasx_xvsrai_w(v1_od, 19);
307 yl2_ev = __lasx_xvsrai_w(yl2_ev, 19);
308 yh2_ev = __lasx_xvsrai_w(yh2_ev, 19);
309 yl2_od = __lasx_xvsrai_w(yl2_od, 19);
310 yh2_od = __lasx_xvsrai_w(yh2_od, 19);
311 u2_ev = __lasx_xvsrai_w(u2_ev, 19);
312 v2_ev = __lasx_xvsrai_w(v2_ev, 19);
313 u2_od = __lasx_xvsrai_w(u2_od, 19);
314 v2_od = __lasx_xvsrai_w(v2_od, 19);
315 u1_ev = __lasx_xvadd_w(u1_ev,
headroom);
316 v1_ev = __lasx_xvadd_w(v1_ev,
headroom);
317 u1_od = __lasx_xvadd_w(u1_od,
headroom);
318 v1_od = __lasx_xvadd_w(v1_od,
headroom);
319 u2_ev = __lasx_xvadd_w(u2_ev,
headroom);
320 v2_ev = __lasx_xvadd_w(v2_ev,
headroom);
321 u2_od = __lasx_xvadd_w(u2_od,
headroom);
322 v2_od = __lasx_xvadd_w(v2_od,
headroom);
357 int Y1, Y2,
U,
V, count_lum = count << 1;
358 __m256i l_src1, l_src2, u_src, v_src;
359 __m256i yl_ev, yl_od, yh_ev, yh_od;
360 __m256i u_ev, u_od, v_ev, v_od,
temp;
362 yl_ev = __lasx_xvldrepl_w(&t, 0);
370 for (j = 0; j < lumFilterSize; j++) {
371 temp = __lasx_xvldrepl_h((lumFilter + j), 0);
372 DUP2_ARG2(__lasx_xvld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
374 yl_ev = __lasx_xvmaddwev_w_h(yl_ev,
temp, l_src1);
375 yl_od = __lasx_xvmaddwod_w_h(yl_od,
temp, l_src1);
376 yh_ev = __lasx_xvmaddwev_w_h(yh_ev,
temp, l_src2);
377 yh_od = __lasx_xvmaddwod_w_h(yh_od,
temp, l_src2);
379 for (j = 0; j < chrFilterSize; j++) {
380 DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
382 temp = __lasx_xvldrepl_h((chrFilter + j), 0);
383 u_ev = __lasx_xvmaddwev_w_h(u_ev,
temp, u_src);
384 u_od = __lasx_xvmaddwod_w_h(u_od,
temp, u_src);
385 v_ev = __lasx_xvmaddwev_w_h(v_ev,
temp, v_src);
386 v_od = __lasx_xvmaddwod_w_h(v_od,
temp, v_src);
388 yl_ev = __lasx_xvsrai_w(yl_ev, 19);
389 yh_ev = __lasx_xvsrai_w(yh_ev, 19);
390 yl_od = __lasx_xvsrai_w(yl_od, 19);
391 yh_od = __lasx_xvsrai_w(yh_od, 19);
392 u_ev = __lasx_xvsrai_w(u_ev, 19);
393 v_ev = __lasx_xvsrai_w(v_ev, 19);
394 u_od = __lasx_xvsrai_w(u_od, 19);
395 v_od = __lasx_xvsrai_w(v_od, 19);
396 u_ev = __lasx_xvadd_w(u_ev,
headroom);
397 v_ev = __lasx_xvadd_w(v_ev,
headroom);
398 u_od = __lasx_xvadd_w(u_od,
headroom);
399 v_od = __lasx_xvadd_w(v_od,
headroom);
420 int count_lum = count << 1;
421 __m256i l_src, u_src, v_src;
422 __m256i y_ev, y_od,
u, v,
temp;
424 y_ev = __lasx_xvldrepl_w(&t, 0);
428 for (j = 0; j < lumFilterSize; j++) {
429 temp = __lasx_xvldrepl_h((lumFilter + j), 0);
430 l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
431 y_ev = __lasx_xvmaddwev_w_h(y_ev,
temp, l_src);
432 y_od = __lasx_xvmaddwod_w_h(y_od,
temp, l_src);
434 for (j = 0; j < chrFilterSize; j++) {
435 DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count,
437 temp = __lasx_xvldrepl_h((chrFilter + j), 0);
438 u_src = __lasx_vext2xv_w_h(u_src);
439 v_src = __lasx_vext2xv_w_h(v_src);
440 u = __lasx_xvmaddwev_w_h(
u,
temp, u_src);
441 v = __lasx_xvmaddwev_w_h(v,
temp, v_src);
443 y_ev = __lasx_xvsrai_w(y_ev, 19);
444 y_od = __lasx_xvsrai_w(y_od, 19);
445 u = __lasx_xvsrai_w(
u, 19);
446 v = __lasx_xvsrai_w(v, 19);
461 int count_lum = count << 1;
462 __m256i l_src, u_src, v_src;
463 __m256i y_ev, uv,
temp;
465 y_ev = __lasx_xvldrepl_w(&t, 0);
467 for (j = 0; j < lumFilterSize; j++) {
468 temp = __lasx_xvldrepl_h((lumFilter + j), 0);
469 l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
470 l_src = __lasx_vext2xv_w_h(l_src);
471 y_ev = __lasx_xvmaddwev_w_h(y_ev,
temp, l_src);
473 for (j = 0; j < chrFilterSize; j++) {
474 u_src = __lasx_xvldrepl_d((chrUSrc[j] + count), 0);
475 v_src = __lasx_xvldrepl_d((chrVSrc[j] + count), 0);
476 temp = __lasx_xvldrepl_h((chrFilter + j), 0);
477 u_src = __lasx_xvilvl_d(v_src, u_src);
478 u_src = __lasx_vext2xv_w_h(u_src);
479 uv = __lasx_xvmaddwev_w_h(uv,
temp, u_src);
481 y_ev = __lasx_xvsrai_w(y_ev, 19);
482 uv = __lasx_xvsrai_w(uv, 19);
489 for (; count < len_count; count++) {
495 for (j = 0; j < lumFilterSize; j++) {
496 Y1 += lumSrc[j][count * 2] * lumFilter[j];
497 Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
499 for (j = 0; j < chrFilterSize; j++) {
500 U += chrUSrc[j][count] * chrFilter[j];
501 V += chrVSrc[j][count] * chrFilter[j];
513 r,
g,
b, y, target, 0);
519 const int16_t *ubuf[2],
const int16_t *vbuf[2],
520 const int16_t *abuf[2], uint8_t *dest,
int dstW,
521 int yalpha,
int uvalpha,
int y,
524 const int16_t *buf0 = buf[0], *buf1 = buf[1],
525 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
526 *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
527 int yalpha1 = 4096 - yalpha;
528 int uvalpha1 = 4096 - uvalpha;
531 int len_count = (dstW + 1) >> 1;
532 const void *
r, *
g, *
b;
534 __m256i v_yalpha1 = __lasx_xvreplgr2vr_w(yalpha1);
535 __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
536 __m256i v_yalpha = __lasx_xvreplgr2vr_w(yalpha);
537 __m256i v_uvalpha = __lasx_xvreplgr2vr_w(uvalpha);
538 __m256i
headroom = __lasx_xvreplgr2vr_w(head);
540 for (
i = 0;
i <
len;
i += 16) {
543 int c_dex = count << 1;
544 __m256i y0_h, y0_l, y0, u0, v0;
545 __m256i y1_h, y1_l, y1, u1, v1;
546 __m256i y_l, y_h,
u, v;
548 DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
549 buf1, i_dex, y0, u0, v0, y1);
550 DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
551 DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
552 DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h);
553 DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
554 y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
555 y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
556 u0 = __lasx_xvmul_w(u0, v_uvalpha1);
557 v0 = __lasx_xvmul_w(v0, v_uvalpha1);
558 y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
559 y_h = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
560 u = __lasx_xvmadd_w(u0, v_uvalpha, u1);
561 v = __lasx_xvmadd_w(v0, v_uvalpha, v1);
562 y_l = __lasx_xvsrai_w(y_l, 19);
563 y_h = __lasx_xvsrai_w(y_h, 19);
564 u = __lasx_xvsrai_w(
u, 19);
565 v = __lasx_xvsrai_w(v, 19);
580 __m256i y0_l, y0, u0, v0;
581 __m256i y1_l, y1, u1, v1;
584 y0 = __lasx_xvldx(buf0, i_dex);
585 u0 = __lasx_xvldrepl_d((ubuf0 + count), 0);
586 v0 = __lasx_xvldrepl_d((vbuf0 + count), 0);
587 y1 = __lasx_xvldx(buf1, i_dex);
588 u1 = __lasx_xvldrepl_d((ubuf1 + count), 0);
589 v1 = __lasx_xvldrepl_d((vbuf1 + count), 0);
590 DUP2_ARG1(__lasx_vext2xv_w_h, y0, y1, y0_l, y1_l);
591 DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
592 y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
593 u0 = __lasx_xvmul_w(u0, v_uvalpha1);
594 v0 = __lasx_xvmul_w(v0, v_uvalpha1);
595 y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
596 u = __lasx_xvmadd_w(u0, v_uvalpha, u1);
597 v = __lasx_xvmadd_w(v0, v_uvalpha, v1);
598 y_l = __lasx_xvsrai_w(y_l, 19);
599 u = __lasx_xvsrai_w(
u, 19);
600 v = __lasx_xvsrai_w(v, 19);
609 for (; count < len_count; count++) {
610 int Y1 = (buf0[count * 2] * yalpha1 +
611 buf1[count * 2] * yalpha) >> 19;
612 int Y2 = (buf0[count * 2 + 1] * yalpha1 +
613 buf1[count * 2 + 1] * yalpha) >> 19;
614 int U = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
615 int V = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
623 r,
g,
b, y, target, 0);
629 const int16_t *ubuf[2],
const int16_t *vbuf[2],
630 const int16_t *abuf0, uint8_t *dest,
int dstW,
634 const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
636 int len = (dstW - 15);
637 int len_count = (dstW + 1) >> 1;
638 const void *
r, *
g, *
b;
643 __m256i
headroom = __lasx_xvreplgr2vr_h(head);
645 for (
i = 0;
i <
len;
i += 16) {
648 int c_dex = count << 1;
649 __m256i src_y, src_u, src_v;
650 __m256i
u, v, y_l, y_h;
652 DUP2_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, src_y, src_u);
653 src_v = __lasx_xvldx(vbuf0, c_dex);
654 src_u = __lasx_xvpermi_q(src_u, src_v, 0x02);
655 src_y = __lasx_xvsrari_h(src_y, 7);
656 src_u = __lasx_xvsrari_h(src_u, 7);
657 y_l = __lasx_xvsllwil_w_h(src_y, 0);
658 y_h = __lasx_xvexth_w_h(src_y);
659 u = __lasx_xvaddwev_w_h(src_u,
headroom);
660 v = __lasx_xvaddwod_w_h(src_u,
headroom);
673 __m256i src_y, src_u, src_v;
676 src_y = __lasx_xvldx(buf0, i_dex);
677 src_u = __lasx_xvldrepl_d((ubuf0 + count), 0);
678 src_v = __lasx_xvldrepl_d((vbuf0 + count), 0);
679 src_u = __lasx_xvilvl_d(src_v, src_u);
680 y_l = __lasx_xvsrari_h(src_y, 7);
681 uv = __lasx_xvsrari_h(src_u, 7);
682 y_l = __lasx_vext2xv_w_h(y_l);
683 uv = __lasx_vext2xv_w_h(uv);
684 uv = __lasx_xvaddwev_w_h(uv,
headroom);
691 for (; count < len_count; count++) {
692 int Y1 = (buf0[count * 2 ] + 64) >> 7;
693 int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
694 int U = (ubuf0[count] + 64) >> 7;
695 int V = (vbuf0[count] + 64) >> 7;
703 r,
g,
b, y, target, 0);
706 const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
709 int uvalpha1 = 4096 - uvalpha;
710 __m256i
headroom = __lasx_xvreplgr2vr_w(HEADROOM);
711 __m256i uvalpha_tmp1 = __lasx_xvreplgr2vr_h(uvalpha1);
712 __m256i uvalpha_tmp = __lasx_xvreplgr2vr_h(uvalpha);
714 for (
i = 0;
i <
len;
i += 16) {
717 int c_dex = count << 1;
718 __m256i src_y, src_u0, src_v0, src_u1, src_v1;
719 __m256i y_l, y_h,
u, v, u_ev, v_od;
721 DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
722 ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
723 src_v1 = __lasx_xvldx(vbuf1, c_dex);
724 src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
725 src_u1 = __lasx_xvpermi_q(src_u1, src_v1, 0x02);
726 src_y = __lasx_xvsrari_h(src_y, 7);
727 u_ev = __lasx_xvmulwev_w_h(src_u0, uvalpha_tmp1);
728 v_od = __lasx_xvmulwod_w_h(src_u0, uvalpha_tmp1);
729 u = __lasx_xvmaddwev_w_h(u_ev, src_u1, uvalpha_tmp);
730 v = __lasx_xvmaddwod_w_h(v_od, src_u1, uvalpha_tmp);
731 y_l = __lasx_xvsllwil_w_h(src_y, 0);
732 y_h = __lasx_xvexth_w_h(src_y);
733 u = __lasx_xvsrari_w(
u, 19);
734 v = __lasx_xvsrari_w(v, 19);
746 for (; count < len_count; count++) {
747 int Y1 = (buf0[count * 2 ] + 64) >> 7;
748 int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
749 int U = (ubuf0[count] + ubuf1[count] + 128) >> 8;
750 int V = (vbuf0[count] + vbuf1[count] + 128) >> 8;
758 r,
g,
b, y, target, 0);
763 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
764 static void name ## ext ## _X_lasx(SwsInternal *c, const int16_t *lumFilter, \
765 const int16_t **lumSrc, int lumFilterSize, \
766 const int16_t *chrFilter, const int16_t **chrUSrc, \
767 const int16_t **chrVSrc, int chrFilterSize, \
768 const int16_t **alpSrc, uint8_t *dest, int dstW, \
771 name ## base ## _X_template_lasx(c, lumFilter, lumSrc, lumFilterSize, \
772 chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
773 alpSrc, dest, dstW, y, fmt, hasAlpha); \
776 #define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
777 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
778 static void name ## ext ## _2_lasx(SwsInternal *c, const int16_t *buf[2], \
779 const int16_t *ubuf[2], const int16_t *vbuf[2], \
780 const int16_t *abuf[2], uint8_t *dest, int dstW, \
781 int yalpha, int uvalpha, int y) \
783 name ## base ## _2_template_lasx(c, buf, ubuf, vbuf, abuf, dest, \
784 dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
787 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
788 YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
789 static void name ## ext ## _1_lasx(SwsInternal *c, const int16_t *buf0, \
790 const int16_t *ubuf[2], const int16_t *vbuf[2], \
791 const int16_t *abuf0, uint8_t *dest, int dstW, \
792 int uvalpha, int y) \
794 name ## base ## _1_template_lasx(c, buf0, ubuf, vbuf, abuf0, dest, \
795 dstW, uvalpha, y, fmt, hasAlpha); \
801 #if CONFIG_SWSCALE_ALPHA
817 uint8_t *dest,
int i,
int R,
int A,
int G,
int B,
822 if ((
R |
G |
B) & 0xC0000000) {
830 dest[0] = hasAlpha ?
A : 255;
844 dest[3] = hasAlpha ?
A : 255;
847 dest[0] = hasAlpha ?
A : 255;
861 dest[3] = hasAlpha ?
A : 255;
870 switch (
c->opts.dither) {
877 R += (7*err[0] + 1*
c->dither_error[0][
i] + 5*
c->dither_error[0][
i+1] + 3*
c->dither_error[0][
i+2])>>4;
878 G += (7*err[1] + 1*
c->dither_error[1][
i] + 5*
c->dither_error[1][
i+1] + 3*
c->dither_error[1][
i+2])>>4;
879 B += (7*err[2] + 1*
c->dither_error[2][
i] + 5*
c->dither_error[2][
i+1] + 3*
c->dither_error[2][
i+2])>>4;
880 c->dither_error[0][
i] = err[0];
881 c->dither_error[1][
i] = err[1];
882 c->dither_error[2][
i] = err[2];
883 r =
R >> (isrgb8 ? 5 : 7);
884 g =
G >> (isrgb8 ? 5 : 6);
885 b =
B >> (isrgb8 ? 6 : 7);
889 err[0] =
R -
r*(isrgb8 ? 36 : 255);
890 err[1] =
G -
g*(isrgb8 ? 36 : 85);
891 err[2] =
B -
b*(isrgb8 ? 85 : 255);
896 #define A_DITHER(u,v) (((((u)+((v)*236))*119)&0xff))
915 #define X_DITHER(u,v) (((((u)^((v)*237))*181)&0x1ff)/2)
935 dest[0] =
r + 2*
g + 8*
b;
937 dest[0] =
b + 2*
g + 8*
r;
939 dest[0] =
r + 8*
g + 64*
b;
941 dest[0] =
b + 4*
g + 32*
r;
948 #define YUV2RGB_SETUP \
949 int y_offset = c->yuv2rgb_y_offset; \
950 int y_coeff = c->yuv2rgb_y_coeff; \
951 int v2r_coe = c->yuv2rgb_v2r_coeff; \
952 int v2g_coe = c->yuv2rgb_v2g_coeff; \
953 int u2g_coe = c->yuv2rgb_u2g_coeff; \
954 int u2b_coe = c->yuv2rgb_u2b_coeff; \
955 __m256i offset = __lasx_xvreplgr2vr_w(y_offset); \
956 __m256i coeff = __lasx_xvreplgr2vr_w(y_coeff); \
957 __m256i v2r = __lasx_xvreplgr2vr_w(v2r_coe); \
958 __m256i v2g = __lasx_xvreplgr2vr_w(v2g_coe); \
959 __m256i u2g = __lasx_xvreplgr2vr_w(u2g_coe); \
960 __m256i u2b = __lasx_xvreplgr2vr_w(u2b_coe); \
963 #define YUV2RGB(y, u, v, R, G, B, offset, coeff, \
964 y_temp, v2r, v2g, u2g, u2b) \
966 y = __lasx_xvsub_w(y, offset); \
967 y = __lasx_xvmul_w(y, coeff); \
968 y = __lasx_xvadd_w(y, y_temp); \
969 R = __lasx_xvmadd_w(y, v, v2r); \
970 v = __lasx_xvmadd_w(y, v, v2g); \
971 G = __lasx_xvmadd_w(v, u, u2g); \
972 B = __lasx_xvmadd_w(y, u, u2b); \
975 #define WRITE_FULL_A(r, g, b, a, t1, s) \
977 R = __lasx_xvpickve2gr_w(r, t1); \
978 G = __lasx_xvpickve2gr_w(g, t1); \
979 B = __lasx_xvpickve2gr_w(b, t1); \
980 A = __lasx_xvpickve2gr_w(a, t1); \
982 A = av_clip_uint8(A); \
983 yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\
987 #define WRITE_FULL(r, g, b, t1, s) \
989 R = __lasx_xvpickve2gr_w(r, t1); \
990 G = __lasx_xvpickve2gr_w(g, t1); \
991 B = __lasx_xvpickve2gr_w(b, t1); \
992 yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \
998 const int16_t **lumSrc,
int lumFilterSize,
999 const int16_t *chrFilter,
const int16_t **chrUSrc,
1000 const int16_t **chrVSrc,
int chrFilterSize,
1001 const int16_t **alpSrc, uint8_t *dest,
1005 int i, j,
B,
G,
R,
A;
1009 int a_temp = 1 << 18;
1011 int tempc = templ - (128 << 19);
1012 int ytemp = 1 << 21;
1013 int len = dstW - 15;
1014 __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
1021 for (
i = 0;
i <
len;
i += 16) {
1022 __m256i l_src, u_src, v_src;
1023 __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od,
temp;
1024 __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
1027 y_ev = y_od = __lasx_xvreplgr2vr_w(templ);
1028 u_ev = u_od = v_ev = v_od = __lasx_xvreplgr2vr_w(tempc);
1029 for (j = 0; j < lumFilterSize; j++) {
1030 temp = __lasx_xvldrepl_h((lumFilter + j), 0);
1031 l_src = __lasx_xvldx(lumSrc[j], n);
1032 y_ev = __lasx_xvmaddwev_w_h(y_ev, l_src,
temp);
1033 y_od = __lasx_xvmaddwod_w_h(y_od, l_src,
temp);
1035 for (j = 0; j < chrFilterSize; j++) {
1036 temp = __lasx_xvldrepl_h((chrFilter + j), 0);
1037 DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n,
1040 v_src,
temp, u_ev, v_ev);
1042 v_src,
temp, u_od, v_od);
1044 y_ev = __lasx_xvsrai_w(y_ev, 10);
1045 y_od = __lasx_xvsrai_w(y_od, 10);
1046 u_ev = __lasx_xvsrai_w(u_ev, 10);
1047 u_od = __lasx_xvsrai_w(u_od, 10);
1048 v_ev = __lasx_xvsrai_w(v_ev, 10);
1049 v_od = __lasx_xvsrai_w(v_od, 10);
1051 y_temp, v2r, v2g, u2g, u2b);
1053 y_temp, v2r, v2g, u2g, u2b);
1056 __m256i a_src, a_ev, a_od;
1058 a_ev = a_od = __lasx_xvreplgr2vr_w(a_temp);
1059 for (j = 0; j < lumFilterSize; j++) {
1060 temp = __lasx_xvldrepl_h(lumFilter + j, 0);
1061 a_src = __lasx_xvldx(alpSrc[j], n);
1062 a_ev = __lasx_xvmaddwev_w_h(a_ev, a_src,
temp);
1063 a_od = __lasx_xvmaddwod_w_h(a_od, a_src,
temp);
1065 a_ev = __lasx_xvsrai_w(a_ev, 19);
1066 a_od = __lasx_xvsrai_w(a_od, 19);
1102 if (dstW -
i >= 8) {
1103 __m256i l_src, u_src, v_src;
1104 __m256i y_ev, u_ev, v_ev, uv,
temp;
1105 __m256i R_ev, G_ev, B_ev;
1108 y_ev = __lasx_xvreplgr2vr_w(templ);
1109 u_ev = v_ev = __lasx_xvreplgr2vr_w(tempc);
1110 for (j = 0; j < lumFilterSize; j++) {
1111 temp = __lasx_xvldrepl_h((lumFilter + j), 0);
1112 l_src = __lasx_xvldx(lumSrc[j], n);
1113 l_src = __lasx_xvpermi_d(l_src, 0xD8);
1114 l_src = __lasx_xvilvl_h(l_src, l_src);
1115 y_ev = __lasx_xvmaddwev_w_h(y_ev, l_src,
temp);
1117 for (j = 0; j < chrFilterSize; j++) {
1118 temp = __lasx_xvldrepl_h((chrFilter + j), 0);
1119 DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
1120 u_src = __lasx_xvpermi_d(u_src, 0xD8);
1121 v_src = __lasx_xvpermi_d(v_src, 0xD8);
1122 uv = __lasx_xvilvl_h(v_src, u_src);
1123 u_ev = __lasx_xvmaddwev_w_h(u_ev, uv,
temp);
1124 v_ev = __lasx_xvmaddwod_w_h(v_ev, uv,
temp);
1126 y_ev = __lasx_xvsrai_w(y_ev, 10);
1127 u_ev = __lasx_xvsrai_w(u_ev, 10);
1128 v_ev = __lasx_xvsrai_w(v_ev, 10);
1130 y_temp, v2r, v2g, u2g, u2b);
1133 __m256i a_src, a_ev;
1135 a_ev = __lasx_xvreplgr2vr_w(a_temp);
1136 for (j = 0; j < lumFilterSize; j++) {
1137 temp = __lasx_xvldrepl_h(lumFilter + j, 0);
1138 a_src = __lasx_xvldx(alpSrc[j], n);
1139 a_src = __lasx_xvpermi_d(a_src, 0xD8);
1140 a_src = __lasx_xvilvl_h(a_src, a_src);
1141 a_ev = __lasx_xvmaddwev_w_h(a_ev, a_src,
temp);
1143 a_ev = __lasx_xvsrai_w(a_ev, 19);
1164 for (;
i < dstW;
i++) {
1166 int V,
U =
V = tempc;
1169 for (j = 0; j < lumFilterSize; j++) {
1170 Y += lumSrc[j][
i] * lumFilter[j];
1172 for (j = 0; j < chrFilterSize; j++) {
1173 U += chrUSrc[j][
i] * chrFilter[j];
1174 V += chrVSrc[j][
i] * chrFilter[j];
1182 for (j = 0; j < lumFilterSize; j++) {
1183 A += alpSrc[j][
i] * lumFilter[j];
1192 R = (unsigned)
Y +
V * v2r_coe;
1193 G = (unsigned)
Y +
V * v2g_coe +
U * u2g_coe;
1194 B = (unsigned)
Y +
U * u2b_coe;
1195 yuv2rgb_write_full(
c, dest,
i,
R,
A,
G,
B, y, target, hasAlpha, err);
1198 c->dither_error[0][
i] = err[0];
1199 c->dither_error[1][
i] = err[1];
1200 c->dither_error[2][
i] = err[2];
1205 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1206 const int16_t *abuf[2], uint8_t *dest,
int dstW,
1207 int yalpha,
int uvalpha,
int y,
1210 const int16_t *buf0 = buf[0], *buf1 = buf[1],
1211 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1212 *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1213 *abuf0 = hasAlpha ? abuf[0] :
NULL,
1214 *abuf1 = hasAlpha ? abuf[1] :
NULL;
1215 int yalpha1 = 4096 - yalpha;
1216 int uvalpha1 = 4096 - uvalpha;
1217 int uvtemp = 128 << 19;
1218 int atemp = 1 << 18;
1220 int ytemp = 1 << 21;
1221 int len = dstW - 15;
1225 __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
1226 __m256i v_yalpha1 = __lasx_xvreplgr2vr_w(yalpha1);
1227 __m256i v_uvalpha = __lasx_xvreplgr2vr_w(uvalpha);
1228 __m256i v_yalpha = __lasx_xvreplgr2vr_w(yalpha);
1229 __m256i uv = __lasx_xvreplgr2vr_w(uvtemp);
1230 __m256i a_bias = __lasx_xvreplgr2vr_w(atemp);
1231 __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
1241 for (
i = 0;
i <
len;
i += 16) {
1242 __m256i
b0,
b1, ub0, ub1, vb0, vb1;
1243 __m256i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
1244 __m256i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
1245 __m256i y_l, y_h, v_l, v_h, u_l, u_h;
1246 __m256i R_l, R_h, G_l, G_h, B_l, B_h;
1249 DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0,
1250 n, ubuf1, n,
b0,
b1, ub0, ub1);
1251 DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0 , vb1);
1253 DUP4_ARG2(__lasx_xvsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
1254 u0_l, u1_l, v0_l, v1_l);
1256 DUP4_ARG1(__lasx_xvexth_w_h, ub0, ub1, vb0, vb1,
1257 u0_h, u1_h, v0_h, v1_h);
1258 y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
1259 y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
1260 u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
1261 u0_h = __lasx_xvmul_w(u0_h, v_uvalpha1);
1262 v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
1263 v0_h = __lasx_xvmul_w(v0_h, v_uvalpha1);
1264 y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
1265 y_h = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
1266 u_l = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
1267 u_h = __lasx_xvmadd_w(u0_h, v_uvalpha, u1_h);
1268 v_l = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
1269 v_h = __lasx_xvmadd_w(v0_h, v_uvalpha, v1_h);
1270 u_l = __lasx_xvsub_w(u_l, uv);
1271 u_h = __lasx_xvsub_w(u_h, uv);
1272 v_l = __lasx_xvsub_w(v_l, uv);
1273 v_h = __lasx_xvsub_w(v_h, uv);
1274 y_l = __lasx_xvsrai_w(y_l, 10);
1275 y_h = __lasx_xvsrai_w(y_h, 10);
1276 u_l = __lasx_xvsrai_w(u_l, 10);
1277 u_h = __lasx_xvsrai_w(u_h, 10);
1278 v_l = __lasx_xvsrai_w(v_l, 10);
1279 v_h = __lasx_xvsrai_w(v_h, 10);
1281 y_temp, v2r, v2g, u2g, u2b);
1283 y_temp, v2r, v2g, u2g, u2b);
1286 __m256i
a0,
a1, a0_l, a0_h;
1287 __m256i a_l, a_h, a1_l, a1_h;
1292 a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
1293 a_h = __lasx_xvmadd_w(a_bias, a0_h, v_yalpha1);
1294 a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
1295 a_h = __lasx_xvmadd_w(a_h, v_yalpha, a1_h);
1296 a_l = __lasx_xvsrai_w(a_l, 19);
1297 a_h = __lasx_xvsrai_w(a_h, 19);
1333 if (dstW -
i >= 8) {
1334 __m256i
b0,
b1, ub0, ub1, vb0, vb1;
1335 __m256i y0_l, y1_l, u0_l;
1336 __m256i v0_l, u1_l, v1_l;
1337 __m256i y_l, u_l, v_l;
1338 __m256i R_l, G_l, B_l;
1341 DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0, n,
1342 ubuf1, n,
b0,
b1, ub0, ub1);
1343 DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0, vb1);
1345 DUP4_ARG1(__lasx_vext2xv_w_h, ub0, ub1, vb0, vb1,
1346 u0_l, u1_l, v0_l, v1_l);
1347 y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
1348 u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
1349 v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
1350 y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
1351 u_l = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
1352 v_l = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
1353 u_l = __lasx_xvsub_w(u_l, uv);
1354 v_l = __lasx_xvsub_w(v_l, uv);
1355 y_l = __lasx_xvsrai_w(y_l, 10);
1356 u_l = __lasx_xvsrai_w(u_l, 10);
1357 v_l = __lasx_xvsrai_w(v_l, 10);
1359 y_temp, v2r, v2g, u2g, u2b);
1362 __m256i
a0,
a1, a0_l;
1367 a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
1368 a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
1369 a_l = __lasx_xvsrai_w(a_l, 19);
1390 for (;
i < dstW;
i++){
1391 int Y = ( buf0[
i] * yalpha1 + buf1[
i] * yalpha ) >> 10;
1392 int U = (ubuf0[
i] * uvalpha1 + ubuf1[
i] * uvalpha- uvtemp) >> 10;
1393 int V = (vbuf0[
i] * uvalpha1 + vbuf1[
i] * uvalpha- uvtemp) >> 10;
1397 A = (abuf0[
i] * yalpha1 + abuf1[
i] * yalpha + atemp) >> 19;
1405 R = (unsigned)
Y +
V * v2r_coe;
1406 G = (unsigned)
Y +
V * v2g_coe +
U * u2g_coe;
1407 B = (unsigned)
Y +
U * u2b_coe;
1408 yuv2rgb_write_full(
c, dest,
i,
R,
A,
G,
B, y, target, hasAlpha, err);
1411 c->dither_error[0][
i] = err[0];
1412 c->dither_error[1][
i] = err[1];
1413 c->dither_error[2][
i] = err[2];
1418 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1419 const int16_t *abuf0, uint8_t *dest,
int dstW,
1423 const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
1427 int ytemp = 1 << 21;
1429 int len = dstW - 15;
1430 __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
1436 if (uvalpha < 2048) {
1437 int uvtemp = 128 << 7;
1438 __m256i uv = __lasx_xvreplgr2vr_w(uvtemp);
1439 __m256i
bias = __lasx_xvreplgr2vr_w(bias_int);
1441 for (
i = 0;
i <
len;
i += 16) {
1442 __m256i
b,
ub, vb, ub_l, ub_h, vb_l, vb_h;
1443 __m256i y_l, y_h, u_l, u_h, v_l, v_h;
1444 __m256i R_l, R_h, G_l, G_h, B_l, B_h;
1448 vb = __lasx_xvldx(vbuf0, n);
1449 y_l = __lasx_xvsllwil_w_h(
b, 2);
1450 y_h = __lasx_xvexth_w_h(
b);
1451 DUP2_ARG2(__lasx_xvsllwil_w_h,
ub, 0, vb, 0, ub_l, vb_l);
1453 y_h = __lasx_xvslli_w(y_h, 2);
1454 u_l = __lasx_xvsub_w(ub_l, uv);
1455 u_h = __lasx_xvsub_w(ub_h, uv);
1456 v_l = __lasx_xvsub_w(vb_l, uv);
1457 v_h = __lasx_xvsub_w(vb_h, uv);
1458 u_l = __lasx_xvslli_w(u_l, 2);
1459 u_h = __lasx_xvslli_w(u_h, 2);
1460 v_l = __lasx_xvslli_w(v_l, 2);
1461 v_h = __lasx_xvslli_w(v_h, 2);
1463 y_temp, v2r, v2g, u2g, u2b);
1465 y_temp, v2r, v2g, u2g, u2b);
1471 a_src = __lasx_xvld(abuf0 +
i, 0);
1472 a_l = __lasx_xvsllwil_w_h(a_src, 0);
1473 a_h = __lasx_xvexth_w_h(a_src);
1474 a_l = __lasx_xvadd_w(a_l,
bias);
1475 a_h = __lasx_xvadd_w(a_h,
bias);
1476 a_l = __lasx_xvsrai_w(a_l, 7);
1477 a_h = __lasx_xvsrai_w(a_h, 7);
1513 if (dstW -
i >= 8) {
1514 __m256i
b,
ub, vb, ub_l, vb_l;
1515 __m256i y_l, u_l, v_l;
1516 __m256i R_l, G_l, B_l;
1520 vb = __lasx_xvldx(vbuf0, n);
1521 y_l = __lasx_vext2xv_w_h(
b);
1522 DUP2_ARG1(__lasx_vext2xv_w_h,
ub, vb, ub_l, vb_l);
1523 y_l = __lasx_xvslli_w(y_l, 2);
1524 u_l = __lasx_xvsub_w(ub_l, uv);
1525 v_l = __lasx_xvsub_w(vb_l, uv);
1526 u_l = __lasx_xvslli_w(u_l, 2);
1527 v_l = __lasx_xvslli_w(v_l, 2);
1529 y_temp, v2r, v2g, u2g, u2b);
1534 a_src = __lasx_xvldx(abuf0, n);
1535 a_src = __lasx_vext2xv_w_h(a_src);
1536 a_l = __lasx_xvadd_w(
bias, a_src);
1537 a_l = __lasx_xvsrai_w(a_l, 7);
1558 for (;
i < dstW;
i++) {
1559 int Y = buf0[
i] << 2;
1560 int U = (ubuf0[
i] - uvtemp) << 2;
1561 int V = (vbuf0[
i] - uvtemp) << 2;
1565 A = (abuf0[
i] + 64) >> 7;
1572 R = (unsigned)
Y +
V * v2r_coe;
1573 G = (unsigned)
Y +
V * v2g_coe +
U * u2g_coe;
1574 B = (unsigned)
Y +
U * u2b_coe;
1575 yuv2rgb_write_full(
c, dest,
i,
R,
A,
G,
B, y, target, hasAlpha, err);
1579 const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
1580 int uvtemp = 128 << 8;
1581 __m256i uv = __lasx_xvreplgr2vr_w(uvtemp);
1582 __m256i
zero = __lasx_xvldi(0);
1583 __m256i
bias = __lasx_xvreplgr2vr_h(bias_int);
1585 for (
i = 0;
i <
len;
i += 16) {
1586 __m256i
b, ub0, ub1, vb0, vb1;
1587 __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od;
1588 __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
1591 DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
1592 ubuf1, n,
b, ub0, vb0, ub1);
1593 vb1 = __lasx_xvldx(vbuf, n);
1594 y_ev = __lasx_xvaddwev_w_h(
b,
zero);
1595 y_od = __lasx_xvaddwod_w_h(
b,
zero);
1596 DUP2_ARG2(__lasx_xvaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
1597 DUP2_ARG2(__lasx_xvaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
1598 DUP2_ARG2(__lasx_xvslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
1599 DUP4_ARG2(__lasx_xvsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
1600 u_ev, u_od, v_ev, v_od);
1601 DUP4_ARG2(__lasx_xvslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
1602 u_ev, u_od, v_ev, v_od);
1604 y_temp, v2r, v2g, u2g, u2b);
1606 y_temp, v2r, v2g, u2g, u2b);
1612 a_src = __lasx_xvld(abuf0 +
i, 0);
1613 a_ev = __lasx_xvaddwev_w_h(
bias, a_src);
1614 a_od = __lasx_xvaddwod_w_h(
bias, a_src);
1615 a_ev = __lasx_xvsrai_w(a_ev, 7);
1616 a_od = __lasx_xvsrai_w(a_od, 7);
1652 if (dstW -
i >= 8) {
1653 __m256i
b, ub0, ub1, vb0, vb1;
1654 __m256i y_l, u_l, v_l;
1655 __m256i R_l, G_l, B_l;
1658 DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
1659 ubuf1, n,
b, ub0, vb0, ub1);
1660 vb1 = __lasx_xvldx(vbuf1, n);
1661 y_l = __lasx_vext2xv_w_h(
b);
1662 y_l = __lasx_xvslli_w(y_l, 2);
1663 DUP4_ARG1(__lasx_vext2xv_w_h, ub0, vb0, ub1, vb1,
1664 ub0, vb0, ub1, vb1);
1665 DUP2_ARG2(__lasx_xvadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
1666 u_l = __lasx_xvsub_w(u_l, uv);
1667 v_l = __lasx_xvsub_w(v_l, uv);
1668 u_l = __lasx_xvslli_w(u_l, 1);
1669 v_l = __lasx_xvslli_w(v_l, 1);
1671 y_temp, v2r, v2g, u2g, u2b);
1677 a_src = __lasx_xvld(abuf0 +
i, 0);
1678 a_src = __lasx_xvpermi_d(a_src, 0xD8);
1679 a_src = __lasx_xvilvl_h(a_src, a_src);
1680 a_l = __lasx_xvaddwev_w_h(
bias, a_src);
1681 a_l = __lasx_xvsrai_w(a_l, 7);
1702 for (;
i < dstW;
i++) {
1703 int Y = buf0[
i] << 2;
1704 int U = (ubuf0[
i] + ubuf1[
i] - uvtemp) << 1;
1705 int V = (vbuf0[
i] + vbuf1[
i] - uvtemp) << 1;
1709 A = (abuf0[
i] + 64) >> 7;
1716 R = (unsigned)
Y +
V * v2r_coe;
1717 G = (unsigned)
Y +
V * v2g_coe +
U * u2g_coe;
1718 B = (unsigned)
Y +
U * u2b_coe;
1719 yuv2rgb_write_full(
c, dest,
i,
R,
A,
G,
B, y, target, hasAlpha, err);
1723 c->dither_error[0][
i] = err[0];
1724 c->dither_error[1][
i] = err[1];
1725 c->dither_error[2][
i] = err[2];
1729 CONFIG_SWSCALE_ALPHA &&
c->needAlpha)
1731 CONFIG_SWSCALE_ALPHA &&
c->needAlpha)
1733 CONFIG_SWSCALE_ALPHA &&
c->needAlpha)
1735 CONFIG_SWSCALE_ALPHA &&
c->needAlpha)
1737 #if CONFIG_SWSCALE_ALPHA
1770 }
else if (
is16BPS(dstFormat)) {
1771 }
else if (
isNBPS(dstFormat)) {
1775 *yuv2plane1 = yuv2plane1_8_lasx;
1780 switch (
c->opts.dst_format) {
1783 c->yuv2packedX = yuv2rgba32_full_X_lasx;
1784 c->yuv2packed2 = yuv2rgba32_full_2_lasx;
1785 c->yuv2packed1 = yuv2rgba32_full_1_lasx;
1787 #if CONFIG_SWSCALE_ALPHA
1789 c->yuv2packedX = yuv2rgba32_full_X_lasx;
1790 c->yuv2packed2 = yuv2rgba32_full_2_lasx;
1791 c->yuv2packed1 = yuv2rgba32_full_1_lasx;
1795 c->yuv2packedX = yuv2rgbx32_full_X_lasx;
1796 c->yuv2packed2 = yuv2rgbx32_full_2_lasx;
1797 c->yuv2packed1 = yuv2rgbx32_full_1_lasx;
1803 c->yuv2packedX = yuv2argb32_full_X_lasx;
1804 c->yuv2packed2 = yuv2argb32_full_2_lasx;
1805 c->yuv2packed1 = yuv2argb32_full_1_lasx;
1807 #if CONFIG_SWSCALE_ALPHA
1809 c->yuv2packedX = yuv2argb32_full_X_lasx;
1810 c->yuv2packed2 = yuv2argb32_full_2_lasx;
1811 c->yuv2packed1 = yuv2argb32_full_1_lasx;
1815 c->yuv2packedX = yuv2xrgb32_full_X_lasx;
1816 c->yuv2packed2 = yuv2xrgb32_full_2_lasx;
1817 c->yuv2packed1 = yuv2xrgb32_full_1_lasx;
1823 c->yuv2packedX = yuv2bgra32_full_X_lasx;
1824 c->yuv2packed2 = yuv2bgra32_full_2_lasx;
1825 c->yuv2packed1 = yuv2bgra32_full_1_lasx;
1827 #if CONFIG_SWSCALE_ALPHA
1829 c->yuv2packedX = yuv2bgra32_full_X_lasx;
1830 c->yuv2packed2 = yuv2bgra32_full_2_lasx;
1831 c->yuv2packed1 = yuv2bgra32_full_1_lasx;
1835 c->yuv2packedX = yuv2bgrx32_full_X_lasx;
1836 c->yuv2packed2 = yuv2bgrx32_full_2_lasx;
1837 c->yuv2packed1 = yuv2bgrx32_full_1_lasx;
1843 c->yuv2packedX = yuv2abgr32_full_X_lasx;
1844 c->yuv2packed2 = yuv2abgr32_full_2_lasx;
1845 c->yuv2packed1 = yuv2abgr32_full_1_lasx;
1847 #if CONFIG_SWSCALE_ALPHA
1849 c->yuv2packedX = yuv2abgr32_full_X_lasx;
1850 c->yuv2packed2 = yuv2abgr32_full_2_lasx;
1851 c->yuv2packed1 = yuv2abgr32_full_1_lasx;
1855 c->yuv2packedX = yuv2xbgr32_full_X_lasx;
1856 c->yuv2packed2 = yuv2xbgr32_full_2_lasx;
1857 c->yuv2packed1 = yuv2xbgr32_full_1_lasx;
1862 c->yuv2packedX = yuv2rgb24_full_X_lasx;
1863 c->yuv2packed2 = yuv2rgb24_full_2_lasx;
1864 c->yuv2packed1 = yuv2rgb24_full_1_lasx;
1867 c->yuv2packedX = yuv2bgr24_full_X_lasx;
1868 c->yuv2packed2 = yuv2bgr24_full_2_lasx;
1869 c->yuv2packed1 = yuv2bgr24_full_1_lasx;
1872 c->yuv2packedX = yuv2bgr4_byte_full_X_lasx;
1873 c->yuv2packed2 = yuv2bgr4_byte_full_2_lasx;
1874 c->yuv2packed1 = yuv2bgr4_byte_full_1_lasx;
1877 c->yuv2packedX = yuv2rgb4_byte_full_X_lasx;
1878 c->yuv2packed2 = yuv2rgb4_byte_full_2_lasx;
1879 c->yuv2packed1 = yuv2rgb4_byte_full_1_lasx;
1882 c->yuv2packedX = yuv2bgr8_full_X_lasx;
1883 c->yuv2packed2 = yuv2bgr8_full_2_lasx;
1884 c->yuv2packed1 = yuv2bgr8_full_1_lasx;
1887 c->yuv2packedX = yuv2rgb8_full_X_lasx;
1888 c->yuv2packed2 = yuv2rgb8_full_2_lasx;
1889 c->yuv2packed1 = yuv2rgb8_full_1_lasx;
1893 switch (
c->opts.dst_format) {
1898 #if CONFIG_SWSCALE_ALPHA
1903 c->yuv2packed1 = yuv2rgbx32_1_lasx;
1904 c->yuv2packed2 = yuv2rgbx32_2_lasx;
1905 c->yuv2packedX = yuv2rgbx32_X_lasx;
1913 #if CONFIG_SWSCALE_ALPHA
1918 c->yuv2packed1 = yuv2rgbx32_1_1_lasx;
1919 c->yuv2packed2 = yuv2rgbx32_1_2_lasx;
1920 c->yuv2packedX = yuv2rgbx32_1_X_lasx;
1925 c->yuv2packed1 = yuv2rgb24_1_lasx;
1926 c->yuv2packed2 = yuv2rgb24_2_lasx;
1927 c->yuv2packedX = yuv2rgb24_X_lasx;
1930 c->yuv2packed1 = yuv2bgr24_1_lasx;
1931 c->yuv2packed2 = yuv2bgr24_2_lasx;
1932 c->yuv2packedX = yuv2bgr24_X_lasx;
1938 c->yuv2packed1 = yuv2rgb16_1_lasx;
1939 c->yuv2packed2 = yuv2rgb16_2_lasx;
1940 c->yuv2packedX = yuv2rgb16_X_lasx;
1946 c->yuv2packed1 = yuv2rgb15_1_lasx;
1947 c->yuv2packed2 = yuv2rgb15_2_lasx;
1948 c->yuv2packedX = yuv2rgb15_X_lasx;
1954 c->yuv2packed1 = yuv2rgb12_1_lasx;
1955 c->yuv2packed2 = yuv2rgb12_2_lasx;
1956 c->yuv2packedX = yuv2rgb12_X_lasx;
1960 c->yuv2packed1 = yuv2rgb8_1_lasx;
1961 c->yuv2packed2 = yuv2rgb8_2_lasx;
1962 c->yuv2packedX = yuv2rgb8_X_lasx;
1966 c->yuv2packed1 = yuv2rgb4_1_lasx;
1967 c->yuv2packed2 = yuv2rgb4_2_lasx;
1968 c->yuv2packedX = yuv2rgb4_X_lasx;
1972 c->yuv2packed1 = yuv2rgb4b_1_lasx;
1973 c->yuv2packed2 = yuv2rgb4b_2_lasx;
1974 c->yuv2packedX = yuv2rgb4b_X_lasx;