29 uint16_t *dstU = (uint16_t *)_dstU;
30 uint16_t *dstV = (uint16_t *)_dstV;
38 __m256i ru, gu, bu, rv, gv, bv;
39 __m256i
mask = {0x0D0C090805040100, 0x1D1C191815141110,
40 0x0D0C090805040100, 0x1D1C191815141110};
41 __m256i
temp = __lasx_xvreplgr2vr_w(
set);
42 __m256i sra = __lasx_xvreplgr2vr_w(
shift);
44 ru = __lasx_xvreplgr2vr_w(tem_ru);
45 gu = __lasx_xvreplgr2vr_w(tem_gu);
46 bu = __lasx_xvreplgr2vr_w(tem_bu);
47 rv = __lasx_xvreplgr2vr_w(tem_rv);
48 gv = __lasx_xvreplgr2vr_w(tem_gv);
49 bv = __lasx_xvreplgr2vr_w(tem_bv);
50 for (
i = 0;
i <
len;
i += 16) {
52 __m256i g_l, g_h, b_l, b_h, r_l, r_h;
53 __m256i v_l, v_h, u_l, u_h, u_lh, v_lh;
55 _g = __lasx_xvldx(
src0,
i);
56 _b = __lasx_xvldx(
src1,
i);
57 _r = __lasx_xvldx(
src2,
i);
58 g_l = __lasx_vext2xv_wu_bu(_g);
59 b_l = __lasx_vext2xv_wu_bu(_b);
60 r_l = __lasx_vext2xv_wu_bu(_r);
61 _g = __lasx_xvpermi_d(_g, 0x01);
62 _b = __lasx_xvpermi_d(_b, 0x01);
63 _r = __lasx_xvpermi_d(_r, 0x01);
64 g_h = __lasx_vext2xv_wu_bu(_g);
65 b_h = __lasx_vext2xv_wu_bu(_b);
66 r_h = __lasx_vext2xv_wu_bu(_r);
67 u_l = __lasx_xvmadd_w(
temp, ru, r_l);
68 u_h = __lasx_xvmadd_w(
temp, ru, r_h);
69 v_l = __lasx_xvmadd_w(
temp, rv, r_l);
70 v_h = __lasx_xvmadd_w(
temp, rv, r_h);
71 u_l = __lasx_xvmadd_w(u_l, gu, g_l);
72 u_l = __lasx_xvmadd_w(u_l, bu, b_l);
73 u_h = __lasx_xvmadd_w(u_h, gu, g_h);
74 u_h = __lasx_xvmadd_w(u_h, bu, b_h);
75 v_l = __lasx_xvmadd_w(v_l, gv, g_l);
76 v_l = __lasx_xvmadd_w(v_l, bv, b_l);
77 v_h = __lasx_xvmadd_w(v_h, gv, g_h);
78 v_h = __lasx_xvmadd_w(v_h, bv, b_h);
79 u_l = __lasx_xvsra_w(u_l, sra);
80 u_h = __lasx_xvsra_w(u_h, sra);
81 v_l = __lasx_xvsra_w(v_l, sra);
82 v_h = __lasx_xvsra_w(v_h, sra);
83 u_lh = __lasx_xvshuf_b(u_h, u_l,
mask);
84 v_lh = __lasx_xvshuf_b(v_h, v_l,
mask);
85 u_lh = __lasx_xvpermi_d(u_lh, 0xD8);
86 v_lh = __lasx_xvpermi_d(v_lh, 0xD8);
87 __lasx_xvst(u_lh, (dstU +
i), 0);
88 __lasx_xvst(v_lh, (dstV +
i), 0);
92 __m256i g_l, b_l, r_l;
93 __m256i v_l, u_l,
u, v;
95 _g = __lasx_xvldrepl_d((
src0 +
i), 0);
96 _b = __lasx_xvldrepl_d((
src1 +
i), 0);
97 _r = __lasx_xvldrepl_d((
src2 +
i), 0);
98 g_l = __lasx_vext2xv_wu_bu(_g);
99 b_l = __lasx_vext2xv_wu_bu(_b);
100 r_l = __lasx_vext2xv_wu_bu(_r);
101 u_l = __lasx_xvmadd_w(
temp, ru, r_l);
102 v_l = __lasx_xvmadd_w(
temp, rv, r_l);
103 u_l = __lasx_xvmadd_w(u_l, gu, g_l);
104 u_l = __lasx_xvmadd_w(u_l, bu, b_l);
105 v_l = __lasx_xvmadd_w(v_l, gv, g_l);
106 v_l = __lasx_xvmadd_w(v_l, bv, b_l);
107 u_l = __lasx_xvsra_w(u_l, sra);
108 v_l = __lasx_xvsra_w(v_l, sra);
109 u = __lasx_xvshuf_b(u_l, u_l,
mask);
110 v = __lasx_xvshuf_b(v_l, v_l,
mask);
111 __lasx_xvstelm_d(
u, (dstU +
i), 0, 0);
112 __lasx_xvstelm_d(
u, (dstU +
i), 8, 2);
113 __lasx_xvstelm_d(v, (dstV +
i), 0, 0);
114 __lasx_xvstelm_d(v, (dstV +
i), 8, 2);
122 dstU[
i] = (tem_ru *
r + tem_gu *
g + tem_bu *
b +
set) >>
shift;
123 dstV[
i] = (tem_rv *
r + tem_gv *
g + tem_bv *
b +
set) >>
shift;
134 uint16_t *dst = (uint16_t *)_dst;
138 __m256i
mask = {0x0D0C090805040100, 0x1D1C191815141110,
139 0x0D0C090805040100, 0x1D1C191815141110};
140 __m256i
temp = __lasx_xvreplgr2vr_w(
set);
141 __m256i sra = __lasx_xvreplgr2vr_w(
shift);
142 __m256i ry = __lasx_xvreplgr2vr_w(tem_ry);
143 __m256i gy = __lasx_xvreplgr2vr_w(tem_gy);
144 __m256i by = __lasx_xvreplgr2vr_w(tem_by);
146 for (
i = 0;
i <
len;
i += 16) {
148 __m256i g_l, g_h, b_l, b_h, r_l, r_h;
149 __m256i y_l, y_h, y_lh;
151 _g = __lasx_xvldx(
src0,
i);
152 _b = __lasx_xvldx(
src1,
i);
153 _r = __lasx_xvldx(
src2,
i);
154 g_l = __lasx_vext2xv_wu_bu(_g);
155 b_l = __lasx_vext2xv_wu_bu(_b);
156 r_l = __lasx_vext2xv_wu_bu(_r);
157 _g = __lasx_xvpermi_d(_g, 0x01);
158 _b = __lasx_xvpermi_d(_b, 0x01);
159 _r = __lasx_xvpermi_d(_r, 0x01);
160 g_h = __lasx_vext2xv_wu_bu(_g);
161 b_h = __lasx_vext2xv_wu_bu(_b);
162 r_h = __lasx_vext2xv_wu_bu(_r);
163 y_l = __lasx_xvmadd_w(
temp, ry, r_l);
164 y_h = __lasx_xvmadd_w(
temp, ry, r_h);
165 y_l = __lasx_xvmadd_w(y_l, gy, g_l);
166 y_l = __lasx_xvmadd_w(y_l, by, b_l);
167 y_h = __lasx_xvmadd_w(y_h, gy, g_h);
168 y_h = __lasx_xvmadd_w(y_h, by, b_h);
169 y_l = __lasx_xvsra_w(y_l, sra);
170 y_h = __lasx_xvsra_w(y_h, sra);
171 y_lh = __lasx_xvshuf_b(y_h, y_l,
mask);
172 y_lh = __lasx_xvpermi_d(y_lh, 0xD8);
173 __lasx_xvst(y_lh, (dst +
i), 0);
177 __m256i g_l, b_l, r_l;
180 _g = __lasx_xvldrepl_d((
src0 +
i), 0);
181 _b = __lasx_xvldrepl_d((
src1 +
i), 0);
182 _r = __lasx_xvldrepl_d((
src2 +
i), 0);
183 g_l = __lasx_vext2xv_wu_bu(_g);
184 b_l = __lasx_vext2xv_wu_bu(_b);
185 r_l = __lasx_vext2xv_wu_bu(_r);
186 y_l = __lasx_xvmadd_w(
temp, ry, r_l);
187 y_l = __lasx_xvmadd_w(y_l, gy, g_l);
188 y_l = __lasx_xvmadd_w(y_l, by, b_l);
189 y_l = __lasx_xvsra_w(y_l, sra);
190 y = __lasx_xvshuf_b(y_l, y_l,
mask);
191 __lasx_xvstelm_d(y, (dst +
i), 0, 0);
192 __lasx_xvstelm_d(y, (dst +
i), 8, 2);
200 dst[
i] = (tem_ry *
r + tem_gy *
g + tem_by *
b +
set) >>
shift;