28 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
29 p1_or_q1_org_in, p2_or_q2_org_in, \
30 neg_tc_in, tc_in, p1_or_q1_out) \
32 __m256i clip3, temp; \
34 clip3 = __lasx_xvavgr_hu(p0_or_q0_org_in, \
36 temp = __lasx_xvslli_h(p1_or_q1_org_in, 1); \
37 clip3 = __lasx_xvsub_h(clip3, temp); \
38 clip3 = __lasx_xvavg_h(p2_or_q2_org_in, clip3); \
39 clip3 = __lasx_xvclip_h(clip3, neg_tc_in, tc_in); \
40 p1_or_q1_out = __lasx_xvadd_h(p1_or_q1_org_in, clip3); \
43 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \
44 p1_or_q1_org_in, q1_or_p1_org_in, \
45 neg_threshold_in, threshold_in, \
46 p0_or_q0_out, q0_or_p0_out) \
48 __m256i q0_sub_p0, p1_sub_q1, delta; \
50 q0_sub_p0 = __lasx_xvsub_h(q0_or_p0_org_in, \
52 p1_sub_q1 = __lasx_xvsub_h(p1_or_q1_org_in, \
54 q0_sub_p0 = __lasx_xvslli_h(q0_sub_p0, 2); \
55 p1_sub_q1 = __lasx_xvaddi_hu(p1_sub_q1, 4); \
56 delta = __lasx_xvadd_h(q0_sub_p0, p1_sub_q1); \
57 delta = __lasx_xvsrai_h(delta, 3); \
58 delta = __lasx_xvclip_h(delta, neg_threshold_in, \
60 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, delta); \
61 q0_or_p0_out = __lasx_xvsub_h(q0_or_p0_org_in, delta); \
63 p0_or_q0_out = __lasx_xvclip255_h(p0_or_q0_out); \
64 q0_or_p0_out = __lasx_xvclip255_h(q0_or_p0_out); \
68 int alpha_in,
int beta_in, int8_t *
tc)
70 int img_width_2x = img_width << 1;
71 int img_width_4x = img_width << 2;
72 int img_width_8x = img_width << 3;
73 int img_width_3x = img_width_2x + img_width;
74 __m256i tmp_vec0, bs_vec;
75 __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
76 0x0101010100000000, 0x0303030302020202};
78 tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)
tc, 0);
79 tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
80 bs_vec = __lasx_xvslti_b(tc_vec, 0);
81 bs_vec = __lasx_xvxori_b(bs_vec, 255);
82 bs_vec = __lasx_xvandi_b(bs_vec, 1);
84 if (__lasx_xbnz_v(bs_vec)) {
86 __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
87 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0,
alpha, beta;
88 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
89 __m256i is_bs_greater_than0;
90 __m256i
zero = __lasx_xvldi(0);
92 is_bs_greater_than0 = __lasx_xvslt_bu(
zero, bs_vec);
95 uint8_t *src_tmp =
src + img_width_8x;
96 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
97 __m256i row8, row9, row10, row11, row12, row13, row14, row15;
100 src, img_width_3x, row0, row1, row2, row3);
103 src, img_width_3x, row4, row5, row6, row7);
105 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
106 img_width_2x, src_tmp, img_width_3x,
107 row8, row9, row10, row11);
108 src_tmp += img_width_4x;
109 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
110 img_width_2x, src_tmp, img_width_3x,
111 row12, row13, row14, row15);
112 src_tmp -= img_width_4x;
114 LASX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6,
115 row7, row8, row9, row10, row11,
116 row12, row13, row14, row15,
117 p3_org, p2_org, p1_org, p0_org,
118 q0_org, q1_org, q2_org, q3_org);
121 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
122 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
123 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
125 alpha = __lasx_xvreplgr2vr_b(alpha_in);
126 beta = __lasx_xvreplgr2vr_b(beta_in);
128 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0,
alpha);
129 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
130 is_less_than = is_less_than_alpha & is_less_than_beta;
131 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
132 is_less_than = is_less_than_beta & is_less_than;
133 is_less_than = is_less_than & is_bs_greater_than0;
135 if (__lasx_xbnz_v(is_less_than)) {
136 __m256i neg_tc_h, tc_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
137 __m256i p2_asub_p0, q2_asub_q0;
139 neg_tc_h = __lasx_xvneg_b(tc_vec);
140 neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
141 tc_h = __lasx_vext2xv_hu_bu(tc_vec);
142 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
143 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
144 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
146 p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
147 is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
148 is_less_than_beta = is_less_than_beta & is_less_than;
150 if (__lasx_xbnz_v(is_less_than_beta)) {
151 __m256i p2_org_h, p1_h;
153 p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
155 neg_tc_h, tc_h, p1_h);
156 p1_h = __lasx_xvpickev_b(p1_h, p1_h);
157 p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
158 p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
159 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
160 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
163 q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
164 is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
165 is_less_than_beta = is_less_than_beta & is_less_than;
167 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
169 if (__lasx_xbnz_v(is_less_than_beta)) {
170 __m256i q2_org_h, q1_h;
172 q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
174 neg_tc_h, tc_h, q1_h);
175 q1_h = __lasx_xvpickev_b(q1_h, q1_h);
176 q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
177 q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
179 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
180 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
184 __m256i neg_thresh_h, p0_h, q0_h;
186 neg_thresh_h = __lasx_xvneg_b(tc_vec);
187 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
188 tc_h = __lasx_vext2xv_hu_bu(tc_vec);
191 neg_thresh_h, tc_h, p0_h, q0_h);
192 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
194 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
196 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
197 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
201 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
202 __m256i control = {0x0000000400000000, 0x0000000500000001,
203 0x0000000600000002, 0x0000000700000003};
205 DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org,
206 q2_org, 0x02, p2_org, q1_org, 0x02, p3_org,
207 q0_org, 0x02, p0_org, p1_org, p2_org, p3_org);
208 DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
210 DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
212 DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
213 DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
214 DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
215 control, row7, control, row4, row5, row6, row7);
216 __lasx_xvstelm_d(row4,
src, 0, 0);
217 __lasx_xvstelm_d(row4,
src + img_width, 0, 1);
219 __lasx_xvstelm_d(row4,
src, 0, 2);
220 __lasx_xvstelm_d(row4,
src + img_width, 0, 3);
222 __lasx_xvstelm_d(row5,
src, 0, 0);
223 __lasx_xvstelm_d(row5,
src + img_width, 0, 1);
225 __lasx_xvstelm_d(row5,
src, 0, 2);
226 __lasx_xvstelm_d(row5,
src + img_width, 0, 3);
228 __lasx_xvstelm_d(row6,
src, 0, 0);
229 __lasx_xvstelm_d(row6,
src + img_width, 0, 1);
231 __lasx_xvstelm_d(row6,
src, 0, 2);
232 __lasx_xvstelm_d(row6,
src + img_width, 0, 3);
234 __lasx_xvstelm_d(row7,
src, 0, 0);
235 __lasx_xvstelm_d(row7,
src + img_width, 0, 1);
237 __lasx_xvstelm_d(row7,
src, 0, 2);
238 __lasx_xvstelm_d(row7,
src + img_width, 0, 3);
245 int alpha_in,
int beta_in, int8_t *
tc)
247 int img_width_2x = img_width << 1;
248 int img_width_3x = img_width + img_width_2x;
249 __m256i tmp_vec0, bs_vec;
250 __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
251 0x0101010100000000, 0x0303030302020202};
253 tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)
tc, 0);
254 tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
255 bs_vec = __lasx_xvslti_b(tc_vec, 0);
256 bs_vec = __lasx_xvxori_b(bs_vec, 255);
257 bs_vec = __lasx_xvandi_b(bs_vec, 1);
259 if (__lasx_xbnz_v(bs_vec)) {
260 __m256i p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
261 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0,
alpha, beta;
262 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
263 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
264 __m256i is_bs_greater_than0;
265 __m256i
zero = __lasx_xvldi(0);
267 alpha = __lasx_xvreplgr2vr_b(alpha_in);
268 beta = __lasx_xvreplgr2vr_b(beta_in);
272 p0_org = __lasx_xvldx(
data, -img_width);
275 is_bs_greater_than0 = __lasx_xvslt_bu(
zero, bs_vec);
276 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
277 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
278 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
280 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0,
alpha);
281 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
282 is_less_than = is_less_than_alpha & is_less_than_beta;
283 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
284 is_less_than = is_less_than_beta & is_less_than;
285 is_less_than = is_less_than & is_bs_greater_than0;
287 if (__lasx_xbnz_v(is_less_than)) {
288 __m256i neg_tc_h, tc_h, p2_asub_p0, q2_asub_q0;
290 q2_org = __lasx_xvldx(
data, img_width_2x);
292 neg_tc_h = __lasx_xvneg_b(tc_vec);
293 neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
294 tc_h = __lasx_vext2xv_hu_bu(tc_vec);
295 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
296 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
297 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
299 p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
300 is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
301 is_less_than_beta = is_less_than_beta & is_less_than;
303 if (__lasx_xbnz_v(is_less_than_beta)) {
304 __m256i p1_h, p2_org_h;
306 p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
308 neg_tc_h, tc_h, p1_h);
309 p1_h = __lasx_xvpickev_b(p1_h, p1_h);
310 p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
311 p1_h = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
312 p1_org = __lasx_xvpermi_q(p1_org, p1_h, 0x30);
313 __lasx_xvst(p1_org,
data - img_width_2x, 0);
315 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
316 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
319 q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
320 is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
321 is_less_than_beta = is_less_than_beta & is_less_than;
323 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
325 if (__lasx_xbnz_v(is_less_than_beta)) {
326 __m256i q1_h, q2_org_h;
328 q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
330 neg_tc_h, tc_h, q1_h);
331 q1_h = __lasx_xvpickev_b(q1_h, q1_h);
332 q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
333 q1_h = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
334 q1_org = __lasx_xvpermi_q(q1_org, q1_h, 0x30);
335 __lasx_xvst(q1_org,
data + img_width, 0);
337 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
338 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
343 __m256i neg_thresh_h, p0_h, q0_h;
345 neg_thresh_h = __lasx_xvneg_b(tc_vec);
346 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
347 tc_h = __lasx_vext2xv_hu_bu(tc_vec);
350 neg_thresh_h, tc_h, p0_h, q0_h);
351 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
353 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0Xd8, q0_h, 0xd8,
355 p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
356 q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
357 p0_org = __lasx_xvpermi_q(p0_org, p0_h, 0x30);
358 q0_org = __lasx_xvpermi_q(q0_org, q0_h, 0x30);
359 __lasx_xvst(p0_org,
data - img_width, 0);
360 __lasx_xvst(q0_org,
data, 0);
366 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \
367 q3_or_p3_org_in, p1_or_q1_org_in, \
368 p2_or_q2_org_in, q1_or_p1_org_in, \
369 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \
372 __m256i const2, const3 = __lasx_xvldi(0); \
374 const2 = __lasx_xvaddi_hu(const3, 2); \
375 const3 = __lasx_xvaddi_hu(const3, 3); \
376 threshold = __lasx_xvadd_h(p0_or_q0_org_in, q3_or_p3_org_in); \
377 threshold = __lasx_xvadd_h(p1_or_q1_org_in, threshold); \
379 p0_or_q0_out = __lasx_xvslli_h(threshold, 1); \
380 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p2_or_q2_org_in); \
381 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, q1_or_p1_org_in); \
382 p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const3); \
384 p1_or_q1_out = __lasx_xvadd_h(p2_or_q2_org_in, threshold); \
385 p1_or_q1_out = __lasx_xvsrar_h(p1_or_q1_out, const2); \
387 p2_or_q2_out = __lasx_xvmul_h(p2_or_q2_org_in, const3); \
388 p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in); \
389 p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in); \
390 p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, threshold); \
391 p2_or_q2_out = __lasx_xvsrar_h(p2_or_q2_out, const3); \
395 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \
396 p1_or_q1_org_in, p0_or_q0_out) \
398 __m256i const2 = __lasx_xvldi(0); \
399 const2 = __lasx_xvaddi_hu(const2, 2); \
400 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, q1_or_p1_org_in); \
401 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in); \
402 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in); \
403 p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const2); \
407 int alpha_in,
int beta_in)
409 int img_width_2x = img_width << 1;
410 int img_width_4x = img_width << 2;
411 int img_width_3x = img_width_2x + img_width;
413 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0,
alpha, beta;
414 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
415 __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
416 __m256i
zero = __lasx_xvldi(0);
419 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
420 __m256i row8, row9, row10, row11, row12, row13, row14, row15;
423 src, img_width_3x, row0, row1, row2, row3);
426 src, img_width_3x, row4, row5, row6, row7);
429 src, img_width_3x, row8, row9, row10, row11);
432 src, img_width_3x, row12, row13, row14, row15);
435 LASX_TRANSPOSE16x8_B(row0, row1, row2, row3,
436 row4, row5, row6, row7,
437 row8, row9, row10, row11,
438 row12, row13, row14, row15,
439 p3_org, p2_org, p1_org, p0_org,
440 q0_org, q1_org, q2_org, q3_org);
443 alpha = __lasx_xvreplgr2vr_b(alpha_in);
444 beta = __lasx_xvreplgr2vr_b(beta_in);
445 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
446 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
447 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
449 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0,
alpha);
450 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
451 is_less_than = is_less_than_beta & is_less_than_alpha;
452 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
453 is_less_than = is_less_than_beta & is_less_than;
454 is_less_than = __lasx_xvpermi_q(
zero, is_less_than, 0x30);
456 if (__lasx_xbnz_v(is_less_than)) {
457 __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
458 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
459 __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(
alpha, 2);
461 less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
462 less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
463 less_alpha_shift2_add2);
465 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
466 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
467 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
468 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
470 p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
471 is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
472 is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
473 negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
474 is_less_than_beta = is_less_than_beta & is_less_than;
475 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
478 if (__lasx_xbnz_v(is_less_than_beta)) {
479 __m256i p2_org_h, p3_org_h, p1_h, p2_h;
481 p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
482 p3_org_h = __lasx_vext2xv_hu_bu(p3_org);
485 p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
487 p0_h = __lasx_xvpickev_b(p0_h, p0_h);
488 p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
489 DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
490 DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
491 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
492 p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
493 p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
498 p0_h = __lasx_xvpickev_b(p0_h, p0_h);
499 p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
500 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
503 q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
504 is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
505 is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
506 negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
507 is_less_than_beta = is_less_than_beta & is_less_than;
508 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
511 if (__lasx_xbnz_v(is_less_than_beta)) {
512 __m256i q2_org_h, q3_org_h, q1_h, q2_h;
514 q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
515 q3_org_h = __lasx_vext2xv_hu_bu(q3_org);
518 q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
520 q0_h = __lasx_xvpickev_b(q0_h, q0_h);
521 q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
522 DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
523 DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
524 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
525 q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
526 q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
533 q0_h = __lasx_xvpickev_b(q0_h, q0_h);
534 q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
535 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
539 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
540 __m256i control = {0x0000000400000000, 0x0000000500000001,
541 0x0000000600000002, 0x0000000700000003};
543 DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org, q2_org,
544 0x02, p2_org, q1_org, 0x02, p3_org, q0_org, 0x02,
545 p0_org, p1_org, p2_org, p3_org);
546 DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
548 DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
550 DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
551 DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
552 DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
553 control, row7, control, row4, row5, row6, row7);
555 __lasx_xvstelm_d(row4,
src, 0, 0);
556 __lasx_xvstelm_d(row4,
src + img_width, 0, 1);
558 __lasx_xvstelm_d(row4,
src, 0, 2);
559 __lasx_xvstelm_d(row4,
src + img_width, 0, 3);
561 __lasx_xvstelm_d(row5,
src, 0, 0);
562 __lasx_xvstelm_d(row5,
src + img_width, 0, 1);
564 __lasx_xvstelm_d(row5,
src, 0, 2);
565 __lasx_xvstelm_d(row5,
src + img_width, 0, 3);
567 __lasx_xvstelm_d(row6,
src, 0, 0);
568 __lasx_xvstelm_d(row6,
src + img_width, 0, 1);
570 __lasx_xvstelm_d(row6,
src, 0, 2);
571 __lasx_xvstelm_d(row6,
src + img_width, 0, 3);
573 __lasx_xvstelm_d(row7,
src, 0, 0);
574 __lasx_xvstelm_d(row7,
src + img_width, 0, 1);
576 __lasx_xvstelm_d(row7,
src, 0, 2);
577 __lasx_xvstelm_d(row7,
src + img_width, 0, 3);
583 int alpha_in,
int beta_in)
585 int img_width_2x = img_width << 1;
586 int img_width_3x = img_width_2x + img_width;
587 uint8_t *
src =
data - img_width_2x;
588 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0,
alpha, beta;
589 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
590 __m256i p1_org, p0_org, q0_org, q1_org;
591 __m256i
zero = __lasx_xvldi(0);
594 src, img_width_3x, p1_org, p0_org, q0_org, q1_org);
595 alpha = __lasx_xvreplgr2vr_b(alpha_in);
596 beta = __lasx_xvreplgr2vr_b(beta_in);
597 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
598 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
599 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
601 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0,
alpha);
602 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
603 is_less_than = is_less_than_beta & is_less_than_alpha;
604 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
605 is_less_than = is_less_than_beta & is_less_than;
606 is_less_than = __lasx_xvpermi_q(
zero, is_less_than, 0x30);
608 if (__lasx_xbnz_v(is_less_than)) {
609 __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
610 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
611 __m256i p2_org = __lasx_xvldx(
src, -img_width);
612 __m256i q2_org = __lasx_xvldx(
data, img_width_2x);
613 __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(
alpha, 2);
614 less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
615 less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
616 less_alpha_shift2_add2);
618 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
619 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
620 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
621 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
623 p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
624 is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
625 is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
626 negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
627 is_less_than_beta = is_less_than_beta & is_less_than;
628 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
631 if (__lasx_xbnz_v(is_less_than_beta)) {
632 __m256i p2_org_h, p3_org_h, p1_h, p2_h;
633 __m256i p3_org = __lasx_xvldx(
src, -img_width_2x);
635 p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
636 p3_org_h = __lasx_vext2xv_hu_bu(p3_org);
639 p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
641 p0_h = __lasx_xvpickev_b(p0_h, p0_h);
642 p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
643 DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
644 DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
645 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
646 p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
647 p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
649 __lasx_xvst(p1_org,
src, 0);
650 __lasx_xvst(p2_org,
src - img_width, 0);
655 p0_h = __lasx_xvpickev_b(p0_h, p0_h);
656 p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
657 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
658 __lasx_xvst(p0_org,
data - img_width, 0);
661 q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
662 is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
663 is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
664 negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
665 is_less_than_beta = is_less_than_beta & is_less_than;
666 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
669 if (__lasx_xbnz_v(is_less_than_beta)) {
670 __m256i q2_org_h, q3_org_h, q1_h, q2_h;
671 __m256i q3_org = __lasx_xvldx(
data, img_width_2x + img_width);
673 q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
674 q3_org_h = __lasx_vext2xv_hu_bu(q3_org);
677 q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
679 q0_h = __lasx_xvpickev_b(q0_h, q0_h);
680 q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
681 DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
682 DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
683 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
684 q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
685 q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
687 __lasx_xvst(q1_org,
data + img_width, 0);
688 __lasx_xvst(q2_org,
data + img_width_2x, 0);
694 q0_h = __lasx_xvpickev_b(q0_h, q0_h);
695 q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
696 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
698 __lasx_xvst(q0_org,
data, 0);
704 __m256i
src0, dst0, dst1, dst2, dst3,
zero;
706 uint8_t* _dst1 = _dst +
stride;
707 uint8_t* _dst2 = _dst1 +
stride;
708 uint8_t* _dst3 = _dst2 +
stride;
710 src0 = __lasx_xvld(_src, 0);
711 dst0 = __lasx_xvldrepl_w(_dst, 0);
712 dst1 = __lasx_xvldrepl_w(_dst1, 0);
713 dst2 = __lasx_xvldrepl_w(_dst2, 0);
714 dst3 = __lasx_xvldrepl_w(_dst3, 0);
715 tmp0 = __lasx_xvilvl_w(dst1, dst0);
716 tmp1 = __lasx_xvilvl_w(dst3, dst2);
717 dst0 = __lasx_xvilvl_d(tmp1, tmp0);
718 tmp0 = __lasx_vext2xv_hu_bu(dst0);
719 zero = __lasx_xvldi(0);
720 tmp1 = __lasx_xvadd_h(
src0, tmp0);
721 dst0 = __lasx_xvpickev_b(tmp1, tmp1);
722 __lasx_xvstelm_w(dst0, _dst, 0, 0);
723 __lasx_xvstelm_w(dst0, _dst1, 0, 1);
724 __lasx_xvstelm_w(dst0, _dst2, 0, 4);
725 __lasx_xvstelm_w(dst0, _dst3, 0, 5);
726 __lasx_xvst(
zero, _src, 0);
732 __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
733 __m256i tmp0, tmp1, tmp2, tmp3;
734 __m256i
zero = __lasx_xvldi(0);
735 uint8_t *_dst1 = _dst +
stride;
736 uint8_t *_dst2 = _dst1 +
stride;
737 uint8_t *_dst3 = _dst2 +
stride;
738 uint8_t *_dst4 = _dst3 +
stride;
739 uint8_t *_dst5 = _dst4 +
stride;
740 uint8_t *_dst6 = _dst5 +
stride;
741 uint8_t *_dst7 = _dst6 +
stride;
743 src0 = __lasx_xvld(_src, 0);
744 src1 = __lasx_xvld(_src, 32);
745 src2 = __lasx_xvld(_src, 64);
746 src3 = __lasx_xvld(_src, 96);
747 dst0 = __lasx_xvldrepl_d(_dst, 0);
748 dst1 = __lasx_xvldrepl_d(_dst1, 0);
749 dst2 = __lasx_xvldrepl_d(_dst2, 0);
750 dst3 = __lasx_xvldrepl_d(_dst3, 0);
751 dst4 = __lasx_xvldrepl_d(_dst4, 0);
752 dst5 = __lasx_xvldrepl_d(_dst5, 0);
753 dst6 = __lasx_xvldrepl_d(_dst6, 0);
754 dst7 = __lasx_xvldrepl_d(_dst7, 0);
755 tmp0 = __lasx_xvilvl_d(dst1, dst0);
756 tmp1 = __lasx_xvilvl_d(dst3, dst2);
757 tmp2 = __lasx_xvilvl_d(dst5, dst4);
758 tmp3 = __lasx_xvilvl_d(dst7, dst6);
759 dst0 = __lasx_vext2xv_hu_bu(tmp0);
760 dst1 = __lasx_vext2xv_hu_bu(tmp1);
761 dst1 = __lasx_vext2xv_hu_bu(tmp1);
762 dst2 = __lasx_vext2xv_hu_bu(tmp2);
763 dst3 = __lasx_vext2xv_hu_bu(tmp3);
764 tmp0 = __lasx_xvadd_h(
src0, dst0);
765 tmp1 = __lasx_xvadd_h(
src1, dst1);
766 tmp2 = __lasx_xvadd_h(
src2, dst2);
767 tmp3 = __lasx_xvadd_h(src3, dst3);
768 dst1 = __lasx_xvpickev_b(tmp1, tmp0);
769 dst2 = __lasx_xvpickev_b(tmp3, tmp2);
770 __lasx_xvst(
zero, _src, 0);
771 __lasx_xvst(
zero, _src, 32);
772 __lasx_xvst(
zero, _src, 64);
773 __lasx_xvst(
zero, _src, 96);
774 __lasx_xvstelm_d(dst1, _dst, 0, 0);
775 __lasx_xvstelm_d(dst1, _dst1, 0, 2);
776 __lasx_xvstelm_d(dst1, _dst2, 0, 1);
777 __lasx_xvstelm_d(dst1, _dst3, 0, 3);
778 __lasx_xvstelm_d(dst2, _dst4, 0, 0);
779 __lasx_xvstelm_d(dst2, _dst5, 0, 2);
780 __lasx_xvstelm_d(dst2, _dst6, 0, 1);
781 __lasx_xvstelm_d(dst2, _dst7, 0, 3);