28 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
29 p1_or_q1_org_in, p2_or_q2_org_in, \
30 neg_tc_in, tc_in, p1_or_q1_out) \
32 __m256i clip3, temp; \
34 clip3 = __lasx_xvavgr_hu(p0_or_q0_org_in, \
36 temp = __lasx_xvslli_h(p1_or_q1_org_in, 1); \
37 clip3 = __lasx_xvsub_h(clip3, temp); \
38 clip3 = __lasx_xvavg_h(p2_or_q2_org_in, clip3); \
39 clip3 = __lasx_xvclip_h(clip3, neg_tc_in, tc_in); \
40 p1_or_q1_out = __lasx_xvadd_h(p1_or_q1_org_in, clip3); \
43 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \
44 p1_or_q1_org_in, q1_or_p1_org_in, \
45 neg_threshold_in, threshold_in, \
46 p0_or_q0_out, q0_or_p0_out) \
48 __m256i q0_sub_p0, p1_sub_q1, delta; \
50 q0_sub_p0 = __lasx_xvsub_h(q0_or_p0_org_in, \
52 p1_sub_q1 = __lasx_xvsub_h(p1_or_q1_org_in, \
54 q0_sub_p0 = __lasx_xvslli_h(q0_sub_p0, 2); \
55 p1_sub_q1 = __lasx_xvaddi_hu(p1_sub_q1, 4); \
56 delta = __lasx_xvadd_h(q0_sub_p0, p1_sub_q1); \
57 delta = __lasx_xvsrai_h(delta, 3); \
58 delta = __lasx_xvclip_h(delta, neg_threshold_in, \
60 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, delta); \
61 q0_or_p0_out = __lasx_xvsub_h(q0_or_p0_org_in, delta); \
63 p0_or_q0_out = __lasx_xvclip255_h(p0_or_q0_out); \
64 q0_or_p0_out = __lasx_xvclip255_h(q0_or_p0_out); \
68 int alpha_in,
int beta_in, int8_t *
tc)
70 ptrdiff_t img_width_2x = img_width << 1;
71 ptrdiff_t img_width_4x = img_width << 2;
72 ptrdiff_t img_width_8x = img_width << 3;
73 ptrdiff_t img_width_3x = img_width_2x + img_width;
74 __m256i tmp_vec0, bs_vec;
75 __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
76 0x0101010100000000, 0x0303030302020202};
78 tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)
tc, 0);
79 tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
80 bs_vec = __lasx_xvslti_b(tc_vec, 0);
81 bs_vec = __lasx_xvxori_b(bs_vec, 255);
82 bs_vec = __lasx_xvandi_b(bs_vec, 1);
84 if (__lasx_xbnz_v(bs_vec)) {
86 __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
87 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0,
alpha, beta;
88 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
89 __m256i is_bs_greater_than0;
90 __m256i
zero = __lasx_xvldi(0);
92 is_bs_greater_than0 = __lasx_xvslt_bu(
zero, bs_vec);
95 uint8_t *src_tmp =
src + img_width_8x;
96 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
97 __m256i row8, row9, row10, row11, row12, row13, row14, row15;
100 src, img_width_3x, row0, row1, row2, row3);
103 src, img_width_3x, row4, row5, row6, row7);
105 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
106 img_width_2x, src_tmp, img_width_3x,
107 row8, row9, row10, row11);
108 src_tmp += img_width_4x;
109 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
110 img_width_2x, src_tmp, img_width_3x,
111 row12, row13, row14, row15);
112 src_tmp -= img_width_4x;
114 LASX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6,
115 row7, row8, row9, row10, row11,
116 row12, row13, row14, row15,
117 p3_org, p2_org, p1_org, p0_org,
118 q0_org, q1_org, q2_org, q3_org);
121 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
122 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
123 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
125 alpha = __lasx_xvreplgr2vr_b(alpha_in);
126 beta = __lasx_xvreplgr2vr_b(beta_in);
128 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0,
alpha);
129 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
130 is_less_than = is_less_than_alpha & is_less_than_beta;
131 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
132 is_less_than = is_less_than_beta & is_less_than;
133 is_less_than = is_less_than & is_bs_greater_than0;
135 if (__lasx_xbnz_v(is_less_than)) {
136 __m256i neg_tc_h, tc_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
137 __m256i p2_asub_p0, q2_asub_q0;
139 neg_tc_h = __lasx_xvneg_b(tc_vec);
140 neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
141 tc_h = __lasx_vext2xv_hu_bu(tc_vec);
142 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
143 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
144 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
146 p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
147 is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
148 is_less_than_beta = is_less_than_beta & is_less_than;
150 if (__lasx_xbnz_v(is_less_than_beta)) {
151 __m256i p2_org_h, p1_h;
153 p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
155 neg_tc_h, tc_h, p1_h);
156 p1_h = __lasx_xvpickev_b(p1_h, p1_h);
157 p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
158 p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
159 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
160 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
163 q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
164 is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
165 is_less_than_beta = is_less_than_beta & is_less_than;
167 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
169 if (__lasx_xbnz_v(is_less_than_beta)) {
170 __m256i q2_org_h, q1_h;
172 q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
174 neg_tc_h, tc_h, q1_h);
175 q1_h = __lasx_xvpickev_b(q1_h, q1_h);
176 q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
177 q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
179 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
180 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
184 __m256i neg_thresh_h, p0_h, q0_h;
186 neg_thresh_h = __lasx_xvneg_b(tc_vec);
187 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
188 tc_h = __lasx_vext2xv_hu_bu(tc_vec);
191 neg_thresh_h, tc_h, p0_h, q0_h);
192 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
194 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
196 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
197 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
201 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
202 __m256i control = {0x0000000400000000, 0x0000000500000001,
203 0x0000000600000002, 0x0000000700000003};
205 DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org,
206 q2_org, 0x02, p2_org, q1_org, 0x02, p3_org,
207 q0_org, 0x02, p0_org, p1_org, p2_org, p3_org);
208 DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
210 DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
212 DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
213 DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
214 DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
215 control, row7, control, row4, row5, row6, row7);
216 __lasx_xvstelm_d(row4,
src, 0, 0);
217 __lasx_xvstelm_d(row4,
src + img_width, 0, 1);
219 __lasx_xvstelm_d(row4,
src, 0, 2);
220 __lasx_xvstelm_d(row4,
src + img_width, 0, 3);
222 __lasx_xvstelm_d(row5,
src, 0, 0);
223 __lasx_xvstelm_d(row5,
src + img_width, 0, 1);
225 __lasx_xvstelm_d(row5,
src, 0, 2);
226 __lasx_xvstelm_d(row5,
src + img_width, 0, 3);
228 __lasx_xvstelm_d(row6,
src, 0, 0);
229 __lasx_xvstelm_d(row6,
src + img_width, 0, 1);
231 __lasx_xvstelm_d(row6,
src, 0, 2);
232 __lasx_xvstelm_d(row6,
src + img_width, 0, 3);
234 __lasx_xvstelm_d(row7,
src, 0, 0);
235 __lasx_xvstelm_d(row7,
src + img_width, 0, 1);
237 __lasx_xvstelm_d(row7,
src, 0, 2);
238 __lasx_xvstelm_d(row7,
src + img_width, 0, 3);
245 int alpha_in,
int beta_in, int8_t *
tc)
247 ptrdiff_t img_width_2x = img_width << 1;
248 ptrdiff_t img_width_3x = img_width + img_width_2x;
249 __m256i tmp_vec0, bs_vec;
250 __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
251 0x0101010100000000, 0x0303030302020202};
253 tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)
tc, 0);
254 tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
255 bs_vec = __lasx_xvslti_b(tc_vec, 0);
256 bs_vec = __lasx_xvxori_b(bs_vec, 255);
257 bs_vec = __lasx_xvandi_b(bs_vec, 1);
259 if (__lasx_xbnz_v(bs_vec)) {
260 __m256i p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
261 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0,
alpha, beta;
262 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
263 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
264 __m256i is_bs_greater_than0;
265 __m256i
zero = __lasx_xvldi(0);
267 alpha = __lasx_xvreplgr2vr_b(alpha_in);
268 beta = __lasx_xvreplgr2vr_b(beta_in);
272 p0_org = __lasx_xvldx(
data, -img_width);
275 is_bs_greater_than0 = __lasx_xvslt_bu(
zero, bs_vec);
276 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
277 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
278 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
280 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0,
alpha);
281 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
282 is_less_than = is_less_than_alpha & is_less_than_beta;
283 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
284 is_less_than = is_less_than_beta & is_less_than;
285 is_less_than = is_less_than & is_bs_greater_than0;
287 if (__lasx_xbnz_v(is_less_than)) {
288 __m256i neg_tc_h, tc_h, p2_asub_p0, q2_asub_q0;
290 q2_org = __lasx_xvldx(
data, img_width_2x);
292 neg_tc_h = __lasx_xvneg_b(tc_vec);
293 neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
294 tc_h = __lasx_vext2xv_hu_bu(tc_vec);
295 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
296 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
297 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
299 p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
300 is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
301 is_less_than_beta = is_less_than_beta & is_less_than;
303 if (__lasx_xbnz_v(is_less_than_beta)) {
304 __m256i p1_h, p2_org_h;
306 p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
308 neg_tc_h, tc_h, p1_h);
309 p1_h = __lasx_xvpickev_b(p1_h, p1_h);
310 p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
311 p1_h = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
312 p1_org = __lasx_xvpermi_q(p1_org, p1_h, 0x30);
313 __lasx_xvst(p1_org,
data - img_width_2x, 0);
315 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
316 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
319 q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
320 is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
321 is_less_than_beta = is_less_than_beta & is_less_than;
323 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
325 if (__lasx_xbnz_v(is_less_than_beta)) {
326 __m256i q1_h, q2_org_h;
328 q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
330 neg_tc_h, tc_h, q1_h);
331 q1_h = __lasx_xvpickev_b(q1_h, q1_h);
332 q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
333 q1_h = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
334 q1_org = __lasx_xvpermi_q(q1_org, q1_h, 0x30);
335 __lasx_xvst(q1_org,
data + img_width, 0);
337 is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
338 tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
343 __m256i neg_thresh_h, p0_h, q0_h;
345 neg_thresh_h = __lasx_xvneg_b(tc_vec);
346 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
347 tc_h = __lasx_vext2xv_hu_bu(tc_vec);
350 neg_thresh_h, tc_h, p0_h, q0_h);
351 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
353 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0Xd8, q0_h, 0xd8,
355 p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
356 q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
357 p0_org = __lasx_xvpermi_q(p0_org, p0_h, 0x30);
358 q0_org = __lasx_xvpermi_q(q0_org, q0_h, 0x30);
359 __lasx_xvst(p0_org,
data - img_width, 0);
360 __lasx_xvst(q0_org,
data, 0);
367 int alpha_in,
int beta_in, int8_t *
tc)
369 __m256i tmp_vec0, bs_vec;
370 __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
371 __m256i
zero = __lasx_xvldi(0);
372 ptrdiff_t img_width_2x = img_width << 1;
373 ptrdiff_t img_width_4x = img_width << 2;
374 ptrdiff_t img_width_3x = img_width_2x + img_width;
376 tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)
tc, 0);
377 tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
378 bs_vec = __lasx_xvslti_b(tc_vec, 0);
379 bs_vec = __lasx_xvxori_b(bs_vec, 255);
380 bs_vec = __lasx_xvandi_b(bs_vec, 1);
381 bs_vec = __lasx_xvpermi_q(
zero, bs_vec, 0x30);
383 if (__lasx_xbnz_v(bs_vec)) {
385 __m256i p1_org, p0_org, q0_org, q1_org;
386 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0,
alpha, beta;
387 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
388 __m256i is_bs_greater_than0;
390 is_bs_greater_than0 = __lasx_xvslt_bu(
zero, bs_vec);
393 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
396 src, img_width_3x, row0, row1, row2, row3);
399 src, img_width_3x, row4, row5, row6, row7);
402 DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4,
403 row7, row5, p1_org, p0_org, q0_org, q1_org);
404 row0 = __lasx_xvilvl_b(p0_org, p1_org);
405 row1 = __lasx_xvilvl_b(q1_org, q0_org);
406 row3 = __lasx_xvilvh_w(row1, row0);
407 row2 = __lasx_xvilvl_w(row1, row0);
408 p1_org = __lasx_xvpermi_d(row2, 0x00);
409 p0_org = __lasx_xvpermi_d(row2, 0x55);
410 q0_org = __lasx_xvpermi_d(row3, 0x00);
411 q1_org = __lasx_xvpermi_d(row3, 0x55);
414 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
415 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
416 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
418 alpha = __lasx_xvreplgr2vr_b(alpha_in);
419 beta = __lasx_xvreplgr2vr_b(beta_in);
421 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0,
alpha);
422 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
423 is_less_than = is_less_than_alpha & is_less_than_beta;
424 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
425 is_less_than = is_less_than_beta & is_less_than;
426 is_less_than = is_less_than & is_bs_greater_than0;
428 if (__lasx_xbnz_v(is_less_than)) {
429 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
431 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
432 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
433 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
434 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
437 __m256i tc_h, neg_thresh_h, p0_h, q0_h;
439 neg_thresh_h = __lasx_xvneg_b(tc_vec);
440 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
441 tc_h = __lasx_vext2xv_hu_bu(tc_vec);
444 neg_thresh_h, tc_h, p0_h, q0_h);
445 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
447 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
449 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
450 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
453 p0_org = __lasx_xvilvl_b(q0_org, p0_org);
455 __lasx_xvstelm_h(p0_org,
src, 0, 0);
457 __lasx_xvstelm_h(p0_org,
src, 0, 1);
459 __lasx_xvstelm_h(p0_org,
src, 0, 2);
461 __lasx_xvstelm_h(p0_org,
src, 0, 3);
463 __lasx_xvstelm_h(p0_org,
src, 0, 4);
465 __lasx_xvstelm_h(p0_org,
src, 0, 5);
467 __lasx_xvstelm_h(p0_org,
src, 0, 6);
469 __lasx_xvstelm_h(p0_org,
src, 0, 7);
475 int alpha_in,
int beta_in, int8_t *
tc)
477 int img_width_2x = img_width << 1;
478 __m256i tmp_vec0, bs_vec;
479 __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
480 __m256i
zero = __lasx_xvldi(0);
482 tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)
tc, 0);
483 tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
484 bs_vec = __lasx_xvslti_b(tc_vec, 0);
485 bs_vec = __lasx_xvxori_b(bs_vec, 255);
486 bs_vec = __lasx_xvandi_b(bs_vec, 1);
487 bs_vec = __lasx_xvpermi_q(
zero, bs_vec, 0x30);
489 if (__lasx_xbnz_v(bs_vec)) {
490 __m256i p1_org, p0_org, q0_org, q1_org;
491 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0,
alpha, beta;
492 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
493 __m256i is_bs_greater_than0;
495 alpha = __lasx_xvreplgr2vr_b(alpha_in);
496 beta = __lasx_xvreplgr2vr_b(beta_in);
502 is_bs_greater_than0 = __lasx_xvslt_bu(
zero, bs_vec);
503 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
504 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
505 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
507 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0,
alpha);
508 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
509 is_less_than = is_less_than_alpha & is_less_than_beta;
510 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
511 is_less_than = is_less_than_beta & is_less_than;
512 is_less_than = is_less_than & is_bs_greater_than0;
514 if (__lasx_xbnz_v(is_less_than)) {
515 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
517 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
518 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
519 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
520 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
523 __m256i neg_thresh_h, tc_h, p0_h, q0_h;
525 neg_thresh_h = __lasx_xvneg_b(tc_vec);
526 neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
527 tc_h = __lasx_vext2xv_hu_bu(tc_vec);
530 neg_thresh_h, tc_h, p0_h, q0_h);
531 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
533 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
535 p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
536 q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
537 __lasx_xvstelm_d(p0_h,
data - img_width, 0, 0);
538 __lasx_xvstelm_d(q0_h,
data, 0, 0);
544 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \
545 q3_or_p3_org_in, p1_or_q1_org_in, \
546 p2_or_q2_org_in, q1_or_p1_org_in, \
547 p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \
550 __m256i const2, const3 = __lasx_xvldi(0); \
552 const2 = __lasx_xvaddi_hu(const3, 2); \
553 const3 = __lasx_xvaddi_hu(const3, 3); \
554 threshold = __lasx_xvadd_h(p0_or_q0_org_in, q3_or_p3_org_in); \
555 threshold = __lasx_xvadd_h(p1_or_q1_org_in, threshold); \
557 p0_or_q0_out = __lasx_xvslli_h(threshold, 1); \
558 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p2_or_q2_org_in); \
559 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, q1_or_p1_org_in); \
560 p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const3); \
562 p1_or_q1_out = __lasx_xvadd_h(p2_or_q2_org_in, threshold); \
563 p1_or_q1_out = __lasx_xvsrar_h(p1_or_q1_out, const2); \
565 p2_or_q2_out = __lasx_xvmul_h(p2_or_q2_org_in, const3); \
566 p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in); \
567 p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in); \
568 p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, threshold); \
569 p2_or_q2_out = __lasx_xvsrar_h(p2_or_q2_out, const3); \
573 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \
574 p1_or_q1_org_in, p0_or_q0_out) \
576 __m256i const2 = __lasx_xvldi(0); \
577 const2 = __lasx_xvaddi_hu(const2, 2); \
578 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, q1_or_p1_org_in); \
579 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in); \
580 p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in); \
581 p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const2); \
585 int alpha_in,
int beta_in)
587 ptrdiff_t img_width_2x = img_width << 1;
588 ptrdiff_t img_width_4x = img_width << 2;
589 ptrdiff_t img_width_3x = img_width_2x + img_width;
591 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0,
alpha, beta;
592 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
593 __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
594 __m256i
zero = __lasx_xvldi(0);
597 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
598 __m256i row8, row9, row10, row11, row12, row13, row14, row15;
601 src, img_width_3x, row0, row1, row2, row3);
604 src, img_width_3x, row4, row5, row6, row7);
607 src, img_width_3x, row8, row9, row10, row11);
610 src, img_width_3x, row12, row13, row14, row15);
613 LASX_TRANSPOSE16x8_B(row0, row1, row2, row3,
614 row4, row5, row6, row7,
615 row8, row9, row10, row11,
616 row12, row13, row14, row15,
617 p3_org, p2_org, p1_org, p0_org,
618 q0_org, q1_org, q2_org, q3_org);
621 alpha = __lasx_xvreplgr2vr_b(alpha_in);
622 beta = __lasx_xvreplgr2vr_b(beta_in);
623 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
624 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
625 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
627 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0,
alpha);
628 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
629 is_less_than = is_less_than_beta & is_less_than_alpha;
630 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
631 is_less_than = is_less_than_beta & is_less_than;
632 is_less_than = __lasx_xvpermi_q(
zero, is_less_than, 0x30);
634 if (__lasx_xbnz_v(is_less_than)) {
635 __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
636 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
637 __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(
alpha, 2);
639 less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
640 less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
641 less_alpha_shift2_add2);
643 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
644 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
645 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
646 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
648 p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
649 is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
650 is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
651 negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
652 is_less_than_beta = is_less_than_beta & is_less_than;
653 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
656 if (__lasx_xbnz_v(is_less_than_beta)) {
657 __m256i p2_org_h, p3_org_h, p1_h, p2_h;
659 p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
660 p3_org_h = __lasx_vext2xv_hu_bu(p3_org);
663 p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
665 p0_h = __lasx_xvpickev_b(p0_h, p0_h);
666 p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
667 DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
668 DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
669 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
670 p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
671 p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
676 p0_h = __lasx_xvpickev_b(p0_h, p0_h);
677 p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
678 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
681 q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
682 is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
683 is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
684 negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
685 is_less_than_beta = is_less_than_beta & is_less_than;
686 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
689 if (__lasx_xbnz_v(is_less_than_beta)) {
690 __m256i q2_org_h, q3_org_h, q1_h, q2_h;
692 q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
693 q3_org_h = __lasx_vext2xv_hu_bu(q3_org);
696 q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
698 q0_h = __lasx_xvpickev_b(q0_h, q0_h);
699 q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
700 DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
701 DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
702 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
703 q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
704 q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
711 q0_h = __lasx_xvpickev_b(q0_h, q0_h);
712 q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
713 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
717 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
718 __m256i control = {0x0000000400000000, 0x0000000500000001,
719 0x0000000600000002, 0x0000000700000003};
721 DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org, q2_org,
722 0x02, p2_org, q1_org, 0x02, p3_org, q0_org, 0x02,
723 p0_org, p1_org, p2_org, p3_org);
724 DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
726 DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
728 DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
729 DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
730 DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
731 control, row7, control, row4, row5, row6, row7);
733 __lasx_xvstelm_d(row4,
src, 0, 0);
734 __lasx_xvstelm_d(row4,
src + img_width, 0, 1);
736 __lasx_xvstelm_d(row4,
src, 0, 2);
737 __lasx_xvstelm_d(row4,
src + img_width, 0, 3);
739 __lasx_xvstelm_d(row5,
src, 0, 0);
740 __lasx_xvstelm_d(row5,
src + img_width, 0, 1);
742 __lasx_xvstelm_d(row5,
src, 0, 2);
743 __lasx_xvstelm_d(row5,
src + img_width, 0, 3);
745 __lasx_xvstelm_d(row6,
src, 0, 0);
746 __lasx_xvstelm_d(row6,
src + img_width, 0, 1);
748 __lasx_xvstelm_d(row6,
src, 0, 2);
749 __lasx_xvstelm_d(row6,
src + img_width, 0, 3);
751 __lasx_xvstelm_d(row7,
src, 0, 0);
752 __lasx_xvstelm_d(row7,
src + img_width, 0, 1);
754 __lasx_xvstelm_d(row7,
src, 0, 2);
755 __lasx_xvstelm_d(row7,
src + img_width, 0, 3);
761 int alpha_in,
int beta_in)
763 ptrdiff_t img_width_2x = img_width << 1;
764 ptrdiff_t img_width_3x = img_width_2x + img_width;
765 uint8_t *
src =
data - img_width_2x;
766 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0,
alpha, beta;
767 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
768 __m256i p1_org, p0_org, q0_org, q1_org;
769 __m256i
zero = __lasx_xvldi(0);
772 src, img_width_3x, p1_org, p0_org, q0_org, q1_org);
773 alpha = __lasx_xvreplgr2vr_b(alpha_in);
774 beta = __lasx_xvreplgr2vr_b(beta_in);
775 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
776 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
777 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
779 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0,
alpha);
780 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
781 is_less_than = is_less_than_beta & is_less_than_alpha;
782 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
783 is_less_than = is_less_than_beta & is_less_than;
784 is_less_than = __lasx_xvpermi_q(
zero, is_less_than, 0x30);
786 if (__lasx_xbnz_v(is_less_than)) {
787 __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
788 __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
789 __m256i p2_org = __lasx_xvldx(
src, -img_width);
790 __m256i q2_org = __lasx_xvldx(
data, img_width_2x);
791 __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(
alpha, 2);
792 less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
793 less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
794 less_alpha_shift2_add2);
796 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
797 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
798 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
799 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
801 p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
802 is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
803 is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
804 negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
805 is_less_than_beta = is_less_than_beta & is_less_than;
806 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
809 if (__lasx_xbnz_v(is_less_than_beta)) {
810 __m256i p2_org_h, p3_org_h, p1_h, p2_h;
811 __m256i p3_org = __lasx_xvldx(
src, -img_width_2x);
813 p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
814 p3_org_h = __lasx_vext2xv_hu_bu(p3_org);
817 p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
819 p0_h = __lasx_xvpickev_b(p0_h, p0_h);
820 p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
821 DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
822 DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
823 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
824 p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
825 p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
827 __lasx_xvst(p1_org,
src, 0);
828 __lasx_xvst(p2_org,
src - img_width, 0);
833 p0_h = __lasx_xvpickev_b(p0_h, p0_h);
834 p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
835 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
836 __lasx_xvst(p0_org,
data - img_width, 0);
839 q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
840 is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
841 is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
842 negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
843 is_less_than_beta = is_less_than_beta & is_less_than;
844 negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
847 if (__lasx_xbnz_v(is_less_than_beta)) {
848 __m256i q2_org_h, q3_org_h, q1_h, q2_h;
849 __m256i q3_org = __lasx_xvldx(
data, img_width_2x + img_width);
851 q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
852 q3_org_h = __lasx_vext2xv_hu_bu(q3_org);
855 q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
857 q0_h = __lasx_xvpickev_b(q0_h, q0_h);
858 q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
859 DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
860 DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
861 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
862 q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
863 q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
865 __lasx_xvst(q1_org,
data + img_width, 0);
866 __lasx_xvst(q2_org,
data + img_width_2x, 0);
872 q0_h = __lasx_xvpickev_b(q0_h, q0_h);
873 q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
874 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
876 __lasx_xvst(q0_org,
data, 0);
881 int alpha_in,
int beta_in)
884 ptrdiff_t img_width_2x = img_width << 1;
885 ptrdiff_t img_width_4x = img_width << 2;
886 ptrdiff_t img_width_3x = img_width_2x + img_width;
887 __m256i p1_org, p0_org, q0_org, q1_org;
888 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0,
alpha, beta;
889 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
892 __m256i row0, row1, row2, row3, row4, row5, row6, row7;
895 img_width_3x, row0, row1, row2, row3);
898 img_width_3x, row4, row5, row6, row7);
901 DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4, row7, row5,
902 p1_org, p0_org, q0_org, q1_org);
903 row0 = __lasx_xvilvl_b(p0_org, p1_org);
904 row1 = __lasx_xvilvl_b(q1_org, q0_org);
905 row3 = __lasx_xvilvh_w(row1, row0);
906 row2 = __lasx_xvilvl_w(row1, row0);
907 p1_org = __lasx_xvpermi_d(row2, 0x00);
908 p0_org = __lasx_xvpermi_d(row2, 0x55);
909 q0_org = __lasx_xvpermi_d(row3, 0x00);
910 q1_org = __lasx_xvpermi_d(row3, 0x55);
913 alpha = __lasx_xvreplgr2vr_b(alpha_in);
914 beta = __lasx_xvreplgr2vr_b(beta_in);
916 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
917 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
918 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
920 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0,
alpha);
921 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
922 is_less_than = is_less_than_alpha & is_less_than_beta;
923 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
924 is_less_than = is_less_than_beta & is_less_than;
926 if (__lasx_xbnz_v(is_less_than)) {
927 __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
929 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
930 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
931 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
932 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
936 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
937 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
938 p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
939 q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
941 p0_org = __lasx_xvilvl_b(q0_org, p0_org);
943 __lasx_xvstelm_h(p0_org,
src, 0, 0);
945 __lasx_xvstelm_h(p0_org,
src, 0, 1);
947 __lasx_xvstelm_h(p0_org,
src, 0, 2);
949 __lasx_xvstelm_h(p0_org,
src, 0, 3);
951 __lasx_xvstelm_h(p0_org,
src, 0, 4);
953 __lasx_xvstelm_h(p0_org,
src, 0, 5);
955 __lasx_xvstelm_h(p0_org,
src, 0, 6);
957 __lasx_xvstelm_h(p0_org,
src, 0, 7);
961 int alpha_in,
int beta_in)
963 ptrdiff_t img_width_2x = img_width << 1;
964 __m256i p1_org, p0_org, q0_org, q1_org;
965 __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0,
alpha, beta;
966 __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
968 alpha = __lasx_xvreplgr2vr_b(alpha_in);
969 beta = __lasx_xvreplgr2vr_b(beta_in);
971 p1_org = __lasx_xvldx(
data, -img_width_2x);
972 p0_org = __lasx_xvldx(
data, -img_width);
975 p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
976 p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
977 q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
979 is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0,
alpha);
980 is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
981 is_less_than = is_less_than_alpha & is_less_than_beta;
982 is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
983 is_less_than = is_less_than_beta & is_less_than;
985 if (__lasx_xbnz_v(is_less_than)) {
986 __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
988 p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
989 p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
990 q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
991 q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
995 DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
996 DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
997 p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
998 q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
999 __lasx_xvstelm_d(p0_h,
data - img_width, 0, 0);
1000 __lasx_xvstelm_d(q0_h,
data, 0, 0);
1006 int log2_denom,
int weight_dst,
1007 int weight_src,
int offset_in)
1011 __m256i dst0, dst1, dst2, dst3;
1012 __m256i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1013 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1015 int stride_2x =
stride << 1;
1016 int stride_4x =
stride << 2;
1017 int stride_3x = stride_2x +
stride;
1019 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1020 offset_in += ((weight_src + weight_dst) << 7);
1023 tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1024 tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1025 wgt = __lasx_xvilvh_b(tmp1, tmp0);
1026 offset = __lasx_xvreplgr2vr_h(offset_in);
1027 denom = __lasx_xvreplgr2vr_h(log2_denom);
1030 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1033 src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1035 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1036 0x20, tmp7, tmp6, 0x20,
src0,
src1, src2, src3);
1038 dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1041 dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1043 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1044 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
1048 DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1049 dst0, dst1, dst2, dst3);
1051 dst3, src3, vec0, vec2, vec4, vec6);
1053 dst3, src3, vec1, vec3, vec5, vec7);
1056 offset, wgt, vec2,
offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1058 offset, wgt, vec6,
offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1060 tmp0 = __lasx_xvsra_h(tmp0, denom);
1061 tmp1 = __lasx_xvsra_h(tmp1, denom);
1062 tmp2 = __lasx_xvsra_h(tmp2, denom);
1063 tmp3 = __lasx_xvsra_h(tmp3, denom);
1064 tmp4 = __lasx_xvsra_h(tmp4, denom);
1065 tmp5 = __lasx_xvsra_h(tmp5, denom);
1066 tmp6 = __lasx_xvsra_h(tmp6, denom);
1067 tmp7 = __lasx_xvsra_h(tmp7, denom);
1069 DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1070 tmp0, tmp1, tmp2, tmp3);
1071 DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1072 tmp4, tmp5, tmp6, tmp7);
1073 DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
1074 dst0, dst1, dst2, dst3);
1075 __lasx_xvstelm_d(dst0, dst, 0, 0);
1076 __lasx_xvstelm_d(dst0, dst, 8, 1);
1078 __lasx_xvstelm_d(dst0, dst, 0, 2);
1079 __lasx_xvstelm_d(dst0, dst, 8, 3);
1081 __lasx_xvstelm_d(dst1, dst, 0, 0);
1082 __lasx_xvstelm_d(dst1, dst, 8, 1);
1084 __lasx_xvstelm_d(dst1, dst, 0, 2);
1085 __lasx_xvstelm_d(dst1, dst, 8, 3);
1087 __lasx_xvstelm_d(dst2, dst, 0, 0);
1088 __lasx_xvstelm_d(dst2, dst, 8, 1);
1090 __lasx_xvstelm_d(dst2, dst, 0, 2);
1091 __lasx_xvstelm_d(dst2, dst, 8, 3);
1093 __lasx_xvstelm_d(dst3, dst, 0, 0);
1094 __lasx_xvstelm_d(dst3, dst, 8, 1);
1096 __lasx_xvstelm_d(dst3, dst, 0, 2);
1097 __lasx_xvstelm_d(dst3, dst, 8, 3);
1102 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1105 src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1107 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1108 tmp4, 0x20, tmp7, tmp6, 0x20,
src0,
src1, src2, src3);
1110 dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1113 dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1115 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1116 tmp4, 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
1120 DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1121 dst0, dst1, dst2, dst3);
1123 dst3, src3, vec0, vec2, vec4, vec6);
1125 dst3, src3, vec1, vec3, vec5, vec7);
1128 offset, wgt, vec2,
offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1130 offset, wgt, vec6,
offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1132 tmp0 = __lasx_xvsra_h(tmp0, denom);
1133 tmp1 = __lasx_xvsra_h(tmp1, denom);
1134 tmp2 = __lasx_xvsra_h(tmp2, denom);
1135 tmp3 = __lasx_xvsra_h(tmp3, denom);
1136 tmp4 = __lasx_xvsra_h(tmp4, denom);
1137 tmp5 = __lasx_xvsra_h(tmp5, denom);
1138 tmp6 = __lasx_xvsra_h(tmp6, denom);
1139 tmp7 = __lasx_xvsra_h(tmp7, denom);
1141 DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1142 tmp0, tmp1, tmp2, tmp3);
1143 DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1144 tmp4, tmp5, tmp6, tmp7);
1145 DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7,
1146 tmp6, dst0, dst1, dst2, dst3);
1147 __lasx_xvstelm_d(dst0, dst, 0, 0);
1148 __lasx_xvstelm_d(dst0, dst, 8, 1);
1150 __lasx_xvstelm_d(dst0, dst, 0, 2);
1151 __lasx_xvstelm_d(dst0, dst, 8, 3);
1153 __lasx_xvstelm_d(dst1, dst, 0, 0);
1154 __lasx_xvstelm_d(dst1, dst, 8, 1);
1156 __lasx_xvstelm_d(dst1, dst, 0, 2);
1157 __lasx_xvstelm_d(dst1, dst, 8, 3);
1159 __lasx_xvstelm_d(dst2, dst, 0, 0);
1160 __lasx_xvstelm_d(dst2, dst, 8, 1);
1162 __lasx_xvstelm_d(dst2, dst, 0, 2);
1163 __lasx_xvstelm_d(dst2, dst, 8, 3);
1165 __lasx_xvstelm_d(dst3, dst, 0, 0);
1166 __lasx_xvstelm_d(dst3, dst, 8, 1);
1168 __lasx_xvstelm_d(dst3, dst, 0, 2);
1169 __lasx_xvstelm_d(dst3, dst, 8, 3);
1177 __m256i wgt, vec0, vec1;
1179 __m256i tmp0, tmp1, tmp2, tmp3, denom,
offset;
1180 ptrdiff_t stride_2x =
stride << 1;
1181 ptrdiff_t stride_3x = stride_2x +
stride;
1183 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1184 offset_in += ((weight_src + weight_dst) << 7);
1187 tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1188 tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1189 wgt = __lasx_xvilvh_b(tmp1, tmp0);
1190 offset = __lasx_xvreplgr2vr_h(offset_in);
1191 denom = __lasx_xvreplgr2vr_h(log2_denom);
1194 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1195 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1196 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1198 dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1199 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1200 dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1202 vec0 = __lasx_xvilvl_b(dst0,
src0);
1203 vec1 = __lasx_xvilvh_b(dst0,
src0);
1206 tmp0 = __lasx_xvsra_h(tmp0, denom);
1207 tmp1 = __lasx_xvsra_h(tmp1, denom);
1208 DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
1209 dst0 = __lasx_xvpickev_b(tmp1, tmp0);
1210 __lasx_xvstelm_d(dst0, dst, 0, 0);
1211 __lasx_xvstelm_d(dst0, dst +
stride, 0, 1);
1212 __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1213 __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1220 __m256i wgt, vec0, vec1, vec2, vec3;
1222 __m256i tmp0, tmp1, tmp2, tmp3, denom,
offset;
1223 ptrdiff_t stride_2x =
stride << 1;
1224 ptrdiff_t stride_4x =
stride << 2;
1225 ptrdiff_t stride_3x = stride_2x +
stride;
1226 uint8_t* dst_tmp = dst;
1228 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1229 offset_in += ((weight_src + weight_dst) << 7);
1232 tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1233 tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1234 wgt = __lasx_xvilvh_b(tmp1, tmp0);
1235 offset = __lasx_xvreplgr2vr_h(offset_in);
1236 denom = __lasx_xvreplgr2vr_h(log2_denom);
1239 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1241 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1242 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1244 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1245 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1246 src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1247 tmp0 = __lasx_xvld(dst_tmp, 0);
1248 DUP2_ARG2(__lasx_xvldx, dst_tmp,
stride, dst_tmp, stride_2x, tmp1, tmp2);
1249 tmp3 = __lasx_xvldx(dst_tmp, stride_3x);
1250 dst_tmp += stride_4x;
1251 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1252 dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1253 DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp,
stride, dst_tmp, stride_2x,
1254 dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1255 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1256 dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1263 offset, wgt, vec2,
offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1264 tmp0 = __lasx_xvsra_h(tmp0, denom);
1265 tmp1 = __lasx_xvsra_h(tmp1, denom);
1266 tmp2 = __lasx_xvsra_h(tmp2, denom);
1267 tmp3 = __lasx_xvsra_h(tmp3, denom);
1268 DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1269 tmp0, tmp1, tmp2, tmp3);
1270 DUP2_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, dst0, dst1);
1271 __lasx_xvstelm_d(dst0, dst, 0, 0);
1272 __lasx_xvstelm_d(dst0, dst +
stride, 0, 1);
1273 __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1274 __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1276 __lasx_xvstelm_d(dst1, dst, 0, 0);
1277 __lasx_xvstelm_d(dst1, dst +
stride, 0, 1);
1278 __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
1279 __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
1286 __m256i wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1287 __m256i
src0,
src1, src2, src3, dst0, dst1, dst2, dst3;
1288 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom,
offset;
1289 ptrdiff_t stride_2x =
stride << 1;
1290 ptrdiff_t stride_4x =
stride << 2;
1291 ptrdiff_t stride_3x = stride_2x +
stride;
1292 uint8_t* dst_tmp = dst;
1294 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1295 offset_in += ((weight_src + weight_dst) << 7);
1298 tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1299 tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1300 wgt = __lasx_xvilvh_b(tmp1, tmp0);
1301 offset = __lasx_xvreplgr2vr_h(offset_in);
1302 denom = __lasx_xvreplgr2vr_h(log2_denom);
1305 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1307 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1308 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1310 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1312 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1313 src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1315 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1317 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1318 src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1320 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1321 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1322 src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1324 DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp,
stride, dst_tmp, stride_2x,
1325 dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1326 dst_tmp += stride_4x;
1327 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1328 dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1329 DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp,
stride, dst_tmp, stride_2x,
1330 dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1331 dst_tmp += stride_4x;
1332 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1333 dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1334 DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp,
stride, dst_tmp, stride_2x,
1335 dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1336 dst_tmp += stride_4x;
1337 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1338 dst2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1339 DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp,
stride, dst_tmp, stride_2x,
1340 dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1341 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1342 dst3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1346 DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1347 dst0, dst1, dst2, dst3);
1349 dst3, src3, vec0, vec2, vec4, vec6);
1351 dst3, src3, vec1, vec3, vec5, vec7);
1353 offset, wgt, vec2,
offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1355 offset, wgt, vec6,
offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1356 tmp0 = __lasx_xvsra_h(tmp0, denom);
1357 tmp1 = __lasx_xvsra_h(tmp1, denom);
1358 tmp2 = __lasx_xvsra_h(tmp2, denom);
1359 tmp3 = __lasx_xvsra_h(tmp3, denom);
1360 tmp4 = __lasx_xvsra_h(tmp4, denom);
1361 tmp5 = __lasx_xvsra_h(tmp5, denom);
1362 tmp6 = __lasx_xvsra_h(tmp6, denom);
1363 tmp7 = __lasx_xvsra_h(tmp7, denom);
1364 DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1365 tmp0, tmp1, tmp2, tmp3);
1366 DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1367 tmp4, tmp5, tmp6, tmp7);
1368 DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
1369 dst0, dst1, dst2, dst3)
1370 __lasx_xvstelm_d(dst0, dst, 0, 0);
1371 __lasx_xvstelm_d(dst0, dst +
stride, 0, 1);
1372 __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1373 __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1375 __lasx_xvstelm_d(dst1, dst, 0, 0);
1376 __lasx_xvstelm_d(dst1, dst +
stride, 0, 1);
1377 __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
1378 __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
1380 __lasx_xvstelm_d(dst2, dst, 0, 0);
1381 __lasx_xvstelm_d(dst2, dst +
stride, 0, 1);
1382 __lasx_xvstelm_d(dst2, dst + stride_2x, 0, 2);
1383 __lasx_xvstelm_d(dst2, dst + stride_3x, 0, 3);
1385 __lasx_xvstelm_d(dst3, dst, 0, 0);
1386 __lasx_xvstelm_d(dst3, dst +
stride, 0, 1);
1387 __lasx_xvstelm_d(dst3, dst + stride_2x, 0, 2);
1388 __lasx_xvstelm_d(dst3, dst + stride_3x, 0, 3);
1393 int log2_denom,
int weight_dst,
1394 int weight_src,
int offset)
1399 }
else if (8 ==
height) {
1414 __m256i tmp0, tmp1, denom,
offset;
1416 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1417 offset_in += ((weight_src + weight_dst) << 7);
1420 tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1421 tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1422 wgt = __lasx_xvilvh_b(tmp1, tmp0);
1423 offset = __lasx_xvreplgr2vr_h(offset_in);
1424 denom = __lasx_xvreplgr2vr_h(log2_denom);
1427 src0 = __lasx_xvilvl_w(tmp1, tmp0);
1429 dst0 = __lasx_xvilvl_w(tmp1, tmp0);
1431 vec0 = __lasx_xvilvl_b(dst0,
src0);
1432 tmp0 = __lasx_xvdp2add_h_b(
offset, wgt, vec0);
1433 tmp0 = __lasx_xvsra_h(tmp0, denom);
1434 tmp0 = __lasx_xvclip255_h(tmp0);
1435 tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
1436 __lasx_xvstelm_w(tmp0, dst, 0, 0);
1437 __lasx_xvstelm_w(tmp0, dst +
stride, 0, 1);
1446 __m256i tmp0, tmp1, tmp2, tmp3, denom,
offset;
1447 ptrdiff_t stride_2x =
stride << 1;
1448 ptrdiff_t stride_3x = stride_2x +
stride;
1450 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1451 offset_in += ((weight_src + weight_dst) << 7);
1454 tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1455 tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1456 wgt = __lasx_xvilvh_b(tmp1, tmp0);
1457 offset = __lasx_xvreplgr2vr_h(offset_in);
1458 denom = __lasx_xvreplgr2vr_h(log2_denom);
1461 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1462 DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1463 src0 = __lasx_xvilvl_w(tmp1, tmp0);
1465 dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1466 DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1467 dst0 = __lasx_xvilvl_w(tmp1, tmp0);
1469 vec0 = __lasx_xvilvl_b(dst0,
src0);
1470 dst0 = __lasx_xvilvh_b(dst0,
src0);
1471 vec0 = __lasx_xvpermi_q(vec0, dst0, 0x02);
1472 tmp0 = __lasx_xvdp2add_h_b(
offset, wgt, vec0);
1473 tmp0 = __lasx_xvsra_h(tmp0, denom);
1474 tmp0 = __lasx_xvclip255_h(tmp0);
1475 tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
1476 __lasx_xvstelm_w(tmp0, dst, 0, 0);
1477 __lasx_xvstelm_w(tmp0, dst +
stride, 0, 1);
1478 __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 4);
1479 __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 5);
1486 __m256i wgt, vec0, vec1;
1488 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom,
offset;
1489 ptrdiff_t stride_2x =
stride << 1;
1490 ptrdiff_t stride_4x =
stride << 2;
1491 ptrdiff_t stride_3x = stride_2x +
stride;
1493 offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1494 offset_in += ((weight_src + weight_dst) << 7);
1497 tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1498 tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1499 wgt = __lasx_xvilvh_b(tmp1, tmp0);
1500 offset = __lasx_xvreplgr2vr_h(offset_in);
1501 denom = __lasx_xvreplgr2vr_h(log2_denom);
1504 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1507 src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1508 DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
1509 tmp0, tmp1, tmp2, tmp3);
1510 DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1511 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1513 dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1516 dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1518 DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
1519 tmp0, tmp1, tmp2, tmp3);
1520 DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1521 dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1523 vec0 = __lasx_xvilvl_b(dst0,
src0);
1524 vec1 = __lasx_xvilvh_b(dst0,
src0);
1527 tmp0 = __lasx_xvsra_h(tmp0, denom);
1528 tmp1 = __lasx_xvsra_h(tmp1, denom);
1529 DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
1530 tmp0 = __lasx_xvpickev_b(tmp1, tmp0);
1531 __lasx_xvstelm_w(tmp0, dst, 0, 0);
1532 __lasx_xvstelm_w(tmp0, dst +
stride, 0, 1);
1533 __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 2);
1534 __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 3);
1536 __lasx_xvstelm_w(tmp0, dst, 0, 4);
1537 __lasx_xvstelm_w(tmp0, dst +
stride, 0, 5);
1538 __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 6);
1539 __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 7);
1544 int log2_denom,
int weight_dst,
1545 int weight_src,
int offset)
1550 }
else if (4 ==
height) {
1560 int height,
int log2_denom,
1561 int weight_src,
int offset_in)
1563 uint32_t offset_val;
1564 ptrdiff_t stride_2x =
stride << 1;
1565 ptrdiff_t stride_4x =
stride << 2;
1566 ptrdiff_t stride_3x = stride_2x +
stride;
1567 __m256i
zero = __lasx_xvldi(0);
1569 __m256i src0_l, src1_l, src2_l, src3_l, src0_h, src1_h, src2_h, src3_h;
1570 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1571 __m256i wgt, denom,
offset;
1573 offset_val = (unsigned) offset_in << log2_denom;
1575 wgt = __lasx_xvreplgr2vr_h(weight_src);
1576 offset = __lasx_xvreplgr2vr_h(offset_val);
1577 denom = __lasx_xvreplgr2vr_h(log2_denom);
1580 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1583 src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1585 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1586 0x20, tmp7, tmp6, 0x20,
src0,
src1, src2, src3);
1588 zero, src3, src0_l, src1_l, src2_l, src3_l);
1590 zero, src3, src0_h, src1_h, src2_h, src3_h);
1591 src0_l = __lasx_xvmul_h(wgt, src0_l);
1592 src0_h = __lasx_xvmul_h(wgt, src0_h);
1593 src1_l = __lasx_xvmul_h(wgt, src1_l);
1594 src1_h = __lasx_xvmul_h(wgt, src1_h);
1595 src2_l = __lasx_xvmul_h(wgt, src2_l);
1596 src2_h = __lasx_xvmul_h(wgt, src2_h);
1597 src3_l = __lasx_xvmul_h(wgt, src3_l);
1598 src3_h = __lasx_xvmul_h(wgt, src3_h);
1600 src1_h,
offset, src0_l, src0_h, src1_l, src1_h);
1602 src3_h,
offset, src2_l, src2_h, src3_l, src3_h);
1603 src0_l = __lasx_xvmaxi_h(src0_l, 0);
1604 src0_h = __lasx_xvmaxi_h(src0_h, 0);
1605 src1_l = __lasx_xvmaxi_h(src1_l, 0);
1606 src1_h = __lasx_xvmaxi_h(src1_h, 0);
1607 src2_l = __lasx_xvmaxi_h(src2_l, 0);
1608 src2_h = __lasx_xvmaxi_h(src2_h, 0);
1609 src3_l = __lasx_xvmaxi_h(src3_l, 0);
1610 src3_h = __lasx_xvmaxi_h(src3_h, 0);
1611 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1612 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1613 src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1614 src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1615 src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1616 src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1617 src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1618 src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1619 __lasx_xvstelm_d(src0_l,
src, 0, 0);
1620 __lasx_xvstelm_d(src0_h,
src, 8, 0);
1622 __lasx_xvstelm_d(src0_l,
src, 0, 2);
1623 __lasx_xvstelm_d(src0_h,
src, 8, 2);
1625 __lasx_xvstelm_d(src1_l,
src, 0, 0);
1626 __lasx_xvstelm_d(src1_h,
src, 8, 0);
1628 __lasx_xvstelm_d(src1_l,
src, 0, 2);
1629 __lasx_xvstelm_d(src1_h,
src, 8, 2);
1631 __lasx_xvstelm_d(src2_l,
src, 0, 0);
1632 __lasx_xvstelm_d(src2_h,
src, 8, 0);
1634 __lasx_xvstelm_d(src2_l,
src, 0, 2);
1635 __lasx_xvstelm_d(src2_h,
src, 8, 2);
1637 __lasx_xvstelm_d(src3_l,
src, 0, 0);
1638 __lasx_xvstelm_d(src3_h,
src, 8, 0);
1640 __lasx_xvstelm_d(src3_l,
src, 0, 2);
1641 __lasx_xvstelm_d(src3_h,
src, 8, 2);
1646 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1649 src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1651 DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1652 tmp4, 0x20, tmp7, tmp6, 0x20,
src0,
src1, src2, src3);
1654 zero, src3, src0_l, src1_l, src2_l, src3_l);
1656 zero, src3, src0_h, src1_h, src2_h, src3_h);
1657 src0_l = __lasx_xvmul_h(wgt, src0_l);
1658 src0_h = __lasx_xvmul_h(wgt, src0_h);
1659 src1_l = __lasx_xvmul_h(wgt, src1_l);
1660 src1_h = __lasx_xvmul_h(wgt, src1_h);
1661 src2_l = __lasx_xvmul_h(wgt, src2_l);
1662 src2_h = __lasx_xvmul_h(wgt, src2_h);
1663 src3_l = __lasx_xvmul_h(wgt, src3_l);
1664 src3_h = __lasx_xvmul_h(wgt, src3_h);
1666 offset, src1_h,
offset, src0_l, src0_h, src1_l, src1_h);
1668 offset, src3_h,
offset, src2_l, src2_h, src3_l, src3_h);
1669 src0_l = __lasx_xvmaxi_h(src0_l, 0);
1670 src0_h = __lasx_xvmaxi_h(src0_h, 0);
1671 src1_l = __lasx_xvmaxi_h(src1_l, 0);
1672 src1_h = __lasx_xvmaxi_h(src1_h, 0);
1673 src2_l = __lasx_xvmaxi_h(src2_l, 0);
1674 src2_h = __lasx_xvmaxi_h(src2_h, 0);
1675 src3_l = __lasx_xvmaxi_h(src3_l, 0);
1676 src3_h = __lasx_xvmaxi_h(src3_h, 0);
1677 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1678 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1679 src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1680 src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1681 src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1682 src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1683 src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1684 src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1685 __lasx_xvstelm_d(src0_l,
src, 0, 0);
1686 __lasx_xvstelm_d(src0_h,
src, 8, 0);
1688 __lasx_xvstelm_d(src0_l,
src, 0, 2);
1689 __lasx_xvstelm_d(src0_h,
src, 8, 2);
1691 __lasx_xvstelm_d(src1_l,
src, 0, 0);
1692 __lasx_xvstelm_d(src1_h,
src, 8, 0);
1694 __lasx_xvstelm_d(src1_l,
src, 0, 2);
1695 __lasx_xvstelm_d(src1_h,
src, 8, 2);
1697 __lasx_xvstelm_d(src2_l,
src, 0, 0);
1698 __lasx_xvstelm_d(src2_h,
src, 8, 0);
1700 __lasx_xvstelm_d(src2_l,
src, 0, 2);
1701 __lasx_xvstelm_d(src2_h,
src, 8, 2);
1703 __lasx_xvstelm_d(src3_l,
src, 0, 0);
1704 __lasx_xvstelm_d(src3_h,
src, 8, 0);
1706 __lasx_xvstelm_d(src3_l,
src, 0, 2);
1707 __lasx_xvstelm_d(src3_h,
src, 8, 2);
1715 uint32_t offset_val;
1716 ptrdiff_t stride_2x =
stride << 1;
1717 ptrdiff_t stride_3x = stride_2x +
stride;
1718 __m256i wgt,
zero = __lasx_xvldi(0);
1719 __m256i
src0, src0_h, src0_l;
1720 __m256i tmp0, tmp1, tmp2, tmp3, denom,
offset;
1722 offset_val = (unsigned) offset_in << log2_denom;
1724 wgt = __lasx_xvreplgr2vr_h(weight_src);
1725 offset = __lasx_xvreplgr2vr_h(offset_val);
1726 denom = __lasx_xvreplgr2vr_h(log2_denom);
1729 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1730 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1731 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1732 src0_l = __lasx_xvilvl_b(
zero,
src0);
1733 src0_h = __lasx_xvilvh_b(
zero,
src0);
1734 src0_l = __lasx_xvmul_h(wgt, src0_l);
1735 src0_h = __lasx_xvmul_h(wgt, src0_h);
1736 src0_l = __lasx_xvsadd_h(src0_l,
offset);
1737 src0_h = __lasx_xvsadd_h(src0_h,
offset);
1738 src0_l = __lasx_xvmaxi_h(src0_l, 0);
1739 src0_h = __lasx_xvmaxi_h(src0_h, 0);
1740 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1741 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1743 src0 = __lasx_xvpickev_d(src0_h, src0_l);
1744 __lasx_xvstelm_d(
src0,
src, 0, 0);
1746 __lasx_xvstelm_d(
src0,
src + stride_2x, 0, 2);
1747 __lasx_xvstelm_d(
src0,
src + stride_3x, 0, 3);
1753 __m256i
src0,
src1, src0_h, src0_l, src1_h, src1_l,
zero = __lasx_xvldi(0);
1754 __m256i tmp0, tmp1, tmp2, tmp3, denom,
offset, wgt;
1755 uint32_t offset_val;
1756 uint8_t* src_tmp =
src;
1757 ptrdiff_t stride_2x =
stride << 1;
1758 ptrdiff_t stride_4x =
stride << 2;
1759 ptrdiff_t stride_3x = stride_2x +
stride;
1761 offset_val = (unsigned) offset_in << log2_denom;
1763 wgt = __lasx_xvreplgr2vr_h(src_weight);
1764 offset = __lasx_xvreplgr2vr_h(offset_val);
1765 denom = __lasx_xvreplgr2vr_h(log2_denom);
1767 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp,
stride, src_tmp, stride_2x,
1768 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1769 src_tmp += stride_4x;
1770 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1771 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1772 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp,
stride, src_tmp, stride_2x,
1773 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1774 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1775 src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1778 src0_l = __lasx_xvmul_h(wgt, src0_l);
1779 src0_h = __lasx_xvmul_h(wgt, src0_h);
1780 src1_l = __lasx_xvmul_h(wgt, src1_l);
1781 src1_h = __lasx_xvmul_h(wgt, src1_h);
1783 src1_h,
offset, src0_l, src0_h, src1_l, src1_h);
1784 src0_l = __lasx_xvmaxi_h(src0_l, 0);
1785 src0_h = __lasx_xvmaxi_h(src0_h, 0);
1786 src1_l = __lasx_xvmaxi_h(src1_l, 0);
1787 src1_h = __lasx_xvmaxi_h(src1_h, 0);
1788 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1789 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1790 src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1791 src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1794 __lasx_xvstelm_d(
src0,
src, 0, 0);
1796 __lasx_xvstelm_d(
src0,
src + stride_2x, 0, 2);
1797 __lasx_xvstelm_d(
src0,
src + stride_3x, 0, 3);
1799 __lasx_xvstelm_d(
src1,
src, 0, 0);
1801 __lasx_xvstelm_d(
src1,
src + stride_2x, 0, 2);
1802 __lasx_xvstelm_d(
src1,
src + stride_3x, 0, 3);
1810 __m256i src0_h, src0_l, src1_h, src1_l, src2_h, src2_l, src3_h, src3_l;
1811 __m256i tmp0, tmp1, tmp2, tmp3, denom,
offset, wgt;
1812 __m256i
zero = __lasx_xvldi(0);
1813 uint32_t offset_val;
1814 uint8_t* src_tmp =
src;
1815 ptrdiff_t stride_2x =
stride << 1;
1816 ptrdiff_t stride_4x =
stride << 2;
1817 ptrdiff_t stride_3x = stride_2x +
stride;
1819 offset_val = (unsigned) offset_in << log2_denom;
1821 wgt = __lasx_xvreplgr2vr_h(src_weight);
1822 offset = __lasx_xvreplgr2vr_h(offset_val);
1823 denom = __lasx_xvreplgr2vr_h(log2_denom);
1825 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp,
stride, src_tmp, stride_2x,
1826 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1827 src_tmp += stride_4x;
1828 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1829 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1830 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp,
stride, src_tmp, stride_2x,
1831 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1832 src_tmp += stride_4x;
1833 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1834 src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1835 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp,
stride, src_tmp, stride_2x,
1836 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1837 src_tmp += stride_4x;
1838 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1839 src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1840 DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp,
stride, src_tmp, stride_2x,
1841 src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1842 DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1843 src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1846 src0_l, src1_l, src2_l, src3_l);
1848 src0_h, src1_h, src2_h, src3_h);
1849 src0_l = __lasx_xvmul_h(wgt, src0_l);
1850 src0_h = __lasx_xvmul_h(wgt, src0_h);
1851 src1_l = __lasx_xvmul_h(wgt, src1_l);
1852 src1_h = __lasx_xvmul_h(wgt, src1_h);
1853 src2_l = __lasx_xvmul_h(wgt, src2_l);
1854 src2_h = __lasx_xvmul_h(wgt, src2_h);
1855 src3_l = __lasx_xvmul_h(wgt, src3_l);
1856 src3_h = __lasx_xvmul_h(wgt, src3_h);
1859 src1_h,
offset, src0_l, src0_h, src1_l, src1_h);
1861 src3_h,
offset, src2_l, src2_h, src3_l, src3_h);
1863 src0_l = __lasx_xvmaxi_h(src0_l, 0);
1864 src0_h = __lasx_xvmaxi_h(src0_h, 0);
1865 src1_l = __lasx_xvmaxi_h(src1_l, 0);
1866 src1_h = __lasx_xvmaxi_h(src1_h, 0);
1867 src2_l = __lasx_xvmaxi_h(src2_l, 0);
1868 src2_h = __lasx_xvmaxi_h(src2_h, 0);
1869 src3_l = __lasx_xvmaxi_h(src3_l, 0);
1870 src3_h = __lasx_xvmaxi_h(src3_h, 0);
1871 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1872 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1873 src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1874 src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1875 src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1876 src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1877 src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1878 src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1879 DUP4_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src2_h, src2_l,
1880 src3_h, src3_l,
src0,
src1, src2, src3);
1882 __lasx_xvstelm_d(
src0,
src, 0, 0);
1884 __lasx_xvstelm_d(
src0,
src + stride_2x, 0, 2);
1885 __lasx_xvstelm_d(
src0,
src + stride_3x, 0, 3);
1887 __lasx_xvstelm_d(
src1,
src, 0, 0);
1889 __lasx_xvstelm_d(
src1,
src + stride_2x, 0, 2);
1890 __lasx_xvstelm_d(
src1,
src + stride_3x, 0, 3);
1892 __lasx_xvstelm_d(src2,
src, 0, 0);
1893 __lasx_xvstelm_d(src2,
src +
stride, 0, 1);
1894 __lasx_xvstelm_d(src2,
src + stride_2x, 0, 2);
1895 __lasx_xvstelm_d(src2,
src + stride_3x, 0, 3);
1897 __lasx_xvstelm_d(src3,
src, 0, 0);
1898 __lasx_xvstelm_d(src3,
src +
stride, 0, 1);
1899 __lasx_xvstelm_d(src3,
src + stride_2x, 0, 2);
1900 __lasx_xvstelm_d(src3,
src + stride_3x, 0, 3);
1904 int height,
int log2_denom,
1905 int weight_src,
int offset)
1909 }
else if (8 ==
height) {
1920 uint32_t offset_val;
1921 __m256i wgt,
zero = __lasx_xvldi(0);
1924 offset_val = (unsigned) offset_in << log2_denom;
1926 wgt = __lasx_xvreplgr2vr_h(weight_src);
1927 offset = __lasx_xvreplgr2vr_h(offset_val);
1928 denom = __lasx_xvreplgr2vr_h(log2_denom);
1931 src0 = __lasx_xvilvl_w(tmp1, tmp0);
1936 src0 = __lasx_xvssrlrn_bu_h(
src0, denom);
1937 __lasx_xvstelm_w(
src0,
src, 0, 0);
1946 __m256i
src0, tmp0, tmp1, tmp2, tmp3, denom,
offset;
1947 uint32_t offset_val;
1948 ptrdiff_t stride_2x =
stride << 1;
1949 ptrdiff_t stride_3x = stride_2x +
stride;
1951 offset_val = (unsigned) offset_in << log2_denom;
1953 wgt = __lasx_xvreplgr2vr_h(weight_src);
1954 offset = __lasx_xvreplgr2vr_h(offset_val);
1955 denom = __lasx_xvreplgr2vr_h(log2_denom);
1958 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1959 DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1960 src0 = __lasx_xvilvl_w(tmp1, tmp0);
1965 src0 = __lasx_xvssrlrn_bu_h(
src0, denom);
1966 __lasx_xvstelm_w(
src0,
src, 0, 0);
1968 __lasx_xvstelm_w(
src0,
src + stride_2x, 0, 4);
1969 __lasx_xvstelm_w(
src0,
src + stride_3x, 0, 5);
1976 __m256i
src0, src0_h, src0_l;
1977 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom,
offset;
1978 __m256i wgt,
zero = __lasx_xvldi(0);
1979 uint32_t offset_val;
1980 ptrdiff_t stride_2x =
stride << 1;
1981 ptrdiff_t stride_4x =
stride << 2;
1982 ptrdiff_t stride_3x = stride_2x +
stride;
1984 offset_val = (unsigned) offset_in << log2_denom;
1986 wgt = __lasx_xvreplgr2vr_h(weight_src);
1987 offset = __lasx_xvreplgr2vr_h(offset_val);
1988 denom = __lasx_xvreplgr2vr_h(log2_denom);
1991 src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1994 src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1996 DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7,
1997 tmp5, tmp0, tmp1, tmp2, tmp3);
1998 DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1999 src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
2000 src0_l = __lasx_xvilvl_b(
zero,
src0);
2001 src0_h = __lasx_xvilvh_b(
zero,
src0);
2002 src0_l = __lasx_xvmul_h(wgt, src0_l);
2003 src0_h = __lasx_xvmul_h(wgt, src0_h);
2004 src0_l = __lasx_xvsadd_h(src0_l,
offset);
2005 src0_h = __lasx_xvsadd_h(src0_h,
offset);
2006 src0_l = __lasx_xvmaxi_h(src0_l, 0);
2007 src0_h = __lasx_xvmaxi_h(src0_h, 0);
2008 src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
2009 src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
2010 __lasx_xvstelm_w(src0_l,
src, 0, 0);
2011 __lasx_xvstelm_w(src0_l,
src +
stride, 0, 1);
2012 __lasx_xvstelm_w(src0_h,
src + stride_2x, 0, 0);
2013 __lasx_xvstelm_w(src0_h,
src + stride_3x, 0, 1);
2015 __lasx_xvstelm_w(src0_l,
src, 0, 4);
2016 __lasx_xvstelm_w(src0_l,
src +
stride, 0, 5);
2017 __lasx_xvstelm_w(src0_h,
src + stride_2x, 0, 4);
2018 __lasx_xvstelm_w(src0_h,
src + stride_3x, 0, 5);
2022 int height,
int log2_denom,
2023 int weight_src,
int offset)
2027 }
else if (4 ==
height) {
2036 __m256i
src0, dst0, dst1, dst2, dst3,
zero;
2038 uint8_t* _dst1 = _dst +
stride;
2039 uint8_t* _dst2 = _dst1 +
stride;
2040 uint8_t* _dst3 = _dst2 +
stride;
2042 src0 = __lasx_xvld(_src, 0);
2043 dst0 = __lasx_xvldrepl_w(_dst, 0);
2044 dst1 = __lasx_xvldrepl_w(_dst1, 0);
2045 dst2 = __lasx_xvldrepl_w(_dst2, 0);
2046 dst3 = __lasx_xvldrepl_w(_dst3, 0);
2047 tmp0 = __lasx_xvilvl_w(dst1, dst0);
2048 tmp1 = __lasx_xvilvl_w(dst3, dst2);
2049 dst0 = __lasx_xvilvl_d(tmp1, tmp0);
2050 tmp0 = __lasx_vext2xv_hu_bu(dst0);
2051 zero = __lasx_xvldi(0);
2052 tmp1 = __lasx_xvadd_h(
src0, tmp0);
2053 dst0 = __lasx_xvpickev_b(tmp1, tmp1);
2054 __lasx_xvstelm_w(dst0, _dst, 0, 0);
2055 __lasx_xvstelm_w(dst0, _dst1, 0, 1);
2056 __lasx_xvstelm_w(dst0, _dst2, 0, 4);
2057 __lasx_xvstelm_w(dst0, _dst3, 0, 5);
2058 __lasx_xvst(
zero, _src, 0);
2064 __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2065 __m256i tmp0, tmp1, tmp2, tmp3;
2066 __m256i
zero = __lasx_xvldi(0);
2067 uint8_t *_dst1 = _dst +
stride;
2068 uint8_t *_dst2 = _dst1 +
stride;
2069 uint8_t *_dst3 = _dst2 +
stride;
2070 uint8_t *_dst4 = _dst3 +
stride;
2071 uint8_t *_dst5 = _dst4 +
stride;
2072 uint8_t *_dst6 = _dst5 +
stride;
2073 uint8_t *_dst7 = _dst6 +
stride;
2075 src0 = __lasx_xvld(_src, 0);
2076 src1 = __lasx_xvld(_src, 32);
2077 src2 = __lasx_xvld(_src, 64);
2078 src3 = __lasx_xvld(_src, 96);
2079 dst0 = __lasx_xvldrepl_d(_dst, 0);
2080 dst1 = __lasx_xvldrepl_d(_dst1, 0);
2081 dst2 = __lasx_xvldrepl_d(_dst2, 0);
2082 dst3 = __lasx_xvldrepl_d(_dst3, 0);
2083 dst4 = __lasx_xvldrepl_d(_dst4, 0);
2084 dst5 = __lasx_xvldrepl_d(_dst5, 0);
2085 dst6 = __lasx_xvldrepl_d(_dst6, 0);
2086 dst7 = __lasx_xvldrepl_d(_dst7, 0);
2087 tmp0 = __lasx_xvilvl_d(dst1, dst0);
2088 tmp1 = __lasx_xvilvl_d(dst3, dst2);
2089 tmp2 = __lasx_xvilvl_d(dst5, dst4);
2090 tmp3 = __lasx_xvilvl_d(dst7, dst6);
2091 dst0 = __lasx_vext2xv_hu_bu(tmp0);
2092 dst1 = __lasx_vext2xv_hu_bu(tmp1);
2093 dst1 = __lasx_vext2xv_hu_bu(tmp1);
2094 dst2 = __lasx_vext2xv_hu_bu(tmp2);
2095 dst3 = __lasx_vext2xv_hu_bu(tmp3);
2096 tmp0 = __lasx_xvadd_h(
src0, dst0);
2097 tmp1 = __lasx_xvadd_h(
src1, dst1);
2098 tmp2 = __lasx_xvadd_h(src2, dst2);
2099 tmp3 = __lasx_xvadd_h(src3, dst3);
2100 dst1 = __lasx_xvpickev_b(tmp1, tmp0);
2101 dst2 = __lasx_xvpickev_b(tmp3, tmp2);
2102 __lasx_xvst(
zero, _src, 0);
2103 __lasx_xvst(
zero, _src, 32);
2104 __lasx_xvst(
zero, _src, 64);
2105 __lasx_xvst(
zero, _src, 96);
2106 __lasx_xvstelm_d(dst1, _dst, 0, 0);
2107 __lasx_xvstelm_d(dst1, _dst1, 0, 2);
2108 __lasx_xvstelm_d(dst1, _dst2, 0, 1);
2109 __lasx_xvstelm_d(dst1, _dst3, 0, 3);
2110 __lasx_xvstelm_d(dst2, _dst4, 0, 0);
2111 __lasx_xvstelm_d(dst2, _dst5, 0, 2);
2112 __lasx_xvstelm_d(dst2, _dst6, 0, 1);
2113 __lasx_xvstelm_d(dst2, _dst7, 0, 3);