28 uint8_t *p_is_pcm, uint8_t *q_is_pcm)
30 ptrdiff_t stride_2x = (
stride << 1);
31 ptrdiff_t stride_4x = (
stride << 2);
32 ptrdiff_t stride_3x = stride_2x +
stride;
33 uint8_t *p3 =
src - stride_4x;
34 uint8_t *p2 =
src - stride_3x;
35 uint8_t *p1 =
src - stride_2x;
39 uint8_t *q2 =
src + stride_2x;
40 uint8_t *q3 =
src + stride_3x;
42 int32_t dp00, dq00, dp30, dq30, d00, d30, d0030, d0434;
43 int32_t dp04, dq04, dp34, dq34, d04, d34;
44 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
47 __m128i dst0, dst1, dst2, dst3, dst4, dst5;
48 __m128i cmp0, cmp1, cmp2, cmp3, p_is_pcm_vec, q_is_pcm_vec;
50 __m128i temp2, tc_pos, tc_neg;
51 __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
53 __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
55 dp00 =
abs(p2[0] - (p1[0] << 1) + p0[0]);
56 dq00 =
abs(q2[0] - (
q1[0] << 1) +
q0[0]);
57 dp30 =
abs(p2[3] - (p1[3] << 1) + p0[3]);
58 dq30 =
abs(q2[3] - (
q1[3] << 1) +
q0[3]);
61 dp04 =
abs(p2[4] - (p1[4] << 1) + p0[4]);
62 dq04 =
abs(q2[4] - (
q1[4] << 1) +
q0[4]);
63 dp34 =
abs(p2[7] - (p1[7] << 1) + p0[7]);
64 dq34 =
abs(q2[7] - (
q1[7] << 1) +
q0[7]);
68 p_is_pcm0 = p_is_pcm[0];
69 p_is_pcm4 = p_is_pcm[1];
70 q_is_pcm0 = q_is_pcm[0];
71 q_is_pcm4 = q_is_pcm[1];
73 DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
74 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
75 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
76 d0030 = (d00 + d30) >= beta;
77 d0434 = (d04 + d34) >= beta;
78 DUP2_ARG1(__lsx_vreplgr2vr_w, d0030, d0434, cmp0, cmp1);
79 cmp3 = __lsx_vpackev_w(cmp1, cmp0);
80 cmp3 = __lsx_vseqi_w(cmp3, 0);
82 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
84 DUP4_ARG2(__lsx_vld, p3, 0, p2, 0, p1, 0, p0, 0,
85 p3_src, p2_src, p1_src, p0_src);
86 DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
87 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
88 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
93 tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
95 tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
97 DUP2_ARG1(__lsx_vreplgr2vr_h, tc0, tc4, cmp0, cmp1);
99 p0_src, p3_src, p2_src, p1_src, p0_src);
101 q0_src, q1_src, q2_src, q3_src);
102 flag0 =
abs(p3[0] - p0[0]) +
abs(q3[0] -
q0[0]) < beta30 &&
103 abs(p0[0] -
q0[0]) < tc250;
104 flag0 = flag0 && (
abs(p3[3] - p0[3]) +
abs(q3[3] -
q0[3]) < beta30 &&
105 abs(p0[3] -
q0[3]) < tc250 && (d00 << 1) < beta20 &&
106 (d30 << 1) < beta20);
107 tc_pos = __lsx_vpackev_d(cmp1, cmp0);
109 zero, q3_src, q0_src, q1_src, q2_src, q3_src);
111 flag1 =
abs(p3[4] - p0[4]) +
abs(q3[4] -
q0[4]) < beta30 &&
112 abs(p0[4] -
q0[4]) < tc254;
113 flag1 = flag1 && (
abs(p3[7] - p0[7]) +
abs(q3[7] -
q0[7]) < beta30 &&
114 abs(p0[7] -
q0[7]) < tc254 && (d04 << 1) < beta20 &&
115 (d34 << 1) < beta20);
116 DUP2_ARG1(__lsx_vreplgr2vr_w, flag0, flag1, cmp0, cmp1);
117 cmp2 = __lsx_vpackev_w(cmp1, cmp0);
118 cmp2 = __lsx_vseqi_w(cmp2, 0);
120 if (flag0 && flag1) {
122 tc_pos = __lsx_vslli_h(tc_pos, 1);
123 tc_neg = __lsx_vneg_h(tc_pos);
126 DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
128 temp1 = __lsx_vadd_h(p3_src, p2_src);
129 temp1 = __lsx_vslli_h(temp1, 1);
130 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
131 temp1 = __lsx_vsrari_h(temp1, 3);
132 temp2 = __lsx_vsub_h(temp1, p2_src);
133 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
134 dst0 = __lsx_vadd_h(temp2, p2_src);
136 temp1 = __lsx_vadd_h(temp0, p2_src);
137 temp1 = __lsx_vsrari_h(temp1, 2);
138 temp2 = __lsx_vsub_h(temp1, p1_src);
139 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
140 dst1 = __lsx_vadd_h(temp2, p1_src);
142 temp1 = __lsx_vslli_h(temp0, 1);
143 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src,
145 temp1 = __lsx_vsrari_h(temp1, 3);
146 temp2 = __lsx_vsub_h(temp1, p0_src);
147 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
148 dst2 = __lsx_vadd_h(temp2, p0_src);
150 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
151 DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
152 p1_src, p_is_pcm_vec, dst0, dst1);
153 dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
156 DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
158 temp1 = __lsx_vadd_h(q3_src, q2_src);
159 temp1 = __lsx_vslli_h(temp1, 1);
160 DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
161 temp1 = __lsx_vsrari_h(temp1, 3);
162 temp2 = __lsx_vsub_h(temp1, q2_src);
163 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
164 dst5 = __lsx_vadd_h(temp2, q2_src);
166 temp1 = __lsx_vadd_h(temp0, q2_src);
167 temp1 = __lsx_vsrari_h(temp1, 2);
168 temp2 = __lsx_vsub_h(temp1, q1_src);
169 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
170 dst4 = __lsx_vadd_h(temp2, q1_src);
172 temp0 = __lsx_vslli_h(temp0, 1);
173 DUP2_ARG2(__lsx_vadd_h, temp0, p1_src, temp1, q2_src,
175 temp1 = __lsx_vsrari_h(temp1, 3);
176 temp2 = __lsx_vsub_h(temp1, q0_src);
177 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
178 dst3 = __lsx_vadd_h(temp2, q0_src);
180 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
181 DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
182 q1_src, q_is_pcm_vec, dst3, dst4);
183 dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
186 DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
187 dst2 = __lsx_vpickev_b(dst5, dst4);
190 DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
192 dst5 = __lsx_vpickev_b(q2_src, q1_src);
194 cmp3 = __lsx_vnor_v(cmp3, cmp3);
195 DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
197 dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
199 __lsx_vstelm_d(dst0, p2, 0, 0);
200 __lsx_vstelm_d(dst0, p2 +
stride, 0, 1);
201 __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
202 __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
203 __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
204 __lsx_vstelm_d(dst2, p2 + stride_4x +
stride, 0, 1);
206 }
else if (flag0 == flag1) {
208 tc_neg = __lsx_vneg_h(tc_pos);
209 DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
211 DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
212 __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
213 delta0 = __lsx_vsub_h(diff0, diff1);
214 delta0 = __lsx_vsrari_h(delta0, 4);
215 temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
216 __lsx_vslli_h(tc_pos, 1));
217 abs_delta0 = __lsx_vadda_h(delta0,
zero);
218 abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
219 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
221 delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
222 temp2 = __lsx_vadd_h(delta0, p0_src);
223 temp2 = __lsx_vclip255_h(temp2);
224 temp0 = __lsx_vbitsel_v(temp2, p0_src,
225 __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec));
226 temp2 = __lsx_vsub_h(q0_src, delta0);
227 temp2 = __lsx_vclip255_h(temp2);
228 temp2 = __lsx_vbitsel_v(temp2, q0_src, __lsx_vnor_v(q_is_pcm_vec,
230 DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
231 q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
233 tmp = (beta + (beta >> 1)) >> 3;
236 cmp0 = __lsx_vpackev_d(cmp1, cmp0);
237 cmp0 = __lsx_vseqi_d(cmp0, 0);
238 p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, cmp0);
242 cmp0 = __lsx_vpackev_d(cmp1, cmp0);
243 cmp0 = __lsx_vseqi_d(cmp0, 0);
244 q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, cmp0);
245 tc_pos = __lsx_vsrai_h(tc_pos, 1);
246 tc_neg = __lsx_vneg_h(tc_pos);
248 DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
250 DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
252 delta1 = __lsx_vadd_h(delta1, delta0);
253 delta2 = __lsx_vsub_h(delta2, delta0);
254 DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
255 DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2,
256 tc_neg, tc_pos, delta1, delta2);
257 DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
259 DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
260 DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
261 q1_src, q_is_pcm_vec, delta1, delta2);
263 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
264 DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
265 p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
266 q1_src, abs_delta0, dst1, dst2, dst3, dst4);
268 DUP2_ARG2(__lsx_vpickev_b, dst2, dst1, dst4, dst3, dst0, dst1);
270 DUP2_ARG2(__lsx_vpickev_b, p0_src, p1_src, q1_src, q0_src,
272 cmp3 = __lsx_vnor_v(cmp3, cmp3);
273 DUP2_ARG3(__lsx_vbitsel_v, dst0, dst2, cmp3, dst1, dst3, cmp3,
277 __lsx_vstelm_d(dst0, p2, 0, 0);
278 __lsx_vstelm_d(dst0, p2 +
stride, 0, 1);
279 __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
280 __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
284 tc_pos = __lsx_vslli_h(tc_pos, 1);
285 tc_neg = __lsx_vneg_h(tc_pos);
288 DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
290 temp1 = __lsx_vadd_h(p3_src, p2_src);
291 temp1 = __lsx_vslli_h(temp1, 1);
292 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
293 temp1 = __lsx_vsrari_h(temp1, 3);
294 temp2 = __lsx_vsub_h(temp1, p2_src);
295 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
296 dst0 = __lsx_vadd_h(temp2, p2_src);
298 temp1 = __lsx_vadd_h(temp0, p2_src);
299 temp1 = __lsx_vsrari_h(temp1, 2);
300 temp2 = __lsx_vsub_h(temp1, p1_src);
301 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
302 dst1 = __lsx_vadd_h(temp2, p1_src);
304 temp1 = __lsx_vslli_h(temp0, 1);
305 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
306 temp1 = __lsx_vsrari_h(temp1, 3);
307 temp2 = __lsx_vsub_h(temp1, p0_src);
308 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
309 dst2 = __lsx_vadd_h(temp2, p0_src);
311 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
312 DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
313 p1_src, p_is_pcm_vec, dst0, dst1);
314 dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
317 DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
319 temp1 = __lsx_vadd_h(q3_src, q2_src);
320 temp1 = __lsx_vslli_h(temp1, 1);
321 DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
322 temp1 = __lsx_vsrari_h(temp1, 3);
323 temp2 = __lsx_vsub_h(temp1, q2_src);
324 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
325 dst5 = __lsx_vadd_h(temp2, q2_src);
327 temp1 = __lsx_vadd_h(temp0, q2_src);
328 temp1 = __lsx_vsrari_h(temp1, 2);
329 temp2 = __lsx_vsub_h(temp1, q1_src);
330 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
331 dst4 = __lsx_vadd_h(temp2, q1_src);
333 temp1 = __lsx_vslli_h(temp0, 1);
334 DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
335 temp1 = __lsx_vsrari_h(temp1, 3);
336 temp2 = __lsx_vsub_h(temp1, q0_src);
337 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
338 dst3 = __lsx_vadd_h(temp2, q0_src);
340 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
341 DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
342 q1_src, q_is_pcm_vec, dst3, dst4);
343 dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
346 DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
347 dst2 = __lsx_vpickev_b(dst5, dst4);
351 tc_pos = __lsx_vsrai_h(tc_pos, 1);
352 tc_neg = __lsx_vneg_h(tc_pos);
354 DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
356 DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
357 __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
358 delta0 = __lsx_vsub_h(diff0, diff1);
359 delta0 = __lsx_vsrari_h(delta0, 4);
360 temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
361 __lsx_vslli_h(tc_pos, 1));
362 abs_delta0 = __lsx_vadda_h(delta0,
zero);
363 abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
364 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
366 delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
367 temp2 = __lsx_vadd_h(delta0, p0_src);
368 temp2 = __lsx_vclip255_h(temp2);
369 temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
371 temp2 = __lsx_vsub_h(q0_src, delta0);
372 temp2 = __lsx_vclip255_h(temp2);
373 temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
375 tmp = (beta + (beta >> 1)) >> 3;
378 cmp0 = __lsx_vpackev_d(cmp1, cmp0);
379 p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
382 cmp0 = __lsx_vpackev_d(cmp1, cmp0);
383 q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
385 tc_pos = __lsx_vsrai_h(tc_pos, 1);
386 tc_neg = __lsx_vneg_h(tc_pos);
388 DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
390 DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
392 delta1 = __lsx_vadd_h(delta1, delta0);
393 delta2 = __lsx_vsub_h(delta2, delta0);
394 DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
395 DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
396 tc_pos, delta1, delta2);
397 DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
399 DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
400 DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
401 q1_src, q_is_pcm_vec, delta1, delta2);
402 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
403 DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
404 q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
405 q0_src, abs_delta0, delta1, delta2, temp0, temp2);
409 DUP2_ARG2(__lsx_vpickev_b, delta1, p2_src, temp2, temp0,
411 dst5 = __lsx_vpickev_b(q2_src, delta2);
414 DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp2, dst1, dst4, cmp2,
416 dst2 = __lsx_vbitsel_v(dst2, dst5, cmp2);
419 DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
421 dst5 = __lsx_vpickev_b(q2_src, q1_src);
423 cmp3 = __lsx_vnor_v(cmp3, cmp3);
424 DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
426 dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
428 __lsx_vstelm_d(dst0, p2, 0, 0);
429 __lsx_vstelm_d(dst0, p2 +
stride, 0, 1);
430 __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
431 __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
432 __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
433 __lsx_vstelm_d(dst2, p2 + stride_4x +
stride, 0, 1);
440 uint8_t *p_is_pcm, uint8_t *q_is_pcm)
442 ptrdiff_t stride_2x = (
stride << 1);
443 ptrdiff_t stride_4x = (
stride << 2);
444 ptrdiff_t stride_3x = stride_2x +
stride;
446 uint8_t *p2 =
src + stride_3x;
447 uint8_t *p1 =
src + stride_4x;
448 uint8_t *p0 =
src + stride_4x + stride_3x;
449 uint8_t flag0, flag1;
450 int32_t dp00, dq00, dp30, dq30, d00, d30;
452 int32_t dp04, dq04, dp34, dq34, d04, d34;
453 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
454 int32_t tc4, p_is_pcm4, q_is_pcm4, tc254,
tmp;
456 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
457 __m128i cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
459 __m128i temp0, temp1;
461 __m128i tc_pos, tc_neg;
462 __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
464 __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
466 dp00 =
abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
467 dq00 =
abs(p3[2] - (p3[1] << 1) + p3[0]);
468 dp30 =
abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
469 dq30 =
abs(p2[2] - (p2[1] << 1) + p2[0]);
472 p_is_pcm0 = p_is_pcm[0];
473 q_is_pcm0 = q_is_pcm[0];
475 dp04 =
abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
476 dq04 =
abs(p1[2] - (p1[1] << 1) + p1[0]);
477 dp34 =
abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
478 dq34 =
abs(p0[2] - (p0[1] << 1) + p0[0]);
481 p_is_pcm4 = p_is_pcm[1];
482 q_is_pcm4 = q_is_pcm[1];
484 DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
485 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
486 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
488 d0030 = (d00 + d30) >= beta;
489 d0434 = (d04 + d34) >= beta;
491 DUP2_ARG1(__lsx_vreplgr2vr_d, d0030, d0434, cmp0, cmp1);
492 cmp3 = __lsx_vpackev_d(cmp1, cmp0);
493 cmp3 = __lsx_vseqi_d(cmp3, 0);
495 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
496 (!d0030 || !d0434)) {
499 src + stride_3x, 0, p3_src, p2_src, p1_src, p0_src);
502 src + stride_3x, 0, q0_src, q1_src, q2_src, q3_src);
505 DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
506 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
507 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
512 tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
514 tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
515 DUP2_ARG1( __lsx_vreplgr2vr_h, tc0 << 1, tc4 << 1, cmp0, cmp1);
516 tc_pos = __lsx_vpackev_d(cmp1, cmp0);
517 LSX_TRANSPOSE8x8_B(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
518 q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
519 q0_src, q1_src, q2_src, q3_src);
521 flag0 =
abs(p3[-4] - p3[-1]) +
abs(p3[3] - p3[0]) < beta30 &&
522 abs(p3[-1] - p3[0]) < tc250;
523 flag0 = flag0 && (
abs(p2[-4] - p2[-1]) +
abs(p2[3] - p2[0]) < beta30 &&
524 abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
525 (d30 << 1) < beta20);
526 cmp0 = __lsx_vreplgr2vr_d(flag0);
528 p0_src, p3_src, p2_src, p1_src, p0_src);
530 flag1 =
abs(p1[-4] - p1[-1]) +
abs(p1[3] - p1[0]) < beta30 &&
531 abs(p1[-1] - p1[0]) < tc254;
532 flag1 = flag1 && (
abs(p0[-4] - p0[-1]) +
abs(p0[3] - p0[0]) < beta30 &&
533 abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
534 (d34 << 1) < beta20);
536 q3_src, q0_src, q1_src, q2_src, q3_src);
538 cmp1 = __lsx_vreplgr2vr_d(flag1);
539 cmp2 = __lsx_vpackev_d(cmp1, cmp0);
540 cmp2 = __lsx_vseqi_d(cmp2, 0);
542 if (flag0 && flag1) {
544 tc_neg = __lsx_vneg_h(tc_pos);
546 DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
548 temp1 = __lsx_vadd_h(p3_src, p2_src);
549 temp1 = __lsx_vslli_h(temp1, 1);
550 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
551 temp1 = __lsx_vsrari_h(temp1, 3);
552 temp2 = __lsx_vsub_h(temp1, p2_src);
553 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
554 dst0 = __lsx_vadd_h(temp2, p2_src);
556 temp1 = __lsx_vadd_h(temp0, p2_src);
557 temp1 = __lsx_vsrari_h(temp1, 2);
558 temp2 = __lsx_vsub_h(temp1, p1_src);
559 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
560 dst1 = __lsx_vadd_h(temp2, p1_src);
562 temp1 = __lsx_vslli_h(temp0, 1);
563 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
564 temp1 = __lsx_vsrari_h(temp1, 3);
565 temp2 = __lsx_vsub_h(temp1, p0_src);
566 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
567 dst2 = __lsx_vadd_h(temp2, p0_src);
569 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
570 DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
571 p_is_pcm_vec, dst0, dst1);
572 dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
575 DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
577 temp1 = __lsx_vadd_h(q3_src, q2_src);
578 temp1 = __lsx_vslli_h(temp1, 1);
579 DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
580 temp1 = __lsx_vsrari_h(temp1, 3);
581 temp2 = __lsx_vsub_h(temp1, q2_src);
582 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
583 dst5 = __lsx_vadd_h(temp2, q2_src);
585 temp1 = __lsx_vadd_h(temp0, q2_src);
586 temp1 = __lsx_vsrari_h(temp1, 2);
587 temp2 = __lsx_vsub_h(temp1, q1_src);
588 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
589 dst4 = __lsx_vadd_h(temp2, q1_src);
591 temp1 = __lsx_vslli_h(temp0, 1);
592 DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
593 temp1 = __lsx_vsrari_h(temp1, 3);
594 temp2 = __lsx_vsub_h(temp1, q0_src);
595 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
596 dst3 = __lsx_vadd_h(temp2, q0_src);
598 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
599 DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
600 q_is_pcm_vec, dst3, dst4);
601 dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
603 }
else if (flag0 == flag1) {
605 tc_pos = __lsx_vsrai_h(tc_pos, 1);
606 tc_neg = __lsx_vneg_h(tc_pos);
608 DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
610 DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
611 __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
612 delta0 = __lsx_vsub_h(diff0, diff1);
613 delta0 = __lsx_vsrari_h(delta0, 4);
614 temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
615 __lsx_vslli_h(tc_pos, 1));
616 abs_delta0 = __lsx_vadda_h(delta0,
zero);
617 abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
618 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
620 delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
621 temp2 = __lsx_vadd_h(delta0, p0_src);
622 temp2 = __lsx_vclip255_h(temp2);
623 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
624 temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
626 temp2 = __lsx_vsub_h(q0_src, delta0);
627 temp2 = __lsx_vclip255_h(temp2);
628 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
629 temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
631 tmp = ((beta + (beta >> 1)) >> 3);
632 DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) <
tmp),
633 !p_is_pcm4 && ((dp04 + dp34) <
tmp), cmp0, cmp1);
634 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
635 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
637 DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 <
tmp),
638 (!q_is_pcm4) && (dq04 + dq34 <
tmp), cmp0, cmp1);
639 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
640 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
641 tc_pos = __lsx_vsrai_h(tc_pos, 1);
642 tc_neg = __lsx_vneg_h(tc_pos);
644 DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
646 DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
648 delta1 = __lsx_vadd_h(delta1, delta0);
649 delta2 = __lsx_vsub_h(delta2, delta0);
650 DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
651 DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
652 tc_pos, delta1, delta2);
653 DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
655 DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
656 DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
657 q1_src, q_is_pcm_vec, delta1, delta2);
659 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
660 DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
661 p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
662 q1_src, abs_delta0, dst0, dst1, dst2, dst3);
665 cmp3 = __lsx_vnor_v(cmp3, cmp3);
666 DUP4_ARG3(__lsx_vbitsel_v, dst0, p1_src, cmp3, dst1, p0_src,
667 cmp3, dst2, q0_src, cmp3, dst3, q1_src, cmp3,
668 dst0, dst1, dst2, dst3);
669 DUP2_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst0, dst1);
672 dst4 = __lsx_vilvl_b(dst1, dst0);
673 dst5 = __lsx_vilvh_b(dst1, dst0);
674 dst0 = __lsx_vilvl_h(dst5, dst4);
675 dst1 = __lsx_vilvh_h(dst5, dst4);
678 __lsx_vstelm_w(dst0,
src, 0, 0);
679 __lsx_vstelm_w(dst0,
src +
stride, 0, 1);
680 __lsx_vstelm_w(dst0,
src + stride_2x, 0, 2);
681 __lsx_vstelm_w(dst0,
src + stride_3x, 0, 3);
683 __lsx_vstelm_w(dst1,
src, 0, 0);
684 __lsx_vstelm_w(dst1,
src +
stride, 0, 1);
685 __lsx_vstelm_w(dst1,
src + stride_2x, 0, 2);
686 __lsx_vstelm_w(dst1,
src + stride_3x, 0, 3);
690 tc_neg = __lsx_vneg_h(tc_pos);
693 DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
696 temp1 = __lsx_vadd_h(p3_src, p2_src);
697 temp1 = __lsx_vslli_h(temp1, 1);
698 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
699 temp1 = __lsx_vsrari_h(temp1, 3);
700 temp2 = __lsx_vsub_h(temp1, p2_src);
701 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
702 dst0 = __lsx_vadd_h(temp2, p2_src);
704 temp1 = __lsx_vadd_h(temp0, p2_src);
705 temp1 = __lsx_vsrari_h(temp1, 2);
706 temp2 = __lsx_vsub_h(temp1, p1_src);
707 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
708 dst1 = __lsx_vadd_h(temp2, p1_src);
710 temp1 = __lsx_vslli_h(temp0, 1);
711 DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
712 temp1 = __lsx_vsrari_h(temp1, 3);
713 temp2 = __lsx_vsub_h(temp1, p0_src);
714 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
715 dst2 = __lsx_vadd_h(temp2, p0_src);
717 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
718 DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
719 p_is_pcm_vec, dst0, dst1);
720 dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
723 DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, temp0, temp0);
724 temp1 = __lsx_vadd_h(q3_src, q2_src);
725 temp1 = __lsx_vslli_h(temp1, 1);
726 DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
727 temp1 = __lsx_vsrari_h(temp1, 3);
728 temp2 = __lsx_vsub_h(temp1, q2_src);
729 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
730 dst5 = __lsx_vadd_h(temp2, q2_src);
732 temp1 = __lsx_vadd_h(temp0, q2_src);
733 temp1 = __lsx_vsrari_h(temp1, 2);
734 temp2 = __lsx_vsub_h(temp1, q1_src);
735 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
736 dst4 = __lsx_vadd_h(temp2, q1_src);
738 temp1 = __lsx_vslli_h(temp0, 1);
739 DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
740 temp1 = __lsx_vsrari_h(temp1, 3);
741 temp2 = __lsx_vsub_h(temp1, q0_src);
742 temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
743 dst3 = __lsx_vadd_h(temp2, q0_src);
745 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
746 DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
747 q_is_pcm_vec, dst3, dst4);
748 dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
752 tc_pos = __lsx_vsrai_h(tc_pos, 1);
753 tc_neg = __lsx_vneg_h(tc_pos);
755 DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
757 DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
758 __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
759 delta0 = __lsx_vsub_h(diff0, diff1);
760 delta0 = __lsx_vsrari_h(delta0, 4);
762 temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
763 __lsx_vslli_h(tc_pos, 1));
764 abs_delta0 = __lsx_vadda_h(delta0,
zero);
765 abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
766 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
767 delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
768 temp2 = __lsx_vadd_h(delta0, p0_src);
769 temp2 = __lsx_vclip255_h(temp2);
770 temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
771 temp2 = __lsx_vsub_h(q0_src, delta0);
772 temp2 = __lsx_vclip255_h(temp2);
773 temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
775 tmp = (beta + (beta >> 1)) >> 3;
776 DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) <
tmp),
777 !p_is_pcm4 && ((dp04 + dp34) <
tmp), cmp0, cmp1);
778 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
779 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
781 DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 <
tmp),
782 (!q_is_pcm4) && (dq04 + dq34 <
tmp), cmp0, cmp1);
783 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
784 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
785 tc_pos = __lsx_vsrai_h(tc_pos, 1);
786 tc_neg = __lsx_vneg_h(tc_pos);
788 DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
790 DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
792 delta1 = __lsx_vadd_h(delta1, delta0);
793 delta2 = __lsx_vsub_h(delta2, delta0);
794 DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
795 DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
796 tc_pos, delta1, delta2);
797 DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
799 DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
800 DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
801 q1_src, q_is_pcm_vec, delta1, delta2);
803 abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
804 DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
805 q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
806 q0_src, abs_delta0, delta1, delta2, temp0, temp2);
810 DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp2, dst1, delta1,
811 cmp2, dst2, temp0, cmp2, dst3, temp2, cmp2,
812 dst0, dst1, dst2, dst3);
813 DUP2_ARG3(__lsx_vbitsel_v, dst4, delta2, cmp2, dst5, q2_src, cmp2,
817 cmp3 = __lsx_vnor_v(cmp3, cmp3);
818 DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp3, dst1, p1_src, cmp3, dst2,
819 p0_src, cmp3, dst3, q0_src, cmp3, dst0, dst1, dst2, dst3);
820 DUP2_ARG3(__lsx_vbitsel_v, dst4, q1_src, cmp3, dst5, q2_src, cmp3,
824 DUP4_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst4, dst4, dst5,
825 dst5, dst0, dst1, dst2, dst3);
828 DUP2_ARG2(__lsx_vilvl_b, dst1, dst0, dst3, dst2, dst4, dst6);
829 DUP2_ARG2(__lsx_vilvh_b, dst1, dst0, dst3, dst2, dst5, dst7);
830 DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst7, dst6, dst0, dst2);
831 DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst7, dst6, dst1, dst3);
834 __lsx_vstelm_w(dst0,
src, 0, 0);
835 __lsx_vstelm_h(dst2,
src, 4, 0);
837 __lsx_vstelm_w(dst0,
src, 0, 1);
838 __lsx_vstelm_h(dst2,
src, 4, 2);
841 __lsx_vstelm_w(dst0,
src, 0, 2);
842 __lsx_vstelm_h(dst2,
src, 4, 4);
844 __lsx_vstelm_w(dst0,
src, 0, 3);
845 __lsx_vstelm_h(dst2,
src, 4, 6);
848 __lsx_vstelm_w(dst1,
src, 0, 0);
849 __lsx_vstelm_h(dst3,
src, 4, 0);
851 __lsx_vstelm_w(dst1,
src, 0, 1);
852 __lsx_vstelm_h(dst3,
src, 4, 2);
855 __lsx_vstelm_w(dst1,
src, 0, 2);
856 __lsx_vstelm_h(dst3,
src, 4, 4);
858 __lsx_vstelm_w(dst1,
src, 0, 3);
859 __lsx_vstelm_h(dst3,
src, 4, 6);
869 uint8_t *q0_ptr =
src;
871 __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
872 __m128i p1, p0,
q0,
q1;
873 __m128i tc_pos, tc_neg;
875 __m128i temp0, temp1,
delta;
877 if (!(
tc[0] <= 0) || !(
tc[1] <= 0)) {
879 tc_pos = __lsx_vpackev_d(cmp1, cmp0);
880 tc_neg = __lsx_vneg_h(tc_pos);
881 DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
882 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
883 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
885 DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
886 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
887 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
889 DUP4_ARG2(__lsx_vld, p1_ptr, 0, p0_ptr, 0, q0_ptr, 0, q1_ptr, 0,
894 temp0 = __lsx_vslli_h(temp0, 2);
895 temp0 = __lsx_vadd_h(temp0, temp1);
896 delta = __lsx_vsrari_h(temp0, 3);
898 temp0 = __lsx_vadd_h(p0,
delta);
899 temp0 = __lsx_vclip255_h(temp0);
900 p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
901 temp0 = __lsx_vbitsel_v(temp0, p0, p_is_pcm_vec);
903 temp1 = __lsx_vsub_h(
q0,
delta);
904 temp1 = __lsx_vclip255_h(temp1);
905 q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
906 temp1 = __lsx_vbitsel_v(temp1,
q0, q_is_pcm_vec);
908 tc_pos = __lsx_vslei_d(tc_pos, 0);
909 DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1,
q0, tc_pos,
911 temp0 = __lsx_vpickev_b(temp1, temp0);
912 __lsx_vstelm_d(temp0, p0_ptr, 0, 0);
913 __lsx_vstelm_d(temp0, p0_ptr +
stride, 0, 1);
921 ptrdiff_t stride_2x = (
stride << 1);
922 ptrdiff_t stride_4x = (
stride << 2);
923 ptrdiff_t stride_3x = stride_2x +
stride;
924 __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
926 __m128i p1, p0,
q0,
q1;
927 __m128i tc_pos, tc_neg;
929 __m128i temp0, temp1,
delta;
931 if (!(
tc[0] <= 0) || !(
tc[1] <= 0)) {
933 tc_pos = __lsx_vpackev_d(cmp1, cmp0);
934 tc_neg = __lsx_vneg_h(tc_pos);
936 DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
937 p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
938 p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
939 DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
940 q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
941 q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
948 src + stride_3x, 0, src4, src5, src6, src7);
950 LSX_TRANSPOSE8x4_B(
src0,
src1,
src2, src3, src4, src5, src6, src7,
956 temp0 = __lsx_vslli_h(temp0, 2);
957 temp0 = __lsx_vadd_h(temp0, temp1);
958 delta = __lsx_vsrari_h(temp0, 3);
961 temp0 = __lsx_vadd_h(p0,
delta);
962 temp1 = __lsx_vsub_h(
q0,
delta);
963 DUP2_ARG1(__lsx_vclip255_h, temp0, temp1, temp0, temp1);
964 DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
965 q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
966 DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, p_is_pcm_vec, temp1,
q0,
967 q_is_pcm_vec, temp0, temp1);
969 tc_pos = __lsx_vslei_d(tc_pos, 0);
970 DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1,
q0, tc_pos,
972 temp0 = __lsx_vpackev_b(temp1, temp0);
975 __lsx_vstelm_h(temp0,
src, 0, 0);
976 __lsx_vstelm_h(temp0,
src +
stride, 0, 1);
977 __lsx_vstelm_h(temp0,
src + stride_2x, 0, 2);
978 __lsx_vstelm_h(temp0,
src + stride_3x, 0, 3);
980 __lsx_vstelm_h(temp0,
src, 0, 4);
981 __lsx_vstelm_h(temp0,
src +
stride, 0, 5);
982 __lsx_vstelm_h(temp0,
src + stride_2x, 0, 6);
983 __lsx_vstelm_h(temp0,
src + stride_3x, 0, 7);
992 int16_t *sao_offset_val,
995 const int32_t src_stride_2x = (src_stride << 1);
996 const int32_t dst_stride_2x = (dst_stride << 1);
997 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
998 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
999 __m128i edge_idx = {0x403000201, 0x0};
1000 __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
1001 __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
1002 __m128i src_minus10, src_minus11, src_plus10,
offset,
src0, dst0;
1003 __m128i const1 = __lsx_vldi(1);
1006 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1010 DUP2_ARG2(__lsx_vld,
src, 0,
src + src_stride, 0, src_minus10, src_minus11);
1013 src += src_stride_2x;
1014 src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
1015 src0 = __lsx_vshuf_b(
zero, src_minus10, shuf1);
1016 src_plus10 = __lsx_vshuf_b(
zero, src_minus10, shuf2);
1019 cmp_minus10, cmp_minus11);
1020 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1021 cmp_minus11, diff_minus10, diff_minus11);
1023 cmp_minus10, cmp_minus11);
1024 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1025 cmp_minus11, cmp_minus10, cmp_minus11);
1026 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1027 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1029 offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1034 src_minus10, src_minus11);
1039 dst0 = __lsx_vxori_b(dst0, 128);
1041 __lsx_vstelm_w(dst0, dst, 0, 0);
1042 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1043 dst += dst_stride_2x;
1046 src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
1047 src0 = __lsx_vshuf_b(
zero, src_minus10, shuf1);
1048 src_plus10 = __lsx_vshuf_b(
zero, src_minus10, shuf2);
1052 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1053 diff_minus10, diff_minus11);
1056 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1057 cmp_minus10, cmp_minus11);
1058 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1059 const1, cmp_minus11, diff_minus10, diff_minus11);
1061 offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1063 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx,
offset, sao_offset, sao_offset,
1067 dst0 = __lsx_vxori_b(dst0, 128);
1069 __lsx_vstelm_w(dst0, dst, 0, 0);
1070 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1077 int16_t *sao_offset_val,
1080 const int32_t src_stride_2x = (src_stride << 1);
1081 const int32_t dst_stride_2x = (dst_stride << 1);
1082 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1083 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1084 __m128i edge_idx = {0x403000201, 0x0};
1085 __m128i const1 = __lsx_vldi(1);
1086 __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
1087 __m128i
src0,
src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
1088 __m128i
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1089 __m128i zeros = {0};
1091 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1095 DUP2_ARG2(__lsx_vld,
src, 0,
src + src_stride, 0, src_minus10, src_minus11);
1098 src += src_stride_2x;
1099 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros,
1101 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros,
1102 src_minus11, shuf2, src_plus10, src_plus11);
1103 DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
1104 src_plus10, src_minus10, src_plus10);
1108 cmp_minus10, cmp_minus11);
1109 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1110 cmp_minus11, diff_minus10, diff_minus11);
1112 cmp_minus10, cmp_minus11);
1113 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1114 cmp_minus11, cmp_minus10, cmp_minus11);
1115 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1116 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1118 offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1123 src_minus10, src_minus11);
1128 dst0 = __lsx_vxori_b(dst0, 128);
1130 __lsx_vstelm_d(dst0, dst, 0, 0);
1131 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1132 dst += dst_stride_2x;
1135 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros, src_minus11,
1137 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
1138 shuf2, src_plus10, src_plus11);
1139 DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
1140 src_plus10, src_minus10, src_plus10);
1145 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1146 diff_minus10, diff_minus11);
1149 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1150 cmp_minus10, cmp_minus11);
1151 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1152 const1, cmp_minus11, diff_minus10, diff_minus11);
1154 offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1160 dst0 = __lsx_vxori_b(dst0, 128);
1162 __lsx_vstelm_d(dst0, dst, 0, 0);
1163 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1170 int16_t *sao_offset_val,
1174 uint8_t *dst_ptr, *src_minus1;
1176 const int32_t src_stride_2x = (src_stride << 1);
1177 const int32_t dst_stride_2x = (dst_stride << 1);
1178 const int32_t src_stride_4x = (src_stride << 2);
1179 const int32_t dst_stride_4x = (dst_stride << 2);
1180 const int32_t src_stride_3x = src_stride_2x + src_stride;
1181 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1183 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1184 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1185 __m128i edge_idx = {0x403000201, 0x0};
1186 __m128i const1 = __lsx_vldi(1);
1188 __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1189 __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1190 __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1191 __m128i diff_plus13;
1192 __m128i src10, src11, src12, src13, dst0, dst1, dst2, dst3;
1193 __m128i src_minus10, src_minus11, src_minus12, src_minus13;
1194 __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1195 __m128i src_zero0, src_zero1, src_zero2, src_zero3;
1196 __m128i src_plus10, src_plus11, src_plus12, src_plus13;
1198 sao_offset = __lsx_vld(sao_offset_val, 0);
1199 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1202 src_minus1 =
src - 1;
1203 src_minus10 = __lsx_vld(src_minus1, 0);
1204 DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
1205 src_stride_2x, src_minus11, src_minus12);
1206 src_minus13 = __lsx_vldx(src_minus1, src_stride_3x);
1208 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
1210 dst_ptr = dst + v_cnt;
1211 src10 = __lsx_vld(src_minus1, 0);
1212 DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
1213 src_stride_2x, src11, src12);
1214 src13 = __lsx_vldx(src_minus1, src_stride_3x);
1215 DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf1, src11,
1216 src_minus11, shuf1, src12, src_minus12, shuf1, src13,
1217 src_minus13, shuf1, src_zero0, src_zero1,
1218 src_zero2, src_zero3);
1219 DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf2, src11,
1220 src_minus11, shuf2, src12, src_minus12, shuf2, src13,
1221 src_minus13, shuf2, src_plus10, src_plus11,
1222 src_plus12, src_plus13);
1223 DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
1224 src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
1225 cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
1226 DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
1227 src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
1228 cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
1229 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1230 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1231 cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1233 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1234 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1235 cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1237 DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
1238 src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
1239 cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
1240 DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
1241 src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
1242 cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
1243 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1244 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1245 cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1247 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1248 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1249 cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1251 DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1252 diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1253 cmp_minus11, diff_plus11, const1, cmp_plus11,
1254 diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1255 DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1256 diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1257 cmp_minus13, diff_plus13, const1, cmp_plus13,
1258 diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1260 DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1261 diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1262 diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1264 DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1265 offset_mask2, 2, offset_mask3, 2, offset_mask0,
1266 offset_mask1, offset_mask2, offset_mask3);
1267 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1268 sao_offset, sao_offset, offset_mask0, offset_mask0,
1270 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1271 sao_offset, sao_offset, offset_mask1, offset_mask1,
1273 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1274 sao_offset, sao_offset, offset_mask2, offset_mask2,
1276 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1277 sao_offset, sao_offset, offset_mask3, offset_mask3,
1280 DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
1281 src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
1282 src_zero2, src_zero3);
1283 DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
1284 offset_mask1, src_zero2, offset_mask2, src_zero3,
1285 offset_mask3, dst0, dst1, dst2, dst3);
1286 DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1287 128, dst0, dst1, dst2, dst3);
1289 src_minus10 = src10;
1290 src_minus11 = src11;
1291 src_minus12 = src12;
1292 src_minus13 = src13;
1294 __lsx_vst(dst0, dst_ptr, 0);
1295 __lsx_vst(dst1, dst_ptr + dst_stride, 0);
1296 __lsx_vst(dst2, dst_ptr + dst_stride_2x, 0);
1297 __lsx_vst(dst3, dst_ptr + dst_stride_3x, 0);
1299 src += src_stride_4x;
1300 dst += dst_stride_4x;
1308 int16_t *sao_offset_val,
1311 const int32_t src_stride_2x = (src_stride << 1);
1312 const int32_t dst_stride_2x = (dst_stride << 1);
1313 __m128i edge_idx = {0x403000201, 0x0};
1314 __m128i const1 = __lsx_vldi(1);
1316 __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
1317 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1318 __m128i src_minus10, src_minus11, src10, src11;
1319 __m128i src_zero0, src_zero1;
1321 __m128i offset_mask0, offset_mask1;
1323 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1327 src + src_stride_2x, 0, src_minus10, src_minus11, src10, src11);
1330 src += src_stride_2x;
1331 DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1332 src11, src_minus11, src10, src10, src_minus10, src_zero0,
1333 src_minus11, src_zero1);
1334 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1335 cmp_minus10, cmp_minus11);
1336 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1337 cmp_minus11, diff_minus10, diff_minus11);
1338 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1339 src_minus11, cmp_minus10, cmp_minus11);
1340 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1341 cmp_minus11, cmp_minus10, cmp_minus11);
1342 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1343 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1345 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1346 diff_minus11, offset_mask0, offset_mask1);
1347 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1348 offset_mask0, offset_mask1);
1349 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1350 src_zero0,
offset, dst0);
1354 dst0 = __lsx_vxori_b(dst0, 128);
1355 dst0 = __lsx_vsadd_b(dst0,
offset);
1356 dst0 = __lsx_vxori_b(dst0, 128);
1357 src_minus10 = src10;
1358 src_minus11 = src11;
1364 __lsx_vstelm_w(dst0, dst, 0, 0);
1365 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1366 dst += dst_stride_2x;
1369 DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1370 src11, src_minus11, src10, src10, src_minus10, src_zero0,
1371 src_minus11, src_zero1);
1372 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1373 cmp_minus10, cmp_minus11);
1374 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1375 diff_minus10, diff_minus11);
1376 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1377 cmp_minus10, cmp_minus11);
1378 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1379 cmp_minus10, cmp_minus11);
1380 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1381 const1, cmp_minus11, diff_minus10, diff_minus11);
1383 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1384 diff_minus11, offset_mask0, offset_mask1);
1385 DUP2_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1386 offset_mask0, offset_mask1);
1387 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1388 src_zero0,
offset, dst0);
1391 dst0 = __lsx_vxori_b(dst0, 128);
1392 dst0 = __lsx_vsadd_b(dst0,
offset);
1393 dst0 = __lsx_vxori_b(dst0, 128);
1395 __lsx_vstelm_w(dst0, dst, 0, 0);
1396 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1403 int16_t *sao_offset_val,
1406 const int32_t src_stride_2x = (src_stride << 1);
1407 const int32_t dst_stride_2x = (dst_stride << 1);
1408 __m128i edge_idx = {0x403000201, 0x0};
1409 __m128i const1 = __lsx_vldi(1);
1410 __m128i
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1411 __m128i src_zero0, src_zero1, dst0;
1412 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1413 __m128i src_minus10, src_minus11, src10, src11;
1414 __m128i offset_mask0, offset_mask1;
1416 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1419 DUP2_ARG2(__lsx_vld,
src - src_stride, 0,
src, 0, src_minus10, src_minus11);
1420 DUP2_ARG2(__lsx_vldx,
src, src_stride,
src, src_stride_2x, src10, src11);
1423 src += src_stride_2x;
1424 DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1425 src11, src_minus11, src10, src10, src_minus10, src_zero0,
1426 src_minus11, src_zero1);
1427 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1428 cmp_minus10, cmp_minus11);
1429 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1430 cmp_minus11, diff_minus10, diff_minus11);
1431 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1432 src_minus11, cmp_minus10, cmp_minus11);
1433 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1434 cmp_minus11, cmp_minus10, cmp_minus11);
1435 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1436 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1438 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1439 diff_minus11, offset_mask0, offset_mask1);
1440 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1441 offset_mask0, offset_mask1);
1442 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1443 src_zero0,
offset, dst0);
1447 dst0 = __lsx_vxori_b(dst0, 128);
1448 dst0 = __lsx_vsadd_b(dst0,
offset);
1449 dst0 = __lsx_vxori_b(dst0, 128);
1450 src_minus10 = src10;
1451 src_minus11 = src11;
1457 __lsx_vstelm_d(dst0, dst, 0, 0);
1458 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1459 dst += dst_stride_2x;
1462 DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1463 src11, src_minus11, src10, src10, src_minus10, src_zero0,
1464 src_minus11, src_zero1);
1465 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1466 cmp_minus10, cmp_minus11);
1467 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1468 diff_minus10, diff_minus11);
1469 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1470 cmp_minus10, cmp_minus11);
1471 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1472 cmp_minus10, cmp_minus11);
1473 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1474 const1, cmp_minus11, diff_minus10, diff_minus11);
1476 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1477 diff_minus11, offset_mask0, offset_mask1);
1478 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1479 offset_mask0, offset_mask1);
1480 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1481 src_zero0,
offset, dst0);
1484 dst0 = __lsx_vxori_b(dst0, 128);
1485 dst0 = __lsx_vsadd_b(dst0,
offset);
1486 dst0 = __lsx_vxori_b(dst0, 128);
1488 __lsx_vstelm_d(dst0, dst, 0, 0);
1489 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1501 uint8_t *src_orig =
src;
1502 uint8_t *dst_orig = dst;
1504 const int32_t src_stride_2x = (src_stride << 1);
1505 const int32_t dst_stride_2x = (dst_stride << 1);
1506 const int32_t src_stride_4x = (src_stride << 2);
1507 const int32_t dst_stride_4x = (dst_stride << 2);
1508 const int32_t src_stride_3x = src_stride_2x + src_stride;
1509 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1510 __m128i edge_idx = {0x403000201, 0x0};
1511 __m128i const1 = __lsx_vldi(1);
1512 __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1513 __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1514 __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1515 __m128i diff_plus13;
1516 __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
1517 __m128i src12, dst2, src13, dst3;
1518 __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1520 sao_offset = __lsx_vld(sao_offset_val, 0);
1521 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1523 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
1524 src = src_orig + v_cnt;
1525 dst = dst_orig + v_cnt;
1528 src_minus10, src_minus11);
1530 for (h_cnt = (
height >> 2); h_cnt--;) {
1532 src, src_stride_3x,
src, src_stride_4x,
1533 src10, src11, src12, src13);
1534 DUP4_ARG2(__lsx_vseq_b, src_minus11, src_minus10, src_minus11,
1535 src10, src10, src_minus11, src10, src11, cmp_minus10,
1536 cmp_plus10, cmp_minus11, cmp_plus11);
1537 DUP4_ARG2(__lsx_vseq_b, src11, src10, src11, src12, src12, src11,
1538 src12, src13, cmp_minus12, cmp_plus12,
1539 cmp_minus13, cmp_plus13);
1540 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1541 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1542 cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1544 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1545 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1546 cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1548 DUP4_ARG2(__lsx_vsle_bu, src_minus11, src_minus10, src_minus11,
1549 src10, src10, src_minus11, src10, src11, cmp_minus10,
1550 cmp_plus10, cmp_minus11, cmp_plus11);
1551 DUP4_ARG2(__lsx_vsle_bu, src11, src10, src11, src12, src12, src11,
1552 src12, src13, cmp_minus12, cmp_plus12, cmp_minus13,
1554 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1555 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1556 cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1558 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1559 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1560 cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1562 DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1563 diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1564 cmp_minus11, diff_plus11, const1, cmp_plus11,
1565 diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1566 DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1567 diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1568 cmp_minus13, diff_plus13, const1, cmp_plus13,
1569 diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1571 DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1572 diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1573 diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1575 DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1576 offset_mask2, 2, offset_mask3, 2, offset_mask0,
1577 offset_mask1, offset_mask2, offset_mask3);
1578 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1579 sao_offset, sao_offset, offset_mask0,\
1580 offset_mask0, offset_mask0);
1581 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1582 sao_offset, sao_offset, offset_mask1, offset_mask1,
1584 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1585 sao_offset, sao_offset, offset_mask2, offset_mask2,
1587 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1588 sao_offset, sao_offset, offset_mask3, offset_mask3,
1591 src_minus10 = src12;
1592 DUP4_ARG2(__lsx_vxori_b, src_minus11, 128, src10, 128, src11, 128,
1593 src12, 128, src_minus11, src10, src11, src12);
1594 DUP4_ARG2(__lsx_vsadd_b, src_minus11, offset_mask0, src10,
1595 offset_mask1, src11, offset_mask2, src12,
1596 offset_mask3, dst0, dst1, dst2, dst3);
1597 DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1598 128, dst0, dst1, dst2, dst3);
1599 src_minus11 = src13;
1601 __lsx_vst(dst0, dst, 0);
1602 __lsx_vstx(dst1, dst, dst_stride);
1603 __lsx_vstx(dst2, dst, dst_stride_2x);
1604 __lsx_vstx(dst3, dst, dst_stride_3x);
1605 src += src_stride_4x;
1606 dst += dst_stride_4x;
1615 int16_t *sao_offset_val,
1619 const int32_t src_stride_2x = (src_stride << 1);
1620 const int32_t dst_stride_2x = (dst_stride << 1);
1621 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1622 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1623 __m128i edge_idx = {0x403000201, 0x0};
1624 __m128i const1 = __lsx_vldi(1);
1625 __m128i
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1626 __m128i cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
1627 __m128i src_minus11, src10, src11;
1628 __m128i src_plus0, src_zero0, src_plus1, src_zero1, dst0;
1629 __m128i offset_mask0, offset_mask1;
1630 __m128i zeros = {0};
1632 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1636 DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
1637 src_minus10, src_minus11);
1638 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1642 src_orig += src_stride_2x;
1644 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
1645 shuf1, src_zero0, src_zero1);
1646 DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1647 src_plus0, src_plus1);
1649 DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1,
1650 src_minus11, src_minus10, src_minus11);
1651 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1,
1652 src_zero1, src_zero0, src_zero1);
1653 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1,
1654 src_minus11, cmp_minus10, cmp_minus11);
1655 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1656 cmp_minus11, diff_minus10, diff_minus11);
1657 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1658 src_minus11, cmp_minus10, cmp_minus11);
1659 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1660 cmp_minus11, cmp_minus10, cmp_minus11);
1661 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1662 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1664 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1665 diff_minus11, offset_mask0, offset_mask1);
1666 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1667 offset_mask0, offset_mask1);
1668 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1669 src_zero0,
offset, dst0);
1672 dst0 = __lsx_vxori_b(dst0, 128);
1673 dst0 = __lsx_vsadd_b(dst0,
offset);
1674 dst0 = __lsx_vxori_b(dst0, 128);
1676 src_minus10 = src10;
1677 src_minus11 = src11;
1680 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1683 __lsx_vstelm_w(dst0, dst, 0, 0);
1684 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1685 dst += dst_stride_2x;
1688 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
1689 src_zero0, src_zero1);
1690 DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1691 src_plus0, src_plus1);
1693 DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1, src_minus11,
1694 src_minus10, src_minus11);
1695 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1696 src_zero0, src_zero1);
1697 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1698 cmp_minus10, cmp_minus11);
1699 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1700 diff_minus10, diff_minus11);
1701 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1702 cmp_minus10, cmp_minus11);
1703 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1704 cmp_minus10, cmp_minus11);
1705 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1706 const1, cmp_minus11, diff_minus10, diff_minus11);
1708 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1709 diff_minus11, offset_mask0, offset_mask1);
1710 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
1712 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1713 src_zero0,
offset, dst0);
1716 dst0 = __lsx_vxori_b(dst0, 128);
1717 dst0 = __lsx_vsadd_b(dst0,
offset);
1718 dst0 = __lsx_vxori_b(dst0, 128);
1720 __lsx_vstelm_w(dst0, dst, 0, 0);
1721 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1728 int16_t *sao_offset_val,
1732 const int32_t src_stride_2x = (src_stride << 1);
1733 const int32_t dst_stride_2x = (dst_stride << 1);
1734 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1735 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1736 __m128i edge_idx = {0x403000201, 0x0};
1737 __m128i const1 = __lsx_vldi(1);
1738 __m128i
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1739 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1740 __m128i src_minus10, src10, src_minus11, src11;
1741 __m128i src_zero0, src_plus10, src_zero1, src_plus11, dst0;
1742 __m128i offset_mask0, offset_mask1;
1743 __m128i zeros = {0};
1745 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1749 DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, src_minus10,
1751 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1755 src_orig += src_stride_2x;
1757 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
1758 shuf1, src_zero0, src_zero1);
1759 DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1760 src_plus10, src_plus11);
1762 DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11,
1763 src_minus11, src_minus10, src_minus11);
1764 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1765 src_zero0, src_zero1);
1766 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1767 cmp_minus10, cmp_minus11);
1768 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1769 cmp_minus11, diff_minus10, diff_minus11);
1770 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1771 src_minus11, cmp_minus10, cmp_minus11);
1772 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1773 cmp_minus11, cmp_minus10, cmp_minus11);
1774 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1775 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1777 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1778 diff_minus11, offset_mask0, offset_mask1);
1779 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1780 offset_mask0, offset_mask1);
1781 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1782 src_zero0,
offset, dst0);
1785 dst0 = __lsx_vxori_b(dst0, 128);
1786 dst0 = __lsx_vsadd_b(dst0,
offset);
1787 dst0 = __lsx_vxori_b(dst0, 128);
1789 src_minus10 = src10;
1790 src_minus11 = src11;
1793 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1795 __lsx_vstelm_d(dst0, dst, 0, 0);
1796 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1797 dst += dst_stride_2x;
1800 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
1801 src_zero0, src_zero1);
1802 DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1803 src_plus10, src_plus11);
1804 DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11, src_minus11,
1805 src_minus10, src_minus11);
1806 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1807 src_zero0, src_zero1);
1809 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1810 cmp_minus10, cmp_minus11);
1811 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1812 cmp_minus11, diff_minus10, diff_minus11);
1813 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1814 cmp_minus10, cmp_minus11);
1815 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1816 cmp_minus10, cmp_minus11);
1817 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1818 const1, cmp_minus11, diff_minus10, diff_minus11);
1820 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1821 diff_minus11, offset_mask0, offset_mask1);
1822 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
1824 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1825 src_zero0,
offset, dst0);
1828 dst0 = __lsx_vxori_b(dst0, 128);
1829 dst0 = __lsx_vsadd_b(dst0,
offset);
1830 dst0 = __lsx_vxori_b(dst0, 128);
1832 src_minus10 = src10;
1833 src_minus11 = src11;
1836 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1839 __lsx_vstelm_d(dst0, dst, 0, 0);
1840 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1852 uint8_t *src_orig =
src;
1853 uint8_t *dst_orig = dst;
1855 const int32_t src_stride_2x = (src_stride << 1);
1856 const int32_t dst_stride_2x = (dst_stride << 1);
1857 const int32_t src_stride_4x = (src_stride << 2);
1858 const int32_t dst_stride_4x = (dst_stride << 2);
1859 const int32_t src_stride_3x = src_stride_2x + src_stride;
1860 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1862 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1863 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1864 __m128i edge_idx = {0x403000201, 0x0};
1865 __m128i const1 = __lsx_vldi(1);
1866 __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1867 __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1868 __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1869 __m128i diff_plus13, src_minus14, src_plus13;
1870 __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1871 __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
1872 __m128i src12, src_minus12, dst2, src13, src_minus13, dst3;
1873 __m128i src_zero0, src_plus10, src_zero1, src_plus11, src_zero2;
1874 __m128i src_zero3, sao_offset, src_plus12;
1876 sao_offset = __lsx_vld(sao_offset_val, 0);
1877 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1882 src_minus11 = __lsx_vld(src_orig, 0);
1883 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1884 src_minus12, src_minus13);
1885 src_minus14 = __lsx_vldx(src_orig, src_stride_3x);
1887 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
1888 src_minus10 = __lsx_vld(src_orig - src_stride, 0);
1890 src10 = __lsx_vld(src_orig, 0);
1891 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig,
1892 src_stride_2x, src11, src12);
1893 src13 = __lsx_vldx(src_orig, src_stride_3x);
1894 src_plus13 = __lsx_vld(
src + v_cnt + src_stride_4x, 1);
1896 DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
1897 src_minus12, shuf1, src12, src_minus13, shuf1,
1898 src13, src_minus14, shuf1, src_zero0, src_zero1,
1899 src_zero2, src_zero3);
1900 DUP2_ARG3(__lsx_vshuf_b, src11, src_minus12, shuf2, src12,
1901 src_minus13, shuf2, src_plus10, src_plus11);
1902 src_plus12 = __lsx_vshuf_b(src13, src_minus14, shuf2);
1904 DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
1905 src_plus10, src_zero1, src_minus11, src_zero1,
1906 src_plus11, cmp_minus10, cmp_plus10,
1907 cmp_minus11, cmp_plus11);
1908 DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
1909 src_plus12, src_zero3, src_minus13, src_zero3,
1910 src_plus13, cmp_minus12, cmp_plus12,
1911 cmp_minus13, cmp_plus13);
1912 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1913 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1914 cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1916 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1917 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1918 cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1920 DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
1921 src_plus10, src_zero1, src_minus11, src_zero1,
1922 src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1924 DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
1925 src_plus12, src_zero3, src_minus13, src_zero3,
1926 src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1928 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1929 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1930 cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1932 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1933 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1934 cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1936 DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1937 diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1938 cmp_minus11, diff_plus11, const1, cmp_plus11,
1939 diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1940 DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1941 diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1942 cmp_minus13, diff_plus13, const1, cmp_plus13,
1943 diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1945 DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1946 diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1947 diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1949 DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1950 offset_mask2, 2, offset_mask3, 2, offset_mask0,
1951 offset_mask1, offset_mask2, offset_mask3);
1953 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1954 sao_offset, sao_offset, offset_mask0, offset_mask0,
1956 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1957 sao_offset, sao_offset, offset_mask1, offset_mask1,
1959 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1960 sao_offset, sao_offset, offset_mask2, offset_mask2,
1962 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1963 sao_offset, sao_offset, offset_mask3, offset_mask3,
1966 DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, src_zero2,
1967 128, src_zero3, 128, src_zero0, src_zero1, src_zero2,
1969 DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
1970 offset_mask1, src_zero2, offset_mask2, src_zero3,
1971 offset_mask3, dst0, dst1, dst2, dst3);
1972 DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1973 128, dst0, dst1, dst2, dst3);
1975 src_minus11 = src10;
1976 src_minus12 = src11;
1977 src_minus13 = src12;
1978 src_minus14 = src13;
1980 __lsx_vst(dst0, dst_orig, 0);
1981 __lsx_vstx(dst1, dst_orig, dst_stride);
1982 __lsx_vstx(dst2, dst_orig, dst_stride_2x);
1983 __lsx_vstx(dst3, dst_orig, dst_stride_3x);
1986 src += src_stride_4x;
1987 dst += dst_stride_4x;
1995 int16_t *sao_offset_val,
1999 const int32_t src_stride_2x = (src_stride << 1);
2000 const int32_t dst_stride_2x = (dst_stride << 1);
2002 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2003 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2004 __m128i edge_idx = {0x403000201, 0x0};
2005 __m128i const1 = __lsx_vldi(1);
2006 __m128i
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
2007 __m128i src_zero0, src_zero1, dst0;
2008 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2009 __m128i src_minus10, src10, src_minus11, src11;
2010 __m128i offset_mask0, offset_mask1;
2011 __m128i zeros = {0};
2013 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2017 DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
2018 src_minus10, src_minus11);
2019 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2023 src_orig += src_stride_2x;
2025 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
2026 shuf1, src_zero0, src_zero1);
2027 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2028 shuf2, src_minus10, src_minus11);
2030 DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2031 src_minus10, src_minus11);
2032 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2033 src_zero0, src_zero1);
2034 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2035 cmp_minus10, cmp_minus11);
2036 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2037 cmp_minus11, diff_minus10, diff_minus11);
2038 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
2039 src_minus11, cmp_minus10, cmp_minus11);
2040 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2041 cmp_minus11, cmp_minus10, cmp_minus11);
2042 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2043 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
2045 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2046 diff_minus11, offset_mask0, offset_mask1);
2047 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
2048 offset_mask0, offset_mask1);
2049 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2050 src_zero0,
offset, dst0);
2053 dst0 = __lsx_vxori_b(dst0, 128);
2054 dst0 = __lsx_vsadd_b(dst0,
offset);
2055 dst0 = __lsx_vxori_b(dst0, 128);
2057 src_minus10 = src10;
2058 src_minus11 = src11;
2061 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2064 __lsx_vstelm_w(dst0, dst, 0, 0);
2065 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
2066 dst += dst_stride_2x;
2069 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
2070 src_zero0, src_zero1);
2071 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2072 shuf2, src_minus10, src_minus11);
2074 DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2075 src_minus10, src_minus11);
2076 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2077 src_zero0, src_zero1);
2078 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2079 cmp_minus10, cmp_minus11);
2080 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2081 cmp_minus11, diff_minus10, diff_minus11);
2082 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
2083 cmp_minus10, cmp_minus11);
2084 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2085 cmp_minus10, cmp_minus11);
2086 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
2087 const1, cmp_minus11, diff_minus10, diff_minus11);
2089 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2090 diff_minus11, offset_mask0, offset_mask1);
2091 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
2093 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2094 src_zero0,
offset, dst0);
2097 dst0 = __lsx_vxori_b(dst0, 128);
2098 dst0 = __lsx_vsadd_b(dst0,
offset);
2099 dst0 = __lsx_vxori_b(dst0, 128);
2101 __lsx_vstelm_w(dst0, dst, 0, 0);
2102 __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
2103 dst += dst_stride_2x;
2110 int16_t *sao_offset_val,
2114 const int32_t src_stride_2x = (src_stride << 1);
2115 const int32_t dst_stride_2x = (dst_stride << 1);
2117 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2118 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2119 __m128i edge_idx = {0x403000201, 0x0};
2120 __m128i const1 = __lsx_vldi(1);
2121 __m128i
offset, sao_offset = __lsx_vld(sao_offset_val, 0);
2122 __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2123 __m128i src_minus10, src10, src_minus11, src11;
2124 __m128i src_zero0, src_zero1, dst0;
2125 __m128i offset_mask0, offset_mask1;
2126 __m128i zeros = {0};
2128 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2132 DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
2133 src_minus10, src_minus11);
2134 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2138 src_orig += src_stride_2x;
2140 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
2141 shuf1, src_zero0, src_zero1);
2142 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2143 shuf2, src_minus10, src_minus11);
2145 DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2146 src_minus10, src_minus11);
2147 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2148 src_zero0, src_zero1);
2149 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2150 cmp_minus10, cmp_minus11);
2151 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2152 cmp_minus11, diff_minus10, diff_minus11);
2153 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
2154 src_minus11, cmp_minus10, cmp_minus11);
2155 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2156 cmp_minus11, cmp_minus10, cmp_minus11);
2157 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2158 diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
2160 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2161 diff_minus11, offset_mask0, offset_mask1);
2162 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
2163 offset_mask0, offset_mask1);
2164 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2165 src_zero0,
offset, dst0);
2168 dst0 = __lsx_vxori_b(dst0, 128);
2169 dst0 = __lsx_vsadd_b(dst0,
offset);
2170 dst0 = __lsx_vxori_b(dst0, 128);
2172 src_minus10 = src10;
2173 src_minus11 = src11;
2176 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2179 __lsx_vstelm_d(dst0, dst, 0, 0);
2180 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
2181 dst += dst_stride_2x;
2184 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
2185 src_zero0, src_zero1);
2186 DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2187 shuf2, src_minus10, src_minus11);
2189 DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2190 src_minus10, src_minus11);
2191 DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2192 src_zero0, src_zero1);
2193 DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2194 cmp_minus10, cmp_minus11);
2195 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2196 diff_minus10, diff_minus11);
2197 DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
2198 cmp_minus10, cmp_minus11);
2199 DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2200 cmp_minus10, cmp_minus11);
2201 DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
2202 const1, cmp_minus11, diff_minus10, diff_minus11);
2204 DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2205 diff_minus11, offset_mask0, offset_mask1);
2206 DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
2208 DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2209 src_zero0,
offset, dst0);
2212 dst0 = __lsx_vxori_b(dst0, 128);
2213 dst0 = __lsx_vsadd_b(dst0,
offset);
2214 dst0 = __lsx_vxori_b(dst0, 128);
2216 __lsx_vstelm_d(dst0, dst, 0, 0);
2217 __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
2224 int16_t *sao_offset_val,
2228 uint8_t *src_orig, *dst_orig;
2230 const int32_t src_stride_2x = (src_stride << 1);
2231 const int32_t dst_stride_2x = (dst_stride << 1);
2232 const int32_t src_stride_4x = (src_stride << 2);
2233 const int32_t dst_stride_4x = (dst_stride << 2);
2234 const int32_t src_stride_3x = src_stride_2x + src_stride;
2235 const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
2237 __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2238 __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2239 __m128i edge_idx = {0x403000201, 0x0};
2240 __m128i const1 = __lsx_vldi(1);
2241 __m128i dst0, dst1, dst2, dst3;
2242 __m128i cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
2243 __m128i cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
2244 __m128i diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
2245 __m128i diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
2246 __m128i src_plus10, src_plus11, src_plus12, src_plus13;
2247 __m128i src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
2248 __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
2250 sao_offset = __lsx_vld(sao_offset_val, 0);
2251 sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2257 src_minus11 = __lsx_vld(src_orig, 0);
2258 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2259 src_plus10, src_plus11);
2260 src_plus12 = __lsx_vldx(src_orig, src_stride_3x);
2262 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
2263 src_minus10 = __lsx_vld(src_orig - src_stride, 2);
2264 src_plus13 = __lsx_vldx(src_orig, src_stride_4x);
2266 src10 = __lsx_vld(src_orig, 0);
2267 DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2269 src13 =__lsx_vldx(src_orig, src_stride_3x);
2271 DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
2272 src_plus10, shuf1, src12, src_plus11, shuf1, src13,
2273 src_plus12, shuf1, src_zero0, src_zero1, src_zero2,
2275 src_minus11 = __lsx_vshuf_b(src10, src_minus11, shuf2);
2276 DUP2_ARG3(__lsx_vshuf_b, src11, src_plus10, shuf2, src12,
2277 src_plus11, shuf2, src_minus12, src_minus13);
2279 DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
2280 src_plus10, src_zero1, src_minus11, src_zero1,
2281 src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
2283 DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
2284 src_plus12, src_zero3, src_minus13, src_zero3,
2285 src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
2287 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
2288 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
2289 cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
2291 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
2292 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
2293 cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
2295 DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
2296 src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
2297 cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
2298 DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
2299 src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
2300 cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
2301 DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
2302 cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
2303 cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
2305 DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
2306 cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
2307 cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
2309 DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2310 diff_plus10, const1, cmp_plus10, diff_minus11, const1,
2311 cmp_minus11, diff_plus11, const1, cmp_plus11,
2312 diff_minus10, diff_plus10, diff_minus11, diff_plus11);
2313 DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
2314 diff_plus12, const1, cmp_plus12, diff_minus13, const1,
2315 cmp_minus13, diff_plus13, const1, cmp_plus13,
2316 diff_minus12, diff_plus12, diff_minus13, diff_plus13);
2318 DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
2319 diff_plus11, diff_minus12, diff_plus12, diff_minus13,
2320 diff_plus13, offset_mask0, offset_mask1, offset_mask2,
2322 DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
2323 offset_mask2, 2, offset_mask3, 2, offset_mask0,
2324 offset_mask1, offset_mask2, offset_mask3);
2326 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
2327 sao_offset, sao_offset, offset_mask0, offset_mask0,
2329 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
2330 sao_offset, sao_offset, offset_mask1, offset_mask1,
2332 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
2333 sao_offset, sao_offset, offset_mask2, offset_mask2,
2335 DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
2336 sao_offset, sao_offset, offset_mask3, offset_mask3,
2339 DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
2340 src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
2341 src_zero2, src_zero3);
2342 DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
2343 offset_mask1, src_zero2, offset_mask2, src_zero3,
2344 offset_mask3, dst0, dst1, dst2, dst3);
2345 DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
2346 128, dst0, dst1, dst2, dst3);
2348 src_minus11 = src10;
2353 __lsx_vst(dst0, dst_orig, 0);
2354 __lsx_vstx(dst1, dst_orig, dst_stride);
2355 __lsx_vstx(dst2, dst_orig, dst_stride_2x);
2356 __lsx_vstx(dst3, dst_orig, dst_stride_3x);
2360 src += src_stride_4x;
2361 dst += dst_stride_4x;
2366 ptrdiff_t stride_dst,
2367 int16_t *sao_offset_val,
2380 dst +=
width & 0xFFFFFFF0;
2408 dst +=
width & 0xFFFFFFF0;
2436 dst +=
width & 0xFFFFFFF0;
2464 dst +=
width & 0xFFFFFFF0;