FFmpeg
hevc_lpf_sao_lsx.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2022 Loongson Technology Corporation Limited
3  * Contributed by Lu Wang <wanglu@loongson.cn>
4  * Hao Chen <chenhao@loongson.cn>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
24 #include "hevcdsp_lsx.h"
25 
26 void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride,
27  int32_t beta, int32_t *tc,
28  uint8_t *p_is_pcm, uint8_t *q_is_pcm)
29 {
30  ptrdiff_t stride_2x = (stride << 1);
31  ptrdiff_t stride_4x = (stride << 2);
32  ptrdiff_t stride_3x = stride_2x + stride;
33  uint8_t *p3 = src - stride_4x;
34  uint8_t *p2 = src - stride_3x;
35  uint8_t *p1 = src - stride_2x;
36  uint8_t *p0 = src - stride;
37  uint8_t *q0 = src;
38  uint8_t *q1 = src + stride;
39  uint8_t *q2 = src + stride_2x;
40  uint8_t *q3 = src + stride_3x;
41  uint8_t flag0, flag1;
42  int32_t dp00, dq00, dp30, dq30, d00, d30, d0030, d0434;
43  int32_t dp04, dq04, dp34, dq34, d04, d34;
44  int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
45  int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
46 
47  __m128i dst0, dst1, dst2, dst3, dst4, dst5;
48  __m128i cmp0, cmp1, cmp2, cmp3, p_is_pcm_vec, q_is_pcm_vec;
49  __m128i temp0, temp1;
50  __m128i temp2, tc_pos, tc_neg;
51  __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
52  __m128i zero = {0};
53  __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
54 
55  dp00 = abs(p2[0] - (p1[0] << 1) + p0[0]);
56  dq00 = abs(q2[0] - (q1[0] << 1) + q0[0]);
57  dp30 = abs(p2[3] - (p1[3] << 1) + p0[3]);
58  dq30 = abs(q2[3] - (q1[3] << 1) + q0[3]);
59  d00 = dp00 + dq00;
60  d30 = dp30 + dq30;
61  dp04 = abs(p2[4] - (p1[4] << 1) + p0[4]);
62  dq04 = abs(q2[4] - (q1[4] << 1) + q0[4]);
63  dp34 = abs(p2[7] - (p1[7] << 1) + p0[7]);
64  dq34 = abs(q2[7] - (q1[7] << 1) + q0[7]);
65  d04 = dp04 + dq04;
66  d34 = dp34 + dq34;
67 
68  p_is_pcm0 = p_is_pcm[0];
69  p_is_pcm4 = p_is_pcm[1];
70  q_is_pcm0 = q_is_pcm[0];
71  q_is_pcm4 = q_is_pcm[1];
72 
73  DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
74  p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
75  p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
76  d0030 = (d00 + d30) >= beta;
77  d0434 = (d04 + d34) >= beta;
78  DUP2_ARG1(__lsx_vreplgr2vr_w, d0030, d0434, cmp0, cmp1);
79  cmp3 = __lsx_vpackev_w(cmp1, cmp0);
80  cmp3 = __lsx_vseqi_w(cmp3, 0);
81 
82  if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
83  (!d0030 || !d0434)) {
84  DUP4_ARG2(__lsx_vld, p3, 0, p2, 0, p1, 0, p0, 0,
85  p3_src, p2_src, p1_src, p0_src);
86  DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
87  q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
88  q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
89 
90  tc0 = tc[0];
91  beta30 = beta >> 3;
92  beta20 = beta >> 2;
93  tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
94  tc4 = tc[1];
95  tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
96 
97  DUP2_ARG1(__lsx_vreplgr2vr_h, tc0, tc4, cmp0, cmp1);
98  DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
99  p0_src, p3_src, p2_src, p1_src, p0_src);
100  DUP4_ARG2(__lsx_vld, q0, 0, q1, 0, q2, 0, q3, 0,
101  q0_src, q1_src, q2_src, q3_src);
102  flag0 = abs(p3[0] - p0[0]) + abs(q3[0] - q0[0]) < beta30 &&
103  abs(p0[0] - q0[0]) < tc250;
104  flag0 = flag0 && (abs(p3[3] - p0[3]) + abs(q3[3] - q0[3]) < beta30 &&
105  abs(p0[3] - q0[3]) < tc250 && (d00 << 1) < beta20 &&
106  (d30 << 1) < beta20);
107  tc_pos = __lsx_vpackev_d(cmp1, cmp0);
108  DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src,
109  zero, q3_src, q0_src, q1_src, q2_src, q3_src);
110 
111  flag1 = abs(p3[4] - p0[4]) + abs(q3[4] - q0[4]) < beta30 &&
112  abs(p0[4] - q0[4]) < tc254;
113  flag1 = flag1 && (abs(p3[7] - p0[7]) + abs(q3[7] - q0[7]) < beta30 &&
114  abs(p0[7] - q0[7]) < tc254 && (d04 << 1) < beta20 &&
115  (d34 << 1) < beta20);
116  DUP2_ARG1(__lsx_vreplgr2vr_w, flag0, flag1, cmp0, cmp1);
117  cmp2 = __lsx_vpackev_w(cmp1, cmp0);
118  cmp2 = __lsx_vseqi_w(cmp2, 0);
119 
120  if (flag0 && flag1) { /* strong only */
121  /* strong filter */
122  tc_pos = __lsx_vslli_h(tc_pos, 1);
123  tc_neg = __lsx_vneg_h(tc_pos);
124 
125  /* p part */
126  DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
127  temp0, temp0);
128  temp1 = __lsx_vadd_h(p3_src, p2_src);
129  temp1 = __lsx_vslli_h(temp1, 1);
130  DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
131  temp1 = __lsx_vsrari_h(temp1, 3);
132  temp2 = __lsx_vsub_h(temp1, p2_src);
133  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
134  dst0 = __lsx_vadd_h(temp2, p2_src);
135 
136  temp1 = __lsx_vadd_h(temp0, p2_src);
137  temp1 = __lsx_vsrari_h(temp1, 2);
138  temp2 = __lsx_vsub_h(temp1, p1_src);
139  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
140  dst1 = __lsx_vadd_h(temp2, p1_src);
141 
142  temp1 = __lsx_vslli_h(temp0, 1);
143  DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src,
144  temp1, temp1);
145  temp1 = __lsx_vsrari_h(temp1, 3);
146  temp2 = __lsx_vsub_h(temp1, p0_src);
147  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
148  dst2 = __lsx_vadd_h(temp2, p0_src);
149 
150  p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
151  DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
152  p1_src, p_is_pcm_vec, dst0, dst1);
153  dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
154 
155  /* q part */
156  DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
157  temp0, temp0);
158  temp1 = __lsx_vadd_h(q3_src, q2_src);
159  temp1 = __lsx_vslli_h(temp1, 1);
160  DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
161  temp1 = __lsx_vsrari_h(temp1, 3);
162  temp2 = __lsx_vsub_h(temp1, q2_src);
163  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
164  dst5 = __lsx_vadd_h(temp2, q2_src);
165 
166  temp1 = __lsx_vadd_h(temp0, q2_src);
167  temp1 = __lsx_vsrari_h(temp1, 2);
168  temp2 = __lsx_vsub_h(temp1, q1_src);
169  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
170  dst4 = __lsx_vadd_h(temp2, q1_src);
171 
172  temp0 = __lsx_vslli_h(temp0, 1);
173  DUP2_ARG2(__lsx_vadd_h, temp0, p1_src, temp1, q2_src,
174  temp1, temp1);
175  temp1 = __lsx_vsrari_h(temp1, 3);
176  temp2 = __lsx_vsub_h(temp1, q0_src);
177  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
178  dst3 = __lsx_vadd_h(temp2, q0_src);
179 
180  q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
181  DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
182  q1_src, q_is_pcm_vec, dst3, dst4);
183  dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
184 
185  /* pack results to 8 bit */
186  DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
187  dst2 = __lsx_vpickev_b(dst5, dst4);
188 
189  /* pack src to 8 bit */
190  DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
191  dst3, dst4);
192  dst5 = __lsx_vpickev_b(q2_src, q1_src);
193 
194  cmp3 = __lsx_vnor_v(cmp3, cmp3);
195  DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
196  dst0, dst1);
197  dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
198 
199  __lsx_vstelm_d(dst0, p2, 0, 0);
200  __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
201  __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
202  __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
203  __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
204  __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
205  /* strong filter ends */
206  } else if (flag0 == flag1) { /* weak only */
207  /* weak filter */
208  tc_neg = __lsx_vneg_h(tc_pos);
209  DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
210  diff0, diff1);
211  DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
212  __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
213  delta0 = __lsx_vsub_h(diff0, diff1);
214  delta0 = __lsx_vsrari_h(delta0, 4);
215  temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
216  __lsx_vslli_h(tc_pos, 1));
217  abs_delta0 = __lsx_vadda_h(delta0, zero);
218  abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
219  abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
220 
221  delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
222  temp2 = __lsx_vadd_h(delta0, p0_src);
223  temp2 = __lsx_vclip255_h(temp2);
224  temp0 = __lsx_vbitsel_v(temp2, p0_src,
225  __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec));
226  temp2 = __lsx_vsub_h(q0_src, delta0);
227  temp2 = __lsx_vclip255_h(temp2);
228  temp2 = __lsx_vbitsel_v(temp2, q0_src, __lsx_vnor_v(q_is_pcm_vec,
229  q_is_pcm_vec));
230  DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
231  q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
232 
233  tmp = (beta + (beta >> 1)) >> 3;
234  DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
235  cmp0, cmp1);
236  cmp0 = __lsx_vpackev_d(cmp1, cmp0);
237  cmp0 = __lsx_vseqi_d(cmp0, 0);
238  p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, cmp0);
239 
240  DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
241  cmp0, cmp1);
242  cmp0 = __lsx_vpackev_d(cmp1, cmp0);
243  cmp0 = __lsx_vseqi_d(cmp0, 0);
244  q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, cmp0);
245  tc_pos = __lsx_vsrai_h(tc_pos, 1);
246  tc_neg = __lsx_vneg_h(tc_pos);
247 
248  DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
249  delta1, delta2);
250  DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
251  delta1, delta2);
252  delta1 = __lsx_vadd_h(delta1, delta0);
253  delta2 = __lsx_vsub_h(delta2, delta0);
254  DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
255  DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2,
256  tc_neg, tc_pos, delta1, delta2);
257  DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
258  delta1, delta2);
259  DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
260  DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
261  q1_src, q_is_pcm_vec, delta1, delta2);
262 
263  abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
264  DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
265  p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
266  q1_src, abs_delta0, dst1, dst2, dst3, dst4);
267  /* pack results to 8 bit */
268  DUP2_ARG2(__lsx_vpickev_b, dst2, dst1, dst4, dst3, dst0, dst1);
269  /* pack src to 8 bit */
270  DUP2_ARG2(__lsx_vpickev_b, p0_src, p1_src, q1_src, q0_src,
271  dst2, dst3);
272  cmp3 = __lsx_vnor_v(cmp3, cmp3);
273  DUP2_ARG3(__lsx_vbitsel_v, dst0, dst2, cmp3, dst1, dst3, cmp3,
274  dst0, dst1);
275 
276  p2 += stride;
277  __lsx_vstelm_d(dst0, p2, 0, 0);
278  __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
279  __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
280  __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
281  /* weak filter ends */
282  } else { /* strong + weak */
283  /* strong filter */
284  tc_pos = __lsx_vslli_h(tc_pos, 1);
285  tc_neg = __lsx_vneg_h(tc_pos);
286 
287  /* p part */
288  DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
289  temp0, temp0);
290  temp1 = __lsx_vadd_h(p3_src, p2_src);
291  temp1 = __lsx_vslli_h(temp1, 1);
292  DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
293  temp1 = __lsx_vsrari_h(temp1, 3);
294  temp2 = __lsx_vsub_h(temp1, p2_src);
295  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
296  dst0 = __lsx_vadd_h(temp2, p2_src);
297 
298  temp1 = __lsx_vadd_h(temp0, p2_src);
299  temp1 = __lsx_vsrari_h(temp1, 2);
300  temp2 = __lsx_vsub_h(temp1, p1_src);
301  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
302  dst1 = __lsx_vadd_h(temp2, p1_src);
303 
304  temp1 = __lsx_vslli_h(temp0, 1);
305  DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
306  temp1 = __lsx_vsrari_h(temp1, 3);
307  temp2 = __lsx_vsub_h(temp1, p0_src);
308  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
309  dst2 = __lsx_vadd_h(temp2, p0_src);
310 
311  p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
312  DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1,
313  p1_src, p_is_pcm_vec, dst0, dst1);
314  dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
315 
316  /* q part */
317  DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
318  temp0, temp0);
319  temp1 = __lsx_vadd_h(q3_src, q2_src);
320  temp1 = __lsx_vslli_h(temp1, 1);
321  DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
322  temp1 = __lsx_vsrari_h(temp1, 3);
323  temp2 = __lsx_vsub_h(temp1, q2_src);
324  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
325  dst5 = __lsx_vadd_h(temp2, q2_src);
326 
327  temp1 = __lsx_vadd_h(temp0, q2_src);
328  temp1 = __lsx_vsrari_h(temp1, 2);
329  temp2 = __lsx_vsub_h(temp1, q1_src);
330  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
331  dst4 = __lsx_vadd_h(temp2, q1_src);
332 
333  temp1 = __lsx_vslli_h(temp0, 1);
334  DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
335  temp1 = __lsx_vsrari_h(temp1, 3);
336  temp2 = __lsx_vsub_h(temp1, q0_src);
337  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
338  dst3 = __lsx_vadd_h(temp2, q0_src);
339 
340  q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
341  DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4,
342  q1_src, q_is_pcm_vec, dst3, dst4);
343  dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
344 
345  /* pack strong results to 8 bit */
346  DUP2_ARG2(__lsx_vpickev_b, dst1, dst0, dst3, dst2, dst0, dst1);
347  dst2 = __lsx_vpickev_b(dst5, dst4);
348  /* strong filter ends */
349 
350  /* weak filter */
351  tc_pos = __lsx_vsrai_h(tc_pos, 1);
352  tc_neg = __lsx_vneg_h(tc_pos);
353 
354  DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
355  diff0, diff1);
356  DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
357  __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
358  delta0 = __lsx_vsub_h(diff0, diff1);
359  delta0 = __lsx_vsrari_h(delta0, 4);
360  temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
361  __lsx_vslli_h(tc_pos, 1));
362  abs_delta0 = __lsx_vadda_h(delta0, zero);
363  abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
364  abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
365 
366  delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
367  temp2 = __lsx_vadd_h(delta0, p0_src);
368  temp2 = __lsx_vclip255_h(temp2);
369  temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
370 
371  temp2 = __lsx_vsub_h(q0_src, delta0);
372  temp2 = __lsx_vclip255_h(temp2);
373  temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
374 
375  tmp = (beta + (beta >> 1)) >> 3;
376  DUP2_ARG1(__lsx_vreplgr2vr_d, dp00 + dp30 < tmp, dp04 + dp34 < tmp,
377  cmp0, cmp1);
378  cmp0 = __lsx_vpackev_d(cmp1, cmp0);
379  p_is_pcm_vec = __lsx_vor_v(p_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
380  DUP2_ARG1(__lsx_vreplgr2vr_d, dq00 + dq30 < tmp, dq04 + dq34 < tmp,
381  cmp0, cmp1);
382  cmp0 = __lsx_vpackev_d(cmp1, cmp0);
383  q_is_pcm_vec = __lsx_vor_v(q_is_pcm_vec, __lsx_vseqi_d(cmp0, 0));
384 
385  tc_pos = __lsx_vsrai_h(tc_pos, 1);
386  tc_neg = __lsx_vneg_h(tc_pos);
387 
388  DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
389  delta1, delta2);
390  DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
391  delta1, delta2);
392  delta1 = __lsx_vadd_h(delta1, delta0);
393  delta2 = __lsx_vsub_h(delta2, delta0);
394  DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
395  DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
396  tc_pos, delta1, delta2);
397  DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
398  delta1, delta2);
399  DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
400  DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
401  q1_src, q_is_pcm_vec, delta1, delta2);
402  abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
403  DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
404  q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
405  q0_src, abs_delta0, delta1, delta2, temp0, temp2);
406  /* weak filter ends */
407 
408  /* pack weak results to 8 bit */
409  DUP2_ARG2(__lsx_vpickev_b, delta1, p2_src, temp2, temp0,
410  dst3, dst4);
411  dst5 = __lsx_vpickev_b(q2_src, delta2);
412 
413  /* select between weak or strong */
414  DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp2, dst1, dst4, cmp2,
415  dst0, dst1);
416  dst2 = __lsx_vbitsel_v(dst2, dst5, cmp2);
417 
418  /* pack src to 8 bit */
419  DUP2_ARG2(__lsx_vpickev_b, p1_src, p2_src, q0_src, p0_src,
420  dst3, dst4);
421  dst5 = __lsx_vpickev_b(q2_src, q1_src);
422 
423  cmp3 = __lsx_vnor_v(cmp3, cmp3);
424  DUP2_ARG3(__lsx_vbitsel_v, dst0, dst3, cmp3, dst1, dst4, cmp3,
425  dst0, dst1);
426  dst2 = __lsx_vbitsel_v(dst2, dst5, cmp3);
427 
428  __lsx_vstelm_d(dst0, p2, 0, 0);
429  __lsx_vstelm_d(dst0, p2 + stride, 0, 1);
430  __lsx_vstelm_d(dst1, p2 + stride_2x, 0, 0);
431  __lsx_vstelm_d(dst1, p2 + stride_3x, 0, 1);
432  __lsx_vstelm_d(dst2, p2 + stride_4x, 0, 0);
433  __lsx_vstelm_d(dst2, p2 + stride_4x + stride, 0, 1);
434  }
435  }
436 }
437 
438 void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride,
439  int32_t beta, int32_t *tc,
440  uint8_t *p_is_pcm, uint8_t *q_is_pcm)
441 {
442  ptrdiff_t stride_2x = (stride << 1);
443  ptrdiff_t stride_4x = (stride << 2);
444  ptrdiff_t stride_3x = stride_2x + stride;
445  uint8_t *p3 = src;
446  uint8_t *p2 = src + stride_3x;
447  uint8_t *p1 = src + stride_4x;
448  uint8_t *p0 = src + stride_4x + stride_3x;
449  uint8_t flag0, flag1;
450  int32_t dp00, dq00, dp30, dq30, d00, d30;
451  int32_t d0030, d0434;
452  int32_t dp04, dq04, dp34, dq34, d04, d34;
453  int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
454  int32_t tc4, p_is_pcm4, q_is_pcm4, tc254, tmp;
455 
456  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
457  __m128i cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
458  __m128i cmp3;
459  __m128i temp0, temp1;
460  __m128i temp2;
461  __m128i tc_pos, tc_neg;
462  __m128i diff0, diff1, delta0, delta1, delta2, abs_delta0;
463  __m128i zero = {0};
464  __m128i p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
465 
466  dp00 = abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
467  dq00 = abs(p3[2] - (p3[1] << 1) + p3[0]);
468  dp30 = abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
469  dq30 = abs(p2[2] - (p2[1] << 1) + p2[0]);
470  d00 = dp00 + dq00;
471  d30 = dp30 + dq30;
472  p_is_pcm0 = p_is_pcm[0];
473  q_is_pcm0 = q_is_pcm[0];
474 
475  dp04 = abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
476  dq04 = abs(p1[2] - (p1[1] << 1) + p1[0]);
477  dp34 = abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
478  dq34 = abs(p0[2] - (p0[1] << 1) + p0[0]);
479  d04 = dp04 + dq04;
480  d34 = dp34 + dq34;
481  p_is_pcm4 = p_is_pcm[1];
482  q_is_pcm4 = q_is_pcm[1];
483 
484  DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm0, p_is_pcm4, cmp0, cmp1);
485  p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
486  p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
487 
488  d0030 = (d00 + d30) >= beta;
489  d0434 = (d04 + d34) >= beta;
490 
491  DUP2_ARG1(__lsx_vreplgr2vr_d, d0030, d0434, cmp0, cmp1);
492  cmp3 = __lsx_vpackev_d(cmp1, cmp0);
493  cmp3 = __lsx_vseqi_d(cmp3, 0);
494 
495  if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
496  (!d0030 || !d0434)) {
497  src -= 4;
498  DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
499  src + stride_3x, 0, p3_src, p2_src, p1_src, p0_src);
500  src += stride_4x;
501  DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
502  src + stride_3x, 0, q0_src, q1_src, q2_src, q3_src);
503  src -= stride_4x;
504 
505  DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm0, q_is_pcm4, cmp0, cmp1);
506  q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
507  q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
508 
509  tc0 = tc[0];
510  beta30 = beta >> 3;
511  beta20 = beta >> 2;
512  tc250 = (((tc0 << 2) + tc0 + 1) >> 1);
513  tc4 = tc[1];
514  tc254 = (((tc4 << 2) + tc4 + 1) >> 1);
515  DUP2_ARG1( __lsx_vreplgr2vr_h, tc0 << 1, tc4 << 1, cmp0, cmp1);
516  tc_pos = __lsx_vpackev_d(cmp1, cmp0);
517  LSX_TRANSPOSE8x8_B(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src,
518  q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
519  q0_src, q1_src, q2_src, q3_src);
520 
521  flag0 = abs(p3[-4] - p3[-1]) + abs(p3[3] - p3[0]) < beta30 &&
522  abs(p3[-1] - p3[0]) < tc250;
523  flag0 = flag0 && (abs(p2[-4] - p2[-1]) + abs(p2[3] - p2[0]) < beta30 &&
524  abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
525  (d30 << 1) < beta20);
526  cmp0 = __lsx_vreplgr2vr_d(flag0);
527  DUP4_ARG2(__lsx_vilvl_b, zero, p3_src, zero, p2_src, zero, p1_src, zero,
528  p0_src, p3_src, p2_src, p1_src, p0_src);
529 
530  flag1 = abs(p1[-4] - p1[-1]) + abs(p1[3] - p1[0]) < beta30 &&
531  abs(p1[-1] - p1[0]) < tc254;
532  flag1 = flag1 && (abs(p0[-4] - p0[-1]) + abs(p0[3] - p0[0]) < beta30 &&
533  abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
534  (d34 << 1) < beta20);
535  DUP4_ARG2(__lsx_vilvl_b, zero, q0_src, zero, q1_src, zero, q2_src, zero,
536  q3_src, q0_src, q1_src, q2_src, q3_src);
537 
538  cmp1 = __lsx_vreplgr2vr_d(flag1);
539  cmp2 = __lsx_vpackev_d(cmp1, cmp0);
540  cmp2 = __lsx_vseqi_d(cmp2, 0);
541 
542  if (flag0 && flag1) { /* strong only */
543  /* strong filter */
544  tc_neg = __lsx_vneg_h(tc_pos);
545  /* p part */
546  DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
547  temp0, temp0);
548  temp1 = __lsx_vadd_h(p3_src, p2_src);
549  temp1 = __lsx_vslli_h(temp1, 1);
550  DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
551  temp1 = __lsx_vsrari_h(temp1, 3);
552  temp2 = __lsx_vsub_h(temp1, p2_src);
553  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
554  dst0 = __lsx_vadd_h(temp2, p2_src);
555 
556  temp1 = __lsx_vadd_h(temp0, p2_src);
557  temp1 = __lsx_vsrari_h(temp1, 2);
558  temp2 = __lsx_vsub_h(temp1, p1_src);
559  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
560  dst1 = __lsx_vadd_h(temp2, p1_src);
561 
562  temp1 = __lsx_vslli_h(temp0, 1);
563  DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
564  temp1 = __lsx_vsrari_h(temp1, 3);
565  temp2 = __lsx_vsub_h(temp1, p0_src);
566  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
567  dst2 = __lsx_vadd_h(temp2, p0_src);
568 
569  p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
570  DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
571  p_is_pcm_vec, dst0, dst1);
572  dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
573 
574  /* q part */
575  DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src,
576  temp0, temp0);
577  temp1 = __lsx_vadd_h(q3_src, q2_src);
578  temp1 = __lsx_vslli_h(temp1, 1);
579  DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
580  temp1 = __lsx_vsrari_h(temp1, 3);
581  temp2 = __lsx_vsub_h(temp1, q2_src);
582  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
583  dst5 = __lsx_vadd_h(temp2, q2_src);
584 
585  temp1 = __lsx_vadd_h(temp0, q2_src);
586  temp1 = __lsx_vsrari_h(temp1, 2);
587  temp2 = __lsx_vsub_h(temp1, q1_src);
588  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
589  dst4 = __lsx_vadd_h(temp2, q1_src);
590 
591  temp1 = __lsx_vslli_h(temp0, 1);
592  DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
593  temp1 = __lsx_vsrari_h(temp1, 3);
594  temp2 = __lsx_vsub_h(temp1, q0_src);
595  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
596  dst3 = __lsx_vadd_h(temp2, q0_src);
597 
598  q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
599  DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
600  q_is_pcm_vec, dst3, dst4);
601  dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
602  /* strong filter ends */
603  } else if (flag0 == flag1) { /* weak only */
604  /* weak filter */
605  tc_pos = __lsx_vsrai_h(tc_pos, 1);
606  tc_neg = __lsx_vneg_h(tc_pos);
607 
608  DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
609  diff0, diff1);
610  DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
611  __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
612  delta0 = __lsx_vsub_h(diff0, diff1);
613  delta0 = __lsx_vsrari_h(delta0, 4);
614  temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
615  __lsx_vslli_h(tc_pos, 1));
616  abs_delta0 = __lsx_vadda_h(delta0, zero);
617  abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
618  abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
619 
620  delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
621  temp2 = __lsx_vadd_h(delta0, p0_src);
622  temp2 = __lsx_vclip255_h(temp2);
623  p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
624  temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
625 
626  temp2 = __lsx_vsub_h(q0_src, delta0);
627  temp2 = __lsx_vclip255_h(temp2);
628  q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
629  temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
630 
631  tmp = ((beta + (beta >> 1)) >> 3);
632  DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
633  !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
634  p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
635  p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
636 
637  DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
638  (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
639  q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
640  q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
641  tc_pos = __lsx_vsrai_h(tc_pos, 1);
642  tc_neg = __lsx_vneg_h(tc_pos);
643 
644  DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
645  delta1, delta2);
646  DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
647  delta1, delta2);
648  delta1 = __lsx_vadd_h(delta1, delta0);
649  delta2 = __lsx_vsub_h(delta2, delta0);
650  DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
651  DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
652  tc_pos, delta1, delta2);
653  DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
654  delta1, delta2);
655  DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
656  DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
657  q1_src, q_is_pcm_vec, delta1, delta2);
658 
659  abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
660  DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, temp0,
661  p0_src, abs_delta0, temp2, q0_src, abs_delta0, delta2,
662  q1_src, abs_delta0, dst0, dst1, dst2, dst3);
663  /* weak filter ends */
664 
665  cmp3 = __lsx_vnor_v(cmp3, cmp3);
666  DUP4_ARG3(__lsx_vbitsel_v, dst0, p1_src, cmp3, dst1, p0_src,
667  cmp3, dst2, q0_src, cmp3, dst3, q1_src, cmp3,
668  dst0, dst1, dst2, dst3);
669  DUP2_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst0, dst1);
670 
671  /* transpose */
672  dst4 = __lsx_vilvl_b(dst1, dst0);
673  dst5 = __lsx_vilvh_b(dst1, dst0);
674  dst0 = __lsx_vilvl_h(dst5, dst4);
675  dst1 = __lsx_vilvh_h(dst5, dst4);
676 
677  src += 2;
678  __lsx_vstelm_w(dst0, src, 0, 0);
679  __lsx_vstelm_w(dst0, src + stride, 0, 1);
680  __lsx_vstelm_w(dst0, src + stride_2x, 0, 2);
681  __lsx_vstelm_w(dst0, src + stride_3x, 0, 3);
682  src += stride_4x;
683  __lsx_vstelm_w(dst1, src, 0, 0);
684  __lsx_vstelm_w(dst1, src + stride, 0, 1);
685  __lsx_vstelm_w(dst1, src + stride_2x, 0, 2);
686  __lsx_vstelm_w(dst1, src + stride_3x, 0, 3);
687  return;
688  } else { /* strong + weak */
689  /* strong filter */
690  tc_neg = __lsx_vneg_h(tc_pos);
691 
692  /* p part */
693  DUP2_ARG2(__lsx_vadd_h, p1_src, p0_src, temp0, q0_src,
694  temp0, temp0);
695 
696  temp1 = __lsx_vadd_h(p3_src, p2_src);
697  temp1 = __lsx_vslli_h(temp1, 1);
698  DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, temp0, temp1, temp1);
699  temp1 = __lsx_vsrari_h(temp1, 3);
700  temp2 = __lsx_vsub_h(temp1, p2_src);
701  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
702  dst0 = __lsx_vadd_h(temp2, p2_src);
703 
704  temp1 = __lsx_vadd_h(temp0, p2_src);
705  temp1 = __lsx_vsrari_h(temp1, 2);
706  temp2 = __lsx_vsub_h(temp1, p1_src);
707  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
708  dst1 = __lsx_vadd_h(temp2, p1_src);
709 
710  temp1 = __lsx_vslli_h(temp0, 1);
711  DUP2_ARG2(__lsx_vadd_h, temp1, p2_src, temp1, q1_src, temp1, temp1);
712  temp1 = __lsx_vsrari_h(temp1, 3);
713  temp2 = __lsx_vsub_h(temp1, p0_src);
714  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
715  dst2 = __lsx_vadd_h(temp2, p0_src);
716 
717  p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
718  DUP2_ARG3(__lsx_vbitsel_v, dst0, p2_src, p_is_pcm_vec, dst1, p1_src,
719  p_is_pcm_vec, dst0, dst1);
720  dst2 = __lsx_vbitsel_v(dst2, p0_src, p_is_pcm_vec);
721 
722  /* q part */
723  DUP2_ARG2(__lsx_vadd_h, q1_src, p0_src, temp0, q0_src, temp0, temp0);
724  temp1 = __lsx_vadd_h(q3_src, q2_src);
725  temp1 = __lsx_vslli_h(temp1, 1);
726  DUP2_ARG2(__lsx_vadd_h, temp1, q2_src, temp1, temp0, temp1, temp1);
727  temp1 = __lsx_vsrari_h(temp1, 3);
728  temp2 = __lsx_vsub_h(temp1, q2_src);
729  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
730  dst5 = __lsx_vadd_h(temp2, q2_src);
731 
732  temp1 = __lsx_vadd_h(temp0, q2_src);
733  temp1 = __lsx_vsrari_h(temp1, 2);
734  temp2 = __lsx_vsub_h(temp1, q1_src);
735  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
736  dst4 = __lsx_vadd_h(temp2, q1_src);
737 
738  temp1 = __lsx_vslli_h(temp0, 1);
739  DUP2_ARG2(__lsx_vadd_h, temp1, p1_src, temp1, q2_src, temp1, temp1);
740  temp1 = __lsx_vsrari_h(temp1, 3);
741  temp2 = __lsx_vsub_h(temp1, q0_src);
742  temp2 = __lsx_vclip_h(temp2, tc_neg, tc_pos);
743  dst3 = __lsx_vadd_h(temp2, q0_src);
744 
745  q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
746  DUP2_ARG3(__lsx_vbitsel_v, dst3, q0_src, q_is_pcm_vec, dst4, q1_src,
747  q_is_pcm_vec, dst3, dst4);
748  dst5 = __lsx_vbitsel_v(dst5, q2_src, q_is_pcm_vec);
749  /* strong filter ends */
750 
751  /* weak filter */
752  tc_pos = __lsx_vsrai_h(tc_pos, 1);
753  tc_neg = __lsx_vneg_h(tc_pos);
754 
755  DUP2_ARG2(__lsx_vsub_h, q0_src, p0_src, q1_src, p1_src,
756  diff0, diff1);
757  DUP2_ARG2(__lsx_vadd_h, __lsx_vslli_h(diff0, 3), diff0,
758  __lsx_vslli_h(diff1, 1), diff1, diff0, diff1);
759  delta0 = __lsx_vsub_h(diff0, diff1);
760  delta0 = __lsx_vsrari_h(delta0, 4);
761 
762  temp1 = __lsx_vadd_h(__lsx_vslli_h(tc_pos, 3),
763  __lsx_vslli_h(tc_pos, 1));
764  abs_delta0 = __lsx_vadda_h(delta0, zero);
765  abs_delta0 = __lsx_vsle_hu(temp1, abs_delta0);
766  abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
767  delta0 = __lsx_vclip_h(delta0, tc_neg, tc_pos);
768  temp2 = __lsx_vadd_h(delta0, p0_src);
769  temp2 = __lsx_vclip255_h(temp2);
770  temp0 = __lsx_vbitsel_v(temp2, p0_src, p_is_pcm_vec);
771  temp2 = __lsx_vsub_h(q0_src, delta0);
772  temp2 = __lsx_vclip255_h(temp2);
773  temp2 = __lsx_vbitsel_v(temp2, q0_src, q_is_pcm_vec);
774 
775  tmp = (beta + (beta >> 1)) >> 3;
776  DUP2_ARG1(__lsx_vreplgr2vr_d, !p_is_pcm0 && ((dp00 + dp30) < tmp),
777  !p_is_pcm4 && ((dp04 + dp34) < tmp), cmp0, cmp1);
778  p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
779  p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
780 
781  DUP2_ARG1(__lsx_vreplgr2vr_h, (!q_is_pcm0) && (dq00 + dq30 < tmp),
782  (!q_is_pcm4) && (dq04 + dq34 < tmp), cmp0, cmp1);
783  q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
784  q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
785  tc_pos = __lsx_vsrai_h(tc_pos, 1);
786  tc_neg = __lsx_vneg_h(tc_pos);
787 
788  DUP2_ARG2(__lsx_vavgr_hu, p2_src, p0_src, q0_src, q2_src,
789  delta1, delta2);
790  DUP2_ARG2(__lsx_vsub_h, delta1, p1_src, delta2, q1_src,
791  delta1, delta2);
792  delta1 = __lsx_vadd_h(delta1, delta0);
793  delta2 = __lsx_vsub_h(delta2, delta0);
794  DUP2_ARG2(__lsx_vsrai_h, delta1, 1, delta2, 1, delta1, delta2);
795  DUP2_ARG3(__lsx_vclip_h, delta1, tc_neg, tc_pos, delta2, tc_neg,
796  tc_pos, delta1, delta2);
797  DUP2_ARG2(__lsx_vadd_h, p1_src, delta1, q1_src, delta2,
798  delta1, delta2);
799  DUP2_ARG1(__lsx_vclip255_h, delta1, delta2, delta1, delta2);
800  DUP2_ARG3(__lsx_vbitsel_v, delta1, p1_src, p_is_pcm_vec, delta2,
801  q1_src, q_is_pcm_vec, delta1, delta2);
802 
803  abs_delta0 = __lsx_vnor_v(abs_delta0, abs_delta0);
804  DUP4_ARG3(__lsx_vbitsel_v, delta1, p1_src, abs_delta0, delta2,
805  q1_src, abs_delta0, temp0, p0_src, abs_delta0, temp2,
806  q0_src, abs_delta0, delta1, delta2, temp0, temp2);
807  /* weak filter ends*/
808 
809  /* select between weak or strong */
810  DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp2, dst1, delta1,
811  cmp2, dst2, temp0, cmp2, dst3, temp2, cmp2,
812  dst0, dst1, dst2, dst3);
813  DUP2_ARG3(__lsx_vbitsel_v, dst4, delta2, cmp2, dst5, q2_src, cmp2,
814  dst4, dst5);
815  }
816 
817  cmp3 = __lsx_vnor_v(cmp3, cmp3);
818  DUP4_ARG3(__lsx_vbitsel_v, dst0, p2_src, cmp3, dst1, p1_src, cmp3, dst2,
819  p0_src, cmp3, dst3, q0_src, cmp3, dst0, dst1, dst2, dst3);
820  DUP2_ARG3(__lsx_vbitsel_v, dst4, q1_src, cmp3, dst5, q2_src, cmp3,
821  dst4, dst5);
822 
823  /* pack results to 8 bit */
824  DUP4_ARG2(__lsx_vpickev_b, dst2, dst0, dst3, dst1, dst4, dst4, dst5,
825  dst5, dst0, dst1, dst2, dst3);
826 
827  /* transpose */
828  DUP2_ARG2(__lsx_vilvl_b, dst1, dst0, dst3, dst2, dst4, dst6);
829  DUP2_ARG2(__lsx_vilvh_b, dst1, dst0, dst3, dst2, dst5, dst7);
830  DUP2_ARG2(__lsx_vilvl_h, dst5, dst4, dst7, dst6, dst0, dst2);
831  DUP2_ARG2(__lsx_vilvh_h, dst5, dst4, dst7, dst6, dst1, dst3);
832 
833  src += 1;
834  __lsx_vstelm_w(dst0, src, 0, 0);
835  __lsx_vstelm_h(dst2, src, 4, 0);
836  src += stride;
837  __lsx_vstelm_w(dst0, src, 0, 1);
838  __lsx_vstelm_h(dst2, src, 4, 2);
839  src += stride;
840 
841  __lsx_vstelm_w(dst0, src, 0, 2);
842  __lsx_vstelm_h(dst2, src, 4, 4);
843  src += stride;
844  __lsx_vstelm_w(dst0, src, 0, 3);
845  __lsx_vstelm_h(dst2, src, 4, 6);
846  src += stride;
847 
848  __lsx_vstelm_w(dst1, src, 0, 0);
849  __lsx_vstelm_h(dst3, src, 4, 0);
850  src += stride;
851  __lsx_vstelm_w(dst1, src, 0, 1);
852  __lsx_vstelm_h(dst3, src, 4, 2);
853  src += stride;
854 
855  __lsx_vstelm_w(dst1, src, 0, 2);
856  __lsx_vstelm_h(dst3, src, 4, 4);
857  src += stride;
858  __lsx_vstelm_w(dst1, src, 0, 3);
859  __lsx_vstelm_h(dst3, src, 4, 6);
860  }
861 }
862 
864  int32_t *tc, uint8_t *p_is_pcm,
865  uint8_t *q_is_pcm)
866 {
867  uint8_t *p1_ptr = src - (stride << 1);
868  uint8_t *p0_ptr = src - stride;
869  uint8_t *q0_ptr = src;
870  uint8_t *q1_ptr = src + stride;
871  __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
872  __m128i p1, p0, q0, q1;
873  __m128i tc_pos, tc_neg;
874  __m128i zero = {0};
875  __m128i temp0, temp1, delta;
876 
877  if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
878  DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
879  tc_pos = __lsx_vpackev_d(cmp1, cmp0);
880  tc_neg = __lsx_vneg_h(tc_pos);
881  DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
882  p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
883  p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
884 
885  DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
886  q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
887  q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
888 
889  DUP4_ARG2(__lsx_vld, p1_ptr, 0, p0_ptr, 0, q0_ptr, 0, q1_ptr, 0,
890  p1, p0, q0, q1);
891  DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
892  p1, p0, q0, q1);
893  DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
894  temp0 = __lsx_vslli_h(temp0, 2);
895  temp0 = __lsx_vadd_h(temp0, temp1);
896  delta = __lsx_vsrari_h(temp0, 3);
897  delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
898  temp0 = __lsx_vadd_h(p0, delta);
899  temp0 = __lsx_vclip255_h(temp0);
900  p_is_pcm_vec = __lsx_vnor_v(p_is_pcm_vec, p_is_pcm_vec);
901  temp0 = __lsx_vbitsel_v(temp0, p0, p_is_pcm_vec);
902 
903  temp1 = __lsx_vsub_h(q0, delta);
904  temp1 = __lsx_vclip255_h(temp1);
905  q_is_pcm_vec = __lsx_vnor_v(q_is_pcm_vec, q_is_pcm_vec);
906  temp1 = __lsx_vbitsel_v(temp1, q0, q_is_pcm_vec);
907 
908  tc_pos = __lsx_vslei_d(tc_pos, 0);
909  DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
910  temp0, temp1);
911  temp0 = __lsx_vpickev_b(temp1, temp0);
912  __lsx_vstelm_d(temp0, p0_ptr, 0, 0);
913  __lsx_vstelm_d(temp0, p0_ptr + stride, 0, 1);
914  }
915 }
916 
918  int32_t *tc, uint8_t *p_is_pcm,
919  uint8_t *q_is_pcm)
920 {
921  ptrdiff_t stride_2x = (stride << 1);
922  ptrdiff_t stride_4x = (stride << 2);
923  ptrdiff_t stride_3x = stride_2x + stride;
924  __m128i cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
925  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
926  __m128i p1, p0, q0, q1;
927  __m128i tc_pos, tc_neg;
928  __m128i zero = {0};
929  __m128i temp0, temp1, delta;
930 
931  if (!(tc[0] <= 0) || !(tc[1] <= 0)) {
932  DUP2_ARG1(__lsx_vreplgr2vr_h, tc[0], tc[1], cmp0, cmp1);
933  tc_pos = __lsx_vpackev_d(cmp1, cmp0);
934  tc_neg = __lsx_vneg_h(tc_pos);
935 
936  DUP2_ARG1(__lsx_vreplgr2vr_d, p_is_pcm[0], p_is_pcm[1], cmp0, cmp1);
937  p_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
938  p_is_pcm_vec = __lsx_vseqi_d(p_is_pcm_vec, 0);
939  DUP2_ARG1(__lsx_vreplgr2vr_d, q_is_pcm[0], q_is_pcm[1], cmp0, cmp1);
940  q_is_pcm_vec = __lsx_vpackev_d(cmp1, cmp0);
941  q_is_pcm_vec = __lsx_vseqi_d(q_is_pcm_vec, 0);
942 
943  src -= 2;
944  DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
945  src + stride_3x, 0, src0, src1, src2, src3);
946  src += stride_4x;
947  DUP4_ARG2(__lsx_vld, src, 0, src + stride, 0, src + stride_2x, 0,
948  src + stride_3x, 0, src4, src5, src6, src7);
949  src -= stride_4x;
950  LSX_TRANSPOSE8x4_B(src0, src1, src2, src3, src4, src5, src6, src7,
951  p1, p0, q0, q1);
952  DUP4_ARG2(__lsx_vilvl_b, zero, p1, zero, p0, zero, q0, zero, q1,
953  p1, p0, q0, q1);
954 
955  DUP2_ARG2(__lsx_vsub_h, q0, p0, p1, q1, temp0, temp1);
956  temp0 = __lsx_vslli_h(temp0, 2);
957  temp0 = __lsx_vadd_h(temp0, temp1);
958  delta = __lsx_vsrari_h(temp0, 3);
959  delta = __lsx_vclip_h(delta, tc_neg, tc_pos);
960 
961  temp0 = __lsx_vadd_h(p0, delta);
962  temp1 = __lsx_vsub_h(q0, delta);
963  DUP2_ARG1(__lsx_vclip255_h, temp0, temp1, temp0, temp1);
964  DUP2_ARG2(__lsx_vnor_v, p_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec,
965  q_is_pcm_vec, p_is_pcm_vec, q_is_pcm_vec);
966  DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, p_is_pcm_vec, temp1, q0,
967  q_is_pcm_vec, temp0, temp1);
968 
969  tc_pos = __lsx_vslei_d(tc_pos, 0);
970  DUP2_ARG3(__lsx_vbitsel_v, temp0, p0, tc_pos, temp1, q0, tc_pos,
971  temp0, temp1);
972  temp0 = __lsx_vpackev_b(temp1, temp0);
973 
974  src += 1;
975  __lsx_vstelm_h(temp0, src, 0, 0);
976  __lsx_vstelm_h(temp0, src + stride, 0, 1);
977  __lsx_vstelm_h(temp0, src + stride_2x, 0, 2);
978  __lsx_vstelm_h(temp0, src + stride_3x, 0, 3);
979  src += stride_4x;
980  __lsx_vstelm_h(temp0, src, 0, 4);
981  __lsx_vstelm_h(temp0, src + stride, 0, 5);
982  __lsx_vstelm_h(temp0, src + stride_2x, 0, 6);
983  __lsx_vstelm_h(temp0, src + stride_3x, 0, 7);
984  src -= stride_4x;
985  }
986 }
987 
989  int32_t dst_stride,
990  uint8_t *src,
991  int32_t src_stride,
992  int16_t *sao_offset_val,
993  int32_t height)
994 {
995  const int32_t src_stride_2x = (src_stride << 1);
996  const int32_t dst_stride_2x = (dst_stride << 1);
997  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
998  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
999  __m128i edge_idx = {0x403000201, 0x0};
1000  __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
1001  __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
1002  __m128i src_minus10, src_minus11, src_plus10, offset, src0, dst0;
1003  __m128i const1 = __lsx_vldi(1);
1004  __m128i zero = {0};
1005 
1006  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1007  src -= 1;
1008 
1009  /* load in advance */
1010  DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
1011 
1012  for (height -= 2; height; height -= 2) {
1013  src += src_stride_2x;
1014  src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
1015  src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
1016  src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
1017 
1018  DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
1019  cmp_minus10, cmp_minus11);
1020  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1021  cmp_minus11, diff_minus10, diff_minus11);
1022  DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
1023  cmp_minus10, cmp_minus11);
1024  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1025  cmp_minus11, cmp_minus10, cmp_minus11);
1026  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1027  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1028 
1029  offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1030  offset = __lsx_vaddi_bu(offset, 2);
1031 
1032  /* load in advance */
1033  DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
1034  src_minus10, src_minus11);
1035  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset,
1036  sao_offset, sao_offset, offset, offset, offset);
1037  src0 = __lsx_vxori_b(src0, 128);
1038  dst0 = __lsx_vsadd_b(src0, offset);
1039  dst0 = __lsx_vxori_b(dst0, 128);
1040 
1041  __lsx_vstelm_w(dst0, dst, 0, 0);
1042  __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1043  dst += dst_stride_2x;
1044  }
1045 
1046  src_minus10 = __lsx_vpickev_d(src_minus11, src_minus10);
1047  src0 = __lsx_vshuf_b(zero, src_minus10, shuf1);
1048  src_plus10 = __lsx_vshuf_b(zero, src_minus10, shuf2);
1049 
1050  DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
1051  cmp_minus11);
1052  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1053  diff_minus10, diff_minus11);
1054  DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
1055  cmp_minus11);
1056  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1057  cmp_minus10, cmp_minus11);
1058  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1059  const1, cmp_minus11, diff_minus10, diff_minus11);
1060 
1061  offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1062  offset = __lsx_vaddi_bu(offset, 2);
1063  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset, sao_offset,
1064  offset, offset, offset);
1065  src0 = __lsx_vxori_b(src0, 128);
1066  dst0 = __lsx_vsadd_b(src0, offset);
1067  dst0 = __lsx_vxori_b(dst0, 128);
1068 
1069  __lsx_vstelm_w(dst0, dst, 0, 0);
1070  __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1071 }
1072 
1074  int32_t dst_stride,
1075  uint8_t *src,
1076  int32_t src_stride,
1077  int16_t *sao_offset_val,
1078  int32_t height)
1079 {
1080  const int32_t src_stride_2x = (src_stride << 1);
1081  const int32_t dst_stride_2x = (dst_stride << 1);
1082  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1083  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1084  __m128i edge_idx = {0x403000201, 0x0};
1085  __m128i const1 = __lsx_vldi(1);
1086  __m128i cmp_minus10, cmp_minus11, diff_minus10, diff_minus11;
1087  __m128i src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
1088  __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1089  __m128i zeros = {0};
1090 
1091  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1092  src -= 1;
1093 
1094  /* load in advance */
1095  DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src_minus10, src_minus11);
1096 
1097  for (height -= 2; height; height -= 2) {
1098  src += src_stride_2x;
1099  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros,
1100  src_minus11, shuf1, src0, src1);
1101  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros,
1102  src_minus11, shuf2, src_plus10, src_plus11);
1103  DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
1104  src_plus10, src_minus10, src_plus10);
1105  src0 = __lsx_vpickev_d(src1, src0);
1106 
1107  DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10,
1108  cmp_minus10, cmp_minus11);
1109  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1110  cmp_minus11, diff_minus10, diff_minus11);
1111  DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10,
1112  cmp_minus10, cmp_minus11);
1113  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1114  cmp_minus11, cmp_minus10, cmp_minus11);
1115  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1116  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1117 
1118  offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1119  offset = __lsx_vaddi_bu(offset, 2);
1120 
1121  /* load in advance */
1122  DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0,
1123  src_minus10, src_minus11);
1124  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1125  sao_offset, offset, offset, offset);
1126  src0 = __lsx_vxori_b(src0, 128);
1127  dst0 = __lsx_vsadd_b(src0, offset);
1128  dst0 = __lsx_vxori_b(dst0, 128);
1129 
1130  __lsx_vstelm_d(dst0, dst, 0, 0);
1131  __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1132  dst += dst_stride_2x;
1133  }
1134 
1135  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf1, zeros, src_minus11,
1136  shuf1, src0, src1);
1137  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
1138  shuf2, src_plus10, src_plus11);
1139  DUP2_ARG2(__lsx_vpickev_d, src_minus11, src_minus10, src_plus11,
1140  src_plus10, src_minus10, src_plus10);
1141  src0 = __lsx_vpickev_d(src1, src0);
1142 
1143  DUP2_ARG2(__lsx_vseq_b, src0, src_minus10, src0, src_plus10, cmp_minus10,
1144  cmp_minus11);
1145  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1146  diff_minus10, diff_minus11);
1147  DUP2_ARG2(__lsx_vsle_bu, src0, src_minus10, src0, src_plus10, cmp_minus10,
1148  cmp_minus11);
1149  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1150  cmp_minus10, cmp_minus11);
1151  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1152  const1, cmp_minus11, diff_minus10, diff_minus11);
1153 
1154  offset = __lsx_vadd_b(diff_minus10, diff_minus11);
1155  offset = __lsx_vaddi_bu(offset, 2);
1156  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1157  sao_offset, offset, offset, offset);
1158  src0 = __lsx_vxori_b(src0, 128);
1159  dst0 = __lsx_vsadd_b(src0, offset);
1160  dst0 = __lsx_vxori_b(dst0, 128);
1161 
1162  __lsx_vstelm_d(dst0, dst, 0, 0);
1163  __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1164 }
1165 
1167  int32_t dst_stride,
1168  uint8_t *src,
1169  int32_t src_stride,
1170  int16_t *sao_offset_val,
1171  int32_t width,
1172  int32_t height)
1173 {
1174  uint8_t *dst_ptr, *src_minus1;
1175  int32_t v_cnt;
1176  const int32_t src_stride_2x = (src_stride << 1);
1177  const int32_t dst_stride_2x = (dst_stride << 1);
1178  const int32_t src_stride_4x = (src_stride << 2);
1179  const int32_t dst_stride_4x = (dst_stride << 2);
1180  const int32_t src_stride_3x = src_stride_2x + src_stride;
1181  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1182 
1183  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1184  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1185  __m128i edge_idx = {0x403000201, 0x0};
1186  __m128i const1 = __lsx_vldi(1);
1187  __m128i sao_offset;
1188  __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1189  __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1190  __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1191  __m128i diff_plus13;
1192  __m128i src10, src11, src12, src13, dst0, dst1, dst2, dst3;
1193  __m128i src_minus10, src_minus11, src_minus12, src_minus13;
1194  __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1195  __m128i src_zero0, src_zero1, src_zero2, src_zero3;
1196  __m128i src_plus10, src_plus11, src_plus12, src_plus13;
1197 
1198  sao_offset = __lsx_vld(sao_offset_val, 0);
1199  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1200 
1201  for (; height; height -= 4) {
1202  src_minus1 = src - 1;
1203  src_minus10 = __lsx_vld(src_minus1, 0);
1204  DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
1205  src_stride_2x, src_minus11, src_minus12);
1206  src_minus13 = __lsx_vldx(src_minus1, src_stride_3x);
1207 
1208  for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1209  src_minus1 += 16;
1210  dst_ptr = dst + v_cnt;
1211  src10 = __lsx_vld(src_minus1, 0);
1212  DUP2_ARG2(__lsx_vldx, src_minus1, src_stride, src_minus1,
1213  src_stride_2x, src11, src12);
1214  src13 = __lsx_vldx(src_minus1, src_stride_3x);
1215  DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf1, src11,
1216  src_minus11, shuf1, src12, src_minus12, shuf1, src13,
1217  src_minus13, shuf1, src_zero0, src_zero1,
1218  src_zero2, src_zero3);
1219  DUP4_ARG3(__lsx_vshuf_b, src10, src_minus10, shuf2, src11,
1220  src_minus11, shuf2, src12, src_minus12, shuf2, src13,
1221  src_minus13, shuf2, src_plus10, src_plus11,
1222  src_plus12, src_plus13);
1223  DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
1224  src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
1225  cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
1226  DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
1227  src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
1228  cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
1229  DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1230  cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1231  cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1232  diff_plus11);
1233  DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1234  cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1235  cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1236  diff_plus13);
1237  DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
1238  src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
1239  cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
1240  DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
1241  src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
1242  cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
1243  DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1244  cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1245  cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1246  cmp_plus11);
1247  DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1248  cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1249  cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1250  cmp_plus13);
1251  DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1252  diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1253  cmp_minus11, diff_plus11, const1, cmp_plus11,
1254  diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1255  DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1256  diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1257  cmp_minus13, diff_plus13, const1, cmp_plus13,
1258  diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1259 
1260  DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1261  diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1262  diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1263  offset_mask3);
1264  DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1265  offset_mask2, 2, offset_mask3, 2, offset_mask0,
1266  offset_mask1, offset_mask2, offset_mask3);
1267  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1268  sao_offset, sao_offset, offset_mask0, offset_mask0,
1269  offset_mask0);
1270  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1271  sao_offset, sao_offset, offset_mask1, offset_mask1,
1272  offset_mask1);
1273  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1274  sao_offset, sao_offset, offset_mask2, offset_mask2,
1275  offset_mask2);
1276  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1277  sao_offset, sao_offset, offset_mask3, offset_mask3,
1278  offset_mask3);
1279 
1280  DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
1281  src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
1282  src_zero2, src_zero3);
1283  DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
1284  offset_mask1, src_zero2, offset_mask2, src_zero3,
1285  offset_mask3, dst0, dst1, dst2, dst3);
1286  DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1287  128, dst0, dst1, dst2, dst3);
1288 
1289  src_minus10 = src10;
1290  src_minus11 = src11;
1291  src_minus12 = src12;
1292  src_minus13 = src13;
1293 
1294  __lsx_vst(dst0, dst_ptr, 0);
1295  __lsx_vst(dst1, dst_ptr + dst_stride, 0);
1296  __lsx_vst(dst2, dst_ptr + dst_stride_2x, 0);
1297  __lsx_vst(dst3, dst_ptr + dst_stride_3x, 0);
1298  }
1299  src += src_stride_4x;
1300  dst += dst_stride_4x;
1301  }
1302 }
1303 
1305  int32_t dst_stride,
1306  uint8_t *src,
1307  int32_t src_stride,
1308  int16_t *sao_offset_val,
1309  int32_t height)
1310 {
1311  const int32_t src_stride_2x = (src_stride << 1);
1312  const int32_t dst_stride_2x = (dst_stride << 1);
1313  __m128i edge_idx = {0x403000201, 0x0};
1314  __m128i const1 = __lsx_vldi(1);
1315  __m128i dst0;
1316  __m128i sao_offset = __lsx_vld(sao_offset_val, 0);
1317  __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1318  __m128i src_minus10, src_minus11, src10, src11;
1319  __m128i src_zero0, src_zero1;
1320  __m128i offset;
1321  __m128i offset_mask0, offset_mask1;
1322 
1323  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1324 
1325  /* load in advance */
1326  DUP4_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src + src_stride, 0,
1327  src + src_stride_2x, 0, src_minus10, src_minus11, src10, src11);
1328 
1329  for (height -= 2; height; height -= 2) {
1330  src += src_stride_2x;
1331  DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1332  src11, src_minus11, src10, src10, src_minus10, src_zero0,
1333  src_minus11, src_zero1);
1334  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1335  cmp_minus10, cmp_minus11);
1336  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1337  cmp_minus11, diff_minus10, diff_minus11);
1338  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1339  src_minus11, cmp_minus10, cmp_minus11);
1340  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1341  cmp_minus11, cmp_minus10, cmp_minus11);
1342  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1343  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1344 
1345  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1346  diff_minus11, offset_mask0, offset_mask1);
1347  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1348  offset_mask0, offset_mask1);
1349  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1350  src_zero0, offset, dst0);
1351  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1352  sao_offset, offset, offset, offset);
1353 
1354  dst0 = __lsx_vxori_b(dst0, 128);
1355  dst0 = __lsx_vsadd_b(dst0, offset);
1356  dst0 = __lsx_vxori_b(dst0, 128);
1357  src_minus10 = src10;
1358  src_minus11 = src11;
1359 
1360  /* load in advance */
1361  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
1362  src10, src11);
1363 
1364  __lsx_vstelm_w(dst0, dst, 0, 0);
1365  __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1366  dst += dst_stride_2x;
1367  }
1368 
1369  DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1370  src11, src_minus11, src10, src10, src_minus10, src_zero0,
1371  src_minus11, src_zero1);
1372  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1373  cmp_minus10, cmp_minus11);
1374  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1375  diff_minus10, diff_minus11);
1376  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1377  cmp_minus10, cmp_minus11);
1378  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1379  cmp_minus10, cmp_minus11);
1380  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1381  const1, cmp_minus11, diff_minus10, diff_minus11);
1382 
1383  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1384  diff_minus11, offset_mask0, offset_mask1);
1385  DUP2_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1386  offset_mask0, offset_mask1);
1387  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1388  src_zero0, offset, dst0);
1389  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1390  sao_offset, offset, offset, offset);
1391  dst0 = __lsx_vxori_b(dst0, 128);
1392  dst0 = __lsx_vsadd_b(dst0, offset);
1393  dst0 = __lsx_vxori_b(dst0, 128);
1394 
1395  __lsx_vstelm_w(dst0, dst, 0, 0);
1396  __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1397 }
1398 
1400  int32_t dst_stride,
1401  uint8_t *src,
1402  int32_t src_stride,
1403  int16_t *sao_offset_val,
1404  int32_t height)
1405 {
1406  const int32_t src_stride_2x = (src_stride << 1);
1407  const int32_t dst_stride_2x = (dst_stride << 1);
1408  __m128i edge_idx = {0x403000201, 0x0};
1409  __m128i const1 = __lsx_vldi(1);
1410  __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1411  __m128i src_zero0, src_zero1, dst0;
1412  __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1413  __m128i src_minus10, src_minus11, src10, src11;
1414  __m128i offset_mask0, offset_mask1;
1415 
1416  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1417 
1418  /* load in advance */
1419  DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0, src_minus10, src_minus11);
1420  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src10, src11);
1421 
1422  for (height -= 2; height; height -= 2) {
1423  src += src_stride_2x;
1424  DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1425  src11, src_minus11, src10, src10, src_minus10, src_zero0,
1426  src_minus11, src_zero1);
1427  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1428  cmp_minus10, cmp_minus11);
1429  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1430  cmp_minus11, diff_minus10, diff_minus11);
1431  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1432  src_minus11, cmp_minus10, cmp_minus11);
1433  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1434  cmp_minus11, cmp_minus10, cmp_minus11);
1435  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1436  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1437 
1438  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1439  diff_minus11, offset_mask0, offset_mask1);
1440  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1441  offset_mask0, offset_mask1);
1442  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1443  src_zero0, offset, dst0);
1444  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1445  sao_offset, offset, offset, offset);
1446 
1447  dst0 = __lsx_vxori_b(dst0, 128);
1448  dst0 = __lsx_vsadd_b(dst0, offset);
1449  dst0 = __lsx_vxori_b(dst0, 128);
1450  src_minus10 = src10;
1451  src_minus11 = src11;
1452 
1453  /* load in advance */
1454  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
1455  src10, src11);
1456 
1457  __lsx_vstelm_d(dst0, dst, 0, 0);
1458  __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1459  dst += dst_stride_2x;
1460  }
1461 
1462  DUP4_ARG2(__lsx_vilvl_b, src10, src_minus10, src_minus11, src_minus11,
1463  src11, src_minus11, src10, src10, src_minus10, src_zero0,
1464  src_minus11, src_zero1);
1465  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1466  cmp_minus10, cmp_minus11);
1467  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1468  diff_minus10, diff_minus11);
1469  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1470  cmp_minus10, cmp_minus11);
1471  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1472  cmp_minus10, cmp_minus11);
1473  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1474  const1, cmp_minus11, diff_minus10, diff_minus11);
1475 
1476  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1477  diff_minus11, offset_mask0, offset_mask1);
1478  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1479  offset_mask0, offset_mask1);
1480  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1481  src_zero0, offset, dst0);
1482  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1483  sao_offset, offset, offset, offset);
1484  dst0 = __lsx_vxori_b(dst0, 128);
1485  dst0 = __lsx_vsadd_b(dst0, offset);
1486  dst0 = __lsx_vxori_b(dst0, 128);
1487 
1488  __lsx_vstelm_d(dst0, dst, 0, 0);
1489  __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1490 }
1491 
1493  int32_t dst_stride,
1494  uint8_t *src,
1495  int32_t src_stride,
1496  int16_t *
1497  sao_offset_val,
1498  int32_t width,
1499  int32_t height)
1500 {
1501  uint8_t *src_orig = src;
1502  uint8_t *dst_orig = dst;
1503  int32_t h_cnt, v_cnt;
1504  const int32_t src_stride_2x = (src_stride << 1);
1505  const int32_t dst_stride_2x = (dst_stride << 1);
1506  const int32_t src_stride_4x = (src_stride << 2);
1507  const int32_t dst_stride_4x = (dst_stride << 2);
1508  const int32_t src_stride_3x = src_stride_2x + src_stride;
1509  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1510  __m128i edge_idx = {0x403000201, 0x0};
1511  __m128i const1 = __lsx_vldi(1);
1512  __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1513  __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1514  __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1515  __m128i diff_plus13;
1516  __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
1517  __m128i src12, dst2, src13, dst3;
1518  __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1519 
1520  sao_offset = __lsx_vld(sao_offset_val, 0);
1521  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1522 
1523  for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1524  src = src_orig + v_cnt;
1525  dst = dst_orig + v_cnt;
1526 
1527  DUP2_ARG2(__lsx_vld, src - src_stride, 0, src, 0,
1528  src_minus10, src_minus11);
1529 
1530  for (h_cnt = (height >> 2); h_cnt--;) {
1531  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,
1532  src, src_stride_3x, src, src_stride_4x,
1533  src10, src11, src12, src13);
1534  DUP4_ARG2(__lsx_vseq_b, src_minus11, src_minus10, src_minus11,
1535  src10, src10, src_minus11, src10, src11, cmp_minus10,
1536  cmp_plus10, cmp_minus11, cmp_plus11);
1537  DUP4_ARG2(__lsx_vseq_b, src11, src10, src11, src12, src12, src11,
1538  src12, src13, cmp_minus12, cmp_plus12,
1539  cmp_minus13, cmp_plus13);
1540  DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1541  cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1542  cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1543  diff_plus11);
1544  DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1545  cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1546  cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1547  diff_plus13);
1548  DUP4_ARG2(__lsx_vsle_bu, src_minus11, src_minus10, src_minus11,
1549  src10, src10, src_minus11, src10, src11, cmp_minus10,
1550  cmp_plus10, cmp_minus11, cmp_plus11);
1551  DUP4_ARG2(__lsx_vsle_bu, src11, src10, src11, src12, src12, src11,
1552  src12, src13, cmp_minus12, cmp_plus12, cmp_minus13,
1553  cmp_plus13);
1554  DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1555  cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1556  cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1557  cmp_plus11);
1558  DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1559  cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1560  cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1561  cmp_plus13);
1562  DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1563  diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1564  cmp_minus11, diff_plus11, const1, cmp_plus11,
1565  diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1566  DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1567  diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1568  cmp_minus13, diff_plus13, const1, cmp_plus13,
1569  diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1570 
1571  DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1572  diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1573  diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1574  offset_mask3);
1575  DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1576  offset_mask2, 2, offset_mask3, 2, offset_mask0,
1577  offset_mask1, offset_mask2, offset_mask3);
1578  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1579  sao_offset, sao_offset, offset_mask0,\
1580  offset_mask0, offset_mask0);
1581  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1582  sao_offset, sao_offset, offset_mask1, offset_mask1,
1583  offset_mask1);
1584  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1585  sao_offset, sao_offset, offset_mask2, offset_mask2,
1586  offset_mask2);
1587  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1588  sao_offset, sao_offset, offset_mask3, offset_mask3,
1589  offset_mask3);
1590 
1591  src_minus10 = src12;
1592  DUP4_ARG2(__lsx_vxori_b, src_minus11, 128, src10, 128, src11, 128,
1593  src12, 128, src_minus11, src10, src11, src12);
1594  DUP4_ARG2(__lsx_vsadd_b, src_minus11, offset_mask0, src10,
1595  offset_mask1, src11, offset_mask2, src12,
1596  offset_mask3, dst0, dst1, dst2, dst3);
1597  DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1598  128, dst0, dst1, dst2, dst3);
1599  src_minus11 = src13;
1600 
1601  __lsx_vst(dst0, dst, 0);
1602  __lsx_vstx(dst1, dst, dst_stride);
1603  __lsx_vstx(dst2, dst, dst_stride_2x);
1604  __lsx_vstx(dst3, dst, dst_stride_3x);
1605  src += src_stride_4x;
1606  dst += dst_stride_4x;
1607  }
1608  }
1609 }
1610 
1612  int32_t dst_stride,
1613  uint8_t *src,
1614  int32_t src_stride,
1615  int16_t *sao_offset_val,
1616  int32_t height)
1617 {
1618  uint8_t *src_orig;
1619  const int32_t src_stride_2x = (src_stride << 1);
1620  const int32_t dst_stride_2x = (dst_stride << 1);
1621  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1622  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1623  __m128i edge_idx = {0x403000201, 0x0};
1624  __m128i const1 = __lsx_vldi(1);
1625  __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1626  __m128i cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
1627  __m128i src_minus11, src10, src11;
1628  __m128i src_plus0, src_zero0, src_plus1, src_zero1, dst0;
1629  __m128i offset_mask0, offset_mask1;
1630  __m128i zeros = {0};
1631 
1632  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1633  src_orig = src - 1;
1634 
1635  /* load in advance */
1636  DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
1637  src_minus10, src_minus11);
1638  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1639  src10, src11);
1640 
1641  for (height -= 2; height; height -= 2) {
1642  src_orig += src_stride_2x;
1643 
1644  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
1645  shuf1, src_zero0, src_zero1);
1646  DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1647  src_plus0, src_plus1);
1648 
1649  DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1,
1650  src_minus11, src_minus10, src_minus11);
1651  DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1,
1652  src_zero1, src_zero0, src_zero1);
1653  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1,
1654  src_minus11, cmp_minus10, cmp_minus11);
1655  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1656  cmp_minus11, diff_minus10, diff_minus11);
1657  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1658  src_minus11, cmp_minus10, cmp_minus11);
1659  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1660  cmp_minus11, cmp_minus10, cmp_minus11);
1661  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1662  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1663 
1664  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1665  diff_minus11, offset_mask0, offset_mask1);
1666  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1667  offset_mask0, offset_mask1);
1668  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1669  src_zero0, offset, dst0);
1670  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1671  sao_offset, offset, offset, offset);
1672  dst0 = __lsx_vxori_b(dst0, 128);
1673  dst0 = __lsx_vsadd_b(dst0, offset);
1674  dst0 = __lsx_vxori_b(dst0, 128);
1675 
1676  src_minus10 = src10;
1677  src_minus11 = src11;
1678 
1679  /* load in advance */
1680  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1681  src10, src11);
1682 
1683  __lsx_vstelm_w(dst0, dst, 0, 0);
1684  __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1685  dst += dst_stride_2x;
1686  }
1687 
1688  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
1689  src_zero0, src_zero1);
1690  DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1691  src_plus0, src_plus1);
1692 
1693  DUP2_ARG2(__lsx_vilvl_b, src_plus0, src_minus10, src_plus1, src_minus11,
1694  src_minus10, src_minus11);
1695  DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1696  src_zero0, src_zero1);
1697  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1698  cmp_minus10, cmp_minus11);
1699  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1700  diff_minus10, diff_minus11);
1701  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1702  cmp_minus10, cmp_minus11);
1703  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1704  cmp_minus10, cmp_minus11);
1705  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1706  const1, cmp_minus11, diff_minus10, diff_minus11);
1707 
1708  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1709  diff_minus11, offset_mask0, offset_mask1);
1710  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
1711  offset_mask1);
1712  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1713  src_zero0, offset, dst0);
1714  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1715  sao_offset, offset, offset, offset);
1716  dst0 = __lsx_vxori_b(dst0, 128);
1717  dst0 = __lsx_vsadd_b(dst0, offset);
1718  dst0 = __lsx_vxori_b(dst0, 128);
1719 
1720  __lsx_vstelm_w(dst0, dst, 0, 0);
1721  __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
1722 }
1723 
1725  int32_t dst_stride,
1726  uint8_t *src,
1727  int32_t src_stride,
1728  int16_t *sao_offset_val,
1729  int32_t height)
1730 {
1731  uint8_t *src_orig;
1732  const int32_t src_stride_2x = (src_stride << 1);
1733  const int32_t dst_stride_2x = (dst_stride << 1);
1734  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1735  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1736  __m128i edge_idx = {0x403000201, 0x0};
1737  __m128i const1 = __lsx_vldi(1);
1738  __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
1739  __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1740  __m128i src_minus10, src10, src_minus11, src11;
1741  __m128i src_zero0, src_plus10, src_zero1, src_plus11, dst0;
1742  __m128i offset_mask0, offset_mask1;
1743  __m128i zeros = {0};
1744 
1745  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1746  src_orig = src - 1;
1747 
1748  /* load in advance */
1749  DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0, src_minus10,
1750  src_minus11);
1751  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1752  src10, src11);
1753 
1754  for (height -= 2; height; height -= 2) {
1755  src_orig += src_stride_2x;
1756 
1757  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
1758  shuf1, src_zero0, src_zero1);
1759  DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1760  src_plus10, src_plus11);
1761 
1762  DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11,
1763  src_minus11, src_minus10, src_minus11);
1764  DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1765  src_zero0, src_zero1);
1766  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1767  cmp_minus10, cmp_minus11);
1768  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1769  cmp_minus11, diff_minus10, diff_minus11);
1770  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
1771  src_minus11, cmp_minus10, cmp_minus11);
1772  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1773  cmp_minus11, cmp_minus10, cmp_minus11);
1774  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1775  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
1776 
1777  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1778  diff_minus11, offset_mask0, offset_mask1);
1779  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
1780  offset_mask0, offset_mask1);
1781  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1782  src_zero0, offset, dst0);
1783  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1784  sao_offset, offset, offset, offset);
1785  dst0 = __lsx_vxori_b(dst0, 128);
1786  dst0 = __lsx_vsadd_b(dst0, offset);
1787  dst0 = __lsx_vxori_b(dst0, 128);
1788 
1789  src_minus10 = src10;
1790  src_minus11 = src11;
1791 
1792  /* load in advance */
1793  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1794  src10, src11)
1795  __lsx_vstelm_d(dst0, dst, 0, 0);
1796  __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1797  dst += dst_stride_2x;
1798  }
1799 
1800  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
1801  src_zero0, src_zero1);
1802  DUP2_ARG3(__lsx_vshuf_b, zeros, src10, shuf2, zeros, src11, shuf2,
1803  src_plus10, src_plus11);
1804  DUP2_ARG2(__lsx_vilvl_b, src_plus10, src_minus10, src_plus11, src_minus11,
1805  src_minus10, src_minus11);
1806  DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
1807  src_zero0, src_zero1);
1808 
1809  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
1810  cmp_minus10, cmp_minus11);
1811  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
1812  cmp_minus11, diff_minus10, diff_minus11);
1813  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
1814  cmp_minus10, cmp_minus11);
1815  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
1816  cmp_minus10, cmp_minus11);
1817  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
1818  const1, cmp_minus11, diff_minus10, diff_minus11);
1819 
1820  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
1821  diff_minus11, offset_mask0, offset_mask1);
1822  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
1823  offset_mask1);
1824  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
1825  src_zero0, offset, dst0);
1826  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
1827  sao_offset, offset, offset, offset);
1828  dst0 = __lsx_vxori_b(dst0, 128);
1829  dst0 = __lsx_vsadd_b(dst0, offset);
1830  dst0 = __lsx_vxori_b(dst0, 128);
1831 
1832  src_minus10 = src10;
1833  src_minus11 = src11;
1834 
1835  /* load in advance */
1836  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1837  src10, src11);
1838 
1839  __lsx_vstelm_d(dst0, dst, 0, 0);
1840  __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
1841 }
1842 
1844  int32_t dst_stride,
1845  uint8_t *src,
1846  int32_t src_stride,
1847  int16_t *
1848  sao_offset_val,
1849  int32_t width,
1850  int32_t height)
1851 {
1852  uint8_t *src_orig = src;
1853  uint8_t *dst_orig = dst;
1854  int32_t v_cnt;
1855  const int32_t src_stride_2x = (src_stride << 1);
1856  const int32_t dst_stride_2x = (dst_stride << 1);
1857  const int32_t src_stride_4x = (src_stride << 2);
1858  const int32_t dst_stride_4x = (dst_stride << 2);
1859  const int32_t src_stride_3x = src_stride_2x + src_stride;
1860  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1861 
1862  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
1863  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
1864  __m128i edge_idx = {0x403000201, 0x0};
1865  __m128i const1 = __lsx_vldi(1);
1866  __m128i cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1867  __m128i cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1868  __m128i diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1869  __m128i diff_plus13, src_minus14, src_plus13;
1870  __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1871  __m128i src10, src_minus10, dst0, src11, src_minus11, dst1;
1872  __m128i src12, src_minus12, dst2, src13, src_minus13, dst3;
1873  __m128i src_zero0, src_plus10, src_zero1, src_plus11, src_zero2;
1874  __m128i src_zero3, sao_offset, src_plus12;
1875 
1876  sao_offset = __lsx_vld(sao_offset_val, 0);
1877  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
1878 
1879  for (; height; height -= 4) {
1880  src_orig = src - 1;
1881  dst_orig = dst;
1882  src_minus11 = __lsx_vld(src_orig, 0);
1883  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
1884  src_minus12, src_minus13);
1885  src_minus14 = __lsx_vldx(src_orig, src_stride_3x);
1886 
1887  for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
1888  src_minus10 = __lsx_vld(src_orig - src_stride, 0);
1889  src_orig += 16;
1890  src10 = __lsx_vld(src_orig, 0);
1891  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig,
1892  src_stride_2x, src11, src12);
1893  src13 = __lsx_vldx(src_orig, src_stride_3x);
1894  src_plus13 = __lsx_vld(src + v_cnt + src_stride_4x, 1);
1895 
1896  DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
1897  src_minus12, shuf1, src12, src_minus13, shuf1,
1898  src13, src_minus14, shuf1, src_zero0, src_zero1,
1899  src_zero2, src_zero3);
1900  DUP2_ARG3(__lsx_vshuf_b, src11, src_minus12, shuf2, src12,
1901  src_minus13, shuf2, src_plus10, src_plus11);
1902  src_plus12 = __lsx_vshuf_b(src13, src_minus14, shuf2);
1903 
1904  DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
1905  src_plus10, src_zero1, src_minus11, src_zero1,
1906  src_plus11, cmp_minus10, cmp_plus10,
1907  cmp_minus11, cmp_plus11);
1908  DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
1909  src_plus12, src_zero3, src_minus13, src_zero3,
1910  src_plus13, cmp_minus12, cmp_plus12,
1911  cmp_minus13, cmp_plus13);
1912  DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1913  cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1914  cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
1915  diff_plus11);
1916  DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1917  cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1918  cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
1919  diff_plus13);
1920  DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
1921  src_plus10, src_zero1, src_minus11, src_zero1,
1922  src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1923  cmp_plus11);
1924  DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
1925  src_plus12, src_zero3, src_minus13, src_zero3,
1926  src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1927  cmp_plus13);
1928  DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
1929  cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
1930  cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
1931  cmp_plus11);
1932  DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
1933  cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
1934  cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
1935  cmp_plus13);
1936  DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
1937  diff_plus10, const1, cmp_plus10, diff_minus11, const1,
1938  cmp_minus11, diff_plus11, const1, cmp_plus11,
1939  diff_minus10, diff_plus10, diff_minus11, diff_plus11);
1940  DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
1941  diff_plus12, const1, cmp_plus12, diff_minus13, const1,
1942  cmp_minus13, diff_plus13, const1, cmp_plus13,
1943  diff_minus12, diff_plus12, diff_minus13, diff_plus13);
1944 
1945  DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
1946  diff_plus11, diff_minus12, diff_plus12, diff_minus13,
1947  diff_plus13, offset_mask0, offset_mask1, offset_mask2,
1948  offset_mask3);
1949  DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
1950  offset_mask2, 2, offset_mask3, 2, offset_mask0,
1951  offset_mask1, offset_mask2, offset_mask3);
1952 
1953  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
1954  sao_offset, sao_offset, offset_mask0, offset_mask0,
1955  offset_mask0);
1956  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
1957  sao_offset, sao_offset, offset_mask1, offset_mask1,
1958  offset_mask1);
1959  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
1960  sao_offset, sao_offset, offset_mask2, offset_mask2,
1961  offset_mask2);
1962  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
1963  sao_offset, sao_offset, offset_mask3, offset_mask3,
1964  offset_mask3);
1965 
1966  DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128, src_zero2,
1967  128, src_zero3, 128, src_zero0, src_zero1, src_zero2,
1968  src_zero3);
1969  DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
1970  offset_mask1, src_zero2, offset_mask2, src_zero3,
1971  offset_mask3, dst0, dst1, dst2, dst3);
1972  DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
1973  128, dst0, dst1, dst2, dst3);
1974 
1975  src_minus11 = src10;
1976  src_minus12 = src11;
1977  src_minus13 = src12;
1978  src_minus14 = src13;
1979 
1980  __lsx_vst(dst0, dst_orig, 0);
1981  __lsx_vstx(dst1, dst_orig, dst_stride);
1982  __lsx_vstx(dst2, dst_orig, dst_stride_2x);
1983  __lsx_vstx(dst3, dst_orig, dst_stride_3x);
1984  dst_orig += 16;
1985  }
1986  src += src_stride_4x;
1987  dst += dst_stride_4x;
1988  }
1989 }
1990 
1992  int32_t dst_stride,
1993  uint8_t *src,
1994  int32_t src_stride,
1995  int16_t *sao_offset_val,
1996  int32_t height)
1997 {
1998  uint8_t *src_orig;
1999  const int32_t src_stride_2x = (src_stride << 1);
2000  const int32_t dst_stride_2x = (dst_stride << 1);
2001 
2002  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2003  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2004  __m128i edge_idx = {0x403000201, 0x0};
2005  __m128i const1 = __lsx_vldi(1);
2006  __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
2007  __m128i src_zero0, src_zero1, dst0;
2008  __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2009  __m128i src_minus10, src10, src_minus11, src11;
2010  __m128i offset_mask0, offset_mask1;
2011  __m128i zeros = {0};
2012 
2013  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2014  src_orig = src - 1;
2015 
2016  /* load in advance */
2017  DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
2018  src_minus10, src_minus11);
2019  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2020  src10, src11);
2021 
2022  for (height -= 2; height; height -= 2) {
2023  src_orig += src_stride_2x;
2024 
2025  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
2026  shuf1, src_zero0, src_zero1);
2027  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2028  shuf2, src_minus10, src_minus11);
2029 
2030  DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2031  src_minus10, src_minus11);
2032  DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2033  src_zero0, src_zero1);
2034  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2035  cmp_minus10, cmp_minus11);
2036  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2037  cmp_minus11, diff_minus10, diff_minus11);
2038  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
2039  src_minus11, cmp_minus10, cmp_minus11);
2040  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2041  cmp_minus11, cmp_minus10, cmp_minus11);
2042  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2043  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
2044 
2045  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2046  diff_minus11, offset_mask0, offset_mask1);
2047  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
2048  offset_mask0, offset_mask1);
2049  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2050  src_zero0, offset, dst0);
2051  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2052  sao_offset, offset, offset, offset);
2053  dst0 = __lsx_vxori_b(dst0, 128);
2054  dst0 = __lsx_vsadd_b(dst0, offset);
2055  dst0 = __lsx_vxori_b(dst0, 128);
2056 
2057  src_minus10 = src10;
2058  src_minus11 = src11;
2059 
2060  /* load in advance */
2061  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2062  src10, src11);
2063 
2064  __lsx_vstelm_w(dst0, dst, 0, 0);
2065  __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
2066  dst += dst_stride_2x;
2067  }
2068 
2069  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
2070  src_zero0, src_zero1);
2071  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2072  shuf2, src_minus10, src_minus11);
2073 
2074  DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2075  src_minus10, src_minus11);
2076  DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2077  src_zero0, src_zero1);
2078  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2079  cmp_minus10, cmp_minus11);
2080  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2081  cmp_minus11, diff_minus10, diff_minus11);
2082  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
2083  cmp_minus10, cmp_minus11);
2084  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2085  cmp_minus10, cmp_minus11);
2086  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
2087  const1, cmp_minus11, diff_minus10, diff_minus11);
2088 
2089  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2090  diff_minus11, offset_mask0, offset_mask1);
2091  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
2092  offset_mask1);
2093  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2094  src_zero0, offset, dst0);
2095  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2096  sao_offset, offset, offset, offset);
2097  dst0 = __lsx_vxori_b(dst0, 128);
2098  dst0 = __lsx_vsadd_b(dst0, offset);
2099  dst0 = __lsx_vxori_b(dst0, 128);
2100 
2101  __lsx_vstelm_w(dst0, dst, 0, 0);
2102  __lsx_vstelm_w(dst0, dst + dst_stride, 0, 2);
2103  dst += dst_stride_2x;
2104 }
2105 
2107  int32_t dst_stride,
2108  uint8_t *src,
2109  int32_t src_stride,
2110  int16_t *sao_offset_val,
2111  int32_t height)
2112 {
2113  uint8_t *src_orig;
2114  const int32_t src_stride_2x = (src_stride << 1);
2115  const int32_t dst_stride_2x = (dst_stride << 1);
2116 
2117  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2118  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2119  __m128i edge_idx = {0x403000201, 0x0};
2120  __m128i const1 = __lsx_vldi(1);
2121  __m128i offset, sao_offset = __lsx_vld(sao_offset_val, 0);
2122  __m128i cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2123  __m128i src_minus10, src10, src_minus11, src11;
2124  __m128i src_zero0, src_zero1, dst0;
2125  __m128i offset_mask0, offset_mask1;
2126  __m128i zeros = {0};
2127 
2128  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2129  src_orig = src - 1;
2130 
2131  /* load in advance */
2132  DUP2_ARG2(__lsx_vld, src_orig - src_stride, 0, src_orig, 0,
2133  src_minus10, src_minus11);
2134  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2135  src10, src11);
2136 
2137  for (height -= 2; height; height -= 2) {
2138  src_orig += src_stride_2x;
2139 
2140  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10,
2141  shuf1, src_zero0, src_zero1);
2142  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2143  shuf2, src_minus10, src_minus11);
2144 
2145  DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2146  src_minus10, src_minus11);
2147  DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2148  src_zero0, src_zero1);
2149  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2150  cmp_minus10, cmp_minus11);
2151  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2152  cmp_minus11, diff_minus10, diff_minus11);
2153  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1,
2154  src_minus11, cmp_minus10, cmp_minus11);
2155  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11,
2156  cmp_minus11, cmp_minus10, cmp_minus11);
2157  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2158  diff_minus11, const1, cmp_minus11, diff_minus10, diff_minus11);
2159 
2160  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2161  diff_minus11, offset_mask0, offset_mask1);
2162  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2,
2163  offset_mask0, offset_mask1);
2164  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2165  src_zero0, offset, dst0);
2166  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2167  sao_offset, offset, offset, offset);
2168  dst0 = __lsx_vxori_b(dst0, 128);
2169  dst0 = __lsx_vsadd_b(dst0, offset);
2170  dst0 = __lsx_vxori_b(dst0, 128);
2171 
2172  src_minus10 = src10;
2173  src_minus11 = src11;
2174 
2175  /* load in advance */
2176  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2177  src10, src11);
2178 
2179  __lsx_vstelm_d(dst0, dst, 0, 0);
2180  __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
2181  dst += dst_stride_2x;
2182  }
2183 
2184  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus11, shuf1, zeros, src10, shuf1,
2185  src_zero0, src_zero1);
2186  DUP2_ARG3(__lsx_vshuf_b, zeros, src_minus10, shuf2, zeros, src_minus11,
2187  shuf2, src_minus10, src_minus11);
2188 
2189  DUP2_ARG2(__lsx_vilvl_b, src10, src_minus10, src11, src_minus11,
2190  src_minus10, src_minus11);
2191  DUP2_ARG2(__lsx_vilvl_b, src_zero0, src_zero0, src_zero1, src_zero1,
2192  src_zero0, src_zero1);
2193  DUP2_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero1, src_minus11,
2194  cmp_minus10, cmp_minus11);
2195  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2196  diff_minus10, diff_minus11);
2197  DUP2_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero1, src_minus11,
2198  cmp_minus10, cmp_minus11);
2199  DUP2_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_minus11, cmp_minus11,
2200  cmp_minus10, cmp_minus11);
2201  DUP2_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10, diff_minus11,
2202  const1, cmp_minus11, diff_minus10, diff_minus11);
2203 
2204  DUP2_ARG2(__lsx_vhaddw_hu_bu, diff_minus10, diff_minus10, diff_minus11,
2205  diff_minus11, offset_mask0, offset_mask1);
2206  DUP2_ARG2(__lsx_vaddi_hu, offset_mask0, 2, offset_mask1, 2, offset_mask0,
2207  offset_mask1);
2208  DUP2_ARG2(__lsx_vpickev_b, offset_mask1, offset_mask0, src_zero1,
2209  src_zero0, offset, dst0);
2210  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset, sao_offset,
2211  sao_offset, offset, offset, offset);
2212  dst0 = __lsx_vxori_b(dst0, 128);
2213  dst0 = __lsx_vsadd_b(dst0, offset);
2214  dst0 = __lsx_vxori_b(dst0, 128);
2215 
2216  __lsx_vstelm_d(dst0, dst, 0, 0);
2217  __lsx_vstelm_d(dst0, dst + dst_stride, 0, 1);
2218 }
2219 
2221  int32_t dst_stride,
2222  uint8_t *src,
2223  int32_t src_stride,
2224  int16_t *sao_offset_val,
2225  int32_t width,
2226  int32_t height)
2227 {
2228  uint8_t *src_orig, *dst_orig;
2229  int32_t v_cnt;
2230  const int32_t src_stride_2x = (src_stride << 1);
2231  const int32_t dst_stride_2x = (dst_stride << 1);
2232  const int32_t src_stride_4x = (src_stride << 2);
2233  const int32_t dst_stride_4x = (dst_stride << 2);
2234  const int32_t src_stride_3x = src_stride_2x + src_stride;
2235  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
2236 
2237  __m128i shuf1 = {0x807060504030201, 0x100F0E0D0C0B0A09};
2238  __m128i shuf2 = {0x908070605040302, 0x11100F0E0D0C0B0A};
2239  __m128i edge_idx = {0x403000201, 0x0};
2240  __m128i const1 = __lsx_vldi(1);
2241  __m128i dst0, dst1, dst2, dst3;
2242  __m128i cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
2243  __m128i cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
2244  __m128i diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
2245  __m128i diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
2246  __m128i src_plus10, src_plus11, src_plus12, src_plus13;
2247  __m128i src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
2248  __m128i offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
2249 
2250  sao_offset = __lsx_vld(sao_offset_val, 0);
2251  sao_offset = __lsx_vpickev_b(sao_offset, sao_offset);
2252 
2253  for (; height; height -= 4) {
2254  src_orig = src - 1;
2255  dst_orig = dst;
2256 
2257  src_minus11 = __lsx_vld(src_orig, 0);
2258  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2259  src_plus10, src_plus11);
2260  src_plus12 = __lsx_vldx(src_orig, src_stride_3x);
2261 
2262  for (v_cnt = 0; v_cnt < width; v_cnt += 16) {
2263  src_minus10 = __lsx_vld(src_orig - src_stride, 2);
2264  src_plus13 = __lsx_vldx(src_orig, src_stride_4x);
2265  src_orig += 16;
2266  src10 = __lsx_vld(src_orig, 0);
2267  DUP2_ARG2(__lsx_vldx, src_orig, src_stride, src_orig, src_stride_2x,
2268  src11, src12);
2269  src13 =__lsx_vldx(src_orig, src_stride_3x);
2270 
2271  DUP4_ARG3(__lsx_vshuf_b, src10, src_minus11, shuf1, src11,
2272  src_plus10, shuf1, src12, src_plus11, shuf1, src13,
2273  src_plus12, shuf1, src_zero0, src_zero1, src_zero2,
2274  src_zero3);
2275  src_minus11 = __lsx_vshuf_b(src10, src_minus11, shuf2);
2276  DUP2_ARG3(__lsx_vshuf_b, src11, src_plus10, shuf2, src12,
2277  src_plus11, shuf2, src_minus12, src_minus13);
2278 
2279  DUP4_ARG2(__lsx_vseq_b, src_zero0, src_minus10, src_zero0,
2280  src_plus10, src_zero1, src_minus11, src_zero1,
2281  src_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
2282  cmp_plus11);
2283  DUP4_ARG2(__lsx_vseq_b, src_zero2, src_minus12, src_zero2,
2284  src_plus12, src_zero3, src_minus13, src_zero3,
2285  src_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
2286  cmp_plus13);
2287  DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
2288  cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
2289  cmp_plus11, diff_minus10, diff_plus10, diff_minus11,
2290  diff_plus11);
2291  DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
2292  cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
2293  cmp_plus13, diff_minus12, diff_plus12, diff_minus13,
2294  diff_plus13);
2295  DUP4_ARG2(__lsx_vsle_bu, src_zero0, src_minus10, src_zero0,
2296  src_plus10, src_zero1, src_minus11, src_zero1, src_plus11,
2297  cmp_minus10, cmp_plus10, cmp_minus11, cmp_plus11);
2298  DUP4_ARG2(__lsx_vsle_bu, src_zero2, src_minus12, src_zero2,
2299  src_plus12, src_zero3, src_minus13, src_zero3, src_plus13,
2300  cmp_minus12, cmp_plus12, cmp_minus13, cmp_plus13);
2301  DUP4_ARG2(__lsx_vnor_v, cmp_minus10, cmp_minus10, cmp_plus10,
2302  cmp_plus10, cmp_minus11, cmp_minus11, cmp_plus11,
2303  cmp_plus11, cmp_minus10, cmp_plus10, cmp_minus11,
2304  cmp_plus11);
2305  DUP4_ARG2(__lsx_vnor_v, cmp_minus12, cmp_minus12, cmp_plus12,
2306  cmp_plus12, cmp_minus13, cmp_minus13, cmp_plus13,
2307  cmp_plus13, cmp_minus12, cmp_plus12, cmp_minus13,
2308  cmp_plus13);
2309  DUP4_ARG3(__lsx_vbitsel_v, diff_minus10, const1, cmp_minus10,
2310  diff_plus10, const1, cmp_plus10, diff_minus11, const1,
2311  cmp_minus11, diff_plus11, const1, cmp_plus11,
2312  diff_minus10, diff_plus10, diff_minus11, diff_plus11);
2313  DUP4_ARG3(__lsx_vbitsel_v, diff_minus12, const1, cmp_minus12,
2314  diff_plus12, const1, cmp_plus12, diff_minus13, const1,
2315  cmp_minus13, diff_plus13, const1, cmp_plus13,
2316  diff_minus12, diff_plus12, diff_minus13, diff_plus13);
2317 
2318  DUP4_ARG2(__lsx_vadd_b, diff_minus10, diff_plus10, diff_minus11,
2319  diff_plus11, diff_minus12, diff_plus12, diff_minus13,
2320  diff_plus13, offset_mask0, offset_mask1, offset_mask2,
2321  offset_mask3);
2322  DUP4_ARG2(__lsx_vaddi_bu, offset_mask0, 2, offset_mask1, 2,
2323  offset_mask2, 2, offset_mask3, 2, offset_mask0,
2324  offset_mask1, offset_mask2, offset_mask3);
2325 
2326  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask0,
2327  sao_offset, sao_offset, offset_mask0, offset_mask0,
2328  offset_mask0);
2329  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask1,
2330  sao_offset, sao_offset, offset_mask1, offset_mask1,
2331  offset_mask1);
2332  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask2,
2333  sao_offset, sao_offset, offset_mask2, offset_mask2,
2334  offset_mask2);
2335  DUP2_ARG3(__lsx_vshuf_b, edge_idx, edge_idx, offset_mask3,
2336  sao_offset, sao_offset, offset_mask3, offset_mask3,
2337  offset_mask3);
2338 
2339  DUP4_ARG2(__lsx_vxori_b, src_zero0, 128, src_zero1, 128,
2340  src_zero2, 128, src_zero3, 128, src_zero0, src_zero1,
2341  src_zero2, src_zero3);
2342  DUP4_ARG2(__lsx_vsadd_b, src_zero0, offset_mask0, src_zero1,
2343  offset_mask1, src_zero2, offset_mask2, src_zero3,
2344  offset_mask3, dst0, dst1, dst2, dst3);
2345  DUP4_ARG2(__lsx_vxori_b, dst0, 128, dst1, 128, dst2, 128, dst3,
2346  128, dst0, dst1, dst2, dst3);
2347 
2348  src_minus11 = src10;
2349  src_plus10 = src11;
2350  src_plus11 = src12;
2351  src_plus12 = src13;
2352 
2353  __lsx_vst(dst0, dst_orig, 0);
2354  __lsx_vstx(dst1, dst_orig, dst_stride);
2355  __lsx_vstx(dst2, dst_orig, dst_stride_2x);
2356  __lsx_vstx(dst3, dst_orig, dst_stride_3x);
2357  dst_orig += 16;
2358  }
2359 
2360  src += src_stride_4x;
2361  dst += dst_stride_4x;
2362  }
2363 }
2364 
2365 void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src,
2366  ptrdiff_t stride_dst,
2367  int16_t *sao_offset_val,
2368  int eo, int width, int height)
2369 {
2370  ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
2371 
2372  switch (eo) {
2373  case 0:
2374  if (width >> 4) {
2376  src, stride_src,
2377  sao_offset_val,
2378  width - (width & 0x0F),
2379  height);
2380  dst += width & 0xFFFFFFF0;
2381  src += width & 0xFFFFFFF0;
2382  width &= 0x0F;
2383  }
2384 
2385  if (width >> 3) {
2387  src, stride_src,
2388  sao_offset_val, height);
2389  dst += 8;
2390  src += 8;
2391  width &= 0x07;
2392  }
2393 
2394  if (width) {
2396  src, stride_src,
2397  sao_offset_val, height);
2398  }
2399  break;
2400 
2401  case 1:
2402  if (width >> 4) {
2404  src, stride_src,
2405  sao_offset_val,
2406  width - (width & 0x0F),
2407  height);
2408  dst += width & 0xFFFFFFF0;
2409  src += width & 0xFFFFFFF0;
2410  width &= 0x0F;
2411  }
2412 
2413  if (width >> 3) {
2415  src, stride_src,
2416  sao_offset_val, height);
2417  dst += 8;
2418  src += 8;
2419  width &= 0x07;
2420  }
2421 
2422  if (width) {
2424  src, stride_src,
2425  sao_offset_val, height);
2426  }
2427  break;
2428 
2429  case 2:
2430  if (width >> 4) {
2432  src, stride_src,
2433  sao_offset_val,
2434  width - (width & 0x0F),
2435  height);
2436  dst += width & 0xFFFFFFF0;
2437  src += width & 0xFFFFFFF0;
2438  width &= 0x0F;
2439  }
2440 
2441  if (width >> 3) {
2443  src, stride_src,
2444  sao_offset_val, height);
2445  dst += 8;
2446  src += 8;
2447  width &= 0x07;
2448  }
2449 
2450  if (width) {
2452  src, stride_src,
2453  sao_offset_val, height);
2454  }
2455  break;
2456 
2457  case 3:
2458  if (width >> 4) {
2460  src, stride_src,
2461  sao_offset_val,
2462  width - (width & 0x0F),
2463  height);
2464  dst += width & 0xFFFFFFF0;
2465  src += width & 0xFFFFFFF0;
2466  width &= 0x0F;
2467  }
2468 
2469  if (width >> 3) {
2471  src, stride_src,
2472  sao_offset_val, height);
2473  dst += 8;
2474  src += 8;
2475  width &= 0x07;
2476  }
2477 
2478  if (width) {
2480  src, stride_src,
2481  sao_offset_val, height);
2482  }
2483  break;
2484  }
2485 }
q1
static const uint8_t q1[256]
Definition: twofish.c:100
hevc_sao_edge_filter_45degree_4width_lsx
static void hevc_sao_edge_filter_45degree_4width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1611
ff_hevc_loop_filter_chroma_h_8_lsx
void ff_hevc_loop_filter_chroma_h_8_lsx(uint8_t *src, ptrdiff_t stride, int32_t *tc, uint8_t *p_is_pcm, uint8_t *q_is_pcm)
Definition: hevc_lpf_sao_lsx.c:863
src1
const pixel * src1
Definition: h264pred_template.c:421
hevc_sao_edge_filter_135degree_8width_lsx
static void hevc_sao_edge_filter_135degree_8width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)
Definition: hevc_lpf_sao_lsx.c:2106
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
ff_hevc_loop_filter_luma_h_8_lsx
void ff_hevc_loop_filter_luma_h_8_lsx(uint8_t *src, ptrdiff_t stride, int32_t beta, int32_t *tc, uint8_t *p_is_pcm, uint8_t *q_is_pcm)
Definition: hevc_lpf_sao_lsx.c:26
hevc_sao_edge_filter_90degree_4width_lsx
static void hevc_sao_edge_filter_90degree_4width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1304
DUP2_ARG2
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:58
hevc_sao_edge_filter_45degree_16multiple_lsx
static void hevc_sao_edge_filter_45degree_16multiple_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t width, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1843
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
width
#define width
ff_hevc_loop_filter_luma_v_8_lsx
void ff_hevc_loop_filter_luma_v_8_lsx(uint8_t *src, ptrdiff_t stride, int32_t beta, int32_t *tc, uint8_t *p_is_pcm, uint8_t *q_is_pcm)
Definition: hevc_lpf_sao_lsx.c:438
q0
static const uint8_t q0[256]
Definition: twofish.c:81
hevc_sao_edge_filter_45degree_8width_lsx
static void hevc_sao_edge_filter_45degree_8width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1724
abs
#define abs(x)
Definition: cuda_runtime.h:35
DUP2_ARG1
#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:52
hevc_sao_edge_filter_0degree_16multiple_lsx
static void hevc_sao_edge_filter_0degree_16multiple_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t width, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1166
DUP2_ARG3
#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:64
hevc_sao_edge_filter_135degree_4width_lsx
static void hevc_sao_edge_filter_135degree_4width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1991
height
#define height
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
hevc_sao_edge_filter_135degree_16multiple_lsx
static void hevc_sao_edge_filter_135degree_16multiple_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t width, int32_t height)
Definition: hevc_lpf_sao_lsx.c:2220
hevc_sao_edge_filter_0degree_4width_lsx
static void hevc_sao_edge_filter_0degree_4width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)
Definition: hevc_lpf_sao_lsx.c:988
MAX_PB_SIZE
#define MAX_PB_SIZE
Definition: hevcdsp.h:32
src2
const pixel * src2
Definition: h264pred_template.c:422
delta
float delta
Definition: vorbis_enc_data.h:430
ff_hevc_sao_edge_filter_8_lsx
void ff_hevc_sao_edge_filter_8_lsx(uint8_t *dst, uint8_t *src, ptrdiff_t stride_dst, int16_t *sao_offset_val, int eo, int width, int height)
Definition: hevc_lpf_sao_lsx.c:2365
hevc_sao_edge_filter_0degree_8width_lsx
static void hevc_sao_edge_filter_0degree_8width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1073
stride
#define stride
Definition: h264pred_template.c:537
AV_INPUT_BUFFER_PADDING_SIZE
#define AV_INPUT_BUFFER_PADDING_SIZE
Definition: defs.h:40
hevc_sao_edge_filter_90degree_16multiple_lsx
static void hevc_sao_edge_filter_90degree_16multiple_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t width, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1492
hevc_sao_edge_filter_90degree_8width_lsx
static void hevc_sao_edge_filter_90degree_8width_lsx(uint8_t *dst, int32_t dst_stride, uint8_t *src, int32_t src_stride, int16_t *sao_offset_val, int32_t height)
Definition: hevc_lpf_sao_lsx.c:1399
src0
const pixel *const src0
Definition: h264pred_template.c:420
tc
#define tc
Definition: regdef.h:69
zero
#define zero
Definition: regdef.h:64
loongson_intrinsics.h
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
int32_t
int32_t
Definition: audioconvert.c:56
hevcdsp_lsx.h
ff_hevc_loop_filter_chroma_v_8_lsx
void ff_hevc_loop_filter_chroma_v_8_lsx(uint8_t *src, ptrdiff_t stride, int32_t *tc, uint8_t *p_is_pcm, uint8_t *q_is_pcm)
Definition: hevc_lpf_sao_lsx.c:917
DUP4_ARG3
#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:83