FFmpeg
vp9_lpf_lsx.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * Contributed by Jin Bo <jinbo@loongson.cn>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "libavcodec/vp9dsp.h"
24 #include "libavutil/common.h"
25 #include "vp9dsp_loongarch.h"
26 
27 #define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, _in2, \
28  _in3, _in4, _in5, _in6, _in7) \
29 { \
30  _in0 = __lsx_vld(_src, 0); \
31  _in1 = __lsx_vldx(_src, _stride); \
32  _in2 = __lsx_vldx(_src, _stride2); \
33  _in3 = __lsx_vldx(_src, _stride3); \
34  _src += _stride4; \
35  _in4 = __lsx_vld(_src, 0); \
36  _in5 = __lsx_vldx(_src, _stride); \
37  _in6 = __lsx_vldx(_src, _stride2); \
38  _in7 = __lsx_vldx(_src, _stride3); \
39 }
40 
41 #define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, \
42  _dst, _stride, _stride2, _stride3, _stride4) \
43 { \
44  __lsx_vst(_dst0, _dst, 0); \
45  __lsx_vstx(_dst1, _dst, _stride); \
46  __lsx_vstx(_dst2, _dst, _stride2); \
47  __lsx_vstx(_dst3, _dst, _stride3); \
48  _dst += _stride4; \
49  __lsx_vst(_dst4, _dst, 0); \
50  __lsx_vstx(_dst5, _dst, _stride); \
51  __lsx_vstx(_dst6, _dst, _stride2); \
52  __lsx_vstx(_dst7, _dst, _stride3); \
53 }
54 
55 #define VP9_LPF_FILTER4_4W(p1_src, p0_src, q0_src, q1_src, mask_src, hev_src, \
56  p1_dst, p0_dst, q0_dst, q1_dst) \
57 { \
58  __m128i p1_tmp, p0_tmp, q0_tmp, q1_tmp, q0_sub_p0, filt, filt1, filt2; \
59  const __m128i cnst3b = __lsx_vldi(3); \
60  const __m128i cnst4b = __lsx_vldi(4); \
61  \
62  p1_tmp = __lsx_vxori_b(p1_src, 0x80); \
63  p0_tmp = __lsx_vxori_b(p0_src, 0x80); \
64  q0_tmp = __lsx_vxori_b(q0_src, 0x80); \
65  q1_tmp = __lsx_vxori_b(q1_src, 0x80); \
66  \
67  filt = __lsx_vssub_b(p1_tmp, q1_tmp); \
68  \
69  filt = filt & hev_src; \
70  \
71  q0_sub_p0 = __lsx_vssub_b(q0_tmp, p0_tmp); \
72  filt = __lsx_vsadd_b(filt, q0_sub_p0); \
73  filt = __lsx_vsadd_b(filt, q0_sub_p0); \
74  filt = __lsx_vsadd_b(filt, q0_sub_p0); \
75  filt = filt & mask_src; \
76  \
77  filt1 = __lsx_vsadd_b(filt, cnst4b); \
78  filt1 = __lsx_vsrai_b(filt1, 3); \
79  \
80  filt2 = __lsx_vsadd_b(filt, cnst3b); \
81  filt2 = __lsx_vsrai_b(filt2, 3); \
82  \
83  q0_tmp = __lsx_vssub_b(q0_tmp, filt1); \
84  q0_dst = __lsx_vxori_b(q0_tmp, 0x80); \
85  p0_tmp = __lsx_vsadd_b(p0_tmp, filt2); \
86  p0_dst = __lsx_vxori_b(p0_tmp, 0x80); \
87  \
88  filt = __lsx_vsrari_b(filt1, 1); \
89  hev_src = __lsx_vxori_b(hev_src, 0xff); \
90  filt = filt & hev_src; \
91  \
92  q1_tmp = __lsx_vssub_b(q1_tmp, filt); \
93  q1_dst = __lsx_vxori_b(q1_tmp, 0x80); \
94  p1_tmp = __lsx_vsadd_b(p1_tmp, filt); \
95  p1_dst = __lsx_vxori_b(p1_tmp, 0x80); \
96 }
97 
98 #define VP9_FLAT4(p3_src, p2_src, p0_src, q0_src, q2_src, q3_src, flat_dst) \
99 { \
100  __m128i f_tmp = __lsx_vldi(1); \
101  __m128i p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
102  \
103  p2_a_sub_p0 = __lsx_vabsd_bu(p2_src, p0_src); \
104  q2_a_sub_q0 = __lsx_vabsd_bu(q2_src, q0_src); \
105  p3_a_sub_p0 = __lsx_vabsd_bu(p3_src, p0_src); \
106  q3_a_sub_q0 = __lsx_vabsd_bu(q3_src, q0_src); \
107  \
108  p2_a_sub_p0 = __lsx_vmax_bu(p2_a_sub_p0, q2_a_sub_q0); \
109  flat_dst = __lsx_vmax_bu(p2_a_sub_p0, flat_dst); \
110  p3_a_sub_p0 = __lsx_vmax_bu(p3_a_sub_p0, q3_a_sub_q0); \
111  flat_dst = __lsx_vmax_bu(p3_a_sub_p0, flat_dst); \
112  \
113  flat_dst = __lsx_vslt_bu(f_tmp, flat_dst); \
114  flat_dst = __lsx_vxori_b(flat_dst, 0xff); \
115  flat_dst = flat_dst & mask; \
116 }
117 
118 #define VP9_FLAT5(p7_src, p6_src, p5_src, p4_src, p0_src, q0_src, q4_src, \
119  q5_src, q6_src, q7_src, flat_src, flat2_dst) \
120 { \
121  __m128i f_tmp = __lsx_vldi(1); \
122  __m128i p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
123  __m128i p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
124  \
125  p4_a_sub_p0 = __lsx_vabsd_bu(p4_src, p0_src); \
126  q4_a_sub_q0 = __lsx_vabsd_bu(q4_src, q0_src); \
127  p5_a_sub_p0 = __lsx_vabsd_bu(p5_src, p0_src); \
128  q5_a_sub_q0 = __lsx_vabsd_bu(q5_src, q0_src); \
129  p6_a_sub_p0 = __lsx_vabsd_bu(p6_src, p0_src); \
130  q6_a_sub_q0 = __lsx_vabsd_bu(q6_src, q0_src); \
131  p7_a_sub_p0 = __lsx_vabsd_bu(p7_src, p0_src); \
132  q7_a_sub_q0 = __lsx_vabsd_bu(q7_src, q0_src); \
133  \
134  p4_a_sub_p0 = __lsx_vmax_bu(p4_a_sub_p0, q4_a_sub_q0); \
135  flat2_dst = __lsx_vmax_bu(p5_a_sub_p0, q5_a_sub_q0); \
136  flat2_dst = __lsx_vmax_bu(p4_a_sub_p0, flat2_dst); \
137  p6_a_sub_p0 = __lsx_vmax_bu(p6_a_sub_p0, q6_a_sub_q0); \
138  flat2_dst = __lsx_vmax_bu(p6_a_sub_p0, flat2_dst); \
139  p7_a_sub_p0 = __lsx_vmax_bu(p7_a_sub_p0, q7_a_sub_q0); \
140  flat2_dst = __lsx_vmax_bu(p7_a_sub_p0, flat2_dst); \
141  \
142  flat2_dst = __lsx_vslt_bu(f_tmp, flat2_dst); \
143  flat2_dst = __lsx_vxori_b(flat2_dst, 0xff); \
144  flat2_dst = flat2_dst & flat_src; \
145 }
146 
147 #define VP9_FILTER8(p3_src, p2_src, p1_src, p0_src, \
148  q0_src, q1_src, q2_src, q3_src, \
149  p2_filt8_dst, p1_filt8_dst, p0_filt8_dst, \
150  q0_filt8_dst, q1_filt8_dst, q2_filt8_dst) \
151 { \
152  __m128i tmp0, tmp1, tmp2; \
153  \
154  tmp2 = __lsx_vadd_h(p2_src, p1_src); \
155  tmp2 = __lsx_vadd_h(tmp2, p0_src); \
156  tmp0 = __lsx_vslli_h(p3_src, 1); \
157  \
158  tmp0 = __lsx_vadd_h(tmp0, tmp2); \
159  tmp0 = __lsx_vadd_h(tmp0, q0_src); \
160  tmp1 = __lsx_vadd_h(tmp0, p3_src); \
161  tmp1 = __lsx_vadd_h(tmp1, p2_src); \
162  p2_filt8_dst = __lsx_vsrari_h(tmp1, 3); \
163  \
164  tmp1 = __lsx_vadd_h(tmp0, p1_src); \
165  tmp1 = __lsx_vadd_h(tmp1, q1_src); \
166  p1_filt8_dst = __lsx_vsrari_h(tmp1, 3); \
167  \
168  tmp1 = __lsx_vadd_h(q2_src, q1_src); \
169  tmp1 = __lsx_vadd_h(tmp1, q0_src); \
170  tmp2 = __lsx_vadd_h(tmp2, tmp1); \
171  tmp0 = __lsx_vadd_h(tmp2, p0_src); \
172  tmp0 = __lsx_vadd_h(tmp0, p3_src); \
173  p0_filt8_dst = __lsx_vsrari_h(tmp0, 3); \
174  \
175  tmp0 = __lsx_vadd_h(q2_src, q3_src); \
176  tmp0 = __lsx_vadd_h(tmp0, p0_src); \
177  tmp0 = __lsx_vadd_h(tmp0, tmp1); \
178  tmp1 = __lsx_vadd_h(q3_src, q3_src); \
179  tmp1 = __lsx_vadd_h(tmp1, tmp0); \
180  q2_filt8_dst = __lsx_vsrari_h(tmp1, 3); \
181  \
182  tmp0 = __lsx_vadd_h(tmp2, q3_src); \
183  tmp1 = __lsx_vadd_h(tmp0, q0_src); \
184  q0_filt8_dst = __lsx_vsrari_h(tmp1, 3); \
185  \
186  tmp1 = __lsx_vsub_h(tmp0, p2_src); \
187  tmp0 = __lsx_vadd_h(q1_src, q3_src); \
188  tmp1 = __lsx_vadd_h(tmp0, tmp1); \
189  q1_filt8_dst = __lsx_vsrari_h(tmp1, 3); \
190 }
191 
192 #define LPF_MASK_HEV(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, \
193  q2_src, q3_src, limit_src, b_limit_src, thresh_src, \
194  hev_dst, mask_dst, flat_dst) \
195 { \
196  __m128i p3_asub_p2_tmp, p2_asub_p1_tmp, p1_asub_p0_tmp, q1_asub_q0_tmp; \
197  __m128i p1_asub_q1_tmp, p0_asub_q0_tmp, q3_asub_q2_tmp, q2_asub_q1_tmp; \
198  \
199  /* absolute subtraction of pixel values */ \
200  p3_asub_p2_tmp = __lsx_vabsd_bu(p3_src, p2_src); \
201  p2_asub_p1_tmp = __lsx_vabsd_bu(p2_src, p1_src); \
202  p1_asub_p0_tmp = __lsx_vabsd_bu(p1_src, p0_src); \
203  q1_asub_q0_tmp = __lsx_vabsd_bu(q1_src, q0_src); \
204  q2_asub_q1_tmp = __lsx_vabsd_bu(q2_src, q1_src); \
205  q3_asub_q2_tmp = __lsx_vabsd_bu(q3_src, q2_src); \
206  p0_asub_q0_tmp = __lsx_vabsd_bu(p0_src, q0_src); \
207  p1_asub_q1_tmp = __lsx_vabsd_bu(p1_src, q1_src); \
208  \
209  /* calculation of hev */ \
210  flat_dst = __lsx_vmax_bu(p1_asub_p0_tmp, q1_asub_q0_tmp); \
211  hev_dst = __lsx_vslt_bu(thresh_src, flat_dst); \
212  \
213  /* calculation of mask */ \
214  p0_asub_q0_tmp = __lsx_vsadd_bu(p0_asub_q0_tmp, p0_asub_q0_tmp); \
215  p1_asub_q1_tmp = __lsx_vsrli_b(p1_asub_q1_tmp, 1); \
216  p0_asub_q0_tmp = __lsx_vsadd_bu(p0_asub_q0_tmp, p1_asub_q1_tmp); \
217  \
218  mask_dst = __lsx_vslt_bu(b_limit_src, p0_asub_q0_tmp); \
219  mask_dst = __lsx_vmax_bu(flat_dst, mask_dst); \
220  p3_asub_p2_tmp = __lsx_vmax_bu(p3_asub_p2_tmp, p2_asub_p1_tmp); \
221  mask_dst = __lsx_vmax_bu(p3_asub_p2_tmp, mask_dst); \
222  q2_asub_q1_tmp = __lsx_vmax_bu(q2_asub_q1_tmp, q3_asub_q2_tmp); \
223  mask_dst = __lsx_vmax_bu(q2_asub_q1_tmp, mask_dst); \
224  \
225  mask_dst = __lsx_vslt_bu(limit_src, mask_dst); \
226  mask_dst = __lsx_vxori_b(mask_dst, 0xff); \
227 }
228 
229 void ff_loop_filter_v_4_8_lsx(uint8_t *dst, ptrdiff_t stride,
230  int32_t b_limit_ptr,
231  int32_t limit_ptr,
232  int32_t thresh_ptr)
233 {
234  ptrdiff_t stride2 = stride << 1;
235  ptrdiff_t stride3 = stride2 + stride;
236  ptrdiff_t stride4 = stride2 << 1;
237  __m128i mask, hev, flat, thresh, b_limit, limit;
238  __m128i p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
239 
240  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
241  dst, -stride, p3, p2, p1, p0);
242  q0 = __lsx_vld(dst, 0);
243  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
244  q3 = __lsx_vldx(dst, stride3);
245 
246  thresh = __lsx_vreplgr2vr_b(thresh_ptr);
247  b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
248  limit = __lsx_vreplgr2vr_b(limit_ptr);
249 
250  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
251  hev, mask, flat);
252 
253  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
254  q1_out);
255 
256  __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
257  __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
258  __lsx_vstelm_d(q0_out, dst , 0, 0);
259  __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
260 }
261 
262 void ff_loop_filter_v_44_16_lsx(uint8_t *dst, ptrdiff_t stride,
263  int32_t b_limit_ptr,
264  int32_t limit_ptr,
265  int32_t thresh_ptr)
266 {
267  ptrdiff_t stride2 = stride << 1;
268  ptrdiff_t stride3 = stride2 + stride;
269  ptrdiff_t stride4 = stride2 << 1;
270  __m128i mask, hev, flat, thresh0, b_limit0;
271  __m128i limit0, thresh1, b_limit1, limit1;
272  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
273 
274  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
275  dst, -stride, p3, p2, p1, p0);
276  q0 = __lsx_vld(dst, 0);
277  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
278  q3 = __lsx_vldx(dst, stride3);
279 
280  thresh0 = __lsx_vreplgr2vr_b(thresh_ptr);
281  thresh1 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
282  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
283 
284  b_limit0 = __lsx_vreplgr2vr_b(b_limit_ptr);
285  b_limit1 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
286  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
287 
288  limit0 = __lsx_vreplgr2vr_b(limit_ptr);
289  limit1 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
290  limit0 = __lsx_vilvl_d(limit1, limit0);
291 
292  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
293  hev, mask, flat);
294  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
295 
296  __lsx_vst(p1, dst - stride2, 0);
297  __lsx_vst(p0, dst - stride, 0);
298  __lsx_vst(q0, dst , 0);
299  __lsx_vst(q1, dst + stride, 0);
300 }
301 
302 void ff_loop_filter_v_8_8_lsx(uint8_t *dst, ptrdiff_t stride,
303  int32_t b_limit_ptr,
304  int32_t limit_ptr,
305  int32_t thresh_ptr)
306 {
307  ptrdiff_t stride2 = stride << 1;
308  ptrdiff_t stride3 = stride2 + stride;
309  ptrdiff_t stride4 = stride2 << 1;
310  __m128i mask, hev, flat, thresh, b_limit, limit;
311  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
312  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
313  __m128i p2_filter8, p1_filter8, p0_filter8;
314  __m128i q0_filter8, q1_filter8, q2_filter8;
315  __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
316  __m128i zero = __lsx_vldi(0);
317 
318  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
319  dst, -stride, p3, p2, p1, p0);
320  q0 = __lsx_vld(dst, 0);
321  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
322  q3 = __lsx_vldx(dst, stride3);
323 
324  thresh = __lsx_vreplgr2vr_b(thresh_ptr);
325  b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
326  limit = __lsx_vreplgr2vr_b(limit_ptr);
327 
328  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
329  hev, mask, flat);
330  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
331  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
332  q1_out);
333 
334  flat = __lsx_vilvl_d(zero, flat);
335 
336  /* if flat is zero for all pixels, then no need to calculate other filter */
337  if (__lsx_bz_v(flat)) {
338  __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
339  __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
340  __lsx_vstelm_d(q0_out, dst , 0, 0);
341  __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
342  } else {
343  DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
344  p3_l, p2_l, p1_l, p0_l);
345  DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
346  q0_l, q1_l, q2_l, q3_l);
347  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
348  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
349 
350  /* convert 16 bit output data into 8 bit */
351  DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8,
352  zero, p0_filter8, zero, q0_filter8, p2_filter8,
353  p1_filter8, p0_filter8, q0_filter8);
354  DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8,
355  q1_filter8, q2_filter8);
356 
357  /* store pixel values */
358  p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat);
359  p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat);
360  p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat);
361  q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat);
362  q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat);
363  q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat);
364 
365  __lsx_vstelm_d(p2_out, dst - stride3, 0, 0);
366  __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
367  __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
368  __lsx_vstelm_d(q0_out, dst, 0, 0);
369  __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
370  __lsx_vstelm_d(q2_out, dst + stride2, 0, 0);
371  }
372 }
373 
374 void ff_loop_filter_v_88_16_lsx(uint8_t *dst, ptrdiff_t stride,
375  int32_t b_limit_ptr,
376  int32_t limit_ptr,
377  int32_t thresh_ptr)
378 {
379  ptrdiff_t stride2 = stride << 1;
380  ptrdiff_t stride3 = stride2 + stride;
381  ptrdiff_t stride4 = stride2 << 1;
382  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
383  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
384  __m128i flat, mask, hev, tmp, thresh, b_limit, limit;
385  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
386  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
387  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
388  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
389  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
390  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
391  __m128i zero = __lsx_vldi(0);
392 
393  /* load vector elements */
394  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
395  dst, -stride, p3, p2, p1, p0);
396  q0 = __lsx_vld(dst, 0);
397  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
398  q3 = __lsx_vldx(dst, stride3);
399 
400  thresh = __lsx_vreplgr2vr_b(thresh_ptr);
401  tmp = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
402  thresh = __lsx_vilvl_d(tmp, thresh);
403 
404  b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
405  tmp = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
406  b_limit = __lsx_vilvl_d(tmp, b_limit);
407 
408  limit = __lsx_vreplgr2vr_b(limit_ptr);
409  tmp = __lsx_vreplgr2vr_b(limit_ptr >> 8);
410  limit = __lsx_vilvl_d(tmp, limit);
411 
412  /* mask and hev */
413  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
414  hev, mask, flat);
415  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
416  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
417  q1_out);
418 
419  /* if flat is zero for all pixels, then no need to calculate other filter */
420  if (__lsx_bz_v(flat)) {
421  __lsx_vst(p1_out, dst - stride2, 0);
422  __lsx_vst(p0_out, dst - stride, 0);
423  __lsx_vst(q0_out, dst, 0);
424  __lsx_vst(q1_out, dst + stride, 0);
425  } else {
426  DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
427  p3_l, p2_l, p1_l, p0_l);
428  DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
429  q0_l, q1_l, q2_l, q3_l);
430  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
431  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
432 
433  DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
434  p3_h, p2_h, p1_h, p0_h);
435  DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
436  q0_h, q1_h, q2_h, q3_h);
437  VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
438  p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
439 
440  /* convert 16 bit output data into 8 bit */
441  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
442  p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l,
443  p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
444  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
445  q2_filt8_l, q1_filt8_l, q2_filt8_l);
446 
447  /* store pixel values */
448  p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
449  p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
450  p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
451  q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
452  q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
453  q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
454 
455 
456  __lsx_vstx(p2_out, dst, -stride3);
457  __lsx_vstx(p1_out, dst, -stride2);
458  __lsx_vstx(p0_out, dst, -stride);
459  __lsx_vst(q0_out, dst, 0);
460  __lsx_vstx(q1_out, dst, stride);
461  __lsx_vstx(q2_out, dst, stride2);
462  }
463 }
464 
465 void ff_loop_filter_v_84_16_lsx(uint8_t *dst, ptrdiff_t stride,
466  int32_t b_limit_ptr,
467  int32_t limit_ptr,
468  int32_t thresh_ptr)
469 {
470  ptrdiff_t stride2 = stride << 1;
471  ptrdiff_t stride3 = stride2 + stride;
472  ptrdiff_t stride4 = stride2 << 1;
473  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
474  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
475  __m128i flat, mask, hev, tmp, thresh, b_limit, limit;
476  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
477  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
478  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
479  __m128i zero = __lsx_vldi(0);
480 
481  /* load vector elements */
482  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
483  dst, -stride, p3, p2, p1, p0);
484  q0 = __lsx_vld(dst, 0);
485  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
486  q3 = __lsx_vldx(dst, stride3);
487 
488  thresh = __lsx_vreplgr2vr_b(thresh_ptr);
489  tmp = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
490  thresh = __lsx_vilvl_d(tmp, thresh);
491 
492  b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
493  tmp = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
494  b_limit = __lsx_vilvl_d(tmp, b_limit);
495 
496  limit = __lsx_vreplgr2vr_b(limit_ptr);
497  tmp = __lsx_vreplgr2vr_b(limit_ptr >> 8);
498  limit = __lsx_vilvl_d(tmp, limit);
499 
500  /* mask and hev */
501  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
502  hev, mask, flat);
503  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
504  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
505  q1_out);
506 
507  flat = __lsx_vilvl_d(zero, flat);
508 
509  /* if flat is zero for all pixels, then no need to calculate other filter */
510  if (__lsx_bz_v(flat)) {
511  __lsx_vstx(p1_out, dst, -stride2);
512  __lsx_vstx(p0_out, dst, -stride);
513  __lsx_vst(q0_out, dst, 0);
514  __lsx_vstx(q1_out, dst, stride);
515  } else {
516  DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
517  p3_l, p2_l, p1_l, p0_l);
518  DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
519  q0_l, q1_l, q2_l, q3_l);
520  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
521  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
522 
523  /* convert 16 bit output data into 8 bit */
524  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l,
525  p1_filt8_l, p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
526  p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
527  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l,
528  q2_filt8_l, q1_filt8_l, q2_filt8_l);
529 
530  /* store pixel values */
531  p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
532  p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
533  p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
534  q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
535  q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
536  q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
537 
538  __lsx_vstx(p2_out, dst, -stride3);
539  __lsx_vstx(p1_out, dst, -stride2);
540  __lsx_vstx(p0_out, dst, -stride);
541  __lsx_vst(q0_out, dst, 0);
542  __lsx_vstx(q1_out, dst, stride);
543  __lsx_vstx(q2_out, dst, stride2);
544  }
545 }
546 
547 void ff_loop_filter_v_48_16_lsx(uint8_t *dst, ptrdiff_t stride,
548  int32_t b_limit_ptr,
549  int32_t limit_ptr,
550  int32_t thresh_ptr)
551 {
552  ptrdiff_t stride2 = stride << 1;
553  ptrdiff_t stride3 = stride2 + stride;
554  ptrdiff_t stride4 = stride2 << 1;
555  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
556  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
557  __m128i flat, mask, hev, tmp, thresh, b_limit, limit;
558  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
559  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
560  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
561  __m128i zero = { 0 };
562 
563  /* load vector elements */
564  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
565  dst, -stride, p3, p2, p1, p0);
566  q0 = __lsx_vld(dst, 0);
567  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
568  q3 = __lsx_vldx(dst, stride3);
569 
570  thresh = __lsx_vreplgr2vr_b(thresh_ptr);
571  tmp = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
572  thresh = __lsx_vilvl_d(tmp, thresh);
573 
574  b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
575  tmp = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
576  b_limit = __lsx_vilvl_d(tmp, b_limit);
577 
578  limit = __lsx_vreplgr2vr_b(limit_ptr);
579  tmp = __lsx_vreplgr2vr_b(limit_ptr >> 8);
580  limit = __lsx_vilvl_d(tmp, limit);
581 
582  /* mask and hev */
583  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
584  hev, mask, flat);
585  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
586  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
587  q1_out);
588 
589  flat = __lsx_vilvh_d(flat, zero);
590 
591  /* if flat is zero for all pixels, then no need to calculate other filter */
592  if (__lsx_bz_v(flat)) {
593  __lsx_vstx(p1_out, dst, -stride2);
594  __lsx_vstx(p0_out, dst, -stride);
595  __lsx_vst(q0_out, dst, 0);
596  __lsx_vstx(q1_out, dst, stride);
597  } else {
598  DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
599  p3_h, p2_h, p1_h, p0_h);
600  DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
601  q0_h, q1_h, q2_h, q3_h);
602  VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
603  p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
604 
605  /* convert 16 bit output data into 8 bit */
606  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_h, p1_filt8_h,
607  p1_filt8_h, p0_filt8_h, p0_filt8_h, q0_filt8_h, q0_filt8_h,
608  p2_filt8_h, p1_filt8_h, p0_filt8_h, q0_filt8_h);
609  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_h, q2_filt8_h,
610  q2_filt8_h, q1_filt8_h, q2_filt8_h);
611 
612  /* store pixel values */
613  p2_out = __lsx_vbitsel_v(p2, p2_filt8_h, flat);
614  p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_h, flat);
615  p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_h, flat);
616  q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_h, flat);
617  q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_h, flat);
618  q2_out = __lsx_vbitsel_v(q2, q2_filt8_h, flat);
619 
620  __lsx_vstx(p2_out, dst, -stride3);
621  __lsx_vstx(p1_out, dst, -stride2);
622  __lsx_vstx(p0_out, dst, -stride);
623  __lsx_vst(q0_out, dst, 0);
624  __lsx_vstx(q1_out, dst, stride);
625  __lsx_vstx(q2_out, dst, stride2);
626  }
627 }
628 
629 static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *dst, ptrdiff_t stride,
630  uint8_t *filter48,
631  int32_t b_limit_ptr,
632  int32_t limit_ptr,
633  int32_t thresh_ptr)
634 {
635  ptrdiff_t stride2 = stride << 1;
636  ptrdiff_t stride3 = stride2 + stride;
637  ptrdiff_t stride4 = stride2 << 1;
638  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
639  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
640  __m128i flat, mask, hev, thresh, b_limit, limit;
641  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
642  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
643  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
644  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
645  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
646  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
647  __m128i zero = __lsx_vldi(0);
648 
649  /* load vector elements */
650  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
651  dst, -stride, p3, p2, p1, p0);
652  q0 = __lsx_vld(dst, 0);
653  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
654  q3 = __lsx_vldx(dst, stride3);
655 
656  thresh = __lsx_vreplgr2vr_b(thresh_ptr);
657  b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
658  limit = __lsx_vreplgr2vr_b(limit_ptr);
659 
660  /* mask and hev */
661  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
662  hev, mask, flat);
663  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
664  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
665  q1_out);
666 
667  /* if flat is zero for all pixels, then no need to calculate other filter */
668  if (__lsx_bz_v(flat)) {
669  __lsx_vstx(p1_out, dst, -stride2);
670  __lsx_vstx(p0_out, dst, -stride);
671  __lsx_vst(q0_out, dst, 0);
672  __lsx_vstx(q1_out, dst, stride);
673  return 1;
674  } else {
675  DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
676  p3_l, p2_l, p1_l, p0_l);
677  DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
678  q0_l, q1_l, q2_l, q3_l);
679  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
680  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
681 
682  DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
683  p3_h, p2_h, p1_h, p0_h);
684  DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
685  q0_h, q1_h, q2_h, q3_h);
686  VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
687  p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
688 
689  /* convert 16 bit output data into 8 bit */
690  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
691  p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l,
692  p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
693  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
694  q2_filt8_l, q1_filt8_l, q2_filt8_l);
695 
696  /* store pixel values */
697  p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
698  p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
699  p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
700  q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
701  q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
702  q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
703 
704  __lsx_vst(p2_out, filter48, 0);
705  __lsx_vst(p1_out, filter48, 16);
706  __lsx_vst(p0_out, filter48, 32);
707  __lsx_vst(q0_out, filter48, 48);
708  __lsx_vst(q1_out, filter48, 64);
709  __lsx_vst(q2_out, filter48, 80);
710  __lsx_vst(flat, filter48, 96);
711 
712  return 0;
713  }
714 }
715 
716 static void vp9_hz_lpf_t16_16w(uint8_t *dst, ptrdiff_t stride,
717  uint8_t *filter48)
718 {
719  ptrdiff_t stride2 = stride << 1;
720  ptrdiff_t stride3 = stride2 + stride;
721  ptrdiff_t stride4 = stride2 << 1;
722  uint8_t *dst_tmp = dst - stride4;
723  uint8_t *dst_tmp1 = dst + stride4;
724  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
725  __m128i flat, flat2, filter8;
726  __m128i zero = __lsx_vldi(0);
727  __m128i out_h, out_l;
728  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
729  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
730  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
731  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
732  v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
733  v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
734  v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
735  v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
736  v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
737 
738  flat = __lsx_vld(filter48, 96);
739 
740  DUP4_ARG2(__lsx_vldx, dst_tmp, -stride4, dst_tmp, -stride3, dst_tmp,
741  -stride2, dst_tmp, -stride, p7, p6, p5, p4);
742  p3 = __lsx_vld(dst_tmp, 0);
743  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1);
744  p0 = __lsx_vldx(dst_tmp, stride3);
745 
746  q0 = __lsx_vld(dst, 0);
747  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
748  q3 = __lsx_vldx(dst, stride3);
749 
750  q4 = __lsx_vld(dst_tmp1, 0);
751  DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6);
752  q7 = __lsx_vldx(dst_tmp1, stride3);
753  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
754 
755  /* if flat2 is zero for all pixels, then no need to calculate other filter */
756  if (__lsx_bz_v(flat2)) {
757  DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48,
758  48, p2, p1, p0, q0);
759  DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
760 
761  __lsx_vstx(p2, dst, -stride3);
762  __lsx_vstx(p1, dst, -stride2);
763  __lsx_vstx(p0, dst, -stride);
764  __lsx_vst(q0, dst, 0);
765  __lsx_vstx(q1, dst, stride);
766  __lsx_vstx(q2, dst, stride2);
767  } else {
768  dst = dst_tmp - stride3;
769 
770  p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7);
771  p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6);
772  p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5);
773  p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4);
774  p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3);
775  p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2);
776  p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1);
777  p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0);
778 
779  q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0);
780 
781  tmp0_l = p7_l_in << 3;
782  tmp0_l -= p7_l_in;
783  tmp0_l += p6_l_in;
784  tmp0_l += q0_l_in;
785  tmp1_l = p6_l_in + p5_l_in;
786  tmp1_l += p4_l_in;
787  tmp1_l += p3_l_in;
788  tmp1_l += p2_l_in;
789  tmp1_l += p1_l_in;
790  tmp1_l += p0_l_in;
791  tmp1_l += tmp0_l;
792 
793  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
794 
795  p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7);
796  p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6);
797  p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5);
798  p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4);
799 
800  p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3);
801  p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2);
802  p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1);
803  p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0);
804  q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0);
805 
806  tmp0_h = p7_h_in << 3;
807  tmp0_h -= p7_h_in;
808  tmp0_h += p6_h_in;
809  tmp0_h += q0_h_in;
810  tmp1_h = p6_h_in + p5_h_in;
811  tmp1_h += p4_h_in;
812  tmp1_h += p3_h_in;
813  tmp1_h += p2_h_in;
814  tmp1_h += p1_h_in;
815  tmp1_h += p0_h_in;
816  tmp1_h += tmp0_h;
817 
818  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
819 
820  out_l = __lsx_vpickev_b(out_h, out_l);
821  p6 = __lsx_vbitsel_v(p6, out_l, flat2);
822  __lsx_vst(p6, dst, 0);
823  dst += stride;
824 
825  /* p5 */
826  q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1);
827  tmp0_l = p5_l_in - p6_l_in;
828  tmp0_l += q1_l_in;
829  tmp0_l -= p7_l_in;
830  tmp1_l += tmp0_l;
831  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
832 
833  q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1);
834  tmp0_h = p5_h_in - p6_h_in;
835  tmp0_h += q1_h_in;
836  tmp0_h -= p7_h_in;
837  tmp1_h += tmp0_h;
838  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
839 
840  out_l = __lsx_vpickev_b(out_h, out_l);
841  p5 = __lsx_vbitsel_v(p5, out_l, flat2);
842  __lsx_vst(p5, dst, 0);
843  dst += stride;
844 
845  /* p4 */
846  q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2);
847  tmp0_l = p4_l_in - p5_l_in;
848  tmp0_l += q2_l_in;
849  tmp0_l -= p7_l_in;
850  tmp1_l += tmp0_l;
851  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
852 
853  q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2);
854  tmp0_h = p4_h_in - p5_h_in;
855  tmp0_h += q2_h_in;
856  tmp0_h -= p7_h_in;
857  tmp1_h += tmp0_h;
858  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
859 
860  out_l = __lsx_vpickev_b(out_h, out_l);
861  p4 = __lsx_vbitsel_v(p4, out_l, flat2);
862  __lsx_vst(p4, dst, 0);
863  dst += stride;
864 
865  /* p3 */
866  q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3);
867  tmp0_l = p3_l_in - p4_l_in;
868  tmp0_l += q3_l_in;
869  tmp0_l -= p7_l_in;
870  tmp1_l += tmp0_l;
871  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
872 
873  q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3);
874  tmp0_h = p3_h_in - p4_h_in;
875  tmp0_h += q3_h_in;
876  tmp0_h -= p7_h_in;
877  tmp1_h += tmp0_h;
878  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
879 
880  out_l = __lsx_vpickev_b(out_h, out_l);
881  p3 = __lsx_vbitsel_v(p3, out_l, flat2);
882  __lsx_vst(p3, dst, 0);
883  dst += stride;
884 
885  /* p2 */
886  q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4);
887  filter8 = __lsx_vld(filter48, 0);
888  tmp0_l = p2_l_in - p3_l_in;
889  tmp0_l += q4_l_in;
890  tmp0_l -= p7_l_in;
891  tmp1_l += tmp0_l;
892  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
893 
894  q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4);
895  tmp0_h = p2_h_in - p3_h_in;
896  tmp0_h += q4_h_in;
897  tmp0_h -= p7_h_in;
898  tmp1_h += tmp0_h;
899  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
900 
901  out_l = __lsx_vpickev_b(out_h, out_l);
902  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
903  __lsx_vst(filter8, dst, 0);
904  dst += stride;
905 
906  /* p1 */
907  q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5);
908  filter8 = __lsx_vld(filter48, 16);
909  tmp0_l = p1_l_in - p2_l_in;
910  tmp0_l += q5_l_in;
911  tmp0_l -= p7_l_in;
912  tmp1_l += tmp0_l;
913  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
914 
915  q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5);
916  tmp0_h = p1_h_in - p2_h_in;
917  tmp0_h += q5_h_in;
918  tmp0_h -= p7_h_in;
919  tmp1_h += tmp0_h;
920  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
921 
922  out_l = __lsx_vpickev_b(out_h, out_l);
923  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
924  __lsx_vst(filter8, dst, 0);
925  dst += stride;
926 
927  /* p0 */
928  q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6);
929  filter8 = __lsx_vld(filter48, 32);
930  tmp0_l = p0_l_in - p1_l_in;
931  tmp0_l += q6_l_in;
932  tmp0_l -= p7_l_in;
933  tmp1_l += tmp0_l;
934  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
935 
936  q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6);
937  tmp0_h = p0_h_in - p1_h_in;
938  tmp0_h += q6_h_in;
939  tmp0_h -= p7_h_in;
940  tmp1_h += tmp0_h;
941  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
942 
943  out_l = __lsx_vpickev_b(out_h, out_l);
944  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
945  __lsx_vst(filter8, dst, 0);
946  dst += stride;
947 
948  /* q0 */
949  q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7);
950  filter8 = __lsx_vld(filter48, 48);
951  tmp0_l = q7_l_in - p0_l_in;
952  tmp0_l += q0_l_in;
953  tmp0_l -= p7_l_in;
954  tmp1_l += tmp0_l;
955  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
956 
957  q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7);
958  tmp0_h = q7_h_in - p0_h_in;
959  tmp0_h += q0_h_in;
960  tmp0_h -= p7_h_in;
961  tmp1_h += tmp0_h;
962  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
963 
964  out_l = __lsx_vpickev_b(out_h, out_l);
965  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
966  __lsx_vst(filter8, dst, 0);
967  dst += stride;
968 
969  /* q1 */
970  filter8 = __lsx_vld(filter48, 64);
971  tmp0_l = q7_l_in - q0_l_in;
972  tmp0_l += q1_l_in;
973  tmp0_l -= p6_l_in;
974  tmp1_l += tmp0_l;
975  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
976 
977  tmp0_h = q7_h_in - q0_h_in;
978  tmp0_h += q1_h_in;
979  tmp0_h -= p6_h_in;
980  tmp1_h += tmp0_h;
981  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
982 
983  out_l = __lsx_vpickev_b(out_h, out_l);
984  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
985  __lsx_vst(filter8, dst, 0);
986  dst += stride;
987 
988  /* q2 */
989  filter8 = __lsx_vld(filter48, 80);
990  tmp0_l = q7_l_in - q1_l_in;
991  tmp0_l += q2_l_in;
992  tmp0_l -= p5_l_in;
993  tmp1_l += tmp0_l;
994  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
995 
996  tmp0_h = q7_h_in - q1_h_in;
997  tmp0_h += q2_h_in;
998  tmp0_h -= p5_h_in;
999  tmp1_h += tmp0_h;
1000  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1001 
1002  out_l = __lsx_vpickev_b(out_h, out_l);
1003  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
1004  __lsx_vst(filter8, dst, 0);
1005  dst += stride;
1006 
1007  /* q3 */
1008  tmp0_l = q7_l_in - q2_l_in;
1009  tmp0_l += q3_l_in;
1010  tmp0_l -= p4_l_in;
1011  tmp1_l += tmp0_l;
1012  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1013 
1014  tmp0_h = q7_h_in - q2_h_in;
1015  tmp0_h += q3_h_in;
1016  tmp0_h -= p4_h_in;
1017  tmp1_h += tmp0_h;
1018  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1019 
1020  out_l = __lsx_vpickev_b(out_h, out_l);
1021  q3 = __lsx_vbitsel_v(q3, out_l, flat2);
1022  __lsx_vst(q3, dst, 0);
1023  dst += stride;
1024 
1025  /* q4 */
1026  tmp0_l = q7_l_in - q3_l_in;
1027  tmp0_l += q4_l_in;
1028  tmp0_l -= p3_l_in;
1029  tmp1_l += tmp0_l;
1030  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1031 
1032  tmp0_h = q7_h_in - q3_h_in;
1033  tmp0_h += q4_h_in;
1034  tmp0_h -= p3_h_in;
1035  tmp1_h += tmp0_h;
1036  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1037 
1038  out_l = __lsx_vpickev_b(out_h, out_l);
1039  q4 = __lsx_vbitsel_v(q4, out_l, flat2);
1040  __lsx_vst(q4, dst, 0);
1041  dst += stride;
1042 
1043  /* q5 */
1044  tmp0_l = q7_l_in - q4_l_in;
1045  tmp0_l += q5_l_in;
1046  tmp0_l -= p2_l_in;
1047  tmp1_l += tmp0_l;
1048  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1049 
1050  tmp0_h = q7_h_in - q4_h_in;
1051  tmp0_h += q5_h_in;
1052  tmp0_h -= p2_h_in;
1053  tmp1_h += tmp0_h;
1054  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1055 
1056  out_l = __lsx_vpickev_b(out_h, out_l);
1057  q5 = __lsx_vbitsel_v(q5, out_l, flat2);
1058  __lsx_vst(q5, dst, 0);
1059  dst += stride;
1060 
1061  /* q6 */
1062  tmp0_l = q7_l_in - q5_l_in;
1063  tmp0_l += q6_l_in;
1064  tmp0_l -= p1_l_in;
1065  tmp1_l += tmp0_l;
1066  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
1067 
1068  tmp0_h = q7_h_in - q5_h_in;
1069  tmp0_h += q6_h_in;
1070  tmp0_h -= p1_h_in;
1071  tmp1_h += tmp0_h;
1072  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
1073 
1074  out_l = __lsx_vpickev_b(out_h, out_l);
1075  q6 = __lsx_vbitsel_v(q6, out_l, flat2);
1076  __lsx_vst(q6, dst, 0);
1077  }
1078 }
1079 
1080 void ff_loop_filter_v_16_16_lsx(uint8_t *dst, ptrdiff_t stride,
1081  int32_t b_limit_ptr,
1082  int32_t limit_ptr,
1083  int32_t thresh_ptr)
1084 {
1085  uint8_t filter48[16 * 8] __attribute__ ((aligned(16)));
1086  uint8_t early_exit = 0;
1087 
1088  early_exit = vp9_hz_lpf_t4_and_t8_16w(dst, stride, &filter48[0],
1089  b_limit_ptr, limit_ptr, thresh_ptr);
1090 
1091  if (0 == early_exit) {
1092  vp9_hz_lpf_t16_16w(dst, stride, filter48);
1093  }
1094 }
1095 
1096 void ff_loop_filter_v_16_8_lsx(uint8_t *dst, ptrdiff_t stride,
1097  int32_t b_limit_ptr,
1098  int32_t limit_ptr,
1099  int32_t thresh_ptr)
1100 {
1101  ptrdiff_t stride2 = stride << 1;
1102  ptrdiff_t stride3 = stride2 + stride;
1103  ptrdiff_t stride4 = stride2 << 1;
1104  uint8_t *dst_tmp = dst - stride4;
1105  uint8_t *dst_tmp1 = dst + stride4;
1106  __m128i zero = __lsx_vldi(0);
1107  __m128i flat2, mask, hev, flat, thresh, b_limit, limit;
1108  __m128i p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
1109  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1110  __m128i p0_filter16, p1_filter16;
1111  __m128i p2_filter8, p1_filter8, p0_filter8;
1112  __m128i q0_filter8, q1_filter8, q2_filter8;
1113  __m128i p7_l, p6_l, p5_l, p4_l, q7_l, q6_l, q5_l, q4_l;
1114  __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
1115  __m128i tmp0, tmp1, tmp2;
1116 
1117  /* load vector elements */
1118  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2,
1119  dst, -stride, p3, p2, p1, p0);
1120  q0 = __lsx_vld(dst, 0);
1121  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
1122  q3 = __lsx_vldx(dst, stride3);
1123 
1124  thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1125  b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1126  limit = __lsx_vreplgr2vr_b(limit_ptr);
1127 
1128  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1129  hev, mask, flat);
1130  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1131  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1132  q1_out);
1133 
1134  flat = __lsx_vilvl_d(zero, flat);
1135 
1136  /* if flat is zero for all pixels, then no need to calculate other filter */
1137  if (__lsx_bz_v(flat)) {
1138  __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
1139  __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
1140  __lsx_vstelm_d(q0_out, dst , 0, 0);
1141  __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
1142  } else {
1143  /* convert 8 bit input data into 16 bit */
1144  DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
1145  p3_l, p2_l, p1_l, p0_l);
1146  DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
1147  q0_l, q1_l, q2_l, q3_l);
1148  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l,
1149  p2_filter8, p1_filter8, p0_filter8, q0_filter8,
1150  q1_filter8, q2_filter8);
1151 
1152  /* convert 16 bit output data into 8 bit */
1153  DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8,
1154  zero, p0_filter8, zero, q0_filter8, p2_filter8,
1155  p1_filter8, p0_filter8, q0_filter8);
1156  DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8,
1157  q1_filter8, q2_filter8);
1158 
1159  /* store pixel values */
1160  p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat);
1161  p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat);
1162  p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat);
1163  q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat);
1164  q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat);
1165  q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat);
1166 
1167  /* load 16 vector elements */
1168  DUP4_ARG2(__lsx_vld, dst_tmp - stride4, 0, dst_tmp - stride3, 0,
1169  dst_tmp - stride2, 0, dst_tmp - stride, 0, p7, p6, p5, p4);
1170  DUP4_ARG2(__lsx_vld, dst_tmp1, 0, dst_tmp1 + stride, 0,
1171  dst_tmp1 + stride2, 0, dst_tmp1 + stride3, 0, q4, q5, q6, q7);
1172 
1173  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1174 
1175  /* if flat2 is zero for all pixels, then no need to calculate other filter */
1176  if (__lsx_bz_v(flat2)) {
1177  dst -= stride3;
1178  __lsx_vstelm_d(p2_out, dst, 0, 0);
1179  dst += stride;
1180  __lsx_vstelm_d(p1_out, dst, 0, 0);
1181  dst += stride;
1182  __lsx_vstelm_d(p0_out, dst, 0, 0);
1183  dst += stride;
1184  __lsx_vstelm_d(q0_out, dst, 0, 0);
1185  dst += stride;
1186  __lsx_vstelm_d(q1_out, dst, 0, 0);
1187  dst += stride;
1188  __lsx_vstelm_d(q2_out, dst, 0, 0);
1189  } else {
1190  /* LSB(right) 8 pixel operation */
1191  DUP4_ARG2(__lsx_vilvl_b, zero, p7, zero, p6, zero, p5, zero, p4,
1192  p7_l, p6_l, p5_l, p4_l);
1193  DUP4_ARG2(__lsx_vilvl_b, zero, q4, zero, q5, zero, q6, zero, q7,
1194  q4_l, q5_l, q6_l, q7_l);
1195 
1196  tmp0 = __lsx_vslli_h(p7_l, 3);
1197  tmp0 = __lsx_vsub_h(tmp0, p7_l);
1198  tmp0 = __lsx_vadd_h(tmp0, p6_l);
1199  tmp0 = __lsx_vadd_h(tmp0, q0_l);
1200 
1201  dst = dst_tmp - stride3;
1202 
1203  /* calculation of p6 and p5 */
1204  tmp1 = __lsx_vadd_h(p6_l, p5_l);
1205  tmp1 = __lsx_vadd_h(tmp1, p4_l);
1206  tmp1 = __lsx_vadd_h(tmp1, p3_l);
1207  tmp1 = __lsx_vadd_h(tmp1, p2_l);
1208  tmp1 = __lsx_vadd_h(tmp1, p1_l);
1209  tmp1 = __lsx_vadd_h(tmp1, p0_l);
1210  tmp1 = __lsx_vadd_h(tmp1, tmp0);
1211 
1212  p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1213  tmp0 = __lsx_vsub_h(p5_l, p6_l);
1214  tmp0 = __lsx_vadd_h(tmp0, q1_l);
1215  tmp0 = __lsx_vsub_h(tmp0, p7_l);
1216  tmp1 = __lsx_vadd_h(tmp1, tmp0);
1217 
1218  p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1219  DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1220  p1_filter16, p0_filter16, p1_filter16);
1221  p0_filter16 = __lsx_vbitsel_v(p6, p0_filter16, flat2);
1222  p1_filter16 = __lsx_vbitsel_v(p5, p1_filter16, flat2);
1223  __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1224  dst += stride;
1225  __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1226  dst += stride;
1227 
1228  /* calculation of p4 and p3 */
1229  tmp0 = __lsx_vsub_h(p4_l, p5_l);
1230  tmp0 = __lsx_vadd_h(tmp0, q2_l);
1231  tmp0 = __lsx_vsub_h(tmp0, p7_l);
1232  tmp2 = __lsx_vsub_h(p3_l, p4_l);
1233  tmp2 = __lsx_vadd_h(tmp2, q3_l);
1234  tmp2 = __lsx_vsub_h(tmp2, p7_l);
1235  tmp1 = __lsx_vadd_h(tmp1, tmp0);
1236  p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1237  tmp1 = __lsx_vadd_h(tmp1, tmp2);
1238  p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1239  DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1240  p1_filter16, p0_filter16, p1_filter16);
1241  p0_filter16 = __lsx_vbitsel_v(p4, p0_filter16, flat2);
1242  p1_filter16 = __lsx_vbitsel_v(p3, p1_filter16, flat2);
1243  __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1244  dst += stride;
1245  __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1246  dst += stride;
1247 
1248  /* calculation of p2 and p1 */
1249  tmp0 = __lsx_vsub_h(p2_l, p3_l);
1250  tmp0 = __lsx_vadd_h(tmp0, q4_l);
1251  tmp0 = __lsx_vsub_h(tmp0, p7_l);
1252  tmp2 = __lsx_vsub_h(p1_l, p2_l);
1253  tmp2 = __lsx_vadd_h(tmp2, q5_l);
1254  tmp2 = __lsx_vsub_h(tmp2, p7_l);
1255  tmp1 = __lsx_vadd_h(tmp1, tmp0);
1256  p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1257  tmp1 = __lsx_vadd_h(tmp1, tmp2);
1258  p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1259  DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1260  p1_filter16, p0_filter16, p1_filter16);
1261  p0_filter16 = __lsx_vbitsel_v(p2_out, p0_filter16, flat2);
1262  p1_filter16 = __lsx_vbitsel_v(p1_out, p1_filter16, flat2);
1263  __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1264  dst += stride;
1265  __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1266  dst += stride;
1267 
1268  /* calculation of p0 and q0 */
1269  tmp0 = __lsx_vsub_h(p0_l, p1_l);
1270  tmp0 = __lsx_vadd_h(tmp0, q6_l);
1271  tmp0 = __lsx_vsub_h(tmp0, p7_l);
1272  tmp2 = __lsx_vsub_h(q7_l, p0_l);
1273  tmp2 = __lsx_vadd_h(tmp2, q0_l);
1274  tmp2 = __lsx_vsub_h(tmp2, p7_l);
1275  tmp1 = __lsx_vadd_h(tmp1, tmp0);
1276  p0_filter16 = __lsx_vsrari_h((__m128i)tmp1, 4);
1277  tmp1 = __lsx_vadd_h(tmp1, tmp2);
1278  p1_filter16 = __lsx_vsrari_h((__m128i)tmp1, 4);
1279  DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1280  p1_filter16, p0_filter16, p1_filter16);
1281  p0_filter16 = __lsx_vbitsel_v(p0_out, p0_filter16, flat2);
1282  p1_filter16 = __lsx_vbitsel_v(q0_out, p1_filter16, flat2);
1283  __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1284  dst += stride;
1285  __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1286  dst += stride;
1287 
1288  /* calculation of q1 and q2 */
1289  tmp0 = __lsx_vsub_h(q7_l, q0_l);
1290  tmp0 = __lsx_vadd_h(tmp0, q1_l);
1291  tmp0 = __lsx_vsub_h(tmp0, p6_l);
1292  tmp2 = __lsx_vsub_h(q7_l, q1_l);
1293  tmp2 = __lsx_vadd_h(tmp2, q2_l);
1294  tmp2 = __lsx_vsub_h(tmp2, p5_l);
1295  tmp1 = __lsx_vadd_h(tmp1, tmp0);
1296  p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1297  tmp1 = __lsx_vadd_h(tmp1, tmp2);
1298  p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1299  DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1300  p1_filter16, p0_filter16, p1_filter16);
1301  p0_filter16 = __lsx_vbitsel_v(q1_out, p0_filter16, flat2);
1302  p1_filter16 = __lsx_vbitsel_v(q2_out, p1_filter16, flat2);
1303  __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1304  dst += stride;
1305  __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1306  dst += stride;
1307 
1308  /* calculation of q3 and q4 */
1309  tmp0 = __lsx_vsub_h(q7_l, q2_l);
1310  tmp0 = __lsx_vadd_h(tmp0, q3_l);
1311  tmp0 = __lsx_vsub_h(tmp0, p4_l);
1312  tmp2 = __lsx_vsub_h(q7_l, q3_l);
1313  tmp2 = __lsx_vadd_h(tmp2, q4_l);
1314  tmp2 = __lsx_vsub_h(tmp2, p3_l);
1315  tmp1 = __lsx_vadd_h(tmp1, tmp0);
1316  p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1317  tmp1 = __lsx_vadd_h(tmp1, tmp2);
1318  p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1319  DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1320  p1_filter16, p0_filter16, p1_filter16);
1321  p0_filter16 = __lsx_vbitsel_v(q3, p0_filter16, flat2);
1322  p1_filter16 = __lsx_vbitsel_v(q4, p1_filter16, flat2);
1323  __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1324  dst += stride;
1325  __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1326  dst += stride;
1327 
1328  /* calculation of q5 and q6 */
1329  tmp0 = __lsx_vsub_h(q7_l, q4_l);
1330  tmp0 = __lsx_vadd_h(tmp0, q5_l);
1331  tmp0 = __lsx_vsub_h(tmp0, p2_l);
1332  tmp2 = __lsx_vsub_h(q7_l, q5_l);
1333  tmp2 = __lsx_vadd_h(tmp2, q6_l);
1334  tmp2 = __lsx_vsub_h(tmp2, p1_l);
1335  tmp1 = __lsx_vadd_h(tmp1, tmp0);
1336  p0_filter16 = __lsx_vsrari_h(tmp1, 4);
1337  tmp1 = __lsx_vadd_h(tmp1, tmp2);
1338  p1_filter16 = __lsx_vsrari_h(tmp1, 4);
1339  DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero,
1340  p1_filter16, p0_filter16, p1_filter16);
1341  p0_filter16 = __lsx_vbitsel_v(q5, p0_filter16, flat2);
1342  p1_filter16 = __lsx_vbitsel_v(q6, p1_filter16, flat2);
1343  __lsx_vstelm_d(p0_filter16, dst, 0, 0);
1344  dst += stride;
1345  __lsx_vstelm_d(p1_filter16, dst, 0, 0);
1346  }
1347  }
1348 }
1349 
1350 void ff_loop_filter_h_4_8_lsx(uint8_t *dst, ptrdiff_t stride,
1351  int32_t b_limit_ptr,
1352  int32_t limit_ptr,
1353  int32_t thresh_ptr)
1354 {
1355  ptrdiff_t stride2 = stride << 1;
1356  ptrdiff_t stride3 = stride2 + stride;
1357  ptrdiff_t stride4 = stride2 << 1;
1358  uint8_t *dst_tmp1 = dst - 4;
1359  uint8_t *dst_tmp2 = dst_tmp1 + stride4;
1360  __m128i mask, hev, flat, limit, thresh, b_limit;
1361  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1362  __m128i vec0, vec1, vec2, vec3;
1363 
1364  p3 = __lsx_vld(dst_tmp1, 0);
1365  DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, p2, p1);
1366  p0 = __lsx_vldx(dst_tmp1, stride3);
1367  q0 = __lsx_vld(dst_tmp2, 0);
1368  DUP2_ARG2(__lsx_vldx, dst_tmp2, stride, dst_tmp2, stride2, q1, q2);
1369  q3 = __lsx_vldx(dst_tmp2, stride3);
1370 
1371  thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1372  b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1373  limit = __lsx_vreplgr2vr_b(limit_ptr);
1374 
1375  LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3,
1376  p3, p2, p1, p0, q0, q1, q2, q3);
1377  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1378  hev, mask, flat);
1379  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1380  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, vec0, vec1);
1381  vec2 = __lsx_vilvl_h(vec1, vec0);
1382  vec3 = __lsx_vilvh_h(vec1, vec0);
1383 
1384  dst -= 2;
1385  __lsx_vstelm_w(vec2, dst, 0, 0);
1386  __lsx_vstelm_w(vec2, dst + stride, 0, 1);
1387  __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
1388  __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
1389  dst += stride4;
1390  __lsx_vstelm_w(vec3, dst, 0, 0);
1391  __lsx_vstelm_w(vec3, dst + stride, 0, 1);
1392  __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
1393  __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
1394 }
1395 
1396 void ff_loop_filter_h_44_16_lsx(uint8_t *dst, ptrdiff_t stride,
1397  int32_t b_limit_ptr,
1398  int32_t limit_ptr,
1399  int32_t thresh_ptr)
1400 {
1401  ptrdiff_t stride2 = stride << 1;
1402  ptrdiff_t stride3 = stride2 + stride;
1403  ptrdiff_t stride4 = stride2 << 1;
1404  uint8_t *dst_tmp = dst - 4;
1405  __m128i mask, hev, flat;
1406  __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
1407  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1408  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
1409  __m128i row8, row9, row10, row11, row12, row13, row14, row15;
1410  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1411 
1412  row0 = __lsx_vld(dst_tmp, 0);
1413  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row1, row2);
1414  row3 = __lsx_vldx(dst_tmp, stride3);
1415  dst_tmp += stride4;
1416  row4 = __lsx_vld(dst_tmp, 0);
1417  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
1418  row7 = __lsx_vldx(dst_tmp, stride3);
1419  dst_tmp += stride4;
1420  row8 = __lsx_vld(dst_tmp, 0);
1421  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row9, row10);
1422  row11 = __lsx_vldx(dst_tmp, stride3);
1423  dst_tmp += stride4;
1424  row12 = __lsx_vld(dst_tmp, 0);
1425  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
1426  row15 = __lsx_vldx(dst_tmp, stride3);
1427 
1428  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7,
1429  row8, row9, row10, row11, row12, row13, row14, row15,
1430  p3, p2, p1, p0, q0, q1, q2, q3);
1431 
1432  thresh0 = __lsx_vreplgr2vr_b(thresh_ptr);
1433  thresh1 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1434  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
1435 
1436  b_limit0 = __lsx_vreplgr2vr_b(b_limit_ptr);
1437  b_limit1 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
1438  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
1439 
1440  limit0 = __lsx_vreplgr2vr_b(limit_ptr);
1441  limit1 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
1442  limit0 = __lsx_vilvl_d(limit1, limit0);
1443 
1444  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
1445  hev, mask, flat);
1446  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1447  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
1448  tmp2 = __lsx_vilvl_h(tmp1, tmp0);
1449  tmp3 = __lsx_vilvh_h(tmp1, tmp0);
1450  DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
1451  tmp4 = __lsx_vilvl_h(tmp1, tmp0);
1452  tmp5 = __lsx_vilvh_h(tmp1, tmp0);
1453 
1454  dst -= 2;
1455  __lsx_vstelm_w(tmp2, dst, 0, 0);
1456  __lsx_vstelm_w(tmp2, dst + stride, 0, 1);
1457  __lsx_vstelm_w(tmp2, dst + stride2, 0, 2);
1458  __lsx_vstelm_w(tmp2, dst + stride3, 0, 3);
1459  dst += stride4;
1460  __lsx_vstelm_w(tmp3, dst, 0, 0);
1461  __lsx_vstelm_w(tmp3, dst + stride, 0, 1);
1462  __lsx_vstelm_w(tmp3, dst + stride2, 0, 2);
1463  __lsx_vstelm_w(tmp3, dst + stride3, 0, 3);
1464  dst += stride4;
1465  __lsx_vstelm_w(tmp4, dst, 0, 0);
1466  __lsx_vstelm_w(tmp4, dst + stride, 0, 1);
1467  __lsx_vstelm_w(tmp4, dst + stride2, 0, 2);
1468  __lsx_vstelm_w(tmp4, dst + stride3, 0, 3);
1469  dst += stride4;
1470  __lsx_vstelm_w(tmp5, dst, 0, 0);
1471  __lsx_vstelm_w(tmp5, dst + stride, 0, 1);
1472  __lsx_vstelm_w(tmp5, dst + stride2, 0, 2);
1473  __lsx_vstelm_w(tmp5, dst + stride3, 0, 3);
1474 }
1475 
1476 void ff_loop_filter_h_8_8_lsx(uint8_t *dst, ptrdiff_t stride,
1477  int32_t b_limit_ptr,
1478  int32_t limit_ptr,
1479  int32_t thresh_ptr)
1480 {
1481  ptrdiff_t stride2 = stride << 1;
1482  ptrdiff_t stride3 = stride2 + stride;
1483  ptrdiff_t stride4 = stride2 << 1;
1484  uint8_t *dst_tmp = dst - 4;
1485  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1486  __m128i p1_out, p0_out, q0_out, q1_out;
1487  __m128i flat, mask, hev, thresh, b_limit, limit;
1488  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1489  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
1490  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
1491  __m128i vec0, vec1, vec2, vec3, vec4;
1492  __m128i zero = __lsx_vldi(0);
1493 
1494  /* load vector elements */
1495  p3 = __lsx_vld(dst_tmp, 0);
1496  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1);
1497  p0 = __lsx_vldx(dst_tmp, stride3);
1498  dst_tmp += stride4;
1499  q0 = __lsx_vld(dst_tmp, 0);
1500  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2);
1501  q3 = __lsx_vldx(dst_tmp, stride3);
1502 
1503  LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3,
1504  p3, p2, p1, p0, q0, q1, q2, q3);
1505 
1506  thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1507  b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1508  limit = __lsx_vreplgr2vr_b(limit_ptr);
1509 
1510  /* mask and hev */
1511  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1512  hev, mask, flat);
1513  /* flat4 */
1514  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1515  /* filter4 */
1516  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1517  q1_out);
1518 
1519  flat = __lsx_vilvl_d(zero, flat);
1520 
1521  /* if flat is zero for all pixels, then no need to calculate other filter */
1522  if (__lsx_bz_v(flat)) {
1523  /* Store 4 pixels p1-_q1 */
1524  DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1525  vec2 = __lsx_vilvl_h(vec1, vec0);
1526  vec3 = __lsx_vilvh_h(vec1, vec0);
1527 
1528  dst -= 2;
1529  __lsx_vstelm_w(vec2, dst, 0, 0);
1530  __lsx_vstelm_w(vec2, dst + stride, 0, 1);
1531  __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
1532  __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
1533  dst += stride4;
1534  __lsx_vstelm_w(vec3, dst, 0, 0);
1535  __lsx_vstelm_w(vec3, dst + stride, 0, 1);
1536  __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
1537  __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
1538  } else {
1539  DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
1540  p3_l, p2_l, p1_l, p0_l);
1541  DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
1542  q0_l, q1_l, q2_l, q3_l);
1543  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1544  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1545  /* convert 16 bit output data into 8 bit */
1546  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l,
1547  p1_filt8_l, p0_filt8_l, p0_filt8_l, q0_filt8_l,
1548  q0_filt8_l, p2_filt8_l, p1_filt8_l, p0_filt8_l,
1549  q0_filt8_l);
1550  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l,
1551  q2_filt8_l, q1_filt8_l, q2_filt8_l);
1552 
1553  /* store pixel values */
1554  p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
1555  p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
1556  p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
1557  q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
1558  q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
1559  q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
1560 
1561  /* Store 6 pixels p2-_q2 */
1562  DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
1563  vec2 = __lsx_vilvl_h(vec1, vec0);
1564  vec3 = __lsx_vilvh_h(vec1, vec0);
1565  vec4 = __lsx_vilvl_b(q2, q1);
1566 
1567  dst -= 3;
1568  __lsx_vstelm_w(vec2, dst, 0, 0);
1569  __lsx_vstelm_h(vec4, dst, 4, 0);
1570  dst += stride;
1571  __lsx_vstelm_w(vec2, dst, 0, 1);
1572  __lsx_vstelm_h(vec4, dst, 4, 1);
1573  dst += stride;
1574  __lsx_vstelm_w(vec2, dst, 0, 2);
1575  __lsx_vstelm_h(vec4, dst, 4, 2);
1576  dst += stride;
1577  __lsx_vstelm_w(vec2, dst, 0, 3);
1578  __lsx_vstelm_h(vec4, dst, 4, 3);
1579  dst += stride;
1580  __lsx_vstelm_w(vec3, dst, 0, 0);
1581  __lsx_vstelm_h(vec4, dst, 4, 4);
1582  dst += stride;
1583  __lsx_vstelm_w(vec3, dst, 0, 1);
1584  __lsx_vstelm_h(vec4, dst, 4, 5);
1585  dst += stride;
1586  __lsx_vstelm_w(vec3, dst, 0, 2);
1587  __lsx_vstelm_h(vec4, dst, 4, 6);
1588  dst += stride;
1589  __lsx_vstelm_w(vec3, dst, 0, 3);
1590  __lsx_vstelm_h(vec4, dst, 4, 7);
1591  }
1592 }
1593 
1594 void ff_loop_filter_h_88_16_lsx(uint8_t *dst, ptrdiff_t stride,
1595  int32_t b_limit_ptr,
1596  int32_t limit_ptr,
1597  int32_t thresh_ptr)
1598 {
1599  ptrdiff_t stride2 = stride << 1;
1600  ptrdiff_t stride3 = stride2 + stride;
1601  ptrdiff_t stride4 = stride2 << 1;
1602  uint8_t *dst_tmp = dst - 4;
1603  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1604  __m128i p1_out, p0_out, q0_out, q1_out;
1605  __m128i flat, mask, hev, thresh, b_limit, limit;
1606  __m128i row4, row5, row6, row7, row12, row13, row14, row15;
1607  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1608  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
1609  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
1610  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
1611  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
1612  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
1613  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1614  __m128i zero = __lsx_vldi(0);
1615 
1616  p0 = __lsx_vld(dst_tmp, 0);
1617  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
1618  p3 = __lsx_vldx(dst_tmp, stride3);
1619  dst_tmp += stride4;
1620  row4 = __lsx_vld(dst_tmp, 0);
1621  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
1622  row7 = __lsx_vldx(dst_tmp, stride3);
1623  dst_tmp += stride4;
1624  q3 = __lsx_vld(dst_tmp, 0);
1625  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
1626  q0 = __lsx_vldx(dst_tmp, stride3);
1627  dst_tmp += stride4;
1628  row12 = __lsx_vld(dst_tmp, 0);
1629  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
1630  row15 = __lsx_vldx(dst_tmp, stride3);
1631 
1632  /* transpose 16x8 matrix into 8x16 */
1633  LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7,
1634  q3, q2, q1, q0, row12, row13, row14, row15,
1635  p3, p2, p1, p0, q0, q1, q2, q3);
1636 
1637  thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1638  vec0 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1639  thresh = __lsx_vilvl_d(vec0, thresh);
1640 
1641  b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1642  vec0 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
1643  b_limit = __lsx_vilvl_d(vec0, b_limit);
1644 
1645  limit = __lsx_vreplgr2vr_b(limit_ptr);
1646  vec0 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
1647  limit = __lsx_vilvl_d(vec0, limit);
1648 
1649  /* mask and hev */
1650  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1651  hev, mask, flat);
1652  /* flat4 */
1653  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1654  /* filter4 */
1655  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1656  q1_out);
1657 
1658  /* if flat is zero for all pixels, then no need to calculate other filter */
1659  if (__lsx_bz_v(flat)) {
1660  DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1661  vec2 = __lsx_vilvl_h(vec1, vec0);
1662  vec3 = __lsx_vilvh_h(vec1, vec0);
1663  DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1664  vec4 = __lsx_vilvl_h(vec1, vec0);
1665  vec5 = __lsx_vilvh_h(vec1, vec0);
1666 
1667  dst -= 2;
1668  __lsx_vstelm_w(vec2, dst, 0, 0);
1669  __lsx_vstelm_w(vec2, dst + stride, 0, 1);
1670  __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
1671  __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
1672  dst += stride4;
1673  __lsx_vstelm_w(vec3, dst, 0, 0);
1674  __lsx_vstelm_w(vec3, dst + stride, 0, 1);
1675  __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
1676  __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
1677  dst += stride4;
1678  __lsx_vstelm_w(vec4, dst, 0, 0);
1679  __lsx_vstelm_w(vec4, dst + stride, 0, 1);
1680  __lsx_vstelm_w(vec4, dst + stride2, 0, 2);
1681  __lsx_vstelm_w(vec4, dst + stride3, 0, 3);
1682  dst += stride4;
1683  __lsx_vstelm_w(vec5, dst, 0, 0);
1684  __lsx_vstelm_w(vec5, dst + stride, 0, 1);
1685  __lsx_vstelm_w(vec5, dst + stride2, 0, 2);
1686  __lsx_vstelm_w(vec5, dst + stride3, 0, 3);
1687  } else {
1688  DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
1689  p3_l, p2_l, p1_l, p0_l);
1690  DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
1691  q0_l, q1_l, q2_l, q3_l);
1692  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1693  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1694 
1695  DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
1696  p3_h, p2_h, p1_h, p0_h);
1697  DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
1698  q0_h, q1_h, q2_h, q3_h);
1699 
1700  /* filter8 */
1701  VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
1702  p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
1703 
1704  /* convert 16 bit output data into 8 bit */
1705  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
1706  p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l,
1707  p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
1708  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
1709  q2_filt8_l, q1_filt8_l, q2_filt8_l);
1710 
1711  /* store pixel values */
1712  p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
1713  p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
1714  p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
1715  q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
1716  q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
1717  q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
1718 
1719  DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
1720  vec3 = __lsx_vilvl_h(vec1, vec0);
1721  vec4 = __lsx_vilvh_h(vec1, vec0);
1722  DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
1723  vec6 = __lsx_vilvl_h(vec1, vec0);
1724  vec7 = __lsx_vilvh_h(vec1, vec0);
1725  vec2 = __lsx_vilvl_b(q2, q1);
1726  vec5 = __lsx_vilvh_b(q2, q1);
1727 
1728  dst -= 3;
1729  __lsx_vstelm_w(vec3, dst, 0, 0);
1730  __lsx_vstelm_h(vec2, dst, 4, 0);
1731  dst += stride;
1732  __lsx_vstelm_w(vec3, dst, 0, 1);
1733  __lsx_vstelm_h(vec2, dst, 4, 1);
1734  dst += stride;
1735  __lsx_vstelm_w(vec3, dst, 0, 2);
1736  __lsx_vstelm_h(vec2, dst, 4, 2);
1737  dst += stride;
1738  __lsx_vstelm_w(vec3, dst, 0, 3);
1739  __lsx_vstelm_h(vec2, dst, 4, 3);
1740  dst += stride;
1741  __lsx_vstelm_w(vec4, dst, 0, 0);
1742  __lsx_vstelm_h(vec2, dst, 4, 4);
1743  dst += stride;
1744  __lsx_vstelm_w(vec4, dst, 0, 1);
1745  __lsx_vstelm_h(vec2, dst, 4, 5);
1746  dst += stride;
1747  __lsx_vstelm_w(vec4, dst, 0, 2);
1748  __lsx_vstelm_h(vec2, dst, 4, 6);
1749  dst += stride;
1750  __lsx_vstelm_w(vec4, dst, 0, 3);
1751  __lsx_vstelm_h(vec2, dst, 4, 7);
1752  dst += stride;
1753  __lsx_vstelm_w(vec6, dst, 0, 0);
1754  __lsx_vstelm_h(vec5, dst, 4, 0);
1755  dst += stride;
1756  __lsx_vstelm_w(vec6, dst, 0, 1);
1757  __lsx_vstelm_h(vec5, dst, 4, 1);
1758  dst += stride;
1759  __lsx_vstelm_w(vec6, dst, 0, 2);
1760  __lsx_vstelm_h(vec5, dst, 4, 2);
1761  dst += stride;
1762  __lsx_vstelm_w(vec6, dst, 0, 3);
1763  __lsx_vstelm_h(vec5, dst, 4, 3);
1764  dst += stride;
1765  __lsx_vstelm_w(vec7, dst, 0, 0);
1766  __lsx_vstelm_h(vec5, dst, 4, 4);
1767  dst += stride;
1768  __lsx_vstelm_w(vec7, dst, 0, 1);
1769  __lsx_vstelm_h(vec5, dst, 4, 5);
1770  dst += stride;
1771  __lsx_vstelm_w(vec7, dst, 0, 2);
1772  __lsx_vstelm_h(vec5, dst, 4, 6);
1773  dst += stride;
1774  __lsx_vstelm_w(vec7, dst, 0, 3);
1775  __lsx_vstelm_h(vec5, dst, 4, 7);
1776  }
1777 }
1778 
1779 void ff_loop_filter_h_84_16_lsx(uint8_t *dst, ptrdiff_t stride,
1780  int32_t b_limit_ptr,
1781  int32_t limit_ptr,
1782  int32_t thresh_ptr)
1783 {
1784  ptrdiff_t stride2 = stride << 1;
1785  ptrdiff_t stride3 = stride2 + stride;
1786  ptrdiff_t stride4 = stride2 << 1;
1787  uint8_t *dst_tmp = dst - 4;
1788  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1789  __m128i p1_out, p0_out, q0_out, q1_out;
1790  __m128i flat, mask, hev, thresh, b_limit, limit;
1791  __m128i row4, row5, row6, row7, row12, row13, row14, row15;
1792  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1793  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
1794  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
1795  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1796  __m128i zero = __lsx_vldi(0);
1797 
1798  p0 = __lsx_vld(dst_tmp, 0);
1799  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
1800  p3 = __lsx_vldx(dst_tmp, stride3);
1801  dst_tmp += stride4;
1802  row4 = __lsx_vld(dst_tmp, 0);
1803  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
1804  row7 = __lsx_vldx(dst_tmp, stride3);
1805  dst_tmp += stride4;
1806  q3 = __lsx_vld(dst_tmp, 0);
1807  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
1808  q0 = __lsx_vldx(dst_tmp, stride3);
1809  dst_tmp += stride4;
1810  row12 = __lsx_vld(dst_tmp, 0);
1811  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
1812  row15 = __lsx_vldx(dst_tmp, stride3);
1813 
1814  /* transpose 16x8 matrix into 8x16 */
1815  LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7,
1816  q3, q2, q1, q0, row12, row13, row14, row15,
1817  p3, p2, p1, p0, q0, q1, q2, q3);
1818 
1819  thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1820  vec0 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1821  thresh = __lsx_vilvl_d(vec0, thresh);
1822 
1823  b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1824  vec0 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
1825  b_limit = __lsx_vilvl_d(vec0, b_limit);
1826 
1827  limit = __lsx_vreplgr2vr_b(limit_ptr);
1828  vec0 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
1829  limit = __lsx_vilvl_d(vec0, limit);
1830 
1831  /* mask and hev */
1832  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1833  hev, mask, flat);
1834  /* flat4 */
1835  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1836  /* filter4 */
1837  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1838  q1_out);
1839 
1840  flat = __lsx_vilvl_d(zero, flat);
1841 
1842  /* if flat is zero for all pixels, then no need to calculate other filter */
1843  if (__lsx_bz_v(flat)) {
1844  DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1845  vec2 = __lsx_vilvl_h(vec1, vec0);
1846  vec3 = __lsx_vilvh_h(vec1, vec0);
1847  DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1848  vec4 = __lsx_vilvl_h(vec1, vec0);
1849  vec5 = __lsx_vilvh_h(vec1, vec0);
1850 
1851  dst -= 2;
1852  __lsx_vstelm_w(vec2, dst, 0, 0);
1853  __lsx_vstelm_w(vec2, dst + stride, 0, 1);
1854  __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
1855  __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
1856  dst += stride4;
1857  __lsx_vstelm_w(vec3, dst, 0, 0);
1858  __lsx_vstelm_w(vec3, dst + stride, 0, 1);
1859  __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
1860  __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
1861  dst += stride4;
1862  __lsx_vstelm_w(vec4, dst, 0, 0);
1863  __lsx_vstelm_w(vec4, dst + stride, 0, 1);
1864  __lsx_vstelm_w(vec4, dst + stride2, 0, 2);
1865  __lsx_vstelm_w(vec4, dst + stride3, 0, 3);
1866  dst += stride4;
1867  __lsx_vstelm_w(vec5, dst, 0, 0);
1868  __lsx_vstelm_w(vec5, dst + stride, 0, 1);
1869  __lsx_vstelm_w(vec5, dst + stride2, 0, 2);
1870  __lsx_vstelm_w(vec5, dst + stride3, 0, 3);
1871  } else {
1872  DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
1873  p3_l, p2_l, p1_l, p0_l);
1874  DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
1875  q0_l, q1_l, q2_l, q3_l);
1876  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1877  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1878 
1879  /* convert 16 bit output data into 8 bit */
1880  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
1881  p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l,
1882  p1_filt8_l, p0_filt8_l, q0_filt8_l);
1883  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
1884  q1_filt8_l, q2_filt8_l);
1885 
1886  /* store pixel values */
1887  p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
1888  p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
1889  p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
1890  q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
1891  q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
1892  q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
1893 
1894  DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
1895  vec3 = __lsx_vilvl_h(vec1, vec0);
1896  vec4 = __lsx_vilvh_h(vec1, vec0);
1897  DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
1898  vec6 = __lsx_vilvl_h(vec1, vec0);
1899  vec7 = __lsx_vilvh_h(vec1, vec0);
1900  vec2 = __lsx_vilvl_b(q2, q1);
1901  vec5 = __lsx_vilvh_b(q2, q1);
1902 
1903  dst -= 3;
1904  __lsx_vstelm_w(vec3, dst, 0, 0);
1905  __lsx_vstelm_h(vec2, dst, 4, 0);
1906  dst += stride;
1907  __lsx_vstelm_w(vec3, dst, 0, 1);
1908  __lsx_vstelm_h(vec2, dst, 4, 1);
1909  dst += stride;
1910  __lsx_vstelm_w(vec3, dst, 0, 2);
1911  __lsx_vstelm_h(vec2, dst, 4, 2);
1912  dst += stride;
1913  __lsx_vstelm_w(vec3, dst, 0, 3);
1914  __lsx_vstelm_h(vec2, dst, 4, 3);
1915  dst += stride;
1916  __lsx_vstelm_w(vec4, dst, 0, 0);
1917  __lsx_vstelm_h(vec2, dst, 4, 4);
1918  dst += stride;
1919  __lsx_vstelm_w(vec4, dst, 0, 1);
1920  __lsx_vstelm_h(vec2, dst, 4, 5);
1921  dst += stride;
1922  __lsx_vstelm_w(vec4, dst, 0, 2);
1923  __lsx_vstelm_h(vec2, dst, 4, 6);
1924  dst += stride;
1925  __lsx_vstelm_w(vec4, dst, 0, 3);
1926  __lsx_vstelm_h(vec2, dst, 4, 7);
1927  dst += stride;
1928  __lsx_vstelm_w(vec6, dst, 0, 0);
1929  __lsx_vstelm_h(vec5, dst, 4, 0);
1930  dst += stride;
1931  __lsx_vstelm_w(vec6, dst, 0, 1);
1932  __lsx_vstelm_h(vec5, dst, 4, 1);
1933  dst += stride;
1934  __lsx_vstelm_w(vec6, dst, 0, 2);
1935  __lsx_vstelm_h(vec5, dst, 4, 2);
1936  dst += stride;
1937  __lsx_vstelm_w(vec6, dst, 0, 3);
1938  __lsx_vstelm_h(vec5, dst, 4, 3);
1939  dst += stride;
1940  __lsx_vstelm_w(vec7, dst, 0, 0);
1941  __lsx_vstelm_h(vec5, dst, 4, 4);
1942  dst += stride;
1943  __lsx_vstelm_w(vec7, dst, 0, 1);
1944  __lsx_vstelm_h(vec5, dst, 4, 5);
1945  dst += stride;
1946  __lsx_vstelm_w(vec7, dst, 0, 2);
1947  __lsx_vstelm_h(vec5, dst, 4, 6);
1948  dst += stride;
1949  __lsx_vstelm_w(vec7, dst, 0, 3);
1950  __lsx_vstelm_h(vec5, dst, 4, 7);
1951  }
1952 }
1953 
1954 void ff_loop_filter_h_48_16_lsx(uint8_t *dst, ptrdiff_t stride,
1955  int32_t b_limit_ptr,
1956  int32_t limit_ptr,
1957  int32_t thresh_ptr)
1958 {
1959  ptrdiff_t stride2 = stride << 1;
1960  ptrdiff_t stride3 = stride2 + stride;
1961  ptrdiff_t stride4 = stride2 << 1;
1962  uint8_t *dst_tmp = dst - 4;
1963  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
1964  __m128i p1_out, p0_out, q0_out, q1_out;
1965  __m128i flat, mask, hev, thresh, b_limit, limit;
1966  __m128i row4, row5, row6, row7, row12, row13, row14, row15;
1967  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
1968  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
1969  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
1970  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1971  __m128i zero = __lsx_vldi(0);
1972 
1973  p0 = __lsx_vld(dst_tmp, 0);
1974  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
1975  p3 = __lsx_vldx(dst_tmp, stride3);
1976  dst_tmp += stride4;
1977  row4 = __lsx_vld(dst_tmp, 0);
1978  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
1979  row7 = __lsx_vldx(dst_tmp, stride3);
1980  dst_tmp += stride4;
1981  q3 = __lsx_vld(dst_tmp, 0);
1982  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
1983  q0 = __lsx_vldx(dst_tmp, stride3);
1984  dst_tmp += stride4;
1985  row12 = __lsx_vld(dst_tmp, 0);
1986  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
1987  row15 = __lsx_vldx(dst_tmp, stride3);
1988 
1989  /* transpose 16x8 matrix into 8x16 */
1990  LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7,
1991  q3, q2, q1, q0, row12, row13, row14, row15,
1992  p3, p2, p1, p0, q0, q1, q2, q3);
1993 
1994  thresh = __lsx_vreplgr2vr_b(thresh_ptr);
1995  vec0 = __lsx_vreplgr2vr_b(thresh_ptr >> 8);
1996  thresh = __lsx_vilvl_d(vec0, thresh);
1997 
1998  b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
1999  vec0 = __lsx_vreplgr2vr_b(b_limit_ptr >> 8);
2000  b_limit = __lsx_vilvl_d(vec0, b_limit);
2001 
2002  limit = __lsx_vreplgr2vr_b(limit_ptr);
2003  vec0 = __lsx_vreplgr2vr_b(limit_ptr >> 8);
2004  limit = __lsx_vilvl_d(vec0, limit);
2005 
2006  /* mask and hev */
2007  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2008  hev, mask, flat);
2009  /* flat4 */
2010  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2011  /* filter4 */
2012  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2013  q1_out);
2014 
2015  flat = __lsx_vilvh_d(flat, zero);
2016 
2017  /* if flat is zero for all pixels, then no need to calculate other filter */
2018  if (__lsx_bz_v(flat)) {
2019  DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2020  vec2 = __lsx_vilvl_h(vec1, vec0);
2021  vec3 = __lsx_vilvh_h(vec1, vec0);
2022  DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2023  vec4 = __lsx_vilvl_h(vec1, vec0);
2024  vec5 = __lsx_vilvh_h(vec1, vec0);
2025 
2026  dst -= 2;
2027  __lsx_vstelm_w(vec2, dst, 0, 0);
2028  __lsx_vstelm_w(vec2, dst + stride, 0, 1);
2029  __lsx_vstelm_w(vec2, dst + stride2, 0, 2);
2030  __lsx_vstelm_w(vec2, dst + stride3, 0, 3);
2031  dst += stride4;
2032  __lsx_vstelm_w(vec3, dst, 0, 0);
2033  __lsx_vstelm_w(vec3, dst + stride, 0, 1);
2034  __lsx_vstelm_w(vec3, dst + stride2, 0, 2);
2035  __lsx_vstelm_w(vec3, dst + stride3, 0, 3);
2036  dst += stride4;
2037  __lsx_vstelm_w(vec4, dst, 0, 0);
2038  __lsx_vstelm_w(vec4, dst + stride, 0, 1);
2039  __lsx_vstelm_w(vec4, dst + stride2, 0, 2);
2040  __lsx_vstelm_w(vec4, dst + stride3, 0, 3);
2041  dst += stride4;
2042  __lsx_vstelm_w(vec5, dst, 0, 0);
2043  __lsx_vstelm_w(vec5, dst + stride, 0, 1);
2044  __lsx_vstelm_w(vec5, dst + stride2, 0, 2);
2045  __lsx_vstelm_w(vec5, dst + stride3, 0, 3);
2046  } else {
2047  DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
2048  p3_h, p2_h, p1_h, p0_h);
2049  DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
2050  q0_h, q1_h, q2_h, q3_h);
2051 
2052  VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
2053  p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
2054 
2055  /* convert 16 bit output data into 8 bit */
2056  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_h, p1_filt8_h,
2057  p1_filt8_h, p0_filt8_h, p0_filt8_h, q0_filt8_h, q0_filt8_h,
2058  p2_filt8_h, p1_filt8_h, p0_filt8_h, q0_filt8_h);
2059  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_h, q2_filt8_h,
2060  q2_filt8_h, q1_filt8_h, q2_filt8_h);
2061 
2062  /* store pixel values */
2063  p2 = __lsx_vbitsel_v(p2, p2_filt8_h, flat);
2064  p1 = __lsx_vbitsel_v(p1_out, p1_filt8_h, flat);
2065  p0 = __lsx_vbitsel_v(p0_out, p0_filt8_h, flat);
2066  q0 = __lsx_vbitsel_v(q0_out, q0_filt8_h, flat);
2067  q1 = __lsx_vbitsel_v(q1_out, q1_filt8_h, flat);
2068  q2 = __lsx_vbitsel_v(q2, q2_filt8_h, flat);
2069 
2070  DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
2071  vec3 = __lsx_vilvl_h(vec1, vec0);
2072  vec4 = __lsx_vilvh_h(vec1, vec0);
2073  DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
2074  vec6 = __lsx_vilvl_h(vec1, vec0);
2075  vec7 = __lsx_vilvh_h(vec1, vec0);
2076  vec2 = __lsx_vilvl_b(q2, q1);
2077  vec5 = __lsx_vilvh_b(q2, q1);
2078 
2079  dst -= 3;
2080  __lsx_vstelm_w(vec3, dst, 0, 0);
2081  __lsx_vstelm_h(vec2, dst, 4, 0);
2082  dst += stride;
2083  __lsx_vstelm_w(vec3, dst, 0, 1);
2084  __lsx_vstelm_h(vec2, dst, 4, 1);
2085  dst += stride;
2086  __lsx_vstelm_w(vec3, dst, 0, 2);
2087  __lsx_vstelm_h(vec2, dst, 4, 2);
2088  dst += stride;
2089  __lsx_vstelm_w(vec3, dst, 0, 3);
2090  __lsx_vstelm_h(vec2, dst, 4, 3);
2091  dst += stride;
2092  __lsx_vstelm_w(vec4, dst, 0, 0);
2093  __lsx_vstelm_h(vec2, dst, 4, 4);
2094  dst += stride;
2095  __lsx_vstelm_w(vec4, dst, 0, 1);
2096  __lsx_vstelm_h(vec2, dst, 4, 5);
2097  dst += stride;
2098  __lsx_vstelm_w(vec4, dst, 0, 2);
2099  __lsx_vstelm_h(vec2, dst, 4, 6);
2100  dst += stride;
2101  __lsx_vstelm_w(vec4, dst, 0, 3);
2102  __lsx_vstelm_h(vec2, dst, 4, 7);
2103  dst += stride;
2104  __lsx_vstelm_w(vec6, dst, 0, 0);
2105  __lsx_vstelm_h(vec5, dst, 4, 0);
2106  dst += stride;
2107  __lsx_vstelm_w(vec6, dst, 0, 1);
2108  __lsx_vstelm_h(vec5, dst, 4, 1);
2109  dst += stride;
2110  __lsx_vstelm_w(vec6, dst, 0, 2);
2111  __lsx_vstelm_h(vec5, dst, 4, 2);
2112  dst += stride;
2113  __lsx_vstelm_w(vec6, dst, 0, 3);
2114  __lsx_vstelm_h(vec5, dst, 4, 3);
2115  dst += stride;
2116  __lsx_vstelm_w(vec7, dst, 0, 0);
2117  __lsx_vstelm_h(vec5, dst, 4, 4);
2118  dst += stride;
2119  __lsx_vstelm_w(vec7, dst, 0, 1);
2120  __lsx_vstelm_h(vec5, dst, 4, 5);
2121  dst += stride;
2122  __lsx_vstelm_w(vec7, dst, 0, 2);
2123  __lsx_vstelm_h(vec5, dst, 4, 6);
2124  dst += stride;
2125  __lsx_vstelm_w(vec7, dst, 0, 3);
2126  __lsx_vstelm_h(vec5, dst, 4, 7);
2127  }
2128 }
2129 
2130 static void vp9_transpose_16x8_to_8x16(uint8_t *input, ptrdiff_t in_pitch,
2131  uint8_t *output)
2132 {
2133  __m128i p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
2134  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2135  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2136  ptrdiff_t in_pitch2 = in_pitch << 1;
2137  ptrdiff_t in_pitch3 = in_pitch2 + in_pitch;
2138  ptrdiff_t in_pitch4 = in_pitch2 << 1;
2139 
2140  LSX_LD_8(input, in_pitch, in_pitch2, in_pitch3, in_pitch4,
2141  p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
2142  /* 8x8 transpose */
2143  LSX_TRANSPOSE8x8_B(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
2144  p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
2145  /* 8x8 transpose */
2146  DUP4_ARG2(__lsx_vilvh_b, p5_org, p7_org, p4_org, p6_org, p1_org,
2147  p3_org, p0_org, p2_org, tmp0, tmp1, tmp2, tmp3);
2148  DUP2_ARG2(__lsx_vilvl_b, tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
2149  DUP2_ARG2(__lsx_vilvh_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
2150  DUP2_ARG2(__lsx_vilvl_w, tmp6, tmp4, tmp7, tmp5, q0, q4);
2151  DUP2_ARG2(__lsx_vilvh_w, tmp6, tmp4, tmp7, tmp5, q2, q6);
2152  DUP4_ARG2(__lsx_vbsrl_v, q0, 8, q2, 8, q4, 8, q6, 8, q1, q3, q5, q7);
2153 
2154  __lsx_vst(p7, output, 0);
2155  __lsx_vst(p6, output, 16);
2156  __lsx_vst(p5, output, 32);
2157  __lsx_vst(p4, output, 48);
2158  __lsx_vst(p3, output, 64);
2159  __lsx_vst(p2, output, 80);
2160  __lsx_vst(p1, output, 96);
2161  __lsx_vst(p0, output, 112);
2162  __lsx_vst(q0, output, 128);
2163  __lsx_vst(q1, output, 144);
2164  __lsx_vst(q2, output, 160);
2165  __lsx_vst(q3, output, 176);
2166  __lsx_vst(q4, output, 192);
2167  __lsx_vst(q5, output, 208);
2168  __lsx_vst(q6, output, 224);
2169  __lsx_vst(q7, output, 240);
2170 }
2171 
2172 static void vp9_transpose_8x16_to_16x8(uint8_t *input, uint8_t *output,
2173  ptrdiff_t out_pitch)
2174 {
2175  __m128i p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
2176  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2177  ptrdiff_t out_pitch2 = out_pitch << 1;
2178  ptrdiff_t out_pitch3 = out_pitch2 + out_pitch;
2179  ptrdiff_t out_pitch4 = out_pitch2 << 1;
2180 
2181  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48,
2182  p7, p6, p5, p4);
2183  DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112,
2184  p3, p2, p1, p0);
2185  DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176,
2186  q0, q1, q2, q3);
2187  DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240,
2188  q4, q5, q6, q7);
2189  LSX_TRANSPOSE16x8_B(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
2190  q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
2191  LSX_ST_8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o,
2192  output, out_pitch, out_pitch2, out_pitch3, out_pitch4);
2193 }
2194 
2195 static void vp9_transpose_16x16(uint8_t *input, int32_t in_stride,
2196  uint8_t *output, int32_t out_stride)
2197 {
2198  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
2199  __m128i row8, row9, row10, row11, row12, row13, row14, row15;
2200  __m128i tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
2201  __m128i tmp2, tmp3;
2202  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2203  int32_t in_stride2 = in_stride << 1;
2204  int32_t in_stride3 = in_stride2 + in_stride;
2205  int32_t in_stride4 = in_stride2 << 1;
2206  int32_t out_stride2 = out_stride << 1;
2207  int32_t out_stride3 = out_stride2 + out_stride;
2208  int32_t out_stride4 = out_stride2 << 1;
2209 
2210  LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4,
2211  row0, row1, row2, row3, row4, row5, row6, row7);
2212  input += in_stride4;
2213  LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4,
2214  row8, row9, row10, row11, row12, row13, row14, row15);
2215 
2216  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7,
2217  row8, row9, row10, row11, row12, row13, row14, row15,
2218  p7, p6, p5, p4, p3, p2, p1, p0);
2219 
2220  /* transpose 16x8 matrix into 8x16 */
2221  /* total 8 intermediate register and 32 instructions */
2222  q7 = __lsx_vpackod_d(row8, row0);
2223  q6 = __lsx_vpackod_d(row9, row1);
2224  q5 = __lsx_vpackod_d(row10, row2);
2225  q4 = __lsx_vpackod_d(row11, row3);
2226  q3 = __lsx_vpackod_d(row12, row4);
2227  q2 = __lsx_vpackod_d(row13, row5);
2228  q1 = __lsx_vpackod_d(row14, row6);
2229  q0 = __lsx_vpackod_d(row15, row7);
2230 
2231  DUP2_ARG2(__lsx_vpackev_b, q6, q7, q4, q5, tmp0, tmp1);
2232  DUP2_ARG2(__lsx_vpackod_b, q6, q7, q4, q5, tmp4, tmp5);
2233 
2234  DUP2_ARG2(__lsx_vpackev_b, q2, q3, q0, q1, q5, q7);
2235  DUP2_ARG2(__lsx_vpackod_b, q2, q3, q0, q1, tmp6, tmp7);
2236 
2237  DUP2_ARG2(__lsx_vpackev_h, tmp1, tmp0, q7, q5, tmp2, tmp3);
2238  q0 = __lsx_vpackev_w(tmp3, tmp2);
2239  q4 = __lsx_vpackod_w(tmp3, tmp2);
2240 
2241  tmp2 = __lsx_vpackod_h(tmp1, tmp0);
2242  tmp3 = __lsx_vpackod_h(q7, q5);
2243  q2 = __lsx_vpackev_w(tmp3, tmp2);
2244  q6 = __lsx_vpackod_w(tmp3, tmp2);
2245 
2246  DUP2_ARG2(__lsx_vpackev_h, tmp5, tmp4, tmp7, tmp6, tmp2, tmp3);
2247  q1 = __lsx_vpackev_w(tmp3, tmp2);
2248  q5 = __lsx_vpackod_w(tmp3, tmp2);
2249 
2250  tmp2 = __lsx_vpackod_h(tmp5, tmp4);
2251  tmp3 = __lsx_vpackod_h(tmp7, tmp6);
2252  q3 = __lsx_vpackev_w(tmp3, tmp2);
2253  q7 = __lsx_vpackod_w(tmp3, tmp2);
2254 
2255  LSX_ST_8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_stride,
2256  out_stride2, out_stride3, out_stride4);
2257  output += out_stride4;
2258  LSX_ST_8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_stride,
2259  out_stride2, out_stride3, out_stride4);
2260 }
2261 
2262 static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48,
2263  uint8_t *src_org, int32_t pitch_org,
2264  int32_t b_limit_ptr,
2265  int32_t limit_ptr,
2266  int32_t thresh_ptr)
2267 {
2268  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
2269  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2270  __m128i flat, mask, hev, thresh, b_limit, limit;
2271  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2272  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
2273  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
2274  __m128i vec0, vec1, vec2, vec3;
2275  __m128i zero = __lsx_vldi(0);
2276 
2277  /* load vector elements */
2278  DUP4_ARG2(__lsx_vld, src, -64, src, -48, src, -32, src, -16,
2279  p3, p2, p1, p0);
2280  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, q0, q1, q2, q3);
2281 
2282  thresh = __lsx_vreplgr2vr_b(thresh_ptr);
2283  b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
2284  limit = __lsx_vreplgr2vr_b(limit_ptr);
2285 
2286  /* mask and hev */
2287  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2288  hev, mask, flat);
2289  /* flat4 */
2290  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2291  /* filter4 */
2292  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2293  q1_out);
2294 
2295  flat = __lsx_vilvl_d(zero, flat);
2296 
2297  /* if flat is zero for all pixels, then no need to calculate other filter */
2298  if (__lsx_bz_v(flat)) {
2299  DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2300  vec2 = __lsx_vilvl_h(vec1, vec0);
2301  vec3 = __lsx_vilvh_h(vec1, vec0);
2302 
2303  src_org -= 2;
2304  __lsx_vstelm_w(vec2, src_org, 0, 0);
2305  src_org += pitch_org;
2306  __lsx_vstelm_w(vec2, src_org, 0, 1);
2307  src_org += pitch_org;
2308  __lsx_vstelm_w(vec2, src_org, 0, 2);
2309  src_org += pitch_org;
2310  __lsx_vstelm_w(vec2, src_org, 0, 3);
2311  src_org += pitch_org;
2312  __lsx_vstelm_w(vec3, src_org, 0, 0);
2313  src_org += pitch_org;
2314  __lsx_vstelm_w(vec3, src_org, 0, 1);
2315  src_org += pitch_org;
2316  __lsx_vstelm_w(vec3, src_org, 0, 2);
2317  src_org += pitch_org;
2318  __lsx_vstelm_w(vec3, src_org, 0, 3);
2319  return 1;
2320  } else {
2321  DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
2322  p3_l, p2_l, p1_l, p0_l);
2323  DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
2324  q0_l, q1_l, q2_l, q3_l);
2325  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2326  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2327 
2328  /* convert 16 bit output data into 8 bit */
2329  p2_l = __lsx_vpickev_b(p2_filt8_l, p2_filt8_l);
2330  p1_l = __lsx_vpickev_b(p1_filt8_l, p1_filt8_l);
2331  p0_l = __lsx_vpickev_b(p0_filt8_l, p0_filt8_l);
2332  q0_l = __lsx_vpickev_b(q0_filt8_l, q0_filt8_l);
2333  q1_l = __lsx_vpickev_b(q1_filt8_l, q1_filt8_l);
2334  q2_l = __lsx_vpickev_b(q2_filt8_l, q2_filt8_l);
2335 
2336  /* store pixel values */
2337  p2_out = __lsx_vbitsel_v(p2, p2_l, flat);
2338  p1_out = __lsx_vbitsel_v(p1_out, p1_l, flat);
2339  p0_out = __lsx_vbitsel_v(p0_out, p0_l, flat);
2340  q0_out = __lsx_vbitsel_v(q0_out, q0_l, flat);
2341  q1_out = __lsx_vbitsel_v(q1_out, q1_l, flat);
2342  q2_out = __lsx_vbitsel_v(q2, q2_l, flat);
2343 
2344  __lsx_vst(p2_out, filter48, 0);
2345  __lsx_vst(p1_out, filter48, 16);
2346  __lsx_vst(p0_out, filter48, 32);
2347  __lsx_vst(q0_out, filter48, 48);
2348  __lsx_vst(q1_out, filter48, 64);
2349  __lsx_vst(q2_out, filter48, 80);
2350  __lsx_vst(flat, filter48, 96);
2351 
2352  return 0;
2353  }
2354 }
2355 
2356 static int32_t vp9_vt_lpf_t16_8w(uint8_t *dst, uint8_t *dst_org,
2357  ptrdiff_t stride,
2358  uint8_t *filter48)
2359 {
2360  __m128i zero = __lsx_vldi(0);
2361  __m128i filter8, flat, flat2;
2362  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2363  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2364  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2365  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2366  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2367  v8u16 tmp0_l, tmp1_l;
2368  __m128i out_l;
2369  uint8_t *dst_tmp = dst - 128;
2370 
2371  /* load vector elements */
2372  DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32,
2373  dst_tmp, 48, p7, p6, p5, p4);
2374  DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96,
2375  dst_tmp, 112, p3, p2, p1, p0);
2376  DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
2377  DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7);
2378 
2379  flat = __lsx_vld(filter48, 96);
2380 
2381 
2382  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2383 
2384  /* if flat2 is zero for all pixels, then no need to calculate other filter */
2385  if (__lsx_bz_v(flat2)) {
2386  __m128i vec0, vec1, vec2, vec3, vec4;
2387 
2388  DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32,
2389  filter48, 48, p2, p1, p0, q0);
2390  DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
2391 
2392  DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
2393  vec3 = __lsx_vilvl_h(vec1, vec0);
2394  vec4 = __lsx_vilvh_h(vec1, vec0);
2395  vec2 = __lsx_vilvl_b(q2, q1);
2396 
2397  dst_org -= 3;
2398  __lsx_vstelm_w(vec3, dst_org, 0, 0);
2399  __lsx_vstelm_h(vec2, dst_org, 4, 0);
2400  dst_org += stride;
2401  __lsx_vstelm_w(vec3, dst_org, 0, 1);
2402  __lsx_vstelm_h(vec2, dst_org, 4, 1);
2403  dst_org += stride;
2404  __lsx_vstelm_w(vec3, dst_org, 0, 2);
2405  __lsx_vstelm_h(vec2, dst_org, 4, 2);
2406  dst_org += stride;
2407  __lsx_vstelm_w(vec3, dst_org, 0, 3);
2408  __lsx_vstelm_h(vec2, dst_org, 4, 3);
2409  dst_org += stride;
2410  __lsx_vstelm_w(vec4, dst_org, 0, 0);
2411  __lsx_vstelm_h(vec2, dst_org, 4, 4);
2412  dst_org += stride;
2413  __lsx_vstelm_w(vec4, dst_org, 0, 1);
2414  __lsx_vstelm_h(vec2, dst_org, 4, 5);
2415  dst_org += stride;
2416  __lsx_vstelm_w(vec4, dst_org, 0, 2);
2417  __lsx_vstelm_h(vec2, dst_org, 4, 6);
2418  dst_org += stride;
2419  __lsx_vstelm_w(vec4, dst_org, 0, 3);
2420  __lsx_vstelm_h(vec2, dst_org, 4, 7);
2421  return 1;
2422  } else {
2423  dst -= 7 * 16;
2424 
2425  p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7);
2426  p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6);
2427  p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5);
2428  p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4);
2429  p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3);
2430  p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2);
2431  p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1);
2432  p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0);
2433  q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0);
2434 
2435  tmp0_l = p7_l_in << 3;
2436  tmp0_l -= p7_l_in;
2437  tmp0_l += p6_l_in;
2438  tmp0_l += q0_l_in;
2439  tmp1_l = p6_l_in + p5_l_in;
2440  tmp1_l += p4_l_in;
2441  tmp1_l += p3_l_in;
2442  tmp1_l += p2_l_in;
2443  tmp1_l += p1_l_in;
2444  tmp1_l += p0_l_in;
2445  tmp1_l += tmp0_l;
2446 
2447  out_l =__lsx_vsrari_h((__m128i)tmp1_l, 4);
2448  out_l =__lsx_vpickev_b(out_l, out_l);
2449  p6 = __lsx_vbitsel_v(p6, out_l, flat2);
2450  __lsx_vstelm_d(p6, dst, 0, 0);
2451  dst += 16;
2452 
2453  /* p5 */
2454  q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1);
2455  tmp0_l = p5_l_in - p6_l_in;
2456  tmp0_l += q1_l_in;
2457  tmp0_l -= p7_l_in;
2458  tmp1_l += tmp0_l;
2459  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2460  out_l = __lsx_vpickev_b(out_l, out_l);
2461  p5 = __lsx_vbitsel_v(p5, out_l, flat2);
2462  __lsx_vstelm_d(p5, dst, 0, 0);
2463  dst += 16;
2464 
2465  /* p4 */
2466  q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2);
2467  tmp0_l = p4_l_in - p5_l_in;
2468  tmp0_l += q2_l_in;
2469  tmp0_l -= p7_l_in;
2470  tmp1_l += tmp0_l;
2471  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2472  out_l = __lsx_vpickev_b(out_l, out_l);
2473  p4 = __lsx_vbitsel_v(p4, out_l, flat2);
2474  __lsx_vstelm_d(p4, dst, 0, 0);
2475  dst += 16;
2476 
2477  /* p3 */
2478  q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3);
2479  tmp0_l = p3_l_in - p4_l_in;
2480  tmp0_l += q3_l_in;
2481  tmp0_l -= p7_l_in;
2482  tmp1_l += tmp0_l;
2483  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2484  out_l = __lsx_vpickev_b(out_l, out_l);
2485  p3 = __lsx_vbitsel_v(p3, out_l, flat2);
2486  __lsx_vstelm_d(p3, dst, 0, 0);
2487  dst += 16;
2488 
2489  /* p2 */
2490  q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4);
2491  filter8 = __lsx_vld(filter48, 0);
2492  tmp0_l = p2_l_in - p3_l_in;
2493  tmp0_l += q4_l_in;
2494  tmp0_l -= p7_l_in;
2495  tmp1_l += tmp0_l;
2496  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2497  out_l = __lsx_vpickev_b(out_l, out_l);
2498  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2499  __lsx_vstelm_d(filter8, dst, 0, 0);
2500  dst += 16;
2501 
2502  /* p1 */
2503  q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5);
2504  filter8 = __lsx_vld(filter48, 16);
2505  tmp0_l = p1_l_in - p2_l_in;
2506  tmp0_l += q5_l_in;
2507  tmp0_l -= p7_l_in;
2508  tmp1_l += tmp0_l;
2509  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2510  out_l = __lsx_vpickev_b(out_l, out_l);
2511  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2512  __lsx_vstelm_d(filter8, dst, 0, 0);
2513  dst += 16;
2514 
2515  /* p0 */
2516  q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6);
2517  filter8 = __lsx_vld(filter48, 32);
2518  tmp0_l = p0_l_in - p1_l_in;
2519  tmp0_l += q6_l_in;
2520  tmp0_l -= p7_l_in;
2521  tmp1_l += tmp0_l;
2522  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2523  out_l = __lsx_vpickev_b(out_l, out_l);
2524  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2525  __lsx_vstelm_d(filter8, dst, 0, 0);
2526  dst += 16;
2527 
2528  /* q0 */
2529  q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7);
2530  filter8 = __lsx_vld(filter48, 48);
2531  tmp0_l = q7_l_in - p0_l_in;
2532  tmp0_l += q0_l_in;
2533  tmp0_l -= p7_l_in;
2534  tmp1_l += tmp0_l;
2535  out_l = __lsx_vsrari_h((v8i16) tmp1_l, 4);
2536  out_l = __lsx_vpickev_b(out_l, out_l);
2537  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2538  __lsx_vstelm_d(filter8, dst, 0, 0);
2539  dst += 16;
2540 
2541  /* q1 */
2542  filter8 = __lsx_vld(filter48, 64);
2543  tmp0_l = q7_l_in - q0_l_in;
2544  tmp0_l += q1_l_in;
2545  tmp0_l -= p6_l_in;
2546  tmp1_l += tmp0_l;
2547  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2548  out_l = __lsx_vpickev_b(out_l, out_l);
2549  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2550  __lsx_vstelm_d(filter8, dst, 0, 0);
2551  dst += 16;
2552 
2553  /* q2 */
2554  filter8 = __lsx_vld(filter48, 80);
2555  tmp0_l = q7_l_in - q1_l_in;
2556  tmp0_l += q2_l_in;
2557  tmp0_l -= p5_l_in;
2558  tmp1_l += tmp0_l;
2559  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2560  out_l = __lsx_vpickev_b(out_l, out_l);
2561  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2562  __lsx_vstelm_d(filter8, dst, 0, 0);
2563  dst += 16;
2564 
2565  /* q3 */
2566  tmp0_l = q7_l_in - q2_l_in;
2567  tmp0_l += q3_l_in;
2568  tmp0_l -= p4_l_in;
2569  tmp1_l += tmp0_l;
2570  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2571  out_l = __lsx_vpickev_b(out_l, out_l);
2572  q3 = __lsx_vbitsel_v(q3, out_l, flat2);
2573  __lsx_vstelm_d(q3, dst, 0, 0);
2574  dst += 16;
2575 
2576  /* q4 */
2577  tmp0_l = q7_l_in - q3_l_in;
2578  tmp0_l += q4_l_in;
2579  tmp0_l -= p3_l_in;
2580  tmp1_l += tmp0_l;
2581  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2582  out_l = __lsx_vpickev_b(out_l, out_l);
2583  q4 = __lsx_vbitsel_v(q4, out_l, flat2);
2584  __lsx_vstelm_d(q4, dst, 0, 0);
2585  dst += 16;
2586 
2587  /* q5 */
2588  tmp0_l = q7_l_in - q4_l_in;
2589  tmp0_l += q5_l_in;
2590  tmp0_l -= p2_l_in;
2591  tmp1_l += tmp0_l;
2592  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2593  out_l = __lsx_vpickev_b(out_l, out_l);
2594  q5 = __lsx_vbitsel_v(q5, out_l, flat2);
2595  __lsx_vstelm_d(q5, dst, 0, 0);
2596  dst += 16;
2597 
2598  /* q6 */
2599  tmp0_l = q7_l_in - q5_l_in;
2600  tmp0_l += q6_l_in;
2601  tmp0_l -= p1_l_in;
2602  tmp1_l += tmp0_l;
2603  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2604  out_l = __lsx_vpickev_b(out_l, out_l);
2605  q6 = __lsx_vbitsel_v(q6, out_l, flat2);
2606  __lsx_vstelm_d(q6, dst, 0, 0);
2607 
2608  return 0;
2609  }
2610 }
2611 
2612 void ff_loop_filter_h_16_8_lsx(uint8_t *dst, ptrdiff_t stride,
2613  int32_t b_limit_ptr,
2614  int32_t limit_ptr,
2615  int32_t thresh_ptr)
2616 {
2617  uint8_t early_exit = 0;
2618  uint8_t transposed_input[16 * 24] __attribute__ ((aligned(16)));
2619  uint8_t *filter48 = &transposed_input[16 * 16];
2620 
2621  vp9_transpose_16x8_to_8x16(dst - 8, stride, transposed_input);
2622 
2623  early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
2624  &filter48[0], dst, stride,
2625  b_limit_ptr, limit_ptr, thresh_ptr);
2626 
2627  if (0 == early_exit) {
2628  early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), dst, stride,
2629  &filter48[0]);
2630 
2631  if (0 == early_exit) {
2632  vp9_transpose_8x16_to_16x8(transposed_input, dst - 8, stride);
2633  }
2634  }
2635 }
2636 
2637 static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48,
2638  uint8_t *dst_org, ptrdiff_t stride,
2639  int32_t b_limit_ptr,
2640  int32_t limit_ptr,
2641  int32_t thresh_ptr)
2642 {
2643  ptrdiff_t stride2 = stride << 1;
2644  ptrdiff_t stride3 = stride2 + stride;
2645  ptrdiff_t stride4 = stride2 << 1;
2646  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
2647  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2648  __m128i flat, mask, hev, thresh, b_limit, limit;
2649  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2650  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
2651  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
2652  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
2653  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
2654  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
2655  __m128i vec0, vec1, vec2, vec3, vec4, vec5;
2656  __m128i zero = __lsx_vldi(0);
2657 
2658  /* load vector elements */
2659  DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16,
2660  p3, p2, p1, p0);
2661  DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
2662 
2663  thresh = __lsx_vreplgr2vr_b(thresh_ptr);
2664  b_limit = __lsx_vreplgr2vr_b(b_limit_ptr);
2665  limit = __lsx_vreplgr2vr_b(limit_ptr);
2666 
2667  /* mask and hev */
2668  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2669  hev, mask, flat);
2670  /* flat4 */
2671  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2672  /* filter4 */
2673  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2674  q1_out);
2675 
2676  /* if flat is zero for all pixels, then no need to calculate other filter */
2677  if (__lsx_bz_v(flat)) {
2678  DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2679  vec2 = __lsx_vilvl_h(vec1, vec0);
2680  vec3 = __lsx_vilvh_h(vec1, vec0);
2681  DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2682  vec4 = __lsx_vilvl_h(vec1, vec0);
2683  vec5 = __lsx_vilvh_h(vec1, vec0);
2684 
2685  dst_org -= 2;
2686  __lsx_vstelm_w(vec2, dst_org, 0, 0);
2687  __lsx_vstelm_w(vec2, dst_org + stride, 0, 1);
2688  __lsx_vstelm_w(vec2, dst_org + stride2, 0, 2);
2689  __lsx_vstelm_w(vec2, dst_org + stride3, 0, 3);
2690  dst_org += stride4;
2691  __lsx_vstelm_w(vec3, dst_org, 0, 0);
2692  __lsx_vstelm_w(vec3, dst_org + stride, 0, 1);
2693  __lsx_vstelm_w(vec3, dst_org + stride2, 0, 2);
2694  __lsx_vstelm_w(vec3, dst_org + stride3, 0, 3);
2695  dst_org += stride4;
2696  __lsx_vstelm_w(vec4, dst_org, 0, 0);
2697  __lsx_vstelm_w(vec4, dst_org + stride, 0, 1);
2698  __lsx_vstelm_w(vec4, dst_org + stride2, 0, 2);
2699  __lsx_vstelm_w(vec4, dst_org + stride3, 0, 3);
2700  dst_org += stride4;
2701  __lsx_vstelm_w(vec5, dst_org, 0, 0);
2702  __lsx_vstelm_w(vec5, dst_org + stride, 0, 1);
2703  __lsx_vstelm_w(vec5, dst_org + stride2, 0, 2);
2704  __lsx_vstelm_w(vec5, dst_org + stride3, 0, 3);
2705 
2706  return 1;
2707  } else {
2708  DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0,
2709  p3_l, p2_l, p1_l, p0_l);
2710  DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3,
2711  q0_l, q1_l, q2_l, q3_l);
2712  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2713  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2714  DUP4_ARG2(__lsx_vilvh_b, zero, p3, zero, p2, zero, p1, zero, p0,
2715  p3_h, p2_h, p1_h, p0_h);
2716  DUP4_ARG2(__lsx_vilvh_b, zero, q0, zero, q1, zero, q2, zero, q3,
2717  q0_h, q1_h, q2_h, q3_h);
2718  VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
2719  p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
2720 
2721  /* convert 16 bit output data into 8 bit */
2722  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h,
2723  p1_filt8_l, p0_filt8_h, p0_filt8_l, q0_filt8_h,
2724  q0_filt8_l, p2_filt8_l, p1_filt8_l, p0_filt8_l,
2725  q0_filt8_l);
2726  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h,
2727  q2_filt8_l, q1_filt8_l, q2_filt8_l);
2728 
2729  /* store pixel values */
2730  p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
2731  p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
2732  p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
2733  q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
2734  q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
2735  q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
2736 
2737  __lsx_vst(p2_out, filter48, 0);
2738  __lsx_vst(p1_out, filter48, 16);
2739  __lsx_vst(p0_out, filter48, 32);
2740  __lsx_vst(q0_out, filter48, 48);
2741  __lsx_vst(q1_out, filter48, 64);
2742  __lsx_vst(q2_out, filter48, 80);
2743  __lsx_vst(flat, filter48, 96);
2744 
2745  return 0;
2746  }
2747 }
2748 
2749 static int32_t vp9_vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org,
2750  ptrdiff_t stride,
2751  uint8_t *filter48)
2752 {
2753  __m128i zero = __lsx_vldi(0);
2754  __m128i flat, flat2, filter8;
2755  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2756  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2757  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2758  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2759  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2760  v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
2761  v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
2762  v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
2763  v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
2764  v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
2765  __m128i out_l, out_h;
2766  uint8_t *dst_tmp = dst - 128;
2767 
2768  flat = __lsx_vld(filter48, 96);
2769 
2770  DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32,
2771  dst_tmp, 48, p7, p6, p5, p4);
2772  DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96,
2773  dst_tmp, 112, p3, p2, p1, p0);
2774  DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
2775  DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7);
2776 
2777  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2778 
2779  /* if flat2 is zero for all pixels, then no need to calculate other filter */
2780  if (__lsx_bz_v(flat2)) {
2781  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2782 
2783  DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32,
2784  filter48, 48, p2, p1, p0, q0);
2785  DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
2786 
2787  DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
2788  vec3 = __lsx_vilvl_h(vec1, vec0);
2789  vec4 = __lsx_vilvh_h(vec1, vec0);
2790  DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
2791  vec6 = __lsx_vilvl_h(vec1, vec0);
2792  vec7 = __lsx_vilvh_h(vec1, vec0);
2793  vec2 = __lsx_vilvl_b(q2, q1);
2794  vec5 = __lsx_vilvh_b(q2, q1);
2795 
2796  dst_org -= 3;
2797  __lsx_vstelm_w(vec3, dst_org, 0, 0);
2798  __lsx_vstelm_h(vec2, dst_org, 4, 0);
2799  dst_org += stride;
2800  __lsx_vstelm_w(vec3, dst_org, 0, 1);
2801  __lsx_vstelm_h(vec2, dst_org, 4, 1);
2802  dst_org += stride;
2803  __lsx_vstelm_w(vec3, dst_org, 0, 2);
2804  __lsx_vstelm_h(vec2, dst_org, 4, 2);
2805  dst_org += stride;
2806  __lsx_vstelm_w(vec3, dst_org, 0, 3);
2807  __lsx_vstelm_h(vec2, dst_org, 4, 3);
2808  dst_org += stride;
2809  __lsx_vstelm_w(vec4, dst_org, 0, 0);
2810  __lsx_vstelm_h(vec2, dst_org, 4, 4);
2811  dst_org += stride;
2812  __lsx_vstelm_w(vec4, dst_org, 0, 1);
2813  __lsx_vstelm_h(vec2, dst_org, 4, 5);
2814  dst_org += stride;
2815  __lsx_vstelm_w(vec4, dst_org, 0, 2);
2816  __lsx_vstelm_h(vec2, dst_org, 4, 6);
2817  dst_org += stride;
2818  __lsx_vstelm_w(vec4, dst_org, 0, 3);
2819  __lsx_vstelm_h(vec2, dst_org, 4, 7);
2820  dst_org += stride;
2821  __lsx_vstelm_w(vec6, dst_org, 0, 0);
2822  __lsx_vstelm_h(vec5, dst_org, 4, 0);
2823  dst_org += stride;
2824  __lsx_vstelm_w(vec6, dst_org, 0, 1);
2825  __lsx_vstelm_h(vec5, dst_org, 4, 1);
2826  dst_org += stride;
2827  __lsx_vstelm_w(vec6, dst_org, 0, 2);
2828  __lsx_vstelm_h(vec5, dst_org, 4, 2);
2829  dst_org += stride;
2830  __lsx_vstelm_w(vec6, dst_org, 0, 3);
2831  __lsx_vstelm_h(vec5, dst_org, 4, 3);
2832  dst_org += stride;
2833  __lsx_vstelm_w(vec7, dst_org, 0, 0);
2834  __lsx_vstelm_h(vec5, dst_org, 4, 4);
2835  dst_org += stride;
2836  __lsx_vstelm_w(vec7, dst_org, 0, 1);
2837  __lsx_vstelm_h(vec5, dst_org, 4, 5);
2838  dst_org += stride;
2839  __lsx_vstelm_w(vec7, dst_org, 0, 2);
2840  __lsx_vstelm_h(vec5, dst_org, 4, 6);
2841  dst_org += stride;
2842  __lsx_vstelm_w(vec7, dst_org, 0, 3);
2843  __lsx_vstelm_h(vec5, dst_org, 4, 7);
2844 
2845  return 1;
2846  } else {
2847  dst -= 7 * 16;
2848 
2849  p7_l_in = (v8u16)__lsx_vilvl_b(zero, p7);
2850  p6_l_in = (v8u16)__lsx_vilvl_b(zero, p6);
2851  p5_l_in = (v8u16)__lsx_vilvl_b(zero, p5);
2852  p4_l_in = (v8u16)__lsx_vilvl_b(zero, p4);
2853  p3_l_in = (v8u16)__lsx_vilvl_b(zero, p3);
2854  p2_l_in = (v8u16)__lsx_vilvl_b(zero, p2);
2855  p1_l_in = (v8u16)__lsx_vilvl_b(zero, p1);
2856  p0_l_in = (v8u16)__lsx_vilvl_b(zero, p0);
2857  q0_l_in = (v8u16)__lsx_vilvl_b(zero, q0);
2858 
2859  tmp0_l = p7_l_in << 3;
2860  tmp0_l -= p7_l_in;
2861  tmp0_l += p6_l_in;
2862  tmp0_l += q0_l_in;
2863  tmp1_l = p6_l_in + p5_l_in;
2864  tmp1_l += p4_l_in;
2865  tmp1_l += p3_l_in;
2866  tmp1_l += p2_l_in;
2867  tmp1_l += p1_l_in;
2868  tmp1_l += p0_l_in;
2869  tmp1_l += tmp0_l;
2870  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2871 
2872  p7_h_in = (v8u16)__lsx_vilvh_b(zero, p7);
2873  p6_h_in = (v8u16)__lsx_vilvh_b(zero, p6);
2874  p5_h_in = (v8u16)__lsx_vilvh_b(zero, p5);
2875  p4_h_in = (v8u16)__lsx_vilvh_b(zero, p4);
2876  p3_h_in = (v8u16)__lsx_vilvh_b(zero, p3);
2877  p2_h_in = (v8u16)__lsx_vilvh_b(zero, p2);
2878  p1_h_in = (v8u16)__lsx_vilvh_b(zero, p1);
2879  p0_h_in = (v8u16)__lsx_vilvh_b(zero, p0);
2880  q0_h_in = (v8u16)__lsx_vilvh_b(zero, q0);
2881 
2882  tmp0_h = p7_h_in << 3;
2883  tmp0_h -= p7_h_in;
2884  tmp0_h += p6_h_in;
2885  tmp0_h += q0_h_in;
2886  tmp1_h = p6_h_in + p5_h_in;
2887  tmp1_h += p4_h_in;
2888  tmp1_h += p3_h_in;
2889  tmp1_h += p2_h_in;
2890  tmp1_h += p1_h_in;
2891  tmp1_h += p0_h_in;
2892  tmp1_h += tmp0_h;
2893  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2894 
2895  out_l = __lsx_vpickev_b(out_h, out_l);
2896  p6 = __lsx_vbitsel_v(p6, out_l, flat2);
2897  __lsx_vst(p6, dst, 0);
2898 
2899  /* p5 */
2900  q1_l_in = (v8u16)__lsx_vilvl_b(zero, q1);
2901  tmp0_l = p5_l_in - p6_l_in;
2902  tmp0_l += q1_l_in;
2903  tmp0_l -= p7_l_in;
2904  tmp1_l += tmp0_l;
2905  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2906  q1_h_in = (v8u16)__lsx_vilvh_b(zero, q1);
2907  tmp0_h = p5_h_in - p6_h_in;
2908  tmp0_h += q1_h_in;
2909  tmp0_h -= p7_h_in;
2910  tmp1_h += tmp0_h;
2911  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2912  out_l = __lsx_vpickev_b(out_h, out_l);
2913  p5 = __lsx_vbitsel_v(p5, out_l, flat2);
2914  __lsx_vst(p5, dst, 16);
2915 
2916  /* p4 */
2917  q2_l_in = (v8u16)__lsx_vilvl_b(zero, q2);
2918  tmp0_l = p4_l_in - p5_l_in;
2919  tmp0_l += q2_l_in;
2920  tmp0_l -= p7_l_in;
2921  tmp1_l += tmp0_l;
2922  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2923  q2_h_in = (v8u16)__lsx_vilvh_b(zero, q2);
2924  tmp0_h = p4_h_in - p5_h_in;
2925  tmp0_h += q2_h_in;
2926  tmp0_h -= p7_h_in;
2927  tmp1_h += tmp0_h;
2928  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2929  out_l = __lsx_vpickev_b(out_h, out_l);
2930  p4 = __lsx_vbitsel_v(p4, out_l, flat2);
2931  __lsx_vst(p4, dst, 16*2);
2932 
2933  /* p3 */
2934  q3_l_in = (v8u16)__lsx_vilvl_b(zero, q3);
2935  tmp0_l = p3_l_in - p4_l_in;
2936  tmp0_l += q3_l_in;
2937  tmp0_l -= p7_l_in;
2938  tmp1_l += tmp0_l;
2939  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2940  q3_h_in = (v8u16)__lsx_vilvh_b(zero, q3);
2941  tmp0_h = p3_h_in - p4_h_in;
2942  tmp0_h += q3_h_in;
2943  tmp0_h -= p7_h_in;
2944  tmp1_h += tmp0_h;
2945  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2946  out_l = __lsx_vpickev_b(out_h, out_l);
2947  p3 = __lsx_vbitsel_v(p3, out_l, flat2);
2948  __lsx_vst(p3, dst, 16*3);
2949 
2950  /* p2 */
2951  q4_l_in = (v8u16)__lsx_vilvl_b(zero, q4);
2952  filter8 = __lsx_vld(filter48, 0);
2953  tmp0_l = p2_l_in - p3_l_in;
2954  tmp0_l += q4_l_in;
2955  tmp0_l -= p7_l_in;
2956  tmp1_l += tmp0_l;
2957  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2958  q4_h_in = (v8u16)__lsx_vilvh_b(zero, q4);
2959  tmp0_h = p2_h_in - p3_h_in;
2960  tmp0_h += q4_h_in;
2961  tmp0_h -= p7_h_in;
2962  tmp1_h += tmp0_h;
2963  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
2964  out_l = __lsx_vpickev_b(out_h, out_l);
2965  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2966  __lsx_vst(filter8, dst, 16*4);
2967 
2968  /* p1 */
2969  q5_l_in = (v8u16)__lsx_vilvl_b(zero, q5);
2970  filter8 = __lsx_vld(filter48, 16);
2971  tmp0_l = p1_l_in - p2_l_in;
2972  tmp0_l += q5_l_in;
2973  tmp0_l -= p7_l_in;
2974  tmp1_l += tmp0_l;
2975  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2976  q5_h_in = (v8u16)__lsx_vilvh_b(zero, q5);
2977  tmp0_h = p1_h_in - p2_h_in;
2978  tmp0_h += q5_h_in;
2979  tmp0_h -= p7_h_in;
2980  tmp1_h += tmp0_h;
2981  out_h = __lsx_vsrari_h((__m128i)(tmp1_h), 4);
2982  out_l = __lsx_vpickev_b(out_h, out_l);
2983  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
2984  __lsx_vst(filter8, dst, 16*5);
2985 
2986  /* p0 */
2987  q6_l_in = (v8u16)__lsx_vilvl_b(zero, q6);
2988  filter8 = __lsx_vld(filter48, 32);
2989  tmp0_l = p0_l_in - p1_l_in;
2990  tmp0_l += q6_l_in;
2991  tmp0_l -= p7_l_in;
2992  tmp1_l += tmp0_l;
2993  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
2994  q6_h_in = (v8u16)__lsx_vilvh_b(zero, q6);
2995  tmp0_h = p0_h_in - p1_h_in;
2996  tmp0_h += q6_h_in;
2997  tmp0_h -= p7_h_in;
2998  tmp1_h += tmp0_h;
2999  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3000  out_l = __lsx_vpickev_b(out_h, out_l);
3001  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3002  __lsx_vst(filter8, dst, 16*6);
3003 
3004  /* q0 */
3005  q7_l_in = (v8u16)__lsx_vilvl_b(zero, q7);
3006  filter8 = __lsx_vld(filter48, 48);
3007  tmp0_l = q7_l_in - p0_l_in;
3008  tmp0_l += q0_l_in;
3009  tmp0_l -= p7_l_in;
3010  tmp1_l += tmp0_l;
3011  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3012  q7_h_in = (v8u16)__lsx_vilvh_b(zero, q7);
3013  tmp0_h = q7_h_in - p0_h_in;
3014  tmp0_h += q0_h_in;
3015  tmp0_h -= p7_h_in;
3016  tmp1_h += tmp0_h;
3017  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3018  out_l = __lsx_vpickev_b(out_h, out_l);
3019  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3020  __lsx_vst(filter8, dst, 16*7);
3021 
3022  /* q1 */
3023  filter8 = __lsx_vld(filter48, 64);
3024  tmp0_l = q7_l_in - q0_l_in;
3025  tmp0_l += q1_l_in;
3026  tmp0_l -= p6_l_in;
3027  tmp1_l += tmp0_l;
3028  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3029  tmp0_h = q7_h_in - q0_h_in;
3030  tmp0_h += q1_h_in;
3031  tmp0_h -= p6_h_in;
3032  tmp1_h += tmp0_h;
3033  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3034  out_l = __lsx_vpickev_b(out_h, out_l);
3035  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3036  __lsx_vst(filter8, dst, 16*8);
3037 
3038  /* q2 */
3039  filter8 = __lsx_vld(filter48, 80);
3040  tmp0_l = q7_l_in - q1_l_in;
3041  tmp0_l += q2_l_in;
3042  tmp0_l -= p5_l_in;
3043  tmp1_l += tmp0_l;
3044  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3045  tmp0_h = q7_h_in - q1_h_in;
3046  tmp0_h += q2_h_in;
3047  tmp0_h -= p5_h_in;
3048  tmp1_h += tmp0_h;
3049  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3050  out_l = __lsx_vpickev_b(out_h, out_l);
3051  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
3052  __lsx_vst(filter8, dst, 16*9);
3053 
3054  /* q3 */
3055  tmp0_l = q7_l_in - q2_l_in;
3056  tmp0_l += q3_l_in;
3057  tmp0_l -= p4_l_in;
3058  tmp1_l += tmp0_l;
3059  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3060  tmp0_h = q7_h_in - q2_h_in;
3061  tmp0_h += q3_h_in;
3062  tmp0_h -= p4_h_in;
3063  tmp1_h += tmp0_h;
3064  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3065  out_l = __lsx_vpickev_b(out_h, out_l);
3066  q3 = __lsx_vbitsel_v(q3, out_l, flat2);
3067  __lsx_vst(q3, dst, 16*10);
3068 
3069  /* q4 */
3070  tmp0_l = q7_l_in - q3_l_in;
3071  tmp0_l += q4_l_in;
3072  tmp0_l -= p3_l_in;
3073  tmp1_l += tmp0_l;
3074  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3075  tmp0_h = q7_h_in - q3_h_in;
3076  tmp0_h += q4_h_in;
3077  tmp0_h -= p3_h_in;
3078  tmp1_h += tmp0_h;
3079  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3080  out_l = __lsx_vpickev_b(out_h, out_l);
3081  q4 = __lsx_vbitsel_v(q4, out_l, flat2);
3082  __lsx_vst(q4, dst, 16*11);
3083 
3084  /* q5 */
3085  tmp0_l = q7_l_in - q4_l_in;
3086  tmp0_l += q5_l_in;
3087  tmp0_l -= p2_l_in;
3088  tmp1_l += tmp0_l;
3089  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3090  tmp0_h = q7_h_in - q4_h_in;
3091  tmp0_h += q5_h_in;
3092  tmp0_h -= p2_h_in;
3093  tmp1_h += tmp0_h;
3094  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3095  out_l = __lsx_vpickev_b(out_h, out_l);
3096  q5 = __lsx_vbitsel_v(q5, out_l, flat2);
3097  __lsx_vst(q5, dst, 16*12);
3098 
3099  /* q6 */
3100  tmp0_l = q7_l_in - q5_l_in;
3101  tmp0_l += q6_l_in;
3102  tmp0_l -= p1_l_in;
3103  tmp1_l += tmp0_l;
3104  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
3105  tmp0_h = q7_h_in - q5_h_in;
3106  tmp0_h += q6_h_in;
3107  tmp0_h -= p1_h_in;
3108  tmp1_h += tmp0_h;
3109  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
3110  out_l = __lsx_vpickev_b(out_h, out_l);
3111  q6 = __lsx_vbitsel_v(q6, out_l, flat2);
3112  __lsx_vst(q6, dst, 16*13);
3113 
3114  return 0;
3115  }
3116 }
3117 
3118 void ff_loop_filter_h_16_16_lsx(uint8_t *dst, ptrdiff_t stride,
3119  int32_t b_limit_ptr,
3120  int32_t limit_ptr,
3121  int32_t thresh_ptr)
3122 {
3123  uint8_t early_exit = 0;
3124  uint8_t transposed_input[16 * 24] __attribute__ ((aligned(16)));
3125  uint8_t *filter48 = &transposed_input[16 * 16];
3126 
3127  vp9_transpose_16x16((dst - 8), stride, &transposed_input[0], 16);
3128 
3129  early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
3130  &filter48[0], dst, stride,
3131  b_limit_ptr, limit_ptr, thresh_ptr);
3132 
3133  if (0 == early_exit) {
3134  early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), dst,
3135  stride, &filter48[0]);
3136 
3137  if (0 == early_exit) {
3138  vp9_transpose_16x16(transposed_input, 16, (dst - 8), stride);
3139  }
3140  }
3141 }
stride
int stride
Definition: mace.c:144
q1
static const uint8_t q1[256]
Definition: twofish.c:96
output
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce output
Definition: filter_design.txt:225
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:26
ff_loop_filter_h_88_16_lsx
void ff_loop_filter_h_88_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:1594
vp9_vt_lpf_t16_16w
static int32_t vp9_vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, ptrdiff_t stride, uint8_t *filter48)
Definition: vp9_lpf_lsx.c:2749
DUP2_ARG2
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:58
ff_loop_filter_h_16_8_lsx
void ff_loop_filter_h_16_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:2612
vp9_vt_lpf_t4_and_t8_16w
static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48, uint8_t *dst_org, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:2637
vp9_vt_lpf_t16_8w
static int32_t vp9_vt_lpf_t16_8w(uint8_t *dst, uint8_t *dst_org, ptrdiff_t stride, uint8_t *filter48)
Definition: vp9_lpf_lsx.c:2356
ff_loop_filter_v_84_16_lsx
void ff_loop_filter_v_84_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:465
aligned
static int aligned(int val)
Definition: dashdec.c:169
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
mask
static const uint16_t mask[17]
Definition: lzw.c:38
vp9_vt_lpf_t4_and_t8_8w
static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, int32_t pitch_org, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:2262
LPF_MASK_HEV
#define LPF_MASK_HEV(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src, limit_src, b_limit_src, thresh_src, hev_dst, mask_dst, flat_dst)
Definition: vp9_lpf_lsx.c:192
vp9_transpose_16x8_to_8x16
static void vp9_transpose_16x8_to_8x16(uint8_t *input, ptrdiff_t in_pitch, uint8_t *output)
Definition: vp9_lpf_lsx.c:2130
ff_loop_filter_h_84_16_lsx
void ff_loop_filter_h_84_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:1779
q0
static const uint8_t q0[256]
Definition: twofish.c:77
ff_loop_filter_v_16_16_lsx
void ff_loop_filter_v_16_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:1080
vp9_hz_lpf_t16_16w
static void vp9_hz_lpf_t16_16w(uint8_t *dst, ptrdiff_t stride, uint8_t *filter48)
Definition: vp9_lpf_lsx.c:716
vp9_transpose_8x16_to_16x8
static void vp9_transpose_8x16_to_16x8(uint8_t *input, uint8_t *output, ptrdiff_t out_pitch)
Definition: vp9_lpf_lsx.c:2172
ff_loop_filter_v_8_8_lsx
void ff_loop_filter_v_8_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:302
ff_loop_filter_v_4_8_lsx
void ff_loop_filter_v_4_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:229
src
#define src
Definition: vp8dsp.c:255
LSX_ST_8
#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, _dst, _stride, _stride2, _stride3, _stride4)
Definition: vp9_lpf_lsx.c:41
vp9dsp.h
VP9_FILTER8
#define VP9_FILTER8(p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src, p2_filt8_dst, p1_filt8_dst, p0_filt8_dst, q0_filt8_dst, q1_filt8_dst, q2_filt8_dst)
Definition: vp9_lpf_lsx.c:147
LSX_LD_8
#define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7)
Definition: vp9_lpf_lsx.c:27
input
and forward the test the status of outputs and forward it to the corresponding return FFERROR_NOT_READY If the filters stores internally one or a few frame for some input
Definition: filter_design.txt:172
ff_loop_filter_v_48_16_lsx
void ff_loop_filter_v_48_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:547
ff_loop_filter_h_44_16_lsx
void ff_loop_filter_h_44_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:1396
common.h
vp9dsp_loongarch.h
ff_loop_filter_h_48_16_lsx
void ff_loop_filter_h_48_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:1954
limit
static double limit(double x)
Definition: vf_pseudocolor.c:128
ff_loop_filter_h_8_8_lsx
void ff_loop_filter_h_8_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:1476
ff_loop_filter_v_16_8_lsx
void ff_loop_filter_v_16_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:1096
zero
#define zero
Definition: regdef.h:64
loongson_intrinsics.h
ff_loop_filter_v_88_16_lsx
void ff_loop_filter_v_88_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:374
ff_loop_filter_h_4_8_lsx
void ff_loop_filter_h_4_8_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:1350
hev
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
Definition: vp8dsp_mmi.c:731
vp9_hz_lpf_t4_and_t8_16w
static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *dst, ptrdiff_t stride, uint8_t *filter48, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:629
VP9_LPF_FILTER4_4W
#define VP9_LPF_FILTER4_4W(p1_src, p0_src, q0_src, q1_src, mask_src, hev_src, p1_dst, p0_dst, q0_dst, q1_dst)
Definition: vp9_lpf_lsx.c:55
int32_t
int32_t
Definition: audioconvert.c:56
flat
static av_always_inline void flat(WaveformContext *s, AVFrame *in, AVFrame *out, int component, int intensity, int offset_y, int offset_x, int column, int mirror, int jobnr, int nb_jobs)
Definition: vf_waveform.c:1099
VP9_FLAT5
#define VP9_FLAT5(p7_src, p6_src, p5_src, p4_src, p0_src, q0_src, q4_src, q5_src, q6_src, q7_src, flat_src, flat2_dst)
Definition: vp9_lpf_lsx.c:118
VP9_FLAT4
#define VP9_FLAT4(p3_src, p2_src, p0_src, q0_src, q2_src, q3_src, flat_dst)
Definition: vp9_lpf_lsx.c:98
vp9_transpose_16x16
static void vp9_transpose_16x16(uint8_t *input, int32_t in_stride, uint8_t *output, int32_t out_stride)
Definition: vp9_lpf_lsx.c:2195
ff_loop_filter_v_44_16_lsx
void ff_loop_filter_v_44_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:262
ff_loop_filter_h_16_16_lsx
void ff_loop_filter_h_16_16_lsx(uint8_t *dst, ptrdiff_t stride, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_lsx.c:3118