FFmpeg
hevc_mc_uni_lsx.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2022 Loongson Technology Corporation Limited
3  * Contributed by Lu Wang <wanglu@loongson.cn>
4  * Hao Chen <chenhao@loongson.cn>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
24 #include "hevcdsp_lsx.h"
25 
26 static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
27  /* 8 width cases */
28  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29  /* 4 width cases */
30  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31  /* 4 width cases */
32  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
33 };
34 
35 static av_always_inline
36 void common_hz_8t_64w_lsx(uint8_t *src, int32_t src_stride,
37  uint8_t *dst, int32_t dst_stride,
38  const int8_t *filter, int32_t height)
39 {
40  int32_t loop_cnt;
41  __m128i mask0, mask1, mask2, mask3, out1, out2;
42  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
43  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
44  __m128i filt0, filt1, filt2, filt3;
45  __m128i res0, res1, res2, res3;
46 
47  mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
48  src -= 3;
49 
50  /* rearranging filter */
51  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
52  filt0, filt1, filt2, filt3);
53 
54  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
55  mask3 = __lsx_vaddi_bu(mask0, 6);
56 
57  for (loop_cnt = height; loop_cnt--;) {
58  DUP4_ARG2(__lsx_vld, src, 0, src, 8, src, 16, src, 24,
59  src0, src1, src2, src3);
60  DUP4_ARG2(__lsx_vld, src, 32, src, 40, src, 48, src, 56,
61  src4, src5, src6, src7);
62  src += src_stride;
63 
64  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0,
65  vec0, vec1);
66  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0,
67  vec2, vec3);
68  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
69  vec3, filt0, res0, res1, res2, res3);
70  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2,
71  vec0, vec1);
72  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2,
73  vec2, vec3);
74  DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2,
75  res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3);
76  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1,
77  vec4, vec5);
78  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1,
79  vec6, vec7);
80  DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1,
81  res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3);
82  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3,
83  vec4, vec5);
84  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask3, src3, src3, mask3,
85  vec6, vec7);
86  DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3,
87  res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3);
88 
89  DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6,
90  out1, out2);
91  __lsx_vst(out1, dst, 0);
92  __lsx_vst(out2, dst, 16);
93 
94  DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src5, src5, mask0,
95  vec0, vec1);
96  DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src7, src7, mask0,
97  vec2, vec3);
98  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec1, filt0, vec2, filt0,
99  vec3, filt0, res0, res1, res2, res3);
100  DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask2, src5, src5, mask2,
101  vec0, vec1);
102  DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask2, src7, src7, mask2,
103  vec2, vec3);
104  DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec0, filt2, res1, vec1, filt2,
105  res2, vec2, filt2, res3, vec3, filt2, res0, res1, res2, res3);
106  DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask1, src5, src5, mask1,
107  vec4, vec5);
108  DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask1, src7, src7, mask1,
109  vec6, vec7);
110  DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt1, res1, vec5, filt1,
111  res2, vec6, filt1, res3, vec7, filt1, res0, res1, res2, res3);
112  DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask3, src5, src5, mask3,
113  vec4, vec5);
114  DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask3, src7, src7, mask3,
115  vec6, vec7);
116  DUP4_ARG3(__lsx_vdp2add_h_bu_b, res0, vec4, filt3, res1, vec5, filt3,
117  res2, vec6, filt3, res3, vec7, filt3, res0, res1, res2, res3);
118 
119  DUP2_ARG3(__lsx_vssrarni_bu_h, res1, res0, 6, res3, res2, 6,
120  out1, out2);
121  __lsx_vst(out1, dst, 32);
122  __lsx_vst(out2, dst, 48);
123  dst += dst_stride;
124  }
125 }
126 
127 static av_always_inline
128 void common_vt_8t_8w_lsx(uint8_t *src, int32_t src_stride,
129  uint8_t *dst, int32_t dst_stride,
130  const int8_t *filter, int32_t height)
131 {
132  uint32_t loop_cnt;
133  int32_t src_stride_2x = (src_stride << 1);
134  int32_t dst_stride_2x = (dst_stride << 1);
135  int32_t src_stride_4x = (src_stride << 2);
136  int32_t dst_stride_4x = (dst_stride << 2);
137  int32_t src_stride_3x = src_stride_2x + src_stride;
138  int32_t dst_stride_3x = dst_stride_2x + dst_stride;
139 
140  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
141  __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
142  __m128i src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
143  __m128i tmp0, tmp1;
144  __m128i out0_r, out1_r, out2_r, out3_r;
145 
146  src -= src_stride_3x;
147  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
148  filt0, filt1, filt2, filt3);
149 
150  src0 = __lsx_vld(src, 0);
151  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
152  src3 = __lsx_vldx(src, src_stride_3x);
153  src += src_stride_4x;
154  src4 = __lsx_vld(src, 0);
155  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
156  src += src_stride_3x;
157  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
158  src10_r, src32_r, src54_r, src21_r);
159  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
160 
161  for (loop_cnt = (height >> 2); loop_cnt--;) {
162  src7 = __lsx_vld(src, 0);
163  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
164  src10 = __lsx_vldx(src, src_stride_3x);
165  src += src_stride_4x;
166 
167  DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
168  src9, src76_r, src87_r, src98_r, src109_r);
169  DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
170  filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r);
171  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r,
172  src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r,
173  filt1, out0_r, out1_r, out2_r, out3_r);
174  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r,
175  src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r,
176  filt2, out0_r, out1_r, out2_r, out3_r);
177  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r,
178  src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r,
179  filt3, out0_r, out1_r, out2_r, out3_r);
180 
181  DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6,
182  tmp0, tmp1)
183  __lsx_vstelm_d(tmp0, dst, 0, 0);
184  __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
185  __lsx_vstelm_d(tmp1, dst + dst_stride_2x, 0, 0);
186  __lsx_vstelm_d(tmp1, dst + dst_stride_3x, 0, 1);
187  dst += dst_stride_4x;
188 
189  src10_r = src54_r;
190  src32_r = src76_r;
191  src54_r = src98_r;
192  src21_r = src65_r;
193  src43_r = src87_r;
194  src65_r = src109_r;
195  src6 = src10;
196  }
197 }
198 
199 static av_always_inline
200 void common_vt_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
201  int32_t dst_stride, const int8_t *filter,
203 {
204  uint8_t *src_tmp;
205  uint8_t *dst_tmp;
206  uint32_t loop_cnt, cnt;
207  const int32_t src_stride_2x = (src_stride << 1);
208  const int32_t dst_stride_2x = (dst_stride << 1);
209  const int32_t src_stride_4x = (src_stride << 2);
210  const int32_t dst_stride_4x = (dst_stride << 2);
211  const int32_t src_stride_3x = src_stride_2x + src_stride;
212  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
213 
214  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
215  __m128i filt0, filt1, filt2, filt3;
216  __m128i src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
217  __m128i src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
218  __m128i src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
219  __m128i tmp0, tmp1, tmp2, tmp3;
220  __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
221 
222  src -= src_stride_3x;
223  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6, filt0,
224  filt1, filt2, filt3);
225 
226  for (cnt = (width >> 4); cnt--;) {
227  src_tmp = src;
228  dst_tmp = dst;
229 
230  src0 = __lsx_vld(src_tmp, 0);
231  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
232  src1, src2);
233  src3 = __lsx_vldx(src_tmp, src_stride_3x);
234  src_tmp += src_stride_4x;
235  src4 = __lsx_vld(src_tmp, 0);
236  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
237  src5, src6);
238  src_tmp += src_stride_3x;
239  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
240  src10_r, src32_r, src54_r, src21_r);
241  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, src43_r, src65_r);
242  DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
243  src10_l, src32_l, src54_l, src21_l);
244  DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, src43_l, src65_l);
245 
246  for (loop_cnt = (height >> 2); loop_cnt--;) {
247  src7 = __lsx_vld(src_tmp, 0);
248  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
249  src8, src9);
250  src10 = __lsx_vldx(src_tmp, src_stride_3x);
251  src_tmp += src_stride_4x;
252  DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
253  src9, src76_r, src87_r, src98_r, src109_r);
254  DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10,
255  src9, src76_l, src87_l, src98_l, src109_l);
256  DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src21_r, filt0, src32_r,
257  filt0, src43_r, filt0, out0_r, out1_r, out2_r, out3_r);
258  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out1_r,
259  src43_r, filt1, out2_r, src54_r, filt1, out3_r, src65_r,
260  filt1, out0_r, out1_r, out2_r, out3_r);
261  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src54_r, filt2, out1_r,
262  src65_r, filt2, out2_r, src76_r, filt2, out3_r, src87_r,
263  filt2, out0_r, out1_r, out2_r, out3_r);
264  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src76_r, filt3, out1_r,
265  src87_r, filt3, out2_r, src98_r, filt3, out3_r, src109_r,
266  filt3, out0_r, out1_r, out2_r, out3_r);
267  DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_l, filt0, src21_l, filt0, src32_l,
268  filt0, src43_l, filt0, out0_l, out1_l, out2_l, out3_l);
269  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src32_l, filt1, out1_l,
270  src43_l, filt1, out2_l, src54_l, filt1, out3_l, src65_l,
271  filt1, out0_l, out1_l, out2_l, out3_l);
272  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src54_l, filt2, out1_l,
273  src65_l, filt2, out2_l, src76_l, filt2, out3_l, src87_l,
274  filt2, out0_l, out1_l, out2_l, out3_l);
275  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_l, src76_l, filt3, out1_l,
276  src87_l, filt3, out2_l, src98_l, filt3, out3_l, src109_l,
277  filt3, out0_l, out1_l, out2_l, out3_l);
278  DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r,
279  6, out2_l, out2_r, 6, out3_l, out3_r, 6,
280  tmp0, tmp1, tmp2, tmp3);
281  __lsx_vst(tmp0, dst_tmp, 0);
282  __lsx_vstx(tmp1, dst_tmp, dst_stride);
283  __lsx_vstx(tmp2, dst_tmp, dst_stride_2x);
284  __lsx_vstx(tmp3, dst_tmp, dst_stride_3x);
285  dst_tmp += dst_stride_4x;
286 
287  src10_r = src54_r;
288  src32_r = src76_r;
289  src54_r = src98_r;
290  src21_r = src65_r;
291  src43_r = src87_r;
292  src65_r = src109_r;
293  src10_l = src54_l;
294  src32_l = src76_l;
295  src54_l = src98_l;
296  src21_l = src65_l;
297  src43_l = src87_l;
298  src65_l = src109_l;
299  src6 = src10;
300  }
301 
302  src += 16;
303  dst += 16;
304  }
305 }
306 
307 static void common_vt_8t_24w_lsx(uint8_t *src, int32_t src_stride,
308  uint8_t *dst, int32_t dst_stride,
309  const int8_t *filter, int32_t height)
310 {
311  common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 16);
312  common_vt_8t_8w_lsx(src + 16, src_stride, dst + 16, dst_stride, filter,
313  height);
314 }
315 
316 static void common_vt_8t_32w_lsx(uint8_t *src, int32_t src_stride,
317  uint8_t *dst, int32_t dst_stride,
318  const int8_t *filter, int32_t height)
319 {
320  common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 32);
321 }
322 
323 static void common_vt_8t_48w_lsx(uint8_t *src, int32_t src_stride,
324  uint8_t *dst, int32_t dst_stride,
325  const int8_t *filter, int32_t height)
326 {
327  common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 48);
328 }
329 
330 static void common_vt_8t_64w_lsx(uint8_t *src, int32_t src_stride,
331  uint8_t *dst, int32_t dst_stride,
332  const int8_t *filter, int32_t height)
333 {
334  common_vt_8t_16w_lsx(src, src_stride, dst, dst_stride, filter, height, 64);
335 }
336 
337 static av_always_inline
338 void hevc_hv_8t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
339  int32_t dst_stride, const int8_t *filter_x,
340  const int8_t *filter_y, int32_t height, int32_t width)
341 {
342  uint32_t loop_cnt, cnt;
343  uint8_t *src_tmp;
344  uint8_t *dst_tmp;
345  const int32_t src_stride_2x = (src_stride << 1);
346  const int32_t dst_stride_2x = (dst_stride << 1);
347  const int32_t src_stride_4x = (src_stride << 2);
348  const int32_t src_stride_3x = src_stride_2x + src_stride;
349 
350  __m128i out;
351  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
352  __m128i filt0, filt1, filt2, filt3;
353  __m128i filt_h0, filt_h1, filt_h2, filt_h3;
354  __m128i mask1, mask2, mask3;
355  __m128i filter_vec;
356  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
357  __m128i vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
358  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
359  __m128i dst0_r, dst0_l, dst1_r, dst1_l;
360  __m128i dst10_r, dst32_r, dst54_r, dst76_r;
361  __m128i dst10_l, dst32_l, dst54_l, dst76_l;
362  __m128i dst21_r, dst43_r, dst65_r, dst87_r;
363  __m128i dst21_l, dst43_l, dst65_l, dst87_l;
364  __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
365 
366  src -= (src_stride_3x + 3);
367  DUP4_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filter_x, 4,
368  filter_x, 6, filt0, filt1, filt2, filt3);
369 
370  filter_vec = __lsx_vld(filter_y, 0);
371  filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
372  DUP4_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filter_vec, 2,
373  filter_vec, 3, filt_h0, filt_h1, filt_h2, filt_h3);
374 
375  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
376  mask3 = __lsx_vaddi_bu(mask0, 6);
377 
378  for (cnt = width >> 3; cnt--;) {
379  src_tmp = src;
380  dst_tmp = dst;
381 
382  src0 = __lsx_vld(src_tmp, 0);
383  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
384  src1, src2);
385  src3 = __lsx_vldx(src_tmp, src_stride_3x);
386  src_tmp += src_stride_4x;
387  src4 = __lsx_vld(src_tmp, 0);
388  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
389  src5, src6);
390  src_tmp += src_stride_3x;
391 
392  /* row 0 row 1 row 2 row 3 */
393  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src0,
394  src0, mask2, src0, src0, mask3, vec0, vec1, vec2, vec3);
395  DUP4_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, src1,
396  src1, mask2, src1, src1, mask3, vec4, vec5, vec6, vec7);
397  DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src2,
398  src2, mask2, src2, src2, mask3, vec8, vec9, vec10, vec11);
399  DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src3,
400  src3, mask2, src3, src3, mask3, vec12, vec13, vec14, vec15);
401  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, vec8, filt0,
402  vec12, filt0, dst0, dst1, dst2, dst3);
403  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec5, filt1,
404  dst2, vec9, filt1, dst3, vec13, filt1, dst0, dst1, dst2, dst3);
405  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec2, filt2, dst1, vec6, filt2,
406  dst2, vec10, filt2, dst3, vec14, filt2, dst0, dst1, dst2, dst3);
407  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec3, filt3, dst1, vec7, filt3,
408  dst2, vec11, filt3, dst3, vec15, filt3, dst0, dst1, dst2, dst3);
409 
410  DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src4,
411  src4, mask2, src4, src4, mask3, vec0, vec1, vec2, vec3);
412  DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src5,
413  src5, mask2, src5, src5, mask3, vec4, vec5, vec6, vec7);
414  DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src6,
415  src6, mask2, src6, src6, mask3, vec8, vec9, vec10, vec11);
416  DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec4, filt0, dst4, dst5);
417  dst6 = __lsx_vdp2_h_bu_b(vec8, filt0);
418  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec1, filt1, dst5, vec5, filt1,
419  dst6, vec9, filt1, dst4, vec2, filt2, dst4, dst5, dst6, dst4);
420  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst5, vec6, filt2, dst6, vec10, filt2,
421  dst4, vec3, filt3, dst5, vec7, filt3, dst5, dst6, dst4, dst5);
422  dst6 = __lsx_vdp2add_h_bu_b(dst6, vec11, filt3);
423  DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
424  dst1, dst10_r, dst32_r, dst54_r, dst21_r);
425  DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst3, dst2, dst5, dst4, dst2,
426  dst1, dst10_l, dst32_l, dst54_l, dst21_l);
427  DUP2_ARG2(__lsx_vilvl_h, dst4, dst3, dst6, dst5, dst43_r, dst65_r);
428  DUP2_ARG2(__lsx_vilvh_h, dst4, dst3, dst6, dst5, dst43_l, dst65_l);
429 
430  for (loop_cnt = height >> 1; loop_cnt--;) {
431  src7 = __lsx_vld(src_tmp, 0);
432  src8 = __lsx_vldx(src_tmp, src_stride);
433  src_tmp += src_stride_2x;
434 
435  DUP4_ARG3(__lsx_vshuf_b, src7, src7, mask0, src7, src7, mask1, src7,
436  src7, mask2, src7, src7, mask3, vec0, vec1, vec2, vec3);
437  dst7 = __lsx_vdp2_h_bu_b(vec0, filt0);
438  DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst7, vec1, filt1, dst7, vec2,
439  filt2, dst7, dst7);
440  dst7 = __lsx_vdp2add_h_bu_b(dst7, vec3, filt3);
441  dst76_r = __lsx_vilvl_h(dst7, dst6);
442  dst76_l = __lsx_vilvh_h(dst7, dst6);
443  DUP2_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0,
444  dst0_r, dst0_l);
445  DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
446  dst32_l, filt_h1, dst0_r, dst54_r, filt_h2, dst0_l,
447  dst54_l, filt_h2, dst0_r, dst0_l, dst0_r, dst0_l);
448  DUP2_ARG3(__lsx_vdp2add_w_h, dst0_r, dst76_r, filt_h3, dst0_l,
449  dst76_l, filt_h3, dst0_r, dst0_l);
450  DUP2_ARG2(__lsx_vsrai_w, dst0_r, 6, dst0_l, 6, dst0_r, dst0_l);
451 
452  DUP4_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, src8,
453  src8, mask2, src8, src8, mask3, vec0, vec1, vec2, vec3);
454  dst8 = __lsx_vdp2_h_bu_b(vec0, filt0);
455  DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst8, vec1, filt1, dst8, vec2,
456  filt2, dst8, dst8);
457  dst8 = __lsx_vdp2add_h_bu_b(dst8, vec3, filt3);
458 
459  dst87_r = __lsx_vilvl_h(dst8, dst7);
460  dst87_l = __lsx_vilvh_h(dst8, dst7);
461  DUP2_ARG2(__lsx_vdp2_w_h, dst21_r, filt_h0, dst21_l, filt_h0,
462  dst1_r, dst1_l);
463  DUP4_ARG3(__lsx_vdp2add_w_h, dst1_r, dst43_r, filt_h1, dst1_l,
464  dst43_l, filt_h1, dst1_r, dst65_r, filt_h2, dst1_l,
465  dst65_l, filt_h2, dst1_r, dst1_l, dst1_r, dst1_l);
466  DUP2_ARG3(__lsx_vdp2add_w_h, dst1_r, dst87_r, filt_h3, dst1_l,
467  dst87_l, filt_h3, dst1_r, dst1_l);
468  DUP2_ARG2(__lsx_vsrai_w, dst1_r, 6, dst1_l, 6, dst1_r, dst1_l);
469  DUP4_ARG2(__lsx_vsrari_w, dst0_r, 6, dst0_l, 6,dst1_r, 6, dst1_l,
470  6, dst0_r, dst0_l, dst1_r, dst1_l);
471  DUP4_ARG1(__lsx_vclip255_w, dst0_l, dst0_r, dst1_l, dst1_r,
472  dst0_l, dst0_r, dst1_l, dst1_r);
473  DUP2_ARG2(__lsx_vpickev_h, dst0_l, dst0_r, dst1_l, dst1_r,
474  dst0, dst1);
475  out = __lsx_vpickev_b(dst1, dst0);
476  __lsx_vstelm_d(out, dst_tmp, 0, 0);
477  __lsx_vstelm_d(out, dst_tmp + dst_stride, 0, 1);
478  dst_tmp += dst_stride_2x;
479 
480  dst10_r = dst32_r;
481  dst32_r = dst54_r;
482  dst54_r = dst76_r;
483  dst10_l = dst32_l;
484  dst32_l = dst54_l;
485  dst54_l = dst76_l;
486  dst21_r = dst43_r;
487  dst43_r = dst65_r;
488  dst65_r = dst87_r;
489  dst21_l = dst43_l;
490  dst43_l = dst65_l;
491  dst65_l = dst87_l;
492  dst6 = dst8;
493  }
494  src += 8;
495  dst += 8;
496  }
497 }
498 
499 static void hevc_hv_8t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
500  int32_t dst_stride, const int8_t *filter_x,
501  const int8_t *filter_y, int32_t height)
502 {
503  hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
504  filter_x, filter_y, height, 8);
505 }
506 
507 static void hevc_hv_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
508  int32_t dst_stride, const int8_t *filter_x,
509  const int8_t *filter_y, int32_t height)
510 {
511  hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
512  filter_x, filter_y, height, 16);
513 }
514 
515 static void hevc_hv_8t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
516  int32_t dst_stride, const int8_t *filter_x,
517  const int8_t *filter_y, int32_t height)
518 {
519  hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
520  filter_x, filter_y, height, 24);
521 }
522 
523 static void hevc_hv_8t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
524  int32_t dst_stride, const int8_t *filter_x,
525  const int8_t *filter_y, int32_t height)
526 {
527  hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
528  filter_x, filter_y, height, 32);
529 }
530 
531 static void hevc_hv_8t_48w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
532  int32_t dst_stride, const int8_t *filter_x,
533  const int8_t *filter_y, int32_t height)
534 {
535  hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
536  filter_x, filter_y, height, 48);
537 }
538 
539 static void hevc_hv_8t_64w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
540  int32_t dst_stride, const int8_t *filter_x,
541  const int8_t *filter_y, int32_t height)
542 {
543  hevc_hv_8t_8x2_lsx(src, src_stride, dst, dst_stride,
544  filter_x, filter_y, height, 64);
545 }
546 
547 static av_always_inline
548 void common_vt_4t_24w_lsx(uint8_t *src, int32_t src_stride,
549  uint8_t *dst, int32_t dst_stride,
550  const int8_t *filter, int32_t height)
551 {
552  uint32_t loop_cnt;
553  int32_t src_stride_2x = (src_stride << 1);
554  int32_t src_stride_3x = src_stride_2x + src_stride;
555  uint8_t *_src;
556 
557  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
558  __m128i src11, filt0, filt1;
559  __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
560  __m128i src109_r, src10_l, src32_l, src21_l, src43_l;
561  __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
562  __m128i out1, out2, out3, out4;
563 
564  src -= src_stride;
565  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
566  _src = src + 16;
567 
568  /* 16 width */
569  src0 = __lsx_vld(src, 0);
570  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
571  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
572  DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
573 
574  /* 8 width */
575  src6 = __lsx_vld(_src, 0);
576  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
577  src += src_stride_3x;
578  _src += src_stride_3x;
579  DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
580 
581  for (loop_cnt = 8; loop_cnt--;) {
582  /* 16 width */
583  DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
584  DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
585  DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
586  DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
587 
588  /* 8 width */
589  src += src_stride_2x;
590  _src += src_stride_2x;
591  DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
592 
593  /* 16 width */
594  DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
595  filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l);
596  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l,
597  filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1,
598  out0_r, out0_l, out1_r, out1_l);
599 
600  /* 8 width */
601  DUP2_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src87_r, filt0,
602  out2_r, out3_r);
603  DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out3_r,
604  src109_r, filt1, out2_r, out3_r);
605 
606  /* 16 + 8 width */
607  DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6,
608  out3_r, out3_r, 6, out1_l, out1_r, 6, out1, out2, out3, out4);
609  __lsx_vst(out1, dst, 0);
610  __lsx_vstelm_d(out2, dst, 16, 0);
611  dst += dst_stride;
612  __lsx_vst(out4, dst, 0);
613  __lsx_vstelm_d(out3, dst, 16, 0);
614  dst += dst_stride;
615 
616  /* 16 width */
617  DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src5, src11);
618  DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src2, src8);
619  DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src10_r, src21_r);
620  DUP2_ARG2(__lsx_vilvh_b, src5, src4, src2, src5, src10_l, src21_l);
621 
622  /* 8 width */
623  src += src_stride_2x;
624  _src += src_stride_2x;
625  DUP2_ARG2(__lsx_vilvl_b, src11, src10, src8, src11, src76_r, src87_r);
626 
627  /* 16 width */
628  DUP4_ARG2(__lsx_vdp2_h_bu_b, src32_r, filt0, src32_l, filt0, src43_r,
629  filt0, src43_l, filt0, out0_r, out0_l, out1_r, out1_l);
630  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src10_r, filt1, out0_l, src10_l,
631  filt1, out1_r, src21_r, filt1, out1_l, src21_l, filt1,
632  out0_r, out0_l, out1_r, out1_l);
633 
634  /* 8 width */
635  DUP2_ARG2(__lsx_vdp2_h_bu_b, src98_r, filt0, src109_r, filt0,
636  out2_r, out3_r);
637  DUP2_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src76_r, filt1, out3_r,
638  src87_r, filt1, out2_r, out3_r);
639 
640  /* 16 + 8 width */
641  DUP4_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out2_r, out2_r, 6,
642  out1_l, out1_r, 6, out3_r, out3_r, 6, out1, out2, out3, out4);
643 
644  __lsx_vst(out1, dst, 0);
645  __lsx_vstelm_d(out2, dst, 16, 0);
646  dst += dst_stride;
647  __lsx_vst(out3, dst, 0);
648  __lsx_vstelm_d(out4, dst, 16, 0);
649  dst += dst_stride;
650  }
651 }
652 
653 static av_always_inline
654 void common_vt_4t_32w_lsx(uint8_t *src, int32_t src_stride,
655  uint8_t *dst, int32_t dst_stride,
656  const int8_t *filter, int32_t height)
657 {
658  uint32_t loop_cnt;
659  int32_t src_stride_2x = (src_stride << 1);
660  int32_t dst_stride_2x = (dst_stride << 1);
661  int32_t src_stride_3x = src_stride_2x + src_stride;
662  uint8_t *_src;
663 
664  __m128i src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
665  __m128i src10_r, src32_r, src76_r, src98_r;
666  __m128i src21_r, src43_r, src87_r, src109_r;
667  __m128i out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
668  __m128i src10_l, src32_l, src76_l, src98_l;
669  __m128i src21_l, src43_l, src87_l, src109_l;
670  __m128i filt0, filt1;
671  __m128i out1, out2;
672 
673  src -= src_stride;
674  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
675  _src = src + 16;
676 
677  /* 16 width */
678  src0 = __lsx_vld(src, 0);
679  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
680 
681  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
682  DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
683 
684  /* next 16 width */
685  src6 = __lsx_vld(_src, 0);
686  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride_2x, src7, src8);
687  src += src_stride_3x;
688  _src += src_stride_3x;
689 
690  DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src76_r, src87_r);
691  DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src76_l, src87_l);
692 
693  for (loop_cnt = (height >> 1); loop_cnt--;) {
694  /* 16 width */
695  DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src3, src9);
696  DUP2_ARG2(__lsx_vldx, src, src_stride, _src, src_stride, src4, src10);
697  DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
698  DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src32_l, src43_l);
699 
700  /* 16 width */
701  DUP4_ARG2(__lsx_vdp2_h_bu_b, src10_r, filt0, src10_l, filt0, src21_r,
702  filt0, src21_l, filt0, out0_r, out0_l, out1_r, out1_l);
703  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out0_r, src32_r, filt1, out0_l, src32_l,
704  filt1, out1_r, src43_r, filt1, out1_l, src43_l, filt1,
705  out0_r, out0_l, out1_r, out1_l);
706 
707  DUP2_ARG3(__lsx_vssrarni_bu_h, out0_l, out0_r, 6, out1_l, out1_r, 6,
708  out1, out2);
709  __lsx_vst(out1, dst, 0);
710  __lsx_vstx(out2, dst, dst_stride);
711 
712  src10_r = src32_r;
713  src21_r = src43_r;
714  src10_l = src32_l;
715  src21_l = src43_l;
716  src2 = src4;
717 
718  /* next 16 width */
719  src += src_stride_2x;
720  _src += src_stride_2x;
721  DUP2_ARG2(__lsx_vilvl_b, src9, src8, src10, src9, src98_r, src109_r);
722  DUP2_ARG2(__lsx_vilvh_b, src9, src8, src10, src9, src98_l, src109_l);
723 
724  /* next 16 width */
725  DUP4_ARG2(__lsx_vdp2_h_bu_b, src76_r, filt0, src76_l, filt0, src87_r,
726  filt0, src87_l, filt0, out2_r, out2_l, out3_r, out3_l);
727  DUP4_ARG3(__lsx_vdp2add_h_bu_b, out2_r, src98_r, filt1, out2_l, src98_l,
728  filt1, out3_r, src109_r, filt1, out3_l, src109_l, filt1,
729  out2_r, out2_l, out3_r, out3_l);
730 
731  /* next 16 width */
732  DUP2_ARG3(__lsx_vssrarni_bu_h, out2_l, out2_r, 6, out3_l, out3_r, 6,
733  out1, out2);
734  __lsx_vst(out1, dst, 16);
735  __lsx_vst(out2, dst + dst_stride, 16);
736 
737  dst += dst_stride_2x;
738 
739  src76_r = src98_r;
740  src87_r = src109_r;
741  src76_l = src98_l;
742  src87_l = src109_l;
743  src8 = src10;
744  }
745 }
746 
747 static av_always_inline
748 void hevc_hv_4t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
749  int32_t dst_stride, const int8_t *filter_x,
750  const int8_t *filter_y)
751 {
752  const int32_t src_stride_2x = (src_stride << 1);
753  const int32_t src_stride_4x = (src_stride << 2);
754  const int32_t src_stride_3x = src_stride_2x + src_stride;
755  __m128i out;
756  __m128i src0, src1, src2, src3, src4;
757  __m128i filt0, filt1;
758  __m128i filt_h0, filt_h1, filter_vec;
759  __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
760  __m128i mask1;
761  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
762  __m128i dst0, dst1, dst2, dst3, dst4;
763  __m128i dst0_r, dst0_l, dst1_r, dst1_l;
764  __m128i dst10_r, dst32_r, dst21_r, dst43_r;
765  __m128i dst10_l, dst32_l, dst21_l, dst43_l;
766  __m128i out0_r, out1_r;
767 
768  src -= (src_stride + 1);
769  DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
770 
771  filter_vec = __lsx_vld(filter_y, 0);
772  filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
773  DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
774 
775  mask1 = __lsx_vaddi_bu(mask0, 2);
776  src0 = __lsx_vld(src, 0);
777  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src,
778  src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
779 
780  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1,
781  mask0, src1, src1, mask1, vec0, vec1, vec2, vec3);
782  DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src3, src3,
783  mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
784  DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, vec8, vec9);
785 
786  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
787  filt0, dst0, dst1, dst2, dst3);
788  dst4 = __lsx_vdp2_h_bu_b(vec8, filt0);
789  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
790  vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
791  dst4 = __lsx_vdp2add_h_bu_b(dst4, vec9, filt1);
792  DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
793  dst10_r, dst21_r, dst32_r, dst43_r);
794  DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
795  dst10_l, dst21_l, dst32_l, dst43_l);
796  DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
797  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
798  DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
799  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
800  dst0_r, dst0_l, dst1_r, dst1_l);
801  DUP2_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
802  out0_r, out1_r);
803  out = __lsx_vssrarni_bu_h(out1_r, out0_r, 6);
804  __lsx_vstelm_d(out, dst, 0, 0);
805  __lsx_vstelm_d(out, dst + dst_stride, 0, 1);
806 }
807 
808 static av_always_inline
809 void hevc_hv_4t_8multx4_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
810  int32_t dst_stride, const int8_t *filter_x,
811  const int8_t *filter_y, int32_t width8mult)
812 {
813  uint32_t cnt;
814  const int32_t src_stride_2x = (src_stride << 1);
815  const int32_t dst_stride_2x = (dst_stride << 1);
816  const int32_t src_stride_4x = (src_stride << 2);
817  const int32_t src_stride_3x = src_stride_2x + src_stride;
818  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
819 
820  __m128i out0, out1;
821  __m128i src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
822  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
823  __m128i filt0, filt1, filt_h0, filt_h1, filter_vec;
824  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
825  __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
826  __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
827  __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
828 
829  src -= (src_stride + 1);
830  DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
831 
832  filter_vec = __lsx_vld(filter_y, 0);
833  filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
834  DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
835 
836  mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
837  mask1 = __lsx_vaddi_bu(mask0, 2);
838 
839  for (cnt = width8mult; cnt--;) {
840  src0 = __lsx_vld(src, 0);
841  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
842  src3 = __lsx_vldx(src, src_stride_3x);
843  src += src_stride_4x;
844  src4 = __lsx_vld(src, 0);
845  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src5, src6);
846  src += (8 - src_stride_4x);
847  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
848  vec0, vec1);
849  DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
850  vec2, vec3);
851  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
852  vec4, vec5);
853 
854  DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
855  dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
856  DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
857  dst0, dst1);
858  dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
859 
860  DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
861  DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
862 
863  DUP2_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1,
864  vec0, vec1);
865  DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1,
866  vec2, vec3);
867  DUP2_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1,
868  vec4, vec5);
869  DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1,
870  vec6, vec7);
871 
872  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
873  vec6, filt0, dst3, dst4, dst5, dst6);
874  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3, filt1,
875  dst5, vec5, filt1, dst6, vec7, filt1, dst3, dst4, dst5, dst6);
876 
877  DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6,
878  dst5, dst32_r, dst43_r, dst54_r, dst65_r);
879  DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4, dst6,
880  dst5, dst32_l, dst43_l, dst54_l, dst65_l);
881 
882  DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
883  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
884  DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
885  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
886  DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
887  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
888  dst0_r, dst0_l, dst1_r, dst1_l);
889  DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
890  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
891  dst2_r, dst2_l, dst3_r, dst3_l);
892 
893  DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
894  dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3);
895  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
896  __lsx_vstelm_d(out0, dst, 0, 0);
897  __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
898  __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
899  __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
900  dst += 8;
901  }
902 }
903 
904 static av_always_inline
905 void hevc_hv_4t_8x6_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
906  int32_t dst_stride, const int8_t *filter_x,
907  const int8_t *filter_y)
908 {
909  const int32_t src_stride_2x = (src_stride << 1);
910  const int32_t dst_stride_2x = (dst_stride << 1);
911  const int32_t src_stride_4x = (src_stride << 2);
912  const int32_t dst_stride_4x = (dst_stride << 2);
913  const int32_t src_stride_3x = src_stride_2x + src_stride;
914  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
915  __m128i out0, out1, out2;
916  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
917  __m128i filt0, filt1;
918  __m128i filt_h0, filt_h1, filter_vec;
919  __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
920  __m128i mask1;
921  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
922  __m128i vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
923  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
924  __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
925  __m128i dst4_r, dst4_l, dst5_r, dst5_l;
926  __m128i dst10_r, dst32_r, dst10_l, dst32_l;
927  __m128i dst21_r, dst43_r, dst21_l, dst43_l;
928  __m128i dst54_r, dst54_l, dst65_r, dst65_l;
929  __m128i dst76_r, dst76_l, dst87_r, dst87_l;
930  __m128i out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
931 
932  src -= (src_stride + 1);
933  DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
934 
935  filter_vec = __lsx_vld(filter_y, 0);
936  filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
937  DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
938 
939  mask1 = __lsx_vaddi_bu(mask0, 2);
940 
941  src0 = __lsx_vld(src, 0);
942  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,src,
943  src_stride_3x, src, src_stride_4x, src1, src2, src3, src4);
944  src += src_stride_4x;
945  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x,src,
946  src_stride_3x, src, src_stride_4x, src5, src6, src7, src8);
947 
948  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, src1, src1,
949  mask0, src1, src1, mask1, vec0, vec1, vec2, vec3);
950  DUP4_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, src3, src3,
951  mask0, src3, src3, mask1, vec4, vec5, vec6, vec7);
952  DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask0, src4, src4, mask1, src5, src5,
953  mask0, src5, src5, mask1, vec8, vec9, vec10, vec11);
954  DUP4_ARG3(__lsx_vshuf_b, src6, src6, mask0, src6, src6, mask1, src7, src7,
955  mask0, src7, src7, mask1, vec12, vec13, vec14, vec15);
956  DUP2_ARG3(__lsx_vshuf_b, src8, src8, mask0, src8, src8, mask1, vec16, vec17);
957 
958  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0, vec6,
959  filt0, dst0, dst1, dst2, dst3);
960  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec8, filt0, vec10, filt0, vec12, filt0, vec14,
961  filt0, dst4, dst5, dst6, dst7);
962  dst8 = __lsx_vdp2_h_bu_b(vec16, filt0);
963  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1, dst2,
964  vec5, filt1, dst3, vec7, filt1, dst0, dst1, dst2, dst3);
965  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst4, vec9, filt1, dst5, vec11, filt1, dst6,
966  vec13, filt1, dst7, vec15, filt1, dst4, dst5, dst6, dst7);
967  dst8 = __lsx_vdp2add_h_bu_b(dst8, vec17, filt1);
968 
969  DUP4_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
970  dst10_r, dst21_r, dst32_r, dst43_r);
971  DUP4_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst3, dst2, dst4, dst3,
972  dst10_l, dst21_l, dst32_l, dst43_l);
973  DUP4_ARG2(__lsx_vilvl_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
974  dst54_r, dst65_r, dst76_r, dst87_r);
975  DUP4_ARG2(__lsx_vilvh_h, dst5, dst4, dst6, dst5, dst7, dst6, dst8, dst7,
976  dst54_l, dst65_l, dst76_l, dst87_l);
977 
978  DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
979  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
980  DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
981  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
982  DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst54_l, filt_h0, dst65_r,
983  filt_h0, dst65_l, filt_h0, dst4_r, dst4_l, dst5_r, dst5_l);
984  DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
985  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
986  dst0_r, dst0_l, dst1_r, dst1_l);
987  DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
988  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
989  dst2_r, dst2_l, dst3_r, dst3_l);
990  DUP4_ARG3(__lsx_vdp2add_w_h, dst4_r, dst76_r, filt_h1, dst4_l, dst76_l,
991  filt_h1, dst5_r, dst87_r, filt_h1, dst5_l, dst87_l, filt_h1,
992  dst4_r, dst4_l, dst5_r, dst5_l);
993 
994  DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
995  dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r, out2_r, out3_r);
996  DUP2_ARG3(__lsx_vsrani_h_w, dst4_l, dst4_r, 6, dst5_l, dst5_r, 6,
997  out4_r, out5_r);
998  DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r, 6,
999  out0, out1);
1000  out2 = __lsx_vssrarni_bu_h(out5_r, out4_r, 6);
1001 
1002  __lsx_vstelm_d(out0, dst, 0, 0);
1003  __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
1004  __lsx_vstelm_d(out1, dst + dst_stride_2x, 0, 0);
1005  __lsx_vstelm_d(out1, dst + dst_stride_3x, 0, 1);
1006  dst += dst_stride_4x;
1007  __lsx_vstelm_d(out2, dst, 0, 0);
1008  __lsx_vstelm_d(out2, dst + dst_stride, 0, 1);
1009 }
1010 
1011 static av_always_inline
1012 void hevc_hv_4t_8multx4mult_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
1013  int32_t dst_stride, const int8_t *filter_x,
1014  const int8_t *filter_y, int32_t height,
1015  int32_t width8mult)
1016 {
1017  uint32_t loop_cnt, cnt;
1018  uint8_t *src_tmp;
1019  uint8_t *dst_tmp;
1020  const int32_t src_stride_2x = (src_stride << 1);
1021  const int32_t dst_stride_2x = (dst_stride << 1);
1022  const int32_t src_stride_4x = (src_stride << 2);
1023  const int32_t dst_stride_4x = (dst_stride << 2);
1024  const int32_t src_stride_3x = src_stride_2x + src_stride;
1025  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1026 
1027  __m128i out0, out1;
1028  __m128i src0, src1, src2, src3, src4, src5, src6;
1029  __m128i filt0, filt1;
1030  __m128i filt_h0, filt_h1, filter_vec;
1031  __m128i mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1032  __m128i mask1;
1033  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1034  __m128i dst0, dst1, dst2, dst3, dst4, dst5;
1035  __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1036  __m128i dst10_r, dst32_r, dst21_r, dst43_r;
1037  __m128i dst10_l, dst32_l, dst21_l, dst43_l;
1038  __m128i dst54_r, dst54_l, dst65_r, dst65_l, dst6;
1039  __m128i out0_r, out1_r, out2_r, out3_r;
1040 
1041  src -= (src_stride + 1);
1042  DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1043 
1044  filter_vec = __lsx_vld(filter_y, 0);
1045  filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1046  DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1047  mask1 = __lsx_vaddi_bu(mask0, 2);
1048 
1049  for (cnt = width8mult; cnt--;) {
1050  src_tmp = src;
1051  dst_tmp = dst;
1052 
1053  src0 = __lsx_vld(src_tmp, 0);
1054  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1055  src1, src2);
1056  src_tmp += src_stride_3x;
1057 
1058  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1,
1059  vec0, vec1);
1060  DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1,
1061  vec2, vec3);
1062  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1,
1063  vec4, vec5);
1064 
1065  DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst0, dst1);
1066  dst2 = __lsx_vdp2_h_bu_b(vec4, filt0);
1067  DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst0, vec1, filt1, dst1, vec3, filt1,
1068  dst0, dst1);
1069  dst2 = __lsx_vdp2add_h_bu_b(dst2, vec5, filt1);
1070 
1071  DUP2_ARG2(__lsx_vilvl_h, dst1, dst0, dst2, dst1, dst10_r, dst21_r);
1072  DUP2_ARG2(__lsx_vilvh_h, dst1, dst0, dst2, dst1, dst10_l, dst21_l);
1073 
1074  for (loop_cnt = (height >> 2); loop_cnt--;) {
1075  src3 = __lsx_vld(src_tmp, 0);
1076  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1077  src4, src5);
1078  src6 = __lsx_vldx(src_tmp, src_stride_3x);
1079  src_tmp += src_stride_4x;
1080 
1081  DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
1082  src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
1083  DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
1084  src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
1085 
1086  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1087  vec6, filt0, dst3, dst4, dst5, dst6);
1088  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst3, vec1, filt1, dst4, vec3,
1089  filt1, dst5, vec5, filt1, dst6, vec7, filt1,
1090  dst3, dst4, dst5, dst6);
1091 
1092  DUP4_ARG2(__lsx_vilvl_h, dst3, dst2, dst4, dst3, dst5, dst4,
1093  dst6, dst5, dst32_r, dst43_r, dst54_r, dst65_r);
1094  DUP4_ARG2(__lsx_vilvh_h, dst3, dst2, dst4, dst3, dst5, dst4,
1095  dst6, dst5, dst32_l, dst43_l, dst54_l, dst65_l);
1096 
1097  DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1098  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1099  DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1100  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1101  DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l,
1102  dst32_l, filt_h1, dst1_r, dst43_r, filt_h1, dst1_l,
1103  dst43_l, filt_h1, dst0_r, dst0_l, dst1_r, dst1_l);
1104  DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l,
1105  dst54_l, filt_h1, dst3_r, dst65_r, filt_h1, dst3_l,
1106  dst65_l, filt_h1, dst2_r, dst2_l, dst3_r, dst3_l);
1107 
1108  DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6,
1109  dst2_l, dst2_r, 6, dst3_l, dst3_r, 6, out0_r, out1_r,
1110  out2_r, out3_r);
1111  DUP2_ARG3(__lsx_vssrarni_bu_h, out1_r, out0_r, 6, out3_r, out2_r,
1112  6, out0, out1);
1113  __lsx_vstelm_d(out0, dst_tmp, 0, 0);
1114  __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
1115  __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
1116  __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
1117  dst_tmp += dst_stride_4x;
1118 
1119  dst10_r = dst54_r;
1120  dst10_l = dst54_l;
1121  dst21_r = dst65_r;
1122  dst21_l = dst65_l;
1123  dst2 = dst6;
1124  }
1125  src += 8;
1126  dst += 8;
1127  }
1128 }
1129 
1130 static
1131 void hevc_hv_4t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
1132  int32_t dst_stride, const int8_t *filter_x,
1133  const int8_t *filter_y, int32_t height)
1134 {
1135  if (2 == height) {
1136  hevc_hv_4t_8x2_lsx(src, src_stride, dst, dst_stride, filter_x, filter_y);
1137  } else if (4 == height) {
1138  hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride,
1139  filter_x, filter_y, 1);
1140  } else if (6 == height) {
1141  hevc_hv_4t_8x6_lsx(src, src_stride, dst, dst_stride, filter_x, filter_y);
1142  } else if (0 == (height & 0x03)) {
1143  hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
1144  filter_x, filter_y, height, 1);
1145  }
1146 }
1147 
1148 static av_always_inline
1149 void hevc_hv_4t_12w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
1150  int32_t dst_stride, const int8_t *filter_x,
1151  const int8_t *filter_y, int32_t height)
1152 {
1153  uint32_t loop_cnt;
1154  uint8_t *src_tmp, *dst_tmp;
1155  const int32_t src_stride_2x = (src_stride << 1);
1156  const int32_t dst_stride_2x = (dst_stride << 1);
1157  const int32_t src_stride_4x = (src_stride << 2);
1158  const int32_t dst_stride_4x = (dst_stride << 2);
1159  const int32_t src_stride_3x = src_stride_2x + src_stride;
1160  const int32_t dst_stride_3x = dst_stride_2x + dst_stride;
1161  __m128i out0, out1;
1162  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1163  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1164  __m128i mask0, mask1, mask2, mask3;
1165  __m128i filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
1166  __m128i dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
1167  __m128i dst10, dst21, dst22, dst73, dst84, dst95, dst106;
1168  __m128i dst76_r, dst98_r, dst87_r, dst109_r;
1169  __m128i dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
1170  __m128i dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
1171  __m128i dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
1172  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1173 
1174  src -= (src_stride + 1);
1175  DUP2_ARG2(__lsx_vldrepl_h, filter_x, 0, filter_x, 2, filt0, filt1);
1176 
1177  filter_vec = __lsx_vld(filter_y, 0);
1178  filter_vec = __lsx_vsllwil_h_b(filter_vec, 0);
1179  DUP2_ARG2(__lsx_vreplvei_w, filter_vec, 0, filter_vec, 1, filt_h0, filt_h1);
1180 
1181  mask0 = __lsx_vld(ff_hevc_mask_arr, 0);
1182  mask1 = __lsx_vaddi_bu(mask0, 2);
1183 
1184  src_tmp = src;
1185  dst_tmp = dst;
1186 
1187  src0 = __lsx_vld(src_tmp, 0);
1188  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1189  src1, src2);
1190  src_tmp += src_stride_3x;
1191 
1192  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src0, src0, mask1, vec0, vec1);
1193  DUP2_ARG3(__lsx_vshuf_b, src1, src1, mask0, src1, src1, mask1, vec2, vec3);
1194  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src2, src2, mask1, vec4, vec5);
1195 
1196  DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dsth0, dsth1);
1197  dsth2 = __lsx_vdp2_h_bu_b(vec4, filt0);
1198  DUP2_ARG3(__lsx_vdp2add_h_bu_b, dsth0, vec1, filt1, dsth1, vec3, filt1,
1199  dsth0, dsth1);
1200  dsth2 = __lsx_vdp2add_h_bu_b(dsth2, vec5, filt1);
1201 
1202  DUP2_ARG2(__lsx_vilvl_h, dsth1, dsth0, dsth2, dsth1, dst10_r, dst21_r);
1203  DUP2_ARG2(__lsx_vilvh_h, dsth1, dsth0, dsth2, dsth1, dst10_l, dst21_l);
1204 
1205  for (loop_cnt = 4; loop_cnt--;) {
1206  src3 = __lsx_vld(src_tmp, 0);
1207  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride_2x,
1208  src4, src5);
1209  src6 = __lsx_vldx(src_tmp, src_stride_3x);
1210  src_tmp += src_stride_4x;
1211 
1212  DUP4_ARG3(__lsx_vshuf_b, src3, src3, mask0, src3, src3, mask1, src4,
1213  src4, mask0, src4, src4, mask1, vec0, vec1, vec2, vec3);
1214  DUP4_ARG3(__lsx_vshuf_b, src5, src5, mask0, src5, src5, mask1, src6,
1215  src6, mask0, src6, src6, mask1, vec4, vec5, vec6, vec7);
1216 
1217  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1218  vec6, filt0, dsth3, dsth4, dsth5, dsth6);
1219  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dsth3, vec1, filt1, dsth4,
1220  vec3, filt1, dsth5, vec5, filt1, dsth6, vec7, filt1,
1221  dsth3, dsth4, dsth5, dsth6);
1222 
1223  DUP4_ARG2(__lsx_vilvl_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
1224  dsth6, dsth5, dst32_r, dst43_r, dst54_r, dst65_r);
1225  DUP4_ARG2(__lsx_vilvh_h, dsth3, dsth2, dsth4, dsth3, dsth5, dsth4,
1226  dsth6, dsth5, dst32_l, dst43_l, dst54_l, dst65_l);
1227 
1228  DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst10_l, filt_h0, dst21_r,
1229  filt_h0, dst21_l, filt_h0, dst0_r, dst0_l, dst1_r, dst1_l);
1230  DUP4_ARG2(__lsx_vdp2_w_h, dst32_r, filt_h0, dst32_l, filt_h0, dst43_r,
1231  filt_h0, dst43_l, filt_h0, dst2_r, dst2_l, dst3_r, dst3_l);
1232  DUP4_ARG3(__lsx_vdp2add_w_h, dst0_r, dst32_r, filt_h1, dst0_l, dst32_l,
1233  filt_h1, dst1_r, dst43_r, filt_h1, dst1_l, dst43_l, filt_h1,
1234  dst0_r, dst0_l, dst1_r, dst1_l);
1235  DUP4_ARG3(__lsx_vdp2add_w_h, dst2_r, dst54_r, filt_h1, dst2_l, dst54_l,
1236  filt_h1, dst3_r, dst65_r, filt_h1, dst3_l, dst65_l, filt_h1,
1237  dst2_r, dst2_l, dst3_r, dst3_l);
1238 
1239  DUP4_ARG3(__lsx_vsrani_h_w, dst0_l, dst0_r, 6, dst1_l, dst1_r, 6, dst2_l,
1240  dst2_r, 6, dst3_l, dst3_r, 6, tmp0, tmp1, tmp2, tmp3);
1241  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
1242 
1243  __lsx_vstelm_d(out0, dst_tmp, 0, 0);
1244  __lsx_vstelm_d(out0, dst_tmp + dst_stride, 0, 1);
1245  __lsx_vstelm_d(out1, dst_tmp + dst_stride_2x, 0, 0);
1246  __lsx_vstelm_d(out1, dst_tmp + dst_stride_3x, 0, 1);
1247  dst_tmp += dst_stride_4x;
1248 
1249  dst10_r = dst54_r;
1250  dst10_l = dst54_l;
1251  dst21_r = dst65_r;
1252  dst21_l = dst65_l;
1253  dsth2 = dsth6;
1254  }
1255 
1256  src += 8;
1257  dst += 8;
1258 
1259  mask2 = __lsx_vld(ff_hevc_mask_arr, 16);
1260  mask3 = __lsx_vaddi_bu(mask2, 2);
1261 
1262  src0 = __lsx_vld(src, 0);
1263  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src1, src2);
1264  src += src_stride_3x;
1265  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src1, src0, mask3, vec0, vec1);
1266  DUP2_ARG3(__lsx_vshuf_b, src2, src1, mask2, src2, src1, mask3, vec2, vec3);
1267 
1268  DUP2_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, dst10, dst21);
1269  DUP2_ARG3(__lsx_vdp2add_h_bu_b, dst10, vec1, filt1, dst21, vec3, filt1,
1270  dst10, dst21);
1271 
1272  dst10_r = __lsx_vilvl_h(dst21, dst10);
1273  dst21_r = __lsx_vilvh_h(dst21, dst10);
1274  dst22 = __lsx_vreplvei_d(dst21, 1);
1275 
1276  for (loop_cnt = 2; loop_cnt--;) {
1277  src3 = __lsx_vld(src, 0);
1278  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src4, src5);
1279  src6 = __lsx_vldx(src, src_stride_3x);
1280  src += src_stride_4x;
1281  src7 = __lsx_vld(src, 0);
1282  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_2x, src8, src9);
1283  src10 = __lsx_vldx(src, src_stride_3x);
1284  src += src_stride_4x;
1285  DUP4_ARG3(__lsx_vshuf_b, src7, src3, mask2, src7, src3, mask3, src8,
1286  src4, mask2, src8, src4, mask3, vec0, vec1, vec2, vec3);
1287  DUP4_ARG3(__lsx_vshuf_b, src9, src5, mask2, src9, src5, mask3, src10,
1288  src6, mask2, src10, src6, mask3, vec4, vec5, vec6, vec7);
1289 
1290  DUP4_ARG2(__lsx_vdp2_h_bu_b, vec0, filt0, vec2, filt0, vec4, filt0,
1291  vec6, filt0, dst73, dst84, dst95, dst106);
1292  DUP4_ARG3(__lsx_vdp2add_h_bu_b, dst73, vec1, filt1, dst84, vec3,
1293  filt1, dst95, vec5, filt1, dst106, vec7, filt1,
1294  dst73, dst84, dst95, dst106);
1295 
1296  dst32_r = __lsx_vilvl_h(dst73, dst22);
1297  DUP2_ARG2(__lsx_vilvl_h, dst84, dst73, dst95, dst84, dst43_r, dst54_r);
1298  DUP2_ARG2(__lsx_vilvh_h, dst84, dst73, dst95, dst84, dst87_r, dst98_r);
1299  dst65_r = __lsx_vilvl_h(dst106, dst95);
1300  dst109_r = __lsx_vilvh_h(dst106, dst95);
1301  dst22 = __lsx_vreplvei_d(dst73, 1);
1302  dst76_r = __lsx_vilvl_h(dst22, dst106);
1303 
1304  DUP4_ARG2(__lsx_vdp2_w_h, dst10_r, filt_h0, dst21_r, filt_h0, dst32_r,
1305  filt_h0, dst43_r, filt_h0, dst0, dst1, dst2, dst3);
1306  DUP4_ARG2(__lsx_vdp2_w_h, dst54_r, filt_h0, dst65_r, filt_h0, dst76_r,
1307  filt_h0, dst87_r, filt_h0, dst4, dst5, dst6, dst7);
1308  DUP4_ARG3(__lsx_vdp2add_w_h, dst0, dst32_r, filt_h1, dst1, dst43_r,
1309  filt_h1, dst2, dst54_r, filt_h1, dst3, dst65_r, filt_h1,
1310  dst0, dst1, dst2, dst3);
1311  DUP4_ARG3(__lsx_vdp2add_w_h, dst4, dst76_r, filt_h1, dst5, dst87_r,
1312  filt_h1, dst6, dst98_r, filt_h1, dst7, dst109_r, filt_h1,
1313  dst4, dst5, dst6, dst7);
1314 
1315  DUP4_ARG3(__lsx_vsrani_h_w, dst1, dst0, 6, dst3, dst2, 6, dst5, dst4,
1316  6, dst7, dst6, 6, tmp0, tmp1, tmp2, tmp3);
1317  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, 6, tmp3, tmp2, 6, out0, out1);
1318 
1319  __lsx_vstelm_w(out0, dst, 0, 0);
1320  __lsx_vstelm_w(out0, dst + dst_stride, 0, 1);
1321  __lsx_vstelm_w(out0, dst + dst_stride_2x, 0, 2);
1322  __lsx_vstelm_w(out0, dst + dst_stride_3x, 0, 3);
1323  dst += dst_stride_4x;
1324  __lsx_vstelm_w(out1, dst, 0, 0);
1325  __lsx_vstelm_w(out1, dst + dst_stride, 0, 1);
1326  __lsx_vstelm_w(out1, dst + dst_stride_2x, 0, 2);
1327  __lsx_vstelm_w(out1, dst + dst_stride_3x, 0, 3);
1328  dst += dst_stride_4x;
1329 
1330  dst10_r = dst98_r;
1331  dst21_r = dst109_r;
1332  dst22 = __lsx_vreplvei_d(dst106, 1);
1333  }
1334 }
1335 
1336 static void hevc_hv_4t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
1337  int32_t dst_stride, const int8_t *filter_x,
1338  const int8_t *filter_y, int32_t height)
1339 {
1340  if (4 == height) {
1341  hevc_hv_4t_8multx4_lsx(src, src_stride, dst, dst_stride, filter_x,
1342  filter_y, 2);
1343  } else {
1344  hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
1345  filter_x, filter_y, height, 2);
1346  }
1347 }
1348 
1349 static void hevc_hv_4t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
1350  int32_t dst_stride, const int8_t *filter_x,
1351  const int8_t *filter_y, int32_t height)
1352 {
1353  hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
1354  filter_x, filter_y, height, 3);
1355 }
1356 
1357 static void hevc_hv_4t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst,
1358  int32_t dst_stride, const int8_t *filter_x,
1359  const int8_t *filter_y, int32_t height)
1360 {
1361  hevc_hv_4t_8multx4mult_lsx(src, src_stride, dst, dst_stride,
1362  filter_x, filter_y, height, 4);
1363 }
1364 
1365 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
1366 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_lsx(uint8_t *dst, \
1367  ptrdiff_t dst_stride, \
1368  uint8_t *src, \
1369  ptrdiff_t src_stride, \
1370  int height, \
1371  intptr_t mx, \
1372  intptr_t my, \
1373  int width) \
1374 { \
1375  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
1376  \
1377  common_##DIR1##_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, \
1378  filter, height); \
1379 }
1380 
1381 UNI_MC(qpel, h, 64, 8, hz, mx);
1382 
1383 UNI_MC(qpel, v, 24, 8, vt, my);
1384 UNI_MC(qpel, v, 32, 8, vt, my);
1385 UNI_MC(qpel, v, 48, 8, vt, my);
1386 UNI_MC(qpel, v, 64, 8, vt, my);
1387 
1388 UNI_MC(epel, v, 24, 4, vt, my);
1389 UNI_MC(epel, v, 32, 4, vt, my);
1390 
1391 #undef UNI_MC
1392 
1393 #define UNI_MC_HV(PEL, WIDTH, TAP) \
1394 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_lsx(uint8_t *dst, \
1395  ptrdiff_t dst_stride, \
1396  uint8_t *src, \
1397  ptrdiff_t src_stride, \
1398  int height, \
1399  intptr_t mx, \
1400  intptr_t my, \
1401  int width) \
1402 { \
1403  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
1404  const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
1405  \
1406  hevc_hv_##TAP##t_##WIDTH##w_lsx(src, src_stride, dst, dst_stride, \
1407  filter_x, filter_y, height); \
1408 }
1409 
1410 UNI_MC_HV(qpel, 8, 8);
1411 UNI_MC_HV(qpel, 16, 8);
1412 UNI_MC_HV(qpel, 24, 8);
1413 UNI_MC_HV(qpel, 32, 8);
1414 UNI_MC_HV(qpel, 48, 8);
1415 UNI_MC_HV(qpel, 64, 8);
1416 
1417 UNI_MC_HV(epel, 8, 4);
1418 UNI_MC_HV(epel, 12, 4);
1419 UNI_MC_HV(epel, 16, 4);
1420 UNI_MC_HV(epel, 24, 4);
1421 UNI_MC_HV(epel, 32, 4);
1422 
1423 #undef UNI_MC_HV
common_vt_8t_32w_lsx
static void common_vt_8t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_lsx.c:316
out
FILE * out
Definition: movenc.c:54
src1
const pixel * src1
Definition: h264pred_template.c:421
hevc_hv_4t_12w_lsx
static av_always_inline void hevc_hv_4t_12w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:1149
common_vt_8t_64w_lsx
static void common_vt_8t_64w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_lsx.c:330
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
DUP2_ARG2
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:58
hevc_hv_8t_48w_lsx
static void hevc_hv_8t_48w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:531
hevc_hv_4t_24w_lsx
static void hevc_hv_4t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:1349
UNI_MC
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
Definition: hevc_mc_uni_lsx.c:1365
hevc_hv_4t_8x2_lsx
static av_always_inline void hevc_hv_4t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
Definition: hevc_mc_uni_lsx.c:748
aligned
static int aligned(int val)
Definition: dashdec.c:168
common_vt_4t_24w_lsx
static av_always_inline void common_vt_4t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_lsx.c:548
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
hevc_hv_4t_8multx4_lsx
static av_always_inline void hevc_hv_4t_8multx4_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult)
Definition: hevc_mc_uni_lsx.c:809
hevc_hv_4t_16w_lsx
static void hevc_hv_4t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:1336
width
#define width
hevc_hv_8t_8w_lsx
static void hevc_hv_8t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:499
common_vt_8t_16w_lsx
static av_always_inline void common_vt_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
Definition: hevc_mc_uni_lsx.c:200
hevc_hv_4t_32w_lsx
static void hevc_hv_4t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:1357
hevc_hv_4t_8w_lsx
static void hevc_hv_4t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:1131
common_vt_8t_24w_lsx
static void common_vt_8t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_lsx.c:307
hevc_hv_8t_16w_lsx
static void hevc_hv_8t_16w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:507
DUP4_ARG1
#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:70
common_hz_8t_64w_lsx
static av_always_inline void common_hz_8t_64w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_lsx.c:36
UNI_MC_HV
#define UNI_MC_HV(PEL, WIDTH, TAP)
Definition: hevc_mc_uni_lsx.c:1393
DUP2_ARG3
#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:64
height
#define height
common_vt_8t_48w_lsx
static void common_vt_8t_48w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_lsx.c:323
ff_hevc_mask_arr
static const uint8_t ff_hevc_mask_arr[16 *3]
Definition: hevc_mc_uni_lsx.c:26
common_vt_8t_8w_lsx
static av_always_inline void common_vt_8t_8w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_lsx.c:128
src2
const pixel * src2
Definition: h264pred_template.c:422
av_always_inline
#define av_always_inline
Definition: attributes.h:49
hevc_hv_4t_8multx4mult_lsx
static av_always_inline void hevc_hv_4t_8multx4mult_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width8mult)
Definition: hevc_mc_uni_lsx.c:1012
hevc_hv_8t_24w_lsx
static void hevc_hv_8t_24w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:515
hevc_hv_8t_64w_lsx
static void hevc_hv_8t_64w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:539
hevc_hv_8t_32w_lsx
static void hevc_hv_8t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_lsx.c:523
common_vt_4t_32w_lsx
static av_always_inline void common_vt_4t_32w_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_lsx.c:654
hevc_hv_8t_8x2_lsx
static av_always_inline void hevc_hv_8t_8x2_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
Definition: hevc_mc_uni_lsx.c:338
src0
const pixel *const src0
Definition: h264pred_template.c:420
loongson_intrinsics.h
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
int32_t
int32_t
Definition: audioconvert.c:56
hevc_hv_4t_8x6_lsx
static av_always_inline void hevc_hv_4t_8x6_lsx(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
Definition: hevc_mc_uni_lsx.c:905
h
h
Definition: vp9dsp_template.c:2038
hevcdsp_lsx.h
DUP4_ARG3
#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:83