FFmpeg
vp8_mc_lsx.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 #include "libavcodec/vp8dsp.h"
22 #include "vp8dsp_loongarch.h"
23 
24 static const uint8_t mc_filt_mask_arr[16 * 3] = {
25  /* 8 width cases */
26  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
27  /* 4 width cases */
28  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
29  /* 4 width cases */
30  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
31 };
32 
33 static const int8_t subpel_filters_lsx[7][8] = {
34  {-6, 123, 12, -1, 0, 0, 0, 0},
35  {2, -11, 108, 36, -8, 1, 0, 0}, /* New 1/4 pel 6 tap filter */
36  {-9, 93, 50, -6, 0, 0, 0, 0},
37  {3, -16, 77, 77, -16, 3, 0, 0}, /* New 1/2 pel 6 tap filter */
38  {-6, 50, 93, -9, 0, 0, 0, 0},
39  {1, -8, 36, 108, -11, 2, 0, 0}, /* New 1/4 pel 6 tap filter */
40  {-1, 12, 123, -6, 0, 0, 0, 0},
41 };
42 
43 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
44 ( { \
45  __m128i out0_m; \
46  \
47  out0_m = __lsx_vdp2_h_b(in0, coeff0); \
48  out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1); \
49  out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2); \
50  \
51  out0_m; \
52 } )
53 
54 #define VSHF_B3_SB(in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
55  out0, out1, out2) \
56 { \
57  DUP2_ARG3(__lsx_vshuf_b, in1, in0, mask0, in3, in2, mask1, \
58  out0, out1); \
59  out2 = __lsx_vshuf_b(in5, in4, mask2); \
60 }
61 
62 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, \
63  filt_h0, filt_h1, filt_h2) \
64 ( { \
65  __m128i vec0_m, vec1_m, vec2_m; \
66  __m128i hz_out_m; \
67  \
68  VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \
69  vec0_m, vec1_m, vec2_m); \
70  hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, \
71  filt_h0, filt_h1, filt_h2); \
72  \
73  hz_out_m = __lsx_vsrari_h(hz_out_m, 7); \
74  hz_out_m = __lsx_vsat_h(hz_out_m, 7); \
75  \
76  hz_out_m; \
77 } )
78 
79 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
80  mask0, mask1, mask2, \
81  filt0, filt1, filt2, \
82  out0, out1, out2, out3) \
83 { \
84  __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
85  \
86  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2, \
87  mask0, src3, src3, mask0, vec0_m, vec1_m, vec2_m, vec3_m); \
88  DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \
89  vec3_m, filt0, out0, out1, out2, out3); \
90  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2, \
91  mask1, src3, src3, mask1, vec0_m, vec1_m, vec2_m, vec3_m); \
92  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2, \
93  mask2, src3, src3, mask2, vec4_m, vec5_m, vec6_m, vec7_m); \
94  DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \
95  out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2, out3); \
96  DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \
97  out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2, out3); \
98 }
99 
100 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
101 ( { \
102  __m128i tmp0; \
103  \
104  tmp0 = __lsx_vdp2_h_b(vec0, filt0); \
105  tmp0 = __lsx_vdp2add_h_b(tmp0, vec1, filt1); \
106  \
107  tmp0; \
108 } )
109 
110 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \
111 ( { \
112  __m128i vec0_m, vec1_m; \
113  __m128i hz_out_m; \
114  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, \
115  vec0_m, vec1_m); \
116  hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \
117  \
118  hz_out_m = __lsx_vsrari_h(hz_out_m, 7); \
119  hz_out_m = __lsx_vsat_h(hz_out_m, 7); \
120  \
121  hz_out_m; \
122 } )
123 
124 void ff_put_vp8_epel8_h6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
125  uint8_t *src, ptrdiff_t src_stride,
126  int height, int mx, int my)
127 {
128  uint32_t loop_cnt;
129  const int8_t *filter = subpel_filters_lsx[mx - 1];
130  __m128i src0, src1, src2, src3, filt0, filt1, filt2;
131  __m128i mask0, mask1, mask2;
132  __m128i out0, out1, out2, out3;
133 
134  ptrdiff_t src_stride2 = src_stride << 1;
135  ptrdiff_t src_stride3 = src_stride2 + src_stride;
136  ptrdiff_t src_stride4 = src_stride2 << 1;
137 
138  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
139  src -= 2;
140 
141  /* rearranging filter */
142  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
143  filt2 = __lsx_vldrepl_h(filter, 4);
144 
145  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
146 
147  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
148  src + src_stride3, 0, src0, src1, src2, src3);
149  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
150  src0, src1, src2, src3);
151  src += src_stride4;
152  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
153  filt0, filt1, filt2, out0, out1, out2, out3);
154 
155  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
156  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
157  __lsx_vstelm_d(out0, dst, 0, 0);
158  dst += dst_stride;
159  __lsx_vstelm_d(out0, dst, 0, 1);
160  dst += dst_stride;
161  __lsx_vstelm_d(out1, dst, 0, 0);
162  dst += dst_stride;
163  __lsx_vstelm_d(out1, dst, 0, 1);
164  dst += dst_stride;
165 
166  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
167  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
168  src + src_stride3, 0, src0, src1, src2, src3);
169  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
170  src0, src1, src2, src3);
171  src += src_stride4;
172  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
173  filt0, filt1, filt2, out0, out1, out2, out3);
174 
175  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
176  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
177 
178  __lsx_vstelm_d(out0, dst, 0, 0);
179  dst += dst_stride;
180  __lsx_vstelm_d(out0, dst, 0, 1);
181  dst += dst_stride;
182  __lsx_vstelm_d(out1, dst, 0, 0);
183  dst += dst_stride;
184  __lsx_vstelm_d(out1, dst, 0, 1);
185  dst += dst_stride;
186  }
187 }
188 
189 void ff_put_vp8_epel16_h6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
190  uint8_t *src, ptrdiff_t src_stride,
191  int height, int mx, int my)
192 {
193  uint32_t loop_cnt;
194  const int8_t *filter = subpel_filters_lsx[mx - 1];
195  __m128i src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1;
196  __m128i filt2, mask0, mask1, mask2;
197  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
198 
199  ptrdiff_t src_stride2 = src_stride << 1;
200  ptrdiff_t src_stride3 = src_stride2 + src_stride;
201  ptrdiff_t src_stride4 = src_stride2 << 1;
202 
203  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
204  src -= 2;
205  /* rearranging filter */
206  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
207  filt2 = __lsx_vldrepl_h(filter, 4);
208 
209  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
210 
211  for (loop_cnt = (height >> 2); loop_cnt--;) {
212  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2,
213  0, src + src_stride3, 0, src0 ,src2, src4, src6);
214  DUP4_ARG2(__lsx_vld, src, 8, src + src_stride, 8, src + src_stride2,
215  8, src + src_stride3, 8, src1, src3, src5, src7);
216 
217  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
218  src0, src1, src2, src3);
219  DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128,
220  src4, src5, src6, src7);
221  src += src_stride4;
222 
223  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
224  filt0, filt1, filt2, out0, out1, out2, out3);
225  HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
226  filt0, filt1, filt2, out4, out5, out6, out7);
227  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
228  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
229  __lsx_vst(out0, dst, 0);
230  dst += dst_stride;
231  __lsx_vst(out1, dst, 0);
232  dst += dst_stride;
233 
234  DUP2_ARG3(__lsx_vssrarni_b_h, out5, out4, 7, out7, out6, 7, out4, out5);
235  DUP2_ARG2(__lsx_vxori_b, out4, 128, out5, 128, out4, out5);
236  __lsx_vst(out4, dst, 0);
237  dst += dst_stride;
238  __lsx_vst(out5, dst, 0);
239  dst += dst_stride;
240  }
241 }
242 
243 void ff_put_vp8_epel8_v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
244  uint8_t *src, ptrdiff_t src_stride,
245  int height, int mx, int my)
246 {
247  uint32_t loop_cnt;
248  const int8_t *filter = subpel_filters_lsx[my - 1];
249  __m128i src0, src1, src2, src3, src4, src7, src8, src9, src10;
250  __m128i src10_l, src32_l, src76_l, src98_l, src21_l, src43_l, src87_l;
251  __m128i src109_l, filt0, filt1, filt2;
252  __m128i out0_l, out1_l, out2_l, out3_l;
253 
254  ptrdiff_t src_stride2 = src_stride << 1;
255  ptrdiff_t src_stride3 = src_stride2 + src_stride;
256  ptrdiff_t src_stride4 = src_stride2 << 1;
257 
258  src -= src_stride2;
259  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
260  filt2 = __lsx_vldrepl_h(filter, 4);
261 
262  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
263  src + src_stride3, 0, src0, src1, src2, src3);
264  src += src_stride4;
265  src4 = __lsx_vld(src, 0);
266  src += src_stride;
267 
268  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
269  src0, src1, src2, src3);
270  src4 = __lsx_vxori_b(src4, 128);
271 
272  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src2, src1, src4,
273  src3, src10_l, src32_l, src21_l, src43_l);
274  for (loop_cnt = (height >> 2); loop_cnt--;) {
275  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2,
276  0, src + src_stride3, 0, src7, src8, src9, src10);
277  DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10,
278  128, src7, src8, src9, src10);
279  src += src_stride4;
280 
281  DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10,
282  src9, src76_l, src87_l, src98_l, src109_l);
283 
284  out0_l = DPADD_SH3_SH(src10_l, src32_l, src76_l, filt0, filt1, filt2);
285  out1_l = DPADD_SH3_SH(src21_l, src43_l, src87_l, filt0, filt1, filt2);
286  out2_l = DPADD_SH3_SH(src32_l, src76_l, src98_l, filt0, filt1, filt2);
287  out3_l = DPADD_SH3_SH(src43_l, src87_l, src109_l, filt0, filt1, filt2);
288 
289  DUP2_ARG3(__lsx_vssrarni_b_h, out1_l, out0_l, 7, out3_l, out2_l, 7,
290  out0_l, out1_l);
291  DUP2_ARG2(__lsx_vxori_b, out0_l, 128, out1_l, 128, out0_l, out1_l);
292 
293  __lsx_vstelm_d(out0_l, dst, 0, 0);
294  dst += dst_stride;
295  __lsx_vstelm_d(out0_l, dst, 0, 1);
296  dst += dst_stride;
297  __lsx_vstelm_d(out1_l, dst, 0, 0);
298  dst += dst_stride;
299  __lsx_vstelm_d(out1_l, dst, 0, 1);
300  dst += dst_stride;
301 
302  src10_l = src76_l;
303  src32_l = src98_l;
304  src21_l = src87_l;
305  src43_l = src109_l;
306  src4 = src10;
307  }
308 }
309 
310 void ff_put_vp8_epel16_v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
311  uint8_t *src, ptrdiff_t src_stride,
312  int height, int mx, int my)
313 {
314  uint32_t loop_cnt;
315  const int8_t *filter = subpel_filters_lsx[my - 1];
316  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
317  __m128i src10_l, src32_l, src54_l, src76_l, src21_l, src43_l, src65_l, src87_l;
318  __m128i src10_h, src32_h, src54_h, src76_h, src21_h, src43_h, src65_h, src87_h;
319  __m128i filt0, filt1, filt2;
320  __m128i tmp0, tmp1, tmp2, tmp3;
321 
322  ptrdiff_t src_stride2 = src_stride << 1;
323  ptrdiff_t src_stride3 = src_stride2 + src_stride;
324  ptrdiff_t src_stride4 = src_stride2 << 1;
325 
326  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
327  filt2 = __lsx_vldrepl_h(filter, 4);
328 
329  DUP4_ARG2(__lsx_vld, src - src_stride2, 0, src - src_stride, 0, src, 0,
330  src + src_stride, 0, src0, src1, src2, src3);
331  src4 = __lsx_vld(src + src_stride2, 0);
332  src += src_stride3;
333 
334  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
335  src1, src2, src3);
336  src4 = __lsx_vxori_b(src4, 128);
337 
338  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src4, src3, src2, src1,
339  src10_l, src32_l, src43_l, src21_l);
340  DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src4, src3, src2, src1,
341  src10_h, src32_h, src43_h, src21_h);
342 
343  for (loop_cnt = (height >> 2); loop_cnt--;) {
344  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
345  src + src_stride3, 0, src5, src6, src7, src8);
346  src += src_stride4;
347  DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128,
348  src5, src6, src7, src8);
349 
350  DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
351  src54_l, src65_l, src76_l, src87_l);
352  DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7,
353  src54_h, src65_h, src76_h, src87_h);
354 
355  tmp0 = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
356  tmp1 = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
357  tmp2 = DPADD_SH3_SH(src10_h, src32_h, src54_h, filt0, filt1, filt2);
358  tmp3 = DPADD_SH3_SH(src21_h, src43_h, src65_h, filt0, filt1, filt2);
359 
360  DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
361  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
362  __lsx_vst(tmp0, dst, 0);
363  dst += dst_stride;
364  __lsx_vst(tmp1, dst, 0);
365  dst += dst_stride;
366 
367  tmp0 = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
368  tmp1 = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
369  tmp2 = DPADD_SH3_SH(src32_h, src54_h, src76_h, filt0, filt1, filt2);
370  tmp3 = DPADD_SH3_SH(src43_h, src65_h, src87_h, filt0, filt1, filt2);
371 
372  DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
373  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
374  __lsx_vst(tmp0, dst, 0);
375  dst += dst_stride;
376  __lsx_vst(tmp1, dst, 0);
377  dst += dst_stride;
378 
379  src10_l = src54_l;
380  src32_l = src76_l;
381  src21_l = src65_l;
382  src43_l = src87_l;
383  src10_h = src54_h;
384  src32_h = src76_h;
385  src21_h = src65_h;
386  src43_h = src87_h;
387  src4 = src8;
388  }
389 }
390 
391 void ff_put_vp8_epel8_h6v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
392  uint8_t *src, ptrdiff_t src_stride,
393  int height, int mx, int my)
394 {
395  uint32_t loop_cnt;
396  const int8_t *filter_horiz = subpel_filters_lsx[mx - 1];
397  const int8_t *filter_vert = subpel_filters_lsx[my - 1];
398  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
399  __m128i filt_hz0, filt_hz1, filt_hz2;
400  __m128i mask0, mask1, mask2, filt_vt0, filt_vt1, filt_vt2;
401  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
402  __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
403  __m128i tmp0, tmp1, tmp2, tmp3;
404 
405  ptrdiff_t src_stride2 = src_stride << 1;
406  ptrdiff_t src_stride3 = src_stride2 + src_stride;
407  ptrdiff_t src_stride4 = src_stride2 << 1;
408 
409  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
410  src -= (2 + src_stride2);
411 
412  /* rearranging filter */
413  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1);
414  filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
415 
416  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
417 
418  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
419  src + src_stride3, 0, src0, src1, src2, src3);
420  src += src_stride4;
421  src4 = __lsx_vld(src, 0);
422  src += src_stride;
423 
424  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
425  src0 ,src1, src2, src3);
426  src4 = __lsx_vxori_b(src4, 128);
427 
428  hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
429  filt_hz1, filt_hz2);
430  hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
431  filt_hz1, filt_hz2);
432  hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
433  filt_hz1, filt_hz2);
434  hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
435  filt_hz1, filt_hz2);
436  hz_out4 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
437  filt_hz1, filt_hz2);
438 
439  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1);
440  filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
441 
442  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
443  DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4);
444  for (loop_cnt = (height >> 2); loop_cnt--;) {
445  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
446  src + src_stride3, 0, src5, src6, src7, src8);
447  src += src_stride4;
448 
449  DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128,
450  src5, src6, src7, src8);
451 
452  hz_out5 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
453  filt_hz1, filt_hz2);
454  out2 = __lsx_vpackev_b(hz_out5, hz_out4);
455  tmp0 = DPADD_SH3_SH(out0, out1, out2,filt_vt0, filt_vt1, filt_vt2);
456 
457  hz_out6 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
458  filt_hz1, filt_hz2);
459  out5 = __lsx_vpackev_b(hz_out6, hz_out5);
460  tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
461 
462  hz_out7 = HORIZ_6TAP_FILT(src7, src7, mask0, mask1, mask2, filt_hz0,
463  filt_hz1, filt_hz2);
464 
465  out7 = __lsx_vpackev_b(hz_out7, hz_out6);
466  tmp2 = DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
467 
468  hz_out8 = HORIZ_6TAP_FILT(src8, src8, mask0, mask1, mask2, filt_hz0,
469  filt_hz1, filt_hz2);
470  out6 = __lsx_vpackev_b(hz_out8, hz_out7);
471  tmp3 = DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
472 
473  DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
474  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
475  __lsx_vstelm_d(tmp0, dst, 0, 0);
476 
477  dst += dst_stride;
478  __lsx_vstelm_d(tmp0, dst, 0, 1);
479  dst += dst_stride;
480  __lsx_vstelm_d(tmp1, dst, 0, 0);
481  dst += dst_stride;
482  __lsx_vstelm_d(tmp1, dst, 0, 1);
483  dst += dst_stride;
484 
485  hz_out4 = hz_out8;
486  out0 = out2;
487  out1 = out7;
488  out3 = out5;
489  out4 = out6;
490  }
491 }
492 
493 void ff_put_vp8_epel16_h6v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
494  uint8_t *src, ptrdiff_t src_stride,
495  int height, int mx, int my)
496 {
497  int32_t multiple8_cnt;
498 
499  for (multiple8_cnt = 2; multiple8_cnt--;) {
500  ff_put_vp8_epel8_h6v6_lsx(dst, dst_stride, src, src_stride, height, mx, my);
501  src += 8;
502  dst += 8;
503  }
504 }
505 
506 void ff_put_vp8_epel8_v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
507  uint8_t *src, ptrdiff_t src_stride,
508  int height, int mx, int my)
509 {
510  uint32_t loop_cnt;
511  const int8_t *filter = subpel_filters_lsx[my - 1];
512  __m128i src0, src1, src2, src7, src8, src9, src10;
513  __m128i src10_l, src72_l, src98_l, src21_l, src87_l, src109_l, filt0, filt1;
514  __m128i out0, out1, out2, out3;
515 
516  ptrdiff_t src_stride2 = src_stride << 1;
517  ptrdiff_t src_stride3 = src_stride2 + src_stride;
518  ptrdiff_t src_stride4 = src_stride2 << 1;
519 
520  src -= src_stride;
521 
522  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
523  DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src0, src1);
524  src2 = __lsx_vld(src + src_stride2, 0);
525  src += src_stride3;
526 
527  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
528  src2 = __lsx_vxori_b(src2, 128);
529  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_l, src21_l);
530 
531  for (loop_cnt = (height >> 2); loop_cnt--;) {
532  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
533  src + src_stride3, 0, src7, src8, src9, src10);
534  src += src_stride4;
535 
536  DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
537  src7, src8, src9, src10);
538  DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9,
539  src72_l, src87_l, src98_l, src109_l);
540 
541  out0 = FILT_4TAP_DPADD_S_H(src10_l, src72_l, filt0, filt1);
542  out1 = FILT_4TAP_DPADD_S_H(src21_l, src87_l, filt0, filt1);
543  out2 = FILT_4TAP_DPADD_S_H(src72_l, src98_l, filt0, filt1);
544  out3 = FILT_4TAP_DPADD_S_H(src87_l, src109_l, filt0, filt1);
545  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
546  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
547 
548  __lsx_vstelm_d(out0, dst, 0, 0);
549  dst += dst_stride;
550  __lsx_vstelm_d(out0, dst, 0, 1);
551  dst += dst_stride;
552  __lsx_vstelm_d(out1, dst, 0, 0);
553  dst += dst_stride;
554  __lsx_vstelm_d(out1, dst, 0, 1);
555  dst += dst_stride;
556 
557  src10_l = src98_l;
558  src21_l = src109_l;
559  src2 = src10;
560  }
561 }
562 
563 void ff_put_vp8_epel16_v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
564  uint8_t *src, ptrdiff_t src_stride,
565  int height, int mx, int my)
566 {
567  uint32_t loop_cnt;
568  const int8_t *filter = subpel_filters_lsx[my - 1];
569  __m128i src0, src1, src2, src3, src4, src5, src6;
570  __m128i src10_l, src32_l, src54_l, src21_l, src43_l, src65_l, src10_h;
571  __m128i src32_h, src54_h, src21_h, src43_h, src65_h, filt0, filt1;
572  __m128i tmp0, tmp1, tmp2, tmp3;
573 
574  ptrdiff_t src_stride2 = src_stride << 1;
575  ptrdiff_t src_stride3 = src_stride2 + src_stride;
576  ptrdiff_t src_stride4 = src_stride2 << 1;
577 
578  src -= src_stride;
579  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
580  DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src0, src1);
581  src2 = __lsx_vld(src + src_stride2, 0);
582  src += src_stride3;
583 
584  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
585  src2 = __lsx_vxori_b(src2, 128);
586  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_l, src21_l);
587  DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_h, src21_h);
588 
589  for (loop_cnt = (height >> 2); loop_cnt--;) {
590  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2,
591  0, src + src_stride3, 0, src3, src4, src5, src6);
592  src += src_stride4;
593 
594  DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128,
595  src3, src4, src5, src6);
596  DUP4_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src5, src4, src6,
597  src5, src32_l, src43_l, src54_l, src65_l);
598  DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6,
599  src5, src32_h, src43_h, src54_h, src65_h);
600 
601  tmp0 = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
602  tmp1 = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
603  tmp2 = FILT_4TAP_DPADD_S_H(src10_h, src32_h, filt0, filt1);
604  tmp3 = FILT_4TAP_DPADD_S_H(src21_h, src43_h, filt0, filt1);
605  DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
606  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
607 
608  __lsx_vst(tmp0, dst, 0);
609  dst += dst_stride;
610  __lsx_vst(tmp1, dst, 0);
611  dst += dst_stride;
612 
613  tmp0 = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
614  tmp1 = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
615  tmp2 = FILT_4TAP_DPADD_S_H(src32_h, src54_h, filt0, filt1);
616  tmp3 = FILT_4TAP_DPADD_S_H(src43_h, src65_h, filt0, filt1);
617  DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
618  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
619 
620  __lsx_vst(tmp0, dst, 0);
621  dst += dst_stride;
622  __lsx_vst(tmp1, dst, 0);
623  dst += dst_stride;
624 
625  src10_l = src54_l;
626  src21_l = src65_l;
627  src10_h = src54_h;
628  src21_h = src65_h;
629  src2 = src6;
630  }
631 }
632 
633 void ff_put_vp8_epel8_h6v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
634  uint8_t *src, ptrdiff_t src_stride,
635  int height, int mx, int my)
636 {
637  uint32_t loop_cnt;
638  const int8_t *filter_horiz = subpel_filters_lsx[mx - 1];
639  const int8_t *filter_vert = subpel_filters_lsx[my - 1];
640  __m128i src0, src1, src2, src3, src4, src5, src6;
641  __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
642  __m128i filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
643  __m128i tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
644 
645  ptrdiff_t src_stride2 = src_stride << 1;
646  ptrdiff_t src_stride3 = src_stride2 + src_stride;
647  ptrdiff_t src_stride4 = src_stride2 << 1;
648 
649  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
650  src -= (2 + src_stride);
651 
652  /* rearranging filter */
653  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1);
654  filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
655 
656  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
657 
658  DUP2_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src0, src1);
659  src2 = __lsx_vld(src + src_stride2, 0);
660  src += src_stride3;
661 
662  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
663  src2 = __lsx_vxori_b(src2, 128);
664  hz_out0 = HORIZ_6TAP_FILT(src0, src0, mask0, mask1, mask2, filt_hz0,
665  filt_hz1, filt_hz2);
666  hz_out1 = HORIZ_6TAP_FILT(src1, src1, mask0, mask1, mask2, filt_hz0,
667  filt_hz1, filt_hz2);
668  hz_out2 = HORIZ_6TAP_FILT(src2, src2, mask0, mask1, mask2, filt_hz0,
669  filt_hz1, filt_hz2);
670  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
671 
672  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1);
673 
674  for (loop_cnt = (height >> 2); loop_cnt--;) {
675  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
676  src + src_stride3, 0, src3, src4, src5, src6);
677  src += src_stride4;
678 
679  DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128,
680  src3, src4, src5, src6);
681 
682  hz_out3 = HORIZ_6TAP_FILT(src3, src3, mask0, mask1, mask2, filt_hz0,
683  filt_hz1, filt_hz2);
684  vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
685  tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt_vt0, filt_vt1);
686 
687  hz_out0 = HORIZ_6TAP_FILT(src4, src4, mask0, mask1, mask2, filt_hz0,
688  filt_hz1, filt_hz2);
689  vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
690  tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt_vt0, filt_vt1);
691 
692  hz_out1 = HORIZ_6TAP_FILT(src5, src5, mask0, mask1, mask2, filt_hz0,
693  filt_hz1, filt_hz2);
694  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
695  tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec0, filt_vt0, filt_vt1);
696 
697  hz_out2 = HORIZ_6TAP_FILT(src6, src6, mask0, mask1, mask2, filt_hz0,
698  filt_hz1, filt_hz2);
699  DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2);
700  tmp3 = FILT_4TAP_DPADD_S_H(vec1, vec2, filt_vt0, filt_vt1);
701 
702  DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
703  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
704 
705  __lsx_vstelm_d(tmp0, dst, 0, 0);
706  dst += dst_stride;
707  __lsx_vstelm_d(tmp0, dst, 0, 1);
708  dst += dst_stride;
709  __lsx_vstelm_d(tmp1, dst, 0, 0);
710  dst += dst_stride;
711  __lsx_vstelm_d(tmp1, dst, 0, 1);
712  dst += dst_stride;
713  }
714 }
715 
716 void ff_put_vp8_epel16_h6v4_lsx(uint8_t *dst, ptrdiff_t dst_stride,
717  uint8_t *src, ptrdiff_t src_stride,
718  int height, int mx, int my)
719 {
720  int32_t multiple8_cnt;
721 
722  for (multiple8_cnt = 2; multiple8_cnt--;) {
723  ff_put_vp8_epel8_h6v4_lsx(dst, dst_stride, src, src_stride, height,
724  mx, my);
725  src += 8;
726  dst += 8;
727  }
728 }
729 
730 void ff_put_vp8_epel8_h4v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
731  uint8_t *src, ptrdiff_t src_stride,
732  int height, int mx, int my)
733 {
734  uint32_t loop_cnt;
735  const int8_t *filter_horiz = subpel_filters_lsx[mx - 1];
736  const int8_t *filter_vert = subpel_filters_lsx[my - 1];
737  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
738  __m128i filt_hz0, filt_hz1, mask0, mask1;
739  __m128i filt_vt0, filt_vt1, filt_vt2;
740  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
741  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
742 
743  ptrdiff_t src_stride2 = src_stride << 1;
744  ptrdiff_t src_stride3 = src_stride2 + src_stride;
745  ptrdiff_t src_stride4 = src_stride2 << 1;
746 
747  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
748  src -= (1 + src_stride2);
749 
750  /* rearranging filter */
751  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0, filt_hz1);
752  mask1 = __lsx_vaddi_bu(mask0, 2);
753 
754  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
755  src + src_stride3, 0, src0, src1, src2, src3);
756  src += src_stride4;
757  src4 = __lsx_vld(src, 0);
758  src += src_stride;
759 
760  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
761  src0, src1, src2, src3);
762  src4 = __lsx_vxori_b(src4, 128);
763 
764  tmp0 = HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
765  tmp1 = HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
766  tmp2 = HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
767  tmp3 = HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
768  tmp4 = HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
769 
770  DUP4_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp2, tmp1,
771  tmp4, tmp3, out0, out1, out3, out4);
772 
773  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0, filt_vt1);
774  filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
775 
776  for (loop_cnt = (height >> 2); loop_cnt--;) {
777  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
778  src + src_stride3, 0, src5, src6, src7, src8);
779  src += src_stride4;
780 
781  DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128,
782  src5, src6, src7, src8);
783 
784  tmp5 = HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
785  out2 = __lsx_vpackev_b(tmp5, tmp4);
786  tmp0 = DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
787 
788  tmp6 = HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
789  out5 = __lsx_vpackev_b(tmp6, tmp5);
790  tmp1 = DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
791 
792  tmp7 = HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
793  out6 = __lsx_vpackev_b(tmp7, tmp6);
794  tmp2 = DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
795 
796  tmp8 = HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
797  out7 = __lsx_vpackev_b(tmp8, tmp7);
798  tmp3 = DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
799 
800  DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
801  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
802 
803  __lsx_vstelm_d(tmp0, dst, 0, 0);
804  dst += dst_stride;
805  __lsx_vstelm_d(tmp0, dst, 0, 1);
806  dst += dst_stride;
807  __lsx_vstelm_d(tmp1, dst, 0, 0);
808  dst += dst_stride;
809  __lsx_vstelm_d(tmp1, dst, 0, 1);
810  dst += dst_stride;
811 
812  tmp4 = tmp8;
813  out0 = out2;
814  out1 = out6;
815  out3 = out5;
816  out4 = out7;
817  }
818 }
819 
820 void ff_put_vp8_epel16_h4v6_lsx(uint8_t *dst, ptrdiff_t dst_stride,
821  uint8_t *src, ptrdiff_t src_stride,
822  int height, int mx, int my)
823 {
824  int32_t multiple8_cnt;
825 
826  for (multiple8_cnt = 2; multiple8_cnt--;) {
827  ff_put_vp8_epel8_h4v6_lsx(dst, dst_stride, src, src_stride, height,
828  mx, my);
829  src += 8;
830  dst += 8;
831  }
832 }
833 
834 void ff_put_vp8_pixels8_lsx(uint8_t *dst, ptrdiff_t dst_stride,
835  uint8_t *src, ptrdiff_t src_stride,
836  int height, int mx, int my)
837 {
838  int32_t cnt;
839  __m128i src0, src1, src2, src3;
840 
841  ptrdiff_t src_stride2 = src_stride << 1;
842  ptrdiff_t src_stride3 = src_stride2 + src_stride;
843  ptrdiff_t src_stride4 = src_stride2 << 1;
844 
845  if (0 == height % 8) {
846  for (cnt = height >> 3; cnt--;) {
847  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
848  src + src_stride3, 0, src0, src1, src2, src3);
849  src += src_stride4;
850 
851  __lsx_vstelm_d(src0, dst, 0, 0);
852  dst += dst_stride;
853  __lsx_vstelm_d(src1, dst, 0, 0);
854  dst += dst_stride;
855  __lsx_vstelm_d(src2, dst, 0, 0);
856  dst += dst_stride;
857  __lsx_vstelm_d(src3, dst, 0, 0);
858  dst += dst_stride;
859 
860  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
861  src + src_stride3, 0, src0, src1, src2, src3);
862  src += src_stride4;
863 
864  __lsx_vstelm_d(src0, dst, 0, 0);
865  dst += dst_stride;
866  __lsx_vstelm_d(src1, dst, 0, 0);
867  dst += dst_stride;
868  __lsx_vstelm_d(src2, dst, 0, 0);
869  dst += dst_stride;
870  __lsx_vstelm_d(src3, dst, 0, 0);
871  dst += dst_stride;
872  }
873  } else if( 0 == height % 4) {
874  for (cnt = (height >> 2); cnt--;) {
875  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
876  src + src_stride3, 0, src0, src1, src2, src3);
877  src += src_stride4;
878 
879  __lsx_vstelm_d(src0, dst, 0, 0);
880  dst += dst_stride;
881  __lsx_vstelm_d(src1, dst, 0, 0);
882  dst += dst_stride;
883  __lsx_vstelm_d(src2, dst, 0, 0);
884  dst += dst_stride;
885  __lsx_vstelm_d(src3, dst, 0, 0);
886  dst += dst_stride;
887  }
888  }
889 }
890 
891 void ff_put_vp8_pixels16_lsx(uint8_t *dst, ptrdiff_t dst_stride,
892  uint8_t *src, ptrdiff_t src_stride,
893  int height, int mx, int my)
894 {
895  int32_t width = 16;
896  int32_t cnt, loop_cnt;
897  uint8_t *src_tmp, *dst_tmp;
898  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
899 
900  ptrdiff_t src_stride2 = src_stride << 1;
901  ptrdiff_t src_stride3 = src_stride2 + src_stride;
902  ptrdiff_t src_stride4 = src_stride2 << 1;
903 
904  ptrdiff_t dst_stride2 = dst_stride << 1;
905  ptrdiff_t dst_stride3 = dst_stride2 + dst_stride;
906  ptrdiff_t dst_stride4 = dst_stride2 << 1;
907 
908  if (0 == height % 8) {
909  for (cnt = (width >> 4); cnt--;) {
910  src_tmp = src;
911  dst_tmp = dst;
912  for (loop_cnt = (height >> 3); loop_cnt--;) {
913  DUP4_ARG2(__lsx_vld, src_tmp, 0, src_tmp + src_stride, 0,
914  src_tmp + src_stride2, 0, src_tmp + src_stride3, 0,
915  src4, src5, src6, src7);
916  src_tmp += src_stride4;
917 
918  __lsx_vst(src4, dst_tmp, 0);
919  __lsx_vst(src5, dst_tmp + dst_stride, 0);
920  __lsx_vst(src6, dst_tmp + dst_stride2, 0);
921  __lsx_vst(src7, dst_tmp + dst_stride3, 0);
922  dst_tmp += dst_stride4;
923 
924  DUP4_ARG2(__lsx_vld, src_tmp, 0, src_tmp + src_stride, 0,
925  src_tmp + src_stride2, 0, src_tmp + src_stride3, 0,
926  src4, src5, src6, src7);
927  src_tmp += src_stride4;
928 
929  __lsx_vst(src4, dst_tmp, 0);
930  __lsx_vst(src5, dst_tmp + dst_stride, 0);
931  __lsx_vst(src6, dst_tmp + dst_stride2, 0);
932  __lsx_vst(src7, dst_tmp + dst_stride3, 0);
933  dst_tmp += dst_stride4;
934  }
935  src += 16;
936  dst += 16;
937  }
938  } else if (0 == height % 4) {
939  for (cnt = (height >> 2); cnt--;) {
940  DUP4_ARG2(__lsx_vld, src, 0, src + src_stride, 0, src + src_stride2, 0,
941  src + src_stride3, 0, src0, src1, src2, src3);
942  src += 4 * src_stride4;
943 
944  __lsx_vst(src0, dst, 0);
945  __lsx_vst(src1, dst + dst_stride, 0);
946  __lsx_vst(src2, dst + dst_stride2, 0);
947  __lsx_vst(src3, dst + dst_stride3, 0);
948  dst += dst_stride4;
949  }
950  }
951 }
ff_put_vp8_epel16_h6v4_lsx
void ff_put_vp8_epel16_h6v4_lsx(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:716
vp8dsp_loongarch.h
ff_put_vp8_epel8_h6v6_lsx
void ff_put_vp8_epel8_h6v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:391
src1
const pixel * src1
Definition: h264pred_template.c:421
HORIZ_6TAP_FILT
#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, filt_h0, filt_h1, filt_h2)
Definition: vp8_mc_lsx.c:62
ff_put_vp8_pixels8_lsx
void ff_put_vp8_pixels8_lsx(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:834
subpel_filters_lsx
static const int8_t subpel_filters_lsx[7][8]
Definition: vp8_mc_lsx.c:33
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
DUP2_ARG2
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:58
ff_put_vp8_pixels16_lsx
void ff_put_vp8_pixels16_lsx(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:891
ff_put_vp8_epel16_h4v6_lsx
void ff_put_vp8_epel16_h4v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:820
HORIZ_4TAP_FILT
#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)
Definition: vp8_mc_lsx.c:110
ff_put_vp8_epel16_h6_lsx
void ff_put_vp8_epel16_h6_lsx(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:189
ff_put_vp8_epel8_v6_lsx
void ff_put_vp8_epel8_v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:243
vp8dsp.h
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
width
#define width
FILT_4TAP_DPADD_S_H
#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)
Definition: vp8_mc_lsx.c:100
mc_filt_mask_arr
static const uint8_t mc_filt_mask_arr[16 *3]
Definition: vp8_mc_lsx.c:24
HORIZ_6TAP_8WID_4VECS_FILT
#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0, filt1, filt2, out0, out1, out2, out3)
Definition: vp8_mc_lsx.c:79
ff_put_vp8_epel8_h6_lsx
void ff_put_vp8_epel8_h6_lsx(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:124
DUP2_ARG3
#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:64
height
#define height
ff_put_vp8_epel16_v6_lsx
void ff_put_vp8_epel16_v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:310
ff_put_vp8_epel16_v4_lsx
void ff_put_vp8_epel16_v4_lsx(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:563
src2
const pixel * src2
Definition: h264pred_template.c:422
ff_put_vp8_epel8_h6v4_lsx
void ff_put_vp8_epel8_h6v4_lsx(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:633
DPADD_SH3_SH
#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)
Definition: vp8_mc_lsx.c:43
src0
const pixel *const src0
Definition: h264pred_template.c:420
loongson_intrinsics.h
ff_put_vp8_epel8_h4v6_lsx
void ff_put_vp8_epel8_h4v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:730
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
int32_t
int32_t
Definition: audioconvert.c:56
ff_put_vp8_epel8_v4_lsx
void ff_put_vp8_epel8_v4_lsx(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:506
ff_put_vp8_epel16_h6v6_lsx
void ff_put_vp8_epel16_h6v6_lsx(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
Definition: vp8_mc_lsx.c:493