28 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
30 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
32 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
36 #define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, \
37 _mask0, _mask1, _mask2, _mask3, \
38 _filter0, _filter1, _filter2, _filter3, \
41 __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
42 __m128i _reg0, _reg1, _reg2, _reg3; \
44 DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src3, _src2, _mask0, \
46 DUP2_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _reg0, _reg1); \
47 DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask1, _src3, _src2, _mask1, \
49 DUP2_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp2, _filter1, _reg1, _tmp3, \
50 _filter1, _reg0, _reg1); \
51 DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask2, _src3, _src2, _mask2, \
53 DUP2_ARG2(__lsx_vdp2_h_b, _tmp4, _filter2, _tmp5, _filter2, _reg2, _reg3); \
54 DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask3, _src3, _src2, _mask3, \
56 DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7, \
57 _filter3, _reg2, _reg3); \
58 DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1); \
61 #define HORIZ_8TAP_8WID_4VECS_FILT(_src0, _src1, _src2, _src3, \
62 _mask0, _mask1, _mask2, _mask3, \
63 _filter0, _filter1, _filter2, _filter3, \
64 _out0, _out1, _out2, _out3) \
66 __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
67 __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7; \
69 DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask0, _src1, _src1, _mask0, _src2,\
70 _src2, _mask0, _src3, _src3, _mask0, _tmp0, _tmp1, _tmp2, _tmp3);\
71 DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _tmp2, \
72 _filter0, _tmp3, _filter0, _reg0, _reg1, _reg2, _reg3); \
73 DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask2, _src1, _src1, _mask2, _src2,\
74 _src2, _mask2, _src3, _src3, _mask2, _tmp0, _tmp1, _tmp2, _tmp3);\
75 DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter2, _tmp1, _filter2, _tmp2, \
76 _filter2, _tmp3, _filter2, _reg4, _reg5, _reg6, _reg7); \
77 DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask1, _src1, _src1, _mask1, _src2,\
78 _src2, _mask1, _src3, _src3, _mask1, _tmp4, _tmp5, _tmp6, _tmp7);\
79 DUP4_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp4, _filter1, _reg1, _tmp5, \
80 _filter1, _reg2, _tmp6, _filter1, _reg3, _tmp7, _filter1, _reg0, \
81 _reg1, _reg2, _reg3); \
82 DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask3, _src1, _src1, _mask3, _src2,\
83 _src2, _mask3, _src3, _src3, _mask3, _tmp4, _tmp5, _tmp6, _tmp7);\
84 DUP4_ARG3(__lsx_vdp2add_h_b, _reg4, _tmp4, _filter3, _reg5, _tmp5, \
85 _filter3, _reg6, _tmp6, _filter3, _reg7, _tmp7, _filter3, _reg4, \
86 _reg5, _reg6, _reg7); \
87 DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3, \
88 _reg7, _out0, _out1, _out2, _out3); \
91 #define FILT_8TAP_DPADD_S_H(_reg0, _reg1, _reg2, _reg3, \
92 _filter0, _filter1, _filter2, _filter3) \
94 __m128i _vec0, _vec1; \
96 _vec0 = __lsx_vdp2_h_b(_reg0, _filter0); \
97 _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1); \
98 _vec1 = __lsx_vdp2_h_b(_reg2, _filter2); \
99 _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3); \
100 _vec0 = __lsx_vsadd_h(_vec0, _vec1); \
105 #define HORIZ_8TAP_FILT(_src0, _src1, _mask0, _mask1, _mask2, _mask3, \
106 _filt_h0, _filt_h1, _filt_h2, _filt_h3) \
108 __m128i _tmp0, _tmp1, _tmp2, _tmp3; \
111 DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1,\
112 _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3);\
113 _out = FILT_8TAP_DPADD_S_H(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1, \
114 _filt_h2, _filt_h3); \
115 _out = __lsx_vsrari_h(_out, 7); \
116 _out = __lsx_vsat_h(_out, 7); \
121 #define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \
123 _src0 = __lsx_vld(_src, 0); \
125 _src1 = __lsx_vld(_src, 0); \
127 _src2 = __lsx_vld(_src, 0); \
129 _src3 = __lsx_vld(_src, 0); \
133 uint8_t *dst,
int32_t dst_stride,
138 __m128i mask0, mask1, mask2, mask3;
139 __m128i
out, out0, out1;
145 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
146 mask3 = __lsx_vaddi_bu(mask0, 6);
153 out = __lsx_vssrarni_b_h(out1, out0, 7);
154 out = __lsx_vxori_b(
out, 128);
155 __lsx_vstelm_w(
out, dst, 0, 0);
157 __lsx_vstelm_w(
out, dst, 0, 1);
159 __lsx_vstelm_w(
out, dst, 0, 2);
161 __lsx_vstelm_w(
out, dst, 0, 3);
165 uint8_t *dst,
int32_t dst_stride,
168 int32_t src_stride2 = src_stride << 1;
169 int32_t src_stride3 = src_stride + src_stride2;
170 int32_t src_stride4 = src_stride2 << 1;
173 __m128i mask0, mask1, mask2, mask3;
174 __m128i out0, out1, out2, out3;
175 uint8_t *_src = (uint8_t*)
src - 3;
178 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
179 mask3 = __lsx_vaddi_bu(mask0, 6);
183 src0 = __lsx_vld(_src, 0);
184 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2,
src1, src2);
185 src3 = __lsx_vldx(_src, src_stride3);
191 src0 = __lsx_vld(_src, 0);
192 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2,
src1, src2);
193 src3 = __lsx_vldx(_src, src_stride3);
198 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
199 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
200 __lsx_vstelm_w(out0, dst, 0, 0);
202 __lsx_vstelm_w(out0, dst, 0, 1);
204 __lsx_vstelm_w(out0, dst, 0, 2);
206 __lsx_vstelm_w(out0, dst, 0, 3);
208 __lsx_vstelm_w(out1, dst, 0, 0);
210 __lsx_vstelm_w(out1, dst, 0, 1);
212 __lsx_vstelm_w(out1, dst, 0, 2);
214 __lsx_vstelm_w(out1, dst, 0, 3);
218 uint8_t *dst,
int32_t dst_stride,
229 uint8_t *dst,
int32_t dst_stride,
234 __m128i mask0, mask1, mask2, mask3;
235 __m128i out0, out1, out2, out3;
239 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
240 mask3 = __lsx_vaddi_bu(mask0, 6);
248 mask3,
filter0,
filter1, filter2, filter3, out0, out1, out2, out3);
249 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
250 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
251 __lsx_vstelm_d(out0, dst, 0, 0);
253 __lsx_vstelm_d(out0, dst, 0, 1);
255 __lsx_vstelm_d(out1, dst, 0, 0);
257 __lsx_vstelm_d(out1, dst, 0, 1);
261 uint8_t *dst,
int32_t dst_stride,
264 uint32_t loop_cnt =
height >> 2;
265 int32_t src_stride2 = src_stride << 1;
266 int32_t src_stride3 = src_stride + src_stride2;
267 int32_t src_stride4 = src_stride2 << 1;
270 __m128i mask0, mask1, mask2, mask3;
271 __m128i out0, out1, out2, out3;
272 uint8_t* _src = (uint8_t*)
src - 3;
275 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
276 mask3 = __lsx_vaddi_bu(mask0, 6);
280 for (; loop_cnt--;) {
281 src0 = __lsx_vld(_src, 0);
282 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2,
src1, src2);
283 src3 = __lsx_vldx(_src, src_stride3);
288 mask3,
filter0,
filter1, filter2, filter3, out0, out1, out2, out3);
289 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
290 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
291 __lsx_vstelm_d(out0, dst, 0, 0);
293 __lsx_vstelm_d(out0, dst, 0, 1);
295 __lsx_vstelm_d(out1, dst, 0, 0);
297 __lsx_vstelm_d(out1, dst, 0, 1);
303 uint8_t *dst,
int32_t dst_stride,
315 uint8_t *dst,
int32_t dst_stride,
318 uint32_t loop_cnt =
height >> 1;
322 __m128i mask0, mask1, mask2, mask3;
323 __m128i out0, out1, out2, out3;
327 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
328 mask3 = __lsx_vaddi_bu(mask0, 6);
332 for (; loop_cnt--;) {
333 const uint8_t* _src =
src + src_stride;
339 mask3,
filter0,
filter1, filter2, filter3, out0, out1, out2, out3);
340 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
341 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
342 __lsx_vst(out0, dst, 0);
344 __lsx_vst(out1, dst, 0);
351 uint8_t *dst,
int32_t dst_stride,
354 uint32_t loop_cnt =
height >> 1;
357 __m128i mask0, mask1, mask2, mask3;
358 __m128i out0, out1, out2, out3;
359 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
363 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
364 mask3 = __lsx_vaddi_bu(mask0, 6);
368 for (; loop_cnt--;) {
370 src3 = __lsx_vld(
src, 24);
371 src1 = __lsx_vshuf_b(src2,
src0, shuff);
376 mask3,
filter0,
filter1, filter2, filter3, out0, out1, out2, out3);
377 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
378 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
379 __lsx_vst(out0, dst, 0);
380 __lsx_vst(out1, dst, 16);
383 src3 = __lsx_vld(
src, 24);
384 src1 = __lsx_vshuf_b(src2,
src0, shuff);
391 mask3,
filter0,
filter1, filter2, filter3, out0, out1, out2, out3);
392 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
393 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
394 __lsx_vst(out0, dst, 0);
395 __lsx_vst(out1, dst, 16);
401 uint8_t *dst,
int32_t dst_stride,
407 __m128i mask0, mask1, mask2, mask3;
408 __m128i out0, out1, out2, out3;
409 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
413 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
414 mask3 = __lsx_vaddi_bu(mask0, 6);
418 for (; loop_cnt--;) {
420 src3 = __lsx_vld(
src, 24);
421 src1 = __lsx_vshuf_b(src2,
src0, shuff);
425 mask3,
filter0,
filter1, filter2, filter3, out0, out1, out2, out3);
426 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
427 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
428 __lsx_vst(out0, dst, 0);
429 __lsx_vst(out1, dst, 16);
432 src3 = __lsx_vld(
src, 56);
433 src1 = __lsx_vshuf_b(src2,
src0, shuff);
437 mask3,
filter0,
filter1, filter2, filter3, out0, out1, out2, out3);
438 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
439 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
440 __lsx_vst(out0, dst, 32);
441 __lsx_vst(out1, dst, 48);
448 uint8_t *dst,
int32_t dst_stride,
451 uint32_t loop_cnt =
height >> 2;
452 int32_t src_stride2 = src_stride << 1;
453 int32_t src_stride3 = src_stride + src_stride2;
454 int32_t src_stride4 = src_stride2 << 1;
455 __m128i
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
456 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
457 __m128i reg0, reg1, reg2, reg3, reg4;
460 uint8_t* _src = (uint8_t*)
src - src_stride3;
464 src0 = __lsx_vld(_src, 0);
465 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2,
src1, src2);
466 src3 = __lsx_vldx(_src, src_stride3);
468 src4 = __lsx_vld(_src, 0);
469 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
471 DUP4_ARG2(__lsx_vilvl_b,
src1,
src0, src3, src2, src5, src4, src2,
src1, tmp0,
473 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
474 DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
475 reg2 = __lsx_vilvl_d(tmp5, tmp2);
476 DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
477 reg2 = __lsx_vxori_b(reg2, 128);
480 src7 = __lsx_vld(_src, 0);
481 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
482 src10 = __lsx_vldx(_src, src_stride3);
484 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
485 src9, tmp0, tmp1, tmp2, tmp3);
486 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
487 DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
492 out0 = __lsx_vssrarni_b_h(out1, out0, 7);
493 out0 = __lsx_vxori_b(out0, 128);
494 __lsx_vstelm_w(out0, dst, 0, 0);
496 __lsx_vstelm_w(out0, dst, 0, 1);
498 __lsx_vstelm_w(out0, dst, 0, 2);
500 __lsx_vstelm_w(out0, dst, 0, 3);
511 uint8_t *dst,
int32_t dst_stride,
514 uint32_t loop_cnt =
height >> 2;
515 __m128i
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
516 __m128i tmp0, tmp1, tmp2, tmp3;
517 __m128i reg0, reg1, reg2, reg3, reg4, reg5;
519 __m128i out0, out1, out2, out3;
520 int32_t src_stride2 = src_stride << 1;
521 int32_t src_stride3 = src_stride + src_stride2;
522 int32_t src_stride4 = src_stride2 << 1;
523 uint8_t* _src = (uint8_t*)
src - src_stride3;
528 src0 = __lsx_vld(_src, 0);
529 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2,
src1, src2);
530 src3 = __lsx_vldx(_src, src_stride3);
532 src4 = __lsx_vld(_src, 0);
533 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
538 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
539 src6 = __lsx_vxori_b(src6, 128);
541 reg0, reg1, reg2, reg3);
542 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
545 src7 = __lsx_vld(_src, 0);
546 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
547 src10 = __lsx_vldx(_src, src_stride3);
549 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
550 src7, src8, src9, src10);
551 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
552 src9, tmp0, tmp1, tmp2, tmp3);
561 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
562 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
563 __lsx_vstelm_d(out0, dst, 0, 0);
565 __lsx_vstelm_d(out0, dst, 0, 1);
567 __lsx_vstelm_d(out1, dst, 0, 0);
569 __lsx_vstelm_d(out1, dst, 0, 1);
583 uint8_t *dst,
int32_t dst_stride,
586 uint32_t loop_cnt =
height >> 2;
587 __m128i
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
589 __m128i reg0, reg1, reg2, reg3, reg4, reg5;
590 __m128i reg6, reg7, reg8, reg9, reg10, reg11;
591 __m128i tmp0, tmp1, tmp2, tmp3;
592 int32_t src_stride2 = src_stride << 1;
593 int32_t src_stride3 = src_stride + src_stride2;
594 int32_t src_stride4 = src_stride2 << 1;
595 uint8_t* _src = (uint8_t*)
src - src_stride3;
599 src0 = __lsx_vld(_src, 0);
600 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2,
src1, src2);
601 src3 = __lsx_vldx(_src, src_stride3);
603 src4 = __lsx_vld(_src, 0);
604 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
608 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
609 src6 = __lsx_vxori_b(src6, 128);
611 reg0, reg1, reg2, reg3);
612 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
614 reg6, reg7, reg8, reg9);
615 DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
618 src7 = __lsx_vld(_src, 0);
619 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
620 src10 = __lsx_vldx(_src, src_stride3);
622 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
623 src7, src8, src9, src10);
624 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
626 DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
627 src4, src5, src7, src8);
636 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
637 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
638 __lsx_vst(tmp0, dst, 0);
640 __lsx_vst(tmp1, dst, 0);
650 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
651 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
652 __lsx_vst(tmp0, dst, 0);
654 __lsx_vst(tmp1, dst, 0);
674 uint8_t *dst,
int32_t dst_stride,
680 uint32_t cnt =
width >> 4;
681 __m128i
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
683 __m128i reg0, reg1, reg2, reg3, reg4, reg5;
684 __m128i reg6, reg7, reg8, reg9, reg10, reg11;
685 __m128i tmp0, tmp1, tmp2, tmp3;
686 int32_t src_stride2 = src_stride << 1;
687 int32_t src_stride3 = src_stride + src_stride2;
688 int32_t src_stride4 = src_stride2 << 1;
689 int32_t dst_stride2 = dst_stride << 1;
690 int32_t dst_stride3 = dst_stride2 + dst_stride;
691 int32_t dst_stride4 = dst_stride2 << 1;
692 uint8_t* _src = (uint8_t*)
src - src_stride3;
697 uint32_t loop_cnt =
height >> 2;
702 src0 = __lsx_vld(src_tmp, 0);
703 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
705 src3 = __lsx_vldx(src_tmp, src_stride3);
706 src_tmp += src_stride4;
707 src4 = __lsx_vld(src_tmp, 0);
708 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
710 src_tmp += src_stride3;
714 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
715 src6 = __lsx_vxori_b(src6, 128);
717 reg0, reg1, reg2, reg3);
718 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
720 reg6, reg7, reg8, reg9);
721 DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
724 src7 = __lsx_vld(src_tmp, 0);
725 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
727 src10 = __lsx_vldx(src_tmp, src_stride3);
728 src_tmp += src_stride4;
729 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10,
730 128, src7, src8, src9, src10);
731 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8,
732 src10, src9,
src0,
src1, src2, src3);
733 DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8,
734 src10, src9, src4, src5, src7, src8);
743 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7,
745 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
746 __lsx_vst(tmp0, dst_tmp, 0);
747 __lsx_vstx(tmp1, dst_tmp, dst_stride);
756 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7,
758 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
759 __lsx_vstx(tmp0, dst_tmp, dst_stride2);
760 __lsx_vstx(tmp1, dst_tmp, dst_stride3);
761 dst_tmp += dst_stride4;
783 uint8_t *dst,
int32_t dst_stride,
790 uint8_t *dst,
int32_t dst_stride,
798 uint8_t *dst,
int32_t dst_stride,
799 const int8_t *filter_horiz,
800 const int8_t *filter_vert,
803 uint32_t loop_cnt =
height >> 2;
804 __m128i
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
805 __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
806 __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
807 __m128i mask0, mask1, mask2, mask3;
808 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
810 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
811 int32_t src_stride2 = src_stride << 1;
812 int32_t src_stride3 = src_stride + src_stride2;
813 int32_t src_stride4 = src_stride2 << 1;
814 uint8_t* _src = (uint8_t*)
src - src_stride3 - 3;
817 DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
818 filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
819 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
820 mask3 = __lsx_vaddi_bu(mask0, 6);
822 src0 = __lsx_vld(_src, 0);
823 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2,
src1, src2);
824 src3 = __lsx_vldx(_src, src_stride3);
826 src4 = __lsx_vld(_src, 0);
827 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
831 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
832 src6 = __lsx_vxori_b(src6, 128);
835 filt_hz1, filt_hz2, filt_hz3);
836 tmp2 =
HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
837 filt_hz1, filt_hz2, filt_hz3);
838 tmp4 =
HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
839 filt_hz1, filt_hz2, filt_hz3);
840 tmp5 =
HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
841 filt_hz1, filt_hz2, filt_hz3);
842 DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
843 DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
844 filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
845 DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
846 tmp2 = __lsx_vpackev_b(tmp5, tmp4);
849 src7 = __lsx_vld(_src, 0);
850 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
851 src10 = __lsx_vldx(_src, src_stride3);
853 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
854 src7, src8, src9, src10);
855 tmp3 =
HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
856 filt_hz1, filt_hz2, filt_hz3);
857 tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
858 tmp4 = __lsx_vpackev_b(tmp3, tmp4);
862 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
863 src0 = __lsx_vshuf_b(
src1, tmp3, shuff);
867 out0 = __lsx_vssrarni_b_h(out1, out0, 7);
868 out0 = __lsx_vxori_b(out0, 128);
869 __lsx_vstelm_w(out0, dst, 0, 0);
871 __lsx_vstelm_w(out0, dst, 0, 1);
873 __lsx_vstelm_w(out0, dst, 0, 2);
875 __lsx_vstelm_w(out0, dst, 0, 3);
886 uint8_t *dst,
int32_t dst_stride,
887 const int8_t *filter_horiz,
888 const int8_t *filter_vert,
891 uint32_t loop_cnt =
height >> 2;
892 __m128i
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
893 __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
894 __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
895 __m128i mask0, mask1, mask2, mask3;
896 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
898 int32_t src_stride2 = src_stride << 1;
899 int32_t src_stride3 = src_stride + src_stride2;
900 int32_t src_stride4 = src_stride2 << 1;
901 uint8_t* _src = (uint8_t*)
src - src_stride3 - 3;
904 DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz,
905 4, filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
906 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
907 mask3 = __lsx_vaddi_bu(mask0, 6);
909 src0 = __lsx_vld(_src, 0);
910 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2,
src1, src2);
911 src3 = __lsx_vldx(_src, src_stride3);
913 src4 = __lsx_vld(_src, 0);
914 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
918 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
919 src6 = __lsx_vxori_b(src6, 128);
922 filt_hz1, filt_hz2, filt_hz3);
924 filt_hz1, filt_hz2, filt_hz3);
925 src2 =
HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
926 filt_hz1, filt_hz2, filt_hz3);
927 src3 =
HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
928 filt_hz1, filt_hz2, filt_hz3);
929 src4 =
HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
930 filt_hz1, filt_hz2, filt_hz3);
931 src5 =
HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
932 filt_hz1, filt_hz2, filt_hz3);
933 src6 =
HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
934 filt_hz1, filt_hz2, filt_hz3);
936 DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
937 filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
939 src2,
src1, tmp0, tmp1, tmp2, tmp4);
940 DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
943 src7 = __lsx_vld(_src, 0);
944 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
945 src10 = __lsx_vldx(_src, src_stride3);
948 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
949 src7, src8, src9, src10);
950 src7 =
HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
951 filt_hz1, filt_hz2, filt_hz3);
952 tmp3 = __lsx_vpackev_b(src7, src6);
955 src8 =
HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
956 filt_hz1, filt_hz2, filt_hz3);
957 src0 = __lsx_vpackev_b(src8, src7);
960 src9 =
HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
961 filt_hz1, filt_hz2, filt_hz3);
962 src1 = __lsx_vpackev_b(src9, src8);
966 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
967 src2 = __lsx_vpackev_b(src10, src9);
970 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
971 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
972 __lsx_vstelm_d(out0, dst, 0, 0);
974 __lsx_vstelm_d(out0, dst, 0, 1);
976 __lsx_vstelm_d(out1, dst, 0, 0);
978 __lsx_vstelm_d(out1, dst, 0, 1);
992 uint8_t *dst,
int32_t dst_stride,
993 const int8_t *filter_horiz,
994 const int8_t *filter_vert,
999 for (multiple8_cnt = 2; multiple8_cnt--;) {
1008 uint8_t *dst,
int32_t dst_stride,
1009 const int8_t *filter_horiz,
1010 const int8_t *filter_vert,
1015 for (multiple8_cnt = 4; multiple8_cnt--;) {
1024 uint8_t *dst,
int32_t dst_stride,
1025 const int8_t *filter_horiz,
1026 const int8_t *filter_vert,
1031 for (multiple8_cnt = 8; multiple8_cnt--;) {
1040 uint8_t *dst,
int32_t dst_stride,
1047 src0 = __lsx_vldrepl_d(
src, 0);
1049 src1 = __lsx_vldrepl_d(
src, 0);
1051 src2 = __lsx_vldrepl_d(
src, 0);
1053 src3 = __lsx_vldrepl_d(
src, 0);
1055 __lsx_vstelm_d(
src0, dst, 0, 0);
1057 __lsx_vstelm_d(
src1, dst, 0, 0);
1059 __lsx_vstelm_d(src2, dst, 0, 0);
1061 __lsx_vstelm_d(src3, dst, 0, 0);
1067 uint8_t *dst,
int32_t dst_stride,
1072 int32_t src_stride2 = src_stride << 1;
1073 int32_t src_stride3 = src_stride + src_stride2;
1074 int32_t src_stride4 = src_stride2 << 1;
1075 int32_t dst_stride2 = dst_stride << 1;
1076 int32_t dst_stride3 = dst_stride2 + dst_stride;
1077 int32_t dst_stride4 = dst_stride2 << 1;
1078 uint8_t *_src = (uint8_t*)
src;
1081 src0 = __lsx_vld(_src, 0);
1082 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2,
src1, src2);
1083 src3 = __lsx_vldx(_src, src_stride3);
1084 _src += src_stride4;
1085 __lsx_vst(
src0, dst, 0);
1086 __lsx_vstx(
src1, dst, dst_stride);
1087 __lsx_vstx(src2, dst, dst_stride2);
1088 __lsx_vstx(src3, dst, dst_stride3);
1094 uint8_t *dst,
int32_t dst_stride,
1098 uint8_t *src_tmp1 = (uint8_t*)
src;
1099 uint8_t *dst_tmp1 = dst;
1100 uint8_t *src_tmp2 = src_tmp1 + 16;
1101 uint8_t *dst_tmp2 = dst_tmp1 + 16;
1102 int32_t src_stride2 = src_stride << 1;
1103 int32_t src_stride3 = src_stride + src_stride2;
1104 int32_t src_stride4 = src_stride2 << 1;
1105 int32_t dst_stride2 = dst_stride << 1;
1106 int32_t dst_stride3 = dst_stride2 + dst_stride;
1107 int32_t dst_stride4 = dst_stride2 << 1;
1108 __m128i
src0,
src1, src2, src3, src4, src5, src6, src7;
1111 src0 = __lsx_vld(src_tmp1, 0);
1112 DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2,
1114 src3 = __lsx_vldx(src_tmp1, src_stride3);
1115 src_tmp1 += src_stride4;
1117 src4 = __lsx_vld(src_tmp2, 0);
1118 DUP2_ARG2(__lsx_vldx, src_tmp2, src_stride, src_tmp2, src_stride2,
1120 src7 = __lsx_vldx(src_tmp2, src_stride3);
1121 src_tmp2 += src_stride4;
1123 __lsx_vst(
src0, dst_tmp1, 0);
1124 __lsx_vstx(
src1, dst_tmp1, dst_stride);
1125 __lsx_vstx(src2, dst_tmp1, dst_stride2);
1126 __lsx_vstx(src3, dst_tmp1, dst_stride3);
1127 dst_tmp1 += dst_stride4;
1128 __lsx_vst(src4, dst_tmp2, 0);
1129 __lsx_vstx(src5, dst_tmp2, dst_stride);
1130 __lsx_vstx(src6, dst_tmp2, dst_stride2);
1131 __lsx_vstx(src7, dst_tmp2, dst_stride3);
1132 dst_tmp2 += dst_stride4;
1137 uint8_t *dst,
int32_t dst_stride,
1141 __m128i
src0,
src1, src2, src3, src4, src5, src6, src7;
1142 __m128i src8, src9, src10, src11, src12, src13, src14, src15;
1149 src4, src5, src6, src7);
1152 src8, src9, src10, src11);
1155 src12, src13, src14, src15);
1157 __lsx_vst(
src0, dst, 0);
1158 __lsx_vst(
src1, dst, 16);
1159 __lsx_vst(src2, dst, 32);
1160 __lsx_vst(src3, dst, 48);
1162 __lsx_vst(src4, dst, 0);
1163 __lsx_vst(src5, dst, 16);
1164 __lsx_vst(src6, dst, 32);
1165 __lsx_vst(src7, dst, 48);
1167 __lsx_vst(src8, dst, 0);
1168 __lsx_vst(src9, dst, 16);
1169 __lsx_vst(src10, dst, 32);
1170 __lsx_vst(src11, dst, 48);
1172 __lsx_vst(src12, dst, 0);
1173 __lsx_vst(src13, dst, 16);
1174 __lsx_vst(src14, dst, 32);
1175 __lsx_vst(src15, dst, 48);
1182 uint8_t *dst,
int32_t dst_stride,
1185 uint8_t *dst_tmp = dst;
1188 __m128i mask0, mask1, mask2, mask3;
1190 __m128i dst0, dst1, dst2, dst3;
1194 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1195 mask3 = __lsx_vaddi_bu(mask0, 6);
1203 dst0 = __lsx_vldrepl_w(dst_tmp, 0);
1204 dst_tmp += dst_stride;
1205 dst1 = __lsx_vldrepl_w(dst_tmp, 0);
1206 dst_tmp += dst_stride;
1207 dst2 = __lsx_vldrepl_w(dst_tmp, 0);
1208 dst_tmp += dst_stride;
1209 dst3 = __lsx_vldrepl_w(dst_tmp, 0);
1210 dst0 = __lsx_vilvl_w(dst1, dst0);
1211 dst1 = __lsx_vilvl_w(dst3, dst2);
1212 dst0 = __lsx_vilvl_d(dst1, dst0);
1213 tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
1214 tmp0 = __lsx_vxori_b(tmp0, 128);
1215 dst0 = __lsx_vavgr_bu(tmp0, dst0);
1216 __lsx_vstelm_w(dst0, dst, 0, 0);
1218 __lsx_vstelm_w(dst0, dst, 0, 1);
1220 __lsx_vstelm_w(dst0, dst, 0, 2);
1222 __lsx_vstelm_w(dst0, dst, 0, 3);
1227 uint8_t *dst,
int32_t dst_stride,
1230 uint8_t *dst_tmp = dst;
1232 __m128i mask0, mask1, mask2, mask3, tmp0, tmp1, tmp2, tmp3;
1237 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1238 mask3 = __lsx_vaddi_bu(mask0, 6);
1246 tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
1247 dst_tmp += dst_stride;
1248 tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
1249 dst_tmp += dst_stride;
1250 tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
1251 dst_tmp += dst_stride;
1252 tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
1253 dst_tmp += dst_stride;
1254 tmp0 = __lsx_vilvl_w(tmp1, tmp0);
1255 tmp1 = __lsx_vilvl_w(tmp3, tmp2);
1256 dst0 = __lsx_vilvl_d(tmp1, tmp0);
1258 tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
1259 dst_tmp += dst_stride;
1260 tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
1261 dst_tmp += dst_stride;
1262 tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
1263 dst_tmp += dst_stride;
1264 tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
1265 tmp0 = __lsx_vilvl_w(tmp1, tmp0);
1266 tmp1 = __lsx_vilvl_w(tmp3, tmp2);
1267 dst1 = __lsx_vilvl_d(tmp1, tmp0);
1275 DUP4_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp2, tmp2, 7,
1276 tmp3, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
1277 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1278 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1279 DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
1280 __lsx_vstelm_w(dst0, dst, 0, 0);
1282 __lsx_vstelm_w(dst0, dst, 0, 1);
1284 __lsx_vstelm_w(dst0, dst, 0, 2);
1286 __lsx_vstelm_w(dst0, dst, 0, 3);
1288 __lsx_vstelm_w(dst1, dst, 0, 0);
1290 __lsx_vstelm_w(dst1, dst, 0, 1);
1292 __lsx_vstelm_w(dst1, dst, 0, 2);
1294 __lsx_vstelm_w(dst1, dst, 0, 3);
1299 uint8_t *dst,
int32_t dst_stride,
1305 }
else if (
height == 8) {
1312 uint8_t *dst,
int32_t dst_stride,
1317 uint8_t *dst_tmp = dst;
1319 __m128i mask0, mask1, mask2, mask3;
1320 __m128i tmp0, tmp1, tmp2, tmp3;
1321 __m128i dst0, dst1, dst2, dst3;
1322 int32_t src_stride2 = src_stride << 1;
1323 int32_t src_stride3 = src_stride2 + src_stride;
1324 int32_t src_stride4 = src_stride2 << 1;
1325 uint8_t *_src = (uint8_t*)
src - 3;
1328 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1329 mask3 = __lsx_vaddi_bu(mask0, 6);
1333 for (;loop_cnt--;) {
1334 src0 = __lsx_vld(_src, 0);
1335 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2,
src1, src2);
1336 src3 = __lsx_vldx(_src, src_stride3);
1337 _src += src_stride4;
1341 mask3,
filter0,
filter1, filter2, filter3, tmp0, tmp1, tmp2, tmp3);
1342 dst0 = __lsx_vldrepl_d(dst_tmp, 0);
1343 dst_tmp += dst_stride;
1344 dst1 = __lsx_vldrepl_d(dst_tmp, 0);
1345 dst_tmp += dst_stride;
1346 dst2 = __lsx_vldrepl_d(dst_tmp, 0);
1347 dst_tmp += dst_stride;
1348 dst3 = __lsx_vldrepl_d(dst_tmp, 0);
1349 dst_tmp += dst_stride;
1350 DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
1351 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
1352 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1353 DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
1354 __lsx_vstelm_d(dst0, dst, 0, 0);
1356 __lsx_vstelm_d(dst0, dst, 0, 1);
1358 __lsx_vstelm_d(dst1, dst, 0, 0);
1360 __lsx_vstelm_d(dst1, dst, 0, 1);
1367 uint8_t *dst,
int32_t dst_stride,
1372 int32_t dst_stride2 = dst_stride << 1;
1373 uint8_t *dst_tmp = dst;
1375 __m128i mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
1376 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1377 __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1381 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1382 mask3 = __lsx_vaddi_bu(mask0, 6);
1386 for (;loop_cnt--;) {
1391 dst0 = __lsx_vld(dst_tmp, 0);
1392 dst1 = __lsx_vldx(dst_tmp, dst_stride);
1393 dst_tmp += dst_stride2;
1397 mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
1399 mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
1401 mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
1403 mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
1405 filter0, tmp0, tmp1, tmp2, tmp3);
1406 DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2, tmp11,
1407 filter2, tmp8, tmp9, tmp10, tmp11);
1410 DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3, tmp10,
1411 tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, tmp7);
1412 DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
1413 tmp0, tmp1, tmp2, tmp3);
1414 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst2, dst3);
1415 DUP2_ARG2(__lsx_vxori_b, dst2, 128, dst3, 128, dst2, dst3);
1416 DUP2_ARG2(__lsx_vavgr_bu, dst0, dst2, dst1, dst3, dst0, dst1);
1417 __lsx_vst(dst0, dst, 0);
1418 __lsx_vstx(dst1, dst, dst_stride);
1425 uint8_t *dst,
int32_t dst_stride,
1429 uint32_t loop_cnt =
height;
1430 uint8_t *dst_tmp = dst;
1432 __m128i mask0, mask1, mask2, mask3, dst0, dst1;
1433 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1434 __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
1435 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
1439 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1440 mask3 = __lsx_vaddi_bu(mask0, 6);
1444 for (;loop_cnt--;) {
1446 src3 = __lsx_vld(
src, 24);
1447 src1 = __lsx_vshuf_b(src2,
src0, shuff);
1449 DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst, 16, dst0, dst1);
1450 dst_tmp += dst_stride;
1454 src2, mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
1456 src2, mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
1458 src2, mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
1460 src2, mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
1462 tmp3,
filter0, tmp0, tmp1, tmp2, tmp3);
1463 DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2,
1464 tmp11, filter2, tmp8, tmp9, tmp10, tmp11);
1466 tmp2, tmp6,
filter1, tmp3, tmp7,
filter1, tmp0, tmp1, tmp2, tmp3);
1467 DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3,
1468 tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6, tmp7);
1469 DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
1470 tmp0, tmp1, tmp2, tmp3);
1471 DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
1472 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1473 DUP2_ARG2(__lsx_vavgr_bu, dst0, tmp0, dst1, tmp1, dst0, dst1);
1474 __lsx_vst(dst0, dst, 0);
1475 __lsx_vst(dst1, dst, 16);
1482 uint8_t *dst,
int32_t dst_stride,
1489 __m128i mask0, mask1, mask2, mask3;
1490 __m128i out0, out1, out2, out3, dst0, dst1;
1491 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
1495 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1496 mask3 = __lsx_vaddi_bu(mask0, 6);
1500 for (;loop_cnt--;) {
1502 src3 = __lsx_vld(
src, 24);
1503 src1 = __lsx_vshuf_b(src2,
src0, shuff);
1504 DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
1508 mask3,
filter0,
filter1, filter2, filter3, out0, out1, out2, out3);
1509 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
1510 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
1511 DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
1512 __lsx_vst(out0, dst, 0);
1513 __lsx_vst(out1, dst, 16);
1516 src3 = __lsx_vld(
src, 56);
1517 src1 = __lsx_vshuf_b(src2,
src0, shuff);
1518 DUP2_ARG2(__lsx_vld, dst, 32, dst, 48, dst0, dst1);
1522 mask3,
filter0,
filter1, filter2, filter3, out0, out1, out2, out3);
1523 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
1524 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
1525 DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
1526 __lsx_vst(out0, dst, 32);
1527 __lsx_vst(out1, dst, 48);
1535 uint8_t *dst,
int32_t dst_stride,
1539 uint32_t loop_cnt =
height >> 2;
1540 uint8_t *dst_tmp = dst;
1541 __m128i
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1542 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1543 __m128i reg0, reg1, reg2, reg3, reg4;
1546 int32_t src_stride2 = src_stride << 1;
1547 int32_t src_stride3 = src_stride + src_stride2;
1548 int32_t src_stride4 = src_stride2 << 1;
1549 uint8_t* _src = (uint8_t*)
src - src_stride3;
1553 src0 = __lsx_vld(_src, 0);
1554 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2,
src1, src2);
1555 src3 = __lsx_vldx(_src, src_stride3);
1556 _src += src_stride4;
1557 src4 = __lsx_vld(_src, 0);
1558 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
1559 _src += src_stride3;
1561 tmp0, tmp1, tmp2, tmp3);
1562 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
1563 DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
1564 reg2 = __lsx_vilvl_d(tmp5, tmp2);
1565 DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
1566 reg2 = __lsx_vxori_b(reg2, 128);
1568 for (;loop_cnt--;) {
1569 src7 = __lsx_vld(_src, 0);
1570 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
1571 src10 = __lsx_vldx(_src, src_stride3);
1572 _src += src_stride4;
1573 src0 = __lsx_vldrepl_w(dst_tmp, 0);
1574 dst_tmp += dst_stride;
1575 src1 = __lsx_vldrepl_w(dst_tmp, 0);
1576 dst_tmp += dst_stride;
1577 src2 = __lsx_vldrepl_w(dst_tmp, 0);
1578 dst_tmp += dst_stride;
1579 src3 = __lsx_vldrepl_w(dst_tmp, 0);
1580 dst_tmp += dst_stride;
1583 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
1584 src9, tmp0, tmp1, tmp2, tmp3);
1585 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
1586 DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
1591 out0 = __lsx_vssrarni_b_h(out1, out0, 7);
1592 out0 = __lsx_vxori_b(out0, 128);
1593 out0 = __lsx_vavgr_bu(out0,
src0);
1594 __lsx_vstelm_w(out0, dst, 0, 0);
1596 __lsx_vstelm_w(out0, dst, 0, 1);
1598 __lsx_vstelm_w(out0, dst, 0, 2);
1600 __lsx_vstelm_w(out0, dst, 0, 3);
1611 uint8_t *dst,
int32_t dst_stride,
1615 uint32_t loop_cnt =
height >> 2;
1616 uint8_t *dst_tmp = dst;
1617 __m128i
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1618 __m128i tmp0, tmp1, tmp2, tmp3;
1619 __m128i reg0, reg1, reg2, reg3, reg4, reg5;
1621 __m128i out0, out1, out2, out3;
1622 int32_t src_stride2 = src_stride << 1;
1623 int32_t src_stride3 = src_stride + src_stride2;
1624 int32_t src_stride4 = src_stride2 << 1;
1625 uint8_t* _src = (uint8_t*)
src - src_stride3;
1630 src0 = __lsx_vld(_src, 0);
1631 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2,
src1, src2);
1632 src3 = __lsx_vldx(_src, src_stride3);
1633 _src += src_stride4;
1634 src4 = __lsx_vld(_src, 0);
1635 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
1636 _src += src_stride3;
1639 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
1640 src6 = __lsx_vxori_b(src6, 128);
1642 src1, reg0, reg1, reg2, reg3);
1643 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
1645 for (;loop_cnt--;) {
1646 src7 = __lsx_vld(_src, 0);
1647 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
1648 src10 = __lsx_vldx(_src, src_stride3);
1649 _src += src_stride4;
1650 src0 = __lsx_vldrepl_d(dst_tmp, 0);
1651 dst_tmp += dst_stride;
1652 src1 = __lsx_vldrepl_d(dst_tmp, 0);
1653 dst_tmp += dst_stride;
1654 src2 = __lsx_vldrepl_d(dst_tmp, 0);
1655 dst_tmp += dst_stride;
1656 src3 = __lsx_vldrepl_d(dst_tmp, 0);
1657 dst_tmp += dst_stride;
1659 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
1660 src7, src8, src9, src10);
1661 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10,
1662 src9, tmp0, tmp1, tmp2, tmp3);
1671 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
1672 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
1674 __lsx_vstelm_d(out0, dst, 0, 0);
1676 __lsx_vstelm_d(out0, dst, 0, 1);
1678 __lsx_vstelm_d(out1, dst, 0, 0);
1680 __lsx_vstelm_d(out1, dst, 0, 1);
1702 uint32_t cnt =
width >> 4;
1703 __m128i
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1705 __m128i reg0, reg1, reg2, reg3, reg4, reg5;
1706 __m128i reg6, reg7, reg8, reg9, reg10, reg11;
1707 __m128i tmp0, tmp1, tmp2, tmp3;
1708 int32_t src_stride2 = src_stride << 1;
1709 int32_t src_stride3 = src_stride + src_stride2;
1710 int32_t src_stride4 = src_stride2 << 1;
1711 int32_t dst_stride2 = dst_stride << 1;
1712 int32_t dst_stride3 = dst_stride2 + dst_stride;
1713 int32_t dst_stride4 = dst_stride2 << 1;
1714 uint8_t *_src = (uint8_t*)
src - src_stride3;
1719 uint32_t loop_cnt =
height >> 2;
1720 uint8_t *dst_reg = dst;
1723 src0 = __lsx_vld(src_tmp, 0);
1724 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
1726 src3 = __lsx_vldx(src_tmp, src_stride3);
1727 src_tmp += src_stride4;
1728 src4 = __lsx_vld(src_tmp, 0);
1729 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
1731 src_tmp += src_stride3;
1734 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
1735 src6 = __lsx_vxori_b(src6, 128);
1737 reg0, reg1, reg2, reg3);
1738 DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
1740 reg6, reg7, reg8, reg9);
1741 DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
1743 for (;loop_cnt--;) {
1744 src7 = __lsx_vld(src_tmp, 0);
1745 DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2,
1747 src10 = __lsx_vldx(src_tmp, src_stride3);
1748 src_tmp += src_stride4;
1749 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10,
1750 128, src7, src8, src9, src10);
1751 DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8,
1752 src10, src9,
src0,
src1, src2, src3);
1753 DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8,
1754 src10, src9, src4, src5, src7, src8);
1763 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7,
1765 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1766 tmp2 = __lsx_vld(dst_reg, 0);
1767 tmp3 = __lsx_vldx(dst_reg, dst_stride);
1768 DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
1769 __lsx_vst(tmp0, dst_reg, 0);
1770 __lsx_vstx(tmp1, dst_reg, dst_stride);
1779 DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7,
1781 DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
1782 tmp2 = __lsx_vldx(dst_reg, dst_stride2);
1783 tmp3 = __lsx_vldx(dst_reg, dst_stride3);
1784 DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
1785 __lsx_vstx(tmp0, dst_reg, dst_stride2);
1786 __lsx_vstx(tmp1, dst_reg, dst_stride3);
1787 dst_reg += dst_stride4;
1810 uint8_t *dst,
int32_t dst_stride,
1820 uint8_t *dst,
int32_t dst_stride,
1830 uint8_t *dst,
int32_t dst_stride,
1842 const int8_t *filter_horiz,
1843 const int8_t *filter_vert,
1846 uint32_t loop_cnt =
height >> 2;
1847 uint8_t *dst_tmp = dst;
1848 __m128i
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1849 __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1850 __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1851 __m128i mask0, mask1, mask2, mask3;
1852 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1854 __m128i shuff = {0x0F0E0D0C0B0A0908, 0x1716151413121110};
1855 int32_t src_stride2 = src_stride << 1;
1856 int32_t src_stride3 = src_stride + src_stride2;
1857 int32_t src_stride4 = src_stride2 << 1;
1858 uint8_t* _src = (uint8_t*)
src - 3 - src_stride3;
1861 DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz,
1862 4, filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1863 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1864 mask3 = __lsx_vaddi_bu(mask0, 6);
1866 src0 = __lsx_vld(_src, 0);
1867 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2,
src1, src2);
1868 src3 = __lsx_vldx(_src, src_stride3);
1869 _src += src_stride4;
1870 src4 = __lsx_vld(_src, 0);
1871 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
1872 _src += src_stride3;
1876 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
1877 src6 = __lsx_vxori_b(src6, 128);
1880 filt_hz1, filt_hz2, filt_hz3);
1881 tmp2 =
HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
1882 filt_hz1, filt_hz2, filt_hz3);
1883 tmp4 =
HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
1884 filt_hz1, filt_hz2, filt_hz3);
1885 tmp5 =
HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
1886 filt_hz1, filt_hz2, filt_hz3);
1887 DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
1888 DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
1889 filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1890 DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1891 tmp2 = __lsx_vpackev_b(tmp5, tmp4);
1893 for (;loop_cnt--;) {
1894 src7 = __lsx_vld(_src, 0);
1895 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
1896 src10 = __lsx_vldx(_src, src_stride3);
1897 _src += src_stride4;
1898 src2 = __lsx_vldrepl_w(dst_tmp, 0);
1899 dst_tmp += dst_stride;
1900 src3 = __lsx_vldrepl_w(dst_tmp, 0);
1901 dst_tmp += dst_stride;
1902 src4 = __lsx_vldrepl_w(dst_tmp, 0);
1903 dst_tmp += dst_stride;
1904 src5 = __lsx_vldrepl_w(dst_tmp, 0);
1905 dst_tmp += dst_stride;
1906 DUP2_ARG2(__lsx_vilvl_w, src3, src2, src5, src4, src2, src3);
1907 src2 = __lsx_vilvl_d(src3, src2);
1908 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
1909 src7, src8, src9, src10);
1910 tmp3 =
HORIZ_8TAP_FILT(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
1911 filt_hz1, filt_hz2, filt_hz3);
1912 tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
1913 tmp4 = __lsx_vpackev_b(tmp3, tmp4);
1915 filt_vt2, filt_vt3);
1917 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1918 src0 = __lsx_vshuf_b(
src1, tmp3, shuff);
1921 filt_vt2, filt_vt3);
1922 out0 = __lsx_vssrarni_b_h(out1, out0, 7);
1923 out0 = __lsx_vxori_b(out0, 128);
1924 out0 = __lsx_vavgr_bu(out0, src2);
1925 __lsx_vstelm_w(out0, dst, 0, 0);
1927 __lsx_vstelm_w(out0, dst, 0, 1);
1929 __lsx_vstelm_w(out0, dst, 0, 2);
1931 __lsx_vstelm_w(out0, dst, 0, 3);
1945 const int8_t *filter_horiz,
1946 const int8_t *filter_vert,
1949 uint32_t loop_cnt =
height >> 2;
1950 uint8_t *dst_tmp = dst;
1951 __m128i
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1952 __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1953 __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1954 __m128i mask0, mask1, mask2, mask3;
1955 __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1957 int32_t src_stride2 = src_stride << 1;
1958 int32_t src_stride3 = src_stride + src_stride2;
1959 int32_t src_stride4 = src_stride2 << 1;
1960 uint8_t* _src = (uint8_t*)
src - 3 - src_stride3;
1963 DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz,
1964 4, filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1965 DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
1966 mask3 = __lsx_vaddi_bu(mask0, 6);
1968 src0 = __lsx_vld(_src, 0);
1969 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2,
src1, src2);
1970 src3 = __lsx_vldx(_src, src_stride3);
1971 _src += src_stride4;
1972 src4 = __lsx_vld(_src, 0);
1973 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
1974 _src += src_stride3;
1977 DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
1978 src6 = __lsx_vxori_b(src6, 128);
1981 filt_hz1, filt_hz2, filt_hz3);
1983 filt_hz1, filt_hz2, filt_hz3);
1984 src2 =
HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
1985 filt_hz1, filt_hz2, filt_hz3);
1986 src3 =
HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
1987 filt_hz1, filt_hz2, filt_hz3);
1988 src4 =
HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
1989 filt_hz1, filt_hz2, filt_hz3);
1990 src5 =
HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
1991 filt_hz1, filt_hz2, filt_hz3);
1992 src6 =
HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
1993 filt_hz1, filt_hz2, filt_hz3);
1995 DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
1996 filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1998 src2,
src1, tmp0, tmp1, tmp2, tmp4);
1999 DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
2001 for (;loop_cnt--;) {
2002 src7 = __lsx_vld(_src, 0);
2003 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
2004 src10 = __lsx_vldx(_src, src_stride3);
2005 _src += src_stride4;
2007 DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
2008 src7, src8, src9, src10);
2009 src7 =
HORIZ_8TAP_FILT(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
2010 filt_hz1, filt_hz2, filt_hz3);
2011 tmp3 = __lsx_vpackev_b(src7, src6);
2013 filt_vt2, filt_vt3);
2014 src8 =
HORIZ_8TAP_FILT(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
2015 filt_hz1, filt_hz2, filt_hz3);
2016 src0 = __lsx_vpackev_b(src8, src7);
2018 filt_vt2, filt_vt3);
2019 src9 =
HORIZ_8TAP_FILT(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
2020 filt_hz1, filt_hz2, filt_hz3);
2021 src1 = __lsx_vpackev_b(src9, src8);
2023 filt_vt2, filt_vt3);
2024 src10 =
HORIZ_8TAP_FILT(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
2025 filt_hz1, filt_hz2, filt_hz3);
2026 src2 = __lsx_vpackev_b(src10, src9);
2028 filt_vt2, filt_vt3);
2029 DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
2030 DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
2031 src5 = __lsx_vldrepl_d(dst_tmp, 0);
2032 dst_tmp += dst_stride;
2033 src7 = __lsx_vldrepl_d(dst_tmp, 0);
2034 dst_tmp += dst_stride;
2035 src8 = __lsx_vldrepl_d(dst_tmp, 0);
2036 dst_tmp += dst_stride;
2037 src9 = __lsx_vldrepl_d(dst_tmp, 0);
2038 dst_tmp += dst_stride;
2039 DUP2_ARG2(__lsx_vilvl_d, src7, src5, src9, src8, src5, src7);
2040 DUP2_ARG2(__lsx_vavgr_bu, out0, src5, out1, src7, out0, out1);
2041 __lsx_vstelm_d(out0, dst, 0, 0);
2043 __lsx_vstelm_d(out0, dst, 0, 1);
2045 __lsx_vstelm_d(out1, dst, 0, 0);
2047 __lsx_vstelm_d(out1, dst, 0, 1);
2064 const int8_t *filter_horiz,
2065 const int8_t *filter_vert,
2070 for (multiple8_cnt = 2; multiple8_cnt--;) {
2072 filter_horiz, filter_vert,
2084 const int8_t *filter_horiz,
2085 const int8_t *filter_vert,
2090 for (multiple8_cnt = 4; multiple8_cnt--;) {
2092 filter_horiz, filter_vert,
2104 const int8_t *filter_horiz,
2105 const int8_t *filter_vert,
2110 for (multiple8_cnt = 8; multiple8_cnt--;) {
2112 filter_horiz, filter_vert,
2121 uint8_t *dst,
int32_t dst_stride,
2125 uint8_t *dst_tmp = dst;
2127 __m128i tmp0, tmp1, tmp2, tmp3;
2130 tmp0 = __lsx_vldrepl_d(
src, 0);
2132 tmp1 = __lsx_vldrepl_d(
src, 0);
2134 tmp2 = __lsx_vldrepl_d(
src, 0);
2136 tmp3 = __lsx_vldrepl_d(
src, 0);
2139 tmp0 = __lsx_vldrepl_d(dst_tmp, 0);
2140 dst_tmp += dst_stride;
2141 tmp1 = __lsx_vldrepl_d(dst_tmp, 0);
2142 dst_tmp += dst_stride;
2143 tmp2 = __lsx_vldrepl_d(dst_tmp, 0);
2144 dst_tmp += dst_stride;
2145 tmp3 = __lsx_vldrepl_d(dst_tmp, 0);
2146 dst_tmp += dst_stride;
2147 DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, dst0, dst1);
2149 __lsx_vstelm_d(dst0, dst, 0, 0);
2151 __lsx_vstelm_d(dst0, dst, 0, 1);
2153 __lsx_vstelm_d(dst1, dst, 0, 0);
2155 __lsx_vstelm_d(dst1, dst, 0, 1);
2161 uint8_t *dst,
int32_t dst_stride,
2166 __m128i dst0, dst1, dst2, dst3;
2167 int32_t src_stride2 = src_stride << 1;
2168 int32_t src_stride3 = src_stride + src_stride2;
2169 int32_t src_stride4 = src_stride2 << 1;
2170 int32_t dst_stride2 = dst_stride << 1;
2171 int32_t dst_stride3 = dst_stride2 + dst_stride;
2172 int32_t dst_stride4 = dst_stride2 << 1;
2173 uint8_t* _src = (uint8_t*)
src;
2176 src0 = __lsx_vld(_src, 0);
2177 DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2,
src1, src2);
2178 src3 = __lsx_vldx(_src, src_stride3);
2179 _src += src_stride4;
2181 dst0 = __lsx_vld(dst, 0);
2182 DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2,
2184 dst3 = __lsx_vldx(dst, dst_stride3);
2186 src2, dst2, src3, dst3, dst0, dst1, dst2, dst3);
2187 __lsx_vst(dst0, dst, 0);
2188 __lsx_vstx(dst1, dst, dst_stride);
2189 __lsx_vstx(dst2, dst, dst_stride2);
2190 __lsx_vstx(dst3, dst, dst_stride3);
2196 uint8_t *dst,
int32_t dst_stride,
2200 uint8_t *src_tmp1 = (uint8_t*)
src;
2201 uint8_t *src_tmp2 = src_tmp1 + 16;
2202 uint8_t *dst_tmp1, *dst_tmp2;
2203 __m128i
src0,
src1, src2, src3, src4, src5, src6, src7;
2204 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2205 int32_t src_stride2 = src_stride << 1;
2206 int32_t src_stride3 = src_stride + src_stride2;
2207 int32_t src_stride4 = src_stride2 << 1;
2208 int32_t dst_stride2 = dst_stride << 1;
2209 int32_t dst_stride3 = dst_stride2 + dst_stride;
2210 int32_t dst_stride4 = dst_stride2 << 1;
2213 dst_tmp2 = dst + 16;
2215 src0 = __lsx_vld(src_tmp1, 0);
2216 DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2,
2218 src6 = __lsx_vldx(src_tmp1, src_stride3);
2219 src_tmp1 += src_stride4;
2221 src1 = __lsx_vld(src_tmp2, 0);
2222 DUP2_ARG2(__lsx_vldx, src_tmp2, src_stride, src_tmp2, src_stride2,
2224 src7 = __lsx_vldx(src_tmp2, src_stride3);
2225 src_tmp2 += src_stride4;
2227 dst0 = __lsx_vld(dst_tmp1, 0);
2228 DUP2_ARG2(__lsx_vldx, dst_tmp1, dst_stride, dst_tmp1, dst_stride2,
2230 dst6 = __lsx_vldx(dst_tmp1, dst_stride3);
2231 dst1 = __lsx_vld(dst_tmp2, 0);
2232 DUP2_ARG2(__lsx_vldx, dst_tmp2, dst_stride, dst_tmp2, dst_stride2,
2234 dst7 = __lsx_vldx(dst_tmp2, dst_stride3);
2237 src2, dst2, src3, dst3, dst0, dst1, dst2, dst3);
2238 DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5,
2239 src6, dst6, src7, dst7, dst4, dst5, dst6, dst7);
2240 __lsx_vst(dst0, dst_tmp1, 0);
2241 __lsx_vstx(dst2, dst_tmp1, dst_stride);
2242 __lsx_vstx(dst4, dst_tmp1, dst_stride2);
2243 __lsx_vstx(dst6, dst_tmp1, dst_stride3);
2244 dst_tmp1 += dst_stride4;
2245 __lsx_vst(dst1, dst_tmp2, 0);
2246 __lsx_vstx(dst3, dst_tmp2, dst_stride);
2247 __lsx_vstx(dst5, dst_tmp2, dst_stride2);
2248 __lsx_vstx(dst7, dst_tmp2, dst_stride3);
2249 dst_tmp2 += dst_stride4;
2254 uint8_t *dst,
int32_t dst_stride,
2258 uint8_t *dst_tmp = dst;
2259 __m128i
src0,
src1, src2, src3, src4, src5, src6, src7;
2260 __m128i src8, src9, src10, src11, src12, src13, src14, src15;
2261 __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2262 __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
2269 src4, src5, src6, src7);
2272 src8, src9, src10, src11);
2275 src12, src13, src14, src15);
2277 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
2278 dst0, dst1, dst2, dst3);
2279 dst_tmp += dst_stride;
2280 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
2281 dst4, dst5, dst6, dst7);
2282 dst_tmp += dst_stride;
2283 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
2284 dst8, dst9, dst10, dst11);
2285 dst_tmp += dst_stride;
2286 DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
2287 dst12, dst13, dst14, dst15);
2288 dst_tmp += dst_stride;
2290 src2, dst2, src3, dst3, dst0, dst1, dst2, dst3);
2291 DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5,
2292 src6, dst6, src7, dst7, dst4, dst5, dst6, dst7);
2293 DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10,
2294 dst10, src11, dst11, dst8, dst9, dst10, dst11);
2295 DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14,
2296 dst14, src15, dst15, dst12, dst13, dst14, dst15);
2297 __lsx_vst(dst0, dst, 0);
2298 __lsx_vst(dst1, dst, 16);
2299 __lsx_vst(dst2, dst, 32);
2300 __lsx_vst(dst3, dst, 48);
2302 __lsx_vst(dst4, dst, 0);
2303 __lsx_vst(dst5, dst, 16);
2304 __lsx_vst(dst6, dst, 32);
2305 __lsx_vst(dst7, dst, 48);
2307 __lsx_vst(dst8, dst, 0);
2308 __lsx_vst(dst9, dst, 16);
2309 __lsx_vst(dst10, dst, 32);
2310 __lsx_vst(dst11, dst, 48);
2312 __lsx_vst(dst12, dst, 0);
2313 __lsx_vst(dst13, dst, 16);
2314 __lsx_vst(dst14, dst, 32);
2315 __lsx_vst(dst15, dst, 48);
2322 {0, 1, -5, 126, 8, -3, 1, 0},
2323 {-1, 3, -10, 122, 18, -6, 2, 0},
2324 {-1, 4, -13, 118, 27, -9, 3, -1},
2325 {-1, 4, -16, 112, 37, -11, 4, -1},
2326 {-1, 5, -18, 105, 48, -14, 4, -1},
2327 {-1, 5, -19, 97, 58, -16, 5, -1},
2328 {-1, 6, -19, 88, 68, -18, 5, -1},
2329 {-1, 6, -19, 78, 78, -19, 6, -1},
2330 {-1, 5, -18, 68, 88, -19, 6, -1},
2331 {-1, 5, -16, 58, 97, -19, 5, -1},
2332 {-1, 4, -14, 48, 105, -18, 5, -1},
2333 {-1, 4, -11, 37, 112, -16, 4, -1},
2334 {-1, 3, -9, 27, 118, -13, 4, -1},
2335 {0, 2, -6, 18, 122, -10, 3, -1},
2336 {0, 1, -3, 8, 126, -5, 1, 0},
2338 {-1, 3, -7, 127, 8, -3, 1, 0},
2339 {-2, 5, -13, 125, 17, -6, 3, -1},
2340 {-3, 7, -17, 121, 27, -10, 5, -2},
2341 {-4, 9, -20, 115, 37, -13, 6, -2},
2342 {-4, 10, -23, 108, 48, -16, 8, -3},
2343 {-4, 10, -24, 100, 59, -19, 9, -3},
2344 {-4, 11, -24, 90, 70, -21, 10, -4},
2345 {-4, 11, -23, 80, 80, -23, 11, -4},
2346 {-4, 10, -21, 70, 90, -24, 11, -4},
2347 {-3, 9, -19, 59, 100, -24, 10, -4},
2348 {-3, 8, -16, 48, 108, -23, 10, -4},
2349 {-2, 6, -13, 37, 115, -20, 9, -4},
2350 {-2, 5, -10, 27, 121, -17, 7, -3},
2351 {-1, 3, -6, 17, 125, -13, 5, -2},
2352 {0, 1, -3, 8, 127, -7, 3, -1},
2354 {-3, -1, 32, 64, 38, 1, -3, 0},
2355 {-2, -2, 29, 63, 41, 2, -3, 0},
2356 {-2, -2, 26, 63, 43, 4, -4, 0},
2357 {-2, -3, 24, 62, 46, 5, -4, 0},
2358 {-2, -3, 21, 60, 49, 7, -4, 0},
2359 {-1, -4, 18, 59, 51, 9, -4, 0},
2360 {-1, -4, 16, 57, 53, 12, -4, -1},
2361 {-1, -4, 14, 55, 55, 14, -4, -1},
2362 {-1, -4, 12, 53, 57, 16, -4, -1},
2363 {0, -4, 9, 51, 59, 18, -4, -1},
2364 {0, -4, 7, 49, 60, 21, -3, -2},
2365 {0, -4, 5, 46, 62, 24, -3, -2},
2366 {0, -4, 4, 43, 63, 26, -2, -2},
2367 {0, -3, 2, 41, 63, 29, -2, -2},
2368 {0, -3, 1, 38, 64, 32, -1, -3},
2372 #define VP9_8TAP_LOONGARCH_LSX_FUNC(SIZE, type, type_idx) \
2373 void ff_put_8tap_##type##_##SIZE##h_lsx(uint8_t *dst, ptrdiff_t dststride, \
2374 const uint8_t *src, \
2375 ptrdiff_t srcstride, \
2376 int h, int mx, int my) \
2378 const int8_t *filter = vp9_subpel_filters_lsx[type_idx][mx-1]; \
2380 common_hz_8t_##SIZE##w_lsx(src, srcstride, dst, dststride, filter, h); \
2383 void ff_put_8tap_##type##_##SIZE##v_lsx(uint8_t *dst, ptrdiff_t dststride, \
2384 const uint8_t *src, \
2385 ptrdiff_t srcstride, \
2386 int h, int mx, int my) \
2388 const int8_t *filter = vp9_subpel_filters_lsx[type_idx][my-1]; \
2390 common_vt_8t_##SIZE##w_lsx(src, srcstride, dst, dststride, filter, h); \
2393 void ff_put_8tap_##type##_##SIZE##hv_lsx(uint8_t *dst, ptrdiff_t dststride, \
2394 const uint8_t *src, \
2395 ptrdiff_t srcstride, \
2396 int h, int mx, int my) \
2398 const int8_t *hfilter = vp9_subpel_filters_lsx[type_idx][mx-1]; \
2399 const int8_t *vfilter = vp9_subpel_filters_lsx[type_idx][my-1]; \
2401 common_hv_8ht_8vt_##SIZE##w_lsx(src, srcstride, dst, dststride, hfilter, \
2405 void ff_avg_8tap_##type##_##SIZE##h_lsx(uint8_t *dst, ptrdiff_t dststride, \
2406 const uint8_t *src, \
2407 ptrdiff_t srcstride, \
2408 int h, int mx, int my) \
2410 const int8_t *filter = vp9_subpel_filters_lsx[type_idx][mx-1]; \
2412 common_hz_8t_and_aver_dst_##SIZE##w_lsx(src, srcstride, dst, \
2413 dststride, filter, h); \
2416 void ff_avg_8tap_##type##_##SIZE##v_lsx(uint8_t *dst, ptrdiff_t dststride, \
2417 const uint8_t *src, \
2418 ptrdiff_t srcstride, \
2419 int h, int mx, int my) \
2421 const int8_t *filter = vp9_subpel_filters_lsx[type_idx][my-1]; \
2423 common_vt_8t_and_aver_dst_##SIZE##w_lsx(src, srcstride, dst, dststride, \
2427 void ff_avg_8tap_##type##_##SIZE##hv_lsx(uint8_t *dst, ptrdiff_t dststride, \
2428 const uint8_t *src, \
2429 ptrdiff_t srcstride, \
2430 int h, int mx, int my) \
2432 const int8_t *hfilter = vp9_subpel_filters_lsx[type_idx][mx-1]; \
2433 const int8_t *vfilter = vp9_subpel_filters_lsx[type_idx][my-1]; \
2435 common_hv_8ht_8vt_and_aver_dst_##SIZE##w_lsx(src, srcstride, dst, \
2436 dststride, hfilter, \
2440 #define VP9_COPY_LOONGARCH_LSX_FUNC(SIZE) \
2441 void ff_copy##SIZE##_lsx(uint8_t *dst, ptrdiff_t dststride, \
2442 const uint8_t *src, ptrdiff_t srcstride, \
2443 int h, int mx, int my) \
2446 copy_width##SIZE##_lsx(src, srcstride, dst, dststride, h); \
2448 void ff_avg##SIZE##_lsx(uint8_t *dst, ptrdiff_t dststride, \
2449 const uint8_t *src, ptrdiff_t srcstride, \
2450 int h, int mx, int my) \
2453 avg_width##SIZE##_lsx(src, srcstride, dst, dststride, h); \
2479 #undef VP9_8TAP_LOONGARCH_LSX_FUNC
2480 #undef VP9_COPY_LOONGARCH_LSX_FUNC