25 #define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \
27 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
28 SRARI_H2_SH(out0, out1, rnd_val); \
29 CLIP_SH2_0_255(out0, out1); \
32 #define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \
33 vec0, vec1, vec2, vec3, rnd_val, \
34 out0, out1, out2, out3) \
36 HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \
37 HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \
54 LD_SB2(src0_ptr, src_stride, src0, src1);
55 LD_SH2(src1_ptr, src2_stride, in0, in1);
57 src0 = (v16i8) __msa_ilvr_w((v4i32)
src1, (v4i32) src0);
58 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
60 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
63 dst0 = __msa_srari_h(dst0, 7);
66 dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
68 }
else if (4 == height) {
71 v8i16 in0, in1, in2, in3;
73 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
74 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
75 ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
77 ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
83 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
84 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
85 }
else if (0 == height % 8) {
87 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
88 v8i16 dst0, dst1, dst2, dst3;
89 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
91 for (loop_cnt = (height >> 3); loop_cnt--;) {
92 LD_SB8(src0_ptr, src_stride,
93 src0, src1, src2, src3, src4, src5, src6, src7);
94 src0_ptr += (8 * src_stride);
96 LD_SH8(src1_ptr, src2_stride,
97 in0, in1, in2, in3, in4, in5, in6, in7);
98 src1_ptr += (8 * src2_stride);
103 ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
104 src0, src1, src2, src3);
105 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
106 dst0, dst1, dst2, dst3);
108 SLLI_4V(dst0, dst1, dst2, dst3, 6);
110 dst0, dst1, dst2, dst3, 7,
111 dst0, dst1, dst2, dst3);
114 ST4x8_UB(dst0, dst1, dst, dst_stride);
115 dst += (8 * dst_stride);
130 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
131 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
132 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
134 for (loop_cnt = (height >> 3); loop_cnt--;) {
135 LD_SB8(src0_ptr, src_stride,
136 src0, src1, src2, src3, src4, src5, src6, src7);
137 src0_ptr += (8 * src_stride);
138 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
139 src1_ptr += (8 * src2_stride);
140 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
141 dst0, dst1, dst2, dst3);
142 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
143 dst4, dst5, dst6, dst7);
145 SLLI_4V(dst0, dst1, dst2, dst3, 6);
146 SLLI_4V(dst4, dst5, dst6, dst7, 6);
148 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
151 ST6x4_UB(dst0, dst1, dst, dst_stride);
152 dst += (4 * dst_stride);
155 dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
158 ST6x4_UB(dst4, dst5, dst, dst_stride);
159 dst += (4 * dst_stride);
178 LD_SB2(src0_ptr, src_stride, src0, src1);
179 LD_SH2(src1_ptr, src2_stride, in0, in1);
180 ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
186 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
188 }
else if (4 == height) {
190 v8i16 in0, in1, in2, in3;
191 v8i16 dst0, dst1, dst2, dst3;
193 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
194 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
195 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
196 dst0, dst1, dst2, dst3);
198 SLLI_4V(dst0, dst1, dst2, dst3, 6);
200 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
203 ST8x4_UB(dst0, dst1, dst, dst_stride);
204 }
else if (6 == height) {
205 v16i8
src0,
src1, src2, src3, src4, src5;
206 v8i16 in0, in1, in2, in3, in4, in5;
207 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
209 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
210 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
211 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
212 dst0, dst1, dst2, dst3);
213 ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5);
215 SLLI_4V(dst0, dst1, dst2, dst3, 6);
219 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
223 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
224 ST8x4_UB(dst0, dst1, dst, dst_stride);
225 dst += (4 * dst_stride);
227 }
else if (0 == height % 8) {
229 v8i16 in0, in1, in2, in3;
230 v8i16 dst0, dst1, dst2, dst3;
233 for (loop_cnt = (height >> 3); loop_cnt--;) {
234 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
235 src0_ptr += (4 * src_stride);
236 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
237 src1_ptr += (4 * src2_stride);
238 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
239 dst0, dst1, dst2, dst3);
241 SLLI_4V(dst0, dst1, dst2, dst3, 6);
243 dst0, dst1, dst2, dst3, 7,
244 dst0, dst1, dst2, dst3);
247 ST8x4_UB(dst0, dst1, dst, dst_stride);
248 dst += (4 * dst_stride);
250 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
251 src0_ptr += (4 * src_stride);
252 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
253 src1_ptr += (4 * src2_stride);
254 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
255 dst0, dst1, dst2, dst3);
257 SLLI_4V(dst0, dst1, dst2, dst3, 6);
259 dst0, dst1, dst2, dst3, 7,
260 dst0, dst1, dst2, dst3);
263 ST8x4_UB(dst0, dst1, dst, dst_stride);
264 dst += (4 * dst_stride);
279 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
280 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
283 for (loop_cnt = (16 >> 2); loop_cnt--;) {
284 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
285 src0_ptr += (4 * src_stride);
287 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
288 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
289 src1_ptr += (4 * src2_stride);
291 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
292 dst0, dst1, dst2, dst3);
294 SLLI_4V(dst0, dst1, dst2, dst3, 6);
295 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
296 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
300 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
304 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
305 ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
306 dst += (4 * dst_stride);
322 int16_t *src1_ptr_tmp;
326 for (cnt = (width >> 4); cnt--;) {
327 src0_ptr_tmp = src0_ptr;
328 src1_ptr_tmp = src1_ptr;
331 for (loop_cnt = (height >> 2); loop_cnt--;) {
333 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
334 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
335 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
337 LD_SB4(src0_ptr_tmp, src_stride, src0, src1, src2, src3);
338 src0_ptr_tmp += (4 * src_stride);
339 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
340 LD_SH4(src1_ptr_tmp + 8, src2_stride, in4, in5, in6, in7);
341 src1_ptr_tmp += (4 * src2_stride);
343 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
344 dst0_r, dst1_r, dst2_r, dst3_r);
345 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
346 dst0_l, dst1_l, dst2_l, dst3_l);
348 SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
349 SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
351 dst0_r, dst1_r, dst0_l, dst1_l, 7,
352 dst0_r, dst1_r, dst0_l, dst1_l);
354 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
355 ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
356 dst_tmp += (2 * dst_stride);
359 dst2_r, dst3_r, dst2_l, dst3_l, 7,
360 dst2_r, dst3_r, dst2_l, dst3_l);
362 PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
363 ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
364 dst_tmp += (2 * dst_stride);
382 dst, dst_stride, height, 16);
394 dst, dst_stride, height, 16);
397 dst + 16, dst_stride, height);
409 dst, dst_stride, height, 32);
421 dst, dst_stride, height, 48);
433 dst, dst_stride, height, 64);
446 v8i16 filt0, filt1, filt2, filt3;
447 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
448 v16i8 mask1, mask2, mask3;
449 v16i8 vec0, vec1, vec2, vec3;
450 v8i16 dst0, dst1, dst2, dst3;
451 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
452 v8i16 filter_vec, const_vec;
453 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
458 filter_vec =
LD_SH(filter);
459 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
465 const_vec = __msa_ldi_h(128);
468 for (loop_cnt = (height >> 3); loop_cnt--;) {
469 LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3,
470 src4, src5, src6, src7);
471 src0_ptr += (8 * src_stride);
472 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
473 src1_ptr += (8 * src2_stride);
479 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
480 vec0, vec1, vec2, vec3);
482 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
483 dst0, dst0, dst0, dst0);
484 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
485 vec0, vec1, vec2, vec3);
487 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
488 dst1, dst1, dst1, dst1);
489 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
490 vec0, vec1, vec2, vec3);
492 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
493 dst2, dst2, dst2, dst2);
494 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
495 vec0, vec1, vec2, vec3);
497 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
498 dst3, dst3, dst3, dst3);
501 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
504 ST4x8_UB(dst0, dst1, dst, dst_stride);
505 dst += (8 * dst_stride);
519 v8i16 filt0, filt1, filt2, filt3;
521 v16i8 mask1, mask2, mask3;
522 v16i8 vec0, vec1, vec2, vec3;
523 v8i16 dst0, dst1, dst2, dst3;
524 v8i16 in0, in1, in2, in3;
525 v8i16 filter_vec, const_vec;
526 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
530 const_vec = __msa_ldi_h(128);
533 filter_vec =
LD_SH(filter);
534 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
540 for (loop_cnt = (height >> 2); loop_cnt--;) {
541 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
542 src0_ptr += (4 * src_stride);
543 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
544 src1_ptr += (4 * src2_stride);
547 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
548 vec0, vec1, vec2, vec3);
550 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
551 dst0, dst0, dst0, dst0);
552 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
553 vec0, vec1, vec2, vec3);
555 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
556 dst1, dst1, dst1, dst1);
557 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
558 vec0, vec1, vec2, vec3);
560 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
561 dst2, dst2, dst2, dst2);
562 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
563 vec0, vec1, vec2, vec3);
565 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
566 dst3, dst3, dst3, dst3);
569 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
572 ST8x4_UB(dst0, dst1, dst, dst_stride);
573 dst += (4 * dst_stride);
587 dst, dst_stride, filter, height);
589 dst + 8, dst_stride, filter, height);
603 v8i16 filt0, filt1, filt2, filt3;
604 v16i8 mask1, mask2, mask3;
605 v16i8 vec0, vec1, vec2, vec3;
606 v8i16 dst0, dst1, dst2, dst3;
607 v8i16 in0, in1, in2, in3;
608 v8i16 filter_vec, const_vec;
609 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
612 const_vec = __msa_ldi_h(128);
615 filter_vec =
LD_SH(filter);
616 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
622 for (loop_cnt = (height >> 1); loop_cnt--;) {
623 LD_SB2(src0_ptr, 8, src0, src1);
624 src0_ptr += src_stride;
625 LD_SB2(src0_ptr, 8, src2, src3);
626 src0_ptr += src_stride;
627 LD_SH2(src1_ptr, 8, in0, in1);
628 src1_ptr += src2_stride;
629 LD_SH2(src1_ptr, 8, in2, in3);
630 src1_ptr += src2_stride;
633 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
634 vec0, vec1, vec2, vec3);
636 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
637 dst0, dst0, dst0, dst0);
638 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
639 vec0, vec1, vec2, vec3);
641 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
642 dst1, dst1, dst1, dst1);
643 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
644 vec0, vec1, vec2, vec3);
646 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
647 dst2, dst2, dst2, dst2);
648 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
649 vec0, vec1, vec2, vec3);
651 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
652 dst3, dst3, dst3, dst3);
655 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
658 ST_SH2(dst0, dst1, dst, dst_stride);
659 dst += (2 * dst_stride);
675 v8i16 filt0, filt1, filt2, filt3;
676 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
677 v16i8 vec0, vec1, vec2, vec3;
678 v8i16 dst0, dst1, dst2;
680 v8i16 filter_vec, const_vec;
681 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
683 src0_ptr = src0_ptr - 3;
684 const_vec = __msa_ldi_h(128);
687 filter_vec =
LD_SH(filter);
688 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
698 for (loop_cnt = height; loop_cnt--;) {
699 LD_SB2(src0_ptr, 16, src0, src1);
700 src0_ptr += src_stride;
701 LD_SH2(src1_ptr, 8, in0, in1);
702 in2 =
LD_SH(src1_ptr + 16);
703 src1_ptr += src2_stride;
706 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
707 vec0, vec1, vec2, vec3);
709 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
710 dst0, dst0, dst0, dst0);
711 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
712 vec0, vec1, vec2, vec3);
714 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
715 dst1, dst1, dst1, dst1);
716 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
717 vec0, vec1, vec2, vec3);
719 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
720 dst2, dst2, dst2, dst2);
723 dst2 = __msa_adds_s_h(dst2, in2);
724 dst2 = __msa_srari_h(dst2, 7);
728 dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
730 SD(dst_val0, dst + 16);
746 v8i16 filt0, filt1, filt2, filt3;
747 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
748 v16i8 vec0, vec1, vec2, vec3;
749 v8i16 dst0, dst1, dst2, dst3;
750 v8i16 in0, in1, in2, in3;
751 v8i16 filter_vec, const_vec;
752 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
755 const_vec = __msa_ldi_h(128);
758 filter_vec =
LD_SH(filter);
759 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
769 for (loop_cnt = height; loop_cnt--;) {
770 LD_SB2(src0_ptr, 16, src0, src1);
771 src2 =
LD_SB(src0_ptr + 24);
772 src0_ptr += src_stride;
773 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
774 src1_ptr += src2_stride;
777 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
778 vec0, vec1, vec2, vec3);
780 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
781 dst0, dst0, dst0, dst0);
782 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
783 vec0, vec1, vec2, vec3);
785 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
786 dst1, dst1, dst1, dst1);
787 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
788 vec0, vec1, vec2, vec3);
790 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
791 dst2, dst2, dst2, dst2);
792 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
793 vec0, vec1, vec2, vec3);
795 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
796 dst3, dst3, dst3, dst3);
799 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
802 ST_SB2(tmp0, tmp1, dst, 16);
818 v16i8 tmp0, tmp1, tmp2;
819 v8i16 filt0, filt1, filt2, filt3;
820 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
821 v16i8 vec0, vec1, vec2, vec3;
822 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
823 v8i16 in0, in1, in2, in3, in4, in5;
824 v8i16 filter_vec, const_vec;
825 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
829 const_vec = __msa_ldi_h(128);
832 filter_vec =
LD_SH(filter);
833 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
843 for (loop_cnt = height; loop_cnt--;) {
844 LD_SB2(src0_ptr, 16, src0, src1);
846 LD_SH2(src1_ptr, 8, in0, in1);
848 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
849 vec0, vec1, vec2, vec3);
851 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
852 dst0, dst0, dst0, dst0);
853 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
854 vec0, vec1, vec2, vec3);
856 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
857 dst1, dst1, dst1, dst1);
861 tmp0 = __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
864 LD_SB2(src0_ptr + 32, 8, src2, src3);
866 src0_ptr += src_stride;
868 LD_SH2(src1_ptr + 16, 8, in2, in3);
870 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
871 vec0, vec1, vec2, vec3);
873 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
874 dst2, dst2, dst2, dst2);
875 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
876 vec0, vec1, vec2, vec3);
878 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
879 dst3, dst3, dst3, dst3);
883 tmp1 = __msa_pckev_b((v16i8) dst3, (v16i8) dst2);
884 ST_SB(tmp1, dst + 16);
886 LD_SH2(src1_ptr + 32, 8, in4, in5);
887 src1_ptr += src2_stride;
889 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
890 vec0, vec1, vec2, vec3);
892 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
893 dst4, dst4, dst4, dst4);
894 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
895 vec0, vec1, vec2, vec3);
897 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
898 dst5, dst5, dst5, dst5);
902 tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
903 ST_SB(tmp2, dst + 32);
919 int16_t *src1_ptr_tmp;
923 v8i16 filt0, filt1, filt2, filt3;
924 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
925 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
926 v16i8 vec0, vec1, vec2, vec3;
927 v8i16 dst0, dst1, dst2, dst3;
928 v8i16 in0, in1, in2, in3;
929 v8i16 filter_vec, const_vec;
933 const_vec = __msa_ldi_h(128);
936 filter_vec =
LD_SH(filter);
937 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
947 for (loop_cnt = height; loop_cnt--;) {
948 src0_ptr_tmp = src0_ptr;
950 src1_ptr_tmp = src1_ptr;
952 for (cnt = 2; cnt--;) {
953 LD_SB2(src0_ptr_tmp, 16, src0, src1);
954 src2 =
LD_SB(src0_ptr_tmp + 24);
956 LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
960 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
961 vec0, vec1, vec2, vec3);
963 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
964 dst0, dst0, dst0, dst0);
965 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
966 vec0, vec1, vec2, vec3);
968 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
969 dst1, dst1, dst1, dst1);
970 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
971 vec0, vec1, vec2, vec3);
973 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
974 dst2, dst2, dst2, dst2);
975 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
976 vec0, vec1, vec2, vec3);
978 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
979 dst3, dst3, dst3, dst3);
982 dst0, dst1, dst2, dst3, 7,
983 dst0, dst1, dst2, dst3);
986 ST_SB2(tmp0, tmp1, dst_tmp, 16);
990 src1_ptr += src2_stride;
991 src0_ptr += src_stride;
1006 v16i8
src0,
src1, src2, src3, src4, src5;
1007 v16i8 src6, src7, src8, src9, src10;
1008 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1009 v16i8 src11, src12, src13, src14;
1010 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1011 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1012 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1013 v16i8 src2110, src4332, src6554, src8776, src10998;
1014 v16i8 src12111110, src14131312;
1015 v8i16 dst10, dst32, dst54, dst76;
1016 v8i16 filt0, filt1, filt2, filt3;
1017 v8i16 filter_vec, const_vec;
1019 src0_ptr -= (3 * src_stride);
1021 const_vec = __msa_ldi_h(128);
1024 filter_vec =
LD_SH(filter);
1025 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1027 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1028 src0_ptr += (7 * src_stride);
1029 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1030 src10_r, src32_r, src54_r, src21_r);
1031 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1032 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1033 src2110, src4332, src6554);
1036 for (loop_cnt = (height >> 3); loop_cnt--;) {
1037 LD_SB8(src0_ptr, src_stride,
1038 src7, src8, src9, src10, src11, src12, src13, src14);
1039 src0_ptr += (8 * src_stride);
1040 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1041 src1_ptr += (8 * src2_stride);
1045 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1046 src76_r, src87_r, src98_r, src109_r);
1047 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1048 src1110_r, src1211_r, src1312_r, src1413_r);
1049 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1050 src1413_r, src1312_r,
1051 src8776, src10998, src12111110, src14131312);
1056 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1059 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1062 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1064 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1065 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1068 dst10, dst32, dst54, dst76, 7,
1069 dst10, dst32, dst54, dst76);
1071 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
1072 ST4x8_UB(dst10, dst54, dst, dst_stride);
1073 dst += (8 * dst_stride);
1076 src4332 = src12111110;
1077 src6554 = src14131312;
1092 v16i8
src0,
src1, src2, src3, src4, src5;
1093 v16i8 src6, src7, src8, src9, src10;
1094 v8i16 in0, in1, in2, in3;
1095 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1096 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1097 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1098 v8i16 filt0, filt1, filt2, filt3;
1099 v8i16 filter_vec, const_vec;
1101 src0_ptr -= (3 * src_stride);
1102 const_vec = __msa_ldi_h(128);
1105 filter_vec =
LD_SH(filter);
1106 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1108 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1109 src0_ptr += (7 * src_stride);
1111 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1112 src10_r, src32_r, src54_r, src21_r);
1113 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1115 for (loop_cnt = (height >> 2); loop_cnt--;) {
1116 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1117 src0_ptr += (4 * src_stride);
1118 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1119 src1_ptr += (4 * src2_stride);
1121 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1122 src76_r, src87_r, src98_r, src109_r);
1126 filt0, filt1, filt2, filt3,
1127 dst0_r, dst0_r, dst0_r, dst0_r);
1130 filt0, filt1, filt2, filt3,
1131 dst1_r, dst1_r, dst1_r, dst1_r);
1134 filt0, filt1, filt2, filt3,
1135 dst2_r, dst2_r, dst2_r, dst2_r);
1138 filt0, filt1, filt2, filt3,
1139 dst3_r, dst3_r, dst3_r, dst3_r);
1142 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1143 dst0_r, dst1_r, dst2_r, dst3_r);
1145 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1146 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
1147 dst += (4 * dst_stride);
1170 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1171 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1172 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1173 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1174 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1175 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1176 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1177 v16i8 src2110, src4332, src6554, src8776, src10998;
1178 v8i16 dst0_l, dst1_l;
1179 v8i16 filt0, filt1, filt2, filt3;
1180 v8i16 filter_vec, const_vec;
1182 src0_ptr -= (3 * src_stride);
1183 const_vec = __msa_ldi_h(128);
1186 filter_vec =
LD_SH(filter);
1187 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1189 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1190 src0_ptr += (7 * src_stride);
1193 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1194 src10_r, src32_r, src54_r, src21_r);
1195 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1196 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1197 src10_l, src32_l, src54_l, src21_l);
1198 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1199 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1200 src2110, src4332, src6554);
1202 for (loop_cnt = (height >> 2); loop_cnt--;) {
1203 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1204 src0_ptr += (4 * src_stride);
1205 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1206 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
1207 src1_ptr += (4 * src2_stride);
1211 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1212 src76_r, src87_r, src98_r, src109_r);
1213 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1214 src76_l, src87_l, src98_l, src109_l);
1215 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1219 filt0, filt1, filt2, filt3,
1220 dst0_r, dst0_r, dst0_r, dst0_r);
1223 filt0, filt1, filt2, filt3,
1224 dst1_r, dst1_r, dst1_r, dst1_r);
1227 filt0, filt1, filt2, filt3,
1228 dst2_r, dst2_r, dst2_r, dst2_r);
1231 filt0, filt1, filt2, filt3,
1232 dst3_r, dst3_r, dst3_r, dst3_r);
1235 filt0, filt1, filt2, filt3,
1236 dst0_l, dst0_l, dst0_l, dst0_l);
1239 filt0, filt1, filt2, filt3,
1240 dst1_l, dst1_l, dst1_l, dst1_l);
1243 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1244 dst0_r, dst1_r, dst2_r, dst3_r);
1248 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1249 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
1250 ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
1251 dst += (4 * dst_stride);
1276 int16_t *src1_ptr_tmp;
1280 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1281 v8i16 in0, in1, in2, in3;
1282 v16i8 src10_r, src32_r, src54_r, src76_r;
1283 v16i8 src21_r, src43_r, src65_r, src87_r;
1284 v8i16 dst0_r, dst1_r;
1285 v16i8 src10_l, src32_l, src54_l, src76_l;
1286 v16i8 src21_l, src43_l, src65_l, src87_l;
1287 v8i16 dst0_l, dst1_l;
1288 v8i16 filt0, filt1, filt2, filt3;
1289 v8i16 filter_vec, const_vec;
1291 src0_ptr -= (3 * src_stride);
1292 const_vec = __msa_ldi_h(128);
1295 filter_vec =
LD_SH(filter);
1296 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1298 for (cnt = (width >> 4); cnt--;) {
1299 src0_ptr_tmp = src0_ptr;
1300 src1_ptr_tmp = src1_ptr;
1303 LD_SB7(src0_ptr_tmp, src_stride,
1304 src0, src1, src2, src3, src4, src5, src6);
1305 src0_ptr_tmp += (7 * src_stride);
1308 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1309 src10_r, src32_r, src54_r, src21_r);
1310 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1311 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1312 src10_l, src32_l, src54_l, src21_l);
1313 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1315 for (loop_cnt = (height >> 1); loop_cnt--;) {
1316 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1317 src0_ptr_tmp += (2 * src_stride);
1318 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1319 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1320 src1_ptr_tmp += (2 * src2_stride);
1323 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1324 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1328 filt0, filt1, filt2, filt3,
1329 dst0_r, dst0_r, dst0_r, dst0_r);
1332 filt0, filt1, filt2, filt3,
1333 dst1_r, dst1_r, dst1_r, dst1_r);
1336 filt0, filt1, filt2, filt3,
1337 dst0_l, dst0_l, dst0_l, dst0_l);
1340 filt0, filt1, filt2, filt3,
1341 dst1_l, dst1_l, dst1_l, dst1_l);
1344 dst0_r, dst1_r, dst0_l, dst1_l, 7,
1345 dst0_r, dst1_r, dst0_l, dst1_l);
1347 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
1348 ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
1349 dst_tmp += (2 * dst_stride);
1382 dst, dst_stride, filter, height, 16);
1395 dst, dst_stride, filter, height, 16);
1397 dst + 16, dst_stride, filter, height);
1410 dst, dst_stride, filter, height, 32);
1423 dst, dst_stride, filter, height, 48);
1436 dst, dst_stride, filter, height, 64);
1445 const int8_t *filter_x,
1446 const int8_t *filter_y,
1450 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1452 v8i16 filt0, filt1, filt2, filt3;
1453 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1454 v16i8 mask1, mask2, mask3;
1455 v8i16 filter_vec, const_vec;
1456 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1457 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1458 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1459 v4i32 dst0_r, dst1_r, in0_r, in0_l;
1460 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1461 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1462 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1463 v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1465 src0_ptr -= ((3 * src_stride) + 3);
1466 filter_vec =
LD_SH(filter_x);
1467 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1469 filter_vec =
LD_SH(filter_y);
1470 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1471 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1473 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1479 const_vec = __msa_ldi_h(128);
1482 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1483 src0_ptr += (7 * src_stride);
1487 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1488 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1489 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1490 vec8, vec9, vec10, vec11);
1491 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1492 vec12, vec13, vec14, vec15);
1495 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1496 dst30, dst30, dst30, dst30);
1498 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1499 dst41, dst41, dst41, dst41);
1501 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1502 dst52, dst52, dst52, dst52);
1504 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1505 dst63, dst63, dst63, dst63);
1507 ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1508 dst10_r, dst21_r, dst32_r);
1509 dst43_r = __msa_ilvl_h(dst41, dst30);
1510 dst54_r = __msa_ilvl_h(dst52, dst41);
1511 dst65_r = __msa_ilvl_h(dst63, dst52);
1512 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1514 for (loop_cnt = height >> 1; loop_cnt--;) {
1515 LD_SB2(src0_ptr, src_stride, src7, src8);
1516 src0_ptr += (2 * src_stride);
1517 LD_SH2(src1_ptr, src2_stride, in0, in1);
1518 src1_ptr += (2 * src2_stride);
1520 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
1523 VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1524 vec0, vec1, vec2, vec3);
1526 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1527 dst87, dst87, dst87, dst87);
1528 dst76_r = __msa_ilvr_h(dst87, dst66);
1530 filt_h0, filt_h1, filt_h2, filt_h3);
1531 dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
1533 filt_h0, filt_h1, filt_h2, filt_h3);
1538 dst0_r = __msa_adds_s_w(dst0_r, in0_r);
1539 dst1_r = __msa_adds_s_w(dst1_r, in0_l);
1546 dst += (2 * dst_stride);
1554 dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1564 const int8_t *filter_x,
1565 const int8_t *filter_y,
1571 int16_t *src1_ptr_tmp;
1573 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1575 v4i32 in0_r, in0_l, in1_r, in1_l;
1576 v8i16 filt0, filt1, filt2, filt3;
1577 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1578 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1579 v16i8 mask1, mask2, mask3;
1580 v8i16 filter_vec, const_vec;
1581 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1582 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1583 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1584 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1585 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1586 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1587 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1588 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1590 src0_ptr -= ((3 * src_stride) + 3);
1591 const_vec = __msa_ldi_h(128);
1594 filter_vec =
LD_SH(filter_x);
1595 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1597 filter_vec =
LD_SH(filter_y);
1598 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1599 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1600 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1606 for (cnt = width >> 3; cnt--;) {
1607 src0_ptr_tmp = src0_ptr;
1609 src1_ptr_tmp = src1_ptr;
1611 LD_SB7(src0_ptr_tmp, src_stride,
1612 src0, src1, src2, src3, src4, src5, src6);
1613 src0_ptr_tmp += (7 * src_stride);
1617 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1618 vec0, vec1, vec2, vec3);
1619 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1620 vec4, vec5, vec6, vec7);
1621 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1622 vec8, vec9, vec10, vec11);
1623 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1624 vec12, vec13, vec14, vec15);
1626 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1627 dst0, dst0, dst0, dst0);
1629 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1630 dst1, dst1, dst1, dst1);
1632 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1633 dst2, dst2, dst2, dst2);
1635 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1636 dst3, dst3, dst3, dst3);
1638 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1639 vec0, vec1, vec2, vec3);
1640 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1641 vec4, vec5, vec6, vec7);
1642 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1643 vec8, vec9, vec10, vec11);
1645 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1646 dst4, dst4, dst4, dst4);
1648 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1649 dst5, dst5, dst5, dst5);
1651 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1652 dst6, dst6, dst6, dst6);
1654 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1655 dst10_r, dst32_r, dst54_r, dst21_r);
1656 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1657 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1658 dst10_l, dst32_l, dst54_l, dst21_l);
1659 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1661 for (loop_cnt = height >> 1; loop_cnt--;) {
1663 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1665 src0_ptr_tmp += 2 * src_stride;
1667 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1668 src1_ptr_tmp += (2 * src2_stride);
1670 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1671 vec0, vec1, vec2, vec3);
1673 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1674 dst7, dst7, dst7, dst7);
1678 filt_h0, filt_h1, filt_h2, filt_h3);
1680 filt_h0, filt_h1, filt_h2, filt_h3);
1684 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1685 vec0, vec1, vec2, vec3);
1687 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1688 dst8, dst8, dst8, dst8);
1692 filt_h0, filt_h1, filt_h2, filt_h3);
1694 filt_h0, filt_h1, filt_h2, filt_h3);
1700 in0_r = __msa_adds_s_w(in0_r, dst0_r);
1701 in0_l = __msa_adds_s_w(in0_l, dst0_l);
1702 in1_r = __msa_adds_s_w(in1_r, dst1_r);
1703 in1_l = __msa_adds_s_w(in1_l, dst1_l);
1711 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
1712 dst_tmp += (2 * dst_stride);
1741 const int8_t *filter_x,
1742 const int8_t *filter_y,
1746 dst, dst_stride, filter_x, filter_y,
1756 const int8_t *filter_x,
1757 const int8_t *filter_y,
1761 dst, dst_stride, filter_x, filter_y,
1765 dst + 8, dst_stride, filter_x, filter_y, height);
1774 const int8_t *filter_x,
1775 const int8_t *filter_y,
1779 dst, dst_stride, filter_x, filter_y,
1789 const int8_t *filter_x,
1790 const int8_t *filter_y,
1794 dst, dst_stride, filter_x, filter_y,
1804 const int8_t *filter_x,
1805 const int8_t *filter_y,
1809 dst, dst_stride, filter_x, filter_y,
1819 const int8_t *filter_x,
1820 const int8_t *filter_y,
1824 dst, dst_stride, filter_x, filter_y,
1834 const int8_t *filter_x,
1835 const int8_t *filter_y,
1839 dst, dst_stride, filter_x, filter_y,
1853 v16i8
src0,
src1, dst0, vec0, vec1;
1855 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1858 v8i16 filter_vec, const_vec;
1862 const_vec = __msa_ldi_h(128);
1865 filter_vec =
LD_SH(filter);
1870 LD_SB2(src0_ptr, src_stride, src0, src1);
1871 LD_SH2(src1_ptr, src2_stride, in0, in1);
1872 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
1874 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1878 tmp0 = __msa_adds_s_h(tmp0, in0);
1879 tmp0 = __msa_srari_h(tmp0, 7);
1881 dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
1896 v16i8
src0,
src1, src2, src3, dst0, vec0, vec1;
1897 v8i16 in0, in1, in2, in3;
1898 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1901 v8i16 filter_vec, const_vec;
1905 const_vec = __msa_ldi_h(128);
1908 filter_vec =
LD_SH(filter);
1913 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
1914 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1919 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1922 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1926 dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
1928 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1942 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
1944 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1945 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1946 v16i8 mask1, vec0, vec1;
1947 v8i16 tmp0, tmp1, tmp2, tmp3;
1948 v8i16 filter_vec, const_vec;
1952 const_vec = __msa_ldi_h(128);
1955 filter_vec =
LD_SH(filter);
1960 for (loop_cnt = (height >> 3); loop_cnt--;) {
1961 LD_SB8(src0_ptr, src_stride,
1962 src0, src1, src2, src3, src4, src5, src6, src7);
1963 src0_ptr += (8 * src_stride);
1964 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1965 src1_ptr += (4 * src2_stride);
1966 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
1967 src1_ptr += (4 * src2_stride);
1972 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1975 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1978 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
1981 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
1986 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
1989 ST4x8_UB(dst0, dst1, dst, dst_stride);
1990 dst += (8 * dst_stride);
2005 dst, dst_stride, filter, height);
2006 }
else if (4 == height) {
2008 dst, dst_stride, filter, height);
2009 }
else if (8 == height || 16 == height) {
2011 src1_ptr, src2_stride,
2012 dst, dst_stride, filter, height);
2028 v8i16 in0, in1, in2, in3;
2029 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2032 v8i16 dst0, dst1, dst2, dst3;
2033 v8i16 filter_vec, const_vec;
2037 const_vec = __msa_ldi_h(128);
2040 filter_vec =
LD_SH(filter);
2045 for (loop_cnt = (height >> 2); loop_cnt--;) {
2046 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2047 src0_ptr += (4 * src_stride);
2048 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2049 src1_ptr += (4 * src2_stride);
2052 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2055 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2058 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2061 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2066 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2069 ST6x4_UB(dst0, dst1, dst, dst_stride);
2070 dst += (4 * dst_stride);
2086 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2087 v16i8 mask1, vec0, vec1;
2089 v8i16 filter_vec, const_vec;
2093 const_vec = __msa_ldi_h(128);
2096 filter_vec =
LD_SH(filter);
2101 LD_SB2(src0_ptr, src_stride, src0, src1);
2102 LD_SH2(src1_ptr, src2_stride, in0, in1);
2105 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2108 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2113 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2127 v16i8
src0,
src1, src2, src3, src4, src5;
2128 v8i16 in0, in1, in2, in3, in4, in5;
2129 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2132 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2133 v8i16 filter_vec, const_vec;
2137 const_vec = __msa_ldi_h(128);
2140 filter_vec =
LD_SH(filter);
2145 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
2146 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2147 src1_ptr += (4 * src2_stride);
2148 LD_SH2(src1_ptr, src2_stride, in4, in5);
2151 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2154 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2157 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2160 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2163 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2166 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2171 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2175 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2176 ST8x4_UB(dst0, dst1, dst, dst_stride);
2177 dst += (4 * dst_stride);
2193 v8i16 in0, in1, in2, in3;
2194 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2197 v8i16 dst0, dst1, dst2, dst3;
2198 v8i16 filter_vec, const_vec;
2202 const_vec = __msa_ldi_h(128);
2205 filter_vec =
LD_SH(filter);
2210 for (loop_cnt = (height >> 2); loop_cnt--;) {
2211 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2212 src0_ptr += (4 * src_stride);
2213 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2214 src1_ptr += (4 * src2_stride);
2217 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2220 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2223 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2226 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2231 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2234 ST8x4_UB(dst0, dst1, dst, dst_stride);
2235 dst += (4 * dst_stride);
2250 dst, dst_stride, filter, height);
2251 }
else if (6 == height) {
2253 dst, dst_stride, filter, height);
2254 }
else if (0 == (height % 4)) {
2256 src1_ptr, src2_stride,
2257 dst, dst_stride, filter, height);
2273 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2274 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2276 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2280 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2281 v8i16 filter_vec, const_vec;
2285 const_vec = __msa_ldi_h(128);
2288 filter_vec =
LD_SH(filter);
2294 for (loop_cnt = (height >> 2); loop_cnt--;) {
2295 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2296 src0_ptr += (4 * src_stride);
2297 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2298 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
2299 src1_ptr += (4 * src2_stride);
2304 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2307 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2310 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2313 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2316 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2319 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2324 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2328 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2329 ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
2330 dst += (4 * dst_stride);
2344 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2345 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2347 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2349 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2351 v8i16 filter_vec, const_vec;
2355 const_vec = __msa_ldi_h(128);
2358 filter_vec =
LD_SH(filter);
2363 for (loop_cnt = (height >> 2); loop_cnt--;) {
2364 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
2365 LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
2366 src0_ptr += (4 * src_stride);
2367 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2368 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2369 src1_ptr += (4 * src2_stride);
2372 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2375 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2378 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2381 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2384 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2387 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2390 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2393 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2398 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2400 dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2403 dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2404 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2405 dst += (4 * dst_stride);
2418 int16_t *src1_ptr_tmp;
2421 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2422 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2424 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2425 v16i8 mask1, mask2, mask3;
2427 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2428 v8i16 filter_vec, const_vec;
2432 const_vec = __msa_ldi_h(128);
2435 filter_vec =
LD_SH(filter);
2443 src1_ptr_tmp = src1_ptr + 16;
2445 for (loop_cnt = (height >> 2); loop_cnt--;) {
2446 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
2447 LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7);
2448 src0_ptr += (4 * src_stride);
2449 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2450 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2451 src1_ptr += (4 * src2_stride);
2454 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2457 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2460 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2463 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2466 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2469 VSHF_B2_SB(src4, src5, src4, src5, mask2, mask3, vec0, vec1);
2472 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2475 VSHF_B2_SB(src6, src7, src6, src7, mask2, mask3, vec0, vec1);
2480 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2482 dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2485 dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2486 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2487 dst += (4 * dst_stride);
2489 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
2490 src1_ptr_tmp += (4 * src2_stride);
2492 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2495 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2498 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2501 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2506 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2509 ST8x4_UB(dst0, dst1, dst_tmp, dst_stride);
2510 dst_tmp += (4 * dst_stride);
2525 v8i16 in0, in1, in2, in3;
2527 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2528 v16i8 mask1, mask2, mask3;
2529 v8i16 dst0, dst1, dst2, dst3;
2531 v8i16 filter_vec, const_vec;
2535 const_vec = __msa_ldi_h(128);
2538 filter_vec =
LD_SH(filter);
2545 for (loop_cnt = (height >> 1); loop_cnt--;) {
2546 LD_SB2(src0_ptr, 16, src0, src1);
2547 src2 =
LD_SB(src0_ptr + 24);
2548 src0_ptr += src_stride;
2549 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
2550 src1_ptr += src2_stride;
2553 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2556 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2559 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2562 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2567 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2570 ST_SH2(dst0, dst1, dst, 16);
2573 LD_SB2(src0_ptr, 16, src0, src1);
2574 src2 =
LD_SB(src0_ptr + 24);
2575 src0_ptr += src_stride;
2576 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
2577 src1_ptr += src2_stride;
2580 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2583 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2586 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2589 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2594 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2597 ST_SH2(dst0, dst1, dst, 16);
2611 v16i8
src0,
src1, src2, src3, src4;
2613 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2616 v8i16 filter_vec, const_vec;
2618 src0_ptr -= src_stride;
2620 const_vec = __msa_ldi_h(128);
2623 filter_vec =
LD_SH(filter);
2626 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2627 src0_ptr += (3 * src_stride);
2629 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2630 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2631 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2633 LD_SB2(src0_ptr, src_stride, src3, src4);
2634 LD_SH2(src1_ptr, src2_stride, in0, in1);
2635 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2636 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2637 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2638 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2641 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2642 dst10 = __msa_adds_s_h(dst10, in0);
2643 dst10 = __msa_srari_h(dst10, 7);
2646 dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
2659 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2660 v8i16 in0, in1, in2, in3;
2661 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2662 v16i8 src2110, src4332, src6554;
2665 v8i16 filter_vec, const_vec;
2667 src0_ptr -= src_stride;
2669 const_vec = __msa_ldi_h(128);
2672 filter_vec =
LD_SH(filter);
2675 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2676 src0_ptr += (3 * src_stride);
2677 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2678 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2679 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2681 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
2682 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2684 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2685 src32_r, src43_r, src54_r, src65_r);
2686 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
2690 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2692 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2695 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
2696 ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
2709 v16i8
src0,
src1, src2, src3, src4, src5;
2710 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2711 v16i8 src6, src7, src8, src9;
2712 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
2713 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
2714 v16i8 src2110, src4332, src6554, src8776;
2715 v8i16 dst10, dst32, dst54, dst76;
2717 v8i16 filter_vec, const_vec;
2719 src0_ptr -= src_stride;
2721 const_vec = __msa_ldi_h(128);
2724 filter_vec =
LD_SH(filter);
2727 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2728 src0_ptr += (3 * src_stride);
2729 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2730 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2731 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2733 for (loop_cnt = (height >> 3); loop_cnt--;) {
2734 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
2735 src0_ptr += (6 * src_stride);
2736 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
2737 src1_ptr += (8 * src2_stride);
2740 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2741 src32_r, src43_r, src54_r, src65_r);
2742 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2743 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
2744 src4332, src6554, src8776);
2748 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2750 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2752 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2754 LD_SB2(src0_ptr, src_stride, src9, src2);
2755 src0_ptr += (2 * src_stride);
2756 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
2757 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
2758 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2760 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
2763 dst10, dst32, dst54, dst76, 7,
2764 dst10, dst32, dst54, dst76);
2766 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
2767 ST4x8_UB(dst10, dst54, dst, dst_stride);
2768 dst += (8 * dst_stride);
2783 dst, dst_stride, filter, height);
2784 }
else if (4 == height) {
2786 dst, dst_stride, filter, height);
2789 src1_ptr, src2_stride,
2790 dst, dst_stride, filter, height);
2804 v16i8
src0,
src1, src2, src3, src4, src5;
2805 v8i16 in0, in1, in2, in3;
2806 v16i8 src10_r, src32_r, src21_r, src43_r;
2807 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2809 v8i16 filter_vec, const_vec;
2811 src0_ptr -= src_stride;
2813 const_vec = __msa_ldi_h(128);
2816 filter_vec =
LD_SH(filter);
2819 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2820 src0_ptr += (3 * src_stride);
2822 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2824 for (loop_cnt = (height >> 2); loop_cnt--;) {
2825 LD_SB2(src0_ptr, src_stride, src3, src4);
2826 src0_ptr += (2 * src_stride);
2827 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2828 src1_ptr += (4 * src2_stride);
2830 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2833 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2835 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2837 LD_SB2(src0_ptr, src_stride, src5, src2);
2838 src0_ptr += (2 * src_stride);
2840 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2843 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
2845 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
2848 dst0_r, dst1_r, dst2_r, dst3_r, 7,
2849 dst0_r, dst1_r, dst2_r, dst3_r);
2851 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
2852 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
2853 dst += (4 * dst_stride);
2866 v16i8
src0,
src1, src2, src3, src4;
2867 v8i16 in0, in1, dst0_r, dst1_r;
2868 v16i8 src10_r, src32_r, src21_r, src43_r;
2870 v8i16 filter_vec, const_vec;
2872 src0_ptr -= src_stride;
2874 const_vec = __msa_ldi_h(128);
2877 filter_vec =
LD_SH(filter);
2880 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2881 src0_ptr += (3 * src_stride);
2883 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2885 LD_SB2(src0_ptr, src_stride, src3, src4);
2886 LD_SH2(src1_ptr, src2_stride, in0, in1);
2888 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2891 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2893 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2896 dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2910 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2911 v8i16 in0, in1, in2, in3, in4, in5;
2912 v16i8 src10_r, src32_r, src54_r, src76_r;
2913 v16i8 src21_r, src43_r, src65_r, src87_r;
2914 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
2916 v8i16 filter_vec, const_vec;
2918 src0_ptr -= src_stride;
2920 const_vec = __msa_ldi_h(128);
2923 filter_vec =
LD_SH(filter);
2926 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2927 src0_ptr += (3 * src_stride);
2929 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2931 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
2932 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
2934 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2935 src32_r, src43_r, src54_r, src65_r);
2936 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2939 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2941 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2943 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
2945 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
2947 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r);
2949 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r);
2951 dst0_r, dst1_r, dst2_r, dst3_r, 7,
2952 dst0_r, dst1_r, dst2_r, dst3_r);
2955 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
2956 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
2957 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
2958 dst += (4 * dst_stride);
2972 v16i8
src0,
src1, src2, src3, src4, src5;
2973 v8i16 in0, in1, in2, in3;
2974 v16i8 src10_r, src32_r, src21_r, src43_r;
2975 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2977 v8i16 filter_vec, const_vec;
2979 src0_ptr -= src_stride;
2981 const_vec = __msa_ldi_h(128);
2984 filter_vec =
LD_SH(filter);
2987 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2988 src0_ptr += (3 * src_stride);
2990 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2992 for (loop_cnt = (height >> 2); loop_cnt--;) {
2993 LD_SB2(src0_ptr, src_stride, src3, src4);
2994 src0_ptr += (2 * src_stride);
2995 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2996 src1_ptr += (4 * src2_stride);
2998 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3001 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3003 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3005 LD_SB2(src0_ptr, src_stride, src5, src2);
3006 src0_ptr += (2 * src_stride);
3008 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3011 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3013 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3015 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3016 dst0_r, dst1_r, dst2_r, dst3_r);
3018 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3019 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3020 dst += (4 * dst_stride);
3035 dst, dst_stride, filter, height);
3036 }
else if (6 == height) {
3038 dst, dst_stride, filter, height);
3041 src1_ptr, src2_stride,
3042 dst, dst_stride, filter, height);
3056 v16i8
src0,
src1, src2, src3, src4, src5;
3057 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3058 v16i8 src10_r, src32_r, src21_r, src43_r;
3059 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3060 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3061 v16i8 src2110, src4332;
3062 v8i16 dst0_l, dst1_l, filt0, filt1;
3063 v8i16 filter_vec, const_vec;
3065 src0_ptr -= (1 * src_stride);
3067 const_vec = __msa_ldi_h(128);
3070 filter_vec =
LD_SH(filter);
3073 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3074 src0_ptr += (3 * src_stride);
3076 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3077 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3078 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3080 for (loop_cnt = (height >> 2); loop_cnt--;) {
3081 LD_SB2(src0_ptr, src_stride, src3, src4);
3082 src0_ptr += (2 * src_stride);
3083 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3084 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
3085 src1_ptr += (4 * src2_stride);
3089 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3090 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3091 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3094 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3096 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3098 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
3100 LD_SB2(src0_ptr, src_stride, src5, src2);
3101 src0_ptr += (2 * src_stride);
3104 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3105 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
3106 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3109 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3111 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3113 DPADD_SB2_SH(src4332, src2110, filt0, filt1, dst1_l, dst1_l);
3115 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3116 dst0_r, dst1_r, dst2_r, dst3_r);
3119 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3120 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
3121 ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
3122 dst += (4 * dst_stride);
3136 v16i8
src0,
src1, src2, src3, src4, src5;
3137 v8i16 in0, in1, in2, in3;
3138 v16i8 src10_r, src32_r, src21_r, src43_r;
3139 v16i8 src10_l, src32_l, src21_l, src43_l;
3140 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
3142 v8i16 filter_vec, const_vec;
3144 src0_ptr -= src_stride;
3146 const_vec = __msa_ldi_h(128);
3149 filter_vec =
LD_SH(filter);
3152 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3153 src0_ptr += (3 * src_stride);
3155 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3156 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3158 for (loop_cnt = (height >> 2); loop_cnt--;) {
3159 LD_SB2(src0_ptr, src_stride, src3, src4);
3160 src0_ptr += (2 * src_stride);
3161 LD_SH2(src1_ptr, src2_stride, in0, in1);
3162 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3163 src1_ptr += (2 * src2_stride);
3165 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3166 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3169 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3171 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3173 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3175 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3177 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3178 dst0_r, dst1_r, dst0_l, dst1_l);
3180 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3181 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3182 dst += (2 * dst_stride);
3184 LD_SB2(src0_ptr, src_stride, src5, src2);
3185 src0_ptr += (2 * src_stride);
3186 LD_SH2(src1_ptr, src2_stride, in0, in1);
3187 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3188 src1_ptr += (2 * src2_stride);
3190 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3191 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3194 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3196 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3198 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3200 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3202 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3203 dst0_r, dst1_r, dst0_l, dst1_l);
3205 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3206 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3207 dst += (2 * dst_stride);
3221 v16i8
src0,
src1, src2, src3, src4, src5;
3222 v16i8 src6, src7, src8, src9, src10, src11;
3223 v8i16 in0, in1, in2, in3, in4, in5;
3224 v16i8 src10_r, src32_r, src76_r, src98_r;
3225 v16i8 src21_r, src43_r, src87_r, src109_r;
3226 v16i8 src10_l, src32_l, src21_l, src43_l;
3227 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3228 v8i16 dst0_l, dst1_l;
3230 v8i16 filter_vec, const_vec;
3232 src0_ptr -= src_stride;
3234 const_vec = __msa_ldi_h(128);
3237 filter_vec =
LD_SH(filter);
3241 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3243 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3244 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3246 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3247 src0_ptr += (3 * src_stride);
3249 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3251 for (loop_cnt = (height >> 2); loop_cnt--;) {
3253 LD_SB2(src0_ptr, src_stride, src3, src4);
3254 LD_SH2(src1_ptr, src2_stride, in0, in1);
3255 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3256 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3257 src1_ptr += (2 * src2_stride);
3259 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3260 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3262 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3263 src0_ptr += (2 * src_stride);
3265 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3268 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3270 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3272 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3274 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3277 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3279 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3282 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3283 dst0_r, dst1_r, dst0_l, dst1_l);
3287 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3288 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3289 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3290 ST8x2_UB(dst2_r, dst + 16, dst_stride);
3291 dst += (2 * dst_stride);
3294 LD_SB2(src0_ptr, src_stride, src5, src2);
3295 LD_SH2(src1_ptr, src2_stride, in0, in1);
3296 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3297 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3298 src1_ptr += (2 * src2_stride);
3300 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3301 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3303 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
3304 src0_ptr += (2 * src_stride);
3306 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3309 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3311 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3313 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3315 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3318 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3320 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3323 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3324 dst0_r, dst1_r, dst0_l, dst1_l);
3327 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3328 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3329 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3330 ST8x2_UB(dst2_r, dst + 16, dst_stride);
3331 dst += (2 * dst_stride);
3346 v16i8
src0,
src1, src2, src3, src4, src6, src7, src8, src9, src10;
3347 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3348 v16i8 src10_r, src32_r, src76_r, src98_r;
3349 v16i8 src21_r, src43_r, src87_r, src109_r;
3350 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3351 v16i8 src10_l, src32_l, src76_l, src98_l;
3352 v16i8 src21_l, src43_l, src87_l, src109_l;
3353 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3355 v8i16 filter_vec, const_vec;
3357 src0_ptr -= src_stride;
3359 const_vec = __msa_ldi_h(128);
3362 filter_vec =
LD_SH(filter);
3366 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3368 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3369 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3372 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3373 src0_ptr += (3 * src_stride);
3375 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3376 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3378 for (loop_cnt = (height >> 1); loop_cnt--;) {
3380 LD_SB2(src0_ptr, src_stride, src3, src4);
3381 LD_SH2(src1_ptr, src2_stride, in0, in1);
3382 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3383 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3384 LD_SH2((src1_ptr + 24), src2_stride, in6, in7);
3385 src1_ptr += (2 * src2_stride);
3387 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3388 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3391 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3393 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3395 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3397 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3400 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3401 dst0_r, dst1_r, dst0_l, dst1_l);
3409 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3410 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3411 dst += (2 * dst_stride);
3414 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3415 src0_ptr += (2 * src_stride);
3417 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3418 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3421 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3423 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3425 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3427 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3430 dst2_r, dst3_r, dst2_l, dst3_l, 7,
3431 dst2_r, dst3_r, dst2_l, dst3_l);
3433 PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3434 ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
3435 dst_tmp += (2 * dst_stride);
3451 const int8_t *filter_x,
3452 const int8_t *filter_y,
3456 v16i8
src0,
src1, src2, src3, src4;
3458 v4i32 filt_h0, filt_h1;
3459 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3461 v8i16 filter_vec, const_vec;
3462 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3463 v8i16 dst0, dst1, dst2, dst3, dst4;
3464 v4i32 dst0_r, dst1_r;
3465 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3467 src0_ptr -= (src_stride + 1);
3469 filter_vec =
LD_SH(filter_x);
3472 filter_vec =
LD_SH(filter_y);
3473 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3474 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3480 const_vec = __msa_ldi_h(128);
3483 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3484 src0_ptr += (3 * src_stride);
3487 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3488 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3489 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3496 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3498 LD_SB2(src0_ptr, src_stride, src3, src4);
3499 LD_SH2(src1_ptr, src2_stride, in0, in1);
3500 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3503 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3506 dst32_r = __msa_ilvr_h(dst3, dst2);
3510 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3513 dst43_r = __msa_ilvr_h(dst4, dst3);
3516 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
3517 dst0_r = (v4i32) __msa_adds_s_h((v8i16) dst0_r, in0);
3518 dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 7);
3521 dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
3531 const int8_t *filter_x,
3532 const int8_t *filter_y,
3535 v8i16 in0, in1, in2, in3;
3536 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3538 v4i32 filt_h0, filt_h1;
3539 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3541 v8i16 filter_vec, const_vec;
3542 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3543 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3544 v8i16 dst0_r, dst1_r;
3545 v4i32 tmp0, tmp1, tmp2, tmp3;
3546 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3548 src0_ptr -= (src_stride + 1);
3550 filter_vec =
LD_SH(filter_x);
3553 filter_vec =
LD_SH(filter_y);
3554 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3555 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3561 const_vec = __msa_ldi_h(128);
3564 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3565 src0_ptr += (3 * src_stride);
3568 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3569 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3570 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3577 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3579 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3580 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3584 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3587 dst32_r = __msa_ilvr_h(dst3, dst2);
3591 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3594 dst43_r = __msa_ilvr_h(dst4, dst3);
3598 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3601 dst10_r = __msa_ilvr_h(dst5, dst4);
3605 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3608 dst21_r = __msa_ilvr_h(dst2, dst5);
3611 PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
3614 dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
3615 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
3616 dst += (4 * dst_stride);
3625 const int8_t *filter_x,
3626 const int8_t *filter_y,
3630 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3631 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3633 v4i32 filt_h0, filt_h1;
3634 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3636 v8i16 filter_vec, const_vec;
3637 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3638 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
3639 v8i16 tmp0, tmp1, tmp2, tmp3;
3640 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3641 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3642 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3644 src0_ptr -= (src_stride + 1);
3646 filter_vec =
LD_SH(filter_x);
3649 filter_vec =
LD_SH(filter_y);
3650 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3651 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3657 const_vec = __msa_ldi_h(128);
3660 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3661 src0_ptr += (3 * src_stride);
3664 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3665 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3666 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3673 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3675 for (loop_cnt = height >> 3; loop_cnt--;) {
3676 LD_SB8(src0_ptr, src_stride,
3677 src3, src4, src5, src6, src7, src8, src9, src10);
3678 src0_ptr += (8 * src_stride);
3679 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3680 src1_ptr += (8 * src2_stride);
3685 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3688 dst32_r = __msa_ilvr_h(dst3, dst2);
3692 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3695 dst43_r = __msa_ilvr_h(dst4, dst3);
3699 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3702 dst54_r = __msa_ilvr_h(dst5, dst4);
3706 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3709 dst65_r = __msa_ilvr_h(dst6, dst5);
3713 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3716 dst76_r = __msa_ilvr_h(dst7, dst6);
3720 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3723 dst87_r = __msa_ilvr_h(dst8, dst7);
3727 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
3730 dst10_r = __msa_ilvr_h(dst9, dst8);
3734 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
3737 dst21_r = __msa_ilvr_h(dst2, dst9);
3741 dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
3743 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
3746 ST4x8_UB(tmp0, tmp1, dst, dst_stride);
3747 dst += (8 * dst_stride);
3757 const int8_t *filter_x,
3758 const int8_t *filter_y,
3763 dst, dst_stride, filter_x, filter_y, height);
3764 }
else if (4 == height) {
3766 dst, dst_stride, filter_x, filter_y, height);
3767 }
else if (0 == (height % 8)) {
3769 src1_ptr, src2_stride,
3771 filter_x, filter_y, height);
3781 const int8_t *filter_x,
3782 const int8_t *filter_y,
3786 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3787 v8i16 in0, in1, in2, in3;
3789 v4i32 filt_h0, filt_h1;
3790 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3792 v8i16 filter_vec, const_vec;
3793 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3794 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3795 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3796 v8i16 tmp0, tmp1, tmp2, tmp3;
3797 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3798 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3800 src0_ptr -= (src_stride + 1);
3802 filter_vec =
LD_SH(filter_x);
3805 filter_vec =
LD_SH(filter_y);
3806 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3807 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3813 const_vec = __msa_ldi_h(128);
3816 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3817 src0_ptr += (3 * src_stride);
3820 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3821 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3822 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3833 for (loop_cnt = height >> 2; loop_cnt--;) {
3834 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3835 src0_ptr += (4 * src_stride);
3836 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3837 src1_ptr += (4 * src2_stride);
3840 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3850 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3860 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3870 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3880 dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
3882 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
3884 PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
3885 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
3886 dst += (4 * dst_stride);
3896 const int8_t *filter_x,
3897 const int8_t *filter_y,
3900 v16i8
src0,
src1, src2, src3, src4;
3902 v4i32 filt_h0, filt_h1;
3903 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3905 v8i16 filter_vec, const_vec;
3906 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3907 v8i16 dst0, dst1, dst2, dst3, dst4;
3908 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3909 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3910 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3914 src0_ptr -= (src_stride + 1);
3916 filter_vec =
LD_SH(filter_x);
3919 filter_vec =
LD_SH(filter_y);
3920 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3921 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3927 const_vec = __msa_ldi_h(128);
3930 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3931 src0_ptr += (3 * src_stride);
3934 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3935 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3936 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3947 LD_SB2(src0_ptr, src_stride, src3, src4);
3948 LD_SH2(src1_ptr, src2_stride, in0, in1);
3951 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3961 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3971 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
3974 dst0_r = (v4i32) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3984 const int8_t *filter_x,
3985 const int8_t *filter_y,
3988 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3989 v8i16 in0, in1, in2, in3, in4, in5;
3991 v4i32 filt_h0, filt_h1;
3992 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3994 v8i16 filter_vec, const_vec;
3995 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3996 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3997 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3998 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3999 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4000 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4001 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4002 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4003 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4005 src0_ptr -= (src_stride + 1);
4007 filter_vec =
LD_SH(filter_x);
4010 filter_vec =
LD_SH(filter_y);
4011 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4012 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4018 const_vec = __msa_ldi_h(128);
4021 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4022 src0_ptr += (3 * src_stride);
4024 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4025 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4026 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4037 LD_SB2(src0_ptr, src_stride, src3, src4);
4038 src0_ptr += (2 * src_stride);
4040 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
4041 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4050 tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
4052 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4061 tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r);
4063 LD_SB2(src0_ptr, src_stride, src5, src6);
4064 src0_ptr += (2 * src_stride);
4067 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4076 tmp2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
4079 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4088 tmp3 = __msa_pckev_h((v8i16) dst3_l, (v8i16) dst3_r);
4090 LD_SB2(src0_ptr, src_stride, src7, src8);
4093 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4103 tmp4 = __msa_pckev_h((v8i16) dst4_l, (v8i16) dst4_r);
4105 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
4114 tmp5 = __msa_pckev_h((v8i16) dst5_l, (v8i16) dst5_r);
4117 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
4120 PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
4121 dst2_r = (v4i32) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4122 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
4123 dst += (4 * dst_stride);
4133 const int8_t *filter_x,
4134 const int8_t *filter_y,
4138 uint32_t loop_cnt, cnt;
4140 int16_t *src1_ptr_tmp;
4142 v16i8
src0,
src1, src2, src3, src4, src5, src6;
4143 v8i16 in0, in1, in2, in3;
4145 v4i32 filt_h0, filt_h1;
4146 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4148 v8i16 filter_vec, const_vec;
4149 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4150 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4151 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4152 v8i16 tmp0, tmp1, tmp2, tmp3;
4153 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4154 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4156 src0_ptr -= (src_stride + 1);
4158 filter_vec =
LD_SH(filter_x);
4161 filter_vec =
LD_SH(filter_y);
4162 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4163 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4169 const_vec = __msa_ldi_h(128);
4172 for (cnt = width >> 3; cnt--;) {
4173 src0_ptr_tmp = src0_ptr;
4175 src1_ptr_tmp = src1_ptr;
4177 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4178 src0_ptr_tmp += (3 * src_stride);
4181 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4182 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4183 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4194 for (loop_cnt = height >> 2; loop_cnt--;) {
4195 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4196 src0_ptr_tmp += (4 * src_stride);
4197 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4198 src1_ptr_tmp += (4 * src2_stride);
4201 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4211 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4221 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4231 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4242 dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
4244 tmp0, tmp1, tmp2, tmp3, 7,
4245 tmp0, tmp1, tmp2, tmp3);
4247 PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
4248 ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
4249 dst_tmp += (4 * dst_stride);
4264 const int8_t *filter_x,
4265 const int8_t *filter_y,
4270 dst, dst_stride, filter_x, filter_y, height);
4271 }
else if (6 == height) {
4273 dst, dst_stride, filter_x, filter_y, height);
4276 src1_ptr, src2_stride,
4278 filter_x, filter_y, height, 8);
4288 const int8_t *filter_x,
4289 const int8_t *filter_y,
4293 dst, dst_stride, filter_x, filter_y,
4296 dst + 8, dst_stride, filter_x, filter_y, height);
4305 const int8_t *filter_x,
4306 const int8_t *filter_y,
4310 dst, dst_stride, filter_x, filter_y,
4320 const int8_t *filter_x,
4321 const int8_t *filter_y,
4325 dst, dst_stride, filter_x, filter_y,
4335 const int8_t *filter_x,
4336 const const int8_t *filter_y,
4340 dst, dst_stride, filter_x, filter_y,
4344 #define BI_MC_COPY(WIDTH) \
4345 void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4346 ptrdiff_t dst_stride, \
4348 ptrdiff_t src_stride, \
4349 int16_t *src_16bit, \
4355 hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
4356 dst, dst_stride, height); \
4371 #define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4372 void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4373 ptrdiff_t dst_stride, \
4375 ptrdiff_t src_stride, \
4376 int16_t *src_16bit, \
4382 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4384 hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
4385 MAX_PB_SIZE, dst, dst_stride, \
4389 BI_MC(qpel,
h, 4, 8, hz, mx);
4390 BI_MC(qpel,
h, 8, 8, hz, mx);
4391 BI_MC(qpel,
h, 12, 8, hz, mx);
4392 BI_MC(qpel,
h, 16, 8, hz, mx);
4393 BI_MC(qpel,
h, 24, 8, hz, mx);
4394 BI_MC(qpel,
h, 32, 8, hz, mx);
4395 BI_MC(qpel,
h, 48, 8, hz, mx);
4396 BI_MC(qpel,
h, 64, 8, hz, mx);
4398 BI_MC(qpel, v, 4, 8, vt, my);
4399 BI_MC(qpel, v, 8, 8, vt, my);
4400 BI_MC(qpel, v, 12, 8, vt, my);
4401 BI_MC(qpel, v, 16, 8, vt, my);
4402 BI_MC(qpel, v, 24, 8, vt, my);
4403 BI_MC(qpel, v, 32, 8, vt, my);
4404 BI_MC(qpel, v, 48, 8, vt, my);
4405 BI_MC(qpel, v, 64, 8, vt, my);
4407 BI_MC(epel,
h, 4, 4, hz, mx);
4408 BI_MC(epel,
h, 8, 4, hz, mx);
4409 BI_MC(epel,
h, 6, 4, hz, mx);
4410 BI_MC(epel,
h, 12, 4, hz, mx);
4411 BI_MC(epel,
h, 16, 4, hz, mx);
4412 BI_MC(epel,
h, 24, 4, hz, mx);
4413 BI_MC(epel,
h, 32, 4, hz, mx);
4415 BI_MC(epel, v, 4, 4, vt, my);
4416 BI_MC(epel, v, 8, 4, vt, my);
4417 BI_MC(epel, v, 6, 4, vt, my);
4418 BI_MC(epel, v, 12, 4, vt, my);
4419 BI_MC(epel, v, 16, 4, vt, my);
4420 BI_MC(epel, v, 24, 4, vt, my);
4421 BI_MC(epel, v, 32, 4, vt, my);
4425 #define BI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
4426 void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4427 ptrdiff_t dst_stride, \
4429 ptrdiff_t src_stride, \
4430 int16_t *src_16bit, \
4436 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4437 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4439 hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
4440 MAX_PB_SIZE, dst, dst_stride, \
4441 filter_x, filter_y, \
#define HEVC_PCK_SW_SB2(in0, in1, out)
static void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1)
#define XORI_B8_128_SB(...)
static void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_bi_copy_16multx4mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t width)
static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_8t_8multx2mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
static void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define XORI_B2_128_SB(...)
static void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B3_128_SB(...)
static void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define DPADD_SB4_SH(...)
#define SPLATI_H2_SH(...)
static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
#define HEVC_PCK_SW_SB4(in0, in1, in2, in3, out)
static void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define CLIP_SH_0_255(in)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, uint8_t clip)
#define SPLATI_H4_SH(...)
static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define BI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)
#define CLIP_SW_0_255(in)
static void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define HEVC_BI_RND_CLIP4(in0, in1, in2, in3,vec0, vec1, vec2, vec3, rnd_val,out0, out1, out2, out3)
#define ST8x2_UB(in, pdst, stride)
static void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B7_128_SB(...)
static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define XORI_B4_128_SB(...)
static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define DPADD_SB2_SH(...)
static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SPLATI_W4_SW(...)
#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define UNPCK_SH_SW(in, out0, out1)
static void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define BI_MC_COPY(WIDTH)
static void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SLLI_4V(in0, in1, in2, in3, shift)
#define ST4x8_UB(in0, in1, pdst, stride)
static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST6x4_UB(in0, in1, pdst, stride)
static void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST8x4_UB(in0, in1, pdst, stride)
static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B6_128_SB(...)
static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const const int8_t *filter_y, int32_t height)
static void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST12x4_UB(in0, in1, in2, pdst, stride)
static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SPLATI_W2_SW(...)
static void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST4x2_UB(in, pdst, stride)
static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)