24 #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \
28 tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
29 tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \
30 ST_UB(tmp_m, (pdst)); \
33 #define PCKEV_ST_SB4(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
35 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
36 uint8_t *pdst_m = (uint8_t *) (pdst); \
38 PCKEV_B4_SB(in0, in1, in2, in3, in4, in5, in6, in7, \
39 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
40 ST_SB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst_m, stride); \
43 #define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \
46 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
47 uint8_t *pdst_m = (uint8_t *) (pdst); \
49 PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
50 PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
51 AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
52 ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
56 uint8_t *dst,
int32_t dst_stride,
61 v16u8
src0,
src1, src0_sld1, src1_sld1, res0, res1;
64 for (loop_cnt = (
height >> 1); loop_cnt--;) {
66 src += (2 * src_stride);
71 out0 = __msa_copy_u_w((v4i32) res0, 0);
72 out1 = __msa_copy_u_w((v4i32) res1, 0);
81 uint8_t *dst,
int32_t dst_stride,
85 v16i8
src0,
src1,
src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
88 for (loop_cnt = (
height >> 2); loop_cnt--;) {
90 src += (4 * src_stride);
93 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
95 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
96 dst += (4 * dst_stride);
101 uint8_t *dst,
int32_t dst_stride,
106 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
108 for (loop_cnt = (
height >> 3); loop_cnt--;) {
111 src8, src9, src10, src11, src12, src13, src14, src15);
112 src += (8 * src_stride);
116 dst += (4 * dst_stride);
118 AVER_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
120 dst += (4 * dst_stride);
125 uint8_t *dst,
int32_t dst_stride)
128 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
129 v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1;
133 src += (8 * src_stride);
136 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
137 SLDI_B4_SB(zeros, src4, zeros, src5, zeros, src6, zeros, src7, 1,
138 src4_sld1, src5_sld1, src6_sld1, src7_sld1);
141 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
142 dst += (4 * dst_stride);
144 src6, src6_sld1, src7, src7_sld1, dst, dst_stride);
148 uint8_t *dst,
int32_t dst_stride)
150 v16i8
src0,
src1,
src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
155 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
157 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
162 uint8_t *dst,
int32_t dst_stride)
164 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
165 v16u8 src9, src10, src11, src12, src13, src14, src15;
169 src8, src9, src10, src11, src12, src13, src14, src15);
170 src += (8 * src_stride);
174 dst += (4 * dst_stride);
177 LD_UB4((
src + 1), src_stride, src8, src9, src10, src11);
178 src += (4 * src_stride);
180 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
182 dst += (4 * dst_stride);
184 LD_UB4(
src, src_stride, src4, src5, src6, src7);
185 LD_UB4((
src + 1), src_stride, src12, src13, src14, src15);
186 src += (4 * src_stride);
190 dst += (4 * dst_stride);
191 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
197 uint8_t *dst,
int32_t dst_stride)
199 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
200 v16u8 src9, src10, src11, src12, src13, src14, src15;
204 src8, src9, src10, src11, src12, src13, src14, src15);
208 dst += (4 * dst_stride);
209 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
215 uint8_t *dst,
int32_t dst_stride,
219 uint32_t dst0, dst1, out0, out1;
220 v16u8
src0,
src1, src0_sld1, src1_sld1, res0, res1;
225 for (loop_cnt = (
height >> 1); loop_cnt--;) {
227 src += (2 * src_stride);
232 dst1 =
LW(dst + dst_stride);
233 tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
234 tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
239 out0 = __msa_copy_u_w((v4i32) res0, 0);
240 out1 = __msa_copy_u_w((v4i32) res1, 0);
250 uint8_t *dst,
int32_t dst_stride,
254 v16i8
src0,
src1,
src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
257 for (loop_cnt = (
height >> 2); loop_cnt--;) {
259 src += (4 * src_stride);
262 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
265 src3, src3_sld1, dst, dst_stride);
266 dst += (4 * dst_stride);
272 uint8_t *dst,
int32_t dst_stride,
276 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
277 v16u8 src9, src10, src11, src12, src13, src14, src15;
279 for (loop_cnt = (
height >> 3); loop_cnt--;) {
282 src8, src9, src10, src11, src12, src13, src14, src15);
283 src += (8 * src_stride);
287 dst += (4 * dst_stride);
290 dst += (4 * dst_stride);
295 uint8_t *dst,
int32_t dst_stride,
305 for (loop_cnt = (
height >> 1); loop_cnt--;) {
307 src += (2 * src_stride);
311 out0 = __msa_copy_u_w((v4i32) res0, 0);
312 out1 = __msa_copy_u_w((v4i32) res1, 0);
323 uint8_t *dst,
int32_t dst_stride,
332 for (loop_cnt = (
height >> 2); loop_cnt--;) {
334 src += (4 * src_stride);
338 dst += (4 * dst_stride);
345 uint8_t *dst,
int32_t dst_stride,
349 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
354 for (loop_cnt = (
height >> 3); loop_cnt--;) {
356 src += (8 * src_stride);
360 dst += (4 * dst_stride);
363 dst += (4 * dst_stride);
370 uint8_t *dst,
int32_t dst_stride)
372 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
375 src += (8 * src_stride);
380 dst += (4 * dst_stride);
382 AVE_ST8x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
387 uint8_t *dst,
int32_t dst_stride)
398 uint8_t *dst,
int32_t dst_stride)
400 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
401 v16u8 src9, src10, src11, src12, src13, src14, src15, src16;
404 src += (8 * src_stride);
406 src8, src9, src10, src11, src12, src13, src14, src15);
407 src += (8 * src_stride);
412 dst += (4 * dst_stride);
413 AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
415 dst += (4 * dst_stride);
416 AVE_ST16x4_UB(src8, src9, src9, src10, src10, src11, src11, src12,
418 dst += (4 * dst_stride);
420 src14, src15, src15, src16, dst, dst_stride);
425 uint8_t *dst,
int32_t dst_stride)
427 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
430 src += (8 * src_stride);
435 dst += (4 * dst_stride);
436 AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
442 uint8_t *dst,
int32_t dst_stride,
446 uint32_t out0, out1, dst0, dst1;
455 for (loop_cnt = (
height >> 1); loop_cnt--;) {
457 src += (2 * src_stride);
459 dst1 =
LW(dst + dst_stride);
460 tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
461 tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
464 out0 = __msa_copy_u_w((v4i32) res0, 0);
465 out1 = __msa_copy_u_w((v4i32) res1, 0);
476 uint8_t *dst,
int32_t dst_stride,
485 for (loop_cnt = (
height >> 2); loop_cnt--;) {
487 src += (4 * src_stride);
491 dst += (4 * dst_stride);
498 uint8_t *dst,
int32_t dst_stride,
502 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
503 v16u8 res0, res1, res2, res3, res4, res5, res6, res7;
504 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
509 for (loop_cnt = (
height >> 3); loop_cnt--;) {
511 src += (8 * src_stride);
513 res0, res1, res2, res3);
514 AVER_UB4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
515 res4, res5, res6, res7);
517 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
518 AVER_UB4_UB(dst0, res0, dst1, res1, dst2, res2, dst3, res3,
519 res0, res1, res2, res3);
520 AVER_UB4_UB(dst4, res4, dst5, res5, dst6, res6, dst7, res7,
521 res4, res5, res6, res7);
522 ST_UB8(res0, res1, res2, res3, res4, res5, res6, res7, dst, dst_stride);
523 dst += (8 * dst_stride);
530 uint8_t *dst,
int32_t dst_stride,
535 v16i8
src0,
src1,
src2, src0_sld1, src1_sld1, src2_sld1;
536 v16u8 src0_r, src1_r, src2_r, res;
537 v8u16 add0, add1, add2, sum0, sum1;
543 for (loop_cnt = (
height >> 1); loop_cnt--;) {
545 src += (2 * src_stride);
548 src1_sld1, src2_sld1);
550 src0_r, src1_r, src2_r);
551 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
552 ADD2(add0, add1, add1, add2, sum0, sum1);
554 res = (v16u8) __msa_pckev_b((v16i8) sum1, (v16i8) sum0);
555 res0 = __msa_copy_u_w((v4i32) res, 0);
556 res1 = __msa_copy_u_w((v4i32) res, 2);
567 uint8_t *dst,
int32_t dst_stride,
572 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
573 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
574 v8u16 add0, add1, add2, add3, add4;
575 v8u16 sum0, sum1, sum2, sum3;
581 for (loop_cnt = (
height >> 2); loop_cnt--;) {
583 src += (4 * src_stride);
586 src1_sld1, src2_sld1);
587 SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
590 ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
591 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
593 ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
594 sum0, sum1, sum2, sum3);
598 dst += (4 * dst_stride);
604 uint8_t *dst,
int32_t dst_stride,
608 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9;
609 v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
610 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
611 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
612 v8u16 src7_l, src8_l;
613 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
614 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
616 for (loop_cnt = (
height >> 3); loop_cnt--;) {
619 src9, src10, src11, src12, src13, src14, src15, src16);
620 src += (8 * src_stride);
634 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
635 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
636 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
637 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
638 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
639 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
640 ADD4(src0_r, src1_r, src1_r, src2_r, src2_r, src3_r, src3_r, src4_r,
641 sum0_r, sum1_r, sum2_r, sum3_r);
642 ADD4(src4_r, src5_r, src5_r, src6_r, src6_r, src7_r, src7_r, src8_r,
643 sum4_r, sum5_r, sum6_r, sum7_r);
644 ADD4(src0_l, src1_l, src1_l, src2_l, src2_l, src3_l, src3_l, src4_l,
645 sum0_l, sum1_l, sum2_l, sum3_l);
646 ADD4(src4_l, src5_l, src5_l, src6_l, src6_l, src7_l, src7_l, src8_l,
647 sum4_l, sum5_l, sum6_l, sum7_l);
652 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, sum2_l, sum2_r,
653 sum3_l, sum3_r, dst, dst_stride);
654 dst += (4 * dst_stride);
655 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, sum6_l, sum6_r,
656 sum7_l, sum7_r, dst, dst_stride);
657 dst += (4 * dst_stride);
662 uint8_t *dst,
int32_t dst_stride)
664 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
665 v16u8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
666 v16u8 src4_sld1, src5_sld1, src6_sld1, src7_sld1, src8_sld1;
667 v8u16 src0_r, src1_r, src2_r, src3_r;
668 v8u16 src4_r, src5_r, src6_r, src7_r, src8_r;
669 v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
670 v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
675 src += (8 * src_stride);
679 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
680 SLDI_B3_UB(zeros, src4, zeros, src5, zeros, src6, 1, src4_sld1,
681 src5_sld1, src6_sld1);
682 SLDI_B2_UB(zeros, src7, zeros, src8, 1, src7_sld1, src8_sld1);
684 src3, src0_r, src1_r, src2_r, src3_r);
685 ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r,
687 ILVR_B2_UH(src7_sld1, src7, src8_sld1, src8, src7_r, src8_r);
688 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
689 HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
690 HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
692 sum0 = add0 + add1 + 1;
693 sum1 = add1 + add2 + 1;
694 sum2 = add2 + add3 + 1;
695 sum3 = add3 + add4 + 1;
696 sum4 = add4 + add5 + 1;
697 sum5 = add5 + add6 + 1;
698 sum6 = add6 + add7 + 1;
699 sum7 = add7 + add8 + 1;
701 SRA_4V(sum0, sum1, sum2, sum3, 2);
702 SRA_4V(sum4, sum5, sum6, sum7, 2);
704 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
706 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
710 uint8_t *dst,
int32_t dst_stride)
713 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
714 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r;
715 v8u16 add0, add1, add2, add3, add4;
716 v8u16 sum0, sum1, sum2, sum3;
721 src += (4 * src_stride);
725 src1_sld1, src2_sld1);
726 SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
729 ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
730 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
733 sum0 = add0 + add1 + 1;
734 sum1 = add1 + add2 + 1;
735 sum2 = add2 + add3 + 1;
736 sum3 = add3 + add4 + 1;
738 SRA_4V(sum0, sum1, sum2, sum3, 2);
740 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
745 uint8_t *dst,
int32_t dst_stride)
747 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9;
748 v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
749 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
750 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
751 v8u16 src7_l, src8_l;
752 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
753 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
757 src9, src10, src11, src12, src13, src14, src15, src16);
758 src += (8 * src_stride);
772 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
773 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
774 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
775 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
776 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
777 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
779 sum0_r = src0_r + src1_r + 1;
780 sum1_r = src1_r + src2_r + 1;
781 sum2_r = src2_r + src3_r + 1;
782 sum3_r = src3_r + src4_r + 1;
783 sum4_r = src4_r + src5_r + 1;
784 sum5_r = src5_r + src6_r + 1;
785 sum6_r = src6_r + src7_r + 1;
786 sum7_r = src7_r + src8_r + 1;
787 sum0_l = src0_l + src1_l + 1;
788 sum1_l = src1_l + src2_l + 1;
789 sum2_l = src2_l + src3_l + 1;
790 sum3_l = src3_l + src4_l + 1;
791 sum4_l = src4_l + src5_l + 1;
792 sum5_l = src5_l + src6_l + 1;
793 sum6_l = src6_l + src7_l + 1;
794 sum7_l = src7_l + src8_l + 1;
796 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
797 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
798 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
799 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
801 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
802 dst += (4 * dst_stride);
806 src9, src10, src11, src12, src13, src14, src15, src16);
807 src += (8 * src_stride);
812 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
813 dst += (4 * dst_stride);
825 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
826 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
827 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
828 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
829 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
830 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
832 sum0_r = src0_r + src1_r + 1;
833 sum1_r = src1_r + src2_r + 1;
834 sum2_r = src2_r + src3_r + 1;
835 sum3_r = src3_r + src4_r + 1;
836 sum4_r = src4_r + src5_r + 1;
837 sum5_r = src5_r + src6_r + 1;
838 sum6_r = src6_r + src7_r + 1;
839 sum7_r = src7_r + src8_r + 1;
840 sum0_l = src0_l + src1_l + 1;
841 sum1_l = src1_l + src2_l + 1;
842 sum2_l = src2_l + src3_l + 1;
843 sum3_l = src3_l + src4_l + 1;
844 sum4_l = src4_l + src5_l + 1;
845 sum5_l = src5_l + src6_l + 1;
846 sum6_l = src6_l + src7_l + 1;
847 sum7_l = src7_l + src8_l + 1;
849 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
850 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
851 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
852 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
854 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
855 dst += (4 * dst_stride);
857 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
862 uint8_t *dst,
int32_t dst_stride)
864 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9;
865 v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
866 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
867 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
868 v8u16 src7_l, src8_l;
869 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
870 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
874 src9, src10, src11, src12, src13, src14, src15, src16);
875 src += (8 * src_stride);
889 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
890 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
891 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
892 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
893 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
894 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
896 sum0_r = src0_r + src1_r + 1;
897 sum1_r = src1_r + src2_r + 1;
898 sum2_r = src2_r + src3_r + 1;
899 sum3_r = src3_r + src4_r + 1;
900 sum4_r = src4_r + src5_r + 1;
901 sum5_r = src5_r + src6_r + 1;
902 sum6_r = src6_r + src7_r + 1;
903 sum7_r = src7_r + src8_r + 1;
904 sum0_l = src0_l + src1_l + 1;
905 sum1_l = src1_l + src2_l + 1;
906 sum2_l = src2_l + src3_l + 1;
907 sum3_l = src3_l + src4_l + 1;
908 sum4_l = src4_l + src5_l + 1;
909 sum5_l = src5_l + src6_l + 1;
910 sum6_l = src6_l + src7_l + 1;
911 sum7_l = src7_l + src8_l + 1;
913 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
914 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
915 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
916 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
918 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
919 dst += (4 * dst_stride);
921 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
926 uint8_t *dst,
int32_t dst_stride,
931 v16i8
src0,
src1,
src2, src0_sld1, src1_sld1, src2_sld1;
932 v16u8 src0_r, src1_r, src2_r;
933 v8u16 add0, add1, add2, sum0, sum1;
934 v16u8 dst0, dst1, res0, res1;
940 for (loop_cnt = (
height >> 1); loop_cnt--;) {
942 src += (2 * src_stride);
944 LD_UB2(dst, dst_stride, dst0, dst1);
946 src1_sld1, src2_sld1);
949 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
950 ADD2(add0, add1, add1, add2, sum0, sum1);
955 out0 = __msa_copy_u_w((v4i32) res0, 0);
956 out1 = __msa_copy_u_w((v4i32) res1, 0);
968 uint8_t *dst,
int32_t dst_stride,
973 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
974 v16u8 dst0, dst1, dst2, dst3;
975 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
976 v8u16 add0, add1, add2, add3, add4;
977 v8u16 sum0, sum1, sum2, sum3;
983 for (loop_cnt = (
height >> 2); loop_cnt--;) {
985 src += (4 * src_stride);
987 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
989 src1_sld1, src2_sld1);
990 SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
993 ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
994 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
996 ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
997 sum0, sum1, sum2, sum3);
1000 sum2, dst2, sum3, dst3, dst, dst_stride);
1001 dst += (4 * dst_stride);
1008 uint8_t *dst,
int32_t dst_stride,
1012 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1013 v16u8 src11, src12, src13, src14, src15, src16, src17;
1014 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
1015 v16u8 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
1016 v16u8 src7_l, src8_l;
1017 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1018 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
1019 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
1020 v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
1022 for (loop_cnt = (
height >> 3); loop_cnt--;) {
1025 src9, src10, src11, src12, src13, src14, src15, src16);
1026 src += (8 * src_stride);
1040 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
1041 HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
1042 HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
1043 ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_r, sum1_r,
1045 ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_r, sum5_r,
1047 HADD_UB3_UH(src0_l, src1_l, src2_l, add0, add1, add2);
1048 HADD_UB3_UH(src3_l, src4_l, src5_l, add3, add4, add5);
1049 HADD_UB3_UH(src6_l, src7_l, src8_l, add6, add7, add8);
1050 ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_l, sum1_l,
1052 ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_l, sum5_l,
1058 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1079 uint8_t *dst,
int32_t dst_stride,
1083 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
1087 for (cnt = (
height / 12); cnt--;) {
1090 src += (8 * src_stride);
1092 out0 = __msa_copy_u_d((v2i64)
src0, 0);
1093 out1 = __msa_copy_u_d((v2i64)
src1, 0);
1094 out2 = __msa_copy_u_d((v2i64)
src2, 0);
1095 out3 = __msa_copy_u_d((v2i64) src3, 0);
1096 out4 = __msa_copy_u_d((v2i64) src4, 0);
1097 out5 = __msa_copy_u_d((v2i64) src5, 0);
1098 out6 = __msa_copy_u_d((v2i64) src6, 0);
1099 out7 = __msa_copy_u_d((v2i64) src7, 0);
1101 SD4(out0, out1, out2, out3, dst, dst_stride);
1102 dst += (4 * dst_stride);
1103 SD4(out4, out5, out6, out7, dst, dst_stride);
1104 dst += (4 * dst_stride);
1107 src += (4 * src_stride);
1109 out0 = __msa_copy_u_d((v2i64)
src0, 0);
1110 out1 = __msa_copy_u_d((v2i64)
src1, 0);
1111 out2 = __msa_copy_u_d((v2i64)
src2, 0);
1112 out3 = __msa_copy_u_d((v2i64) src3, 0);
1114 SD4(out0, out1, out2, out3, dst, dst_stride);
1115 dst += (4 * dst_stride);
1117 }
else if (0 ==
height % 8) {
1118 for (cnt =
height >> 3; cnt--;) {
1121 src += (8 * src_stride);
1123 out0 = __msa_copy_u_d((v2i64)
src0, 0);
1124 out1 = __msa_copy_u_d((v2i64)
src1, 0);
1125 out2 = __msa_copy_u_d((v2i64)
src2, 0);
1126 out3 = __msa_copy_u_d((v2i64) src3, 0);
1127 out4 = __msa_copy_u_d((v2i64) src4, 0);
1128 out5 = __msa_copy_u_d((v2i64) src5, 0);
1129 out6 = __msa_copy_u_d((v2i64) src6, 0);
1130 out7 = __msa_copy_u_d((v2i64) src7, 0);
1132 SD4(out0, out1, out2, out3, dst, dst_stride);
1133 dst += (4 * dst_stride);
1134 SD4(out4, out5, out6, out7, dst, dst_stride);
1135 dst += (4 * dst_stride);
1137 }
else if (0 ==
height % 4) {
1138 for (cnt = (
height / 4); cnt--;) {
1140 src += (4 * src_stride);
1141 out0 = __msa_copy_u_d((v2i64)
src0, 0);
1142 out1 = __msa_copy_u_d((v2i64)
src1, 0);
1143 out2 = __msa_copy_u_d((v2i64)
src2, 0);
1144 out3 = __msa_copy_u_d((v2i64) src3, 0);
1146 SD4(out0, out1, out2, out3, dst, dst_stride);
1147 dst += (4 * dst_stride);
1149 }
else if (0 ==
height % 2) {
1150 for (cnt = (
height / 2); cnt--;) {
1152 src += (2 * src_stride);
1153 out0 = __msa_copy_u_d((v2i64)
src0, 0);
1154 out1 = __msa_copy_u_d((v2i64)
src1, 0);
1165 uint8_t *dst,
int32_t dst_stride,
1169 const uint8_t *src_tmp;
1173 for (cnt = (
width >> 4); cnt--;) {
1177 for (loop_cnt = (
height >> 3); loop_cnt--;) {
1178 LD_UB8(src_tmp, src_stride,
1180 src_tmp += (8 * src_stride);
1183 dst_tmp, dst_stride);
1184 dst_tmp += (8 * dst_stride);
1193 uint8_t *dst,
int32_t dst_stride,
1200 for (cnt = (
height / 12); cnt--;) {
1203 src += (8 * src_stride);
1206 dst += (8 * dst_stride);
1209 src += (4 * src_stride);
1211 dst += (4 * dst_stride);
1213 }
else if (0 ==
height % 8) {
1215 }
else if (0 ==
height % 4) {
1216 for (cnt = (
height >> 2); cnt--;) {
1218 src += (4 * src_stride);
1221 dst += (4 * dst_stride);
1227 uint8_t *dst,
int32_t dst_stride,
1231 uint32_t out0, out1, out2, out3;
1233 v16u8 dst0, dst1, dst2, dst3;
1236 for (cnt = (
height / 4); cnt--;) {
1238 src += (4 * src_stride);
1240 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1243 dst0, dst1, dst2, dst3);
1245 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1246 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1247 out2 = __msa_copy_u_w((v4i32) dst2, 0);
1248 out3 = __msa_copy_u_w((v4i32) dst3, 0);
1249 SW4(out0, out1, out2, out3, dst, dst_stride);
1250 dst += (4 * dst_stride);
1252 }
else if (0 == (
height % 2)) {
1253 for (cnt = (
height / 2); cnt--;) {
1255 src += (2 * src_stride);
1257 LD_UB2(dst, dst_stride, dst0, dst1);
1261 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1262 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1272 uint8_t *dst,
int32_t dst_stride,
1276 uint64_t out0, out1, out2, out3;
1278 v16u8 dst0, dst1, dst2, dst3;
1280 for (cnt = (
height / 4); cnt--;) {
1282 src += (4 * src_stride);
1283 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1286 dst0, dst1, dst2, dst3);
1288 out0 = __msa_copy_u_d((v2i64) dst0, 0);
1289 out1 = __msa_copy_u_d((v2i64) dst1, 0);
1290 out2 = __msa_copy_u_d((v2i64) dst2, 0);
1291 out3 = __msa_copy_u_d((v2i64) dst3, 0);
1292 SD4(out0, out1, out2, out3, dst, dst_stride);
1293 dst += (4 * dst_stride);
1298 uint8_t *dst,
int32_t dst_stride,
1303 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1305 for (cnt = (
height / 8); cnt--;) {
1307 src += (8 * src_stride);
1308 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1311 dst0, dst1, dst2, dst3);
1312 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
1313 dst4, dst5, dst6, dst7);
1314 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
1315 dst += (8 * dst_stride);
1320 ptrdiff_t line_size,
int h)
1326 ptrdiff_t line_size,
int h)
1332 ptrdiff_t line_size,
int h)
1338 ptrdiff_t line_size,
int h)
1344 ptrdiff_t line_size,
int h)
1350 ptrdiff_t line_size,
int h)
1356 ptrdiff_t line_size,
int h)
1362 ptrdiff_t line_size,
int h)
1368 ptrdiff_t line_size,
int h)
1374 ptrdiff_t line_size,
int h)
1380 ptrdiff_t line_size,
int h)
1386 ptrdiff_t line_size,
int h)
1390 }
else if (
h == 8) {
1396 ptrdiff_t line_size,
int h)
1400 }
else if (
h == 8) {
1406 const uint8_t *pixels,
1407 ptrdiff_t line_size,
int h)
1411 }
else if (
h == 8) {
1417 ptrdiff_t line_size,
int h)
1421 }
else if (
h == 4) {
1427 ptrdiff_t line_size,
int h)
1431 }
else if (
h == 4) {
1437 ptrdiff_t line_size,
int h)
1441 }
else if (
h == 4) {
1447 ptrdiff_t line_size,
int h)
1453 ptrdiff_t line_size,
int h)
1459 ptrdiff_t line_size,
int h)
1465 ptrdiff_t line_size,
int h)
1471 ptrdiff_t line_size,
int h)
1477 ptrdiff_t line_size,
int h)
1483 ptrdiff_t line_size,
int h)
1489 ptrdiff_t line_size,
int h)
1495 ptrdiff_t line_size,
int h)
1501 ptrdiff_t line_size,
int h)
1507 ptrdiff_t line_size,
int h)
1513 ptrdiff_t line_size,
int h)