24 #define AVC_CALC_DPADD_H_6PIX_2COEFF_SH(in0, in1, in2, in3, in4, in5) \
26 v4i32 tmp0_m, tmp1_m; \
27 v8i16 out0_m, out1_m, out2_m, out3_m; \
28 v8i16 minus5h_m = __msa_ldi_h(-5); \
29 v8i16 plus20h_m = __msa_ldi_h(20); \
31 ILVRL_H2_SW(in5, in0, tmp0_m, tmp1_m); \
33 tmp0_m = __msa_hadd_s_w((v8i16) tmp0_m, (v8i16) tmp0_m); \
34 tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \
36 ILVRL_H2_SH(in1, in4, out0_m, out1_m); \
37 DPADD_SH2_SW(out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m); \
38 ILVRL_H2_SH(in2, in3, out2_m, out3_m); \
39 DPADD_SH2_SW(out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m); \
41 SRARI_W2_SW(tmp0_m, tmp1_m, 10); \
42 SAT_SW2_SW(tmp0_m, tmp1_m, 7); \
43 out0_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
50 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
51 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
52 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
55 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
56 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
57 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
59 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
60 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
63 #define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, \
66 v16i8 tmp0_m, tmp1_m; \
67 v16i8 minus5b_m = __msa_ldi_b(-5); \
68 v16i8 plus20b_m = __msa_ldi_b(20); \
70 ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m); \
71 HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2); \
72 ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m); \
73 DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2); \
74 ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m); \
75 DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \
78 #define AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \
81 v16i8 tmp0_m, tmp2_m; \
82 v16i8 minus5b_m = __msa_ldi_b(-5); \
83 v16i8 plus20b_m = __msa_ldi_b(20); \
85 tmp1_m = (v8i16) __msa_ilvr_b((v16i8) vec5, (v16i8) vec0); \
86 tmp1_m = __msa_hadd_s_h((v16i8) tmp1_m, (v16i8) tmp1_m); \
88 ILVR_B2_SB(vec4, vec1, vec3, vec2, tmp0_m, tmp2_m); \
89 DPADD_SB2_SH(tmp0_m, tmp2_m, minus5b_m, plus20b_m, tmp1_m, tmp1_m); \
94 #define AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \
97 v8i16 tmp2_m, tmp3_m; \
98 v8i16 minus5h_m = __msa_ldi_h(-5); \
99 v8i16 plus20h_m = __msa_ldi_h(20); \
101 tmp1_m = (v4i32) __msa_ilvr_h((v8i16) vec5, (v8i16) vec0); \
102 tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \
104 ILVR_H2_SH(vec1, vec4, vec2, vec3, tmp2_m, tmp3_m); \
105 DPADD_SH2_SW(tmp2_m, tmp3_m, minus5h_m, plus20h_m, tmp1_m, tmp1_m); \
107 tmp1_m = __msa_srari_w(tmp1_m, 10); \
108 tmp1_m = __msa_sat_s_w(tmp1_m, 7); \
110 tmp2_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp1_m); \
115 #define AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1, \
116 mask0, mask1, mask2) \
119 v16i8 vec0_m, vec1_m, vec2_m; \
120 v16i8 minus5b_m = __msa_ldi_b(-5); \
121 v16i8 plus20b_m = __msa_ldi_b(20); \
123 vec0_m = __msa_vshf_b((v16i8) mask0, (v16i8) src1, (v16i8) src0); \
124 hz_out_m = __msa_hadd_s_h(vec0_m, vec0_m); \
126 VSHF_B2_SB(src0, src1, src0, src1, mask1, mask2, vec1_m, vec2_m); \
127 DPADD_SB2_SH(vec1_m, vec2_m, minus5b_m, plus20b_m, hz_out_m, hz_out_m); \
132 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \
136 v16i8 minus5b = __msa_ldi_b(-5); \
137 v16i8 plus20b = __msa_ldi_b(20); \
139 tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0); \
140 out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m); \
142 tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0); \
143 out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \
145 tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0); \
146 out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m); \
151 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
155 out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
156 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
157 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
162 #define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2) \
166 out0_m = __msa_dotp_s_w((v8i16) in0, (v8i16) coeff0); \
167 out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in1, (v8i16) coeff1); \
168 out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in2, (v8i16) coeff2); \
169 out0_m = __msa_srari_w(out0_m, 10); \
170 out0_m = __msa_sat_s_w(out0_m, 7); \
180 v16i8 mask0, mask1, mask2;
181 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
182 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
183 v8i16 dst0, dst1, dst2, dst3;
186 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
187 src += (5 * src_stride);
192 mask0, mask1, mask2);
194 mask0, mask1, mask2);
196 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
200 for (loop_cnt = (height >> 2); loop_cnt--;) {
201 LD_SB4(src, src_stride, src0, src1, src2, src3);
202 src += (4 * src_stride);
213 PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
216 hz_out3, hz_out4, hz_out5);
218 hz_out4, hz_out5, hz_out6);
220 hz_out5, hz_out6, hz_out7);
222 hz_out6, hz_out7, hz_out8);
227 ST4x4_UB(src0, src1, 0, 2, 0, 2, dst, dst_stride);
229 dst += (4 * dst_stride);
245 v16i8 mask0, mask1, mask2;
246 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
247 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
248 v8i16 dst0, dst1, dst2, dst3;
253 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
255 src += (5 * src_stride);
263 for (loop_cnt = (height >> 2); loop_cnt--;) {
264 LD_SB4(src, src_stride, src0, src1, src2, src3);
266 src += (4 * src_stride);
273 hz_out3, hz_out4, hz_out5);
275 hz_out4, hz_out5, hz_out6);
277 hz_out5, hz_out6, hz_out7);
279 hz_out6, hz_out7, hz_out8);
282 ST8x4_UB(out0, out1, dst, dst_stride);
284 dst += (4 * dst_stride);
298 uint32_t multiple8_cnt;
300 for (multiple8_cnt = 2; multiple8_cnt--;) {
312 v16i8
src0,
src1, src2, src3, src4, src5, src6;
313 v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
314 v4i32 hz_res0, hz_res1;
316 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
317 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
318 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
319 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
320 v8i16 minus5h = __msa_ldi_h(-5);
321 v8i16 plus20h = __msa_ldi_h(20);
325 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
326 src += (5 * src_stride);
329 for (row = (height >> 1); row--;) {
330 LD_SB2(src, src_stride, src5, src6);
331 src += (2 * src_stride);
338 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
339 mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
340 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
341 mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
342 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
343 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
344 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
345 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
350 dst0 = __msa_srari_h(shf_vec2, 5);
351 dst1 = __msa_srari_h(shf_vec5, 5);
356 dst0 = __msa_ilvod_h(zeros, dst0);
357 dst1 = __msa_ilvod_h(zeros, dst1);
362 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
363 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
364 dst0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
369 dst += (2 * dst_stride);
383 uint32_t multiple8_cnt;
385 for (multiple8_cnt = 2; multiple8_cnt--;) {
398 uint32_t multiple8_cnt;
400 for (multiple8_cnt = 4; multiple8_cnt--;) {
414 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
415 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
416 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
417 v16i8 mask0, mask1, mask2;
418 v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
424 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
425 src_y += (5 * src_stride);
427 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
428 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
429 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
430 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
434 for (loop_cnt = (height >> 2); loop_cnt--;) {
435 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
436 src_x += (4 * src_stride);
450 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
451 src_y += (4 * src_stride);
453 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
454 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
455 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
456 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
471 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
472 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
476 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
477 dst += (4 * dst_stride);
492 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
493 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
494 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
495 v16i8 mask0, mask1, mask2;
496 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
497 v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
498 v8i16 out0, out1, out2, out3;
502 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
503 src_y += (5 * src_stride);
505 src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
506 src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
507 src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
508 src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
512 for (loop_cnt = (height >> 2); loop_cnt--;) {
513 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
515 src_x += (4 * src_stride);
522 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
523 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
525 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
526 src_y += (4 * src_stride);
528 src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
529 src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
530 src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
531 src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
537 src_vt4, src_vt5, vert_out0, vert_out1);
539 src_vt6, src_vt7, vert_out2, vert_out3);
541 SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
542 SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
544 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
545 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
546 out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
547 out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
552 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
554 dst += (4 * dst_stride);
568 uint32_t multiple8_cnt;
570 for (multiple8_cnt = 2; multiple8_cnt--;) {
585 v16u8 dst0, dst1, dst2, dst3, res;
587 v16i8 mask0, mask1, mask2;
588 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
589 v16i8 minus5b = __msa_ldi_b(-5);
590 v16i8 plus20b = __msa_ldi_b(20);
593 LD_SB4(src, src_stride, src0, src1, src2, src3);
595 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
597 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
599 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
601 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
606 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
608 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
609 res = __msa_aver_u_b(res, dst0);
611 ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
620 v16u8 dst0, dst1, dst2, dst3;
621 v8i16 res0, res1, res2, res3;
622 v16i8 mask0, mask1, mask2;
623 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
624 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
625 v16i8 minus5b = __msa_ldi_b(-5);
626 v16i8 plus20b = __msa_ldi_b(20);
630 for (loop_cnt = 2; loop_cnt--;) {
631 LD_SB4(src, src_stride, src0, src1, src2, src3);
632 src += (4 * src_stride);
634 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
637 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
638 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
639 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
640 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
641 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
642 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
643 res0, res1, res2, res3);
644 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
645 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
646 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
647 plus20b, res0, res1, res2, res3);
650 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
654 dst += (4 * dst_stride);
663 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
664 v16u8 dst0, dst1, dst2, dst3;
665 v16i8 mask0, mask1, mask2;
666 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
667 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
668 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
669 v16i8 minus5b = __msa_ldi_b(-5);
670 v16i8 plus20b = __msa_ldi_b(20);
674 for (loop_cnt = 4; loop_cnt--;) {
675 LD_SB2(src, 8, src0, src1);
677 LD_SB2(src, 8, src2, src3);
680 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
683 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
684 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
685 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
686 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
687 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
688 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
689 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
690 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
691 minus5b, res0, res1, res2, res3);
692 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
693 plus20b, res0, res1, res2, res3);
694 LD_SB2(src, 8, src4, src5);
696 LD_SB2(src, 8, src6, src7);
699 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
700 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
701 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
702 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
703 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
704 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
705 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
706 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
707 minus5b, res4, res5, res6, res7);
708 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
709 plus20b, res4, res5, res6, res7);
714 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
715 vec0, vec1, vec2, vec3);
717 AVER_UB4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
718 dst0, dst1, dst2, dst3);
719 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
720 dst += (4 * dst_stride);
732 v16u8 dst0, dst1, dst2, dst3;
733 v16i8 mask0, mask1, mask2;
734 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
736 v16i8 minus5b = __msa_ldi_b(-5);
737 v16i8 plus20b = __msa_ldi_b(20);
748 LD_SB4(src, src_stride, src0, src1, src2, src3);
749 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
752 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
754 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
756 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
763 src0 = __msa_sld_b(src0, src0, slide);
764 src1 = __msa_sld_b(src1, src1, slide);
765 src2 = __msa_sld_b(src2, src2, slide);
766 src3 = __msa_sld_b(src3, src3, slide);
767 src0 = (v16i8) __msa_insve_w((v4i32)
src0, 1, (v4i32) src1);
768 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
769 res0 = (v16u8) __msa_aver_s_b((v16i8) res0,
src0);
770 res1 = (v16u8) __msa_aver_s_b((v16i8) res1,
src1);
774 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
775 dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
779 ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
791 v16i8 mask0, mask1, mask2;
792 v16u8 dst0, dst1, dst2, dst3;
793 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
794 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
795 v8i16 out0, out1, out2, out3;
796 v16i8 minus5b = __msa_ldi_b(-5);
797 v16i8 plus20b = __msa_ldi_b(20);
798 v16i8 res0, res1, res2, res3;
808 for (loop_cnt = 2; loop_cnt--;) {
809 LD_SB4(src, src_stride, src0, src1, src2, src3);
810 src += (4 * src_stride);
812 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
815 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
816 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
817 HADD_SB4_SH(vec0, vec1, vec2, vec3, out0, out1, out2, out3);
818 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
819 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
820 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
821 out0, out1, out2, out3);
822 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
823 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
824 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
825 plus20b, out0, out1, out2, out3);
827 src0 = __msa_sld_b(src0, src0, slide);
828 src1 = __msa_sld_b(src1, src1, slide);
829 src2 = __msa_sld_b(src2, src2, slide);
830 src3 = __msa_sld_b(src3, src3, slide);
835 PCKEV_B4_SB(out0, out0, out1, out1, out2, out2, out3, out3,
836 res0, res1, res2, res3);
838 res0 = __msa_aver_s_b(res0, src0);
839 res1 = __msa_aver_s_b(res1, src1);
840 res2 = __msa_aver_s_b(res2, src2);
841 res3 = __msa_aver_s_b(res3, src3);
844 AVER_ST8x4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
847 dst += (4 * dst_stride);
860 v16i8 mask0, mask1, mask2, vshf;
862 v8i16 res0, res1, res2, res3;
863 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
864 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
865 v16i8 minus5b = __msa_ldi_b(-5);
866 v16i8 plus20b = __msa_ldi_b(20);
876 for (loop_cnt = 8; loop_cnt--;) {
877 LD_SB2(src, 8, src0, src1);
879 LD_SB2(src, 8, src2, src3);
882 LD_UB2(dst, dst_stride, dst0, dst1);
885 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
886 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
887 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
888 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
889 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
890 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
891 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
892 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
893 minus5b, res0, res1, res2, res3);
894 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
895 plus20b, res0, res1, res2, res3);
896 VSHF_B2_SB(src0, src1, src2, src3, vshf, vshf, src0, src2);
901 out0 = __msa_aver_s_b(out0, src0);
902 out1 = __msa_aver_s_b(out1, src2);
906 ST_UB2(dst0, dst1, dst, dst_stride);
907 dst += (2 * dst_stride);
915 int16_t filt_const0 = 0xfb01;
916 int16_t filt_const1 = 0x1414;
917 int16_t filt_const2 = 0x1fb;
918 v16u8 dst0, dst1, dst2, dst3;
919 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
920 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
921 v16i8 src87_r, src2110, src4332, src6554, src8776;
923 v16i8 filt0, filt1, filt2;
926 filt0 = (v16i8) __msa_fill_h(filt_const0);
927 filt1 = (v16i8) __msa_fill_h(filt_const1);
928 filt2 = (v16i8) __msa_fill_h(filt_const2);
930 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
931 src += (5 * src_stride);
933 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
934 src10_r, src21_r, src32_r, src43_r);
935 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
937 LD_SB4(src, src_stride, src5, src6, src7, src8);
938 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
939 src54_r, src65_r, src76_r, src87_r);
940 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
942 out10 =
DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
943 out32 =
DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
946 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
949 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
951 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
952 dst0 = __msa_aver_u_b(res, dst0);
954 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
962 int16_t filt_const0 = 0xfb01;
963 int16_t filt_const1 = 0x1414;
964 int16_t filt_const2 = 0x1fb;
965 v16u8 dst0, dst1, dst2, dst3;
966 v16i8
src0,
src1, src2, src3, src4, src7, src8, src9, src10;
967 v16i8 src10_r, src32_r, src76_r, src98_r;
968 v16i8 src21_r, src43_r, src87_r, src109_r;
969 v8i16 out0, out1, out2, out3;
970 v16i8 filt0, filt1, filt2;
972 filt0 = (v16i8) __msa_fill_h(filt_const0);
973 filt1 = (v16i8) __msa_fill_h(filt_const1);
974 filt2 = (v16i8) __msa_fill_h(filt_const2);
976 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
977 src += (5 * src_stride);
980 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
981 src10_r, src21_r, src32_r, src43_r);
983 for (loop_cnt = 2; loop_cnt--;) {
984 LD_SB4(src, src_stride, src7, src8, src9, src10);
985 src += (4 * src_stride);
988 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
989 src76_r, src87_r, src98_r, src109_r);
990 out0 =
DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
991 out1 =
DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
992 out2 =
DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
993 out3 =
DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
996 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
997 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1000 dst += (4 * dst_stride);
1015 int16_t filt_const0 = 0xfb01;
1016 int16_t filt_const1 = 0x1414;
1017 int16_t filt_const2 = 0x1fb;
1018 v16u8 dst0, dst1, dst2, dst3;
1019 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1020 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1021 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1022 v16i8 src65_l, src87_l;
1023 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1024 v16i8 filt0, filt1, filt2;
1025 v16u8 res0, res1, res2, res3;
1027 filt0 = (v16i8) __msa_fill_h(filt_const0);
1028 filt1 = (v16i8) __msa_fill_h(filt_const1);
1029 filt2 = (v16i8) __msa_fill_h(filt_const2);
1031 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1032 src += (5 * src_stride);
1035 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1036 src10_r, src21_r, src32_r, src43_r);
1037 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1038 src10_l, src21_l, src32_l, src43_l);
1040 for (loop_cnt = 4; loop_cnt--;) {
1041 LD_SB4(src, src_stride, src5, src6, src7, src8);
1042 src += (4 * src_stride);
1045 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1046 src54_r, src65_r, src76_r, src87_r);
1047 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1048 src54_l, src65_l, src76_l, src87_l);
1049 out0_r =
DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1050 out1_r =
DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1051 out2_r =
DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1052 out3_r =
DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1053 out0_l =
DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1054 out1_l =
DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1055 out2_l =
DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1056 out3_l =
DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1059 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1060 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1061 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1062 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1063 out3_r, res0, res1, res2, res3);
1065 AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
1066 res0, res1, res2, res3);
1067 ST_UB4(res0, res1, res2, res3, dst, dst_stride);
1068 dst += (4 * dst_stride);
1088 int16_t filt_const0 = 0xfb01;
1089 int16_t filt_const1 = 0x1414;
1090 int16_t filt_const2 = 0x1fb;
1091 v16u8 dst0, dst1, dst2, dst3;
1092 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1093 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1094 v16i8 src87_r, src2110, src4332, src6554, src8776;
1096 v16i8 filt0, filt1, filt2;
1099 filt0 = (v16i8) __msa_fill_h(filt_const0);
1100 filt1 = (v16i8) __msa_fill_h(filt_const1);
1101 filt2 = (v16i8) __msa_fill_h(filt_const2);
1103 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1104 src += (5 * src_stride);
1106 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1107 src10_r, src21_r, src32_r, src43_r);
1108 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1110 LD_SB4(src, src_stride, src5, src6, src7, src8);
1111 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1112 src54_r, src65_r, src76_r, src87_r);
1113 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1115 out10 =
DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1116 out32 =
DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1119 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1123 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
1124 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
1126 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1127 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
1130 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1131 res = __msa_aver_u_b(res, (v16u8) src32_r);
1133 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1135 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
1136 dst0 = __msa_aver_u_b(res, dst0);
1138 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1148 int16_t filt_const0 = 0xfb01;
1149 int16_t filt_const1 = 0x1414;
1150 int16_t filt_const2 = 0x1fb;
1151 v16u8 dst0, dst1, dst2, dst3;
1152 v16i8
src0,
src1, src2, src3, src4, src7, src8, src9, src10;
1153 v16i8 src10_r, src32_r, src76_r, src98_r;
1154 v16i8 src21_r, src43_r, src87_r, src109_r;
1155 v8i16 out0_r, out1_r, out2_r, out3_r;
1158 v16i8 filt0, filt1, filt2;
1160 filt0 = (v16i8) __msa_fill_h(filt_const0);
1161 filt1 = (v16i8) __msa_fill_h(filt_const1);
1162 filt2 = (v16i8) __msa_fill_h(filt_const2);
1164 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1165 src += (5 * src_stride);
1168 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1169 src10_r, src21_r, src32_r, src43_r);
1171 for (loop_cnt = 2; loop_cnt--;) {
1172 LD_SB4(src, src_stride, src7, src8, src9, src10);
1173 src += (4 * src_stride);
1176 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
1177 src76_r, src87_r, src98_r, src109_r);
1178 out0_r =
DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
1179 out1_r =
DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
1180 out2_r =
DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
1181 out3_r =
DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
1183 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1184 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, res0, res1);
1187 PCKEV_D2_SB(src4, src3, src8, src7, src10_r, src32_r);
1189 PCKEV_D2_SB(src3, src2, src7, src4, src10_r, src32_r);
1192 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1193 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1195 vec0 = (v16u8) __msa_aver_s_b(res0, src10_r);
1196 vec1 = (v16u8) __msa_aver_s_b(res1, src32_r);
1200 ST8x4_UB(vec0, vec1, dst, dst_stride);
1201 dst += (4 * dst_stride);
1220 int16_t filt_const0 = 0xfb01;
1221 int16_t filt_const1 = 0x1414;
1222 int16_t filt_const2 = 0x1fb;
1223 v16u8 dst0, dst1, dst2, dst3;
1224 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1225 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1226 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1227 v16i8 src65_l, src87_l;
1228 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1229 v16i8 out0, out1, out2, out3;
1230 v16i8 filt0, filt1, filt2;
1231 v16u8 res0, res1, res2, res3;
1233 filt0 = (v16i8) __msa_fill_h(filt_const0);
1234 filt1 = (v16i8) __msa_fill_h(filt_const1);
1235 filt2 = (v16i8) __msa_fill_h(filt_const2);
1237 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1238 src += (5 * src_stride);
1241 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1242 src10_r, src21_r, src32_r, src43_r);
1243 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1244 src10_l, src21_l, src32_l, src43_l);
1246 for (loop_cnt = 4; loop_cnt--;) {
1247 LD_SB4(src, src_stride, src5, src6, src7, src8);
1248 src += (4 * src_stride);
1251 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1252 src54_r, src65_r, src76_r, src87_r);
1253 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1254 src54_l, src65_l, src76_l, src87_l);
1255 out0_r =
DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1256 out1_r =
DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1257 out2_r =
DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1258 out3_r =
DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1259 out0_l =
DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1260 out1_l =
DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1261 out2_l =
DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1262 out3_l =
DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1265 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1266 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1267 PCKEV_B4_SB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1268 out3_r, out0, out1, out2, out3);
1269 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1272 res0 = (v16u8) __msa_aver_s_b(out0, src3);
1273 res1 = (v16u8) __msa_aver_s_b(out1, src4);
1274 res2 = (v16u8) __msa_aver_s_b(out2, src5);
1275 res3 = (v16u8) __msa_aver_s_b(out3, src6);
1277 res0 = (v16u8) __msa_aver_s_b(out0, src2);
1278 res1 = (v16u8) __msa_aver_s_b(out1, src3);
1279 res2 = (v16u8) __msa_aver_s_b(out2, src4);
1280 res3 = (v16u8) __msa_aver_s_b(out3, src5);
1284 AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
1285 dst0, dst1, dst2, dst3);
1286 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
1287 dst += (4 * dst_stride);
1307 v16i8
src0,
src1, src2, src3, src4;
1308 v16i8 mask0, mask1, mask2;
1309 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1310 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1311 v8i16 res0, res1, res2, res3;
1312 v16u8 dst0, dst1, dst2, dst3;
1313 v16u8 tmp0, tmp1, tmp2, tmp3;
1316 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1317 src += (5 * src_stride);
1322 mask0, mask1, mask2);
1324 mask0, mask1, mask2);
1326 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
1330 LD_SB4(src, src_stride, src0, src1, src2, src3);
1334 mask0, mask1, mask2);
1336 mask0, mask1, mask2);
1338 PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
1341 hz_out3, hz_out4, hz_out5);
1343 hz_out4, hz_out5, hz_out6);
1345 hz_out5, hz_out6, hz_out7);
1347 hz_out6, hz_out7, hz_out8);
1348 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1354 ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
1363 v16i8
src0,
src1, src2, src3, src4;
1364 v16i8 mask0, mask1, mask2;
1365 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1366 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1367 v16u8 dst0, dst1, dst2, dst3;
1368 v8i16 res0, res1, res2, res3;
1372 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1374 src += (5 * src_stride);
1382 for (loop_cnt = (height >> 2); loop_cnt--;) {
1383 LD_SB4(src, src_stride, src0, src1, src2, src3);
1385 src += (4 * src_stride);
1393 hz_out3, hz_out4, hz_out5);
1395 hz_out4, hz_out5, hz_out6);
1397 hz_out5, hz_out6, hz_out7);
1399 hz_out6, hz_out7, hz_out8);
1400 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1401 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1405 dst += (4 * dst_stride);
1433 v16i8
src0,
src1, src2, src3, src4, src5, src6;
1434 v16u8 dst0, dst1, res;
1435 v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
1436 v4i32 hz_res0, hz_res1;
1438 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
1439 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
1440 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
1441 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
1442 v8i16 minus5h = __msa_ldi_h(-5);
1443 v8i16 plus20h = __msa_ldi_h(20);
1444 v8i16 zeros = { 0 };
1446 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1447 src += (5 * src_stride);
1451 for (row = (height >> 1); row--;) {
1452 LD_SB2(src, src_stride, src5, src6);
1453 src += (2 * src_stride);
1456 LD_UB2(dst, dst_stride, dst0, dst1);
1458 dst0 = (v16u8) __msa_ilvr_w((v4i32) dst1, (v4i32) dst0);
1464 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
1465 mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
1466 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
1467 mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
1469 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
1470 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
1472 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
1473 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
1478 res0 = __msa_srari_h(shf_vec2, 5);
1479 res1 = __msa_srari_h(shf_vec5, 5);
1484 res0 = __msa_ilvod_h(zeros, res0);
1485 res1 = __msa_ilvod_h(zeros, res1);
1487 ILVEV_H2_SH(res0, zeros, res1, zeros, res0, res1);
1489 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) res0);
1490 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) res1);
1491 res0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
1495 dst0 = __msa_aver_u_b(res, dst0);
1498 dst += (2 * dst_stride);
1515 uint32_t multiple8_cnt;
1517 for (multiple8_cnt = 2; multiple8_cnt--;) {
1519 height, horiz_offset);
1533 uint32_t multiple8_cnt;
1535 for (multiple8_cnt = 4; multiple8_cnt--;) {
1537 height, horiz_offset);
1553 v16i8
src0,
src1, src2, src3, src4;
1555 v16i8 mask0, mask1, mask2;
1556 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1557 v8i16 hz_out4, hz_out5, hz_out6;
1558 v8i16 res0, res1, res2, res3;
1562 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1563 src += (5 * src_stride);
1568 mask0, mask1, mask2);
1570 mask0, mask1, mask2);
1572 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
1576 for (loop_cnt = (height >> 1); loop_cnt--;) {
1577 LD_SB2(src, src_stride, src0, src1);
1578 src += (2 * src_stride);
1581 LD_UB2(dst, dst_stride, dst0, dst1);
1585 hz_out6 = (v8i16) __msa_pckod_d((v2i64) hz_out5, (v2i64) hz_out5);
1587 hz_out3, hz_out4, hz_out5);
1589 hz_out4, hz_out5, hz_out6);
1592 res1 = __msa_srari_h(hz_out3, 5);
1593 res3 = __msa_srari_h(hz_out4, 5);
1595 res1 = __msa_srari_h(hz_out2, 5);
1596 res3 = __msa_srari_h(hz_out3, 5);
1601 res0 = __msa_aver_s_h(res0, res1);
1602 res1 = __msa_aver_s_h(res2, res3);
1609 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1610 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1632 v16i8
src0,
src1, src2, src3, src4;
1633 v16u8 dst0, dst1, dst2, dst3;
1634 v16i8 mask0, mask1, mask2;
1635 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1636 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1637 v8i16 res0, res1, res2, res3;
1638 v8i16 res4, res5, res6, res7;
1642 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1644 src += (5 * src_stride);
1652 for (loop_cnt = (height >> 2); loop_cnt--;) {
1653 LD_SB4(src, src_stride, src0, src1, src2, src3);
1655 src += (4 * src_stride);
1657 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1665 hz_out3, hz_out4, hz_out5);
1667 hz_out4, hz_out5, hz_out6);
1669 hz_out5, hz_out6, hz_out7);
1671 hz_out6, hz_out7, hz_out8);
1674 res1 = __msa_srari_h(hz_out3, 5);
1675 res3 = __msa_srari_h(hz_out4, 5);
1676 res5 = __msa_srari_h(hz_out5, 5);
1677 res7 = __msa_srari_h(hz_out6, 5);
1679 res1 = __msa_srari_h(hz_out2, 5);
1680 res3 = __msa_srari_h(hz_out3, 5);
1681 res5 = __msa_srari_h(hz_out4, 5);
1682 res7 = __msa_srari_h(hz_out5, 5);
1687 res0 = __msa_aver_s_h(res0, res1);
1688 res1 = __msa_aver_s_h(res2, res3);
1689 res2 = __msa_aver_s_h(res4, res5);
1690 res3 = __msa_aver_s_h(res6, res7);
1691 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1694 dst += (4 * dst_stride);
1713 for (multiple8_cnt = 2; multiple8_cnt--;) {
1715 height, vert_offset);
1728 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
1729 v16u8 dst0, dst1, dst2, dst3;
1730 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
1731 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
1732 v16i8 mask0, mask1, mask2;
1733 v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
1738 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
1739 src_y += (5 * src_stride);
1741 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
1742 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
1743 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
1744 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
1747 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
1748 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1751 mask0, mask1, mask2);
1753 mask0, mask1, mask2);
1756 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
1758 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
1759 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
1760 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
1761 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
1767 src_vt3, src_vt4, src_vt5);
1769 src_vt5, src_vt6, src_vt7);
1773 res1 = __msa_srari_h((hz_out1 + vert_out1), 1);
1774 res0 = __msa_srari_h((hz_out0 + vert_out0), 1);
1779 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
1780 dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
1781 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
1782 dst0 = __msa_aver_u_b(res, dst0);
1784 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1794 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
1795 v16u8 dst0, dst1, dst2, dst3;
1796 v16i8 src_vt0, src_vt1, src_vt2, src_vt3;
1797 v16i8 src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
1798 v16i8 mask0, mask1, mask2;
1799 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1800 v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
1801 v8i16 out0, out1, out2, out3;
1805 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
1806 src_y += (5 * src_stride);
1808 src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
1809 src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
1810 src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
1811 src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
1815 for (loop_cnt = 2; loop_cnt--;) {
1816 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
1818 src_x += (4 * src_stride);
1820 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1825 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
1826 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
1827 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
1828 src_y += (4 * src_stride);
1830 src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
1831 src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
1832 src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
1833 src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
1837 src_vt4, src_vt5, vert_out0, vert_out1);
1839 src_vt6, src_vt7, vert_out2, vert_out3);
1840 SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
1841 SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
1843 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
1844 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
1845 out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
1846 out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
1849 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1852 dst += (4 * dst_stride);
1868 uint32_t multiple8_cnt;
1870 for (multiple8_cnt = 2; multiple8_cnt--;) {
1879 src_x += (8 * src_stride) - 16;
1880 src_y += (8 * src_stride) - 16;
1881 dst += (8 * dst_stride) - 16;
1883 for (multiple8_cnt = 2; multiple8_cnt--;) {
1896 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
1897 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
1899 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1901 LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15);
1903 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride);
1905 ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride);
1911 uint64_t
src0,
src1, src2, src3, src4, src5, src6, src7;
1913 LD4(src, stride, src0, src1, src2, src3);
1915 LD4(src, stride, src4, src5, src6, src7);
1916 SD4(src0, src1, src2, src3, dst, stride);
1918 SD4(src4, src5, src6, src7, dst, stride);
1924 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
1925 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1927 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1929 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1931 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1933 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
1935 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
1938 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1939 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1941 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1943 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
1945 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
1951 uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1952 v16u8
src0 = { 0 },
src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
1953 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1955 LD4(src, stride, tp0, tp1, tp2, tp3);
1957 LD4(src, stride, tp4, tp5, tp6, tp7);
1963 LD4(dst, stride, tp0, tp1, tp2, tp3);
1964 LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
1970 AVER_UB4_UB(src0, dst0,
src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1973 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
1979 uint32_t tp0, tp1, tp2, tp3;
1980 v16u8
src0 = { 0 }, dst0 = { 0 };
1982 LW4(src, stride, tp0, tp1, tp2, tp3);
1984 LW4(dst, stride, tp0, tp1, tp2, tp3);
1987 dst0 = __msa_aver_u_b(src0, dst0);
1989 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
1996 v16i8 dst0, dst1, dst2, dst3,
src0,
src1, src2, src3, src4, src5, src6;
1997 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
1998 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1999 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2000 v16i8 minus5b = __msa_ldi_b(-5);
2001 v16i8 plus20b = __msa_ldi_b(20);
2009 for (loop_cnt = 4; loop_cnt--;) {
2010 LD_SB2(src, 16, src0, src1);
2012 LD_SB2(src, 16, src2, src3);
2014 LD_SB2(src, 16, src4, src5);
2016 LD_SB2(src, 16, src6, src7);
2020 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
2021 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
2022 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
2023 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
2024 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
2025 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
2026 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
2027 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2028 minus5b, res0, res1, res2, res3);
2029 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2030 plus20b, res0, res1, res2, res3);
2031 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
2032 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
2033 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
2034 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
2035 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
2036 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
2037 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
2038 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2039 minus5b, res4, res5, res6, res7);
2040 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2041 plus20b, res4, res5, res6, res7);
2042 SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2);
2043 SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2);
2050 dst0 = __msa_aver_s_b(dst0, src0);
2051 dst1 = __msa_aver_s_b(dst1, src2);
2052 dst2 = __msa_aver_s_b(dst2, src4);
2053 dst3 = __msa_aver_s_b(dst3, src6);
2055 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
2064 v16i8 dst0, dst1, dst2, dst3,
src0,
src1, src2, src3, src4, src5, src6;
2065 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
2066 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2067 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2068 v16i8 minus5b = __msa_ldi_b(-5);
2069 v16i8 plus20b = __msa_ldi_b(20);
2077 for (loop_cnt = 4; loop_cnt--;) {
2078 LD_SB2(src, 16, src0, src1);
2080 LD_SB2(src, 16, src2, src3);
2082 LD_SB2(src, 16, src4, src5);
2084 LD_SB2(src, 16, src6, src7);
2088 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
2089 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
2090 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
2091 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
2092 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
2093 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
2094 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
2095 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2096 minus5b, res0, res1, res2, res3);
2097 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2098 plus20b, res0, res1, res2, res3);
2099 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
2100 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
2101 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
2102 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
2103 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
2104 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
2105 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
2106 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2107 minus5b, res4, res5, res6, res7);
2108 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2109 plus20b, res4, res5, res6, res7);
2110 SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3);
2111 SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3);
2118 dst0 = __msa_aver_s_b(dst0, src0);
2119 dst1 = __msa_aver_s_b(dst1, src2);
2120 dst2 = __msa_aver_s_b(dst2, src4);
2121 dst3 = __msa_aver_s_b(dst3, src6);
2123 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
2131 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
2132 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
2133 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2134 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2135 v16i8 minus5b = __msa_ldi_b(-5);
2136 v16i8 plus20b = __msa_ldi_b(20);
2139 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2141 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2142 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2143 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
2144 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
2145 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
2146 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2147 res0, res1, res2, res3);
2148 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
2149 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
2150 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
2151 res0, res1, res2, res3);
2152 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2153 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
2154 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
2155 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
2156 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
2157 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2158 res4, res5, res6, res7);
2159 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
2160 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
2161 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
2162 res4, res5, res6, res7);
2163 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
2164 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
2165 SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2);
2166 SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2);
2175 tmp0 = __msa_aver_s_b(tmp0, src0);
2176 tmp1 = __msa_aver_s_b(tmp1, src1);
2177 tmp2 = __msa_aver_s_b(tmp2, src4);
2178 tmp3 = __msa_aver_s_b(tmp3, src5);
2180 ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
2186 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
2187 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
2188 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2189 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2190 v16i8 minus5b = __msa_ldi_b(-5);
2191 v16i8 plus20b = __msa_ldi_b(20);
2194 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2196 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2197 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2198 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
2199 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
2200 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
2201 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2202 res0, res1, res2, res3);
2203 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
2204 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
2205 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
2206 res0, res1, res2, res3);
2207 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2208 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
2209 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
2210 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
2211 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
2212 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2213 res4, res5, res6, res7);
2214 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
2215 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
2216 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
2217 res4, res5, res6, res7);
2218 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
2219 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
2220 SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 3);
2221 SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 3);
2230 tmp0 = __msa_aver_s_b(tmp0, src0);
2231 tmp1 = __msa_aver_s_b(tmp1, src1);
2232 tmp2 = __msa_aver_s_b(tmp2, src4);
2233 tmp3 = __msa_aver_s_b(tmp3, src5);
2235 ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
2241 v16i8
src0,
src1, src2, src3, res, mask0, mask1, mask2;
2242 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2244 v16i8 minus5b = __msa_ldi_b(-5);
2245 v16i8 plus20b = __msa_ldi_b(20);
2248 LD_SB4(src - 2, stride, src0, src1, src2, src3);
2250 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2252 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2253 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
2254 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2255 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
2258 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
2259 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
2260 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
2261 src0 = (v16i8) __msa_insve_w((v4i32)
src0, 1, (v4i32) src1);
2262 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
2263 src0 = (v16i8) __msa_insve_d((v2i64)
src0, 1, (v2i64) src1);
2264 res = __msa_aver_s_b(res, src0);
2265 res = (v16i8) __msa_xori_b((v16u8) res, 128);
2266 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
2272 v16i8
src0,
src1, src2, src3, res, mask0, mask1, mask2;
2273 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2275 v16i8 minus5b = __msa_ldi_b(-5);
2276 v16i8 plus20b = __msa_ldi_b(20);
2279 LD_SB4(src - 2, stride, src0, src1, src2, src3);
2281 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2283 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2284 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
2285 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2286 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
2289 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
2290 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
2291 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
2292 src0 = (v16i8) __msa_insve_w((v4i32)
src0, 1, (v4i32) src1);
2293 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
2294 src0 = (v16i8) __msa_insve_d((v2i64)
src0, 1, (v2i64) src1);
2295 res = __msa_aver_s_b(res, src0);
2296 res = (v16i8) __msa_xori_b((v16u8) res, 128);
2297 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
2304 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
2305 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2307 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2308 v16i8 minus5b = __msa_ldi_b(-5);
2309 v16i8 plus20b = __msa_ldi_b(20);
2314 for (loop_cnt = 4; loop_cnt--;) {
2315 LD_SB2(src, 8, src0, src1);
2317 LD_SB2(src, 8, src2, src3);
2319 LD_SB2(src, 8, src4, src5);
2321 LD_SB2(src, 8, src6, src7);
2325 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
2326 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
2327 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
2328 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
2329 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
2330 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
2331 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
2332 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2333 minus5b, res0, res1, res2, res3);
2334 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2335 plus20b, res0, res1, res2, res3);
2336 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
2337 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
2338 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
2339 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
2340 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
2341 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
2342 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
2343 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2344 minus5b, res4, res5, res6, res7);
2345 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2346 plus20b, res4, res5, res6, res7);
2351 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
2354 ST_SB4(vec0, vec1, vec2, vec3, dst, stride);
2362 v16u8 out0, out1, out2, out3;
2363 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
2364 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2366 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2367 v16i8 minus5b = __msa_ldi_b(-5);
2368 v16i8 plus20b = __msa_ldi_b(20);
2371 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2373 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2374 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2375 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
2376 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
2377 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
2378 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2379 res0, res1, res2, res3);
2380 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
2381 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
2382 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
2383 plus20b, res0, res1, res2, res3);
2384 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2385 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
2386 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
2387 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
2388 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
2389 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2390 res4, res5, res6, res7);
2391 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
2392 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
2393 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
2394 plus20b, res4, res5, res6, res7);
2403 ST8x8_UB(out0, out1, out2, out3, dst, stride);
2410 v16i8
src0,
src1, src2, src3, mask0, mask1, mask2;
2411 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2413 v16i8 minus5b = __msa_ldi_b(-5);
2414 v16i8 plus20b = __msa_ldi_b(20);
2417 LD_SB4(src - 2, stride, src0, src1, src2, src3);
2419 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2421 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2422 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
2423 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2424 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
2428 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2435 int16_t filt_const0 = 0xfb01;
2436 int16_t filt_const1 = 0x1414;
2437 int16_t filt_const2 = 0x1fb;
2438 v16u8 res0, res1, res2, res3;
2439 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2440 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2441 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2442 v16i8 src65_l, src87_l, filt0, filt1, filt2;
2443 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2445 filt0 = (v16i8) __msa_fill_h(filt_const0);
2446 filt1 = (v16i8) __msa_fill_h(filt_const1);
2447 filt2 = (v16i8) __msa_fill_h(filt_const2);
2449 src -= (stride * 2);
2451 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2455 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2457 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2460 for (loop_cnt = 4; loop_cnt--;) {
2461 LD_SB4(src, stride, src5, src6, src7, src8);
2465 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
2466 src65_r, src76_r, src87_r);
2467 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
2468 src65_l, src76_l, src87_l);
2469 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2470 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2471 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2472 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2473 out0_l =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2474 out1_l =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2475 out2_l =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2476 out3_l =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2478 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2480 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2481 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2482 out3_r, res0, res1, res2, res3);
2483 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
2484 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
2485 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
2486 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
2488 ST_UB4(res0, res1, res2, res3, dst, stride);
2509 int16_t filt_const0 = 0xfb01;
2510 int16_t filt_const1 = 0x1414;
2511 int16_t filt_const2 = 0x1fb;
2512 v16u8 res0, res1, res2, res3;
2513 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2514 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2515 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2516 v16i8 src65_l, src87_l, filt0, filt1, filt2;
2517 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2519 filt0 = (v16i8) __msa_fill_h(filt_const0);
2520 filt1 = (v16i8) __msa_fill_h(filt_const1);
2521 filt2 = (v16i8) __msa_fill_h(filt_const2);
2523 src -= (stride * 2);
2525 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2529 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2531 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2534 for (loop_cnt = 4; loop_cnt--;) {
2535 LD_SB4(src, stride, src5, src6, src7, src8);
2539 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
2540 src65_r, src76_r, src87_r);
2541 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
2542 src65_l, src76_l, src87_l);
2543 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2544 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2545 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2546 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2547 out0_l =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2548 out1_l =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2549 out2_l =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2550 out3_l =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2552 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2554 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2555 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2556 out3_r, res0, res1, res2, res3);
2557 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
2558 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
2559 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
2560 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
2562 ST_UB4(res0, res1, res2, res3, dst, stride);
2581 const int16_t filt_const0 = 0xfb01;
2582 const int16_t filt_const1 = 0x1414;
2583 const int16_t filt_const2 = 0x1fb;
2584 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2585 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
2586 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
2587 v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3;
2588 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
2590 filt0 = (v16i8) __msa_fill_h(filt_const0);
2591 filt1 = (v16i8) __msa_fill_h(filt_const1);
2592 filt2 = (v16i8) __msa_fill_h(filt_const2);
2594 src -= (stride * 2);
2596 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2598 LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
2599 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
2601 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2603 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2605 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
2606 src109_r, src1110_r, src1211_r);
2607 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2608 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2609 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2610 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2611 out4_r =
AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
2612 out5_r =
AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
2613 out6_r =
AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
2614 out7_r =
AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
2619 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2620 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
2621 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
2622 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
2623 out0 = __msa_aver_s_b(out0, tmp0);
2624 out1 = __msa_aver_s_b(out1, tmp1);
2625 out2 = __msa_aver_s_b(out2, tmp2);
2626 out3 = __msa_aver_s_b(out3, tmp3);
2628 ST8x8_UB(out0, out1, out2, out3, dst, stride);
2634 const int16_t filt_const0 = 0xfb01;
2635 const int16_t filt_const1 = 0x1414;
2636 const int16_t filt_const2 = 0x1fb;
2637 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2638 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
2639 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
2640 v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
2641 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
2643 filt0 = (v16i8) __msa_fill_h(filt_const0);
2644 filt1 = (v16i8) __msa_fill_h(filt_const1);
2645 filt2 = (v16i8) __msa_fill_h(filt_const2);
2647 src -= (stride * 2);
2649 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2651 LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
2653 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
2654 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2656 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2658 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
2659 src109_r, src1110_r, src1211_r);
2660 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2661 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2662 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2663 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2664 out4_r =
AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
2665 out5_r =
AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
2666 out6_r =
AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
2667 out7_r =
AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
2672 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2673 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
2674 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
2675 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
2676 out0 = __msa_aver_s_b(out0, tmp0);
2677 out1 = __msa_aver_s_b(out1, tmp1);
2678 out2 = __msa_aver_s_b(out2, tmp2);
2679 out3 = __msa_aver_s_b(out3, tmp3);
2681 ST8x8_UB(out0, out1, out2, out3, dst, stride);
2687 int16_t filt_const0 = 0xfb01;
2688 int16_t filt_const1 = 0x1414;
2689 int16_t filt_const2 = 0x1fb;
2691 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2692 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2693 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
2696 filt0 = (v16i8) __msa_fill_h(filt_const0);
2697 filt1 = (v16i8) __msa_fill_h(filt_const1);
2698 filt2 = (v16i8) __msa_fill_h(filt_const2);
2700 src -= (stride * 2);
2702 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2704 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2706 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2708 LD_SB4(src, stride, src5, src6, src7, src8);
2709 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2711 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
2713 out10 =
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
2714 out32 =
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
2718 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
2719 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
2720 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
2721 out = __msa_aver_u_b(out, (v16u8) src32_r);
2722 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2728 int16_t filt_const0 = 0xfb01;
2729 int16_t filt_const1 = 0x1414;
2730 int16_t filt_const2 = 0x1fb;
2732 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2733 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2734 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
2737 filt0 = (v16i8) __msa_fill_h(filt_const0);
2738 filt1 = (v16i8) __msa_fill_h(filt_const1);
2739 filt2 = (v16i8) __msa_fill_h(filt_const2);
2741 src -= (stride * 2);
2743 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2745 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2747 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2749 LD_SB4(src, stride, src5, src6, src7, src8);
2750 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2752 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
2754 out10 =
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
2755 out32 =
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
2759 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
2760 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
2761 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
2762 out = __msa_aver_u_b(out, (v16u8) src32_r);
2763 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2770 src - (stride * 2), stride, dst, stride, 16);
2777 src - (stride * 2) +
2778 sizeof(
uint8_t), stride, dst, stride, 16);
2785 src - (stride * 2), stride, dst, stride, 16);
2792 src - (stride * 2) +
2793 sizeof(
uint8_t), stride, dst, stride, 16);
2806 src - (stride * 2) +
2807 sizeof(
uint8_t), stride, dst, stride, 8);
2814 src - (stride * 2), stride, dst, stride, 8);
2821 src - (stride * 2) +
2822 sizeof(
uint8_t), stride, dst, stride, 8);
2836 src - (stride * 2) +
2837 sizeof(
uint8_t), stride, dst, stride, 4);
2844 src - (stride * 2), stride, dst, stride, 4);
2851 src - (stride * 2) +
2852 sizeof(
uint8_t), stride, dst, stride, 4);
2860 uint32_t multiple8_cnt, loop_cnt;
2861 const int32_t filt_const0 = 0xfffb0001;
2862 const int32_t filt_const1 = 0x140014;
2863 const int32_t filt_const2 = 0x1fffb;
2865 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
2867 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2868 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2869 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2870 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
2871 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
2872 v8i16 hz_out87_l, filt0, filt1, filt2;
2875 filt0 = (v8i16) __msa_fill_w(filt_const0);
2876 filt1 = (v8i16) __msa_fill_w(filt_const1);
2877 filt2 = (v8i16) __msa_fill_w(filt_const2);
2881 for (multiple8_cnt = 2; multiple8_cnt--;) {
2885 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2895 for (loop_cnt = 4; loop_cnt--;) {
2896 LD_SB4(src, stride, src5, src6, src7, src8);
2906 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2907 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
2909 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2910 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
2912 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2913 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
2915 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2916 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
2923 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2928 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2933 dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2938 dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2940 dst1 = __msa_srari_h(hz_out2, 5);
2941 dst3 = __msa_srari_h(hz_out3, 5);
2942 dst5 = __msa_srari_h(hz_out4, 5);
2943 dst7 = __msa_srari_h(hz_out5, 5);
2946 dst0 = __msa_aver_s_h(dst0, dst1);
2947 dst1 = __msa_aver_s_h(dst2, dst3);
2948 dst2 = __msa_aver_s_h(dst4, dst5);
2949 dst3 = __msa_aver_s_h(dst6, dst7);
2973 uint32_t multiple8_cnt, loop_cnt;
2974 const int32_t filt_const0 = 0xfffb0001;
2975 const int32_t filt_const1 = 0x140014;
2976 const int32_t filt_const2 = 0x1fffb;
2978 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
2980 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2981 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2982 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2983 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
2984 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
2985 v8i16 hz_out87_l, filt0, filt1, filt2;
2988 filt0 = (v8i16) __msa_fill_w(filt_const0);
2989 filt1 = (v8i16) __msa_fill_w(filt_const1);
2990 filt2 = (v8i16) __msa_fill_w(filt_const2);
2994 for (multiple8_cnt = 2; multiple8_cnt--;) {
2998 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3008 for (loop_cnt = 4; loop_cnt--;) {
3009 LD_SB4(src, stride, src5, src6, src7, src8);
3019 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
3020 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
3022 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
3023 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
3025 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
3026 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
3028 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
3029 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
3036 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3041 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3046 dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3051 dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3053 dst1 = __msa_srari_h(hz_out3, 5);
3054 dst3 = __msa_srari_h(hz_out4, 5);
3055 dst5 = __msa_srari_h(hz_out5, 5);
3056 dst7 = __msa_srari_h(hz_out6, 5);
3059 dst0 = __msa_aver_s_h(dst0, dst1);
3060 dst1 = __msa_aver_s_h(dst2, dst3);
3061 dst2 = __msa_aver_s_h(dst4, dst5);
3062 dst3 = __msa_aver_s_h(dst6, dst7);
3084 const int32_t filt_const0 = 0xfffb0001;
3085 const int32_t filt_const1 = 0x140014;
3086 const int32_t filt_const2 = 0x1fffb;
3088 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3089 v16i8 src11, src12, mask0, mask1, mask2;
3090 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3091 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
3092 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3093 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
3094 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
3095 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
3096 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
3097 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
3102 filt0 = (v8i16) __msa_fill_w(filt_const0);
3103 filt1 = (v8i16) __msa_fill_w(filt_const1);
3104 filt2 = (v8i16) __msa_fill_w(filt_const2);
3106 src -= ((2 *
stride) + 2);
3108 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3118 LD_SB4(src, stride, src5, src6, src7, src8);
3127 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3128 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3129 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3130 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
3131 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3132 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3133 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3134 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
3136 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3138 tmp1 =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
3140 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3141 tmp0 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3143 tmp1 =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
3145 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3146 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3148 tmp1 =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
3150 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3151 tmp0 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3153 tmp1 =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
3155 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3157 SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
3158 SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
3160 dst0 = __msa_aver_s_h(dst0, hz_out2);
3161 dst1 = __msa_aver_s_h(dst1, hz_out3);
3162 dst2 = __msa_aver_s_h(dst2, hz_out4);
3163 dst3 = __msa_aver_s_h(dst3, hz_out5);
3170 LD_SB4(src, stride, src9, src10, src11, src12);
3176 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3177 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
3179 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3180 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
3182 tmp0 =
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
3184 tmp1 =
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
3186 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3187 tmp0 =
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
3189 tmp1 =
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
3191 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3192 tmp0 =
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
3194 tmp1 =
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
3196 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3197 tmp0 =
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
3199 tmp1 =
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
3201 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3203 SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
3204 SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
3206 dst0 = __msa_aver_s_h(dst0, hz_out6);
3207 dst1 = __msa_aver_s_h(dst1, hz_out7);
3208 dst2 = __msa_aver_s_h(dst2, hz_out8);
3209 dst3 = __msa_aver_s_h(dst3, hz_out9);
3219 const int32_t filt_const0 = 0xfffb0001;
3220 const int32_t filt_const1 = 0x140014;
3221 const int32_t filt_const2 = 0x1fffb;
3223 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3224 v16i8 src11, src12, mask0, mask1, mask2;
3225 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3226 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
3227 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3228 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
3229 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
3230 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
3231 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
3232 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
3237 filt0 = (v8i16) __msa_fill_w(filt_const0);
3238 filt1 = (v8i16) __msa_fill_w(filt_const1);
3239 filt2 = (v8i16) __msa_fill_w(filt_const2);
3241 src -= ((2 *
stride) + 2);
3243 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3253 LD_SB4(src, stride, src5, src6, src7, src8);
3262 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3263 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3264 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3265 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
3266 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3267 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3268 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3269 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
3271 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3273 tmp1 =
AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
3275 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3276 tmp0 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3278 tmp1 =
AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
3280 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3281 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3283 tmp1 =
AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
3285 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3286 tmp0 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3288 tmp1 =
AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
3290 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3292 SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
3293 SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
3295 dst0 = __msa_aver_s_h(dst0, hz_out3);
3296 dst1 = __msa_aver_s_h(dst1, hz_out4);
3297 dst2 = __msa_aver_s_h(dst2, hz_out5);
3298 dst3 = __msa_aver_s_h(dst3, hz_out6);
3305 LD_SB4(src, stride, src9, src10, src11, src12);
3311 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3312 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
3314 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3315 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
3317 tmp0 =
AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
3319 tmp1 =
AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
3321 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3322 tmp0 =
AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
3324 tmp1 =
AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
3326 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3327 tmp0 =
AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
3329 tmp1 =
AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
3331 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3332 tmp0 =
AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
3334 tmp1 =
AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
3336 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3338 SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
3339 SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
3341 dst0 = __msa_aver_s_h(dst0, hz_out7);
3342 dst1 = __msa_aver_s_h(dst1, hz_out8);
3343 dst2 = __msa_aver_s_h(dst2, hz_out9);
3344 dst3 = __msa_aver_s_h(dst3, hz_out10);
3354 const int32_t filt_const0 = 0xfffb0001;
3355 const int32_t filt_const1 = 0x140014;
3356 const int32_t filt_const2 = 0x1fffb;
3358 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3359 v16i8 mask0, mask1, mask2;
3360 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3361 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
3362 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3363 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
3368 filt0 = (v8i16) __msa_fill_w(filt_const0);
3369 filt1 = (v8i16) __msa_fill_w(filt_const1);
3370 filt2 = (v8i16) __msa_fill_w(filt_const2);
3372 src -= ((2 *
stride) + 2);
3374 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3376 LD_SB4(src, stride, src5, src6, src7, src8);
3386 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
3387 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
3389 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3390 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3391 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3392 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3394 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3396 tmp1 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3398 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3399 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3401 tmp1 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3403 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3408 dst0 = __msa_aver_s_h(dst0, hz_out2);
3409 dst1 = __msa_aver_s_h(dst1, hz_out4);
3412 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
3418 const int32_t filt_const0 = 0xfffb0001;
3419 const int32_t filt_const1 = 0x140014;
3420 const int32_t filt_const2 = 0x1fffb;
3422 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3423 v16i8 mask0, mask1, mask2;
3424 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3425 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
3426 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3427 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
3432 filt0 = (v8i16) __msa_fill_w(filt_const0);
3433 filt1 = (v8i16) __msa_fill_w(filt_const1);
3434 filt2 = (v8i16) __msa_fill_w(filt_const2);
3436 src -= ((2 *
stride) + 2);
3438 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3440 LD_SB4(src, stride, src5, src6, src7, src8);
3450 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
3451 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
3453 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3454 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3455 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3456 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3458 tmp0 =
AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3460 tmp1 =
AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3462 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3463 tmp0 =
AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3465 tmp1 =
AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3467 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3469 PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
3473 dst0 = __msa_aver_s_h(dst0, hz_out0);
3474 dst1 = __msa_aver_s_h(dst1, hz_out1);
3477 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
3484 int16_t filt_const0 = 0xfb01;
3485 int16_t filt_const1 = 0x1414;
3486 int16_t filt_const2 = 0x1fb;
3487 v16u8 res0, res1, res2, res3;
3488 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3489 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3490 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3491 v16i8 src65_l, src87_l, filt0, filt1, filt2;
3492 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3494 filt0 = (v16i8) __msa_fill_h(filt_const0);
3495 filt1 = (v16i8) __msa_fill_h(filt_const1);
3496 filt2 = (v16i8) __msa_fill_h(filt_const2);
3497 src -= (stride * 2);
3499 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3503 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3505 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3508 for (loop_cnt = 4; loop_cnt--;) {
3509 LD_SB4(src, stride, src5, src6, src7, src8);
3513 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3514 src65_r, src76_r, src87_r);
3515 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3516 src65_l, src76_l, src87_l);
3517 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3518 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3519 out2_r =
AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3520 out3_r =
AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3521 out0_l =
AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3522 out1_l =
AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3523 out2_l =
AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3524 out3_l =
AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3526 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3528 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3529 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3530 out3_r, res0, res1, res2, res3);
3532 ST_UB4(res0, res1, res2, res3, dst, stride);
3550 const int16_t filt_const0 = 0xfb01;
3551 const int16_t filt_const1 = 0x1414;
3552 const int16_t filt_const2 = 0x1fb;
3553 v16u8 out0, out1, out2, out3;
3554 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3555 v16i8 src11, src12, src10_r, src21_r, src32_r, src43_r, src76_r, src87_r;
3556 v16i8 src98_r, src109_r, src89_r, src910_r, src1110_r, src1211_r;
3557 v16i8 filt0, filt1, filt2;
3558 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3560 filt0 = (v16i8) __msa_fill_h(filt_const0);
3561 filt1 = (v16i8) __msa_fill_h(filt_const1);
3562 filt2 = (v16i8) __msa_fill_h(filt_const2);
3564 src -= (stride * 2);
3566 LD_SB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3568 LD_SB5(src, stride, src8, src9, src10, src11, src12);
3569 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3571 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src76_r, src87_r,
3573 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src89_r,
3574 src910_r, src1110_r, src1211_r);
3578 out0_r =
AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3579 out1_r =
AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3580 out2_r =
AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3581 out3_r =
AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3582 out4_r =
AVC_DOT_SH3_SH(src76_r, src98_r, src89_r, filt0, filt1, filt2);
3583 out5_r =
AVC_DOT_SH3_SH(src87_r, src109_r, src910_r, filt0, filt1, filt2);
3584 out6_r =
AVC_DOT_SH3_SH(src98_r, src89_r, src1110_r, filt0, filt1, filt2);
3585 out7_r =
AVC_DOT_SH3_SH(src109_r, src910_r, src1211_r, filt0, filt1, filt2);
3588 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3589 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3594 ST8x8_UB(out0, out1, out2, out3, dst, stride);
3600 const int16_t filt_const0 = 0xfb01;
3601 const int16_t filt_const1 = 0x1414;
3602 const int16_t filt_const2 = 0x1fb;
3604 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3605 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3606 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3609 filt0 = (v16i8) __msa_fill_h(filt_const0);
3610 filt1 = (v16i8) __msa_fill_h(filt_const1);
3611 filt2 = (v16i8) __msa_fill_h(filt_const2);
3613 src -= (stride * 2);
3615 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3617 LD_SB4(src, stride, src5, src6, src7, src8);
3619 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3621 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3623 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
3624 src76_r, src2110, src4332, src6554, src8776);
3626 out10 =
AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
3627 out32 =
AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
3631 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
3638 stride, dst, stride, 16, 0);
3645 stride, dst, stride, 16, 1);
3748 stride, dst, stride, 0);
3755 stride, dst, stride, 1);
3762 stride, dst, stride, 0);
3769 stride, dst, stride, 1);
3776 stride, dst, stride, 0);
3783 stride, dst, stride, 1);
3791 stride, dst, stride);
3798 src - (stride * 2) +
3808 stride, dst, stride);
3815 src - (stride * 2) +
3825 stride, dst, stride);
3832 src - (stride * 2) +
3833 sizeof(
uint8_t), stride, dst, stride);
3841 stride, dst, stride);
3848 src - (stride * 2) +
3849 sizeof(
uint8_t), stride, dst, stride);
3858 stride, dst, stride);
3865 src - (stride * 2) +
3866 sizeof(
uint8_t), stride, dst, stride);
3874 stride, dst, stride);
3881 src - (stride * 2) +
3882 sizeof(
uint8_t), stride, dst, stride);
3889 stride, dst, stride, 16, 0);
3896 stride, dst, stride, 16, 1);
3903 stride, dst, stride, 8, 0);
3910 stride, dst, stride, 8, 1);
3917 stride, dst, stride, 4, 0);
3924 stride, dst, stride, 4, 1);
3949 stride, dst, stride, 16, 0);
3956 stride, dst, stride, 16, 1);
3963 stride, dst, stride, 8, 0);
3970 stride, dst, stride, 8, 1);
3977 stride, dst, stride, 4, 0);
3984 stride, dst, stride, 4, 1);
3991 stride, dst, stride);
3998 stride, dst, stride, 8);
4005 stride, dst, stride);
void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define XORI_B5_128_SB(...)
static void avc_luma_vt_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
#define XORI_B8_128_SB(...)
void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_mid_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2)
void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midh_qrt_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
static void avc_luma_hz_qrt_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t hor_offset)
static void avc_luma_midv_qrt_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t vert_offset)
void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define XORI_B2_128_SB(...)
#define PCKEV_XORI128_UB(in0, in1)
void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hz_qrt_and_aver_dst_8x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t hor_offset)
static void avc_luma_hv_qrt_8w_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define DPADD_SB4_SH(...)
void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midh_qrt_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)
static void avc_luma_midv_qrt_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t ver_offset)
#define XORI_B4_128_UB(...)
void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hz_and_aver_dst_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
#define LD4(psrc, stride, out0, out1, out2, out3)
void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define XORI_B2_128_UB(...)
static void avc_luma_mid_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5)
static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midh_qrt_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midh_qrt_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
static void avc_luma_mid_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define XORI_B4_128_SB(...)
void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVC_CALC_DPADD_H_6PIX_2COEFF_SH(in0, in1, in2, in3, in4, in5)
#define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2)
static void avc_luma_midh_qrt_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,dst0, dst1, pdst, stride)
void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define DPADD_SB2_SH(...)
void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static const uint8_t luma_mask_arr[16 *8]
static void avc_luma_hv_qrt_16w_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)
void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define INSERT_W4_UB(...)
void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_vt_and_aver_dst_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midv_qrt_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t vert_offset)
void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midh_qrt_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define SD4(in0, in1, in2, in3, pdst, stride)
static void avc_luma_hz_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
#define DPADD_SH2_SW(...)
static void avc_luma_hz_qrt_and_aver_dst_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t hor_offset)
void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hz_and_aver_dst_8x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_vt_qrt_and_aver_dst_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t ver_offset)
#define INSERT_D2_UB(...)
#define AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,mask0, mask1, mask2)
GLint GLenum GLboolean GLsizei stride
static void avc_luma_vt_qrt_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t ver_offset)
#define LW4(psrc, stride, out0, out1, out2, out3)
static void avc_luma_mid_and_aver_dst_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
#define ST8x4_UB(in0, in1, pdst, stride)
static void avc_luma_mid_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define ST8x8_UB(in0, in1, in2, in3, pdst, stride)
void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_vt_qrt_and_aver_dst_8x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t ver_offset)
void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5)
void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define ST4x2_UB(in, pdst, stride)
static void avc_luma_mid_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5,out1, out2)
void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_vt_and_aver_dst_8x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)