27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
32 #define HEVC_HV_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd, \
33 out0, out1, out2, out3) \
35 MUL4(in0, wgt, in1, wgt, in2, wgt, in3, wgt, out0, out1, out2, out3); \
36 SRAR_W4_SW(out0, out1, out2, out3, rnd); \
37 ADD4(out0, offset, out1, offset, out2, offset, out3, offset, \
38 out0, out1, out2, out3); \
39 out0 = CLIP_SW_0_255(out0); \
40 out1 = CLIP_SW_0_255(out1); \
41 out2 = CLIP_SW_0_255(out2); \
42 out3 = CLIP_SW_0_255(out3); \
45 #define HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd, \
46 out0_r, out1_r, out0_l, out1_l) \
48 ILVR_H2_SW(in0, in0, in1, in1, out0_r, out1_r); \
49 ILVL_H2_SW(in0, in0, in1, in1, out0_l, out1_l); \
50 DOTP_SH4_SW(out0_r, out1_r, out0_l, out1_l, wgt, wgt, wgt, wgt, \
51 out0_r, out1_r, out0_l, out1_l); \
52 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
53 ADD4(out0_r, offset, out1_r, offset, \
54 out0_l, offset, out1_l, offset, \
55 out0_r, out1_r, out0_l, out1_l); \
56 out0_r = CLIP_SW_0_255(out0_r); \
57 out1_r = CLIP_SW_0_255(out1_r); \
58 out0_l = CLIP_SW_0_255(out0_l); \
59 out1_l = CLIP_SW_0_255(out1_l); \
62 #define HEVC_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd, \
63 out0_r, out1_r, out2_r, out3_r, \
64 out0_l, out1_l, out2_l, out3_l) \
66 HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd, \
67 out0_r, out1_r, out0_l, out1_l); \
68 HEVC_UNIW_RND_CLIP2(in2, in3, wgt, offset, rnd, \
69 out2_r, out3_r, out2_l, out3_l); \
72 #define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \
75 v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m; \
77 ILVRL_H2_SW(in0_h, in0_h, in0_r_m, in0_l_m); \
78 ILVRL_H2_SW(in1_h, in1_h, in1_r_m, in1_l_m); \
79 DOTP_SH4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, wgt_w, wgt_w, wgt_w, \
80 wgt_w, in0_r_m, in1_r_m, in0_l_m, in1_l_m); \
81 SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w); \
82 PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h); \
83 ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h); \
84 CLIP_SH2_0_255_MAX_SATU(out0_h, out1_h); \
87 #define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w, \
88 offset_h, rnd_w, out0_h, out1_h, \
91 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \
93 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w, \
106 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
110 v8i16 dst0, dst1, dst2, dst3, offset_vec;
111 v4i32 weight_vec, rnd_vec;
113 weight = weight & 0x0000FFFF;
114 weight_vec = __msa_fill_w(weight);
115 offset_vec = __msa_fill_h(offset);
116 rnd_vec = __msa_fill_w(rnd_val);
119 v4i32 dst0_r, dst0_l;
121 LW2(src, src_stride, tp0, tp1);
123 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
127 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
129 dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
132 out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
134 }
else if (4 == height) {
135 LW4(src, src_stride, tp0, tp1, tp2, tp3);
140 rnd_vec, dst0, dst1);
141 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
142 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
143 }
else if (0 == (height % 8)) {
144 for (loop_cnt = (height >> 3); loop_cnt--;) {
145 LW4(src, src_stride, tp0, tp1, tp2, tp3);
146 src += 4 * src_stride;
148 LW4(src, src_stride, tp0, tp1, tp2, tp3);
149 src += 4 * src_stride;
153 SLLI_4V(dst0, dst1, dst2, dst3, 6);
155 offset_vec, rnd_vec, dst0, dst1,
158 ST4x8_UB(out0, out1, dst, dst_stride);
159 dst += 8 * dst_stride;
174 uint64_t tp0, tp1, tp2, tp3;
176 v16u8 out0, out1, out2, out3;
178 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
179 v4i32 weight_vec, rnd_vec;
181 weight = weight & 0x0000FFFF;
182 weight_vec = __msa_fill_w(weight);
183 offset_vec = __msa_fill_h(offset);
184 rnd_vec = __msa_fill_w(rnd_val);
186 for (loop_cnt = (height >> 3); loop_cnt--;) {
187 LD4(src, src_stride, tp0, tp1, tp2, tp3);
188 src += (4 * src_stride);
191 LD4(src, src_stride, tp0, tp1, tp2, tp3);
192 src += (4 * src_stride);
201 SLLI_4V(dst0, dst1, dst2, dst3, 6);
202 SLLI_4V(dst4, dst5, dst6, dst7, 6);
205 offset_vec, rnd_vec, dst0, dst1, dst2,
208 offset_vec, rnd_vec, dst4, dst5, dst6,
213 ST6x4_UB(out0, out1, dst, dst_stride);
214 dst += (4 * dst_stride);
215 ST6x4_UB(out2, out3, dst, dst_stride);
216 dst += (4 * dst_stride);
230 uint64_t tp0, tp1, tp2, tp3;
231 v16i8
src0 = { 0 },
src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
233 v16u8 out0, out1, out2, out3;
234 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
235 v4i32 weight_vec, rnd_vec;
237 weight = weight & 0x0000FFFF;
238 weight_vec = __msa_fill_w(weight);
239 offset_vec = __msa_fill_h(offset);
240 rnd_vec = __msa_fill_w(rnd_val);
243 LD2(src, src_stride, tp0, tp1);
248 rnd_vec, dst0, dst1);
249 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
251 }
else if (4 == height) {
252 LD4(src, src_stride, tp0, tp1, tp2, tp3);
257 SLLI_4V(dst0, dst1, dst2, dst3, 6);
259 offset_vec, rnd_vec, dst0, dst1, dst2,
262 ST8x4_UB(out0, out1, dst, dst_stride);
263 }
else if (6 == height) {
264 LD4(src, src_stride, tp0, tp1, tp2, tp3);
265 src += 4 * src_stride;
268 LD2(src, src_stride, tp0, tp1);
273 SLLI_4V(dst0, dst1, dst2, dst3, 6);
276 offset_vec, rnd_vec, dst0, dst1, dst2,
279 rnd_vec, dst4, dst5);
280 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
281 ST8x4_UB(out0, out1, dst, dst_stride);
282 dst += (4 * dst_stride);
284 }
else if (0 == height % 8) {
285 for (loop_cnt = (height >> 3); loop_cnt--;) {
286 LD4(src, src_stride, tp0, tp1, tp2, tp3);
287 src += 4 * src_stride;
290 LD4(src, src_stride, tp0, tp1, tp2, tp3);
291 src += 4 * src_stride;
299 SLLI_4V(dst0, dst1, dst2, dst3, 6);
300 SLLI_4V(dst4, dst5, dst6, dst7, 6);
302 offset_vec, rnd_vec, dst0, dst1,
305 offset_vec, rnd_vec, dst4, dst5,
309 ST8x4_UB(out0, out1, dst, dst_stride);
310 dst += (4 * dst_stride);
311 ST8x4_UB(out2, out3, dst, dst_stride);
312 dst += (4 * dst_stride);
327 v16u8 out0, out1, out2;
329 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
332 v4i32 weight_vec, rnd_vec;
334 weight = weight & 0x0000FFFF;
335 weight_vec = __msa_fill_w(weight);
336 offset_vec = __msa_fill_h(offset);
337 rnd_vec = __msa_fill_w(rnd_val);
339 for (loop_cnt = 4; loop_cnt--;) {
340 LD_SB4(src, src_stride, src0, src1, src2, src3);
341 src += (4 * src_stride);
342 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
343 dst0, dst1, dst2, dst3);
345 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
346 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
347 SLLI_4V(dst0, dst1, dst2, dst3, 6);
350 offset_vec, rnd_vec, dst0, dst1, dst2,
353 rnd_vec, dst4, dst5);
355 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
356 ST12x4_UB(out0, out1, out2, dst, dst_stride);
357 dst += (4 * dst_stride);
371 v16u8 out0, out1, out2, out3;
374 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
375 v4i32 weight_vec, rnd_vec;
377 weight = weight & 0x0000FFFF;
378 weight_vec = __msa_fill_w(weight);
379 offset_vec = __msa_fill_h(offset);
380 rnd_vec = __msa_fill_w(rnd_val);
382 for (loop_cnt = height >> 2; loop_cnt--;) {
383 LD_SB4(src, src_stride, src0, src1, src2, src3);
384 src += (4 * src_stride);
389 SLLI_4V(dst0, dst1, dst2, dst3, 6);
390 SLLI_4V(dst4, dst5, dst6, dst7, 6);
392 offset_vec, rnd_vec, dst0, dst1, dst2,
395 offset_vec, rnd_vec, dst4, dst5, dst6,
399 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
400 dst += (4 * dst_stride);
414 v16u8 out0, out1, out2, out3, out4, out5;
415 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
417 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
418 v8i16 dst8, dst9, dst10, dst11;
419 v4i32 weight_vec, rnd_vec;
421 weight = weight & 0x0000FFFF;
422 weight_vec = __msa_fill_w(weight);
423 offset_vec = __msa_fill_h(offset);
424 rnd_vec = __msa_fill_w(rnd_val);
426 for (loop_cnt = (height >> 2); loop_cnt--;) {
427 LD_SB4(src, src_stride, src0, src1, src4, src5);
428 LD_SB4(src + 16, src_stride, src2, src3, src6, src7);
429 src += (4 * src_stride);
433 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
436 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
437 SLLI_4V(dst0, dst1, dst2, dst3, 6);
438 SLLI_4V(dst4, dst5, dst6, dst7, 6);
439 SLLI_4V(dst8, dst9, dst10, dst11, 6);
441 offset_vec, rnd_vec, dst0, dst1, dst2,
444 offset_vec, rnd_vec, dst4, dst5, dst6,
447 offset_vec, rnd_vec, dst8, dst9, dst10,
449 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
450 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
451 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
452 ST8x4_UB(out2, out5, dst + 16, dst_stride);
453 dst += (4 * dst_stride);
467 v16u8 out0, out1, out2, out3;
470 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
471 v4i32 weight_vec, rnd_vec;
473 weight = weight & 0x0000FFFF;
474 weight_vec = __msa_fill_w(weight);
475 offset_vec = __msa_fill_h(offset);
476 rnd_vec = __msa_fill_w(rnd_val);
478 for (loop_cnt = (height >> 1); loop_cnt--;) {
479 LD_SB2(src, src_stride, src0, src1);
480 LD_SB2(src + 16, src_stride, src2, src3);
481 src += (2 * src_stride);
487 SLLI_4V(dst0, dst1, dst2, dst3, 6);
488 SLLI_4V(dst4, dst5, dst6, dst7, 6);
490 offset_vec, rnd_vec, dst0, dst1, dst2,
493 offset_vec, rnd_vec, dst4, dst5, dst6,
497 ST_UB2(out0, out1, dst, dst_stride);
498 ST_UB2(out2, out3, dst + 16, dst_stride);
499 dst += (2 * dst_stride);
513 v16u8 out0, out1, out2, out3, out4, out5;
514 v16i8
src0,
src1, src2, src3, src4, src5;
516 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, offset_vec;
517 v8i16 dst6, dst7, dst8, dst9, dst10, dst11;
518 v4i32 weight_vec, rnd_vec;
520 weight = weight & 0x0000FFFF;
521 weight_vec = __msa_fill_w(weight);
522 offset_vec = __msa_fill_h(offset);
523 rnd_vec = __msa_fill_w(rnd_val);
525 for (loop_cnt = (height >> 1); loop_cnt--;) {
526 LD_SB3(src, 16, src0, src1, src2);
528 LD_SB3(src, 16, src3, src4, src5);
537 SLLI_4V(dst0, dst1, dst2, dst3, 6);
538 SLLI_4V(dst4, dst5, dst6, dst7, 6);
539 SLLI_4V(dst8, dst9, dst10, dst11, 6);
541 offset_vec, rnd_vec, dst0, dst1, dst2,
544 offset_vec, rnd_vec, dst4, dst5, dst6,
547 offset_vec, rnd_vec, dst8, dst9, dst10,
549 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
550 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
551 ST_UB2(out0, out1, dst, 16);
552 ST_UB(out2, dst + 32);
554 ST_UB2(out3, out4, dst, 16);
555 ST_UB(out5, dst + 32);
570 v16u8 out0, out1, out2, out3, out4, out5, out6, out7;
571 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
573 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
574 v8i16 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
575 v4i32 weight_vec, rnd_vec;
577 weight = weight & 0x0000FFFF;
578 weight_vec = __msa_fill_w(weight);
579 offset_vec = __msa_fill_h(offset);
580 rnd_vec = __msa_fill_w(rnd_val);
582 for (loop_cnt = (height >> 1); loop_cnt--;) {
583 LD_SB4(src, 16, src0, src1, src2, src3);
585 LD_SB4(src, 16, src4, src5, src6, src7);
596 SLLI_4V(dst0, dst1, dst2, dst3, 6);
597 SLLI_4V(dst4, dst5, dst6, dst7, 6);
598 SLLI_4V(dst8, dst9, dst10, dst11, 6);
599 SLLI_4V(dst12, dst13, dst14, dst15, 6);
601 offset_vec, rnd_vec, dst0, dst1, dst2,
604 offset_vec, rnd_vec, dst4, dst5, dst6,
607 offset_vec, rnd_vec, dst8, dst9, dst10,
610 offset_vec, rnd_vec, dst12, dst13, dst14,
615 PCKEV_B2_UB(dst13, dst12, dst15, dst14, out6, out7);
616 ST_UB4(out0, out1, out2, out3, dst, 16);
618 ST_UB4(out4, out5, out6, out7, dst, 16);
635 v8i16 filt0, filt1, filt2, filt3;
636 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
637 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
638 v16i8 mask0, mask1, mask2, mask3, vec11, vec12, vec13, vec14, vec15;
639 v8i16 filter_vec, dst01, dst23, dst45, dst67;
640 v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
641 v4i32 weight_vec, rnd_vec;
644 weight = weight & 0x0000FFFF;
646 weight_vec = __msa_fill_w(weight);
647 rnd_vec = __msa_fill_w(rnd_val);
652 weight_vec_h = __msa_fill_h(weight);
653 offset_vec = __msa_fill_h(offset);
654 denom_vec = __msa_fill_h(rnd_val);
656 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
657 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
659 filter_vec =
LD_SH(filter);
660 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
667 for (loop_cnt = (height >> 3); loop_cnt--;) {
668 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
669 src += (8 * src_stride);
672 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
673 vec0, vec1, vec2, vec3);
674 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
675 vec4, vec5, vec6, vec7);
676 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
677 vec8, vec9, vec10, vec11);
678 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
679 vec12, vec13, vec14, vec15);
690 offset_vec, rnd_vec, dst0, dst1, dst2,
694 ST4x8_UB(out0, out1, dst, dst_stride);
695 dst += (8 * dst_stride);
712 v8i16 filt0, filt1, filt2, filt3;
713 v16i8 mask0, mask1, mask2, mask3;
715 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
716 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
717 v8i16 dst0, dst1, dst2, dst3;
718 v8i16 weight_vec_h, offset_vec, denom_vec;
719 v4i32 weight_vec, rnd_vec;
722 weight = weight & 0x0000FFFF;
724 weight_vec = __msa_fill_w(weight);
725 rnd_vec = __msa_fill_w(rnd_val);
730 weight_vec_h = __msa_fill_h(weight);
731 offset_vec = __msa_fill_h(offset);
732 denom_vec = __msa_fill_h(rnd_val);
734 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
735 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
737 filter_vec =
LD_SH(filter);
738 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
745 for (loop_cnt = (height >> 2); loop_cnt--;) {
746 LD_SB4(src, src_stride, src0, src1, src2, src3);
747 src += (4 * src_stride);
750 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
751 vec0, vec1, vec2, vec3);
752 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
753 vec4, vec5, vec6, vec7);
754 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
755 vec8, vec9, vec10, vec11);
756 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
757 vec12, vec13, vec14, vec15);
768 offset_vec, rnd_vec, dst0, dst1, dst2,
772 ST8x4_UB(out0, out1, dst, dst_stride);
773 dst += (4 * dst_stride);
788 v16u8 out0, out1, out2;
789 v8i16 filt0, filt1, filt2, filt3;
790 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
791 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
792 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
793 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
795 v8i16 dst01, dst23, dst0, dst1, dst2, dst3, dst4, dst5;
796 v8i16 weight_vec_h, offset_vec, denom_vec;
797 v4i32 weight_vec, rnd_vec;
800 weight = weight & 0x0000FFFF;
802 weight_vec = __msa_fill_w(weight);
803 rnd_vec = __msa_fill_w(rnd_val);
808 weight_vec_h = __msa_fill_h(weight);
809 offset_vec = __msa_fill_h(offset);
810 denom_vec = __msa_fill_h(rnd_val);
812 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
813 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
815 filter_vec =
LD_SH(filter);
816 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
827 for (loop_cnt = (height >> 2); loop_cnt--;) {
828 LD_SB4(src, src_stride, src0, src1, src2, src3);
829 LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
830 src += (4 * src_stride);
833 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
834 vec0, vec1, vec2, vec3);
835 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
836 vec4, vec5, vec6, vec7);
837 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
838 vec8, vec9, vec10, vec11);
839 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
840 vec12, vec13, vec14, vec15);
849 VSHF_B4_SB(src4, src5, mask4, mask5, mask6, mask7,
850 vec0, vec1, vec2, vec3);
851 VSHF_B4_SB(src6, src7, mask4, mask5, mask6, mask7,
852 vec4, vec5, vec6, vec7);
859 offset_vec, rnd_vec, dst0, dst1, dst2,
862 rnd_vec, dst4, dst5);
864 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
865 ST8x4_UB(out0, out1, dst, dst_stride);
866 ST4x4_UB(out2, out2, 0, 1, 2, 3, dst + 8, dst_stride);
867 dst += (4 * dst_stride);
884 v8i16 filt0, filt1, filt2, filt3;
885 v16i8 mask0, mask1, mask2, mask3;
887 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
888 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
889 v8i16 dst0, dst1, dst2, dst3;
890 v8i16 weight_vec_h, offset_vec, denom_vec;
891 v4i32 weight_vec, rnd_vec;
895 weight_vec = __msa_fill_w(weight);
896 rnd_vec = __msa_fill_w(rnd_val);
901 weight_vec_h = __msa_fill_h(weight);
902 offset_vec = __msa_fill_h(offset);
903 denom_vec = __msa_fill_h(rnd_val);
905 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
906 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
908 filter_vec =
LD_SH(filter);
909 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
916 for (loop_cnt = (height >> 1); loop_cnt--;) {
917 LD_SB2(src, src_stride, src0, src2);
918 LD_SB2(src + 8, src_stride, src1, src3);
919 src += (2 * src_stride);
922 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
923 vec0, vec1, vec2, vec3);
924 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
925 vec4, vec5, vec6, vec7);
926 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
927 vec8, vec9, vec10, vec11);
928 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
929 vec12, vec13, vec14, vec15);
940 offset_vec, rnd_vec, dst0, dst1, dst2,
944 ST_UB2(out0, out1, dst, dst_stride);
945 dst += (2 * dst_stride);
960 v16u8 out0, out1, out2;
962 v8i16 filt0, filt1, filt2, filt3;
963 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
964 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
965 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
966 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
967 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
968 v4i32 weight_vec, rnd_vec;
972 weight_vec = __msa_fill_w(weight);
973 rnd_vec = __msa_fill_w(rnd_val);
978 weight_vec_h = __msa_fill_h(weight);
979 offset_vec = __msa_fill_h(offset);
980 denom_vec = __msa_fill_h(rnd_val);
982 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
983 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
985 filter_vec =
LD_SH(filter);
986 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
997 for (loop_cnt = 16; loop_cnt--;) {
998 LD_SB2(src, 16, src0, src1);
1000 LD_SB2(src, 16, src2, src3);
1003 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1004 vec0, vec1, vec2, vec3);
1005 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1006 vec4, vec5, vec6, vec7);
1007 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1008 vec8, vec9, vec10, vec11);
1009 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1010 vec12, vec13, vec14, vec15);
1020 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
1021 vec0, vec1, vec2, vec3);
1022 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1023 vec4, vec5, vec6, vec7);
1030 offset_vec, rnd_vec, dst0, dst1, dst2,
1033 rnd_vec, dst4, dst5);
1035 PCKEV_B3_UB(dst1, dst0, dst4, dst3, dst5, dst2, out0, out1, out2);
1036 ST_UB2(out0, out1, dst, dst_stride);
1037 ST8x2_UB(out2, dst + 16, dst_stride);
1038 dst += (2 * dst_stride);
1053 v16u8 out0, out1, out2, out3;
1054 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
1055 v8i16 filt0, filt1, filt2, filt3;
1056 v16i8 mask0, mask1, mask2, mask3;
1057 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1058 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1060 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1061 v8i16 weight_vec_h, offset_vec, denom_vec;
1062 v4i32 weight_vec, rnd_vec;
1066 weight_vec = __msa_fill_w(weight);
1067 rnd_vec = __msa_fill_w(rnd_val);
1072 weight_vec_h = __msa_fill_h(weight);
1073 offset_vec = __msa_fill_h(offset);
1074 denom_vec = __msa_fill_h(rnd_val);
1076 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1077 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1079 filter_vec =
LD_SH(filter);
1080 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1087 for (loop_cnt = height >> 1; loop_cnt--;) {
1088 LD_SB4(src, 8, src0, src1, src2, src3);
1090 LD_SB4(src, 8, src4, src5, src6, src7);
1094 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1095 vec0, vec1, vec2, vec3);
1096 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1097 vec4, vec5, vec6, vec7);
1098 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1099 vec8, vec9, vec10, vec11);
1100 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1101 vec12, vec13, vec14, vec15);
1111 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1112 vec0, vec1, vec2, vec3);
1113 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1114 vec4, vec5, vec6, vec7);
1115 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1116 vec8, vec9, vec10, vec11);
1117 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1118 vec12, vec13, vec14, vec15);
1129 offset_vec, rnd_vec, dst0, dst1, dst2,
1132 offset_vec, rnd_vec, dst4, dst5, dst6,
1137 ST_UB2(out0, out1, dst, 16);
1139 ST_UB2(out2, out3, dst, 16);
1155 v16u8 out0, out1, out2;
1157 v8i16 filt0, filt1, filt2, filt3;
1158 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1159 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1160 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1161 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1162 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
1163 v4i32 weight_vec, rnd_vec;
1167 weight = weight & 0x0000FFFF;
1168 weight_vec = __msa_fill_w(weight);
1169 rnd_vec = __msa_fill_w(rnd_val);
1174 weight_vec_h = __msa_fill_h(weight);
1175 offset_vec = __msa_fill_h(offset);
1176 denom_vec = __msa_fill_h(rnd_val);
1178 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1179 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1181 filter_vec =
LD_SH(filter);
1182 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1193 for (loop_cnt = 64; loop_cnt--;) {
1194 LD_SB3(src, 16, src0, src1, src2);
1195 src3 =
LD_SB(src + 40);
1199 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1200 vec0, vec1, vec2, vec3);
1201 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1202 vec4, vec5, vec6, vec7);
1203 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1204 vec8, vec9, vec10, vec11);
1205 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
1206 vec12, vec13, vec14, vec15);
1216 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1217 vec0, vec1, vec2, vec3);
1218 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1219 vec4, vec5, vec6, vec7);
1226 offset_vec, rnd_vec, dst0, dst1, dst2,
1229 rnd_vec, dst4, dst5);
1231 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
1232 ST_UB2(out0, out1, dst, 16);
1233 ST_UB(out2, dst + 32);
1250 uint32_t loop_cnt, cnt;
1253 v8i16 filt0, filt1, filt2, filt3;
1254 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1255 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1256 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1257 v8i16 dst0, dst1, dst2, dst3;
1258 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
1259 v4i32 weight_vec, rnd_vec;
1263 weight_vec = __msa_fill_w(weight);
1264 rnd_vec = __msa_fill_w(rnd_val);
1269 weight_vec_h = __msa_fill_h(weight);
1270 offset_vec = __msa_fill_h(offset);
1271 denom_vec = __msa_fill_h(rnd_val);
1273 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1274 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1276 filter_vec =
LD_SH(filter);
1277 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1288 for (loop_cnt = height; loop_cnt--;) {
1292 for (cnt = 2; cnt--;) {
1293 LD_SB2(src_tmp, 16, src0, src1);
1294 src2 =
LD_SB(src_tmp + 24);
1298 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1299 vec0, vec1, vec2, vec3);
1300 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1301 vec4, vec5, vec6, vec7);
1302 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1303 vec8, vec9, vec10, vec11);
1304 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1305 vec12, vec13, vec14, vec15);
1316 offset_vec, rnd_vec, dst0, dst1,
1320 ST_UB2(out0, out1, dst_tmp, 16);
1340 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1341 v16i8 src9, src10, src11, src12, src13, src14;
1342 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1343 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1344 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1345 v16i8 src2110, src4332, src6554, src8776, src10998;
1346 v16i8 src12111110, src14131312;
1347 v8i16 dst10, dst32, dst54, dst76;
1348 v8i16 filt0, filt1, filt2, filt3;
1349 v8i16 filter_vec, const_vec;
1350 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1351 v4i32 weight_vec, offset_vec, rnd_vec;
1353 src -= (3 * src_stride);
1354 const_vec = __msa_ldi_h(128);
1357 weight = weight & 0x0000FFFF;
1358 weight_vec = __msa_fill_w(weight);
1359 offset_vec = __msa_fill_w(offset);
1360 rnd_vec = __msa_fill_w(rnd_val);
1362 filter_vec =
LD_SH(filter);
1363 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1365 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1366 src += (7 * src_stride);
1368 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1369 src10_r, src32_r, src54_r, src21_r);
1371 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1374 src32_r, src65_r, src54_r, src2110, src4332, src6554);
1378 for (loop_cnt = (height >> 3); loop_cnt--;) {
1380 src7, src8, src9, src10, src11, src12, src13, src14);
1381 src += (8 * src_stride);
1382 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1383 src76_r, src87_r, src98_r, src109_r);
1384 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1385 src1110_r, src1211_r, src1312_r, src1413_r);
1386 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1387 src1413_r, src1312_r,
1388 src8776, src10998, src12111110, src14131312);
1392 DPADD_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt1,
1393 filt2, filt3, dst10, dst10, dst10, dst10);
1396 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1399 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1401 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1402 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1405 weight_vec, offset_vec, rnd_vec,
1406 dst0_r, dst1_r, dst2_r, dst3_r,
1407 dst0_l, dst1_l, dst2_l, dst3_l);
1410 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
1411 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
1412 dst += (8 * dst_stride);
1415 src4332 = src12111110;
1416 src6554 = src14131312;
1432 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1433 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1434 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1435 v8i16 tmp0, tmp1, tmp2, tmp3;
1436 v8i16 filt0, filt1, filt2, filt3;
1437 v8i16 filter_vec, const_vec;
1438 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1439 v4i32 weight_vec, offset_vec, rnd_vec;
1441 src -= (3 * src_stride);
1442 const_vec = __msa_ldi_h(128);
1445 weight = weight & 0x0000FFFF;
1446 weight_vec = __msa_fill_w(weight);
1447 offset_vec = __msa_fill_w(offset);
1448 rnd_vec = __msa_fill_w(rnd_val);
1450 filter_vec =
LD_SH(filter);
1451 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1453 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1454 src += (7 * src_stride);
1457 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1458 src10_r, src32_r, src54_r, src21_r);
1459 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1461 for (loop_cnt = (height >> 2); loop_cnt--;) {
1462 LD_SB4(src, src_stride, src7, src8, src9, src10);
1463 src += (4 * src_stride);
1465 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1466 src76_r, src87_r, src98_r, src109_r);
1470 filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
1473 filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
1476 filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
1479 filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
1482 weight_vec, offset_vec, rnd_vec,
1483 dst0_r, dst1_r, dst2_r, dst3_r,
1484 dst0_l, dst1_l, dst2_l, dst3_l);
1487 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
1488 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
1489 dst += (4 * dst_stride);
1512 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1513 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1514 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1515 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1516 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1517 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1518 v16i8 src2110, src4332, src6554, src8776, src10998;
1519 v8i16 filt0, filt1, filt2, filt3;
1520 v8i16 filter_vec, const_vec;
1521 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
1522 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
1523 v4i32 weight_vec, offset_vec, rnd_vec;
1525 src -= (3 * src_stride);
1526 const_vec = __msa_ldi_h(128);
1529 weight = weight & 0x0000FFFF;
1530 weight_vec = __msa_fill_w(weight);
1531 offset_vec = __msa_fill_w(offset);
1532 rnd_vec = __msa_fill_w(rnd_val);
1534 filter_vec =
LD_SH(filter);
1535 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1537 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1538 src += (7 * src_stride);
1541 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1542 src10_r, src32_r, src54_r, src21_r);
1543 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1544 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1545 src10_l, src32_l, src54_l, src21_l);
1546 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1547 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1548 src2110, src4332, src6554);
1550 for (loop_cnt = (height >> 2); loop_cnt--;) {
1551 LD_SB4(src, src_stride, src7, src8, src9, src10);
1552 src += (4 * src_stride);
1555 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1556 src76_r, src87_r, src98_r, src109_r);
1557 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1558 src76_l, src87_l, src98_l, src109_l);
1559 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1563 filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
1566 filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
1569 filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
1572 filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
1575 filt0, filt1, filt2, filt3, tmp4, tmp4, tmp4, tmp4);
1578 filt0, filt1, filt2, filt3, tmp5, tmp5, tmp5, tmp5);
1581 weight_vec, offset_vec, rnd_vec,
1582 dst0_r, dst1_r, dst2_r, dst3_r,
1583 dst0_l, dst1_l, dst2_l, dst3_l);
1585 dst4_r, dst5_r, dst4_l, dst5_l);
1588 dst2_l, dst2_r, dst3_l, dst3_r,
1589 dst4_l, dst4_r, dst5_l, dst5_r,
1590 dst0_r, dst1_r, dst2_r);
1591 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
1592 dst += (4 * dst_stride);
1621 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1622 v16i8 src10_r, src32_r, src54_r, src76_r;
1623 v16i8 src21_r, src43_r, src65_r, src87_r;
1624 v8i16 tmp0, tmp1, tmp2, tmp3;
1625 v16i8 src10_l, src32_l, src54_l, src76_l;
1626 v16i8 src21_l, src43_l, src65_l, src87_l;
1627 v8i16 filt0, filt1, filt2, filt3;
1628 v8i16 filter_vec, const_vec;
1629 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1630 v4i32 weight_vec, offset_vec, rnd_vec;
1632 src -= (3 * src_stride);
1633 const_vec = __msa_ldi_h(128);
1636 weight = weight & 0x0000FFFF;
1637 weight_vec = __msa_fill_w(weight);
1638 offset_vec = __msa_fill_w(offset);
1639 rnd_vec = __msa_fill_w(rnd_val);
1641 filter_vec =
LD_SH(filter);
1642 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1644 for (cnt = (width >> 4); cnt--;) {
1648 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1649 src_tmp += (7 * src_stride);
1651 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1652 src10_r, src32_r, src54_r, src21_r);
1653 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1654 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1655 src10_l, src32_l, src54_l, src21_l);
1656 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1658 for (loop_cnt = (height >> 1); loop_cnt--;) {
1659 LD_SB2(src_tmp, src_stride, src7, src8);
1660 src_tmp += (2 * src_stride);
1662 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1663 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1667 filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
1670 filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
1673 filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
1676 filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
1679 weight_vec, offset_vec, rnd_vec,
1680 dst0_r, dst1_r, dst2_r, dst3_r,
1681 dst0_l, dst1_l, dst2_l, dst3_l);
1684 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
1685 ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
1686 dst_tmp += (2 * dst_stride);
1719 filter, height, weight,
1720 offset, rnd_val, 16);
1734 filter, height, weight,
1735 offset, rnd_val, 16);
1738 filter, height, weight, offset, rnd_val);
1752 filter, height, weight,
1753 offset, rnd_val, 32);
1767 filter, height, weight,
1768 offset, rnd_val, 48);
1782 filter, height, weight,
1783 offset, rnd_val, 64);
1790 const int8_t *filter_x,
1791 const int8_t *filter_y,
1798 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1799 v8i16 filt0, filt1, filt2, filt3;
1800 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1801 v16i8 mask1, mask2, mask3;
1802 v8i16 filter_vec, const_vec;
1803 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1804 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1805 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1806 v4i32 dst0_r, dst1_r, weight_vec, offset_vec, rnd_vec;
1807 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1808 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1809 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1810 v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1812 src -= ((3 * src_stride) + 3);
1813 filter_vec =
LD_SH(filter_x);
1814 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1816 filter_vec =
LD_SH(filter_y);
1817 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1818 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1820 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1826 const_vec = __msa_ldi_h(128);
1829 weight_vec = __msa_fill_w(weight);
1830 offset_vec = __msa_fill_w(offset);
1831 rnd_vec = __msa_fill_w(rnd_val);
1833 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1834 src += (7 * src_stride);
1838 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1839 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1840 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1841 vec8, vec9, vec10, vec11);
1842 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1843 vec12, vec13, vec14, vec15);
1845 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1846 dst30, dst30, dst30, dst30);
1848 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1849 dst41, dst41, dst41, dst41);
1851 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1852 dst52, dst52, dst52, dst52);
1854 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1855 dst63, dst63, dst63, dst63);
1857 ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1858 dst10_r, dst21_r, dst32_r);
1860 dst43_r = __msa_ilvl_h(dst41, dst30);
1861 dst54_r = __msa_ilvl_h(dst52, dst41);
1862 dst65_r = __msa_ilvl_h(dst63, dst52);
1864 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1866 for (loop_cnt = height >> 1; loop_cnt--;) {
1867 LD_SB2(src, src_stride, src7, src8);
1868 src += (2 * src_stride);
1871 VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1872 vec0, vec1, vec2, vec3);
1874 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1875 dst87, dst87, dst87, dst87);
1876 dst76_r = __msa_ilvr_h(dst87, dst66);
1878 filt_h0, filt_h1, filt_h2, filt_h3);
1879 dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
1881 filt_h0, filt_h1, filt_h2, filt_h3);
1885 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
1887 ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
1893 dst += (2 * dst_stride);
1901 dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1909 const int8_t *filter_x,
1910 const int8_t *filter_y,
1917 uint32_t loop_cnt, cnt;
1920 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1921 v8i16 filt0, filt1, filt2, filt3;
1922 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1923 v16i8 mask1, mask2, mask3;
1924 v8i16 filter_vec, const_vec;
1925 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1926 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1927 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1928 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1929 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1930 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1931 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1932 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1933 v4i32 weight_vec, offset_vec, rnd_vec;
1934 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1936 src -= ((3 * src_stride) + 3);
1937 const_vec = __msa_ldi_h(128);
1940 weight_vec = __msa_fill_w(weight);
1941 offset_vec = __msa_fill_w(offset);
1942 rnd_vec = __msa_fill_w(rnd_val);
1944 filter_vec =
LD_SH(filter_x);
1945 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1947 filter_vec =
LD_SH(filter_y);
1948 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1949 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1950 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1956 for (cnt = width >> 3; cnt--;) {
1960 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1961 src_tmp += (7 * src_stride);
1964 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1965 vec0, vec1, vec2, vec3);
1966 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1967 vec4, vec5, vec6, vec7);
1968 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1969 vec8, vec9, vec10, vec11);
1970 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1971 vec12, vec13, vec14, vec15);
1973 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1974 dst0, dst0, dst0, dst0);
1976 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1977 dst1, dst1, dst1, dst1);
1979 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1980 dst2, dst2, dst2, dst2);
1982 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1983 dst3, dst3, dst3, dst3);
1985 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1986 vec0, vec1, vec2, vec3);
1987 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1988 vec4, vec5, vec6, vec7);
1989 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1990 vec8, vec9, vec10, vec11);
1992 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1993 dst4, dst4, dst4, dst4);
1995 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1996 dst5, dst5, dst5, dst5);
1998 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1999 dst6, dst6, dst6, dst6);
2001 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
2002 dst10_r, dst32_r, dst54_r, dst21_r);
2003 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2004 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
2005 dst10_l, dst32_l, dst54_l, dst21_l);
2006 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2008 for (loop_cnt = height >> 1; loop_cnt--;) {
2009 LD_SB2(src_tmp, src_stride, src7, src8);
2010 src_tmp += 2 * src_stride;
2013 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2014 vec0, vec1, vec2, vec3);
2016 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
2017 dst7, dst7, dst7, dst7);
2021 filt_h0, filt_h1, filt_h2, filt_h3);
2023 filt_h0, filt_h1, filt_h2, filt_h3);
2028 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2029 vec0, vec1, vec2, vec3);
2031 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
2032 dst8, dst8, dst8, dst8);
2036 filt_h0, filt_h1, filt_h2, filt_h3);
2038 filt_h0, filt_h1, filt_h2, filt_h3);
2043 weight_vec, offset_vec, rnd_vec,
2044 dst0_r, dst1_r, dst0_l, dst1_l);
2047 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
2048 dst_tmp += (2 * dst_stride);
2074 const int8_t *filter_x,
2075 const int8_t *filter_y,
2082 filter_x, filter_y, height, weight,
2083 offset, rnd_val, 8);
2090 const int8_t *filter_x,
2091 const int8_t *filter_y,
2098 filter_x, filter_y, height, weight,
2099 offset, rnd_val, 8);
2101 filter_x, filter_y, height, weight, offset,
2109 const int8_t *filter_x,
2110 const int8_t *filter_y,
2117 filter_x, filter_y, height, weight,
2118 offset, rnd_val, 16);
2125 const int8_t *filter_x,
2126 const int8_t *filter_y,
2133 filter_x, filter_y, height, weight,
2134 offset, rnd_val, 24);
2141 const int8_t *filter_x,
2142 const int8_t *filter_y,
2149 filter_x, filter_y, height, weight,
2150 offset, rnd_val, 32);
2157 const int8_t *filter_x,
2158 const int8_t *filter_y,
2165 filter_x, filter_y, height, weight,
2166 offset, rnd_val, 48);
2173 const int8_t *filter_x,
2174 const int8_t *filter_y,
2181 filter_x, filter_y, height, weight,
2182 offset, rnd_val, 64);
2199 v4i32 dst0_r, dst0_l;
2200 v8i16 filter_vec, const_vec;
2201 v4i32 weight_vec, offset_vec, rnd_vec;
2202 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2206 filter_vec =
LD_SH(filter);
2211 weight = weight & 0x0000FFFF;
2213 const_vec = __msa_ldi_h(128);
2216 weight_vec = __msa_fill_w(weight);
2217 offset_vec = __msa_fill_w(offset);
2218 rnd_vec = __msa_fill_w(rnd_val);
2220 LD_SB2(src, src_stride, src0, src1);
2223 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2228 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
2230 ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2236 dst += (4 * dst_stride);
2251 v16i8 mask1, vec0, vec1;
2253 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
2254 v8i16 filter_vec, const_vec;
2255 v4i32 weight_vec, offset_vec, rnd_vec;
2256 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2261 filter_vec =
LD_SH(filter);
2266 weight = weight & 0x0000FFFF;
2268 const_vec = __msa_ldi_h(128);
2271 weight_vec = __msa_fill_w(weight);
2272 offset_vec = __msa_fill_w(offset);
2273 rnd_vec = __msa_fill_w(rnd_val);
2275 LD_SB4(src, src_stride, src0, src1, src2, src3);
2278 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2282 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2287 dst0_r, dst1_r, dst0_l, dst1_l);
2290 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
2291 dst += (4 * dst_stride);
2306 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2307 v16i8 mask1, vec0, vec1;
2308 v8i16 dst0, dst1, dst2, dst3;
2309 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2310 v8i16 filter_vec, const_vec;
2311 v4i32 weight_vec, offset_vec, rnd_vec;
2312 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2316 filter_vec =
LD_SH(filter);
2319 weight = weight & 0x0000FFFF;
2320 const_vec = __msa_ldi_h(128);
2323 weight_vec = __msa_fill_w(weight);
2324 offset_vec = __msa_fill_w(offset);
2325 rnd_vec = __msa_fill_w(rnd_val);
2329 for (loop_cnt = (height >> 3); loop_cnt--;) {
2330 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2331 src += (8 * src_stride);
2335 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2339 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2343 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2347 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2352 weight_vec, offset_vec, rnd_vec,
2353 dst0_r, dst1_r, dst2_r, dst3_r,
2354 dst0_l, dst1_l, dst2_l, dst3_l);
2357 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2358 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
2359 dst += (8 * dst_stride);
2375 filter, height, weight, offset, rnd_val);
2376 }
else if (4 == height) {
2378 filter, height, weight, offset, rnd_val);
2379 }
else if (8 == height || 16 == height) {
2381 filter, height, weight,
2399 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2402 v8i16 dst0, dst1, dst2, dst3;
2403 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2404 v8i16 filter_vec, const_vec;
2405 v4i32 weight_vec, offset_vec, rnd_vec;
2409 filter_vec =
LD_SH(filter);
2412 weight = weight & 0x0000FFFF;
2413 const_vec = __msa_ldi_h(128);
2416 weight_vec = __msa_fill_w(weight);
2417 offset_vec = __msa_fill_w(offset);
2418 rnd_vec = __msa_fill_w(rnd_val);
2422 for (loop_cnt = (height >> 2); loop_cnt--;) {
2423 LD_SB4(src, src_stride, src0, src1, src2, src3);
2424 src += (4 * src_stride);
2428 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2432 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2436 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2440 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2445 weight_vec, offset_vec, rnd_vec,
2446 dst0_r, dst1_r, dst2_r, dst3_r,
2447 dst0_l, dst1_l, dst2_l, dst3_l);
2450 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2452 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
2453 dst += (4 * dst_stride);
2467 v8i16 filt0, filt1, dst0, dst1;
2469 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2472 v8i16 filter_vec, const_vec;
2473 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
2474 v4i32 weight_vec, offset_vec, rnd_vec;
2478 filter_vec =
LD_SH(filter);
2481 weight = weight & 0x0000FFFF;
2482 const_vec = __msa_ldi_h(128);
2485 weight_vec = __msa_fill_w(weight);
2486 offset_vec = __msa_fill_w(offset);
2487 rnd_vec = __msa_fill_w(rnd_val);
2491 LD_SB2(src, src_stride, src0, src1);
2494 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2497 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2502 dst0_r, dst1_r, dst0_l, dst1_l);
2519 v16i8
src0,
src1, src2, src3, src4, src5;
2520 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2523 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2524 v8i16 filter_vec, const_vec;
2525 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
2526 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
2527 v4i32 weight_vec, offset_vec, rnd_vec;
2531 filter_vec =
LD_SH(filter);
2534 weight = weight & 0x0000FFFF;
2535 const_vec = __msa_ldi_h(128);
2538 weight_vec = __msa_fill_w(weight);
2539 offset_vec = __msa_fill_w(offset);
2540 rnd_vec = __msa_fill_w(rnd_val);
2544 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
2545 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
2548 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2552 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2556 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2560 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2564 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2568 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2573 weight_vec, offset_vec, rnd_vec,
2574 dst0_r, dst1_r, dst2_r, dst3_r,
2575 dst0_l, dst1_l, dst2_l, dst3_l);
2578 dst4_r, dst5_r, dst4_l, dst5_l);
2581 dst2_l, dst2_r, dst3_l, dst3_r,
2582 dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
2584 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
2585 dst += (4 * dst_stride);
2602 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2605 v8i16 dst0, dst1, dst2, dst3;
2606 v8i16 filter_vec, const_vec;
2607 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2608 v4i32 weight_vec, offset_vec, rnd_vec;
2612 filter_vec =
LD_SH(filter);
2615 weight = weight & 0x0000FFFF;
2616 const_vec = __msa_ldi_h(128);
2619 weight_vec = __msa_fill_w(weight);
2620 offset_vec = __msa_fill_w(offset);
2621 rnd_vec = __msa_fill_w(rnd_val);
2625 for (loop_cnt = (height >> 2); loop_cnt--;) {
2626 LD_SB4(src, src_stride, src0, src1, src2, src3);
2627 src += (4 * src_stride);
2631 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2635 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2639 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2643 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2648 weight_vec, offset_vec, rnd_vec,
2649 dst0_r, dst1_r, dst2_r, dst3_r,
2650 dst0_l, dst1_l, dst2_l, dst3_l);
2653 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2655 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
2656 dst += (4 * dst_stride);
2672 filter, height, weight, offset, rnd_val);
2673 }
else if (6 == height) {
2675 filter, height, weight, offset, rnd_val);
2678 filter, height, weight, offset,
2696 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2697 v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2701 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2702 v8i16 filter_vec, const_vec;
2704 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
2705 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
2706 v4i32 weight_vec, offset_vec, rnd_vec;
2710 filter_vec =
LD_SH(filter);
2713 weight = weight & 0x0000FFFF;
2714 const_vec = __msa_ldi_h(128);
2717 weight_vec = __msa_fill_w(weight);
2718 offset_vec = __msa_fill_w(offset);
2719 rnd_vec = __msa_fill_w(rnd_val);
2724 for (loop_cnt = (height >> 2); loop_cnt--;) {
2725 LD_SB4(src, src_stride, src0, src1, src2, src3);
2726 src += (4 * src_stride);
2730 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2734 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2738 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2742 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2746 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2750 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2755 weight_vec, offset_vec, rnd_vec,
2756 dst0_r, dst1_r, dst2_r, dst3_r,
2757 dst0_l, dst1_l, dst2_l, dst3_l);
2760 dst4_r, dst5_r, dst4_l, dst5_l);
2763 dst2_l, dst2_r, dst3_l, dst3_r,
2764 dst4_l, dst4_r, dst5_l, dst5_r,
2765 dst0_r, dst1_r, dst2_r);
2767 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
2768 dst += (4 * dst_stride);
2783 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2785 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2787 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2789 v8i16 filter_vec, const_vec;
2790 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2791 v4i32 weight_vec, offset_vec, rnd_vec;
2795 filter_vec =
LD_SH(filter);
2798 weight = weight & 0x0000FFFF;
2799 const_vec = __msa_ldi_h(128);
2802 weight_vec = __msa_fill_w(weight);
2803 offset_vec = __msa_fill_w(offset);
2804 rnd_vec = __msa_fill_w(rnd_val);
2808 for (loop_cnt = (height >> 2); loop_cnt--;) {
2809 LD_SB4(src, src_stride, src0, src2, src4, src6);
2810 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2811 src += (4 * src_stride);
2815 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2819 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2823 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2827 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2831 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2835 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2839 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2843 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2848 weight_vec, offset_vec, rnd_vec,
2849 dst0_r, dst1_r, dst2_r, dst3_r,
2850 dst0_l, dst1_l, dst2_l, dst3_l);
2853 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2854 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
2855 dst += (2 * dst_stride);
2858 weight_vec, offset_vec, rnd_vec,
2859 dst0_r, dst1_r, dst2_r, dst3_r,
2860 dst0_l, dst1_l, dst2_l, dst3_l);
2863 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2864 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
2865 dst += (2 * dst_stride);
2883 v8i16 dst0, dst1, dst2, dst3;
2884 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2885 v16i8 mask1, mask2, mask3;
2887 v8i16 filter_vec, const_vec;
2888 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2889 v4i32 weight_vec, offset_vec, rnd_vec;
2893 filter_vec =
LD_SH(filter);
2896 weight = weight & 0x0000FFFF;
2897 const_vec = __msa_ldi_h(128);
2900 weight_vec = __msa_fill_w(weight);
2901 offset_vec = __msa_fill_w(offset);
2902 rnd_vec = __msa_fill_w(rnd_val);
2908 for (loop_cnt = (height >> 1); loop_cnt--;) {
2910 LD_SB2(src, src_stride, src0, src2);
2911 LD_SB2(src + 16, src_stride, src1, src3);
2912 src += (2 * src_stride);
2916 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2920 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2924 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2928 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2933 weight_vec, offset_vec, rnd_vec,
2934 dst0_r, dst1_r, dst2_r, dst3_r,
2935 dst0_l, dst1_l, dst2_l, dst3_l);
2938 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2939 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
2940 dst += (2 * dst_stride);
2943 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2947 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2952 dst0_r, dst1_r, dst0_l, dst1_l);
2955 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
2956 dst_tmp += (2 * dst_stride);
2973 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2974 v16i8 mask1, mask2, mask3;
2975 v8i16 dst0, dst1, dst2, dst3;
2977 v8i16 filter_vec, const_vec;
2978 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2979 v4i32 weight_vec, offset_vec, rnd_vec;
2983 filter_vec =
LD_SH(filter);
2986 weight = weight & 0x0000FFFF;
2987 const_vec = __msa_ldi_h(128);
2990 weight_vec = __msa_fill_w(weight);
2991 offset_vec = __msa_fill_w(offset);
2992 rnd_vec = __msa_fill_w(rnd_val);
2998 for (loop_cnt = (height >> 1); loop_cnt--;) {
2999 LD_SB2(src, 16, src0, src1);
3000 src2 =
LD_SB(src + 24);
3005 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3009 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3013 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3017 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3022 weight_vec, offset_vec, rnd_vec,
3023 dst0_r, dst1_r, dst2_r, dst3_r,
3024 dst0_l, dst1_l, dst2_l, dst3_l);
3027 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3028 ST_SW2(dst0_r, dst1_r, dst, 16);
3031 LD_SB2(src, 16, src0, src1);
3032 src2 =
LD_SB(src + 24);
3037 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3041 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3045 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3049 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3054 weight_vec, offset_vec, rnd_vec,
3055 dst0_r, dst1_r, dst2_r, dst3_r,
3056 dst0_l, dst1_l, dst2_l, dst3_l);
3059 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3060 ST_SW2(dst0_r, dst1_r, dst, 16);
3075 v16i8
src0,
src1, src2, src3, src4;
3076 v16i8 src10_r, src32_r, src21_r, src43_r;
3077 v16i8 src2110, src4332;
3079 v4i32 dst0_r, dst0_l;
3081 v8i16 filter_vec, const_vec;
3082 v4i32 weight_vec, offset_vec, rnd_vec;
3086 const_vec = __msa_ldi_h(128);
3088 weight = weight & 0x0000FFFF;
3090 weight_vec = __msa_fill_w(weight);
3091 offset_vec = __msa_fill_w(offset);
3092 rnd_vec = __msa_fill_w(rnd_val);
3094 filter_vec =
LD_SH(filter);
3097 LD_SB3(src, src_stride, src0, src1, src2);
3098 src += (3 * src_stride);
3099 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3100 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3101 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3102 LD_SB2(src, src_stride, src3, src4);
3103 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3104 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3105 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3108 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3111 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
3113 ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
3131 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3132 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3133 v16i8 src2110, src4332, src6554;
3135 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
3137 v8i16 filter_vec, const_vec;
3138 v4i32 weight_vec, offset_vec, rnd_vec;
3142 const_vec = __msa_ldi_h(128);
3144 weight = weight & 0x0000FFFF;
3146 weight_vec = __msa_fill_w(weight);
3147 offset_vec = __msa_fill_w(offset);
3148 rnd_vec = __msa_fill_w(rnd_val);
3150 filter_vec =
LD_SH(filter);
3153 LD_SB3(src, src_stride, src0, src1, src2);
3154 src += (3 * src_stride);
3155 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3156 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3157 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3159 LD_SB4(src, src_stride, src3, src4, src5, src6);
3160 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3161 src32_r, src43_r, src54_r, src65_r);
3162 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3166 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3168 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3170 dst0_r, dst1_r, dst0_l, dst1_l);
3173 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
3174 dst += (4 * dst_stride);
3188 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9;
3189 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3190 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3191 v16i8 src2110, src4332, src6554, src8776;
3192 v8i16 dst10, dst32, dst54, dst76;
3193 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3195 v8i16 filter_vec, const_vec;
3196 v4i32 weight_vec, offset_vec, rnd_vec;
3200 const_vec = __msa_ldi_h(128);
3202 weight = weight & 0x0000FFFF;
3204 weight_vec = __msa_fill_w(weight);
3205 offset_vec = __msa_fill_w(offset);
3206 rnd_vec = __msa_fill_w(rnd_val);
3208 filter_vec =
LD_SH(filter);
3211 LD_SB3(src, src_stride, src0, src1, src2);
3212 src += (3 * src_stride);
3213 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3214 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3215 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3217 for (loop_cnt = (height >> 3); loop_cnt--;) {
3218 LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
3219 src += (6 * src_stride);
3220 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3221 src32_r, src43_r, src54_r, src65_r);
3222 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3223 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3224 src4332, src6554, src8776);
3228 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3230 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3232 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
3234 LD_SB2(src, src_stride, src9, src2);
3235 src += (2 * src_stride);
3236 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3237 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3238 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3241 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
3243 weight_vec, offset_vec, rnd_vec,
3244 dst0_r, dst1_r, dst2_r, dst3_r,
3245 dst0_l, dst1_l, dst2_l, dst3_l);
3248 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3249 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
3250 dst += (8 * dst_stride);
3266 filter, height, weight, offset, rnd_val);
3267 }
else if (4 == height) {
3269 filter, height, weight, offset, rnd_val);
3270 }
else if (0 == (height % 8)) {
3272 filter, height, weight, offset,
3288 v16i8
src0,
src1, src2, src3, src4;
3289 v16i8 src10_r, src32_r, src21_r, src43_r;
3290 v8i16 tmp0, tmp1, tmp2, tmp3;
3292 v8i16 filter_vec, const_vec;
3293 v4i32 weight_vec, offset_vec, rnd_vec;
3294 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3298 const_vec = __msa_ldi_h(128);
3300 weight = weight & 0x0000FFFF;
3302 weight_vec = __msa_fill_w(weight);
3303 offset_vec = __msa_fill_w(offset);
3304 rnd_vec = __msa_fill_w(rnd_val);
3306 filter_vec =
LD_SH(filter);
3309 LD_SB3(src, src_stride, src0, src1, src2);
3310 src += (3 * src_stride);
3312 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3314 for (loop_cnt = (height >> 2); loop_cnt--;) {
3315 LD_SB2(src, src_stride, src3, src4);
3316 src += (2 * src_stride);
3318 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3321 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3323 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3325 LD_SB2(src, src_stride, src1, src2);
3326 src += (2 * src_stride);
3328 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3331 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3333 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3335 weight_vec, offset_vec, rnd_vec,
3336 dst0_r, dst1_r, dst2_r, dst3_r,
3337 dst0_l, dst1_l, dst2_l, dst3_l);
3340 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3342 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
3343 dst += (4 * dst_stride);
3357 v16i8
src0,
src1, src2, src3, src4;
3358 v16i8 src10_r, src32_r, src21_r, src43_r;
3361 v8i16 filter_vec, const_vec;
3362 v4i32 weight_vec, offset_vec, rnd_vec;
3363 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
3367 const_vec = __msa_ldi_h(128);
3369 weight = weight & 0x0000FFFF;
3371 weight_vec = __msa_fill_w(weight);
3372 offset_vec = __msa_fill_w(offset);
3373 rnd_vec = __msa_fill_w(rnd_val);
3375 filter_vec =
LD_SH(filter);
3378 LD_SB3(src, src_stride, src0, src1, src2);
3379 src += (3 * src_stride);
3381 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3382 LD_SB2(src, src_stride, src3, src4);
3384 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3387 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3389 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3391 dst0_r, dst1_r, dst0_l, dst1_l);
3407 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3408 v16i8 src10_r, src32_r, src54_r, src76_r;
3409 v16i8 src21_r, src43_r, src65_r, src87_r;
3410 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3412 v8i16 filter_vec, const_vec;
3413 v4i32 weight_vec, offset_vec, rnd_vec;
3414 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3415 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
3419 const_vec = __msa_ldi_h(128);
3421 weight = weight & 0x0000FFFF;
3423 weight_vec = __msa_fill_w(weight);
3424 offset_vec = __msa_fill_w(offset);
3425 rnd_vec = __msa_fill_w(rnd_val);
3427 filter_vec =
LD_SH(filter);
3430 LD_SB3(src, src_stride, src0, src1, src2);
3431 src += (3 * src_stride);
3433 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3435 LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
3437 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3438 src32_r, src43_r, src54_r, src65_r);
3439 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3442 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3444 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3446 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, tmp2, tmp2);
3448 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, tmp3, tmp3);
3450 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, tmp4, tmp4);
3452 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, tmp5, tmp5);
3454 weight_vec, offset_vec, rnd_vec,
3455 dst0_r, dst1_r, dst2_r, dst3_r,
3456 dst0_l, dst1_l, dst2_l, dst3_l);
3458 dst4_r, dst5_r, dst4_l, dst5_l);
3461 dst2_l, dst2_r, dst3_l, dst3_r,
3462 dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
3463 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3464 dst += (4 * dst_stride);
3479 v16i8
src0,
src1, src2, src3, src4;
3480 v16i8 src10_r, src32_r, src21_r, src43_r;
3481 v8i16 tmp0, tmp1, tmp2, tmp3;
3483 v8i16 filter_vec, const_vec;
3484 v4i32 weight_vec, offset_vec, rnd_vec;
3485 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3489 const_vec = __msa_ldi_h(128);
3491 weight = weight & 0x0000FFFF;
3493 weight_vec = __msa_fill_w(weight);
3494 offset_vec = __msa_fill_w(offset);
3495 rnd_vec = __msa_fill_w(rnd_val);
3497 filter_vec =
LD_SH(filter);
3500 LD_SB3(src, src_stride, src0, src1, src2);
3501 src += (3 * src_stride);
3503 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3505 for (loop_cnt = (height >> 2); loop_cnt--;) {
3506 LD_SB2(src, src_stride, src3, src4);
3507 src += (2 * src_stride);
3509 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3512 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3514 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3516 LD_SB2(src, src_stride, src1, src2);
3517 src += (2 * src_stride);
3519 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3522 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3524 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3526 weight_vec, offset_vec, rnd_vec,
3527 dst0_r, dst1_r, dst2_r, dst3_r,
3528 dst0_l, dst1_l, dst2_l, dst3_l);
3531 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3532 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3533 dst += (4 * dst_stride);
3549 filter, height, weight, offset, rnd_val);
3550 }
else if (6 == height) {
3552 filter, height, weight, offset, rnd_val);
3555 filter, height, weight, offset,
3571 v16i8
src0,
src1, src2, src3, src4, src5;
3572 v16i8 src10_r, src32_r, src21_r, src43_r;
3573 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3574 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3575 v16i8 src2110, src4332;
3577 v8i16 filter_vec, const_vec;
3578 v4i32 weight_vec, offset_vec, rnd_vec;
3579 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3580 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
3582 src -= (1 * src_stride);
3584 const_vec = __msa_ldi_h(128);
3586 weight = weight & 0x0000FFFF;
3588 weight_vec = __msa_fill_w(weight);
3589 offset_vec = __msa_fill_w(offset);
3590 rnd_vec = __msa_fill_w(rnd_val);
3592 filter_vec =
LD_SH(filter);
3595 LD_SB3(src, src_stride, src0, src1, src2);
3596 src += (3 * src_stride);
3598 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3599 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3600 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3602 for (loop_cnt = (height >> 2); loop_cnt--;) {
3603 LD_SB2(src, src_stride, src3, src4);
3604 src += (2 * src_stride);
3606 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3607 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3608 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3611 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3613 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3615 DPADD_SB2_SH(src2110, src4332, filt0, filt1, tmp4, tmp4);
3617 LD_SB2(src, src_stride, src5, src2);
3618 src += (2 * src_stride);
3620 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3621 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
3622 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3625 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3627 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3629 DPADD_SB2_SH(src4332, src2110, filt0, filt1, tmp5, tmp5);
3631 weight_vec, offset_vec, rnd_vec,
3632 dst0_r, dst1_r, dst2_r, dst3_r,
3633 dst0_l, dst1_l, dst2_l, dst3_l);
3635 dst4_r, dst5_r, dst4_l, dst5_l);
3638 dst2_l, dst2_r, dst3_l, dst3_r,
3639 dst4_l, dst4_r, dst5_l, dst5_r,
3640 dst0_r, dst1_r, dst2_r);
3641 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
3642 dst += (4 * dst_stride);
3657 v16i8
src0,
src1, src2, src3, src4, src5;
3658 v16i8 src10_r, src32_r, src21_r, src43_r;
3659 v16i8 src10_l, src32_l, src21_l, src43_l;
3660 v8i16 tmp0, tmp1, tmp2, tmp3;
3662 v8i16 filter_vec, const_vec;
3663 v4i32 weight_vec, offset_vec, rnd_vec;
3664 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3668 const_vec = __msa_ldi_h(128);
3670 weight = weight & 0x0000FFFF;
3672 weight_vec = __msa_fill_w(weight);
3673 offset_vec = __msa_fill_w(offset);
3674 rnd_vec = __msa_fill_w(rnd_val);
3676 filter_vec =
LD_SH(filter);
3679 LD_SB3(src, src_stride, src0, src1, src2);
3680 src += (3 * src_stride);
3682 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3683 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3685 for (loop_cnt = (height >> 2); loop_cnt--;) {
3686 LD_SB2(src, src_stride, src3, src4);
3687 src += (2 * src_stride);
3689 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3690 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3693 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3695 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3697 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp2, tmp2);
3699 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp3, tmp3);
3701 weight_vec, offset_vec, rnd_vec,
3702 dst0_r, dst1_r, dst2_r, dst3_r,
3703 dst0_l, dst1_l, dst2_l, dst3_l);
3706 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3707 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3708 dst += (2 * dst_stride);
3710 LD_SB2(src, src_stride, src5, src2);
3711 src += (2 * src_stride);
3713 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3714 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3717 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
3719 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
3721 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp2, tmp2);
3723 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp3, tmp3);
3725 weight_vec, offset_vec, rnd_vec,
3726 dst0_r, dst1_r, dst2_r, dst3_r,
3727 dst0_l, dst1_l, dst2_l, dst3_l);
3730 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3731 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3732 dst += (2 * dst_stride);
3747 v16i8
src0,
src1, src2, src3, src4, src5;
3748 v16i8 src6, src7, src8, src9, src10, src11;
3749 v16i8 src10_r, src32_r, src76_r, src98_r;
3750 v16i8 src21_r, src43_r, src87_r, src109_r;
3751 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3752 v16i8 src10_l, src32_l, src21_l, src43_l;
3754 v8i16 filter_vec, const_vec;
3755 v4i32 weight_vec, offset_vec, rnd_vec;
3756 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3757 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
3761 const_vec = __msa_ldi_h(128);
3763 weight = weight & 0x0000FFFF;
3765 weight_vec = __msa_fill_w(weight);
3766 offset_vec = __msa_fill_w(offset);
3767 rnd_vec = __msa_fill_w(rnd_val);
3769 filter_vec =
LD_SH(filter);
3772 LD_SB3(src, src_stride, src0, src1, src2);
3774 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3775 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3777 LD_SB3(src + 16, src_stride, src6, src7, src8);
3778 src += (3 * src_stride);
3780 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3782 for (loop_cnt = (height >> 2); loop_cnt--;) {
3783 LD_SB2(src, src_stride, src3, src4);
3785 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3786 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3787 LD_SB2(src + 16, src_stride, src9, src10);
3788 src += (2 * src_stride);
3790 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3793 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3795 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
3797 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3799 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
3801 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
3803 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
3806 weight_vec, offset_vec, rnd_vec,
3807 dst0_r, dst1_r, dst2_r, dst3_r,
3808 dst0_l, dst1_l, dst2_l, dst3_l);
3810 dst4_r, dst5_r, dst4_l, dst5_l);
3813 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3815 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3816 ST8x2_UB(dst4_r, dst + 16, dst_stride);
3817 dst += (2 * dst_stride);
3819 LD_SB2(src, src_stride, src5, src2);
3821 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3822 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3823 LD_SB2(src + 16, src_stride, src11, src8);
3824 src += (2 * src_stride);
3826 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3829 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
3831 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp4, tmp4);
3833 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
3835 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp5, tmp5);
3837 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, tmp2, tmp2);
3839 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, tmp3, tmp3);
3842 weight_vec, offset_vec, rnd_vec,
3843 dst0_r, dst1_r, dst2_r, dst3_r,
3844 dst0_l, dst1_l, dst2_l, dst3_l);
3846 dst4_r, dst5_r, dst4_l, dst5_l);
3849 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3851 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3852 ST8x2_UB(dst4_r, dst + 16, dst_stride);
3853 dst += (2 * dst_stride);
3869 v16i8
src0,
src1, src2, src3, src4, src6, src7, src8, src9, src10;
3870 v16i8 src10_r, src32_r, src76_r, src98_r;
3871 v16i8 src21_r, src43_r, src87_r, src109_r;
3872 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3873 v16i8 src10_l, src32_l, src76_l, src98_l;
3874 v16i8 src21_l, src43_l, src87_l, src109_l;
3876 v8i16 filter_vec, const_vec;
3877 v4i32 weight_vec, offset_vec, rnd_vec;
3878 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3879 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l, dst6_l, dst7_l;
3883 const_vec = __msa_ldi_h(128);
3885 weight = weight & 0x0000FFFF;
3887 weight_vec = __msa_fill_w(weight);
3888 offset_vec = __msa_fill_w(offset);
3889 rnd_vec = __msa_fill_w(rnd_val);
3891 filter_vec =
LD_SH(filter);
3894 LD_SB3(src, src_stride, src0, src1, src2);
3896 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3897 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3899 LD_SB3(src + 16, src_stride, src6, src7, src8);
3900 src += (3 * src_stride);
3902 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3903 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3905 for (loop_cnt = (height >> 1); loop_cnt--;) {
3906 LD_SB2(src, src_stride, src3, src4);
3908 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3909 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3912 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3914 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
3916 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3918 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
3921 weight_vec, offset_vec, rnd_vec,
3922 dst0_r, dst1_r, dst2_r, dst3_r,
3923 dst0_l, dst1_l, dst2_l, dst3_l);
3925 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3926 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3927 dst += (2 * dst_stride);
3935 LD_SB2(src + 16, src_stride, src9, src10);
3936 src += (2 * src_stride);
3938 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3939 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3942 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
3944 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, tmp6, tmp6);
3946 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
3948 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, tmp7, tmp7);
3951 weight_vec, offset_vec, rnd_vec,
3952 dst4_r, dst5_r, dst6_r, dst7_r,
3953 dst4_l, dst5_l, dst6_l, dst7_l);
3956 dst5_l, dst5_r, dst7_l, dst7_r, dst4_r, dst5_r);
3957 ST_SW2(dst4_r, dst5_r, dst_tmp, dst_stride);
3958 dst_tmp += (2 * dst_stride);
3972 const int8_t *filter_x,
3973 const int8_t *filter_y,
3979 v16i8
src0,
src1, src2, src3, src4;
3981 v4i32 filt_h0, filt_h1;
3982 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3984 v8i16 filter_vec, const_vec;
3985 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3986 v8i16 dst0, dst1, dst2, dst3, dst4;
3987 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3988 v4i32 dst0_r, dst1_r;
3989 v4i32 weight_vec, offset_vec, rnd_vec;
3991 src -= (src_stride + 1);
3993 filter_vec =
LD_SH(filter_x);
3996 filter_vec =
LD_SH(filter_y);
3997 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3998 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4004 const_vec = __msa_ldi_h(128);
4007 weight_vec = __msa_fill_w(weight);
4008 offset_vec = __msa_fill_w(offset);
4009 rnd_vec = __msa_fill_w(rnd_val);
4011 LD_SB3(src, src_stride, src0, src1, src2);
4012 src += (3 * src_stride);
4015 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4016 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4017 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4025 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
4026 LD_SB2(src, src_stride, src3, src4);
4030 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4034 dst32_r = __msa_ilvr_h(dst3, dst2);
4038 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4042 dst43_r = __msa_ilvr_h(dst4, dst3);
4046 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4048 ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
4060 const int8_t *filter_x,
4061 const int8_t *filter_y,
4067 v16i8
src0,
src1, src2, src3, src4, src5, src6;
4069 v4i32 filt_h0, filt_h1;
4070 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4072 v8i16 filter_vec, const_vec;
4073 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4074 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4075 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
4076 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4077 v4i32 weight_vec, offset_vec, rnd_vec;
4079 src -= (src_stride + 1);
4081 filter_vec =
LD_SH(filter_x);
4084 filter_vec =
LD_SH(filter_y);
4085 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4086 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4092 const_vec = __msa_ldi_h(128);
4095 weight_vec = __msa_fill_w(weight);
4096 offset_vec = __msa_fill_w(offset);
4097 rnd_vec = __msa_fill_w(rnd_val);
4099 LD_SB3(src, src_stride, src0, src1, src2);
4100 src += (3 * src_stride);
4103 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4104 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4105 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4113 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
4115 LD_SB4(src, src_stride, src3, src4, src5, src6);
4119 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4122 dst32_r = __msa_ilvr_h(dst3, dst2);
4127 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4130 dst43_r = __msa_ilvr_h(dst4, dst3);
4135 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4138 dst10_r = __msa_ilvr_h(dst5, dst4);
4143 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4146 dst21_r = __msa_ilvr_h(dst2, dst5);
4151 weight_vec, offset_vec, rnd_vec,
4152 dst0_r, dst1_r, dst2_r, dst3_r);
4154 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
4161 const int8_t *filter_x,
4162 const int8_t *filter_y,
4169 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4171 v4i32 filt_h0, filt_h1;
4172 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4174 v8i16 filter_vec, const_vec;
4175 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4176 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
4177 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
4178 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4179 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4180 v4i32 weight_vec, offset_vec, rnd_vec;
4182 src -= (src_stride + 1);
4184 filter_vec =
LD_SH(filter_x);
4187 filter_vec =
LD_SH(filter_y);
4188 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4189 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4195 const_vec = __msa_ldi_h(128);
4198 weight_vec = __msa_fill_w(weight);
4199 offset_vec = __msa_fill_w(offset);
4200 rnd_vec = __msa_fill_w(rnd_val);
4202 LD_SB3(src, src_stride, src0, src1, src2);
4203 src += (3 * src_stride);
4206 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4207 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4208 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4215 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
4217 for (loop_cnt = height >> 3; loop_cnt--;) {
4219 src3, src4, src5, src6, src7, src8, src9, src10);
4220 src += (8 * src_stride);
4223 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4226 dst32_r = __msa_ilvr_h(dst3, dst2);
4230 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4233 dst43_r = __msa_ilvr_h(dst4, dst3);
4237 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4240 dst54_r = __msa_ilvr_h(dst5, dst4);
4244 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4247 dst65_r = __msa_ilvr_h(dst6, dst5);
4251 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4254 dst76_r = __msa_ilvr_h(dst7, dst6);
4258 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
4261 dst87_r = __msa_ilvr_h(dst8, dst7);
4265 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
4268 dst10_r = __msa_ilvr_h(dst9, dst8);
4272 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
4275 dst21_r = __msa_ilvr_h(dst2, dst9);
4280 weight_vec, offset_vec, rnd_vec,
4281 dst0_r, dst1_r, dst2_r, dst3_r);
4283 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
4284 dst += (4 * dst_stride);
4287 weight_vec, offset_vec, rnd_vec,
4288 dst4_r, dst5_r, dst6_r, dst7_r);
4290 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
4291 dst += (4 * dst_stride);
4299 const int8_t *filter_x,
4300 const int8_t *filter_y,
4308 filter_x, filter_y, height, weight,
4310 }
else if (4 == height) {
4312 filter_x, filter_y, height, weight,
4314 }
else if (0 == (height % 8)) {
4316 filter_x, filter_y, height, weight,
4325 const int8_t *filter_x,
4326 const int8_t *filter_y,
4333 v16i8
src0,
src1, src2, src3, src4, src5, src6;
4335 v4i32 filt_h0, filt_h1;
4336 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4338 v8i16 filter_vec, const_vec;
4339 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4340 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4341 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4342 v4i32 weight_vec, offset_vec, rnd_vec;
4343 v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
4344 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4345 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4347 src -= (src_stride + 1);
4349 filter_vec =
LD_SH(filter_x);
4352 filter_vec =
LD_SH(filter_y);
4353 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4354 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4360 const_vec = __msa_ldi_h(128);
4363 weight_vec = __msa_fill_w(weight);
4364 offset_vec = __msa_fill_w(offset);
4365 rnd_vec = __msa_fill_w(rnd_val);
4367 LD_SB3(src, src_stride, src0, src1, src2);
4368 src += (3 * src_stride);
4371 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4372 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4373 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4384 for (loop_cnt = height >> 2; loop_cnt--;) {
4385 LD_SB4(src, src_stride, src3, src4, src5, src6);
4386 src += (4 * src_stride);
4390 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4400 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4410 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4420 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4430 weight_vec, offset_vec, rnd_vec,
4431 dst0_r, dst1_r, dst0_l, dst1_l);
4433 weight_vec, offset_vec, rnd_vec,
4434 dst2_r, dst3_r, dst2_l, dst3_l);
4436 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
4437 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
4438 dst += (4 * dst_stride);
4446 const int8_t *filter_x,
4447 const int8_t *filter_y,
4453 v16i8
src0,
src1, src2, src3, src4;
4455 v4i32 filt_h0, filt_h1;
4456 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4458 v8i16 filter_vec, const_vec;
4459 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4460 v8i16 dst0, dst1, dst2, dst3, dst4;
4461 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4462 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4463 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4464 v4i32 weight_vec, offset_vec, rnd_vec;
4466 src -= (src_stride + 1);
4468 filter_vec =
LD_SH(filter_x);
4471 filter_vec =
LD_SH(filter_y);
4472 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4473 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4479 const_vec = __msa_ldi_h(128);
4482 weight_vec = __msa_fill_w(weight);
4483 offset_vec = __msa_fill_w(offset);
4484 rnd_vec = __msa_fill_w(rnd_val);
4486 LD_SB3(src, src_stride, src0, src1, src2);
4487 src += (3 * src_stride);
4490 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4491 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4492 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4503 LD_SB2(src, src_stride, src3, src4);
4504 src += (2 * src_stride);
4507 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4516 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4526 weight_vec, offset_vec, rnd_vec,
4527 dst0_r, dst1_r, dst0_l, dst1_l);
4530 dst += (2 * dst_stride);
4537 const int8_t *filter_x,
4538 const int8_t *filter_y,
4544 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
4546 v4i32 filt_h0, filt_h1;
4547 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4549 v8i16 filter_vec, const_vec;
4550 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4551 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4552 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4553 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4554 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4555 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4556 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4557 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4558 v4i32 weight_vec, offset_vec, rnd_vec;
4560 src -= (src_stride + 1);
4562 filter_vec =
LD_SH(filter_x);
4565 filter_vec =
LD_SH(filter_y);
4566 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4567 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4573 const_vec = __msa_ldi_h(128);
4576 weight_vec = __msa_fill_w(weight);
4577 offset_vec = __msa_fill_w(offset);
4578 rnd_vec = __msa_fill_w(rnd_val);
4580 LD_SB3(src, src_stride, src0, src1, src2);
4581 src += (3 * src_stride);
4585 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4586 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4587 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4598 LD_SB2(src, src_stride, src3, src4);
4599 src += (2 * src_stride);
4603 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4613 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4622 LD_SB2(src, src_stride, src5, src6);
4623 src += (2 * src_stride);
4627 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4637 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4646 LD_SB2(src, src_stride, src7, src8);
4647 src += (2 * src_stride);
4651 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4662 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
4672 weight_vec, offset_vec, rnd_vec,
4673 dst0_r, dst1_r, dst0_l, dst1_l);
4675 weight_vec, offset_vec, rnd_vec,
4676 dst2_r, dst3_r, dst2_l, dst3_l);
4678 weight_vec, offset_vec, rnd_vec,
4679 dst4_r, dst5_r, dst4_l, dst5_l);
4681 dst2_l, dst2_r, dst3_l, dst3_r,
4682 dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
4683 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
4684 dst += (4 * dst_stride);
4692 const int8_t *filter_x,
4693 const int8_t *filter_y,
4700 uint32_t loop_cnt, cnt;
4703 v16i8
src0,
src1, src2, src3, src4, src5, src6;
4705 v4i32 filt_h0, filt_h1;
4706 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4708 v8i16 filter_vec, const_vec;
4709 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4710 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4711 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4712 v4i32 weight_vec, offset_vec, rnd_vec;
4713 v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
4714 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4715 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4717 src -= (src_stride + 1);
4719 filter_vec =
LD_SH(filter_x);
4722 filter_vec =
LD_SH(filter_y);
4723 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4724 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4730 const_vec = __msa_ldi_h(128);
4733 weight_vec = __msa_fill_w(weight);
4734 offset_vec = __msa_fill_w(offset);
4735 rnd_vec = __msa_fill_w(rnd_val);
4737 for (cnt = width >> 3; cnt--;) {
4741 LD_SB3(src_tmp, src_stride, src0, src1, src2);
4742 src_tmp += (3 * src_stride);
4745 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4746 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4747 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4758 for (loop_cnt = height >> 2; loop_cnt--;) {
4759 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
4760 src_tmp += (4 * src_stride);
4763 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4772 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4781 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4790 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4800 weight_vec, offset_vec, rnd_vec,
4801 dst0_r, dst1_r, dst0_l, dst1_l);
4803 weight_vec, offset_vec, rnd_vec,
4804 dst2_r, dst3_r, dst2_l, dst3_l);
4806 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
4807 ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
4808 dst_tmp += (4 * dst_stride);
4820 const int8_t *filter_x,
4821 const int8_t *filter_y,
4830 filter_x, filter_y, height, weight,
4832 }
else if (6 == height) {
4834 filter_x, filter_y, height, weight,
4836 }
else if (0 == (height % 4)) {
4838 filter_x, filter_y, height, weight,
4839 offset, rnd_val, 8);
4847 const int8_t *filter_x,
4848 const int8_t *filter_y,
4855 filter_x, filter_y, height, weight,
4856 offset, rnd_val, 8);
4858 filter_x, filter_y, height, weight,
4866 const int8_t *filter_x,
4867 const int8_t *filter_y,
4874 filter_x, filter_y, height, weight,
4875 offset, rnd_val, 16);
4882 const int8_t *filter_x,
4883 const int8_t *filter_y,
4890 filter_x, filter_y, height, weight,
4891 offset, rnd_val, 24);
4898 const int8_t *filter_x,
4899 const int8_t *filter_y,
4906 filter_x, filter_y, height, weight,
4907 offset, rnd_val, 32);
4910 #define UNIWGT_MC_COPY(WIDTH) \
4911 void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4912 ptrdiff_t dst_stride, \
4914 ptrdiff_t src_stride, \
4923 int shift = denom + 14 - 8; \
4924 hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4925 height, weight, offset, shift); \
4938 #undef UNIWGT_MC_COPY
4940 #define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4941 void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4955 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4956 int shift = denom + 14 - 8; \
4958 hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
4959 dst_stride, filter, height, \
4960 weight, offset, shift); \
4999 #define UNI_W_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
5000 void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
5014 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
5015 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
5016 int shift = denom + 14 - 8; \
5018 hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
5019 dst_stride, filter_x, \
5020 filter_y, height, weight, \
#define HEVC_PCK_SW_SB2(in0, in1, out)
static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B8_128_SB(...)
static void hevc_hz_uniwgt_4t_8x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B2_128_SB(...)
#define MUL2(in0, in1, in2, in3, out0, out1)
static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B3_128_SB(...)
static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define UNI_W_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)
static void hevc_uniwgt_copy_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define DPADD_SB4_SH(...)
#define SPLATI_H2_SH(...)
#define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,out0_h, out1_h)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd,out0_r, out1_r, out2_r, out3_r,out0_l, out1_l, out2_l, out3_l)
static void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_PCK_SW_SB4(in0, in1, in2, in3, out)
static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define LD4(psrc, stride, out0, out1, out2, out3)
static void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, uint8_t clip)
#define SPLATI_H4_SH(...)
static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_PCK_SW_SB12(in0, in1, in2, in3, in4, in5, in6, in7,in8, in9, in10, in11, out0, out1, out2)
static void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define CLIP_SW_0_255(in)
static void hevc_vt_uniwgt_4t_8x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST8x2_UB(in, pdst, stride)
static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define CLIP_SH_0_255_MAX_SATU(in)
#define XORI_B7_128_SB(...)
static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define LW2(psrc, stride, out0, out1)
static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width)
#define XORI_B4_128_SB(...)
static const uint8_t offset[127][2]
static void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define DPADD_SB2_SH(...)
static void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SPLATI_W4_SW(...)
static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width)
static void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_PCK_SW_SB8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1)
static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static const uint8_t ff_hevc_mask_arr[16 *2]
static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_uniwgt_copy_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_HV_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd,out0, out1, out2, out3)
#define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
#define ADD2(in0, in1, in2, in3, out0, out1)
#define INSERT_W2_SB(...)
static void hevc_vt_uniwgt_8t_16multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width)
static int weight(int i, int blen, int offset)
static void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SLLI_4V(in0, in1, in2, in3, shift)
#define ST4x8_UB(in0, in1, pdst, stride)
#define ST6x4_UB(in0, in1, pdst, stride)
static void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
#define LW4(psrc, stride, out0, out1, out2, out3)
static void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define UNIWGT_MC_COPY(WIDTH)
static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST8x4_UB(in0, in1, pdst, stride)
static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define INSERT_D2_SB(...)
static void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B6_128_SB(...)
#define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w,offset_h, rnd_w, out0_h, out1_h,out2_h, out3_h)
static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd,out0_r, out1_r, out0_l, out1_l)
#define ST12x4_UB(in0, in1, in2, pdst, stride)
static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SPLATI_W2_SW(...)
static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SLLI_2V(in0, in1, shift)
#define ST4x2_UB(in, pdst, stride)
#define INSERT_W4_SB(...)
#define LD2(psrc, stride, out0, out1)
static void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)