25 #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \
26 out0_r, out1_r, out0_l, out1_l) \
28 ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
29 ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
31 out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
32 out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
33 out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
34 out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
36 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
38 out0_r = CLIP_SW_0_255(out0_r); \
39 out1_r = CLIP_SW_0_255(out1_r); \
40 out0_l = CLIP_SW_0_255(out0_l); \
41 out1_l = CLIP_SW_0_255(out1_l); \
44 #define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \
46 out0_r, out1_r, out2_r, out3_r, \
47 out0_l, out1_l, out2_l, out3_l) \
49 HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \
50 out0_r, out1_r, out0_l, out1_l) \
51 HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, \
52 out2_r, out3_r, out2_l, out3_l) \
55 #define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \
57 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
58 SRARI_H2_SH(out0, out1, rnd_val); \
59 CLIP_SH2_0_255(out0, out1); \
62 #define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \
63 vec0, vec1, vec2, vec3, rnd_val, \
64 out0, out1, out2, out3) \
66 HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \
67 HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \
85 v4i32 weight_vec, offset_vec, rnd_vec;
87 offset = (offset0 + offset1) << rnd_val;
88 weight0 = weight0 & 0x0000FFFF;
89 weight = weight0 | (weight1 << 16);
91 offset_vec = __msa_fill_w(offset);
92 weight_vec = __msa_fill_w(weight);
93 rnd_vec = __msa_fill_w(rnd_val + 1);
100 LD_SB2(src0_ptr, src_stride, src0, src1);
101 LD_SH2(src1_ptr, src2_stride, in0, in1);
102 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
103 src0 = (v16i8) __msa_ilvr_w((v4i32)
src1, (v4i32) src0);
105 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
109 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r,
111 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l,
119 }
else if (4 == height) {
121 v8i16 in0, in1, in2, in3;
123 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
125 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
126 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
128 ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
129 ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
133 weight_vec, rnd_vec, offset_vec,
134 dst0_r, dst1_r, dst0_l, dst1_l);
137 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
138 }
else if (0 == height % 8) {
140 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
141 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
142 v8i16 dst0, dst1, dst2, dst3;
143 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
145 for (loop_cnt = (height >> 3); loop_cnt--;) {
146 LD_SB8(src0_ptr, src_stride,
147 src0, src1, src2, src3, src4, src5, src6, src7);
148 src0_ptr += (8 * src_stride);
149 LD_SH8(src1_ptr, src2_stride,
150 in0, in1, in2, in3, in4, in5, in6, in7);
151 src1_ptr += (8 * src2_stride);
155 ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
156 src0, src1, src2, src3);
157 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
158 dst0, dst1, dst2, dst3);
160 SLLI_4V(dst0, dst1, dst2, dst3, 6);
163 weight_vec, rnd_vec, offset_vec,
164 dst0_r, dst1_r, dst2_r, dst3_r,
165 dst0_l, dst1_l, dst2_l, dst3_l);
168 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
169 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
170 dst += (8 * dst_stride);
192 v8i16 in0, in1, in2, in3;
193 v8i16 dst0, dst1, dst2, dst3;
194 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
195 v4i32 offset_vec, weight_vec, rnd_vec;
197 offset = (offset0 + offset1) << rnd_val;
198 weight0 = weight0 & 0x0000FFFF;
199 weight = weight0 | (weight1 << 16);
201 weight_vec = __msa_fill_w(weight);
202 offset_vec = __msa_fill_w(offset);
203 rnd_vec = __msa_fill_w(rnd_val + 1);
205 for (loop_cnt = (height >> 2); loop_cnt--;) {
206 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
207 src0_ptr += (4 * src_stride);
208 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
209 src1_ptr += (4 * src2_stride);
210 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
211 dst0, dst1, dst2, dst3);
213 SLLI_4V(dst0, dst1, dst2, dst3, 6);
216 weight_vec, rnd_vec, offset_vec,
217 dst0_r, dst1_r, dst2_r, dst3_r,
218 dst0_l, dst1_l, dst2_l, dst3_l);
221 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
222 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
223 dst += (4 * dst_stride);
242 v4i32 offset_vec, weight_vec, rnd_vec;
244 offset = (offset0 + offset1) << rnd_val;
245 weight0 = weight0 & 0x0000FFFF;
246 weight = weight0 | (weight1 << 16);
248 offset_vec = __msa_fill_w(offset);
249 weight_vec = __msa_fill_w(weight);
250 rnd_vec = __msa_fill_w(rnd_val + 1);
254 v8i16 in0, in1, dst0, dst1;
255 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
257 LD_SB2(src0_ptr, src_stride, src0, src1);
258 LD_SH2(src1_ptr, src2_stride, in0, in1);
260 ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
265 weight_vec, rnd_vec, offset_vec,
266 dst0_r, dst1_r, dst0_l, dst1_l);
270 }
else if (6 == height) {
271 v16i8
src0,
src1, src2, src3, src4, src5;
272 v8i16 in0, in1, in2, in3, in4, in5;
273 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
274 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
275 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
277 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
278 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
279 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
280 dst0, dst1, dst2, dst3);
281 ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5);
283 SLLI_4V(dst0, dst1, dst2, dst3, 6);
288 weight_vec, rnd_vec, offset_vec,
289 dst0_r, dst1_r, dst2_r, dst3_r,
290 dst0_l, dst1_l, dst2_l, dst3_l);
292 weight_vec, rnd_vec, offset_vec,
293 dst4_r, dst5_r, dst4_l, dst5_l);
296 dst2_l, dst2_r, dst3_l, dst3_r,
297 dst4_l, dst4_r, dst5_l, dst5_r,
298 dst0_r, dst1_r, dst2_r);
299 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
300 dst += (4 * dst_stride);
302 }
else if (0 == height % 4) {
305 v8i16 in0, in1, in2, in3;
306 v8i16 dst0, dst1, dst2, dst3;
307 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
309 for (loop_cnt = (height >> 2); loop_cnt--;) {
310 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
311 src0_ptr += (4 * src_stride);
312 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
313 src1_ptr += (4 * src2_stride);
314 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
315 dst0, dst1, dst2, dst3);
317 SLLI_4V(dst0, dst1, dst2, dst3, 6);
320 weight_vec, rnd_vec, offset_vec,
321 dst0_r, dst1_r, dst2_r, dst3_r,
322 dst0_l, dst1_l, dst2_l, dst3_l);
325 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
326 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
327 dst += (4 * dst_stride);
349 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
350 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
351 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
352 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
353 v4i32 offset_vec, weight_vec, rnd_vec;
355 offset = (offset0 + offset1) << rnd_val;
356 weight0 = weight0 & 0x0000FFFF;
357 weight = weight0 | (weight1 << 16);
359 offset_vec = __msa_fill_w(offset);
360 weight_vec = __msa_fill_w(weight);
361 rnd_vec = __msa_fill_w(rnd_val + 1);
363 for (loop_cnt = (16 >> 2); loop_cnt--;) {
364 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
365 src0_ptr += (4 * src_stride);
366 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
367 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
368 src1_ptr += (4 * src2_stride);
371 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
372 dst0, dst1, dst2, dst3);
374 SLLI_4V(dst0, dst1, dst2, dst3, 6);
375 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
376 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
382 weight_vec, rnd_vec, offset_vec,
383 dst0_r, dst1_r, dst2_r, dst3_r,
384 dst0_l, dst1_l, dst2_l, dst3_l);
386 weight_vec, rnd_vec, offset_vec,
387 dst4_r, dst5_r, dst4_l, dst5_l);
390 dst2_l, dst2_r, dst3_l, dst3_r,
391 dst4_l, dst4_r, dst5_l, dst5_r,
392 dst0_r, dst1_r, dst2_r);
393 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
394 dst += (4 * dst_stride);
412 uint32_t loop_cnt, cnt;
414 int16_t *src1_ptr_tmp;
418 v4i32 offset_vec, weight_vec, rnd_vec;
420 offset = (offset0 + offset1) << rnd_val;
421 weight0 = weight0 & 0x0000FFFF;
422 weight = weight0 | (weight1 << 16);
424 offset_vec = __msa_fill_w(offset);
425 weight_vec = __msa_fill_w(weight);
426 rnd_vec = __msa_fill_w(rnd_val + 1);
428 for (cnt = (width >> 4); cnt--;) {
429 src0_ptr_tmp = src0_ptr;
430 src1_ptr_tmp = src1_ptr;
433 for (loop_cnt = (height >> 2); loop_cnt--;) {
435 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
436 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
437 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
438 v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
440 LD_SB4(src0_ptr_tmp, src_stride, src0, src1, src2, src3);
441 src0_ptr_tmp += (4 * src_stride);
442 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
443 LD_SH4(src1_ptr_tmp + 8, src2_stride, in4, in5, in6, in7);
444 src1_ptr_tmp += (4 * src2_stride);
446 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
447 tmp0, tmp1, tmp2, tmp3);
448 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
449 tmp4, tmp5, tmp6, tmp7);
451 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
452 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
455 weight_vec, rnd_vec, offset_vec,
456 dst0_r, dst1_r, dst2_r, dst3_r,
457 dst0_l, dst1_l, dst2_l, dst3_l);
460 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
461 ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
462 dst_tmp += (2 * dst_stride);
466 weight_vec, rnd_vec, offset_vec,
467 dst0_r, dst1_r, dst2_r, dst3_r,
468 dst0_l, dst1_l, dst2_l, dst3_l);
471 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
472 ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
473 dst_tmp += (2 * dst_stride);
496 src1_ptr, src2_stride,
497 dst, dst_stride, height, weight0,
498 weight1, offset0, offset1, rnd_val, 16);
515 src1_ptr, src2_stride,
516 dst, dst_stride, height, weight0,
517 weight1, offset0, offset1, rnd_val, 16);
519 src1_ptr + 16, src2_stride,
520 dst + 16, dst_stride, height, weight0,
521 weight1, offset0, offset1, rnd_val);
538 src1_ptr, src2_stride,
539 dst, dst_stride, height, weight0,
540 weight1, offset0, offset1, rnd_val, 32);
557 src1_ptr, src2_stride,
558 dst, dst_stride, height, weight0,
559 weight1, offset0, offset1, rnd_val, 48);
576 src1_ptr, src2_stride,
577 dst, dst_stride, height, weight0,
578 weight1, offset0, offset1, rnd_val, 64);
597 v8i16 filt0, filt1, filt2, filt3;
599 v16i8 mask1, mask2, mask3;
600 v16i8 vec0, vec1, vec2, vec3;
602 v8i16 in0, in1, in2, in3;
603 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
604 v8i16 filter_vec, const_vec;
605 v4i32 weight_vec, offset_vec, rnd_vec;
606 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
609 filter_vec =
LD_SH(filter);
610 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
616 offset = (offset0 + offset1) << rnd_val;
617 weight0 = weight0 & 0x0000FFFF;
618 weight = weight0 | (weight1 << 16);
620 const_vec = __msa_ldi_h(128);
622 offset_vec = __msa_fill_w(offset);
623 weight_vec = __msa_fill_w(weight);
624 rnd_vec = __msa_fill_w(rnd_val + 1);
626 for (loop_cnt = (height >> 2); loop_cnt--;) {
627 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
628 src0_ptr += (4 * src_stride);
629 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
630 src1_ptr += (4 * src2_stride);
634 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
635 vec0, vec1, vec2, vec3);
637 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
638 dst0, dst0, dst0, dst0);
639 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
640 vec0, vec1, vec2, vec3);
642 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
643 dst1, dst1, dst1, dst1);
646 weight_vec, rnd_vec, offset_vec,
647 dst0_r, dst1_r, dst0_l, dst1_l);
650 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
651 dst += (4 * dst_stride);
671 v8i16 filt0, filt1, filt2, filt3;
673 v16i8 mask1, mask2, mask3;
674 v16i8 vec0, vec1, vec2, vec3;
675 v8i16 dst0, dst1, dst2, dst3;
676 v8i16 in0, in1, in2, in3;
677 v4i32 dst0_r, dst1_r, dst0_l, dst1_l, dst2_r, dst3_r, dst2_l, dst3_l;
678 v8i16 filter_vec, const_vec;
679 v4i32 weight_vec, offset_vec, rnd_vec;
680 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
683 offset = (offset0 + offset1) << rnd_val;
684 weight0 = weight0 & 0x0000FFFF;
685 weight = weight0 | (weight1 << 16);
687 const_vec = __msa_ldi_h(128);
689 offset_vec = __msa_fill_w(offset);
690 weight_vec = __msa_fill_w(weight);
691 rnd_vec = __msa_fill_w(rnd_val + 1);
693 filter_vec =
LD_SH(filter);
694 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
700 for (loop_cnt = (height >> 2); loop_cnt--;) {
701 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
702 src0_ptr += (4 * src_stride);
703 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
704 src1_ptr += (4 * src2_stride);
707 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
708 vec0, vec1, vec2, vec3);
710 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
711 dst0, dst0, dst0, dst0);
712 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
713 vec0, vec1, vec2, vec3);
715 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
716 dst1, dst1, dst1, dst1);
717 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
718 vec0, vec1, vec2, vec3);
720 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
721 dst2, dst2, dst2, dst2);
722 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
723 vec0, vec1, vec2, vec3);
725 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
726 dst3, dst3, dst3, dst3);
730 weight_vec, rnd_vec, offset_vec,
731 dst0_r, dst1_r, dst2_r, dst3_r,
732 dst0_l, dst1_l, dst2_l, dst3_l);
735 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
736 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
737 dst += (4 * dst_stride);
756 dst, dst_stride, filter, height,
757 weight0, weight1, offset0, offset1, rnd_val);
759 dst + 8, dst_stride, filter, height,
760 weight0, weight1, offset0, offset1, rnd_val);
780 v8i16 in0, in1, in2, in3;
781 v8i16 filt0, filt1, filt2, filt3;
782 v16i8 mask1, mask2, mask3;
783 v8i16 filter_vec, const_vec;
784 v16i8 vec0, vec1, vec2, vec3;
785 v8i16 dst0, dst1, dst2, dst3;
786 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
787 v4i32 weight_vec, offset_vec, rnd_vec;
788 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
791 offset = (offset0 + offset1) << rnd_val;
792 weight0 = weight0 & 0x0000FFFF;
793 weight = weight0 | (weight1 << 16);
795 const_vec = __msa_ldi_h(128);
797 offset_vec = __msa_fill_w(offset);
798 weight_vec = __msa_fill_w(weight);
799 rnd_vec = __msa_fill_w(rnd_val + 1);
801 filter_vec =
LD_SH(filter);
802 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
808 for (loop_cnt = (height >> 1); loop_cnt--;) {
809 LD_SB2(src0_ptr, 8, src0, src1);
810 src0_ptr += src_stride;
811 LD_SB2(src0_ptr, 8, src2, src3);
812 src0_ptr += src_stride;
813 LD_SH2(src1_ptr, 8, in0, in1);
814 src1_ptr += src2_stride;
815 LD_SH2(src1_ptr, 8, in2, in3);
816 src1_ptr += src2_stride;
819 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
820 vec0, vec1, vec2, vec3);
822 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
823 dst0, dst0, dst0, dst0);
824 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
825 vec0, vec1, vec2, vec3);
827 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
828 dst1, dst1, dst1, dst1);
829 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
830 vec0, vec1, vec2, vec3);
832 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
833 dst2, dst2, dst2, dst2);
834 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
835 vec0, vec1, vec2, vec3);
837 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
838 dst3, dst3, dst3, dst3);
842 weight_vec, rnd_vec, offset_vec,
843 dst0_r, dst1_r, dst2_r, dst3_r,
844 dst0_l, dst1_l, dst2_l, dst3_l);
847 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
848 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
849 dst += (2 * dst_stride);
872 v8i16 filt0, filt1, filt2, filt3;
873 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
874 v16i8 vec0, vec1, vec2, vec3;
875 v8i16 dst0, dst1, dst2;
876 v4i32 dst0_r, dst1_r, dst2_r, dst0_l, dst1_l, dst2_l;
877 v8i16 filter_vec, const_vec;
878 v4i32 weight_vec, offset_vec, rnd_vec;
879 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
881 src0_ptr = src0_ptr - 3;
882 offset = (offset0 + offset1) << rnd_val;
883 weight0 = weight0 & 0x0000FFFF;
884 weight = weight0 | (weight1 << 16);
886 const_vec = __msa_ldi_h(128);
888 offset_vec = __msa_fill_w(offset);
889 weight_vec = __msa_fill_w(weight);
890 rnd_vec = __msa_fill_w(rnd_val + 1);
892 filter_vec =
LD_SH(filter);
893 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
903 for (loop_cnt = height; loop_cnt--;) {
904 LD_SB2(src0_ptr, 16, src0, src1);
905 src0_ptr += src_stride;
906 LD_SH2(src1_ptr, 8, in0, in1);
907 in2 =
LD_SH(src1_ptr + 16);
908 src1_ptr += src2_stride;
911 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
912 vec0, vec1, vec2, vec3);
914 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
915 dst0, dst0, dst0, dst0);
916 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
917 vec0, vec1, vec2, vec3);
919 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
920 dst1, dst1, dst1, dst1);
921 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
922 vec0, vec1, vec2, vec3);
924 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
925 dst2, dst2, dst2, dst2);
928 weight_vec, rnd_vec, offset_vec,
929 dst0_r, dst1_r, dst0_l, dst1_l);
932 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
934 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
942 dst_val0 = __msa_copy_u_d((v2i64) dst2_r, 0);
944 SD(dst_val0, dst + 16);
966 v8i16 in0, in1, in2, in3;
967 v8i16 filt0, filt1, filt2, filt3;
968 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
969 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
970 v16i8 vec0, vec1, vec2, vec3;
971 v8i16 dst0, dst1, dst2, dst3;
972 v8i16 filter_vec, const_vec;
973 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
974 v4i32 weight_vec, offset_vec, rnd_vec;
977 offset = (offset0 + offset1) << rnd_val;
978 weight0 = weight0 & 0x0000FFFF;
979 weight = weight0 | (weight1 << 16);
981 const_vec = __msa_ldi_h(128);
983 offset_vec = __msa_fill_w(offset);
984 weight_vec = __msa_fill_w(weight);
985 rnd_vec = __msa_fill_w(rnd_val + 1);
987 filter_vec =
LD_SH(filter);
988 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
998 for (loop_cnt = height; loop_cnt--;) {
999 LD_SB2(src0_ptr, 16, src0, src1);
1000 src2 =
LD_SB(src0_ptr + 24);
1001 src0_ptr += src_stride;
1002 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1003 src1_ptr += src2_stride;
1007 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1008 vec0, vec1, vec2, vec3);
1010 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1011 dst0, dst0, dst0, dst0);
1012 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1013 vec0, vec1, vec2, vec3);
1015 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1016 dst1, dst1, dst1, dst1);
1017 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1018 vec0, vec1, vec2, vec3);
1020 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1021 dst2, dst2, dst2, dst2);
1022 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1023 vec0, vec1, vec2, vec3);
1025 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1026 dst3, dst3, dst3, dst3);
1030 weight_vec, rnd_vec, offset_vec,
1031 dst0_r, dst1_r, dst2_r, dst3_r,
1032 dst0_l, dst1_l, dst2_l, dst3_l);
1035 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
1036 ST_SW2(dst0_r, dst1_r, dst, 16);
1059 v8i16 in0, in1, in2, in3, in4, in5;
1060 v8i16 filt0, filt1, filt2, filt3;
1061 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1062 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1063 v16i8 vec0, vec1, vec2, vec3;
1064 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1065 v8i16 filter_vec, const_vec;
1066 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
1067 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
1068 v4i32 weight_vec, offset_vec, rnd_vec;
1071 offset = (offset0 + offset1) << rnd_val;
1072 weight0 = weight0 & 0x0000FFFF;
1073 weight = weight0 | (weight1 << 16);
1075 const_vec = __msa_ldi_h(128);
1077 offset_vec = __msa_fill_w(offset);
1078 weight_vec = __msa_fill_w(weight);
1079 rnd_vec = __msa_fill_w(rnd_val + 1);
1081 filter_vec =
LD_SH(filter);
1082 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1092 for (loop_cnt = height; loop_cnt--;) {
1093 LD_SB3(src0_ptr, 16, src0, src1, src2);
1094 src3 =
LD_SB(src0_ptr + 40);
1095 src0_ptr += src_stride;
1096 LD_SH2(src1_ptr, 8, in0, in1);
1097 in2 =
LD_SH(src1_ptr + 16);
1100 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1101 vec0, vec1, vec2, vec3);
1103 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1104 dst0, dst0, dst0, dst0);
1105 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1106 vec0, vec1, vec2, vec3);
1108 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1109 dst1, dst1, dst1, dst1);
1110 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1111 vec0, vec1, vec2, vec3);
1113 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1114 dst2, dst2, dst2, dst2);
1115 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
1116 vec0, vec1, vec2, vec3);
1118 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1119 dst3, dst3, dst3, dst3);
1120 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1121 vec0, vec1, vec2, vec3);
1123 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1124 dst4, dst4, dst4, dst4);
1125 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1126 vec0, vec1, vec2, vec3);
1128 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1129 dst5, dst5, dst5, dst5);
1132 weight_vec, rnd_vec, offset_vec,
1133 dst0_r, dst1_r, dst0_l, dst1_l);
1136 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1137 (v8i16) weight_vec);
1138 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1139 (v8i16) weight_vec);
1146 dst_val0 = __msa_copy_u_d((v2i64) dst2_r, 0);
1148 SD(dst_val0, dst + 16);
1150 LD_SH2(src1_ptr + 24, 8, in3, in4);
1151 in5 =
LD_SH(src1_ptr + 40);
1152 src1_ptr += src2_stride;
1155 weight_vec, rnd_vec, offset_vec,
1156 dst3_r, dst4_r, dst3_l, dst4_l);
1159 dst5_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst5_r,
1160 (v8i16) weight_vec);
1161 dst5_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst5_l,
1162 (v8i16) weight_vec);
1169 dst_val0 = __msa_copy_u_d((v2i64) dst3_r, 0);
1170 SD(dst_val0, dst + 24);
1171 ST_SW(dst4_r, dst + 32);
1192 int16_t *src1_ptr_tmp;
1193 uint32_t loop_cnt, cnt;
1196 v8i16 in0, in1, in2, in3;
1197 v8i16 filt0, filt1, filt2, filt3;
1198 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1199 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1200 v16i8 vec0, vec1, vec2, vec3;
1201 v8i16 dst0, dst1, dst2, dst3;
1202 v8i16 filter_vec, const_vec;
1203 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1204 v4i32 weight_vec, offset_vec, rnd_vec;
1207 offset = (offset0 + offset1) << rnd_val;
1208 weight0 = weight0 & 0x0000FFFF;
1209 weight = weight0 | (weight1 << 16);
1211 const_vec = __msa_ldi_h(128);
1213 offset_vec = __msa_fill_w(offset);
1214 weight_vec = __msa_fill_w(weight);
1215 rnd_vec = __msa_fill_w(rnd_val + 1);
1217 filter_vec =
LD_SH(filter);
1218 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1228 for (loop_cnt = height; loop_cnt--;) {
1229 src0_ptr_tmp = src0_ptr;
1231 src1_ptr_tmp = src1_ptr;
1233 for (cnt = 2; cnt--;) {
1234 LD_SB2(src0_ptr_tmp, 16, src0, src1);
1235 src2 =
LD_SB(src0_ptr_tmp + 24);
1237 LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
1241 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1242 vec0, vec1, vec2, vec3);
1244 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1245 dst0, dst0, dst0, dst0);
1246 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1247 vec0, vec1, vec2, vec3);
1249 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1250 dst1, dst1, dst1, dst1);
1251 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1252 vec0, vec1, vec2, vec3);
1254 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1255 dst2, dst2, dst2, dst2);
1256 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1257 vec0, vec1, vec2, vec3);
1259 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1260 dst3, dst3, dst3, dst3);
1264 weight_vec, rnd_vec, offset_vec,
1265 dst0_r, dst1_r, dst2_r, dst3_r,
1266 dst0_l, dst1_l, dst2_l, dst3_l);
1269 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
1270 ST_SW2(dst0_r, dst1_r, dst_tmp, 16);
1274 src0_ptr += src_stride;
1275 src1_ptr += src2_stride;
1297 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1298 v16i8 src11, src12, src13, src14;
1299 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1300 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1301 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1302 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1303 v16i8 src2110, src4332, src6554, src8776, src10998;
1304 v16i8 src12111110, src14131312;
1305 v8i16 dst10, dst32, dst54, dst76;
1306 v4i32 dst10_r, dst32_r, dst54_r, dst76_r;
1307 v4i32 dst10_l, dst32_l, dst54_l, dst76_l;
1308 v8i16 filt0, filt1, filt2, filt3;
1309 v8i16 filter_vec, const_vec;
1310 v4i32 weight_vec, offset_vec, rnd_vec;
1312 src0_ptr -= (3 * src_stride);
1313 offset = (offset0 + offset1) << rnd_val;
1314 weight0 = weight0 & 0x0000FFFF;
1315 weight = weight0 | (weight1 << 16);
1317 const_vec = __msa_ldi_h(128);
1319 offset_vec = __msa_fill_w(offset);
1320 weight_vec = __msa_fill_w(weight);
1321 rnd_vec = __msa_fill_w(rnd_val + 1);
1323 filter_vec =
LD_SH(filter);
1324 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1326 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1327 src0_ptr += (7 * src_stride);
1329 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1330 src10_r, src32_r, src54_r, src21_r);
1331 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1332 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1333 src2110, src4332, src6554);
1336 for (loop_cnt = (height >> 3); loop_cnt--;) {
1337 LD_SB8(src0_ptr, src_stride,
1338 src7, src8, src9, src10, src11, src12, src13, src14);
1339 src0_ptr += (8 * src_stride);
1340 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1341 src1_ptr += (8 * src2_stride);
1345 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1346 src76_r, src87_r, src98_r, src109_r);
1347 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1348 src1110_r, src1211_r, src1312_r, src1413_r);
1349 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1350 src1413_r, src1312_r,
1351 src8776, src10998, src12111110, src14131312);
1355 DPADD_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt1,
1356 filt2, filt3, dst10, dst10, dst10, dst10);
1359 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1362 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1364 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1365 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1369 weight_vec, rnd_vec, offset_vec,
1370 dst10_r, dst32_r, dst54_r, dst76_r,
1371 dst10_l, dst32_l, dst54_l, dst76_l);
1374 dst54_l, dst54_r, dst76_l, dst76_r, dst10_r, dst54_r);
1375 ST4x8_UB(dst10_r, dst54_r, dst, dst_stride);
1376 dst += (8 * dst_stride);
1379 src4332 = src12111110;
1380 src6554 = src14131312;
1401 v16i8
src0,
src1, src2, src3, src4, src5;
1402 v16i8 src6, src7, src8, src9, src10;
1403 v8i16 in0, in1, in2, in3;
1404 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1405 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1406 v8i16 tmp0, tmp1, tmp2, tmp3;
1407 v8i16 filt0, filt1, filt2, filt3;
1408 v8i16 filter_vec, const_vec;
1409 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1410 v4i32 weight_vec, offset_vec, rnd_vec;
1412 src0_ptr -= (3 * src_stride);
1413 offset = (offset0 + offset1) << rnd_val;
1414 weight0 = weight0 & 0x0000FFFF;
1415 weight = weight0 | (weight1 << 16);
1417 const_vec = __msa_ldi_h(128);
1419 offset_vec = __msa_fill_w(offset);
1420 weight_vec = __msa_fill_w(weight);
1421 rnd_vec = __msa_fill_w(rnd_val + 1);
1423 filter_vec =
LD_SH(filter);
1424 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1426 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1427 src0_ptr += (7 * src_stride);
1430 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1431 src10_r, src32_r, src54_r, src21_r);
1432 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1434 for (loop_cnt = (height >> 2); loop_cnt--;) {
1435 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1436 src0_ptr += (4 * src_stride);
1437 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1438 src1_ptr += (4 * src2_stride);
1441 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1442 src76_r, src87_r, src98_r, src109_r);
1446 filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
1449 filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
1452 filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
1455 filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
1459 weight_vec, rnd_vec, offset_vec,
1460 dst0_r, dst1_r, dst2_r, dst3_r,
1461 dst0_l, dst1_l, dst2_l, dst3_l);
1464 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
1465 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
1466 dst += (4 * dst_stride);
1494 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1495 v8i16 in0, in1, in2, in3;
1496 v16i8 src10_r, src32_r, src54_r, src76_r;
1497 v16i8 src21_r, src43_r, src65_r, src87_r;
1498 v8i16 tmp0, tmp1, tmp2;
1499 v16i8 src10_l, src32_l, src54_l, src76_l;
1500 v16i8 src21_l, src43_l, src65_l, src87_l;
1501 v16i8 src2110, src4332, src6554, src8776;
1502 v8i16 filt0, filt1, filt2, filt3;
1503 v8i16 filter_vec, const_vec;
1504 v4i32 dst0_r, dst1_r, dst2_r, dst0_l, dst1_l, dst2_l;
1505 v4i32 weight_vec, offset_vec, rnd_vec;
1507 src0_ptr -= (3 * src_stride);
1508 offset = (offset0 + offset1) << rnd_val;
1509 weight0 = weight0 & 0x0000FFFF;
1510 weight = weight0 | (weight1 << 16);
1512 const_vec = __msa_ldi_h(128);
1514 offset_vec = __msa_fill_w(offset);
1515 weight_vec = __msa_fill_w(weight);
1516 rnd_vec = __msa_fill_w(rnd_val + 1);
1518 filter_vec =
LD_SH(filter);
1519 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1521 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1522 src0_ptr += (7 * src_stride);
1525 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1526 src10_r, src32_r, src54_r, src21_r);
1527 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1528 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1529 src10_l, src32_l, src54_l, src21_l);
1530 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1531 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1532 src2110, src4332, src6554);
1534 for (loop_cnt = (height >> 1); loop_cnt--;) {
1535 LD_SB2(src0_ptr, src_stride, src7, src8);
1536 src0_ptr += (2 * src_stride);
1537 LD_SH2(src1_ptr, src2_stride, in0, in1);
1538 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
1539 src1_ptr += (2 * src2_stride);
1540 in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
1543 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1544 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1545 src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
1549 filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
1552 filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
1555 filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
1558 weight_vec, rnd_vec, offset_vec,
1559 dst0_r, dst1_r, dst0_l, dst1_l);
1562 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1563 (v8i16) weight_vec);
1564 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1565 (v8i16) weight_vec);
1573 ST4x2_UB(dst2_r, dst + 8, dst_stride);
1574 dst += (2 * dst_stride);
1605 int16_t *src1_ptr_tmp;
1607 uint32_t loop_cnt, cnt;
1609 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1610 v8i16 in0, in1, in2, in3;
1611 v16i8 src10_r, src32_r, src54_r, src76_r;
1612 v16i8 src21_r, src43_r, src65_r, src87_r;
1613 v16i8 src10_l, src32_l, src54_l, src76_l;
1614 v16i8 src21_l, src43_l, src65_l, src87_l;
1615 v8i16 tmp0, tmp1, tmp2, tmp3;
1616 v8i16 filt0, filt1, filt2, filt3;
1617 v8i16 filter_vec, const_vec;
1618 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1619 v4i32 weight_vec, offset_vec, rnd_vec;
1621 src0_ptr -= (3 * src_stride);
1623 offset = (offset0 + offset1) << rnd_val;
1624 weight0 = weight0 & 0x0000FFFF;
1625 weight = weight0 | (weight1 << 16);
1627 const_vec = __msa_ldi_h(128);
1629 offset_vec = __msa_fill_w(offset);
1630 weight_vec = __msa_fill_w(weight);
1631 rnd_vec = __msa_fill_w(rnd_val + 1);
1633 filter_vec =
LD_SH(filter);
1634 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1636 for (cnt = (width >> 4); cnt--;) {
1637 src0_ptr_tmp = src0_ptr;
1638 src1_ptr_tmp = src1_ptr;
1641 LD_SB7(src0_ptr_tmp, src_stride,
1642 src0, src1, src2, src3, src4, src5, src6);
1643 src0_ptr_tmp += (7 * src_stride);
1646 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1647 src10_r, src32_r, src54_r, src21_r);
1648 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1649 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1650 src10_l, src32_l, src54_l, src21_l);
1651 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1653 for (loop_cnt = (height >> 1); loop_cnt--;) {
1654 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1655 src0_ptr_tmp += (2 * src_stride);
1656 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1657 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1658 src1_ptr_tmp += (2 * src2_stride);
1661 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1662 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1666 filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
1669 filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
1672 filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
1675 filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
1679 weight_vec, rnd_vec, offset_vec,
1680 dst0_r, dst1_r, dst2_r, dst3_r,
1681 dst0_l, dst1_l, dst2_l, dst3_l);
1684 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
1685 ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
1686 dst_tmp += (2 * dst_stride);
1724 src1_ptr, src2_stride,
1725 dst, dst_stride, filter, height,
1726 weight0, weight1, offset0, offset1,
1745 src1_ptr, src2_stride,
1746 dst, dst_stride, filter, height,
1747 weight0, weight1, offset0, offset1,
1750 src1_ptr + 16, src2_stride,
1751 dst + 16, dst_stride, filter, height,
1752 weight0, weight1, offset0, offset1, rnd_val);
1770 src1_ptr, src2_stride,
1771 dst, dst_stride, filter, height,
1772 weight0, weight1, offset0, offset1,
1791 src1_ptr, src2_stride,
1792 dst, dst_stride, filter, height,
1793 weight0, weight1, offset0, offset1,
1812 src1_ptr, src2_stride,
1813 dst, dst_stride, filter, height,
1814 weight0, weight1, offset0, offset1,
1824 const int8_t *filter_x,
1825 const int8_t *filter_y,
1835 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1837 v8i16 filt0, filt1, filt2, filt3;
1838 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1839 v16i8 mask1, mask2, mask3;
1840 v8i16 filter_vec, const_vec;
1841 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1842 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1843 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1844 v4i32 dst0_r, dst1_r;
1846 v4i32 weight_vec0, weight_vec1, offset_vec, rnd_vec;
1847 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1848 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1849 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1850 v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1852 src0_ptr -= ((3 * src_stride) + 3);
1854 filter_vec =
LD_SH(filter_x);
1855 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1857 filter_vec =
LD_SH(filter_y);
1858 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1859 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1861 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1867 offset = (offset0 + offset1) << rnd_val;
1868 weight0 = weight0 & 0x0000FFFF;
1870 const_vec = __msa_ldi_h(128);
1872 offset_vec = __msa_fill_w(offset);
1873 weight_vec0 = __msa_fill_w(weight0);
1874 weight_vec1 = __msa_fill_w(weight1);
1875 rnd_vec = __msa_fill_w(rnd_val + 1);
1877 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1878 src0_ptr += (7 * src_stride);
1882 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1883 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1884 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1885 vec8, vec9, vec10, vec11);
1886 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1887 vec12, vec13, vec14, vec15);
1890 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1891 dst30, dst30, dst30, dst30);
1893 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1894 dst41, dst41, dst41, dst41);
1896 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1897 dst52, dst52, dst52, dst52);
1899 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1900 dst63, dst63, dst63, dst63);
1902 ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1903 dst10_r, dst21_r, dst32_r);
1904 dst43_r = __msa_ilvl_h(dst41, dst30);
1905 dst54_r = __msa_ilvl_h(dst52, dst41);
1906 dst65_r = __msa_ilvl_h(dst63, dst52);
1907 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1909 for (loop_cnt = height >> 1; loop_cnt--;) {
1910 LD_SB2(src0_ptr, src_stride, src7, src8);
1911 src0_ptr += (2 * src_stride);
1912 LD_SH2(src1_ptr, src2_stride, in0, in1);
1913 src1_ptr += (2 * src2_stride);
1915 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
1918 VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1919 vec0, vec1, vec2, vec3);
1921 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1922 dst87, dst87, dst87, dst87);
1923 dst76_r = __msa_ilvr_h(dst87, dst66);
1925 filt_h0, filt_h1, filt_h2, filt_h3);
1926 dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
1928 filt_h0, filt_h1, filt_h2, filt_h3);
1934 tmp1 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp1, (v8i16) weight_vec0);
1935 tmp2 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp2, (v8i16) weight_vec0);
1936 tmp1 += dst0_r * weight_vec1;
1937 tmp2 += dst1_r * weight_vec1;
1944 dst += (2 * dst_stride);
1952 dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1962 const int8_t *filter_x,
1963 const int8_t *filter_y,
1972 uint32_t loop_cnt, cnt;
1975 int16_t *src1_ptr_tmp;
1977 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1979 v8i16 filt0, filt1, filt2, filt3;
1980 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1981 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1982 v16i8 mask1, mask2, mask3;
1983 v8i16 filter_vec, const_vec;
1984 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1985 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1986 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1987 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1988 v4i32 tmp0, tmp1, tmp2, tmp3;
1989 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1990 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1991 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1992 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1993 v4i32 weight_vec0, weight_vec1, offset_vec, rnd_vec;
1995 src0_ptr -= ((3 * src_stride) + 3);
1997 offset = (offset0 + offset1) << rnd_val;
1998 weight0 = weight0 & 0x0000FFFF;
2000 const_vec = __msa_ldi_h(128);
2002 offset_vec = __msa_fill_w(offset);
2003 weight_vec0 = __msa_fill_w(weight0);
2004 weight_vec1 = __msa_fill_w(weight1);
2005 rnd_vec = __msa_fill_w(rnd_val + 1);
2007 filter_vec =
LD_SH(filter_x);
2008 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2010 filter_vec =
LD_SH(filter_y);
2011 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2012 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2014 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2020 for (cnt = width >> 3; cnt--;) {
2021 src0_ptr_tmp = src0_ptr;
2022 src1_ptr_tmp = src1_ptr;
2025 LD_SB7(src0_ptr_tmp, src_stride,
2026 src0, src1, src2, src3, src4, src5, src6);
2027 src0_ptr_tmp += (7 * src_stride);
2032 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
2033 vec0, vec1, vec2, vec3);
2034 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
2035 vec4, vec5, vec6, vec7);
2036 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
2037 vec8, vec9, vec10, vec11);
2038 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2039 vec12, vec13, vec14, vec15);
2042 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
2043 dst0, dst0, dst0, dst0);
2045 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
2046 dst1, dst1, dst1, dst1);
2048 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
2049 dst2, dst2, dst2, dst2);
2051 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
2052 dst3, dst3, dst3, dst3);
2055 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2056 vec0, vec1, vec2, vec3);
2057 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2058 vec4, vec5, vec6, vec7);
2059 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2060 vec8, vec9, vec10, vec11);
2063 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
2064 dst4, dst4, dst4, dst4);
2066 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
2067 dst5, dst5, dst5, dst5);
2069 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
2070 dst6, dst6, dst6, dst6);
2072 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
2073 dst10_r, dst32_r, dst54_r, dst21_r);
2074 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2075 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
2076 dst10_l, dst32_l, dst54_l, dst21_l);
2077 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2079 for (loop_cnt = height >> 1; loop_cnt--;) {
2080 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2082 src0_ptr_tmp += 2 * src_stride;
2084 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2085 src1_ptr_tmp += (2 * src2_stride);
2087 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2088 vec0, vec1, vec2, vec3);
2090 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
2091 dst7, dst7, dst7, dst7);
2095 filt_h0, filt_h1, filt_h2, filt_h3);
2097 filt_h0, filt_h1, filt_h2, filt_h3);
2103 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2104 vec0, vec1, vec2, vec3);
2107 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
2108 dst8, dst8, dst8, dst8);
2112 filt_h0, filt_h1, filt_h2, filt_h3);
2114 filt_h0, filt_h1, filt_h2, filt_h3);
2121 tmp0 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp0,
2122 (v8i16) weight_vec0);
2123 tmp1 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp1,
2124 (v8i16) weight_vec0);
2125 tmp2 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp2,
2126 (v8i16) weight_vec0);
2127 tmp3 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp3,
2128 (v8i16) weight_vec0);
2130 tmp0 += (dst0_r * weight_vec1);
2131 tmp1 += (dst0_l * weight_vec1);
2132 tmp2 += (dst1_r * weight_vec1);
2133 tmp3 += (dst1_l * weight_vec1);
2141 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
2142 dst_tmp += (2 * dst_stride);
2171 const int8_t *filter_x,
2172 const int8_t *filter_y,
2181 src1_ptr, src2_stride,
2182 dst, dst_stride, filter_x, filter_y,
2183 height, weight0, weight1, offset0,
2184 offset1, rnd_val, 8);
2193 const int8_t *filter_x,
2194 const int8_t *filter_y,
2203 src1_ptr, src2_stride,
2204 dst, dst_stride, filter_x, filter_y,
2205 height, weight0, weight1, offset0,
2206 offset1, rnd_val, 8);
2208 src1_ptr + 8, src2_stride,
2209 dst + 8, dst_stride, filter_x, filter_y,
2210 height, weight0, weight1, offset0, offset1,
2220 const int8_t *filter_x,
2221 const int8_t *filter_y,
2230 src1_ptr, src2_stride,
2231 dst, dst_stride, filter_x, filter_y,
2232 height, weight0, weight1, offset0,
2233 offset1, rnd_val, 16);
2242 const int8_t *filter_x,
2243 const int8_t *filter_y,
2252 src1_ptr, src2_stride,
2253 dst, dst_stride, filter_x, filter_y,
2254 height, weight0, weight1, offset0,
2255 offset1, rnd_val, 24);
2264 const int8_t *filter_x,
2265 const int8_t *filter_y,
2274 src1_ptr, src2_stride,
2275 dst, dst_stride, filter_x, filter_y,
2276 height, weight0, weight1, offset0,
2277 offset1, rnd_val, 32);
2286 const int8_t *filter_x,
2287 const int8_t *filter_y,
2296 src1_ptr, src2_stride,
2297 dst, dst_stride, filter_x, filter_y,
2298 height, weight0, weight1, offset0,
2299 offset1, rnd_val, 48);
2308 const int8_t *filter_x,
2309 const int8_t *filter_y,
2318 src1_ptr, src2_stride,
2319 dst, dst_stride, filter_x, filter_y,
2320 height, weight0, weight1, offset0,
2321 offset1, rnd_val, 64);
2342 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2343 v16i8 mask1, vec0, vec1;
2345 v4i32 dst0_r, dst0_l;
2346 v8i16 filter_vec, const_vec;
2347 v4i32 weight_vec, offset_vec, rnd_vec;
2351 filter_vec =
LD_SH(filter);
2356 offset = (offset0 + offset1) << rnd_val;
2357 weight0 = weight0 & 0x0000FFFF;
2358 weight = weight0 | (weight1 << 16);
2360 const_vec = __msa_ldi_h(128);
2362 offset_vec = __msa_fill_w(offset);
2363 weight_vec = __msa_fill_w(weight);
2364 rnd_vec = __msa_fill_w(rnd_val + 1);
2366 LD_SB2(src0_ptr, src_stride, src0, src1);
2367 LD_SH2(src1_ptr, src2_stride, in0, in1);
2368 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2371 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2376 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
2377 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
2403 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2407 v8i16 in0, in1, in2, in3;
2408 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
2409 v8i16 filter_vec, const_vec;
2410 v4i32 weight_vec, offset_vec, rnd_vec;
2415 filter_vec =
LD_SH(filter);
2420 offset = (offset0 + offset1) << rnd_val;
2421 weight0 = weight0 & 0x0000FFFF;
2422 weight = weight0 | (weight1 << 16);
2424 const_vec = __msa_ldi_h(128);
2426 offset_vec = __msa_fill_w(offset);
2427 weight_vec = __msa_fill_w(weight);
2428 rnd_vec = __msa_fill_w(rnd_val + 1);
2430 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2432 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2435 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2438 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2442 weight_vec, rnd_vec, offset_vec,
2443 dst0_r, dst1_r, dst0_l, dst1_l);
2446 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
2466 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2467 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2470 v8i16 dst0, dst1, dst2, dst3;
2471 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2472 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2473 v8i16 filter_vec, const_vec;
2474 v4i32 weight_vec, offset_vec, rnd_vec;
2478 filter_vec =
LD_SH(filter);
2481 offset = (offset0 + offset1) << rnd_val;
2482 weight0 = weight0 & 0x0000FFFF;
2483 weight = weight0 | (weight1 << 16);
2485 const_vec = __msa_ldi_h(128);
2487 offset_vec = __msa_fill_w(offset);
2488 weight_vec = __msa_fill_w(weight);
2489 rnd_vec = __msa_fill_w(rnd_val + 1);
2493 for (loop_cnt = (height >> 3); loop_cnt--;) {
2494 LD_SB8(src0_ptr, src_stride,
2495 src0, src1, src2, src3, src4, src5, src6, src7);
2496 src0_ptr += (8 * src_stride);
2497 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2498 src1_ptr += (4 * src2_stride);
2499 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2500 src1_ptr += (4 * src2_stride);
2505 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2508 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2511 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2514 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2519 weight_vec, rnd_vec, offset_vec,
2520 dst0_r, dst1_r, dst2_r, dst3_r,
2521 dst0_l, dst1_l, dst2_l, dst3_l);
2524 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2525 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
2526 dst += (8 * dst_stride);
2546 dst, dst_stride, filter, height,
2547 weight0, weight1, offset0, offset1, rnd_val);
2548 }
else if (4 == height) {
2550 dst, dst_stride, filter, height,
2551 weight0, weight1, offset0, offset1, rnd_val);
2552 }
else if (0 == (height % 8)) {
2554 src1_ptr, src2_stride,
2555 dst, dst_stride, filter, height,
2556 weight0, weight1, offset0, offset1,
2579 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2582 v8i16 in0, in1, in2, in3;
2583 v8i16 dst0, dst1, dst2, dst3;
2584 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2585 v8i16 filter_vec, const_vec;
2586 v4i32 weight_vec, offset_vec, rnd_vec;
2590 filter_vec =
LD_SH(filter);
2593 offset = (offset0 + offset1) << rnd_val;
2594 weight0 = weight0 & 0x0000FFFF;
2595 weight = weight0 | (weight1 << 16);
2597 const_vec = __msa_ldi_h(128);
2599 offset_vec = __msa_fill_w(offset);
2600 weight_vec = __msa_fill_w(weight);
2601 rnd_vec = __msa_fill_w(rnd_val + 1);
2605 for (loop_cnt = (height >> 2); loop_cnt--;) {
2606 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2607 src0_ptr += (4 * src_stride);
2608 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2609 src1_ptr += (4 * src2_stride);
2612 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2615 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2618 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2621 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2627 weight_vec, rnd_vec, offset_vec,
2628 dst0_r, dst1_r, dst2_r, dst3_r,
2629 dst0_l, dst1_l, dst2_l, dst3_l);
2632 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2633 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
2634 dst += (4 * dst_stride);
2656 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2657 v16i8 mask1, vec0, vec1;
2659 v8i16 filter_vec, const_vec;
2660 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
2661 v4i32 weight_vec, offset_vec, rnd_vec;
2665 filter_vec =
LD_SH(filter);
2668 offset = (offset0 + offset1) << rnd_val;
2669 weight0 = weight0 & 0x0000FFFF;
2670 weight = weight0 | (weight1 << 16);
2672 const_vec = __msa_ldi_h(128);
2674 offset_vec = __msa_fill_w(offset);
2675 weight_vec = __msa_fill_w(weight);
2676 rnd_vec = __msa_fill_w(rnd_val + 1);
2680 LD_SB2(src0_ptr, src_stride, src0, src1);
2681 LD_SH2(src1_ptr, src2_stride, in0, in1);
2683 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2686 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2690 weight_vec, rnd_vec, offset_vec,
2691 dst0_r, dst1_r, dst0_l, dst1_l);
2713 v16i8
src0,
src1, src2, src3, src4, src5;
2714 v8i16 in0, in1, in2, in3, in4, in5;
2715 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2718 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2719 v8i16 filter_vec, const_vec;
2720 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
2721 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
2722 v4i32 weight_vec, offset_vec, rnd_vec;
2726 filter_vec =
LD_SH(filter);
2729 offset = (offset0 + offset1) << rnd_val;
2730 weight0 = weight0 & 0x0000FFFF;
2731 weight = weight0 | (weight1 << 16);
2733 const_vec = __msa_ldi_h(128);
2735 offset_vec = __msa_fill_w(offset);
2736 weight_vec = __msa_fill_w(weight);
2737 rnd_vec = __msa_fill_w(rnd_val + 1);
2741 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
2743 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2744 src1_ptr += (4 * src2_stride);
2745 LD_SH2(src1_ptr, src2_stride, in4, in5);
2747 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2750 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2753 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2756 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2759 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2762 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2767 weight_vec, rnd_vec, offset_vec,
2768 dst0_r, dst1_r, dst2_r, dst3_r,
2769 dst0_l, dst1_l, dst2_l, dst3_l);
2771 weight_vec, rnd_vec, offset_vec,
2772 dst4_r, dst5_r, dst4_l, dst5_l);
2775 dst2_l, dst2_r, dst3_l, dst3_r,
2776 dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
2777 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
2778 dst += (4 * dst_stride);
2800 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2803 v8i16 in0, in1, in2, in3;
2804 v8i16 dst0, dst1, dst2, dst3;
2805 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2806 v8i16 filter_vec, const_vec;
2807 v4i32 weight_vec, offset_vec, rnd_vec;
2811 filter_vec =
LD_SH(filter);
2814 offset = (offset0 + offset1) << rnd_val;
2815 weight0 = weight0 & 0x0000FFFF;
2816 weight = weight0 | (weight1 << 16);
2818 const_vec = __msa_ldi_h(128);
2820 offset_vec = __msa_fill_w(offset);
2821 weight_vec = __msa_fill_w(weight);
2822 rnd_vec = __msa_fill_w(rnd_val + 1);
2826 for (loop_cnt = (height >> 2); loop_cnt--;) {
2827 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2828 src0_ptr += (4 * src_stride);
2829 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2830 src1_ptr += (4 * src2_stride);
2833 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2836 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2839 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2842 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2847 weight_vec, rnd_vec, offset_vec,
2848 dst0_r, dst1_r, dst2_r, dst3_r,
2849 dst0_l, dst1_l, dst2_l, dst3_l);
2852 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2853 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
2854 dst += (4 * dst_stride);
2874 dst, dst_stride, filter, height,
2875 weight0, weight1, offset0, offset1, rnd_val);
2876 }
else if (6 == height) {
2878 dst, dst_stride, filter, height,
2879 weight0, weight1, offset0, offset1, rnd_val);
2880 }
else if (0 == (height % 4)) {
2882 src1_ptr, src2_stride,
2883 dst, dst_stride, filter, height,
2884 weight0, weight1, offset0, offset1,
2907 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2908 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2910 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2914 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2915 v8i16 filter_vec, const_vec;
2916 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
2917 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
2918 v4i32 weight_vec, offset_vec, rnd_vec;
2922 filter_vec =
LD_SH(filter);
2925 offset = (offset0 + offset1) << rnd_val;
2926 weight0 = weight0 & 0x0000FFFF;
2927 weight = weight0 | (weight1 << 16);
2929 const_vec = __msa_ldi_h(128);
2931 offset_vec = __msa_fill_w(offset);
2932 weight_vec = __msa_fill_w(weight);
2933 rnd_vec = __msa_fill_w(rnd_val + 1);
2938 for (loop_cnt = (height >> 2); loop_cnt--;) {
2939 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2940 src0_ptr += (4 * src_stride);
2941 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2942 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
2943 src1_ptr += (4 * src2_stride);
2947 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2950 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2953 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2956 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2959 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2962 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2968 weight_vec, rnd_vec, offset_vec,
2969 dst0_r, dst1_r, dst2_r, dst3_r,
2970 dst0_l, dst1_l, dst2_l, dst3_l);
2972 weight_vec, rnd_vec, offset_vec,
2973 dst4_r, dst5_r, dst4_l, dst5_l);
2976 dst2_l, dst2_r, dst3_l, dst3_r,
2977 dst4_l, dst4_r, dst5_l, dst5_r,
2978 dst0_r, dst1_r, dst2_r);
2979 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
2980 dst += (4 * dst_stride);
3000 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
3001 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3003 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3005 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3007 v8i16 filter_vec, const_vec;
3008 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3009 v4i32 weight_vec, offset_vec, rnd_vec;
3013 filter_vec =
LD_SH(filter);
3016 offset = (offset0 + offset1) << rnd_val;
3017 weight0 = weight0 & 0x0000FFFF;
3018 weight = weight0 | (weight1 << 16);
3020 const_vec = __msa_ldi_h(128);
3022 offset_vec = __msa_fill_w(offset);
3023 weight_vec = __msa_fill_w(weight);
3024 rnd_vec = __msa_fill_w(rnd_val + 1);
3028 for (loop_cnt = (height >> 2); loop_cnt--;) {
3029 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
3030 LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
3031 src0_ptr += (4 * src_stride);
3032 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
3033 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
3034 src1_ptr += (4 * src2_stride);
3037 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3040 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3043 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3046 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3049 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3052 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3055 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3058 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3063 weight_vec, rnd_vec, offset_vec,
3064 dst0_r, dst1_r, dst2_r, dst3_r,
3065 dst0_l, dst1_l, dst2_l, dst3_l);
3068 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3069 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3070 dst += (2 * dst_stride);
3074 weight_vec, rnd_vec, offset_vec,
3075 dst0_r, dst1_r, dst2_r, dst3_r,
3076 dst0_l, dst1_l, dst2_l, dst3_l);
3079 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3080 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3081 dst += (2 * dst_stride);
3104 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3105 v16i8 mask1, mask2, mask3;
3107 v8i16 dst0, dst1, dst2, dst3;
3108 v8i16 in0, in1, in2, in3, in4, in5;
3109 v8i16 filter_vec, const_vec;
3110 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3111 v4i32 weight_vec, offset_vec, rnd_vec;
3115 filter_vec =
LD_SH(filter);
3118 offset = (offset0 + offset1) << rnd_val;
3119 weight0 = weight0 & 0x0000FFFF;
3120 weight = weight0 | (weight1 << 16);
3122 const_vec = __msa_ldi_h(128);
3124 offset_vec = __msa_fill_w(offset);
3125 weight_vec = __msa_fill_w(weight);
3126 rnd_vec = __msa_fill_w(rnd_val + 1);
3132 for (loop_cnt = (height >> 1); loop_cnt--;) {
3133 LD_SB2(src0_ptr, src_stride, src0, src2);
3134 LD_SB2(src0_ptr + 16, src_stride, src1, src3);
3135 src0_ptr += (2 * src_stride);
3136 LD_SH2(src1_ptr, src2_stride, in0, in2);
3137 LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
3138 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
3139 src1_ptr += (2 * src2_stride);
3142 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3145 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3148 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3151 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3156 weight_vec, rnd_vec, offset_vec,
3157 dst0_r, dst1_r, dst2_r, dst3_r,
3158 dst0_l, dst1_l, dst2_l, dst3_l);
3161 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3162 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3163 dst += (2 * dst_stride);
3165 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3168 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3172 weight_vec, rnd_vec, offset_vec,
3173 dst0_r, dst1_r, dst0_l, dst1_l);
3176 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
3177 dst_tmp += (2 * dst_stride);
3199 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3200 v16i8 mask1, mask2, mask3;
3201 v8i16 dst0, dst1, dst2, dst3;
3203 v8i16 in0, in1, in2, in3;
3204 v8i16 filter_vec, const_vec;
3205 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3206 v4i32 weight_vec, offset_vec, rnd_vec;
3210 filter_vec =
LD_SH(filter);
3213 offset = (offset0 + offset1) << rnd_val;
3214 weight0 = weight0 & 0x0000FFFF;
3215 weight = weight0 | (weight1 << 16);
3217 const_vec = __msa_ldi_h(128);
3219 offset_vec = __msa_fill_w(offset);
3220 weight_vec = __msa_fill_w(weight);
3221 rnd_vec = __msa_fill_w(rnd_val + 1);
3227 for (loop_cnt = height; loop_cnt--;) {
3228 LD_SB2(src0_ptr, 16, src0, src1);
3229 src2 =
LD_SB(src0_ptr + 24);
3230 src0_ptr += src_stride;
3231 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3232 src1_ptr += src2_stride;
3235 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3238 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3241 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3244 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3249 weight_vec, rnd_vec, offset_vec,
3250 dst0_r, dst1_r, dst2_r, dst3_r,
3251 dst0_l, dst1_l, dst2_l, dst3_l);
3254 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3255 ST_SW2(dst0_r, dst1_r, dst, 16);
3275 v16i8
src0,
src1, src2, src3, src4;
3276 v8i16 in0, in1, dst10;
3277 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3278 v4i32 dst10_r, dst10_l;
3280 v8i16 filter_vec, const_vec;
3281 v4i32 weight_vec, offset_vec, rnd_vec;
3283 src0_ptr -= src_stride;
3285 offset = (offset0 + offset1) << rnd_val;
3286 weight0 = weight0 & 0x0000FFFF;
3287 weight = weight0 | (weight1 << 16);
3289 const_vec = __msa_ldi_h(128);
3291 offset_vec = __msa_fill_w(offset);
3292 weight_vec = __msa_fill_w(weight);
3293 rnd_vec = __msa_fill_w(rnd_val + 1);
3295 filter_vec =
LD_SH(filter);
3298 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3299 src0_ptr += (3 * src_stride);
3300 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3301 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3302 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3303 LD_SB2(src0_ptr, src_stride, src3, src4);
3304 src0_ptr += (2 * src_stride);
3305 LD_SH2(src1_ptr, src2_stride, in0, in1);
3306 src1_ptr += (2 * src2_stride);
3308 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3309 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3310 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3311 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3314 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3317 dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
3318 dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
3324 ST4x2_UB(dst10_r, dst, dst_stride);
3342 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3343 v8i16 in0, in1, in2, in3;
3344 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3345 v16i8 src2110, src4332, src6554;
3347 v4i32 dst10_r, dst32_r, dst10_l, dst32_l;
3349 v8i16 filter_vec, const_vec;
3350 v4i32 weight_vec, offset_vec, rnd_vec;
3352 src0_ptr -= src_stride;
3354 offset = (offset0 + offset1) << rnd_val;
3355 weight0 = weight0 & 0x0000FFFF;
3356 weight = weight0 | (weight1 << 16);
3358 const_vec = __msa_ldi_h(128);
3360 offset_vec = __msa_fill_w(offset);
3361 weight_vec = __msa_fill_w(weight);
3362 rnd_vec = __msa_fill_w(rnd_val + 1);
3364 filter_vec =
LD_SH(filter);
3367 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3368 src0_ptr += (3 * src_stride);
3369 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3370 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3371 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3373 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3374 src0_ptr += (4 * src_stride);
3375 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3376 src1_ptr += (4 * src2_stride);
3378 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3379 src32_r, src43_r, src54_r, src65_r);
3380 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3384 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3386 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3389 weight_vec, rnd_vec, offset_vec,
3390 dst10_r, dst32_r, dst10_l, dst32_l);
3393 ST4x4_UB(dst10_r, dst10_r, 0, 1, 2, 3, dst, dst_stride);
3394 dst += (4 * dst_stride);
3413 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9;
3414 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3415 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3416 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3417 v16i8 src2110, src4332, src6554, src8776;
3418 v8i16 dst10, dst32, dst54, dst76;
3419 v4i32 dst10_r, dst32_r, dst54_r, dst76_r;
3420 v4i32 dst10_l, dst32_l, dst54_l, dst76_l;
3422 v8i16 filter_vec, const_vec;
3423 v4i32 weight_vec, offset_vec, rnd_vec;
3425 src0_ptr -= src_stride;
3427 offset = (offset0 + offset1) << rnd_val;
3428 weight0 = weight0 & 0x0000FFFF;
3429 weight = weight0 | (weight1 << 16);
3431 const_vec = __msa_ldi_h(128);
3433 offset_vec = __msa_fill_w(offset);
3434 weight_vec = __msa_fill_w(weight);
3435 rnd_vec = __msa_fill_w(rnd_val + 1);
3437 filter_vec =
LD_SH(filter);
3440 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3441 src0_ptr += (3 * src_stride);
3442 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3443 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3444 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3446 for (loop_cnt = (height >> 3); loop_cnt--;) {
3447 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3448 src0_ptr += (6 * src_stride);
3449 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3450 src1_ptr += (8 * src2_stride);
3455 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3456 src32_r, src43_r, src54_r, src65_r);
3457 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3458 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3459 src4332, src6554, src8776);
3463 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3465 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3467 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
3469 LD_SB2(src0_ptr, src_stride, src9, src2);
3470 src0_ptr += (2 * src_stride);
3471 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3472 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3473 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3476 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
3479 weight_vec, rnd_vec, offset_vec,
3480 dst10_r, dst32_r, dst54_r, dst76_r,
3481 dst10_l, dst32_l, dst54_l, dst76_l);
3484 dst54_l, dst54_r, dst76_l, dst76_r, dst10_r, dst54_r);
3485 ST4x8_UB(dst10_r, dst54_r, dst, dst_stride);
3486 dst += (8 * dst_stride);
3506 dst, dst_stride, filter, height,
3507 weight0, weight1, offset0, offset1, rnd_val);
3508 }
else if (4 == height) {
3510 dst, dst_stride, filter, height,
3511 weight0, weight1, offset0, offset1, rnd_val);
3512 }
else if (0 == (height % 8)) {
3514 src1_ptr, src2_stride,
3515 dst, dst_stride, filter, height,
3516 weight0, weight1, offset0, offset1,
3537 v16i8
src0,
src1, src2, src3, src4;
3538 v8i16 in0, in1, in2, in3;
3539 v16i8 src10_r, src32_r, src21_r, src43_r;
3540 v8i16 tmp0, tmp1, tmp2, tmp3;
3542 v8i16 filter_vec, const_vec;
3543 v4i32 weight_vec, offset_vec, rnd_vec;
3544 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3546 src0_ptr -= src_stride;
3548 offset = (offset0 + offset1) << rnd_val;
3549 weight0 = weight0 & 0x0000FFFF;
3550 weight = weight0 | (weight1 << 16);
3552 const_vec = __msa_ldi_h(128);
3554 offset_vec = __msa_fill_w(offset);
3555 weight_vec = __msa_fill_w(weight);
3556 rnd_vec = __msa_fill_w(rnd_val + 1);
3558 filter_vec =
LD_SH(filter);
3561 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3562 src0_ptr += (3 * src_stride);
3564 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3566 for (loop_cnt = (height >> 2); loop_cnt--;) {
3567 LD_SB2(src0_ptr, src_stride, src3, src4);
3568 src0_ptr += (2 * src_stride);
3569 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3570 src1_ptr += (4 * src2_stride);
3572 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3575 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3577 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3579 LD_SB2(src0_ptr, src_stride, src1, src2);
3580 src0_ptr += (2 * src_stride);
3582 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3585 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3587 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3590 weight_vec, rnd_vec, offset_vec,
3591 dst0_r, dst1_r, dst2_r, dst3_r,
3592 dst0_l, dst1_l, dst2_l, dst3_l);
3595 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3596 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
3597 dst += (4 * dst_stride);
3616 v16i8
src0,
src1, src2, src3, src4;
3617 v8i16 in0, in1, tmp0, tmp1;
3618 v16i8 src10_r, src32_r, src21_r, src43_r;
3620 v8i16 filter_vec, const_vec;
3621 v4i32 weight_vec, offset_vec, rnd_vec;
3622 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
3624 src0_ptr -= src_stride;
3626 offset = (offset0 + offset1) << rnd_val;
3627 weight0 = weight0 & 0x0000FFFF;
3628 weight = weight0 | (weight1 << 16);
3630 const_vec = __msa_ldi_h(128);
3632 offset_vec = __msa_fill_w(offset);
3633 weight_vec = __msa_fill_w(weight);
3634 rnd_vec = __msa_fill_w(rnd_val + 1);
3636 filter_vec =
LD_SH(filter);
3639 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3640 src0_ptr += (3 * src_stride);
3642 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3644 LD_SB2(src0_ptr, src_stride, src3, src4);
3645 LD_SH2(src1_ptr, src2_stride, in0, in1);
3647 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3650 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3652 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3654 weight_vec, rnd_vec, offset_vec,
3655 dst0_r, dst1_r, dst0_l, dst1_l);
3676 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3677 v8i16 in0, in1, in2, in3, in4, in5;
3678 v16i8 src10_r, src32_r, src54_r, src76_r;
3679 v16i8 src21_r, src43_r, src65_r, src87_r;
3680 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3682 v8i16 filter_vec, const_vec;
3683 v4i32 weight_vec, offset_vec, rnd_vec;
3684 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3685 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
3687 src0_ptr -= src_stride;
3689 offset = (offset0 + offset1) << rnd_val;
3690 weight0 = weight0 & 0x0000FFFF;
3691 weight = weight0 | (weight1 << 16);
3693 const_vec = __msa_ldi_h(128);
3695 offset_vec = __msa_fill_w(offset);
3696 weight_vec = __msa_fill_w(weight);
3697 rnd_vec = __msa_fill_w(rnd_val + 1);
3699 filter_vec =
LD_SH(filter);
3702 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3703 src0_ptr += (3 * src_stride);
3705 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3707 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3708 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3710 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3711 src32_r, src43_r, src54_r, src65_r);
3712 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3715 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3717 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3719 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, tmp2, tmp2);
3721 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, tmp3, tmp3);
3723 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, tmp4, tmp4);
3725 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, tmp5, tmp5);
3728 weight_vec, rnd_vec, offset_vec,
3729 dst0_r, dst1_r, dst2_r, dst3_r,
3730 dst0_l, dst1_l, dst2_l, dst3_l);
3732 weight_vec, rnd_vec, offset_vec,
3733 dst4_r, dst5_r, dst4_l, dst5_l);
3736 dst2_l, dst2_r, dst3_l, dst3_r,
3737 dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
3738 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3739 dst += (4 * dst_stride);
3759 v16i8
src0,
src1, src2, src3, src4;
3760 v8i16 in0, in1, in2, in3;
3761 v16i8 src10_r, src32_r, src21_r, src43_r;
3762 v8i16 tmp0, tmp1, tmp2, tmp3;
3764 v8i16 filter_vec, const_vec;
3765 v4i32 weight_vec, offset_vec, rnd_vec;
3766 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3768 src0_ptr -= src_stride;
3770 offset = (offset0 + offset1) << rnd_val;
3771 weight0 = weight0 & 0x0000FFFF;
3772 weight = weight0 | (weight1 << 16);
3774 const_vec = __msa_ldi_h(128);
3776 offset_vec = __msa_fill_w(offset);
3777 weight_vec = __msa_fill_w(weight);
3778 rnd_vec = __msa_fill_w(rnd_val + 1);
3780 filter_vec =
LD_SH(filter);
3783 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3784 src0_ptr += (3 * src_stride);
3786 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3788 for (loop_cnt = (height >> 2); loop_cnt--;) {
3789 LD_SB2(src0_ptr, src_stride, src3, src4);
3790 src0_ptr += (2 * src_stride);
3791 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3792 src1_ptr += (4 * src2_stride);
3794 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3797 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3799 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3801 LD_SB2(src0_ptr, src_stride, src1, src2);
3802 src0_ptr += (2 * src_stride);
3804 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3807 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3809 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3812 weight_vec, rnd_vec, offset_vec,
3813 dst0_r, dst1_r, dst2_r, dst3_r,
3814 dst0_l, dst1_l, dst2_l, dst3_l);
3817 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3818 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3819 dst += (4 * dst_stride);
3839 dst, dst_stride, filter, height,
3840 weight0, weight1, offset0, offset1, rnd_val);
3841 }
else if (6 == height) {
3843 dst, dst_stride, filter, height,
3844 weight0, weight1, offset0, offset1, rnd_val);
3847 src1_ptr, src2_stride,
3848 dst, dst_stride, filter, height,
3849 weight0, weight1, offset0, offset1,
3870 v16i8
src0,
src1, src2, src3, src4, src5;
3871 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3872 v16i8 src10_r, src32_r, src21_r, src43_r;
3873 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3874 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3875 v16i8 src2110, src4332;
3877 v8i16 filter_vec, const_vec;
3878 v4i32 weight_vec, offset_vec, rnd_vec;
3879 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3880 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
3882 src0_ptr -= (1 * src_stride);
3884 offset = (offset0 + offset1) << rnd_val;
3885 weight0 = weight0 & 0x0000FFFF;
3886 weight = weight0 | (weight1 << 16);
3888 const_vec = __msa_ldi_h(128);
3890 offset_vec = __msa_fill_w(offset);
3891 weight_vec = __msa_fill_w(weight);
3892 rnd_vec = __msa_fill_w(rnd_val + 1);
3894 filter_vec =
LD_SH(filter);
3897 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3898 src0_ptr += (3 * src_stride);
3900 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3901 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3902 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3904 for (loop_cnt = (height >> 2); loop_cnt--;) {
3905 LD_SB2(src0_ptr, src_stride, src3, src4);
3906 src0_ptr += (2 * src_stride);
3907 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3908 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
3909 src1_ptr += (4 * src2_stride);
3913 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3914 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3915 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3918 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3920 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3922 DPADD_SB2_SH(src2110, src4332, filt0, filt1, tmp4, tmp4);
3924 LD_SB2(src0_ptr, src_stride, src5, src2);
3925 src0_ptr += (2 * src_stride);
3927 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3928 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
3929 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3932 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3934 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3936 DPADD_SB2_SH(src4332, src2110, filt0, filt1, tmp5, tmp5);
3939 weight_vec, rnd_vec, offset_vec,
3940 dst0_r, dst1_r, dst2_r, dst3_r,
3941 dst0_l, dst1_l, dst2_l, dst3_l);
3943 weight_vec, rnd_vec, offset_vec,
3944 dst4_r, dst5_r, dst4_l, dst5_l);
3947 dst2_l, dst2_r, dst3_l, dst3_r,
3948 dst4_l, dst4_r, dst5_l, dst5_r,
3949 dst0_r, dst1_r, dst2_r);
3950 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
3951 dst += (4 * dst_stride);
3971 v16i8
src0,
src1, src2, src3, src4, src5;
3972 v8i16 in0, in1, in2, in3;
3973 v16i8 src10_r, src32_r, src21_r, src43_r;
3974 v16i8 src10_l, src32_l, src21_l, src43_l;
3975 v8i16 tmp0, tmp1, tmp2, tmp3;
3977 v8i16 filter_vec, const_vec;
3978 v4i32 weight_vec, offset_vec, rnd_vec;
3979 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3981 src0_ptr -= src_stride;
3983 offset = (offset0 + offset1) << rnd_val;
3984 weight0 = weight0 & 0x0000FFFF;
3985 weight = weight0 | (weight1 << 16);
3987 const_vec = __msa_ldi_h(128);
3989 offset_vec = __msa_fill_w(offset);
3990 weight_vec = __msa_fill_w(weight);
3991 rnd_vec = __msa_fill_w(rnd_val + 1);
3993 filter_vec =
LD_SH(filter);
3996 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3997 src0_ptr += (3 * src_stride);
3999 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4000 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4002 for (loop_cnt = (height >> 2); loop_cnt--;) {
4003 LD_SB2(src0_ptr, src_stride, src3, src4);
4004 src0_ptr += (2 * src_stride);
4005 LD_SH2(src1_ptr, src2_stride, in0, in1);
4006 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4007 src1_ptr += (2 * src2_stride);
4009 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4010 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4013 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
4015 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
4017 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp2, tmp2);
4019 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp3, tmp3);
4023 weight_vec, rnd_vec, offset_vec,
4024 dst0_r, dst1_r, dst2_r, dst3_r,
4025 dst0_l, dst1_l, dst2_l, dst3_l);
4027 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
4028 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
4029 dst += (2 * dst_stride);
4030 LD_SB2(src0_ptr, src_stride, src5, src2);
4031 src0_ptr += (2 * src_stride);
4033 LD_SH2(src1_ptr, src2_stride, in0, in1);
4034 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4035 src1_ptr += (2 * src2_stride);
4037 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4038 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4041 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
4043 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
4045 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp2, tmp2);
4047 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp3, tmp3);
4050 weight_vec, rnd_vec, offset_vec,
4051 dst0_r, dst1_r, dst2_r, dst3_r,
4052 dst0_l, dst1_l, dst2_l, dst3_l);
4055 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
4056 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
4057 dst += (2 * dst_stride);
4077 v16i8
src0,
src1, src2, src3, src4, src5;
4078 v16i8 src6, src7, src8, src9, src10, src11;
4079 v8i16 in0, in1, in2, in3, in4, in5;
4080 v16i8 src10_r, src32_r, src76_r, src98_r;
4081 v16i8 src10_l, src32_l, src21_l, src43_l;
4082 v16i8 src21_r, src43_r, src87_r, src109_r;
4083 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4085 v8i16 filter_vec, const_vec;
4086 v4i32 weight_vec, offset_vec, rnd_vec;
4087 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
4088 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
4090 src0_ptr -= src_stride;
4092 offset = (offset0 + offset1) << rnd_val;
4093 weight0 = weight0 & 0x0000FFFF;
4094 weight = weight0 | (weight1 << 16);
4096 const_vec = __msa_ldi_h(128);
4098 offset_vec = __msa_fill_w(offset);
4099 weight_vec = __msa_fill_w(weight);
4100 rnd_vec = __msa_fill_w(rnd_val + 1);
4102 filter_vec =
LD_SH(filter);
4106 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4108 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4109 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4111 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4112 src0_ptr += (3 * src_stride);
4114 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4116 for (loop_cnt = (height >> 2); loop_cnt--;) {
4118 LD_SB2(src0_ptr, src_stride, src3, src4);
4119 LD_SH2(src1_ptr, src2_stride, in0, in1);
4120 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4122 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4123 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4126 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4127 src0_ptr += (2 * src_stride);
4128 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4129 src1_ptr += (2 * src2_stride);
4131 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4134 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
4136 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
4138 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
4140 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
4143 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
4145 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
4149 weight_vec, rnd_vec, offset_vec,
4150 dst0_r, dst1_r, dst2_r, dst3_r,
4151 dst0_l, dst1_l, dst2_l, dst3_l);
4154 weight_vec, rnd_vec, offset_vec,
4155 dst4_r, dst5_r, dst4_l, dst5_l);
4158 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
4161 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
4162 ST8x2_UB(dst4_r, dst + 16, dst_stride);
4163 dst += (2 * dst_stride);
4166 LD_SB2(src0_ptr, src_stride, src5, src2);
4167 LD_SH2(src1_ptr, src2_stride, in0, in1);
4168 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4170 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4171 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4173 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
4174 src0_ptr += (2 * src_stride);
4175 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4176 src1_ptr += (2 * src2_stride);
4178 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
4181 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
4183 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp4, tmp4);
4185 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
4187 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp5, tmp5);
4190 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, tmp2, tmp2);
4192 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, tmp3, tmp3);
4196 weight_vec, rnd_vec, offset_vec,
4197 dst0_r, dst1_r, dst2_r, dst3_r,
4198 dst0_l, dst1_l, dst2_l, dst3_l);
4201 weight_vec, rnd_vec, offset_vec,
4202 dst4_r, dst5_r, dst4_l, dst5_l);
4205 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
4209 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
4210 ST8x2_UB(dst4_r, dst + 16, dst_stride);
4211 dst += (2 * dst_stride);
4232 v16i8
src0,
src1, src2, src3, src4, src6, src7, src8, src9, src10;
4233 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4234 v16i8 src10_r, src32_r, src76_r, src98_r;
4235 v16i8 src21_r, src43_r, src87_r, src109_r;
4236 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4237 v16i8 src10_l, src32_l, src76_l, src98_l;
4238 v16i8 src21_l, src43_l, src87_l, src109_l;
4240 v8i16 filter_vec, const_vec;
4241 v4i32 weight_vec, offset_vec, rnd_vec;
4242 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
4243 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l, dst6_l, dst7_l;
4245 src0_ptr -= src_stride;
4247 offset = (offset0 + offset1) << rnd_val;
4248 weight0 = weight0 & 0x0000FFFF;
4249 weight = weight0 | (weight1 << 16);
4251 const_vec = __msa_ldi_h(128);
4253 offset_vec = __msa_fill_w(offset);
4254 weight_vec = __msa_fill_w(weight);
4255 rnd_vec = __msa_fill_w(rnd_val + 1);
4257 filter_vec =
LD_SH(filter);
4261 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4263 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4264 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4266 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4267 src0_ptr += (3 * src_stride);
4269 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4270 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
4272 for (loop_cnt = (height >> 1); loop_cnt--;) {
4274 LD_SB2(src0_ptr, src_stride, src3, src4);
4275 LD_SH2(src1_ptr, src2_stride, in0, in1);
4276 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4278 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4279 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4283 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
4285 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
4287 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
4289 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
4293 weight_vec, rnd_vec, offset_vec,
4294 dst0_r, dst1_r, dst2_r, dst3_r,
4295 dst0_l, dst1_l, dst2_l, dst3_l);
4298 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
4299 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
4300 dst += (2 * dst_stride);
4309 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4310 src0_ptr += (2 * src_stride);
4311 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4312 LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
4313 src1_ptr += (2 * src2_stride);
4315 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4316 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
4319 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
4321 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, tmp6, tmp6);
4323 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
4325 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, tmp7, tmp7);
4329 weight_vec, rnd_vec, offset_vec,
4330 dst4_r, dst5_r, dst6_r, dst7_r,
4331 dst4_l, dst5_l, dst6_l, dst7_l);
4335 dst5_l, dst5_r, dst7_l, dst7_r, dst4_r, dst5_r);
4336 ST_SW2(dst4_r, dst5_r, dst_tmp, dst_stride);
4337 dst_tmp += (2 * dst_stride);
4353 const int8_t *filter_x,
4354 const int8_t *filter_y,
4364 v16i8
src0,
src1, src2, src3, src4;
4366 v4i32 filt_h0, filt_h1;
4367 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4369 v8i16 filter_vec, const_vec;
4370 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4371 v8i16 dst0, dst1, dst2, dst3, dst4;
4372 v4i32 dst0_r, dst1_r, dst0_l;
4373 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4374 v4i32 weight_vec, offset_vec, rnd_vec;
4376 src0_ptr -= (src_stride + 1);
4378 filter_vec =
LD_SH(filter_x);
4381 filter_vec =
LD_SH(filter_y);
4382 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4383 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4389 offset = (offset0 + offset1) << rnd_val;
4390 weight0 = weight0 & 0x0000FFFF;
4391 weight = weight0 | (weight1 << 16);
4393 const_vec = __msa_ldi_h(128);
4395 offset_vec = __msa_fill_w(offset);
4396 weight_vec = __msa_fill_w(weight);
4397 rnd_vec = __msa_fill_w(rnd_val + 1);
4399 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4400 src0_ptr += (3 * src_stride);
4403 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4404 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4405 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4412 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
4414 LD_SB2(src0_ptr, src_stride, src3, src4);
4415 LD_SH2(src1_ptr, src2_stride, in0, in1);
4416 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
4419 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4422 dst32_r = __msa_ilvr_h(dst3, dst2);
4426 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4429 dst43_r = __msa_ilvr_h(dst4, dst3);
4432 dst1_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
4435 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
4436 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
4451 const int8_t *filter_x,
4452 const int8_t *filter_y,
4461 v8i16 in0, in1, in2, in3;
4462 v16i8
src0,
src1, src2, src3, src4, src5, src6;
4464 v4i32 filt_h0, filt_h1;
4465 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4467 v8i16 filter_vec, const_vec;
4468 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4469 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4471 v4i32 dst0_l, dst1_l;
4472 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
4473 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4474 v4i32 weight_vec, offset_vec, rnd_vec;
4476 src0_ptr -= (src_stride + 1);
4478 filter_vec =
LD_SH(filter_x);
4481 filter_vec =
LD_SH(filter_y);
4482 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4483 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4489 offset = (offset0 + offset1) << rnd_val;
4490 weight0 = weight0 & 0x0000FFFF;
4491 weight = weight0 | (weight1 << 16);
4493 const_vec = __msa_ldi_h(128);
4495 offset_vec = __msa_fill_w(offset);
4496 weight_vec = __msa_fill_w(weight);
4497 rnd_vec = __msa_fill_w(rnd_val + 1);
4499 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4500 src0_ptr += (3 * src_stride);
4503 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4504 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4505 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4512 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
4514 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
4515 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4519 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4522 dst32_r = __msa_ilvr_h(dst3, dst2);
4526 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4529 dst43_r = __msa_ilvr_h(dst4, dst3);
4533 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4536 dst10_r = __msa_ilvr_h(dst5, dst4);
4540 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4543 dst21_r = __msa_ilvr_h(dst2, dst5);
4546 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
4548 weight_vec, rnd_vec, offset_vec,
4549 dst0_r, dst1_r, dst0_l, dst1_l);
4552 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
4561 const int8_t *filter_x,
4562 const int8_t *filter_y,
4572 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4573 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4575 v4i32 filt_h0, filt_h1;
4576 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4578 v8i16 filter_vec, const_vec;
4579 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4580 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
4581 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
4582 v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
4583 v8i16 tmp0, tmp1, tmp2, tmp3;
4584 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4585 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4586 v4i32 weight_vec, offset_vec, rnd_vec;
4588 src0_ptr -= (src_stride + 1);
4590 filter_vec =
LD_SH(filter_x);
4593 filter_vec =
LD_SH(filter_y);
4594 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4595 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4601 offset = (offset0 + offset1) << rnd_val;
4602 weight0 = weight0 & 0x0000FFFF;
4603 weight = weight0 | (weight1 << 16);
4605 const_vec = __msa_ldi_h(128);
4607 offset_vec = __msa_fill_w(offset);
4608 weight_vec = __msa_fill_w(weight);
4609 rnd_vec = __msa_fill_w(rnd_val + 1);
4611 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4612 src0_ptr += (3 * src_stride);
4615 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4616 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4617 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4624 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
4626 for (loop_cnt = height >> 3; loop_cnt--;) {
4627 LD_SB8(src0_ptr, src_stride,
4628 src3, src4, src5, src6, src7, src8, src9, src10);
4629 src0_ptr += (8 * src_stride);
4630 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
4631 src1_ptr += (8 * src2_stride);
4636 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4639 dst32_r = __msa_ilvr_h(dst3, dst2);
4643 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4646 dst43_r = __msa_ilvr_h(dst4, dst3);
4650 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4653 dst54_r = __msa_ilvr_h(dst5, dst4);
4657 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4660 dst65_r = __msa_ilvr_h(dst6, dst5);
4663 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4666 dst76_r = __msa_ilvr_h(dst7, dst6);
4669 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
4672 dst87_r = __msa_ilvr_h(dst8, dst7);
4675 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
4678 dst10_r = __msa_ilvr_h(dst9, dst8);
4681 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
4684 dst21_r = __msa_ilvr_h(dst2, dst9);
4688 dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
4691 weight_vec, rnd_vec, offset_vec,
4692 dst0_r, dst1_r, dst2_r, dst3_r,
4693 dst0_l, dst1_l, dst2_l, dst3_l);
4696 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
4697 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
4698 dst += (8 * dst_stride);
4708 const int8_t *filter_x,
4709 const int8_t *filter_y,
4719 dst, dst_stride, filter_x, filter_y,
4720 height, weight0, weight1, offset0, offset1,
4722 }
else if (4 == height) {
4724 dst, dst_stride, filter_x, filter_y,
4725 height, weight0, weight1, offset0, offset1,
4727 }
else if (0 == (height % 8)) {
4729 src1_ptr, src2_stride,
4730 dst, dst_stride, filter_x, filter_y,
4731 height, weight0, weight1,
4732 offset0, offset1, rnd_val);
4742 const int8_t *filter_x,
4743 const int8_t *filter_y,
4753 v16i8
src0,
src1, src2, src3, src4, src5, src6;
4754 v8i16 in0, in1, in2, in3;
4756 v4i32 filt_h0, filt_h1;
4757 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4759 v8i16 filter_vec, const_vec;
4760 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4761 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4762 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4763 v8i16 tmp0, tmp1, tmp2, tmp3;
4764 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4765 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4766 v4i32 weight_vec, offset_vec, rnd_vec;
4768 src0_ptr -= (src_stride + 1);
4770 filter_vec =
LD_SH(filter_x);
4773 filter_vec =
LD_SH(filter_y);
4774 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4775 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4781 offset = (offset0 + offset1) << rnd_val;
4782 weight0 = weight0 & 0x0000FFFF;
4783 weight = weight0 | (weight1 << 16);
4785 const_vec = __msa_ldi_h(128);
4787 offset_vec = __msa_fill_w(offset);
4788 weight_vec = __msa_fill_w(weight);
4789 rnd_vec = __msa_fill_w(rnd_val + 1);
4791 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4792 src0_ptr += (3 * src_stride);
4795 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4796 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4797 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4808 for (loop_cnt = height >> 2; loop_cnt--;) {
4809 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
4810 src0_ptr += (4 * src_stride);
4811 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4812 src1_ptr += (4 * src2_stride);
4815 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4825 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4835 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4844 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4854 dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
4857 weight_vec, rnd_vec, offset_vec,
4858 dst0_r, dst1_r, dst2_r, dst3_r,
4859 dst0_l, dst1_l, dst2_l, dst3_l);
4862 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
4863 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
4864 dst += (4 * dst_stride);
4874 const int8_t *filter_x,
4875 const int8_t *filter_y,
4884 v16i8
src0,
src1, src2, src3, src4;
4886 v4i32 filt_h0, filt_h1;
4887 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4889 v8i16 filter_vec, const_vec;
4890 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4891 v8i16 dst0, dst1, dst2, dst3, dst4;
4893 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4894 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4895 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4897 v4i32 weight_vec, offset_vec, rnd_vec;
4899 src0_ptr -= (src_stride + 1);
4901 filter_vec =
LD_SH(filter_x);
4904 filter_vec =
LD_SH(filter_y);
4905 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4906 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4912 offset = (offset0 + offset1) << rnd_val;
4913 weight0 = weight0 & 0x0000FFFF;
4914 weight = weight0 | (weight1 << 16);
4916 const_vec = __msa_ldi_h(128);
4918 offset_vec = __msa_fill_w(offset);
4919 weight_vec = __msa_fill_w(weight);
4920 rnd_vec = __msa_fill_w(rnd_val + 1);
4922 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4923 src0_ptr += (3 * src_stride);
4926 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4927 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4928 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4939 LD_SB2(src0_ptr, src_stride, src3, src4);
4941 LD_SH2(src1_ptr, src2_stride, in0, in1);
4944 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4953 tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
4955 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4964 tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r);
4967 weight_vec, rnd_vec, offset_vec,
4968 dst0_r, dst1_r, dst0_l, dst1_l);
4979 const int8_t *filter_x,
4980 const int8_t *filter_y,
4989 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
4991 v4i32 filt_h0, filt_h1;
4992 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4994 v8i16 filter_vec, const_vec;
4995 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4996 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4997 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4998 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4999 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
5000 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
5001 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
5002 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
5003 v8i16 in0, in1, in2, in3, in4, in5;
5004 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
5005 v4i32 weight_vec, offset_vec, rnd_vec;
5007 src0_ptr -= (src_stride + 1);
5009 filter_vec =
LD_SH(filter_x);
5012 filter_vec =
LD_SH(filter_y);
5013 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
5014 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
5020 offset = (offset0 + offset1) << rnd_val;
5021 weight0 = weight0 & 0x0000FFFF;
5022 weight = weight0 | (weight1 << 16);
5024 const_vec = __msa_ldi_h(128);
5026 offset_vec = __msa_fill_w(offset);
5027 weight_vec = __msa_fill_w(weight);
5028 rnd_vec = __msa_fill_w(rnd_val + 1);
5030 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
5031 src0_ptr += (3 * src_stride);
5034 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5035 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5036 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5047 LD_SB2(src0_ptr, src_stride, src3, src4);
5048 src0_ptr += (2 * src_stride);
5050 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
5051 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5060 tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
5062 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
5071 tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r);
5073 LD_SB2(src0_ptr, src_stride, src5, src6);
5074 src0_ptr += (2 * src_stride);
5077 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
5086 tmp2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
5088 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
5097 tmp3 = __msa_pckev_h((v8i16) dst3_l, (v8i16) dst3_r);
5101 weight_vec, rnd_vec, offset_vec,
5102 dst0_r, dst1_r, dst2_r, dst3_r,
5103 dst0_l, dst1_l, dst2_l, dst3_l);
5106 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
5107 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
5108 dst += (4 * dst_stride);
5110 LD_SB2(src0_ptr, src_stride, src7, src8);
5113 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
5122 tmp4 = __msa_pckev_h((v8i16) dst4_l, (v8i16) dst4_r);
5124 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
5133 tmp5 = __msa_pckev_h((v8i16) dst5_l, (v8i16) dst5_r);
5136 weight_vec, rnd_vec, offset_vec,
5137 dst4_r, dst5_r, dst4_l, dst5_l);
5149 const int8_t *filter_x,
5150 const int8_t *filter_y,
5163 int16_t *src1_ptr_tmp;
5165 v16i8
src0,
src1, src2, src3, src4, src5, src6;
5166 v8i16 in0, in1, in2, in3;
5168 v4i32 filt_h0, filt_h1;
5169 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5171 v8i16 filter_vec, const_vec;
5172 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
5173 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
5174 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5175 v8i16 tmp0, tmp1, tmp2, tmp3;
5176 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
5177 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
5178 v4i32 weight_vec, offset_vec, rnd_vec;
5180 src0_ptr -= (src_stride + 1);
5182 filter_vec =
LD_SH(filter_x);
5185 filter_vec =
LD_SH(filter_y);
5186 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
5187 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
5193 offset = (offset0 + offset1) << rnd_val;
5194 weight0 = weight0 & 0x0000FFFF;
5195 weight = weight0 | (weight1 << 16);
5197 const_vec = __msa_ldi_h(128);
5199 offset_vec = __msa_fill_w(offset);
5200 weight_vec = __msa_fill_w(weight);
5201 rnd_vec = __msa_fill_w(rnd_val + 1);
5203 for (cnt = width >> 3; cnt--;) {
5204 src0_ptr_tmp = src0_ptr;
5205 src1_ptr_tmp = src1_ptr;
5208 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5209 src0_ptr_tmp += (3 * src_stride);
5212 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5213 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5214 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5225 for (loop_cnt = height >> 2; loop_cnt--;) {
5226 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5227 src0_ptr_tmp += (4 * src_stride);
5228 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5229 src1_ptr_tmp += (4 * src2_stride);
5232 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5242 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
5252 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
5262 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
5273 dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
5276 weight_vec, rnd_vec, offset_vec,
5277 dst0_r, dst1_r, dst2_r, dst3_r,
5278 dst0_l, dst1_l, dst2_l, dst3_l);
5281 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
5282 ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
5283 dst_tmp += (4 * dst_stride);
5298 const int8_t *filter_x,
5299 const int8_t *filter_y,
5309 dst, dst_stride, filter_x, filter_y,
5310 height, weight0, weight1, offset0, offset1,
5312 }
else if (6 == height) {
5314 dst, dst_stride, filter_x, filter_y,
5315 height, weight0, weight1, offset0, offset1,
5317 }
else if (0 == (height % 4)) {
5319 src1_ptr, src2_stride,
5320 dst, dst_stride, filter_x, filter_y,
5322 weight1, offset0, offset1, rnd_val, 8);
5332 const int8_t *filter_x,
5333 const int8_t *filter_y,
5342 src1_ptr, src2_stride,
5344 filter_x, filter_y, height, weight0,
5345 weight1, offset0, offset1, rnd_val, 8);
5348 dst + 8, dst_stride, filter_x, filter_y,
5349 height, weight0, weight1, offset0,
5359 const int8_t *filter_x,
5360 const int8_t *filter_y,
5369 src1_ptr, src2_stride,
5371 filter_x, filter_y, height, weight0,
5372 weight1, offset0, offset1, rnd_val, 16);
5381 const int8_t *filter_x,
5382 const int8_t *filter_y,
5391 src1_ptr, src2_stride,
5393 filter_x, filter_y, height, weight0,
5394 weight1, offset0, offset1, rnd_val, 24);
5403 const int8_t *filter_x,
5404 const int8_t *filter_y,
5413 src1_ptr, src2_stride,
5415 filter_x, filter_y, height, weight0,
5416 weight1, offset0, offset1, rnd_val, 32);
5419 #define BI_W_MC_COPY(WIDTH) \
5420 void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
5421 ptrdiff_t dst_stride, \
5423 ptrdiff_t src_stride, \
5424 int16_t *src_16bit, \
5435 int shift = 14 + 1 - 8; \
5436 int log2Wd = denom + shift - 1; \
5438 hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
5439 dst, dst_stride, height, \
5440 weight0, weight1, offset0, \
5456 #define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
5457 void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
5463 int16_t *src_16bit, \
5474 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
5476 int shift = 14 + 1 - 8; \
5477 int log2Wd = denom + shift - 1; \
5479 hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, \
5480 src_16bit, MAX_PB_SIZE, \
5483 weight0, weight1, offset0, \
5496 BI_W_MC(qpel, v, 4, 8, vt, my);
5497 BI_W_MC(qpel, v, 8, 8, vt, my);
5498 BI_W_MC(qpel, v, 12, 8, vt, my);
5499 BI_W_MC(qpel, v, 16, 8, vt, my);
5500 BI_W_MC(qpel, v, 24, 8, vt, my);
5501 BI_W_MC(qpel, v, 32, 8, vt, my);
5502 BI_W_MC(qpel, v, 48, 8, vt, my);
5503 BI_W_MC(qpel, v, 64, 8, vt, my);
5513 BI_W_MC(epel, v, 4, 4, vt, my);
5514 BI_W_MC(epel, v, 8, 4, vt, my);
5515 BI_W_MC(epel, v, 6, 4, vt, my);
5516 BI_W_MC(epel, v, 12, 4, vt, my);
5517 BI_W_MC(epel, v, 16, 4, vt, my);
5518 BI_W_MC(epel, v, 24, 4, vt, my);
5519 BI_W_MC(epel, v, 32, 4, vt, my);
5523 #define BI_W_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
5524 void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
5530 int16_t *src_16bit, \
5541 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
5542 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
5544 int shift = 14 + 1 - 8; \
5545 int log2Wd = denom + shift - 1; \
5547 hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, \
5548 src_16bit, MAX_PB_SIZE, \
5550 filter_x, filter_y, \
5551 height, weight0, weight1, \
5552 offset0, offset1, log2Wd); \
#define HEVC_PCK_SW_SB2(in0, in1, out)
static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define XORI_B8_128_SB(...)
static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define XORI_B2_128_SB(...)
#define XORI_B3_128_SB(...)
static void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define DPADD_SB4_SH(...)
#define SPLATI_H2_SH(...)
static void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define HEVC_PCK_SW_SB4(in0, in1, in2, in3, out)
#define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3,wgt, rnd, offset,out0_r, out1_r, out2_r, out3_r,out0_l, out1_l, out2_l, out3_l)
static void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, uint8_t clip)
#define SPLATI_H4_SH(...)
static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define HEVC_PCK_SW_SB12(in0, in1, in2, in3, in4, in5, in6, in7,in8, in9, in10, in11, out0, out1, out2)
#define CLIP_SW_0_255(in)
static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width)
static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define ST8x2_UB(in, pdst, stride)
static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define XORI_B7_128_SB(...)
static void hevc_biwgt_copy_16multx4mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width)
#define XORI_B4_128_SB(...)
static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static const uint8_t offset[127][2]
static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define DPADD_SB2_SH(...)
static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width)
static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val, int32_t width)
#define SPLATI_W4_SW(...)
static void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define ST_SW2(in0, in1, pdst, stride)
#define BI_W_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)
#define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define HEVC_PCK_SW_SB8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1)
static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static int weight(int i, int blen, int offset)
static void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define SLLI_4V(in0, in1, in2, in3, shift)
static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define ST4x8_UB(in0, in1, pdst, stride)
#define ST6x4_UB(in0, in1, pdst, stride)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define ST8x4_UB(in0, in1, pdst, stride)
static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset,out0_r, out1_r, out0_l, out1_l)
#define XORI_B6_128_SB(...)
static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define ST12x4_UB(in0, in1, in2, pdst, stride)
#define BI_W_MC_COPY(WIDTH)
#define SPLATI_W2_SW(...)
static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
#define ST4x2_UB(in, pdst, stride)
static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)
static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight0, int32_t weight1, int32_t offset0, int32_t offset1, int32_t rnd_val)