27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
31 #define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \
33 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
34 SRARI_H2_SH(out0, out1, rnd_val); \
35 CLIP_SH2_0_255(out0, out1); \
38 #define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \
39 vec0, vec1, vec2, vec3, rnd_val, \
40 out0, out1, out2, out3) \
42 HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \
43 HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \
46 #define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, \
49 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
50 SRARI_H2_SH(out0, out1, rnd_val); \
51 CLIP_SH2_0_255_MAX_SATU(out0, out1); \
54 #define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
55 vec3, rnd_val, out0, out1, out2, out3) \
57 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, out0, out1); \
58 HEVC_BI_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, rnd_val, out2, out3); \
69 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
70 uint64_t tpd0, tpd1, tpd2, tpd3;
73 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
74 v8i16 dst0, dst1, dst2, dst3;
77 LW2(src0_ptr, src_stride, tp0, tp1);
79 LD2(src1_ptr, src2_stride, tpd0, tpd1);
82 dst0 = (v8i16) __msa_ilvr_b(
zero,
src0);
85 dst0 = __msa_srari_h(dst0, 7);
88 dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
89 ST_W2(dst0, 0, 1, dst, dst_stride);
91 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
93 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
99 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
100 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
101 }
else if (0 ==
height % 8) {
102 for (loop_cnt = (
height >> 3); loop_cnt--;) {
103 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
104 src0_ptr += 4 * src_stride;
106 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
107 src0_ptr += 4 * src_stride;
109 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
110 src1_ptr += (4 * src2_stride);
113 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
114 src1_ptr += (4 * src2_stride);
119 SLLI_4V(dst0, dst1, dst2, dst3, 6);
121 dst3, 7, dst0, dst1, dst2, dst3);
123 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
124 dst += (8 * dst_stride);
138 uint64_t tp0, tp1, tp2, tp3;
139 v16u8 out0, out1, out2, out3;
141 v16i8
src0 = { 0 },
src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
142 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
143 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
145 for (loop_cnt = (
height >> 3); loop_cnt--;) {
146 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
147 src0_ptr += (4 * src_stride);
150 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
151 src0_ptr += (4 * src_stride);
154 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
155 src1_ptr += (8 * src2_stride);
160 SLLI_4V(dst0, dst1, dst2, dst3, 6);
161 SLLI_4V(dst4, dst5, dst6, dst7, 6);
163 7, dst0, dst1, dst2, dst3);
165 7, dst4, dst5, dst6, dst7);
168 ST_W2(out0, 0, 2, dst, dst_stride);
169 ST_H2(out0, 2, 6, dst + 4, dst_stride);
170 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
171 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
172 dst += (4 * dst_stride);
173 ST_W2(out2, 0, 2, dst, dst_stride);
174 ST_H2(out2, 2, 6, dst + 4, dst_stride);
175 ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
176 ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
177 dst += (4 * dst_stride);
189 uint64_t tp0, tp1, tp2, tp3;
190 v16u8 out0, out1, out2, out3;
191 v16i8
src0 = { 0 },
src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
193 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
194 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
197 LD2(src0_ptr, src_stride, tp0, tp1);
199 LD_SH2(src1_ptr, src2_stride, in0, in1);
203 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
204 ST_D2(out0, 0, 1, dst, dst_stride);
206 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
211 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
212 SLLI_4V(dst0, dst1, dst2, dst3, 6);
214 7, dst0, dst1, dst2, dst3);
216 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
218 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
219 src0_ptr += 4 * src_stride;
222 LD2(src0_ptr, src_stride, tp0, tp1);
227 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
228 SLLI_4V(dst0, dst1, dst2, dst3, 6);
231 7, dst0, dst1, dst2, dst3);
233 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
234 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
235 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
236 }
else if (0 ==
height % 8) {
239 for (loop_cnt = (
height >> 3); loop_cnt--;) {
240 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
241 src0_ptr += 4 * src_stride;
244 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
245 src0_ptr += 4 * src_stride;
252 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6,
254 src1_ptr += (8 * src2_stride);
255 SLLI_4V(dst0, dst1, dst2, dst3, 6);
256 SLLI_4V(dst4, dst5, dst6, dst7, 6);
258 dst3, 7, dst0, dst1, dst2, dst3);
260 dst7, 7, dst4, dst5, dst6, dst7);
263 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
264 dst += (8 * dst_stride);
279 v16u8 out0, out1, out2;
281 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
282 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
284 for (loop_cnt = 4; loop_cnt--;) {
286 src0_ptr += (4 * src_stride);
288 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
289 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
290 src1_ptr += (4 * src2_stride);
292 ILVR_B4_SH(
zero,
src0,
zero,
src1,
zero, src2,
zero, src3, dst0, dst1,
294 SLLI_4V(dst0, dst1, dst2, dst3, 6);
299 7, dst0, dst1, dst2, dst3);
301 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
302 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
303 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
304 dst += (4 * dst_stride);
317 v16u8 out0, out1, out2, out3;
319 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
320 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
323 for (loop_cnt = (
height >> 2); loop_cnt--;) {
325 src0_ptr += (4 * src_stride);
326 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
327 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
328 src1_ptr += (4 * src2_stride);
333 SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
334 SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
336 dst1_l, 7, dst0_r, dst1_r, dst0_l, dst1_l);
338 dst3_l, 7, dst2_r, dst3_r, dst2_l, dst3_l);
339 PCKEV_B2_UB(dst0_l, dst0_r, dst1_l, dst1_r, out0, out1);
340 PCKEV_B2_UB(dst2_l, dst2_r, dst3_l, dst3_r, out2, out3);
341 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
342 dst += (4 * dst_stride);
355 v16u8 out0, out1, out2, out3, out4, out5;
356 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
zero = { 0 };
357 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
358 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
360 for (loop_cnt = 8; loop_cnt--;) {
362 LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
363 src0_ptr += (4 * src_stride);
364 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
365 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
366 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
367 src1_ptr += (4 * src2_stride);
375 SLLI_4V(dst0, dst1, dst2, dst3, 6);
376 SLLI_4V(dst4, dst5, dst6, dst7, 6);
377 SLLI_4V(dst8, dst9, dst10, dst11, 6);
379 7, dst0, dst1, dst2, dst3);
381 7, dst4, dst5, dst6, dst7);
383 dst11, 7, dst8, dst9, dst10, dst11);
384 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
385 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
386 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
387 ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
388 dst += (4 * dst_stride);
401 v16u8 out0, out1, out2, out3;
404 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
405 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
407 for (loop_cnt = (
height >> 1); loop_cnt--;) {
409 src0_ptr += src_stride;
410 LD_SB2(src0_ptr, 16, src2, src3);
411 src0_ptr += src_stride;
412 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
413 src1_ptr += src2_stride;
414 LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
415 src1_ptr += src2_stride;
421 SLLI_4V(dst0, dst1, dst2, dst3, 6);
422 SLLI_4V(dst4, dst5, dst6, dst7, 6);
424 7, dst0, dst1, dst2, dst3);
426 7, dst4, dst5, dst6, dst7);
429 ST_UB2(out0, out1, dst, 16);
431 ST_UB2(out2, out3, dst, 16);
445 v16u8 out0, out1, out2, out3, out4, out5;
446 v16i8
src0,
src1, src2, src3, src4, src5;
448 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
449 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
451 for (loop_cnt = (
height >> 1); loop_cnt--;) {
453 src0_ptr += src_stride;
454 LD_SB3(src0_ptr, 16, src3, src4, src5);
455 src0_ptr += src_stride;
457 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
458 src1_ptr += src2_stride;
459 LD_SH6(src1_ptr, 8, in6, in7, in8, in9, in10, in11);
460 src1_ptr += src2_stride;
469 SLLI_4V(dst0, dst1, dst2, dst3, 6);
470 SLLI_4V(dst4, dst5, dst6, dst7, 6);
471 SLLI_4V(dst8, dst9, dst10, dst11, 6);
474 7, dst0, dst1, dst2, dst3);
476 7, dst4, dst5, dst6, dst7);
478 dst11, 7, dst8, dst9, dst10, dst11);
479 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
480 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
481 ST_UB2(out0, out1, dst, 16);
482 ST_UB(out2, dst + 32);
484 ST_UB2(out3, out4, dst, 16);
485 ST_UB(out5, dst + 32);
499 v16u8 out0, out1, out2, out3;
502 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
503 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
505 for (loop_cnt =
height; loop_cnt--;) {
507 src0_ptr += src_stride;
508 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
509 src1_ptr += src2_stride;
515 SLLI_4V(dst0, dst1, dst2, dst3, 6);
516 SLLI_4V(dst4, dst5, dst6, dst7, 6);
518 7, dst0, dst1, dst2, dst3);
520 7, dst4, dst5, dst6, dst7);
524 ST_UB4(out0, out1, out2, out3, dst, 16);
539 v8i16 filt0, filt1, filt2, filt3;
540 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
541 v16i8 mask1, mask2, mask3;
542 v16i8 vec0, vec1, vec2, vec3;
543 v8i16 dst0, dst1, dst2, dst3;
544 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
545 v8i16 filter_vec, const_vec;
552 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
558 const_vec = __msa_ldi_h(128);
561 for (loop_cnt = (
height >> 3); loop_cnt--;) {
563 src4, src5, src6, src7);
564 src0_ptr += (8 * src_stride);
565 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
566 src1_ptr += (8 * src2_stride);
577 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
578 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
581 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
582 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
585 VSHF_B2_SB(src4, src5, src6, src7, mask2, mask2, vec2, vec3);
586 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
589 VSHF_B2_SB(src4, src5, src6, src7, mask3, mask3, vec2, vec3);
590 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
594 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
597 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
598 dst += (8 * dst_stride);
612 v8i16 filt0, filt1, filt2, filt3;
614 v16i8 mask1, mask2, mask3;
615 v16i8 vec0, vec1, vec2, vec3;
616 v8i16 dst0, dst1, dst2, dst3;
617 v8i16 in0, in1, in2, in3;
618 v8i16 filter_vec, const_vec;
623 const_vec = __msa_ldi_h(128);
627 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
633 for (loop_cnt = (
height >> 2); loop_cnt--;) {
635 src0_ptr += (4 * src_stride);
636 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
637 src1_ptr += (4 * src2_stride);
645 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
646 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
649 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
650 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
653 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
654 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
657 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
658 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
662 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
665 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
666 dst += (4 * dst_stride);
683 v16i8 vec0, vec1, vec2;
684 v8i16 filt0, filt1, filt2, filt3;
685 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
686 v8i16 dst0, dst1, dst2;
687 v8i16 in0, in1, in2, in3;
688 v8i16 filter_vec, const_vec;
691 const_vec = __msa_ldi_h(128);
695 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
706 for (loop_cnt = 8; loop_cnt--;) {
708 src0_ptr += src_stride;
709 LD_SB2(src0_ptr, 8, src2, src3);
710 src0_ptr += src_stride;
711 LD_SH2(src1_ptr, 8, in0, in1);
712 src1_ptr += src2_stride;
713 LD_SH2(src1_ptr, 8, in2, in3);
714 src1_ptr += src2_stride;
724 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt0);
728 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt1);
732 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt2);
736 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt3);
738 in1 = (v8i16) __msa_pckev_d((v2i64) in3, (v2i64) in1);
740 dst2 = __msa_adds_s_h(in2, dst2);
741 dst2 = __msa_srari_h(dst2, 7);
745 tmp2 = __msa_copy_s_d((v2i64) dst0, 0);
746 tmp0 = __msa_copy_s_w((v4i32) dst0, 2);
747 tmp3 = __msa_copy_s_d((v2i64) dst1, 0);
748 tmp1 = __msa_copy_s_w((v4i32) dst0, 3);
769 v8i16 filt0, filt1, filt2, filt3;
770 v16i8 mask1, mask2, mask3;
771 v16i8 vec0, vec1, vec2, vec3;
772 v8i16 dst0, dst1, dst2, dst3;
773 v8i16 in0, in1, in2, in3;
774 v8i16 filter_vec, const_vec;
778 const_vec = __msa_ldi_h(128);
782 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
788 for (loop_cnt = (
height >> 1); loop_cnt--;) {
790 src0_ptr += src_stride;
791 LD_SB2(src0_ptr, 8, src2, src3);
792 src0_ptr += src_stride;
793 LD_SH2(src1_ptr, 8, in0, in1);
794 src1_ptr += src2_stride;
795 LD_SH2(src1_ptr, 8, in2, in3);
796 src1_ptr += src2_stride;
804 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
805 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
808 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
809 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
812 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
813 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
816 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
817 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
821 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
824 ST_SH2(dst0, dst1, dst, dst_stride);
825 dst += (2 * dst_stride);
841 v8i16 filt0, filt1, filt2, filt3;
842 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
843 v16i8 vec0, vec1, vec2, vec3;
844 v8i16 dst0, dst1, dst2;
846 v8i16 filter_vec, const_vec;
849 src0_ptr = src0_ptr - 3;
850 const_vec = __msa_ldi_h(128);
854 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
864 for (loop_cnt =
height; loop_cnt--;) {
866 src0_ptr += src_stride;
867 LD_SH2(src1_ptr, 8, in0, in1);
868 in2 =
LD_SH(src1_ptr + 16);
869 src1_ptr += src2_stride;
877 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt1, dst0,
881 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt2, filt2, dst1,
885 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt3, filt3, filt3, dst2,
889 dst2 = __msa_adds_s_h(dst2, in2);
890 dst2 = __msa_srari_h(dst2, 7);
894 dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
896 SD(dst_val0, dst + 16);
912 v8i16 filt0, filt1, filt2, filt3;
913 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
914 v16i8 vec0, vec1, vec2, vec3;
915 v8i16 dst0, dst1, dst2, dst3;
916 v8i16 in0, in1, in2, in3;
917 v8i16 filter_vec, const_vec;
921 const_vec = __msa_ldi_h(128);
925 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
935 for (loop_cnt =
height; loop_cnt--;) {
937 src2 =
LD_SB(src0_ptr + 24);
938 src0_ptr += src_stride;
939 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
940 src1_ptr += src2_stride;
949 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
953 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
957 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
961 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
965 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
968 ST_SB2(tmp0, tmp1, dst, 16);
984 v16i8 tmp0, tmp1, tmp2;
985 v8i16 filt0, filt1, filt2, filt3;
986 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
987 v16i8 vec0, vec1, vec2, vec3;
988 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
989 v8i16 in0, in1, in2, in3, in4, in5;
990 v8i16 filter_vec, const_vec;
995 const_vec = __msa_ldi_h(128);
999 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1009 for (loop_cnt = 64; loop_cnt--;) {
1011 src3 =
LD_SB(src0_ptr + 40);
1012 src0_ptr += src_stride;
1013 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1023 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1027 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1031 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1035 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1041 ST_SB(tmp1, dst + 16);
1043 LD_SH2(src1_ptr + 32, 8, in4, in5);
1044 src1_ptr += src2_stride;
1048 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec0, vec1);
1049 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
1050 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4,
1052 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec0, vec1);
1053 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
1054 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt3, filt3, dst4,
1059 tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
1060 ST_SB(tmp2, dst + 32);
1075 v16i8
src0,
src1, src2, src3, src4, src5, tmp0, tmp1;
1076 v8i16 filt0, filt1, filt2, filt3;
1078 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1079 v16i8 vec0, vec1, vec2, vec3;
1080 v8i16 dst0, dst1, dst2, dst3;
1081 v8i16 in0, in1, in2, in3;
1082 v8i16 filter_vec, const_vec;
1086 const_vec = __msa_ldi_h(128);
1090 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1100 for (loop_cnt =
height; loop_cnt--;) {
1102 src2 =
LD_SB(src0_ptr + 24);
1103 LD_SB2(src0_ptr + 32, 16, src3, src4);
1104 src5 =
LD_SB(src0_ptr + 56);
1105 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1115 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1119 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1123 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1127 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1131 dst0, dst1, dst2, dst3, 7,
1132 dst0, dst1, dst2, dst3);
1135 ST_SB2(tmp0, tmp1, dst, 16);
1141 LD_SH4(src1_ptr + 32, 8, in0, in1, in2, in3);
1150 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1154 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1158 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1162 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1165 dst0, dst1, dst2, dst3, 7,
1166 dst0, dst1, dst2, dst3);
1168 ST_SB2(tmp0, tmp1, dst + 32, 16);
1169 src1_ptr += src2_stride;
1170 src0_ptr += src_stride;
1185 v16i8
src0,
src1, src2, src3, src4, src5;
1186 v16i8 src6, src7, src8, src9, src10;
1187 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1188 v16i8 src11, src12, src13, src14;
1189 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1190 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1191 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1192 v16i8 src2110, src4332, src6554, src8776, src10998;
1193 v16i8 src12111110, src14131312;
1194 v8i16 dst10, dst32, dst54, dst76;
1195 v8i16 filt0, filt1, filt2, filt3;
1196 v8i16 filter_vec, const_vec;
1198 src0_ptr -= (3 * src_stride);
1200 const_vec = __msa_ldi_h(128);
1204 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1206 LD_SB7(src0_ptr, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
1207 src0_ptr += (7 * src_stride);
1209 src10_r, src32_r, src54_r, src21_r);
1210 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1211 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1212 src2110, src4332, src6554);
1215 for (loop_cnt = (
height >> 3); loop_cnt--;) {
1216 LD_SB8(src0_ptr, src_stride,
1217 src7, src8, src9, src10, src11, src12, src13, src14);
1218 src0_ptr += (8 * src_stride);
1219 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1220 src1_ptr += (8 * src2_stride);
1224 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1225 src76_r, src87_r, src98_r, src109_r);
1226 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1227 src1110_r, src1211_r, src1312_r, src1413_r);
1228 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1229 src1413_r, src1312_r,
1230 src8776, src10998, src12111110, src14131312);
1235 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1238 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1241 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1243 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1244 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1247 dst10, dst32, dst54, dst76, 7,
1248 dst10, dst32, dst54, dst76);
1250 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
1251 ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1252 dst += (8 * dst_stride);
1255 src4332 = src12111110;
1256 src6554 = src14131312;
1271 v16i8
src0,
src1, src2, src3, src4, src5;
1272 v16i8 src6, src7, src8, src9, src10;
1273 v8i16 in0, in1, in2, in3;
1274 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1275 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1276 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1277 v8i16 filt0, filt1, filt2, filt3;
1278 v8i16 filter_vec, const_vec;
1280 src0_ptr -= (3 * src_stride);
1281 const_vec = __msa_ldi_h(128);
1285 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1287 LD_SB7(src0_ptr, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
1288 src0_ptr += (7 * src_stride);
1291 src10_r, src32_r, src54_r, src21_r);
1292 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1294 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1295 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1296 src0_ptr += (4 * src_stride);
1297 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1298 src1_ptr += (4 * src2_stride);
1300 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1301 src76_r, src87_r, src98_r, src109_r);
1305 filt0, filt1, filt2, filt3,
1306 dst0_r, dst0_r, dst0_r, dst0_r);
1309 filt0, filt1, filt2, filt3,
1310 dst1_r, dst1_r, dst1_r, dst1_r);
1313 filt0, filt1, filt2, filt3,
1314 dst2_r, dst2_r, dst2_r, dst2_r);
1317 filt0, filt1, filt2, filt3,
1318 dst3_r, dst3_r, dst3_r, dst3_r);
1321 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1322 dst0_r, dst1_r, dst2_r, dst3_r);
1324 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1325 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
1326 dst += (4 * dst_stride);
1349 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1350 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1351 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1352 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1353 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1354 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1355 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1356 v16i8 src2110, src4332, src6554, src8776, src10998;
1357 v8i16 dst0_l, dst1_l;
1358 v8i16 filt0, filt1, filt2, filt3;
1359 v8i16 filter_vec, const_vec;
1361 src0_ptr -= (3 * src_stride);
1362 const_vec = __msa_ldi_h(128);
1366 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1368 LD_SB7(src0_ptr, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
1369 src0_ptr += (7 * src_stride);
1373 src10_r, src32_r, src54_r, src21_r);
1374 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1376 src10_l, src32_l, src54_l, src21_l);
1377 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1378 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1379 src2110, src4332, src6554);
1381 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1382 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1383 src0_ptr += (4 * src_stride);
1384 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1385 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
1386 src1_ptr += (4 * src2_stride);
1390 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1391 src76_r, src87_r, src98_r, src109_r);
1392 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1393 src76_l, src87_l, src98_l, src109_l);
1394 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1398 filt0, filt1, filt2, filt3,
1399 dst0_r, dst0_r, dst0_r, dst0_r);
1402 filt0, filt1, filt2, filt3,
1403 dst1_r, dst1_r, dst1_r, dst1_r);
1406 filt0, filt1, filt2, filt3,
1407 dst2_r, dst2_r, dst2_r, dst2_r);
1410 filt0, filt1, filt2, filt3,
1411 dst3_r, dst3_r, dst3_r, dst3_r);
1414 filt0, filt1, filt2, filt3,
1415 dst0_l, dst0_l, dst0_l, dst0_l);
1418 filt0, filt1, filt2, filt3,
1419 dst1_l, dst1_l, dst1_l, dst1_l);
1422 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1423 dst0_r, dst1_r, dst2_r, dst3_r);
1427 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1428 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
1429 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
1430 ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
1431 dst += (4 * dst_stride);
1456 int16_t *src1_ptr_tmp;
1460 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1461 v8i16 in0, in1, in2, in3;
1462 v16i8 src10_r, src32_r, src54_r, src76_r;
1463 v16i8 src21_r, src43_r, src65_r, src87_r;
1464 v8i16 dst0_r, dst1_r;
1465 v16i8 src10_l, src32_l, src54_l, src76_l;
1466 v16i8 src21_l, src43_l, src65_l, src87_l;
1467 v8i16 dst0_l, dst1_l;
1468 v8i16 filt0, filt1, filt2, filt3;
1469 v8i16 filter_vec, const_vec;
1471 src0_ptr -= (3 * src_stride);
1472 const_vec = __msa_ldi_h(128);
1476 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1478 for (cnt = (
width >> 4); cnt--;) {
1479 src0_ptr_tmp = src0_ptr;
1480 src1_ptr_tmp = src1_ptr;
1483 LD_SB7(src0_ptr_tmp, src_stride,
1484 src0,
src1, src2, src3, src4, src5, src6);
1485 src0_ptr_tmp += (7 * src_stride);
1489 src10_r, src32_r, src54_r, src21_r);
1490 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1492 src10_l, src32_l, src54_l, src21_l);
1493 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1495 for (loop_cnt = (
height >> 1); loop_cnt--;) {
1496 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1497 src0_ptr_tmp += (2 * src_stride);
1498 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1499 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1500 src1_ptr_tmp += (2 * src2_stride);
1503 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1504 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1508 filt0, filt1, filt2, filt3,
1509 dst0_r, dst0_r, dst0_r, dst0_r);
1512 filt0, filt1, filt2, filt3,
1513 dst1_r, dst1_r, dst1_r, dst1_r);
1516 filt0, filt1, filt2, filt3,
1517 dst0_l, dst0_l, dst0_l, dst0_l);
1520 filt0, filt1, filt2, filt3,
1521 dst1_l, dst1_l, dst1_l, dst1_l);
1524 dst0_r, dst1_r, dst0_l, dst1_l, 7,
1525 dst0_r, dst1_r, dst0_l, dst1_l);
1527 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
1528 ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
1529 dst_tmp += (2 * dst_stride);
1625 const int8_t *filter_x,
1626 const int8_t *filter_y,
1632 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1633 v8i16 in0 = { 0 }, in1 = { 0 };
1634 v8i16 filt0, filt1, filt2, filt3;
1635 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1636 v16i8 mask1, mask2, mask3;
1637 v8i16 filter_vec, const_vec;
1638 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1639 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1641 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1642 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1643 v4i32 dst0, dst1, dst2, dst3;
1646 src0_ptr -= ((3 * src_stride) + 3);
1647 filter_vec =
LD_SH(filter_x);
1648 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1650 filter_vec =
LD_SH(filter_y);
1653 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1659 const_vec = __msa_ldi_h(128);
1662 LD_SB7(src0_ptr, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
1663 src0_ptr += (7 * src_stride);
1667 VSHF_B4_SB(
src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1668 VSHF_B4_SB(
src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1669 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1670 vec8, vec9, vec10, vec11);
1671 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1672 vec12, vec13, vec14, vec15);
1687 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1689 for (loop_cnt =
height >> 2; loop_cnt--;) {
1690 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1691 src0_ptr += (4 * src_stride);
1694 LD2(src1_ptr, src2_stride, tp0, tp1);
1696 src1_ptr += (2 * src2_stride);
1697 LD2(src1_ptr, src2_stride, tp0, tp1);
1699 src1_ptr += (2 * src2_stride);
1701 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1702 vec0, vec1, vec2, vec3);
1703 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1704 vec4, vec5, vec6, vec7);
1710 dst76 = __msa_ilvr_h(dst97, dst66);
1712 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1713 dst98 = __msa_ilvr_h(dst66, dst108);
1715 dst0 =
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
1717 dst1 =
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
1719 dst2 =
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
1721 dst3 =
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
1724 SRA_4V(dst0, dst1, dst2, dst3, 6);
1727 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
1730 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1731 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
1732 dst += (4 * dst_stride);
1740 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1750 const int8_t *filter_x,
1751 const int8_t *filter_y,
1757 int16_t *src1_ptr_tmp;
1760 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
1762 v8i16 filt0, filt1, filt2, filt3;
1763 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1765 v16i8 mask1, mask2, mask3;
1766 v8i16 filter_vec, const_vec;
1767 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1768 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1769 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1770 v4i32 dst0_r, dst0_l;
1771 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1772 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1774 src0_ptr -= ((3 * src_stride) + 3);
1775 const_vec = __msa_ldi_h(128);
1778 filter_vec =
LD_SH(filter_x);
1779 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1781 filter_vec =
LD_SH(filter_y);
1784 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1790 for (cnt =
width >> 3; cnt--;) {
1791 src0_ptr_tmp = src0_ptr;
1793 src1_ptr_tmp = src1_ptr;
1795 LD_SB7(src0_ptr_tmp, src_stride,
1796 src0,
src1, src2, src3, src4, src5, src6);
1797 src0_ptr_tmp += (7 * src_stride);
1802 vec0, vec1, vec2, vec3);
1804 vec4, vec5, vec6, vec7);
1805 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1806 vec8, vec9, vec10, vec11);
1807 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1808 vec12, vec13, vec14, vec15);
1818 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1819 vec0, vec1, vec2, vec3);
1820 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1821 vec4, vec5, vec6, vec7);
1822 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1823 vec8, vec9, vec10, vec11);
1831 for (loop_cnt =
height; loop_cnt--;) {
1832 src7 =
LD_SB(src0_ptr_tmp);
1833 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1834 src0_ptr_tmp += src_stride;
1836 in0 =
LD_SH(src1_ptr_tmp);
1837 src1_ptr_tmp += src2_stride;
1839 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1840 vec0, vec1, vec2, vec3);
1848 filt_h0, filt_h1, filt_h2, filt_h3);
1850 filt_h0, filt_h1, filt_h2, filt_h3);
1854 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1856 tmp = __msa_srari_h(
tmp, 7);
1858 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8)
tmp);
1860 dst_tmp += dst_stride;
1883 const int8_t *filter_x,
1884 const int8_t *filter_y,
1888 dst, dst_stride, filter_x, filter_y,
1898 const int8_t *filter_x,
1899 const int8_t *filter_y,
1903 uint8_t *src0_ptr_tmp, *dst_tmp;
1904 int16_t *src1_ptr_tmp;
1907 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1908 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1909 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1910 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1911 v8i16 in0, in1 = { 0 }, out0, out1,
tmp, filter_vec, const_vec;
1912 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1913 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1914 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1915 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1916 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1917 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1918 v4i32 dst0_r, dst0_l, tmp0, tmp1, tmp2, tmp3;
1920 src0_ptr -= ((3 * src_stride) + 3);
1922 const_vec = __msa_ldi_h(128);
1925 filter_vec =
LD_SH(filter_x);
1926 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1928 filter_vec =
LD_SH(filter_y);
1931 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1938 src0_ptr_tmp = src0_ptr;
1940 src1_ptr_tmp = src1_ptr;
1942 LD_SB7(src0_ptr_tmp, src_stride,
src0,
src1, src2, src3, src4, src5,
1944 src0_ptr_tmp += (7 * src_stride);
1952 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1954 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1964 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1966 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
1968 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1977 for (loop_cnt = 16; loop_cnt--;) {
1978 src7 =
LD_SB(src0_ptr_tmp);
1979 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1980 src0_ptr_tmp += src_stride;
1982 in0 =
LD_SH(src1_ptr_tmp);
1983 src1_ptr_tmp += src2_stride;
1985 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1993 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1994 filt_h1, filt_h2, filt_h3);
1995 dst0_l =
HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
1996 filt_h1, filt_h2, filt_h3);
2000 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2002 tmp = __msa_srari_h(
tmp, 7);
2004 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8)
tmp);
2006 dst_tmp += dst_stride;
2026 LD_SB7(src0_ptr, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
2027 src0_ptr += (7 * src_stride);
2031 VSHF_B4_SB(
src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2032 VSHF_B4_SB(
src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2033 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7,
2034 vec8, vec9, vec10, vec11);
2035 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7,
2036 vec12, vec13, vec14, vec15);
2050 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2052 for (loop_cnt = 4; loop_cnt--;) {
2053 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2054 src0_ptr += (4 * src_stride);
2057 LD2(src1_ptr, src2_stride, tp0, tp1);
2059 src1_ptr += (2 * src2_stride);
2060 LD2(src1_ptr, src2_stride, tp0, tp1);
2062 src1_ptr += (2 * src2_stride);
2064 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2066 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2073 dst76 = __msa_ilvr_h(dst97, dst66);
2075 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2076 dst98 = __msa_ilvr_h(dst66, dst108);
2078 tmp0 =
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2080 tmp1 =
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2082 tmp2 =
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2084 tmp3 =
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2086 SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
2089 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
2092 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
2093 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
2094 dst += (4 * dst_stride);
2102 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2112 const int8_t *filter_x,
2113 const int8_t *filter_y,
2117 dst, dst_stride, filter_x, filter_y,
2127 const int8_t *filter_x,
2128 const int8_t *filter_y,
2132 dst, dst_stride, filter_x, filter_y,
2142 const int8_t *filter_x,
2143 const int8_t *filter_y,
2147 dst, dst_stride, filter_x, filter_y,
2157 const int8_t *filter_x,
2158 const int8_t *filter_y,
2162 dst, dst_stride, filter_x, filter_y,
2172 const int8_t *filter_x,
2173 const int8_t *filter_y,
2177 dst, dst_stride, filter_x, filter_y,
2191 v16i8
src0,
src1, dst0, vec0, vec1;
2196 v8i16 filter_vec, const_vec;
2200 const_vec = __msa_ldi_h(128);
2209 LD_SH2(src1_ptr, src2_stride, in0, in1);
2210 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2216 tmp0 = __msa_adds_s_h(tmp0, in0);
2217 tmp0 = __msa_srari_h(tmp0, 7);
2219 dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
2221 ST_W2(dst0, 0, 1, dst, dst_stride);
2234 v16i8
src0,
src1, src2, src3, dst0, vec0, vec1;
2235 v8i16 in0, in1, in2, in3;
2240 v8i16 filter_vec, const_vec;
2244 const_vec = __msa_ldi_h(128);
2253 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2262 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, tmp0, tmp1,
2265 dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2267 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
2281 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2283 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2285 v16i8 mask1, vec0, vec1, vec2, vec3;
2286 v8i16 tmp0, tmp1, tmp2, tmp3;
2287 v8i16 filter_vec, const_vec;
2291 const_vec = __msa_ldi_h(128);
2299 for (loop_cnt = (
height >> 3); loop_cnt--;) {
2300 LD_SB8(src0_ptr, src_stride,
2301 src0,
src1, src2, src3, src4, src5, src6, src7);
2302 src0_ptr += (8 * src_stride);
2303 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2304 src1_ptr += (4 * src2_stride);
2305 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2306 src1_ptr += (4 * src2_stride);
2316 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
2317 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0,
2320 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
2321 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, tmp0,
2325 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
2328 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2329 dst += (8 * dst_stride);
2345 }
else if (4 ==
height) {
2350 src1_ptr, src2_stride,
2367 v8i16 in0, in1, in2, in3;
2370 v16i8 vec0, vec1, vec2, vec3;
2371 v8i16 dst0, dst1, dst2, dst3;
2372 v8i16 filter_vec, const_vec;
2376 const_vec = __msa_ldi_h(128);
2384 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2386 src0_ptr += (4 * src_stride);
2387 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2388 src1_ptr += (4 * src2_stride);
2396 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2397 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2400 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2401 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2405 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2408 ST_W2(dst0, 0, 2, dst, dst_stride);
2409 ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2410 ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
2411 ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2412 dst += (4 * dst_stride);
2429 v16i8 mask1, vec0, vec1, vec2, vec3;
2431 v8i16 filter_vec, const_vec;
2435 const_vec = __msa_ldi_h(128);
2444 LD_SH2(src1_ptr, src2_stride, in0, in1);
2451 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst0, dst1,
2455 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2456 ST_D2(dst0, 0, 1, dst, dst_stride);
2469 v16i8
src0,
src1, src2, src3, src4, src5;
2470 v8i16 in0, in1, in2, in3, in4, in5;
2473 v16i8 vec0, vec1, vec2, vec3;
2474 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2475 v8i16 filter_vec, const_vec;
2479 const_vec = __msa_ldi_h(128);
2487 LD_SB6(src0_ptr, src_stride,
src0,
src1, src2, src3, src4, src5);
2488 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2489 src1_ptr += (4 * src2_stride);
2490 LD_SH2(src1_ptr, src2_stride, in4, in5);
2498 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2499 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, dst1,
2502 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2503 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, dst1,
2508 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2509 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec2, vec3);
2510 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4, dst5,
2514 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2518 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2519 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2520 ST_D2(dst2, 0, 1, dst + 4 * dst_stride, dst_stride);
2535 v8i16 in0, in1, in2, in3;
2538 v16i8 vec0, vec1, vec2, vec3;
2539 v8i16 dst0, dst1, dst2, dst3;
2540 v8i16 filter_vec, const_vec;
2544 const_vec = __msa_ldi_h(128);
2552 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2554 src0_ptr += (4 * src_stride);
2555 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2556 src1_ptr += (4 * src2_stride);
2564 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2565 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2568 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2569 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2573 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2576 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2577 dst += (4 * dst_stride);
2593 }
else if (6 ==
height) {
2596 }
else if (0 == (
height % 4)) {
2598 src1_ptr, src2_stride,
2615 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2618 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2621 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2622 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2623 v8i16 filter_vec, const_vec;
2627 const_vec = __msa_ldi_h(128);
2636 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2638 src0_ptr += (4 * src_stride);
2639 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2640 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
2641 src1_ptr += (4 * src2_stride);
2653 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2655 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2659 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2661 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2666 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2670 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2671 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2672 ST_W4(dst2, 0, 1, 2, 3, dst + 8, dst_stride);
2673 dst += (4 * dst_stride);
2687 v16i8
src0,
src1, src2, src3, vec0, vec1, vec2, vec3;
2688 v8i16 in0, in1, in2, in3, dst0, dst1, dst2, dst3;
2692 v8i16 filter_vec, const_vec;
2696 const_vec = __msa_ldi_h(128);
2704 for (loop_cnt = (
height >> 1); loop_cnt--;) {
2706 LD_SB2(src0_ptr + 8, src_stride,
src1, src3);
2707 src0_ptr += (2 * src_stride);
2708 LD_SH2(src1_ptr, src2_stride, in0, in2);
2709 LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
2710 src1_ptr += (2 * src2_stride);
2720 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2721 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2724 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2725 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2729 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2732 ST_SH2(dst0, dst1, dst, dst_stride);
2733 dst += (2 * dst_stride);
2746 int16_t *src1_ptr_tmp;
2749 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2750 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2753 v16i8 mask1, mask2, mask3;
2754 v16i8 vec0, vec1, vec2, vec3;
2755 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2756 v8i16 filter_vec, const_vec;
2760 const_vec = __msa_ldi_h(128);
2771 src1_ptr_tmp = src1_ptr + 16;
2773 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2774 LD_SB4(src0_ptr, src_stride,
src0, src2, src4, src6);
2775 LD_SB4(src0_ptr + 16, src_stride,
src1, src3, src5, src7);
2776 src0_ptr += (4 * src_stride);
2777 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2778 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2779 src1_ptr += (4 * src2_stride);
2787 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask2, vec2, vec3);
2788 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2791 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask3, vec2, vec3);
2792 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2799 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask2, vec0, vec1);
2800 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask2, vec2, vec3);
2801 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst4,
2803 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask3, vec0, vec1);
2804 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask3, vec2, vec3);
2805 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst4,
2809 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2811 dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2814 dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2815 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2816 dst += (4 * dst_stride);
2818 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
2819 src1_ptr_tmp += (4 * src2_stride);
2826 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2827 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2830 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec2, vec3);
2831 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2835 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2838 ST_D4(dst0, dst1, 0, 1, 0, 1, dst_tmp, dst_stride);
2839 dst_tmp += (4 * dst_stride);
2854 v8i16 in0, in1, in2, in3;
2857 v16i8 mask1, mask2, mask3;
2858 v8i16 dst0, dst1, dst2, dst3;
2859 v16i8 vec0, vec1, vec2, vec3;
2860 v8i16 filter_vec, const_vec;
2864 const_vec = __msa_ldi_h(128);
2874 for (loop_cnt =
height; loop_cnt--;) {
2876 src2 =
LD_SB(src0_ptr + 24);
2877 src0_ptr += src_stride;
2878 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
2879 src1_ptr += src2_stride;
2888 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2892 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2896 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2899 ST_SH2(dst0, dst1, dst, 16);
2913 v16i8
src0,
src1, src2, src3, src4;
2915 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2918 v8i16 filter_vec, const_vec;
2920 src0_ptr -= src_stride;
2922 const_vec = __msa_ldi_h(128);
2929 src0_ptr += (3 * src_stride);
2932 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2933 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2935 LD_SB2(src0_ptr, src_stride, src3, src4);
2936 LD_SH2(src1_ptr, src2_stride, in0, in1);
2937 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2938 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2939 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2940 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2943 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2944 dst10 = __msa_adds_s_h(dst10, in0);
2945 dst10 = __msa_srari_h(dst10, 7);
2948 dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
2949 ST_W2(dst10, 0, 1, dst, dst_stride);
2961 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2962 v8i16 in0, in1, in2, in3;
2963 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2964 v16i8 src2110, src4332, src6554;
2967 v8i16 filter_vec, const_vec;
2969 src0_ptr -= src_stride;
2971 const_vec = __msa_ldi_h(128);
2978 src0_ptr += (3 * src_stride);
2980 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2981 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2983 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
2984 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2986 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2987 src32_r, src43_r, src54_r, src65_r);
2988 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
2992 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2994 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2997 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
2998 ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
3011 v16i8
src0,
src1, src2, src3, src4, src5;
3012 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3013 v16i8 src6, src7, src8, src9;
3014 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3015 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3016 v16i8 src2110, src4332, src6554, src8776;
3017 v8i16 dst10, dst32, dst54, dst76;
3019 v8i16 filter_vec, const_vec;
3021 src0_ptr -= src_stride;
3023 const_vec = __msa_ldi_h(128);
3030 src0_ptr += (3 * src_stride);
3032 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3033 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3035 for (loop_cnt = (
height >> 3); loop_cnt--;) {
3036 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3037 src0_ptr += (6 * src_stride);
3038 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3039 src1_ptr += (8 * src2_stride);
3042 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3043 src32_r, src43_r, src54_r, src65_r);
3044 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3045 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3046 src4332, src6554, src8776);
3050 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3052 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3054 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
3056 LD_SB2(src0_ptr, src_stride, src9, src2);
3057 src0_ptr += (2 * src_stride);
3058 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3059 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3060 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3062 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
3065 dst10, dst32, dst54, dst76, 7,
3066 dst10, dst32, dst54, dst76);
3068 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
3069 ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3070 dst += (8 * dst_stride);
3086 }
else if (4 ==
height) {
3091 src1_ptr, src2_stride,
3105 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3106 v8i16 in0, in1, in2, in3;
3107 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3108 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3110 v8i16 filter_vec, const_vec;
3112 src0_ptr -= src_stride;
3114 const_vec = __msa_ldi_h(128);
3121 src0_ptr += (3 * src_stride);
3122 LD_SB2(src0_ptr, src_stride, src3, src4);
3123 src0_ptr += (2 * src_stride);
3124 LD_SB2(src0_ptr, src_stride, src5, src6);
3125 src0_ptr += (2 * src_stride);
3126 LD_SB2(src0_ptr, src_stride, src7, src8);
3127 src0_ptr += (2 * src_stride);
3128 LD_SB2(src0_ptr, src_stride, src9, src10);
3129 src0_ptr += (2 * src_stride);
3131 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3132 src1_ptr += (4 * src2_stride);
3141 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3144 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3146 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3148 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3151 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3153 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3156 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3157 dst0_r, dst1_r, dst2_r, dst3_r);
3159 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3160 ST_W2(dst0_r, 0, 2, dst, dst_stride);
3161 ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
3162 ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
3163 ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3164 dst += (4 * dst_stride);
3166 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3167 src1_ptr += (4 * src2_stride);
3168 ILVR_B2_SB(src7, src6, src8, src7, src32_r, src43_r);
3171 DPADD_SB2_SH(src54_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3173 DPADD_SB2_SH(src65_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3175 ILVR_B2_SB(src9, src8, src10, src9, src54_r, src65_r);
3178 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3180 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3183 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3184 dst0_r, dst1_r, dst2_r, dst3_r);
3186 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3187 ST_W2(dst0_r, 0, 2, dst, dst_stride);
3188 ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
3189 ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
3190 ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3191 dst += (4 * dst_stride);
3203 v16i8
src0,
src1, src2, src3, src4;
3204 v8i16 in0, in1, dst0_r, dst1_r;
3205 v16i8 src10_r, src32_r, src21_r, src43_r;
3207 v8i16 filter_vec, const_vec;
3209 src0_ptr -= src_stride;
3211 const_vec = __msa_ldi_h(128);
3218 src0_ptr += (3 * src_stride);
3222 LD_SB2(src0_ptr, src_stride, src3, src4);
3223 LD_SH2(src1_ptr, src2_stride, in0, in1);
3225 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3228 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3230 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3233 dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
3235 ST_D2(dst0_r, 0, 1, dst, dst_stride);
3247 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3248 v8i16 in0, in1, in2, in3, in4, in5;
3249 v16i8 src10_r, src32_r, src54_r, src76_r;
3250 v16i8 src21_r, src43_r, src65_r, src87_r;
3251 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3253 v8i16 filter_vec, const_vec;
3255 src0_ptr -= src_stride;
3257 const_vec = __msa_ldi_h(128);
3264 src0_ptr += (3 * src_stride);
3268 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3269 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3271 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3272 src32_r, src43_r, src54_r, src65_r);
3273 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3276 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3278 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3280 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3282 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3284 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r);
3286 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r);
3288 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3289 dst0_r, dst1_r, dst2_r, dst3_r);
3292 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3293 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
3294 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3295 ST_D2(dst2_r, 0, 1, dst + 4 * dst_stride, dst_stride);
3308 v16i8
src0,
src1, src2, src3, src4, src5;
3309 v8i16 in0, in1, in2, in3;
3310 v16i8 src10_r, src32_r, src21_r, src43_r;
3311 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3313 v8i16 filter_vec, const_vec;
3315 src0_ptr -= src_stride;
3317 const_vec = __msa_ldi_h(128);
3324 src0_ptr += (3 * src_stride);
3328 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3329 LD_SB2(src0_ptr, src_stride, src3, src4);
3330 src0_ptr += (2 * src_stride);
3331 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3332 src1_ptr += (4 * src2_stride);
3334 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3337 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3339 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3341 LD_SB2(src0_ptr, src_stride, src5, src2);
3342 src0_ptr += (2 * src_stride);
3344 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3347 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3349 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3351 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3352 dst0_r, dst1_r, dst2_r, dst3_r);
3354 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3355 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3356 dst += (4 * dst_stride);
3372 }
else if (6 ==
height) {
3377 src1_ptr, src2_stride,
3392 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3393 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3394 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3395 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3396 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3397 v16i8 src2110, src4332, src6554;
3398 v8i16 dst0_l, dst1_l, filt0, filt1;
3399 v8i16 filter_vec, const_vec;
3401 src0_ptr -= (1 * src_stride);
3403 const_vec = __msa_ldi_h(128);
3410 src0_ptr += (3 * src_stride);
3414 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3416 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3417 LD_SB2(src0_ptr, src_stride, src3, src4);
3418 src0_ptr += (2 * src_stride);
3419 LD_SB2(src0_ptr, src_stride, src5, src6);
3420 src0_ptr += (2 * src_stride);
3421 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3422 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
3423 src1_ptr += (4 * src2_stride);
3428 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3429 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3430 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3431 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3432 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
3433 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3436 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3438 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3440 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
3442 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3444 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3446 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
3448 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3449 dst0_r, dst1_r, dst2_r, dst3_r);
3452 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3453 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
3454 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3455 ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
3456 dst += (4 * dst_stride);
3475 v16i8
src0,
src1, src2, src3, src4, src5;
3476 v8i16 in0, in1, in2, in3;
3477 v16i8 src10_r, src32_r, src21_r, src43_r;
3478 v16i8 src10_l, src32_l, src21_l, src43_l;
3479 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
3481 v8i16 filter_vec, const_vec;
3483 src0_ptr -= src_stride;
3485 const_vec = __msa_ldi_h(128);
3492 src0_ptr += (3 * src_stride);
3497 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3498 LD_SB2(src0_ptr, src_stride, src3, src4);
3499 src0_ptr += (2 * src_stride);
3500 LD_SH2(src1_ptr, src2_stride, in0, in1);
3501 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3502 src1_ptr += (2 * src2_stride);
3504 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3505 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3508 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3510 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3512 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3514 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3516 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3517 dst0_r, dst1_r, dst0_l, dst1_l);
3519 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3520 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3521 dst += (2 * dst_stride);
3523 LD_SB2(src0_ptr, src_stride, src5, src2);
3524 src0_ptr += (2 * src_stride);
3525 LD_SH2(src1_ptr, src2_stride, in0, in1);
3526 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3527 src1_ptr += (2 * src2_stride);
3529 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3530 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3533 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3535 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3537 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3539 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3541 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3542 dst0_r, dst1_r, dst0_l, dst1_l);
3544 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3545 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3546 dst += (2 * dst_stride);
3560 v16i8
src0,
src1, src2, src3, src4, src5;
3561 v16i8 src6, src7, src8, src9, src10, src11;
3562 v8i16 in0, in1, in2, in3, in4, in5;
3563 v16i8 src10_r, src32_r, src76_r, src98_r;
3564 v16i8 src21_r, src43_r, src87_r, src109_r;
3565 v16i8 src10_l, src32_l, src21_l, src43_l;
3566 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3567 v8i16 dst0_l, dst1_l;
3569 v8i16 filter_vec, const_vec;
3571 src0_ptr -= src_stride;
3573 const_vec = __msa_ldi_h(128);
3585 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3586 src0_ptr += (3 * src_stride);
3588 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3590 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3592 LD_SB2(src0_ptr, src_stride, src3, src4);
3593 LD_SH2(src1_ptr, src2_stride, in0, in1);
3594 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3595 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3596 src1_ptr += (2 * src2_stride);
3598 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3599 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3601 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3602 src0_ptr += (2 * src_stride);
3604 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3607 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3609 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3611 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3613 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3616 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3618 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3621 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3622 dst0_r, dst1_r, dst0_l, dst1_l);
3626 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3627 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3628 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3629 ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
3630 dst += (2 * dst_stride);
3633 LD_SB2(src0_ptr, src_stride, src5, src2);
3634 LD_SH2(src1_ptr, src2_stride, in0, in1);
3635 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3636 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3637 src1_ptr += (2 * src2_stride);
3639 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3640 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3642 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
3643 src0_ptr += (2 * src_stride);
3645 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3648 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3650 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3652 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3654 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3657 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3659 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3662 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3663 dst0_r, dst1_r, dst0_l, dst1_l);
3666 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3667 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3668 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3669 ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
3670 dst += (2 * dst_stride);
3685 v16i8
src0,
src1, src2, src3, src4, src6, src7, src8, src9, src10;
3686 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3687 v16i8 src10_r, src32_r, src76_r, src98_r;
3688 v16i8 src21_r, src43_r, src87_r, src109_r;
3689 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3690 v16i8 src10_l, src32_l, src76_l, src98_l;
3691 v16i8 src21_l, src43_l, src87_l, src109_l;
3692 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3694 v8i16 filter_vec, const_vec;
3696 src0_ptr -= src_stride;
3698 const_vec = __msa_ldi_h(128);
3711 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3712 src0_ptr += (3 * src_stride);
3714 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3715 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3717 for (loop_cnt = (
height >> 1); loop_cnt--;) {
3719 LD_SB2(src0_ptr, src_stride, src3, src4);
3720 LD_SH2(src1_ptr, src2_stride, in0, in1);
3721 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3722 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3723 LD_SH2((src1_ptr + 24), src2_stride, in6, in7);
3724 src1_ptr += (2 * src2_stride);
3726 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3727 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3730 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3732 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3734 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3736 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3739 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3740 dst0_r, dst1_r, dst0_l, dst1_l);
3748 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3749 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3750 dst += (2 * dst_stride);
3753 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3754 src0_ptr += (2 * src_stride);
3756 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3757 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3760 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3762 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3764 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3766 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3769 dst2_r, dst3_r, dst2_l, dst3_l, 7,
3770 dst2_r, dst3_r, dst2_l, dst3_l);
3772 PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3773 ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
3774 dst_tmp += (2 * dst_stride);
3790 const int8_t *filter_x,
3791 const int8_t *filter_y)
3796 v16i8
src0,
src1, src2, src3, src4;
3798 v8i16 filt_h0, filt_h1;
3801 v8i16 filter_vec, const_vec;
3802 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3803 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43,
tmp;
3806 src0_ptr -= (src_stride + 1);
3808 filter_vec =
LD_SH(filter_x);
3811 filter_vec =
LD_SH(filter_y);
3818 const_vec = __msa_ldi_h(128);
3824 LD2(src1_ptr, src2_stride, tp0, tp1);
3826 in0 = __msa_adds_s_h(in0, const_vec);
3830 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3843 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3844 tmp = __msa_adds_s_h(
tmp, in0);
3845 tmp = __msa_srari_h(
tmp, 7);
3847 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8)
tmp);
3857 const int8_t *filter_x,
3858 const int8_t *filter_y)
3862 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3864 v8i16 filt_h0, filt_h1;
3867 v8i16 filter_vec, const_vec;
3868 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3870 v8i16 in0 = { 0 }, in1 = { 0 };
3871 v8i16 dst30, dst41, dst52, dst63;
3872 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3873 v4i32 dst0, dst1, dst2, dst3;
3875 src0_ptr -= (src_stride + 1);
3877 filter_vec =
LD_SH(filter_x);
3880 filter_vec =
LD_SH(filter_y);
3887 LD_SB7(src0_ptr, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
3890 const_vec = __msa_ldi_h(128);
3893 LD2(src1_ptr, src2_stride, tp0, tp1);
3894 src1_ptr += 2 * src2_stride;
3896 LD2(src1_ptr, src2_stride, tp0, tp1);
3899 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
3903 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3904 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3918 SRA_4V(dst0, dst1, dst2, dst3, 6);
3923 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3924 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
3933 const int8_t *filter_x,
3934 const int8_t *filter_y,
3940 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3942 v8i16 filt_h0, filt_h1;
3945 v8i16 filter_vec, const_vec;
3946 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3947 v8i16 tmp0, tmp1, tmp2, tmp3;
3948 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3949 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3950 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3951 v8i16 dst98_r, dst109_r;
3952 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
3953 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3955 src0_ptr -= (src_stride + 1);
3957 filter_vec =
LD_SH(filter_x);
3960 filter_vec =
LD_SH(filter_y);
3967 const_vec = __msa_ldi_h(128);
3971 src0_ptr += (3 * src_stride);
3979 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3982 for (loop_cnt =
height >> 3; loop_cnt--;) {
3983 LD_SB8(src0_ptr, src_stride,
3984 src3, src4, src5, src6, src7, src8, src9, src10);
3985 src0_ptr += (8 * src_stride);
3987 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3988 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3989 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3990 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3997 dst32_r = __msa_ilvr_h(dst73, dst22);
4001 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4002 dst76_r = __msa_ilvr_h(dst22, dst106);
4004 LD2(src1_ptr, src2_stride, tp0, tp1);
4005 src1_ptr += 2 * src2_stride;
4007 LD2(src1_ptr, src2_stride, tp0, tp1);
4008 src1_ptr += 2 * src2_stride;
4011 LD2(src1_ptr, src2_stride, tp0, tp1);
4012 src1_ptr += 2 * src2_stride;
4014 LD2(src1_ptr, src2_stride, tp0, tp1);
4015 src1_ptr += 2 * src2_stride;
4018 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4019 const_vec, in0, in1, in2, in3);
4028 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4029 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4031 dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
4032 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1,
4037 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4038 dst += (8 * dst_stride);
4042 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4052 const int8_t *filter_x,
4053 const int8_t *filter_y,
4058 dst, dst_stride, filter_x, filter_y);
4059 }
else if (4 ==
height) {
4061 dst, dst_stride, filter_x, filter_y);
4062 }
else if (0 == (
height % 8)) {
4064 src1_ptr, src2_stride,
4066 filter_x, filter_y,
height);
4076 const int8_t *filter_x,
4077 const int8_t *filter_y,
4080 uint32_t tpw0, tpw1, tpw2, tpw3;
4082 v16u8 out0, out1, out2;
4083 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4084 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4086 v8i16 filt_h0, filt_h1;
4089 v8i16 filter_vec, const_vec;
4090 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4091 v8i16 dsth10, tmp4, tmp5;
4092 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4093 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4094 v8i16 tmp0, tmp1, tmp2, tmp3;
4095 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4096 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4097 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
4098 v8i16 dst54_l, dst76_l, dst98_l, dst65_l, dst87_l, dst109_l;
4099 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4100 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4101 v8i16 in4 = { 0 }, in5 = { 0 };
4103 src0_ptr -= (src_stride + 1);
4105 filter_vec =
LD_SH(filter_x);
4108 filter_vec =
LD_SH(filter_y);
4115 const_vec = __msa_ldi_h(128);
4119 src0_ptr += (3 * src_stride);
4124 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4133 LD_SB8(src0_ptr, src_stride,
4134 src3, src4, src5, src6, src7, src8, src9, src10);
4137 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4138 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4139 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4140 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4147 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4148 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4149 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4150 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4165 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4166 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4167 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4180 dst3_l =
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4181 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4182 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4183 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4184 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
4185 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
4186 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
4188 LD2(src1_ptr, src2_stride, tp0, tp1);
4190 LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4193 LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4195 LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4198 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4199 in0, in1, in2, in3);
4200 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, tmp2,
4205 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4207 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4208 src1_ptr += (4 * src2_stride);
4210 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4212 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4216 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4217 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
4226 const int8_t *filter_x,
4227 const int8_t *filter_y)
4230 v16i8
src0,
src1, src2, src3, src4;
4232 v8i16 filt_h0, filt_h1;
4235 v8i16 filter_vec, const_vec;
4236 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4237 v8i16 dst0, dst1, dst2, dst3, dst4;
4238 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4239 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4240 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4244 src0_ptr -= (src_stride + 1);
4246 filter_vec =
LD_SH(filter_x);
4249 filter_vec =
LD_SH(filter_y);
4256 const_vec = __msa_ldi_h(128);
4262 LD_SH2(src1_ptr, src2_stride, in0, in1);
4263 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
4267 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4268 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4269 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4285 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4286 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
4290 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4300 const int8_t *filter_x,
4301 const int8_t *filter_y,
4306 v16i8
src0,
src1, src2, src3, src4, src5, src6, mask0, mask1;
4307 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4308 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
4309 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4310 v8i16 in0, in1, in2, in3;
4311 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4312 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4313 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4315 src0_ptr -= (src_stride + 1);
4317 filter_vec =
LD_SH(filter_x);
4320 filter_vec =
LD_SH(filter_y);
4328 const_vec = __msa_ldi_h(128);
4331 for (cnt = width8mult; cnt--;) {
4332 LD_SB7(src0_ptr, src_stride,
src0,
src1, src2, src3, src4, src5, src6);
4336 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4338 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4339 const_vec, in0, in1, in2, in3);
4343 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4352 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4353 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4354 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4355 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4376 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4377 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4378 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4379 dst3_r, tmp0, tmp1, tmp2, tmp3);
4380 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4381 tmp0, tmp1, tmp2, tmp3);
4385 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4396 const int8_t *filter_x,
4397 const int8_t *filter_y)
4399 v16u8 out0, out1, out2;
4400 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
4401 v8i16 in0, in1, in2, in3, in4, in5;
4403 v8i16 filt_h0, filt_h1;
4406 v8i16 filter_vec, const_vec;
4407 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4408 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
4409 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4410 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4411 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4412 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4413 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4414 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4415 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4416 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4418 src0_ptr -= (src_stride + 1);
4420 filter_vec =
LD_SH(filter_x);
4423 filter_vec =
LD_SH(filter_y);
4430 const_vec = __msa_ldi_h(128);
4434 src0_ptr += (5 * src_stride);
4435 LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
4440 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
4441 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4442 in0, in1, in2, in3);
4443 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4447 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4448 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4449 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4450 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
4451 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
4452 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
4453 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
4487 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4488 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4489 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
4490 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
4491 tmp0, tmp1, tmp2, tmp3);
4492 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
4493 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4494 tmp0, tmp1, tmp2, tmp3);
4501 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4502 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4503 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
4512 const int8_t *filter_x,
4513 const int8_t *filter_y,
4517 uint32_t loop_cnt, cnt;
4519 int16_t *src1_ptr_tmp;
4522 v16i8
src0,
src1, src2, src3, src4, src5, src6;
4523 v8i16 in0, in1, in2, in3;
4525 v8i16 filt_h0, filt_h1;
4528 v8i16 filter_vec, const_vec;
4529 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4530 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4531 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4532 v8i16 tmp0, tmp1, tmp2, tmp3;
4533 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4534 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4535 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
4537 src0_ptr -= (src_stride + 1);
4539 filter_vec =
LD_SH(filter_x);
4542 filter_vec =
LD_SH(filter_y);
4549 const_vec = __msa_ldi_h(128);
4552 for (cnt =
width >> 3; cnt--;) {
4553 src0_ptr_tmp = src0_ptr;
4555 src1_ptr_tmp = src1_ptr;
4558 src0_ptr_tmp += (3 * src_stride);
4563 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4572 for (loop_cnt =
height >> 2; loop_cnt--;) {
4573 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4574 src0_ptr_tmp += (4 * src_stride);
4575 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4576 src1_ptr_tmp += (4 * src2_stride);
4579 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4580 const_vec, in0, in1, in2, in3);
4582 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4583 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4584 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4585 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4606 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4607 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4608 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4609 dst3_r, tmp0, tmp1, tmp2, tmp3);
4610 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4611 tmp0, tmp1, tmp2, tmp3);
4615 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4616 dst_tmp += (4 * dst_stride);
4637 const int8_t *filter_x,
4638 const int8_t *filter_y,
4643 dst, dst_stride, filter_x, filter_y);
4644 }
else if (4 ==
height) {
4646 dst, dst_stride, filter_x, filter_y, 1);
4647 }
else if (6 ==
height) {
4649 dst, dst_stride, filter_x, filter_y);
4652 src1_ptr, src2_stride,
4654 filter_x, filter_y,
height, 8);
4664 const int8_t *filter_x,
4665 const int8_t *filter_y,
4670 uint8_t *src0_ptr_tmp, *dst_tmp;
4671 int16_t *src1_ptr_tmp;
4673 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4674 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4675 v16i8 mask0, mask1, mask2, mask3;
4676 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
4677 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, const_vec;
4678 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4679 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
4680 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4681 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4682 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4683 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4684 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4686 src0_ptr -= (src_stride + 1);
4688 filter_vec =
LD_SH(filter_x);
4691 filter_vec =
LD_SH(filter_y);
4699 const_vec = __msa_ldi_h(128);
4702 src0_ptr_tmp = src0_ptr;
4704 src1_ptr_tmp = src1_ptr;
4707 src0_ptr_tmp += (3 * src_stride);
4713 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4722 for (loop_cnt = 4; loop_cnt--;) {
4723 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4724 src0_ptr_tmp += (4 * src_stride);
4727 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4728 src1_ptr_tmp += (4 * src2_stride);
4729 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4730 const_vec, in0, in1, in2, in3);
4732 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4733 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4734 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4735 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4756 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4757 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4758 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4759 dst3_r, tmp0, tmp1, tmp2, tmp3);
4760 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4761 tmp0, tmp1, tmp2, tmp3);
4765 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4766 dst_tmp += (4 * dst_stride);
4783 src0_ptr += (3 * src_stride);
4792 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4794 for (loop_cnt = 2; loop_cnt--;) {
4795 LD_SB8(src0_ptr, src_stride,
4796 src3, src4, src5, src6, src7, src8, src9, src10);
4797 src0_ptr += (8 * src_stride);
4799 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4800 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4801 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4802 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4809 dst32_r = __msa_ilvr_h(dst73, dst22);
4813 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4814 dst76_r = __msa_ilvr_h(dst22, dst106);
4816 LD2(src1_ptr, src2_stride, tp0, tp1);
4817 src1_ptr += 2 * src2_stride;
4819 LD2(src1_ptr, src2_stride, tp0, tp1);
4820 src1_ptr += 2 * src2_stride;
4823 LD2(src1_ptr, src2_stride, tp0, tp1);
4824 src1_ptr += 2 * src2_stride;
4826 LD2(src1_ptr, src2_stride, tp0, tp1);
4827 src1_ptr += 2 * src2_stride;
4830 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4831 const_vec, in0, in1, in2, in3);
4842 SRA_4V(dst0, dst1, dst2, dst3, 6);
4843 SRA_4V(dst4, dst5, dst6, dst7, 6);
4844 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
4845 tmp0, tmp1, tmp2, tmp3);
4846 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4847 tmp0, tmp1, tmp2, tmp3);
4851 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4852 dst += (8 * dst_stride);
4856 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4866 const int8_t *filter_x,
4867 const int8_t *filter_y,
4872 dst, dst_stride, filter_x, filter_y, 2);
4875 src2_stride, dst, dst_stride, filter_x,
4886 const int8_t *filter_x,
4887 const int8_t *filter_y,
4891 dst, dst_stride, filter_x, filter_y,
4901 const int8_t *filter_x,
4902 const int8_t *filter_y,
4906 dst, dst_stride, filter_x, filter_y,
4910 #define BI_MC_COPY(WIDTH) \
4911 void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4912 ptrdiff_t dst_stride, \
4914 ptrdiff_t src_stride, \
4915 int16_t *src_16bit, \
4921 hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
4922 dst, dst_stride, height); \
4937 #define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4938 void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4939 ptrdiff_t dst_stride, \
4941 ptrdiff_t src_stride, \
4942 int16_t *src_16bit, \
4948 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4950 hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
4951 MAX_PB_SIZE, dst, dst_stride, \
4955 BI_MC(qpel,
h, 4, 8, hz, mx);
4956 BI_MC(qpel,
h, 8, 8, hz, mx);
4957 BI_MC(qpel,
h, 12, 8, hz, mx);
4958 BI_MC(qpel,
h, 16, 8, hz, mx);
4959 BI_MC(qpel,
h, 24, 8, hz, mx);
4960 BI_MC(qpel,
h, 32, 8, hz, mx);
4961 BI_MC(qpel,
h, 48, 8, hz, mx);
4962 BI_MC(qpel,
h, 64, 8, hz, mx);
4964 BI_MC(qpel, v, 4, 8, vt, my);
4965 BI_MC(qpel, v, 8, 8, vt, my);
4966 BI_MC(qpel, v, 12, 8, vt, my);
4967 BI_MC(qpel, v, 16, 8, vt, my);
4968 BI_MC(qpel, v, 24, 8, vt, my);
4969 BI_MC(qpel, v, 32, 8, vt, my);
4970 BI_MC(qpel, v, 48, 8, vt, my);
4971 BI_MC(qpel, v, 64, 8, vt, my);
4973 BI_MC(epel,
h, 4, 4, hz, mx);
4974 BI_MC(epel,
h, 8, 4, hz, mx);
4975 BI_MC(epel,
h, 6, 4, hz, mx);
4976 BI_MC(epel,
h, 12, 4, hz, mx);
4977 BI_MC(epel,
h, 16, 4, hz, mx);
4978 BI_MC(epel,
h, 24, 4, hz, mx);
4979 BI_MC(epel,
h, 32, 4, hz, mx);
4981 BI_MC(epel, v, 4, 4, vt, my);
4982 BI_MC(epel, v, 8, 4, vt, my);
4983 BI_MC(epel, v, 6, 4, vt, my);
4984 BI_MC(epel, v, 12, 4, vt, my);
4985 BI_MC(epel, v, 16, 4, vt, my);
4986 BI_MC(epel, v, 24, 4, vt, my);
4987 BI_MC(epel, v, 32, 4, vt, my);
4991 #define BI_MC_HV(PEL, WIDTH, TAP) \
4992 void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4993 ptrdiff_t dst_stride, \
4995 ptrdiff_t src_stride, \
4996 int16_t *src_16bit, \
5002 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
5003 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
5005 hevc_hv_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
5006 MAX_PB_SIZE, dst, dst_stride, \
5007 filter_x, filter_y, height); \