30 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
33 LD2(src, src_stride, out0, out1);
37 }
else if (6 == height) {
38 LD4(src, src_stride, out0, out1, out2, out3);
39 src += (4 * src_stride);
40 SD4(out0, out1, out2, out3, dst, dst_stride);
41 dst += (4 * dst_stride);
42 LD2(src, src_stride, out0, out1);
46 }
else if (0 == (height % 8)) {
47 for (cnt = (height >> 3); cnt--;) {
48 LD4(src, src_stride, out0, out1, out2, out3);
49 src += (4 * src_stride);
50 LD4(src, src_stride, out4, out5, out6, out7);
51 src += (4 * src_stride);
52 SD4(out0, out1, out2, out3, dst, dst_stride);
53 dst += (4 * dst_stride);
54 SD4(out4, out5, out6, out7, dst, dst_stride);
55 dst += (4 * dst_stride);
57 }
else if (0 == (height % 4)) {
58 for (cnt = (height >> 2); cnt--;) {
59 LD4(src, src_stride, out0, out1, out2, out3);
60 src += (4 * src_stride);
61 SD4(out0, out1, out2, out3, dst, dst_stride);
62 dst += (4 * dst_stride);
71 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
73 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
74 src += (8 * src_stride);
75 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
76 dst += (8 * dst_stride);
77 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
78 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
86 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
89 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
90 src += (8 * src_stride);
91 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
92 dst += (8 * dst_stride);
93 LD_UB4(src, src_stride, src0, src1, src2, src3);
94 src += (4 * src_stride);
95 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
96 dst += (4 * dst_stride);
97 }
else if (0 == (height % 8)) {
98 for (cnt = (height >> 3); cnt--;) {
99 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
101 src += (8 * src_stride);
102 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
104 dst += (8 * dst_stride);
106 }
else if (0 == (height % 4)) {
107 for (cnt = (height >> 2); cnt--;) {
108 LD_UB4(src, src_stride, src0, src1, src2, src3);
109 src += (4 * src_stride);
111 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
112 dst += (4 * dst_stride);
122 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
123 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
125 for (cnt = 4; cnt--;) {
126 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
127 LD4(src + 16, src_stride, out0, out1, out2, out3);
128 src += (4 * src_stride);
129 LD4(src + 16, src_stride, out4, out5, out6, out7);
130 src += (4 * src_stride);
132 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
133 SD4(out0, out1, out2, out3, dst + 16, dst_stride);
134 dst += (4 * dst_stride);
135 SD4(out4, out5, out6, out7, dst + 16, dst_stride);
136 dst += (4 * dst_stride);
145 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
147 for (cnt = (height >> 2); cnt--;) {
148 LD_UB4(src, src_stride, src0, src1, src2, src3);
149 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
150 src += (4 * src_stride);
151 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
152 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
153 dst += (4 * dst_stride);
162 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
165 for (cnt = (height >> 2); cnt--;) {
166 LD_UB4(src, src_stride, src0, src1, src2, src3);
167 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
168 LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
169 src += (4 * src_stride);
171 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
172 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
173 ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
174 dst += (4 * dst_stride);
183 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
184 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
186 for (cnt = (height >> 2); cnt--;) {
187 LD_UB4(src, 16, src0, src1, src2, src3);
189 LD_UB4(src, 16, src4, src5, src6, src7);
191 LD_UB4(src, 16, src8, src9, src10, src11);
193 LD_UB4(src, 16, src12, src13, src14, src15);
196 ST_UB4(src0, src1, src2, src3, dst, 16);
198 ST_UB4(src4, src5, src6, src7, dst, 16);
200 ST_UB4(src8, src9, src10, src11, dst, 16);
202 ST_UB4(src12, src13, src14, src15, dst, 16);
209 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
211 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
213 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
216 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
217 filt0, filt1, filt2, filt3) \
221 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
222 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
223 tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \
224 tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3); \
225 tmp0 = __msa_adds_s_h(tmp0, tmp1); \
230 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
231 mask0, mask1, mask2, mask3, \
232 filt0, filt1, filt2, filt3, \
235 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
236 v8i16 res0_m, res1_m, res2_m, res3_m; \
238 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
239 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
240 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
241 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
242 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
243 DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
244 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
245 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
246 ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
249 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
250 mask0, mask1, mask2, mask3, \
251 filt0, filt1, filt2, filt3, \
252 out0, out1, out2, out3) \
254 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
255 v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
257 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
258 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
259 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
260 res0_m, res1_m, res2_m, res3_m); \
261 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
262 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
263 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
264 res4_m, res5_m, res6_m, res7_m); \
265 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
266 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
267 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
268 res0_m, res1_m, res2_m, res3_m); \
269 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
270 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
271 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
272 res4_m, res5_m, res6_m, res7_m); \
273 ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
274 res7_m, out0, out1, out2, out3); \
277 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
281 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
282 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
287 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
288 mask0, mask1, filt0, filt1, \
291 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
293 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
294 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
295 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
296 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
299 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
300 mask0, mask1, filt0, filt1, \
301 out0, out1, out2, out3) \
303 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
305 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
306 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
307 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
308 out0, out1, out2, out3); \
309 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
310 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
311 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
312 out0, out1, out2, out3); \
319 v16u8 mask0, mask1, mask2, mask3,
out;
320 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
321 v8i16
filt, out0, out1;
327 filt =
LD_SH(filter);
328 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
334 LD_SB4(src, src_stride, src0, src1, src2, src3);
337 mask3, filt0, filt1, filt2, filt3, out0, out1);
341 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
348 v16i8 filt0, filt1, filt2, filt3;
350 v16u8 mask0, mask1, mask2, mask3,
out;
351 v8i16
filt, out0, out1, out2, out3;
357 filt =
LD_SH(filter);
358 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
364 LD_SB4(src, src_stride, src0, src1, src2, src3);
366 src += (4 * src_stride);
368 mask3, filt0, filt1, filt2, filt3, out0, out1);
369 LD_SB4(src, src_stride, src0, src1, src2, src3);
372 mask3, filt0, filt1, filt2, filt3, out2, out3);
376 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
377 dst += (4 * dst_stride);
379 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
386 v16u8 mask0, mask1, mask2, mask3,
out;
387 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
388 v8i16
filt, out0, out1, out2, out3;
394 filt =
LD_SH(filter);
395 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
401 LD_SB4(src, src_stride, src0, src1, src2, src3);
403 src += (4 * src_stride);
405 mask3, filt0, filt1, filt2, filt3, out0, out1);
406 LD_SB4(src, src_stride, src0, src1, src2, src3);
408 src += (4 * src_stride);
410 mask3, filt0, filt1, filt2, filt3, out2, out3);
414 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
415 dst += (4 * dst_stride);
417 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
418 dst += (4 * dst_stride);
420 LD_SB4(src, src_stride, src0, src1, src2, src3);
422 src += (4 * src_stride);
424 mask3, filt0, filt1, filt2, filt3, out0, out1);
425 LD_SB4(src, src_stride, src0, src1, src2, src3);
427 src += (4 * src_stride);
429 mask3, filt0, filt1, filt2, filt3, out2, out3);
434 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
435 dst += (4 * dst_stride);
437 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
446 }
else if (8 == height) {
448 }
else if (16 == height) {
457 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
458 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
459 v8i16
filt, out0, out1, out2, out3;
465 filt =
LD_SH(filter);
466 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
472 LD_SB4(src, src_stride, src0, src1, src2, src3);
475 mask3, filt0, filt1, filt2, filt3, out0, out1,
481 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
489 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
490 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
491 v8i16
filt, out0, out1, out2, out3;
497 filt =
LD_SH(filter);
498 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
504 for (loop_cnt = (height >> 2); loop_cnt--;) {
505 LD_SB4(src, src_stride, src0, src1, src2, src3);
507 src += (4 * src_stride);
509 mask3, filt0, filt1, filt2, filt3, out0,
515 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
516 dst += (4 * dst_stride);
538 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
539 v8i16
filt, out0, out1, out2, out3;
540 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00, tmp0, tmp1;
552 filt =
LD_SH(filter);
553 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
562 for (loop_cnt = (height >> 2); loop_cnt--;) {
564 LD_SB4(src1_ptr, src_stride, src0, src1, src2, src3);
566 src1_ptr += (4 * src_stride);
568 mask3, filt0, filt1, filt2, filt3, out0,
574 ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
575 dst1 += (4 * dst_stride);
578 LD_SB4(src, src_stride, src0, src1, src2, src3);
580 src += (4 * src_stride);
582 mask6, filt0, filt1, filt2, filt3, out0,
587 ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst, dst_stride);
588 dst += (4 * dst_stride);
597 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
598 v16u8 mask0, mask1, mask2, mask3,
out;
599 v8i16
filt, out0, out1, out2, out3;
605 filt =
LD_SH(filter);
606 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
612 for (loop_cnt = (height >> 1); loop_cnt--;) {
613 LD_SB2(src, src_stride, src0, src2);
614 LD_SB2(src + 8, src_stride, src1, src3);
616 src += (2 * src_stride);
618 mask3, filt0, filt1, filt2, filt3, out0,
636 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
637 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7,
out;
638 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
640 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10;
647 filt =
LD_SH(filter);
648 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
658 for (loop_cnt = (height >> 1); loop_cnt--;) {
659 LD_SB2(src, src_stride, src0, src2);
660 LD_SB2(src + 16, src_stride, src1, src3);
662 src += (2 * src_stride);
663 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
664 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
665 VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
666 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
669 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
670 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
671 VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
672 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2, out4,
675 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
676 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
677 VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
678 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
679 out0, out8, out2, out9);
681 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
682 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
683 VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
684 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
685 out4, out10, out6, out11);
687 ADDS_SH4_SH(out0, out4, out8, out10, out2, out6, out9, out11, out0,
695 ST8x2_UB(out, dst + 16, dst_stride);
710 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
711 v16u8 mask0, mask1, mask2, mask3,
out;
712 v8i16
filt, out0, out1, out2, out3;
718 filt =
LD_SH(filter);
719 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
725 for (loop_cnt = (height >> 1); loop_cnt--;) {
727 src2 =
LD_SB(src + 16);
728 src3 =
LD_SB(src + 24);
729 src1 = __msa_sldi_b(src2, src0, 8);
733 mask3, filt0, filt1, filt2, filt3, out0,
739 src2 =
LD_SB(src + 16);
740 src3 =
LD_SB(src + 24);
741 src1 = __msa_sldi_b(src2, src0, 8);
747 ST_UB(out, dst + 16);
752 mask3, filt0, filt1, filt2, filt3, out0,
759 ST_UB(out, dst + 16);
769 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
770 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7,
out;
771 v8i16
filt, out0, out1, out2, out3, out4, out5, out6;
777 filt =
LD_SH(filter);
778 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
788 for (loop_cnt = height; loop_cnt--;) {
789 LD_SB3(src, 16, src0, src2, src3);
790 src1 = __msa_sldi_b(src2, src0, 8);
793 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
795 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
796 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
799 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
800 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
802 DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
803 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
806 out5 = __msa_dpadd_s_h(out5, vec2, filt3);
808 out2 = __msa_adds_s_h(out2, out5);
810 out6 = __msa_srari_h(out2, 6);
815 src1 =
LD_SB(src + 40);
817 src1 = (v16i8) __msa_xori_b((v16u8)
src1, 128);
819 VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask4, mask0, mask0,
821 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
822 VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask5, mask1, mask1,
825 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
826 VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask6, mask2, mask2,
828 DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
829 VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask7, mask3, mask3,
832 out5 = __msa_dpadd_s_h(out5, vec2, filt3);
834 out5 = __msa_adds_s_h(out2, out5);
836 out5 = __msa_srari_h(out5, 6);
839 ST_UB(out, dst + 16);
841 ST_UB(out, dst + 32);
851 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
852 v16u8 mask0, mask1, mask2, mask3,
out;
853 v8i16
filt, out0, out1, out2, out3;
859 filt =
LD_SH(filter);
860 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
866 for (loop_cnt = height; loop_cnt--;) {
868 src2 =
LD_SB(src + 16);
869 src3 =
LD_SB(src + 24);
870 src1 = __msa_sldi_b(src2, src0, 8);
874 mask2, mask3, filt0, filt1, filt2, filt3,
875 out0, out1, out2, out3);
881 ST_UB(out, dst + 16);
883 src0 =
LD_SB(src + 32);
884 src2 =
LD_SB(src + 48);
885 src3 =
LD_SB(src + 56);
886 src1 = __msa_sldi_b(src2, src0, 8);
891 mask2, mask3, filt0, filt1, filt2, filt3,
892 out0, out1, out2, out3);
896 ST_UB(out, dst + 32);
898 ST_UB(out, dst + 48);
908 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
909 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
910 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
911 v16i8 src10998, filt0, filt1, filt2, filt3;
913 v8i16
filt, out10, out32;
915 src -= (3 * src_stride);
917 filt =
LD_SH(filter);
918 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
920 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
921 src += (7 * src_stride);
923 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
925 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
926 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
930 for (loop_cnt = (height >> 2); loop_cnt--;) {
931 LD_SB4(src, src_stride, src7, src8, src9, src10);
932 src += (4 * src_stride);
934 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
935 src87_r, src98_r, src109_r);
936 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
939 filt1, filt2, filt3);
941 filt1, filt2, filt3);
945 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
946 dst += (4 * dst_stride);
960 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
961 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
962 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
964 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
966 src -= (3 * src_stride);
968 filt =
LD_SH(filter);
969 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
971 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
973 src += (7 * src_stride);
974 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
976 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
978 for (loop_cnt = (height >> 2); loop_cnt--;) {
979 LD_SB4(src, src_stride, src7, src8, src9, src10);
981 src += (4 * src_stride);
983 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
984 src87_r, src98_r, src109_r);
986 filt1, filt2, filt3);
988 filt1, filt2, filt3);
990 filt1, filt2, filt3);
992 filt1, filt2, filt3);
994 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
997 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
998 dst += (4 * dst_stride);
1015 uint32_t out2, out3;
1016 uint64_t out0, out1;
1017 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
1018 v16i8 res2, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1019 v8i16 vec01, vec23, vec45, vec67, tmp0, tmp1, tmp2;
1020 v8i16
filt, filt0, filt1, filt2, filt3;
1021 v4i32
mask = { 2, 6, 2, 6 };
1023 src -= (3 * src_stride);
1026 filt =
LD_SH(filter);
1027 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1029 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1030 src += (7 * src_stride);
1035 VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
1036 VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
1037 VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
1039 for (loop_cnt = (height >> 1); loop_cnt--;) {
1040 LD_SB2(src, src_stride, src7, src8);
1042 src += (2 * src_stride);
1044 ILVR_B4_SH(src1, src0, src3, src2, src5, src4, src7, src6,
1045 vec01, vec23, vec45, vec67);
1048 ILVR_B4_SH(src2, src1, src4, src3, src6, src5, src8, src7, vec01, vec23,
1054 VSHF_W2_SB(src6, src7, src7, src8, mask, mask, vec6, vec7);
1055 ILVR_B4_SH(vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec01, vec23,
1060 tmp2 = __msa_srari_h(tmp2, 6);
1062 PCKEV_B3_SB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, res0, res1, res2);
1065 out0 = __msa_copy_u_d((v2i64) res0, 0);
1066 out1 = __msa_copy_u_d((v2i64) res1, 0);
1067 out2 = __msa_copy_u_w((v4i32) res2, 0);
1068 out3 = __msa_copy_u_w((v4i32) res2, 1);
1070 SW(out2, (dst + 8));
1073 SW(out3, (dst + 8));
1097 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1098 v16i8 filt0, filt1, filt2, filt3;
1099 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1100 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1101 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1102 v16u8 tmp0, tmp1, tmp2, tmp3;
1103 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1105 src -= (3 * src_stride);
1107 filt =
LD_SH(filter);
1108 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1110 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1112 src += (7 * src_stride);
1113 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1115 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1116 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1118 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1120 for (loop_cnt = (height >> 2); loop_cnt--;) {
1121 LD_SB4(src, src_stride, src7, src8, src9, src10);
1123 src += (4 * src_stride);
1125 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1126 src87_r, src98_r, src109_r);
1127 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1128 src87_l, src98_l, src109_l);
1130 filt1, filt2, filt3);
1132 filt1, filt2, filt3);
1134 filt1, filt2, filt3);
1136 filt1, filt2, filt3);
1138 filt1, filt2, filt3);
1140 filt1, filt2, filt3);
1142 filt1, filt2, filt3);
1144 filt1, filt2, filt3);
1147 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1148 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1149 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1150 out3_r, tmp0, tmp1, tmp2, tmp3);
1152 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1153 dst += (4 * dst_stride);
1178 uint32_t loop_cnt, cnt;
1179 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1180 v16i8 filt0, filt1, filt2, filt3;
1181 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1182 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1183 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1184 v16u8 tmp0, tmp1, tmp2, tmp3;
1185 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1187 src -= (3 * src_stride);
1189 filt =
LD_SH(filter);
1190 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1192 for (cnt = (width >> 4); cnt--;) {
1196 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1198 src_tmp += (7 * src_stride);
1199 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1200 src32_r, src54_r, src21_r);
1201 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1202 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1203 src32_l, src54_l, src21_l);
1204 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1206 for (loop_cnt = (height >> 2); loop_cnt--;) {
1207 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1209 src_tmp += (4 * src_stride);
1210 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1211 src87_r, src98_r, src109_r);
1212 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1213 src87_l, src98_l, src109_l);
1215 filt0, filt1, filt2, filt3);
1217 filt0, filt1, filt2, filt3);
1219 filt0, filt1, filt2, filt3);
1221 filt0, filt1, filt2, filt3);
1223 filt0, filt1, filt2, filt3);
1225 filt0, filt1, filt2, filt3);
1227 filt0, filt1, filt2, filt3);
1229 filt0, filt1, filt2, filt3);
1232 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1233 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1234 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1235 out3_r, tmp0, tmp1, tmp2, tmp3);
1237 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1238 dst_tmp += (4 * dst_stride);
1299 const int8_t *filter_x,
1300 const int8_t *filter_y,
1304 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1305 v8i16 filt0, filt1, filt2, filt3;
1306 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1307 v16i8 mask1, mask2, mask3;
1308 v8i16 filter_vec, const_vec;
1309 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1310 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1311 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1312 v4i32 dst0_r, dst1_r;
1313 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1314 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1315 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1316 v8i16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1318 src -= ((3 * src_stride) + 3);
1319 filter_vec =
LD_SH(filter_x);
1320 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1322 filter_vec =
LD_SH(filter_y);
1323 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1324 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1326 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1332 const_vec = __msa_ldi_h(128);
1335 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1336 src += (7 * src_stride);
1339 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1340 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1341 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1342 vec8, vec9, vec10, vec11);
1343 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1344 vec12, vec13, vec14, vec15);
1347 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1348 dst30, dst30, dst30, dst30);
1350 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1351 dst41, dst41, dst41, dst41);
1353 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1354 dst52, dst52, dst52, dst52);
1356 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1357 dst63, dst63, dst63, dst63);
1359 ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1360 dst10_r, dst21_r, dst32_r);
1361 dst43_r = __msa_ilvl_h(dst41, dst30);
1362 dst54_r = __msa_ilvl_h(dst52, dst41);
1363 dst65_r = __msa_ilvl_h(dst63, dst52);
1364 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1366 for (loop_cnt = height >> 1; loop_cnt--;) {
1367 LD_SB2(src, src_stride, src7, src8);
1368 src += 2 * src_stride;
1371 VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1372 vec0, vec1, vec2, vec3);
1374 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1375 dst87, dst87, dst87, dst87);
1377 dst76_r = __msa_ilvr_h(dst87, dst66);
1379 filt_h0, filt_h1, filt_h2, filt_h3);
1380 dst87_r = __msa_vshf_h(mask4, dst87, dst87);
1382 filt_h0, filt_h1, filt_h2, filt_h3);
1392 dst += (2 * dst_stride);
1400 dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1408 const int8_t *filter_x,
1409 const int8_t *filter_y,
1412 uint32_t loop_cnt, cnt;
1415 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1416 v8i16 filt0, filt1, filt2, filt3;
1417 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1418 v16i8 mask1, mask2, mask3;
1419 v8i16 filter_vec, const_vec;
1420 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1421 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1422 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1423 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1424 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1425 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1426 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1427 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1428 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1430 src -= ((3 * src_stride) + 3);
1431 const_vec = __msa_ldi_h(128);
1434 filter_vec =
LD_SH(filter_x);
1435 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1437 filter_vec =
LD_SH(filter_y);
1438 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1439 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1441 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1447 for (cnt = width >> 3; cnt--;) {
1451 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1452 src_tmp += (7 * src_stride);
1456 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1457 vec0, vec1, vec2, vec3);
1458 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1459 vec4, vec5, vec6, vec7);
1460 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1461 vec8, vec9, vec10, vec11);
1462 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1463 vec12, vec13, vec14, vec15);
1465 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1466 dst0, dst0, dst0, dst0);
1468 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1469 dst1, dst1, dst1, dst1);
1471 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1472 dst2, dst2, dst2, dst2);
1474 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1475 dst3, dst3, dst3, dst3);
1477 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1478 vec0, vec1, vec2, vec3);
1479 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1480 vec4, vec5, vec6, vec7);
1481 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1482 vec8, vec9, vec10, vec11);
1484 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1485 dst4, dst4, dst4, dst4);
1487 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1488 dst5, dst5, dst5, dst5);
1490 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1491 dst6, dst6, dst6, dst6);
1493 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1494 dst10_r, dst32_r, dst54_r, dst21_r);
1495 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1496 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1497 dst10_l, dst32_l, dst54_l, dst21_l);
1498 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1500 for (loop_cnt = height >> 1; loop_cnt--;) {
1501 LD_SB2(src_tmp, src_stride, src7, src8);
1503 src_tmp += 2 * src_stride;
1505 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1506 vec0, vec1, vec2, vec3);
1508 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1509 dst7, dst7, dst7, dst7);
1513 filt_h0, filt_h1, filt_h2, filt_h3);
1515 filt_h0, filt_h1, filt_h2, filt_h3);
1519 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1520 vec0, vec1, vec2, vec3);
1522 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1523 dst8, dst8, dst8, dst8);
1527 filt_h0, filt_h1, filt_h2, filt_h3);
1529 filt_h0, filt_h1, filt_h2, filt_h3);
1539 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
1540 dst_tmp += (2 * dst_stride);
1566 const int8_t *filter_x,
1567 const int8_t *filter_y,
1571 filter_x, filter_y, height, 8);
1578 const int8_t *filter_x,
1579 const int8_t *filter_y,
1583 filter_x, filter_y, height, 8);
1586 filter_x, filter_y, height);
1593 const int8_t *filter_x,
1594 const int8_t *filter_y,
1598 filter_x, filter_y, height, 16);
1605 const int8_t *filter_x,
1606 const int8_t *filter_y,
1610 filter_x, filter_y, height, 24);
1617 const int8_t *filter_x,
1618 const int8_t *filter_y,
1622 filter_x, filter_y, height, 32);
1629 const int8_t *filter_x,
1630 const int8_t *filter_y,
1634 filter_x, filter_y, height, 48);
1641 const int8_t *filter_x,
1642 const int8_t *filter_y,
1646 filter_x, filter_y, height, 64);
1653 v16i8 filt0, filt1,
src0,
src1, mask0, mask1, vec0, vec1;
1661 filt =
LD_SH(filter);
1666 LD_SB2(src, src_stride, src0, src1);
1668 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1670 res0 = __msa_srari_h(res0, 6);
1671 res0 = __msa_sat_s_h(res0, 7);
1680 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
1681 v8i16
filt, out0, out1;
1688 filt =
LD_SH(filter);
1693 LD_SB4(src, src_stride, src0, src1, src2, src3);
1696 filt0, filt1, out0, out1);
1700 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1707 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
1709 v8i16
filt, out0, out1, out2, out3;
1715 filt =
LD_SH(filter);
1720 LD_SB4(src, src_stride, src0, src1, src2, src3);
1721 src += (4 * src_stride);
1725 filt0, filt1, out0, out1);
1726 LD_SB4(src, src_stride, src0, src1, src2, src3);
1729 filt0, filt1, out2, out3);
1733 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1734 dst += (4 * dst_stride);
1736 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1743 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
1744 v16i8 filt0, filt1, mask0, mask1;
1746 v8i16
filt, out0, out1, out2, out3;
1752 filt =
LD_SH(filter);
1757 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1758 src += (8 * src_stride);
1761 filt0, filt1, out0, out1);
1763 filt0, filt1, out2, out3);
1767 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1768 dst += (4 * dst_stride);
1770 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1771 dst += (4 * dst_stride);
1773 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1774 src += (8 * src_stride);
1777 filt0, filt1, out0, out1);
1779 filt0, filt1, out2, out3);
1783 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1784 dst += (4 * dst_stride);
1786 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1795 }
else if (4 == height) {
1797 }
else if (8 == height) {
1799 }
else if (16 == height) {
1809 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
1811 v8i16
filt, out0, out1, out2, out3;
1817 filt =
LD_SH(filter);
1822 for (loop_cnt = (height >> 2); loop_cnt--;) {
1823 LD_SB4(src, src_stride, src0, src1, src2, src3);
1824 src += (4 * src_stride);
1828 filt1, out0, out1, out2, out3);
1834 ST6x4_UB(out4, out5, dst, dst_stride);
1835 dst += (4 * dst_stride);
1844 v16i8
src0,
src1, filt0, filt1, mask0, mask1;
1846 v8i16
filt, vec0, vec1, vec2, vec3;
1851 filt =
LD_SH(filter);
1856 for (loop_cnt = (height >> 1); loop_cnt--;) {
1857 LD_SB2(src, src_stride, src0, src1);
1858 src += (2 * src_stride);
1861 VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1862 DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
1863 VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
1869 dst += (2 * dst_stride);
1878 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
1880 v8i16
filt, out0, out1, out2, out3;
1886 filt =
LD_SH(filter);
1891 for (loop_cnt = (height >> 2); loop_cnt--;) {
1892 LD_SB4(src, src_stride, src0, src1, src2, src3);
1893 src += (4 * src_stride);
1897 filt1, out0, out1, out2, out3);
1902 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1903 dst += (4 * dst_stride);
1911 if ((2 == height) || (6 == height)) {
1925 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
1926 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1929 v8i16
filt, out0, out1, out2, out3, out4, out5;
1937 filt =
LD_SH(filter);
1943 for (loop_cnt = (height >> 2); loop_cnt--;) {
1944 LD_SB4(src, src_stride, src0, src1, src2, src3);
1945 src += (4 * src_stride);
1948 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
1949 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
1950 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
1951 DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1952 out2, out3, out4, out5);
1953 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
1954 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
1955 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
1956 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
1957 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
1958 out2, out3, out4, out5);
1966 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1968 ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
1969 dst += (4 * dst_stride);
1978 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
1979 v16i8 filt0, filt1, mask0, mask1;
1980 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
1987 filt =
LD_SH(filter);
1992 for (loop_cnt = (height >> 2); loop_cnt--;) {
1993 LD_SB4(src, src_stride, src0, src2, src4, src6);
1994 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1995 src += (4 * src_stride);
1999 filt1, out0, out1, out2, out3);
2001 filt1, out4, out5, out6, out7);
2027 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2028 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2029 v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2030 v8i16
filt, out0, out1, out2, out3;
2037 filt =
LD_SH(filter);
2042 mask11 = mask0 + 10;
2044 for (loop_cnt = (height >> 2); loop_cnt--;) {
2045 LD_SB4(src, src_stride, src0, src2, src4, src6);
2046 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2047 src += (4 * src_stride);
2050 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2051 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2052 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2053 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2054 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2055 out0, out1, out2, out3);
2056 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2057 out0, out1, out2, out3);
2067 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2068 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2069 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2070 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2071 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2072 out0, out1, out2, out3);
2073 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2074 out0, out1, out2, out3);
2085 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2086 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2087 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2088 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2090 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2091 out0, out1, out2, out3);
2092 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2093 out0, out1, out2, out3);
2099 ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
2100 dst1 += (4 * dst_stride);
2109 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2110 v16i8 filt0, filt1, mask0, mask1;
2112 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
2118 filt =
LD_SH(filter);
2123 for (loop_cnt = (height >> 1); loop_cnt--;) {
2125 src2 =
LD_SB(src + 16);
2126 src3 =
LD_SB(src + 24);
2129 src6 =
LD_SB(src + 16);
2130 src7 =
LD_SB(src + 24);
2131 SLDI_B2_SB(src2, src6, src0, src4, src1, src5, 8);
2136 filt0, filt1, out0, out1, out2, out3);
2138 filt0, filt1, out4, out5, out6, out7);
2146 ST_UB(out, dst + 16);
2151 ST_UB(out, dst + 16);
2160 v16i8
src0,
src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2161 v16i8 src2110, src4332, filt0, filt1;
2167 filt =
LD_SH(filter);
2170 LD_SB3(src, src_stride, src0, src1, src2);
2171 src += (3 * src_stride);
2173 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2174 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2175 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2176 LD_SB2(src, src_stride, src3, src4);
2177 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2178 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2179 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2181 out10 = __msa_srari_h(out10, 6);
2182 out10 = __msa_sat_s_h(out10, 7);
2192 v16i8
src0,
src1, src2, src3, src4, src5;
2193 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2194 v16i8 src2110, src4332, filt0, filt1;
2195 v8i16
filt, out10, out32;
2200 filt =
LD_SH(filter);
2203 LD_SB3(src, src_stride, src0, src1, src2);
2204 src += (3 * src_stride);
2206 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2208 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2209 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2211 for (loop_cnt = (height >> 2); loop_cnt--;) {
2212 LD_SB3(src, src_stride, src3, src4, src5);
2213 src += (3 * src_stride);
2214 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2215 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2216 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2220 src += (src_stride);
2221 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2222 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2223 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2228 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2229 dst += (4 * dst_stride);
2250 v16u8
src0,
src1, src2, src3, vec0, vec1, vec2, vec3, out0, out1;
2251 v8i16 vec01, vec12, vec23, vec30, tmp0, tmp1, tmp2, tmp3;
2252 v8i16
filt, filt0, filt1;
2257 filt =
LD_SH(filter);
2260 LD_UB3(src, src_stride, src0, src1, src2);
2261 src += (3 * src_stride);
2263 vec0 = (v16u8) __msa_xori_b((v16u8)
src0, 128);
2264 vec1 = (v16u8) __msa_xori_b((v16u8)
src1, 128);
2265 vec2 = (v16u8) __msa_xori_b((v16u8) src2, 128);
2267 for (loop_cnt = (height >> 2); loop_cnt--;) {
2268 LD_UB4(src, src_stride, src3, src0, src1, src2);
2269 src += (4 * src_stride);
2271 vec3 = (v16u8) __msa_xori_b((v16u8) src3, 128);
2272 ILVR_B2_SH(vec1, vec0, vec3, vec2, vec01, vec23);
2275 vec0 = __msa_xori_b((v16u8) src0, 128);
2276 ILVR_B2_SH(vec2, vec1, vec0, vec3, vec12, vec30);
2279 vec1 = __msa_xori_b((v16u8) src1, 128);
2280 vec01 = (v8i16) __msa_ilvr_b((v16i8) vec1, (v16i8) vec0);
2283 vec2 = __msa_xori_b((v16u8) src2, 128);
2284 vec12 = (v8i16) __msa_ilvr_b((v16i8) vec2, (v16i8) vec1);
2291 ST6x4_UB(out0, out1, dst, dst_stride);
2292 dst += (4 * dst_stride);
2300 v16i8
src0,
src1, src2, src3, src4;
2301 v8i16 src01, src12, src23, src34, tmp0, tmp1,
filt, filt0, filt1;
2307 filt =
LD_SH(filter);
2310 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2312 ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2314 ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2327 uint64_t out0, out1, out2;
2328 v16i8
src0,
src1, src2, src3, src4, src5;
2329 v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2330 v8i16
filt, filt0, filt1;
2335 filt =
LD_SH(filter);
2338 LD_SB3(src, src_stride, src0, src1, src2);
2339 src += (3 * src_stride);
2342 ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2344 for (loop_cnt = 2; loop_cnt--;) {
2345 LD_SB3(src, src_stride, src3, src4, src5);
2346 src += (3 * src_stride);
2349 ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2354 tmp2 = __msa_srari_h(tmp2, 6);
2359 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2360 out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2361 out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2380 v16i8
src0,
src1, src2, src7, src8, src9, src10;
2381 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2383 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
2387 filt =
LD_SH(filter);
2390 LD_SB3(src, src_stride, src0, src1, src2);
2391 src += (3 * src_stride);
2394 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2396 for (loop_cnt = (height >> 2); loop_cnt--;) {
2397 LD_SB4(src, src_stride, src7, src8, src9, src10);
2398 src += (4 * src_stride);
2401 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2402 src72_r, src87_r, src98_r, src109_r);
2408 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2411 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2412 dst += (4 * dst_stride);
2426 }
else if (6 == height) {
2439 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2440 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2442 v8i16 src10, src21, src32, src43, src54, src65, src87, src109, src1211;
2443 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5,
filt, filt0, filt1;
2444 v4u32
mask = { 2, 6, 2, 6 };
2447 filt =
LD_SH(filter);
2452 LD_SB3(src, src_stride, src0, src1, src2);
2453 src += (3 * src_stride);
2456 VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
2458 for (loop_cnt = (height >> 2); loop_cnt--;) {
2459 LD_SB4(src, src_stride, src3, src4, src5, src6);
2460 src += (4 * src_stride);
2463 ILVR_B2_SH(src1, src0, src3, src2, src10, src32);
2464 VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
2465 VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
2467 ILVR_B4_SH(src2, src1, src4, src3, src5, src4, src6, src5,
2468 src21, src43, src54, src65);
2472 ILVR_B3_SH(vec1, vec0, vec3, vec2, vec5, vec4, src87, src109, src1211);
2481 ST8x4_UB(out0, out1, dst, dst_stride);
2483 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
2484 dst += (4 * dst_stride);
2500 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2501 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2502 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2503 v16u8 tmp0, tmp1, tmp2, tmp3;
2504 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2508 filt =
LD_SH(filter);
2511 LD_SB3(src, src_stride, src0, src1, src2);
2512 src += (3 * src_stride);
2515 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2516 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2518 for (loop_cnt = (height >> 2); loop_cnt--;) {
2519 LD_SB4(src, src_stride, src3, src4, src5, src6);
2520 src += (4 * src_stride);
2523 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2524 src32_r, src43_r, src54_r, src65_r);
2525 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2526 src32_l, src43_l, src54_l, src65_l);
2537 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2538 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2539 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2540 out3_r, tmp0, tmp1, tmp2, tmp3);
2542 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2543 dst += (4 * dst_stride);
2558 uint64_t out0, out1;
2559 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2560 v16i8 src11, filt0, filt1;
2561 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2562 v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2564 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2568 filt =
LD_SH(filter);
2572 LD_SB3(src, src_stride, src0, src1, src2);
2574 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2575 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2578 LD_SB3(src + 16, src_stride, src6, src7, src8);
2579 src += (3 * src_stride);
2581 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2583 for (loop_cnt = (height >> 2); loop_cnt--;) {
2585 LD_SB2(src, src_stride, src3, src4);
2587 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2588 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2591 LD_SB2(src + 16, src_stride, src9, src10);
2592 src += (2 * src_stride);
2594 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2609 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2613 PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2615 out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2616 out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2625 LD_SB2(src, src_stride, src5, src2);
2627 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2628 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2631 LD_SB2(src + 16, src_stride, src11, src8);
2632 src += (2 * src_stride);
2634 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2649 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2669 uint32_t loop_cnt, cnt;
2671 v16i8
src0,
src1, src2, src3, src4, src6, src7, src8, src9, src10;
2672 v16i8 src10_r, src32_r, src76_r, src98_r;
2673 v16i8 src21_r, src43_r, src87_r, src109_r;
2674 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2675 v16i8 src10_l, src32_l, src76_l, src98_l;
2676 v16i8 src21_l, src43_l, src87_l, src109_l;
2683 filt =
LD_SH(filter);
2686 for (cnt = (width >> 5); cnt--;) {
2691 LD_SB3(src_tmp, src_stride, src0, src1, src2);
2694 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2695 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2698 LD_SB3(src_tmp + 16, src_stride, src6, src7, src8);
2699 src_tmp += (3 * src_stride);
2702 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2703 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
2705 for (loop_cnt = (height >> 1); loop_cnt--;) {
2707 LD_SB2(src_tmp, src_stride, src3, src4);
2709 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2710 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2720 SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
2722 ST_UB(out, dst_tmp);
2724 ST_UB(out, dst_tmp + dst_stride);
2733 LD_SB2(src_tmp + 16, src_stride, src9, src10);
2734 src_tmp += (2 * src_stride);
2736 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2737 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
2747 SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
2749 ST_UB(out, dst_tmp + 16);
2751 ST_UB(out, dst_tmp + 16 + dst_stride);
2753 dst_tmp += 2 * dst_stride;
2772 filter, height, 32);
2779 const int8_t *filter_x,
2780 const int8_t *filter_y,
2783 v16i8
src0,
src1, src2, src3, src4;
2785 v4i32 filt_h0, filt_h1;
2786 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2788 v8i16 filter_vec, const_vec;
2789 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2790 v8i16 dst0, dst1, dst2, dst3, dst4;
2791 v4i32 dst0_r, dst1_r;
2792 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
2794 src -= (src_stride + 1);
2796 filter_vec =
LD_SH(filter_x);
2799 filter_vec =
LD_SH(filter_y);
2800 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2801 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2807 const_vec = __msa_ldi_h(128);
2810 LD_SB3(src, src_stride, src0, src1, src2);
2811 src += (3 * src_stride);
2815 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2816 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2817 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2826 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2827 LD_SB2(src, src_stride, src3, src4);
2831 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2835 dst32_r = __msa_ilvr_h(dst3, dst2);
2840 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2844 dst43_r = __msa_ilvr_h(dst4, dst3);
2848 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
2849 dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 6);
2851 dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
2860 const int8_t *filter_x,
2861 const int8_t *filter_y,
2864 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2866 v4i32 filt_h0, filt_h1;
2867 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2869 v8i16 filter_vec, const_vec;
2870 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2871 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2872 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
2873 v8i16 out0_r, out1_r;
2874 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
2876 src -= (src_stride + 1);
2878 filter_vec =
LD_SH(filter_x);
2881 filter_vec =
LD_SH(filter_y);
2882 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2883 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2889 const_vec = __msa_ldi_h(128);
2892 LD_SB3(src, src_stride, src0, src1, src2);
2893 src += (3 * src_stride);
2897 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2898 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2899 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2908 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2909 LD_SB4(src, src_stride, src3, src4, src5, src6);
2913 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2917 dst32_r = __msa_ilvr_h(dst3, dst2);
2922 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2926 dst43_r = __msa_ilvr_h(dst4, dst3);
2931 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2935 dst10_r = __msa_ilvr_h(dst5, dst4);
2940 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2944 dst21_r = __msa_ilvr_h(dst2, dst5);
2948 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, out0_r, out1_r);
2951 out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
2953 ST4x4_UB(out0_r, out0_r, 0, 1, 2, 3, dst, dst_stride);
2960 const int8_t *filter_x,
2961 const int8_t *filter_y,
2965 v16i8
src0,
src1, src2, src3, src4, src5;
2966 v16i8 src6, src7, src8, src9, src10;
2968 v4i32 filt_h0, filt_h1;
2969 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2971 v8i16 filter_vec, const_vec;
2972 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2973 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
2974 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
2975 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2976 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2977 v8i16 out0_r, out1_r, out2_r, out3_r;
2979 src -= (src_stride + 1);
2981 filter_vec =
LD_SH(filter_x);
2984 filter_vec =
LD_SH(filter_y);
2985 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2986 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2992 const_vec = __msa_ldi_h(128);
2995 LD_SB3(src, src_stride, src0, src1, src2);
2996 src += (3 * src_stride);
3000 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3001 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3002 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3011 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3013 for (loop_cnt = height >> 3; loop_cnt--;) {
3015 src3, src4, src5, src6, src7, src8, src9, src10);
3016 src += (8 * src_stride);
3021 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3025 dst32_r = __msa_ilvr_h(dst3, dst2);
3030 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3034 dst43_r = __msa_ilvr_h(dst4, dst3);
3039 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3043 dst54_r = __msa_ilvr_h(dst5, dst4);
3048 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3052 dst65_r = __msa_ilvr_h(dst6, dst5);
3057 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3061 dst76_r = __msa_ilvr_h(dst7, dst6);
3066 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3070 dst87_r = __msa_ilvr_h(dst8, dst7);
3075 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
3079 dst10_r = __msa_ilvr_h(dst9, dst8);
3084 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
3088 dst21_r = __msa_ilvr_h(dst2, dst9);
3093 dst5_r, dst4_r, dst7_r, dst6_r,
3094 out0_r, out1_r, out2_r, out3_r);
3099 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3100 ST4x8_UB(out0_r, out1_r, dst, dst_stride);
3101 dst += (8 * dst_stride);
3109 const int8_t *filter_x,
3110 const int8_t *filter_y,
3115 filter_x, filter_y, height);
3116 }
else if (4 == height) {
3118 filter_x, filter_y, height);
3119 }
else if (0 == (height % 8)) {
3121 filter_x, filter_y, height);
3129 const int8_t *filter_x,
3130 const int8_t *filter_y,
3134 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3136 v4i32 filt_h0, filt_h1;
3137 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3139 v8i16 filter_vec, const_vec;
3140 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3141 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3142 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3143 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3144 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3145 v8i16 out0_r, out1_r, out2_r, out3_r;
3147 src -= (src_stride + 1);
3149 filter_vec =
LD_SH(filter_x);
3152 filter_vec =
LD_SH(filter_y);
3153 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3154 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3160 const_vec = __msa_ldi_h(128);
3163 LD_SB3(src, src_stride, src0, src1, src2);
3164 src += (3 * src_stride);
3168 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3169 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3170 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3182 for (loop_cnt = height >> 2; loop_cnt--;) {
3183 LD_SB4(src, src_stride, src3, src4, src5, src6);
3184 src += (4 * src_stride);
3189 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3200 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3211 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3223 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3235 dst2_l, dst2_r, dst3_l, dst3_r,
3236 out0_r, out1_r, out2_r, out3_r);
3241 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3242 ST6x4_UB(out0_r, out1_r, dst, dst_stride);
3243 dst += (4 * dst_stride);
3251 const int8_t *filter_x,
3252 const int8_t *filter_y,
3255 v16i8
src0,
src1, src2, src3, src4;
3257 v4i32 filt_h0, filt_h1;
3258 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3260 v8i16 filter_vec, const_vec;
3261 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3262 v8i16 dst0, dst1, dst2, dst3, dst4;
3263 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3264 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3265 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3266 v8i16 out0_r, out1_r;
3268 src -= (src_stride + 1);
3270 filter_vec =
LD_SH(filter_x);
3273 filter_vec =
LD_SH(filter_y);
3274 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3275 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3281 const_vec = __msa_ldi_h(128);
3284 LD_SB3(src, src_stride, src0, src1, src2);
3285 src += (3 * src_stride);
3289 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3290 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3291 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3303 LD_SB2(src, src_stride, src3, src4);
3307 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3318 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3328 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3331 out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
3340 const int8_t *filter_x,
3341 const int8_t *filter_y,
3344 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3346 v4i32 filt_h0, filt_h1;
3347 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3349 v8i16 filter_vec, const_vec;
3350 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3351 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3352 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3353 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3354 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3355 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3356 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3357 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3358 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3360 src -= (src_stride + 1);
3362 filter_vec =
LD_SH(filter_x);
3365 filter_vec =
LD_SH(filter_y);
3366 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3367 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3373 const_vec = __msa_ldi_h(128);
3376 LD_SB3(src, src_stride, src0, src1, src2);
3377 src += (3 * src_stride);
3381 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3382 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3383 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3395 LD_SB2(src, src_stride, src3, src4);
3396 src += (2 * src_stride);
3401 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3413 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3423 LD_SB2(src, src_stride, src5, src6);
3424 src += (2 * src_stride);
3429 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3440 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3450 LD_SB2(src, src_stride, src7, src8);
3451 src += (2 * src_stride);
3456 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3468 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3479 dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3480 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3486 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3487 out2_r = (v8i16) __msa_pckev_b((v16i8) out5_r, (v16i8) out4_r);
3489 ST8x4_UB(out0_r, out1_r, dst, dst_stride);
3490 dst += (4 * dst_stride);
3498 const int8_t *filter_x,
3499 const int8_t *filter_y,
3503 uint32_t loop_cnt, cnt;
3506 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3508 v4i32 filt_h0, filt_h1;
3509 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3511 v8i16 filter_vec, const_vec;
3512 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3513 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3514 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3515 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3516 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3517 v8i16 out0_r, out1_r, out2_r, out3_r;
3519 src -= (src_stride + 1);
3521 filter_vec =
LD_SH(filter_x);
3524 filter_vec =
LD_SH(filter_y);
3525 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3526 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3532 const_vec = __msa_ldi_h(128);
3535 for (cnt = width >> 3; cnt--;) {
3539 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3540 src_tmp += (3 * src_stride);
3544 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3545 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3546 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3558 for (loop_cnt = height >> 2; loop_cnt--;) {
3559 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3560 src_tmp += (4 * src_stride);
3565 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3577 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3588 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3600 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3612 dst2_l, dst2_r, dst3_l, dst3_r,
3613 out0_r, out1_r, out2_r, out3_r);
3618 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3619 ST8x4_UB(out0_r, out1_r, dst_tmp, dst_stride);
3620 dst_tmp += (4 * dst_stride);
3632 const int8_t *filter_x,
3633 const int8_t *filter_y,
3638 filter_x, filter_y, height);
3639 }
else if (6 == height) {
3641 filter_x, filter_y, height);
3642 }
else if (0 == (height % 4)) {
3644 filter_x, filter_y, height, 8);
3652 const int8_t *filter_x,
3653 const int8_t *filter_y,
3657 filter_x, filter_y, height, 8);
3660 filter_x, filter_y, height);
3667 const int8_t *filter_x,
3668 const int8_t *filter_y,
3672 filter_x, filter_y, height, 16);
3679 const int8_t *filter_x,
3680 const int8_t *filter_y,
3684 filter_x, filter_y, height, 24);
3691 const int8_t *filter_x,
3692 const int8_t *filter_y,
3696 filter_x, filter_y, height, 32);
3699 #define UNI_MC_COPY(WIDTH) \
3700 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
3701 ptrdiff_t dst_stride, \
3703 ptrdiff_t src_stride, \
3709 copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
3722 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
3723 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
3734 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
3736 common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
3740 UNI_MC(qpel,
h, 4, 8, hz, mx);
3741 UNI_MC(qpel,
h, 8, 8, hz, mx);
3742 UNI_MC(qpel,
h, 12, 8, hz, mx);
3743 UNI_MC(qpel,
h, 16, 8, hz, mx);
3744 UNI_MC(qpel,
h, 24, 8, hz, mx);
3745 UNI_MC(qpel,
h, 32, 8, hz, mx);
3746 UNI_MC(qpel,
h, 48, 8, hz, mx);
3747 UNI_MC(qpel,
h, 64, 8, hz, mx);
3749 UNI_MC(qpel, v, 4, 8, vt, my);
3750 UNI_MC(qpel, v, 8, 8, vt, my);
3751 UNI_MC(qpel, v, 12, 8, vt, my);
3752 UNI_MC(qpel, v, 16, 8, vt, my);
3753 UNI_MC(qpel, v, 24, 8, vt, my);
3754 UNI_MC(qpel, v, 32, 8, vt, my);
3755 UNI_MC(qpel, v, 48, 8, vt, my);
3756 UNI_MC(qpel, v, 64, 8, vt, my);
3758 UNI_MC(epel,
h, 4, 4, hz, mx);
3759 UNI_MC(epel,
h, 6, 4, hz, mx);
3760 UNI_MC(epel,
h, 8, 4, hz, mx);
3761 UNI_MC(epel,
h, 12, 4, hz, mx);
3762 UNI_MC(epel,
h, 16, 4, hz, mx);
3763 UNI_MC(epel,
h, 24, 4, hz, mx);
3764 UNI_MC(epel,
h, 32, 4, hz, mx);
3766 UNI_MC(epel, v, 4, 4, vt, my);
3767 UNI_MC(epel, v, 6, 4, vt, my);
3768 UNI_MC(epel, v, 8, 4, vt, my);
3769 UNI_MC(epel, v, 12, 4, vt, my);
3770 UNI_MC(epel, v, 16, 4, vt, my);
3771 UNI_MC(epel, v, 24, 4, vt, my);
3772 UNI_MC(epel, v, 32, 4, vt, my);
3776 #define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
3777 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
3788 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
3789 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
3791 hevc_##DIR1##_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
3792 dst_stride, filter_x, \
3793 filter_y, height); \
static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
#define HEVC_PCK_SW_SB2(in0, in1, out)
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
static void hevc_hv_uni_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static const uint8_t mc_filt_mask_arr[16 *3]
static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_uni_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void copy_width24_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define XORI_B2_128_SB(...)
#define PCKEV_XORI128_UB(in0, in1)
static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define XORI_B3_128_SB(...)
static void hevc_hv_uni_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_4t_8w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define DPADD_SB4_SH(...)
#define SPLATI_H2_SH(...)
static void hevc_hv_uni_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void copy_width48_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B2_128_SH(...)
#define XORI_B4_128_UB(...)
#define HEVC_PCK_SW_SB4(in0, in1, in2, in3, out)
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1, out2, out3)
#define LD4(psrc, stride, out0, out1, out2, out3)
#define CLIP_SH_0_255(in)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, uint8_t clip)
#define SPLATI_H4_SH(...)
static void hevc_hv_uni_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define CLIP_SW_0_255(in)
static void hevc_hv_uni_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define CLIP_SH2_0_255(in0, in1)
static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1)
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
static void hevc_hv_uni_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST8x2_UB(in, pdst, stride)
static const uint16_t mask[17]
#define SPLATI_H2_SB(...)
static void hevc_hv_uni_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define XORI_B7_128_SB(...)
static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define XORI_B4_128_SB(...)
static void hevc_hv_uni_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define CLIP_SH4_0_255(in0, in1, in2, in3)
#define DPADD_SB2_SH(...)
static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SPLATI_W4_SW(...)
static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width12_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define SPLATI_H4_SB(...)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_hv_uni_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1, out2, out3)
#define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)
static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void copy_width64_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_uni_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1)
static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define SD4(in0, in1, in2, in3, pdst, stride)
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static const int8_t filt[NUMTAPS]
#define ST4x8_UB(in0, in1, pdst, stride)
#define ST6x4_UB(in0, in1, pdst, stride)
static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define ST8x4_UB(in0, in1, pdst, stride)
#define UNI_MC_COPY(WIDTH)
static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_uni_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,filt0, filt1, filt2, filt3)
static void hevc_hv_uni_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SPLATI_W2_SW(...)
#define ST8x1_UB(in, pdst)
static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)
static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST4x2_UB(in, pdst, stride)
#define LD2(psrc, stride, out0, out1)
static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
static void copy_width32_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)