25 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
26 0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
34 uint32_t coeff0, uint32_t coeff1)
41 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
42 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
43 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
47 LD_SB2(src, src_stride, src0, src1);
49 src0 = __msa_vshf_b(mask, src1, src0);
50 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
52 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
53 res_r = __msa_sat_u_h(res_r, 7);
54 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
56 out0 = __msa_copy_u_h(res, 0);
57 out1 = __msa_copy_u_h(res, 2);
66 uint32_t coeff0, uint32_t coeff1)
72 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
73 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
74 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
78 LD_UB4(src, src_stride, src0, src1, src2, src3);
80 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
82 src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
84 res_r = __msa_dotp_u_h(src0, coeff_vec);
86 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
87 res_r = __msa_sat_u_h(res_r, 7);
88 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
95 uint32_t coeff0, uint32_t coeff1)
97 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
101 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
102 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
103 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
107 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
109 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
110 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
112 ILVR_D2_UB(src2, src0, src6, src4, src0, src4);
114 res_r = __msa_dotp_u_h(src0, coeff_vec);
116 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
117 res_r = __msa_sat_u_h(res_r, 7);
118 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
121 dst += (4 * dst_stride);
123 res_r = __msa_dotp_u_h(src4, coeff_vec);
125 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
126 res_r = __msa_sat_u_h(res_r, 7);
127 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
134 uint32_t coeff0, uint32_t coeff1,
139 }
else if (4 == height) {
141 }
else if (8 == height) {
148 uint32_t coeff0, uint32_t coeff1)
154 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
155 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
156 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
160 LD_SB2(src, src_stride, src0, src1);
162 src0 = __msa_vshf_b(mask, src1, src0);
163 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
165 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
166 res_r = __msa_sat_u_h(res_r, 7);
167 res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
174 uint32_t coeff0, uint32_t coeff1,
179 v8u16 res0_r, res1_r;
182 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
183 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
184 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
188 for (row = (height >> 2); row--;) {
189 LD_UB4(src, src_stride, src0, src1, src2, src3);
190 src += (4 * src_stride);
192 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
193 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
200 PCKEV_B2_SW(res0_r, res0_r, res1_r, res1_r, res0, res1);
202 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
203 dst += (4 * dst_stride);
209 uint32_t coeff0, uint32_t coeff1,
222 uint32_t coeff0, uint32_t coeff1,
226 v16u8
src0,
src1, src2, src3, out0, out1;
227 v8u16 res0, res1, res2, res3;
229 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
230 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
231 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
235 for (row = height >> 2; row--;) {
236 LD_UB4(src, src_stride, src0, src1, src2, src3);
237 src += (4 * src_stride);
239 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
240 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
241 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
242 coeff_vec, res0, res1, res2, res3);
243 SLLI_4V(res0, res1, res2, res3, 3);
247 ST8x4_UB(out0, out1, dst, dst_stride);
248 dst += (4 * dst_stride);
251 if (0 != (height % 4)) {
252 for (row = (height % 4); row--;) {
256 src0 = (v16u8) __msa_vshf_b(mask, (v16i8)
src0, (v16i8) src0);
258 res0 = __msa_dotp_u_h(src0, coeff_vec);
260 res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
261 res0 = __msa_sat_u_h(res0, 7);
262 res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
272 uint32_t coeff0, uint32_t coeff1)
279 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
280 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
281 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
283 LD_SB3(src, src_stride, src0, src1, src2);
285 ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
287 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
289 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
291 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
292 res_r = __msa_sat_u_h(res_r, 7);
293 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
295 out0 = __msa_copy_u_h(res, 0);
296 out1 = __msa_copy_u_h(res, 2);
305 uint32_t coeff0, uint32_t coeff1)
308 v16u8 tmp0, tmp1, tmp2, tmp3;
311 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
312 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
313 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
315 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
316 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
317 tmp0, tmp1, tmp2, tmp3);
318 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
320 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
322 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
324 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
325 res_r = __msa_sat_u_h(res_r, 7);
327 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
334 uint32_t coeff0, uint32_t coeff1)
336 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
337 v16u8 tmp0, tmp1, tmp2, tmp3;
340 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
341 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
342 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
344 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
345 src += (5 * src_stride);
346 LD_UB4(src, src_stride, src5, src6, src7, src8);
348 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
349 tmp0, tmp1, tmp2, tmp3);
350 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
352 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
354 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
356 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
357 res_r = __msa_sat_u_h(res_r, 7);
359 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
362 dst += (4 * dst_stride);
364 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
365 tmp0, tmp1, tmp2, tmp3);
366 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
368 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
370 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
372 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
373 res_r = __msa_sat_u_h(res_r, 7);
375 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
378 dst += (4 * dst_stride);
383 uint32_t coeff0, uint32_t coeff1,
388 }
else if (4 == height) {
390 }
else if (8 == height) {
397 uint32_t coeff0, uint32_t coeff1)
403 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
404 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
405 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
407 LD_UB3(src, src_stride, src0, src1, src2);
408 ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
410 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
411 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
413 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
414 res_r = __msa_sat_u_h(res_r, 7);
415 res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
422 uint32_t coeff0, uint32_t coeff1,
427 v16u8 tmp0, tmp1, tmp2, tmp3;
428 v8u16 res0_r, res1_r;
430 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
431 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
432 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
437 for (row = (height >> 2); row--;) {
438 LD_UB4(src, src_stride, src1, src2, src3, src4);
439 src += (4 * src_stride);
441 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
442 tmp0, tmp1, tmp2, tmp3);
443 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
444 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
451 PCKEV_B2_SW(res0_r, res0_r, res1_r, res1_r, res0, res1);
453 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
454 dst += (4 * dst_stride);
461 uint32_t coeff0, uint32_t coeff1,
474 uint32_t coeff0, uint32_t coeff1,
478 v16u8
src0,
src1, src2, src3, src4, out0, out1;
479 v8u16 res0, res1, res2, res3;
480 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
481 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
482 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
487 for (row = height >> 2; row--;) {
488 LD_UB4(src, src_stride, src1, src2, src3, src4);
489 src += (4 * src_stride);
491 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
492 src0, src1, src2, src3);
493 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
494 coeff_vec, res0, res1, res2, res3);
495 SLLI_4V(res0, res1, res2, res3, 3);
500 ST8x4_UB(out0, out1, dst, dst_stride);
502 dst += (4 * dst_stride);
509 uint32_t coef_hor0, uint32_t coef_hor1,
510 uint32_t coef_ver0, uint32_t coef_ver1)
514 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
517 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
518 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
519 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
520 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
521 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
525 LD_UB3(src, src_stride, src0, src1, src2);
526 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
527 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
528 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
531 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
532 res_vt0 = __msa_sat_u_h(res_vt0, 7);
533 res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
535 out0 = __msa_copy_u_h(res_vert, 0);
536 out1 = __msa_copy_u_h(res_vert, 1);
545 uint32_t coef_hor0, uint32_t coef_hor1,
546 uint32_t coef_ver0, uint32_t coef_ver1)
549 v16u8 tmp0, tmp1, tmp2, tmp3;
550 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
553 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
554 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
555 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
556 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
557 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
561 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
563 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
564 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
565 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
566 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
567 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
570 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
571 res_vt0 = __msa_sat_u_h(res_vt0, 7);
572 res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
579 uint32_t coef_hor0, uint32_t coef_hor1,
580 uint32_t coef_ver0, uint32_t coef_ver1)
582 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
583 v16u8 tmp0, tmp1, tmp2, tmp3;
584 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
587 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
588 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
589 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
590 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
591 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
595 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
596 src += (5 * src_stride);
597 LD_UB4(src, src_stride, src5, src6, src7, src8);
599 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
600 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
601 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
602 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
603 VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
604 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
605 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
606 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
609 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
610 res_vt0 = __msa_sat_u_h(res_vt0, 7);
612 res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
615 dst += (4 * dst_stride);
617 DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
618 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
621 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
622 res_vt0 = __msa_sat_u_h(res_vt0, 7);
624 res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
631 uint32_t coef_hor0, uint32_t coef_hor1,
632 uint32_t coef_ver0, uint32_t coef_ver1,
637 coef_hor1, coef_ver0, coef_ver1);
638 }
else if (4 == height) {
640 coef_hor1, coef_ver0, coef_ver1);
641 }
else if (8 == height) {
643 coef_hor1, coef_ver0, coef_ver1);
649 uint32_t coef_hor0, uint32_t coef_hor1,
650 uint32_t coef_ver0, uint32_t coef_ver1)
653 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
656 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
657 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
658 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
659 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
660 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
663 LD_UB3(src, src_stride, src0, src1, src2);
664 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
665 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
666 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
669 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
670 res_vt0 = __msa_sat_u_h(res_vt0, 7);
671 res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
686 v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
687 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
689 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
690 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
691 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
692 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
693 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
701 for (row = (height >> 2); row--;) {
702 LD_UB4(src, src_stride, src1, src2, src3, src4);
703 src += (4 * src_stride);
705 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
706 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
707 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
708 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
710 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
711 coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
713 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
716 PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
718 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
719 dst += (4 * dst_stride);
726 uint32_t coef_hor0, uint32_t coef_hor1,
727 uint32_t coef_ver0, uint32_t coef_ver1,
732 coef_hor1, coef_ver0, coef_ver1);
735 coef_hor0, coef_hor1, coef_ver0,
742 uint32_t coef_hor0, uint32_t coef_hor1,
743 uint32_t coef_ver0, uint32_t coef_ver1,
747 v16u8
src0,
src1, src2, src3, src4, out0, out1;
748 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
749 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
751 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
752 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
753 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
754 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
755 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
762 src0 = (v16u8) __msa_vshf_b(mask, (v16i8)
src0, (v16i8) src0);
763 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
765 for (row = (height >> 2); row--;) {
766 LD_UB4(src, src_stride, src1, src2, src3, src4);
767 src += (4 * src_stride);
769 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
770 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
771 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
772 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
774 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
775 coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
778 res_vt0 += (res_hz0 * coeff_vt_vec1);
779 res_vt1 += (res_hz1 * coeff_vt_vec1);
780 res_vt2 += (res_hz2 * coeff_vt_vec1);
781 res_vt3 += (res_hz3 * coeff_vt_vec1);
783 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
784 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
785 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
786 ST8x4_UB(out0, out1, dst, dst_stride);
788 dst += (4 * dst_stride);
796 uint32_t coeff0, uint32_t coeff1)
799 uint32_t load0, load1;
801 v16u8 dst_data = { 0 };
805 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
806 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
807 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
811 LD_SB2(src, src_stride, src0, src1);
814 load1 =
LW(dst + dst_stride);
818 src0 = __msa_vshf_b(mask, src1, src0);
820 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
822 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
823 res_r = __msa_sat_u_h(res_r, 7);
825 res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
826 dst_data = __msa_aver_u_b(res, dst_data);
828 out0 = __msa_copy_u_h((v8i16) dst_data, 0);
829 out1 = __msa_copy_u_h((v8i16) dst_data, 2);
838 uint32_t coeff0, uint32_t coeff1)
841 v16u8 dst0, dst1, dst2, dst3;
844 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
845 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
846 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
850 LD_UB4(src, src_stride, src0, src1, src2, src3);
851 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
853 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
854 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
855 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
857 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
859 src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
861 res_r = __msa_dotp_u_h(src0, coeff_vec);
863 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
864 res_r = __msa_sat_u_h(res_r, 7);
866 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
867 dst0 = __msa_aver_u_b((v16u8) res, dst0);
874 uint32_t coeff0, uint32_t coeff1)
876 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
877 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
878 v8u16 res0_r, res1_r;
879 v16u8 res0, res1,
mask;
880 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
881 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
882 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
886 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
887 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
889 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
890 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
891 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
893 dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
894 dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
895 dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
897 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
898 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
899 ILVR_D2_UB(src2, src0, src6, src4, src0, src4);
900 DOTP_UB2_UH(src0, src4, coeff_vec, coeff_vec, res0_r, res1_r);
907 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
911 dst += (4 * dst_stride);
917 uint32_t coeff0, uint32_t coeff1,
923 }
else if (4 == height) {
926 }
else if (8 == height) {
934 uint32_t coeff0, uint32_t coeff1)
936 uint32_t load0, load1;
938 v16u8 dst_data = { 0 };
941 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
942 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
943 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
947 LD_SB2(src, src_stride, src0, src1);
950 load1 =
LW(dst + dst_stride);
954 src0 = __msa_vshf_b(mask, src1, src0);
956 res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
958 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
959 res_r = __msa_sat_u_h(res_r, 7);
960 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
961 dst_data = __msa_aver_u_b((v16u8) res, dst_data);
963 ST4x2_UB(dst_data, dst, dst_stride);
974 uint32_t load0, load1;
979 v8u16 res0_r, res1_r;
980 v16u8 res0, res1,
mask;
981 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
982 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
983 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
987 for (row = (height >> 2); row--;) {
988 LD_UB4(src, src_stride, src0, src1, src2, src3);
989 src += (4 * src_stride);
992 load1 =
LW(dst + dst_stride);
996 load0 =
LW(dst + 2 * dst_stride);
997 load1 =
LW(dst + 3 * dst_stride);
1001 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
1002 DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
1009 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
1012 ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1013 dst += (4 * dst_stride);
1019 uint32_t coeff0, uint32_t coeff1,
1028 coeff0, coeff1, height);
1034 uint32_t coeff0, uint32_t coeff1,
1038 v16u8
src0,
src1, src2, src3, out0, out1;
1039 v8u16 res0, res1, res2, res3;
1040 v16u8 dst0, dst1, dst2, dst3;
1042 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1043 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1044 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1048 for (row = height >> 2; row--;) {
1049 LD_UB4(src, src_stride, src0, src1, src2, src3);
1050 src += (4 * src_stride);
1051 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1052 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
1053 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1054 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1055 coeff_vec, res0, res1, res2, res3);
1056 SLLI_4V(res0, res1, res2, res3, 3);
1062 ST8x4_UB(out0, out1, dst, dst_stride);
1063 dst += (4 * dst_stride);
1069 uint32_t coeff0, uint32_t coeff1)
1071 uint16_t out0, out1;
1072 uint32_t load0, load1;
1073 v16i8
src0,
src1, src2, tmp0, tmp1, res;
1074 v16u8 dst_data = { 0 };
1076 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1077 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1078 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1080 LD_SB3(src, src_stride, src0, src1, src2);
1082 load1 =
LW(dst + dst_stride);
1086 ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
1088 tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1089 res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
1091 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1092 res_r = __msa_sat_u_h(res_r, 7);
1093 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1094 dst_data = __msa_aver_u_b((v16u8) res, dst_data);
1095 out0 = __msa_copy_u_h((v8i16) dst_data, 0);
1096 out1 = __msa_copy_u_h((v8i16) dst_data, 2);
1105 uint32_t coeff0, uint32_t coeff1)
1107 uint32_t load0, load1;
1108 v16i8
src0,
src1, src2, src3, src4;
1109 v16u8 tmp0, tmp1, tmp2, tmp3;
1112 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1113 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1114 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1115 v16u8 dst_data = { 0 };
1117 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1120 load1 =
LW(dst + dst_stride);
1122 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, load0);
1123 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, load1);
1125 load0 =
LW(dst + 2 * dst_stride);
1126 load1 =
LW(dst + 3 * dst_stride);
1128 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, load0);
1129 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, load1);
1131 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1132 tmp0, tmp1, tmp2, tmp3);
1133 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1135 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1137 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1139 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1140 res_r = __msa_sat_u_h(res_r, 7);
1142 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1143 res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1146 dst += (4 * dst_stride);
1151 uint32_t coeff0, uint32_t coeff1)
1153 uint32_t load0, load1, load2, load3;
1154 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1155 v16u8 tmp0, tmp1, tmp2, tmp3;
1158 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1159 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1160 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1161 v16u8 dst_data0 = { 0 };
1162 v16u8 dst_data1 = { 0 };
1164 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1165 src += (5 * src_stride);
1166 LD_SB4(src, src_stride, src5, src6, src7, src8);
1168 LW4(dst, dst_stride, load0, load1, load2, load3);
1170 dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 0, load0);
1171 dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 1, load1);
1172 dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 2, load2);
1173 dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 3, load3);
1175 LW4(dst + 4 * dst_stride, dst_stride, load0, load1, load2, load3);
1177 dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 0, load0);
1178 dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 1, load1);
1179 dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 2, load2);
1180 dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 3, load3);
1182 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1183 tmp0, tmp1, tmp2, tmp3);
1185 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1187 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1189 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1191 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1192 res_r = __msa_sat_u_h(res_r, 7);
1194 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1195 res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data0);
1198 dst += (4 * dst_stride);
1200 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1201 tmp0, tmp1, tmp2, tmp3);
1203 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1205 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1207 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1209 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1210 res_r = __msa_sat_u_h(res_r, 7);
1212 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1213 res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data1);
1220 uint32_t coeff0, uint32_t coeff1,
1226 }
else if (4 == height) {
1229 }
else if (8 == height) {
1237 uint32_t coeff0, uint32_t coeff1)
1239 uint32_t load0, load1;
1240 v16i8
src0,
src1, src2, tmp0, tmp1;
1241 v16u8 dst_data = { 0 };
1244 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1245 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1246 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1248 LD_SB3(src, src_stride, src0, src1, src2);
1251 load1 =
LW(dst + dst_stride);
1254 ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
1256 tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1258 res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
1260 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1261 res_r = __msa_sat_u_h(res_r, 7);
1262 res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1263 res = __msa_aver_u_b(res, dst_data);
1276 uint32_t load0, load1, row;
1277 v16i8
src0,
src1, src2, src3, src4;
1278 v16u8 tmp0, tmp1, tmp2, tmp3;
1281 v8u16 res0_r, res1_r;
1283 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1284 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1285 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1290 for (row = (height >> 2); row--;) {
1291 LD_SB4(src, src_stride, src1, src2, src3, src4);
1292 src += (4 * src_stride);
1295 load1 =
LW(dst + dst_stride);
1298 load0 =
LW(dst + 2 * dst_stride);
1299 load1 =
LW(dst + 3 * dst_stride);
1302 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1303 tmp0, tmp1, tmp2, tmp3);
1304 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1305 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
1312 PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
1315 ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1316 dst += (4 * dst_stride);
1323 uint32_t coeff0, uint32_t coeff1,
1331 coeff0, coeff1, height);
1337 uint32_t coeff0, uint32_t coeff1,
1341 v16u8
src0,
src1, src2, src3, src4;
1343 v8u16 res0, res1, res2, res3;
1344 v16u8 dst0, dst1, dst2, dst3;
1345 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1346 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1347 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1352 for (row = height >> 2; row--;) {
1353 LD_UB4(src, src_stride, src1, src2, src3, src4);
1354 src += (4 * src_stride);
1355 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1356 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1357 src0, src1, src2, src3);
1358 DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1359 coeff_vec, res0, res1, res2, res3);
1360 SLLI_4V(res0, res1, res2, res3, 3);
1366 ST8x4_UB(out0, out1, dst, dst_stride);
1368 dst += (4 * dst_stride);
1380 uint16_t out0, out1;
1383 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1385 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1386 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1387 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1388 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1389 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1393 LD_UB3(src, src_stride, src0, src1, src2);
1394 LD_UB2(dst, dst_stride, dst0, dst1);
1395 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1396 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1397 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1400 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1401 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1402 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1403 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
1404 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1405 out0 = __msa_copy_u_h((v8i16) dst0, 0);
1406 out1 = __msa_copy_u_h((v8i16) dst0, 1);
1420 v16u8
src0,
src1, src2, src3, src4;
1421 v16u8 tmp0, tmp1, tmp2, tmp3;
1422 v16u8 dst0, dst1, dst2, dst3;
1423 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1425 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1426 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1427 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1428 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1429 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1433 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1434 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1435 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
1436 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
1437 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1438 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1439 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1442 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1443 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1444 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1446 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
1447 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
1448 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
1449 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1451 ST2x4_UB(dst0, 0, dst, dst_stride);
1461 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1462 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1463 v16u8 tmp0, tmp1, tmp2, tmp3;
1464 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1466 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1467 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1468 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1469 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1470 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1474 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1475 src += (5 * src_stride);
1476 LD_UB4(src, src_stride, src5, src6, src7, src8);
1478 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1480 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
1481 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
1482 dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
1484 dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
1485 dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
1486 dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
1488 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
1489 VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
1490 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1491 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
1492 VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
1493 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
1494 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1495 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1498 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1499 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1500 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1501 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1503 ST2x4_UB(dst0, 0, dst, dst_stride);
1504 dst += (4 * dst_stride);
1506 DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1507 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1510 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1511 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1512 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1513 dst4 = __msa_aver_u_b((v16u8) res, dst4);
1515 ST2x4_UB(dst4, 0, dst, dst_stride);
1528 coef_hor0, coef_hor1,
1529 coef_ver0, coef_ver1);
1530 }
else if (4 == height) {
1532 coef_hor0, coef_hor1,
1533 coef_ver0, coef_ver1);
1534 }
else if (8 == height) {
1536 coef_hor0, coef_hor1,
1537 coef_ver0, coef_ver1);
1550 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1552 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1553 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1554 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1555 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1556 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1560 LD_UB3(src, src_stride, src0, src1, src2);
1561 LD_UB2(dst, dst_stride, dst0, dst1);
1562 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1563 DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1564 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1567 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1568 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1569 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1570 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
1571 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1587 v16u8
src0,
src1, src2, src3, src4;
1588 v16u8 dst0, dst1, dst2, dst3;
1589 v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
1590 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1592 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1593 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1594 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1595 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1596 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1604 for (row = (height >> 2); row--;) {
1605 LD_UB4(src, src_stride, src1, src2, src3, src4);
1606 src += (4 * src_stride);
1608 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1610 VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1611 VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
1612 DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
1613 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
1615 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
1616 coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
1618 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1621 PCKEV_B2_UB(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
1623 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
1624 dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
1628 ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1629 dst += (4 * dst_stride);
1644 coef_hor0, coef_hor1,
1645 coef_ver0, coef_ver1);
1648 coef_hor0, coef_hor1,
1649 coef_ver0, coef_ver1, height);
1662 v16u8
src0,
src1, src2, src3, src4, out0, out1;
1663 v8u16 res_hz0, res_hz1, res_hz2;
1664 v8u16 res_hz3, res_hz4;
1665 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1666 v16u8 dst0, dst1, dst2, dst3;
1668 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1669 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1670 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1671 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1672 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1679 src0 = (v16u8) __msa_vshf_b(mask, (v16i8)
src0, (v16i8) src0);
1680 res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
1682 for (row = (height >> 2); row--;) {
1683 LD_UB4(src, src_stride, src1, src2, src3, src4);
1684 src += (4 * src_stride);
1686 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1687 VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
1688 VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
1689 DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
1690 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
1692 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
1693 coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
1696 res_vt0 += (res_hz0 * coeff_vt_vec1);
1697 res_vt1 += (res_hz1 * coeff_vt_vec1);
1698 res_vt2 += (res_hz2 * coeff_vt_vec1);
1699 res_vt3 += (res_hz3 * coeff_vt_vec1);
1701 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1702 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1704 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1707 ST8x4_UB(out0, out1, dst, dst_stride);
1708 dst += (4 * dst_stride);
1719 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
1720 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
1722 if (0 == height % 12) {
1723 for (cnt = (height / 12); cnt--;) {
1725 src0, src1, src2, src3, src4, src5, src6, src7);
1726 src += (8 * src_stride);
1728 out0 = __msa_copy_u_d((v2i64) src0, 0);
1729 out1 = __msa_copy_u_d((v2i64) src1, 0);
1730 out2 = __msa_copy_u_d((v2i64) src2, 0);
1731 out3 = __msa_copy_u_d((v2i64) src3, 0);
1732 out4 = __msa_copy_u_d((v2i64) src4, 0);
1733 out5 = __msa_copy_u_d((v2i64) src5, 0);
1734 out6 = __msa_copy_u_d((v2i64) src6, 0);
1735 out7 = __msa_copy_u_d((v2i64) src7, 0);
1737 SD4(out0, out1, out2, out3, dst, dst_stride);
1738 dst += (4 * dst_stride);
1739 SD4(out4, out5, out6, out7, dst, dst_stride);
1740 dst += (4 * dst_stride);
1742 LD_UB4(src, src_stride, src0, src1, src2, src3);
1743 src += (4 * src_stride);
1745 out0 = __msa_copy_u_d((v2i64) src0, 0);
1746 out1 = __msa_copy_u_d((v2i64) src1, 0);
1747 out2 = __msa_copy_u_d((v2i64) src2, 0);
1748 out3 = __msa_copy_u_d((v2i64) src3, 0);
1750 SD4(out0, out1, out2, out3, dst, dst_stride);
1751 dst += (4 * dst_stride);
1753 }
else if (0 == height % 8) {
1754 for (cnt = height >> 3; cnt--;) {
1756 src0, src1, src2, src3, src4, src5, src6, src7);
1757 src += (8 * src_stride);
1759 out0 = __msa_copy_u_d((v2i64) src0, 0);
1760 out1 = __msa_copy_u_d((v2i64) src1, 0);
1761 out2 = __msa_copy_u_d((v2i64) src2, 0);
1762 out3 = __msa_copy_u_d((v2i64) src3, 0);
1763 out4 = __msa_copy_u_d((v2i64) src4, 0);
1764 out5 = __msa_copy_u_d((v2i64) src5, 0);
1765 out6 = __msa_copy_u_d((v2i64) src6, 0);
1766 out7 = __msa_copy_u_d((v2i64) src7, 0);
1768 SD4(out0, out1, out2, out3, dst, dst_stride);
1769 dst += (4 * dst_stride);
1770 SD4(out4, out5, out6, out7, dst, dst_stride);
1771 dst += (4 * dst_stride);
1773 }
else if (0 == height % 4) {
1774 for (cnt = (height / 4); cnt--;) {
1775 LD_UB4(src, src_stride, src0, src1, src2, src3);
1776 src += (4 * src_stride);
1777 out0 = __msa_copy_u_d((v2i64) src0, 0);
1778 out1 = __msa_copy_u_d((v2i64) src1, 0);
1779 out2 = __msa_copy_u_d((v2i64) src2, 0);
1780 out3 = __msa_copy_u_d((v2i64) src3, 0);
1782 SD4(out0, out1, out2, out3, dst, dst_stride);
1783 dst += (4 * dst_stride);
1785 }
else if (0 == height % 2) {
1786 for (cnt = (height / 2); cnt--;) {
1787 LD_UB2(src, src_stride, src0, src1);
1788 src += (2 * src_stride);
1789 out0 = __msa_copy_u_d((v2i64) src0, 0);
1790 out1 = __msa_copy_u_d((v2i64) src1, 0);
1805 uint32_t out0, out1, out2, out3;
1807 v16u8 dst0, dst1, dst2, dst3;
1809 if (0 == (height % 4)) {
1810 for (cnt = (height / 4); cnt--;) {
1811 LD_UB4(src, src_stride, src0, src1, src2, src3);
1812 src += (4 * src_stride);
1814 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1816 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1817 dst0, dst1, dst2, dst3);
1819 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1820 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1821 out2 = __msa_copy_u_w((v4i32) dst2, 0);
1822 out3 = __msa_copy_u_w((v4i32) dst3, 0);
1823 SW4(out0, out1, out2, out3, dst, dst_stride);
1824 dst += (4 * dst_stride);
1826 }
else if (0 == (height % 2)) {
1827 for (cnt = (height / 2); cnt--;) {
1828 LD_UB2(src, src_stride, src0, src1);
1829 src += (2 * src_stride);
1831 LD_UB2(dst, dst_stride, dst0, dst1);
1835 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1836 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1850 uint64_t out0, out1, out2, out3;
1852 v16u8 dst0, dst1, dst2, dst3;
1854 for (cnt = (height / 4); cnt--;) {
1855 LD_UB4(src, src_stride, src0, src1, src2, src3);
1856 src += (4 * src_stride);
1857 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1859 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1860 dst0, dst1, dst2, dst3);
1862 out0 = __msa_copy_u_d((v2i64) dst0, 0);
1863 out1 = __msa_copy_u_d((v2i64) dst1, 0);
1864 out2 = __msa_copy_u_d((v2i64) dst2, 0);
1865 out3 = __msa_copy_u_d((v2i64) dst3, 0);
1866 SD4(out0, out1, out2, out3, dst, dst_stride);
1867 dst += (4 * dst_stride);
1874 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1878 stride, x, (8 - x), y, (8 - y), height);
1893 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1897 stride, x, (8 - x), y, (8 - y), height);
1903 for (cnt = height; cnt--;) {
1904 *((uint32_t *) dst) = *((uint32_t *) src);
1917 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1921 stride, x, (8 - x), y, (8 - y), height);
1927 for (cnt = height; cnt--;) {
1928 *((uint16_t *) dst) = *((uint16_t *) src);
1939 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1944 stride, x, (8 - x), y,
1948 stride, x, (8 - x), height);
1951 stride, y, (8 - y), height);
1960 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1964 stride, x, (8 - x), y,
1968 stride, x, (8 - x), height);
1971 stride, y, (8 - y), height);
1982 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1986 stride, x, (8 - x), y,
1990 stride, x, (8 - x), height);
1993 stride, y, (8 - y), height);
1995 for (cnt = height; cnt--;) {
1996 dst[0] = (dst[0] + src[0] + 1) >> 1;
1997 dst[1] = (dst[1] + src[1] + 1) >> 1;
static void avc_chroma_vt_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_2x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_hz_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
#define MUL2(in0, in1, in2, in3, out0, out1)
static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
static void avc_chroma_vt_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
static void avc_chroma_hv_and_aver_dst_4x4mul_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
static void avc_chroma_vt_2x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
static void avc_chroma_hv_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
#define INSERT_W2_UB(...)
static void avc_chroma_hz_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
static const uint16_t mask[17]
static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
#define SW4(in0, in1, in2, in3, pdst, stride)
void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_2x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_and_aver_dst_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
#define ST2x4_UB(in, stidx, pdst, stride)
static void avc_chroma_vt_2x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static const uint8_t chroma_mask_arr[16 *5]
static void avc_chroma_hz_2x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
static void copy_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void avc_chroma_hz_2w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
static void avc_chroma_hv_2x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_vt_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_vt_2w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_hz_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
#define ADD2(in0, in1, in2, in3, out0, out1)
static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_hv_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
#define SD4(in0, in1, in2, in3, pdst, stride)
static void avc_chroma_hv_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
#define SLLI_4V(in0, in1, in2, in3, shift)
static void avc_chroma_vt_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
GLint GLenum GLboolean GLsizei stride
#define LW4(psrc, stride, out0, out1, out2, out3)
#define ST8x4_UB(in0, in1, pdst, stride)
static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_hv_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
static void avc_chroma_hv_2x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_hv_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_vt_2x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
static void avg_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void avg_width4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void avc_chroma_hv_2w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_hv_2x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
#define ST8x1_UB(in, pdst)
#define ST4x2_UB(in, pdst, stride)
static void avc_chroma_vt_and_aver_dst_4x4mul_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_vt_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)