24 #define AVC_CALC_DPADD_H_6PIX_2COEFF_SH(in0, in1, in2, in3, in4, in5) \
26 v4i32 tmp0_m, tmp1_m; \
27 v8i16 out0_m, out1_m, out2_m, out3_m; \
28 v8i16 minus5h_m = __msa_ldi_h(-5); \
29 v8i16 plus20h_m = __msa_ldi_h(20); \
31 ILVRL_H2_SW(in5, in0, tmp0_m, tmp1_m); \
33 tmp0_m = __msa_hadd_s_w((v8i16) tmp0_m, (v8i16) tmp0_m); \
34 tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \
36 ILVRL_H2_SH(in1, in4, out0_m, out1_m); \
37 DPADD_SH2_SW(out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m); \
38 ILVRL_H2_SH(in2, in3, out2_m, out3_m); \
39 DPADD_SH2_SW(out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m); \
41 SRARI_W2_SW(tmp0_m, tmp1_m, 10); \
42 SAT_SW2_SW(tmp0_m, tmp1_m, 7); \
43 out0_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
48 #define AVC_HORZ_FILTER_SH(in, mask0, mask1, mask2) \
50 v8i16 out0_m, out1_m; \
51 v16i8 tmp0_m, tmp1_m; \
52 v16i8 minus5b = __msa_ldi_b(-5); \
53 v16i8 plus20b = __msa_ldi_b(20); \
55 tmp0_m = __msa_vshf_b((v16i8) mask0, in, in); \
56 out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m); \
58 tmp0_m = __msa_vshf_b((v16i8) mask1, in, in); \
59 out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \
61 tmp1_m = __msa_vshf_b((v16i8) (mask2), in, in); \
62 out1_m = __msa_dpadd_s_h(out0_m, plus20b, tmp1_m); \
69 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
70 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
71 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
74 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
75 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
76 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
78 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
79 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
82 #define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, \
85 v16i8 tmp0_m, tmp1_m; \
86 v16i8 minus5b_m = __msa_ldi_b(-5); \
87 v16i8 plus20b_m = __msa_ldi_b(20); \
89 ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m); \
90 HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2); \
91 ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m); \
92 DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2); \
93 ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m); \
94 DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \
97 #define AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \
100 v16i8 tmp0_m, tmp2_m; \
101 v16i8 minus5b_m = __msa_ldi_b(-5); \
102 v16i8 plus20b_m = __msa_ldi_b(20); \
104 tmp1_m = (v8i16) __msa_ilvr_b((v16i8) vec5, (v16i8) vec0); \
105 tmp1_m = __msa_hadd_s_h((v16i8) tmp1_m, (v16i8) tmp1_m); \
107 ILVR_B2_SB(vec4, vec1, vec3, vec2, tmp0_m, tmp2_m); \
108 DPADD_SB2_SH(tmp0_m, tmp2_m, minus5b_m, plus20b_m, tmp1_m, tmp1_m); \
113 #define AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \
116 v8i16 tmp2_m, tmp3_m; \
117 v8i16 minus5h_m = __msa_ldi_h(-5); \
118 v8i16 plus20h_m = __msa_ldi_h(20); \
120 tmp1_m = (v4i32) __msa_ilvr_h((v8i16) vec5, (v8i16) vec0); \
121 tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \
123 ILVR_H2_SH(vec1, vec4, vec2, vec3, tmp2_m, tmp3_m); \
124 DPADD_SH2_SW(tmp2_m, tmp3_m, minus5h_m, plus20h_m, tmp1_m, tmp1_m); \
126 tmp1_m = __msa_srari_w(tmp1_m, 10); \
127 tmp1_m = __msa_sat_s_w(tmp1_m, 7); \
129 tmp2_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp1_m); \
134 #define AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1, \
135 mask0, mask1, mask2) \
138 v16i8 vec0_m, vec1_m, vec2_m; \
139 v16i8 minus5b_m = __msa_ldi_b(-5); \
140 v16i8 plus20b_m = __msa_ldi_b(20); \
142 vec0_m = __msa_vshf_b((v16i8) mask0, (v16i8) src1, (v16i8) src0); \
143 hz_out_m = __msa_hadd_s_h(vec0_m, vec0_m); \
145 VSHF_B2_SB(src0, src1, src0, src1, mask1, mask2, vec1_m, vec2_m); \
146 DPADD_SB2_SH(vec1_m, vec2_m, minus5b_m, plus20b_m, hz_out_m, hz_out_m); \
159 v16i8 mask0, mask1, mask2;
160 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
161 v16i8 minus5b = __msa_ldi_b(-5);
162 v16i8 plus20b = __msa_ldi_b(20);
165 for (loop_cnt = (height >> 2); loop_cnt--;) {
166 LD_SB4(src, src_stride, src0, src1, src2, src3);
167 src += (4 * src_stride);
170 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
172 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
174 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
179 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
180 dst += (4 * dst_stride);
190 v8i16 res0, res1, res2, res3;
191 v16i8 mask0, mask1, mask2;
192 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
193 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
194 v16i8 minus5b = __msa_ldi_b(-5);
195 v16i8 plus20b = __msa_ldi_b(20);
200 for (loop_cnt = (height >> 2); loop_cnt--;) {
201 LD_SB4(src, src_stride, src0, src1, src2, src3);
202 src += (4 * src_stride);
205 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
206 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
207 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
208 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
209 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
210 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
211 res0, res1, res2, res3);
212 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
213 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
214 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
215 plus20b, res0, res1, res2, res3);
220 ST8x4_UB(out0, out1, dst, dst_stride);
221 dst += (4 * dst_stride);
230 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
231 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
232 v16i8 mask0, mask1, mask2;
233 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
234 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
235 v16i8 minus5b = __msa_ldi_b(-5);
236 v16i8 plus20b = __msa_ldi_b(20);
240 for (loop_cnt = (height >> 2); loop_cnt--;) {
241 LD_SB2(src, 8, src0, src1);
243 LD_SB2(src, 8, src2, src3);
247 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
248 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
249 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
250 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
251 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
252 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
253 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
254 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
255 minus5b, res0, res1, res2, res3);
256 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
257 plus20b, res0, res1, res2, res3);
259 LD_SB2(src, 8, src4, src5);
261 LD_SB2(src, 8, src6, src7);
265 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
266 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
267 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
268 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
269 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
270 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
271 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
272 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
273 minus5b, res4, res5, res6, res7);
274 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
275 plus20b, res4, res5, res6, res7);
280 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
281 vec0, vec1, vec2, vec3);
284 ST_SB4(vec0, vec1, vec2, vec3, dst, dst_stride);
285 dst += (4 * dst_stride);
297 v16i8 res, mask0, mask1, mask2;
298 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
299 v16i8 minus5b = __msa_ldi_b(-5);
300 v16i8 plus20b = __msa_ldi_b(20);
303 slide = 2 + hor_offset;
305 for (loop_cnt = (height >> 2); loop_cnt--;) {
306 LD_SB4(src, src_stride, src0, src1, src2, src3);
307 src += (4 * src_stride);
310 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
312 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
314 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
319 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
320 src0 = __msa_sld_b(src0, src0, slide);
321 src1 = __msa_sld_b(src1, src1, slide);
322 src2 = __msa_sld_b(src2, src2, slide);
323 src3 = __msa_sld_b(src3, src3, slide);
324 src0 = (v16i8) __msa_insve_w((v4i32)
src0, 1, (v4i32) src1);
325 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
326 src0 = (v16i8) __msa_insve_d((v2i64)
src0, 1, (v2i64) src1);
327 res = __msa_aver_s_b(res, src0);
328 res = (v16i8) __msa_xori_b((v16u8) res, 128);
330 ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
331 dst += (4 * dst_stride);
343 v8i16 res0, res1, res2, res3;
344 v16i8 mask0, mask1, mask2;
345 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
346 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
347 v16i8 minus5b = __msa_ldi_b(-5);
348 v16i8 plus20b = __msa_ldi_b(20);
351 slide = 2 + hor_offset;
353 for (loop_cnt = height >> 2; loop_cnt--;) {
354 LD_SB4(src, src_stride, src0, src1, src2, src3);
355 src += (4 * src_stride);
358 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
359 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
360 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
361 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
362 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
363 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
364 res0, res1, res2, res3);
365 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
366 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
367 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
368 plus20b, res0, res1, res2, res3);
370 src0 = __msa_sld_b(src0, src0, slide);
371 src1 = __msa_sld_b(src1, src1, slide);
372 src2 = __msa_sld_b(src2, src2, slide);
373 src3 = __msa_sld_b(src3, src3, slide);
380 tmp0 = __msa_aver_s_b(tmp0, src0);
381 tmp1 = __msa_aver_s_b(tmp1, src1);
384 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
386 dst += (4 * dst_stride);
397 v16i8 mask0, mask1, mask2, vshf;
398 v8i16 res0, res1, res2, res3;
399 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
400 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
401 v16i8 minus5b = __msa_ldi_b(-5);
402 v16i8 plus20b = __msa_ldi_b(20);
412 for (loop_cnt = height >> 1; loop_cnt--;) {
413 LD_SB2(src, 8, src0, src1);
415 LD_SB2(src, 8, src2, src3);
419 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
420 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
421 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
422 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
423 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
424 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
425 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
426 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
427 minus5b, res0, res1, res2, res3);
428 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
429 plus20b, res0, res1, res2, res3);
430 VSHF_B2_SB(src0, src1, src2, src3, vshf, vshf, src0, src2);
435 dst0 = __msa_aver_s_b(dst0, src0);
436 dst1 = __msa_aver_s_b(dst1, src2);
440 ST_SB2(dst0, dst1, dst, dst_stride);
441 dst += (2 * dst_stride);
450 int16_t filt_const0 = 0xfb01;
451 int16_t filt_const1 = 0x1414;
452 int16_t filt_const2 = 0x1fb;
453 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
454 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
455 v16i8 src87_r, src2110, src4332, src6554, src8776;
456 v16i8 filt0, filt1, filt2;
460 filt0 = (v16i8) __msa_fill_h(filt_const0);
461 filt1 = (v16i8) __msa_fill_h(filt_const1);
462 filt2 = (v16i8) __msa_fill_h(filt_const2);
464 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
465 src += (5 * src_stride);
467 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
468 src10_r, src21_r, src32_r, src43_r);
469 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
472 for (loop_cnt = (height >> 2); loop_cnt--;) {
473 LD_SB4(src, src_stride, src5, src6, src7, src8);
474 src += (4 * src_stride);
476 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
477 src54_r, src65_r, src76_r, src87_r);
478 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
480 out10 =
DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
481 out32 =
DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
485 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
487 dst += (4 * dst_stride);
499 int16_t filt_const0 = 0xfb01;
500 int16_t filt_const1 = 0x1414;
501 int16_t filt_const2 = 0x1fb;
502 v16i8
src0,
src1, src2, src3, src4, src7, src8, src9, src10;
503 v16i8 src10_r, src32_r, src76_r, src98_r;
504 v16i8 src21_r, src43_r, src87_r, src109_r;
505 v8i16 out0_r, out1_r, out2_r, out3_r;
506 v16i8 filt0, filt1, filt2;
509 filt0 = (v16i8) __msa_fill_h(filt_const0);
510 filt1 = (v16i8) __msa_fill_h(filt_const1);
511 filt2 = (v16i8) __msa_fill_h(filt_const2);
513 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
514 src += (5 * src_stride);
517 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
518 src10_r, src21_r, src32_r, src43_r);
520 for (loop_cnt = (height >> 2); loop_cnt--;) {
521 LD_SB4(src, src_stride, src7, src8, src9, src10);
522 src += (4 * src_stride);
525 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
526 src76_r, src87_r, src98_r, src109_r);
527 out0_r =
DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
528 out1_r =
DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
529 out2_r =
DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
530 out3_r =
DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
532 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
535 ST8x4_UB(out0, out1, dst, dst_stride);
536 dst += (4 * dst_stride);
551 int16_t filt_const0 = 0xfb01;
552 int16_t filt_const1 = 0x1414;
553 int16_t filt_const2 = 0x1fb;
554 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
555 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
556 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
557 v16i8 src65_l, src87_l;
558 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
559 v16u8 res0, res1, res2, res3;
560 v16i8 filt0, filt1, filt2;
562 filt0 = (v16i8) __msa_fill_h(filt_const0);
563 filt1 = (v16i8) __msa_fill_h(filt_const1);
564 filt2 = (v16i8) __msa_fill_h(filt_const2);
566 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
567 src += (5 * src_stride);
570 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
571 src10_r, src21_r, src32_r, src43_r);
572 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
573 src10_l, src21_l, src32_l, src43_l);
575 for (loop_cnt = (height >> 2); loop_cnt--;) {
576 LD_SB4(src, src_stride, src5, src6, src7, src8);
577 src += (4 * src_stride);
580 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
581 src54_r, src65_r, src76_r, src87_r);
582 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
583 src54_l, src65_l, src76_l, src87_l);
584 out0_r =
DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
585 out1_r =
DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
586 out2_r =
DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
587 out3_r =
DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
588 out0_l =
DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
589 out1_l =
DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
590 out2_l =
DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
591 out3_l =
DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
593 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
595 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
596 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
597 out3_r, res0, res1, res2, res3);
600 ST_UB4(res0, res1, res2, res3, dst, dst_stride);
601 dst += (4 * dst_stride);
620 int16_t filt_const0 = 0xfb01;
621 int16_t filt_const1 = 0x1414;
622 int16_t filt_const2 = 0x1fb;
623 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
624 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
625 v16i8 src87_r, src2110, src4332, src6554, src8776;
627 v16i8 filt0, filt1, filt2;
630 filt0 = (v16i8) __msa_fill_h(filt_const0);
631 filt1 = (v16i8) __msa_fill_h(filt_const1);
632 filt2 = (v16i8) __msa_fill_h(filt_const2);
634 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
635 src += (5 * src_stride);
637 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
638 src10_r, src21_r, src32_r, src43_r);
639 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
642 for (loop_cnt = (height >> 2); loop_cnt--;) {
643 LD_SB4(src, src_stride, src5, src6, src7, src8);
644 src += (4 * src_stride);
646 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
647 src54_r, src65_r, src76_r, src87_r);
648 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
650 out10 =
DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
651 out32 =
DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
658 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
659 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
661 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
662 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
665 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
666 out = __msa_aver_u_b(out, (v16u8) src32_r);
668 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
669 dst += (4 * dst_stride);
683 int16_t filt_const0 = 0xfb01;
684 int16_t filt_const1 = 0x1414;
685 int16_t filt_const2 = 0x1fb;
686 v16i8
src0,
src1, src2, src3, src4, src7, src8, src9, src10;
687 v16i8 src10_r, src32_r, src76_r, src98_r;
688 v16i8 src21_r, src43_r, src87_r, src109_r;
689 v8i16 out0_r, out1_r, out2_r, out3_r;
691 v16i8 filt0, filt1, filt2;
693 filt0 = (v16i8) __msa_fill_h(filt_const0);
694 filt1 = (v16i8) __msa_fill_h(filt_const1);
695 filt2 = (v16i8) __msa_fill_h(filt_const2);
697 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
698 src += (5 * src_stride);
701 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
702 src10_r, src21_r, src32_r, src43_r);
704 for (loop_cnt = (height >> 2); loop_cnt--;) {
705 LD_SB4(src, src_stride, src7, src8, src9, src10);
706 src += (4 * src_stride);
709 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
710 src76_r, src87_r, src98_r, src109_r);
711 out0_r =
DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
712 out1_r =
DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
713 out2_r =
DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
714 out3_r =
DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
716 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
717 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, res0, res1);
720 PCKEV_D2_SB(src4, src3, src8, src7, src10_r, src32_r);
722 PCKEV_D2_SB(src3, src2, src7, src4, src10_r, src32_r);
725 res0 = __msa_aver_s_b(res0, (v16i8) src10_r);
726 res1 = __msa_aver_s_b(res1, (v16i8) src32_r);
729 ST8x4_UB(res0, res1, dst, dst_stride);
731 dst += (4 * dst_stride);
747 int16_t filt_const0 = 0xfb01;
748 int16_t filt_const1 = 0x1414;
749 int16_t filt_const2 = 0x1fb;
750 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
751 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
752 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
753 v16i8 src65_l, src87_l;
754 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
755 v16u8 res0, res1, res2, res3;
756 v16i8 filt0, filt1, filt2;
758 filt0 = (v16i8) __msa_fill_h(filt_const0);
759 filt1 = (v16i8) __msa_fill_h(filt_const1);
760 filt2 = (v16i8) __msa_fill_h(filt_const2);
762 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
763 src += (5 * src_stride);
766 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
767 src10_r, src21_r, src32_r, src43_r);
768 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
769 src10_l, src21_l, src32_l, src43_l);
771 for (loop_cnt = (height >> 2); loop_cnt--;) {
772 LD_SB4(src, src_stride, src5, src6, src7, src8);
773 src += (4 * src_stride);
776 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
777 src54_r, src65_r, src76_r, src87_r);
778 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
779 src54_l, src65_l, src76_l, src87_l);
780 out0_r =
DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
781 out1_r =
DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
782 out2_r =
DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
783 out3_r =
DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
784 out0_l =
DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
785 out1_l =
DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
786 out2_l =
DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
787 out3_l =
DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
789 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
791 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
792 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
793 out3_r, res0, res1, res2, res3);
796 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
797 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
798 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
799 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
801 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
802 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
803 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
804 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
808 ST_UB4(res0, res1, res2, res3, dst, dst_stride);
810 dst += (4 * dst_stride);
832 v16i8 mask0, mask1, mask2;
833 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
834 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
835 v8i16 dst0, dst1, dst2, dst3;
838 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
839 src += (5 * src_stride);
844 mask0, mask1, mask2);
846 mask0, mask1, mask2);
848 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
852 for (loop_cnt = (height >> 2); loop_cnt--;) {
853 LD_SB4(src, src_stride, src0, src1, src2, src3);
854 src += (4 * src_stride);
865 PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
868 hz_out3, hz_out4, hz_out5);
870 hz_out4, hz_out5, hz_out6);
872 hz_out5, hz_out6, hz_out7);
874 hz_out6, hz_out7, hz_out8);
879 ST4x4_UB(src0, src1, 0, 2, 0, 2, dst, dst_stride);
881 dst += (4 * dst_stride);
897 v16i8 mask0, mask1, mask2;
898 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
899 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
900 v8i16 dst0, dst1, dst2, dst3;
905 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
907 src += (5 * src_stride);
915 for (loop_cnt = (height >> 2); loop_cnt--;) {
916 LD_SB4(src, src_stride, src0, src1, src2, src3);
918 src += (4 * src_stride);
925 hz_out3, hz_out4, hz_out5);
927 hz_out4, hz_out5, hz_out6);
929 hz_out5, hz_out6, hz_out7);
931 hz_out6, hz_out7, hz_out8);
934 ST8x4_UB(out0, out1, dst, dst_stride);
936 dst += (4 * dst_stride);
950 uint32_t multiple8_cnt;
952 for (multiple8_cnt = 2; multiple8_cnt--;) {
964 v16i8
src0,
src1, src2, src3, src4, src5, src6;
965 v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
966 v4i32 hz_res0, hz_res1;
968 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
969 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
970 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
971 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
972 v8i16 minus5h = __msa_ldi_h(-5);
973 v8i16 plus20h = __msa_ldi_h(20);
977 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
978 src += (5 * src_stride);
981 for (row = (height >> 1); row--;) {
982 LD_SB2(src, src_stride, src5, src6);
983 src += (2 * src_stride);
990 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
991 mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
992 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
993 mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
994 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
995 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
996 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
997 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
1002 dst0 = __msa_srari_h(shf_vec2, 5);
1003 dst1 = __msa_srari_h(shf_vec5, 5);
1008 dst0 = __msa_ilvod_h(zeros, dst0);
1009 dst1 = __msa_ilvod_h(zeros, dst1);
1011 ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
1014 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
1015 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
1016 dst0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
1021 dst += (2 * dst_stride);
1035 uint32_t multiple8_cnt;
1037 for (multiple8_cnt = 2; multiple8_cnt--;) {
1050 uint32_t multiple8_cnt;
1052 for (multiple8_cnt = 4; multiple8_cnt--;) {
1066 v16i8
src0,
src1, src2, src3, src4;
1067 v16i8 mask0, mask1, mask2;
1068 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1069 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1070 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1073 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1074 src += (5 * src_stride);
1079 mask0, mask1, mask2);
1081 mask0, mask1, mask2);
1083 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
1087 for (loop_cnt = (height >> 2); loop_cnt--;) {
1088 LD_SB4(src, src_stride, src0, src1, src2, src3);
1089 src += (4 * src_stride);
1099 PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
1102 hz_out3, hz_out4, hz_out5);
1104 hz_out4, hz_out5, hz_out6);
1106 hz_out5, hz_out6, hz_out7);
1108 hz_out6, hz_out7, hz_out8);
1111 dst1 = __msa_srari_h(hz_out3, 5);
1112 dst3 = __msa_srari_h(hz_out4, 5);
1113 dst5 = __msa_srari_h(hz_out5, 5);
1114 dst7 = __msa_srari_h(hz_out6, 5);
1116 dst1 = __msa_srari_h(hz_out2, 5);
1117 dst3 = __msa_srari_h(hz_out3, 5);
1118 dst5 = __msa_srari_h(hz_out4, 5);
1119 dst7 = __msa_srari_h(hz_out5, 5);
1124 dst0 = __msa_aver_s_h(dst0, dst1);
1125 dst1 = __msa_aver_s_h(dst2, dst3);
1126 dst2 = __msa_aver_s_h(dst4, dst5);
1127 dst3 = __msa_aver_s_h(dst6, dst7);
1132 ST4x4_UB(src0, src1, 0, 2, 0, 2, dst, dst_stride);
1134 dst += (4 * dst_stride);
1148 v16i8
src0,
src1, src2, src3, src4;
1149 v16i8 mask0, mask1, mask2;
1150 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1151 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1152 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1157 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1159 src += (5 * src_stride);
1167 for (loop_cnt = (height >> 2); loop_cnt--;) {
1168 LD_SB4(src, src_stride, src0, src1, src2, src3);
1170 src += (4 * src_stride);
1178 hz_out3, hz_out4, hz_out5);
1180 hz_out4, hz_out5, hz_out6);
1182 hz_out5, hz_out6, hz_out7);
1184 hz_out6, hz_out7, hz_out8);
1187 dst1 = __msa_srari_h(hz_out3, 5);
1188 dst3 = __msa_srari_h(hz_out4, 5);
1189 dst5 = __msa_srari_h(hz_out5, 5);
1190 dst7 = __msa_srari_h(hz_out6, 5);
1192 dst1 = __msa_srari_h(hz_out2, 5);
1193 dst3 = __msa_srari_h(hz_out3, 5);
1194 dst5 = __msa_srari_h(hz_out4, 5);
1195 dst7 = __msa_srari_h(hz_out5, 5);
1200 dst0 = __msa_aver_s_h(dst0, dst1);
1201 dst1 = __msa_aver_s_h(dst2, dst3);
1202 dst2 = __msa_aver_s_h(dst4, dst5);
1203 dst3 = __msa_aver_s_h(dst6, dst7);
1230 uint32_t multiple8_cnt;
1232 for (multiple8_cnt = 2; multiple8_cnt--;) {
1246 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
1247 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
1248 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
1249 v16i8 mask0, mask1, mask2;
1250 v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
1256 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
1257 src_y += (5 * src_stride);
1259 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
1260 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
1261 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
1262 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
1266 for (loop_cnt = (height >> 2); loop_cnt--;) {
1267 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
1268 src_x += (4 * src_stride);
1282 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
1283 src_y += (4 * src_stride);
1285 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
1286 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
1287 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
1288 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
1303 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
1304 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
1308 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1309 dst += (4 * dst_stride);
1324 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
1325 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
1326 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
1327 v16i8 mask0, mask1, mask2;
1328 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1329 v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
1330 v8i16 out0, out1, out2, out3;
1334 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
1335 src_y += (5 * src_stride);
1337 src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
1338 src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
1339 src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
1340 src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
1344 for (loop_cnt = (height >> 2); loop_cnt--;) {
1345 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
1347 src_x += (4 * src_stride);
1354 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
1355 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
1357 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
1358 src_y += (4 * src_stride);
1360 src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
1361 src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
1362 src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
1363 src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
1369 src_vt4, src_vt5, vert_out0, vert_out1);
1371 src_vt6, src_vt7, vert_out2, vert_out3);
1373 SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
1374 SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
1376 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
1377 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
1378 out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
1379 out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
1384 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1386 dst += (4 * dst_stride);
1400 uint32_t multiple8_cnt;
1402 for (multiple8_cnt = 2; multiple8_cnt--;) {
1417 v16u8 dst0, dst1, dst2, dst3, res;
1419 v16i8 mask0, mask1, mask2;
1420 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1421 v16i8 minus5b = __msa_ldi_b(-5);
1422 v16i8 plus20b = __msa_ldi_b(20);
1425 LD_SB4(src, src_stride, src0, src1, src2, src3);
1427 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1429 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1431 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1432 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1433 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1434 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1438 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1440 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
1441 res = __msa_aver_u_b(res, dst0);
1443 ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
1452 v16u8 dst0, dst1, dst2, dst3;
1453 v8i16 res0, res1, res2, res3;
1454 v16i8 mask0, mask1, mask2;
1455 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1456 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
1457 v16i8 minus5b = __msa_ldi_b(-5);
1458 v16i8 plus20b = __msa_ldi_b(20);
1462 for (loop_cnt = 2; loop_cnt--;) {
1463 LD_SB4(src, src_stride, src0, src1, src2, src3);
1464 src += (4 * src_stride);
1466 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1469 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1470 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
1471 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
1472 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
1473 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
1474 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1475 res0, res1, res2, res3);
1476 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
1477 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
1478 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1479 plus20b, res0, res1, res2, res3);
1485 dst += (4 * dst_stride);
1494 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
1495 v16u8 dst0, dst1, dst2, dst3;
1496 v16i8 mask0, mask1, mask2;
1497 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1498 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1499 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
1500 v16i8 minus5b = __msa_ldi_b(-5);
1501 v16i8 plus20b = __msa_ldi_b(20);
1505 for (loop_cnt = 4; loop_cnt--;) {
1506 LD_SB2(src, 8, src0, src1);
1508 LD_SB2(src, 8, src2, src3);
1511 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1514 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
1515 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
1516 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
1517 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
1518 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
1519 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
1520 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
1521 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1522 minus5b, res0, res1, res2, res3);
1523 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1524 plus20b, res0, res1, res2, res3);
1525 LD_SB2(src, 8, src4, src5);
1527 LD_SB2(src, 8, src6, src7);
1530 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
1531 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
1532 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
1533 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
1534 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
1535 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
1536 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
1537 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1538 minus5b, res4, res5, res6, res7);
1539 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1540 plus20b, res4, res5, res6, res7);
1545 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
1546 vec0, vec1, vec2, vec3);
1548 AVER_UB4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
1549 dst0, dst1, dst2, dst3);
1550 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
1551 dst += (4 * dst_stride);
1563 v16u8 dst0, dst1, dst2, dst3;
1564 v16i8 mask0, mask1, mask2;
1565 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1567 v16i8 minus5b = __msa_ldi_b(-5);
1568 v16i8 plus20b = __msa_ldi_b(20);
1579 LD_SB4(src, src_stride, src0, src1, src2, src3);
1580 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1583 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1585 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1586 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
1587 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1588 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
1594 src0 = __msa_sld_b(src0, src0, slide);
1595 src1 = __msa_sld_b(src1, src1, slide);
1596 src2 = __msa_sld_b(src2, src2, slide);
1597 src3 = __msa_sld_b(src3, src3, slide);
1598 src0 = (v16i8) __msa_insve_w((v4i32)
src0, 1, (v4i32) src1);
1599 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1600 res0 = (v16u8) __msa_aver_s_b((v16i8) res0,
src0);
1601 res1 = (v16u8) __msa_aver_s_b((v16i8) res1,
src1);
1605 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
1606 dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
1610 ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1622 v16i8 mask0, mask1, mask2;
1623 v16u8 dst0, dst1, dst2, dst3;
1624 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1625 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
1626 v8i16 out0, out1, out2, out3;
1627 v16i8 minus5b = __msa_ldi_b(-5);
1628 v16i8 plus20b = __msa_ldi_b(20);
1629 v16i8 res0, res1, res2, res3;
1639 for (loop_cnt = 2; loop_cnt--;) {
1640 LD_SB4(src, src_stride, src0, src1, src2, src3);
1641 src += (4 * src_stride);
1643 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1646 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1647 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
1648 HADD_SB4_SH(vec0, vec1, vec2, vec3, out0, out1, out2, out3);
1649 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
1650 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
1651 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1652 out0, out1, out2, out3);
1653 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
1654 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
1655 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1656 plus20b, out0, out1, out2, out3);
1658 src0 = __msa_sld_b(src0, src0, slide);
1659 src1 = __msa_sld_b(src1, src1, slide);
1660 src2 = __msa_sld_b(src2, src2, slide);
1661 src3 = __msa_sld_b(src3, src3, slide);
1666 PCKEV_B4_SB(out0, out0, out1, out1, out2, out2, out3, out3,
1667 res0, res1, res2, res3);
1669 res0 = __msa_aver_s_b(res0, src0);
1670 res1 = __msa_aver_s_b(res1, src1);
1671 res2 = __msa_aver_s_b(res2, src2);
1672 res3 = __msa_aver_s_b(res3, src3);
1675 AVER_ST8x4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
1678 dst += (4 * dst_stride);
1691 v16i8 mask0, mask1, mask2, vshf;
1693 v8i16 res0, res1, res2, res3;
1694 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1695 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
1696 v16i8 minus5b = __msa_ldi_b(-5);
1697 v16i8 plus20b = __msa_ldi_b(20);
1707 for (loop_cnt = 8; loop_cnt--;) {
1708 LD_SB2(src, 8, src0, src1);
1710 LD_SB2(src, 8, src2, src3);
1713 LD_UB2(dst, dst_stride, dst0, dst1);
1716 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
1717 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
1718 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
1719 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
1720 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
1721 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
1722 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
1723 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1724 minus5b, res0, res1, res2, res3);
1725 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1726 plus20b, res0, res1, res2, res3);
1727 VSHF_B2_SB(src0, src1, src2, src3, vshf, vshf, src0, src2);
1732 out0 = __msa_aver_s_b(out0, src0);
1733 out1 = __msa_aver_s_b(out1, src2);
1737 ST_UB2(dst0, dst1, dst, dst_stride);
1738 dst += (2 * dst_stride);
1746 int16_t filt_const0 = 0xfb01;
1747 int16_t filt_const1 = 0x1414;
1748 int16_t filt_const2 = 0x1fb;
1749 v16u8 dst0, dst1, dst2, dst3;
1750 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1751 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1752 v16i8 src87_r, src2110, src4332, src6554, src8776;
1754 v16i8 filt0, filt1, filt2;
1757 filt0 = (v16i8) __msa_fill_h(filt_const0);
1758 filt1 = (v16i8) __msa_fill_h(filt_const1);
1759 filt2 = (v16i8) __msa_fill_h(filt_const2);
1761 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1762 src += (5 * src_stride);
1764 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1765 src10_r, src21_r, src32_r, src43_r);
1766 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1768 LD_SB4(src, src_stride, src5, src6, src7, src8);
1769 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1770 src54_r, src65_r, src76_r, src87_r);
1771 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1773 out10 =
DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1774 out32 =
DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1777 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1780 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1782 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
1783 dst0 = __msa_aver_u_b(res, dst0);
1785 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1793 int16_t filt_const0 = 0xfb01;
1794 int16_t filt_const1 = 0x1414;
1795 int16_t filt_const2 = 0x1fb;
1796 v16u8 dst0, dst1, dst2, dst3;
1797 v16i8
src0,
src1, src2, src3, src4, src7, src8, src9, src10;
1798 v16i8 src10_r, src32_r, src76_r, src98_r;
1799 v16i8 src21_r, src43_r, src87_r, src109_r;
1800 v8i16 out0, out1, out2, out3;
1801 v16i8 filt0, filt1, filt2;
1803 filt0 = (v16i8) __msa_fill_h(filt_const0);
1804 filt1 = (v16i8) __msa_fill_h(filt_const1);
1805 filt2 = (v16i8) __msa_fill_h(filt_const2);
1807 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1808 src += (5 * src_stride);
1811 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1812 src10_r, src21_r, src32_r, src43_r);
1814 for (loop_cnt = 2; loop_cnt--;) {
1815 LD_SB4(src, src_stride, src7, src8, src9, src10);
1816 src += (4 * src_stride);
1819 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
1820 src76_r, src87_r, src98_r, src109_r);
1821 out0 =
DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
1822 out1 =
DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
1823 out2 =
DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
1824 out3 =
DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
1827 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1831 dst += (4 * dst_stride);
1846 int16_t filt_const0 = 0xfb01;
1847 int16_t filt_const1 = 0x1414;
1848 int16_t filt_const2 = 0x1fb;
1849 v16u8 dst0, dst1, dst2, dst3;
1850 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1851 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1852 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1853 v16i8 src65_l, src87_l;
1854 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1855 v16i8 filt0, filt1, filt2;
1856 v16u8 res0, res1, res2, res3;
1858 filt0 = (v16i8) __msa_fill_h(filt_const0);
1859 filt1 = (v16i8) __msa_fill_h(filt_const1);
1860 filt2 = (v16i8) __msa_fill_h(filt_const2);
1862 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1863 src += (5 * src_stride);
1866 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1867 src10_r, src21_r, src32_r, src43_r);
1868 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1869 src10_l, src21_l, src32_l, src43_l);
1871 for (loop_cnt = 4; loop_cnt--;) {
1872 LD_SB4(src, src_stride, src5, src6, src7, src8);
1873 src += (4 * src_stride);
1876 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1877 src54_r, src65_r, src76_r, src87_r);
1878 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1879 src54_l, src65_l, src76_l, src87_l);
1880 out0_r =
DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1881 out1_r =
DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1882 out2_r =
DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1883 out3_r =
DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1884 out0_l =
DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1885 out1_l =
DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1886 out2_l =
DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1887 out3_l =
DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1890 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1891 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1892 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1893 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1894 out3_r, res0, res1, res2, res3);
1896 AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
1897 res0, res1, res2, res3);
1898 ST_UB4(res0, res1, res2, res3, dst, dst_stride);
1899 dst += (4 * dst_stride);
1919 int16_t filt_const0 = 0xfb01;
1920 int16_t filt_const1 = 0x1414;
1921 int16_t filt_const2 = 0x1fb;
1922 v16u8 dst0, dst1, dst2, dst3;
1923 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1924 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1925 v16i8 src87_r, src2110, src4332, src6554, src8776;
1927 v16i8 filt0, filt1, filt2;
1930 filt0 = (v16i8) __msa_fill_h(filt_const0);
1931 filt1 = (v16i8) __msa_fill_h(filt_const1);
1932 filt2 = (v16i8) __msa_fill_h(filt_const2);
1934 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1935 src += (5 * src_stride);
1937 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1938 src10_r, src21_r, src32_r, src43_r);
1939 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1941 LD_SB4(src, src_stride, src5, src6, src7, src8);
1942 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1943 src54_r, src65_r, src76_r, src87_r);
1944 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1946 out10 =
DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1947 out32 =
DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1950 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1954 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
1955 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
1957 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1958 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
1961 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1962 res = __msa_aver_u_b(res, (v16u8) src32_r);
1964 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1966 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
1967 dst0 = __msa_aver_u_b(res, dst0);
1969 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1979 int16_t filt_const0 = 0xfb01;
1980 int16_t filt_const1 = 0x1414;
1981 int16_t filt_const2 = 0x1fb;
1982 v16u8 dst0, dst1, dst2, dst3;
1983 v16i8
src0,
src1, src2, src3, src4, src7, src8, src9, src10;
1984 v16i8 src10_r, src32_r, src76_r, src98_r;
1985 v16i8 src21_r, src43_r, src87_r, src109_r;
1986 v8i16 out0_r, out1_r, out2_r, out3_r;
1989 v16i8 filt0, filt1, filt2;
1991 filt0 = (v16i8) __msa_fill_h(filt_const0);
1992 filt1 = (v16i8) __msa_fill_h(filt_const1);
1993 filt2 = (v16i8) __msa_fill_h(filt_const2);
1995 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1996 src += (5 * src_stride);
1999 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2000 src10_r, src21_r, src32_r, src43_r);
2002 for (loop_cnt = 2; loop_cnt--;) {
2003 LD_SB4(src, src_stride, src7, src8, src9, src10);
2004 src += (4 * src_stride);
2007 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
2008 src76_r, src87_r, src98_r, src109_r);
2009 out0_r =
DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
2010 out1_r =
DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
2011 out2_r =
DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
2012 out3_r =
DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
2014 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2015 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, res0, res1);
2018 PCKEV_D2_SB(src4, src3, src8, src7, src10_r, src32_r);
2020 PCKEV_D2_SB(src3, src2, src7, src4, src10_r, src32_r);
2023 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2024 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
2026 vec0 = (v16u8) __msa_aver_s_b(res0, src10_r);
2027 vec1 = (v16u8) __msa_aver_s_b(res1, src32_r);
2031 ST8x4_UB(vec0, vec1, dst, dst_stride);
2032 dst += (4 * dst_stride);
2051 int16_t filt_const0 = 0xfb01;
2052 int16_t filt_const1 = 0x1414;
2053 int16_t filt_const2 = 0x1fb;
2054 v16u8 dst0, dst1, dst2, dst3;
2055 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2056 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2057 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2058 v16i8 src65_l, src87_l;
2059 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2060 v16i8 out0, out1, out2, out3;
2061 v16i8 filt0, filt1, filt2;
2062 v16u8 res0, res1, res2, res3;
2064 filt0 = (v16i8) __msa_fill_h(filt_const0);
2065 filt1 = (v16i8) __msa_fill_h(filt_const1);
2066 filt2 = (v16i8) __msa_fill_h(filt_const2);
2068 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2069 src += (5 * src_stride);
2072 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2073 src10_r, src21_r, src32_r, src43_r);
2074 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2075 src10_l, src21_l, src32_l, src43_l);
2077 for (loop_cnt = 4; loop_cnt--;) {
2078 LD_SB4(src, src_stride, src5, src6, src7, src8);
2079 src += (4 * src_stride);
2082 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
2083 src54_r, src65_r, src76_r, src87_r);
2084 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
2085 src54_l, src65_l, src76_l, src87_l);
2086 out0_r =
DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2087 out1_r =
DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2088 out2_r =
DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2089 out3_r =
DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2090 out0_l =
DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2091 out1_l =
DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2092 out2_l =
DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2093 out3_l =
DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2096 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2097 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2098 PCKEV_B4_SB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2099 out3_r, out0, out1, out2, out3);
2100 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2103 res0 = (v16u8) __msa_aver_s_b(out0, src3);
2104 res1 = (v16u8) __msa_aver_s_b(out1, src4);
2105 res2 = (v16u8) __msa_aver_s_b(out2, src5);
2106 res3 = (v16u8) __msa_aver_s_b(out3, src6);
2108 res0 = (v16u8) __msa_aver_s_b(out0, src2);
2109 res1 = (v16u8) __msa_aver_s_b(out1, src3);
2110 res2 = (v16u8) __msa_aver_s_b(out2, src4);
2111 res3 = (v16u8) __msa_aver_s_b(out3, src5);
2115 AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
2116 dst0, dst1, dst2, dst3);
2117 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
2118 dst += (4 * dst_stride);
2138 v16i8
src0,
src1, src2, src3, src4;
2139 v16i8 mask0, mask1, mask2;
2140 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
2141 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
2142 v8i16 res0, res1, res2, res3;
2143 v16u8 dst0, dst1, dst2, dst3;
2144 v16u8 tmp0, tmp1, tmp2, tmp3;
2147 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2148 src += (5 * src_stride);
2153 mask0, mask1, mask2);
2155 mask0, mask1, mask2);
2157 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2161 LD_SB4(src, src_stride, src0, src1, src2, src3);
2165 mask0, mask1, mask2);
2167 mask0, mask1, mask2);
2169 PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
2172 hz_out3, hz_out4, hz_out5);
2174 hz_out4, hz_out5, hz_out6);
2176 hz_out5, hz_out6, hz_out7);
2178 hz_out6, hz_out7, hz_out8);
2179 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2185 ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
2194 v16i8
src0,
src1, src2, src3, src4;
2195 v16i8 mask0, mask1, mask2;
2196 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
2197 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
2198 v16u8 dst0, dst1, dst2, dst3;
2199 v8i16 res0, res1, res2, res3;
2203 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2205 src += (5 * src_stride);
2213 for (loop_cnt = (height >> 2); loop_cnt--;) {
2214 LD_SB4(src, src_stride, src0, src1, src2, src3);
2216 src += (4 * src_stride);
2224 hz_out3, hz_out4, hz_out5);
2226 hz_out4, hz_out5, hz_out6);
2228 hz_out5, hz_out6, hz_out7);
2230 hz_out6, hz_out7, hz_out8);
2231 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2235 dst += (4 * dst_stride);
2263 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2264 v16u8 dst0, dst1, res;
2265 v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
2266 v4i32 hz_res0, hz_res1;
2268 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
2269 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2270 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2271 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2272 v8i16 minus5h = __msa_ldi_h(-5);
2273 v8i16 plus20h = __msa_ldi_h(20);
2274 v8i16 zeros = { 0 };
2276 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2277 src += (5 * src_stride);
2281 for (row = (height >> 1); row--;) {
2282 LD_SB2(src, src_stride, src5, src6);
2283 src += (2 * src_stride);
2286 LD_UB2(dst, dst_stride, dst0, dst1);
2288 dst0 = (v16u8) __msa_ilvr_w((v4i32) dst1, (v4i32) dst0);
2294 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
2295 mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2296 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
2297 mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2299 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2300 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2302 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2303 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2308 res0 = __msa_srari_h(shf_vec2, 5);
2309 res1 = __msa_srari_h(shf_vec5, 5);
2314 res0 = __msa_ilvod_h(zeros, res0);
2315 res1 = __msa_ilvod_h(zeros, res1);
2317 ILVEV_H2_SH(res0, zeros, res1, zeros, res0, res1);
2319 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) res0);
2320 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) res1);
2321 res0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
2325 dst0 = __msa_aver_u_b(res, dst0);
2328 dst += (2 * dst_stride);
2345 uint32_t multiple8_cnt;
2347 for (multiple8_cnt = 2; multiple8_cnt--;) {
2349 height, horiz_offset);
2363 uint32_t multiple8_cnt;
2365 for (multiple8_cnt = 4; multiple8_cnt--;) {
2367 height, horiz_offset);
2383 v16i8
src0,
src1, src2, src3, src4;
2385 v16i8 mask0, mask1, mask2;
2386 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
2387 v8i16 hz_out4, hz_out5, hz_out6;
2388 v8i16 res0, res1, res2, res3;
2392 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2393 src += (5 * src_stride);
2398 mask0, mask1, mask2);
2400 mask0, mask1, mask2);
2402 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2406 for (loop_cnt = (height >> 1); loop_cnt--;) {
2407 LD_SB2(src, src_stride, src0, src1);
2408 src += (2 * src_stride);
2411 LD_UB2(dst, dst_stride, dst0, dst1);
2415 hz_out6 = (v8i16) __msa_pckod_d((v2i64) hz_out5, (v2i64) hz_out5);
2417 hz_out3, hz_out4, hz_out5);
2419 hz_out4, hz_out5, hz_out6);
2422 res1 = __msa_srari_h(hz_out3, 5);
2423 res3 = __msa_srari_h(hz_out4, 5);
2425 res1 = __msa_srari_h(hz_out2, 5);
2426 res3 = __msa_srari_h(hz_out3, 5);
2431 res0 = __msa_aver_s_h(res0, res1);
2432 res1 = __msa_aver_s_h(res2, res3);
2439 out0 = __msa_copy_u_w((v4i32) dst0, 0);
2440 out1 = __msa_copy_u_w((v4i32) dst1, 0);
2462 v16i8
src0,
src1, src2, src3, src4;
2463 v16u8 dst0, dst1, dst2, dst3;
2464 v16i8 mask0, mask1, mask2;
2465 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
2466 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
2467 v8i16 res0, res1, res2, res3;
2468 v8i16 res4, res5, res6, res7;
2472 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2474 src += (5 * src_stride);
2482 for (loop_cnt = (height >> 2); loop_cnt--;) {
2483 LD_SB4(src, src_stride, src0, src1, src2, src3);
2485 src += (4 * src_stride);
2487 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2495 hz_out3, hz_out4, hz_out5);
2497 hz_out4, hz_out5, hz_out6);
2499 hz_out5, hz_out6, hz_out7);
2501 hz_out6, hz_out7, hz_out8);
2504 res1 = __msa_srari_h(hz_out3, 5);
2505 res3 = __msa_srari_h(hz_out4, 5);
2506 res5 = __msa_srari_h(hz_out5, 5);
2507 res7 = __msa_srari_h(hz_out6, 5);
2509 res1 = __msa_srari_h(hz_out2, 5);
2510 res3 = __msa_srari_h(hz_out3, 5);
2511 res5 = __msa_srari_h(hz_out4, 5);
2512 res7 = __msa_srari_h(hz_out5, 5);
2517 res0 = __msa_aver_s_h(res0, res1);
2518 res1 = __msa_aver_s_h(res2, res3);
2519 res2 = __msa_aver_s_h(res4, res5);
2520 res3 = __msa_aver_s_h(res6, res7);
2524 dst += (4 * dst_stride);
2543 for (multiple8_cnt = 2; multiple8_cnt--;) {
2545 height, vert_offset);
2558 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
2559 v16u8 dst0, dst1, dst2, dst3;
2560 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
2561 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
2562 v16i8 mask0, mask1, mask2;
2563 v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
2568 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
2569 src_y += (5 * src_stride);
2571 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
2572 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
2573 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
2574 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
2577 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
2578 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2581 mask0, mask1, mask2);
2583 mask0, mask1, mask2);
2586 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
2588 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
2589 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
2590 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
2591 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
2597 src_vt3, src_vt4, src_vt5);
2599 src_vt5, src_vt6, src_vt7);
2603 res1 = __msa_srari_h((hz_out1 + vert_out1), 1);
2604 res0 = __msa_srari_h((hz_out0 + vert_out0), 1);
2609 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
2610 dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
2611 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2612 dst0 = __msa_aver_u_b(res, dst0);
2614 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
2624 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
2625 v16u8 dst0, dst1, dst2, dst3;
2626 v16i8 src_vt0, src_vt1, src_vt2, src_vt3;
2627 v16i8 src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
2628 v16i8 mask0, mask1, mask2;
2629 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
2630 v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
2631 v8i16 out0, out1, out2, out3;
2635 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
2636 src_y += (5 * src_stride);
2638 src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
2639 src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
2640 src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
2641 src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
2645 for (loop_cnt = 2; loop_cnt--;) {
2646 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
2648 src_x += (4 * src_stride);
2650 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2655 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
2656 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
2657 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
2658 src_y += (4 * src_stride);
2660 src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
2661 src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
2662 src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
2663 src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
2667 src_vt4, src_vt5, vert_out0, vert_out1);
2669 src_vt6, src_vt7, vert_out2, vert_out3);
2670 SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
2671 SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
2673 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
2674 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
2675 out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
2676 out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
2681 dst += (4 * dst_stride);
2697 uint32_t multiple8_cnt;
2699 for (multiple8_cnt = 2; multiple8_cnt--;) {
2708 src_x += (8 * src_stride) - 16;
2709 src_y += (8 * src_stride) - 16;
2710 dst += (8 * dst_stride) - 16;
2712 for (multiple8_cnt = 2; multiple8_cnt--;) {
2727 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
2728 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
2730 if (0 == height % 12) {
2731 for (cnt = (height / 12); cnt--;) {
2733 src0, src1, src2, src3, src4, src5, src6, src7);
2734 src += (8 * src_stride);
2736 out0 = __msa_copy_u_d((v2i64) src0, 0);
2737 out1 = __msa_copy_u_d((v2i64) src1, 0);
2738 out2 = __msa_copy_u_d((v2i64) src2, 0);
2739 out3 = __msa_copy_u_d((v2i64) src3, 0);
2740 out4 = __msa_copy_u_d((v2i64) src4, 0);
2741 out5 = __msa_copy_u_d((v2i64) src5, 0);
2742 out6 = __msa_copy_u_d((v2i64) src6, 0);
2743 out7 = __msa_copy_u_d((v2i64) src7, 0);
2745 SD4(out0, out1, out2, out3, dst, dst_stride);
2746 dst += (4 * dst_stride);
2747 SD4(out4, out5, out6, out7, dst, dst_stride);
2748 dst += (4 * dst_stride);
2750 LD_UB4(src, src_stride, src0, src1, src2, src3);
2751 src += (4 * src_stride);
2753 out0 = __msa_copy_u_d((v2i64) src0, 0);
2754 out1 = __msa_copy_u_d((v2i64) src1, 0);
2755 out2 = __msa_copy_u_d((v2i64) src2, 0);
2756 out3 = __msa_copy_u_d((v2i64) src3, 0);
2758 SD4(out0, out1, out2, out3, dst, dst_stride);
2759 dst += (4 * dst_stride);
2761 }
else if (0 == height % 8) {
2762 for (cnt = height >> 3; cnt--;) {
2764 src0, src1, src2, src3, src4, src5, src6, src7);
2765 src += (8 * src_stride);
2767 out0 = __msa_copy_u_d((v2i64) src0, 0);
2768 out1 = __msa_copy_u_d((v2i64) src1, 0);
2769 out2 = __msa_copy_u_d((v2i64) src2, 0);
2770 out3 = __msa_copy_u_d((v2i64) src3, 0);
2771 out4 = __msa_copy_u_d((v2i64) src4, 0);
2772 out5 = __msa_copy_u_d((v2i64) src5, 0);
2773 out6 = __msa_copy_u_d((v2i64) src6, 0);
2774 out7 = __msa_copy_u_d((v2i64) src7, 0);
2776 SD4(out0, out1, out2, out3, dst, dst_stride);
2777 dst += (4 * dst_stride);
2778 SD4(out4, out5, out6, out7, dst, dst_stride);
2779 dst += (4 * dst_stride);
2781 }
else if (0 == height % 4) {
2782 for (cnt = (height / 4); cnt--;) {
2783 LD_UB4(src, src_stride, src0, src1, src2, src3);
2784 src += (4 * src_stride);
2785 out0 = __msa_copy_u_d((v2i64) src0, 0);
2786 out1 = __msa_copy_u_d((v2i64) src1, 0);
2787 out2 = __msa_copy_u_d((v2i64) src2, 0);
2788 out3 = __msa_copy_u_d((v2i64) src3, 0);
2790 SD4(out0, out1, out2, out3, dst, dst_stride);
2791 dst += (4 * dst_stride);
2793 }
else if (0 == height % 2) {
2794 for (cnt = (height / 2); cnt--;) {
2795 LD_UB2(src, src_stride, src0, src1);
2796 src += (2 * src_stride);
2797 out0 = __msa_copy_u_d((v2i64) src0, 0);
2798 out1 = __msa_copy_u_d((v2i64) src1, 0);
2815 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
2817 for (cnt = (width >> 4); cnt--;) {
2821 for (loop_cnt = (height >> 3); loop_cnt--;) {
2822 LD_UB8(src_tmp, src_stride,
2823 src0, src1, src2, src3, src4, src5, src6, src7);
2824 src_tmp += (8 * src_stride);
2826 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
2827 dst_tmp, dst_stride);
2828 dst_tmp += (8 * dst_stride);
2841 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
2843 if (0 == height % 12) {
2844 for (cnt = (height / 12); cnt--;) {
2846 src0, src1, src2, src3, src4, src5, src6, src7);
2847 src += (8 * src_stride);
2848 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
2850 dst += (8 * dst_stride);
2852 LD_UB4(src, src_stride, src0, src1, src2, src3);
2853 src += (4 * src_stride);
2854 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
2855 dst += (4 * dst_stride);
2857 }
else if (0 == height % 8) {
2859 }
else if (0 == height % 4) {
2860 for (cnt = (height >> 2); cnt--;) {
2861 LD_UB4(src, src_stride, src0, src1, src2, src3);
2862 src += (4 * src_stride);
2864 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
2865 dst += (4 * dst_stride);
2875 uint32_t out0, out1, out2, out3;
2877 v16u8 dst0, dst1, dst2, dst3;
2879 if (0 == (height % 4)) {
2880 for (cnt = (height / 4); cnt--;) {
2881 LD_UB4(src, src_stride, src0, src1, src2, src3);
2882 src += (4 * src_stride);
2884 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2886 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
2887 dst0, dst1, dst2, dst3);
2889 out0 = __msa_copy_u_w((v4i32) dst0, 0);
2890 out1 = __msa_copy_u_w((v4i32) dst1, 0);
2891 out2 = __msa_copy_u_w((v4i32) dst2, 0);
2892 out3 = __msa_copy_u_w((v4i32) dst3, 0);
2893 SW4(out0, out1, out2, out3, dst, dst_stride);
2894 dst += (4 * dst_stride);
2896 }
else if (0 == (height % 2)) {
2897 for (cnt = (height / 2); cnt--;) {
2898 LD_UB2(src, src_stride, src0, src1);
2899 src += (2 * src_stride);
2901 LD_UB2(dst, dst_stride, dst0, dst1);
2905 out0 = __msa_copy_u_w((v4i32) dst0, 0);
2906 out1 = __msa_copy_u_w((v4i32) dst1, 0);
2920 uint64_t out0, out1, out2, out3;
2922 v16u8 dst0, dst1, dst2, dst3;
2924 for (cnt = (height / 4); cnt--;) {
2925 LD_UB4(src, src_stride, src0, src1, src2, src3);
2926 src += (4 * src_stride);
2927 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2929 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
2930 dst0, dst1, dst2, dst3);
2932 out0 = __msa_copy_u_d((v2i64) dst0, 0);
2933 out1 = __msa_copy_u_d((v2i64) dst1, 0);
2934 out2 = __msa_copy_u_d((v2i64) dst2, 0);
2935 out3 = __msa_copy_u_d((v2i64) dst3, 0);
2936 SD4(out0, out1, out2, out3, dst, dst_stride);
2937 dst += (4 * dst_stride);
2946 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
2947 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2949 for (cnt = (height / 8); cnt--;) {
2950 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2951 src += (8 * src_stride);
2952 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2954 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
2955 dst0, dst1, dst2, dst3);
2956 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
2957 dst4, dst5, dst6, dst7);
2958 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
2959 dst += (8 * dst_stride);
3087 src - (stride * 2), stride, dst, stride, 16);
3094 src - (stride * 2) +
3095 sizeof(
uint8_t), stride, dst, stride, 16);
3102 src - (stride * 2), stride, dst, stride, 16);
3109 src - (stride * 2) +
3110 sizeof(
uint8_t), stride, dst, stride, 16);
3123 src - (stride * 2) +
3124 sizeof(
uint8_t), stride, dst, stride, 8);
3131 src - (stride * 2), stride, dst, stride, 8);
3138 src - (stride * 2) +
3139 sizeof(
uint8_t), stride, dst, stride, 8);
3153 src - (stride * 2) +
3154 sizeof(
uint8_t), stride, dst, stride, 4);
3161 src - (stride * 2), stride, dst, stride, 4);
3168 src - (stride * 2) +
3169 sizeof(
uint8_t), stride, dst, stride, 4);
3176 stride, dst, stride, 16, 0);
3183 stride, dst, stride, 16, 1);
3232 stride, dst, stride, 16, 0);
3239 stride, dst, stride, 16, 1);
3342 stride, dst, stride, 0);
3349 stride, dst, stride, 1);
3356 stride, dst, stride, 0);
3363 stride, dst, stride, 1);
3370 stride, dst, stride, 0);
3377 stride, dst, stride, 1);
3385 stride, dst, stride);
3392 src - (stride * 2) +
3402 stride, dst, stride);
3409 src - (stride * 2) +
3419 stride, dst, stride);
3426 src - (stride * 2) +
3427 sizeof(
uint8_t), stride, dst, stride);
3435 stride, dst, stride);
3442 src - (stride * 2) +
3443 sizeof(
uint8_t), stride, dst, stride);
3452 stride, dst, stride);
3459 src - (stride * 2) +
3460 sizeof(
uint8_t), stride, dst, stride);
3468 stride, dst, stride);
3475 src - (stride * 2) +
3476 sizeof(
uint8_t), stride, dst, stride);
3483 stride, dst, stride, 16, 0);
3490 stride, dst, stride, 16, 1);
3497 stride, dst, stride, 8, 0);
3504 stride, dst, stride, 8, 1);
3511 stride, dst, stride, 4, 0);
3518 stride, dst, stride, 4, 1);
3543 stride, dst, stride, 16, 0);
3550 stride, dst, stride, 16, 1);
3557 stride, dst, stride, 8, 0);
3564 stride, dst, stride, 8, 1);
3571 stride, dst, stride, 4, 0);
3578 stride, dst, stride, 4, 1);
3585 stride, dst, stride);
3592 stride, dst, stride, 8);
3599 stride, dst, stride);
void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hz_qrt_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t hor_offset)
void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define XORI_B5_128_SB(...)
static void avc_luma_vt_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_mid_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midh_qrt_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
static void avc_luma_hz_qrt_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t hor_offset)
static void avc_luma_midv_qrt_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t vert_offset)
void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midv_qrt_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t ver_offset)
#define XORI_B2_128_SB(...)
#define PCKEV_XORI128_UB(in0, in1)
void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avg_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hz_qrt_and_aver_dst_8x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t hor_offset)
static void avc_luma_hv_qrt_8w_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define DPADD_SB4_SH(...)
void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midh_qrt_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_vt_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)
static void avc_luma_midv_qrt_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t ver_offset)
#define XORI_B4_128_UB(...)
void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hz_and_aver_dst_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_vt_qrt_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t ver_offset)
void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hz_qrt_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t hor_offset)
#define XORI_B2_128_UB(...)
static void avc_luma_hz_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void avc_luma_mid_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midv_qrt_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t vert_offset)
#define AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5)
static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midh_qrt_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midh_qrt_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
static void copy_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define SW4(in0, in1, in2, in3, pdst, stride)
static void avc_luma_mid_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define XORI_B4_128_SB(...)
void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t width)
static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define AVC_CALC_DPADD_H_6PIX_2COEFF_SH(in0, in1, in2, in3, in4, in5)
static void avc_luma_midh_qrt_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define DPADD_SB2_SH(...)
static void avc_luma_vt_qrt_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t ver_offset)
void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static const uint8_t luma_mask_arr[16 *8]
static void avc_luma_hz_qrt_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t hor_offset)
#define AVC_HORZ_FILTER_SH(in, mask0, mask1, mask2)
static void avc_luma_hv_qrt_16w_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hz_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,dst0, dst1, dst2, dst3, pdst, stride)
void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_vt_and_aver_dst_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_vt_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_vt_qrt_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t ver_offset)
void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
BYTE int const BYTE int int int height
void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midv_qrt_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t vert_offset)
void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_midh_qrt_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t horiz_offset)
void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define SD4(in0, in1, in2, in3, pdst, stride)
static void avc_luma_hz_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
#define DPADD_SH2_SW(...)
static void avc_luma_hz_qrt_and_aver_dst_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t hor_offset)
void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hz_and_aver_dst_8x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_vt_qrt_and_aver_dst_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t ver_offset)
#define AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,mask0, mask1, mask2)
GLint GLenum GLboolean GLsizei stride
static void avc_luma_vt_qrt_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t ver_offset)
static void avc_luma_midv_qrt_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, uint8_t ver_offset)
static void avc_luma_mid_and_aver_dst_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
#define ST8x4_UB(in0, in1, pdst, stride)
static void avc_luma_mid_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_vt_qrt_and_aver_dst_8x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t ver_offset)
void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5)
void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_vt_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x, const uint8_t *src_y, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define ST8x1_UB(in, pdst)
static void copy_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define ST4x2_UB(in, pdst, stride)
static void avc_luma_hz_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void avc_luma_mid_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
#define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5,out1, out2)
void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)
static void avc_luma_vt_and_aver_dst_8x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride)