26 int16_t *dst,
int32_t dst_stride,
35 LD_SB2(src, src_stride, src0, src1);
37 src0 = (v16i8) __msa_ilvr_w((v4i32)
src1, (v4i32) src0);
38 in0 = (v8i16) __msa_ilvr_b(zero, src0);
41 }
else if (4 == height) {
45 LD_SB4(src, src_stride, src0, src1, src2, src3);
47 ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
51 ST8x4_UB(in0, in1, dst, 2 * dst_stride);
52 }
else if (0 == height % 8) {
53 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
54 v8i16 in0, in1, in2, in3;
57 for (loop_cnt = (height >> 3); loop_cnt--;) {
59 src0, src1, src2, src3, src4, src5, src6, src7);
60 src += (8 * src_stride);
62 ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
63 src0, src1, src2, src3);
64 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
67 ST8x8_UB(in0, in1, in2, in3, dst, 2 * dst_stride);
68 dst += (8 * dst_stride);
74 int16_t *dst,
int32_t dst_stride,
79 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
80 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
82 for (loop_cnt = (height >> 3); loop_cnt--;) {
83 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
84 src += (8 * src_stride);
86 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
88 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
92 ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, dst, 2 * dst_stride);
93 dst += (8 * dst_stride);
98 int16_t *dst,
int32_t dst_stride,
107 LD_SB2(src, src_stride, src0, src1);
112 ST_SH2(in0, in1, dst, dst_stride);
113 }
else if (4 == height) {
115 v8i16 in0, in1, in2, in3;
117 LD_SB4(src, src_stride, src0, src1, src2, src3);
119 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
121 SLLI_4V(in0, in1, in2, in3, 6);
122 ST_SH4(in0, in1, in2, in3, dst, dst_stride);
123 }
else if (6 == height) {
124 v16i8
src0,
src1, src2, src3, src4, src5;
125 v8i16 in0, in1, in2, in3, in4, in5;
127 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
129 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
132 SLLI_4V(in0, in1, in2, in3, 6);
135 ST_SH6(in0, in1, in2, in3, in4, in5, dst, dst_stride);
136 }
else if (0 == height % 8) {
138 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
139 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
141 for (loop_cnt = (height >> 3); loop_cnt--;) {
143 src0, src1, src2, src3, src4, src5, src6, src7);
144 src += (8 * src_stride);
146 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
148 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
150 SLLI_4V(in0, in1, in2, in3, 6);
151 SLLI_4V(in4, in5, in6, in7, 6);
152 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, dst, dst_stride);
153 dst += (8 * dst_stride);
159 int16_t *dst,
int32_t dst_stride,
164 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
165 v8i16 in0, in1, in0_r, in1_r, in2_r, in3_r;
167 for (loop_cnt = (height >> 3); loop_cnt--;) {
168 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169 src += (8 * src_stride);
171 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
172 in0_r, in1_r, in2_r, in3_r);
173 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
174 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
178 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
179 ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
180 dst += (4 * dst_stride);
182 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
183 in0_r, in1_r, in2_r, in3_r);
184 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
185 ILVL_W2_SB(src5, src4, src7, src6, src0, src1);
189 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
190 ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
191 dst += (4 * dst_stride);
196 int16_t *dst,
int32_t dst_stride,
203 v8i16 in0_r, in1_r, in2_r, in3_r;
204 v8i16 in0_l, in1_l, in2_l, in3_l;
206 LD_SB4(src, src_stride, src0, src1, src2, src3);
208 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
209 in0_r, in1_r, in2_r, in3_r);
210 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
211 in0_l, in1_l, in2_l, in3_l);
212 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
213 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
214 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
215 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
216 }
else if (12 == height) {
217 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
218 v16i8 src8, src9, src10, src11;
219 v8i16 in0_r, in1_r, in2_r, in3_r;
220 v8i16 in0_l, in1_l, in2_l, in3_l;
222 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
223 src += (8 * src_stride);
224 LD_SB4(src, src_stride, src8, src9, src10, src11);
226 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
227 in0_r, in1_r, in2_r, in3_r);
228 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
229 in0_l, in1_l, in2_l, in3_l);
230 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
231 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
232 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
233 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
234 dst += (4 * dst_stride);
236 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
237 in0_r, in1_r, in2_r, in3_r);
238 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
239 in0_l, in1_l, in2_l, in3_l);
240 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
241 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
242 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
243 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
244 dst += (4 * dst_stride);
246 ILVR_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
247 in0_r, in1_r, in2_r, in3_r);
248 ILVL_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
249 in0_l, in1_l, in2_l, in3_l);
250 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
251 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
252 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
253 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
254 }
else if (0 == (height % 8)) {
256 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
257 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
259 for (loop_cnt = (height >> 3); loop_cnt--;) {
260 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
262 src += (8 * src_stride);
263 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r,
264 in1_r, in2_r, in3_r);
265 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l,
266 in1_l, in2_l, in3_l);
267 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
268 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
269 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
270 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
271 dst += (4 * dst_stride);
273 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r,
274 in1_r, in2_r, in3_r);
275 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l,
276 in1_l, in2_l, in3_l);
277 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
278 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
279 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
280 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
281 dst += (4 * dst_stride);
287 int16_t *dst,
int32_t dst_stride,
292 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
293 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
295 for (loop_cnt = (height >> 2); loop_cnt--;) {
296 LD_SB4(src, src_stride, src0, src1, src2, src3);
297 LD_SB4((src + 16), src_stride, src4, src5, src6, src7);
298 src += (4 * src_stride);
299 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r,
301 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l,
303 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
304 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
305 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
306 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
307 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r,
309 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
310 ST_SH4(in0_r, in1_r, in2_r, in3_r, (dst + 16), dst_stride);
311 dst += (4 * dst_stride);
316 int16_t *dst,
int32_t dst_stride,
321 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
322 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
324 for (loop_cnt = (height >> 2); loop_cnt--;) {
325 LD_SB4(src, src_stride, src0, src2, src4, src6);
326 LD_SB4((src + 16), src_stride, src1, src3, src5, src7);
327 src += (4 * src_stride);
329 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r,
331 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l,
333 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
334 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
335 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
337 ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
340 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r,
342 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l, in1_l,
344 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
345 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
346 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
348 ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
354 int16_t *dst,
int32_t dst_stride,
359 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
360 v16i8 src8, src9, src10, src11;
361 v8i16 in0_r, in1_r, in2_r, in3_r, in4_r, in5_r;
362 v8i16 in0_l, in1_l, in2_l, in3_l, in4_l, in5_l;
364 for (loop_cnt = (height >> 2); loop_cnt--;) {
365 LD_SB3(src, 16, src0, src1, src2);
367 LD_SB3(src, 16, src3, src4, src5);
369 LD_SB3(src, 16, src6, src7, src8);
371 LD_SB3(src, 16, src9, src10, src11);
374 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
375 in0_r, in1_r, in2_r, in3_r);
376 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
377 in0_l, in1_l, in2_l, in3_l);
378 ILVR_B2_SH(zero, src4, zero, src5, in4_r, in5_r);
379 ILVL_B2_SH(zero, src4, zero, src5, in4_l, in5_l);
380 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
381 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
382 SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
383 ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
385 ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
388 ILVR_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9,
389 in0_r, in1_r, in2_r, in3_r);
390 ILVL_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9,
391 in0_l, in1_l, in2_l, in3_l);
392 ILVR_B2_SH(zero, src10, zero, src11, in4_r, in5_r);
393 ILVL_B2_SH(zero, src10, zero, src11, in4_l, in5_l);
394 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
395 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
396 SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
397 ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
399 ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
405 int16_t *dst,
int32_t dst_stride,
410 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
411 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
413 for (loop_cnt = (height >> 1); loop_cnt--;) {
414 LD_SB4(src, 16, src0, src1, src2, src3);
416 LD_SB4(src, 16, src4, src5, src6, src7);
419 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
420 in0_r, in1_r, in2_r, in3_r);
421 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
422 in0_l, in1_l, in2_l, in3_l);
423 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
424 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
425 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
426 ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
429 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
430 in0_r, in1_r, in2_r, in3_r);
431 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
432 in0_l, in1_l, in2_l, in3_l);
433 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
434 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
435 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
436 ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
442 int16_t *dst,
int32_t dst_stride,
446 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
447 v8i16 filt0, filt1, filt2, filt3;
448 v16i8 mask1, mask2, mask3;
449 v16i8 vec0, vec1, vec2, vec3;
450 v8i16 dst0, dst1, dst2, dst3;
451 v8i16 filter_vec, const_vec;
452 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
455 const_vec = __msa_ldi_h(128);
458 filter_vec =
LD_SH(filter);
459 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
465 for (loop_cnt = (height >> 3); loop_cnt--;) {
466 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
467 src += (8 * src_stride);
470 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
471 vec0, vec1, vec2, vec3);
473 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
474 dst0, dst0, dst0, dst0);
475 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
476 vec0, vec1, vec2, vec3);
478 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
479 dst1, dst1, dst1, dst1);
480 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
481 vec0, vec1, vec2, vec3);
483 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
484 dst2, dst2, dst2, dst2);
485 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
486 vec0, vec1, vec2, vec3);
488 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
489 dst3, dst3, dst3, dst3);
491 ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
492 dst += (8 * dst_stride);
497 int16_t *dst,
int32_t dst_stride,
502 v8i16 filt0, filt1, filt2, filt3;
503 v16i8 mask1, mask2, mask3;
504 v16i8 vec0, vec1, vec2, vec3;
505 v8i16 dst0, dst1, dst2, dst3;
506 v8i16 filter_vec, const_vec;
507 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
510 const_vec = __msa_ldi_h(128);
513 filter_vec =
LD_SH(filter);
514 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
520 for (loop_cnt = (height >> 2); loop_cnt--;) {
521 LD_SB4(src, src_stride, src0, src1, src2, src3);
522 src += (4 * src_stride);
525 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
526 vec0, vec1, vec2, vec3);
528 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
529 dst0, dst0, dst0, dst0);
530 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
531 vec0, vec1, vec2, vec3);
533 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
534 dst1, dst1, dst1, dst1);
535 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
536 vec0, vec1, vec2, vec3);
538 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
539 dst2, dst2, dst2, dst2);
540 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
541 vec0, vec1, vec2, vec3);
543 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
544 dst3, dst3, dst3, dst3);
546 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
547 dst += (4 * dst_stride);
552 int16_t *dst,
int32_t dst_stride,
560 int16_t *dst,
int32_t dst_stride,
564 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
565 v8i16 filt0, filt1, filt2, filt3;
566 v16i8 mask1, mask2, mask3;
567 v16i8 vec0, vec1, vec2, vec3;
568 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
569 v8i16 filter_vec, const_vec;
570 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
573 const_vec = __msa_ldi_h(128);
576 filter_vec =
LD_SH(filter);
577 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
583 for (loop_cnt = (height >> 2); loop_cnt--;) {
584 LD_SB4(src, src_stride, src0, src2, src4, src6);
585 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
586 src += (4 * src_stride);
589 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
590 vec0, vec1, vec2, vec3);
592 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
593 dst0, dst0, dst0, dst0);
594 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
595 vec0, vec1, vec2, vec3);
597 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
598 dst1, dst1, dst1, dst1);
599 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
600 vec0, vec1, vec2, vec3);
602 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
603 dst2, dst2, dst2, dst2);
604 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
605 vec0, vec1, vec2, vec3);
607 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
608 dst3, dst3, dst3, dst3);
609 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
610 vec0, vec1, vec2, vec3);
612 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
613 dst4, dst4, dst4, dst4);
614 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
615 vec0, vec1, vec2, vec3);
617 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
618 dst5, dst5, dst5, dst5);
619 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
620 vec0, vec1, vec2, vec3);
622 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
623 dst6, dst6, dst6, dst6);
624 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
625 vec0, vec1, vec2, vec3);
627 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
628 dst7, dst7, dst7, dst7);
630 ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
631 ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
632 dst += (4 * dst_stride);
637 int16_t *dst,
int32_t dst_stride,
642 v8i16 filt0, filt1, filt2, filt3;
643 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
644 v16i8 vec0, vec1, vec2, vec3;
645 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
646 v8i16 filter_vec, const_vec;
647 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
650 filter_vec =
LD_SH(filter);
651 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
661 const_vec = __msa_ldi_h(128);
664 for (loop_cnt = (height >> 1); loop_cnt--;) {
665 LD_SB2(src, 16, src0, src1);
667 LD_SB2(src, 16, src2, src3);
671 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
672 vec0, vec1, vec2, vec3);
674 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
675 dst0, dst0, dst0, dst0);
676 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
677 vec0, vec1, vec2, vec3);
679 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
680 dst1, dst1, dst1, dst1);
681 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
682 vec0, vec1, vec2, vec3);
684 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
685 dst2, dst2, dst2, dst2);
686 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
687 vec0, vec1, vec2, vec3);
689 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
690 dst3, dst3, dst3, dst3);
691 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
692 vec0, vec1, vec2, vec3);
694 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
695 dst4, dst4, dst4, dst4);
696 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
697 vec0, vec1, vec2, vec3);
699 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
700 dst5, dst5, dst5, dst5);
702 ST_SH2(dst0, dst1, dst, 8);
703 ST_SH(dst2, dst + 16);
705 ST_SH2(dst3, dst4, dst, 8);
706 ST_SH(dst5, dst + 16);
712 int16_t *dst,
int32_t dst_stride,
717 v8i16 filt0, filt1, filt2, filt3;
718 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
719 v16i8 vec0, vec1, vec2, vec3;
720 v8i16 dst0, dst1, dst2, dst3;
721 v8i16 filter_vec, const_vec;
722 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
725 filter_vec =
LD_SH(filter);
726 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
736 const_vec = __msa_ldi_h(128);
739 for (loop_cnt = height; loop_cnt--;) {
740 LD_SB2(src, 16, src0, src1);
741 src2 =
LD_SB(src + 24);
745 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
746 vec0, vec1, vec2, vec3);
748 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
749 dst0, dst0, dst0, dst0);
750 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
751 vec0, vec1, vec2, vec3);
753 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
754 dst1, dst1, dst1, dst1);
755 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
756 vec0, vec1, vec2, vec3);
758 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
759 dst2, dst2, dst2, dst2);
760 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
761 vec0, vec1, vec2, vec3);
763 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
764 dst3, dst3, dst3, dst3);
766 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
772 int16_t *dst,
int32_t dst_stride,
777 v8i16 filt0, filt1, filt2, filt3;
778 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
779 v16i8 vec0, vec1, vec2, vec3;
780 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
781 v8i16 filter_vec, const_vec;
782 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
785 filter_vec =
LD_SH(filter);
786 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
796 const_vec = __msa_ldi_h(128);
799 for (loop_cnt = height; loop_cnt--;) {
800 LD_SB3(src, 16, src0, src1, src2);
801 src3 =
LD_SB(src + 40);
805 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
806 vec0, vec1, vec2, vec3);
808 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
809 dst0, dst0, dst0, dst0);
810 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
811 vec0, vec1, vec2, vec3);
813 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
814 dst1, dst1, dst1, dst1);
815 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
816 vec0, vec1, vec2, vec3);
818 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
819 dst2, dst2, dst2, dst2);
820 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
821 vec0, vec1, vec2, vec3);
823 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
824 dst3, dst3, dst3, dst3);
825 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
826 vec0, vec1, vec2, vec3);
828 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
829 dst4, dst4, dst4, dst4);
830 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
831 vec0, vec1, vec2, vec3);
833 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
834 dst5, dst5, dst5, dst5);
836 ST_SH6(dst0, dst1, dst2, dst3, dst4, dst5, dst, 8);
842 int16_t *dst,
int32_t dst_stride,
847 v8i16 filt0, filt1, filt2, filt3;
848 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
849 v16i8 vec0, vec1, vec2, vec3;
850 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
851 v8i16 filter_vec, const_vec;
852 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
856 filter_vec =
LD_SH(filter);
857 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
867 const_vec = __msa_ldi_h(128);
870 for (loop_cnt = height; loop_cnt--;) {
871 LD_SB4(src, 16, src0, src1, src2, src3);
872 src4 =
LD_SB(src + 56);
876 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
877 vec0, vec1, vec2, vec3);
879 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
880 dst0, dst0, dst0, dst0);
883 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
884 vec0, vec1, vec2, vec3);
886 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
887 dst1, dst1, dst1, dst1);
888 ST_SH(dst1, dst + 8);
890 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
891 vec0, vec1, vec2, vec3);
893 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
894 dst2, dst2, dst2, dst2);
895 ST_SH(dst2, dst + 16);
897 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
898 vec0, vec1, vec2, vec3);
900 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
901 dst3, dst3, dst3, dst3);
902 ST_SH(dst3, dst + 24);
904 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
905 vec0, vec1, vec2, vec3);
907 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
908 dst4, dst4, dst4, dst4);
909 ST_SH(dst4, dst + 32);
911 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
912 vec0, vec1, vec2, vec3);
914 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
915 dst5, dst5, dst5, dst5);
916 ST_SH(dst5, dst + 40);
918 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
919 vec0, vec1, vec2, vec3);
921 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
922 dst6, dst6, dst6, dst6);
923 ST_SH(dst6, dst + 48);
925 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
926 vec0, vec1, vec2, vec3);
928 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
929 dst7, dst7, dst7, dst7);
930 ST_SH(dst7, dst + 56);
936 int16_t *dst,
int32_t dst_stride,
940 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
941 v16i8 src9, src10, src11, src12, src13, src14;
942 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
943 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
944 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
945 v16i8 src2110, src4332, src6554, src8776, src10998;
946 v16i8 src12111110, src14131312;
947 v8i16 dst10, dst32, dst54, dst76;
948 v8i16 filt0, filt1, filt2, filt3;
949 v8i16 filter_vec, const_vec;
951 src -= (3 * src_stride);
953 const_vec = __msa_ldi_h(128);
956 filter_vec =
LD_SH(filter);
957 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
959 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
960 src += (7 * src_stride);
961 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
962 src10_r, src32_r, src54_r, src21_r);
963 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
964 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
965 src2110, src4332, src6554);
968 for (loop_cnt = (height >> 3); loop_cnt--;) {
970 src7, src8, src9, src10, src11, src12, src13, src14);
971 src += (8 * src_stride);
973 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
974 src76_r, src87_r, src98_r, src109_r);
975 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
976 src1110_r, src1211_r, src1312_r, src1413_r);
977 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r,
978 src1211_r, src1110_r, src1413_r, src1312_r,
979 src8776, src10998, src12111110, src14131312);
984 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
987 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
990 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
992 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
993 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
995 ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
996 dst += (8 * dst_stride);
999 src4332 = src12111110;
1000 src6554 = src14131312;
1006 int16_t *dst,
int32_t dst_stride,
1010 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1011 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1012 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1013 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1014 v8i16 filter_vec, const_vec;
1015 v8i16 filt0, filt1, filt2, filt3;
1017 src -= (3 * src_stride);
1018 const_vec = __msa_ldi_h(128);
1021 filter_vec =
LD_SH(filter);
1022 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1024 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1025 src += (7 * src_stride);
1027 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1028 src10_r, src32_r, src54_r, src21_r);
1029 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1031 for (loop_cnt = (height >> 2); loop_cnt--;) {
1032 LD_SB4(src, src_stride, src7, src8, src9, src10);
1033 src += (4 * src_stride);
1035 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1036 src76_r, src87_r, src98_r, src109_r);
1040 filt0, filt1, filt2, filt3,
1041 dst0_r, dst0_r, dst0_r, dst0_r);
1044 filt0, filt1, filt2, filt3,
1045 dst1_r, dst1_r, dst1_r, dst1_r);
1048 filt0, filt1, filt2, filt3,
1049 dst2_r, dst2_r, dst2_r, dst2_r);
1052 filt0, filt1, filt2, filt3,
1053 dst3_r, dst3_r, dst3_r, dst3_r);
1055 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1056 dst += (4 * dst_stride);
1069 int16_t *dst,
int32_t dst_stride,
1073 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1074 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1075 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1076 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1077 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1078 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1079 v16i8 src2110, src4332, src6554, src8776, src10998;
1080 v8i16 dst0_l, dst1_l;
1081 v8i16 filter_vec, const_vec;
1082 v8i16 filt0, filt1, filt2, filt3;
1084 src -= (3 * src_stride);
1085 const_vec = __msa_ldi_h(128);
1088 filter_vec =
LD_SH(filter);
1089 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1091 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1092 src += (7 * src_stride);
1094 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1095 src10_r, src32_r, src54_r, src21_r);
1096 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1097 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1098 src10_l, src32_l, src54_l, src21_l);
1099 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1100 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1101 src2110, src4332, src6554);
1103 for (loop_cnt = (height >> 2); loop_cnt--;) {
1104 LD_SB4(src, src_stride, src7, src8, src9, src10);
1105 src += (4 * src_stride);
1107 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1108 src76_r, src87_r, src98_r, src109_r);
1109 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1110 src76_l, src87_l, src98_l, src109_l);
1111 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1115 filt0, filt1, filt2, filt3,
1116 dst0_r, dst0_r, dst0_r, dst0_r);
1119 filt0, filt1, filt2, filt3,
1120 dst1_r, dst1_r, dst1_r, dst1_r);
1123 filt0, filt1, filt2, filt3,
1124 dst2_r, dst2_r, dst2_r, dst2_r);
1127 filt0, filt1, filt2, filt3,
1128 dst3_r, dst3_r, dst3_r, dst3_r);
1131 filt0, filt1, filt2, filt3,
1132 dst0_l, dst0_l, dst0_l, dst0_l);
1135 filt0, filt1, filt2, filt3,
1136 dst1_l, dst1_l, dst1_l, dst1_l);
1138 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1139 ST8x4_UB(dst0_l, dst1_l, dst + 8, 2 * dst_stride);
1140 dst += (4 * dst_stride);
1166 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1167 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1168 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1169 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1170 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1171 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1172 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
1173 v8i16 filter_vec, const_vec;
1174 v8i16 filt0, filt1, filt2, filt3;
1176 src -= (3 * src_stride);
1177 const_vec = __msa_ldi_h(128);
1180 filter_vec =
LD_SH(filter);
1181 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1183 for (cnt = width >> 4; cnt--;) {
1187 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1188 src_tmp += (7 * src_stride);
1190 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1191 src10_r, src32_r, src54_r, src21_r);
1192 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1193 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1194 src10_l, src32_l, src54_l, src21_l);
1195 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1197 for (loop_cnt = (height >> 2); loop_cnt--;) {
1198 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1199 src_tmp += (4 * src_stride);
1201 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1202 src76_r, src87_r, src98_r, src109_r);
1203 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1204 src76_l, src87_l, src98_l, src109_l);
1208 filt0, filt1, filt2, filt3,
1209 dst0_r, dst0_r, dst0_r, dst0_r);
1212 filt0, filt1, filt2, filt3,
1213 dst1_r, dst1_r, dst1_r, dst1_r);
1216 filt0, filt1, filt2, filt3,
1217 dst2_r, dst2_r, dst2_r, dst2_r);
1220 filt0, filt1, filt2, filt3,
1221 dst3_r, dst3_r, dst3_r, dst3_r);
1224 filt0, filt1, filt2, filt3,
1225 dst0_l, dst0_l, dst0_l, dst0_l);
1228 filt0, filt1, filt2, filt3,
1229 dst1_l, dst1_l, dst1_l, dst1_l);
1232 filt0, filt1, filt2, filt3,
1233 dst2_l, dst2_l, dst2_l, dst2_l);
1236 filt0, filt1, filt2, filt3,
1237 dst3_l, dst3_l, dst3_l, dst3_l);
1239 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
1240 ST_SH4(dst0_l, dst1_l, dst2_l, dst3_l, dst_tmp + 8, dst_stride);
1241 dst_tmp += (4 * dst_stride);
1264 int16_t *dst,
int32_t dst_stride,
1268 filter, height, 16);
1272 int16_t *dst,
int32_t dst_stride,
1276 filter, height, 16);
1282 int16_t *dst,
int32_t dst_stride,
1286 filter, height, 32);
1290 int16_t *dst,
int32_t dst_stride,
1294 filter, height, 48);
1298 int16_t *dst,
int32_t dst_stride,
1302 filter, height, 64);
1306 int16_t *dst,
int32_t dst_stride,
1307 const int8_t *filter_x,
const int8_t *filter_y,
1311 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1312 v8i16 filt0, filt1, filt2, filt3;
1313 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1314 v16i8 mask1, mask2, mask3;
1315 v8i16 filter_vec, const_vec;
1316 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1317 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1318 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1319 v4i32 dst0_r, dst1_r;
1320 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1321 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1323 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
1325 v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1327 src -= ((3 * src_stride) + 3);
1328 filter_vec =
LD_SH(filter_x);
1329 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1331 filter_vec =
LD_SH(filter_y);
1332 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1333 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1335 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1341 const_vec = __msa_ldi_h(128);
1344 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1345 src += (7 * src_stride);
1348 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1349 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1350 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1351 vec8, vec9, vec10, vec11);
1352 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1353 vec12, vec13, vec14, vec15);
1355 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1356 dst30, dst30, dst30, dst30);
1358 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1359 dst41, dst41, dst41, dst41);
1361 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1362 dst52, dst52, dst52, dst52);
1364 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1365 dst63, dst63, dst63, dst63);
1367 ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1368 dst10_r, dst21_r, dst32_r);
1369 dst43_r = __msa_ilvl_h(dst41, dst30);
1370 dst54_r = __msa_ilvl_h(dst52, dst41);
1371 dst65_r = __msa_ilvl_h(dst63, dst52);
1372 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1374 for (loop_cnt = height >> 1; loop_cnt--;) {
1375 LD_SB2(src, src_stride, src7, src8);
1376 src += (2 * src_stride);
1379 VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1380 vec0, vec1, vec2, vec3);
1382 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1383 dst87, dst87, dst87, dst87);
1384 dst76_r = __msa_ilvr_h(dst87, dst66);
1386 filt_h0, filt_h1, filt_h2, filt_h3);
1387 dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
1389 filt_h0, filt_h1, filt_h2, filt_h3);
1393 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
1394 ST8x2_UB(dst0_r, dst, (2 * dst_stride));
1395 dst += (2 * dst_stride);
1403 dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1411 const int8_t *filter_x,
1412 const int8_t *filter_y,
1415 uint32_t loop_cnt, cnt;
1418 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1419 v8i16 filt0, filt1, filt2, filt3;
1420 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1421 v16i8 mask1, mask2, mask3;
1422 v8i16 filter_vec, const_vec;
1423 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1424 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1425 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1426 v4i32 dst0_r, dst0_l;
1427 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1428 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1429 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1430 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1431 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1433 src -= ((3 * src_stride) + 3);
1434 filter_vec =
LD_SH(filter_x);
1435 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1437 filter_vec =
LD_SH(filter_y);
1438 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1439 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1441 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1447 const_vec = __msa_ldi_h(128);
1450 for (cnt = width >> 3; cnt--;) {
1454 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1455 src_tmp += (7 * src_stride);
1459 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1460 vec0, vec1, vec2, vec3);
1461 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1462 vec4, vec5, vec6, vec7);
1463 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1464 vec8, vec9, vec10, vec11);
1465 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1466 vec12, vec13, vec14, vec15);
1468 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1469 dst0, dst0, dst0, dst0);
1471 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1472 dst1, dst1, dst1, dst1);
1474 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1475 dst2, dst2, dst2, dst2);
1477 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1478 dst3, dst3, dst3, dst3);
1481 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1482 vec0, vec1, vec2, vec3);
1483 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1484 vec4, vec5, vec6, vec7);
1485 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1486 vec8, vec9, vec10, vec11);
1488 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1489 dst4, dst4, dst4, dst4);
1491 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1492 dst5, dst5, dst5, dst5);
1494 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1495 dst6, dst6, dst6, dst6);
1497 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1498 dst10_r, dst32_r, dst54_r, dst21_r);
1499 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1500 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1501 dst10_l, dst32_l, dst54_l, dst21_l);
1502 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1504 for (loop_cnt = height >> 1; loop_cnt--;) {
1505 LD_SB2(src_tmp, src_stride, src7, src8);
1507 src_tmp += 2 * src_stride;
1509 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1510 vec0, vec1, vec2, vec3);
1512 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1513 dst7, dst7, dst7, dst7);
1517 filt_h0, filt_h1, filt_h2, filt_h3);
1519 filt_h0, filt_h1, filt_h2, filt_h3);
1523 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1524 ST_SW(dst0_r, dst_tmp);
1525 dst_tmp += dst_stride;
1527 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1528 vec0, vec1, vec2, vec3);
1530 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1531 dst8, dst8, dst8, dst8);
1536 filt_h0, filt_h1, filt_h2, filt_h3);
1538 filt_h0, filt_h1, filt_h2, filt_h3);
1542 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1543 ST_SW(dst0_r, dst_tmp);
1544 dst_tmp += dst_stride;
1566 int16_t *dst,
int32_t dst_stride,
1567 const int8_t *filter_x,
const int8_t *filter_y,
1571 filter_x, filter_y, height, 8);
1575 int16_t *dst,
int32_t dst_stride,
1576 const int8_t *filter_x,
const int8_t *filter_y,
1580 filter_x, filter_y, height, 8);
1583 filter_x, filter_y, height);
1587 int16_t *dst,
int32_t dst_stride,
1588 const int8_t *filter_x,
const int8_t *filter_y,
1592 filter_x, filter_y, height, 16);
1596 int16_t *dst,
int32_t dst_stride,
1597 const int8_t *filter_x,
const int8_t *filter_y,
1601 filter_x, filter_y, height, 24);
1605 int16_t *dst,
int32_t dst_stride,
1606 const int8_t *filter_x,
const int8_t *filter_y,
1610 filter_x, filter_y, height, 32);
1614 int16_t *dst,
int32_t dst_stride,
1615 const int8_t *filter_x,
const int8_t *filter_y,
1619 filter_x, filter_y, height, 48);
1623 int16_t *dst,
int32_t dst_stride,
1624 const int8_t *filter_x,
const int8_t *filter_y,
1628 filter_x, filter_y, height, 64);
1639 v16i8 mask1, vec0, vec1;
1641 v8i16 filter_vec, const_vec;
1642 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1646 filter_vec =
LD_SH(filter);
1651 const_vec = __msa_ldi_h(128);
1654 LD_SB2(src, src_stride, src0, src1);
1657 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1661 ST8x2_UB(dst0, dst, 2 * dst_stride);
1672 v16i8 mask1, vec0, vec1;
1674 v8i16 filter_vec, const_vec;
1675 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1679 filter_vec =
LD_SH(filter);
1684 const_vec = __msa_ldi_h(128);
1687 LD_SB4(src, src_stride, src0, src1, src2, src3);
1690 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1694 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1698 ST8x4_UB(dst0, dst1, dst, 2 * dst_stride);
1710 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
1711 v16i8 mask1, vec0, vec1;
1712 v8i16 dst0, dst1, dst2, dst3;
1713 v8i16 filter_vec, const_vec;
1714 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1718 filter_vec =
LD_SH(filter);
1723 const_vec = __msa_ldi_h(128);
1726 for (loop_cnt = (height >> 3); loop_cnt--;) {
1727 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1728 src += (8 * src_stride);
1732 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1735 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1738 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
1741 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
1745 ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
1746 dst += (8 * dst_stride);
1759 }
else if (4 == height) {
1761 }
else if (0 == height % 8) {
1775 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
1776 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
1777 v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
1779 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1782 v8i16 filter_vec, const_vec;
1786 filter_vec =
LD_SH(filter);
1791 const_vec = __msa_ldi_h(128);
1794 for (loop_cnt = (height >> 2); loop_cnt--;) {
1795 LD_SB4(src, src_stride, src0, src1, src2, src3);
1796 src += (4 * src_stride);
1800 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
1803 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
1806 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
1809 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
1813 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1814 dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
1815 dst_val2 = __msa_copy_u_d((v2i64) dst2, 0);
1816 dst_val3 = __msa_copy_u_d((v2i64) dst3, 0);
1818 dst_val_int0 = __msa_copy_u_w((v4i32) dst0, 2);
1819 dst_val_int1 = __msa_copy_u_w((v4i32) dst1, 2);
1820 dst_val_int2 = __msa_copy_u_w((v4i32) dst2, 2);
1821 dst_val_int3 = __msa_copy_u_w((v4i32) dst3, 2);
1824 SW(dst_val_int0, dst + 4);
1827 SW(dst_val_int1, dst + 4);
1830 SW(dst_val_int2, dst + 4);
1833 SW(dst_val_int3, dst + 4);
1846 v8i16 filt0, filt1, dst0, dst1;
1848 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1851 v8i16 filter_vec, const_vec;
1855 filter_vec =
LD_SH(filter);
1860 const_vec = __msa_ldi_h(128);
1863 for (loop_cnt = (height >> 1); loop_cnt--;) {
1864 LD_SB2(src, src_stride, src0, src1);
1865 src += (2 * src_stride);
1869 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
1873 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
1877 ST_SH2(dst0, dst1, dst, dst_stride);
1878 dst += (2 * dst_stride);
1892 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1895 v8i16 dst0, dst1, dst2, dst3;
1896 v8i16 filter_vec, const_vec;
1900 filter_vec =
LD_SH(filter);
1905 const_vec = __msa_ldi_h(128);
1908 for (loop_cnt = (height >> 2); loop_cnt--;) {
1909 LD_SB4(src, src_stride, src0, src1, src2, src3);
1910 src += (4 * src_stride);
1914 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
1918 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
1922 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
1926 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
1930 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
1931 dst += (4 * dst_stride);
1942 if (2 == height || 6 == height) {
1963 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1964 v8i16 filter_vec, const_vec;
1966 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1968 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
1973 filter_vec =
LD_SH(filter);
1979 const_vec = __msa_ldi_h(128);
1982 for (loop_cnt = (height >> 2); loop_cnt--;) {
1983 LD_SB4(src, src_stride, src0, src1, src2, src3);
1984 src += (4 * src_stride);
1987 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
1990 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
1993 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
1996 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
1999 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2002 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2006 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2007 ST8x4_UB(dst4, dst5, dst + 8, 2 * dst_stride);
2008 dst += (4 * dst_stride);
2021 v16i8 src4, src5, src6, src7;
2023 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2025 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2027 v8i16 filter_vec, const_vec;
2031 filter_vec =
LD_SH(filter);
2036 const_vec = __msa_ldi_h(128);
2039 for (loop_cnt = (height >> 2); loop_cnt--;) {
2040 LD_SB4(src, src_stride, src0, src2, src4, src6);
2041 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2042 src += (4 * src_stride);
2046 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2050 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2054 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2058 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2062 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2066 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2070 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2074 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2078 ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
2079 ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
2080 dst += (4 * dst_stride);
2092 int16_t *dst_tmp = dst + 16;
2093 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2095 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2096 v16i8 mask1, mask00, mask11;
2098 v8i16 dst0, dst1, dst2, dst3;
2099 v8i16 filter_vec, const_vec;
2103 filter_vec =
LD_SH(filter);
2108 mask11 = mask0 + 10;
2110 const_vec = __msa_ldi_h(128);
2113 for (loop_cnt = (height >> 2); loop_cnt--;) {
2115 LD_SB4(src, src_stride, src0, src2, src4, src6);
2116 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2117 src += (4 * src_stride);
2121 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2125 VSHF_B2_SB(src0, src1, src0, src1, mask00, mask11, vec0, vec1);
2129 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2133 VSHF_B2_SB(src2, src3, src2, src3, mask00, mask11, vec0, vec1);
2137 ST_SH2(dst0, dst1, dst, 8);
2139 ST_SH2(dst2, dst3, dst, 8);
2142 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2146 VSHF_B2_SB(src4, src5, src4, src5, mask00, mask11, vec0, vec1);
2150 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2154 VSHF_B2_SB(src6, src7, src6, src7, mask00, mask11, vec0, vec1);
2158 ST_SH2(dst0, dst1, dst, 8);
2160 ST_SH2(dst2, dst3, dst, 8);
2164 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2168 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2172 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2176 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2180 ST_SH4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
2181 dst_tmp += (4 * dst_stride);
2195 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2196 v16i8 mask1, mask2, mask3;
2197 v8i16 dst0, dst1, dst2, dst3;
2199 v8i16 filter_vec, const_vec;
2203 filter_vec =
LD_SH(filter);
2206 const_vec = __msa_ldi_h(128);
2213 for (loop_cnt = (height >> 1); loop_cnt--;) {
2214 LD_SB2(src, 16, src0, src1);
2215 src2 =
LD_SB(src + 24);
2220 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2224 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2228 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2232 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2236 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
2239 LD_SB2(src, 16, src0, src1);
2240 src2 =
LD_SB(src + 24);
2245 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2249 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2253 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2257 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2261 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
2272 v16i8
src0,
src1, src2, src3, src4;
2273 v16i8 src10_r, src32_r, src21_r, src43_r;
2274 v16i8 src2110, src4332;
2277 v8i16 filter_vec, const_vec;
2281 const_vec = __msa_ldi_h(128);
2284 filter_vec =
LD_SH(filter);
2287 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2288 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2289 src10_r, src21_r, src32_r, src43_r);
2291 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2294 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2296 ST8x2_UB(dst10, dst, 2 * dst_stride);
2306 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2307 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2308 v16i8 src2110, src4332, src6554;
2311 v8i16 filter_vec, const_vec;
2315 const_vec = __msa_ldi_h(128);
2318 filter_vec =
LD_SH(filter);
2321 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
2322 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2323 src10_r, src21_r, src32_r, src43_r);
2324 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2325 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
2326 src2110, src4332, src6554);
2329 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2331 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2333 ST8x4_UB(dst10, dst32, dst, 2 * dst_stride);
2344 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9;
2345 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
2346 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
2347 v16i8 src2110, src4332, src6554, src8776;
2348 v8i16 dst10, dst32, dst54, dst76;
2350 v8i16 filter_vec, const_vec;
2353 const_vec = __msa_ldi_h(128);
2356 filter_vec =
LD_SH(filter);
2359 LD_SB3(src, src_stride, src0, src1, src2);
2360 src += (3 * src_stride);
2362 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2363 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2364 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2366 for (loop_cnt = (height >> 3); loop_cnt--;) {
2367 LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
2368 src += (6 * src_stride);
2370 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2371 src32_r, src43_r, src54_r, src65_r);
2372 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2373 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
2374 src4332, src6554, src8776);
2378 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2380 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2382 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2384 LD_SB2(src, src_stride, src9, src2);
2385 src += (2 * src_stride);
2386 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
2387 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
2388 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2390 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
2392 ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
2393 dst += (8 * dst_stride);
2406 }
else if (4 == height) {
2408 }
else if (0 == (height % 8)) {
2422 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
2423 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
2424 v16i8
src0,
src1, src2, src3, src4;
2425 v16i8 src10_r, src32_r, src21_r, src43_r;
2426 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2428 v8i16 filter_vec, const_vec;
2431 const_vec = __msa_ldi_h(128);
2434 filter_vec =
LD_SH(filter);
2437 LD_SB3(src, src_stride, src0, src1, src2);
2438 src += (3 * src_stride);
2440 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2442 for (loop_cnt = (height >> 2); loop_cnt--;) {
2443 LD_SB2(src, src_stride, src3, src4);
2444 src += (2 * src_stride);
2446 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2449 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2451 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2453 LD_SB2(src, src_stride, src1, src2);
2454 src += (2 * src_stride);
2456 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
2459 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
2461 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
2463 dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
2464 dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
2465 dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
2466 dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
2468 dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
2469 dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
2470 dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
2471 dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
2474 SW(dst_val_int0, dst + 4);
2477 SW(dst_val_int1, dst + 4);
2480 SW(dst_val_int2, dst + 4);
2483 SW(dst_val_int3, dst + 4);
2494 v16i8
src0,
src1, src2, src3, src4;
2495 v16i8 src10_r, src32_r, src21_r, src43_r;
2496 v8i16 dst0_r, dst1_r;
2498 v8i16 filter_vec, const_vec;
2501 const_vec = __msa_ldi_h(128);
2504 filter_vec =
LD_SH(filter);
2507 LD_SB3(src, src_stride, src0, src1, src2);
2508 src += (3 * src_stride);
2510 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2512 LD_SB2(src, src_stride, src3, src4);
2514 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2516 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2518 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2520 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2529 v16i8
src0,
src1, src2, src3, src4;
2530 v16i8 src10_r, src32_r, src21_r, src43_r;
2531 v8i16 dst0_r, dst1_r;
2533 v8i16 filter_vec, const_vec;
2536 const_vec = __msa_ldi_h(128);
2539 filter_vec =
LD_SH(filter);
2542 LD_SB3(src, src_stride, src0, src1, src2);
2543 src += (3 * src_stride);
2545 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2547 LD_SB2(src, src_stride, src3, src4);
2548 src += (2 * src_stride);
2551 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2553 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2555 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2557 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2558 dst += (2 * dst_stride);
2560 LD_SB2(src, src_stride, src1, src2);
2561 src += (2 * src_stride);
2564 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
2566 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2568 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2570 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2571 dst += (2 * dst_stride);
2573 LD_SB2(src, src_stride, src3, src4);
2576 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2578 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2580 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2582 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2593 v16i8
src0,
src1, src2, src3, src4, src5;
2594 v16i8 src10_r, src32_r, src21_r, src43_r;
2595 v8i16 dst0_r, dst1_r;
2597 v8i16 filter_vec, const_vec;
2600 const_vec = __msa_ldi_h(128);
2603 filter_vec =
LD_SH(filter);
2606 LD_SB3(src, src_stride, src0, src1, src2);
2607 src += (3 * src_stride);
2609 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2611 for (loop_cnt = (height >> 2); loop_cnt--;) {
2612 LD_SB2(src, src_stride, src3, src4);
2613 src += (2 * src_stride);
2615 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2617 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2619 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2621 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2622 dst += (2 * dst_stride);
2624 LD_SB2(src, src_stride, src5, src2);
2625 src += (2 * src_stride);
2627 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2629 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2631 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2633 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2634 dst += (2 * dst_stride);
2647 }
else if (6 == height) {
2663 v16i8
src0,
src1, src2, src3, src4, src5;
2664 v16i8 src10_r, src32_r, src21_r, src43_r;
2665 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2666 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2667 v16i8 src2110, src4332;
2668 v8i16 dst0_l, dst1_l;
2670 v8i16 filter_vec, const_vec;
2672 src -= (1 * src_stride);
2673 const_vec = __msa_ldi_h(128);
2676 filter_vec =
LD_SH(filter);
2679 LD_SB3(src, src_stride, src0, src1, src2);
2680 src += (3 * src_stride);
2682 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2683 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2684 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2686 for (loop_cnt = (height >> 2); loop_cnt--;) {
2687 LD_SB2(src, src_stride, src3, src4);
2688 src += (2 * src_stride);
2690 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2691 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2692 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2694 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2696 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2698 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
2700 LD_SB2(src, src_stride, src5, src2);
2701 src += (2 * src_stride);
2703 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2704 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
2705 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2707 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
2709 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
2711 DPADD_SB2_SH(src4332, src2110, filt0, filt1, dst1_l, dst1_l);
2713 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
2714 ST8x4_UB(dst0_l, dst1_l, dst + 8, (2 * dst_stride));
2715 dst += (4 * dst_stride);
2727 v16i8
src0,
src1, src2, src3, src4, src5;
2728 v16i8 src10_r, src32_r, src21_r, src43_r;
2729 v16i8 src10_l, src32_l, src21_l, src43_l;
2730 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
2732 v8i16 filter_vec, const_vec;
2735 const_vec = __msa_ldi_h(128);
2738 filter_vec =
LD_SH(filter);
2741 LD_SB3(src, src_stride, src0, src1, src2);
2742 src += (3 * src_stride);
2744 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2745 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2747 for (loop_cnt = (height >> 2); loop_cnt--;) {
2748 LD_SB2(src, src_stride, src3, src4);
2749 src += (2 * src_stride);
2751 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2752 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2754 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2756 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
2758 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2760 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
2761 ST_SH2(dst0_r, dst0_l, dst, 8);
2763 ST_SH2(dst1_r, dst1_l, dst, 8);
2766 LD_SB2(src, src_stride, src5, src2);
2767 src += (2 * src_stride);
2769 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2770 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2772 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2774 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
2776 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2778 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
2779 ST_SH2(dst0_r, dst0_l, dst, 8);
2781 ST_SH2(dst1_r, dst1_l, dst, 8);
2794 v16i8
src0,
src1, src2, src3, src4, src5;
2795 v16i8 src6, src7, src8, src9, src10, src11;
2796 v16i8 src10_r, src32_r, src76_r, src98_r;
2797 v16i8 src21_r, src43_r, src87_r, src109_r;
2798 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2799 v16i8 src10_l, src32_l, src21_l, src43_l;
2800 v8i16 dst0_l, dst1_l;
2802 v8i16 filter_vec, const_vec;
2805 const_vec = __msa_ldi_h(128);
2808 filter_vec =
LD_SH(filter);
2811 LD_SB3(src, src_stride, src0, src1, src2);
2813 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2814 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2816 LD_SB3(src + 16, src_stride, src6, src7, src8);
2817 src += (3 * src_stride);
2819 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2821 for (loop_cnt = (height >> 2); loop_cnt--;) {
2822 LD_SB2(src, src_stride, src3, src4);
2824 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2825 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2827 LD_SB2(src + 16, src_stride, src9, src10);
2828 src += (2 * src_stride);
2830 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2833 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2835 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
2837 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2839 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
2841 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
2843 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
2845 ST_SH2(dst0_r, dst0_l, dst, 8);
2846 ST_SH(dst2_r, dst + 16);
2848 ST_SH2(dst1_r, dst1_l, dst, 8);
2849 ST_SH(dst3_r, dst + 16);
2852 LD_SB2(src, src_stride, src5, src2);
2854 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2855 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2857 LD_SB2(src + 16, src_stride, src11, src8);
2858 src += (2 * src_stride);
2860 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2863 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2865 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
2867 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2869 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
2871 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
2873 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
2875 ST_SH2(dst0_r, dst0_l, dst, 8);
2876 ST_SH(dst2_r, dst + 16);
2878 ST_SH2(dst1_r, dst1_l, dst, 8);
2879 ST_SH(dst3_r, dst + 16);
2892 v16i8
src0,
src1, src2, src3, src4, src5;
2893 v16i8 src6, src7, src8, src9, src10, src11;
2894 v16i8 src10_r, src32_r, src76_r, src98_r;
2895 v16i8 src21_r, src43_r, src87_r, src109_r;
2896 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2897 v16i8 src10_l, src32_l, src76_l, src98_l;
2898 v16i8 src21_l, src43_l, src87_l, src109_l;
2899 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
2901 v8i16 filter_vec, const_vec;
2904 const_vec = __msa_ldi_h(128);
2907 filter_vec =
LD_SH(filter);
2910 LD_SB3(src, src_stride, src0, src1, src2);
2912 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2913 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2915 LD_SB3(src + 16, src_stride, src6, src7, src8);
2916 src += (3 * src_stride);
2918 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2919 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
2921 for (loop_cnt = (height >> 2); loop_cnt--;) {
2922 LD_SB2(src, src_stride, src3, src4);
2924 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2925 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2927 LD_SB2(src + 16, src_stride, src9, src10);
2928 src += (2 * src_stride);
2930 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2931 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
2934 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2936 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
2938 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2940 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
2942 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
2944 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
2946 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
2948 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
2950 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
2952 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
2955 LD_SB2(src, src_stride, src5, src2);
2957 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2958 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2960 LD_SB2(src + 16, src_stride, src11, src8);
2961 src += (2 * src_stride);
2963 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2964 ILVL_B2_SB(src11, src10, src8, src11, src76_l, src87_l);
2967 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2969 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
2971 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2973 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
2975 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
2977 DPADD_SB2_SH(src98_l, src76_l, filt0, filt1, dst2_l, dst2_l);
2979 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
2981 DPADD_SB2_SH(src109_l, src87_l, filt0, filt1, dst3_l, dst3_l);
2983 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
2985 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
2994 const int8_t *filter_x,
2995 const int8_t *filter_y)
2997 v16i8
src0,
src1, src2, src3, src4;
2999 v4i32 filt_h0, filt_h1;
3000 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3002 v8i16 filter_vec, const_vec;
3003 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3004 v8i16 dst0, dst1, dst2, dst3, dst4;
3005 v4i32 dst0_r, dst1_r;
3006 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3008 src -= (src_stride + 1);
3009 filter_vec =
LD_SH(filter_x);
3012 filter_vec =
LD_SH(filter_y);
3013 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3014 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3020 const_vec = __msa_ldi_h(128);
3023 LD_SB3(src, src_stride, src0, src1, src2);
3024 src += (3 * src_stride);
3027 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3028 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3029 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3037 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3039 LD_SB2(src, src_stride, src3, src4);
3042 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3046 dst32_r = __msa_ilvr_h(dst3, dst2);
3050 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3054 dst43_r = __msa_ilvr_h(dst4, dst3);
3058 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
3059 ST8x2_UB(dst0_r, dst, 2 * dst_stride);
3066 const int8_t *filter_x,
3067 const int8_t *filter_y)
3069 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3071 v4i32 filt_h0, filt_h1;
3072 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3074 v8i16 filter_vec, const_vec;
3075 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3076 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3077 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
3078 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3080 src -= (src_stride + 1);
3082 filter_vec =
LD_SH(filter_x);
3085 filter_vec =
LD_SH(filter_y);
3086 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3087 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3093 const_vec = __msa_ldi_h(128);
3096 LD_SB3(src, src_stride, src0, src1, src2);
3097 src += (3 * src_stride);
3101 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3102 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3103 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3112 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3114 LD_SB4(src, src_stride, src3, src4, src5, src6);
3117 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3121 dst32_r = __msa_ilvr_h(dst3, dst2);
3125 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3129 dst43_r = __msa_ilvr_h(dst4, dst3);
3133 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3137 dst10_r = __msa_ilvr_h(dst5, dst4);
3141 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3145 dst21_r = __msa_ilvr_h(dst2, dst5);
3149 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3150 ST8x4_UB(dst0_r, dst1_r, dst, 2 * dst_stride);
3158 const int8_t *filter_x,
3159 const int8_t *filter_y,
3163 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3164 v16i8 src7, src8, src9, src10;
3166 v4i32 filt_h0, filt_h1;
3167 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3169 v8i16 filter_vec, const_vec;
3170 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3171 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
3172 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3173 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3174 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3176 src -= (src_stride + 1);
3177 filter_vec =
LD_SH(filter_x);
3180 filter_vec =
LD_SH(filter_y);
3181 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3182 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3188 const_vec = __msa_ldi_h(128);
3191 LD_SB3(src, src_stride, src0, src1, src2);
3192 src += (3 * src_stride);
3195 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3196 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3197 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3206 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3208 for (loop_cnt = height >> 3; loop_cnt--;) {
3210 src3, src4, src5, src6, src7, src8, src9, src10);
3211 src += (8 * src_stride);
3214 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3218 dst32_r = __msa_ilvr_h(dst3, dst2);
3222 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3226 dst43_r = __msa_ilvr_h(dst4, dst3);
3230 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3234 dst54_r = __msa_ilvr_h(dst5, dst4);
3238 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3242 dst65_r = __msa_ilvr_h(dst6, dst5);
3246 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3250 dst76_r = __msa_ilvr_h(dst7, dst6);
3254 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3258 dst87_r = __msa_ilvr_h(dst8, dst7);
3262 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
3266 dst10_r = __msa_ilvr_h(dst9, dst8);
3270 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
3274 dst21_r = __msa_ilvr_h(dst2, dst9);
3279 dst5_r, dst4_r, dst7_r, dst6_r,
3280 dst0_r, dst1_r, dst2_r, dst3_r);
3281 ST8x8_UB(dst0_r, dst1_r, dst2_r, dst3_r, dst, 2 * dst_stride);
3282 dst += (8 * dst_stride);
3290 const int8_t *filter_x,
3291 const int8_t *filter_y,
3296 filter_x, filter_y);
3297 }
else if (4 == height) {
3299 filter_x, filter_y);
3300 }
else if (0 == (height % 8)) {
3302 filter_x, filter_y, height);
3310 const int8_t *filter_x,
3311 const int8_t *filter_y,
3315 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
3316 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
3317 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3319 v4i32 filt_h0, filt_h1;
3320 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3322 v8i16 filter_vec, const_vec;
3323 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3324 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3325 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3326 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3327 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3329 src -= (src_stride + 1);
3330 filter_vec =
LD_SH(filter_x);
3333 filter_vec =
LD_SH(filter_y);
3334 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3335 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3341 const_vec = __msa_ldi_h(128);
3344 LD_SB3(src, src_stride, src0, src1, src2);
3345 src += (3 * src_stride);
3348 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3349 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3350 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3362 for (loop_cnt = height >> 2; loop_cnt--;) {
3363 LD_SB4(src, src_stride, src3, src4, src5, src6);
3364 src += (4 * src_stride);
3367 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3377 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3387 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3397 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3408 dst2_l, dst2_r, dst3_l, dst3_r,
3409 dst0_r, dst1_r, dst2_r, dst3_r);
3411 dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
3412 dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
3413 dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
3414 dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
3416 dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
3417 dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
3418 dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
3419 dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
3422 SW(dst_val_int0, dst + 4);
3425 SW(dst_val_int1, dst + 4);
3428 SW(dst_val_int2, dst + 4);
3431 SW(dst_val_int3, dst + 4);
3441 const int8_t *filter_x,
3442 const int8_t *filter_y,
3445 v16i8
src0,
src1, src2, src3, src4;
3447 v4i32 filt_h0, filt_h1;
3448 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3450 v8i16 filter_vec, const_vec;
3451 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3452 v8i16 dst0, dst1, dst2, dst3, dst4;
3453 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3454 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3455 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3457 src -= (src_stride + 1);
3459 filter_vec =
LD_SH(filter_x);
3462 filter_vec =
LD_SH(filter_y);
3463 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3464 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3470 const_vec = __msa_ldi_h(128);
3473 LD_SB3(src, src_stride, src0, src1, src2);
3474 src += (3 * src_stride);
3477 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3478 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3479 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3491 LD_SB2(src, src_stride, src3, src4);
3492 src += (2 * src_stride);
3495 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3505 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3515 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3516 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3523 const int8_t *filter_x,
3524 const int8_t *filter_y,
3527 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3529 v4i32 filt_h0, filt_h1;
3530 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3532 v8i16 filter_vec, const_vec;
3533 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3534 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3535 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3536 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3537 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3538 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3539 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3540 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3542 src -= (src_stride + 1);
3544 filter_vec =
LD_SH(filter_x);
3547 filter_vec =
LD_SH(filter_y);
3548 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3549 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3555 const_vec = __msa_ldi_h(128);
3558 LD_SB3(src, src_stride, src0, src1, src2);
3559 src += (3 * src_stride);
3563 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3564 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3565 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3577 LD_SB2(src, src_stride, src3, src4);
3578 src += (2 * src_stride);
3583 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3596 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3606 LD_SB2(src, src_stride, src5, src6);
3607 src += (2 * src_stride);
3612 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3623 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3633 LD_SB2(src, src_stride, src7, src8);
3638 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3649 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3660 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
3661 PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
3663 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3664 dst += (2 * dst_stride);
3665 ST_SW2(dst2_r, dst3_r, dst, dst_stride);
3666 dst += (2 * dst_stride);
3667 ST_SW2(dst4_r, dst5_r, dst, dst_stride);
3674 const int8_t *filter_x,
3675 const int8_t *filter_y,
3679 uint32_t loop_cnt, cnt;
3682 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3684 v4i32 filt_h0, filt_h1;
3685 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3687 v8i16 filter_vec, const_vec;
3688 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3689 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3690 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3691 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3692 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3694 src -= (src_stride + 1);
3696 filter_vec =
LD_SH(filter_x);
3699 filter_vec =
LD_SH(filter_y);
3700 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3701 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3707 const_vec = __msa_ldi_h(128);
3710 for (cnt = width >> 3; cnt--;) {
3714 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3715 src_tmp += (3 * src_stride);
3719 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3720 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3721 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3733 for (loop_cnt = height >> 2; loop_cnt--;) {
3734 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3735 src_tmp += (4 * src_stride);
3738 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3750 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3761 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3773 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3785 dst2_l, dst2_r, dst3_l, dst3_r,
3786 dst0_r, dst1_r, dst2_r, dst3_r);
3788 ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
3789 dst_tmp += (2 * dst_stride);
3790 ST_SW2(dst2_r, dst3_r, dst_tmp, dst_stride);
3791 dst_tmp += (2 * dst_stride);
3803 const int8_t *filter_x,
3804 const int8_t *filter_y,
3810 filter_x, filter_y, height);
3811 }
else if (6 == height) {
3813 filter_x, filter_y, height);
3814 }
else if (0 == (height % 4)) {
3816 filter_x, filter_y, height, 8);
3824 const int8_t *filter_x,
3825 const int8_t *filter_y,
3829 filter_x, filter_y, height, 8);
3832 filter_x, filter_y, height);
3840 const int8_t *filter_x,
3841 const int8_t *filter_y,
3845 filter_x, filter_y, height, 16);
3852 const int8_t *filter_x,
3853 const int8_t *filter_y,
3857 filter_x, filter_y, height, 24);
3864 const int8_t *filter_x,
3865 const int8_t *filter_y,
3869 filter_x, filter_y, height, 32);
3872 #define MC_COPY(WIDTH) \
3873 void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst, \
3875 ptrdiff_t src_stride, \
3881 hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height); \
3896 #define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
3897 void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst, \
3899 ptrdiff_t src_stride, \
3905 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
3907 hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
3908 MAX_PB_SIZE, filter, height); \
3911 MC(qpel,
h, 4, 8, hz, mx);
3912 MC(qpel,
h, 8, 8, hz, mx);
3913 MC(qpel,
h, 12, 8, hz, mx);
3914 MC(qpel,
h, 16, 8, hz, mx);
3915 MC(qpel,
h, 24, 8, hz, mx);
3916 MC(qpel,
h, 32, 8, hz, mx);
3917 MC(qpel,
h, 48, 8, hz, mx);
3918 MC(qpel,
h, 64, 8, hz, mx);
3920 MC(qpel, v, 4, 8, vt, my);
3921 MC(qpel, v, 8, 8, vt, my);
3922 MC(qpel, v, 12, 8, vt, my);
3923 MC(qpel, v, 16, 8, vt, my);
3924 MC(qpel, v, 24, 8, vt, my);
3925 MC(qpel, v, 32, 8, vt, my);
3926 MC(qpel, v, 48, 8, vt, my);
3927 MC(qpel, v, 64, 8, vt, my);
3929 MC(epel,
h, 4, 4, hz, mx);
3930 MC(epel,
h, 6, 4, hz, mx);
3931 MC(epel,
h, 8, 4, hz, mx);
3932 MC(epel,
h, 12, 4, hz, mx);
3933 MC(epel,
h, 16, 4, hz, mx);
3934 MC(epel,
h, 24, 4, hz, mx);
3935 MC(epel,
h, 32, 4, hz, mx);
3937 MC(epel, v, 4, 4, vt, my);
3938 MC(epel, v, 6, 4, vt, my);
3939 MC(epel, v, 8, 4, vt, my);
3940 MC(epel, v, 12, 4, vt, my);
3941 MC(epel, v, 16, 4, vt, my);
3942 MC(epel, v, 24, 4, vt, my);
3943 MC(epel, v, 32, 4, vt, my);
3947 #define MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
3948 void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst, \
3950 ptrdiff_t src_stride, \
3956 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
3957 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
3959 hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, \
3960 filter_x, filter_y, height); \
3963 MC_HV(qpel, hv, 4, 8, hv);
3964 MC_HV(qpel, hv, 8, 8, hv);
3965 MC_HV(qpel, hv, 12, 8, hv);
3966 MC_HV(qpel, hv, 16, 8, hv);
3967 MC_HV(qpel, hv, 24, 8, hv);
3968 MC_HV(qpel, hv, 32, 8, hv);
3969 MC_HV(qpel, hv, 48, 8, hv);
3970 MC_HV(qpel, hv, 64, 8, hv);
3972 MC_HV(epel, hv, 4, 4, hv);
3973 MC_HV(epel, hv, 6, 4, hv);
3974 MC_HV(epel, hv, 8, 4, hv);
3975 MC_HV(epel, hv, 12, 4, hv);
3976 MC_HV(epel, hv, 16, 4, hv);
3977 MC_HV(epel, hv, 24, 4, hv);
3978 MC_HV(epel, hv, 32, 4, hv);
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
static void hevc_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_4t_4x2_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void hevc_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_8t_64w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_copy_24w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B2_128_SB(...)
static void hevc_hv_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define XORI_B3_128_SB(...)
static void hevc_copy_32w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)
#define DPADD_SB4_SH(...)
#define SPLATI_H2_SH(...)
static void hevc_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_copy_8w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_4t_8x2_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_8t_24w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_4t_32w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, uint8_t clip)
#define SPLATI_H4_SH(...)
static void hevc_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define MC_HV(PEL, DIR, WIDTH, TAP, DIR1)
static void hevc_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_copy_48w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)
static void hevc_copy_64w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST8x2_UB(in, pdst, stride)
static void hevc_hv_8t_12w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_4t_6w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define XORI_B7_128_SB(...)
static void hevc_hv_8t_4w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter)
#define XORI_B4_128_SB(...)
static void hevc_vt_4t_8x4multiple_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_8t_16multx4mult_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
#define DPADD_SB2_SH(...)
#define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
static void hevc_hv_4t_4x4_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void hevc_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SPLATI_W4_SW(...)
static void hevc_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_4t_4w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_4t_4x4_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_8t_8w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_copy_16w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)
static void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_4t_8w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_4t_24w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_4t_16w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_copy_12w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
static void hevc_hz_4t_8x2multiple_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SLLI_4V(in0, in1, in2, in3, shift)
static void hevc_hv_8t_16w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_4t_8x6_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void hevc_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST8x4_UB(in0, in1, pdst, stride)
static void hevc_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST8x8_UB(in0, in1, in2, in3, pdst, stride)
static void hevc_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_8t_48w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_4t_8multx4mult_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
static void hevc_hv_8t_32w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_4t_8x4multiple_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SPLATI_W2_SW(...)
static void hevc_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_4t_12w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_copy_6w_msa(uint8_t *src, int32_t src_stride, int16_t *dst, int32_t dst_stride, int32_t height)