30 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
31 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
32 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
33 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
34 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
35 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
38 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \
43 tmp0_m = __lasx_xvshuf_b(in1, in0, mask0); \
44 out0_m = __lasx_xvhaddw_h_b(tmp0_m, tmp0_m); \
45 tmp0_m = __lasx_xvshuf_b(in1, in0, mask1); \
46 out0_m = __lasx_xvdp2add_h_b(out0_m, minus5b, tmp0_m); \
47 tmp0_m = __lasx_xvshuf_b(in1, in0, mask2); \
48 out0_m = __lasx_xvdp2add_h_b(out0_m, plus20b, tmp0_m); \
53 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
57 out0_m = __lasx_xvdp2_h_b(in0, coeff0); \
58 DUP2_ARG3(__lasx_xvdp2add_h_b, out0_m, in1, coeff1, out0_m,\
59 in2, coeff2, out0_m, out0_m); \
67 uint8_t *dst, ptrdiff_t
stride)
69 const int16_t filt_const0 = 0xfb01;
70 const int16_t filt_const1 = 0x1414;
71 const int16_t filt_const2 = 0x1fb;
73 ptrdiff_t stride_2x =
stride << 1;
74 ptrdiff_t stride_3x = stride_2x +
stride;
75 ptrdiff_t stride_4x =
stride << 2;
77 __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
78 __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
79 __m256i src_vt7, src_vt8;
80 __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h;
81 __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2;
82 __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
83 __m256i vt_out3, out0, out1, out2, out3;
84 __m256i minus5b = __lasx_xvldi(0xFB);
85 __m256i plus20b = __lasx_xvldi(20);
87 filt0 = __lasx_xvreplgr2vr_h(filt_const0);
88 filt1 = __lasx_xvreplgr2vr_h(filt_const1);
89 filt2 = __lasx_xvreplgr2vr_h(filt_const2);
93 src_vt0 = __lasx_xvld(src_y, 0);
94 DUP4_ARG2(__lasx_xvldx, src_y,
stride, src_y, stride_2x, src_y, stride_3x,
95 src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4);
98 src_vt0 = __lasx_xvxori_b(src_vt0, 128);
99 DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128,
100 src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4);
102 for (loop_cnt = 4; loop_cnt--;) {
103 src_hz0 = __lasx_xvld(src_x, 0);
106 src_hz3 = __lasx_xvldx(src_x, stride_3x);
108 src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94);
109 src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94);
110 src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94);
111 src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94);
112 DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128,
113 src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3);
119 hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5);
120 hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5);
123 src_y, stride_3x, src_y, stride_4x,
124 src_vt5, src_vt6, src_vt7, src_vt8);
127 DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128,
128 src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8);
130 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5,
131 0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02,
132 src_vt0, src_vt1, src_vt2, src_vt3);
133 src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02);
134 DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1,
135 src_vt3, src_vt2, src_vt87_h, src_vt3,
136 src_hz0, src_hz1, src_hz2, src_hz3);
137 DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1,
138 src_vt3, src_vt2, src_vt87_h, src_vt3,
139 src_vt0, src_vt1, src_vt2, src_vt3);
140 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1, src_hz1,
141 0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3, 0x02,
142 src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h);
143 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1, src_hz1,
144 0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3, 0x13,
145 src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h);
146 vt_out0 =
AVC_DOT_SH3_SH(src_vt10_h, src_vt32_h, src_vt54_h, filt0,
148 vt_out1 =
AVC_DOT_SH3_SH(src_vt21_h, src_vt43_h, src_vt65_h, filt0,
150 vt_out2 =
AVC_DOT_SH3_SH(src_vt32_h, src_vt54_h, src_vt76_h, filt0,
152 vt_out3 =
AVC_DOT_SH3_SH(src_vt43_h, src_vt65_h, src_vt87_h, filt0,
154 vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5);
155 vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5);
157 DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
159 DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
161 tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1);
162 tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1);
164 DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
165 out0 = __lasx_xvld(dst, 0);
167 out3 = __lasx_xvldx(dst, stride_3x);
168 out0 = __lasx_xvpermi_q(out0, out2, 0x02);
169 out1 = __lasx_xvpermi_q(out1, out3, 0x02);
170 out2 = __lasx_xvilvl_d(out1, out0);
171 out3 = __lasx_xvilvh_d(out1, out0);
172 out0 = __lasx_xvpermi_q(out2, out3, 0x02);
173 out1 = __lasx_xvpermi_q(out2, out3, 0x13);
174 tmp0 = __lasx_xvavgr_bu(out0, tmp0);
175 tmp1 = __lasx_xvavgr_bu(out1, tmp1);
177 __lasx_xvstelm_d(tmp0, dst, 0, 0);
178 __lasx_xvstelm_d(tmp0, dst +
stride, 0, 1);
179 __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0);
180 __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1);
182 __lasx_xvstelm_d(tmp0, dst, 8, 2);
183 __lasx_xvstelm_d(tmp0, dst +
stride, 8, 3);
184 __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2);
185 __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3);
198 uint8_t *dst, ptrdiff_t
stride)
200 const int16_t filt_const0 = 0xfb01;
201 const int16_t filt_const1 = 0x1414;
202 const int16_t filt_const2 = 0x1fb;
204 ptrdiff_t stride_2x =
stride << 1;
205 ptrdiff_t stride_3x = stride_2x +
stride;
206 ptrdiff_t stride_4x =
stride << 2;
208 __m256i src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
209 __m256i src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
210 __m256i src_vt7, src_vt8;
211 __m256i src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h, src_vt54_h;
212 __m256i src_vt65_h, src_vt76_h, src_vt87_h, filt0, filt1, filt2;
213 __m256i hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
214 __m256i vt_out3, out0, out1, out2, out3;
215 __m256i minus5b = __lasx_xvldi(0xFB);
216 __m256i plus20b = __lasx_xvldi(20);
218 filt0 = __lasx_xvreplgr2vr_h(filt_const0);
219 filt1 = __lasx_xvreplgr2vr_h(filt_const1);
220 filt2 = __lasx_xvreplgr2vr_h(filt_const2);
224 src_vt0 = __lasx_xvld(src_y, 0);
225 DUP4_ARG2(__lasx_xvldx, src_y,
stride, src_y, stride_2x, src_y, stride_3x,
226 src_y, stride_4x, src_vt1, src_vt2, src_vt3, src_vt4);
229 src_vt0 = __lasx_xvxori_b(src_vt0, 128);
230 DUP4_ARG2(__lasx_xvxori_b, src_vt1, 128, src_vt2, 128, src_vt3, 128,
231 src_vt4, 128, src_vt1, src_vt2, src_vt3, src_vt4);
233 for (loop_cnt = 4; loop_cnt--;) {
234 src_hz0 = __lasx_xvld(src_x, 0);
237 src_hz3 = __lasx_xvldx(src_x, stride_3x);
239 src_hz0 = __lasx_xvpermi_d(src_hz0, 0x94);
240 src_hz1 = __lasx_xvpermi_d(src_hz1, 0x94);
241 src_hz2 = __lasx_xvpermi_d(src_hz2, 0x94);
242 src_hz3 = __lasx_xvpermi_d(src_hz3, 0x94);
243 DUP4_ARG2(__lasx_xvxori_b, src_hz0, 128, src_hz1, 128, src_hz2, 128,
244 src_hz3, 128, src_hz0, src_hz1, src_hz2, src_hz3);
250 hz_out0 = __lasx_xvssrarni_b_h(hz_out1, hz_out0, 5);
251 hz_out2 = __lasx_xvssrarni_b_h(hz_out3, hz_out2, 5);
254 src_y, stride_3x, src_y, stride_4x,
255 src_vt5, src_vt6, src_vt7, src_vt8);
258 DUP4_ARG2(__lasx_xvxori_b, src_vt5, 128, src_vt6, 128, src_vt7, 128,
259 src_vt8, 128, src_vt5, src_vt6, src_vt7, src_vt8);
260 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_vt4, 0x02, src_vt1, src_vt5,
261 0x02, src_vt2, src_vt6, 0x02, src_vt3, src_vt7, 0x02,
262 src_vt0, src_vt1, src_vt2, src_vt3);
263 src_vt87_h = __lasx_xvpermi_q(src_vt4, src_vt8, 0x02);
264 DUP4_ARG2(__lasx_xvilvh_b, src_vt1, src_vt0, src_vt2, src_vt1,
265 src_vt3, src_vt2, src_vt87_h, src_vt3,
266 src_hz0, src_hz1, src_hz2, src_hz3);
267 DUP4_ARG2(__lasx_xvilvl_b, src_vt1, src_vt0, src_vt2, src_vt1,
268 src_vt3, src_vt2, src_vt87_h, src_vt3,
269 src_vt0, src_vt1, src_vt2, src_vt3);
270 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x02, src_vt1,
271 src_hz1, 0x02, src_vt2, src_hz2, 0x02, src_vt3, src_hz3,
272 0x02, src_vt10_h, src_vt21_h, src_vt32_h, src_vt43_h);
273 DUP4_ARG3(__lasx_xvpermi_q, src_vt0, src_hz0, 0x13, src_vt1,
274 src_hz1, 0x13, src_vt2, src_hz2, 0x13, src_vt3, src_hz3,
275 0x13, src_vt54_h, src_vt65_h, src_vt76_h, src_vt87_h);
278 filt0, filt1, filt2);
280 filt0, filt1, filt2);
282 filt0, filt1, filt2);
284 filt0, filt1, filt2);
285 vt_out0 = __lasx_xvssrarni_b_h(vt_out1, vt_out0, 5);
286 vt_out2 = __lasx_xvssrarni_b_h(vt_out3, vt_out2, 5);
288 DUP2_ARG2(__lasx_xvaddwl_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
290 DUP2_ARG2(__lasx_xvaddwh_h_b, hz_out0, vt_out0, hz_out2, vt_out2,
292 tmp0 = __lasx_xvssrarni_b_h(out1, out0, 1);
293 tmp1 = __lasx_xvssrarni_b_h(out3, out2, 1);
295 DUP2_ARG2(__lasx_xvxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
296 __lasx_xvstelm_d(tmp0, dst, 0, 0);
297 __lasx_xvstelm_d(tmp0, dst +
stride, 0, 1);
298 __lasx_xvstelm_d(tmp1, dst + stride_2x, 0, 0);
299 __lasx_xvstelm_d(tmp1, dst + stride_3x, 0, 1);
301 __lasx_xvstelm_d(tmp0, dst, 8, 2);
302 __lasx_xvstelm_d(tmp0, dst +
stride, 8, 3);
303 __lasx_xvstelm_d(tmp1, dst + stride_2x, 8, 2);
304 __lasx_xvstelm_d(tmp1, dst + stride_3x, 8, 3);
320 ptrdiff_t stride_2, stride_3, stride_4;
322 "slli.d %[stride_2], %[stride], 1 \n\t"
323 "add.d %[stride_3], %[stride_2], %[stride] \n\t"
324 "slli.d %[stride_4], %[stride_2], 1 \n\t"
325 "ld.d %[tmp0], %[src], 0x0 \n\t"
326 "ldx.d %[tmp1], %[src], %[stride] \n\t"
327 "ldx.d %[tmp2], %[src], %[stride_2] \n\t"
328 "ldx.d %[tmp3], %[src], %[stride_3] \n\t"
329 "add.d %[src], %[src], %[stride_4] \n\t"
330 "ld.d %[tmp4], %[src], 0x0 \n\t"
331 "ldx.d %[tmp5], %[src], %[stride] \n\t"
332 "ldx.d %[tmp6], %[src], %[stride_2] \n\t"
333 "ldx.d %[tmp7], %[src], %[stride_3] \n\t"
335 "st.d %[tmp0], %[dst], 0x0 \n\t"
336 "stx.d %[tmp1], %[dst], %[stride] \n\t"
337 "stx.d %[tmp2], %[dst], %[stride_2] \n\t"
338 "stx.d %[tmp3], %[dst], %[stride_3] \n\t"
339 "add.d %[dst], %[dst], %[stride_4] \n\t"
340 "st.d %[tmp4], %[dst], 0x0 \n\t"
341 "stx.d %[tmp5], %[dst], %[stride] \n\t"
342 "stx.d %[tmp6], %[dst], %[stride_2] \n\t"
343 "stx.d %[tmp7], %[dst], %[stride_3] \n\t"
344 : [tmp0]
"=&r"(
tmp[0]), [tmp1]
"=&r"(
tmp[1]),
345 [tmp2]
"=&r"(
tmp[2]), [tmp3]
"=&r"(
tmp[3]),
346 [tmp4]
"=&r"(
tmp[4]), [tmp5]
"=&r"(
tmp[5]),
347 [tmp6]
"=&r"(
tmp[6]), [tmp7]
"=&r"(
tmp[7]),
348 [stride_2]
"=&r"(stride_2), [stride_3]
"=&r"(stride_3),
349 [stride_4]
"=&r"(stride_4),
350 [dst]
"+&r"(dst), [
src]
"+&r"(
src)
363 ptrdiff_t stride_2, stride_3, stride_4;
366 "slli.d %[stride_2], %[stride], 1 \n\t"
367 "add.d %[stride_3], %[stride_2], %[stride] \n\t"
368 "slli.d %[stride_4], %[stride_2], 1 \n\t"
369 "vld $vr0, %[src], 0 \n\t"
370 "vldx $vr1, %[src], %[stride] \n\t"
371 "vldx $vr2, %[src], %[stride_2] \n\t"
372 "vldx $vr3, %[src], %[stride_3] \n\t"
373 "add.d %[src], %[src], %[stride_4] \n\t"
374 "vld $vr4, %[src], 0 \n\t"
375 "vldx $vr5, %[src], %[stride] \n\t"
376 "vldx $vr6, %[src], %[stride_2] \n\t"
377 "vldx $vr7, %[src], %[stride_3] \n\t"
379 "vld $vr8, %[tmp], 0 \n\t"
380 "vldx $vr9, %[tmp], %[stride] \n\t"
381 "vldx $vr10, %[tmp], %[stride_2] \n\t"
382 "vldx $vr11, %[tmp], %[stride_3] \n\t"
383 "add.d %[tmp], %[tmp], %[stride_4] \n\t"
384 "vld $vr12, %[tmp], 0 \n\t"
385 "vldx $vr13, %[tmp], %[stride] \n\t"
386 "vldx $vr14, %[tmp], %[stride_2] \n\t"
387 "vldx $vr15, %[tmp], %[stride_3] \n\t"
389 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
390 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
391 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
392 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
393 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
394 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
395 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
396 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
398 "vstelm.d $vr0, %[dst], 0, 0 \n\t"
399 "add.d %[dst], %[dst], %[stride] \n\t"
400 "vstelm.d $vr1, %[dst], 0, 0 \n\t"
401 "add.d %[dst], %[dst], %[stride] \n\t"
402 "vstelm.d $vr2, %[dst], 0, 0 \n\t"
403 "add.d %[dst], %[dst], %[stride] \n\t"
404 "vstelm.d $vr3, %[dst], 0, 0 \n\t"
405 "add.d %[dst], %[dst], %[stride] \n\t"
406 "vstelm.d $vr4, %[dst], 0, 0 \n\t"
407 "add.d %[dst], %[dst], %[stride] \n\t"
408 "vstelm.d $vr5, %[dst], 0, 0 \n\t"
409 "add.d %[dst], %[dst], %[stride] \n\t"
410 "vstelm.d $vr6, %[dst], 0, 0 \n\t"
411 "add.d %[dst], %[dst], %[stride] \n\t"
412 "vstelm.d $vr7, %[dst], 0, 0 \n\t"
414 [stride_2]
"=&r"(stride_2), [stride_3]
"=&r"(stride_3),
415 [stride_4]
"=&r"(stride_4)
425 ptrdiff_t stride_2, stride_3, stride_4;
427 "slli.d %[stride_2], %[stride], 1 \n\t"
428 "add.d %[stride_3], %[stride_2], %[stride] \n\t"
429 "slli.d %[stride_4], %[stride_2], 1 \n\t"
430 "vld $vr0, %[src], 0 \n\t"
431 "vldx $vr1, %[src], %[stride] \n\t"
432 "vldx $vr2, %[src], %[stride_2] \n\t"
433 "vldx $vr3, %[src], %[stride_3] \n\t"
434 "add.d %[src], %[src], %[stride_4] \n\t"
435 "vld $vr4, %[src], 0 \n\t"
436 "vldx $vr5, %[src], %[stride] \n\t"
437 "vldx $vr6, %[src], %[stride_2] \n\t"
438 "vldx $vr7, %[src], %[stride_3] \n\t"
439 "add.d %[src], %[src], %[stride_4] \n\t"
441 "vst $vr0, %[dst], 0 \n\t"
442 "vstx $vr1, %[dst], %[stride] \n\t"
443 "vstx $vr2, %[dst], %[stride_2] \n\t"
444 "vstx $vr3, %[dst], %[stride_3] \n\t"
445 "add.d %[dst], %[dst], %[stride_4] \n\t"
446 "vst $vr4, %[dst], 0 \n\t"
447 "vstx $vr5, %[dst], %[stride] \n\t"
448 "vstx $vr6, %[dst], %[stride_2] \n\t"
449 "vstx $vr7, %[dst], %[stride_3] \n\t"
450 "add.d %[dst], %[dst], %[stride_4] \n\t"
452 "vld $vr0, %[src], 0 \n\t"
453 "vldx $vr1, %[src], %[stride] \n\t"
454 "vldx $vr2, %[src], %[stride_2] \n\t"
455 "vldx $vr3, %[src], %[stride_3] \n\t"
456 "add.d %[src], %[src], %[stride_4] \n\t"
457 "vld $vr4, %[src], 0 \n\t"
458 "vldx $vr5, %[src], %[stride] \n\t"
459 "vldx $vr6, %[src], %[stride_2] \n\t"
460 "vldx $vr7, %[src], %[stride_3] \n\t"
462 "vst $vr0, %[dst], 0 \n\t"
463 "vstx $vr1, %[dst], %[stride] \n\t"
464 "vstx $vr2, %[dst], %[stride_2] \n\t"
465 "vstx $vr3, %[dst], %[stride_3] \n\t"
466 "add.d %[dst], %[dst], %[stride_4] \n\t"
467 "vst $vr4, %[dst], 0 \n\t"
468 "vstx $vr5, %[dst], %[stride] \n\t"
469 "vstx $vr6, %[dst], %[stride_2] \n\t"
470 "vstx $vr7, %[dst], %[stride_3] \n\t"
471 : [dst]
"+&r"(dst), [
src]
"+&r"(
src),
472 [stride_2]
"=&r"(stride_2), [stride_3]
"=&r"(stride_3),
473 [stride_4]
"=&r"(stride_4)
486 ptrdiff_t stride_2, stride_3, stride_4;
489 "slli.d %[stride_2], %[stride], 1 \n\t"
490 "add.d %[stride_3], %[stride_2], %[stride] \n\t"
491 "slli.d %[stride_4], %[stride_2], 1 \n\t"
492 "vld $vr0, %[src], 0 \n\t"
493 "vldx $vr1, %[src], %[stride] \n\t"
494 "vldx $vr2, %[src], %[stride_2] \n\t"
495 "vldx $vr3, %[src], %[stride_3] \n\t"
496 "add.d %[src], %[src], %[stride_4] \n\t"
497 "vld $vr4, %[src], 0 \n\t"
498 "vldx $vr5, %[src], %[stride] \n\t"
499 "vldx $vr6, %[src], %[stride_2] \n\t"
500 "vldx $vr7, %[src], %[stride_3] \n\t"
501 "add.d %[src], %[src], %[stride_4] \n\t"
503 "vld $vr8, %[tmp], 0 \n\t"
504 "vldx $vr9, %[tmp], %[stride] \n\t"
505 "vldx $vr10, %[tmp], %[stride_2] \n\t"
506 "vldx $vr11, %[tmp], %[stride_3] \n\t"
507 "add.d %[tmp], %[tmp], %[stride_4] \n\t"
508 "vld $vr12, %[tmp], 0 \n\t"
509 "vldx $vr13, %[tmp], %[stride] \n\t"
510 "vldx $vr14, %[tmp], %[stride_2] \n\t"
511 "vldx $vr15, %[tmp], %[stride_3] \n\t"
512 "add.d %[tmp], %[tmp], %[stride_4] \n\t"
514 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
515 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
516 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
517 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
518 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
519 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
520 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
521 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
523 "vst $vr0, %[dst], 0 \n\t"
524 "vstx $vr1, %[dst], %[stride] \n\t"
525 "vstx $vr2, %[dst], %[stride_2] \n\t"
526 "vstx $vr3, %[dst], %[stride_3] \n\t"
527 "add.d %[dst], %[dst], %[stride_4] \n\t"
528 "vst $vr4, %[dst], 0 \n\t"
529 "vstx $vr5, %[dst], %[stride] \n\t"
530 "vstx $vr6, %[dst], %[stride_2] \n\t"
531 "vstx $vr7, %[dst], %[stride_3] \n\t"
532 "add.d %[dst], %[dst], %[stride_4] \n\t"
535 "vld $vr0, %[src], 0 \n\t"
536 "vldx $vr1, %[src], %[stride] \n\t"
537 "vldx $vr2, %[src], %[stride_2] \n\t"
538 "vldx $vr3, %[src], %[stride_3] \n\t"
539 "add.d %[src], %[src], %[stride_4] \n\t"
540 "vld $vr4, %[src], 0 \n\t"
541 "vldx $vr5, %[src], %[stride] \n\t"
542 "vldx $vr6, %[src], %[stride_2] \n\t"
543 "vldx $vr7, %[src], %[stride_3] \n\t"
545 "vld $vr8, %[tmp], 0 \n\t"
546 "vldx $vr9, %[tmp], %[stride] \n\t"
547 "vldx $vr10, %[tmp], %[stride_2] \n\t"
548 "vldx $vr11, %[tmp], %[stride_3] \n\t"
549 "add.d %[tmp], %[tmp], %[stride_4] \n\t"
550 "vld $vr12, %[tmp], 0 \n\t"
551 "vldx $vr13, %[tmp], %[stride] \n\t"
552 "vldx $vr14, %[tmp], %[stride_2] \n\t"
553 "vldx $vr15, %[tmp], %[stride_3] \n\t"
555 "vavgr.bu $vr0, $vr8, $vr0 \n\t"
556 "vavgr.bu $vr1, $vr9, $vr1 \n\t"
557 "vavgr.bu $vr2, $vr10, $vr2 \n\t"
558 "vavgr.bu $vr3, $vr11, $vr3 \n\t"
559 "vavgr.bu $vr4, $vr12, $vr4 \n\t"
560 "vavgr.bu $vr5, $vr13, $vr5 \n\t"
561 "vavgr.bu $vr6, $vr14, $vr6 \n\t"
562 "vavgr.bu $vr7, $vr15, $vr7 \n\t"
564 "vst $vr0, %[dst], 0 \n\t"
565 "vstx $vr1, %[dst], %[stride] \n\t"
566 "vstx $vr2, %[dst], %[stride_2] \n\t"
567 "vstx $vr3, %[dst], %[stride_3] \n\t"
568 "add.d %[dst], %[dst], %[stride_4] \n\t"
569 "vst $vr4, %[dst], 0 \n\t"
570 "vstx $vr5, %[dst], %[stride] \n\t"
571 "vstx $vr6, %[dst], %[stride_2] \n\t"
572 "vstx $vr7, %[dst], %[stride_3] \n\t"
574 [stride_2]
"=&r"(stride_2), [stride_3]
"=&r"(stride_3),
575 [stride_4]
"=&r"(stride_4)
581 #define QPEL8_H_LOWPASS(out_v) \
582 src00 = __lasx_xvld(src, - 2); \
584 src10 = __lasx_xvld(src, - 2); \
586 src00 = __lasx_xvpermi_q(src00, src10, 0x02); \
587 src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1); \
588 src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2); \
589 src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3); \
590 src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4); \
591 src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5); \
592 DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\
593 src00 = __lasx_xvaddwl_h_bu(src00, src05); \
594 src02 = __lasx_xvmul_h(src02, h_20); \
595 src01 = __lasx_xvmul_h(src01, h_5); \
596 src02 = __lasx_xvssub_h(src02, src01); \
597 src02 = __lasx_xvsadd_h(src02, src00); \
598 src02 = __lasx_xvsadd_h(src02, h_16); \
599 out_v = __lasx_xvssrani_bu_h(src02, src02, 5); \
605 int dstStride_2x = dstStride << 1;
606 __m256i src00, src01, src02, src03, src04, src05, src10;
607 __m256i out0, out1, out2, out3;
608 __m256i h_20 = __lasx_xvldi(0x414);
609 __m256i h_5 = __lasx_xvldi(0x405);
610 __m256i h_16 = __lasx_xvldi(0x410);
611 __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
612 __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
613 __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
614 __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
615 __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
621 __lasx_xvstelm_d(out0, dst, 0, 0);
622 __lasx_xvstelm_d(out0, dst + dstStride, 0, 2);
624 __lasx_xvstelm_d(out1, dst, 0, 0);
625 __lasx_xvstelm_d(out1, dst + dstStride, 0, 2);
627 __lasx_xvstelm_d(out2, dst, 0, 0);
628 __lasx_xvstelm_d(out2, dst + dstStride, 0, 2);
630 __lasx_xvstelm_d(out3, dst, 0, 0);
631 __lasx_xvstelm_d(out3, dst + dstStride, 0, 2);
634 #define QPEL8_V_LOWPASS(src0, src1, src2, src3, src4, src5, src6, \
635 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5) \
637 tmp0 = __lasx_xvpermi_q(src0, src1, 0x02); \
638 tmp1 = __lasx_xvpermi_q(src1, src2, 0x02); \
639 tmp2 = __lasx_xvpermi_q(src2, src3, 0x02); \
640 tmp3 = __lasx_xvpermi_q(src3, src4, 0x02); \
641 tmp4 = __lasx_xvpermi_q(src4, src5, 0x02); \
642 tmp5 = __lasx_xvpermi_q(src5, src6, 0x02); \
643 DUP2_ARG2(__lasx_xvaddwl_h_bu, tmp2, tmp3, tmp1, tmp4, tmp2, tmp1); \
644 tmp0 = __lasx_xvaddwl_h_bu(tmp0, tmp5); \
645 tmp2 = __lasx_xvmul_h(tmp2, h_20); \
646 tmp1 = __lasx_xvmul_h(tmp1, h_5); \
647 tmp2 = __lasx_xvssub_h(tmp2, tmp1); \
648 tmp2 = __lasx_xvsadd_h(tmp2, tmp0); \
649 tmp2 = __lasx_xvsadd_h(tmp2, h_16); \
650 tmp2 = __lasx_xvssrani_bu_h(tmp2, tmp2, 5); \
657 int srcStride_2x = srcStride << 1;
658 int dstStride_2x = dstStride << 1;
659 int srcStride_4x = srcStride << 2;
660 int srcStride_3x = srcStride_2x + srcStride;
661 __m256i src00, src01, src02, src03, src04, src05, src06;
662 __m256i src07, src08, src09, src10, src11, src12;
663 __m256i tmp00, tmp01, tmp02, tmp03, tmp04, tmp05;
664 __m256i h_20 = __lasx_xvldi(0x414);
665 __m256i h_5 = __lasx_xvldi(0x405);
666 __m256i h_16 = __lasx_xvldi(0x410);
670 src02 = __lasx_xvld(
src, 0);
672 srcStride_3x,
src, srcStride_4x, src03, src04, src05, src06);
675 srcStride_3x,
src, srcStride_4x, src07, src08, src09, src10);
677 DUP2_ARG2(__lasx_xvldx,
src, srcStride,
src, srcStride_2x, src11, src12);
680 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
681 __lasx_xvstelm_d(tmp02, dst, 0, 0);
682 __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
685 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
686 __lasx_xvstelm_d(tmp02, dst, 0, 0);
687 __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
690 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
691 __lasx_xvstelm_d(tmp02, dst, 0, 0);
692 __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
695 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
696 __lasx_xvstelm_d(tmp02, dst, 0, 0);
697 __lasx_xvstelm_d(tmp02, dst + dstStride, 0, 2);
704 int srcStride_2x = srcStride << 1;
705 int srcStride_4x = srcStride << 2;
706 int dstStride_2x = dstStride << 1;
707 int dstStride_4x = dstStride << 2;
708 int srcStride_3x = srcStride_2x + srcStride;
709 int dstStride_3x = dstStride_2x + dstStride;
710 __m256i src00, src01, src02, src03, src04, src05, src06;
711 __m256i src07, src08, src09, src10, src11, src12, tmp00;
712 __m256i tmp01, tmp02, tmp03, tmp04, tmp05, tmp06, tmp07, tmp08, tmp09;
713 __m256i h_20 = __lasx_xvldi(0x414);
714 __m256i h_5 = __lasx_xvldi(0x405);
715 __m256i h_16 = __lasx_xvldi(0x410);
720 src02 = __lasx_xvld(
src, 0);
722 srcStride_3x,
src, srcStride_4x, src03, src04, src05, src06);
725 srcStride_3x,
src, srcStride_4x, src07, src08, src09, src10);
727 DUP2_ARG2(__lasx_xvldx,
src, srcStride,
src, srcStride_2x, src11, src12);
729 tmp06 = __lasx_xvld(dst, 0);
730 DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x,
731 dst, dstStride_3x, dst, dstStride_4x,
732 tmp07, tmp02, tmp03, tmp04);
734 DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x,
736 tmp01 = __lasx_xvldx(dst, dstStride_3x);
739 tmp06 = __lasx_xvpermi_q(tmp06, tmp07, 0x02);
740 tmp07 = __lasx_xvpermi_q(tmp02, tmp03, 0x02);
741 tmp08 = __lasx_xvpermi_q(tmp04, tmp05, 0x02);
742 tmp09 = __lasx_xvpermi_q(tmp00, tmp01, 0x02);
745 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
746 tmp06 = __lasx_xvavgr_bu(tmp06, tmp02);
747 __lasx_xvstelm_d(tmp06, dst, 0, 0);
748 __lasx_xvstelm_d(tmp06, dst + dstStride, 0, 2);
751 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
752 tmp07 = __lasx_xvavgr_bu(tmp07, tmp02);
753 __lasx_xvstelm_d(tmp07, dst, 0, 0);
754 __lasx_xvstelm_d(tmp07, dst + dstStride, 0, 2);
757 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
758 tmp08 = __lasx_xvavgr_bu(tmp08, tmp02);
759 __lasx_xvstelm_d(tmp08, dst, 0, 0);
760 __lasx_xvstelm_d(tmp08, dst + dstStride, 0, 2);
763 tmp00, tmp01, tmp02, tmp03, tmp04, tmp05);
764 tmp09 = __lasx_xvavgr_bu(tmp09, tmp02);
765 __lasx_xvstelm_d(tmp09, dst, 0, 0);
766 __lasx_xvstelm_d(tmp09, dst + dstStride, 0, 2);
769 #define QPEL8_HV_LOWPASS_H(tmp) \
771 src00 = __lasx_xvld(src, -2); \
773 src10 = __lasx_xvld(src, -2); \
775 src00 = __lasx_xvpermi_q(src00, src10, 0x02); \
776 src01 = __lasx_xvshuf_b(src00, src00, (__m256i)mask1); \
777 src02 = __lasx_xvshuf_b(src00, src00, (__m256i)mask2); \
778 src03 = __lasx_xvshuf_b(src00, src00, (__m256i)mask3); \
779 src04 = __lasx_xvshuf_b(src00, src00, (__m256i)mask4); \
780 src05 = __lasx_xvshuf_b(src00, src00, (__m256i)mask5); \
781 DUP2_ARG2(__lasx_xvaddwl_h_bu, src02, src03, src01, src04, src02, src01);\
782 src00 = __lasx_xvaddwl_h_bu(src00, src05); \
783 src02 = __lasx_xvmul_h(src02, h_20); \
784 src01 = __lasx_xvmul_h(src01, h_5); \
785 src02 = __lasx_xvssub_h(src02, src01); \
786 tmp = __lasx_xvsadd_h(src02, src00); \
789 #define QPEL8_HV_LOWPASS_V(src0, src1, src2, src3, \
790 src4, src5, temp0, temp1, \
791 temp2, temp3, temp4, temp5, \
794 DUP2_ARG2(__lasx_xvaddwl_w_h, src2, src3, src1, src4, temp0, temp2); \
795 DUP2_ARG2(__lasx_xvaddwh_w_h, src2, src3, src1, src4, temp1, temp3); \
796 temp4 = __lasx_xvaddwl_w_h(src0, src5); \
797 temp5 = __lasx_xvaddwh_w_h(src0, src5); \
798 temp0 = __lasx_xvmul_w(temp0, w_20); \
799 temp1 = __lasx_xvmul_w(temp1, w_20); \
800 temp2 = __lasx_xvmul_w(temp2, w_5); \
801 temp3 = __lasx_xvmul_w(temp3, w_5); \
802 temp0 = __lasx_xvssub_w(temp0, temp2); \
803 temp1 = __lasx_xvssub_w(temp1, temp3); \
804 temp0 = __lasx_xvsadd_w(temp0, temp4); \
805 temp1 = __lasx_xvsadd_w(temp1, temp5); \
806 temp0 = __lasx_xvsadd_w(temp0, w_512); \
807 temp1 = __lasx_xvsadd_w(temp1, w_512); \
808 temp0 = __lasx_xvssrani_hu_w(temp0, temp0, 10); \
809 temp1 = __lasx_xvssrani_hu_w(temp1, temp1, 10); \
810 temp0 = __lasx_xvpackev_d(temp1, temp0); \
811 out = __lasx_xvssrani_bu_h(temp0, temp0, 0); \
816 ptrdiff_t dstStride, ptrdiff_t srcStride)
818 __m256i src00, src01, src02, src03, src04, src05, src10;
819 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
820 __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
821 __m256i h_20 = __lasx_xvldi(0x414);
822 __m256i h_5 = __lasx_xvldi(0x405);
823 __m256i w_20 = __lasx_xvldi(0x814);
824 __m256i w_5 = __lasx_xvldi(0x805);
825 __m256i w_512 = {512};
826 __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
827 __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
828 __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
829 __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
830 __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
832 w_512 = __lasx_xvreplve0_w(w_512);
834 src -= srcStride << 1;
842 tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21);
843 tmp9 = __lasx_xvpermi_q(tmp10, tmp8, 0x21);
844 tmp7 = __lasx_xvpermi_q(tmp8, tmp6, 0x21);
845 tmp5 = __lasx_xvpermi_q(tmp6, tmp4, 0x21);
846 tmp3 = __lasx_xvpermi_q(tmp4, tmp2, 0x21);
847 tmp1 = __lasx_xvpermi_q(tmp2, tmp0, 0x21);
850 src02, src03, src04, src05, tmp0)
852 src02, src03, src04, src05, tmp2)
854 src02, src03, src04, src05, tmp4)
856 src02, src03, src04, src05, tmp6)
857 __lasx_xvstelm_d(tmp0, dst, 0, 0);
859 __lasx_xvstelm_d(tmp0, dst, 0, 2);
861 __lasx_xvstelm_d(tmp2, dst, 0, 0);
863 __lasx_xvstelm_d(tmp2, dst, 0, 2);
865 __lasx_xvstelm_d(tmp4, dst, 0, 0);
867 __lasx_xvstelm_d(tmp4, dst, 0, 2);
869 __lasx_xvstelm_d(tmp6, dst, 0, 0);
871 __lasx_xvstelm_d(tmp6, dst, 0, 2);
878 int dstStride_2x = dstStride << 1;
879 int dstStride_4x = dstStride << 2;
880 int dstStride_3x = dstStride_2x + dstStride;
881 __m256i src00, src01, src02, src03, src04, src05, src10;
882 __m256i dst00, dst01, dst0, dst1, dst2, dst3;
883 __m256i out0, out1, out2, out3;
884 __m256i h_20 = __lasx_xvldi(0x414);
885 __m256i h_5 = __lasx_xvldi(0x405);
886 __m256i h_16 = __lasx_xvldi(0x410);
887 __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
888 __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
889 __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
890 __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
891 __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
897 src00 = __lasx_xvld(dst, 0);
898 DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst,
899 dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04);
901 DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, dst00);
902 dst01 = __lasx_xvldx(dst, dstStride_3x);
904 dst0 = __lasx_xvpermi_q(src00, src01, 0x02);
905 dst1 = __lasx_xvpermi_q(src02, src03, 0x02);
906 dst2 = __lasx_xvpermi_q(src04, src05, 0x02);
907 dst3 = __lasx_xvpermi_q(dst00, dst01, 0x02);
908 dst0 = __lasx_xvavgr_bu(dst0, out0);
909 dst1 = __lasx_xvavgr_bu(dst1, out1);
910 dst2 = __lasx_xvavgr_bu(dst2, out2);
911 dst3 = __lasx_xvavgr_bu(dst3, out3);
912 __lasx_xvstelm_d(dst0, dst, 0, 0);
913 __lasx_xvstelm_d(dst0, dst + dstStride, 0, 2);
914 __lasx_xvstelm_d(dst1, dst + dstStride_2x, 0, 0);
915 __lasx_xvstelm_d(dst1, dst + dstStride_3x, 0, 2);
917 __lasx_xvstelm_d(dst2, dst, 0, 0);
918 __lasx_xvstelm_d(dst2, dst + dstStride, 0, 2);
919 __lasx_xvstelm_d(dst3, dst + dstStride_2x, 0, 0);
920 __lasx_xvstelm_d(dst3, dst + dstStride_3x, 0, 2);
925 ptrdiff_t dstStride, ptrdiff_t srcStride)
927 __m256i src00, src01, src02, src03, src04, src05, src10;
928 __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
929 __m256i tmp7, tmp8, tmp9, tmp10, tmp11, tmp12;
930 __m256i h_20 = __lasx_xvldi(0x414);
931 __m256i h_5 = __lasx_xvldi(0x405);
932 __m256i w_20 = __lasx_xvldi(0x814);
933 __m256i w_5 = __lasx_xvldi(0x805);
934 __m256i w_512 = {512};
935 __m256i mask1 = {0x0807060504030201, 0x0, 0x0807060504030201, 0x0};
936 __m256i mask2 = {0x0908070605040302, 0x0, 0x0908070605040302, 0x0};
937 __m256i mask3 = {0x0a09080706050403, 0x0, 0x0a09080706050403, 0x0};
938 __m256i mask4 = {0x0b0a090807060504, 0x0, 0x0b0a090807060504, 0x0};
939 __m256i mask5 = {0x0c0b0a0908070605, 0x0, 0x0c0b0a0908070605, 0x0};
940 ptrdiff_t dstStride_2x = dstStride << 1;
941 ptrdiff_t dstStride_4x = dstStride << 2;
942 ptrdiff_t dstStride_3x = dstStride_2x + dstStride;
944 w_512 = __lasx_xvreplve0_w(w_512);
946 src -= srcStride << 1;
954 tmp11 = __lasx_xvpermi_q(tmp12, tmp10, 0x21);
955 tmp9 = __lasx_xvpermi_q(tmp10, tmp8, 0x21);
956 tmp7 = __lasx_xvpermi_q(tmp8, tmp6, 0x21);
957 tmp5 = __lasx_xvpermi_q(tmp6, tmp4, 0x21);
958 tmp3 = __lasx_xvpermi_q(tmp4, tmp2, 0x21);
959 tmp1 = __lasx_xvpermi_q(tmp2, tmp0, 0x21);
962 src02, src03, src04, src05, tmp0)
964 src02, src03, src04, src05, tmp2)
966 src02, src03, src04, src05, tmp4)
968 src02, src03, src04, src05, tmp6)
970 src00 = __lasx_xvld(dst, 0);
971 DUP4_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, dst,
972 dstStride_3x, dst, dstStride_4x, src01, src02, src03, src04);
974 DUP2_ARG2(__lasx_xvldx, dst, dstStride, dst, dstStride_2x, src05, tmp8);
975 tmp9 = __lasx_xvldx(dst, dstStride_3x);
977 tmp1 = __lasx_xvpermi_q(src00, src01, 0x02);
978 tmp3 = __lasx_xvpermi_q(src02, src03, 0x02);
979 tmp5 = __lasx_xvpermi_q(src04, src05, 0x02);
980 tmp7 = __lasx_xvpermi_q(tmp8, tmp9, 0x02);
981 tmp0 = __lasx_xvavgr_bu(tmp0, tmp1);
982 tmp2 = __lasx_xvavgr_bu(tmp2, tmp3);
983 tmp4 = __lasx_xvavgr_bu(tmp4, tmp5);
984 tmp6 = __lasx_xvavgr_bu(tmp6, tmp7);
985 __lasx_xvstelm_d(tmp0, dst, 0, 0);
987 __lasx_xvstelm_d(tmp0, dst, 0, 2);
989 __lasx_xvstelm_d(tmp2, dst, 0, 0);
991 __lasx_xvstelm_d(tmp2, dst, 0, 2);
993 __lasx_xvstelm_d(tmp4, dst, 0, 0);
995 __lasx_xvstelm_d(tmp4, dst, 0, 2);
997 __lasx_xvstelm_d(tmp6, dst, 0, 0);
999 __lasx_xvstelm_d(tmp6, dst, 0, 2);
1004 int dstStride,
int srcStride)
1008 src += srcStride << 3;
1009 dst += dstStride << 3;
1016 int dstStride,
int srcStride)
1020 src += srcStride << 3;
1021 dst += dstStride << 3;
1027 int dstStride,
int srcStride)
1038 int dstStride,
int srcStride)
1049 ptrdiff_t dstStride, ptrdiff_t srcStride)
1053 src += srcStride << 3;
1054 dst += dstStride << 3;
1060 ptrdiff_t dstStride, ptrdiff_t srcStride)
1064 src += srcStride << 3;
1065 dst += dstStride << 3;
1127 uint8_t *
const halfH =
temp;
1128 uint8_t *
const halfHV =
temp + 64;
1156 uint8_t *
const halfHV =
temp;
1157 uint8_t *
const halfH =
temp + 64;
1174 uint8_t *
const halfHV =
temp;
1175 uint8_t *
const halfH =
temp + 64;
1206 uint8_t *
const halfH =
temp;
1207 uint8_t *
const halfHV =
temp + 64;
1272 uint8_t *
const halfH =
temp;
1273 uint8_t *
const halfHV =
temp + 64;
1301 uint8_t *
const halfHV =
temp;
1302 uint8_t *
const halfH =
temp + 64;
1319 uint8_t *
const halfHV =
temp;
1320 uint8_t *
const halfH =
temp + 64;
1342 uint8_t *
const halfH =
temp;
1343 uint8_t *
const halfHV =
temp + 64;
1413 uint8_t *
const halfH =
temp;
1414 uint8_t *
const halfHV =
temp + 256;
1438 uint8_t *
const halfHV =
temp;
1439 uint8_t *
const halfH =
temp + 256;
1456 uint8_t *
const halfHV =
temp;
1457 uint8_t *
const halfH =
temp + 256;
1484 uint8_t *
const halfH =
temp;
1485 uint8_t *
const halfHV =
temp + 256;
1552 uint8_t *
const halfH =
temp;
1553 uint8_t *
const halfHV =
temp + 256;
1578 uint8_t *
const halfHV =
temp;
1579 uint8_t *
const halfH =
temp + 256;
1596 uint8_t *
const halfHV =
temp;
1597 uint8_t *
const halfH =
temp + 256;
1625 uint8_t *
const halfH =
temp;
1626 uint8_t *
const halfHV =
temp + 256;