FFmpeg
h264dsp_lasx.c
Go to the documentation of this file.
1 /*
2  * Loongson LASX optimized h264dsp
3  *
4  * Copyright (c) 2021 Loongson Technology Corporation Limited
5  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
6  * Xiwei Gu <guxiwei-hf@loongson.cn>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
26 #include "h264dsp_lasx.h"
27 
28 #define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, \
29  p1_or_q1_org_in, p2_or_q2_org_in, \
30  neg_tc_in, tc_in, p1_or_q1_out) \
31 { \
32  __m256i clip3, temp; \
33  \
34  clip3 = __lasx_xvavgr_hu(p0_or_q0_org_in, \
35  q0_or_p0_org_in); \
36  temp = __lasx_xvslli_h(p1_or_q1_org_in, 1); \
37  clip3 = __lasx_xvsub_h(clip3, temp); \
38  clip3 = __lasx_xvavg_h(p2_or_q2_org_in, clip3); \
39  clip3 = __lasx_xvclip_h(clip3, neg_tc_in, tc_in); \
40  p1_or_q1_out = __lasx_xvadd_h(p1_or_q1_org_in, clip3); \
41 }
42 
43 #define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, \
44  p1_or_q1_org_in, q1_or_p1_org_in, \
45  neg_threshold_in, threshold_in, \
46  p0_or_q0_out, q0_or_p0_out) \
47 { \
48  __m256i q0_sub_p0, p1_sub_q1, delta; \
49  \
50  q0_sub_p0 = __lasx_xvsub_h(q0_or_p0_org_in, \
51  p0_or_q0_org_in); \
52  p1_sub_q1 = __lasx_xvsub_h(p1_or_q1_org_in, \
53  q1_or_p1_org_in); \
54  q0_sub_p0 = __lasx_xvslli_h(q0_sub_p0, 2); \
55  p1_sub_q1 = __lasx_xvaddi_hu(p1_sub_q1, 4); \
56  delta = __lasx_xvadd_h(q0_sub_p0, p1_sub_q1); \
57  delta = __lasx_xvsrai_h(delta, 3); \
58  delta = __lasx_xvclip_h(delta, neg_threshold_in, \
59  threshold_in); \
60  p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, delta); \
61  q0_or_p0_out = __lasx_xvsub_h(q0_or_p0_org_in, delta); \
62  \
63  p0_or_q0_out = __lasx_xvclip255_h(p0_or_q0_out); \
64  q0_or_p0_out = __lasx_xvclip255_h(q0_or_p0_out); \
65 }
66 
67 void ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
68  int alpha_in, int beta_in, int8_t *tc)
69 {
70  ptrdiff_t img_width_2x = img_width << 1;
71  ptrdiff_t img_width_4x = img_width << 2;
72  ptrdiff_t img_width_8x = img_width << 3;
73  ptrdiff_t img_width_3x = img_width_2x + img_width;
74  __m256i tmp_vec0, bs_vec;
75  __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
76  0x0101010100000000, 0x0303030302020202};
77 
78  tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
79  tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
80  bs_vec = __lasx_xvslti_b(tc_vec, 0);
81  bs_vec = __lasx_xvxori_b(bs_vec, 255);
82  bs_vec = __lasx_xvandi_b(bs_vec, 1);
83 
84  if (__lasx_xbnz_v(bs_vec)) {
85  uint8_t *src = data - 4;
86  __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
87  __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
88  __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
89  __m256i is_bs_greater_than0;
90  __m256i zero = __lasx_xvldi(0);
91 
92  is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
93 
94  {
95  uint8_t *src_tmp = src + img_width_8x;
96  __m256i row0, row1, row2, row3, row4, row5, row6, row7;
97  __m256i row8, row9, row10, row11, row12, row13, row14, row15;
98 
99  DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
100  src, img_width_3x, row0, row1, row2, row3);
101  src += img_width_4x;
102  DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
103  src, img_width_3x, row4, row5, row6, row7);
104  src -= img_width_4x;
105  DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
106  img_width_2x, src_tmp, img_width_3x,
107  row8, row9, row10, row11);
108  src_tmp += img_width_4x;
109  DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, img_width, src_tmp,
110  img_width_2x, src_tmp, img_width_3x,
111  row12, row13, row14, row15);
112  src_tmp -= img_width_4x;
113 
114  LASX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6,
115  row7, row8, row9, row10, row11,
116  row12, row13, row14, row15,
117  p3_org, p2_org, p1_org, p0_org,
118  q0_org, q1_org, q2_org, q3_org);
119  }
120 
121  p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
122  p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
123  q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
124 
125  alpha = __lasx_xvreplgr2vr_b(alpha_in);
126  beta = __lasx_xvreplgr2vr_b(beta_in);
127 
128  is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
129  is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
130  is_less_than = is_less_than_alpha & is_less_than_beta;
131  is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
132  is_less_than = is_less_than_beta & is_less_than;
133  is_less_than = is_less_than & is_bs_greater_than0;
134 
135  if (__lasx_xbnz_v(is_less_than)) {
136  __m256i neg_tc_h, tc_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
137  __m256i p2_asub_p0, q2_asub_q0;
138 
139  neg_tc_h = __lasx_xvneg_b(tc_vec);
140  neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
141  tc_h = __lasx_vext2xv_hu_bu(tc_vec);
142  p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
143  p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
144  q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
145 
146  p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
147  is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
148  is_less_than_beta = is_less_than_beta & is_less_than;
149 
150  if (__lasx_xbnz_v(is_less_than_beta)) {
151  __m256i p2_org_h, p1_h;
152 
153  p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
154  AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, p1_org_h, p2_org_h,
155  neg_tc_h, tc_h, p1_h);
156  p1_h = __lasx_xvpickev_b(p1_h, p1_h);
157  p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
158  p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
159  is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
160  tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
161  }
162 
163  q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
164  is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
165  is_less_than_beta = is_less_than_beta & is_less_than;
166 
167  q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
168 
169  if (__lasx_xbnz_v(is_less_than_beta)) {
170  __m256i q2_org_h, q1_h;
171 
172  q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
173  AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, q1_org_h, q2_org_h,
174  neg_tc_h, tc_h, q1_h);
175  q1_h = __lasx_xvpickev_b(q1_h, q1_h);
176  q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
177  q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
178 
179  is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
180  tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
181  }
182 
183  {
184  __m256i neg_thresh_h, p0_h, q0_h;
185 
186  neg_thresh_h = __lasx_xvneg_b(tc_vec);
187  neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
188  tc_h = __lasx_vext2xv_hu_bu(tc_vec);
189 
190  AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
191  neg_thresh_h, tc_h, p0_h, q0_h);
192  DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
193  p0_h, q0_h);
194  DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
195  p0_h, q0_h);
196  p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
197  q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
198  }
199 
200  {
201  __m256i row0, row1, row2, row3, row4, row5, row6, row7;
202  __m256i control = {0x0000000400000000, 0x0000000500000001,
203  0x0000000600000002, 0x0000000700000003};
204 
205  DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org,
206  q2_org, 0x02, p2_org, q1_org, 0x02, p3_org,
207  q0_org, 0x02, p0_org, p1_org, p2_org, p3_org);
208  DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
209  row0, row2);
210  DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
211  row1, row3);
212  DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
213  DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
214  DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
215  control, row7, control, row4, row5, row6, row7);
216  __lasx_xvstelm_d(row4, src, 0, 0);
217  __lasx_xvstelm_d(row4, src + img_width, 0, 1);
218  src += img_width_2x;
219  __lasx_xvstelm_d(row4, src, 0, 2);
220  __lasx_xvstelm_d(row4, src + img_width, 0, 3);
221  src += img_width_2x;
222  __lasx_xvstelm_d(row5, src, 0, 0);
223  __lasx_xvstelm_d(row5, src + img_width, 0, 1);
224  src += img_width_2x;
225  __lasx_xvstelm_d(row5, src, 0, 2);
226  __lasx_xvstelm_d(row5, src + img_width, 0, 3);
227  src += img_width_2x;
228  __lasx_xvstelm_d(row6, src, 0, 0);
229  __lasx_xvstelm_d(row6, src + img_width, 0, 1);
230  src += img_width_2x;
231  __lasx_xvstelm_d(row6, src, 0, 2);
232  __lasx_xvstelm_d(row6, src + img_width, 0, 3);
233  src += img_width_2x;
234  __lasx_xvstelm_d(row7, src, 0, 0);
235  __lasx_xvstelm_d(row7, src + img_width, 0, 1);
236  src += img_width_2x;
237  __lasx_xvstelm_d(row7, src, 0, 2);
238  __lasx_xvstelm_d(row7, src + img_width, 0, 3);
239  }
240  }
241  }
242 }
243 
244 void ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width,
245  int alpha_in, int beta_in, int8_t *tc)
246 {
247  ptrdiff_t img_width_2x = img_width << 1;
248  ptrdiff_t img_width_3x = img_width + img_width_2x;
249  __m256i tmp_vec0, bs_vec;
250  __m256i tc_vec = {0x0101010100000000, 0x0303030302020202,
251  0x0101010100000000, 0x0303030302020202};
252 
253  tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
254  tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
255  bs_vec = __lasx_xvslti_b(tc_vec, 0);
256  bs_vec = __lasx_xvxori_b(bs_vec, 255);
257  bs_vec = __lasx_xvandi_b(bs_vec, 1);
258 
259  if (__lasx_xbnz_v(bs_vec)) {
260  __m256i p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
261  __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
262  __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
263  __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
264  __m256i is_bs_greater_than0;
265  __m256i zero = __lasx_xvldi(0);
266 
267  alpha = __lasx_xvreplgr2vr_b(alpha_in);
268  beta = __lasx_xvreplgr2vr_b(beta_in);
269 
270  DUP2_ARG2(__lasx_xvldx, data, -img_width_3x, data, -img_width_2x,
271  p2_org, p1_org);
272  p0_org = __lasx_xvldx(data, -img_width);
273  DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
274 
275  is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
276  p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
277  p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
278  q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
279 
280  is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
281  is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
282  is_less_than = is_less_than_alpha & is_less_than_beta;
283  is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
284  is_less_than = is_less_than_beta & is_less_than;
285  is_less_than = is_less_than & is_bs_greater_than0;
286 
287  if (__lasx_xbnz_v(is_less_than)) {
288  __m256i neg_tc_h, tc_h, p2_asub_p0, q2_asub_q0;
289 
290  q2_org = __lasx_xvldx(data, img_width_2x);
291 
292  neg_tc_h = __lasx_xvneg_b(tc_vec);
293  neg_tc_h = __lasx_vext2xv_h_b(neg_tc_h);
294  tc_h = __lasx_vext2xv_hu_bu(tc_vec);
295  p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
296  p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
297  q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
298 
299  p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
300  is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
301  is_less_than_beta = is_less_than_beta & is_less_than;
302 
303  if (__lasx_xbnz_v(is_less_than_beta)) {
304  __m256i p1_h, p2_org_h;
305 
306  p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
307  AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, p1_org_h, p2_org_h,
308  neg_tc_h, tc_h, p1_h);
309  p1_h = __lasx_xvpickev_b(p1_h, p1_h);
310  p1_h = __lasx_xvpermi_d(p1_h, 0xd8);
311  p1_h = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
312  p1_org = __lasx_xvpermi_q(p1_org, p1_h, 0x30);
313  __lasx_xvst(p1_org, data - img_width_2x, 0);
314 
315  is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
316  tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
317  }
318 
319  q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
320  is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
321  is_less_than_beta = is_less_than_beta & is_less_than;
322 
323  q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
324 
325  if (__lasx_xbnz_v(is_less_than_beta)) {
326  __m256i q1_h, q2_org_h;
327 
328  q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
329  AVC_LPF_P1_OR_Q1(p0_org_h, q0_org_h, q1_org_h, q2_org_h,
330  neg_tc_h, tc_h, q1_h);
331  q1_h = __lasx_xvpickev_b(q1_h, q1_h);
332  q1_h = __lasx_xvpermi_d(q1_h, 0xd8);
333  q1_h = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
334  q1_org = __lasx_xvpermi_q(q1_org, q1_h, 0x30);
335  __lasx_xvst(q1_org, data + img_width, 0);
336 
337  is_less_than_beta = __lasx_xvandi_b(is_less_than_beta, 1);
338  tc_vec = __lasx_xvadd_b(tc_vec, is_less_than_beta);
339 
340  }
341 
342  {
343  __m256i neg_thresh_h, p0_h, q0_h;
344 
345  neg_thresh_h = __lasx_xvneg_b(tc_vec);
346  neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
347  tc_h = __lasx_vext2xv_hu_bu(tc_vec);
348 
349  AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
350  neg_thresh_h, tc_h, p0_h, q0_h);
351  DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
352  p0_h, q0_h);
353  DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0Xd8, q0_h, 0xd8,
354  p0_h, q0_h);
355  p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
356  q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
357  p0_org = __lasx_xvpermi_q(p0_org, p0_h, 0x30);
358  q0_org = __lasx_xvpermi_q(q0_org, q0_h, 0x30);
359  __lasx_xvst(p0_org, data - img_width, 0);
360  __lasx_xvst(q0_org, data, 0);
361  }
362  }
363  }
364 }
365 
366 void ff_h264_h_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
367  int alpha_in, int beta_in, int8_t *tc)
368 {
369  __m256i tmp_vec0, bs_vec;
370  __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
371  __m256i zero = __lasx_xvldi(0);
372  ptrdiff_t img_width_2x = img_width << 1;
373  ptrdiff_t img_width_4x = img_width << 2;
374  ptrdiff_t img_width_3x = img_width_2x + img_width;
375 
376  tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
377  tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
378  bs_vec = __lasx_xvslti_b(tc_vec, 0);
379  bs_vec = __lasx_xvxori_b(bs_vec, 255);
380  bs_vec = __lasx_xvandi_b(bs_vec, 1);
381  bs_vec = __lasx_xvpermi_q(zero, bs_vec, 0x30);
382 
383  if (__lasx_xbnz_v(bs_vec)) {
384  uint8_t *src = data - 2;
385  __m256i p1_org, p0_org, q0_org, q1_org;
386  __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
387  __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
388  __m256i is_bs_greater_than0;
389 
390  is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
391 
392  {
393  __m256i row0, row1, row2, row3, row4, row5, row6, row7;
394 
395  DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
396  src, img_width_3x, row0, row1, row2, row3);
397  src += img_width_4x;
398  DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
399  src, img_width_3x, row4, row5, row6, row7);
400  src -= img_width_4x;
401  /* LASX_TRANSPOSE8x4_B */
402  DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4,
403  row7, row5, p1_org, p0_org, q0_org, q1_org);
404  row0 = __lasx_xvilvl_b(p0_org, p1_org);
405  row1 = __lasx_xvilvl_b(q1_org, q0_org);
406  row3 = __lasx_xvilvh_w(row1, row0);
407  row2 = __lasx_xvilvl_w(row1, row0);
408  p1_org = __lasx_xvpermi_d(row2, 0x00);
409  p0_org = __lasx_xvpermi_d(row2, 0x55);
410  q0_org = __lasx_xvpermi_d(row3, 0x00);
411  q1_org = __lasx_xvpermi_d(row3, 0x55);
412  }
413 
414  p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
415  p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
416  q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
417 
418  alpha = __lasx_xvreplgr2vr_b(alpha_in);
419  beta = __lasx_xvreplgr2vr_b(beta_in);
420 
421  is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
422  is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
423  is_less_than = is_less_than_alpha & is_less_than_beta;
424  is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
425  is_less_than = is_less_than_beta & is_less_than;
426  is_less_than = is_less_than & is_bs_greater_than0;
427 
428  if (__lasx_xbnz_v(is_less_than)) {
429  __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
430 
431  p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
432  p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
433  q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
434  q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
435 
436  {
437  __m256i tc_h, neg_thresh_h, p0_h, q0_h;
438 
439  neg_thresh_h = __lasx_xvneg_b(tc_vec);
440  neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
441  tc_h = __lasx_vext2xv_hu_bu(tc_vec);
442 
443  AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
444  neg_thresh_h, tc_h, p0_h, q0_h);
445  DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
446  p0_h, q0_h);
447  DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
448  p0_h, q0_h);
449  p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
450  q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
451  }
452 
453  p0_org = __lasx_xvilvl_b(q0_org, p0_org);
454  src = data - 1;
455  __lasx_xvstelm_h(p0_org, src, 0, 0);
456  src += img_width;
457  __lasx_xvstelm_h(p0_org, src, 0, 1);
458  src += img_width;
459  __lasx_xvstelm_h(p0_org, src, 0, 2);
460  src += img_width;
461  __lasx_xvstelm_h(p0_org, src, 0, 3);
462  src += img_width;
463  __lasx_xvstelm_h(p0_org, src, 0, 4);
464  src += img_width;
465  __lasx_xvstelm_h(p0_org, src, 0, 5);
466  src += img_width;
467  __lasx_xvstelm_h(p0_org, src, 0, 6);
468  src += img_width;
469  __lasx_xvstelm_h(p0_org, src, 0, 7);
470  }
471  }
472 }
473 
474 void ff_h264_v_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width,
475  int alpha_in, int beta_in, int8_t *tc)
476 {
477  int img_width_2x = img_width << 1;
478  __m256i tmp_vec0, bs_vec;
479  __m256i tc_vec = {0x0303020201010000, 0x0303020201010000, 0x0, 0x0};
480  __m256i zero = __lasx_xvldi(0);
481 
482  tmp_vec0 = __lasx_xvldrepl_w((uint32_t*)tc, 0);
483  tc_vec = __lasx_xvshuf_b(tmp_vec0, tmp_vec0, tc_vec);
484  bs_vec = __lasx_xvslti_b(tc_vec, 0);
485  bs_vec = __lasx_xvxori_b(bs_vec, 255);
486  bs_vec = __lasx_xvandi_b(bs_vec, 1);
487  bs_vec = __lasx_xvpermi_q(zero, bs_vec, 0x30);
488 
489  if (__lasx_xbnz_v(bs_vec)) {
490  __m256i p1_org, p0_org, q0_org, q1_org;
491  __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
492  __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
493  __m256i is_bs_greater_than0;
494 
495  alpha = __lasx_xvreplgr2vr_b(alpha_in);
496  beta = __lasx_xvreplgr2vr_b(beta_in);
497 
498  DUP2_ARG2(__lasx_xvldx, data, -img_width_2x, data, -img_width,
499  p1_org, p0_org);
500  DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
501 
502  is_bs_greater_than0 = __lasx_xvslt_bu(zero, bs_vec);
503  p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
504  p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
505  q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
506 
507  is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
508  is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
509  is_less_than = is_less_than_alpha & is_less_than_beta;
510  is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
511  is_less_than = is_less_than_beta & is_less_than;
512  is_less_than = is_less_than & is_bs_greater_than0;
513 
514  if (__lasx_xbnz_v(is_less_than)) {
515  __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
516 
517  p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
518  p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
519  q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
520  q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
521 
522  {
523  __m256i neg_thresh_h, tc_h, p0_h, q0_h;
524 
525  neg_thresh_h = __lasx_xvneg_b(tc_vec);
526  neg_thresh_h = __lasx_vext2xv_h_b(neg_thresh_h);
527  tc_h = __lasx_vext2xv_hu_bu(tc_vec);
528 
529  AVC_LPF_P0Q0(q0_org_h, p0_org_h, p1_org_h, q1_org_h,
530  neg_thresh_h, tc_h, p0_h, q0_h);
531  DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h,
532  p0_h, q0_h);
533  DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8,
534  p0_h, q0_h);
535  p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
536  q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
537  __lasx_xvstelm_d(p0_h, data - img_width, 0, 0);
538  __lasx_xvstelm_d(q0_h, data, 0, 0);
539  }
540  }
541  }
542 }
543 
544 #define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, \
545  q3_or_p3_org_in, p1_or_q1_org_in, \
546  p2_or_q2_org_in, q1_or_p1_org_in, \
547  p0_or_q0_out, p1_or_q1_out, p2_or_q2_out) \
548 { \
549  __m256i threshold; \
550  __m256i const2, const3 = __lasx_xvldi(0); \
551  \
552  const2 = __lasx_xvaddi_hu(const3, 2); \
553  const3 = __lasx_xvaddi_hu(const3, 3); \
554  threshold = __lasx_xvadd_h(p0_or_q0_org_in, q3_or_p3_org_in); \
555  threshold = __lasx_xvadd_h(p1_or_q1_org_in, threshold); \
556  \
557  p0_or_q0_out = __lasx_xvslli_h(threshold, 1); \
558  p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p2_or_q2_org_in); \
559  p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, q1_or_p1_org_in); \
560  p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const3); \
561  \
562  p1_or_q1_out = __lasx_xvadd_h(p2_or_q2_org_in, threshold); \
563  p1_or_q1_out = __lasx_xvsrar_h(p1_or_q1_out, const2); \
564  \
565  p2_or_q2_out = __lasx_xvmul_h(p2_or_q2_org_in, const3); \
566  p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in); \
567  p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, p3_or_q3_org_in); \
568  p2_or_q2_out = __lasx_xvadd_h(p2_or_q2_out, threshold); \
569  p2_or_q2_out = __lasx_xvsrar_h(p2_or_q2_out, const3); \
570 }
571 
572 /* data[-u32_img_width] = (uint8_t)((2 * p1 + p0 + q1 + 2) >> 2); */
573 #define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, \
574  p1_or_q1_org_in, p0_or_q0_out) \
575 { \
576  __m256i const2 = __lasx_xvldi(0); \
577  const2 = __lasx_xvaddi_hu(const2, 2); \
578  p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_org_in, q1_or_p1_org_in); \
579  p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in); \
580  p0_or_q0_out = __lasx_xvadd_h(p0_or_q0_out, p1_or_q1_org_in); \
581  p0_or_q0_out = __lasx_xvsrar_h(p0_or_q0_out, const2); \
582 }
583 
584 void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
585  int alpha_in, int beta_in)
586 {
587  ptrdiff_t img_width_2x = img_width << 1;
588  ptrdiff_t img_width_4x = img_width << 2;
589  ptrdiff_t img_width_3x = img_width_2x + img_width;
590  uint8_t *src = data - 4;
591  __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
592  __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
593  __m256i p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
594  __m256i zero = __lasx_xvldi(0);
595 
596  {
597  __m256i row0, row1, row2, row3, row4, row5, row6, row7;
598  __m256i row8, row9, row10, row11, row12, row13, row14, row15;
599 
600  DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
601  src, img_width_3x, row0, row1, row2, row3);
602  src += img_width_4x;
603  DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
604  src, img_width_3x, row4, row5, row6, row7);
605  src += img_width_4x;
606  DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
607  src, img_width_3x, row8, row9, row10, row11);
608  src += img_width_4x;
609  DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
610  src, img_width_3x, row12, row13, row14, row15);
611  src += img_width_4x;
612 
613  LASX_TRANSPOSE16x8_B(row0, row1, row2, row3,
614  row4, row5, row6, row7,
615  row8, row9, row10, row11,
616  row12, row13, row14, row15,
617  p3_org, p2_org, p1_org, p0_org,
618  q0_org, q1_org, q2_org, q3_org);
619  }
620 
621  alpha = __lasx_xvreplgr2vr_b(alpha_in);
622  beta = __lasx_xvreplgr2vr_b(beta_in);
623  p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
624  p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
625  q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
626 
627  is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
628  is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
629  is_less_than = is_less_than_beta & is_less_than_alpha;
630  is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
631  is_less_than = is_less_than_beta & is_less_than;
632  is_less_than = __lasx_xvpermi_q(zero, is_less_than, 0x30);
633 
634  if (__lasx_xbnz_v(is_less_than)) {
635  __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
636  __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
637  __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(alpha, 2);
638 
639  less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
640  less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
641  less_alpha_shift2_add2);
642 
643  p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
644  p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
645  q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
646  q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
647 
648  p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
649  is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
650  is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
651  negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
652  is_less_than_beta = is_less_than_beta & is_less_than;
653  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
654 
655  /* combine and store */
656  if (__lasx_xbnz_v(is_less_than_beta)) {
657  __m256i p2_org_h, p3_org_h, p1_h, p2_h;
658 
659  p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
660  p3_org_h = __lasx_vext2xv_hu_bu(p3_org);
661 
662  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_h, p0_org_h, q0_org_h, p1_org_h,
663  p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
664 
665  p0_h = __lasx_xvpickev_b(p0_h, p0_h);
666  p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
667  DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
668  DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
669  p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
670  p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
671  p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
672  }
673 
674  AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
675  /* combine */
676  p0_h = __lasx_xvpickev_b(p0_h, p0_h);
677  p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
678  p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
679 
680  /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
681  q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
682  is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
683  is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
684  negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
685  is_less_than_beta = is_less_than_beta & is_less_than;
686  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
687 
688  /* combine and store */
689  if (__lasx_xbnz_v(is_less_than_beta)) {
690  __m256i q2_org_h, q3_org_h, q1_h, q2_h;
691 
692  q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
693  q3_org_h = __lasx_vext2xv_hu_bu(q3_org);
694 
695  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_h, q0_org_h, p0_org_h, q1_org_h,
696  q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
697 
698  q0_h = __lasx_xvpickev_b(q0_h, q0_h);
699  q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
700  DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
701  DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
702  q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
703  q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
704  q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
705 
706  }
707 
708  AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
709 
710  /* combine */
711  q0_h = __lasx_xvpickev_b(q0_h, q0_h);
712  q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
713  q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
714 
715  /* transpose and store */
716  {
717  __m256i row0, row1, row2, row3, row4, row5, row6, row7;
718  __m256i control = {0x0000000400000000, 0x0000000500000001,
719  0x0000000600000002, 0x0000000700000003};
720 
721  DUP4_ARG3(__lasx_xvpermi_q, p0_org, q3_org, 0x02, p1_org, q2_org,
722  0x02, p2_org, q1_org, 0x02, p3_org, q0_org, 0x02,
723  p0_org, p1_org, p2_org, p3_org);
724  DUP2_ARG2(__lasx_xvilvl_b, p1_org, p3_org, p0_org, p2_org,
725  row0, row2);
726  DUP2_ARG2(__lasx_xvilvh_b, p1_org, p3_org, p0_org, p2_org,
727  row1, row3);
728  DUP2_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row4, row6);
729  DUP2_ARG2(__lasx_xvilvh_b, row2, row0, row3, row1, row5, row7);
730  DUP4_ARG2(__lasx_xvperm_w, row4, control, row5, control, row6,
731  control, row7, control, row4, row5, row6, row7);
732  src = data - 4;
733  __lasx_xvstelm_d(row4, src, 0, 0);
734  __lasx_xvstelm_d(row4, src + img_width, 0, 1);
735  src += img_width_2x;
736  __lasx_xvstelm_d(row4, src, 0, 2);
737  __lasx_xvstelm_d(row4, src + img_width, 0, 3);
738  src += img_width_2x;
739  __lasx_xvstelm_d(row5, src, 0, 0);
740  __lasx_xvstelm_d(row5, src + img_width, 0, 1);
741  src += img_width_2x;
742  __lasx_xvstelm_d(row5, src, 0, 2);
743  __lasx_xvstelm_d(row5, src + img_width, 0, 3);
744  src += img_width_2x;
745  __lasx_xvstelm_d(row6, src, 0, 0);
746  __lasx_xvstelm_d(row6, src + img_width, 0, 1);
747  src += img_width_2x;
748  __lasx_xvstelm_d(row6, src, 0, 2);
749  __lasx_xvstelm_d(row6, src + img_width, 0, 3);
750  src += img_width_2x;
751  __lasx_xvstelm_d(row7, src, 0, 0);
752  __lasx_xvstelm_d(row7, src + img_width, 0, 1);
753  src += img_width_2x;
754  __lasx_xvstelm_d(row7, src, 0, 2);
755  __lasx_xvstelm_d(row7, src + img_width, 0, 3);
756  }
757  }
758 }
759 
760 void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
761  int alpha_in, int beta_in)
762 {
763  ptrdiff_t img_width_2x = img_width << 1;
764  ptrdiff_t img_width_3x = img_width_2x + img_width;
765  uint8_t *src = data - img_width_2x;
766  __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
767  __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
768  __m256i p1_org, p0_org, q0_org, q1_org;
769  __m256i zero = __lasx_xvldi(0);
770 
771  DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x,
772  src, img_width_3x, p1_org, p0_org, q0_org, q1_org);
773  alpha = __lasx_xvreplgr2vr_b(alpha_in);
774  beta = __lasx_xvreplgr2vr_b(beta_in);
775  p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
776  p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
777  q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
778 
779  is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
780  is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
781  is_less_than = is_less_than_beta & is_less_than_alpha;
782  is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
783  is_less_than = is_less_than_beta & is_less_than;
784  is_less_than = __lasx_xvpermi_q(zero, is_less_than, 0x30);
785 
786  if (__lasx_xbnz_v(is_less_than)) {
787  __m256i p2_asub_p0, q2_asub_q0, p0_h, q0_h, negate_is_less_than_beta;
788  __m256i p1_org_h, p0_org_h, q0_org_h, q1_org_h;
789  __m256i p2_org = __lasx_xvldx(src, -img_width);
790  __m256i q2_org = __lasx_xvldx(data, img_width_2x);
791  __m256i less_alpha_shift2_add2 = __lasx_xvsrli_b(alpha, 2);
792  less_alpha_shift2_add2 = __lasx_xvaddi_bu(less_alpha_shift2_add2, 2);
793  less_alpha_shift2_add2 = __lasx_xvslt_bu(p0_asub_q0,
794  less_alpha_shift2_add2);
795 
796  p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
797  p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
798  q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
799  q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
800 
801  p2_asub_p0 = __lasx_xvabsd_bu(p2_org, p0_org);
802  is_less_than_beta = __lasx_xvslt_bu(p2_asub_p0, beta);
803  is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
804  negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
805  is_less_than_beta = is_less_than_beta & is_less_than;
806  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
807 
808  /* combine and store */
809  if (__lasx_xbnz_v(is_less_than_beta)) {
810  __m256i p2_org_h, p3_org_h, p1_h, p2_h;
811  __m256i p3_org = __lasx_xvldx(src, -img_width_2x);
812 
813  p2_org_h = __lasx_vext2xv_hu_bu(p2_org);
814  p3_org_h = __lasx_vext2xv_hu_bu(p3_org);
815 
816  AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_org_h, p0_org_h, q0_org_h, p1_org_h,
817  p2_org_h, q1_org_h, p0_h, p1_h, p2_h);
818 
819  p0_h = __lasx_xvpickev_b(p0_h, p0_h);
820  p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
821  DUP2_ARG2(__lasx_xvpickev_b, p1_h, p1_h, p2_h, p2_h, p1_h, p2_h);
822  DUP2_ARG2(__lasx_xvpermi_d, p1_h, 0xd8, p2_h, 0xd8, p1_h, p2_h);
823  p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than_beta);
824  p1_org = __lasx_xvbitsel_v(p1_org, p1_h, is_less_than_beta);
825  p2_org = __lasx_xvbitsel_v(p2_org, p2_h, is_less_than_beta);
826 
827  __lasx_xvst(p1_org, src, 0);
828  __lasx_xvst(p2_org, src - img_width, 0);
829  }
830 
831  AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
832  /* combine */
833  p0_h = __lasx_xvpickev_b(p0_h, p0_h);
834  p0_h = __lasx_xvpermi_d(p0_h, 0xd8);
835  p0_org = __lasx_xvbitsel_v(p0_org, p0_h, negate_is_less_than_beta);
836  __lasx_xvst(p0_org, data - img_width, 0);
837 
838  /* if (tmpFlag && (unsigned)ABS(q2-q0) < thresholds->beta_in) */
839  q2_asub_q0 = __lasx_xvabsd_bu(q2_org, q0_org);
840  is_less_than_beta = __lasx_xvslt_bu(q2_asub_q0, beta);
841  is_less_than_beta = is_less_than_beta & less_alpha_shift2_add2;
842  negate_is_less_than_beta = __lasx_xvxori_b(is_less_than_beta, 0xff);
843  is_less_than_beta = is_less_than_beta & is_less_than;
844  negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
845 
846  /* combine and store */
847  if (__lasx_xbnz_v(is_less_than_beta)) {
848  __m256i q2_org_h, q3_org_h, q1_h, q2_h;
849  __m256i q3_org = __lasx_xvldx(data, img_width_2x + img_width);
850 
851  q2_org_h = __lasx_vext2xv_hu_bu(q2_org);
852  q3_org_h = __lasx_vext2xv_hu_bu(q3_org);
853 
854  AVC_LPF_P0P1P2_OR_Q0Q1Q2(q3_org_h, q0_org_h, p0_org_h, q1_org_h,
855  q2_org_h, p1_org_h, q0_h, q1_h, q2_h);
856 
857  q0_h = __lasx_xvpickev_b(q0_h, q0_h);
858  q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
859  DUP2_ARG2(__lasx_xvpickev_b, q1_h, q1_h, q2_h, q2_h, q1_h, q2_h);
860  DUP2_ARG2(__lasx_xvpermi_d, q1_h, 0xd8, q2_h, 0xd8, q1_h, q2_h);
861  q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than_beta);
862  q1_org = __lasx_xvbitsel_v(q1_org, q1_h, is_less_than_beta);
863  q2_org = __lasx_xvbitsel_v(q2_org, q2_h, is_less_than_beta);
864 
865  __lasx_xvst(q1_org, data + img_width, 0);
866  __lasx_xvst(q2_org, data + img_width_2x, 0);
867  }
868 
869  AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
870 
871  /* combine */
872  q0_h = __lasx_xvpickev_b(q0_h, q0_h);
873  q0_h = __lasx_xvpermi_d(q0_h, 0xd8);
874  q0_org = __lasx_xvbitsel_v(q0_org, q0_h, negate_is_less_than_beta);
875 
876  __lasx_xvst(q0_org, data, 0);
877  }
878 }
879 
880 void ff_h264_h_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
881  int alpha_in, int beta_in)
882 {
883  uint8_t *src = data - 2;
884  ptrdiff_t img_width_2x = img_width << 1;
885  ptrdiff_t img_width_4x = img_width << 2;
886  ptrdiff_t img_width_3x = img_width_2x + img_width;
887  __m256i p1_org, p0_org, q0_org, q1_org;
888  __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
889  __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
890 
891  {
892  __m256i row0, row1, row2, row3, row4, row5, row6, row7;
893 
894  DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src,
895  img_width_3x, row0, row1, row2, row3);
896  src += img_width_4x;
897  DUP4_ARG2(__lasx_xvldx, src, 0, src, img_width, src, img_width_2x, src,
898  img_width_3x, row4, row5, row6, row7);
899 
900  /* LASX_TRANSPOSE8x4_B */
901  DUP4_ARG2(__lasx_xvilvl_b, row2, row0, row3, row1, row6, row4, row7, row5,
902  p1_org, p0_org, q0_org, q1_org);
903  row0 = __lasx_xvilvl_b(p0_org, p1_org);
904  row1 = __lasx_xvilvl_b(q1_org, q0_org);
905  row3 = __lasx_xvilvh_w(row1, row0);
906  row2 = __lasx_xvilvl_w(row1, row0);
907  p1_org = __lasx_xvpermi_d(row2, 0x00);
908  p0_org = __lasx_xvpermi_d(row2, 0x55);
909  q0_org = __lasx_xvpermi_d(row3, 0x00);
910  q1_org = __lasx_xvpermi_d(row3, 0x55);
911  }
912 
913  alpha = __lasx_xvreplgr2vr_b(alpha_in);
914  beta = __lasx_xvreplgr2vr_b(beta_in);
915 
916  p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
917  p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
918  q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
919 
920  is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
921  is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
922  is_less_than = is_less_than_alpha & is_less_than_beta;
923  is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
924  is_less_than = is_less_than_beta & is_less_than;
925 
926  if (__lasx_xbnz_v(is_less_than)) {
927  __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
928 
929  p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
930  p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
931  q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
932  q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
933 
934  AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
935  AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
936  DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
937  DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
938  p0_org = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
939  q0_org = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
940  }
941  p0_org = __lasx_xvilvl_b(q0_org, p0_org);
942  src = data - 1;
943  __lasx_xvstelm_h(p0_org, src, 0, 0);
944  src += img_width;
945  __lasx_xvstelm_h(p0_org, src, 0, 1);
946  src += img_width;
947  __lasx_xvstelm_h(p0_org, src, 0, 2);
948  src += img_width;
949  __lasx_xvstelm_h(p0_org, src, 0, 3);
950  src += img_width;
951  __lasx_xvstelm_h(p0_org, src, 0, 4);
952  src += img_width;
953  __lasx_xvstelm_h(p0_org, src, 0, 5);
954  src += img_width;
955  __lasx_xvstelm_h(p0_org, src, 0, 6);
956  src += img_width;
957  __lasx_xvstelm_h(p0_org, src, 0, 7);
958 }
959 
960 void ff_h264_v_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width,
961  int alpha_in, int beta_in)
962 {
963  ptrdiff_t img_width_2x = img_width << 1;
964  __m256i p1_org, p0_org, q0_org, q1_org;
965  __m256i p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha, beta;
966  __m256i is_less_than, is_less_than_beta, is_less_than_alpha;
967 
968  alpha = __lasx_xvreplgr2vr_b(alpha_in);
969  beta = __lasx_xvreplgr2vr_b(beta_in);
970 
971  p1_org = __lasx_xvldx(data, -img_width_2x);
972  p0_org = __lasx_xvldx(data, -img_width);
973  DUP2_ARG2(__lasx_xvldx, data, 0, data, img_width, q0_org, q1_org);
974 
975  p0_asub_q0 = __lasx_xvabsd_bu(p0_org, q0_org);
976  p1_asub_p0 = __lasx_xvabsd_bu(p1_org, p0_org);
977  q1_asub_q0 = __lasx_xvabsd_bu(q1_org, q0_org);
978 
979  is_less_than_alpha = __lasx_xvslt_bu(p0_asub_q0, alpha);
980  is_less_than_beta = __lasx_xvslt_bu(p1_asub_p0, beta);
981  is_less_than = is_less_than_alpha & is_less_than_beta;
982  is_less_than_beta = __lasx_xvslt_bu(q1_asub_q0, beta);
983  is_less_than = is_less_than_beta & is_less_than;
984 
985  if (__lasx_xbnz_v(is_less_than)) {
986  __m256i p0_h, q0_h, p1_org_h, p0_org_h, q0_org_h, q1_org_h;
987 
988  p1_org_h = __lasx_vext2xv_hu_bu(p1_org);
989  p0_org_h = __lasx_vext2xv_hu_bu(p0_org);
990  q0_org_h = __lasx_vext2xv_hu_bu(q0_org);
991  q1_org_h = __lasx_vext2xv_hu_bu(q1_org);
992 
993  AVC_LPF_P0_OR_Q0(p0_org_h, q1_org_h, p1_org_h, p0_h);
994  AVC_LPF_P0_OR_Q0(q0_org_h, p1_org_h, q1_org_h, q0_h);
995  DUP2_ARG2(__lasx_xvpickev_b, p0_h, p0_h, q0_h, q0_h, p0_h, q0_h);
996  DUP2_ARG2(__lasx_xvpermi_d, p0_h, 0xd8, q0_h, 0xd8, p0_h, q0_h);
997  p0_h = __lasx_xvbitsel_v(p0_org, p0_h, is_less_than);
998  q0_h = __lasx_xvbitsel_v(q0_org, q0_h, is_less_than);
999  __lasx_xvstelm_d(p0_h, data - img_width, 0, 0);
1000  __lasx_xvstelm_d(q0_h, data, 0, 0);
1001  }
1002 }
1003 
1004 void ff_biweight_h264_pixels16_8_lasx(uint8_t *dst, uint8_t *src,
1005  ptrdiff_t stride, int height,
1006  int log2_denom, int weight_dst,
1007  int weight_src, int offset_in)
1008 {
1009  __m256i wgt;
1010  __m256i src0, src1, src2, src3;
1011  __m256i dst0, dst1, dst2, dst3;
1012  __m256i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1013  __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1014  __m256i denom, offset;
1015  int stride_2x = stride << 1;
1016  int stride_4x = stride << 2;
1017  int stride_3x = stride_2x + stride;
1018 
1019  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1020  offset_in += ((weight_src + weight_dst) << 7);
1021  log2_denom += 1;
1022 
1023  tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1024  tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1025  wgt = __lasx_xvilvh_b(tmp1, tmp0);
1026  offset = __lasx_xvreplgr2vr_h(offset_in);
1027  denom = __lasx_xvreplgr2vr_h(log2_denom);
1028 
1029  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1030  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1031  src += stride_4x;
1032  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1033  src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1034  src += stride_4x;
1035  DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1036  0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1037  DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1038  dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1039  dst += stride_4x;
1040  DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1041  dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1042  dst -= stride_4x;
1043  DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1044  0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
1045 
1046  DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1047  src0, src1, src2, src3);
1048  DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1049  dst0, dst1, dst2, dst3);
1050  DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
1051  dst3, src3, vec0, vec2, vec4, vec6);
1052  DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
1053  dst3, src3, vec1, vec3, vec5, vec7);
1054 
1055  DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1056  offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1057  DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5,
1058  offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1059 
1060  tmp0 = __lasx_xvsra_h(tmp0, denom);
1061  tmp1 = __lasx_xvsra_h(tmp1, denom);
1062  tmp2 = __lasx_xvsra_h(tmp2, denom);
1063  tmp3 = __lasx_xvsra_h(tmp3, denom);
1064  tmp4 = __lasx_xvsra_h(tmp4, denom);
1065  tmp5 = __lasx_xvsra_h(tmp5, denom);
1066  tmp6 = __lasx_xvsra_h(tmp6, denom);
1067  tmp7 = __lasx_xvsra_h(tmp7, denom);
1068 
1069  DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1070  tmp0, tmp1, tmp2, tmp3);
1071  DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1072  tmp4, tmp5, tmp6, tmp7);
1073  DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
1074  dst0, dst1, dst2, dst3);
1075  __lasx_xvstelm_d(dst0, dst, 0, 0);
1076  __lasx_xvstelm_d(dst0, dst, 8, 1);
1077  dst += stride;
1078  __lasx_xvstelm_d(dst0, dst, 0, 2);
1079  __lasx_xvstelm_d(dst0, dst, 8, 3);
1080  dst += stride;
1081  __lasx_xvstelm_d(dst1, dst, 0, 0);
1082  __lasx_xvstelm_d(dst1, dst, 8, 1);
1083  dst += stride;
1084  __lasx_xvstelm_d(dst1, dst, 0, 2);
1085  __lasx_xvstelm_d(dst1, dst, 8, 3);
1086  dst += stride;
1087  __lasx_xvstelm_d(dst2, dst, 0, 0);
1088  __lasx_xvstelm_d(dst2, dst, 8, 1);
1089  dst += stride;
1090  __lasx_xvstelm_d(dst2, dst, 0, 2);
1091  __lasx_xvstelm_d(dst2, dst, 8, 3);
1092  dst += stride;
1093  __lasx_xvstelm_d(dst3, dst, 0, 0);
1094  __lasx_xvstelm_d(dst3, dst, 8, 1);
1095  dst += stride;
1096  __lasx_xvstelm_d(dst3, dst, 0, 2);
1097  __lasx_xvstelm_d(dst3, dst, 8, 3);
1098  dst += stride;
1099 
1100  if (16 == height) {
1101  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1102  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1103  src += stride_4x;
1104  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1105  src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1106  src += stride_4x;
1107  DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1108  tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1109  DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1110  dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1111  dst += stride_4x;
1112  DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1113  dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1114  dst -= stride_4x;
1115  DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1116  tmp4, 0x20, tmp7, tmp6, 0x20, dst0, dst1, dst2, dst3);
1117 
1118  DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1119  src0, src1, src2, src3);
1120  DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1121  dst0, dst1, dst2, dst3);
1122  DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
1123  dst3, src3, vec0, vec2, vec4, vec6);
1124  DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
1125  dst3, src3, vec1, vec3, vec5, vec7);
1126 
1127  DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1128  offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1129  DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec4, offset, wgt, vec5,
1130  offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1131 
1132  tmp0 = __lasx_xvsra_h(tmp0, denom);
1133  tmp1 = __lasx_xvsra_h(tmp1, denom);
1134  tmp2 = __lasx_xvsra_h(tmp2, denom);
1135  tmp3 = __lasx_xvsra_h(tmp3, denom);
1136  tmp4 = __lasx_xvsra_h(tmp4, denom);
1137  tmp5 = __lasx_xvsra_h(tmp5, denom);
1138  tmp6 = __lasx_xvsra_h(tmp6, denom);
1139  tmp7 = __lasx_xvsra_h(tmp7, denom);
1140 
1141  DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1142  tmp0, tmp1, tmp2, tmp3);
1143  DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1144  tmp4, tmp5, tmp6, tmp7);
1145  DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7,
1146  tmp6, dst0, dst1, dst2, dst3);
1147  __lasx_xvstelm_d(dst0, dst, 0, 0);
1148  __lasx_xvstelm_d(dst0, dst, 8, 1);
1149  dst += stride;
1150  __lasx_xvstelm_d(dst0, dst, 0, 2);
1151  __lasx_xvstelm_d(dst0, dst, 8, 3);
1152  dst += stride;
1153  __lasx_xvstelm_d(dst1, dst, 0, 0);
1154  __lasx_xvstelm_d(dst1, dst, 8, 1);
1155  dst += stride;
1156  __lasx_xvstelm_d(dst1, dst, 0, 2);
1157  __lasx_xvstelm_d(dst1, dst, 8, 3);
1158  dst += stride;
1159  __lasx_xvstelm_d(dst2, dst, 0, 0);
1160  __lasx_xvstelm_d(dst2, dst, 8, 1);
1161  dst += stride;
1162  __lasx_xvstelm_d(dst2, dst, 0, 2);
1163  __lasx_xvstelm_d(dst2, dst, 8, 3);
1164  dst += stride;
1165  __lasx_xvstelm_d(dst3, dst, 0, 0);
1166  __lasx_xvstelm_d(dst3, dst, 8, 1);
1167  dst += stride;
1168  __lasx_xvstelm_d(dst3, dst, 0, 2);
1169  __lasx_xvstelm_d(dst3, dst, 8, 3);
1170  }
1171 }
1172 
1173 static void avc_biwgt_8x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1174  int32_t log2_denom, int32_t weight_src,
1175  int32_t weight_dst, int32_t offset_in)
1176 {
1177  __m256i wgt, vec0, vec1;
1178  __m256i src0, dst0;
1179  __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1180  ptrdiff_t stride_2x = stride << 1;
1181  ptrdiff_t stride_3x = stride_2x + stride;
1182 
1183  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1184  offset_in += ((weight_src + weight_dst) << 7);
1185  log2_denom += 1;
1186 
1187  tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1188  tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1189  wgt = __lasx_xvilvh_b(tmp1, tmp0);
1190  offset = __lasx_xvreplgr2vr_h(offset_in);
1191  denom = __lasx_xvreplgr2vr_h(log2_denom);
1192 
1193  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1194  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1195  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1196  src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1197  DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1198  dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1199  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1200  dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1201  DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1202  vec0 = __lasx_xvilvl_b(dst0, src0);
1203  vec1 = __lasx_xvilvh_b(dst0, src0);
1204  DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1205  tmp0, tmp1);
1206  tmp0 = __lasx_xvsra_h(tmp0, denom);
1207  tmp1 = __lasx_xvsra_h(tmp1, denom);
1208  DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
1209  dst0 = __lasx_xvpickev_b(tmp1, tmp0);
1210  __lasx_xvstelm_d(dst0, dst, 0, 0);
1211  __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1212  __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1213  __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1214 }
1215 
1216 static void avc_biwgt_8x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1217  int32_t log2_denom, int32_t weight_src,
1218  int32_t weight_dst, int32_t offset_in)
1219 {
1220  __m256i wgt, vec0, vec1, vec2, vec3;
1221  __m256i src0, src1, dst0, dst1;
1222  __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1223  ptrdiff_t stride_2x = stride << 1;
1224  ptrdiff_t stride_4x = stride << 2;
1225  ptrdiff_t stride_3x = stride_2x + stride;
1226  uint8_t* dst_tmp = dst;
1227 
1228  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1229  offset_in += ((weight_src + weight_dst) << 7);
1230  log2_denom += 1;
1231 
1232  tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1233  tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1234  wgt = __lasx_xvilvh_b(tmp1, tmp0);
1235  offset = __lasx_xvreplgr2vr_h(offset_in);
1236  denom = __lasx_xvreplgr2vr_h(log2_denom);
1237 
1238  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1239  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1240  src += stride_4x;
1241  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1242  src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1243  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1244  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1245  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1246  src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1247  tmp0 = __lasx_xvld(dst_tmp, 0);
1248  DUP2_ARG2(__lasx_xvldx, dst_tmp, stride, dst_tmp, stride_2x, tmp1, tmp2);
1249  tmp3 = __lasx_xvldx(dst_tmp, stride_3x);
1250  dst_tmp += stride_4x;
1251  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1252  dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1253  DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1254  dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1255  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1256  dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1257 
1258  DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, dst0, 128, dst1, 128,
1259  src0, src1, dst0, dst1);
1260  DUP2_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, vec0, vec2);
1261  DUP2_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, vec1, vec3);
1262  DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1263  offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1264  tmp0 = __lasx_xvsra_h(tmp0, denom);
1265  tmp1 = __lasx_xvsra_h(tmp1, denom);
1266  tmp2 = __lasx_xvsra_h(tmp2, denom);
1267  tmp3 = __lasx_xvsra_h(tmp3, denom);
1268  DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1269  tmp0, tmp1, tmp2, tmp3);
1270  DUP2_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, dst0, dst1);
1271  __lasx_xvstelm_d(dst0, dst, 0, 0);
1272  __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1273  __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1274  __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1275  dst += stride_4x;
1276  __lasx_xvstelm_d(dst1, dst, 0, 0);
1277  __lasx_xvstelm_d(dst1, dst + stride, 0, 1);
1278  __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
1279  __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
1280 }
1281 
1282 static void avc_biwgt_8x16_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1283  int32_t log2_denom, int32_t weight_src,
1284  int32_t weight_dst, int32_t offset_in)
1285 {
1286  __m256i wgt, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1287  __m256i src0, src1, src2, src3, dst0, dst1, dst2, dst3;
1288  __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
1289  ptrdiff_t stride_2x = stride << 1;
1290  ptrdiff_t stride_4x = stride << 2;
1291  ptrdiff_t stride_3x = stride_2x + stride;
1292  uint8_t* dst_tmp = dst;
1293 
1294  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1295  offset_in += ((weight_src + weight_dst) << 7);
1296  log2_denom += 1;
1297 
1298  tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1299  tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1300  wgt = __lasx_xvilvh_b(tmp1, tmp0);
1301  offset = __lasx_xvreplgr2vr_h(offset_in);
1302  denom = __lasx_xvreplgr2vr_h(log2_denom);
1303 
1304  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1305  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1306  src += stride_4x;
1307  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1308  src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1309  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1310  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1311  src += stride_4x;
1312  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1313  src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1314  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1315  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1316  src += stride_4x;
1317  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1318  src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1319  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1320  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1321  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1322  src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1323 
1324  DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1325  dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1326  dst_tmp += stride_4x;
1327  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1328  dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1329  DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1330  dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1331  dst_tmp += stride_4x;
1332  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1333  dst1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1334  DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1335  dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1336  dst_tmp += stride_4x;
1337  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1338  dst2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1339  DUP4_ARG2(__lasx_xvldx, dst_tmp, 0, dst_tmp, stride, dst_tmp, stride_2x,
1340  dst_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1341  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1342  dst3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1343 
1344  DUP4_ARG2(__lasx_xvxori_b, src0, 128, src1, 128, src2, 128, src3, 128,
1345  src0, src1, src2, src3);
1346  DUP4_ARG2(__lasx_xvxori_b, dst0, 128, dst1, 128, dst2, 128, dst3, 128,
1347  dst0, dst1, dst2, dst3);
1348  DUP4_ARG2(__lasx_xvilvl_b, dst0, src0, dst1, src1, dst2, src2,
1349  dst3, src3, vec0, vec2, vec4, vec6);
1350  DUP4_ARG2(__lasx_xvilvh_b, dst0, src0, dst1, src1, dst2, src2,
1351  dst3, src3, vec1, vec3, vec5, vec7);
1352  DUP4_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1353  offset, wgt, vec2, offset, wgt, vec3, tmp0, tmp1, tmp2, tmp3);
1354  DUP4_ARG3(__lasx_xvdp2add_h_b,offset, wgt, vec4, offset, wgt, vec5,
1355  offset, wgt, vec6, offset, wgt, vec7, tmp4, tmp5, tmp6, tmp7);
1356  tmp0 = __lasx_xvsra_h(tmp0, denom);
1357  tmp1 = __lasx_xvsra_h(tmp1, denom);
1358  tmp2 = __lasx_xvsra_h(tmp2, denom);
1359  tmp3 = __lasx_xvsra_h(tmp3, denom);
1360  tmp4 = __lasx_xvsra_h(tmp4, denom);
1361  tmp5 = __lasx_xvsra_h(tmp5, denom);
1362  tmp6 = __lasx_xvsra_h(tmp6, denom);
1363  tmp7 = __lasx_xvsra_h(tmp7, denom);
1364  DUP4_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp2, tmp3,
1365  tmp0, tmp1, tmp2, tmp3);
1366  DUP4_ARG1(__lasx_xvclip255_h, tmp4, tmp5, tmp6, tmp7,
1367  tmp4, tmp5, tmp6, tmp7);
1368  DUP4_ARG2(__lasx_xvpickev_b, tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
1369  dst0, dst1, dst2, dst3)
1370  __lasx_xvstelm_d(dst0, dst, 0, 0);
1371  __lasx_xvstelm_d(dst0, dst + stride, 0, 1);
1372  __lasx_xvstelm_d(dst0, dst + stride_2x, 0, 2);
1373  __lasx_xvstelm_d(dst0, dst + stride_3x, 0, 3);
1374  dst += stride_4x;
1375  __lasx_xvstelm_d(dst1, dst, 0, 0);
1376  __lasx_xvstelm_d(dst1, dst + stride, 0, 1);
1377  __lasx_xvstelm_d(dst1, dst + stride_2x, 0, 2);
1378  __lasx_xvstelm_d(dst1, dst + stride_3x, 0, 3);
1379  dst += stride_4x;
1380  __lasx_xvstelm_d(dst2, dst, 0, 0);
1381  __lasx_xvstelm_d(dst2, dst + stride, 0, 1);
1382  __lasx_xvstelm_d(dst2, dst + stride_2x, 0, 2);
1383  __lasx_xvstelm_d(dst2, dst + stride_3x, 0, 3);
1384  dst += stride_4x;
1385  __lasx_xvstelm_d(dst3, dst, 0, 0);
1386  __lasx_xvstelm_d(dst3, dst + stride, 0, 1);
1387  __lasx_xvstelm_d(dst3, dst + stride_2x, 0, 2);
1388  __lasx_xvstelm_d(dst3, dst + stride_3x, 0, 3);
1389 }
1390 
1391 void ff_biweight_h264_pixels8_8_lasx(uint8_t *dst, uint8_t *src,
1392  ptrdiff_t stride, int height,
1393  int log2_denom, int weight_dst,
1394  int weight_src, int offset)
1395 {
1396  if (4 == height) {
1397  avc_biwgt_8x4_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
1398  offset);
1399  } else if (8 == height) {
1400  avc_biwgt_8x8_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
1401  offset);
1402  } else {
1403  avc_biwgt_8x16_lasx(src, dst, stride, log2_denom, weight_src, weight_dst,
1404  offset);
1405  }
1406 }
1407 
1408 static void avc_biwgt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1409  int32_t log2_denom, int32_t weight_src,
1410  int32_t weight_dst, int32_t offset_in)
1411 {
1412  __m256i wgt, vec0;
1413  __m256i src0, dst0;
1414  __m256i tmp0, tmp1, denom, offset;
1415 
1416  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1417  offset_in += ((weight_src + weight_dst) << 7);
1418  log2_denom += 1;
1419 
1420  tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1421  tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1422  wgt = __lasx_xvilvh_b(tmp1, tmp0);
1423  offset = __lasx_xvreplgr2vr_h(offset_in);
1424  denom = __lasx_xvreplgr2vr_h(log2_denom);
1425 
1426  DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1);
1427  src0 = __lasx_xvilvl_w(tmp1, tmp0);
1428  DUP2_ARG2(__lasx_xvldx, dst, 0, dst, stride, tmp0, tmp1);
1429  dst0 = __lasx_xvilvl_w(tmp1, tmp0);
1430  DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1431  vec0 = __lasx_xvilvl_b(dst0, src0);
1432  tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0);
1433  tmp0 = __lasx_xvsra_h(tmp0, denom);
1434  tmp0 = __lasx_xvclip255_h(tmp0);
1435  tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
1436  __lasx_xvstelm_w(tmp0, dst, 0, 0);
1437  __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
1438 }
1439 
1440 static void avc_biwgt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1441  int32_t log2_denom, int32_t weight_src,
1442  int32_t weight_dst, int32_t offset_in)
1443 {
1444  __m256i wgt, vec0;
1445  __m256i src0, dst0;
1446  __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1447  ptrdiff_t stride_2x = stride << 1;
1448  ptrdiff_t stride_3x = stride_2x + stride;
1449 
1450  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1451  offset_in += ((weight_src + weight_dst) << 7);
1452  log2_denom += 1;
1453 
1454  tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1455  tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1456  wgt = __lasx_xvilvh_b(tmp1, tmp0);
1457  offset = __lasx_xvreplgr2vr_h(offset_in);
1458  denom = __lasx_xvreplgr2vr_h(log2_denom);
1459 
1460  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1461  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1462  DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1463  src0 = __lasx_xvilvl_w(tmp1, tmp0);
1464  DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1465  dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1466  DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1467  dst0 = __lasx_xvilvl_w(tmp1, tmp0);
1468  DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1469  vec0 = __lasx_xvilvl_b(dst0, src0);
1470  dst0 = __lasx_xvilvh_b(dst0, src0);
1471  vec0 = __lasx_xvpermi_q(vec0, dst0, 0x02);
1472  tmp0 = __lasx_xvdp2add_h_b(offset, wgt, vec0);
1473  tmp0 = __lasx_xvsra_h(tmp0, denom);
1474  tmp0 = __lasx_xvclip255_h(tmp0);
1475  tmp0 = __lasx_xvpickev_b(tmp0, tmp0);
1476  __lasx_xvstelm_w(tmp0, dst, 0, 0);
1477  __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
1478  __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 4);
1479  __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 5);
1480 }
1481 
1482 static void avc_biwgt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
1483  int32_t log2_denom, int32_t weight_src,
1484  int32_t weight_dst, int32_t offset_in)
1485 {
1486  __m256i wgt, vec0, vec1;
1487  __m256i src0, dst0;
1488  __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
1489  ptrdiff_t stride_2x = stride << 1;
1490  ptrdiff_t stride_4x = stride << 2;
1491  ptrdiff_t stride_3x = stride_2x + stride;
1492 
1493  offset_in = (unsigned) ((offset_in + 1) | 1) << log2_denom;
1494  offset_in += ((weight_src + weight_dst) << 7);
1495  log2_denom += 1;
1496 
1497  tmp0 = __lasx_xvreplgr2vr_b(weight_src);
1498  tmp1 = __lasx_xvreplgr2vr_b(weight_dst);
1499  wgt = __lasx_xvilvh_b(tmp1, tmp0);
1500  offset = __lasx_xvreplgr2vr_h(offset_in);
1501  denom = __lasx_xvreplgr2vr_h(log2_denom);
1502 
1503  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1504  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1505  src += stride_4x;
1506  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1507  src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1508  DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
1509  tmp0, tmp1, tmp2, tmp3);
1510  DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1511  src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1512  DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1513  dst, stride_3x, tmp0, tmp1, tmp2, tmp3);
1514  dst += stride_4x;
1515  DUP4_ARG2(__lasx_xvldx, dst, 0, dst, stride, dst, stride_2x,
1516  dst, stride_3x, tmp4, tmp5, tmp6, tmp7);
1517  dst -= stride_4x;
1518  DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5,
1519  tmp0, tmp1, tmp2, tmp3);
1520  DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1521  dst0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1522  DUP2_ARG2(__lasx_xvxori_b, src0, 128, dst0, 128, src0, dst0);
1523  vec0 = __lasx_xvilvl_b(dst0, src0);
1524  vec1 = __lasx_xvilvh_b(dst0, src0);
1525  DUP2_ARG3(__lasx_xvdp2add_h_b, offset, wgt, vec0, offset, wgt, vec1,
1526  tmp0, tmp1);
1527  tmp0 = __lasx_xvsra_h(tmp0, denom);
1528  tmp1 = __lasx_xvsra_h(tmp1, denom);
1529  DUP2_ARG1(__lasx_xvclip255_h, tmp0, tmp1, tmp0, tmp1);
1530  tmp0 = __lasx_xvpickev_b(tmp1, tmp0);
1531  __lasx_xvstelm_w(tmp0, dst, 0, 0);
1532  __lasx_xvstelm_w(tmp0, dst + stride, 0, 1);
1533  __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 2);
1534  __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 3);
1535  dst += stride_4x;
1536  __lasx_xvstelm_w(tmp0, dst, 0, 4);
1537  __lasx_xvstelm_w(tmp0, dst + stride, 0, 5);
1538  __lasx_xvstelm_w(tmp0, dst + stride_2x, 0, 6);
1539  __lasx_xvstelm_w(tmp0, dst + stride_3x, 0, 7);
1540 }
1541 
1542 void ff_biweight_h264_pixels4_8_lasx(uint8_t *dst, uint8_t *src,
1543  ptrdiff_t stride, int height,
1544  int log2_denom, int weight_dst,
1545  int weight_src, int offset)
1546 {
1547  if (2 == height) {
1548  avc_biwgt_4x2_lasx(src, dst, stride, log2_denom, weight_src,
1549  weight_dst, offset);
1550  } else if (4 == height) {
1551  avc_biwgt_4x4_lasx(src, dst, stride, log2_denom, weight_src,
1552  weight_dst, offset);
1553  } else {
1554  avc_biwgt_4x8_lasx(src, dst, stride, log2_denom, weight_src,
1555  weight_dst, offset);
1556  }
1557 }
1558 
1559 void ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride,
1560  int height, int log2_denom,
1561  int weight_src, int offset_in)
1562 {
1563  uint32_t offset_val;
1564  ptrdiff_t stride_2x = stride << 1;
1565  ptrdiff_t stride_4x = stride << 2;
1566  ptrdiff_t stride_3x = stride_2x + stride;
1567  __m256i zero = __lasx_xvldi(0);
1568  __m256i src0, src1, src2, src3;
1569  __m256i src0_l, src1_l, src2_l, src3_l, src0_h, src1_h, src2_h, src3_h;
1570  __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1571  __m256i wgt, denom, offset;
1572 
1573  offset_val = (unsigned) offset_in << log2_denom;
1574 
1575  wgt = __lasx_xvreplgr2vr_h(weight_src);
1576  offset = __lasx_xvreplgr2vr_h(offset_val);
1577  denom = __lasx_xvreplgr2vr_h(log2_denom);
1578 
1579  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1580  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1581  src += stride_4x;
1582  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1583  src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1584  src -= stride_4x;
1585  DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5, tmp4,
1586  0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1587  DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2,
1588  zero, src3, src0_l, src1_l, src2_l, src3_l);
1589  DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2,
1590  zero, src3, src0_h, src1_h, src2_h, src3_h);
1591  src0_l = __lasx_xvmul_h(wgt, src0_l);
1592  src0_h = __lasx_xvmul_h(wgt, src0_h);
1593  src1_l = __lasx_xvmul_h(wgt, src1_l);
1594  src1_h = __lasx_xvmul_h(wgt, src1_h);
1595  src2_l = __lasx_xvmul_h(wgt, src2_l);
1596  src2_h = __lasx_xvmul_h(wgt, src2_h);
1597  src3_l = __lasx_xvmul_h(wgt, src3_l);
1598  src3_h = __lasx_xvmul_h(wgt, src3_h);
1599  DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
1600  src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1601  DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset,
1602  src3_h, offset, src2_l, src2_h, src3_l, src3_h);
1603  src0_l = __lasx_xvmaxi_h(src0_l, 0);
1604  src0_h = __lasx_xvmaxi_h(src0_h, 0);
1605  src1_l = __lasx_xvmaxi_h(src1_l, 0);
1606  src1_h = __lasx_xvmaxi_h(src1_h, 0);
1607  src2_l = __lasx_xvmaxi_h(src2_l, 0);
1608  src2_h = __lasx_xvmaxi_h(src2_h, 0);
1609  src3_l = __lasx_xvmaxi_h(src3_l, 0);
1610  src3_h = __lasx_xvmaxi_h(src3_h, 0);
1611  src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1612  src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1613  src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1614  src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1615  src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1616  src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1617  src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1618  src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1619  __lasx_xvstelm_d(src0_l, src, 0, 0);
1620  __lasx_xvstelm_d(src0_h, src, 8, 0);
1621  src += stride;
1622  __lasx_xvstelm_d(src0_l, src, 0, 2);
1623  __lasx_xvstelm_d(src0_h, src, 8, 2);
1624  src += stride;
1625  __lasx_xvstelm_d(src1_l, src, 0, 0);
1626  __lasx_xvstelm_d(src1_h, src, 8, 0);
1627  src += stride;
1628  __lasx_xvstelm_d(src1_l, src, 0, 2);
1629  __lasx_xvstelm_d(src1_h, src, 8, 2);
1630  src += stride;
1631  __lasx_xvstelm_d(src2_l, src, 0, 0);
1632  __lasx_xvstelm_d(src2_h, src, 8, 0);
1633  src += stride;
1634  __lasx_xvstelm_d(src2_l, src, 0, 2);
1635  __lasx_xvstelm_d(src2_h, src, 8, 2);
1636  src += stride;
1637  __lasx_xvstelm_d(src3_l, src, 0, 0);
1638  __lasx_xvstelm_d(src3_h, src, 8, 0);
1639  src += stride;
1640  __lasx_xvstelm_d(src3_l, src, 0, 2);
1641  __lasx_xvstelm_d(src3_h, src, 8, 2);
1642  src += stride;
1643 
1644  if (16 == height) {
1645  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1646  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1647  src += stride_4x;
1648  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1649  src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1650  src -= stride_4x;
1651  DUP4_ARG3(__lasx_xvpermi_q, tmp1, tmp0, 0x20, tmp3, tmp2, 0x20, tmp5,
1652  tmp4, 0x20, tmp7, tmp6, 0x20, src0, src1, src2, src3);
1653  DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2,
1654  zero, src3, src0_l, src1_l, src2_l, src3_l);
1655  DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2,
1656  zero, src3, src0_h, src1_h, src2_h, src3_h);
1657  src0_l = __lasx_xvmul_h(wgt, src0_l);
1658  src0_h = __lasx_xvmul_h(wgt, src0_h);
1659  src1_l = __lasx_xvmul_h(wgt, src1_l);
1660  src1_h = __lasx_xvmul_h(wgt, src1_h);
1661  src2_l = __lasx_xvmul_h(wgt, src2_l);
1662  src2_h = __lasx_xvmul_h(wgt, src2_h);
1663  src3_l = __lasx_xvmul_h(wgt, src3_l);
1664  src3_h = __lasx_xvmul_h(wgt, src3_h);
1665  DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l,
1666  offset, src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1667  DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l,
1668  offset, src3_h, offset, src2_l, src2_h, src3_l, src3_h);
1669  src0_l = __lasx_xvmaxi_h(src0_l, 0);
1670  src0_h = __lasx_xvmaxi_h(src0_h, 0);
1671  src1_l = __lasx_xvmaxi_h(src1_l, 0);
1672  src1_h = __lasx_xvmaxi_h(src1_h, 0);
1673  src2_l = __lasx_xvmaxi_h(src2_l, 0);
1674  src2_h = __lasx_xvmaxi_h(src2_h, 0);
1675  src3_l = __lasx_xvmaxi_h(src3_l, 0);
1676  src3_h = __lasx_xvmaxi_h(src3_h, 0);
1677  src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1678  src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1679  src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1680  src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1681  src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1682  src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1683  src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1684  src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1685  __lasx_xvstelm_d(src0_l, src, 0, 0);
1686  __lasx_xvstelm_d(src0_h, src, 8, 0);
1687  src += stride;
1688  __lasx_xvstelm_d(src0_l, src, 0, 2);
1689  __lasx_xvstelm_d(src0_h, src, 8, 2);
1690  src += stride;
1691  __lasx_xvstelm_d(src1_l, src, 0, 0);
1692  __lasx_xvstelm_d(src1_h, src, 8, 0);
1693  src += stride;
1694  __lasx_xvstelm_d(src1_l, src, 0, 2);
1695  __lasx_xvstelm_d(src1_h, src, 8, 2);
1696  src += stride;
1697  __lasx_xvstelm_d(src2_l, src, 0, 0);
1698  __lasx_xvstelm_d(src2_h, src, 8, 0);
1699  src += stride;
1700  __lasx_xvstelm_d(src2_l, src, 0, 2);
1701  __lasx_xvstelm_d(src2_h, src, 8, 2);
1702  src += stride;
1703  __lasx_xvstelm_d(src3_l, src, 0, 0);
1704  __lasx_xvstelm_d(src3_h, src, 8, 0);
1705  src += stride;
1706  __lasx_xvstelm_d(src3_l, src, 0, 2);
1707  __lasx_xvstelm_d(src3_h, src, 8, 2);
1708  }
1709 }
1710 
1711 static void avc_wgt_8x4_lasx(uint8_t *src, ptrdiff_t stride,
1712  int32_t log2_denom, int32_t weight_src,
1713  int32_t offset_in)
1714 {
1715  uint32_t offset_val;
1716  ptrdiff_t stride_2x = stride << 1;
1717  ptrdiff_t stride_3x = stride_2x + stride;
1718  __m256i wgt, zero = __lasx_xvldi(0);
1719  __m256i src0, src0_h, src0_l;
1720  __m256i tmp0, tmp1, tmp2, tmp3, denom, offset;
1721 
1722  offset_val = (unsigned) offset_in << log2_denom;
1723 
1724  wgt = __lasx_xvreplgr2vr_h(weight_src);
1725  offset = __lasx_xvreplgr2vr_h(offset_val);
1726  denom = __lasx_xvreplgr2vr_h(log2_denom);
1727 
1728  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1729  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1730  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1731  src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1732  src0_l = __lasx_xvilvl_b(zero, src0);
1733  src0_h = __lasx_xvilvh_b(zero, src0);
1734  src0_l = __lasx_xvmul_h(wgt, src0_l);
1735  src0_h = __lasx_xvmul_h(wgt, src0_h);
1736  src0_l = __lasx_xvsadd_h(src0_l, offset);
1737  src0_h = __lasx_xvsadd_h(src0_h, offset);
1738  src0_l = __lasx_xvmaxi_h(src0_l, 0);
1739  src0_h = __lasx_xvmaxi_h(src0_h, 0);
1740  src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1741  src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1742 
1743  src0 = __lasx_xvpickev_d(src0_h, src0_l);
1744  __lasx_xvstelm_d(src0, src, 0, 0);
1745  __lasx_xvstelm_d(src0, src + stride, 0, 1);
1746  __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
1747  __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
1748 }
1749 
1750 static void avc_wgt_8x8_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom,
1751  int32_t src_weight, int32_t offset_in)
1752 {
1753  __m256i src0, src1, src0_h, src0_l, src1_h, src1_l, zero = __lasx_xvldi(0);
1754  __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt;
1755  uint32_t offset_val;
1756  uint8_t* src_tmp = src;
1757  ptrdiff_t stride_2x = stride << 1;
1758  ptrdiff_t stride_4x = stride << 2;
1759  ptrdiff_t stride_3x = stride_2x + stride;
1760 
1761  offset_val = (unsigned) offset_in << log2_denom;
1762 
1763  wgt = __lasx_xvreplgr2vr_h(src_weight);
1764  offset = __lasx_xvreplgr2vr_h(offset_val);
1765  denom = __lasx_xvreplgr2vr_h(log2_denom);
1766 
1767  DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1768  src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1769  src_tmp += stride_4x;
1770  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1771  src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1772  DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1773  src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1774  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1775  src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1776  DUP2_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, src0_l, src1_l);
1777  DUP2_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, src0_h, src1_h);
1778  src0_l = __lasx_xvmul_h(wgt, src0_l);
1779  src0_h = __lasx_xvmul_h(wgt, src0_h);
1780  src1_l = __lasx_xvmul_h(wgt, src1_l);
1781  src1_h = __lasx_xvmul_h(wgt, src1_h);
1782  DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
1783  src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1784  src0_l = __lasx_xvmaxi_h(src0_l, 0);
1785  src0_h = __lasx_xvmaxi_h(src0_h, 0);
1786  src1_l = __lasx_xvmaxi_h(src1_l, 0);
1787  src1_h = __lasx_xvmaxi_h(src1_h, 0);
1788  src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1789  src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1790  src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1791  src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1792 
1793  DUP2_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src0, src1);
1794  __lasx_xvstelm_d(src0, src, 0, 0);
1795  __lasx_xvstelm_d(src0, src + stride, 0, 1);
1796  __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
1797  __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
1798  src += stride_4x;
1799  __lasx_xvstelm_d(src1, src, 0, 0);
1800  __lasx_xvstelm_d(src1, src + stride, 0, 1);
1801  __lasx_xvstelm_d(src1, src + stride_2x, 0, 2);
1802  __lasx_xvstelm_d(src1, src + stride_3x, 0, 3);
1803 }
1804 
1805 static void avc_wgt_8x16_lasx(uint8_t *src, ptrdiff_t stride,
1806  int32_t log2_denom, int32_t src_weight,
1807  int32_t offset_in)
1808 {
1809  __m256i src0, src1, src2, src3;
1810  __m256i src0_h, src0_l, src1_h, src1_l, src2_h, src2_l, src3_h, src3_l;
1811  __m256i tmp0, tmp1, tmp2, tmp3, denom, offset, wgt;
1812  __m256i zero = __lasx_xvldi(0);
1813  uint32_t offset_val;
1814  uint8_t* src_tmp = src;
1815  ptrdiff_t stride_2x = stride << 1;
1816  ptrdiff_t stride_4x = stride << 2;
1817  ptrdiff_t stride_3x = stride_2x + stride;
1818 
1819  offset_val = (unsigned) offset_in << log2_denom;
1820 
1821  wgt = __lasx_xvreplgr2vr_h(src_weight);
1822  offset = __lasx_xvreplgr2vr_h(offset_val);
1823  denom = __lasx_xvreplgr2vr_h(log2_denom);
1824 
1825  DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1826  src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1827  src_tmp += stride_4x;
1828  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1829  src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1830  DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1831  src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1832  src_tmp += stride_4x;
1833  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1834  src1 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1835  DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1836  src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1837  src_tmp += stride_4x;
1838  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1839  src2 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1840  DUP4_ARG2(__lasx_xvldx, src_tmp, 0, src_tmp, stride, src_tmp, stride_2x,
1841  src_tmp, stride_3x, tmp0, tmp1, tmp2, tmp3);
1842  DUP2_ARG2(__lasx_xvilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1843  src3 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
1844 
1845  DUP4_ARG2(__lasx_xvilvl_b, zero, src0, zero, src1, zero, src2, zero, src3,
1846  src0_l, src1_l, src2_l, src3_l);
1847  DUP4_ARG2(__lasx_xvilvh_b, zero, src0, zero, src1, zero, src2, zero, src3,
1848  src0_h, src1_h, src2_h, src3_h);
1849  src0_l = __lasx_xvmul_h(wgt, src0_l);
1850  src0_h = __lasx_xvmul_h(wgt, src0_h);
1851  src1_l = __lasx_xvmul_h(wgt, src1_l);
1852  src1_h = __lasx_xvmul_h(wgt, src1_h);
1853  src2_l = __lasx_xvmul_h(wgt, src2_l);
1854  src2_h = __lasx_xvmul_h(wgt, src2_h);
1855  src3_l = __lasx_xvmul_h(wgt, src3_l);
1856  src3_h = __lasx_xvmul_h(wgt, src3_h);
1857 
1858  DUP4_ARG2(__lasx_xvsadd_h, src0_l, offset, src0_h, offset, src1_l, offset,
1859  src1_h, offset, src0_l, src0_h, src1_l, src1_h);
1860  DUP4_ARG2(__lasx_xvsadd_h, src2_l, offset, src2_h, offset, src3_l, offset,
1861  src3_h, offset, src2_l, src2_h, src3_l, src3_h);
1862 
1863  src0_l = __lasx_xvmaxi_h(src0_l, 0);
1864  src0_h = __lasx_xvmaxi_h(src0_h, 0);
1865  src1_l = __lasx_xvmaxi_h(src1_l, 0);
1866  src1_h = __lasx_xvmaxi_h(src1_h, 0);
1867  src2_l = __lasx_xvmaxi_h(src2_l, 0);
1868  src2_h = __lasx_xvmaxi_h(src2_h, 0);
1869  src3_l = __lasx_xvmaxi_h(src3_l, 0);
1870  src3_h = __lasx_xvmaxi_h(src3_h, 0);
1871  src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
1872  src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
1873  src1_l = __lasx_xvssrlrn_bu_h(src1_l, denom);
1874  src1_h = __lasx_xvssrlrn_bu_h(src1_h, denom);
1875  src2_l = __lasx_xvssrlrn_bu_h(src2_l, denom);
1876  src2_h = __lasx_xvssrlrn_bu_h(src2_h, denom);
1877  src3_l = __lasx_xvssrlrn_bu_h(src3_l, denom);
1878  src3_h = __lasx_xvssrlrn_bu_h(src3_h, denom);
1879  DUP4_ARG2(__lasx_xvpickev_d, src0_h, src0_l, src1_h, src1_l, src2_h, src2_l,
1880  src3_h, src3_l, src0, src1, src2, src3);
1881 
1882  __lasx_xvstelm_d(src0, src, 0, 0);
1883  __lasx_xvstelm_d(src0, src + stride, 0, 1);
1884  __lasx_xvstelm_d(src0, src + stride_2x, 0, 2);
1885  __lasx_xvstelm_d(src0, src + stride_3x, 0, 3);
1886  src += stride_4x;
1887  __lasx_xvstelm_d(src1, src, 0, 0);
1888  __lasx_xvstelm_d(src1, src + stride, 0, 1);
1889  __lasx_xvstelm_d(src1, src + stride_2x, 0, 2);
1890  __lasx_xvstelm_d(src1, src + stride_3x, 0, 3);
1891  src += stride_4x;
1892  __lasx_xvstelm_d(src2, src, 0, 0);
1893  __lasx_xvstelm_d(src2, src + stride, 0, 1);
1894  __lasx_xvstelm_d(src2, src + stride_2x, 0, 2);
1895  __lasx_xvstelm_d(src2, src + stride_3x, 0, 3);
1896  src += stride_4x;
1897  __lasx_xvstelm_d(src3, src, 0, 0);
1898  __lasx_xvstelm_d(src3, src + stride, 0, 1);
1899  __lasx_xvstelm_d(src3, src + stride_2x, 0, 2);
1900  __lasx_xvstelm_d(src3, src + stride_3x, 0, 3);
1901 }
1902 
1903 void ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride,
1904  int height, int log2_denom,
1905  int weight_src, int offset)
1906 {
1907  if (4 == height) {
1908  avc_wgt_8x4_lasx(src, stride, log2_denom, weight_src, offset);
1909  } else if (8 == height) {
1910  avc_wgt_8x8_lasx(src, stride, log2_denom, weight_src, offset);
1911  } else {
1912  avc_wgt_8x16_lasx(src, stride, log2_denom, weight_src, offset);
1913  }
1914 }
1915 
1916 static void avc_wgt_4x2_lasx(uint8_t *src, ptrdiff_t stride,
1917  int32_t log2_denom, int32_t weight_src,
1918  int32_t offset_in)
1919 {
1920  uint32_t offset_val;
1921  __m256i wgt, zero = __lasx_xvldi(0);
1922  __m256i src0, tmp0, tmp1, denom, offset;
1923 
1924  offset_val = (unsigned) offset_in << log2_denom;
1925 
1926  wgt = __lasx_xvreplgr2vr_h(weight_src);
1927  offset = __lasx_xvreplgr2vr_h(offset_val);
1928  denom = __lasx_xvreplgr2vr_h(log2_denom);
1929 
1930  DUP2_ARG2(__lasx_xvldx, src, 0, src, stride, tmp0, tmp1);
1931  src0 = __lasx_xvilvl_w(tmp1, tmp0);
1932  src0 = __lasx_xvilvl_b(zero, src0);
1933  src0 = __lasx_xvmul_h(wgt, src0);
1934  src0 = __lasx_xvsadd_h(src0, offset);
1935  src0 = __lasx_xvmaxi_h(src0, 0);
1936  src0 = __lasx_xvssrlrn_bu_h(src0, denom);
1937  __lasx_xvstelm_w(src0, src, 0, 0);
1938  __lasx_xvstelm_w(src0, src + stride, 0, 1);
1939 }
1940 
1941 static void avc_wgt_4x4_lasx(uint8_t *src, ptrdiff_t stride,
1942  int32_t log2_denom, int32_t weight_src,
1943  int32_t offset_in)
1944 {
1945  __m256i wgt;
1946  __m256i src0, tmp0, tmp1, tmp2, tmp3, denom, offset;
1947  uint32_t offset_val;
1948  ptrdiff_t stride_2x = stride << 1;
1949  ptrdiff_t stride_3x = stride_2x + stride;
1950 
1951  offset_val = (unsigned) offset_in << log2_denom;
1952 
1953  wgt = __lasx_xvreplgr2vr_h(weight_src);
1954  offset = __lasx_xvreplgr2vr_h(offset_val);
1955  denom = __lasx_xvreplgr2vr_h(log2_denom);
1956 
1957  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1958  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1959  DUP2_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
1960  src0 = __lasx_xvilvl_w(tmp1, tmp0);
1961  src0 = __lasx_vext2xv_hu_bu(src0);
1962  src0 = __lasx_xvmul_h(wgt, src0);
1963  src0 = __lasx_xvsadd_h(src0, offset);
1964  src0 = __lasx_xvmaxi_h(src0, 0);
1965  src0 = __lasx_xvssrlrn_bu_h(src0, denom);
1966  __lasx_xvstelm_w(src0, src, 0, 0);
1967  __lasx_xvstelm_w(src0, src + stride, 0, 1);
1968  __lasx_xvstelm_w(src0, src + stride_2x, 0, 4);
1969  __lasx_xvstelm_w(src0, src + stride_3x, 0, 5);
1970 }
1971 
1972 static void avc_wgt_4x8_lasx(uint8_t *src, ptrdiff_t stride,
1973  int32_t log2_denom, int32_t weight_src,
1974  int32_t offset_in)
1975 {
1976  __m256i src0, src0_h, src0_l;
1977  __m256i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, denom, offset;
1978  __m256i wgt, zero = __lasx_xvldi(0);
1979  uint32_t offset_val;
1980  ptrdiff_t stride_2x = stride << 1;
1981  ptrdiff_t stride_4x = stride << 2;
1982  ptrdiff_t stride_3x = stride_2x + stride;
1983 
1984  offset_val = (unsigned) offset_in << log2_denom;
1985 
1986  wgt = __lasx_xvreplgr2vr_h(weight_src);
1987  offset = __lasx_xvreplgr2vr_h(offset_val);
1988  denom = __lasx_xvreplgr2vr_h(log2_denom);
1989 
1990  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1991  src, stride_3x, tmp0, tmp1, tmp2, tmp3);
1992  src += stride_4x;
1993  DUP4_ARG2(__lasx_xvldx, src, 0, src, stride, src, stride_2x,
1994  src, stride_3x, tmp4, tmp5, tmp6, tmp7);
1995  src -= stride_4x;
1996  DUP4_ARG2(__lasx_xvilvl_w, tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7,
1997  tmp5, tmp0, tmp1, tmp2, tmp3);
1998  DUP2_ARG2(__lasx_xvilvl_w, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
1999  src0 = __lasx_xvpermi_q(tmp1, tmp0, 0x20);
2000  src0_l = __lasx_xvilvl_b(zero, src0);
2001  src0_h = __lasx_xvilvh_b(zero, src0);
2002  src0_l = __lasx_xvmul_h(wgt, src0_l);
2003  src0_h = __lasx_xvmul_h(wgt, src0_h);
2004  src0_l = __lasx_xvsadd_h(src0_l, offset);
2005  src0_h = __lasx_xvsadd_h(src0_h, offset);
2006  src0_l = __lasx_xvmaxi_h(src0_l, 0);
2007  src0_h = __lasx_xvmaxi_h(src0_h, 0);
2008  src0_l = __lasx_xvssrlrn_bu_h(src0_l, denom);
2009  src0_h = __lasx_xvssrlrn_bu_h(src0_h, denom);
2010  __lasx_xvstelm_w(src0_l, src, 0, 0);
2011  __lasx_xvstelm_w(src0_l, src + stride, 0, 1);
2012  __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 0);
2013  __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 1);
2014  src += stride_4x;
2015  __lasx_xvstelm_w(src0_l, src, 0, 4);
2016  __lasx_xvstelm_w(src0_l, src + stride, 0, 5);
2017  __lasx_xvstelm_w(src0_h, src + stride_2x, 0, 4);
2018  __lasx_xvstelm_w(src0_h, src + stride_3x, 0, 5);
2019 }
2020 
2021 void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride,
2022  int height, int log2_denom,
2023  int weight_src, int offset)
2024 {
2025  if (2 == height) {
2026  avc_wgt_4x2_lasx(src, stride, log2_denom, weight_src, offset);
2027  } else if (4 == height) {
2028  avc_wgt_4x4_lasx(src, stride, log2_denom, weight_src, offset);
2029  } else {
2030  avc_wgt_4x8_lasx(src, stride, log2_denom, weight_src, offset);
2031  }
2032 }
2033 
2034 void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
2035 {
2036  __m256i src0, dst0, dst1, dst2, dst3, zero;
2037  __m256i tmp0, tmp1;
2038  uint8_t* _dst1 = _dst + stride;
2039  uint8_t* _dst2 = _dst1 + stride;
2040  uint8_t* _dst3 = _dst2 + stride;
2041 
2042  src0 = __lasx_xvld(_src, 0);
2043  dst0 = __lasx_xvldrepl_w(_dst, 0);
2044  dst1 = __lasx_xvldrepl_w(_dst1, 0);
2045  dst2 = __lasx_xvldrepl_w(_dst2, 0);
2046  dst3 = __lasx_xvldrepl_w(_dst3, 0);
2047  tmp0 = __lasx_xvilvl_w(dst1, dst0);
2048  tmp1 = __lasx_xvilvl_w(dst3, dst2);
2049  dst0 = __lasx_xvilvl_d(tmp1, tmp0);
2050  tmp0 = __lasx_vext2xv_hu_bu(dst0);
2051  zero = __lasx_xvldi(0);
2052  tmp1 = __lasx_xvadd_h(src0, tmp0);
2053  dst0 = __lasx_xvpickev_b(tmp1, tmp1);
2054  __lasx_xvstelm_w(dst0, _dst, 0, 0);
2055  __lasx_xvstelm_w(dst0, _dst1, 0, 1);
2056  __lasx_xvstelm_w(dst0, _dst2, 0, 4);
2057  __lasx_xvstelm_w(dst0, _dst3, 0, 5);
2058  __lasx_xvst(zero, _src, 0);
2059 }
2060 
2061 void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
2062 {
2063  __m256i src0, src1, src2, src3;
2064  __m256i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2065  __m256i tmp0, tmp1, tmp2, tmp3;
2066  __m256i zero = __lasx_xvldi(0);
2067  uint8_t *_dst1 = _dst + stride;
2068  uint8_t *_dst2 = _dst1 + stride;
2069  uint8_t *_dst3 = _dst2 + stride;
2070  uint8_t *_dst4 = _dst3 + stride;
2071  uint8_t *_dst5 = _dst4 + stride;
2072  uint8_t *_dst6 = _dst5 + stride;
2073  uint8_t *_dst7 = _dst6 + stride;
2074 
2075  src0 = __lasx_xvld(_src, 0);
2076  src1 = __lasx_xvld(_src, 32);
2077  src2 = __lasx_xvld(_src, 64);
2078  src3 = __lasx_xvld(_src, 96);
2079  dst0 = __lasx_xvldrepl_d(_dst, 0);
2080  dst1 = __lasx_xvldrepl_d(_dst1, 0);
2081  dst2 = __lasx_xvldrepl_d(_dst2, 0);
2082  dst3 = __lasx_xvldrepl_d(_dst3, 0);
2083  dst4 = __lasx_xvldrepl_d(_dst4, 0);
2084  dst5 = __lasx_xvldrepl_d(_dst5, 0);
2085  dst6 = __lasx_xvldrepl_d(_dst6, 0);
2086  dst7 = __lasx_xvldrepl_d(_dst7, 0);
2087  tmp0 = __lasx_xvilvl_d(dst1, dst0);
2088  tmp1 = __lasx_xvilvl_d(dst3, dst2);
2089  tmp2 = __lasx_xvilvl_d(dst5, dst4);
2090  tmp3 = __lasx_xvilvl_d(dst7, dst6);
2091  dst0 = __lasx_vext2xv_hu_bu(tmp0);
2092  dst1 = __lasx_vext2xv_hu_bu(tmp1);
2093  dst1 = __lasx_vext2xv_hu_bu(tmp1);
2094  dst2 = __lasx_vext2xv_hu_bu(tmp2);
2095  dst3 = __lasx_vext2xv_hu_bu(tmp3);
2096  tmp0 = __lasx_xvadd_h(src0, dst0);
2097  tmp1 = __lasx_xvadd_h(src1, dst1);
2098  tmp2 = __lasx_xvadd_h(src2, dst2);
2099  tmp3 = __lasx_xvadd_h(src3, dst3);
2100  dst1 = __lasx_xvpickev_b(tmp1, tmp0);
2101  dst2 = __lasx_xvpickev_b(tmp3, tmp2);
2102  __lasx_xvst(zero, _src, 0);
2103  __lasx_xvst(zero, _src, 32);
2104  __lasx_xvst(zero, _src, 64);
2105  __lasx_xvst(zero, _src, 96);
2106  __lasx_xvstelm_d(dst1, _dst, 0, 0);
2107  __lasx_xvstelm_d(dst1, _dst1, 0, 2);
2108  __lasx_xvstelm_d(dst1, _dst2, 0, 1);
2109  __lasx_xvstelm_d(dst1, _dst3, 0, 3);
2110  __lasx_xvstelm_d(dst2, _dst4, 0, 0);
2111  __lasx_xvstelm_d(dst2, _dst5, 0, 2);
2112  __lasx_xvstelm_d(dst2, _dst6, 0, 1);
2113  __lasx_xvstelm_d(dst2, _dst7, 0, 3);
2114 }
avc_biwgt_8x16_lasx
static void avc_biwgt_8x16_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)
Definition: h264dsp_lasx.c:1282
AVC_LPF_P1_OR_Q1
#define AVC_LPF_P1_OR_Q1(p0_or_q0_org_in, q0_or_p0_org_in, p1_or_q1_org_in, p2_or_q2_org_in, neg_tc_in, tc_in, p1_or_q1_out)
Definition: h264dsp_lasx.c:28
src1
const pixel * src1
Definition: h264pred_template.c:421
ff_biweight_h264_pixels4_8_lasx
void ff_biweight_h264_pixels4_8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)
Definition: h264dsp_lasx.c:1542
data
const char data[16]
Definition: mxf.c:146
DUP2_ARG2
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:58
ff_h264_v_lpf_chroma_8_lasx
void ff_h264_v_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in, int8_t *tc)
Definition: h264dsp_lasx.c:474
avc_biwgt_8x4_lasx
static void avc_biwgt_8x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)
Definition: h264dsp_lasx.c:1173
ff_weight_h264_pixels16_8_lasx
void ff_weight_h264_pixels16_8_lasx(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset_in)
Definition: h264dsp_lasx.c:1559
avc_wgt_8x16_lasx
static void avc_wgt_8x16_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_lasx.c:1805
avc_biwgt_4x4_lasx
static void avc_biwgt_4x4_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)
Definition: h264dsp_lasx.c:1440
AVC_LPF_P0_OR_Q0
#define AVC_LPF_P0_OR_Q0(p0_or_q0_org_in, q1_or_p1_org_in, p1_or_q1_org_in, p0_or_q0_out)
Definition: h264dsp_lasx.c:573
avc_biwgt_8x8_lasx
static void avc_biwgt_8x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)
Definition: h264dsp_lasx.c:1216
avc_wgt_4x4_lasx
static void avc_wgt_4x4_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t offset_in)
Definition: h264dsp_lasx.c:1941
ff_weight_h264_pixels8_8_lasx
void ff_weight_h264_pixels8_8_lasx(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset)
Definition: h264dsp_lasx.c:1903
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
avc_wgt_4x2_lasx
static void avc_wgt_4x2_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t offset_in)
Definition: h264dsp_lasx.c:1916
avc_biwgt_4x2_lasx
static void avc_biwgt_4x2_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)
Definition: h264dsp_lasx.c:1408
ff_weight_h264_pixels4_8_lasx
void ff_weight_h264_pixels4_8_lasx(uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_src, int offset)
Definition: h264dsp_lasx.c:2021
ff_h264_v_lpf_chroma_intra_8_lasx
void ff_h264_v_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in)
Definition: h264dsp_lasx.c:960
ff_h264_h_lpf_luma_intra_8_lasx
void ff_h264_h_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in)
Definition: h264dsp_lasx.c:584
ff_h264_h_lpf_chroma_8_lasx
void ff_h264_h_lpf_chroma_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in, int8_t *tc)
Definition: h264dsp_lasx.c:366
DUP4_ARG1
#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:70
DUP2_ARG1
#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:52
DUP2_ARG3
#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:64
ff_h264_v_lpf_luma_intra_8_lasx
void ff_h264_v_lpf_luma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in)
Definition: h264dsp_lasx.c:760
height
#define height
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
avc_wgt_8x4_lasx
static void avc_wgt_8x4_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t offset_in)
Definition: h264dsp_lasx.c:1711
ff_h264_add_pixels8_8_lasx
void ff_h264_add_pixels8_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
Definition: h264dsp_lasx.c:2061
AVC_LPF_P0P1P2_OR_Q0Q1Q2
#define AVC_LPF_P0P1P2_OR_Q0Q1Q2(p3_or_q3_org_in, p0_or_q0_org_in, q3_or_p3_org_in, p1_or_q1_org_in, p2_or_q2_org_in, q1_or_p1_org_in, p0_or_q0_out, p1_or_q1_out, p2_or_q2_out)
Definition: h264dsp_lasx.c:544
avc_wgt_4x8_lasx
static void avc_wgt_4x8_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t offset_in)
Definition: h264dsp_lasx.c:1972
avc_wgt_8x8_lasx
static void avc_wgt_8x8_lasx(uint8_t *src, ptrdiff_t stride, int32_t log2_denom, int32_t src_weight, int32_t offset_in)
Definition: h264dsp_lasx.c:1750
src2
const pixel * src2
Definition: h264pred_template.c:422
stride
#define stride
Definition: h264pred_template.c:537
ff_biweight_h264_pixels16_8_lasx
void ff_biweight_h264_pixels16_8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset_in)
Definition: h264dsp_lasx.c:1004
ff_biweight_h264_pixels8_8_lasx
void ff_biweight_h264_pixels8_8_lasx(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int log2_denom, int weight_dst, int weight_src, int offset)
Definition: h264dsp_lasx.c:1391
avc_biwgt_4x8_lasx
static void avc_biwgt_4x8_lasx(uint8_t *src, uint8_t *dst, ptrdiff_t stride, int32_t log2_denom, int32_t weight_src, int32_t weight_dst, int32_t offset_in)
Definition: h264dsp_lasx.c:1482
ff_h264_v_lpf_luma_8_lasx
void ff_h264_v_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in, int8_t *tc)
Definition: h264dsp_lasx.c:244
AVC_LPF_P0Q0
#define AVC_LPF_P0Q0(q0_or_p0_org_in, p0_or_q0_org_in, p1_or_q1_org_in, q1_or_p1_org_in, neg_threshold_in, threshold_in, p0_or_q0_out, q0_or_p0_out)
Definition: h264dsp_lasx.c:43
ff_h264_h_lpf_chroma_intra_8_lasx
void ff_h264_h_lpf_chroma_intra_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in)
Definition: h264dsp_lasx.c:880
src0
const pixel *const src0
Definition: h264pred_template.c:420
tc
#define tc
Definition: regdef.h:69
zero
#define zero
Definition: regdef.h:64
ff_h264_add_pixels4_8_lasx
void ff_h264_add_pixels4_8_lasx(uint8_t *_dst, int16_t *_src, int stride)
Definition: h264dsp_lasx.c:2034
loongson_intrinsics.h
h264dsp_lasx.h
alpha
static const int16_t alpha[]
Definition: ilbcdata.h:55
ff_h264_h_lpf_luma_8_lasx
void ff_h264_h_lpf_luma_8_lasx(uint8_t *data, ptrdiff_t img_width, int alpha_in, int beta_in, int8_t *tc)
Definition: h264dsp_lasx.c:67
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
int32_t
int32_t
Definition: audioconvert.c:56
DUP4_ARG3
#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:83