FFmpeg
output_lasx.c
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2022 Loongson Technology Corporation Limited
3  * Contributed by Hao Chen(chenhao@loongson.cn)
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "swscale_loongarch.h"
24 
25 void yuv2planeX_8_lasx(const int16_t *filter, int filterSize,
26  const int16_t **src, uint8_t *dest, int dstW,
27  const uint8_t *dither, int offset)
28 {
29  int i;
30  int len = dstW - 15;
31  __m256i mask = {0x1C0C180814041000, 0x1C1814100C080400,
32  0x1C0C180814041000, 0x1C1814100C080400};
33  __m256i val1, val2, val3;
34  uint8_t dither0 = dither[offset & 7];
35  uint8_t dither1 = dither[(offset + 1) & 7];
36  uint8_t dither2 = dither[(offset + 2) & 7];
37  uint8_t dither3 = dither[(offset + 3) & 7];
38  uint8_t dither4 = dither[(offset + 4) & 7];
39  uint8_t dither5 = dither[(offset + 5) & 7];
40  uint8_t dither6 = dither[(offset + 6) & 7];
41  uint8_t dither7 = dither[(offset + 7) & 7];
42  int val_1[8] = {dither0, dither2, dither4, dither6,
43  dither0, dither2, dither4, dither6};
44  int val_2[8] = {dither1, dither3, dither5, dither7,
45  dither1, dither3, dither5, dither7};
46  int val_3[8] = {dither0, dither1, dither2, dither3,
47  dither4, dither5, dither6, dither7};
48 
49  DUP2_ARG2(__lasx_xvld, val_1, 0, val_2, 0, val1, val2);
50  val3 = __lasx_xvld(val_3, 0);
51 
52  for (i = 0; i < len; i += 16) {
53  int j;
54  __m256i src0, filter0, val;
55  __m256i val_ev, val_od;
56 
57  val_ev = __lasx_xvslli_w(val1, 12);
58  val_od = __lasx_xvslli_w(val2, 12);
59 
60  for (j = 0; j < filterSize; j++) {
61  src0 = __lasx_xvld(src[j]+ i, 0);
62  filter0 = __lasx_xvldrepl_h((filter + j), 0);
63  val_ev = __lasx_xvmaddwev_w_h(val_ev, src0, filter0);
64  val_od = __lasx_xvmaddwod_w_h(val_od, src0, filter0);
65  }
66  val_ev = __lasx_xvsrai_w(val_ev, 19);
67  val_od = __lasx_xvsrai_w(val_od, 19);
68  val_ev = __lasx_xvclip255_w(val_ev);
69  val_od = __lasx_xvclip255_w(val_od);
70  val = __lasx_xvshuf_b(val_od, val_ev, mask);
71  __lasx_xvstelm_d(val, (dest + i), 0, 0);
72  __lasx_xvstelm_d(val, (dest + i), 8, 2);
73  }
74  if (dstW - i >= 8){
75  int j;
76  __m256i src0, filter0, val_h;
77  __m256i val_l;
78 
79  val_l = __lasx_xvslli_w(val3, 12);
80 
81  for (j = 0; j < filterSize; j++) {
82  src0 = __lasx_xvld(src[j] + i, 0);
83  src0 = __lasx_vext2xv_w_h(src0);
84  filter0 = __lasx_xvldrepl_h((filter + j), 0);
85  filter0 = __lasx_vext2xv_w_h(filter0);
86  val_l = __lasx_xvmadd_w(val_l, src0, filter0);
87  }
88  val_l = __lasx_xvsrai_w(val_l, 19);
89  val_l = __lasx_xvclip255_w(val_l);
90  val_h = __lasx_xvpermi_d(val_l, 0x4E);
91  val_l = __lasx_xvshuf_b(val_h, val_l, mask);
92  __lasx_xvstelm_d(val_l, (dest + i), 0, 1);
93  i += 8;
94  }
95  for (; i < dstW; i++) {
96  int val = dither[(i + offset) & 7] << 12;
97  int j;
98  for (j = 0; j< filterSize; j++)
99  val += src[j][i] * filter[j];
100 
101  dest[i] = av_clip_uint8(val >> 19);
102  }
103 }
104 
105 /*Copy from libswscale/output.c*/
106 static av_always_inline void
107 yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
108  unsigned A1, unsigned A2,
109  const void *_r, const void *_g, const void *_b, int y,
110  enum AVPixelFormat target, int hasAlpha)
111 {
112  if (target == AV_PIX_FMT_ARGB || target == AV_PIX_FMT_RGBA ||
113  target == AV_PIX_FMT_ABGR || target == AV_PIX_FMT_BGRA) {
114  uint32_t *dest = (uint32_t *) _dest;
115  const uint32_t *r = (const uint32_t *) _r;
116  const uint32_t *g = (const uint32_t *) _g;
117  const uint32_t *b = (const uint32_t *) _b;
118 
119 #if CONFIG_SMALL
120  dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
121  dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
122 #else
123 #if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
124  int sh = (target == AV_PIX_FMT_RGB32_1 ||
125  target == AV_PIX_FMT_BGR32_1) ? 0 : 24;
126  av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0xFF);
127 #endif
128  dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
129  dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
130 #endif
131  } else if (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) {
132  uint8_t *dest = (uint8_t *) _dest;
133  const uint8_t *r = (const uint8_t *) _r;
134  const uint8_t *g = (const uint8_t *) _g;
135  const uint8_t *b = (const uint8_t *) _b;
136 
137 #define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
138 #define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
139 
140  dest[i * 6 + 0] = r_b[Y1];
141  dest[i * 6 + 1] = g[Y1];
142  dest[i * 6 + 2] = b_r[Y1];
143  dest[i * 6 + 3] = r_b[Y2];
144  dest[i * 6 + 4] = g[Y2];
145  dest[i * 6 + 5] = b_r[Y2];
146 #undef r_b
147 #undef b_r
148  } else if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565 ||
149  target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555 ||
150  target == AV_PIX_FMT_RGB444 || target == AV_PIX_FMT_BGR444) {
151  uint16_t *dest = (uint16_t *) _dest;
152  const uint16_t *r = (const uint16_t *) _r;
153  const uint16_t *g = (const uint16_t *) _g;
154  const uint16_t *b = (const uint16_t *) _b;
155  int dr1, dg1, db1, dr2, dg2, db2;
156 
157  if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565) {
158  dr1 = ff_dither_2x2_8[ y & 1 ][0];
159  dg1 = ff_dither_2x2_4[ y & 1 ][0];
160  db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
161  dr2 = ff_dither_2x2_8[ y & 1 ][1];
162  dg2 = ff_dither_2x2_4[ y & 1 ][1];
163  db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
164  } else if (target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555) {
165  dr1 = ff_dither_2x2_8[ y & 1 ][0];
166  dg1 = ff_dither_2x2_8[ y & 1 ][1];
167  db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
168  dr2 = ff_dither_2x2_8[ y & 1 ][1];
169  dg2 = ff_dither_2x2_8[ y & 1 ][0];
170  db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
171  } else {
172  dr1 = ff_dither_4x4_16[ y & 3 ][0];
173  dg1 = ff_dither_4x4_16[ y & 3 ][1];
174  db1 = ff_dither_4x4_16[(y & 3) ^ 3][0];
175  dr2 = ff_dither_4x4_16[ y & 3 ][1];
176  dg2 = ff_dither_4x4_16[ y & 3 ][0];
177  db2 = ff_dither_4x4_16[(y & 3) ^ 3][1];
178  }
179 
180  dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
181  dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
182  } else /* 8/4 bits */ {
183  uint8_t *dest = (uint8_t *) _dest;
184  const uint8_t *r = (const uint8_t *) _r;
185  const uint8_t *g = (const uint8_t *) _g;
186  const uint8_t *b = (const uint8_t *) _b;
187  int dr1, dg1, db1, dr2, dg2, db2;
188 
189  if (target == AV_PIX_FMT_RGB8 || target == AV_PIX_FMT_BGR8) {
190  const uint8_t * const d64 = ff_dither_8x8_73[y & 7];
191  const uint8_t * const d32 = ff_dither_8x8_32[y & 7];
192  dr1 = dg1 = d32[(i * 2 + 0) & 7];
193  db1 = d64[(i * 2 + 0) & 7];
194  dr2 = dg2 = d32[(i * 2 + 1) & 7];
195  db2 = d64[(i * 2 + 1) & 7];
196  } else {
197  const uint8_t * const d64 = ff_dither_8x8_73 [y & 7];
198  const uint8_t * const d128 = ff_dither_8x8_220[y & 7];
199  dr1 = db1 = d128[(i * 2 + 0) & 7];
200  dg1 = d64[(i * 2 + 0) & 7];
201  dr2 = db2 = d128[(i * 2 + 1) & 7];
202  dg2 = d64[(i * 2 + 1) & 7];
203  }
204 
205  if (target == AV_PIX_FMT_RGB4 || target == AV_PIX_FMT_BGR4) {
206  dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
207  ((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
208  } else {
209  dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
210  dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
211  }
212  }
213 }
214 
215 #define WRITE_YUV2RGB(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4) \
216 { \
217  Y1 = __lasx_xvpickve2gr_w(vec_y1, t1); \
218  Y2 = __lasx_xvpickve2gr_w(vec_y2, t2); \
219  U = __lasx_xvpickve2gr_w(vec_u, t3); \
220  V = __lasx_xvpickve2gr_w(vec_v, t4); \
221  r = c->table_rV[V]; \
222  g = (c->table_gU[U] + c->table_gV[V]); \
223  b = c->table_bU[U]; \
224  yuv2rgb_write(dest, count, Y1, Y2, 0, 0, \
225  r, g, b, y, target, 0); \
226  count++; \
227 }
228 
229 static void
230 yuv2rgb_X_template_lasx(SwsInternal *c, const int16_t *lumFilter,
231  const int16_t **lumSrc, int lumFilterSize,
232  const int16_t *chrFilter, const int16_t **chrUSrc,
233  const int16_t **chrVSrc, int chrFilterSize,
234  const int16_t **alpSrc, uint8_t *dest, int dstW,
235  int y, enum AVPixelFormat target, int hasAlpha)
236 {
237  int i, j;
238  int count = 0;
239  int t = 1 << 18;
240  int len = dstW >> 6;
241  int res = dstW & 63;
242  int len_count = (dstW + 1) >> 1;
243  const void *r, *g, *b;
244  int head = YUVRGB_TABLE_HEADROOM;
245  __m256i headroom = __lasx_xvreplgr2vr_w(head);
246 
247  for (i = 0; i < len; i++) {
248  int Y1, Y2, U, V, count_lum = count << 1;
249  __m256i l_src1, l_src2, l_src3, l_src4, u_src1, u_src2, v_src1, v_src2;
250  __m256i yl1_ev, yl1_od, yh1_ev, yh1_od, yl2_ev, yl2_od, yh2_ev, yh2_od;
251  __m256i u1_ev, u1_od, v1_ev, v1_od, u2_ev, u2_od, v2_ev, v2_od, temp;
252 
253  yl1_ev = __lasx_xvldrepl_w(&t, 0);
254  yl1_od = yl1_ev;
255  yh1_ev = yl1_ev;
256  yh1_od = yl1_ev;
257  u1_ev = yl1_ev;
258  v1_ev = yl1_ev;
259  u1_od = yl1_ev;
260  v1_od = yl1_ev;
261  yl2_ev = yl1_ev;
262  yl2_od = yl1_ev;
263  yh2_ev = yl1_ev;
264  yh2_od = yl1_ev;
265  u2_ev = yl1_ev;
266  v2_ev = yl1_ev;
267  u2_od = yl1_ev;
268  v2_od = yl1_ev;
269  for (j = 0; j < lumFilterSize; j++) {
270  const int16_t *src_lum = lumSrc[j] + count_lum;
271  temp = __lasx_xvldrepl_h((lumFilter + j), 0);
272  DUP4_ARG2(__lasx_xvld, src_lum, 0, src_lum, 32, src_lum, 64,
273  src_lum, 96, l_src1, l_src2, l_src3, l_src4);
274 
275  yl1_ev = __lasx_xvmaddwev_w_h(yl1_ev, temp, l_src1);
276  yl1_od = __lasx_xvmaddwod_w_h(yl1_od, temp, l_src1);
277  yh1_ev = __lasx_xvmaddwev_w_h(yh1_ev, temp, l_src2);
278  yh1_od = __lasx_xvmaddwod_w_h(yh1_od, temp, l_src2);
279  yl2_ev = __lasx_xvmaddwev_w_h(yl2_ev, temp, l_src3);
280  yl2_od = __lasx_xvmaddwod_w_h(yl2_od, temp, l_src3);
281  yh2_ev = __lasx_xvmaddwev_w_h(yh2_ev, temp, l_src4);
282  yh2_od = __lasx_xvmaddwod_w_h(yh2_od, temp, l_src4);
283  }
284  for (j = 0; j < chrFilterSize; j++) {
285  DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrUSrc[j] + count, 32,
286  u_src1, u_src2);
287  DUP2_ARG2(__lasx_xvld, chrVSrc[j] + count, 0, chrVSrc[j] + count, 32,
288  v_src1, v_src2);
289  temp = __lasx_xvldrepl_h((chrFilter + j), 0);
290  u1_ev = __lasx_xvmaddwev_w_h(u1_ev, temp, u_src1);
291  u1_od = __lasx_xvmaddwod_w_h(u1_od, temp, u_src1);
292  v1_ev = __lasx_xvmaddwev_w_h(v1_ev, temp, v_src1);
293  v1_od = __lasx_xvmaddwod_w_h(v1_od, temp, v_src1);
294  u2_ev = __lasx_xvmaddwev_w_h(u2_ev, temp, u_src2);
295  u2_od = __lasx_xvmaddwod_w_h(u2_od, temp, u_src2);
296  v2_ev = __lasx_xvmaddwev_w_h(v2_ev, temp, v_src2);
297  v2_od = __lasx_xvmaddwod_w_h(v2_od, temp, v_src2);
298  }
299  yl1_ev = __lasx_xvsrai_w(yl1_ev, 19);
300  yh1_ev = __lasx_xvsrai_w(yh1_ev, 19);
301  yl1_od = __lasx_xvsrai_w(yl1_od, 19);
302  yh1_od = __lasx_xvsrai_w(yh1_od, 19);
303  u1_ev = __lasx_xvsrai_w(u1_ev, 19);
304  v1_ev = __lasx_xvsrai_w(v1_ev, 19);
305  u1_od = __lasx_xvsrai_w(u1_od, 19);
306  v1_od = __lasx_xvsrai_w(v1_od, 19);
307  yl2_ev = __lasx_xvsrai_w(yl2_ev, 19);
308  yh2_ev = __lasx_xvsrai_w(yh2_ev, 19);
309  yl2_od = __lasx_xvsrai_w(yl2_od, 19);
310  yh2_od = __lasx_xvsrai_w(yh2_od, 19);
311  u2_ev = __lasx_xvsrai_w(u2_ev, 19);
312  v2_ev = __lasx_xvsrai_w(v2_ev, 19);
313  u2_od = __lasx_xvsrai_w(u2_od, 19);
314  v2_od = __lasx_xvsrai_w(v2_od, 19);
315  u1_ev = __lasx_xvadd_w(u1_ev, headroom);
316  v1_ev = __lasx_xvadd_w(v1_ev, headroom);
317  u1_od = __lasx_xvadd_w(u1_od, headroom);
318  v1_od = __lasx_xvadd_w(v1_od, headroom);
319  u2_ev = __lasx_xvadd_w(u2_ev, headroom);
320  v2_ev = __lasx_xvadd_w(v2_ev, headroom);
321  u2_od = __lasx_xvadd_w(u2_od, headroom);
322  v2_od = __lasx_xvadd_w(v2_od, headroom);
323  WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 0, 0, 0, 0);
324  WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 1, 1, 0, 0);
325  WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 2, 2, 1, 1);
326  WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 3, 3, 1, 1);
327  WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 4, 4, 2, 2);
328  WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 5, 5, 2, 2);
329  WRITE_YUV2RGB(yl1_ev, yl1_od, u1_ev, v1_ev, 6, 6, 3, 3);
330  WRITE_YUV2RGB(yl1_ev, yl1_od, u1_od, v1_od, 7, 7, 3, 3);
331  WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 0, 0, 4, 4);
332  WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 1, 1, 4, 4);
333  WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 2, 2, 5, 5);
334  WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 3, 3, 5, 5);
335  WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 4, 4, 6, 6);
336  WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 5, 5, 6, 6);
337  WRITE_YUV2RGB(yh1_ev, yh1_od, u1_ev, v1_ev, 6, 6, 7, 7);
338  WRITE_YUV2RGB(yh1_ev, yh1_od, u1_od, v1_od, 7, 7, 7, 7);
339  WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 0, 0, 0, 0);
340  WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 1, 1, 0, 0);
341  WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 2, 2, 1, 1);
342  WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 3, 3, 1, 1);
343  WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 4, 4, 2, 2);
344  WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 5, 5, 2, 2);
345  WRITE_YUV2RGB(yl2_ev, yl2_od, u2_ev, v2_ev, 6, 6, 3, 3);
346  WRITE_YUV2RGB(yl2_ev, yl2_od, u2_od, v2_od, 7, 7, 3, 3);
347  WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 0, 0, 4, 4);
348  WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 1, 1, 4, 4);
349  WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 2, 2, 5, 5);
350  WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 3, 3, 5, 5);
351  WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 4, 4, 6, 6);
352  WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 5, 5, 6, 6);
353  WRITE_YUV2RGB(yh2_ev, yh2_od, u2_ev, v2_ev, 6, 6, 7, 7);
354  WRITE_YUV2RGB(yh2_ev, yh2_od, u2_od, v2_od, 7, 7, 7, 7);
355  }
356  if (res >= 32) {
357  int Y1, Y2, U, V, count_lum = count << 1;
358  __m256i l_src1, l_src2, u_src, v_src;
359  __m256i yl_ev, yl_od, yh_ev, yh_od;
360  __m256i u_ev, u_od, v_ev, v_od, temp;
361 
362  yl_ev = __lasx_xvldrepl_w(&t, 0);
363  yl_od = yl_ev;
364  yh_ev = yl_ev;
365  yh_od = yl_ev;
366  u_ev = yl_ev;
367  v_ev = yl_ev;
368  u_od = yl_ev;
369  v_od = yl_ev;
370  for (j = 0; j < lumFilterSize; j++) {
371  temp = __lasx_xvldrepl_h((lumFilter + j), 0);
372  DUP2_ARG2(__lasx_xvld, lumSrc[j] + count_lum, 0, lumSrc[j] + count_lum,
373  32, l_src1, l_src2);
374  yl_ev = __lasx_xvmaddwev_w_h(yl_ev, temp, l_src1);
375  yl_od = __lasx_xvmaddwod_w_h(yl_od, temp, l_src1);
376  yh_ev = __lasx_xvmaddwev_w_h(yh_ev, temp, l_src2);
377  yh_od = __lasx_xvmaddwod_w_h(yh_od, temp, l_src2);
378  }
379  for (j = 0; j < chrFilterSize; j++) {
380  DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count, 0,
381  u_src, v_src);
382  temp = __lasx_xvldrepl_h((chrFilter + j), 0);
383  u_ev = __lasx_xvmaddwev_w_h(u_ev, temp, u_src);
384  u_od = __lasx_xvmaddwod_w_h(u_od, temp, u_src);
385  v_ev = __lasx_xvmaddwev_w_h(v_ev, temp, v_src);
386  v_od = __lasx_xvmaddwod_w_h(v_od, temp, v_src);
387  }
388  yl_ev = __lasx_xvsrai_w(yl_ev, 19);
389  yh_ev = __lasx_xvsrai_w(yh_ev, 19);
390  yl_od = __lasx_xvsrai_w(yl_od, 19);
391  yh_od = __lasx_xvsrai_w(yh_od, 19);
392  u_ev = __lasx_xvsrai_w(u_ev, 19);
393  v_ev = __lasx_xvsrai_w(v_ev, 19);
394  u_od = __lasx_xvsrai_w(u_od, 19);
395  v_od = __lasx_xvsrai_w(v_od, 19);
396  u_ev = __lasx_xvadd_w(u_ev, headroom);
397  v_ev = __lasx_xvadd_w(v_ev, headroom);
398  u_od = __lasx_xvadd_w(u_od, headroom);
399  v_od = __lasx_xvadd_w(v_od, headroom);
400  WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 0, 0, 0, 0);
401  WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 1, 1, 0, 0);
402  WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 2, 2, 1, 1);
403  WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 3, 3, 1, 1);
404  WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 4, 4, 2, 2);
405  WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 5, 5, 2, 2);
406  WRITE_YUV2RGB(yl_ev, yl_od, u_ev, v_ev, 6, 6, 3, 3);
407  WRITE_YUV2RGB(yl_ev, yl_od, u_od, v_od, 7, 7, 3, 3);
408  WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 0, 0, 4, 4);
409  WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 1, 1, 4, 4);
410  WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 2, 2, 5, 5);
411  WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 3, 3, 5, 5);
412  WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 4, 4, 6, 6);
413  WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 5, 5, 6, 6);
414  WRITE_YUV2RGB(yh_ev, yh_od, u_ev, v_ev, 6, 6, 7, 7);
415  WRITE_YUV2RGB(yh_ev, yh_od, u_od, v_od, 7, 7, 7, 7);
416  res -= 32;
417  }
418  if (res >= 16) {
419  int Y1, Y2, U, V;
420  int count_lum = count << 1;
421  __m256i l_src, u_src, v_src;
422  __m256i y_ev, y_od, u, v, temp;
423 
424  y_ev = __lasx_xvldrepl_w(&t, 0);
425  y_od = y_ev;
426  u = y_ev;
427  v = y_ev;
428  for (j = 0; j < lumFilterSize; j++) {
429  temp = __lasx_xvldrepl_h((lumFilter + j), 0);
430  l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
431  y_ev = __lasx_xvmaddwev_w_h(y_ev, temp, l_src);
432  y_od = __lasx_xvmaddwod_w_h(y_od, temp, l_src);
433  }
434  for (j = 0; j < chrFilterSize; j++) {
435  DUP2_ARG2(__lasx_xvld, chrUSrc[j] + count, 0, chrVSrc[j] + count,
436  0, u_src, v_src);
437  temp = __lasx_xvldrepl_h((chrFilter + j), 0);
438  u_src = __lasx_vext2xv_w_h(u_src);
439  v_src = __lasx_vext2xv_w_h(v_src);
440  u = __lasx_xvmaddwev_w_h(u, temp, u_src);
441  v = __lasx_xvmaddwev_w_h(v, temp, v_src);
442  }
443  y_ev = __lasx_xvsrai_w(y_ev, 19);
444  y_od = __lasx_xvsrai_w(y_od, 19);
445  u = __lasx_xvsrai_w(u, 19);
446  v = __lasx_xvsrai_w(v, 19);
447  u = __lasx_xvadd_w(u, headroom);
448  v = __lasx_xvadd_w(v, headroom);
449  WRITE_YUV2RGB(y_ev, y_od, u, v, 0, 0, 0, 0);
450  WRITE_YUV2RGB(y_ev, y_od, u, v, 1, 1, 1, 1);
451  WRITE_YUV2RGB(y_ev, y_od, u, v, 2, 2, 2, 2);
452  WRITE_YUV2RGB(y_ev, y_od, u, v, 3, 3, 3, 3);
453  WRITE_YUV2RGB(y_ev, y_od, u, v, 4, 4, 4, 4);
454  WRITE_YUV2RGB(y_ev, y_od, u, v, 5, 5, 5, 5);
455  WRITE_YUV2RGB(y_ev, y_od, u, v, 6, 6, 6, 6);
456  WRITE_YUV2RGB(y_ev, y_od, u, v, 7, 7, 7, 7);
457  res -= 16;
458  }
459  if (res >= 8) {
460  int Y1, Y2, U, V;
461  int count_lum = count << 1;
462  __m256i l_src, u_src, v_src;
463  __m256i y_ev, uv, temp;
464 
465  y_ev = __lasx_xvldrepl_w(&t, 0);
466  uv = y_ev;
467  for (j = 0; j < lumFilterSize; j++) {
468  temp = __lasx_xvldrepl_h((lumFilter + j), 0);
469  l_src = __lasx_xvld(lumSrc[j] + count_lum, 0);
470  l_src = __lasx_vext2xv_w_h(l_src);
471  y_ev = __lasx_xvmaddwev_w_h(y_ev, temp, l_src);
472  }
473  for (j = 0; j < chrFilterSize; j++) {
474  u_src = __lasx_xvldrepl_d((chrUSrc[j] + count), 0);
475  v_src = __lasx_xvldrepl_d((chrVSrc[j] + count), 0);
476  temp = __lasx_xvldrepl_h((chrFilter + j), 0);
477  u_src = __lasx_xvilvl_d(v_src, u_src);
478  u_src = __lasx_vext2xv_w_h(u_src);
479  uv = __lasx_xvmaddwev_w_h(uv, temp, u_src);
480  }
481  y_ev = __lasx_xvsrai_w(y_ev, 19);
482  uv = __lasx_xvsrai_w(uv, 19);
483  uv = __lasx_xvadd_w(uv, headroom);
484  WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 0, 1, 0, 4);
485  WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 2, 3, 1, 5);
486  WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 4, 5, 2, 6);
487  WRITE_YUV2RGB(y_ev, y_ev, uv, uv, 6, 7, 3, 7);
488  }
489  for (; count < len_count; count++) {
490  int Y1 = 1 << 18;
491  int Y2 = Y1;
492  int U = Y1;
493  int V = Y1;
494 
495  for (j = 0; j < lumFilterSize; j++) {
496  Y1 += lumSrc[j][count * 2] * lumFilter[j];
497  Y2 += lumSrc[j][count * 2 + 1] * lumFilter[j];
498  }
499  for (j = 0; j < chrFilterSize; j++) {
500  U += chrUSrc[j][count] * chrFilter[j];
501  V += chrVSrc[j][count] * chrFilter[j];
502  }
503  Y1 >>= 19;
504  Y2 >>= 19;
505  U >>= 19;
506  V >>= 19;
507  r = c->table_rV[V + YUVRGB_TABLE_HEADROOM];
508  g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
509  c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
510  b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
511 
512  yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
513  r, g, b, y, target, 0);
514  }
515 }
516 
517 static void
518 yuv2rgb_2_template_lasx(SwsInternal *c, const int16_t *buf[2],
519  const int16_t *ubuf[2], const int16_t *vbuf[2],
520  const int16_t *abuf[2], uint8_t *dest, int dstW,
521  int yalpha, int uvalpha, int y,
522  enum AVPixelFormat target, int hasAlpha)
523 {
524  const int16_t *buf0 = buf[0], *buf1 = buf[1],
525  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
526  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
527  int yalpha1 = 4096 - yalpha;
528  int uvalpha1 = 4096 - uvalpha;
529  int i, count = 0;
530  int len = dstW - 15;
531  int len_count = (dstW + 1) >> 1;
532  const void *r, *g, *b;
533  int head = YUVRGB_TABLE_HEADROOM;
534  __m256i v_yalpha1 = __lasx_xvreplgr2vr_w(yalpha1);
535  __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
536  __m256i v_yalpha = __lasx_xvreplgr2vr_w(yalpha);
537  __m256i v_uvalpha = __lasx_xvreplgr2vr_w(uvalpha);
538  __m256i headroom = __lasx_xvreplgr2vr_w(head);
539 
540  for (i = 0; i < len; i += 16) {
541  int Y1, Y2, U, V;
542  int i_dex = i << 1;
543  int c_dex = count << 1;
544  __m256i y0_h, y0_l, y0, u0, v0;
545  __m256i y1_h, y1_l, y1, u1, v1;
546  __m256i y_l, y_h, u, v;
547 
548  DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
549  buf1, i_dex, y0, u0, v0, y1);
550  DUP2_ARG2(__lasx_xvldx, ubuf1, c_dex, vbuf1, c_dex, u1, v1);
551  DUP2_ARG2(__lasx_xvsllwil_w_h, y0, 0, y1, 0, y0_l, y1_l);
552  DUP2_ARG1(__lasx_xvexth_w_h, y0, y1, y0_h, y1_h);
553  DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
554  y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
555  y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
556  u0 = __lasx_xvmul_w(u0, v_uvalpha1);
557  v0 = __lasx_xvmul_w(v0, v_uvalpha1);
558  y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
559  y_h = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
560  u = __lasx_xvmadd_w(u0, v_uvalpha, u1);
561  v = __lasx_xvmadd_w(v0, v_uvalpha, v1);
562  y_l = __lasx_xvsrai_w(y_l, 19);
563  y_h = __lasx_xvsrai_w(y_h, 19);
564  u = __lasx_xvsrai_w(u, 19);
565  v = __lasx_xvsrai_w(v, 19);
566  u = __lasx_xvadd_w(u, headroom);
567  v = __lasx_xvadd_w(v, headroom);
568  WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
569  WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
570  WRITE_YUV2RGB(y_h, y_h, u, v, 0, 1, 2, 2);
571  WRITE_YUV2RGB(y_h, y_h, u, v, 2, 3, 3, 3);
572  WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 4, 4);
573  WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 5, 5);
574  WRITE_YUV2RGB(y_h, y_h, u, v, 4, 5, 6, 6);
575  WRITE_YUV2RGB(y_h, y_h, u, v, 6, 7, 7, 7);
576  }
577  if (dstW - i >= 8) {
578  int Y1, Y2, U, V;
579  int i_dex = i << 1;
580  __m256i y0_l, y0, u0, v0;
581  __m256i y1_l, y1, u1, v1;
582  __m256i y_l, u, v;
583 
584  y0 = __lasx_xvldx(buf0, i_dex);
585  u0 = __lasx_xvldrepl_d((ubuf0 + count), 0);
586  v0 = __lasx_xvldrepl_d((vbuf0 + count), 0);
587  y1 = __lasx_xvldx(buf1, i_dex);
588  u1 = __lasx_xvldrepl_d((ubuf1 + count), 0);
589  v1 = __lasx_xvldrepl_d((vbuf1 + count), 0);
590  DUP2_ARG1(__lasx_vext2xv_w_h, y0, y1, y0_l, y1_l);
591  DUP4_ARG1(__lasx_vext2xv_w_h, u0, u1, v0, v1, u0, u1, v0, v1);
592  y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
593  u0 = __lasx_xvmul_w(u0, v_uvalpha1);
594  v0 = __lasx_xvmul_w(v0, v_uvalpha1);
595  y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
596  u = __lasx_xvmadd_w(u0, v_uvalpha, u1);
597  v = __lasx_xvmadd_w(v0, v_uvalpha, v1);
598  y_l = __lasx_xvsrai_w(y_l, 19);
599  u = __lasx_xvsrai_w(u, 19);
600  v = __lasx_xvsrai_w(v, 19);
601  u = __lasx_xvadd_w(u, headroom);
602  v = __lasx_xvadd_w(v, headroom);
603  WRITE_YUV2RGB(y_l, y_l, u, v, 0, 1, 0, 0);
604  WRITE_YUV2RGB(y_l, y_l, u, v, 2, 3, 1, 1);
605  WRITE_YUV2RGB(y_l, y_l, u, v, 4, 5, 2, 2);
606  WRITE_YUV2RGB(y_l, y_l, u, v, 6, 7, 3, 3);
607  i += 8;
608  }
609  for (; count < len_count; count++) {
610  int Y1 = (buf0[count * 2] * yalpha1 +
611  buf1[count * 2] * yalpha) >> 19;
612  int Y2 = (buf0[count * 2 + 1] * yalpha1 +
613  buf1[count * 2 + 1] * yalpha) >> 19;
614  int U = (ubuf0[count] * uvalpha1 + ubuf1[count] * uvalpha) >> 19;
615  int V = (vbuf0[count] * uvalpha1 + vbuf1[count] * uvalpha) >> 19;
616 
617  r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
618  g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
619  c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
620  b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
621 
622  yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
623  r, g, b, y, target, 0);
624  }
625 }
626 
627 static void
628 yuv2rgb_1_template_lasx(SwsInternal *c, const int16_t *buf0,
629  const int16_t *ubuf[2], const int16_t *vbuf[2],
630  const int16_t *abuf0, uint8_t *dest, int dstW,
631  int uvalpha, int y, enum AVPixelFormat target,
632  int hasAlpha)
633 {
634  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
635  int i;
636  int len = (dstW - 15);
637  int len_count = (dstW + 1) >> 1;
638  const void *r, *g, *b;
639 
640  if (uvalpha == 0) {
641  int count = 0;
642  int head = YUVRGB_TABLE_HEADROOM;
643  __m256i headroom = __lasx_xvreplgr2vr_h(head);
644 
645  for (i = 0; i < len; i += 16) {
646  int Y1, Y2, U, V;
647  int i_dex = i << 1;
648  int c_dex = count << 1;
649  __m256i src_y, src_u, src_v;
650  __m256i u, v, y_l, y_h;
651 
652  DUP2_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, src_y, src_u);
653  src_v = __lasx_xvldx(vbuf0, c_dex);
654  src_u = __lasx_xvpermi_q(src_u, src_v, 0x02);
655  src_y = __lasx_xvsrari_h(src_y, 7);
656  src_u = __lasx_xvsrari_h(src_u, 7);
657  y_l = __lasx_xvsllwil_w_h(src_y, 0);
658  y_h = __lasx_xvexth_w_h(src_y);
659  u = __lasx_xvaddwev_w_h(src_u, headroom);
660  v = __lasx_xvaddwod_w_h(src_u, headroom);
661  WRITE_YUV2RGB(y_l, y_l, u, u, 0, 1, 0, 4);
662  WRITE_YUV2RGB(y_l, y_l, v, v, 2, 3, 0, 4);
663  WRITE_YUV2RGB(y_h, y_h, u, u, 0, 1, 1, 5);
664  WRITE_YUV2RGB(y_h, y_h, v, v, 2, 3, 1, 5);
665  WRITE_YUV2RGB(y_l, y_l, u, u, 4, 5, 2, 6);
666  WRITE_YUV2RGB(y_l, y_l, v, v, 6, 7, 2, 6);
667  WRITE_YUV2RGB(y_h, y_h, u, u, 4, 5, 3, 7);
668  WRITE_YUV2RGB(y_h, y_h, v, v, 6, 7, 3, 7);
669  }
670  if (dstW - i >= 8){
671  int Y1, Y2, U, V;
672  int i_dex = i << 1;
673  __m256i src_y, src_u, src_v;
674  __m256i y_l, uv;
675 
676  src_y = __lasx_xvldx(buf0, i_dex);
677  src_u = __lasx_xvldrepl_d((ubuf0 + count), 0);
678  src_v = __lasx_xvldrepl_d((vbuf0 + count), 0);
679  src_u = __lasx_xvilvl_d(src_v, src_u);
680  y_l = __lasx_xvsrari_h(src_y, 7);
681  uv = __lasx_xvsrari_h(src_u, 7);
682  y_l = __lasx_vext2xv_w_h(y_l);
683  uv = __lasx_vext2xv_w_h(uv);
684  uv = __lasx_xvaddwev_w_h(uv, headroom);
685  WRITE_YUV2RGB(y_l, y_l, uv, uv, 0, 1, 0, 4);
686  WRITE_YUV2RGB(y_l, y_l, uv, uv, 2, 3, 1, 5);
687  WRITE_YUV2RGB(y_l, y_l, uv, uv, 4, 5, 2, 6);
688  WRITE_YUV2RGB(y_l, y_l, uv, uv, 6, 7, 3, 7);
689  i += 8;
690  }
691  for (; count < len_count; count++) {
692  int Y1 = (buf0[count * 2 ] + 64) >> 7;
693  int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
694  int U = (ubuf0[count] + 64) >> 7;
695  int V = (vbuf0[count] + 64) >> 7;
696 
697  r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
698  g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
699  c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
700  b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
701 
702  yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
703  r, g, b, y, target, 0);
704  }
705  } else {
706  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
707  int count = 0;
708  int HEADROOM = YUVRGB_TABLE_HEADROOM;
709  int uvalpha1 = 4096 - uvalpha;
710  __m256i headroom = __lasx_xvreplgr2vr_w(HEADROOM);
711  __m256i uvalpha_tmp1 = __lasx_xvreplgr2vr_h(uvalpha1);
712  __m256i uvalpha_tmp = __lasx_xvreplgr2vr_h(uvalpha);
713 
714  for (i = 0; i < len; i += 16) {
715  int Y1, Y2, U, V;
716  int i_dex = i << 1;
717  int c_dex = count << 1;
718  __m256i src_y, src_u0, src_v0, src_u1, src_v1;
719  __m256i y_l, y_h, u, v, u_ev, v_od;
720 
721  DUP4_ARG2(__lasx_xvldx, buf0, i_dex, ubuf0, c_dex, vbuf0, c_dex,
722  ubuf1, c_dex, src_y, src_u0, src_v0, src_u1);
723  src_v1 = __lasx_xvldx(vbuf1, c_dex);
724  src_u0 = __lasx_xvpermi_q(src_u0, src_v0, 0x02);
725  src_u1 = __lasx_xvpermi_q(src_u1, src_v1, 0x02);
726  src_y = __lasx_xvsrari_h(src_y, 7);
727  u_ev = __lasx_xvmulwev_w_h(src_u0, uvalpha_tmp1);
728  v_od = __lasx_xvmulwod_w_h(src_u0, uvalpha_tmp1);
729  u = __lasx_xvmaddwev_w_h(u_ev, src_u1, uvalpha_tmp);
730  v = __lasx_xvmaddwod_w_h(v_od, src_u1, uvalpha_tmp);
731  y_l = __lasx_xvsllwil_w_h(src_y, 0);
732  y_h = __lasx_xvexth_w_h(src_y);
733  u = __lasx_xvsrari_w(u, 19);
734  v = __lasx_xvsrari_w(v, 19);
735  u = __lasx_xvadd_w(u, headroom);
736  v = __lasx_xvadd_w(v, headroom);
737  WRITE_YUV2RGB(y_l, y_l, u, u, 0, 1, 0, 4);
738  WRITE_YUV2RGB(y_l, y_l, v, v, 2, 3, 0, 4);
739  WRITE_YUV2RGB(y_h, y_h, u, u, 0, 1, 1, 5);
740  WRITE_YUV2RGB(y_h, y_h, v, v, 2, 3, 1, 5);
741  WRITE_YUV2RGB(y_l, y_l, u, u, 4, 5, 2, 6);
742  WRITE_YUV2RGB(y_l, y_l, v, v, 6, 7, 2, 6);
743  WRITE_YUV2RGB(y_h, y_h, u, u, 4, 5, 3, 7);
744  WRITE_YUV2RGB(y_h, y_h, v, v, 6, 7, 3, 7);
745  }
746  for (; count < len_count; count++) {
747  int Y1 = (buf0[count * 2 ] + 64) >> 7;
748  int Y2 = (buf0[count * 2 + 1] + 64) >> 7;
749  int U = (ubuf0[count] + ubuf1[count] + 128) >> 8;
750  int V = (vbuf0[count] + vbuf1[count] + 128) >> 8;
751 
752  r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
753  g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] +
754  c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
755  b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
756 
757  yuv2rgb_write(dest, count, Y1, Y2, 0, 0,
758  r, g, b, y, target, 0);
759  }
760  }
761 }
762 
763 #define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
764 static void name ## ext ## _X_lasx(SwsInternal *c, const int16_t *lumFilter, \
765  const int16_t **lumSrc, int lumFilterSize, \
766  const int16_t *chrFilter, const int16_t **chrUSrc, \
767  const int16_t **chrVSrc, int chrFilterSize, \
768  const int16_t **alpSrc, uint8_t *dest, int dstW, \
769  int y) \
770 { \
771  name ## base ## _X_template_lasx(c, lumFilter, lumSrc, lumFilterSize, \
772  chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
773  alpSrc, dest, dstW, y, fmt, hasAlpha); \
774 }
775 
776 #define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
777 YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
778 static void name ## ext ## _2_lasx(SwsInternal *c, const int16_t *buf[2], \
779  const int16_t *ubuf[2], const int16_t *vbuf[2], \
780  const int16_t *abuf[2], uint8_t *dest, int dstW, \
781  int yalpha, int uvalpha, int y) \
782 { \
783  name ## base ## _2_template_lasx(c, buf, ubuf, vbuf, abuf, dest, \
784  dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
785 }
786 
787 #define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
788 YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
789 static void name ## ext ## _1_lasx(SwsInternal *c, const int16_t *buf0, \
790  const int16_t *ubuf[2], const int16_t *vbuf[2], \
791  const int16_t *abuf0, uint8_t *dest, int dstW, \
792  int uvalpha, int y) \
793 { \
794  name ## base ## _1_template_lasx(c, buf0, ubuf, vbuf, abuf0, dest, \
795  dstW, uvalpha, y, fmt, hasAlpha); \
796 }
797 
798 
799 #if CONFIG_SMALL
800 #else
801 #if CONFIG_SWSCALE_ALPHA
802 #endif
805 #endif
806 YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
807 YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
814 
815 // This function is copied from libswscale/output.c
817  uint8_t *dest, int i, int R, int A, int G, int B,
818  int y, enum AVPixelFormat target, int hasAlpha, int err[4])
819 {
820  int isrgb8 = target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8;
821 
822  if ((R | G | B) & 0xC0000000) {
823  R = av_clip_uintp2(R, 30);
824  G = av_clip_uintp2(G, 30);
825  B = av_clip_uintp2(B, 30);
826  }
827 
828  switch(target) {
829  case AV_PIX_FMT_ARGB:
830  dest[0] = hasAlpha ? A : 255;
831  dest[1] = R >> 22;
832  dest[2] = G >> 22;
833  dest[3] = B >> 22;
834  break;
835  case AV_PIX_FMT_RGB24:
836  dest[0] = R >> 22;
837  dest[1] = G >> 22;
838  dest[2] = B >> 22;
839  break;
840  case AV_PIX_FMT_RGBA:
841  dest[0] = R >> 22;
842  dest[1] = G >> 22;
843  dest[2] = B >> 22;
844  dest[3] = hasAlpha ? A : 255;
845  break;
846  case AV_PIX_FMT_ABGR:
847  dest[0] = hasAlpha ? A : 255;
848  dest[1] = B >> 22;
849  dest[2] = G >> 22;
850  dest[3] = R >> 22;
851  break;
852  case AV_PIX_FMT_BGR24:
853  dest[0] = B >> 22;
854  dest[1] = G >> 22;
855  dest[2] = R >> 22;
856  break;
857  case AV_PIX_FMT_BGRA:
858  dest[0] = B >> 22;
859  dest[1] = G >> 22;
860  dest[2] = R >> 22;
861  dest[3] = hasAlpha ? A : 255;
862  break;
865  case AV_PIX_FMT_BGR8:
866  case AV_PIX_FMT_RGB8:
867  {
868  int r,g,b;
869 
870  switch (c->opts.dither) {
871  default:
872  case SWS_DITHER_AUTO:
873  case SWS_DITHER_ED:
874  R >>= 22;
875  G >>= 22;
876  B >>= 22;
877  R += (7*err[0] + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2])>>4;
878  G += (7*err[1] + 1*c->dither_error[1][i] + 5*c->dither_error[1][i+1] + 3*c->dither_error[1][i+2])>>4;
879  B += (7*err[2] + 1*c->dither_error[2][i] + 5*c->dither_error[2][i+1] + 3*c->dither_error[2][i+2])>>4;
880  c->dither_error[0][i] = err[0];
881  c->dither_error[1][i] = err[1];
882  c->dither_error[2][i] = err[2];
883  r = R >> (isrgb8 ? 5 : 7);
884  g = G >> (isrgb8 ? 5 : 6);
885  b = B >> (isrgb8 ? 6 : 7);
886  r = av_clip(r, 0, isrgb8 ? 7 : 1);
887  g = av_clip(g, 0, isrgb8 ? 7 : 3);
888  b = av_clip(b, 0, isrgb8 ? 3 : 1);
889  err[0] = R - r*(isrgb8 ? 36 : 255);
890  err[1] = G - g*(isrgb8 ? 36 : 85);
891  err[2] = B - b*(isrgb8 ? 85 : 255);
892  break;
893  case SWS_DITHER_A_DITHER:
894  if (isrgb8) {
895  /* see http://pippin.gimp.org/a_dither/ for details/origin */
896 #define A_DITHER(u,v) (((((u)+((v)*236))*119)&0xff))
897  r = (((R >> 19) + A_DITHER(i,y) -96)>>8);
898  g = (((G >> 19) + A_DITHER(i + 17,y) - 96)>>8);
899  b = (((B >> 20) + A_DITHER(i + 17*2,y) -96)>>8);
900  r = av_clip_uintp2(r, 3);
901  g = av_clip_uintp2(g, 3);
902  b = av_clip_uintp2(b, 2);
903  } else {
904  r = (((R >> 21) + A_DITHER(i,y)-256)>>8);
905  g = (((G >> 19) + A_DITHER(i + 17,y)-256)>>8);
906  b = (((B >> 21) + A_DITHER(i + 17*2,y)-256)>>8);
907  r = av_clip_uintp2(r, 1);
908  g = av_clip_uintp2(g, 2);
909  b = av_clip_uintp2(b, 1);
910  }
911  break;
912  case SWS_DITHER_X_DITHER:
913  if (isrgb8) {
914  /* see http://pippin.gimp.org/a_dither/ for details/origin */
915 #define X_DITHER(u,v) (((((u)^((v)*237))*181)&0x1ff)/2)
916  r = (((R >> 19) + X_DITHER(i,y) - 96)>>8);
917  g = (((G >> 19) + X_DITHER(i + 17,y) - 96)>>8);
918  b = (((B >> 20) + X_DITHER(i + 17*2,y) - 96)>>8);
919  r = av_clip_uintp2(r, 3);
920  g = av_clip_uintp2(g, 3);
921  b = av_clip_uintp2(b, 2);
922  } else {
923  r = (((R >> 21) + X_DITHER(i,y)-256)>>8);
924  g = (((G >> 19) + X_DITHER(i + 17,y)-256)>>8);
925  b = (((B >> 21) + X_DITHER(i + 17*2,y)-256)>>8);
926  r = av_clip_uintp2(r, 1);
927  g = av_clip_uintp2(g, 2);
928  b = av_clip_uintp2(b, 1);
929  }
930 
931  break;
932  }
933 
934  if(target == AV_PIX_FMT_BGR4_BYTE) {
935  dest[0] = r + 2*g + 8*b;
936  } else if(target == AV_PIX_FMT_RGB4_BYTE) {
937  dest[0] = b + 2*g + 8*r;
938  } else if(target == AV_PIX_FMT_BGR8) {
939  dest[0] = r + 8*g + 64*b;
940  } else if(target == AV_PIX_FMT_RGB8) {
941  dest[0] = b + 4*g + 32*r;
942  } else
943  av_assert2(0);
944  break; }
945  }
946 }
947 
948 #define YUV2RGB_SETUP \
949  int y_offset = c->yuv2rgb_y_offset; \
950  int y_coeff = c->yuv2rgb_y_coeff; \
951  int v2r_coe = c->yuv2rgb_v2r_coeff; \
952  int v2g_coe = c->yuv2rgb_v2g_coeff; \
953  int u2g_coe = c->yuv2rgb_u2g_coeff; \
954  int u2b_coe = c->yuv2rgb_u2b_coeff; \
955  __m256i offset = __lasx_xvreplgr2vr_w(y_offset); \
956  __m256i coeff = __lasx_xvreplgr2vr_w(y_coeff); \
957  __m256i v2r = __lasx_xvreplgr2vr_w(v2r_coe); \
958  __m256i v2g = __lasx_xvreplgr2vr_w(v2g_coe); \
959  __m256i u2g = __lasx_xvreplgr2vr_w(u2g_coe); \
960  __m256i u2b = __lasx_xvreplgr2vr_w(u2b_coe); \
961 
962 
963 #define YUV2RGB(y, u, v, R, G, B, offset, coeff, \
964  y_temp, v2r, v2g, u2g, u2b) \
965 { \
966  y = __lasx_xvsub_w(y, offset); \
967  y = __lasx_xvmul_w(y, coeff); \
968  y = __lasx_xvadd_w(y, y_temp); \
969  R = __lasx_xvmadd_w(y, v, v2r); \
970  v = __lasx_xvmadd_w(y, v, v2g); \
971  G = __lasx_xvmadd_w(v, u, u2g); \
972  B = __lasx_xvmadd_w(y, u, u2b); \
973 }
974 
975 #define WRITE_FULL_A(r, g, b, a, t1, s) \
976 { \
977  R = __lasx_xvpickve2gr_w(r, t1); \
978  G = __lasx_xvpickve2gr_w(g, t1); \
979  B = __lasx_xvpickve2gr_w(b, t1); \
980  A = __lasx_xvpickve2gr_w(a, t1); \
981  if (A & 0x100) \
982  A = av_clip_uint8(A); \
983  yuv2rgb_write_full(c, dest, i + s, R, A, G, B, y, target, hasAlpha, err);\
984  dest += step; \
985 }
986 
987 #define WRITE_FULL(r, g, b, t1, s) \
988 { \
989  R = __lasx_xvpickve2gr_w(r, t1); \
990  G = __lasx_xvpickve2gr_w(g, t1); \
991  B = __lasx_xvpickve2gr_w(b, t1); \
992  yuv2rgb_write_full(c, dest, i + s, R, 0, G, B, y, target, hasAlpha, err); \
993  dest += step; \
994 }
995 
996 static void
997 yuv2rgb_full_X_template_lasx(SwsInternal *c, const int16_t *lumFilter,
998  const int16_t **lumSrc, int lumFilterSize,
999  const int16_t *chrFilter, const int16_t **chrUSrc,
1000  const int16_t **chrVSrc, int chrFilterSize,
1001  const int16_t **alpSrc, uint8_t *dest,
1002  int dstW, int y, enum AVPixelFormat target,
1003  int hasAlpha)
1004 {
1005  int i, j, B, G, R, A;
1006  int step = (target == AV_PIX_FMT_RGB24 ||
1007  target == AV_PIX_FMT_BGR24) ? 3 : 4;
1008  int err[4] = {0};
1009  int a_temp = 1 << 18;
1010  int templ = 1 << 9;
1011  int tempc = templ - (128 << 19);
1012  int ytemp = 1 << 21;
1013  int len = dstW - 15;
1014  __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
1016 
1017  if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
1018  || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
1019  step = 1;
1020 
1021  for (i = 0; i < len; i += 16) {
1022  __m256i l_src, u_src, v_src;
1023  __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od, temp;
1024  __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
1025  int n = i << 1;
1026 
1027  y_ev = y_od = __lasx_xvreplgr2vr_w(templ);
1028  u_ev = u_od = v_ev = v_od = __lasx_xvreplgr2vr_w(tempc);
1029  for (j = 0; j < lumFilterSize; j++) {
1030  temp = __lasx_xvldrepl_h((lumFilter + j), 0);
1031  l_src = __lasx_xvldx(lumSrc[j], n);
1032  y_ev = __lasx_xvmaddwev_w_h(y_ev, l_src, temp);
1033  y_od = __lasx_xvmaddwod_w_h(y_od, l_src, temp);
1034  }
1035  for (j = 0; j < chrFilterSize; j++) {
1036  temp = __lasx_xvldrepl_h((chrFilter + j), 0);
1037  DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n,
1038  u_src, v_src);
1039  DUP2_ARG3(__lasx_xvmaddwev_w_h, u_ev, u_src, temp, v_ev,
1040  v_src, temp, u_ev, v_ev);
1041  DUP2_ARG3(__lasx_xvmaddwod_w_h, u_od, u_src, temp, v_od,
1042  v_src, temp, u_od, v_od);
1043  }
1044  y_ev = __lasx_xvsrai_w(y_ev, 10);
1045  y_od = __lasx_xvsrai_w(y_od, 10);
1046  u_ev = __lasx_xvsrai_w(u_ev, 10);
1047  u_od = __lasx_xvsrai_w(u_od, 10);
1048  v_ev = __lasx_xvsrai_w(v_ev, 10);
1049  v_od = __lasx_xvsrai_w(v_od, 10);
1050  YUV2RGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
1051  y_temp, v2r, v2g, u2g, u2b);
1052  YUV2RGB(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
1053  y_temp, v2r, v2g, u2g, u2b);
1054 
1055  if (hasAlpha) {
1056  __m256i a_src, a_ev, a_od;
1057 
1058  a_ev = a_od = __lasx_xvreplgr2vr_w(a_temp);
1059  for (j = 0; j < lumFilterSize; j++) {
1060  temp = __lasx_xvldrepl_h(lumFilter + j, 0);
1061  a_src = __lasx_xvldx(alpSrc[j], n);
1062  a_ev = __lasx_xvmaddwev_w_h(a_ev, a_src, temp);
1063  a_od = __lasx_xvmaddwod_w_h(a_od, a_src, temp);
1064  }
1065  a_ev = __lasx_xvsrai_w(a_ev, 19);
1066  a_od = __lasx_xvsrai_w(a_od, 19);
1067  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
1068  WRITE_FULL_A(R_od, G_od, B_od, a_od, 0, 1);
1069  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 2);
1070  WRITE_FULL_A(R_od, G_od, B_od, a_od, 1, 3);
1071  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 4);
1072  WRITE_FULL_A(R_od, G_od, B_od, a_od, 2, 5);
1073  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 6);
1074  WRITE_FULL_A(R_od, G_od, B_od, a_od, 3, 7);
1075  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 8);
1076  WRITE_FULL_A(R_od, G_od, B_od, a_od, 4, 9);
1077  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 10);
1078  WRITE_FULL_A(R_od, G_od, B_od, a_od, 5, 11);
1079  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 12);
1080  WRITE_FULL_A(R_od, G_od, B_od, a_od, 6, 13);
1081  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 14);
1082  WRITE_FULL_A(R_od, G_od, B_od, a_od, 7, 15);
1083  } else {
1084  WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
1085  WRITE_FULL(R_od, G_od, B_od, 0, 1);
1086  WRITE_FULL(R_ev, G_ev, B_ev, 1, 2);
1087  WRITE_FULL(R_od, G_od, B_od, 1, 3);
1088  WRITE_FULL(R_ev, G_ev, B_ev, 2, 4);
1089  WRITE_FULL(R_od, G_od, B_od, 2, 5);
1090  WRITE_FULL(R_ev, G_ev, B_ev, 3, 6);
1091  WRITE_FULL(R_od, G_od, B_od, 3, 7);
1092  WRITE_FULL(R_ev, G_ev, B_ev, 4, 8);
1093  WRITE_FULL(R_od, G_od, B_od, 4, 9);
1094  WRITE_FULL(R_ev, G_ev, B_ev, 5, 10);
1095  WRITE_FULL(R_od, G_od, B_od, 5, 11);
1096  WRITE_FULL(R_ev, G_ev, B_ev, 6, 12);
1097  WRITE_FULL(R_od, G_od, B_od, 6, 13);
1098  WRITE_FULL(R_ev, G_ev, B_ev, 7, 14);
1099  WRITE_FULL(R_od, G_od, B_od, 7, 15);
1100  }
1101  }
1102  if (dstW - i >= 8) {
1103  __m256i l_src, u_src, v_src;
1104  __m256i y_ev, u_ev, v_ev, uv, temp;
1105  __m256i R_ev, G_ev, B_ev;
1106  int n = i << 1;
1107 
1108  y_ev = __lasx_xvreplgr2vr_w(templ);
1109  u_ev = v_ev = __lasx_xvreplgr2vr_w(tempc);
1110  for (j = 0; j < lumFilterSize; j++) {
1111  temp = __lasx_xvldrepl_h((lumFilter + j), 0);
1112  l_src = __lasx_xvldx(lumSrc[j], n);
1113  l_src = __lasx_xvpermi_d(l_src, 0xD8);
1114  l_src = __lasx_xvilvl_h(l_src, l_src);
1115  y_ev = __lasx_xvmaddwev_w_h(y_ev, l_src, temp);
1116  }
1117  for (j = 0; j < chrFilterSize; j++) {
1118  temp = __lasx_xvldrepl_h((chrFilter + j), 0);
1119  DUP2_ARG2(__lasx_xvldx, chrUSrc[j], n, chrVSrc[j], n, u_src, v_src);
1120  u_src = __lasx_xvpermi_d(u_src, 0xD8);
1121  v_src = __lasx_xvpermi_d(v_src, 0xD8);
1122  uv = __lasx_xvilvl_h(v_src, u_src);
1123  u_ev = __lasx_xvmaddwev_w_h(u_ev, uv, temp);
1124  v_ev = __lasx_xvmaddwod_w_h(v_ev, uv, temp);
1125  }
1126  y_ev = __lasx_xvsrai_w(y_ev, 10);
1127  u_ev = __lasx_xvsrai_w(u_ev, 10);
1128  v_ev = __lasx_xvsrai_w(v_ev, 10);
1129  YUV2RGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
1130  y_temp, v2r, v2g, u2g, u2b);
1131 
1132  if (hasAlpha) {
1133  __m256i a_src, a_ev;
1134 
1135  a_ev = __lasx_xvreplgr2vr_w(a_temp);
1136  for (j = 0; j < lumFilterSize; j++) {
1137  temp = __lasx_xvldrepl_h(lumFilter + j, 0);
1138  a_src = __lasx_xvldx(alpSrc[j], n);
1139  a_src = __lasx_xvpermi_d(a_src, 0xD8);
1140  a_src = __lasx_xvilvl_h(a_src, a_src);
1141  a_ev = __lasx_xvmaddwev_w_h(a_ev, a_src, temp);
1142  }
1143  a_ev = __lasx_xvsrai_w(a_ev, 19);
1144  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
1145  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 1);
1146  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 2);
1147  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 3);
1148  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 4);
1149  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 5);
1150  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 6);
1151  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 7);
1152  } else {
1153  WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
1154  WRITE_FULL(R_ev, G_ev, B_ev, 1, 1);
1155  WRITE_FULL(R_ev, G_ev, B_ev, 2, 2);
1156  WRITE_FULL(R_ev, G_ev, B_ev, 3, 3);
1157  WRITE_FULL(R_ev, G_ev, B_ev, 4, 4);
1158  WRITE_FULL(R_ev, G_ev, B_ev, 5, 5);
1159  WRITE_FULL(R_ev, G_ev, B_ev, 6, 6);
1160  WRITE_FULL(R_ev, G_ev, B_ev, 7, 7);
1161  }
1162  i += 8;
1163  }
1164  for (; i < dstW; i++) {
1165  int Y = templ;
1166  int V, U = V = tempc;
1167 
1168  A = 0;
1169  for (j = 0; j < lumFilterSize; j++) {
1170  Y += lumSrc[j][i] * lumFilter[j];
1171  }
1172  for (j = 0; j < chrFilterSize; j++) {
1173  U += chrUSrc[j][i] * chrFilter[j];
1174  V += chrVSrc[j][i] * chrFilter[j];
1175 
1176  }
1177  Y >>= 10;
1178  U >>= 10;
1179  V >>= 10;
1180  if (hasAlpha) {
1181  A = 1 << 18;
1182  for (j = 0; j < lumFilterSize; j++) {
1183  A += alpSrc[j][i] * lumFilter[j];
1184  }
1185  A >>= 19;
1186  if (A & 0x100)
1187  A = av_clip_uint8(A);
1188  }
1189  Y -= y_offset;
1190  Y *= y_coeff;
1191  Y += ytemp;
1192  R = (unsigned)Y + V * v2r_coe;
1193  G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
1194  B = (unsigned)Y + U * u2b_coe;
1195  yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
1196  dest += step;
1197  }
1198  c->dither_error[0][i] = err[0];
1199  c->dither_error[1][i] = err[1];
1200  c->dither_error[2][i] = err[2];
1201 }
1202 
1203 static void
1205  const int16_t *ubuf[2], const int16_t *vbuf[2],
1206  const int16_t *abuf[2], uint8_t *dest, int dstW,
1207  int yalpha, int uvalpha, int y,
1208  enum AVPixelFormat target, int hasAlpha)
1209 {
1210  const int16_t *buf0 = buf[0], *buf1 = buf[1],
1211  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
1212  *vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
1213  *abuf0 = hasAlpha ? abuf[0] : NULL,
1214  *abuf1 = hasAlpha ? abuf[1] : NULL;
1215  int yalpha1 = 4096 - yalpha;
1216  int uvalpha1 = 4096 - uvalpha;
1217  int uvtemp = 128 << 19;
1218  int atemp = 1 << 18;
1219  int err[4] = {0};
1220  int ytemp = 1 << 21;
1221  int len = dstW - 15;
1222  int i, R, G, B, A;
1223  int step = (target == AV_PIX_FMT_RGB24 ||
1224  target == AV_PIX_FMT_BGR24) ? 3 : 4;
1225  __m256i v_uvalpha1 = __lasx_xvreplgr2vr_w(uvalpha1);
1226  __m256i v_yalpha1 = __lasx_xvreplgr2vr_w(yalpha1);
1227  __m256i v_uvalpha = __lasx_xvreplgr2vr_w(uvalpha);
1228  __m256i v_yalpha = __lasx_xvreplgr2vr_w(yalpha);
1229  __m256i uv = __lasx_xvreplgr2vr_w(uvtemp);
1230  __m256i a_bias = __lasx_xvreplgr2vr_w(atemp);
1231  __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
1233 
1234  av_assert2(yalpha <= 4096U);
1235  av_assert2(uvalpha <= 4096U);
1236 
1237  if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
1238  || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
1239  step = 1;
1240 
1241  for (i = 0; i < len; i += 16) {
1242  __m256i b0, b1, ub0, ub1, vb0, vb1;
1243  __m256i y0_l, y0_h, y1_l, y1_h, u0_l, u0_h;
1244  __m256i v0_l, v0_h, u1_l, u1_h, v1_l, v1_h;
1245  __m256i y_l, y_h, v_l, v_h, u_l, u_h;
1246  __m256i R_l, R_h, G_l, G_h, B_l, B_h;
1247  int n = i << 1;
1248 
1249  DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0,
1250  n, ubuf1, n, b0, b1, ub0, ub1);
1251  DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0 , vb1);
1252  DUP2_ARG2(__lasx_xvsllwil_w_h, b0, 0, b1, 0, y0_l, y1_l);
1253  DUP4_ARG2(__lasx_xvsllwil_w_h, ub0, 0, ub1, 0, vb0, 0, vb1, 0,
1254  u0_l, u1_l, v0_l, v1_l);
1255  DUP2_ARG1(__lasx_xvexth_w_h, b0, b1, y0_h, y1_h);
1256  DUP4_ARG1(__lasx_xvexth_w_h, ub0, ub1, vb0, vb1,
1257  u0_h, u1_h, v0_h, v1_h);
1258  y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
1259  y0_h = __lasx_xvmul_w(y0_h, v_yalpha1);
1260  u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
1261  u0_h = __lasx_xvmul_w(u0_h, v_uvalpha1);
1262  v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
1263  v0_h = __lasx_xvmul_w(v0_h, v_uvalpha1);
1264  y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
1265  y_h = __lasx_xvmadd_w(y0_h, v_yalpha, y1_h);
1266  u_l = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
1267  u_h = __lasx_xvmadd_w(u0_h, v_uvalpha, u1_h);
1268  v_l = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
1269  v_h = __lasx_xvmadd_w(v0_h, v_uvalpha, v1_h);
1270  u_l = __lasx_xvsub_w(u_l, uv);
1271  u_h = __lasx_xvsub_w(u_h, uv);
1272  v_l = __lasx_xvsub_w(v_l, uv);
1273  v_h = __lasx_xvsub_w(v_h, uv);
1274  y_l = __lasx_xvsrai_w(y_l, 10);
1275  y_h = __lasx_xvsrai_w(y_h, 10);
1276  u_l = __lasx_xvsrai_w(u_l, 10);
1277  u_h = __lasx_xvsrai_w(u_h, 10);
1278  v_l = __lasx_xvsrai_w(v_l, 10);
1279  v_h = __lasx_xvsrai_w(v_h, 10);
1280  YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
1281  y_temp, v2r, v2g, u2g, u2b);
1282  YUV2RGB(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
1283  y_temp, v2r, v2g, u2g, u2b);
1284 
1285  if (hasAlpha) {
1286  __m256i a0, a1, a0_l, a0_h;
1287  __m256i a_l, a_h, a1_l, a1_h;
1288 
1289  DUP2_ARG2(__lasx_xvldx, abuf0, n, abuf1, n, a0, a1);
1290  DUP2_ARG2(__lasx_xvsllwil_w_h, a0, 0, a1, 0, a0_l, a1_l);
1291  DUP2_ARG1(__lasx_xvexth_w_h, a0, a1, a0_h, a1_h);
1292  a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
1293  a_h = __lasx_xvmadd_w(a_bias, a0_h, v_yalpha1);
1294  a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
1295  a_h = __lasx_xvmadd_w(a_h, v_yalpha, a1_h);
1296  a_l = __lasx_xvsrai_w(a_l, 19);
1297  a_h = __lasx_xvsrai_w(a_h, 19);
1298  WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
1299  WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
1300  WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
1301  WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
1302  WRITE_FULL_A(R_h, G_h, B_h, a_h, 0, 4);
1303  WRITE_FULL_A(R_h, G_h, B_h, a_h, 1, 5);
1304  WRITE_FULL_A(R_h, G_h, B_h, a_h, 2, 6);
1305  WRITE_FULL_A(R_h, G_h, B_h, a_h, 3, 7);
1306  WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 8);
1307  WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 9);
1308  WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 10);
1309  WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 11);
1310  WRITE_FULL_A(R_h, G_h, B_h, a_h, 4, 12);
1311  WRITE_FULL_A(R_h, G_h, B_h, a_h, 5, 13);
1312  WRITE_FULL_A(R_h, G_h, B_h, a_h, 6, 14);
1313  WRITE_FULL_A(R_h, G_h, B_h, a_h, 7, 15);
1314  } else {
1315  WRITE_FULL(R_l, G_l, B_l, 0, 0);
1316  WRITE_FULL(R_l, G_l, B_l, 1, 1);
1317  WRITE_FULL(R_l, G_l, B_l, 2, 2);
1318  WRITE_FULL(R_l, G_l, B_l, 3, 3);
1319  WRITE_FULL(R_h, G_h, B_h, 0, 4);
1320  WRITE_FULL(R_h, G_h, B_h, 1, 5);
1321  WRITE_FULL(R_h, G_h, B_h, 2, 6);
1322  WRITE_FULL(R_h, G_h, B_h, 3, 7);
1323  WRITE_FULL(R_l, G_l, B_l, 4, 8);
1324  WRITE_FULL(R_l, G_l, B_l, 5, 9);
1325  WRITE_FULL(R_l, G_l, B_l, 6, 10);
1326  WRITE_FULL(R_l, G_l, B_l, 7, 11);
1327  WRITE_FULL(R_h, G_h, B_h, 4, 12);
1328  WRITE_FULL(R_h, G_h, B_h, 5, 13);
1329  WRITE_FULL(R_h, G_h, B_h, 6, 14);
1330  WRITE_FULL(R_h, G_h, B_h, 7, 15);
1331  }
1332  }
1333  if (dstW - i >= 8) {
1334  __m256i b0, b1, ub0, ub1, vb0, vb1;
1335  __m256i y0_l, y1_l, u0_l;
1336  __m256i v0_l, u1_l, v1_l;
1337  __m256i y_l, u_l, v_l;
1338  __m256i R_l, G_l, B_l;
1339  int n = i << 1;
1340 
1341  DUP4_ARG2(__lasx_xvldx, buf0, n, buf1, n, ubuf0, n,
1342  ubuf1, n, b0, b1, ub0, ub1);
1343  DUP2_ARG2(__lasx_xvldx, vbuf0, n, vbuf1, n, vb0, vb1);
1344  DUP2_ARG1(__lasx_vext2xv_w_h, b0, b1, y0_l, y1_l);
1345  DUP4_ARG1(__lasx_vext2xv_w_h, ub0, ub1, vb0, vb1,
1346  u0_l, u1_l, v0_l, v1_l);
1347  y0_l = __lasx_xvmul_w(y0_l, v_yalpha1);
1348  u0_l = __lasx_xvmul_w(u0_l, v_uvalpha1);
1349  v0_l = __lasx_xvmul_w(v0_l, v_uvalpha1);
1350  y_l = __lasx_xvmadd_w(y0_l, v_yalpha, y1_l);
1351  u_l = __lasx_xvmadd_w(u0_l, v_uvalpha, u1_l);
1352  v_l = __lasx_xvmadd_w(v0_l, v_uvalpha, v1_l);
1353  u_l = __lasx_xvsub_w(u_l, uv);
1354  v_l = __lasx_xvsub_w(v_l, uv);
1355  y_l = __lasx_xvsrai_w(y_l, 10);
1356  u_l = __lasx_xvsrai_w(u_l, 10);
1357  v_l = __lasx_xvsrai_w(v_l, 10);
1358  YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
1359  y_temp, v2r, v2g, u2g, u2b);
1360 
1361  if (hasAlpha) {
1362  __m256i a0, a1, a0_l;
1363  __m256i a_l, a1_l;
1364 
1365  DUP2_ARG2(__lasx_xvldx, abuf0, n, abuf1, n, a0, a1);
1366  DUP2_ARG1(__lasx_vext2xv_w_h, a0, a1, a0_l, a1_l);
1367  a_l = __lasx_xvmadd_w(a_bias, a0_l, v_yalpha1);
1368  a_l = __lasx_xvmadd_w(a_l, v_yalpha, a1_l);
1369  a_l = __lasx_xvsrai_w(a_l, 19);
1370  WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
1371  WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
1372  WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
1373  WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
1374  WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
1375  WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
1376  WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
1377  WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
1378  } else {
1379  WRITE_FULL(R_l, G_l, B_l, 0, 0);
1380  WRITE_FULL(R_l, G_l, B_l, 1, 1);
1381  WRITE_FULL(R_l, G_l, B_l, 2, 2);
1382  WRITE_FULL(R_l, G_l, B_l, 3, 3);
1383  WRITE_FULL(R_l, G_l, B_l, 4, 4);
1384  WRITE_FULL(R_l, G_l, B_l, 5, 5);
1385  WRITE_FULL(R_l, G_l, B_l, 6, 6);
1386  WRITE_FULL(R_l, G_l, B_l, 7, 7);
1387  }
1388  i += 8;
1389  }
1390  for (; i < dstW; i++){
1391  int Y = ( buf0[i] * yalpha1 + buf1[i] * yalpha ) >> 10;
1392  int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha- uvtemp) >> 10;
1393  int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha- uvtemp) >> 10;
1394 
1395  A = 0;
1396  if (hasAlpha){
1397  A = (abuf0[i] * yalpha1 + abuf1[i] * yalpha + atemp) >> 19;
1398  if (A & 0x100)
1399  A = av_clip_uint8(A);
1400  }
1401 
1402  Y -= y_offset;
1403  Y *= y_coeff;
1404  Y += ytemp;
1405  R = (unsigned)Y + V * v2r_coe;
1406  G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
1407  B = (unsigned)Y + U * u2b_coe;
1408  yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
1409  dest += step;
1410  }
1411  c->dither_error[0][i] = err[0];
1412  c->dither_error[1][i] = err[1];
1413  c->dither_error[2][i] = err[2];
1414 }
1415 
1416 static void
1418  const int16_t *ubuf[2], const int16_t *vbuf[2],
1419  const int16_t *abuf0, uint8_t *dest, int dstW,
1420  int uvalpha, int y, enum AVPixelFormat target,
1421  int hasAlpha)
1422 {
1423  const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
1424  int i, B, G, R, A;
1425  int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4;
1426  int err[4] = {0};
1427  int ytemp = 1 << 21;
1428  int bias_int = 64;
1429  int len = dstW - 15;
1430  __m256i y_temp = __lasx_xvreplgr2vr_w(ytemp);
1432 
1433  if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
1434  || target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
1435  step = 1;
1436  if (uvalpha < 2048) {
1437  int uvtemp = 128 << 7;
1438  __m256i uv = __lasx_xvreplgr2vr_w(uvtemp);
1439  __m256i bias = __lasx_xvreplgr2vr_w(bias_int);
1440 
1441  for (i = 0; i < len; i += 16) {
1442  __m256i b, ub, vb, ub_l, ub_h, vb_l, vb_h;
1443  __m256i y_l, y_h, u_l, u_h, v_l, v_h;
1444  __m256i R_l, R_h, G_l, G_h, B_l, B_h;
1445  int n = i << 1;
1446 
1447  DUP2_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, b, ub);
1448  vb = __lasx_xvldx(vbuf0, n);
1449  y_l = __lasx_xvsllwil_w_h(b, 2);
1450  y_h = __lasx_xvexth_w_h(b);
1451  DUP2_ARG2(__lasx_xvsllwil_w_h, ub, 0, vb, 0, ub_l, vb_l);
1452  DUP2_ARG1(__lasx_xvexth_w_h, ub, vb, ub_h, vb_h);
1453  y_h = __lasx_xvslli_w(y_h, 2);
1454  u_l = __lasx_xvsub_w(ub_l, uv);
1455  u_h = __lasx_xvsub_w(ub_h, uv);
1456  v_l = __lasx_xvsub_w(vb_l, uv);
1457  v_h = __lasx_xvsub_w(vb_h, uv);
1458  u_l = __lasx_xvslli_w(u_l, 2);
1459  u_h = __lasx_xvslli_w(u_h, 2);
1460  v_l = __lasx_xvslli_w(v_l, 2);
1461  v_h = __lasx_xvslli_w(v_h, 2);
1462  YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
1463  y_temp, v2r, v2g, u2g, u2b);
1464  YUV2RGB(y_h, u_h, v_h, R_h, G_h, B_h, offset, coeff,
1465  y_temp, v2r, v2g, u2g, u2b);
1466 
1467  if(hasAlpha) {
1468  __m256i a_src;
1469  __m256i a_l, a_h;
1470 
1471  a_src = __lasx_xvld(abuf0 + i, 0);
1472  a_l = __lasx_xvsllwil_w_h(a_src, 0);
1473  a_h = __lasx_xvexth_w_h(a_src);
1474  a_l = __lasx_xvadd_w(a_l, bias);
1475  a_h = __lasx_xvadd_w(a_h, bias);
1476  a_l = __lasx_xvsrai_w(a_l, 7);
1477  a_h = __lasx_xvsrai_w(a_h, 7);
1478  WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
1479  WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
1480  WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
1481  WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
1482  WRITE_FULL_A(R_h, G_h, B_h, a_h, 0, 4);
1483  WRITE_FULL_A(R_h, G_h, B_h, a_h, 1, 5);
1484  WRITE_FULL_A(R_h, G_h, B_h, a_h, 2, 6);
1485  WRITE_FULL_A(R_h, G_h, B_h, a_h, 3, 7);
1486  WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 8);
1487  WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 9);
1488  WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 10);
1489  WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 11);
1490  WRITE_FULL_A(R_h, G_h, B_h, a_h, 4, 12);
1491  WRITE_FULL_A(R_h, G_h, B_h, a_h, 5, 13);
1492  WRITE_FULL_A(R_h, G_h, B_h, a_h, 6, 14);
1493  WRITE_FULL_A(R_h, G_h, B_h, a_h, 7, 15);
1494  } else {
1495  WRITE_FULL(R_l, G_l, B_l, 0, 0);
1496  WRITE_FULL(R_l, G_l, B_l, 1, 1);
1497  WRITE_FULL(R_l, G_l, B_l, 2, 2);
1498  WRITE_FULL(R_l, G_l, B_l, 3, 3);
1499  WRITE_FULL(R_h, G_h, B_h, 0, 4);
1500  WRITE_FULL(R_h, G_h, B_h, 1, 5);
1501  WRITE_FULL(R_h, G_h, B_h, 2, 6);
1502  WRITE_FULL(R_h, G_h, B_h, 3, 7);
1503  WRITE_FULL(R_l, G_l, B_l, 4, 8);
1504  WRITE_FULL(R_l, G_l, B_l, 5, 9);
1505  WRITE_FULL(R_l, G_l, B_l, 6, 10);
1506  WRITE_FULL(R_l, G_l, B_l, 7, 11);
1507  WRITE_FULL(R_h, G_h, B_h, 4, 12);
1508  WRITE_FULL(R_h, G_h, B_h, 5, 13);
1509  WRITE_FULL(R_h, G_h, B_h, 6, 14);
1510  WRITE_FULL(R_h, G_h, B_h, 7, 15);
1511  }
1512  }
1513  if (dstW - i >= 8) {
1514  __m256i b, ub, vb, ub_l, vb_l;
1515  __m256i y_l, u_l, v_l;
1516  __m256i R_l, G_l, B_l;
1517  int n = i << 1;
1518 
1519  DUP2_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, b, ub);
1520  vb = __lasx_xvldx(vbuf0, n);
1521  y_l = __lasx_vext2xv_w_h(b);
1522  DUP2_ARG1(__lasx_vext2xv_w_h, ub, vb, ub_l, vb_l);
1523  y_l = __lasx_xvslli_w(y_l, 2);
1524  u_l = __lasx_xvsub_w(ub_l, uv);
1525  v_l = __lasx_xvsub_w(vb_l, uv);
1526  u_l = __lasx_xvslli_w(u_l, 2);
1527  v_l = __lasx_xvslli_w(v_l, 2);
1528  YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
1529  y_temp, v2r, v2g, u2g, u2b);
1530 
1531  if(hasAlpha) {
1532  __m256i a_src, a_l;
1533 
1534  a_src = __lasx_xvldx(abuf0, n);
1535  a_src = __lasx_vext2xv_w_h(a_src);
1536  a_l = __lasx_xvadd_w(bias, a_src);
1537  a_l = __lasx_xvsrai_w(a_l, 7);
1538  WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
1539  WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
1540  WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
1541  WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
1542  WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
1543  WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
1544  WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
1545  WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
1546  } else {
1547  WRITE_FULL(R_l, G_l, B_l, 0, 0);
1548  WRITE_FULL(R_l, G_l, B_l, 1, 1);
1549  WRITE_FULL(R_l, G_l, B_l, 2, 2);
1550  WRITE_FULL(R_l, G_l, B_l, 3, 3);
1551  WRITE_FULL(R_l, G_l, B_l, 4, 4);
1552  WRITE_FULL(R_l, G_l, B_l, 5, 5);
1553  WRITE_FULL(R_l, G_l, B_l, 6, 6);
1554  WRITE_FULL(R_l, G_l, B_l, 7, 7);
1555  }
1556  i += 8;
1557  }
1558  for (; i < dstW; i++) {
1559  int Y = buf0[i] << 2;
1560  int U = (ubuf0[i] - uvtemp) << 2;
1561  int V = (vbuf0[i] - uvtemp) << 2;
1562 
1563  A = 0;
1564  if(hasAlpha) {
1565  A = (abuf0[i] + 64) >> 7;
1566  if (A & 0x100)
1567  A = av_clip_uint8(A);
1568  }
1569  Y -= y_offset;
1570  Y *= y_coeff;
1571  Y += ytemp;
1572  R = (unsigned)Y + V * v2r_coe;
1573  G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
1574  B = (unsigned)Y + U * u2b_coe;
1575  yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
1576  dest += step;
1577  }
1578  } else {
1579  const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
1580  int uvtemp = 128 << 8;
1581  __m256i uv = __lasx_xvreplgr2vr_w(uvtemp);
1582  __m256i zero = __lasx_xvldi(0);
1583  __m256i bias = __lasx_xvreplgr2vr_h(bias_int);
1584 
1585  for (i = 0; i < len; i += 16) {
1586  __m256i b, ub0, ub1, vb0, vb1;
1587  __m256i y_ev, y_od, u_ev, u_od, v_ev, v_od;
1588  __m256i R_ev, R_od, G_ev, G_od, B_ev, B_od;
1589  int n = i << 1;
1590 
1591  DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
1592  ubuf1, n, b, ub0, vb0, ub1);
1593  vb1 = __lasx_xvldx(vbuf, n);
1594  y_ev = __lasx_xvaddwev_w_h(b, zero);
1595  y_od = __lasx_xvaddwod_w_h(b, zero);
1596  DUP2_ARG2(__lasx_xvaddwev_w_h, ub0, vb0, ub1, vb1, u_ev, v_ev);
1597  DUP2_ARG2(__lasx_xvaddwod_w_h, ub0, vb0, ub1, vb1, u_od, v_od);
1598  DUP2_ARG2(__lasx_xvslli_w, y_ev, 2, y_od, 2, y_ev, y_od);
1599  DUP4_ARG2(__lasx_xvsub_w, u_ev, uv, u_od, uv, v_ev, uv, v_od, uv,
1600  u_ev, u_od, v_ev, v_od);
1601  DUP4_ARG2(__lasx_xvslli_w, u_ev, 1, u_od, 1, v_ev, 1, v_od, 1,
1602  u_ev, u_od, v_ev, v_od);
1603  YUV2RGB(y_ev, u_ev, v_ev, R_ev, G_ev, B_ev, offset, coeff,
1604  y_temp, v2r, v2g, u2g, u2b);
1605  YUV2RGB(y_od, u_od, v_od, R_od, G_od, B_od, offset, coeff,
1606  y_temp, v2r, v2g, u2g, u2b);
1607 
1608  if(hasAlpha) {
1609  __m256i a_src;
1610  __m256i a_ev, a_od;
1611 
1612  a_src = __lasx_xvld(abuf0 + i, 0);
1613  a_ev = __lasx_xvaddwev_w_h(bias, a_src);
1614  a_od = __lasx_xvaddwod_w_h(bias, a_src);
1615  a_ev = __lasx_xvsrai_w(a_ev, 7);
1616  a_od = __lasx_xvsrai_w(a_od, 7);
1617  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 0, 0);
1618  WRITE_FULL_A(R_od, G_od, B_od, a_od, 0, 1);
1619  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 1, 2);
1620  WRITE_FULL_A(R_od, G_od, B_od, a_od, 1, 3);
1621  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 2, 4);
1622  WRITE_FULL_A(R_od, G_od, B_od, a_od, 2, 5);
1623  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 3, 6);
1624  WRITE_FULL_A(R_od, G_od, B_od, a_od, 3, 7);
1625  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 4, 8);
1626  WRITE_FULL_A(R_od, G_od, B_od, a_od, 4, 9);
1627  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 5, 10);
1628  WRITE_FULL_A(R_od, G_od, B_od, a_od, 5, 11);
1629  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 6, 12);
1630  WRITE_FULL_A(R_od, G_od, B_od, a_od, 6, 13);
1631  WRITE_FULL_A(R_ev, G_ev, B_ev, a_ev, 7, 14);
1632  WRITE_FULL_A(R_od, G_od, B_od, a_od, 7, 15);
1633  } else {
1634  WRITE_FULL(R_ev, G_ev, B_ev, 0, 0);
1635  WRITE_FULL(R_od, G_od, B_od, 0, 1);
1636  WRITE_FULL(R_ev, G_ev, B_ev, 1, 2);
1637  WRITE_FULL(R_od, G_od, B_od, 1, 3);
1638  WRITE_FULL(R_ev, G_ev, B_ev, 2, 4);
1639  WRITE_FULL(R_od, G_od, B_od, 2, 5);
1640  WRITE_FULL(R_ev, G_ev, B_ev, 3, 6);
1641  WRITE_FULL(R_od, G_od, B_od, 3, 7);
1642  WRITE_FULL(R_ev, G_ev, B_ev, 4, 8);
1643  WRITE_FULL(R_od, G_od, B_od, 4, 9);
1644  WRITE_FULL(R_ev, G_ev, B_ev, 5, 10);
1645  WRITE_FULL(R_od, G_od, B_od, 5, 11);
1646  WRITE_FULL(R_ev, G_ev, B_ev, 6, 12);
1647  WRITE_FULL(R_od, G_od, B_od, 6, 13);
1648  WRITE_FULL(R_ev, G_ev, B_ev, 7, 14);
1649  WRITE_FULL(R_od, G_od, B_od, 7, 15);
1650  }
1651  }
1652  if (dstW - i >= 8) {
1653  __m256i b, ub0, ub1, vb0, vb1;
1654  __m256i y_l, u_l, v_l;
1655  __m256i R_l, G_l, B_l;
1656  int n = i << 1;
1657 
1658  DUP4_ARG2(__lasx_xvldx, buf0, n, ubuf0, n, vbuf0, n,
1659  ubuf1, n, b, ub0, vb0, ub1);
1660  vb1 = __lasx_xvldx(vbuf1, n);
1661  y_l = __lasx_vext2xv_w_h(b);
1662  y_l = __lasx_xvslli_w(y_l, 2);
1663  DUP4_ARG1(__lasx_vext2xv_w_h, ub0, vb0, ub1, vb1,
1664  ub0, vb0, ub1, vb1);
1665  DUP2_ARG2(__lasx_xvadd_w, ub0, ub1, vb0, vb1, u_l, v_l);
1666  u_l = __lasx_xvsub_w(u_l, uv);
1667  v_l = __lasx_xvsub_w(v_l, uv);
1668  u_l = __lasx_xvslli_w(u_l, 1);
1669  v_l = __lasx_xvslli_w(v_l, 1);
1670  YUV2RGB(y_l, u_l, v_l, R_l, G_l, B_l, offset, coeff,
1671  y_temp, v2r, v2g, u2g, u2b);
1672 
1673  if(hasAlpha) {
1674  __m256i a_src;
1675  __m256i a_l;
1676 
1677  a_src = __lasx_xvld(abuf0 + i, 0);
1678  a_src = __lasx_xvpermi_d(a_src, 0xD8);
1679  a_src = __lasx_xvilvl_h(a_src, a_src);
1680  a_l = __lasx_xvaddwev_w_h(bias, a_src);
1681  a_l = __lasx_xvsrai_w(a_l, 7);
1682  WRITE_FULL_A(R_l, G_l, B_l, a_l, 0, 0);
1683  WRITE_FULL_A(R_l, G_l, B_l, a_l, 1, 1);
1684  WRITE_FULL_A(R_l, G_l, B_l, a_l, 2, 2);
1685  WRITE_FULL_A(R_l, G_l, B_l, a_l, 3, 3);
1686  WRITE_FULL_A(R_l, G_l, B_l, a_l, 4, 4);
1687  WRITE_FULL_A(R_l, G_l, B_l, a_l, 5, 5);
1688  WRITE_FULL_A(R_l, G_l, B_l, a_l, 6, 6);
1689  WRITE_FULL_A(R_l, G_l, B_l, a_l, 7, 7);
1690  } else {
1691  WRITE_FULL(R_l, G_l, B_l, 0, 0);
1692  WRITE_FULL(R_l, G_l, B_l, 1, 1);
1693  WRITE_FULL(R_l, G_l, B_l, 2, 2);
1694  WRITE_FULL(R_l, G_l, B_l, 3, 3);
1695  WRITE_FULL(R_l, G_l, B_l, 4, 4);
1696  WRITE_FULL(R_l, G_l, B_l, 5, 5);
1697  WRITE_FULL(R_l, G_l, B_l, 6, 6);
1698  WRITE_FULL(R_l, G_l, B_l, 7, 7);
1699  }
1700  i += 8;
1701  }
1702  for (; i < dstW; i++) {
1703  int Y = buf0[i] << 2;
1704  int U = (ubuf0[i] + ubuf1[i] - uvtemp) << 1;
1705  int V = (vbuf0[i] + vbuf1[i] - uvtemp) << 1;
1706 
1707  A = 0;
1708  if(hasAlpha) {
1709  A = (abuf0[i] + 64) >> 7;
1710  if (A & 0x100)
1711  A = av_clip_uint8(A);
1712  }
1713  Y -= y_offset;
1714  Y *= y_coeff;
1715  Y += ytemp;
1716  R = (unsigned)Y + V * v2r_coe;
1717  G = (unsigned)Y + V * v2g_coe + U * u2g_coe;
1718  B = (unsigned)Y + U * u2b_coe;
1719  yuv2rgb_write_full(c, dest, i, R, A, G, B, y, target, hasAlpha, err);
1720  dest += step;
1721  }
1722  }
1723  c->dither_error[0][i] = err[0];
1724  c->dither_error[1][i] = err[1];
1725  c->dither_error[2][i] = err[2];
1726 }
1727 #if CONFIG_SMALL
1728 YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA,
1729  CONFIG_SWSCALE_ALPHA && c->needAlpha)
1730 YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR,
1731  CONFIG_SWSCALE_ALPHA && c->needAlpha)
1732 YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA,
1733  CONFIG_SWSCALE_ALPHA && c->needAlpha)
1734 YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB,
1735  CONFIG_SWSCALE_ALPHA && c->needAlpha)
1736 #else
1737 #if CONFIG_SWSCALE_ALPHA
1738 YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA, 1)
1739 YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR, 1)
1740 YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA, 1)
1741 YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB, 1)
1742 #endif
1743 YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
1744 YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
1745 YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
1746 YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
1747 #endif
1748 YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
1749 YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
1750 
1751 YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full, AV_PIX_FMT_BGR4_BYTE, 0)
1752 YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full, AV_PIX_FMT_RGB4_BYTE, 0)
1753 YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
1754 YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
1755 
1756 
1758  yuv2planar1_fn *yuv2plane1,
1760  yuv2interleavedX_fn *yuv2nv12cX,
1761  yuv2packed1_fn *yuv2packed1,
1762  yuv2packed2_fn *yuv2packed2,
1763  yuv2packedX_fn *yuv2packedX,
1764  yuv2anyX_fn *yuv2anyX)
1765 {
1766  enum AVPixelFormat dstFormat = c->opts.dst_format;
1767 
1768  /* Add initialization once optimized */
1769  if (isSemiPlanarYUV(dstFormat) && isDataInHighBits(dstFormat)) {
1770  } else if (is16BPS(dstFormat)) {
1771  } else if (isNBPS(dstFormat)) {
1772  } else if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
1773  } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
1774  } else {
1775  *yuv2plane1 = yuv2plane1_8_lasx;
1777  }
1778 
1779  if(c->opts.flags & SWS_FULL_CHR_H_INT) {
1780  switch (c->opts.dst_format) {
1781  case AV_PIX_FMT_RGBA:
1782 #if CONFIG_SMALL
1783  c->yuv2packedX = yuv2rgba32_full_X_lasx;
1784  c->yuv2packed2 = yuv2rgba32_full_2_lasx;
1785  c->yuv2packed1 = yuv2rgba32_full_1_lasx;
1786 #else
1787 #if CONFIG_SWSCALE_ALPHA
1788  if (c->needAlpha) {
1789  c->yuv2packedX = yuv2rgba32_full_X_lasx;
1790  c->yuv2packed2 = yuv2rgba32_full_2_lasx;
1791  c->yuv2packed1 = yuv2rgba32_full_1_lasx;
1792  } else
1793 #endif /* CONFIG_SWSCALE_ALPHA */
1794  {
1795  c->yuv2packedX = yuv2rgbx32_full_X_lasx;
1796  c->yuv2packed2 = yuv2rgbx32_full_2_lasx;
1797  c->yuv2packed1 = yuv2rgbx32_full_1_lasx;
1798  }
1799 #endif /* !CONFIG_SMALL */
1800  break;
1801  case AV_PIX_FMT_ARGB:
1802 #if CONFIG_SMALL
1803  c->yuv2packedX = yuv2argb32_full_X_lasx;
1804  c->yuv2packed2 = yuv2argb32_full_2_lasx;
1805  c->yuv2packed1 = yuv2argb32_full_1_lasx;
1806 #else
1807 #if CONFIG_SWSCALE_ALPHA
1808  if (c->needAlpha) {
1809  c->yuv2packedX = yuv2argb32_full_X_lasx;
1810  c->yuv2packed2 = yuv2argb32_full_2_lasx;
1811  c->yuv2packed1 = yuv2argb32_full_1_lasx;
1812  } else
1813 #endif /* CONFIG_SWSCALE_ALPHA */
1814  {
1815  c->yuv2packedX = yuv2xrgb32_full_X_lasx;
1816  c->yuv2packed2 = yuv2xrgb32_full_2_lasx;
1817  c->yuv2packed1 = yuv2xrgb32_full_1_lasx;
1818  }
1819 #endif /* !CONFIG_SMALL */
1820  break;
1821  case AV_PIX_FMT_BGRA:
1822 #if CONFIG_SMALL
1823  c->yuv2packedX = yuv2bgra32_full_X_lasx;
1824  c->yuv2packed2 = yuv2bgra32_full_2_lasx;
1825  c->yuv2packed1 = yuv2bgra32_full_1_lasx;
1826 #else
1827 #if CONFIG_SWSCALE_ALPHA
1828  if (c->needAlpha) {
1829  c->yuv2packedX = yuv2bgra32_full_X_lasx;
1830  c->yuv2packed2 = yuv2bgra32_full_2_lasx;
1831  c->yuv2packed1 = yuv2bgra32_full_1_lasx;
1832  } else
1833 #endif /* CONFIG_SWSCALE_ALPHA */
1834  {
1835  c->yuv2packedX = yuv2bgrx32_full_X_lasx;
1836  c->yuv2packed2 = yuv2bgrx32_full_2_lasx;
1837  c->yuv2packed1 = yuv2bgrx32_full_1_lasx;
1838  }
1839 #endif /* !CONFIG_SMALL */
1840  break;
1841  case AV_PIX_FMT_ABGR:
1842 #if CONFIG_SMALL
1843  c->yuv2packedX = yuv2abgr32_full_X_lasx;
1844  c->yuv2packed2 = yuv2abgr32_full_2_lasx;
1845  c->yuv2packed1 = yuv2abgr32_full_1_lasx;
1846 #else
1847 #if CONFIG_SWSCALE_ALPHA
1848  if (c->needAlpha) {
1849  c->yuv2packedX = yuv2abgr32_full_X_lasx;
1850  c->yuv2packed2 = yuv2abgr32_full_2_lasx;
1851  c->yuv2packed1 = yuv2abgr32_full_1_lasx;
1852  } else
1853 #endif /* CONFIG_SWSCALE_ALPHA */
1854  {
1855  c->yuv2packedX = yuv2xbgr32_full_X_lasx;
1856  c->yuv2packed2 = yuv2xbgr32_full_2_lasx;
1857  c->yuv2packed1 = yuv2xbgr32_full_1_lasx;
1858  }
1859 #endif /* !CONFIG_SMALL */
1860  break;
1861  case AV_PIX_FMT_RGB24:
1862  c->yuv2packedX = yuv2rgb24_full_X_lasx;
1863  c->yuv2packed2 = yuv2rgb24_full_2_lasx;
1864  c->yuv2packed1 = yuv2rgb24_full_1_lasx;
1865  break;
1866  case AV_PIX_FMT_BGR24:
1867  c->yuv2packedX = yuv2bgr24_full_X_lasx;
1868  c->yuv2packed2 = yuv2bgr24_full_2_lasx;
1869  c->yuv2packed1 = yuv2bgr24_full_1_lasx;
1870  break;
1871  case AV_PIX_FMT_BGR4_BYTE:
1872  c->yuv2packedX = yuv2bgr4_byte_full_X_lasx;
1873  c->yuv2packed2 = yuv2bgr4_byte_full_2_lasx;
1874  c->yuv2packed1 = yuv2bgr4_byte_full_1_lasx;
1875  break;
1876  case AV_PIX_FMT_RGB4_BYTE:
1877  c->yuv2packedX = yuv2rgb4_byte_full_X_lasx;
1878  c->yuv2packed2 = yuv2rgb4_byte_full_2_lasx;
1879  c->yuv2packed1 = yuv2rgb4_byte_full_1_lasx;
1880  break;
1881  case AV_PIX_FMT_BGR8:
1882  c->yuv2packedX = yuv2bgr8_full_X_lasx;
1883  c->yuv2packed2 = yuv2bgr8_full_2_lasx;
1884  c->yuv2packed1 = yuv2bgr8_full_1_lasx;
1885  break;
1886  case AV_PIX_FMT_RGB8:
1887  c->yuv2packedX = yuv2rgb8_full_X_lasx;
1888  c->yuv2packed2 = yuv2rgb8_full_2_lasx;
1889  c->yuv2packed1 = yuv2rgb8_full_1_lasx;
1890  break;
1891  }
1892  } else {
1893  switch (c->opts.dst_format) {
1894  case AV_PIX_FMT_RGB32:
1895  case AV_PIX_FMT_BGR32:
1896 #if CONFIG_SMALL
1897 #else
1898 #if CONFIG_SWSCALE_ALPHA
1899  if (c->needAlpha) {
1900  } else
1901 #endif /* CONFIG_SWSCALE_ALPHA */
1902  {
1903  c->yuv2packed1 = yuv2rgbx32_1_lasx;
1904  c->yuv2packed2 = yuv2rgbx32_2_lasx;
1905  c->yuv2packedX = yuv2rgbx32_X_lasx;
1906  }
1907 #endif /* !CONFIG_SMALL */
1908  break;
1909  case AV_PIX_FMT_RGB32_1:
1910  case AV_PIX_FMT_BGR32_1:
1911 #if CONFIG_SMALL
1912 #else
1913 #if CONFIG_SWSCALE_ALPHA
1914  if (c->needAlpha) {
1915  } else
1916 #endif /* CONFIG_SWSCALE_ALPHA */
1917  {
1918  c->yuv2packed1 = yuv2rgbx32_1_1_lasx;
1919  c->yuv2packed2 = yuv2rgbx32_1_2_lasx;
1920  c->yuv2packedX = yuv2rgbx32_1_X_lasx;
1921  }
1922 #endif /* !CONFIG_SMALL */
1923  break;
1924  case AV_PIX_FMT_RGB24:
1925  c->yuv2packed1 = yuv2rgb24_1_lasx;
1926  c->yuv2packed2 = yuv2rgb24_2_lasx;
1927  c->yuv2packedX = yuv2rgb24_X_lasx;
1928  break;
1929  case AV_PIX_FMT_BGR24:
1930  c->yuv2packed1 = yuv2bgr24_1_lasx;
1931  c->yuv2packed2 = yuv2bgr24_2_lasx;
1932  c->yuv2packedX = yuv2bgr24_X_lasx;
1933  break;
1934  case AV_PIX_FMT_RGB565LE:
1935  case AV_PIX_FMT_RGB565BE:
1936  case AV_PIX_FMT_BGR565LE:
1937  case AV_PIX_FMT_BGR565BE:
1938  c->yuv2packed1 = yuv2rgb16_1_lasx;
1939  c->yuv2packed2 = yuv2rgb16_2_lasx;
1940  c->yuv2packedX = yuv2rgb16_X_lasx;
1941  break;
1942  case AV_PIX_FMT_RGB555LE:
1943  case AV_PIX_FMT_RGB555BE:
1944  case AV_PIX_FMT_BGR555LE:
1945  case AV_PIX_FMT_BGR555BE:
1946  c->yuv2packed1 = yuv2rgb15_1_lasx;
1947  c->yuv2packed2 = yuv2rgb15_2_lasx;
1948  c->yuv2packedX = yuv2rgb15_X_lasx;
1949  break;
1950  case AV_PIX_FMT_RGB444LE:
1951  case AV_PIX_FMT_RGB444BE:
1952  case AV_PIX_FMT_BGR444LE:
1953  case AV_PIX_FMT_BGR444BE:
1954  c->yuv2packed1 = yuv2rgb12_1_lasx;
1955  c->yuv2packed2 = yuv2rgb12_2_lasx;
1956  c->yuv2packedX = yuv2rgb12_X_lasx;
1957  break;
1958  case AV_PIX_FMT_RGB8:
1959  case AV_PIX_FMT_BGR8:
1960  c->yuv2packed1 = yuv2rgb8_1_lasx;
1961  c->yuv2packed2 = yuv2rgb8_2_lasx;
1962  c->yuv2packedX = yuv2rgb8_X_lasx;
1963  break;
1964  case AV_PIX_FMT_RGB4:
1965  case AV_PIX_FMT_BGR4:
1966  c->yuv2packed1 = yuv2rgb4_1_lasx;
1967  c->yuv2packed2 = yuv2rgb4_2_lasx;
1968  c->yuv2packedX = yuv2rgb4_X_lasx;
1969  break;
1970  case AV_PIX_FMT_RGB4_BYTE:
1971  case AV_PIX_FMT_BGR4_BYTE:
1972  c->yuv2packed1 = yuv2rgb4b_1_lasx;
1973  c->yuv2packed2 = yuv2rgb4b_2_lasx;
1974  c->yuv2packedX = yuv2rgb4b_X_lasx;
1975  break;
1976  }
1977  }
1978 }
A
#define A(x)
Definition: vpx_arith.h:28
yuv2planar1_fn
void(* yuv2planar1_fn)(const int16_t *src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
Write one line of horizontally scaled data to planar output without any additional vertical scaling (...
Definition: swscale_internal.h:108
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:71
SWS_DITHER_AUTO
@ SWS_DITHER_AUTO
Definition: swscale.h:81
av_clip
#define av_clip
Definition: common.h:100
ff_dither_4x4_16
const uint8_t ff_dither_4x4_16[][8]
Definition: output.c:51
yuv2rgb_write_full
static av_always_inline void yuv2rgb_write_full(SwsInternal *c, uint8_t *dest, int i, int R, int A, int G, int B, int y, enum AVPixelFormat target, int hasAlpha, int err[4])
Definition: output_lasx.c:816
r
const char * r
Definition: vf_curves.c:127
AV_PIX_FMT_BGR32
#define AV_PIX_FMT_BGR32
Definition: pixfmt.h:485
AV_PIX_FMT_RGB444LE
@ AV_PIX_FMT_RGB444LE
packed RGB 4:4:4, 16bpp, (msb)4X 4R 4G 4B(lsb), little-endian, X=unused/undefined
Definition: pixfmt.h:136
u
#define u(width, name, range_min, range_max)
Definition: cbs_h2645.c:251
ff_dither_8x8_32
const uint8_t ff_dither_8x8_32[][8]
Definition: output.c:59
av_clip_uintp2
#define av_clip_uintp2
Definition: common.h:124
WRITE_FULL_A
#define WRITE_FULL_A(r, g, b, a, t1, s)
Definition: output_lasx.c:975
yuv2rgb_write
static av_always_inline void yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2, unsigned A1, unsigned A2, const void *_r, const void *_g, const void *_b, int y, enum AVPixelFormat target, int hasAlpha)
Definition: output_lasx.c:107
mask
int mask
Definition: mediacodecdec_common.c:154
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
b
#define b
Definition: input.c:41
yuv2planeX
static void FUNC() yuv2planeX(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
Definition: swscale_ppc_template.c:84
R
#define R
Definition: huffyuv.h:44
AV_PIX_FMT_RGB32_1
#define AV_PIX_FMT_RGB32_1
Definition: pixfmt.h:484
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
AV_PIX_FMT_BGR24
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:76
AV_PIX_FMT_BGRA
@ AV_PIX_FMT_BGRA
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:102
DUP2_ARG2
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:58
A2
@ A2
Definition: mvs.c:525
AV_PIX_FMT_GRAYF32LE
@ AV_PIX_FMT_GRAYF32LE
IEEE-754 single precision Y, 32bpp, little-endian.
Definition: pixfmt.h:364
AV_PIX_FMT_RGB555BE
@ AV_PIX_FMT_RGB555BE
packed RGB 5:5:5, 16bpp, (msb)1X 5R 5G 5B(lsb), big-endian , X=unused/undefined
Definition: pixfmt.h:114
yuv2rgb_1_template_lasx
static void yuv2rgb_1_template_lasx(SwsInternal *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y, enum AVPixelFormat target, int hasAlpha)
Definition: output_lasx.c:628
is16BPS
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:727
rgb
Definition: rpzaenc.c:60
b1
static double b1(void *priv, double x, double y)
Definition: vf_xfade.c:2034
YUV2RGBWRAPPER
#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha)
Definition: output_lasx.c:787
ub
#define ub(width, name)
Definition: cbs_h2645.c:401
swscale_loongarch.h
val
static double val(void *priv, double ch)
Definition: aeval.c:77
isNBPS
static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:741
SWS_DITHER_X_DITHER
@ SWS_DITHER_X_DITHER
Definition: swscale.h:85
WRITE_FULL
#define WRITE_FULL(r, g, b, t1, s)
Definition: output_lasx.c:987
AV_PIX_FMT_BGR8
@ AV_PIX_FMT_BGR8
packed RGB 3:3:2, 8bpp, (msb)2B 3G 3R(lsb)
Definition: pixfmt.h:90
yuv2rgb_full_1_template_lasx
static void yuv2rgb_full_1_template_lasx(SwsInternal *c, const int16_t *buf0, const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf0, uint8_t *dest, int dstW, int uvalpha, int y, enum AVPixelFormat target, int hasAlpha)
Definition: output_lasx.c:1417
av_cold
#define av_cold
Definition: attributes.h:90
YUVRGB_TABLE_HEADROOM
#define YUVRGB_TABLE_HEADROOM
Definition: swscale_internal.h:47
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
yuv2packed2_fn
void(* yuv2packed2_fn)(SwsInternal *c, const int16_t *lumSrc[2], const int16_t *chrUSrc[2], const int16_t *chrVSrc[2], const int16_t *alpSrc[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y)
Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB output by doing bilinear scalin...
Definition: swscale_internal.h:214
g
const char * g
Definition: vf_curves.c:128
B
#define B
Definition: huffyuv.h:42
ff_dither_2x2_4
const uint8_t ff_dither_2x2_4[][8]
Definition: output.c:39
ff_dither_8x8_220
const uint8_t ff_dither_8x8_220[][8]
Definition: output.c:84
AV_PIX_FMT_RGB4
@ AV_PIX_FMT_RGB4
packed RGB 1:2:1 bitstream, 4bpp, (msb)1R 2G 1B(lsb), a byte contains two pixels, the first pixel in ...
Definition: pixfmt.h:94
AV_PIX_FMT_BGR32_1
#define AV_PIX_FMT_BGR32_1
Definition: pixfmt.h:486
AV_PIX_FMT_RGBA
@ AV_PIX_FMT_RGBA
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:100
YUV2RGB_SETUP
#define YUV2RGB_SETUP
Definition: output_lasx.c:948
isSemiPlanarYUV
static av_always_inline int isSemiPlanarYUV(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:773
b_r
#define b_r
A_DITHER
#define A_DITHER(u, v)
AV_PIX_FMT_RGB565LE
@ AV_PIX_FMT_RGB565LE
packed RGB 5:6:5, 16bpp, (msb) 5R 6G 5B(lsb), little-endian
Definition: pixfmt.h:113
NULL
#define NULL
Definition: coverity.c:32
bias
static int bias(int x, int c)
Definition: vqcdec.c:115
V
#define V
Definition: avdct.c:31
AV_PIX_FMT_BGR565LE
@ AV_PIX_FMT_BGR565LE
packed BGR 5:6:5, 16bpp, (msb) 5B 6G 5R(lsb), little-endian
Definition: pixfmt.h:118
AV_PIX_FMT_RGB8
@ AV_PIX_FMT_RGB8
packed RGB 3:3:2, 8bpp, (msb)3R 3G 2B(lsb)
Definition: pixfmt.h:93
AV_PIX_FMT_BGR4
@ AV_PIX_FMT_BGR4
packed RGB 1:2:1 bitstream, 4bpp, (msb)1B 2G 1R(lsb), a byte contains two pixels, the first pixel in ...
Definition: pixfmt.h:91
AV_PIX_FMT_BGR555BE
@ AV_PIX_FMT_BGR555BE
packed BGR 5:5:5, 16bpp, (msb)1X 5B 5G 5R(lsb), big-endian , X=unused/undefined
Definition: pixfmt.h:119
AV_PIX_FMT_ABGR
@ AV_PIX_FMT_ABGR
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:101
yuv2rgb_2_template_lasx
static void yuv2rgb_2_template_lasx(SwsInternal *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y, enum AVPixelFormat target, int hasAlpha)
Definition: output_lasx.c:518
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
AV_PIX_FMT_BGR4_BYTE
@ AV_PIX_FMT_BGR4_BYTE
packed RGB 1:2:1, 8bpp, (msb)1B 2G 1R(lsb)
Definition: pixfmt.h:92
isDataInHighBits
static av_always_inline int isDataInHighBits(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:936
yuv2packedX_fn
void(* yuv2packedX_fn)(SwsInternal *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB output by doing multi-point ver...
Definition: swscale_internal.h:246
DUP4_ARG1
#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:70
AV_PIX_FMT_RGB24
@ AV_PIX_FMT_RGB24
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:75
DUP2_ARG1
#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:52
A1
@ A1
Definition: mvs.c:524
AV_PIX_FMT_RGB444BE
@ AV_PIX_FMT_RGB444BE
packed RGB 4:4:4, 16bpp, (msb)4X 4R 4G 4B(lsb), big-endian, X=unused/undefined
Definition: pixfmt.h:137
AV_PIX_FMT_BGR555
#define AV_PIX_FMT_BGR555
Definition: pixfmt.h:503
DUP2_ARG3
#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:64
AV_PIX_FMT_BGR444BE
@ AV_PIX_FMT_BGR444BE
packed BGR 4:4:4, 16bpp, (msb)4X 4B 4G 4R(lsb), big-endian, X=unused/undefined
Definition: pixfmt.h:139
yuv2rgb_full_X_template_lasx
static void yuv2rgb_full_X_template_lasx(SwsInternal *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int y, enum AVPixelFormat target, int hasAlpha)
Definition: output_lasx.c:997
AV_PIX_FMT_RGB32
#define AV_PIX_FMT_RGB32
Definition: pixfmt.h:483
a0
static double a0(void *priv, double x, double y)
Definition: vf_xfade.c:2028
AV_PIX_FMT_BGR565BE
@ AV_PIX_FMT_BGR565BE
packed BGR 5:6:5, 16bpp, (msb) 5B 6G 5R(lsb), big-endian
Definition: pixfmt.h:117
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
yuv2rgb_full_2_template_lasx
static void yuv2rgb_full_2_template_lasx(SwsInternal *c, const int16_t *buf[2], const int16_t *ubuf[2], const int16_t *vbuf[2], const int16_t *abuf[2], uint8_t *dest, int dstW, int yalpha, int uvalpha, int y, enum AVPixelFormat target, int hasAlpha)
Definition: output_lasx.c:1204
YUV2RGB
#define YUV2RGB(y, u, v, R, G, B, offset, coeff, y_temp, v2r, v2g, u2g, u2b)
Definition: output_lasx.c:963
ff_dither_8x8_73
const uint8_t ff_dither_8x8_73[][8]
Definition: output.c:71
zero
static int zero(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:121
Y
#define Y
Definition: boxblur.h:37
yuv2anyX_fn
void(* yuv2anyX_fn)(SwsInternal *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t **dest, int dstW, int y)
Write one line of horizontally scaled Y/U/V/A to YUV/RGB output by doing multi-point vertical scaling...
Definition: swscale_internal.h:280
AV_PIX_FMT_ARGB
@ AV_PIX_FMT_ARGB
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:99
av_assert2
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:67
AV_PIX_FMT_RGB555LE
@ AV_PIX_FMT_RGB555LE
packed RGB 5:5:5, 16bpp, (msb)1X 5R 5G 5B(lsb), little-endian, X=unused/undefined
Definition: pixfmt.h:115
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
AV_PIX_FMT_BGR444
#define AV_PIX_FMT_BGR444
Definition: pixfmt.h:504
AV_PIX_FMT_RGB555
#define AV_PIX_FMT_RGB555
Definition: pixfmt.h:498
av_always_inline
#define av_always_inline
Definition: attributes.h:49
ff_sws_init_output_lasx
av_cold void ff_sws_init_output_lasx(SwsInternal *c, yuv2planar1_fn *yuv2plane1, yuv2planarX_fn *yuv2planeX, yuv2interleavedX_fn *yuv2nv12cX, yuv2packed1_fn *yuv2packed1, yuv2packed2_fn *yuv2packed2, yuv2packedX_fn *yuv2packedX, yuv2anyX_fn *yuv2anyX)
Definition: output_lasx.c:1757
yuv2interleavedX_fn
void(* yuv2interleavedX_fn)(enum AVPixelFormat dstFormat, const uint8_t *chrDither, const int16_t *chrFilter, int chrFilterSize, const int16_t **chrUSrc, const int16_t **chrVSrc, uint8_t *dest, int dstW)
Write one line of horizontally scaled chroma to interleaved output with multi-point vertical scaling ...
Definition: swscale_internal.h:144
len
int len
Definition: vorbis_enc_data.h:426
AV_PIX_FMT_BGR565
#define AV_PIX_FMT_BGR565
Definition: pixfmt.h:502
AV_PIX_FMT_RGB4_BYTE
@ AV_PIX_FMT_RGB4_BYTE
packed RGB 1:2:1, 8bpp, (msb)1R 2G 1B(lsb)
Definition: pixfmt.h:95
headroom
static int headroom(int *la)
Definition: nellymoser.c:106
AV_PIX_FMT_RGB565
#define AV_PIX_FMT_RGB565
Definition: pixfmt.h:497
SWS_DITHER_ED
@ SWS_DITHER_ED
Definition: swscale.h:83
yuv2packed1_fn
void(* yuv2packed1_fn)(SwsInternal *c, const int16_t *lumSrc, const int16_t *chrUSrc[2], const int16_t *chrVSrc[2], const int16_t *alpSrc, uint8_t *dest, int dstW, int uvalpha, int y)
Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB output without any additional v...
Definition: swscale_internal.h:181
SwsInternal
Definition: swscale_internal.h:317
AV_PIX_FMT_GRAYF32BE
@ AV_PIX_FMT_GRAYF32BE
IEEE-754 single precision Y, 32bpp, big-endian.
Definition: pixfmt.h:363
SWS_FULL_CHR_H_INT
@ SWS_FULL_CHR_H_INT
Perform full chroma upsampling when upscaling to RGB.
Definition: swscale.h:132
U
#define U(x)
Definition: vpx_arith.h:37
yuv2planarX_fn
void(* yuv2planarX_fn)(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
Write one line of horizontally scaled data to planar output with multi-point vertical scaling between...
Definition: swscale_internal.h:124
SWS_DITHER_A_DITHER
@ SWS_DITHER_A_DITHER
Definition: swscale.h:84
temp
else temp
Definition: vf_mcdeint.c:263
av_clip_uint8
#define av_clip_uint8
Definition: common.h:106
G
#define G
Definition: huffyuv.h:43
AV_PIX_FMT_RGB565BE
@ AV_PIX_FMT_RGB565BE
packed RGB 5:6:5, 16bpp, (msb) 5R 6G 5B(lsb), big-endian
Definition: pixfmt.h:112
src0
const pixel *const src0
Definition: h264pred_template.c:419
filter0
static void filter0(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:352
yuv2rgb_X_template_lasx
static void yuv2rgb_X_template_lasx(SwsInternal *c, const int16_t *lumFilter, const int16_t **lumSrc, int lumFilterSize, const int16_t *chrFilter, const int16_t **chrUSrc, const int16_t **chrVSrc, int chrFilterSize, const int16_t **alpSrc, uint8_t *dest, int dstW, int y, enum AVPixelFormat target, int hasAlpha)
Definition: output_lasx.c:230
loongson_intrinsics.h
AV_PIX_FMT_BGR555LE
@ AV_PIX_FMT_BGR555LE
packed BGR 5:5:5, 16bpp, (msb)1X 5B 5G 5R(lsb), little-endian, X=unused/undefined
Definition: pixfmt.h:120
coeff
static const double coeff[2][5]
Definition: vf_owdenoise.c:80
X_DITHER
#define X_DITHER(u, v)
b0
static double b0(void *priv, double x, double y)
Definition: vf_xfade.c:2033
a1
static double a1(void *priv, double x, double y)
Definition: vf_xfade.c:2029
r_b
#define r_b
d128
const uint8_t * d128
Definition: yuv2rgb.c:458
AV_PIX_FMT_BGR444LE
@ AV_PIX_FMT_BGR444LE
packed BGR 4:4:4, 16bpp, (msb)4X 4B 4G 4R(lsb), little-endian, X=unused/undefined
Definition: pixfmt.h:138
yuv2rgb
static void yuv2rgb(uint8_t *out, int ridx, int Y, int U, int V)
Definition: g2meet.c:263
src
#define src
Definition: vp8dsp.c:248
ff_dither_2x2_8
const uint8_t ff_dither_2x2_8[][8]
Definition: output.c:45
WRITE_YUV2RGB
#define WRITE_YUV2RGB(vec_y1, vec_y2, vec_u, vec_v, t1, t2, t3, t4)
Definition: output_lasx.c:215
yuv2planeX_8_lasx
void yuv2planeX_8_lasx(const int16_t *filter, int filterSize, const int16_t **src, uint8_t *dest, int dstW, const uint8_t *dither, int offset)
Definition: output_lasx.c:25
AV_PIX_FMT_RGB444
#define AV_PIX_FMT_RGB444
Definition: pixfmt.h:499
dither
static const uint8_t dither[8][8]
Definition: vf_fspp.c:62