FFmpeg
idct.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2024 Zhao Zhili
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 
23 #include <wasm_simd128.h>
24 
25 #include "libavutil/mem_internal.h"
26 
27 static const int8_t transform[] = {
28  64, 83, 64, 36, 89, 75, 50, 18,
29  90, 87, 80, 70, 57, 43, 25, 9,
30  90, 90, 88, 85, 82, 78, 73, 67,
31  61, 54, 46, 38, 31, 22, 13, 4,
32 };
33 
34 static inline void transpose_4x8h(v128_t *src)
35 {
36  v128_t t0 = wasm_i16x8_shuffle(src[0], src[1], 0, 8, 2, 10, 4, 12, 6, 14);
37  v128_t t1 = wasm_i16x8_shuffle(src[0], src[1], 1, 9, 3, 11, 5, 13, 7, 15);
38  v128_t t2 = wasm_i16x8_shuffle(src[2], src[3], 0, 8, 2, 10, 4, 12, 6, 14);
39  v128_t t3 = wasm_i16x8_shuffle(src[2], src[3], 1, 9, 3, 11, 5, 13, 7, 15);
40 
41  src[0] = wasm_i32x4_shuffle(t0, t2, 0, 4, 2, 6);
42  src[2] = wasm_i32x4_shuffle(t0, t2, 1, 5, 3, 7);
43  src[1] = wasm_i32x4_shuffle(t1, t3, 0, 4, 2, 6);
44  src[3] = wasm_i32x4_shuffle(t1, t3, 1, 5, 3, 7);
45 }
46 
47 static inline void transpose_8x8h(v128_t *src)
48 {
50  transpose_4x8h(&src[4]);
51 }
52 
53 static inline void tr_4x4(v128_t *src, v128_t *trans, int shift)
54 {
55  v128_t tmp[4];
56  v128_t add = wasm_i32x4_splat(1 << (shift - 1));
57  v128_t e0 = wasm_i32x4_extmul_low_i16x8(src[0], trans[0]);
58  v128_t e1 = wasm_i32x4_extmul_low_i16x8(src[0], trans[0]);
59  v128_t o0 = wasm_i32x4_extmul_low_i16x8(src[1], trans[1]);
60  v128_t o1 = wasm_i32x4_extmul_low_i16x8(src[1], trans[3]);
61 
62  tmp[0] = wasm_i32x4_extmul_low_i16x8(src[2], trans[0]);
63  tmp[1] = wasm_i32x4_extmul_low_i16x8(src[2], trans[0]);
64  tmp[2] = wasm_i32x4_extmul_low_i16x8(src[3], trans[3]);
65  tmp[3] = wasm_i32x4_extmul_low_i16x8(src[3], trans[1]);
66  e0 = wasm_i32x4_add(e0, tmp[0]);
67  e1 = wasm_i32x4_sub(e1, tmp[1]);
68  o0 = wasm_i32x4_add(o0, tmp[2]);
69  o1 = wasm_i32x4_sub(o1, tmp[3]);
70 
71  tmp[0] = wasm_i32x4_add(e0, o0);
72  tmp[1] = wasm_i32x4_sub(e0, o0);
73  tmp[2] = wasm_i32x4_add(e1, o1);
74  tmp[3] = wasm_i32x4_sub(e1, o1);
75 
76  tmp[0] = wasm_i32x4_add(tmp[0], add);
77  tmp[1] = wasm_i32x4_add(tmp[1], add);
78  tmp[2] = wasm_i32x4_add(tmp[2], add);
79  tmp[3] = wasm_i32x4_add(tmp[3], add);
80  tmp[0] = wasm_i32x4_shr(tmp[0], shift);
81  tmp[1] = wasm_i32x4_shr(tmp[1], shift);
82  tmp[2] = wasm_i32x4_shr(tmp[2], shift);
83  tmp[3] = wasm_i32x4_shr(tmp[3], shift);
84 
85  src[0] = wasm_i16x8_narrow_i32x4(tmp[0], tmp[0]);
86  src[3] = wasm_i16x8_narrow_i32x4(tmp[1], tmp[1]);
87  src[1] = wasm_i16x8_narrow_i32x4(tmp[2], tmp[2]);
88  src[2] = wasm_i16x8_narrow_i32x4(tmp[3], tmp[3]);
89 }
90 
91 static void idct_4x4(int16_t *coeffs, int bit_depth)
92 {
93  v128_t src[4];
94  v128_t trans[4];
95 
96  src[0] = wasm_v128_load64_zero(&coeffs[0]);
97  src[1] = wasm_v128_load64_zero(&coeffs[4]);
98  src[2] = wasm_v128_load64_zero(&coeffs[8]);
99  src[3] = wasm_v128_load64_zero(&coeffs[12]);
100 
101  trans[0] = wasm_i16x8_const_splat(transform[0]);
102  trans[1] = wasm_i16x8_const_splat(transform[1]);
103  trans[2] = wasm_i16x8_const_splat(transform[2]);
104  trans[3] = wasm_i16x8_const_splat(transform[3]);
105 
106  tr_4x4(src, trans, 7);
108 
109  tr_4x4(src, trans, 20 - bit_depth);
111 
112  src[0] = wasm_i64x2_shuffle(src[0], src[1], 0, 2);
113  src[2] = wasm_i64x2_shuffle(src[2], src[3], 0, 2);
114  wasm_v128_store(&coeffs[0], src[0]);
115  wasm_v128_store(&coeffs[8], src[2]);
116 }
117 
118 void ff_hevc_idct_4x4_8_simd128(int16_t *coeffs, int col_limit)
119 {
120  idct_4x4(coeffs, 8);
121 }
122 
123 void ff_hevc_idct_4x4_10_simd128(int16_t *coeffs, int col_limit)
124 {
125  idct_4x4(coeffs, 10);
126 }
127 
128 static inline void shift_narrow_low(v128_t src, v128_t *dst, v128_t add, int shift)
129 {
130  src = wasm_i32x4_add(src, add);
131  src = wasm_i32x4_shr(src, shift);
132  *dst = wasm_i64x2_shuffle(wasm_i16x8_narrow_i32x4(src, src), *dst, 0, 3);
133 }
134 
135 static inline void shift_narrow_high(v128_t src, v128_t *dst, v128_t add, int shift)
136 {
137  src = wasm_i32x4_add(src, add);
138  src = wasm_i32x4_shr(src, shift);
139  *dst = wasm_i64x2_shuffle(wasm_i16x8_narrow_i32x4(src, src), *dst, 2, 0);
140 }
141 
142 #define tr_4x4_8(in0, in1, in2, in3, dst0, dst1, dst2, dst3, trans, half0, half1) \
143  do { \
144  v128_t e0, e1, o0, o1; \
145  v128_t tmp[4]; \
146  \
147  e0 = wasm_i32x4_extmul_ ## half0 ## _i16x8(in0, trans[0]); \
148  e1 = e0; \
149  o0 = wasm_i32x4_extmul_ ## half0 ## _i16x8(in1, trans[1]); \
150  o1 = wasm_i32x4_extmul_ ## half0 ## _i16x8(in1, trans[3]); \
151  \
152  tmp[0] = wasm_i32x4_extmul_ ## half1 ## _i16x8(in2, trans[0]); \
153  tmp[1] = wasm_i32x4_extmul_ ## half1 ## _i16x8(in2, trans[0]); \
154  tmp[2] = wasm_i32x4_extmul_ ## half1 ## _i16x8(in3, trans[3]); \
155  tmp[3] = wasm_i32x4_extmul_ ## half1 ## _i16x8(in3, trans[1]); \
156  e0 = wasm_i32x4_add(e0, tmp[0]); \
157  e1 = wasm_i32x4_sub(e1, tmp[1]); \
158  o0 = wasm_i32x4_add(o0, tmp[2]); \
159  o1 = wasm_i32x4_sub(o1, tmp[3]); \
160  dst0 = wasm_i32x4_add(e0, o0); \
161  dst1 = wasm_i32x4_add(e1, o1); \
162  dst2 = wasm_i32x4_sub(e1, o1); \
163  dst3 = wasm_i32x4_sub(e0, o0); \
164  } while (0)
165 
166 #define tr_8x4(src0, src1, half0, half1, trans, shift) \
167  do { \
168  v128_t v24, v25, v26, v27, v28, v29, v30, v31; \
169  v128_t add = wasm_i32x4_splat(1 << (shift - 1)); \
170  \
171  tr_4x4_8(src0[0], src0[2], src1[0], src1[2], v24, v25, v26, v27, trans, half0, half1); \
172  \
173  v30 = wasm_i32x4_extmul_ ## half0 ## _i16x8(src0[1], trans[6]); \
174  v28 = wasm_i32x4_extmul_ ## half0 ## _i16x8(src0[1], trans[4]); \
175  v29 = wasm_i32x4_extmul_ ## half0 ## _i16x8(src0[1], trans[5]); \
176  v30 = wasm_i32x4_sub(v30, wasm_i32x4_extmul_ ## half0 ## _i16x8(src0[3], trans[4])); \
177  v28 = wasm_i32x4_add(v28, wasm_i32x4_extmul_ ## half0 ## _i16x8(src0[3], trans[5])); \
178  v29 = wasm_i32x4_sub(v29, wasm_i32x4_extmul_ ## half0 ## _i16x8(src0[3], trans[7])); \
179  \
180  v30 = wasm_i32x4_add(v30, wasm_i32x4_extmul_ ## half1 ## _i16x8(src1[1], trans[7])); \
181  v28 = wasm_i32x4_add(v28, wasm_i32x4_extmul_ ## half1 ## _i16x8(src1[1], trans[6])); \
182  v29 = wasm_i32x4_sub(v29, wasm_i32x4_extmul_ ## half1 ## _i16x8(src1[1], trans[4])); \
183  \
184  v30 = wasm_i32x4_add(v30, wasm_i32x4_extmul_ ## half1 ## _i16x8(src1[3], trans[5])); \
185  v28 = wasm_i32x4_add(v28, wasm_i32x4_extmul_ ## half1 ## _i16x8(src1[3], trans[7])); \
186  v29 = wasm_i32x4_sub(v29, wasm_i32x4_extmul_ ## half1 ## _i16x8(src1[3], trans[6])); \
187  \
188  v31 = wasm_i32x4_add(v26, v30); \
189  v26 = wasm_i32x4_sub(v26, v30); \
190  shift_narrow_ ## half0 (v31, &src0[2], add, shift); \
191  v31 = wasm_i32x4_extmul_ ## half0 ## _i16x8(src0[1], trans[7]); \
192  v31 = wasm_i32x4_sub(v31, wasm_i32x4_extmul_ ## half0 ## _i16x8(src0[3], trans[6])); \
193  v31 = wasm_i32x4_add(v31, wasm_i32x4_extmul_ ## half1 ## _i16x8(src1[1], trans[5])); \
194  v31 = wasm_i32x4_sub(v31, wasm_i32x4_extmul_ ## half1 ## _i16x8(src1[3], trans[4])); \
195  shift_narrow_ ## half1 (v26, &src1[1], add, shift); \
196  v26 = wasm_i32x4_add(v24, v28); \
197  v24 = wasm_i32x4_sub(v24, v28); \
198  v28 = wasm_i32x4_add(v25, v29); \
199  v25 = wasm_i32x4_sub(v25, v29); \
200  v30 = wasm_i32x4_add(v27, v31); \
201  v27 = wasm_i32x4_sub(v27, v31); \
202  shift_narrow_ ## half0 (v26, &src0[0], add, shift); \
203  shift_narrow_ ## half1 (v24, &src1[3], add, shift); \
204  shift_narrow_ ## half0 (v28, &src0[1], add, shift); \
205  shift_narrow_ ## half1 (v25, &src1[2], add, shift); \
206  shift_narrow_ ## half0 (v30, &src0[3], add, shift); \
207  shift_narrow_ ## half1 (v27, &src1[0], add, shift); \
208  } while (0)
209 
210 static void idct_8x8(int16_t *coeffs, int bit_depth)
211 {
212  v128_t src[8];
213  v128_t trans[8];
214  v128_t *src1;
215  int shift1 = 7;
216  int shift2 = 20 - bit_depth;
217 
218  src[0] = wasm_v128_load(coeffs + 0 * 8);
219  src[1] = wasm_v128_load(coeffs + 1 * 8);
220  src[2] = wasm_v128_load(coeffs + 2 * 8);
221  src[3] = wasm_v128_load(coeffs + 3 * 8);
222  src[4] = wasm_v128_load(coeffs + 4 * 8);
223  src[5] = wasm_v128_load(coeffs + 5 * 8);
224  src[6] = wasm_v128_load(coeffs + 6 * 8);
225  src[7] = wasm_v128_load(coeffs + 7 * 8);
226 
227  trans[0] = wasm_i16x8_const_splat(transform[0]);
228  trans[1] = wasm_i16x8_const_splat(transform[1]);
229  trans[2] = wasm_i16x8_const_splat(transform[2]);
230  trans[3] = wasm_i16x8_const_splat(transform[3]);
231  trans[4] = wasm_i16x8_const_splat(transform[4]);
232  trans[5] = wasm_i16x8_const_splat(transform[5]);
233  trans[6] = wasm_i16x8_const_splat(transform[6]);
234  trans[7] = wasm_i16x8_const_splat(transform[7]);
235 
236  src1 = &src[4];
237  tr_8x4(src, src1, low, low, trans, shift1);
238  tr_8x4(src, src1, high, high, trans, shift1);
240  tr_8x4(src, src, low, high, trans, shift2);
241  tr_8x4(src1, src1, low, high, trans, shift2);
243 
244  wasm_v128_store(&coeffs[0 * 8], src[0]);
245  wasm_v128_store(&coeffs[1 * 8], src[1]);
246  wasm_v128_store(&coeffs[2 * 8], src[2]);
247  wasm_v128_store(&coeffs[3 * 8], src[3]);
248  wasm_v128_store(&coeffs[4 * 8], src[4]);
249  wasm_v128_store(&coeffs[5 * 8], src[5]);
250  wasm_v128_store(&coeffs[6 * 8], src[6]);
251  wasm_v128_store(&coeffs[7 * 8], src[7]);
252 }
253 
254 void ff_hevc_idct_8x8_8_simd128(int16_t *coeffs, int col_limit)
255 {
256  idct_8x8(coeffs, 8);
257 }
258 
259 void ff_hevc_idct_8x8_10_simd128(int16_t *coeffs, int col_limit)
260 {
261  idct_8x8(coeffs, 10);
262 }
263 
264 #define load16(x1, x3, x2, in0, in1, in2, in3) \
265  in0 = wasm_v128_load64_zero(x1); \
266  in0 = wasm_v128_load64_lane(x3, in0, 1); \
267  x1 += x2; \
268  x3 += x2; \
269  in1 = wasm_v128_load64_zero(x1); \
270  in1 = wasm_v128_load64_lane(x3, in1, 1); \
271  x1 += x2; \
272  x3 += x2; \
273  in2 = wasm_v128_load64_zero(x1); \
274  in2 = wasm_v128_load64_lane(x3, in2, 1); \
275  x1 += x2; \
276  x3 += x2; \
277  in3 = wasm_v128_load64_zero(x1); \
278  in3 = wasm_v128_load64_lane(x3, in3, 1); \
279  x1 += x2; \
280  x3 += x2; \
281 
282 #define bufferfly(e, o, p, m) \
283  p = wasm_i32x4_add(e, o); \
284  m = wasm_i32x4_sub(e, o); \
285 
286 static void tr16_8x4(v128_t in0, v128_t in1, v128_t in2, v128_t in3,
287  const v128_t *trans, char *sp, int offset)
288 {
289  v128_t v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31;
290 
291  tr_4x4_8(in0, in1, in2, in3, v24, v25, v26, v27, trans, low, low);
292 
293  v28 = wasm_i32x4_extmul_high_i16x8(in0, trans[4]);
294  v29 = wasm_i32x4_extmul_high_i16x8(in0, trans[5]);
295  v30 = wasm_i32x4_extmul_high_i16x8(in0, trans[6]);
296  v31 = wasm_i32x4_extmul_high_i16x8(in0, trans[7]);
297  v28 = wasm_i32x4_add(v28, wasm_i32x4_extmul_high_i16x8(in1, trans[5]));
298  v29 = wasm_i32x4_sub(v29, wasm_i32x4_extmul_high_i16x8(in1, trans[7]));
299  v30 = wasm_i32x4_sub(v30, wasm_i32x4_extmul_high_i16x8(in1, trans[4]));
300  v31 = wasm_i32x4_sub(v31, wasm_i32x4_extmul_high_i16x8(in1, trans[6]));
301 
302  v28 = wasm_i32x4_add(v28, wasm_i32x4_extmul_high_i16x8(in2, trans[6]));
303  v29 = wasm_i32x4_sub(v29, wasm_i32x4_extmul_high_i16x8(in2, trans[4]));
304  v30 = wasm_i32x4_add(v30, wasm_i32x4_extmul_high_i16x8(in2, trans[7]));
305  v31 = wasm_i32x4_add(v31, wasm_i32x4_extmul_high_i16x8(in2, trans[5]));
306 
307  v28 = wasm_i32x4_add(v28, wasm_i32x4_extmul_high_i16x8(in3, trans[7]));
308  v29 = wasm_i32x4_sub(v29, wasm_i32x4_extmul_high_i16x8(in3, trans[6]));
309  v30 = wasm_i32x4_add(v30, wasm_i32x4_extmul_high_i16x8(in3, trans[5]));
310  v31 = wasm_i32x4_sub(v31, wasm_i32x4_extmul_high_i16x8(in3, trans[4]));
311 
312  bufferfly(v24, v28, v16, v23);
313  bufferfly(v25, v29, v17, v22);
314  bufferfly(v26, v30, v18, v21);
315  bufferfly(v27, v31, v19, v20);
316 
317  sp += offset;
318  wasm_v128_store(sp, v16); sp += 16;
319  wasm_v128_store(sp, v17); sp += 16;
320  wasm_v128_store(sp, v18); sp += 16;
321  wasm_v128_store(sp, v19); sp += 16;
322  wasm_v128_store(sp, v20); sp += 16;
323  wasm_v128_store(sp, v21); sp += 16;
324  wasm_v128_store(sp, v22); sp += 16;
325  wasm_v128_store(sp, v23);
326 }
327 
328 static void scale(v128_t *out0, v128_t *out1, v128_t *out2, v128_t *out3,
329  v128_t in0, v128_t in1, v128_t in2, v128_t in3,
330  v128_t in4, v128_t in5, v128_t in6, v128_t in7,
331  int shift)
332 {
333  v128_t add = wasm_i32x4_splat(1 << (shift - 1));
334 
335  in0 = wasm_i32x4_add(in0, add);
336  in1 = wasm_i32x4_add(in1, add);
337  in2 = wasm_i32x4_add(in2, add);
338  in3 = wasm_i32x4_add(in3, add);
339  in4 = wasm_i32x4_add(in4, add);
340  in5 = wasm_i32x4_add(in5, add);
341  in6 = wasm_i32x4_add(in6, add);
342  in7 = wasm_i32x4_add(in7, add);
343 
344  in0 = wasm_i32x4_shr(in0, shift);
345  in1 = wasm_i32x4_shr(in1, shift);
346  in2 = wasm_i32x4_shr(in2, shift);
347  in3 = wasm_i32x4_shr(in3, shift);
348  in4 = wasm_i32x4_shr(in4, shift);
349  in5 = wasm_i32x4_shr(in5, shift);
350  in6 = wasm_i32x4_shr(in6, shift);
351  in7 = wasm_i32x4_shr(in7, shift);
352 
353  *out0 = wasm_i16x8_narrow_i32x4(in0, in1);
354  *out1 = wasm_i16x8_narrow_i32x4(in2, in3);
355  *out2 = wasm_i16x8_narrow_i32x4(in4, in5);
356  *out3 = wasm_i16x8_narrow_i32x4(in6, in7);
357 }
358 
359 static void transpose16_4x4_2(v128_t *r0, v128_t *r1, v128_t *r2, v128_t *r3)
360 {
361  v128_t t0, t1, t2, t3, t4, t5;
362 
363  t0 = wasm_i16x8_shuffle(*r0, *r1, 0, 8, 2, 10, 4, 12, 6, 14);
364  t1 = wasm_i16x8_shuffle(*r0, *r1, 1, 9, 3, 11, 5, 13, 7, 15);
365  t2 = wasm_i16x8_shuffle(*r2, *r3, 0, 8, 2, 10, 4, 12, 6, 14);
366  t3 = wasm_i16x8_shuffle(*r2, *r3, 1, 9, 3, 11, 5, 13, 7, 15);
367  t4 = wasm_i32x4_shuffle(t0, t2, 0, 4, 2, 6);
368  t5 = wasm_i32x4_shuffle(t0, t2, 1, 5, 3, 7);
369  t0 = wasm_i32x4_shuffle(t1, t3, 0, 4, 2, 6);
370  t2 = wasm_i32x4_shuffle(t1, t3, 1, 5, 3, 7);
371  *r0 = wasm_i64x2_shuffle(t4, *r0, 0, 3);
372  *r2 = wasm_i64x2_shuffle(t5, *r2, 0, 3);
373  *r1 = wasm_i64x2_shuffle(t0, *r1, 0, 3);
374  *r3 = wasm_i64x2_shuffle(t2, *r3, 0, 3);
375 
376  t0 = wasm_i16x8_shuffle(*r3, *r2, 0, 8, 2, 10, 4, 12, 6, 14);
377  t1 = wasm_i16x8_shuffle(*r3, *r2, 1, 9, 3, 11, 5, 13, 7, 15);
378  t2 = wasm_i16x8_shuffle(*r1, *r0, 0, 8, 2, 10, 4, 12, 6, 14);
379  t3 = wasm_i16x8_shuffle(*r1, *r0, 1, 9, 3, 11, 5, 13, 7, 15);
380  t4 = wasm_i32x4_shuffle(t0, t2, 0, 4, 2, 6);
381  t5 = wasm_i32x4_shuffle(t0, t2, 1, 5, 3, 7);
382  t0 = wasm_i32x4_shuffle(t1, t3, 0, 4, 2, 6);
383  t2 = wasm_i32x4_shuffle(t1, t3, 1, 5, 3, 7);
384  *r3 = wasm_i64x2_shuffle(*r3, t4, 0, 3);
385  *r1 = wasm_i64x2_shuffle(*r1, t5, 0, 3);
386  *r2 = wasm_i64x2_shuffle(*r2, t0, 0, 3);
387  *r0 = wasm_i64x2_shuffle(*r0, t2, 0, 3);
388 }
389 
390 static void store16(v128_t in0, v128_t in1, v128_t in2, v128_t in3,
391  char *x1, char *x3, int x1_step, int x3_step)
392 {
393  wasm_v128_store64_lane(x1, in0, 0);
394  wasm_v128_store64_lane(x3, in0, 1);
395  x1 += x1_step;
396  x3 += x3_step;
397 
398  wasm_v128_store64_lane(x1, in1, 0);
399  wasm_v128_store64_lane(x3, in1, 1);
400  x1 += x1_step;
401  x3 += x3_step;
402 
403  wasm_v128_store64_lane(x1, in2, 0);
404  wasm_v128_store64_lane(x3, in2, 1);
405  x1 += x1_step;
406  x3 += x3_step;
407 
408  wasm_v128_store64_lane(x1, in3, 0);
409  wasm_v128_store64_lane(x3, in3, 1);
410 }
411 
412 
413 static void store_to_stack(char *sp, int off1, int off2,
414  v128_t in0, v128_t in2, v128_t in4, v128_t in6,
415  v128_t in7, v128_t in5, v128_t in3, v128_t in1)
416 {
417  char *x1 = sp + off1;
418  char *x3 = sp + off2;
419 
420  wasm_v128_store(x1, in0);
421  wasm_v128_store(x3, in1);
422  x1 += 16;
423  x3 -= 16;
424  wasm_v128_store(x1, in2);
425  wasm_v128_store(x3, in3);
426  x1 += 16;
427  x3 -= 16;
428  wasm_v128_store(x1, in4);
429  wasm_v128_store(x3, in5);
430  x1 += 16;
431  x3 -= 16;
432  wasm_v128_store(x1, in6);
433  wasm_v128_store(x3, in7);
434 }
435 
436 #define sum_sub(out, in0, in1, operation, half) \
437  out = wasm_i32x4_ ## operation (out, wasm_i32x4_extmul_ ## half ## _i16x8(in0, in1));
438 
439 #define add_member(in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7, half) \
440  do { \
441  sum_sub(v21, in, t0, op0, half) \
442  sum_sub(v22, in, t1, op1, half) \
443  sum_sub(v23, in, t2, op2, half) \
444  sum_sub(v24, in, t3, op3, half) \
445  sum_sub(v25, in, t4, op4, half) \
446  sum_sub(v26, in, t5, op5, half) \
447  sum_sub(v27, in, t6, op6, half) \
448  sum_sub(v28, in, t7, op7, half) \
449  } while (0)
450 
451 #define butterfly16(in0, in1, in2, in3, in4, in5, in6, in7) \
452  do { \
453  v20 = wasm_i32x4_add(in0, in1); \
454  in0 = wasm_i32x4_sub(in0, in1); \
455  in1 = wasm_i32x4_add(in2, in3); \
456  in2 = wasm_i32x4_sub(in2, in3); \
457  in3 = wasm_i32x4_add(in4, in5); \
458  in4 = wasm_i32x4_sub(in4, in5); \
459  in5 = wasm_i32x4_add(in6, in7); \
460  in6 = wasm_i32x4_sub(in6, in7); \
461  } while (0)
462 
463 static void tr_16x4(char *src, char *buf, char *sp,
464  int shift, int offset, int step)
465 {
466  char *x1, *x3, *x4;
467  int x2;
468  v128_t trans[8];
469  v128_t v16, v17, v18, v19, v20, v21, v22, v23,
470  v24, v25, v26, v27, v28, v29, v30, v31;
471 
472  trans[0] = wasm_i16x8_const_splat(transform[0]);
473  trans[1] = wasm_i16x8_const_splat(transform[1]);
474  trans[2] = wasm_i16x8_const_splat(transform[2]);
475  trans[3] = wasm_i16x8_const_splat(transform[3]);
476  trans[4] = wasm_i16x8_const_splat(transform[4]);
477  trans[5] = wasm_i16x8_const_splat(transform[5]);
478  trans[6] = wasm_i16x8_const_splat(transform[6]);
479  trans[7] = wasm_i16x8_const_splat(transform[7]);
480 
481  x1 = src;
482  x3 = src + step * 64;
483  x2 = step * 128;
484  load16(x1, x3, x2, v16, v17, v18, v19);
485  tr16_8x4(v16, v17, v18, v19, trans, sp, offset);
486 
487  x1 = src + step * 32;
488  x3 = src + step * 3 * 32;
489  x2 = step * 128;
490  load16(x1, x3, x2, v20, v17, v18, v19);
491 
492  trans[0] = wasm_i16x8_const_splat(transform[0 + 8]);
493  trans[1] = wasm_i16x8_const_splat(transform[1 + 8]);
494  trans[2] = wasm_i16x8_const_splat(transform[2 + 8]);
495  trans[3] = wasm_i16x8_const_splat(transform[3 + 8]);
496  trans[4] = wasm_i16x8_const_splat(transform[4 + 8]);
497  trans[5] = wasm_i16x8_const_splat(transform[5 + 8]);
498  trans[6] = wasm_i16x8_const_splat(transform[6 + 8]);
499  trans[7] = wasm_i16x8_const_splat(transform[7 + 8]);
500 
501  v21 = wasm_i32x4_extmul_low_i16x8(v20, trans[0]);
502  v22 = wasm_i32x4_extmul_low_i16x8(v20, trans[1]);
503  v23 = wasm_i32x4_extmul_low_i16x8(v20, trans[2]);
504  v24 = wasm_i32x4_extmul_low_i16x8(v20, trans[3]);
505  v25 = wasm_i32x4_extmul_low_i16x8(v20, trans[4]);
506  v26 = wasm_i32x4_extmul_low_i16x8(v20, trans[5]);
507  v27 = wasm_i32x4_extmul_low_i16x8(v20, trans[6]);
508  v28 = wasm_i32x4_extmul_low_i16x8(v20, trans[7]);
509 
510  add_member(v20, trans[1], trans[4], trans[7], trans[5],
511  trans[2], trans[0], trans[3], trans[6],
512  add, add, add, sub, sub, sub, sub, sub, high);
513  add_member(v17, trans[2], trans[7], trans[3], trans[1],
514  trans[6], trans[4], trans[0], trans[5],
515  add, add, sub, sub, sub, add, add, add, low);
516  add_member(v17, trans[3], trans[5], trans[1], trans[7],
517  trans[0], trans[6], trans[2], trans[4],
518  add, sub, sub, add, add, add, sub, sub, high);
519  add_member(v18, trans[4], trans[2], trans[6], trans[0],
520  trans[7], trans[1], trans[5], trans[3],
521  add, sub, sub, add, sub, sub, add, add, low);
522  add_member(v18, trans[5], trans[0], trans[4], trans[6],
523  trans[1], trans[3], trans[7], trans[2],
524  add, sub, add, add, sub, add, add, sub, high);
525  add_member(v19, trans[6], trans[3], trans[0], trans[2],
526  trans[5], trans[7], trans[4], trans[1],
527  add, sub, add, sub, add, add, sub, add, low);
528  add_member(v19, trans[7], trans[6], trans[5], trans[4],
529  trans[3], trans[2], trans[1], trans[0],
530  add, sub, add, sub, add, sub, add, sub, high);
531 
532  x4 = &sp[offset];
533  v16 = wasm_v128_load(x4);
534  x4 += 16;
535  v17 = wasm_v128_load(x4);
536  x4 += 16;
537  v18 = wasm_v128_load(x4);
538  x4 += 16;
539  v19 = wasm_v128_load(x4);
540  butterfly16(v16, v21, v17, v22, v18, v23, v19, v24);
541 
542  if (shift > 0) {
543  scale(&v29, &v30, &v31, &v24,
544  v20, v16, v21, v17, v22, v18, v23, v19,
545  shift);
546  transpose16_4x4_2(&v29, &v30, &v31, &v24);
547  x1 = buf;
548  x3 = &buf[24 + 3 * 32];
549  store16(v29, v30, v31, v24, x1, x3, 32, -32);
550  } else {
551  store_to_stack(sp, offset, offset + 240,
552  v20, v21, v22, v23, v19, v18, v17, v16);
553  }
554 
555  x4 = &sp[offset + 64];
556  v16 = wasm_v128_load(x4);
557  x4 += 16;
558  v17 = wasm_v128_load(x4);
559  x4 += 16;
560  v18 = wasm_v128_load(x4);
561  x4 += 16;
562  v19 = wasm_v128_load(x4);
563  butterfly16(v16, v25, v17, v26, v18, v27, v19, v28);
564 
565  if (shift > 0) {
566  scale(&v29, &v30, &v31, &v20,
567  v20, v16, v25, v17, v26, v18, v27, v19,
568  shift);
569  transpose16_4x4_2(&v29, &v30, &v31, &v20);
570  x1 = &buf[8];
571  x3 = &buf[16 + 3 * 32];
572  store16(v29, v30, v31, v20, x1, x3, 32, -32);
573  } else {
574  store_to_stack(sp, offset + 64, offset + 176,
575  v20, v25, v26, v27, v19, v18, v17, v16);
576  }
577 }
578 
579 static void idct_16x16(char *coeffs, int bit_depth)
580 {
581  DECLARE_ALIGNED(16, char, sp)[640];
582 
583  for (int i = 0; i < 4; i++) {
584  char *x5 = &coeffs[8 * i];
585  char *x6 = &sp[8 * i * 16];
586  tr_16x4(x5, x6, sp, 7, 512, 1);
587  }
588 
589  for (int i = 0; i < 4; i++) {
590  char *x5 = &sp[8 * i];
591  char *x6 = &coeffs[8 * i * 16];
592  tr_16x4(x5, x6, sp, 20 - bit_depth, 512, 1);
593  }
594 }
595 
596 void ff_hevc_idct_16x16_8_simd128(int16_t *coeffs, int col_limit)
597 {
598  idct_16x16((char *)coeffs, 8);
599 }
600 
601 void ff_hevc_idct_16x16_10_simd128(int16_t *coeffs, int col_limit)
602 {
603  idct_16x16((char *)coeffs, 10);
604 }
605 
606 #define add_member32(in, t0, t1, t2, t3, op0, op1, op2, op3, half) \
607  do { \
608  sum_sub(v24, in, t0, op0, half) \
609  sum_sub(v25, in, t1, op1, half) \
610  sum_sub(v26, in, t2, op2, half) \
611  sum_sub(v27, in, t3, op3, half) \
612  } while (0)
613 
614 #define butterfly32(in0, in1, in2, in3, out) \
615  do { \
616  out = wasm_i32x4_add(in0, in1); \
617  in0 = wasm_i32x4_sub(in0, in1); \
618  in1 = wasm_i32x4_add(in2, in3); \
619  in2 = wasm_i32x4_sub(in2, in3); \
620  } while (0)
621 
622 static void tr_32x4(char *x5, char *x11, char *sp, int shift)
623 {
624  char *x1, *x3, *x4;
625  // transform in v0 - v4
626  v128_t v0[4];
627  v128_t v1[4];
628  v128_t v2[4];
629  v128_t v3[4];
630  v128_t v4, v5, v6, v7, v16, v17, v18, v19,
631  v20, v21, v22, v23, v24, v25, v26, v27,
632  v28, v29, v30, v31, v32, v33;
633 
634  tr_16x4(x5, x11, sp, 0, 2048, 4);
635 
636  // load32
637  x1 = &x5[64];
638  x3 = &x1[128];
639  v4 = wasm_v128_load64_zero(x1);
640  v4 = wasm_v128_load64_lane(x3, v4, 1);
641  x1 += 256;
642  x3 += 256;
643  v5 = wasm_v128_load64_zero(x1);
644  v5 = wasm_v128_load64_lane(x3, v5, 1);
645  x1 += 256;
646  x3 += 256;
647  v6 = wasm_v128_load64_zero(x1);
648  v6 = wasm_v128_load64_lane(x3, v6, 1);
649  x1 += 256;
650  x3 += 256;
651  v7 = wasm_v128_load64_zero(x1);
652  v7 = wasm_v128_load64_lane(x3, v7, 1);
653  x1 += 256;
654  x3 += 256;
655  v16 = wasm_v128_load64_zero(x1);
656  v16 = wasm_v128_load64_lane(x3, v16, 1);
657  x1 += 256;
658  x3 += 256;
659  v17 = wasm_v128_load64_zero(x1);
660  v17 = wasm_v128_load64_lane(x3, v17, 1);
661  x1 += 256;
662  x3 += 256;
663  v18 = wasm_v128_load64_zero(x1);
664  v18 = wasm_v128_load64_lane(x3, v18, 1);
665  x1 += 256;
666  x3 += 256;
667  v19 = wasm_v128_load64_zero(x1);
668  v19 = wasm_v128_load64_lane(x3, v19, 1);
669 
670  // load transform
671  v0[0] = wasm_i16x8_const_splat(transform[16 + 0]);
672  v0[1] = wasm_i16x8_const_splat(transform[16 + 1]);
673  v0[2] = wasm_i16x8_const_splat(transform[16 + 2]);
674  v0[3] = wasm_i16x8_const_splat(transform[16 + 3]);
675  v1[0] = wasm_i16x8_const_splat(transform[16 + 4]);
676  v1[1] = wasm_i16x8_const_splat(transform[16 + 5]);
677  v1[2] = wasm_i16x8_const_splat(transform[16 + 6]);
678  v1[3] = wasm_i16x8_const_splat(transform[16 + 7]);
679  v2[0] = wasm_i16x8_const_splat(transform[16 + 8]);
680  v2[1] = wasm_i16x8_const_splat(transform[16 + 9]);
681  v2[2] = wasm_i16x8_const_splat(transform[16 + 10]);
682  v2[3] = wasm_i16x8_const_splat(transform[16 + 11]);
683  v3[0] = wasm_i16x8_const_splat(transform[16 + 12]);
684  v3[1] = wasm_i16x8_const_splat(transform[16 + 13]);
685  v3[2] = wasm_i16x8_const_splat(transform[16 + 14]);
686  v3[3] = wasm_i16x8_const_splat(transform[16 + 15]);
687 
688  // tr_block1
689  v24 = wasm_i32x4_extmul_low_i16x8(v4, v0[0]);
690  v25 = wasm_i32x4_extmul_low_i16x8(v4, v0[1]);
691  v26 = wasm_i32x4_extmul_low_i16x8(v4, v0[2]);
692  v27 = wasm_i32x4_extmul_low_i16x8(v4, v0[3]);
693 
694  add_member32(v4, v0[1], v1[0], v1[3], v2[2], add, add, add, add, high);
695  add_member32(v5, v0[2], v1[3], v3[0], v3[2], add, add, add, sub, low);
696  add_member32(v5, v0[3], v2[2], v3[2], v1[3], add, add, sub, sub, high);
697  add_member32(v6, v1[0], v3[1], v2[1], v0[0], add, add, sub, sub, low);
698  add_member32(v6, v1[1], v3[3], v1[0], v1[2], add, sub, sub, sub, high);
699  add_member32(v7, v1[2], v3[0], v0[0], v3[1], add, sub, sub, sub, low);
700  add_member32(v7, v1[3], v2[1], v1[1], v2[3], add, sub, sub, add, high);
701  add_member32(v16, v2[0], v1[2], v2[2], v1[0], add, sub, sub, add, low);
702  add_member32(v16, v2[1], v0[3], v3[3], v0[2], add, sub, sub, add, high);
703  add_member32(v17, v2[2], v0[1], v2[3], v2[1], add, sub, add, add, low);
704  add_member32(v17, v2[3], v0[2], v1[2], v3[3], add, sub, add, sub, high);
705  add_member32(v18, v3[0], v1[1], v0[1], v2[0], add, sub, add, sub, low);
706  add_member32(v18, v3[1], v2[0], v0[3], v0[1], add, sub, add, sub, high);
707  add_member32(v19, v3[2], v2[3], v2[0], v1[1], add, sub, add, sub, low);
708  add_member32(v19, v3[3], v3[2], v3[1], v3[0], add, sub, add, sub, high);
709 
710  x4 = &sp[2048];
711  // scale_store
712  v28 = wasm_v128_load(x4);
713  x4 += 16;
714  v29 = wasm_v128_load(x4);
715  x4 += 16;
716  v30 = wasm_v128_load(x4);
717  x4 += 16;
718  v31 = wasm_v128_load(x4);
719  x4 += 16;
720  butterfly32(v28, v24, v29, v25, v32);
721  butterfly32(v30, v26, v31, v27, v33);
722  scale(&v20, &v21, &v22, &v23, v32, v28, v24, v29, v33, v30, v26, v31, shift);
723  transpose16_4x4_2(&v20, &v21, &v22, &v23);
724  x1 = x11;
725  x3 = &x11[56 + 3 * 64];
726  store16(v20, v21, v22, v23, x1, x3, 64, -64);
727 
728  // tr_block2
729  v24 = wasm_i32x4_extmul_low_i16x8(v4, v1[0]);
730  v25 = wasm_i32x4_extmul_low_i16x8(v4, v1[1]);
731  v26 = wasm_i32x4_extmul_low_i16x8(v4, v1[2]);
732  v27 = wasm_i32x4_extmul_low_i16x8(v4, v1[3]);
733 
734  add_member32(v4, v3[1], v3[3], v3[0], v2[1], add, sub, sub, sub, high);
735  add_member32(v5, v2[1], v1[0], v0[0], v1[1], sub, sub, sub, sub, low);
736  add_member32(v5, v0[0], v1[2], v3[1], v2[3], sub, sub, sub, add, high);
737  add_member32(v6, v2[0], v3[2], v1[1], v0[3], sub, add, add, add, low);
738  add_member32(v6, v3[2], v0[3], v1[3], v3[1], add, add, add, sub, high);
739  add_member32(v7, v1[1], v1[3], v2[3], v0[0], add, add, sub, sub, low);
740  add_member32(v7, v0[3], v3[1], v0[1], v3[3], add, sub, sub, add, high);
741  add_member32(v16, v3[0], v0[2], v3[2], v0[1], add, sub, sub, add, low);
742  add_member32(v16, v2[2], v2[0], v1[0], v3[2], sub, sub, add, add, high);
743  add_member32(v17, v0[1], v3[0], v2[0], v0[2], sub, add, add, sub, low);
744  add_member32(v17, v1[3], v0[1], v2[2], v3[0], sub, add, sub, sub, high);
745  add_member32(v18, v3[3], v2[1], v0[2], v1[0], add, add, sub, add, low);
746  add_member32(v18, v1[2], v2[3], v3[3], v2[2], add, sub, sub, add, high);
747  add_member32(v19, v0[2], v0[1], v0[3], v1[2], add, sub, add, sub, low);
748  add_member32(v19, v2[3], v2[2], v2[1], v2[0], add, sub, add, sub, high);
749 
750  // scale_store
751  v28 = wasm_v128_load(x4);
752  x4 += 16;
753  v29 = wasm_v128_load(x4);
754  x4 += 16;
755  v30 = wasm_v128_load(x4);
756  x4 += 16;
757  v31 = wasm_v128_load(x4);
758  x4 += 16;
759  butterfly32(v28, v24, v29, v25, v32);
760  butterfly32(v30, v26, v31, v27, v33);
761  scale(&v20, &v21, &v22, &v23, v32, v28, v24, v29, v33, v30, v26, v31, shift);
762  transpose16_4x4_2(&v20, &v21, &v22, &v23);
763  x1 = &x11[8];
764  x3 = &x11[48 + 3 * 64];
765  store16(v20, v21, v22, v23, x1, x3, 64, -64);
766 
767  // tr_block3
768  v24 = wasm_i32x4_extmul_low_i16x8(v4, v2[0]);
769  v25 = wasm_i32x4_extmul_low_i16x8(v4, v2[1]);
770  v26 = wasm_i32x4_extmul_low_i16x8(v4, v2[2]);
771  v27 = wasm_i32x4_extmul_low_i16x8(v4, v2[3]);
772  add_member32(v4, v1[2], v0[3], v0[0], v0[2], sub, sub, sub, sub, high);
773  add_member32(v5, v2[2], v3[3], v2[3], v1[2], sub, sub, add, add, low);
774  add_member32(v5, v1[0], v0[2], v2[1], v3[3], add, add, add, sub, high);
775  add_member32(v6, v3[0], v2[2], v0[1], v1[3], add, sub, sub, sub, low);
776  add_member32(v6, v0[2], v2[0], v3[0], v0[0], sub, sub, add, add, high);
777  add_member32(v7, v3[2], v1[0], v2[0], v2[2], sub, add, add, sub, low);
778  add_member32(v7, v0[0], v3[2], v0[2], v3[0], add, add, sub, sub, high);
779  add_member32(v16, v3[3], v0[1], v3[1], v0[3], sub, sub, add, add, low);
780  add_member32(v16, v0[1], v2[3], v1[3], v1[1], sub, add, add, sub, high);
781  add_member32(v17, v3[1], v1[3], v0[3], v3[2], add, add, sub, add, low);
782  add_member32(v17, v0[3], v1[1], v3[2], v2[0], add, sub, add, add, high);
783  add_member32(v18, v2[3], v3[1], v1[2], v0[1], sub, sub, add, sub, low);
784  add_member32(v18, v1[1], v0[0], v1[0], v2[1], sub, add, sub, add, high);
785  add_member32(v19, v2[1], v3[0], v3[3], v3[1], add, sub, add, add, low);
786  add_member32(v19, v1[3], v1[2], v1[1], v1[0], add, sub, add, sub, high);
787 
788  // scale_store
789  v28 = wasm_v128_load(x4);
790  x4 += 16;
791  v29 = wasm_v128_load(x4);
792  x4 += 16;
793  v30 = wasm_v128_load(x4);
794  x4 += 16;
795  v31 = wasm_v128_load(x4);
796  x4 += 16;
797  butterfly32(v28, v24, v29, v25, v32);
798  butterfly32(v30, v26, v31, v27, v33);
799  scale(&v20, &v21, &v22, &v23, v32, v28, v24, v29, v33, v30, v26, v31, shift);
800  transpose16_4x4_2(&v20, &v21, &v22, &v23);
801  x1 = &x11[16];
802  x3 = &x11[40 + 3 * 64];
803  store16(v20, v21, v22, v23, x1, x3, 64, -64);
804 
805  // try_block4
806  v24 = wasm_i32x4_extmul_low_i16x8(v4, v3[0]);
807  v25 = wasm_i32x4_extmul_low_i16x8(v4, v3[1]);
808  v26 = wasm_i32x4_extmul_low_i16x8(v4, v3[2]);
809  v27 = wasm_i32x4_extmul_low_i16x8(v4, v3[3]);
810  add_member32(v4, v1[1], v2[0], v2[3], v3[2], sub, sub, sub, sub, high);
811  add_member32(v5, v0[0], v0[3], v2[0], v3[1], add, add, add, add, low);
812  add_member32(v5, v2[0], v0[0], v1[1], v3[0], sub, sub, sub, sub, high);
813  add_member32(v6, v3[3], v1[2], v0[2], v2[3], add, add, add, add, low);
814  add_member32(v6, v2[1], v2[3], v0[0], v2[2], add, sub, sub, sub, high);
815  add_member32(v7, v0[2], v3[3], v0[3], v2[1], sub, sub, add, add, low);
816  add_member32(v7, v1[0], v2[2], v1[2], v2[0], add, add, sub, sub, high);
817  add_member32(v16, v2[3], v1[1], v2[1], v1[3], sub, sub, add, add, low);
818  add_member32(v16, v3[1], v0[1], v3[0], v1[2], sub, add, sub, sub, high);
819  add_member32(v17, v1[2], v1[0], v3[3], v1[1], add, sub, add, add, low);
820  add_member32(v17, v0[1], v2[1], v3[1], v1[0], sub, add, add, sub, high);
821  add_member32(v18, v1[3], v3[2], v2[2], v0[3], add, sub, sub, add, low);
822  add_member32(v18, v3[2], v3[0], v1[3], v0[2], sub, sub, add, sub, high);
823  add_member32(v19, v2[2], v1[3], v1[0], v0[1], sub, add, sub, add, low);
824  add_member32(v19, v0[3], v0[2], v0[1], v0[0], add, sub, add, sub, high);
825 
826  // scale_store
827  v28 = wasm_v128_load(x4);
828  x4 += 16;
829  v29 = wasm_v128_load(x4);
830  x4 += 16;
831  v30 = wasm_v128_load(x4);
832  x4 += 16;
833  v31 = wasm_v128_load(x4);
834  butterfly32(v28, v24, v29, v25, v32);
835  butterfly32(v30, v26, v31, v27, v33);
836  scale(&v20, &v21, &v22, &v23, v32, v28, v24, v29, v33, v30, v26, v31, shift);
837  transpose16_4x4_2(&v20, &v21, &v22, &v23);
838  x1 = &x11[24];
839  x3 = &x11[32 + 3 * 64];
840  store16(v20, v21, v22, v23, x1, x3, 64, -64);
841 }
842 
843 static void idct_32x32(char *coeffs, int bit_depth)
844 {
845  DECLARE_ALIGNED(16, char, sp)[2432];
846  char *x5, *x11;
847 
848  for (int i = 0; i < 8; i++) {
849  x5 = &coeffs[8 * i];
850  x11 = &sp[8 * i * 32];
851  tr_32x4(x5, x11, sp, 7);
852  }
853 
854  for (int i = 0; i < 8; i++) {
855  x5 = &sp[8 * i];
856  x11 = &coeffs[8 * i * 32];
857  tr_32x4(x5, x11, sp, 20 - bit_depth);
858  }
859 }
860 
861 void ff_hevc_idct_32x32_8_simd128(int16_t *coeffs, int col_limit)
862 {
863  idct_32x32((char *)coeffs, 8);
864 }
865 
866 void ff_hevc_idct_32x32_10_simd128(int16_t *coeffs, int col_limit)
867 {
868  idct_32x32((char *)coeffs, 10);
869 }
add_member
#define add_member(in, t0, t1, t2, t3, t4, t5, t6, t7, op0, op1, op2, op3, op4, op5, op6, op7, half)
Definition: idct.c:439
shift_narrow_low
static void shift_narrow_low(v128_t src, v128_t *dst, v128_t add, int shift)
Definition: idct.c:128
mem_internal.h
ff_hevc_idct_32x32_8_simd128
void ff_hevc_idct_32x32_8_simd128(int16_t *coeffs, int col_limit)
Definition: idct.c:861
store16
static void store16(v128_t in0, v128_t in1, v128_t in2, v128_t in3, char *x1, char *x3, int x1_step, int x3_step)
Definition: idct.c:390
src1
const pixel * src1
Definition: h264pred_template.c:421
ff_hevc_idct_16x16_10_simd128
void ff_hevc_idct_16x16_10_simd128(int16_t *coeffs, int col_limit)
Definition: idct.c:601
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
step
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about which is also called distortion Distortion can be quantified by almost any quality measurement one chooses the sum of squared differences is used but more complex methods that consider psychovisual effects can be used as well It makes no difference in this discussion First step
Definition: rate_distortion.txt:58
high
int high
Definition: dovi_rpuenc.c:38
bit_depth
static void bit_depth(AudioStatsContext *s, const uint64_t *const mask, uint8_t *depth)
Definition: af_astats.c:246
tr_32x4
static void tr_32x4(char *x5, char *x11, char *sp, int shift)
Definition: idct.c:622
ff_hevc_idct_4x4_10_simd128
void ff_hevc_idct_4x4_10_simd128(int16_t *coeffs, int col_limit)
Definition: idct.c:123
bufferfly
#define bufferfly(e, o, p, m)
Definition: idct.c:282
ff_hevc_idct_8x8_8_simd128
void ff_hevc_idct_8x8_8_simd128(int16_t *coeffs, int col_limit)
Definition: idct.c:254
tr_8x4
#define tr_8x4(src0, src1, half0, half1, trans, shift)
Definition: idct.c:166
store_to_stack
static void store_to_stack(char *sp, int off1, int off2, v128_t in0, v128_t in2, v128_t in4, v128_t in6, v128_t in7, v128_t in5, v128_t in3, v128_t in1)
Definition: idct.c:413
tr_4x4
static void tr_4x4(v128_t *src, v128_t *trans, int shift)
Definition: idct.c:53
transpose_8x8h
static void transpose_8x8h(v128_t *src)
Definition: idct.c:47
transform
static const int8_t transform[]
Definition: idct.c:27
idct_8x8
static void idct_8x8(int16_t *coeffs, int bit_depth)
Definition: idct.c:210
butterfly16
#define butterfly16(in0, in1, in2, in3, in4, in5, in6, in7)
Definition: idct.c:451
idct.h
transpose16_4x4_2
static void transpose16_4x4_2(v128_t *r0, v128_t *r1, v128_t *r2, v128_t *r3)
Definition: idct.c:359
tr_4x4_8
#define tr_4x4_8(in0, in1, in2, in3, dst0, dst1, dst2, dst3, trans, half0, half1)
Definition: idct.c:142
ff_hevc_idct_32x32_10_simd128
void ff_hevc_idct_32x32_10_simd128(int16_t *coeffs, int col_limit)
Definition: idct.c:866
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:104
shift
static int shift(int a, int b)
Definition: bonk.c:261
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
add_member32
#define add_member32(in, t0, t1, t2, t3, op0, op1, op2, op3, half)
Definition: idct.c:606
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
idct_4x4
static void idct_4x4(int16_t *coeffs, int bit_depth)
Definition: idct.c:91
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
ff_hevc_idct_8x8_10_simd128
void ff_hevc_idct_8x8_10_simd128(int16_t *coeffs, int col_limit)
Definition: idct.c:259
scale
static void scale(v128_t *out0, v128_t *out1, v128_t *out2, v128_t *out3, v128_t in0, v128_t in1, v128_t in2, v128_t in3, v128_t in4, v128_t in5, v128_t in6, v128_t in7, int shift)
Definition: idct.c:328
shift2
static const uint8_t shift2[6]
Definition: dxa.c:49
ff_hevc_idct_4x4_8_simd128
void ff_hevc_idct_4x4_8_simd128(int16_t *coeffs, int col_limit)
Definition: idct.c:118
tr16_8x4
static void tr16_8x4(v128_t in0, v128_t in1, v128_t in2, v128_t in3, const v128_t *trans, char *sp, int offset)
Definition: idct.c:286
load16
#define load16(x1, x3, x2, in0, in1, in2, in3)
Definition: idct.c:264
ff_hevc_idct_16x16_8_simd128
void ff_hevc_idct_16x16_8_simd128(int16_t *coeffs, int col_limit)
Definition: idct.c:596
tr_16x4
static void tr_16x4(char *src, char *buf, char *sp, int shift, int offset, int step)
Definition: idct.c:463
butterfly32
#define butterfly32(in0, in1, in2, in3, out)
Definition: idct.c:614
idct_16x16
static void idct_16x16(char *coeffs, int bit_depth)
Definition: idct.c:579
shift_narrow_high
static void shift_narrow_high(v128_t src, v128_t *dst, v128_t add, int shift)
Definition: idct.c:135
shift1
static const uint8_t shift1[6]
Definition: dxa.c:48
transpose_4x8h
static void transpose_4x8h(v128_t *src)
Definition: idct.c:34
src
#define src
Definition: vp8dsp.c:248
idct_32x32
static void idct_32x32(char *coeffs, int bit_depth)
Definition: idct.c:843