FFmpeg
vvcdsp_init.c
Go to the documentation of this file.
1 /*
2  * VVC DSP init for x86
3  *
4  * Copyright (C) 2022-2024 Nuo Mi
5  * Copyright (c) 2023-2024 Wu Jianhua
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "config.h"
25 
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavutil/x86/cpu.h"
29 #include "libavcodec/vvc/vvcdec.h"
30 #include "libavcodec/vvc/vvc_ctu.h"
31 #include "libavcodec/vvc/vvcdsp.h"
33 
34 #if ARCH_X86_64
35 #define FW_PUT(name, depth, opt) \
36 static void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \
37  int height, const int8_t *hf, const int8_t *vf, int width) \
38 { \
39  ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \
40 }
41 
42 #define FW_PUT_TAP(fname, bitd, opt ) \
43  FW_PUT(fname##4, bitd, opt ) \
44  FW_PUT(fname##8, bitd, opt ) \
45  FW_PUT(fname##16, bitd, opt ) \
46  FW_PUT(fname##32, bitd, opt ) \
47  FW_PUT(fname##64, bitd, opt ) \
48  FW_PUT(fname##128, bitd, opt ) \
49 
50 #define FW_PUT_4TAP(fname, bitd, opt) \
51  FW_PUT(fname ## 2, bitd, opt) \
52  FW_PUT_TAP(fname, bitd, opt)
53 
54 #define FW_PUT_4TAP_SSE4(bitd) \
55  FW_PUT_4TAP(pixels, bitd, sse4) \
56  FW_PUT_4TAP(4tap_h, bitd, sse4) \
57  FW_PUT_4TAP(4tap_v, bitd, sse4) \
58  FW_PUT_4TAP(4tap_hv, bitd, sse4)
59 
60 #define FW_PUT_8TAP_SSE4(bitd) \
61  FW_PUT_TAP(8tap_h, bitd, sse4) \
62  FW_PUT_TAP(8tap_v, bitd, sse4) \
63  FW_PUT_TAP(8tap_hv, bitd, sse4)
64 
65 #define FW_PUT_SSE4(bitd) \
66  FW_PUT_4TAP_SSE4(bitd) \
67  FW_PUT_8TAP_SSE4(bitd)
68 
69 FW_PUT_SSE4( 8)
70 FW_PUT_SSE4(10)
71 FW_PUT_SSE4(12)
72 
73 #define FW_PUT_TAP_AVX2(n, bitd) \
74  FW_PUT(n ## tap_h32, bitd, avx2) \
75  FW_PUT(n ## tap_h64, bitd, avx2) \
76  FW_PUT(n ## tap_h128, bitd, avx2) \
77  FW_PUT(n ## tap_v32, bitd, avx2) \
78  FW_PUT(n ## tap_v64, bitd, avx2) \
79  FW_PUT(n ## tap_v128, bitd, avx2)
80 
81 #define FW_PUT_AVX2(bitd) \
82  FW_PUT(pixels32, bitd, avx2) \
83  FW_PUT(pixels64, bitd, avx2) \
84  FW_PUT(pixels128, bitd, avx2) \
85  FW_PUT_TAP_AVX2(4, bitd) \
86  FW_PUT_TAP_AVX2(8, bitd) \
87 
88 FW_PUT_AVX2( 8)
89 FW_PUT_AVX2(10)
90 FW_PUT_AVX2(12)
91 
92 #define FW_PUT_TAP_16BPC_AVX2(n, bitd) \
93  FW_PUT(n ## tap_h16, bitd, avx2) \
94  FW_PUT(n ## tap_v16, bitd, avx2) \
95  FW_PUT(n ## tap_hv16, bitd, avx2) \
96  FW_PUT(n ## tap_hv32, bitd, avx2) \
97  FW_PUT(n ## tap_hv64, bitd, avx2) \
98  FW_PUT(n ## tap_hv128, bitd, avx2)
99 
100 #define FW_PUT_16BPC_AVX2(bitd) \
101  FW_PUT(pixels16, bitd, avx2) \
102  FW_PUT_TAP_16BPC_AVX2(4, bitd) \
103  FW_PUT_TAP_16BPC_AVX2(8, bitd)
104 
105 FW_PUT_16BPC_AVX2(10)
106 FW_PUT_16BPC_AVX2(12)
107 
108 #define PEL_LINK(dst, C, W, idx1, idx2, name, D, opt) \
109  dst[C][W][idx1][idx2] = ff_vvc_put_## name ## _ ## D ## _##opt; \
110  dst ## _uni[C][W][idx1][idx2] = ff_h2656_put_uni_ ## name ## _ ## D ## _##opt; \
111 
112 #define MC_TAP_LINKS(pointer, C, my, mx, fname, bitd, opt ) \
113  PEL_LINK(pointer, C, 1, my , mx , fname##4 , bitd, opt ); \
114  PEL_LINK(pointer, C, 2, my , mx , fname##8 , bitd, opt ); \
115  PEL_LINK(pointer, C, 3, my , mx , fname##16, bitd, opt ); \
116  PEL_LINK(pointer, C, 4, my , mx , fname##32, bitd, opt ); \
117  PEL_LINK(pointer, C, 5, my , mx , fname##64, bitd, opt ); \
118  PEL_LINK(pointer, C, 6, my , mx , fname##128, bitd, opt );
119 
120 #define MC_8TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
121  MC_TAP_LINKS(pointer, LUMA, my, mx, fname, bitd, opt)
122 
123 #define MC_8TAP_LINKS_SSE4(bd) \
124  MC_8TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
125  MC_8TAP_LINKS(c->inter.put, 0, 1, 8tap_h, bd, sse4); \
126  MC_8TAP_LINKS(c->inter.put, 1, 0, 8tap_v, bd, sse4); \
127  MC_8TAP_LINKS(c->inter.put, 1, 1, 8tap_hv, bd, sse4)
128 
129 #define MC_4TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
130  PEL_LINK(pointer, CHROMA, 0, my , mx , fname##2 , bitd, opt ); \
131  MC_TAP_LINKS(pointer, CHROMA, my, mx, fname, bitd, opt) \
132 
133 #define MC_4TAP_LINKS_SSE4(bd) \
134  MC_4TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
135  MC_4TAP_LINKS(c->inter.put, 0, 1, 4tap_h, bd, sse4); \
136  MC_4TAP_LINKS(c->inter.put, 1, 0, 4tap_v, bd, sse4); \
137  MC_4TAP_LINKS(c->inter.put, 1, 1, 4tap_hv, bd, sse4)
138 
139 #define MC_LINK_SSE4(bd) \
140  MC_4TAP_LINKS_SSE4(bd) \
141  MC_8TAP_LINKS_SSE4(bd)
142 
143 #define MC_TAP_LINKS_AVX2(C,tap,bd) do { \
144  PEL_LINK(c->inter.put, C, 4, 0, 0, pixels32, bd, avx2) \
145  PEL_LINK(c->inter.put, C, 5, 0, 0, pixels64, bd, avx2) \
146  PEL_LINK(c->inter.put, C, 6, 0, 0, pixels128, bd, avx2) \
147  PEL_LINK(c->inter.put, C, 4, 0, 1, tap##tap_h32, bd, avx2) \
148  PEL_LINK(c->inter.put, C, 5, 0, 1, tap##tap_h64, bd, avx2) \
149  PEL_LINK(c->inter.put, C, 6, 0, 1, tap##tap_h128, bd, avx2) \
150  PEL_LINK(c->inter.put, C, 4, 1, 0, tap##tap_v32, bd, avx2) \
151  PEL_LINK(c->inter.put, C, 5, 1, 0, tap##tap_v64, bd, avx2) \
152  PEL_LINK(c->inter.put, C, 6, 1, 0, tap##tap_v128, bd, avx2) \
153  } while (0)
154 
155 #define MC_LINKS_AVX2(bd) \
156  MC_TAP_LINKS_AVX2(LUMA, 8, bd); \
157  MC_TAP_LINKS_AVX2(CHROMA, 4, bd);
158 
159 #define MC_TAP_LINKS_16BPC_AVX2(C, tap, bd) do { \
160  PEL_LINK(c->inter.put, C, 3, 0, 0, pixels16, bd, avx2) \
161  PEL_LINK(c->inter.put, C, 3, 0, 1, tap##tap_h16, bd, avx2) \
162  PEL_LINK(c->inter.put, C, 3, 1, 0, tap##tap_v16, bd, avx2) \
163  PEL_LINK(c->inter.put, C, 3, 1, 1, tap##tap_hv16, bd, avx2) \
164  PEL_LINK(c->inter.put, C, 4, 1, 1, tap##tap_hv32, bd, avx2) \
165  PEL_LINK(c->inter.put, C, 5, 1, 1, tap##tap_hv64, bd, avx2) \
166  PEL_LINK(c->inter.put, C, 6, 1, 1, tap##tap_hv128, bd, avx2) \
167  } while (0)
168 
169 #define MC_LINKS_16BPC_AVX2(bd) \
170  MC_TAP_LINKS_16BPC_AVX2(LUMA, 8, bd); \
171  MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
172 
173 #define bf(fn, bd, opt) fn##_##bd##_##opt
174 #define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
175 
176 #define AVG_BPC_FUNC(bpc, opt) \
177 void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
178  const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \
179 void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
180  const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \
181  intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
182 
183 #define AVG_FUNCS(bpc, bd, opt) \
184 static void bf(avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
185  const int16_t *src0, const int16_t *src1, int width, int height) \
186 { \
187  BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \
188 } \
189 static void bf(w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
190  const int16_t *src0, const int16_t *src1, int width, int height, \
191  int denom, int w0, int w1, int o0, int o1) \
192 { \
193  BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, \
194  denom, w0, w1, o0, o1, (1 << bd) - 1); \
195 }
196 
197 AVG_BPC_FUNC(8, avx2)
198 AVG_BPC_FUNC(16, avx2)
199 
200 AVG_FUNCS(8, 8, avx2)
201 AVG_FUNCS(16, 10, avx2)
202 AVG_FUNCS(16, 12, avx2)
203 
204 #define AVG_INIT(bd, opt) do { \
205  c->inter.avg = bf(avg, bd, opt); \
206  c->inter.w_avg = bf(w_avg, bd, opt); \
207 } while (0)
208 #endif
209 
210 void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
211 {
212 #if ARCH_X86_64
213  const int cpu_flags = av_get_cpu_flags();
214 
215  if (bd == 8) {
216  if (EXTERNAL_SSE4(cpu_flags)) {
217  MC_LINK_SSE4(8);
218  }
220  MC_LINKS_AVX2(8);
221  }
222  } else if (bd == 10) {
223  if (EXTERNAL_SSE4(cpu_flags)) {
224  MC_LINK_SSE4(10);
225  }
227  MC_LINKS_AVX2(10);
228  MC_LINKS_16BPC_AVX2(10);
229  }
230  } else if (bd == 12) {
231  if (EXTERNAL_SSE4(cpu_flags)) {
232  MC_LINK_SSE4(12);
233  }
235  MC_LINKS_AVX2(12);
236  MC_LINKS_16BPC_AVX2(12);
237  }
238  }
239 
240  if (EXTERNAL_AVX2(cpu_flags)) {
241  switch (bd) {
242  case 8:
243  AVG_INIT(8, avx2);
244  break;
245  case 10:
246  AVG_INIT(10, avx2);
247  break;
248  case 12:
249  AVG_INIT(12, avx2);
250  break;
251  default:
252  break;
253  }
254  }
255 #endif
256 }
ff_vvc_dsp_init_x86
void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
Definition: vvcdsp_init.c:210
cpu.h
vvcdsp.h
EXTERNAL_AVX2_FAST
#define EXTERNAL_AVX2_FAST(flags)
Definition: cpu.h:79
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:103
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:52
EXTERNAL_AVX2
#define EXTERNAL_AVX2(flags)
Definition: cpu.h:78
vvcdec.h
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
h2656dsp.h
cpu.h
asm.h
vvc_ctu.h
EXTERNAL_SSE4
#define EXTERNAL_SSE4(flags)
Definition: cpu.h:68
VVCDSPContext
Definition: vvcdsp.h:158