FFmpeg
vvcdsp_init.c
Go to the documentation of this file.
1 /*
2  * VVC DSP init for x86
3  *
4  * Copyright (C) 2022-2024 Nuo Mi
5  * Copyright (c) 2023-2024 Wu Jianhua
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "config.h"
25 
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/cpu.h"
28 #include "libavcodec/vvc/dec.h"
29 #include "libavcodec/vvc/ctu.h"
30 #include "libavcodec/vvc/dsp.h"
32 
33 #define PUT_PROTOTYPE(name, depth, opt) \
34 void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, int height, const int8_t *hf, const int8_t *vf, int width);
35 
36 #define PUT_PROTOTYPES(name, bitd, opt) \
37  PUT_PROTOTYPE(name##2, bitd, opt) \
38  PUT_PROTOTYPE(name##4, bitd, opt) \
39  PUT_PROTOTYPE(name##8, bitd, opt) \
40  PUT_PROTOTYPE(name##12, bitd, opt) \
41  PUT_PROTOTYPE(name##16, bitd, opt) \
42  PUT_PROTOTYPE(name##24, bitd, opt) \
43  PUT_PROTOTYPE(name##32, bitd, opt) \
44  PUT_PROTOTYPE(name##48, bitd, opt) \
45  PUT_PROTOTYPE(name##64, bitd, opt) \
46  PUT_PROTOTYPE(name##128, bitd, opt)
47 
48 #define PUT_BPC_PROTOTYPES(name, opt) \
49  PUT_PROTOTYPES(name, 8, opt) \
50  PUT_PROTOTYPES(name, 10, opt) \
51  PUT_PROTOTYPES(name, 12, opt)
52 
53 #define PUT_TAP_PROTOTYPES(n, opt) \
54  PUT_BPC_PROTOTYPES(n##tap_h, opt) \
55  PUT_BPC_PROTOTYPES(n##tap_v, opt) \
56  PUT_BPC_PROTOTYPES(n##tap_hv, opt)
57 
58 PUT_BPC_PROTOTYPES(pixels, sse4)
59 PUT_BPC_PROTOTYPES(pixels, avx2)
60 
61 PUT_TAP_PROTOTYPES(4, sse4)
62 PUT_TAP_PROTOTYPES(8, sse4)
63 PUT_TAP_PROTOTYPES(4, avx2)
64 PUT_TAP_PROTOTYPES(8, avx2)
65 
66 #define bf(fn, bd, opt) fn##_##bd##_##opt
67 #define BF(fn, bpc, opt) fn##_##bpc##bpc_##opt
68 
69 #define AVG_BPC_PROTOTYPES(bpc, opt) \
70 void BF(ff_vvc_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
71  const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, intptr_t pixel_max); \
72 void BF(ff_vvc_w_avg, bpc, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
73  const int16_t *src0, const int16_t *src1, intptr_t width, intptr_t height, \
74  intptr_t denom, intptr_t w0, intptr_t w1, intptr_t o0, intptr_t o1, intptr_t pixel_max);
75 
76 #define AVG_PROTOTYPES(bd, opt) \
77 void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
78  const int16_t *src0, const int16_t *src1, int width, int height); \
79 void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
80  const int16_t *src0, const int16_t *src1, int width, int height, \
81  int denom, int w0, int w1, int o0, int o1);
82 
83 AVG_BPC_PROTOTYPES( 8, avx2)
84 AVG_BPC_PROTOTYPES(16, avx2)
85 
86 AVG_PROTOTYPES( 8, avx2)
87 AVG_PROTOTYPES(10, avx2)
88 AVG_PROTOTYPES(12, avx2)
89 
90 #if ARCH_X86_64
91 #if HAVE_SSE4_EXTERNAL
92 #define FW_PUT(name, depth, opt) \
93 void ff_vvc_put_ ## name ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \
94  int height, const int8_t *hf, const int8_t *vf, int width) \
95 { \
96  ff_h2656_put_## name ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \
97 }
98 
99 #define FW_PUT_TAP(fname, bitd, opt ) \
100  FW_PUT(fname##4, bitd, opt ) \
101  FW_PUT(fname##8, bitd, opt ) \
102  FW_PUT(fname##16, bitd, opt ) \
103  FW_PUT(fname##32, bitd, opt ) \
104  FW_PUT(fname##64, bitd, opt ) \
105  FW_PUT(fname##128, bitd, opt ) \
106 
107 #define FW_PUT_4TAP(fname, bitd, opt) \
108  FW_PUT(fname ## 2, bitd, opt) \
109  FW_PUT_TAP(fname, bitd, opt)
110 
111 #define FW_PUT_4TAP_SSE4(bitd) \
112  FW_PUT_4TAP(pixels, bitd, sse4) \
113  FW_PUT_4TAP(4tap_h, bitd, sse4) \
114  FW_PUT_4TAP(4tap_v, bitd, sse4) \
115  FW_PUT_4TAP(4tap_hv, bitd, sse4)
116 
117 #define FW_PUT_8TAP_SSE4(bitd) \
118  FW_PUT_TAP(8tap_h, bitd, sse4) \
119  FW_PUT_TAP(8tap_v, bitd, sse4) \
120  FW_PUT_TAP(8tap_hv, bitd, sse4)
121 
122 #define FW_PUT_SSE4(bitd) \
123  FW_PUT_4TAP_SSE4(bitd) \
124  FW_PUT_8TAP_SSE4(bitd)
125 
126 FW_PUT_SSE4( 8)
127 FW_PUT_SSE4(10)
128 FW_PUT_SSE4(12)
129 #endif
130 
131 #if HAVE_AVX2_EXTERNAL
132 #define FW_PUT_TAP_AVX2(n, bitd) \
133  FW_PUT(n ## tap_h32, bitd, avx2) \
134  FW_PUT(n ## tap_h64, bitd, avx2) \
135  FW_PUT(n ## tap_h128, bitd, avx2) \
136  FW_PUT(n ## tap_v32, bitd, avx2) \
137  FW_PUT(n ## tap_v64, bitd, avx2) \
138  FW_PUT(n ## tap_v128, bitd, avx2)
139 
140 #define FW_PUT_AVX2(bitd) \
141  FW_PUT(pixels32, bitd, avx2) \
142  FW_PUT(pixels64, bitd, avx2) \
143  FW_PUT(pixels128, bitd, avx2) \
144  FW_PUT_TAP_AVX2(4, bitd) \
145  FW_PUT_TAP_AVX2(8, bitd) \
146 
147 FW_PUT_AVX2( 8)
148 FW_PUT_AVX2(10)
149 FW_PUT_AVX2(12)
150 
151 #define FW_PUT_TAP_16BPC_AVX2(n, bitd) \
152  FW_PUT(n ## tap_h16, bitd, avx2) \
153  FW_PUT(n ## tap_v16, bitd, avx2) \
154  FW_PUT(n ## tap_hv16, bitd, avx2) \
155  FW_PUT(n ## tap_hv32, bitd, avx2) \
156  FW_PUT(n ## tap_hv64, bitd, avx2) \
157  FW_PUT(n ## tap_hv128, bitd, avx2)
158 
159 #define FW_PUT_16BPC_AVX2(bitd) \
160  FW_PUT(pixels16, bitd, avx2) \
161  FW_PUT_TAP_16BPC_AVX2(4, bitd) \
162  FW_PUT_TAP_16BPC_AVX2(8, bitd)
163 
164 FW_PUT_16BPC_AVX2(10)
165 FW_PUT_16BPC_AVX2(12)
166 
167 #define AVG_FUNCS(bpc, bd, opt) \
168 void bf(ff_vvc_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
169  const int16_t *src0, const int16_t *src1, int width, int height) \
170 { \
171  BF(ff_vvc_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, (1 << bd) - 1); \
172 } \
173 void bf(ff_vvc_w_avg, bd, opt)(uint8_t *dst, ptrdiff_t dst_stride, \
174  const int16_t *src0, const int16_t *src1, int width, int height, \
175  int denom, int w0, int w1, int o0, int o1) \
176 { \
177  BF(ff_vvc_w_avg, bpc, opt)(dst, dst_stride, src0, src1, width, height, \
178  denom, w0, w1, o0, o1, (1 << bd) - 1); \
179 }
180 
181 AVG_FUNCS(8, 8, avx2)
182 AVG_FUNCS(16, 10, avx2)
183 AVG_FUNCS(16, 12, avx2)
184 #endif
185 
186 #define PEL_LINK(dst, C, W, idx1, idx2, name, D, opt) \
187  dst[C][W][idx1][idx2] = ff_vvc_put_## name ## _ ## D ## _##opt; \
188  dst ## _uni[C][W][idx1][idx2] = ff_h2656_put_uni_ ## name ## _ ## D ## _##opt; \
189 
190 #define MC_TAP_LINKS(pointer, C, my, mx, fname, bitd, opt ) \
191  PEL_LINK(pointer, C, 1, my , mx , fname##4 , bitd, opt ); \
192  PEL_LINK(pointer, C, 2, my , mx , fname##8 , bitd, opt ); \
193  PEL_LINK(pointer, C, 3, my , mx , fname##16, bitd, opt ); \
194  PEL_LINK(pointer, C, 4, my , mx , fname##32, bitd, opt ); \
195  PEL_LINK(pointer, C, 5, my , mx , fname##64, bitd, opt ); \
196  PEL_LINK(pointer, C, 6, my , mx , fname##128, bitd, opt );
197 
198 #define MC_8TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
199  MC_TAP_LINKS(pointer, LUMA, my, mx, fname, bitd, opt)
200 
201 #define MC_8TAP_LINKS_SSE4(bd) \
202  MC_8TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
203  MC_8TAP_LINKS(c->inter.put, 0, 1, 8tap_h, bd, sse4); \
204  MC_8TAP_LINKS(c->inter.put, 1, 0, 8tap_v, bd, sse4); \
205  MC_8TAP_LINKS(c->inter.put, 1, 1, 8tap_hv, bd, sse4)
206 
207 #define MC_4TAP_LINKS(pointer, my, mx, fname, bitd, opt) \
208  PEL_LINK(pointer, CHROMA, 0, my , mx , fname##2 , bitd, opt ); \
209  MC_TAP_LINKS(pointer, CHROMA, my, mx, fname, bitd, opt) \
210 
211 #define MC_4TAP_LINKS_SSE4(bd) \
212  MC_4TAP_LINKS(c->inter.put, 0, 0, pixels, bd, sse4); \
213  MC_4TAP_LINKS(c->inter.put, 0, 1, 4tap_h, bd, sse4); \
214  MC_4TAP_LINKS(c->inter.put, 1, 0, 4tap_v, bd, sse4); \
215  MC_4TAP_LINKS(c->inter.put, 1, 1, 4tap_hv, bd, sse4)
216 
217 #define MC_LINK_SSE4(bd) \
218  MC_4TAP_LINKS_SSE4(bd) \
219  MC_8TAP_LINKS_SSE4(bd)
220 
221 #define MC_TAP_LINKS_AVX2(C,tap,bd) do { \
222  PEL_LINK(c->inter.put, C, 4, 0, 0, pixels32, bd, avx2) \
223  PEL_LINK(c->inter.put, C, 5, 0, 0, pixels64, bd, avx2) \
224  PEL_LINK(c->inter.put, C, 6, 0, 0, pixels128, bd, avx2) \
225  PEL_LINK(c->inter.put, C, 4, 0, 1, tap##tap_h32, bd, avx2) \
226  PEL_LINK(c->inter.put, C, 5, 0, 1, tap##tap_h64, bd, avx2) \
227  PEL_LINK(c->inter.put, C, 6, 0, 1, tap##tap_h128, bd, avx2) \
228  PEL_LINK(c->inter.put, C, 4, 1, 0, tap##tap_v32, bd, avx2) \
229  PEL_LINK(c->inter.put, C, 5, 1, 0, tap##tap_v64, bd, avx2) \
230  PEL_LINK(c->inter.put, C, 6, 1, 0, tap##tap_v128, bd, avx2) \
231  } while (0)
232 
233 #define MC_LINKS_AVX2(bd) \
234  MC_TAP_LINKS_AVX2(LUMA, 8, bd); \
235  MC_TAP_LINKS_AVX2(CHROMA, 4, bd);
236 
237 #define MC_TAP_LINKS_16BPC_AVX2(C, tap, bd) do { \
238  PEL_LINK(c->inter.put, C, 3, 0, 0, pixels16, bd, avx2) \
239  PEL_LINK(c->inter.put, C, 3, 0, 1, tap##tap_h16, bd, avx2) \
240  PEL_LINK(c->inter.put, C, 3, 1, 0, tap##tap_v16, bd, avx2) \
241  PEL_LINK(c->inter.put, C, 3, 1, 1, tap##tap_hv16, bd, avx2) \
242  PEL_LINK(c->inter.put, C, 4, 1, 1, tap##tap_hv32, bd, avx2) \
243  PEL_LINK(c->inter.put, C, 5, 1, 1, tap##tap_hv64, bd, avx2) \
244  PEL_LINK(c->inter.put, C, 6, 1, 1, tap##tap_hv128, bd, avx2) \
245  } while (0)
246 
247 #define MC_LINKS_16BPC_AVX2(bd) \
248  MC_TAP_LINKS_16BPC_AVX2(LUMA, 8, bd); \
249  MC_TAP_LINKS_16BPC_AVX2(CHROMA, 4, bd);
250 
251 #define AVG_INIT(bd, opt) do { \
252  c->inter.avg = bf(ff_vvc_avg, bd, opt); \
253  c->inter.w_avg = bf(ff_vvc_w_avg, bd, opt); \
254 } while (0)
255 #endif
256 
257 void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
258 {
259 #if ARCH_X86_64
260  const int cpu_flags = av_get_cpu_flags();
261 
262  if (bd == 8) {
263  if (EXTERNAL_SSE4(cpu_flags)) {
264  MC_LINK_SSE4(8);
265  }
267  MC_LINKS_AVX2(8);
268  }
269  } else if (bd == 10) {
270  if (EXTERNAL_SSE4(cpu_flags)) {
271  MC_LINK_SSE4(10);
272  }
274  MC_LINKS_AVX2(10);
275  MC_LINKS_16BPC_AVX2(10);
276  }
277  } else if (bd == 12) {
278  if (EXTERNAL_SSE4(cpu_flags)) {
279  MC_LINK_SSE4(12);
280  }
282  MC_LINKS_AVX2(12);
283  MC_LINKS_16BPC_AVX2(12);
284  }
285  }
286 
287  if (EXTERNAL_AVX2(cpu_flags)) {
288  switch (bd) {
289  case 8:
290  AVG_INIT(8, avx2);
291  break;
292  case 10:
293  AVG_INIT(10, avx2);
294  break;
295  case 12:
296  AVG_INIT(12, avx2);
297  break;
298  default:
299  break;
300  }
301  }
302 #endif
303 }
ff_vvc_dsp_init_x86
void ff_vvc_dsp_init_x86(VVCDSPContext *const c, const int bd)
Definition: vvcdsp_init.c:257
cpu.h
dsp.h
EXTERNAL_AVX2_FAST
#define EXTERNAL_AVX2_FAST(flags)
Definition: cpu.h:79
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:103
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:52
EXTERNAL_AVX2
#define EXTERNAL_AVX2(flags)
Definition: cpu.h:78
AVG_PROTOTYPES
#define AVG_PROTOTYPES(bd, opt)
Definition: vvcdsp_init.c:76
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
h2656dsp.h
cpu.h
PUT_BPC_PROTOTYPES
#define PUT_BPC_PROTOTYPES(name, opt)
Definition: vvcdsp_init.c:48
EXTERNAL_SSE4
#define EXTERNAL_SSE4(flags)
Definition: cpu.h:68
AVG_BPC_PROTOTYPES
#define AVG_BPC_PROTOTYPES(bpc, opt)
Definition: vvcdsp_init.c:69
ctu.h
PUT_TAP_PROTOTYPES
#define PUT_TAP_PROTOTYPES(n, opt)
Definition: vvcdsp_init.c:53
dec.h
VVCDSPContext
Definition: dsp.h:158