FFmpeg
dsp_init.c
Go to the documentation of this file.
1 /*
2  * VVC filters DSP
3  *
4  * Copyright (C) 2024 Zhao Zhili
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "libavutil/cpu.h"
24 #include "libavutil/aarch64/cpu.h"
26 #include "libavcodec/vvc/dsp.h"
27 #include "libavcodec/vvc/dec.h"
28 #include "libavcodec/vvc/ctu.h"
29 
30 #define BDOF_BLOCK_SIZE 16
31 #define BDOF_MIN_BLOCK_SIZE 4
32 
33 void ff_vvc_put_luma_h8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
34  const int height, const int8_t *hf, const int8_t *vf, const int width);
35 void ff_vvc_put_luma_h16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
36  const int height, const int8_t *hf, const int8_t *vf, const int width);
37 void ff_vvc_put_luma_h_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
38  const int height, const int8_t *hf, const int8_t *vf, const int width);
39 void ff_vvc_put_luma_h8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
40  const int height, const int8_t *hf, const int8_t *vf, const int width);
41 void ff_vvc_put_luma_h16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
42  const int height, const int8_t *hf, const int8_t *vf, const int width);
43 void ff_vvc_put_luma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
44  const int height, const int8_t *hf, const int8_t *vf, const int width);
45 
46 void ff_vvc_put_chroma_h8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
47  const int height, const int8_t *hf, const int8_t *vf, const int width);
48 void ff_vvc_put_chroma_h16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
49  const int height, const int8_t *hf, const int8_t *vf, const int width);
50 void ff_vvc_put_chroma_h_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
51  const int height, const int8_t *hf, const int8_t *vf, const int width);
52 void ff_vvc_put_chroma_h8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
53  const int height, const int8_t *hf, const int8_t *vf, const int width);
54 void ff_vvc_put_chroma_h16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
55  const int height, const int8_t *hf, const int8_t *vf, const int width);
56 void ff_vvc_put_chroma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
57  const int height, const int8_t *hf, const int8_t *vf, const int width);
58 
59 void ff_vvc_put_chroma_v4_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
60  const int height, const int8_t *hf, const int8_t *vf, const int width);
61 void ff_vvc_put_chroma_v8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
62  const int height, const int8_t *hf, const int8_t *vf, const int width);
63 void ff_vvc_put_chroma_v16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
64  const int height, const int8_t *hf, const int8_t *vf, const int width);
65 void ff_vvc_put_chroma_v_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
66  const int height, const int8_t *hf, const int8_t *vf, const int width);
67 void ff_vvc_put_chroma_v4_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
68  const int height, const int8_t *hf, const int8_t *vf, const int width);
69 void ff_vvc_put_chroma_v8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
70  const int height, const int8_t *hf, const int8_t *vf, const int width);
71 void ff_vvc_put_chroma_v16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
72  const int height, const int8_t *hf, const int8_t *vf, const int width);
73 void ff_vvc_put_chroma_v_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
74  const int height, const int8_t *hf, const int8_t *vf, const int width);
75 
76 void ff_vvc_put_luma_v4_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
77  const int height, const int8_t *hf, const int8_t *vf, const int width);
78 void ff_vvc_put_luma_v8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
79  const int height, const int8_t *hf, const int8_t *vf, const int width);
80 void ff_vvc_put_luma_v16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
81  const int height, const int8_t *hf, const int8_t *vf, const int width);
82 void ff_vvc_put_luma_v_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
83  const int height, const int8_t *hf, const int8_t *vf, const int width);
84 void ff_vvc_put_luma_v4_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
85  const int height, const int8_t *hf, const int8_t *vf, const int width);
86 void ff_vvc_put_luma_v8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
87  const int height, const int8_t *hf, const int8_t *vf, const int width);
88 void ff_vvc_put_luma_v16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
89  const int height, const int8_t *hf, const int8_t *vf, const int width);
90 void ff_vvc_put_luma_v_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
91  const int height, const int8_t *hf, const int8_t *vf, const int width);
92 
93 void ff_vvc_put_luma_hv8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
94  const int height, const int8_t *hf, const int8_t *vf, const int width);
95 void ff_vvc_put_luma_hv16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
96  const int height, const int8_t *hf, const int8_t *vf, const int width);
97 void ff_vvc_put_luma_hv_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
98  const int height, const int8_t *hf, const int8_t *vf, const int width);
99 void ff_vvc_put_luma_hv8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
100  const int height, const int8_t *hf, const int8_t *vf, const int width);
101 void ff_vvc_put_luma_hv16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
102  const int height, const int8_t *hf, const int8_t *vf, const int width);
103 void ff_vvc_put_luma_hv_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
104  const int height, const int8_t *hf, const int8_t *vf, const int width);
105 
106 void ff_vvc_put_chroma_hv8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
107  const int height, const int8_t *hf, const int8_t *vf, const int width);
108 void ff_vvc_put_chroma_hv16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
109  const int height, const int8_t *hf, const int8_t *vf, const int width);
110 void ff_vvc_put_chroma_hv_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
111  const int height, const int8_t *hf, const int8_t *vf, const int width);
112 void ff_vvc_put_chroma_hv8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
113  const int height, const int8_t *hf, const int8_t *vf, const int width);
114 void ff_vvc_put_chroma_hv16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
115  const int height, const int8_t *hf, const int8_t *vf, const int width);
116 void ff_vvc_put_chroma_hv_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride,
117  const int height, const int8_t *hf, const int8_t *vf, const int width);
118 
119 void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshift, uint32_t steps);
120 
121 #define BIT_DEPTH 8
122 #include "alf_template.c"
123 #undef BIT_DEPTH
124 
125 #define BIT_DEPTH 10
126 #include "alf_template.c"
127 #undef BIT_DEPTH
128 
129 #define BIT_DEPTH 12
130 #include "alf_template.c"
131 #undef BIT_DEPTH
132 
133 int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy,
134  const int block_w, const int block_h);
135 
136 void ff_vvc_avg_8_neon(uint8_t *dst, ptrdiff_t dst_stride,
137  const int16_t *src0, const int16_t *src1, int width,
138  int height);
139 void ff_vvc_avg_10_neon(uint8_t *dst, ptrdiff_t dst_stride,
140  const int16_t *src0, const int16_t *src1, int width,
141  int height);
142 void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride,
143  const int16_t *src0, const int16_t *src1, int width,
144  int height);
145 
146 void ff_vvc_w_avg_8_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
147  const int16_t *src0, const int16_t *src1,
148  int width, int height,
149  uintptr_t w0_w1, uintptr_t offset_shift);
150 void ff_vvc_w_avg_10_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
151  const int16_t *src0, const int16_t *src1,
152  int width, int height,
153  uintptr_t w0_w1, uintptr_t offset_shift);
154 void ff_vvc_w_avg_12_neon(uint8_t *_dst, ptrdiff_t _dst_stride,
155  const int16_t *src0, const int16_t *src1,
156  int width, int height,
157  uintptr_t w0_w1, uintptr_t offset_shift);
158 /* When passing arguments to functions, Apple platforms diverge from the ARM64
159  * standard ABI for functions that require passing arguments on the stack. To
160  * simplify portability in the assembly function interface, use a different
161  * function signature that doesn't require passing arguments on the stack.
162  */
163 #define W_AVG_FUN(bit_depth) \
164 static void vvc_w_avg_ ## bit_depth(uint8_t *dst, ptrdiff_t dst_stride, \
165  const int16_t *src0, const int16_t *src1, int width, int height, \
166  int denom, int w0, int w1, int o) \
167 { \
168  int shift = denom + FFMAX(3, 15 - bit_depth); \
169  int offset = (o * (1 << (bit_depth - 8)) + 1) * (1 << (shift - 1)); \
170  uintptr_t w0_w1 = ((uintptr_t)w0 << 32) | (uint32_t)w1; \
171  uintptr_t offset_shift = ((uintptr_t)offset << 32) | (uint32_t)shift; \
172  ff_vvc_w_avg_ ## bit_depth ## _neon(dst, dst_stride, src0, src1, width, height, w0_w1, offset_shift); \
173 }
174 
175 W_AVG_FUN(8)
176 W_AVG_FUN(10)
177 W_AVG_FUN(12)
178 
179 #define DMVR_FUN(fn, bd) \
180  void ff_vvc_dmvr_ ## fn ## bd ## _neon(int16_t *dst, \
181  const uint8_t *_src, ptrdiff_t _src_stride, int height, \
182  intptr_t mx, intptr_t my, int width);
183 
184 DMVR_FUN(, 8)
185 DMVR_FUN(, 12)
186 DMVR_FUN(h_, 8)
187 DMVR_FUN(h_, 10)
188 DMVR_FUN(h_, 12)
189 DMVR_FUN(v_, 8)
190 DMVR_FUN(hv_, 8)
191 DMVR_FUN(hv_, 10)
192 DMVR_FUN(hv_, 12)
193 
194 #define APPLY_BDOF_FUNC(bd) \
195  void ff_vvc_apply_bdof_ ## bd ## _neon(uint8_t *_dst, ptrdiff_t _dst_stride, \
196  const int16_t *_src0, const int16_t *_src1, \
197  int block_w, int block_h);
198 
200 APPLY_BDOF_FUNC(10)
201 APPLY_BDOF_FUNC(12)
202 
203 void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
204 {
205  int cpu_flags = av_get_cpu_flags();
206  if (!have_neon(cpu_flags))
207  return;
208 
209  if (bd == 8) {
210  c->inter.put[0][1][0][0] = ff_vvc_put_pel_pixels4_8_neon;
211  c->inter.put[0][2][0][0] = ff_vvc_put_pel_pixels8_8_neon;
212  c->inter.put[0][3][0][0] = ff_vvc_put_pel_pixels16_8_neon;
213  c->inter.put[0][4][0][0] = ff_vvc_put_pel_pixels32_8_neon;
214  c->inter.put[0][5][0][0] = ff_vvc_put_pel_pixels64_8_neon;
215  c->inter.put[0][6][0][0] = ff_vvc_put_pel_pixels128_8_neon;
216 
217  c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon;
218  c->inter.put[0][2][0][1] = ff_vvc_put_qpel_h8_8_neon;
219  c->inter.put[0][3][0][1] = ff_vvc_put_qpel_h16_8_neon;
220  c->inter.put[0][4][0][1] =
221  c->inter.put[0][5][0][1] =
222  c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h32_8_neon;
223 
224  c->inter.put[0][1][1][0] = ff_vvc_put_qpel_v4_8_neon;
225  c->inter.put[0][2][1][0] =
226  c->inter.put[0][3][1][0] =
227  c->inter.put[0][4][1][0] =
228  c->inter.put[0][5][1][0] =
229  c->inter.put[0][6][1][0] = ff_vvc_put_qpel_v8_8_neon;
230 
231  c->inter.put[0][1][1][1] = ff_vvc_put_qpel_hv4_8_neon;
232  c->inter.put[0][2][1][1] = ff_vvc_put_qpel_hv8_8_neon;
233  c->inter.put[0][3][1][1] = ff_vvc_put_qpel_hv16_8_neon;
234  c->inter.put[0][4][1][1] = ff_vvc_put_qpel_hv32_8_neon;
235  c->inter.put[0][5][1][1] = ff_vvc_put_qpel_hv64_8_neon;
236  c->inter.put[0][6][1][1] = ff_vvc_put_qpel_hv128_8_neon;
237 
238  c->inter.put[1][1][0][0] = ff_vvc_put_pel_pixels4_8_neon;
239  c->inter.put[1][2][0][0] = ff_vvc_put_pel_pixels8_8_neon;
240  c->inter.put[1][3][0][0] = ff_vvc_put_pel_pixels16_8_neon;
241  c->inter.put[1][4][0][0] = ff_vvc_put_pel_pixels32_8_neon;
242  c->inter.put[1][5][0][0] = ff_vvc_put_pel_pixels64_8_neon;
243  c->inter.put[1][6][0][0] = ff_vvc_put_pel_pixels128_8_neon;
244 
245  c->inter.put[1][1][0][1] = ff_vvc_put_epel_h4_8_neon;
246  c->inter.put[1][2][0][1] = ff_vvc_put_epel_h8_8_neon;
247  c->inter.put[1][3][0][1] = ff_vvc_put_epel_h16_8_neon;
248  c->inter.put[1][4][0][1] =
249  c->inter.put[1][5][0][1] =
250  c->inter.put[1][6][0][1] = ff_vvc_put_epel_h32_8_neon;
251 
252  c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon;
253  c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon;
254  c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon;
255  c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon;
256  c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon;
257  c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon;
258 
259  c->inter.put_uni[0][1][0][0] = ff_vvc_put_pel_uni_pixels4_8_neon;
260  c->inter.put_uni[0][2][0][0] = ff_vvc_put_pel_uni_pixels8_8_neon;
261  c->inter.put_uni[0][3][0][0] = ff_vvc_put_pel_uni_pixels16_8_neon;
262  c->inter.put_uni[0][4][0][0] = ff_vvc_put_pel_uni_pixels32_8_neon;
263  c->inter.put_uni[0][5][0][0] = ff_vvc_put_pel_uni_pixels64_8_neon;
264  c->inter.put_uni[0][6][0][0] = ff_vvc_put_pel_uni_pixels128_8_neon;
265 
266  c->inter.put_uni[0][1][0][1] = ff_vvc_put_qpel_uni_h4_8_neon;
267  c->inter.put_uni[0][2][0][1] = ff_vvc_put_qpel_uni_h8_8_neon;
268  c->inter.put_uni[0][3][0][1] = ff_vvc_put_qpel_uni_h16_8_neon;
269  c->inter.put_uni[0][4][0][1] =
270  c->inter.put_uni[0][5][0][1] =
271  c->inter.put_uni[0][6][0][1] = ff_vvc_put_qpel_uni_h32_8_neon;
272 
273  c->inter.put_uni_w[0][1][0][0] = ff_vvc_put_pel_uni_w_pixels4_8_neon;
274  c->inter.put_uni_w[0][2][0][0] = ff_vvc_put_pel_uni_w_pixels8_8_neon;
275  c->inter.put_uni_w[0][3][0][0] = ff_vvc_put_pel_uni_w_pixels16_8_neon;
276  c->inter.put_uni_w[0][4][0][0] = ff_vvc_put_pel_uni_w_pixels32_8_neon;
277  c->inter.put_uni_w[0][5][0][0] = ff_vvc_put_pel_uni_w_pixels64_8_neon;
278  c->inter.put_uni_w[0][6][0][0] = ff_vvc_put_pel_uni_w_pixels128_8_neon;
279 
280  c->inter.avg = ff_vvc_avg_8_neon;
281  c->inter.w_avg = vvc_w_avg_8;
282  c->inter.dmvr[0][0] = ff_vvc_dmvr_8_neon;
283  c->inter.dmvr[0][1] = ff_vvc_dmvr_h_8_neon;
284  c->inter.dmvr[1][0] = ff_vvc_dmvr_v_8_neon;
285  c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_8_neon;
286  c->inter.apply_bdof = ff_vvc_apply_bdof_8_neon;
287 
288  c->sao.band_filter[0] = ff_h26x_sao_band_filter_8x8_8_neon;
289  for (int i = 1; i < FF_ARRAY_ELEMS(c->sao.band_filter); i++)
290  c->sao.band_filter[i] = ff_h26x_sao_band_filter_16x16_8_neon;
291  c->sao.edge_filter[0] = ff_vvc_sao_edge_filter_8x8_8_neon;
292  for (int i = 1; i < FF_ARRAY_ELEMS(c->sao.edge_filter); i++)
293  c->sao.edge_filter[i] = ff_vvc_sao_edge_filter_16x16_8_neon;
294  c->alf.filter[LUMA] = alf_filter_luma_8_neon;
295  c->alf.filter[CHROMA] = alf_filter_chroma_8_neon;
296  c->alf.classify = alf_classify_8_neon;
297 
298  if (have_i8mm(cpu_flags)) {
299  c->inter.put[0][1][0][1] = ff_vvc_put_qpel_h4_8_neon_i8mm;
300  c->inter.put[0][2][0][1] = ff_vvc_put_qpel_h8_8_neon_i8mm;
301  c->inter.put[0][3][0][1] = ff_vvc_put_qpel_h16_8_neon_i8mm;
302  c->inter.put[0][4][0][1] = ff_vvc_put_qpel_h32_8_neon_i8mm;
303  c->inter.put[0][5][0][1] = ff_vvc_put_qpel_h64_8_neon_i8mm;
304  c->inter.put[0][6][0][1] = ff_vvc_put_qpel_h128_8_neon_i8mm;
305 
306  c->inter.put[0][1][1][1] = ff_vvc_put_qpel_hv4_8_neon_i8mm;
307  c->inter.put[0][2][1][1] = ff_vvc_put_qpel_hv8_8_neon_i8mm;
308  c->inter.put[0][3][1][1] = ff_vvc_put_qpel_hv16_8_neon_i8mm;
309  c->inter.put[0][4][1][1] = ff_vvc_put_qpel_hv32_8_neon_i8mm;
310  c->inter.put[0][5][1][1] = ff_vvc_put_qpel_hv64_8_neon_i8mm;
311  c->inter.put[0][6][1][1] = ff_vvc_put_qpel_hv128_8_neon_i8mm;
312 
313  c->inter.put[1][1][0][1] = ff_vvc_put_epel_h4_8_neon_i8mm;
314  c->inter.put[1][2][0][1] = ff_vvc_put_epel_h8_8_neon_i8mm;
315  c->inter.put[1][3][0][1] = ff_vvc_put_epel_h16_8_neon_i8mm;
316  c->inter.put[1][4][0][1] = ff_vvc_put_epel_h32_8_neon_i8mm;
317  c->inter.put[1][5][0][1] = ff_vvc_put_epel_h64_8_neon_i8mm;
318  c->inter.put[1][6][0][1] = ff_vvc_put_epel_h128_8_neon_i8mm;
319 
320  c->inter.put[1][1][1][1] = ff_vvc_put_epel_hv4_8_neon_i8mm;
321  c->inter.put[1][2][1][1] = ff_vvc_put_epel_hv8_8_neon_i8mm;
322  c->inter.put[1][3][1][1] = ff_vvc_put_epel_hv16_8_neon_i8mm;
323  c->inter.put[1][4][1][1] = ff_vvc_put_epel_hv32_8_neon_i8mm;
324  c->inter.put[1][5][1][1] = ff_vvc_put_epel_hv64_8_neon_i8mm;
325  c->inter.put[1][6][1][1] = ff_vvc_put_epel_hv128_8_neon_i8mm;
326  }
328  c->alf.filter[LUMA] = alf_filter_luma_8_sme2;
329  }
330  } else if (bd == 10) {
331  c->inter.avg = ff_vvc_avg_10_neon;
332  c->inter.w_avg = vvc_w_avg_10;
333  c->inter.dmvr[0][1] = ff_vvc_dmvr_h_10_neon;
334  c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_10_neon;
335  c->inter.apply_bdof = ff_vvc_apply_bdof_10_neon;
336 
337  c->inter.put[1][2][0][1] = ff_vvc_put_chroma_h8_10_neon;
338  c->inter.put[1][3][0][1] = ff_vvc_put_chroma_h16_10_neon;
339  c->inter.put[1][4][0][1] =
340  c->inter.put[1][5][0][1] =
341  c->inter.put[1][6][0][1] = ff_vvc_put_chroma_h_x16_10_neon;
342 
343  c->inter.put[0][2][0][1] = ff_vvc_put_luma_h8_10_neon;
344  c->inter.put[0][3][0][1] = ff_vvc_put_luma_h16_10_neon;
345  c->inter.put[0][4][0][1] =
346  c->inter.put[0][5][0][1] =
347  c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_10_neon;
348 
349  c->inter.put[1][1][1][0] = ff_vvc_put_chroma_v4_10_neon;
350  c->inter.put[1][2][1][0] = ff_vvc_put_chroma_v8_10_neon;
351  c->inter.put[1][3][1][0] = ff_vvc_put_chroma_v16_10_neon;
352  c->inter.put[1][4][1][0] =
353  c->inter.put[1][5][1][0] =
354  c->inter.put[1][6][1][0] = ff_vvc_put_chroma_v_x16_10_neon;
355 
356  c->inter.put[0][1][1][0] = ff_vvc_put_luma_v4_10_neon;
357  c->inter.put[0][2][1][0] = ff_vvc_put_luma_v8_10_neon;
358  c->inter.put[0][3][1][0] = ff_vvc_put_luma_v16_10_neon;
359  c->inter.put[0][4][1][0] =
360  c->inter.put[0][5][1][0] =
361  c->inter.put[0][6][1][0] = ff_vvc_put_luma_v_x16_10_neon;
362 
363  c->inter.put[0][2][1][1] = ff_vvc_put_luma_hv8_10_neon;
364  c->inter.put[0][3][1][1] = ff_vvc_put_luma_hv16_10_neon;
365  c->inter.put[0][4][1][1] =
366  c->inter.put[0][5][1][1] =
367  c->inter.put[0][6][1][1] = ff_vvc_put_luma_hv_x16_10_neon;
368 
369  c->inter.put[1][2][1][1] = ff_vvc_put_chroma_hv8_10_neon;
370  c->inter.put[1][3][1][1] = ff_vvc_put_chroma_hv16_10_neon;
371  c->inter.put[1][4][1][1] =
372  c->inter.put[1][5][1][1] =
373  c->inter.put[1][6][1][1] = ff_vvc_put_chroma_hv_x16_10_neon;
374 
375  c->alf.filter[LUMA] = alf_filter_luma_10_neon;
376  c->alf.filter[CHROMA] = alf_filter_chroma_10_neon;
377  c->alf.classify = alf_classify_10_neon;
379  c->alf.filter[LUMA] = alf_filter_luma_10_sme2;
380  }
381  } else if (bd == 12) {
382  c->inter.avg = ff_vvc_avg_12_neon;
383  c->inter.w_avg = vvc_w_avg_12;
384  c->inter.dmvr[0][0] = ff_vvc_dmvr_12_neon;
385  c->inter.dmvr[0][1] = ff_vvc_dmvr_h_12_neon;
386  c->inter.dmvr[1][1] = ff_vvc_dmvr_hv_12_neon;
387  c->inter.apply_bdof = ff_vvc_apply_bdof_12_neon;
388 
389  c->inter.put[1][2][0][1] = ff_vvc_put_chroma_h8_12_neon;
390  c->inter.put[1][3][0][1] = ff_vvc_put_chroma_h16_12_neon;
391  c->inter.put[1][4][0][1] =
392  c->inter.put[1][5][0][1] =
393  c->inter.put[1][6][0][1] = ff_vvc_put_chroma_h_x16_12_neon;
394 
395  c->inter.put[0][2][0][1] = ff_vvc_put_luma_h8_12_neon;
396  c->inter.put[0][3][0][1] = ff_vvc_put_luma_h16_12_neon;
397  c->inter.put[0][4][0][1] =
398  c->inter.put[0][5][0][1] =
399  c->inter.put[0][6][0][1] = ff_vvc_put_luma_h_x16_12_neon;
400 
401  c->inter.put[0][2][1][1] = ff_vvc_put_luma_hv8_12_neon;
402  c->inter.put[0][3][1][1] = ff_vvc_put_luma_hv16_12_neon;
403  c->inter.put[0][4][1][1] =
404  c->inter.put[0][5][1][1] =
405  c->inter.put[0][6][1][1] = ff_vvc_put_luma_hv_x16_12_neon;
406 
407  c->inter.put[0][1][1][0] = ff_vvc_put_luma_v4_12_neon;
408  c->inter.put[0][2][1][0] = ff_vvc_put_luma_v8_12_neon;
409  c->inter.put[0][3][1][0] = ff_vvc_put_luma_v16_12_neon;
410  c->inter.put[0][4][1][0] =
411  c->inter.put[0][5][1][0] =
412  c->inter.put[0][6][1][0] = ff_vvc_put_luma_v_x16_12_neon;
413 
414  c->inter.put[1][1][1][0] = ff_vvc_put_chroma_v4_12_neon;
415  c->inter.put[1][2][1][0] = ff_vvc_put_chroma_v8_12_neon;
416  c->inter.put[1][3][1][0] = ff_vvc_put_chroma_v16_12_neon;
417  c->inter.put[1][4][1][0] =
418  c->inter.put[1][5][1][0] =
419  c->inter.put[1][6][1][0] = ff_vvc_put_chroma_v_x16_12_neon;
420 
421  c->inter.put[1][2][1][1] = ff_vvc_put_chroma_hv8_12_neon;
422  c->inter.put[1][3][1][1] = ff_vvc_put_chroma_hv16_12_neon;
423  c->inter.put[1][4][1][1] =
424  c->inter.put[1][5][1][1] =
425  c->inter.put[1][6][1][1] = ff_vvc_put_chroma_hv_x16_12_neon;
426 
427  c->alf.filter[LUMA] = alf_filter_luma_12_neon;
428  c->alf.filter[CHROMA] = alf_filter_chroma_12_neon;
429  c->alf.classify = alf_classify_12_neon;
431  c->alf.filter[LUMA] = alf_filter_luma_12_sme2;
432  }
433  }
434 
435  c->inter.sad = ff_vvc_sad_neon;
436 }
_dst
uint8_t * _dst
Definition: dsp.h:56
ff_vvc_put_chroma_v4_10_neon
void ff_vvc_put_chroma_v4_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
LUMA
#define LUMA
Definition: filter.c:31
ff_vvc_put_chroma_h16_12_neon
void ff_vvc_put_chroma_h16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_chroma_hv16_10_neon
void ff_vvc_put_chroma_hv16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
src1
const pixel * src1
Definition: h264pred_template.c:420
ff_vvc_put_qpel_v8_8_neon
void ff_vvc_put_qpel_v8_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width)
ff_vvc_put_luma_h_x16_12_neon
void ff_vvc_put_luma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_luma_h16_10_neon
void ff_vvc_put_luma_h16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_chroma_h_x16_12_neon
void ff_vvc_put_chroma_h_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_chroma_h8_10_neon
void ff_vvc_put_chroma_h8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_luma_v8_10_neon
void ff_vvc_put_luma_v8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_chroma_hv16_12_neon
void ff_vvc_put_chroma_hv16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_luma_hv16_12_neon
void ff_vvc_put_luma_hv16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
ff_vvc_dsp_init_aarch64
void ff_vvc_dsp_init_aarch64(VVCDSPContext *const c, const int bd)
Definition: dsp_init.c:203
have_sme2
#define have_sme2(flags)
Definition: cpu.h:37
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
_src
uint8_t ptrdiff_t const uint8_t * _src
Definition: dsp.h:56
ff_vvc_put_chroma_hv_x16_12_neon
void ff_vvc_put_chroma_hv_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_luma_h8_10_neon
void ff_vvc_put_luma_h8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_luma_h16_12_neon
void ff_vvc_put_luma_h16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
dsp.h
W_AVG_FUN
#define W_AVG_FUN(bit_depth)
Definition: dsp_init.c:163
ff_vvc_put_chroma_h8_12_neon
void ff_vvc_put_chroma_h8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_w_avg_10_neon
void ff_vvc_w_avg_10_neon(uint8_t *_dst, ptrdiff_t _dst_stride, const int16_t *src0, const int16_t *src1, int width, int height, uintptr_t w0_w1, uintptr_t offset_shift)
ff_vvc_put_qpel_v4_8_neon
void ff_vvc_put_qpel_v4_8_neon(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, const int8_t *hf, const int8_t *vf, int width)
ff_vvc_put_luma_hv_x16_12_neon
void ff_vvc_put_luma_hv_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
FF_ARRAY_ELEMS
#define FF_ARRAY_ELEMS(a)
Definition: sinewin_tablegen.c:29
dsp.h
ff_vvc_put_luma_hv16_10_neon
void ff_vvc_put_luma_hv16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_luma_hv8_10_neon
void ff_vvc_put_luma_hv8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_luma_hv_x16_10_neon
void ff_vvc_put_luma_hv_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_chroma_v_x16_10_neon
void ff_vvc_put_chroma_v_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
DMVR_FUN
#define DMVR_FUN(fn, bd)
Definition: dsp_init.c:179
ff_h26x_sao_band_filter_16x16_8_neon
void ff_h26x_sao_band_filter_16x16_8_neon(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, const int16_t *sao_offset_val, int sao_left_class, int width, int height)
ff_vvc_put_chroma_v16_10_neon
void ff_vvc_put_chroma_v16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
if
if(ret)
Definition: filter_design.txt:179
ff_vvc_put_chroma_v16_12_neon
void ff_vvc_put_chroma_v16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
grad
static double grad(int hash, double x, double y, double z)
Definition: perlin.c:42
ff_vvc_put_luma_hv8_12_neon
void ff_vvc_put_luma_hv8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_sao_edge_filter_8x8_8_neon
void ff_vvc_sao_edge_filter_8x8_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst, const int16_t *sao_offset_val, int eo, int width, int height)
ff_vvc_put_chroma_v8_10_neon
void ff_vvc_put_chroma_v8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
have_i8mm
#define have_i8mm(flags)
Definition: cpu.h:32
ff_vvc_put_chroma_v_x16_12_neon
void ff_vvc_put_chroma_v_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
height
#define height
Definition: dsp.h:89
ff_vvc_put_luma_v_x16_10_neon
void ff_vvc_put_luma_v_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
ff_vvc_put_chroma_h16_10_neon
void ff_vvc_put_chroma_h16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_luma_h_x16_10_neon
void ff_vvc_put_luma_h_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
cpu.h
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
for
for(k=2;k<=8;++k)
Definition: h264pred_template.c:424
ff_vvc_put_chroma_v8_12_neon
void ff_vvc_put_chroma_v8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_chroma_hv_x16_10_neon
void ff_vvc_put_chroma_hv_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_avg_8_neon
void ff_vvc_avg_8_neon(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1, int width, int height)
have_neon
#define have_neon(flags)
Definition: cpu.h:26
ff_h26x_sao_band_filter_8x8_8_neon
void ff_h26x_sao_band_filter_8x8_8_neon(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, ptrdiff_t stride_src, const int16_t *sao_offset_val, int sao_left_class, int width, int height)
ff_vvc_put_luma_v16_10_neon
void ff_vvc_put_luma_v16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_avg_10_neon
void ff_vvc_avg_10_neon(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1, int width, int height)
ff_alf_classify_sum_neon
void ff_alf_classify_sum_neon(int *sum0, int *sum1, int16_t *grad, uint32_t gshift, uint32_t steps)
ff_vvc_put_chroma_v4_12_neon
void ff_vvc_put_chroma_v4_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_chroma_hv8_12_neon
void ff_vvc_put_chroma_hv8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
hf
uint8_t ptrdiff_t const uint8_t ptrdiff_t int const int8_t * hf
Definition: dsp.h:262
ff_vvc_put_luma_v4_10_neon
void ff_vvc_put_luma_v4_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_luma_v8_12_neon
void ff_vvc_put_luma_v8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
CHROMA
@ CHROMA
Definition: vf_waveform.c:49
ff_vvc_put_chroma_hv8_10_neon
void ff_vvc_put_chroma_hv8_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ff_vvc_put_luma_v4_12_neon
void ff_vvc_put_luma_v4_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
steps
static const int16_t steps[16]
Definition: misc4.c:30
ff_vvc_w_avg_8_neon
void ff_vvc_w_avg_8_neon(uint8_t *_dst, ptrdiff_t _dst_stride, const int16_t *src0, const int16_t *src1, int width, int height, uintptr_t w0_w1, uintptr_t offset_shift)
ff_vvc_w_avg_12_neon
void ff_vvc_w_avg_12_neon(uint8_t *_dst, ptrdiff_t _dst_stride, const int16_t *src0, const int16_t *src1, int width, int height, uintptr_t w0_w1, uintptr_t offset_shift)
src0
const pixel *const src0
Definition: h264pred_template.c:419
ff_vvc_avg_12_neon
void ff_vvc_avg_12_neon(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src0, const int16_t *src1, int width, int height)
ff_vvc_sao_edge_filter_16x16_8_neon
void ff_vvc_sao_edge_filter_16x16_8_neon(uint8_t *dst, const uint8_t *src, ptrdiff_t stride_dst, const int16_t *sao_offset_val, int eo, int width, int height)
alf_template.c
ff_vvc_put_chroma_h_x16_10_neon
void ff_vvc_put_chroma_h_x16_10_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
have_sme_i16i64
#define have_sme_i16i64(flags)
Definition: cpu.h:36
ff_vvc_sad_neon
int ff_vvc_sad_neon(const int16_t *src0, const int16_t *src1, int dx, int dy, const int block_w, const int block_h)
APPLY_BDOF_FUNC
#define APPLY_BDOF_FUNC(bd)
Definition: dsp_init.c:194
ff_vvc_put_luma_v16_12_neon
void ff_vvc_put_luma_v16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
ctu.h
ff_vvc_put_luma_h8_12_neon
void ff_vvc_put_luma_h8_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
width
#define width
Definition: dsp.h:89
cpu.h
vf
uint8_t ptrdiff_t const uint8_t ptrdiff_t int const int8_t const int8_t * vf
Definition: dsp.h:262
ff_vvc_put_luma_v_x16_12_neon
void ff_vvc_put_luma_v_x16_12_neon(int16_t *dst, const uint8_t *_src, const ptrdiff_t _src_stride, const int height, const int8_t *hf, const int8_t *vf, const int width)
dec.h
VVCDSPContext
Definition: dsp.h:170