FFmpeg
dsp_init.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2013 Seppo Tomperi
3  * Copyright (c) 2013-2014 Pierre-Edouard Lepere
4  * Copyright (c) 2023-2024 Wu Jianhua
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
23 #include "config.h"
24 
25 #include "libavutil/cpu.h"
26 #include "libavutil/mem_internal.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavutil/x86/cpu.h"
29 #include "libavcodec/hevc/dsp.h"
32 
33 void ff_hevc_dequant_8_ssse3(int16_t *coeffs, int16_t log2_size);
34 
35 #define LFC_FUNC(DIR, DEPTH, OPT) \
36 void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, const int *tc, const uint8_t *no_p, const uint8_t *no_q);
37 
38 #define LFL_FUNC(DIR, DEPTH, OPT) \
39 void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, const int *tc, const uint8_t *no_p, const uint8_t *no_q);
40 
41 #define LFC_FUNCS(type, depth, opt) \
42  LFC_FUNC(h, depth, opt) \
43  LFC_FUNC(v, depth, opt)
44 
45 #define LFL_FUNCS(type, depth, opt) \
46  LFL_FUNC(h, depth, opt) \
47  LFL_FUNC(v, depth, opt)
48 
49 LFC_FUNCS(uint8_t, 8, sse2)
50 LFC_FUNCS(uint8_t, 10, sse2)
51 LFC_FUNCS(uint8_t, 12, sse2)
52 LFC_FUNCS(uint8_t, 8, avx)
53 LFC_FUNCS(uint8_t, 10, avx)
54 LFC_FUNCS(uint8_t, 12, avx)
55 LFL_FUNCS(uint8_t, 8, sse2)
56 LFL_FUNCS(uint8_t, 10, sse2)
57 LFL_FUNCS(uint8_t, 12, sse2)
58 LFL_FUNCS(uint8_t, 8, ssse3)
59 LFL_FUNCS(uint8_t, 10, ssse3)
60 LFL_FUNCS(uint8_t, 12, ssse3)
61 LFL_FUNCS(uint8_t, 8, avx)
62 LFL_FUNCS(uint8_t, 10, avx)
63 LFL_FUNCS(uint8_t, 12, avx)
64 
65 #define IDCT_DC_FUNCS(W, opt) \
66 void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
67 void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \
68 void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs)
69 
70 IDCT_DC_FUNCS(4x4, sse2);
71 IDCT_DC_FUNCS(8x8, sse2);
72 IDCT_DC_FUNCS(16x16, sse2);
73 IDCT_DC_FUNCS(32x32, sse2);
74 IDCT_DC_FUNCS(16x16, avx2);
75 IDCT_DC_FUNCS(32x32, avx2);
76 
77 #define IDCT_FUNCS(opt) \
78 void ff_hevc_idct_4x4_8_ ## opt(int16_t *coeffs, int col_limit); \
79 void ff_hevc_idct_4x4_10_ ## opt(int16_t *coeffs, int col_limit); \
80 void ff_hevc_idct_8x8_8_ ## opt(int16_t *coeffs, int col_limit); \
81 void ff_hevc_idct_8x8_10_ ## opt(int16_t *coeffs, int col_limit); \
82 void ff_hevc_idct_16x16_8_ ## opt(int16_t *coeffs, int col_limit); \
83 void ff_hevc_idct_16x16_10_ ## opt(int16_t *coeffs, int col_limit); \
84 void ff_hevc_idct_32x32_8_ ## opt(int16_t *coeffs, int col_limit); \
85 void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit);
86 
87 IDCT_FUNCS(sse2)
88 IDCT_FUNCS(avx)
89 
90 
91 #define ff_hevc_pel_filters ff_hevc_qpel_filters
92 #define DECL_HV_FILTER(f) \
93  const uint8_t *hf = ff_hevc_ ## f ## _filters[mx]; \
94  const uint8_t *vf = ff_hevc_ ## f ## _filters[my];
95 
96 #define FW_PUT(p, a, b, depth, opt) \
97 static void hevc_put_ ## a ## _ ## depth ## _##opt(int16_t *dst, const uint8_t *src, ptrdiff_t srcstride, \
98  int height, intptr_t mx, intptr_t my,int width) \
99 { \
100  DECL_HV_FILTER(p) \
101  ff_h2656_put_ ## b ## _ ## depth ## _##opt(dst, 2 * MAX_PB_SIZE, src, srcstride, height, hf, vf, width); \
102 }
103 
104 #define FW_PUT_UNI(p, a, b, depth, opt) \
105 static void hevc_put_uni_ ## a ## _ ## depth ## _##opt(uint8_t *dst, ptrdiff_t dststride, \
106  const uint8_t *src, ptrdiff_t srcstride, \
107  int height, intptr_t mx, intptr_t my, int width) \
108 { \
109  DECL_HV_FILTER(p) \
110  ff_h2656_put_uni_ ## b ## _ ## depth ## _##opt(dst, dststride, src, srcstride, height, hf, vf, width); \
111 }
112 
113 #if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
114 
115 #define FW_PUT_FUNCS(p, a, b, depth, opt) \
116  FW_PUT(p, a, b, depth, opt) \
117  FW_PUT_UNI(p, a, b, depth, opt)
118 
119 #define FW_PEL(w, depth, opt) FW_PUT_FUNCS(pel, pel_pixels##w, pixels##w, depth, opt)
120 
121 #define FW_DIR(npel, n, w, depth, opt) \
122  FW_PUT_FUNCS(npel, npel ## _h##w, n ## tap_h##w, depth, opt) \
123  FW_PUT_FUNCS(npel, npel ## _v##w, n ## tap_v##w, depth, opt)
124 
125 #define FW_DIR_HV(npel, n, w, depth, opt) \
126  FW_PUT_FUNCS(npel, npel ## _hv##w, n ## tap_hv##w, depth, opt)
127 
128 FW_PEL(4, 8, sse4)
129 FW_PEL(6, 8, sse4)
130 FW_PEL(8, 8, sse4)
131 FW_PEL(12, 8, sse4)
132 FW_PEL(16, 8, sse4)
133 FW_PEL(4, 10, sse4)
134 FW_PEL(6, 10, sse4)
135 FW_PEL(8, 10, sse4)
136 FW_PEL(4, 12, sse4)
137 FW_PEL(6, 12, sse4)
138 FW_PEL(8, 12, sse4)
139 
140 #define FW_EPEL(w, depth, opt) FW_DIR(epel, 4, w, depth, opt)
141 #define FW_EPEL_HV(w, depth, opt) FW_DIR_HV(epel, 4, w, depth, opt)
142 #define FW_EPEL_FUNCS(w, depth, opt) \
143  FW_EPEL(w, depth, opt) \
144  FW_EPEL_HV(w, depth, opt)
145 
146 FW_EPEL(12, 8, sse4)
147 
148 FW_EPEL_FUNCS(4, 8, sse4)
149 FW_EPEL_FUNCS(6, 8, sse4)
150 FW_EPEL_FUNCS(8, 8, sse4)
151 FW_EPEL_FUNCS(16, 8, sse4)
152 FW_EPEL_FUNCS(4, 10, sse4)
153 FW_EPEL_FUNCS(6, 10, sse4)
154 FW_EPEL_FUNCS(8, 10, sse4)
155 FW_EPEL_FUNCS(4, 12, sse4)
156 FW_EPEL_FUNCS(6, 12, sse4)
157 FW_EPEL_FUNCS(8, 12, sse4)
158 
159 #define FW_QPEL(w, depth, opt) FW_DIR(qpel, 8, w, depth, opt)
160 #define FW_QPEL_HV(w, depth, opt) FW_DIR_HV(qpel, 8, w, depth, opt)
161 #define FW_QPEL_FUNCS(w, depth, opt) \
162  FW_QPEL(w, depth, opt) \
163  FW_QPEL_HV(w, depth, opt)
164 
165 FW_QPEL(12, 8, sse4)
166 FW_QPEL(16, 8, sse4)
167 
168 FW_QPEL_FUNCS(4, 8, sse4)
169 FW_QPEL_FUNCS(8, 8, sse4)
170 FW_QPEL_FUNCS(4, 10, sse4)
171 FW_QPEL_FUNCS(8, 10, sse4)
172 FW_QPEL_FUNCS(4, 12, sse4)
173 FW_QPEL_FUNCS(8, 12, sse4)
174 
175 #if HAVE_AVX2_EXTERNAL
176 
177 FW_PEL(32, 8, avx2)
178 FW_PUT(pel, pel_pixels16, pixels16, 10, avx2)
179 
180 FW_EPEL(32, 8, avx2)
181 FW_EPEL(16, 10, avx2)
182 
183 FW_EPEL_HV(32, 8, avx2)
184 FW_EPEL_HV(16, 10, avx2)
185 
186 FW_QPEL(32, 8, avx2)
187 FW_QPEL(16, 10, avx2)
188 
189 FW_QPEL_HV(16, 10, avx2)
190 
191 #endif
192 #endif
193 
194 #define mc_rep_func(name, bitd, step, W, opt) \
195 static void hevc_put_##name##W##_##bitd##_##opt(int16_t *_dst, \
196  const uint8_t *_src, ptrdiff_t _srcstride, int height, \
197  intptr_t mx, intptr_t my, int width) \
198 { \
199  int i; \
200  int16_t *dst; \
201  for (i = 0; i < W; i += step) { \
202  const uint8_t *src = _src + (i * ((bitd + 7) / 8)); \
203  dst = _dst + i; \
204  hevc_put_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \
205  } \
206 }
207 #define mc_rep_uni_func(name, bitd, step, W, opt) \
208 static void hevc_put_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, \
209  const uint8_t *_src, ptrdiff_t _srcstride, int height, \
210  intptr_t mx, intptr_t my, int width) \
211 { \
212  int i; \
213  uint8_t *dst; \
214  for (i = 0; i < W; i += step) { \
215  const uint8_t *src = _src + (i * ((bitd + 7) / 8)); \
216  dst = _dst + (i * ((bitd + 7) / 8)); \
217  hevc_put_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, \
218  height, mx, my, width); \
219  } \
220 }
221 #define mc_rep_bi_func(name, bitd, step, W, opt) \
222 static void ff_hevc_put_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, const uint8_t *_src, \
223  ptrdiff_t _srcstride, const int16_t *_src2, \
224  int height, intptr_t mx, intptr_t my, int width) \
225 { \
226  int i; \
227  uint8_t *dst; \
228  for (i = 0; i < W ; i += step) { \
229  const uint8_t *src = _src + (i * ((bitd + 7) / 8)); \
230  const int16_t *src2 = _src2 + i; \
231  dst = _dst + (i * ((bitd + 7) / 8)); \
232  ff_hevc_put_bi_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, \
233  height, mx, my, width); \
234  } \
235 }
236 
237 #define mc_rep_funcs(name, bitd, step, W, opt) \
238  mc_rep_func(name, bitd, step, W, opt) \
239  mc_rep_uni_func(name, bitd, step, W, opt) \
240  mc_rep_bi_func(name, bitd, step, W, opt)
241 
242 #define mc_rep_func2(name, bitd, step1, step2, W, opt) \
243 static void hevc_put_##name##W##_##bitd##_##opt(int16_t *dst, \
244  const uint8_t *src, ptrdiff_t _srcstride, int height, \
245  intptr_t mx, intptr_t my, int width) \
246 { \
247  hevc_put_##name##step1##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \
248  hevc_put_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 * ((bitd + 7) / 8)), \
249  _srcstride, height, mx, my, width); \
250 }
251 #define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
252 static void hevc_put_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, \
253  const uint8_t *src, ptrdiff_t _srcstride, int height, \
254  intptr_t mx, intptr_t my, int width) \
255 { \
256  hevc_put_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, height, mx, my, width); \
257  hevc_put_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \
258  src + (step1 * ((bitd + 7) / 8)), _srcstride, \
259  height, mx, my, width); \
260 }
261 #define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \
262 static void ff_hevc_put_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \
263  ptrdiff_t _srcstride, const int16_t *src2, \
264  int height, intptr_t mx, intptr_t my, int width) \
265 { \
266  ff_hevc_put_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, height, mx, my, width);\
267  ff_hevc_put_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \
268  src + (step1 * ((bitd + 7) / 8)), _srcstride, \
269  src2 + step1, height, mx, my, width); \
270 }
271 
272 #define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \
273  mc_rep_func2(name, bitd, step1, step2, W, opt) \
274  mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
275  mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
276 
277 #if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
278 
279 #define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
280 static void hevc_put_##name##width1##_10_##opt1(int16_t *dst, const uint8_t *src, ptrdiff_t _srcstride, \
281  int height, intptr_t mx, intptr_t my, int width) \
282  \
283 { \
284  hevc_put_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width); \
285  hevc_put_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \
286 }
287 
288 #define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
289 static void ff_hevc_put_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \
290  ptrdiff_t _srcstride, const int16_t *src2, \
291  int height, intptr_t mx, intptr_t my, int width) \
292 { \
293  ff_hevc_put_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2, \
294  height, mx, my, width); \
295  ff_hevc_put_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2, \
296  height, mx, my, width); \
297 }
298 
299 #define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
300 static void hevc_put_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, \
301  const uint8_t *src, ptrdiff_t _srcstride, int height, \
302  intptr_t mx, intptr_t my, int width) \
303 { \
304  hevc_put_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, \
305  height, mx, my, width); \
306  hevc_put_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, \
307  height, mx, my, width); \
308 }
309 
310 #define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4) \
311 mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
312 mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
313 mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)
314 
315 #define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
316 static void hevc_put_##name##width1##_8_##opt1(int16_t *dst, const uint8_t *src, ptrdiff_t _srcstride, \
317  int height, intptr_t mx, intptr_t my, int width) \
318  \
319 { \
320  hevc_put_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width); \
321  hevc_put_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width); \
322 }
323 
324 #define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
325 static void ff_hevc_put_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, const uint8_t *src, \
326  ptrdiff_t _srcstride, const int16_t *src2, \
327  int height, intptr_t mx, intptr_t my, int width) \
328 { \
329  ff_hevc_put_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \
330  src2, height, mx, my, width); \
331  ff_hevc_put_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \
332  src2+width2, height, mx, my, width); \
333 }
334 
335 #define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
336 static void hevc_put_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, \
337  const uint8_t *src, ptrdiff_t _srcstride, int height, \
338  intptr_t mx, intptr_t my, int width) \
339 { \
340  hevc_put_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \
341  height, mx, my, width); \
342  hevc_put_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \
343  height, mx, my, width); \
344 }
345 
346 #define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2) \
347 mc_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
348 mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
349 mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)
350 
351 #if HAVE_AVX2_EXTERNAL
352 
353 mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4)
354 mc_rep_mixs_8(epel_hv, 48, 32, 16, avx2, sse4)
355 mc_rep_mixs_8(epel_h , 48, 32, 16, avx2, sse4)
356 mc_rep_mixs_8(epel_v , 48, 32, 16, avx2, sse4)
357 
358 mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32)
359 mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32)
360 mc_rep_mixs_10(epel_hv, 24, 16, 8, avx2, sse4, 32)
361 mc_rep_mixs_10(epel_h , 24, 16, 8, avx2, sse4, 32)
362 mc_rep_mixs_10(epel_v , 24, 16, 8, avx2, sse4, 32)
363 
364 
365 mc_rep_mixs_10(qpel_h , 24, 16, 8, avx2, sse4, 32)
366 mc_rep_mixs_10(qpel_v , 24, 16, 8, avx2, sse4, 32)
367 mc_rep_mixs_10(qpel_hv, 24, 16, 8, avx2, sse4, 32)
368 
369 
370 mc_rep_funcs(pel_pixels, 8, 32, 64, avx2)
371 
372 mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2)//used for 10bit
373 mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2) //used for 10bit
374 
375 mc_rep_func(pel_pixels, 10, 16, 32, avx2)
376 mc_rep_func(pel_pixels, 10, 16, 48, avx2)
377 mc_rep_func(pel_pixels, 10, 32, 64, avx2)
378 
379 mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2)
380 mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2)
381 mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2)
382 
383 mc_rep_funcs(epel_h, 8, 32, 64, avx2)
384 
385 mc_rep_funcs(epel_v, 8, 32, 64, avx2)
386 
387 mc_rep_funcs(epel_h, 10, 16, 32, avx2)
388 mc_rep_funcs(epel_h, 10, 16, 48, avx2)
389 mc_rep_funcs(epel_h, 10, 32, 64, avx2)
390 
391 mc_rep_funcs(epel_v, 10, 16, 32, avx2)
392 mc_rep_funcs(epel_v, 10, 16, 48, avx2)
393 mc_rep_funcs(epel_v, 10, 32, 64, avx2)
394 
395 
396 mc_rep_funcs(epel_hv, 8, 32, 64, avx2)
397 
398 mc_rep_funcs(epel_hv, 10, 16, 32, avx2)
399 mc_rep_funcs(epel_hv, 10, 16, 48, avx2)
400 mc_rep_funcs(epel_hv, 10, 32, 64, avx2)
401 
402 mc_rep_funcs(qpel_h, 8, 32, 64, avx2)
403 mc_rep_mixs_8(qpel_h , 48, 32, 16, avx2, sse4)
404 
405 mc_rep_funcs(qpel_v, 8, 32, 64, avx2)
406 mc_rep_mixs_8(qpel_v, 48, 32, 16, avx2, sse4)
407 
408 mc_rep_funcs(qpel_h, 10, 16, 32, avx2)
409 mc_rep_funcs(qpel_h, 10, 16, 48, avx2)
410 mc_rep_funcs(qpel_h, 10, 32, 64, avx2)
411 
412 mc_rep_funcs(qpel_v, 10, 16, 32, avx2)
413 mc_rep_funcs(qpel_v, 10, 16, 48, avx2)
414 mc_rep_funcs(qpel_v, 10, 32, 64, avx2)
415 
416 mc_rep_funcs(qpel_hv, 10, 16, 32, avx2)
417 mc_rep_funcs(qpel_hv, 10, 16, 48, avx2)
418 mc_rep_funcs(qpel_hv, 10, 32, 64, avx2)
419 
420 #endif //AVX2
421 
422 mc_rep_funcs(pel_pixels, 8, 16, 64, sse4)
423 mc_rep_funcs(pel_pixels, 8, 16, 48, sse4)
424 mc_rep_funcs(pel_pixels, 8, 16, 32, sse4)
425 mc_rep_funcs(pel_pixels, 8, 8, 24, sse4)
426 mc_rep_funcs(pel_pixels,10, 8, 64, sse4)
427 mc_rep_funcs(pel_pixels,10, 8, 48, sse4)
428 mc_rep_funcs(pel_pixels,10, 8, 32, sse4)
429 mc_rep_funcs(pel_pixels,10, 8, 24, sse4)
430 mc_rep_funcs(pel_pixels,10, 8, 16, sse4)
431 mc_rep_funcs(pel_pixels,10, 4, 12, sse4)
432 mc_rep_funcs(pel_pixels,12, 8, 64, sse4)
433 mc_rep_funcs(pel_pixels,12, 8, 48, sse4)
434 mc_rep_funcs(pel_pixels,12, 8, 32, sse4)
435 mc_rep_funcs(pel_pixels,12, 8, 24, sse4)
436 mc_rep_funcs(pel_pixels,12, 8, 16, sse4)
437 mc_rep_funcs(pel_pixels,12, 4, 12, sse4)
438 
439 mc_rep_funcs(epel_h, 8, 16, 64, sse4)
440 mc_rep_funcs(epel_h, 8, 16, 48, sse4)
441 mc_rep_funcs(epel_h, 8, 16, 32, sse4)
442 mc_rep_funcs(epel_h, 8, 8, 24, sse4)
443 mc_rep_funcs(epel_h,10, 8, 64, sse4)
444 mc_rep_funcs(epel_h,10, 8, 48, sse4)
445 mc_rep_funcs(epel_h,10, 8, 32, sse4)
446 mc_rep_funcs(epel_h,10, 8, 24, sse4)
447 mc_rep_funcs(epel_h,10, 8, 16, sse4)
448 mc_rep_funcs(epel_h,10, 4, 12, sse4)
449 mc_rep_funcs(epel_h,12, 8, 64, sse4)
450 mc_rep_funcs(epel_h,12, 8, 48, sse4)
451 mc_rep_funcs(epel_h,12, 8, 32, sse4)
452 mc_rep_funcs(epel_h,12, 8, 24, sse4)
453 mc_rep_funcs(epel_h,12, 8, 16, sse4)
454 mc_rep_funcs(epel_h,12, 4, 12, sse4)
455 mc_rep_funcs(epel_v, 8, 16, 64, sse4)
456 mc_rep_funcs(epel_v, 8, 16, 48, sse4)
457 mc_rep_funcs(epel_v, 8, 16, 32, sse4)
458 mc_rep_funcs(epel_v, 8, 8, 24, sse4)
459 mc_rep_funcs(epel_v,10, 8, 64, sse4)
460 mc_rep_funcs(epel_v,10, 8, 48, sse4)
461 mc_rep_funcs(epel_v,10, 8, 32, sse4)
462 mc_rep_funcs(epel_v,10, 8, 24, sse4)
463 mc_rep_funcs(epel_v,10, 8, 16, sse4)
464 mc_rep_funcs(epel_v,10, 4, 12, sse4)
465 mc_rep_funcs(epel_v,12, 8, 64, sse4)
466 mc_rep_funcs(epel_v,12, 8, 48, sse4)
467 mc_rep_funcs(epel_v,12, 8, 32, sse4)
468 mc_rep_funcs(epel_v,12, 8, 24, sse4)
469 mc_rep_funcs(epel_v,12, 8, 16, sse4)
470 mc_rep_funcs(epel_v,12, 4, 12, sse4)
471 mc_rep_funcs(epel_hv, 8, 16, 64, sse4)
472 mc_rep_funcs(epel_hv, 8, 16, 48, sse4)
473 mc_rep_funcs(epel_hv, 8, 16, 32, sse4)
474 mc_rep_funcs(epel_hv, 8, 8, 24, sse4)
475 mc_rep_funcs2(epel_hv,8, 8, 4, 12, sse4)
476 mc_rep_funcs(epel_hv,10, 8, 64, sse4)
477 mc_rep_funcs(epel_hv,10, 8, 48, sse4)
478 mc_rep_funcs(epel_hv,10, 8, 32, sse4)
479 mc_rep_funcs(epel_hv,10, 8, 24, sse4)
480 mc_rep_funcs(epel_hv,10, 8, 16, sse4)
481 mc_rep_funcs(epel_hv,10, 4, 12, sse4)
482 mc_rep_funcs(epel_hv,12, 8, 64, sse4)
483 mc_rep_funcs(epel_hv,12, 8, 48, sse4)
484 mc_rep_funcs(epel_hv,12, 8, 32, sse4)
485 mc_rep_funcs(epel_hv,12, 8, 24, sse4)
486 mc_rep_funcs(epel_hv,12, 8, 16, sse4)
487 mc_rep_funcs(epel_hv,12, 4, 12, sse4)
488 
489 mc_rep_funcs(qpel_h, 8, 16, 64, sse4)
490 mc_rep_funcs(qpel_h, 8, 16, 48, sse4)
491 mc_rep_funcs(qpel_h, 8, 16, 32, sse4)
492 mc_rep_funcs(qpel_h, 8, 8, 24, sse4)
493 mc_rep_funcs(qpel_h,10, 8, 64, sse4)
494 mc_rep_funcs(qpel_h,10, 8, 48, sse4)
495 mc_rep_funcs(qpel_h,10, 8, 32, sse4)
496 mc_rep_funcs(qpel_h,10, 8, 24, sse4)
497 mc_rep_funcs(qpel_h,10, 8, 16, sse4)
498 mc_rep_funcs(qpel_h,10, 4, 12, sse4)
499 mc_rep_funcs(qpel_h,12, 8, 64, sse4)
500 mc_rep_funcs(qpel_h,12, 8, 48, sse4)
501 mc_rep_funcs(qpel_h,12, 8, 32, sse4)
502 mc_rep_funcs(qpel_h,12, 8, 24, sse4)
503 mc_rep_funcs(qpel_h,12, 8, 16, sse4)
504 mc_rep_funcs(qpel_h,12, 4, 12, sse4)
505 mc_rep_funcs(qpel_v, 8, 16, 64, sse4)
506 mc_rep_funcs(qpel_v, 8, 16, 48, sse4)
507 mc_rep_funcs(qpel_v, 8, 16, 32, sse4)
508 mc_rep_funcs(qpel_v, 8, 8, 24, sse4)
509 mc_rep_funcs(qpel_v,10, 8, 64, sse4)
510 mc_rep_funcs(qpel_v,10, 8, 48, sse4)
511 mc_rep_funcs(qpel_v,10, 8, 32, sse4)
512 mc_rep_funcs(qpel_v,10, 8, 24, sse4)
513 mc_rep_funcs(qpel_v,10, 8, 16, sse4)
514 mc_rep_funcs(qpel_v,10, 4, 12, sse4)
515 mc_rep_funcs(qpel_v,12, 8, 64, sse4)
516 mc_rep_funcs(qpel_v,12, 8, 48, sse4)
517 mc_rep_funcs(qpel_v,12, 8, 32, sse4)
518 mc_rep_funcs(qpel_v,12, 8, 24, sse4)
519 mc_rep_funcs(qpel_v,12, 8, 16, sse4)
520 mc_rep_funcs(qpel_v,12, 4, 12, sse4)
521 mc_rep_funcs(qpel_hv, 8, 8, 64, sse4)
522 mc_rep_funcs(qpel_hv, 8, 8, 48, sse4)
523 mc_rep_funcs(qpel_hv, 8, 8, 32, sse4)
524 mc_rep_funcs(qpel_hv, 8, 8, 24, sse4)
525 mc_rep_funcs(qpel_hv, 8, 8, 16, sse4)
526 mc_rep_funcs2(qpel_hv,8, 8, 4, 12, sse4)
527 mc_rep_funcs(qpel_hv,10, 8, 64, sse4)
528 mc_rep_funcs(qpel_hv,10, 8, 48, sse4)
529 mc_rep_funcs(qpel_hv,10, 8, 32, sse4)
530 mc_rep_funcs(qpel_hv,10, 8, 24, sse4)
531 mc_rep_funcs(qpel_hv,10, 8, 16, sse4)
532 mc_rep_funcs(qpel_hv,10, 4, 12, sse4)
533 mc_rep_funcs(qpel_hv,12, 8, 64, sse4)
534 mc_rep_funcs(qpel_hv,12, 8, 48, sse4)
535 mc_rep_funcs(qpel_hv,12, 8, 32, sse4)
536 mc_rep_funcs(qpel_hv,12, 8, 24, sse4)
537 mc_rep_funcs(qpel_hv,12, 8, 16, sse4)
538 mc_rep_funcs(qpel_hv,12, 4, 12, sse4)
539 
540 #define mc_rep_uni_w(bitd, step, W, opt) \
541 void ff_hevc_put_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, const int16_t *_src, \
542  int height, int denom, int _wx, int _ox) \
543 { \
544  int i; \
545  uint8_t *dst; \
546  for (i = 0; i < W; i += step) { \
547  const int16_t *src = _src + i; \
548  dst= _dst + (i * ((bitd + 7) / 8)); \
549  ff_hevc_put_uni_w##step##_##bitd##_##opt(dst, dststride, src, \
550  height, denom, _wx, _ox); \
551  } \
552 }
553 
554 mc_rep_uni_w(8, 6, 12, sse4)
555 mc_rep_uni_w(8, 8, 16, sse4)
556 mc_rep_uni_w(8, 8, 24, sse4)
557 mc_rep_uni_w(8, 8, 32, sse4)
558 mc_rep_uni_w(8, 8, 48, sse4)
559 mc_rep_uni_w(8, 8, 64, sse4)
560 
561 mc_rep_uni_w(10, 6, 12, sse4)
562 mc_rep_uni_w(10, 8, 16, sse4)
563 mc_rep_uni_w(10, 8, 24, sse4)
564 mc_rep_uni_w(10, 8, 32, sse4)
565 mc_rep_uni_w(10, 8, 48, sse4)
566 mc_rep_uni_w(10, 8, 64, sse4)
567 
568 mc_rep_uni_w(12, 6, 12, sse4)
569 mc_rep_uni_w(12, 8, 16, sse4)
570 mc_rep_uni_w(12, 8, 24, sse4)
571 mc_rep_uni_w(12, 8, 32, sse4)
572 mc_rep_uni_w(12, 8, 48, sse4)
573 mc_rep_uni_w(12, 8, 64, sse4)
574 
575 #define mc_rep_bi_w(bitd, step, W, opt) \
576 void ff_hevc_put_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, const int16_t *_src, \
577  const int16_t *_src2, int height, \
578  int denom, int _wx0, int _wx1, int _ox0, int _ox1) \
579 { \
580  int i; \
581  uint8_t *dst; \
582  for (i = 0; i < W; i += step) { \
583  const int16_t *src = _src + i; \
584  const int16_t *src2 = _src2 + i; \
585  dst = _dst + (i * ((bitd + 7) / 8)); \
586  ff_hevc_put_bi_w##step##_##bitd##_##opt(dst, dststride, src, src2, \
587  height, denom, _wx0, _wx1, _ox0, _ox1); \
588  } \
589 }
590 
591 mc_rep_bi_w(8, 6, 12, sse4)
592 mc_rep_bi_w(8, 8, 16, sse4)
593 mc_rep_bi_w(8, 8, 24, sse4)
594 mc_rep_bi_w(8, 8, 32, sse4)
595 mc_rep_bi_w(8, 8, 48, sse4)
596 mc_rep_bi_w(8, 8, 64, sse4)
597 
598 mc_rep_bi_w(10, 6, 12, sse4)
599 mc_rep_bi_w(10, 8, 16, sse4)
600 mc_rep_bi_w(10, 8, 24, sse4)
601 mc_rep_bi_w(10, 8, 32, sse4)
602 mc_rep_bi_w(10, 8, 48, sse4)
603 mc_rep_bi_w(10, 8, 64, sse4)
604 
605 mc_rep_bi_w(12, 6, 12, sse4)
606 mc_rep_bi_w(12, 8, 16, sse4)
607 mc_rep_bi_w(12, 8, 24, sse4)
608 mc_rep_bi_w(12, 8, 32, sse4)
609 mc_rep_bi_w(12, 8, 48, sse4)
610 mc_rep_bi_w(12, 8, 64, sse4)
611 
612 #define mc_uni_w_func(name, bitd, W, opt) \
613 static void hevc_put_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \
614  const uint8_t *_src, ptrdiff_t _srcstride, \
615  int height, int denom, \
616  int _wx, int _ox, \
617  intptr_t mx, intptr_t my, int width) \
618 { \
619  LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \
620  hevc_put_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \
621  ff_hevc_put_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, height, denom, _wx, _ox); \
622 }
623 
624 #define mc_uni_w_funcs(name, bitd, opt) \
625  mc_uni_w_func(name, bitd, 4, opt) \
626  mc_uni_w_func(name, bitd, 8, opt) \
627  mc_uni_w_func(name, bitd, 12, opt) \
628  mc_uni_w_func(name, bitd, 16, opt) \
629  mc_uni_w_func(name, bitd, 24, opt) \
630  mc_uni_w_func(name, bitd, 32, opt) \
631  mc_uni_w_func(name, bitd, 48, opt) \
632  mc_uni_w_func(name, bitd, 64, opt)
633 
634 mc_uni_w_funcs(pel_pixels, 8, sse4)
635 mc_uni_w_func(pel_pixels, 8, 6, sse4)
636 mc_uni_w_funcs(epel_h, 8, sse4)
637 mc_uni_w_func(epel_h, 8, 6, sse4)
638 mc_uni_w_funcs(epel_v, 8, sse4)
639 mc_uni_w_func(epel_v, 8, 6, sse4)
640 mc_uni_w_funcs(epel_hv, 8, sse4)
641 mc_uni_w_func(epel_hv, 8, 6, sse4)
642 mc_uni_w_funcs(qpel_h, 8, sse4)
643 mc_uni_w_funcs(qpel_v, 8, sse4)
644 mc_uni_w_funcs(qpel_hv, 8, sse4)
645 
646 mc_uni_w_funcs(pel_pixels, 10, sse4)
647 mc_uni_w_func(pel_pixels, 10, 6, sse4)
648 mc_uni_w_funcs(epel_h, 10, sse4)
649 mc_uni_w_func(epel_h, 10, 6, sse4)
650 mc_uni_w_funcs(epel_v, 10, sse4)
651 mc_uni_w_func(epel_v, 10, 6, sse4)
652 mc_uni_w_funcs(epel_hv, 10, sse4)
653 mc_uni_w_func(epel_hv, 10, 6, sse4)
654 mc_uni_w_funcs(qpel_h, 10, sse4)
655 mc_uni_w_funcs(qpel_v, 10, sse4)
656 mc_uni_w_funcs(qpel_hv, 10, sse4)
657 
658 mc_uni_w_funcs(pel_pixels, 12, sse4)
659 mc_uni_w_func(pel_pixels, 12, 6, sse4)
660 mc_uni_w_funcs(epel_h, 12, sse4)
661 mc_uni_w_func(epel_h, 12, 6, sse4)
662 mc_uni_w_funcs(epel_v, 12, sse4)
663 mc_uni_w_func(epel_v, 12, 6, sse4)
664 mc_uni_w_funcs(epel_hv, 12, sse4)
665 mc_uni_w_func(epel_hv, 12, 6, sse4)
666 mc_uni_w_funcs(qpel_h, 12, sse4)
667 mc_uni_w_funcs(qpel_v, 12, sse4)
668 mc_uni_w_funcs(qpel_hv, 12, sse4)
669 
670 #define mc_bi_w_func(name, bitd, W, opt) \
671 static void hevc_put_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \
672  const uint8_t *_src, ptrdiff_t _srcstride, \
673  const int16_t *_src2, \
674  int height, int denom, \
675  int _wx0, int _wx1, int _ox0, int _ox1, \
676  intptr_t mx, intptr_t my, int width) \
677 { \
678  LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \
679  hevc_put_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \
680  ff_hevc_put_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, _src2, \
681  height, denom, _wx0, _wx1, _ox0, _ox1); \
682 }
683 
684 #define mc_bi_w_funcs(name, bitd, opt) \
685  mc_bi_w_func(name, bitd, 4, opt) \
686  mc_bi_w_func(name, bitd, 8, opt) \
687  mc_bi_w_func(name, bitd, 12, opt) \
688  mc_bi_w_func(name, bitd, 16, opt) \
689  mc_bi_w_func(name, bitd, 24, opt) \
690  mc_bi_w_func(name, bitd, 32, opt) \
691  mc_bi_w_func(name, bitd, 48, opt) \
692  mc_bi_w_func(name, bitd, 64, opt)
693 
694 mc_bi_w_funcs(pel_pixels, 8, sse4)
695 mc_bi_w_func(pel_pixels, 8, 6, sse4)
696 mc_bi_w_funcs(epel_h, 8, sse4)
697 mc_bi_w_func(epel_h, 8, 6, sse4)
698 mc_bi_w_funcs(epel_v, 8, sse4)
699 mc_bi_w_func(epel_v, 8, 6, sse4)
700 mc_bi_w_funcs(epel_hv, 8, sse4)
701 mc_bi_w_func(epel_hv, 8, 6, sse4)
702 mc_bi_w_funcs(qpel_h, 8, sse4)
703 mc_bi_w_funcs(qpel_v, 8, sse4)
704 mc_bi_w_funcs(qpel_hv, 8, sse4)
705 
706 mc_bi_w_funcs(pel_pixels, 10, sse4)
707 mc_bi_w_func(pel_pixels, 10, 6, sse4)
708 mc_bi_w_funcs(epel_h, 10, sse4)
709 mc_bi_w_func(epel_h, 10, 6, sse4)
710 mc_bi_w_funcs(epel_v, 10, sse4)
711 mc_bi_w_func(epel_v, 10, 6, sse4)
712 mc_bi_w_funcs(epel_hv, 10, sse4)
713 mc_bi_w_func(epel_hv, 10, 6, sse4)
714 mc_bi_w_funcs(qpel_h, 10, sse4)
715 mc_bi_w_funcs(qpel_v, 10, sse4)
716 mc_bi_w_funcs(qpel_hv, 10, sse4)
717 
718 mc_bi_w_funcs(pel_pixels, 12, sse4)
719 mc_bi_w_func(pel_pixels, 12, 6, sse4)
720 mc_bi_w_funcs(epel_h, 12, sse4)
721 mc_bi_w_func(epel_h, 12, 6, sse4)
722 mc_bi_w_funcs(epel_v, 12, sse4)
723 mc_bi_w_func(epel_v, 12, 6, sse4)
724 mc_bi_w_funcs(epel_hv, 12, sse4)
725 mc_bi_w_func(epel_hv, 12, 6, sse4)
726 mc_bi_w_funcs(qpel_h, 12, sse4)
727 mc_bi_w_funcs(qpel_v, 12, sse4)
728 mc_bi_w_funcs(qpel_hv, 12, sse4)
729 #endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
730 
731 #define SAO_BAND_FILTER_FUNCS(bitd, opt) \
732 void ff_hevc_sao_band_filter_8_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
733  const int16_t *sao_offset_val, int sao_left_class, int width, int height); \
734 void ff_hevc_sao_band_filter_16_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
735  const int16_t *sao_offset_val, int sao_left_class, int width, int height); \
736 void ff_hevc_sao_band_filter_32_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
737  const int16_t *sao_offset_val, int sao_left_class, int width, int height); \
738 void ff_hevc_sao_band_filter_48_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
739  const int16_t *sao_offset_val, int sao_left_class, int width, int height); \
740 void ff_hevc_sao_band_filter_64_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
741  const int16_t *sao_offset_val, int sao_left_class, int width, int height);
742 
743 SAO_BAND_FILTER_FUNCS(8, sse2)
744 SAO_BAND_FILTER_FUNCS(10, sse2)
745 SAO_BAND_FILTER_FUNCS(12, sse2)
746 SAO_BAND_FILTER_FUNCS(8, avx)
747 SAO_BAND_FILTER_FUNCS(10, avx)
748 SAO_BAND_FILTER_FUNCS(12, avx)
749 SAO_BAND_FILTER_FUNCS(8, avx2)
750 SAO_BAND_FILTER_FUNCS(10, avx2)
751 SAO_BAND_FILTER_FUNCS(12, avx2)
752 
753 #define SAO_BAND_INIT(bitd, opt) do { \
754  c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_##bitd##_##opt; \
755  c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_##bitd##_##opt; \
756  c->sao_band_filter[2] = ff_hevc_sao_band_filter_32_##bitd##_##opt; \
757  c->sao_band_filter[3] = ff_hevc_sao_band_filter_48_##bitd##_##opt; \
758  c->sao_band_filter[4] = ff_hevc_sao_band_filter_64_##bitd##_##opt; \
759 } while (0)
760 
761 #define SAO_EDGE_FILTER_FUNCS(bitd, opt) \
762 void ff_hevc_sao_edge_filter_8_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, \
763  const int16_t *sao_offset_val, int eo, int width, int height); \
764 void ff_hevc_sao_edge_filter_16_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, \
765  const int16_t *sao_offset_val, int eo, int width, int height); \
766 void ff_hevc_sao_edge_filter_32_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, \
767  const int16_t *sao_offset_val, int eo, int width, int height); \
768 void ff_hevc_sao_edge_filter_48_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, \
769  const int16_t *sao_offset_val, int eo, int width, int height); \
770 void ff_hevc_sao_edge_filter_64_##bitd##_##opt(uint8_t *_dst, const uint8_t *_src, ptrdiff_t stride_dst, \
771  const int16_t *sao_offset_val, int eo, int width, int height); \
772 
773 SAO_EDGE_FILTER_FUNCS(8, ssse3)
774 SAO_EDGE_FILTER_FUNCS(8, avx2)
775 SAO_EDGE_FILTER_FUNCS(10, sse2)
776 SAO_EDGE_FILTER_FUNCS(10, avx2)
777 SAO_EDGE_FILTER_FUNCS(12, sse2)
778 SAO_EDGE_FILTER_FUNCS(12, avx2)
779 
780 #define SAO_EDGE_INIT(bitd, opt) do { \
781  c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8_##bitd##_##opt; \
782  c->sao_edge_filter[1] = ff_hevc_sao_edge_filter_16_##bitd##_##opt; \
783  c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_##bitd##_##opt; \
784  c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_##bitd##_##opt; \
785  c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_##bitd##_##opt; \
786 } while (0)
787 
788 #define PEL_LINK(dst, idx1, idx2, idx3, name, D, opt) \
789 dst [idx1][idx2][idx3] = hevc_put_ ## name ## _ ## D ## _##opt; \
790 dst ## _bi [idx1][idx2][idx3] = ff_hevc_put_bi_ ## name ## _ ## D ## _##opt; \
791 dst ## _uni [idx1][idx2][idx3] = hevc_put_uni_ ## name ## _ ## D ## _##opt; \
792 dst ## _uni_w[idx1][idx2][idx3] = hevc_put_uni_w_ ## name ## _ ## D ## _##opt; \
793 dst ## _bi_w [idx1][idx2][idx3] = hevc_put_bi_w_ ## name ## _ ## D ## _##opt
794 
795 #define EPEL_LINKS(pointer, my, mx, fname, bitd, opt ) \
796  PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
797  PEL_LINK(pointer, 2, my , mx , fname##6 , bitd, opt ); \
798  PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \
799  PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \
800  PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \
801  PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \
802  PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \
803  PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
804  PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
805 #define QPEL_LINKS(pointer, my, mx, fname, bitd, opt) \
806  PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
807  PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \
808  PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \
809  PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \
810  PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \
811  PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \
812  PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
813  PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
814 
816 {
817  int cpu_flags = av_get_cpu_flags();
818 
819  if (bit_depth == 8) {
820  if (EXTERNAL_MMXEXT(cpu_flags)) {
821  c->add_residual[0] = ff_hevc_add_residual_4_8_mmxext;
822  }
823  if (EXTERNAL_SSE2(cpu_flags)) {
824  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
825  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
826 #if ARCH_X86_64
827  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
828  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
829 
830  c->idct[2] = ff_hevc_idct_16x16_8_sse2;
831  c->idct[3] = ff_hevc_idct_32x32_8_sse2;
832 #endif
833  SAO_BAND_INIT(8, sse2);
834 
835  c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_sse2;
836  c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
837  c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
838  c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
839 
840  c->idct[0] = ff_hevc_idct_4x4_8_sse2;
841  c->idct[1] = ff_hevc_idct_8x8_8_sse2;
842 
843  c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
844  c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
845  c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
846  }
847  if (EXTERNAL_SSSE3(cpu_flags)) {
848 #if ARCH_X86_64
849  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
850  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
851 #endif
852  c->dequant = ff_hevc_dequant_8_ssse3;
853  SAO_EDGE_INIT(8, ssse3);
854  }
855 #if HAVE_SSE4_EXTERNAL && ARCH_X86_64
856  if (EXTERNAL_SSE4(cpu_flags)) {
857 
858  EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, sse4);
859  EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 8, sse4);
860  EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 8, sse4);
861  EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 8, sse4);
862 
863  QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
864  QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 8, sse4);
865  QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4);
866  QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4);
867  }
868 #endif
869  if (EXTERNAL_AVX(cpu_flags)) {
870  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
871  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
872 #if ARCH_X86_64
873  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
874  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
875 
876  c->idct[2] = ff_hevc_idct_16x16_8_avx;
877  c->idct[3] = ff_hevc_idct_32x32_8_avx;
878 #endif
879  SAO_BAND_INIT(8, avx);
880 
881  c->idct[0] = ff_hevc_idct_4x4_8_avx;
882  c->idct[1] = ff_hevc_idct_8x8_8_avx;
883  }
884  if (EXTERNAL_AVX2(cpu_flags)) {
885  c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
886  c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
887  }
888 #if HAVE_AVX2_EXTERNAL
890  c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
891  c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
892 
893 #if ARCH_X86_64
894  c->put_hevc_epel[7][0][0] = hevc_put_pel_pixels32_8_avx2;
895  c->put_hevc_epel[8][0][0] = hevc_put_pel_pixels48_8_avx2;
896  c->put_hevc_epel[9][0][0] = hevc_put_pel_pixels64_8_avx2;
897 
898  c->put_hevc_qpel[7][0][0] = hevc_put_pel_pixels32_8_avx2;
899  c->put_hevc_qpel[8][0][0] = hevc_put_pel_pixels48_8_avx2;
900  c->put_hevc_qpel[9][0][0] = hevc_put_pel_pixels64_8_avx2;
901 
902  c->put_hevc_epel_uni[7][0][0] = hevc_put_uni_pel_pixels32_8_avx2;
903  c->put_hevc_epel_uni[8][0][0] = hevc_put_uni_pel_pixels48_8_avx2;
904  c->put_hevc_epel_uni[9][0][0] = hevc_put_uni_pel_pixels64_8_avx2;
905 
906  c->put_hevc_qpel_uni[7][0][0] = hevc_put_uni_pel_pixels32_8_avx2;
907  c->put_hevc_qpel_uni[8][0][0] = hevc_put_uni_pel_pixels48_8_avx2;
908  c->put_hevc_qpel_uni[9][0][0] = hevc_put_uni_pel_pixels64_8_avx2;
909 
910  c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_bi_pel_pixels32_8_avx2;
911  c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_bi_pel_pixels48_8_avx2;
912  c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_bi_pel_pixels64_8_avx2;
913 
914  c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_bi_pel_pixels32_8_avx2;
915  c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_bi_pel_pixels48_8_avx2;
916  c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_bi_pel_pixels64_8_avx2;
917 
918  c->put_hevc_epel[7][0][1] = hevc_put_epel_h32_8_avx2;
919  c->put_hevc_epel[8][0][1] = hevc_put_epel_h48_8_avx2;
920  c->put_hevc_epel[9][0][1] = hevc_put_epel_h64_8_avx2;
921 
922  c->put_hevc_epel_uni[7][0][1] = hevc_put_uni_epel_h32_8_avx2;
923  c->put_hevc_epel_uni[8][0][1] = hevc_put_uni_epel_h48_8_avx2;
924  c->put_hevc_epel_uni[9][0][1] = hevc_put_uni_epel_h64_8_avx2;
925 
926  c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_bi_epel_h32_8_avx2;
927  c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_bi_epel_h48_8_avx2;
928  c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_bi_epel_h64_8_avx2;
929 
930  c->put_hevc_epel[7][1][0] = hevc_put_epel_v32_8_avx2;
931  c->put_hevc_epel[8][1][0] = hevc_put_epel_v48_8_avx2;
932  c->put_hevc_epel[9][1][0] = hevc_put_epel_v64_8_avx2;
933 
934  c->put_hevc_epel_uni[7][1][0] = hevc_put_uni_epel_v32_8_avx2;
935  c->put_hevc_epel_uni[8][1][0] = hevc_put_uni_epel_v48_8_avx2;
936  c->put_hevc_epel_uni[9][1][0] = hevc_put_uni_epel_v64_8_avx2;
937 
938  c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_bi_epel_v32_8_avx2;
939  c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_bi_epel_v48_8_avx2;
940  c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_bi_epel_v64_8_avx2;
941 
942  c->put_hevc_epel[7][1][1] = hevc_put_epel_hv32_8_avx2;
943  c->put_hevc_epel[8][1][1] = hevc_put_epel_hv48_8_avx2;
944  c->put_hevc_epel[9][1][1] = hevc_put_epel_hv64_8_avx2;
945 
946  c->put_hevc_epel_uni[7][1][1] = hevc_put_uni_epel_hv32_8_avx2;
947  c->put_hevc_epel_uni[8][1][1] = hevc_put_uni_epel_hv48_8_avx2;
948  c->put_hevc_epel_uni[9][1][1] = hevc_put_uni_epel_hv64_8_avx2;
949 
950  c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_bi_epel_hv32_8_avx2;
951  c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_bi_epel_hv48_8_avx2;
952  c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_bi_epel_hv64_8_avx2;
953 
954  c->put_hevc_qpel[7][0][1] = hevc_put_qpel_h32_8_avx2;
955  c->put_hevc_qpel[8][0][1] = hevc_put_qpel_h48_8_avx2;
956  c->put_hevc_qpel[9][0][1] = hevc_put_qpel_h64_8_avx2;
957 
958  c->put_hevc_qpel[7][1][0] = hevc_put_qpel_v32_8_avx2;
959  c->put_hevc_qpel[8][1][0] = hevc_put_qpel_v48_8_avx2;
960  c->put_hevc_qpel[9][1][0] = hevc_put_qpel_v64_8_avx2;
961 
962  c->put_hevc_qpel_uni[7][0][1] = hevc_put_uni_qpel_h32_8_avx2;
963  c->put_hevc_qpel_uni[8][0][1] = hevc_put_uni_qpel_h48_8_avx2;
964  c->put_hevc_qpel_uni[9][0][1] = hevc_put_uni_qpel_h64_8_avx2;
965 
966  c->put_hevc_qpel_uni[7][1][0] = hevc_put_uni_qpel_v32_8_avx2;
967  c->put_hevc_qpel_uni[8][1][0] = hevc_put_uni_qpel_v48_8_avx2;
968  c->put_hevc_qpel_uni[9][1][0] = hevc_put_uni_qpel_v64_8_avx2;
969 
970  c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_bi_qpel_h32_8_avx2;
971  c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_bi_qpel_h48_8_avx2;
972  c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_bi_qpel_h64_8_avx2;
973 
974  c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_bi_qpel_v32_8_avx2;
975  c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_bi_qpel_v48_8_avx2;
976  c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_bi_qpel_v64_8_avx2;
977 #endif /* ARCH_X86_64 */
978 
979  SAO_BAND_INIT(8, avx2);
980 
981  c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
982  c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
983  c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
984 
985  c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
986  }
987 #endif /* HAVE_AVX2_EXTERNAL */
988 #if ARCH_X86_64
990  c->put_hevc_qpel[1][0][1] = ff_hevc_put_qpel_h4_8_avx512icl;
991  c->put_hevc_qpel[3][0][1] = ff_hevc_put_qpel_h8_8_avx512icl;
992  c->put_hevc_qpel[5][0][1] = ff_hevc_put_qpel_h16_8_avx512icl;
993  c->put_hevc_qpel[7][0][1] = ff_hevc_put_qpel_h32_8_avx512icl;
994  c->put_hevc_qpel[9][0][1] = ff_hevc_put_qpel_h64_8_avx512icl;
995  c->put_hevc_qpel[3][1][1] = ff_hevc_put_qpel_hv8_8_avx512icl;
996  }
997 #endif
998  } else if (bit_depth == 10) {
999  if (EXTERNAL_MMXEXT(cpu_flags)) {
1000  c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
1001  }
1002  if (EXTERNAL_SSE2(cpu_flags)) {
1003  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
1004  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
1005 #if ARCH_X86_64
1006  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
1007  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
1008 
1009  c->idct[2] = ff_hevc_idct_16x16_10_sse2;
1010  c->idct[3] = ff_hevc_idct_32x32_10_sse2;
1011 #endif
1012  SAO_BAND_INIT(10, sse2);
1013  SAO_EDGE_INIT(10, sse2);
1014 
1015  c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_sse2;
1016  c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
1017  c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
1018  c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2;
1019 
1020  c->idct[0] = ff_hevc_idct_4x4_10_sse2;
1021  c->idct[1] = ff_hevc_idct_8x8_10_sse2;
1022 
1023  c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
1024  c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
1025  c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
1026  }
1027 #if ARCH_X86_64
1028  if (EXTERNAL_SSSE3(cpu_flags)) {
1029  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
1030  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
1031  }
1032 #endif
1033 #if HAVE_SSE4_EXTERNAL && ARCH_X86_64
1034  if (EXTERNAL_SSE4(cpu_flags)) {
1035  EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
1036  EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 10, sse4);
1037  EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 10, sse4);
1038  EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 10, sse4);
1039 
1040  QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
1041  QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 10, sse4);
1042  QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, sse4);
1043  QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, sse4);
1044  }
1045 #endif
1046  if (EXTERNAL_AVX(cpu_flags)) {
1047  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
1048  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
1049 #if ARCH_X86_64
1050  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
1051  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
1052 
1053  c->idct[2] = ff_hevc_idct_16x16_10_avx;
1054  c->idct[3] = ff_hevc_idct_32x32_10_avx;
1055 #endif
1056 
1057  c->idct[0] = ff_hevc_idct_4x4_10_avx;
1058  c->idct[1] = ff_hevc_idct_8x8_10_avx;
1059 
1060  SAO_BAND_INIT(10, avx);
1061  }
1062  if (EXTERNAL_AVX2(cpu_flags)) {
1063  c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
1064  }
1065 #if HAVE_AVX2_EXTERNAL
1067  c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
1068  c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
1069 
1070 #if ARCH_X86_64
1071  c->put_hevc_epel[5][0][0] = hevc_put_pel_pixels16_10_avx2;
1072  c->put_hevc_epel[6][0][0] = hevc_put_pel_pixels24_10_avx2;
1073  c->put_hevc_epel[7][0][0] = hevc_put_pel_pixels32_10_avx2;
1074  c->put_hevc_epel[8][0][0] = hevc_put_pel_pixels48_10_avx2;
1075  c->put_hevc_epel[9][0][0] = hevc_put_pel_pixels64_10_avx2;
1076 
1077  c->put_hevc_qpel[5][0][0] = hevc_put_pel_pixels16_10_avx2;
1078  c->put_hevc_qpel[6][0][0] = hevc_put_pel_pixels24_10_avx2;
1079  c->put_hevc_qpel[7][0][0] = hevc_put_pel_pixels32_10_avx2;
1080  c->put_hevc_qpel[8][0][0] = hevc_put_pel_pixels48_10_avx2;
1081  c->put_hevc_qpel[9][0][0] = hevc_put_pel_pixels64_10_avx2;
1082 
1083  c->put_hevc_epel_uni[5][0][0] = hevc_put_uni_pel_pixels32_8_avx2;
1084  c->put_hevc_epel_uni[6][0][0] = hevc_put_uni_pel_pixels48_8_avx2;
1085  c->put_hevc_epel_uni[7][0][0] = hevc_put_uni_pel_pixels64_8_avx2;
1086  c->put_hevc_epel_uni[8][0][0] = hevc_put_uni_pel_pixels96_8_avx2;
1087  c->put_hevc_epel_uni[9][0][0] = hevc_put_uni_pel_pixels128_8_avx2;
1088 
1089  c->put_hevc_qpel_uni[5][0][0] = hevc_put_uni_pel_pixels32_8_avx2;
1090  c->put_hevc_qpel_uni[6][0][0] = hevc_put_uni_pel_pixels48_8_avx2;
1091  c->put_hevc_qpel_uni[7][0][0] = hevc_put_uni_pel_pixels64_8_avx2;
1092  c->put_hevc_qpel_uni[8][0][0] = hevc_put_uni_pel_pixels96_8_avx2;
1093  c->put_hevc_qpel_uni[9][0][0] = hevc_put_uni_pel_pixels128_8_avx2;
1094 
1095  c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_bi_pel_pixels16_10_avx2;
1096  c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_bi_pel_pixels24_10_avx2;
1097  c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_bi_pel_pixels32_10_avx2;
1098  c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_bi_pel_pixels48_10_avx2;
1099  c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_bi_pel_pixels64_10_avx2;
1100  c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_bi_pel_pixels16_10_avx2;
1101  c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_bi_pel_pixels24_10_avx2;
1102  c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_bi_pel_pixels32_10_avx2;
1103  c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_bi_pel_pixels48_10_avx2;
1104  c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_bi_pel_pixels64_10_avx2;
1105 
1106  c->put_hevc_epel[5][0][1] = hevc_put_epel_h16_10_avx2;
1107  c->put_hevc_epel[6][0][1] = hevc_put_epel_h24_10_avx2;
1108  c->put_hevc_epel[7][0][1] = hevc_put_epel_h32_10_avx2;
1109  c->put_hevc_epel[8][0][1] = hevc_put_epel_h48_10_avx2;
1110  c->put_hevc_epel[9][0][1] = hevc_put_epel_h64_10_avx2;
1111 
1112  c->put_hevc_epel_uni[5][0][1] = hevc_put_uni_epel_h16_10_avx2;
1113  c->put_hevc_epel_uni[6][0][1] = hevc_put_uni_epel_h24_10_avx2;
1114  c->put_hevc_epel_uni[7][0][1] = hevc_put_uni_epel_h32_10_avx2;
1115  c->put_hevc_epel_uni[8][0][1] = hevc_put_uni_epel_h48_10_avx2;
1116  c->put_hevc_epel_uni[9][0][1] = hevc_put_uni_epel_h64_10_avx2;
1117 
1118  c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_bi_epel_h16_10_avx2;
1119  c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_bi_epel_h24_10_avx2;
1120  c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_bi_epel_h32_10_avx2;
1121  c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_bi_epel_h48_10_avx2;
1122  c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_bi_epel_h64_10_avx2;
1123 
1124  c->put_hevc_epel[5][1][0] = hevc_put_epel_v16_10_avx2;
1125  c->put_hevc_epel[6][1][0] = hevc_put_epel_v24_10_avx2;
1126  c->put_hevc_epel[7][1][0] = hevc_put_epel_v32_10_avx2;
1127  c->put_hevc_epel[8][1][0] = hevc_put_epel_v48_10_avx2;
1128  c->put_hevc_epel[9][1][0] = hevc_put_epel_v64_10_avx2;
1129 
1130  c->put_hevc_epel_uni[5][1][0] = hevc_put_uni_epel_v16_10_avx2;
1131  c->put_hevc_epel_uni[6][1][0] = hevc_put_uni_epel_v24_10_avx2;
1132  c->put_hevc_epel_uni[7][1][0] = hevc_put_uni_epel_v32_10_avx2;
1133  c->put_hevc_epel_uni[8][1][0] = hevc_put_uni_epel_v48_10_avx2;
1134  c->put_hevc_epel_uni[9][1][0] = hevc_put_uni_epel_v64_10_avx2;
1135 
1136  c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_bi_epel_v16_10_avx2;
1137  c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_bi_epel_v24_10_avx2;
1138  c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_bi_epel_v32_10_avx2;
1139  c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_bi_epel_v48_10_avx2;
1140  c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_bi_epel_v64_10_avx2;
1141 
1142  c->put_hevc_epel[5][1][1] = hevc_put_epel_hv16_10_avx2;
1143  c->put_hevc_epel[6][1][1] = hevc_put_epel_hv24_10_avx2;
1144  c->put_hevc_epel[7][1][1] = hevc_put_epel_hv32_10_avx2;
1145  c->put_hevc_epel[8][1][1] = hevc_put_epel_hv48_10_avx2;
1146  c->put_hevc_epel[9][1][1] = hevc_put_epel_hv64_10_avx2;
1147 
1148  c->put_hevc_epel_uni[5][1][1] = hevc_put_uni_epel_hv16_10_avx2;
1149  c->put_hevc_epel_uni[6][1][1] = hevc_put_uni_epel_hv24_10_avx2;
1150  c->put_hevc_epel_uni[7][1][1] = hevc_put_uni_epel_hv32_10_avx2;
1151  c->put_hevc_epel_uni[8][1][1] = hevc_put_uni_epel_hv48_10_avx2;
1152  c->put_hevc_epel_uni[9][1][1] = hevc_put_uni_epel_hv64_10_avx2;
1153 
1154  c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_bi_epel_hv16_10_avx2;
1155  c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_bi_epel_hv24_10_avx2;
1156  c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_bi_epel_hv32_10_avx2;
1157  c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_bi_epel_hv48_10_avx2;
1158  c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_bi_epel_hv64_10_avx2;
1159 
1160  c->put_hevc_qpel[5][0][1] = hevc_put_qpel_h16_10_avx2;
1161  c->put_hevc_qpel[6][0][1] = hevc_put_qpel_h24_10_avx2;
1162  c->put_hevc_qpel[7][0][1] = hevc_put_qpel_h32_10_avx2;
1163  c->put_hevc_qpel[8][0][1] = hevc_put_qpel_h48_10_avx2;
1164  c->put_hevc_qpel[9][0][1] = hevc_put_qpel_h64_10_avx2;
1165 
1166  c->put_hevc_qpel_uni[5][0][1] = hevc_put_uni_qpel_h16_10_avx2;
1167  c->put_hevc_qpel_uni[6][0][1] = hevc_put_uni_qpel_h24_10_avx2;
1168  c->put_hevc_qpel_uni[7][0][1] = hevc_put_uni_qpel_h32_10_avx2;
1169  c->put_hevc_qpel_uni[8][0][1] = hevc_put_uni_qpel_h48_10_avx2;
1170  c->put_hevc_qpel_uni[9][0][1] = hevc_put_uni_qpel_h64_10_avx2;
1171 
1172  c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_bi_qpel_h16_10_avx2;
1173  c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_bi_qpel_h24_10_avx2;
1174  c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_bi_qpel_h32_10_avx2;
1175  c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_bi_qpel_h48_10_avx2;
1176  c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_bi_qpel_h64_10_avx2;
1177 
1178  c->put_hevc_qpel[5][1][0] = hevc_put_qpel_v16_10_avx2;
1179  c->put_hevc_qpel[6][1][0] = hevc_put_qpel_v24_10_avx2;
1180  c->put_hevc_qpel[7][1][0] = hevc_put_qpel_v32_10_avx2;
1181  c->put_hevc_qpel[8][1][0] = hevc_put_qpel_v48_10_avx2;
1182  c->put_hevc_qpel[9][1][0] = hevc_put_qpel_v64_10_avx2;
1183 
1184  c->put_hevc_qpel_uni[5][1][0] = hevc_put_uni_qpel_v16_10_avx2;
1185  c->put_hevc_qpel_uni[6][1][0] = hevc_put_uni_qpel_v24_10_avx2;
1186  c->put_hevc_qpel_uni[7][1][0] = hevc_put_uni_qpel_v32_10_avx2;
1187  c->put_hevc_qpel_uni[8][1][0] = hevc_put_uni_qpel_v48_10_avx2;
1188  c->put_hevc_qpel_uni[9][1][0] = hevc_put_uni_qpel_v64_10_avx2;
1189 
1190  c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_bi_qpel_v16_10_avx2;
1191  c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_bi_qpel_v24_10_avx2;
1192  c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_bi_qpel_v32_10_avx2;
1193  c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_bi_qpel_v48_10_avx2;
1194  c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_bi_qpel_v64_10_avx2;
1195 
1196  c->put_hevc_qpel[5][1][1] = hevc_put_qpel_hv16_10_avx2;
1197  c->put_hevc_qpel[6][1][1] = hevc_put_qpel_hv24_10_avx2;
1198  c->put_hevc_qpel[7][1][1] = hevc_put_qpel_hv32_10_avx2;
1199  c->put_hevc_qpel[8][1][1] = hevc_put_qpel_hv48_10_avx2;
1200  c->put_hevc_qpel[9][1][1] = hevc_put_qpel_hv64_10_avx2;
1201 
1202  c->put_hevc_qpel_uni[5][1][1] = hevc_put_uni_qpel_hv16_10_avx2;
1203  c->put_hevc_qpel_uni[6][1][1] = hevc_put_uni_qpel_hv24_10_avx2;
1204  c->put_hevc_qpel_uni[7][1][1] = hevc_put_uni_qpel_hv32_10_avx2;
1205  c->put_hevc_qpel_uni[8][1][1] = hevc_put_uni_qpel_hv48_10_avx2;
1206  c->put_hevc_qpel_uni[9][1][1] = hevc_put_uni_qpel_hv64_10_avx2;
1207 
1208  c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_bi_qpel_hv16_10_avx2;
1209  c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_bi_qpel_hv24_10_avx2;
1210  c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_bi_qpel_hv32_10_avx2;
1211  c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_bi_qpel_hv48_10_avx2;
1212  c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_bi_qpel_hv64_10_avx2;
1213 #endif /* ARCH_X86_64 */
1214 
1215  SAO_BAND_INIT(10, avx2);
1216  SAO_EDGE_INIT(10, avx2);
1217 
1218  c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
1219  c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
1220  }
1221 #endif /* HAVE_AVX2_EXTERNAL */
1222  } else if (bit_depth == 12) {
1223  if (EXTERNAL_SSE2(cpu_flags)) {
1224  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
1225  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
1226 #if ARCH_X86_64
1227  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
1228  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
1229 #endif
1230  SAO_BAND_INIT(12, sse2);
1231  SAO_EDGE_INIT(12, sse2);
1232 
1233  c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_sse2;
1234  c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
1235  c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
1236  c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
1237  }
1238 #if ARCH_X86_64
1239  if (EXTERNAL_SSSE3(cpu_flags)) {
1240  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
1241  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
1242  }
1243 #endif
1244 #if HAVE_SSE4_EXTERNAL && ARCH_X86_64
1245  if (EXTERNAL_SSE4(cpu_flags)) {
1246  EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
1247  EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 12, sse4);
1248  EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 12, sse4);
1249  EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 12, sse4);
1250 
1251  QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
1252  QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 12, sse4);
1253  QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 12, sse4);
1254  QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 12, sse4);
1255  }
1256 #endif
1257  if (EXTERNAL_AVX(cpu_flags)) {
1258  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
1259  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
1260 #if ARCH_X86_64
1261  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
1262  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
1263 #endif
1264  SAO_BAND_INIT(12, avx);
1265  }
1266  if (EXTERNAL_AVX2(cpu_flags)) {
1267  c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
1268  }
1270  c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2;
1271  c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2;
1272 
1273  SAO_BAND_INIT(12, avx2);
1274  SAO_EDGE_INIT(12, avx2);
1275  }
1276  }
1277 }
SAO_BAND_FILTER_FUNCS
#define SAO_BAND_FILTER_FUNCS(bitd, opt)
Definition: dsp_init.c:731
mc_rep_bi_func
#define mc_rep_bi_func(name, bitd, step, W, opt)
Definition: dsp_init.c:221
cpu.h
mem_internal.h
ff_hevc_add_residual_4_10_mmxext
void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
ff_hevc_put_qpel_h8_8_avx512icl
void ff_hevc_put_qpel_h8_8_avx512icl(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_dsp_init_x86
void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
Definition: dsp_init.c:815
EXTERNAL_AVX2_FAST
#define EXTERNAL_AVX2_FAST(flags)
Definition: cpu.h:73
mc_rep_funcs2
#define mc_rep_funcs2(name, bitd, step1, step2, W, opt)
Definition: dsp_init.c:272
SAO_EDGE_INIT
#define SAO_EDGE_INIT(bitd, opt)
Definition: dsp_init.c:780
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
bit_depth
static void bit_depth(AudioStatsContext *s, const uint64_t *const mask, uint8_t *depth)
Definition: af_astats.c:246
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
FW_PUT
#define FW_PUT(p, a, b, depth, opt)
Definition: dsp_init.c:96
SAO_EDGE_FILTER_FUNCS
#define SAO_EDGE_FILTER_FUNCS(bitd, opt)
Definition: dsp_init.c:761
ff_hevc_put_qpel_h32_8_avx512icl
void ff_hevc_put_qpel_h32_8_avx512icl(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
EXTERNAL_AVX2
#define EXTERNAL_AVX2(flags)
Definition: cpu.h:72
ff_hevc_dequant_8_ssse3
void ff_hevc_dequant_8_ssse3(int16_t *coeffs, int16_t log2_size)
ff_hevc_add_residual_16_10_sse2
void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
ff_hevc_add_residual_32_10_sse2
void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
mc_rep_uni_func
#define mc_rep_uni_func(name, bitd, step, W, opt)
Definition: dsp_init.c:207
ff_hevc_add_residual_8_8_sse2
void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
ff_hevc_add_residual_8_10_sse2
void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
asm.h
ff_hevc_add_residual_16_10_avx2
void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
mc_rep_func
#define mc_rep_func(name, bitd, step, W, opt)
Definition: dsp_init.c:194
ff_hevc_put_qpel_h64_8_avx512icl
void ff_hevc_put_qpel_h64_8_avx512icl(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_add_residual_4_8_mmxext
void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
IDCT_DC_FUNCS
#define IDCT_DC_FUNCS(W, opt)
Definition: dsp_init.c:65
ff_hevc_add_residual_32_10_avx2
void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
SAO_BAND_INIT
#define SAO_BAND_INIT(bitd, opt)
Definition: dsp_init.c:753
QPEL_LINKS
#define QPEL_LINKS(pointer, my, mx, fname, bitd, opt)
Definition: dsp_init.c:805
dsp.h
dsp.h
cpu.h
ff_hevc_add_residual_32_8_sse2
void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
LFL_FUNCS
#define LFL_FUNCS(type, depth, opt)
Definition: dsp_init.c:45
HEVCDSPContext
Definition: dsp.h:47
EXTERNAL_SSE2
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:53
h2656dsp.h
ff_hevc_add_residual_16_8_sse2
void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
ff_hevc_add_residual_32_8_avx2
void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, const int16_t *res, ptrdiff_t stride)
ff_hevc_put_qpel_h4_8_avx512icl
void ff_hevc_put_qpel_h4_8_avx512icl(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
EXTERNAL_AVX
#define EXTERNAL_AVX(flags)
Definition: cpu.h:64
mc_rep_funcs
#define mc_rep_funcs(name, bitd, step, W, opt)
Definition: dsp_init.c:237
EPEL_LINKS
#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt)
Definition: dsp_init.c:795
EXTERNAL_SSE4
#define EXTERNAL_SSE4(flags)
Definition: cpu.h:62
EXTERNAL_AVX512ICL
#define EXTERNAL_AVX512ICL(flags)
Definition: cpu.h:78
ff_hevc_put_qpel_h16_8_avx512icl
void ff_hevc_put_qpel_h16_8_avx512icl(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
LFC_FUNCS
#define LFC_FUNCS(type, depth, opt)
Definition: dsp_init.c:41
IDCT_FUNCS
#define IDCT_FUNCS(opt)
Definition: dsp_init.c:77
EXTERNAL_SSSE3
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:59
ff_hevc_put_qpel_hv8_8_avx512icl
void ff_hevc_put_qpel_hv8_8_avx512icl(int16_t *dst, const uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
EXTERNAL_MMXEXT
#define EXTERNAL_MMXEXT(flags)
Definition: cpu.h:51