FFmpeg
hevcdsp_init.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2013 Seppo Tomperi
3  * Copyright (c) 2013 - 2014 Pierre-Edouard Lepere
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 #include "config.h"
23 
24 #include "libavutil/cpu.h"
25 #include "libavutil/mem_internal.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavutil/x86/cpu.h"
28 #include "libavcodec/hevcdsp.h"
29 #include "libavcodec/x86/hevcdsp.h"
30 
31 #define LFC_FUNC(DIR, DEPTH, OPT) \
32 void ff_hevc_ ## DIR ## _loop_filter_chroma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int *tc, uint8_t *no_p, uint8_t *no_q);
33 
34 #define LFL_FUNC(DIR, DEPTH, OPT) \
35 void ff_hevc_ ## DIR ## _loop_filter_luma_ ## DEPTH ## _ ## OPT(uint8_t *pix, ptrdiff_t stride, int beta, int *tc, uint8_t *no_p, uint8_t *no_q);
36 
37 #define LFC_FUNCS(type, depth, opt) \
38  LFC_FUNC(h, depth, opt) \
39  LFC_FUNC(v, depth, opt)
40 
41 #define LFL_FUNCS(type, depth, opt) \
42  LFL_FUNC(h, depth, opt) \
43  LFL_FUNC(v, depth, opt)
44 
45 LFC_FUNCS(uint8_t, 8, sse2)
46 LFC_FUNCS(uint8_t, 10, sse2)
47 LFC_FUNCS(uint8_t, 12, sse2)
48 LFC_FUNCS(uint8_t, 8, avx)
49 LFC_FUNCS(uint8_t, 10, avx)
50 LFC_FUNCS(uint8_t, 12, avx)
51 LFL_FUNCS(uint8_t, 8, sse2)
52 LFL_FUNCS(uint8_t, 10, sse2)
53 LFL_FUNCS(uint8_t, 12, sse2)
54 LFL_FUNCS(uint8_t, 8, ssse3)
55 LFL_FUNCS(uint8_t, 10, ssse3)
56 LFL_FUNCS(uint8_t, 12, ssse3)
57 LFL_FUNCS(uint8_t, 8, avx)
58 LFL_FUNCS(uint8_t, 10, avx)
59 LFL_FUNCS(uint8_t, 12, avx)
60 
61 #define IDCT_DC_FUNCS(W, opt) \
62 void ff_hevc_idct_ ## W ## _dc_8_ ## opt(int16_t *coeffs); \
63 void ff_hevc_idct_ ## W ## _dc_10_ ## opt(int16_t *coeffs); \
64 void ff_hevc_idct_ ## W ## _dc_12_ ## opt(int16_t *coeffs)
65 
66 IDCT_DC_FUNCS(4x4, mmxext);
67 IDCT_DC_FUNCS(8x8, sse2);
68 IDCT_DC_FUNCS(16x16, sse2);
69 IDCT_DC_FUNCS(32x32, sse2);
70 IDCT_DC_FUNCS(16x16, avx2);
71 IDCT_DC_FUNCS(32x32, avx2);
72 
73 #define IDCT_FUNCS(opt) \
74 void ff_hevc_idct_4x4_8_ ## opt(int16_t *coeffs, int col_limit); \
75 void ff_hevc_idct_4x4_10_ ## opt(int16_t *coeffs, int col_limit); \
76 void ff_hevc_idct_8x8_8_ ## opt(int16_t *coeffs, int col_limit); \
77 void ff_hevc_idct_8x8_10_ ## opt(int16_t *coeffs, int col_limit); \
78 void ff_hevc_idct_16x16_8_ ## opt(int16_t *coeffs, int col_limit); \
79 void ff_hevc_idct_16x16_10_ ## opt(int16_t *coeffs, int col_limit); \
80 void ff_hevc_idct_32x32_8_ ## opt(int16_t *coeffs, int col_limit); \
81 void ff_hevc_idct_32x32_10_ ## opt(int16_t *coeffs, int col_limit);
82 
83 IDCT_FUNCS(sse2)
84 IDCT_FUNCS(avx)
85 
86 #define mc_rep_func(name, bitd, step, W, opt) \
87 void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *_dst, \
88  uint8_t *_src, ptrdiff_t _srcstride, int height, \
89  intptr_t mx, intptr_t my, int width) \
90 { \
91  int i; \
92  uint8_t *src; \
93  int16_t *dst; \
94  for (i = 0; i < W; i += step) { \
95  src = _src + (i * ((bitd + 7) / 8)); \
96  dst = _dst + i; \
97  ff_hevc_put_hevc_##name##step##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \
98  } \
99 }
100 #define mc_rep_uni_func(name, bitd, step, W, opt) \
101 void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, \
102  uint8_t *_src, ptrdiff_t _srcstride, int height, \
103  intptr_t mx, intptr_t my, int width) \
104 { \
105  int i; \
106  uint8_t *src; \
107  uint8_t *dst; \
108  for (i = 0; i < W; i += step) { \
109  src = _src + (i * ((bitd + 7) / 8)); \
110  dst = _dst + (i * ((bitd + 7) / 8)); \
111  ff_hevc_put_hevc_uni_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, \
112  height, mx, my, width); \
113  } \
114 }
115 #define mc_rep_bi_func(name, bitd, step, W, opt) \
116 void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, uint8_t *_src, \
117  ptrdiff_t _srcstride, int16_t* _src2, \
118  int height, intptr_t mx, intptr_t my, int width) \
119 { \
120  int i; \
121  uint8_t *src; \
122  uint8_t *dst; \
123  int16_t *src2; \
124  for (i = 0; i < W ; i += step) { \
125  src = _src + (i * ((bitd + 7) / 8)); \
126  dst = _dst + (i * ((bitd + 7) / 8)); \
127  src2 = _src2 + i; \
128  ff_hevc_put_hevc_bi_##name##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, \
129  height, mx, my, width); \
130  } \
131 }
132 
133 #define mc_rep_funcs(name, bitd, step, W, opt) \
134  mc_rep_func(name, bitd, step, W, opt) \
135  mc_rep_uni_func(name, bitd, step, W, opt) \
136  mc_rep_bi_func(name, bitd, step, W, opt)
137 
138 #define mc_rep_func2(name, bitd, step1, step2, W, opt) \
139 void ff_hevc_put_hevc_##name##W##_##bitd##_##opt(int16_t *dst, \
140  uint8_t *src, ptrdiff_t _srcstride, int height, \
141  intptr_t mx, intptr_t my, int width) \
142 { \
143  ff_hevc_put_hevc_##name##step1##_##bitd##_##opt(dst, src, _srcstride, height, mx, my, width); \
144  ff_hevc_put_hevc_##name##step2##_##bitd##_##opt(dst + step1, src + (step1 * ((bitd + 7) / 8)), \
145  _srcstride, height, mx, my, width); \
146 }
147 #define mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
148 void ff_hevc_put_hevc_uni_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, \
149  uint8_t *src, ptrdiff_t _srcstride, int height, \
150  intptr_t mx, intptr_t my, int width) \
151 { \
152  ff_hevc_put_hevc_uni_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, height, mx, my, width);\
153  ff_hevc_put_hevc_uni_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \
154  src + (step1 * ((bitd + 7) / 8)), _srcstride, \
155  height, mx, my, width); \
156 }
157 #define mc_rep_bi_func2(name, bitd, step1, step2, W, opt) \
158 void ff_hevc_put_hevc_bi_##name##W##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
159  ptrdiff_t _srcstride, int16_t* src2, \
160  int height, intptr_t mx, intptr_t my, int width) \
161 { \
162  ff_hevc_put_hevc_bi_##name##step1##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, height, mx, my, width);\
163  ff_hevc_put_hevc_bi_##name##step2##_##bitd##_##opt(dst + (step1 * ((bitd + 7) / 8)), dststride, \
164  src + (step1 * ((bitd + 7) / 8)), _srcstride, \
165  src2 + step1, height, mx, my, width); \
166 }
167 
168 #define mc_rep_funcs2(name, bitd, step1, step2, W, opt) \
169  mc_rep_func2(name, bitd, step1, step2, W, opt) \
170  mc_rep_uni_func2(name, bitd, step1, step2, W, opt) \
171  mc_rep_bi_func2(name, bitd, step1, step2, W, opt)
172 
173 #if ARCH_X86_64 && HAVE_SSE4_EXTERNAL
174 
175 #define mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
176 void ff_hevc_put_hevc_##name##width1##_10_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride, \
177  int height, intptr_t mx, intptr_t my, int width) \
178  \
179 { \
180  ff_hevc_put_hevc_##name##width2##_10_##opt1(dst, src, _srcstride, height, mx, my, width); \
181  ff_hevc_put_hevc_##name##width3##_10_##opt2(dst+ width2, src+ width4, _srcstride, height, mx, my, width); \
182 }
183 
184 #define mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
185 void ff_hevc_put_hevc_bi_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
186  ptrdiff_t _srcstride, int16_t *src2, \
187  int height, intptr_t mx, intptr_t my, int width) \
188 { \
189  ff_hevc_put_hevc_bi_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, src2, \
190  height, mx, my, width); \
191  ff_hevc_put_hevc_bi_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, src2+width2,\
192  height, mx, my, width); \
193 }
194 
195 #define mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
196 void ff_hevc_put_hevc_uni_##name##width1##_10_##opt1(uint8_t *dst, ptrdiff_t dststride, \
197  uint8_t *src, ptrdiff_t _srcstride, int height, \
198  intptr_t mx, intptr_t my, int width) \
199 { \
200  ff_hevc_put_hevc_uni_##name##width2##_10_##opt1(dst, dststride, src, _srcstride, \
201  height, mx, my, width); \
202  ff_hevc_put_hevc_uni_##name##width3##_10_##opt2(dst+width4, dststride, src+width4, _srcstride, \
203  height, mx, my, width); \
204 }
205 
206 #define mc_rep_mixs_10(name, width1, width2, width3, opt1, opt2, width4) \
207 mc_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
208 mc_bi_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4) \
209 mc_uni_rep_mix_10(name, width1, width2, width3, opt1, opt2, width4)
210 
211 #define mc_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
212 void ff_hevc_put_hevc_##name##width1##_8_##opt1(int16_t *dst, uint8_t *src, ptrdiff_t _srcstride, \
213  int height, intptr_t mx, intptr_t my, int width) \
214  \
215 { \
216  ff_hevc_put_hevc_##name##width2##_8_##opt1(dst, src, _srcstride, height, mx, my, width); \
217  ff_hevc_put_hevc_##name##width3##_8_##opt2(dst+ width2, src+ width2, _srcstride, height, mx, my, width); \
218 }
219 
220 #define mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
221 void ff_hevc_put_hevc_bi_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
222  ptrdiff_t _srcstride, int16_t* src2, \
223  int height, intptr_t mx, intptr_t my, int width) \
224 { \
225  ff_hevc_put_hevc_bi_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \
226  src2, height, mx, my, width); \
227  ff_hevc_put_hevc_bi_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \
228  src2+width2, height, mx, my, width); \
229 }
230 
231 #define mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
232 void ff_hevc_put_hevc_uni_##name##width1##_8_##opt1(uint8_t *dst, ptrdiff_t dststride, \
233  uint8_t *src, ptrdiff_t _srcstride, int height, \
234  intptr_t mx, intptr_t my, int width) \
235 { \
236  ff_hevc_put_hevc_uni_##name##width2##_8_##opt1(dst, dststride, src, _srcstride, \
237  height, mx, my, width); \
238  ff_hevc_put_hevc_uni_##name##width3##_8_##opt2(dst+width2, dststride, src+width2, _srcstride, \
239  height, mx, my, width); \
240 }
241 
242 #define mc_rep_mixs_8(name, width1, width2, width3, opt1, opt2) \
243 mc_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
244 mc_bi_rep_mix_8(name, width1, width2, width3, opt1, opt2) \
245 mc_uni_rep_mix_8(name, width1, width2, width3, opt1, opt2)
246 
247 #if HAVE_AVX2_EXTERNAL
248 
249 mc_rep_mixs_8(pel_pixels, 48, 32, 16, avx2, sse4)
250 mc_rep_mixs_8(epel_hv, 48, 32, 16, avx2, sse4)
251 mc_rep_mixs_8(epel_h , 48, 32, 16, avx2, sse4)
252 mc_rep_mixs_8(epel_v , 48, 32, 16, avx2, sse4)
253 
254 mc_rep_mix_10(pel_pixels, 24, 16, 8, avx2, sse4, 32)
255 mc_bi_rep_mix_10(pel_pixels,24, 16, 8, avx2, sse4, 32)
256 mc_rep_mixs_10(epel_hv, 24, 16, 8, avx2, sse4, 32)
257 mc_rep_mixs_10(epel_h , 24, 16, 8, avx2, sse4, 32)
258 mc_rep_mixs_10(epel_v , 24, 16, 8, avx2, sse4, 32)
259 
260 
261 mc_rep_mixs_10(qpel_h , 24, 16, 8, avx2, sse4, 32)
262 mc_rep_mixs_10(qpel_v , 24, 16, 8, avx2, sse4, 32)
263 mc_rep_mixs_10(qpel_hv, 24, 16, 8, avx2, sse4, 32)
264 
265 
266 mc_rep_uni_func(pel_pixels, 8, 64, 128, avx2)//used for 10bit
267 mc_rep_uni_func(pel_pixels, 8, 32, 96, avx2) //used for 10bit
268 
269 mc_rep_funcs(pel_pixels, 8, 32, 64, avx2)
270 
271 mc_rep_func(pel_pixels, 10, 16, 32, avx2)
272 mc_rep_func(pel_pixels, 10, 16, 48, avx2)
273 mc_rep_func(pel_pixels, 10, 32, 64, avx2)
274 
275 mc_rep_bi_func(pel_pixels, 10, 16, 32, avx2)
276 mc_rep_bi_func(pel_pixels, 10, 16, 48, avx2)
277 mc_rep_bi_func(pel_pixels, 10, 32, 64, avx2)
278 
279 mc_rep_funcs(epel_h, 8, 32, 64, avx2)
280 
281 mc_rep_funcs(epel_v, 8, 32, 64, avx2)
282 
283 mc_rep_funcs(epel_h, 10, 16, 32, avx2)
284 mc_rep_funcs(epel_h, 10, 16, 48, avx2)
285 mc_rep_funcs(epel_h, 10, 32, 64, avx2)
286 
287 mc_rep_funcs(epel_v, 10, 16, 32, avx2)
288 mc_rep_funcs(epel_v, 10, 16, 48, avx2)
289 mc_rep_funcs(epel_v, 10, 32, 64, avx2)
290 
291 
292 mc_rep_funcs(epel_hv, 8, 32, 64, avx2)
293 
294 mc_rep_funcs(epel_hv, 10, 16, 32, avx2)
295 mc_rep_funcs(epel_hv, 10, 16, 48, avx2)
296 mc_rep_funcs(epel_hv, 10, 32, 64, avx2)
297 
298 mc_rep_funcs(qpel_h, 8, 32, 64, avx2)
299 mc_rep_mixs_8(qpel_h , 48, 32, 16, avx2, sse4)
300 
301 mc_rep_funcs(qpel_v, 8, 32, 64, avx2)
302 mc_rep_mixs_8(qpel_v, 48, 32, 16, avx2, sse4)
303 
304 mc_rep_funcs(qpel_h, 10, 16, 32, avx2)
305 mc_rep_funcs(qpel_h, 10, 16, 48, avx2)
306 mc_rep_funcs(qpel_h, 10, 32, 64, avx2)
307 
308 mc_rep_funcs(qpel_v, 10, 16, 32, avx2)
309 mc_rep_funcs(qpel_v, 10, 16, 48, avx2)
310 mc_rep_funcs(qpel_v, 10, 32, 64, avx2)
311 
312 mc_rep_funcs(qpel_hv, 10, 16, 32, avx2)
313 mc_rep_funcs(qpel_hv, 10, 16, 48, avx2)
314 mc_rep_funcs(qpel_hv, 10, 32, 64, avx2)
315 
316 #endif //AVX2
317 
318 mc_rep_funcs(pel_pixels, 8, 16, 64, sse4)
319 mc_rep_funcs(pel_pixels, 8, 16, 48, sse4)
320 mc_rep_funcs(pel_pixels, 8, 16, 32, sse4)
321 mc_rep_funcs(pel_pixels, 8, 8, 24, sse4)
322 mc_rep_funcs(pel_pixels,10, 8, 64, sse4)
323 mc_rep_funcs(pel_pixels,10, 8, 48, sse4)
324 mc_rep_funcs(pel_pixels,10, 8, 32, sse4)
325 mc_rep_funcs(pel_pixels,10, 8, 24, sse4)
326 mc_rep_funcs(pel_pixels,10, 8, 16, sse4)
327 mc_rep_funcs(pel_pixels,10, 4, 12, sse4)
328 mc_rep_funcs(pel_pixels,12, 8, 64, sse4)
329 mc_rep_funcs(pel_pixels,12, 8, 48, sse4)
330 mc_rep_funcs(pel_pixels,12, 8, 32, sse4)
331 mc_rep_funcs(pel_pixels,12, 8, 24, sse4)
332 mc_rep_funcs(pel_pixels,12, 8, 16, sse4)
333 mc_rep_funcs(pel_pixels,12, 4, 12, sse4)
334 
335 mc_rep_funcs(epel_h, 8, 16, 64, sse4)
336 mc_rep_funcs(epel_h, 8, 16, 48, sse4)
337 mc_rep_funcs(epel_h, 8, 16, 32, sse4)
338 mc_rep_funcs(epel_h, 8, 8, 24, sse4)
339 mc_rep_funcs(epel_h,10, 8, 64, sse4)
340 mc_rep_funcs(epel_h,10, 8, 48, sse4)
341 mc_rep_funcs(epel_h,10, 8, 32, sse4)
342 mc_rep_funcs(epel_h,10, 8, 24, sse4)
343 mc_rep_funcs(epel_h,10, 8, 16, sse4)
344 mc_rep_funcs(epel_h,10, 4, 12, sse4)
345 mc_rep_funcs(epel_h,12, 8, 64, sse4)
346 mc_rep_funcs(epel_h,12, 8, 48, sse4)
347 mc_rep_funcs(epel_h,12, 8, 32, sse4)
348 mc_rep_funcs(epel_h,12, 8, 24, sse4)
349 mc_rep_funcs(epel_h,12, 8, 16, sse4)
350 mc_rep_funcs(epel_h,12, 4, 12, sse4)
351 mc_rep_funcs(epel_v, 8, 16, 64, sse4)
352 mc_rep_funcs(epel_v, 8, 16, 48, sse4)
353 mc_rep_funcs(epel_v, 8, 16, 32, sse4)
354 mc_rep_funcs(epel_v, 8, 8, 24, sse4)
355 mc_rep_funcs(epel_v,10, 8, 64, sse4)
356 mc_rep_funcs(epel_v,10, 8, 48, sse4)
357 mc_rep_funcs(epel_v,10, 8, 32, sse4)
358 mc_rep_funcs(epel_v,10, 8, 24, sse4)
359 mc_rep_funcs(epel_v,10, 8, 16, sse4)
360 mc_rep_funcs(epel_v,10, 4, 12, sse4)
361 mc_rep_funcs(epel_v,12, 8, 64, sse4)
362 mc_rep_funcs(epel_v,12, 8, 48, sse4)
363 mc_rep_funcs(epel_v,12, 8, 32, sse4)
364 mc_rep_funcs(epel_v,12, 8, 24, sse4)
365 mc_rep_funcs(epel_v,12, 8, 16, sse4)
366 mc_rep_funcs(epel_v,12, 4, 12, sse4)
367 mc_rep_funcs(epel_hv, 8, 16, 64, sse4)
368 mc_rep_funcs(epel_hv, 8, 16, 48, sse4)
369 mc_rep_funcs(epel_hv, 8, 16, 32, sse4)
370 mc_rep_funcs(epel_hv, 8, 8, 24, sse4)
371 mc_rep_funcs2(epel_hv,8, 8, 4, 12, sse4)
372 mc_rep_funcs(epel_hv,10, 8, 64, sse4)
373 mc_rep_funcs(epel_hv,10, 8, 48, sse4)
374 mc_rep_funcs(epel_hv,10, 8, 32, sse4)
375 mc_rep_funcs(epel_hv,10, 8, 24, sse4)
376 mc_rep_funcs(epel_hv,10, 8, 16, sse4)
377 mc_rep_funcs(epel_hv,10, 4, 12, sse4)
378 mc_rep_funcs(epel_hv,12, 8, 64, sse4)
379 mc_rep_funcs(epel_hv,12, 8, 48, sse4)
380 mc_rep_funcs(epel_hv,12, 8, 32, sse4)
381 mc_rep_funcs(epel_hv,12, 8, 24, sse4)
382 mc_rep_funcs(epel_hv,12, 8, 16, sse4)
383 mc_rep_funcs(epel_hv,12, 4, 12, sse4)
384 
385 mc_rep_funcs(qpel_h, 8, 16, 64, sse4)
386 mc_rep_funcs(qpel_h, 8, 16, 48, sse4)
387 mc_rep_funcs(qpel_h, 8, 16, 32, sse4)
388 mc_rep_funcs(qpel_h, 8, 8, 24, sse4)
389 mc_rep_funcs(qpel_h,10, 8, 64, sse4)
390 mc_rep_funcs(qpel_h,10, 8, 48, sse4)
391 mc_rep_funcs(qpel_h,10, 8, 32, sse4)
392 mc_rep_funcs(qpel_h,10, 8, 24, sse4)
393 mc_rep_funcs(qpel_h,10, 8, 16, sse4)
394 mc_rep_funcs(qpel_h,10, 4, 12, sse4)
395 mc_rep_funcs(qpel_h,12, 8, 64, sse4)
396 mc_rep_funcs(qpel_h,12, 8, 48, sse4)
397 mc_rep_funcs(qpel_h,12, 8, 32, sse4)
398 mc_rep_funcs(qpel_h,12, 8, 24, sse4)
399 mc_rep_funcs(qpel_h,12, 8, 16, sse4)
400 mc_rep_funcs(qpel_h,12, 4, 12, sse4)
401 mc_rep_funcs(qpel_v, 8, 16, 64, sse4)
402 mc_rep_funcs(qpel_v, 8, 16, 48, sse4)
403 mc_rep_funcs(qpel_v, 8, 16, 32, sse4)
404 mc_rep_funcs(qpel_v, 8, 8, 24, sse4)
405 mc_rep_funcs(qpel_v,10, 8, 64, sse4)
406 mc_rep_funcs(qpel_v,10, 8, 48, sse4)
407 mc_rep_funcs(qpel_v,10, 8, 32, sse4)
408 mc_rep_funcs(qpel_v,10, 8, 24, sse4)
409 mc_rep_funcs(qpel_v,10, 8, 16, sse4)
410 mc_rep_funcs(qpel_v,10, 4, 12, sse4)
411 mc_rep_funcs(qpel_v,12, 8, 64, sse4)
412 mc_rep_funcs(qpel_v,12, 8, 48, sse4)
413 mc_rep_funcs(qpel_v,12, 8, 32, sse4)
414 mc_rep_funcs(qpel_v,12, 8, 24, sse4)
415 mc_rep_funcs(qpel_v,12, 8, 16, sse4)
416 mc_rep_funcs(qpel_v,12, 4, 12, sse4)
417 mc_rep_funcs(qpel_hv, 8, 8, 64, sse4)
418 mc_rep_funcs(qpel_hv, 8, 8, 48, sse4)
419 mc_rep_funcs(qpel_hv, 8, 8, 32, sse4)
420 mc_rep_funcs(qpel_hv, 8, 8, 24, sse4)
421 mc_rep_funcs(qpel_hv, 8, 8, 16, sse4)
422 mc_rep_funcs2(qpel_hv,8, 8, 4, 12, sse4)
423 mc_rep_funcs(qpel_hv,10, 8, 64, sse4)
424 mc_rep_funcs(qpel_hv,10, 8, 48, sse4)
425 mc_rep_funcs(qpel_hv,10, 8, 32, sse4)
426 mc_rep_funcs(qpel_hv,10, 8, 24, sse4)
427 mc_rep_funcs(qpel_hv,10, 8, 16, sse4)
428 mc_rep_funcs(qpel_hv,10, 4, 12, sse4)
429 mc_rep_funcs(qpel_hv,12, 8, 64, sse4)
430 mc_rep_funcs(qpel_hv,12, 8, 48, sse4)
431 mc_rep_funcs(qpel_hv,12, 8, 32, sse4)
432 mc_rep_funcs(qpel_hv,12, 8, 24, sse4)
433 mc_rep_funcs(qpel_hv,12, 8, 16, sse4)
434 mc_rep_funcs(qpel_hv,12, 4, 12, sse4)
435 
436 #define mc_rep_uni_w(bitd, step, W, opt) \
437 void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
438  int height, int denom, int _wx, int _ox) \
439 { \
440  int i; \
441  int16_t *src; \
442  uint8_t *dst; \
443  for (i = 0; i < W; i += step) { \
444  src= _src + i; \
445  dst= _dst + (i * ((bitd + 7) / 8)); \
446  ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, \
447  height, denom, _wx, _ox); \
448  } \
449 }
450 
451 mc_rep_uni_w(8, 6, 12, sse4)
452 mc_rep_uni_w(8, 8, 16, sse4)
453 mc_rep_uni_w(8, 8, 24, sse4)
454 mc_rep_uni_w(8, 8, 32, sse4)
455 mc_rep_uni_w(8, 8, 48, sse4)
456 mc_rep_uni_w(8, 8, 64, sse4)
457 
458 mc_rep_uni_w(10, 6, 12, sse4)
459 mc_rep_uni_w(10, 8, 16, sse4)
460 mc_rep_uni_w(10, 8, 24, sse4)
461 mc_rep_uni_w(10, 8, 32, sse4)
462 mc_rep_uni_w(10, 8, 48, sse4)
463 mc_rep_uni_w(10, 8, 64, sse4)
464 
465 mc_rep_uni_w(12, 6, 12, sse4)
466 mc_rep_uni_w(12, 8, 16, sse4)
467 mc_rep_uni_w(12, 8, 24, sse4)
468 mc_rep_uni_w(12, 8, 32, sse4)
469 mc_rep_uni_w(12, 8, 48, sse4)
470 mc_rep_uni_w(12, 8, 64, sse4)
471 
472 #define mc_rep_bi_w(bitd, step, W, opt) \
473 void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \
474  int16_t *_src2, int height, \
475  int denom, int _wx0, int _wx1, int _ox0, int _ox1) \
476 { \
477  int i; \
478  int16_t *src; \
479  int16_t *src2; \
480  uint8_t *dst; \
481  for (i = 0; i < W; i += step) { \
482  src = _src + i; \
483  src2 = _src2 + i; \
484  dst = _dst + (i * ((bitd + 7) / 8)); \
485  ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, src2, \
486  height, denom, _wx0, _wx1, _ox0, _ox1); \
487  } \
488 }
489 
490 mc_rep_bi_w(8, 6, 12, sse4)
491 mc_rep_bi_w(8, 8, 16, sse4)
492 mc_rep_bi_w(8, 8, 24, sse4)
493 mc_rep_bi_w(8, 8, 32, sse4)
494 mc_rep_bi_w(8, 8, 48, sse4)
495 mc_rep_bi_w(8, 8, 64, sse4)
496 
497 mc_rep_bi_w(10, 6, 12, sse4)
498 mc_rep_bi_w(10, 8, 16, sse4)
499 mc_rep_bi_w(10, 8, 24, sse4)
500 mc_rep_bi_w(10, 8, 32, sse4)
501 mc_rep_bi_w(10, 8, 48, sse4)
502 mc_rep_bi_w(10, 8, 64, sse4)
503 
504 mc_rep_bi_w(12, 6, 12, sse4)
505 mc_rep_bi_w(12, 8, 16, sse4)
506 mc_rep_bi_w(12, 8, 24, sse4)
507 mc_rep_bi_w(12, 8, 32, sse4)
508 mc_rep_bi_w(12, 8, 48, sse4)
509 mc_rep_bi_w(12, 8, 64, sse4)
510 
511 #define mc_uni_w_func(name, bitd, W, opt) \
512 void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \
513  uint8_t *_src, ptrdiff_t _srcstride, \
514  int height, int denom, \
515  int _wx, int _ox, \
516  intptr_t mx, intptr_t my, int width) \
517 { \
518  LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \
519  ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \
520  ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, height, denom, _wx, _ox);\
521 }
522 
523 #define mc_uni_w_funcs(name, bitd, opt) \
524  mc_uni_w_func(name, bitd, 4, opt) \
525  mc_uni_w_func(name, bitd, 8, opt) \
526  mc_uni_w_func(name, bitd, 12, opt) \
527  mc_uni_w_func(name, bitd, 16, opt) \
528  mc_uni_w_func(name, bitd, 24, opt) \
529  mc_uni_w_func(name, bitd, 32, opt) \
530  mc_uni_w_func(name, bitd, 48, opt) \
531  mc_uni_w_func(name, bitd, 64, opt)
532 
533 mc_uni_w_funcs(pel_pixels, 8, sse4)
534 mc_uni_w_func(pel_pixels, 8, 6, sse4)
535 mc_uni_w_funcs(epel_h, 8, sse4)
536 mc_uni_w_func(epel_h, 8, 6, sse4)
537 mc_uni_w_funcs(epel_v, 8, sse4)
538 mc_uni_w_func(epel_v, 8, 6, sse4)
539 mc_uni_w_funcs(epel_hv, 8, sse4)
540 mc_uni_w_func(epel_hv, 8, 6, sse4)
541 mc_uni_w_funcs(qpel_h, 8, sse4)
542 mc_uni_w_funcs(qpel_v, 8, sse4)
543 mc_uni_w_funcs(qpel_hv, 8, sse4)
544 
545 mc_uni_w_funcs(pel_pixels, 10, sse4)
546 mc_uni_w_func(pel_pixels, 10, 6, sse4)
547 mc_uni_w_funcs(epel_h, 10, sse4)
548 mc_uni_w_func(epel_h, 10, 6, sse4)
549 mc_uni_w_funcs(epel_v, 10, sse4)
550 mc_uni_w_func(epel_v, 10, 6, sse4)
551 mc_uni_w_funcs(epel_hv, 10, sse4)
552 mc_uni_w_func(epel_hv, 10, 6, sse4)
553 mc_uni_w_funcs(qpel_h, 10, sse4)
554 mc_uni_w_funcs(qpel_v, 10, sse4)
555 mc_uni_w_funcs(qpel_hv, 10, sse4)
556 
557 mc_uni_w_funcs(pel_pixels, 12, sse4)
558 mc_uni_w_func(pel_pixels, 12, 6, sse4)
559 mc_uni_w_funcs(epel_h, 12, sse4)
560 mc_uni_w_func(epel_h, 12, 6, sse4)
561 mc_uni_w_funcs(epel_v, 12, sse4)
562 mc_uni_w_func(epel_v, 12, 6, sse4)
563 mc_uni_w_funcs(epel_hv, 12, sse4)
564 mc_uni_w_func(epel_hv, 12, 6, sse4)
565 mc_uni_w_funcs(qpel_h, 12, sse4)
566 mc_uni_w_funcs(qpel_v, 12, sse4)
567 mc_uni_w_funcs(qpel_hv, 12, sse4)
568 
569 #define mc_bi_w_func(name, bitd, W, opt) \
570 void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _dststride, \
571  uint8_t *_src, ptrdiff_t _srcstride, \
572  int16_t *_src2, \
573  int height, int denom, \
574  int _wx0, int _wx1, int _ox0, int _ox1, \
575  intptr_t mx, intptr_t my, int width) \
576 { \
577  LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \
578  ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \
579  ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, _src2, \
580  height, denom, _wx0, _wx1, _ox0, _ox1); \
581 }
582 
583 #define mc_bi_w_funcs(name, bitd, opt) \
584  mc_bi_w_func(name, bitd, 4, opt) \
585  mc_bi_w_func(name, bitd, 8, opt) \
586  mc_bi_w_func(name, bitd, 12, opt) \
587  mc_bi_w_func(name, bitd, 16, opt) \
588  mc_bi_w_func(name, bitd, 24, opt) \
589  mc_bi_w_func(name, bitd, 32, opt) \
590  mc_bi_w_func(name, bitd, 48, opt) \
591  mc_bi_w_func(name, bitd, 64, opt)
592 
593 mc_bi_w_funcs(pel_pixels, 8, sse4)
594 mc_bi_w_func(pel_pixels, 8, 6, sse4)
595 mc_bi_w_funcs(epel_h, 8, sse4)
596 mc_bi_w_func(epel_h, 8, 6, sse4)
597 mc_bi_w_funcs(epel_v, 8, sse4)
598 mc_bi_w_func(epel_v, 8, 6, sse4)
599 mc_bi_w_funcs(epel_hv, 8, sse4)
600 mc_bi_w_func(epel_hv, 8, 6, sse4)
601 mc_bi_w_funcs(qpel_h, 8, sse4)
602 mc_bi_w_funcs(qpel_v, 8, sse4)
603 mc_bi_w_funcs(qpel_hv, 8, sse4)
604 
605 mc_bi_w_funcs(pel_pixels, 10, sse4)
606 mc_bi_w_func(pel_pixels, 10, 6, sse4)
607 mc_bi_w_funcs(epel_h, 10, sse4)
608 mc_bi_w_func(epel_h, 10, 6, sse4)
609 mc_bi_w_funcs(epel_v, 10, sse4)
610 mc_bi_w_func(epel_v, 10, 6, sse4)
611 mc_bi_w_funcs(epel_hv, 10, sse4)
612 mc_bi_w_func(epel_hv, 10, 6, sse4)
613 mc_bi_w_funcs(qpel_h, 10, sse4)
614 mc_bi_w_funcs(qpel_v, 10, sse4)
615 mc_bi_w_funcs(qpel_hv, 10, sse4)
616 
617 mc_bi_w_funcs(pel_pixels, 12, sse4)
618 mc_bi_w_func(pel_pixels, 12, 6, sse4)
619 mc_bi_w_funcs(epel_h, 12, sse4)
620 mc_bi_w_func(epel_h, 12, 6, sse4)
621 mc_bi_w_funcs(epel_v, 12, sse4)
622 mc_bi_w_func(epel_v, 12, 6, sse4)
623 mc_bi_w_funcs(epel_hv, 12, sse4)
624 mc_bi_w_func(epel_hv, 12, 6, sse4)
625 mc_bi_w_funcs(qpel_h, 12, sse4)
626 mc_bi_w_funcs(qpel_v, 12, sse4)
627 mc_bi_w_funcs(qpel_hv, 12, sse4)
628 #endif //ARCH_X86_64 && HAVE_SSE4_EXTERNAL
629 
630 #define SAO_BAND_FILTER_FUNCS(bitd, opt) \
631 void ff_hevc_sao_band_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
632  int16_t *sao_offset_val, int sao_left_class, int width, int height); \
633 void ff_hevc_sao_band_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
634  int16_t *sao_offset_val, int sao_left_class, int width, int height); \
635 void ff_hevc_sao_band_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
636  int16_t *sao_offset_val, int sao_left_class, int width, int height); \
637 void ff_hevc_sao_band_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
638  int16_t *sao_offset_val, int sao_left_class, int width, int height); \
639 void ff_hevc_sao_band_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t _stride_dst, ptrdiff_t _stride_src, \
640  int16_t *sao_offset_val, int sao_left_class, int width, int height);
641 
642 SAO_BAND_FILTER_FUNCS(8, sse2)
643 SAO_BAND_FILTER_FUNCS(10, sse2)
644 SAO_BAND_FILTER_FUNCS(12, sse2)
645 SAO_BAND_FILTER_FUNCS(8, avx)
646 SAO_BAND_FILTER_FUNCS(10, avx)
647 SAO_BAND_FILTER_FUNCS(12, avx)
648 SAO_BAND_FILTER_FUNCS(8, avx2)
649 SAO_BAND_FILTER_FUNCS(10, avx2)
650 SAO_BAND_FILTER_FUNCS(12, avx2)
651 
652 #define SAO_BAND_INIT(bitd, opt) do { \
653  c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_##bitd##_##opt; \
654  c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_##bitd##_##opt; \
655  c->sao_band_filter[2] = ff_hevc_sao_band_filter_32_##bitd##_##opt; \
656  c->sao_band_filter[3] = ff_hevc_sao_band_filter_48_##bitd##_##opt; \
657  c->sao_band_filter[4] = ff_hevc_sao_band_filter_64_##bitd##_##opt; \
658 } while (0)
659 
660 #define SAO_EDGE_FILTER_FUNCS(bitd, opt) \
661 void ff_hevc_sao_edge_filter_8_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
662  int eo, int width, int height); \
663 void ff_hevc_sao_edge_filter_16_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
664  int eo, int width, int height); \
665 void ff_hevc_sao_edge_filter_32_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
666  int eo, int width, int height); \
667 void ff_hevc_sao_edge_filter_48_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
668  int eo, int width, int height); \
669 void ff_hevc_sao_edge_filter_64_##bitd##_##opt(uint8_t *_dst, uint8_t *_src, ptrdiff_t stride_dst, int16_t *sao_offset_val, \
670  int eo, int width, int height); \
671 
672 SAO_EDGE_FILTER_FUNCS(8, ssse3)
673 SAO_EDGE_FILTER_FUNCS(8, avx2)
674 SAO_EDGE_FILTER_FUNCS(10, sse2)
675 SAO_EDGE_FILTER_FUNCS(10, avx2)
676 SAO_EDGE_FILTER_FUNCS(12, sse2)
677 SAO_EDGE_FILTER_FUNCS(12, avx2)
678 
679 #define SAO_EDGE_INIT(bitd, opt) do { \
680  c->sao_edge_filter[0] = ff_hevc_sao_edge_filter_8_##bitd##_##opt; \
681  c->sao_edge_filter[1] = ff_hevc_sao_edge_filter_16_##bitd##_##opt; \
682  c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_##bitd##_##opt; \
683  c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_##bitd##_##opt; \
684  c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_##bitd##_##opt; \
685 } while (0)
686 
687 #define EPEL_LINKS(pointer, my, mx, fname, bitd, opt ) \
688  PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
689  PEL_LINK(pointer, 2, my , mx , fname##6 , bitd, opt ); \
690  PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \
691  PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \
692  PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \
693  PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \
694  PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \
695  PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
696  PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
697 #define QPEL_LINKS(pointer, my, mx, fname, bitd, opt) \
698  PEL_LINK(pointer, 1, my , mx , fname##4 , bitd, opt ); \
699  PEL_LINK(pointer, 3, my , mx , fname##8 , bitd, opt ); \
700  PEL_LINK(pointer, 4, my , mx , fname##12, bitd, opt ); \
701  PEL_LINK(pointer, 5, my , mx , fname##16, bitd, opt ); \
702  PEL_LINK(pointer, 6, my , mx , fname##24, bitd, opt ); \
703  PEL_LINK(pointer, 7, my , mx , fname##32, bitd, opt ); \
704  PEL_LINK(pointer, 8, my , mx , fname##48, bitd, opt ); \
705  PEL_LINK(pointer, 9, my , mx , fname##64, bitd, opt )
706 
708 {
709  int cpu_flags = av_get_cpu_flags();
710 
711  if (bit_depth == 8) {
712  if (EXTERNAL_MMXEXT(cpu_flags)) {
713  c->idct_dc[0] = ff_hevc_idct_4x4_dc_8_mmxext;
714 
715  c->add_residual[0] = ff_hevc_add_residual_4_8_mmxext;
716  }
717  if (EXTERNAL_SSE2(cpu_flags)) {
718  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_sse2;
719  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_sse2;
720  if (ARCH_X86_64) {
721  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_sse2;
722  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_sse2;
723 
724  c->idct[2] = ff_hevc_idct_16x16_8_sse2;
725  c->idct[3] = ff_hevc_idct_32x32_8_sse2;
726  }
727  SAO_BAND_INIT(8, sse2);
728 
729  c->idct_dc[1] = ff_hevc_idct_8x8_dc_8_sse2;
730  c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_sse2;
731  c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_sse2;
732 
733  c->idct[0] = ff_hevc_idct_4x4_8_sse2;
734  c->idct[1] = ff_hevc_idct_8x8_8_sse2;
735 
736  c->add_residual[1] = ff_hevc_add_residual_8_8_sse2;
737  c->add_residual[2] = ff_hevc_add_residual_16_8_sse2;
738  c->add_residual[3] = ff_hevc_add_residual_32_8_sse2;
739  }
740  if (EXTERNAL_SSSE3(cpu_flags)) {
741  if(ARCH_X86_64) {
742  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_ssse3;
743  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_ssse3;
744  }
745  SAO_EDGE_INIT(8, ssse3);
746  }
747  if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
748 
749  EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 8, sse4);
750  EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 8, sse4);
751  EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 8, sse4);
752  EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 8, sse4);
753 
754  QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 8, sse4);
755  QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 8, sse4);
756  QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 8, sse4);
757  QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 8, sse4);
758  }
759  if (EXTERNAL_AVX(cpu_flags)) {
760  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_8_avx;
761  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_8_avx;
762  if (ARCH_X86_64) {
763  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_8_avx;
764  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_8_avx;
765 
766  c->idct[2] = ff_hevc_idct_16x16_8_avx;
767  c->idct[3] = ff_hevc_idct_32x32_8_avx;
768  }
769  SAO_BAND_INIT(8, avx);
770 
771  c->idct[0] = ff_hevc_idct_4x4_8_avx;
772  c->idct[1] = ff_hevc_idct_8x8_8_avx;
773 
774  c->add_residual[1] = ff_hevc_add_residual_8_8_avx;
775  c->add_residual[2] = ff_hevc_add_residual_16_8_avx;
776  c->add_residual[3] = ff_hevc_add_residual_32_8_avx;
777  }
778  if (EXTERNAL_AVX2(cpu_flags)) {
779  c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_8_avx2;
780  c->sao_band_filter[1] = ff_hevc_sao_band_filter_16_8_avx2;
781  }
783  c->idct_dc[2] = ff_hevc_idct_16x16_dc_8_avx2;
784  c->idct_dc[3] = ff_hevc_idct_32x32_dc_8_avx2;
785  if (ARCH_X86_64) {
786  c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
787  c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
788  c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
789 
790  c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_8_avx2;
791  c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_8_avx2;
792  c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_8_avx2;
793 
794  c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
795  c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
796  c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
797 
798  c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
799  c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
800  c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
801 
802  c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
803  c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
804  c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
805 
806  c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_8_avx2;
807  c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_8_avx2;
808  c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_8_avx2;
809 
810  c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_8_avx2;
811  c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_8_avx2;
812  c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_8_avx2;
813 
814  c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_8_avx2;
815  c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_8_avx2;
816  c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_8_avx2;
817 
818  c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_8_avx2;
819  c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_8_avx2;
820  c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_8_avx2;
821 
822  c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_8_avx2;
823  c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_8_avx2;
824  c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_8_avx2;
825 
826  c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_8_avx2;
827  c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_8_avx2;
828  c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_8_avx2;
829 
830  c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_8_avx2;
831  c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_8_avx2;
832  c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_8_avx2;
833 
834  c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_8_avx2;
835  c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_8_avx2;
836  c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_8_avx2;
837 
838  c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_8_avx2;
839  c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_8_avx2;
840  c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_8_avx2;
841 
842  c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_8_avx2;
843  c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_8_avx2;
844  c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_8_avx2;
845 
846  c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx2;
847  c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_8_avx2;
848  c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx2;
849 
850  c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_8_avx2;
851  c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_8_avx2;
852  c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_8_avx2;
853 
854  c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_8_avx2;
855  c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_8_avx2;
856  c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_8_avx2;
857 
858  c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_8_avx2;
859  c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_8_avx2;
860  c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_8_avx2;
861 
862  c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_8_avx2;
863  c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_8_avx2;
864  c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_8_avx2;
865 
866  c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_8_avx2;
867  c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_8_avx2;
868  c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_8_avx2;
869  }
870  SAO_BAND_INIT(8, avx2);
871 
872  c->sao_edge_filter[2] = ff_hevc_sao_edge_filter_32_8_avx2;
873  c->sao_edge_filter[3] = ff_hevc_sao_edge_filter_48_8_avx2;
874  c->sao_edge_filter[4] = ff_hevc_sao_edge_filter_64_8_avx2;
875 
876  c->add_residual[3] = ff_hevc_add_residual_32_8_avx2;
877  }
878  if (EXTERNAL_AVX512ICL(cpu_flags) && ARCH_X86_64) {
879  c->put_hevc_qpel[1][0][1] = ff_hevc_put_hevc_qpel_h4_8_avx512icl;
880  c->put_hevc_qpel[3][0][1] = ff_hevc_put_hevc_qpel_h8_8_avx512icl;
881  c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_8_avx512icl;
882  c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_8_avx512icl;
883  c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_8_avx512icl;
884  c->put_hevc_qpel[3][1][1] = ff_hevc_put_hevc_qpel_hv8_8_avx512icl;
885  }
886  } else if (bit_depth == 10) {
887  if (EXTERNAL_MMXEXT(cpu_flags)) {
888  c->add_residual[0] = ff_hevc_add_residual_4_10_mmxext;
889  c->idct_dc[0] = ff_hevc_idct_4x4_dc_10_mmxext;
890  }
891  if (EXTERNAL_SSE2(cpu_flags)) {
892  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_sse2;
893  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_sse2;
894  if (ARCH_X86_64) {
895  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_sse2;
896  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_sse2;
897 
898  c->idct[2] = ff_hevc_idct_16x16_10_sse2;
899  c->idct[3] = ff_hevc_idct_32x32_10_sse2;
900  }
901  SAO_BAND_INIT(10, sse2);
902  SAO_EDGE_INIT(10, sse2);
903 
904  c->idct_dc[1] = ff_hevc_idct_8x8_dc_10_sse2;
905  c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_sse2;
906  c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_sse2;
907 
908  c->idct[0] = ff_hevc_idct_4x4_10_sse2;
909  c->idct[1] = ff_hevc_idct_8x8_10_sse2;
910 
911  c->add_residual[1] = ff_hevc_add_residual_8_10_sse2;
912  c->add_residual[2] = ff_hevc_add_residual_16_10_sse2;
913  c->add_residual[3] = ff_hevc_add_residual_32_10_sse2;
914  }
915  if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
916  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_ssse3;
917  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_ssse3;
918  }
919  if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
920  EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 10, sse4);
921  EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 10, sse4);
922  EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 10, sse4);
923  EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 10, sse4);
924 
925  QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 10, sse4);
926  QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 10, sse4);
927  QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 10, sse4);
928  QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 10, sse4);
929  }
930  if (EXTERNAL_AVX(cpu_flags)) {
931  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_10_avx;
932  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_10_avx;
933  if (ARCH_X86_64) {
934  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_10_avx;
935  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_10_avx;
936 
937  c->idct[2] = ff_hevc_idct_16x16_10_avx;
938  c->idct[3] = ff_hevc_idct_32x32_10_avx;
939  }
940 
941  c->idct[0] = ff_hevc_idct_4x4_10_avx;
942  c->idct[1] = ff_hevc_idct_8x8_10_avx;
943 
944  SAO_BAND_INIT(10, avx);
945  }
946  if (EXTERNAL_AVX2(cpu_flags)) {
947  c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_10_avx2;
948  }
950  c->idct_dc[2] = ff_hevc_idct_16x16_dc_10_avx2;
951  c->idct_dc[3] = ff_hevc_idct_32x32_dc_10_avx2;
952  if (ARCH_X86_64) {
953  c->put_hevc_epel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
954  c->put_hevc_epel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
955  c->put_hevc_epel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
956  c->put_hevc_epel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
957  c->put_hevc_epel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
958 
959  c->put_hevc_qpel[5][0][0] = ff_hevc_put_hevc_pel_pixels16_10_avx2;
960  c->put_hevc_qpel[6][0][0] = ff_hevc_put_hevc_pel_pixels24_10_avx2;
961  c->put_hevc_qpel[7][0][0] = ff_hevc_put_hevc_pel_pixels32_10_avx2;
962  c->put_hevc_qpel[8][0][0] = ff_hevc_put_hevc_pel_pixels48_10_avx2;
963  c->put_hevc_qpel[9][0][0] = ff_hevc_put_hevc_pel_pixels64_10_avx2;
964 
965  c->put_hevc_epel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
966  c->put_hevc_epel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
967  c->put_hevc_epel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
968  c->put_hevc_epel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
969  c->put_hevc_epel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
970 
971  c->put_hevc_qpel_uni[5][0][0] = ff_hevc_put_hevc_uni_pel_pixels32_8_avx2;
972  c->put_hevc_qpel_uni[6][0][0] = ff_hevc_put_hevc_uni_pel_pixels48_8_avx2;
973  c->put_hevc_qpel_uni[7][0][0] = ff_hevc_put_hevc_uni_pel_pixels64_8_avx2;
974  c->put_hevc_qpel_uni[8][0][0] = ff_hevc_put_hevc_uni_pel_pixels96_8_avx2;
975  c->put_hevc_qpel_uni[9][0][0] = ff_hevc_put_hevc_uni_pel_pixels128_8_avx2;
976 
977  c->put_hevc_epel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
978  c->put_hevc_epel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
979  c->put_hevc_epel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
980  c->put_hevc_epel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
981  c->put_hevc_epel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
982  c->put_hevc_qpel_bi[5][0][0] = ff_hevc_put_hevc_bi_pel_pixels16_10_avx2;
983  c->put_hevc_qpel_bi[6][0][0] = ff_hevc_put_hevc_bi_pel_pixels24_10_avx2;
984  c->put_hevc_qpel_bi[7][0][0] = ff_hevc_put_hevc_bi_pel_pixels32_10_avx2;
985  c->put_hevc_qpel_bi[8][0][0] = ff_hevc_put_hevc_bi_pel_pixels48_10_avx2;
986  c->put_hevc_qpel_bi[9][0][0] = ff_hevc_put_hevc_bi_pel_pixels64_10_avx2;
987 
988  c->put_hevc_epel[5][0][1] = ff_hevc_put_hevc_epel_h16_10_avx2;
989  c->put_hevc_epel[6][0][1] = ff_hevc_put_hevc_epel_h24_10_avx2;
990  c->put_hevc_epel[7][0][1] = ff_hevc_put_hevc_epel_h32_10_avx2;
991  c->put_hevc_epel[8][0][1] = ff_hevc_put_hevc_epel_h48_10_avx2;
992  c->put_hevc_epel[9][0][1] = ff_hevc_put_hevc_epel_h64_10_avx2;
993 
994  c->put_hevc_epel_uni[5][0][1] = ff_hevc_put_hevc_uni_epel_h16_10_avx2;
995  c->put_hevc_epel_uni[6][0][1] = ff_hevc_put_hevc_uni_epel_h24_10_avx2;
996  c->put_hevc_epel_uni[7][0][1] = ff_hevc_put_hevc_uni_epel_h32_10_avx2;
997  c->put_hevc_epel_uni[8][0][1] = ff_hevc_put_hevc_uni_epel_h48_10_avx2;
998  c->put_hevc_epel_uni[9][0][1] = ff_hevc_put_hevc_uni_epel_h64_10_avx2;
999 
1000  c->put_hevc_epel_bi[5][0][1] = ff_hevc_put_hevc_bi_epel_h16_10_avx2;
1001  c->put_hevc_epel_bi[6][0][1] = ff_hevc_put_hevc_bi_epel_h24_10_avx2;
1002  c->put_hevc_epel_bi[7][0][1] = ff_hevc_put_hevc_bi_epel_h32_10_avx2;
1003  c->put_hevc_epel_bi[8][0][1] = ff_hevc_put_hevc_bi_epel_h48_10_avx2;
1004  c->put_hevc_epel_bi[9][0][1] = ff_hevc_put_hevc_bi_epel_h64_10_avx2;
1005 
1006  c->put_hevc_epel[5][1][0] = ff_hevc_put_hevc_epel_v16_10_avx2;
1007  c->put_hevc_epel[6][1][0] = ff_hevc_put_hevc_epel_v24_10_avx2;
1008  c->put_hevc_epel[7][1][0] = ff_hevc_put_hevc_epel_v32_10_avx2;
1009  c->put_hevc_epel[8][1][0] = ff_hevc_put_hevc_epel_v48_10_avx2;
1010  c->put_hevc_epel[9][1][0] = ff_hevc_put_hevc_epel_v64_10_avx2;
1011 
1012  c->put_hevc_epel_uni[5][1][0] = ff_hevc_put_hevc_uni_epel_v16_10_avx2;
1013  c->put_hevc_epel_uni[6][1][0] = ff_hevc_put_hevc_uni_epel_v24_10_avx2;
1014  c->put_hevc_epel_uni[7][1][0] = ff_hevc_put_hevc_uni_epel_v32_10_avx2;
1015  c->put_hevc_epel_uni[8][1][0] = ff_hevc_put_hevc_uni_epel_v48_10_avx2;
1016  c->put_hevc_epel_uni[9][1][0] = ff_hevc_put_hevc_uni_epel_v64_10_avx2;
1017 
1018  c->put_hevc_epel_bi[5][1][0] = ff_hevc_put_hevc_bi_epel_v16_10_avx2;
1019  c->put_hevc_epel_bi[6][1][0] = ff_hevc_put_hevc_bi_epel_v24_10_avx2;
1020  c->put_hevc_epel_bi[7][1][0] = ff_hevc_put_hevc_bi_epel_v32_10_avx2;
1021  c->put_hevc_epel_bi[8][1][0] = ff_hevc_put_hevc_bi_epel_v48_10_avx2;
1022  c->put_hevc_epel_bi[9][1][0] = ff_hevc_put_hevc_bi_epel_v64_10_avx2;
1023 
1024  c->put_hevc_epel[5][1][1] = ff_hevc_put_hevc_epel_hv16_10_avx2;
1025  c->put_hevc_epel[6][1][1] = ff_hevc_put_hevc_epel_hv24_10_avx2;
1026  c->put_hevc_epel[7][1][1] = ff_hevc_put_hevc_epel_hv32_10_avx2;
1027  c->put_hevc_epel[8][1][1] = ff_hevc_put_hevc_epel_hv48_10_avx2;
1028  c->put_hevc_epel[9][1][1] = ff_hevc_put_hevc_epel_hv64_10_avx2;
1029 
1030  c->put_hevc_epel_uni[5][1][1] = ff_hevc_put_hevc_uni_epel_hv16_10_avx2;
1031  c->put_hevc_epel_uni[6][1][1] = ff_hevc_put_hevc_uni_epel_hv24_10_avx2;
1032  c->put_hevc_epel_uni[7][1][1] = ff_hevc_put_hevc_uni_epel_hv32_10_avx2;
1033  c->put_hevc_epel_uni[8][1][1] = ff_hevc_put_hevc_uni_epel_hv48_10_avx2;
1034  c->put_hevc_epel_uni[9][1][1] = ff_hevc_put_hevc_uni_epel_hv64_10_avx2;
1035 
1036  c->put_hevc_epel_bi[5][1][1] = ff_hevc_put_hevc_bi_epel_hv16_10_avx2;
1037  c->put_hevc_epel_bi[6][1][1] = ff_hevc_put_hevc_bi_epel_hv24_10_avx2;
1038  c->put_hevc_epel_bi[7][1][1] = ff_hevc_put_hevc_bi_epel_hv32_10_avx2;
1039  c->put_hevc_epel_bi[8][1][1] = ff_hevc_put_hevc_bi_epel_hv48_10_avx2;
1040  c->put_hevc_epel_bi[9][1][1] = ff_hevc_put_hevc_bi_epel_hv64_10_avx2;
1041 
1042  c->put_hevc_qpel[5][0][1] = ff_hevc_put_hevc_qpel_h16_10_avx2;
1043  c->put_hevc_qpel[6][0][1] = ff_hevc_put_hevc_qpel_h24_10_avx2;
1044  c->put_hevc_qpel[7][0][1] = ff_hevc_put_hevc_qpel_h32_10_avx2;
1045  c->put_hevc_qpel[8][0][1] = ff_hevc_put_hevc_qpel_h48_10_avx2;
1046  c->put_hevc_qpel[9][0][1] = ff_hevc_put_hevc_qpel_h64_10_avx2;
1047 
1048  c->put_hevc_qpel_uni[5][0][1] = ff_hevc_put_hevc_uni_qpel_h16_10_avx2;
1049  c->put_hevc_qpel_uni[6][0][1] = ff_hevc_put_hevc_uni_qpel_h24_10_avx2;
1050  c->put_hevc_qpel_uni[7][0][1] = ff_hevc_put_hevc_uni_qpel_h32_10_avx2;
1051  c->put_hevc_qpel_uni[8][0][1] = ff_hevc_put_hevc_uni_qpel_h48_10_avx2;
1052  c->put_hevc_qpel_uni[9][0][1] = ff_hevc_put_hevc_uni_qpel_h64_10_avx2;
1053 
1054  c->put_hevc_qpel_bi[5][0][1] = ff_hevc_put_hevc_bi_qpel_h16_10_avx2;
1055  c->put_hevc_qpel_bi[6][0][1] = ff_hevc_put_hevc_bi_qpel_h24_10_avx2;
1056  c->put_hevc_qpel_bi[7][0][1] = ff_hevc_put_hevc_bi_qpel_h32_10_avx2;
1057  c->put_hevc_qpel_bi[8][0][1] = ff_hevc_put_hevc_bi_qpel_h48_10_avx2;
1058  c->put_hevc_qpel_bi[9][0][1] = ff_hevc_put_hevc_bi_qpel_h64_10_avx2;
1059 
1060  c->put_hevc_qpel[5][1][0] = ff_hevc_put_hevc_qpel_v16_10_avx2;
1061  c->put_hevc_qpel[6][1][0] = ff_hevc_put_hevc_qpel_v24_10_avx2;
1062  c->put_hevc_qpel[7][1][0] = ff_hevc_put_hevc_qpel_v32_10_avx2;
1063  c->put_hevc_qpel[8][1][0] = ff_hevc_put_hevc_qpel_v48_10_avx2;
1064  c->put_hevc_qpel[9][1][0] = ff_hevc_put_hevc_qpel_v64_10_avx2;
1065 
1066  c->put_hevc_qpel_uni[5][1][0] = ff_hevc_put_hevc_uni_qpel_v16_10_avx2;
1067  c->put_hevc_qpel_uni[6][1][0] = ff_hevc_put_hevc_uni_qpel_v24_10_avx2;
1068  c->put_hevc_qpel_uni[7][1][0] = ff_hevc_put_hevc_uni_qpel_v32_10_avx2;
1069  c->put_hevc_qpel_uni[8][1][0] = ff_hevc_put_hevc_uni_qpel_v48_10_avx2;
1070  c->put_hevc_qpel_uni[9][1][0] = ff_hevc_put_hevc_uni_qpel_v64_10_avx2;
1071 
1072  c->put_hevc_qpel_bi[5][1][0] = ff_hevc_put_hevc_bi_qpel_v16_10_avx2;
1073  c->put_hevc_qpel_bi[6][1][0] = ff_hevc_put_hevc_bi_qpel_v24_10_avx2;
1074  c->put_hevc_qpel_bi[7][1][0] = ff_hevc_put_hevc_bi_qpel_v32_10_avx2;
1075  c->put_hevc_qpel_bi[8][1][0] = ff_hevc_put_hevc_bi_qpel_v48_10_avx2;
1076  c->put_hevc_qpel_bi[9][1][0] = ff_hevc_put_hevc_bi_qpel_v64_10_avx2;
1077 
1078  c->put_hevc_qpel[5][1][1] = ff_hevc_put_hevc_qpel_hv16_10_avx2;
1079  c->put_hevc_qpel[6][1][1] = ff_hevc_put_hevc_qpel_hv24_10_avx2;
1080  c->put_hevc_qpel[7][1][1] = ff_hevc_put_hevc_qpel_hv32_10_avx2;
1081  c->put_hevc_qpel[8][1][1] = ff_hevc_put_hevc_qpel_hv48_10_avx2;
1082  c->put_hevc_qpel[9][1][1] = ff_hevc_put_hevc_qpel_hv64_10_avx2;
1083 
1084  c->put_hevc_qpel_uni[5][1][1] = ff_hevc_put_hevc_uni_qpel_hv16_10_avx2;
1085  c->put_hevc_qpel_uni[6][1][1] = ff_hevc_put_hevc_uni_qpel_hv24_10_avx2;
1086  c->put_hevc_qpel_uni[7][1][1] = ff_hevc_put_hevc_uni_qpel_hv32_10_avx2;
1087  c->put_hevc_qpel_uni[8][1][1] = ff_hevc_put_hevc_uni_qpel_hv48_10_avx2;
1088  c->put_hevc_qpel_uni[9][1][1] = ff_hevc_put_hevc_uni_qpel_hv64_10_avx2;
1089 
1090  c->put_hevc_qpel_bi[5][1][1] = ff_hevc_put_hevc_bi_qpel_hv16_10_avx2;
1091  c->put_hevc_qpel_bi[6][1][1] = ff_hevc_put_hevc_bi_qpel_hv24_10_avx2;
1092  c->put_hevc_qpel_bi[7][1][1] = ff_hevc_put_hevc_bi_qpel_hv32_10_avx2;
1093  c->put_hevc_qpel_bi[8][1][1] = ff_hevc_put_hevc_bi_qpel_hv48_10_avx2;
1094  c->put_hevc_qpel_bi[9][1][1] = ff_hevc_put_hevc_bi_qpel_hv64_10_avx2;
1095  }
1096  SAO_BAND_INIT(10, avx2);
1097  SAO_EDGE_INIT(10, avx2);
1098 
1099  c->add_residual[2] = ff_hevc_add_residual_16_10_avx2;
1100  c->add_residual[3] = ff_hevc_add_residual_32_10_avx2;
1101  }
1102  } else if (bit_depth == 12) {
1103  if (EXTERNAL_MMXEXT(cpu_flags)) {
1104  c->idct_dc[0] = ff_hevc_idct_4x4_dc_12_mmxext;
1105  }
1106  if (EXTERNAL_SSE2(cpu_flags)) {
1107  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_sse2;
1108  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_sse2;
1109  if (ARCH_X86_64) {
1110  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_sse2;
1111  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_sse2;
1112  }
1113  SAO_BAND_INIT(12, sse2);
1114  SAO_EDGE_INIT(12, sse2);
1115 
1116  c->idct_dc[1] = ff_hevc_idct_8x8_dc_12_sse2;
1117  c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_sse2;
1118  c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_sse2;
1119  }
1120  if (EXTERNAL_SSSE3(cpu_flags) && ARCH_X86_64) {
1121  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_ssse3;
1122  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_ssse3;
1123  }
1124  if (EXTERNAL_SSE4(cpu_flags) && ARCH_X86_64) {
1125  EPEL_LINKS(c->put_hevc_epel, 0, 0, pel_pixels, 12, sse4);
1126  EPEL_LINKS(c->put_hevc_epel, 0, 1, epel_h, 12, sse4);
1127  EPEL_LINKS(c->put_hevc_epel, 1, 0, epel_v, 12, sse4);
1128  EPEL_LINKS(c->put_hevc_epel, 1, 1, epel_hv, 12, sse4);
1129 
1130  QPEL_LINKS(c->put_hevc_qpel, 0, 0, pel_pixels, 12, sse4);
1131  QPEL_LINKS(c->put_hevc_qpel, 0, 1, qpel_h, 12, sse4);
1132  QPEL_LINKS(c->put_hevc_qpel, 1, 0, qpel_v, 12, sse4);
1133  QPEL_LINKS(c->put_hevc_qpel, 1, 1, qpel_hv, 12, sse4);
1134  }
1135  if (EXTERNAL_AVX(cpu_flags)) {
1136  c->hevc_v_loop_filter_chroma = ff_hevc_v_loop_filter_chroma_12_avx;
1137  c->hevc_h_loop_filter_chroma = ff_hevc_h_loop_filter_chroma_12_avx;
1138  if (ARCH_X86_64) {
1139  c->hevc_v_loop_filter_luma = ff_hevc_v_loop_filter_luma_12_avx;
1140  c->hevc_h_loop_filter_luma = ff_hevc_h_loop_filter_luma_12_avx;
1141  }
1142  SAO_BAND_INIT(12, avx);
1143  }
1144  if (EXTERNAL_AVX2(cpu_flags)) {
1145  c->sao_band_filter[0] = ff_hevc_sao_band_filter_8_12_avx2;
1146  }
1148  c->idct_dc[2] = ff_hevc_idct_16x16_dc_12_avx2;
1149  c->idct_dc[3] = ff_hevc_idct_32x32_dc_12_avx2;
1150 
1151  SAO_BAND_INIT(12, avx2);
1152  SAO_EDGE_INIT(12, avx2);
1153  }
1154  }
1155 }
bit_depth
static void bit_depth(AudioStatsContext *s, uint64_t mask, uint64_t imask, AVRational *depth)
Definition: af_astats.c:226
cpu.h
LFL_FUNCS
#define LFL_FUNCS(type, depth, opt)
Definition: hevcdsp_init.c:41
mem_internal.h
ff_hevc_add_residual_16_8_sse2
void ff_hevc_add_residual_16_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
ff_hevc_put_hevc_uni_pel_pixels48_8_avx2
void ff_hevc_put_hevc_uni_pel_pixels48_8_avx2(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_add_residual_32_8_sse2
void ff_hevc_add_residual_32_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
ff_hevc_put_hevc_bi_pel_pixels16_10_avx2
void ff_hevc_put_hevc_bi_pel_pixels16_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_add_residual_32_8_avx
void ff_hevc_add_residual_32_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride)
ff_hevc_add_residual_16_10_avx2
void ff_hevc_add_residual_16_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
mc_rep_funcs2
#define mc_rep_funcs2(name, bitd, step1, step2, W, opt)
Definition: hevcdsp_init.c:168
EXTERNAL_AVX2_FAST
#define EXTERNAL_AVX2_FAST(flags)
Definition: cpu.h:79
mc_rep_func
#define mc_rep_func(name, bitd, step, W, opt)
Definition: hevcdsp_init.c:86
ff_hevc_add_residual_32_10_sse2
void ff_hevc_add_residual_32_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
ff_hevc_add_residual_8_8_sse2
void ff_hevc_add_residual_8_8_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
ff_hevc_put_hevc_qpel_hv8_8_avx512icl
void ff_hevc_put_hevc_qpel_hv8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_uni_pel_pixels96_8_avx2
void ff_hevc_put_hevc_uni_pel_pixels96_8_avx2(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:101
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:52
ff_hevc_put_hevc_bi_pel_pixels64_10_avx2
void ff_hevc_put_hevc_bi_pel_pixels64_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width)
QPEL_LINKS
#define QPEL_LINKS(pointer, my, mx, fname, bitd, opt)
Definition: hevcdsp_init.c:697
IDCT_FUNCS
#define IDCT_FUNCS(opt)
Definition: hevcdsp_init.c:73
ff_hevc_add_residual_4_8_mmxext
void ff_hevc_add_residual_4_8_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride)
ff_hevc_add_residual_32_10_avx2
void ff_hevc_add_residual_32_10_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
ff_hevc_put_hevc_bi_pel_pixels64_8_avx2
void ff_hevc_put_hevc_bi_pel_pixels64_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_add_residual_32_8_avx2
void ff_hevc_add_residual_32_8_avx2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
EPEL_LINKS
#define EPEL_LINKS(pointer, my, mx, fname, bitd, opt)
Definition: hevcdsp_init.c:687
EXTERNAL_AVX2
#define EXTERNAL_AVX2(flags)
Definition: cpu.h:78
ff_hevc_add_residual_16_10_sse2
void ff_hevc_add_residual_16_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
LFC_FUNCS
#define LFC_FUNCS(type, depth, opt)
Definition: hevcdsp_init.c:37
ff_hevc_put_hevc_bi_pel_pixels48_10_avx2
void ff_hevc_put_hevc_bi_pel_pixels48_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width)
hevcdsp.h
ff_hevc_put_hevc_uni_pel_pixels64_8_avx2
void ff_hevc_put_hevc_uni_pel_pixels64_8_avx2(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_dsp_init_x86
void ff_hevc_dsp_init_x86(HEVCDSPContext *c, const int bit_depth)
Definition: hevcdsp_init.c:707
hevcdsp.h
ff_hevc_put_hevc_qpel_h8_8_avx512icl
void ff_hevc_put_hevc_qpel_h8_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_qpel_h32_8_avx512icl
void ff_hevc_put_hevc_qpel_h32_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_pel_pixels16_10_avx2
void ff_hevc_put_hevc_pel_pixels16_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
ff_hevc_add_residual_8_10_sse2
void ff_hevc_add_residual_8_10_sse2(uint8_t *dst, int16_t *res, ptrdiff_t stride)
ff_hevc_put_hevc_qpel_h16_8_avx512icl
void ff_hevc_put_hevc_qpel_h16_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_uni_pel_pixels128_8_avx2
void ff_hevc_put_hevc_uni_pel_pixels128_8_avx2(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
cpu.h
ff_hevc_put_hevc_pel_pixels48_10_avx2
void ff_hevc_put_hevc_pel_pixels48_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
asm.h
HEVCDSPContext
Definition: hevcdsp.h:47
SAO_BAND_FILTER_FUNCS
#define SAO_BAND_FILTER_FUNCS(bitd, opt)
Definition: hevcdsp_init.c:630
EXTERNAL_SSE2
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:59
SAO_EDGE_FILTER_FUNCS
#define SAO_EDGE_FILTER_FUNCS(bitd, opt)
Definition: hevcdsp_init.c:660
SAO_EDGE_INIT
#define SAO_EDGE_INIT(bitd, opt)
Definition: hevcdsp_init.c:679
ff_hevc_put_hevc_bi_pel_pixels24_10_avx2
void ff_hevc_put_hevc_bi_pel_pixels24_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_pel_pixels64_10_avx2
void ff_hevc_put_hevc_pel_pixels64_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_pel_pixels48_8_avx2
void ff_hevc_put_hevc_pel_pixels48_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
IDCT_DC_FUNCS
#define IDCT_DC_FUNCS(W, opt)
Definition: hevcdsp_init.c:61
mc_rep_funcs
#define mc_rep_funcs(name, bitd, step, W, opt)
Definition: hevcdsp_init.c:133
ff_hevc_put_hevc_qpel_h64_8_avx512icl
void ff_hevc_put_hevc_qpel_h64_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_bi_pel_pixels48_8_avx2
void ff_hevc_put_hevc_bi_pel_pixels48_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_uni_pel_pixels32_8_avx2
void ff_hevc_put_hevc_uni_pel_pixels32_8_avx2(uint8_t *dst, ptrdiff_t dststride, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_pel_pixels64_8_avx2
void ff_hevc_put_hevc_pel_pixels64_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
mc_rep_bi_func
#define mc_rep_bi_func(name, bitd, step, W, opt)
Definition: hevcdsp_init.c:115
EXTERNAL_AVX
#define EXTERNAL_AVX(flags)
Definition: cpu.h:70
EXTERNAL_SSE4
#define EXTERNAL_SSE4(flags)
Definition: cpu.h:68
ff_hevc_add_residual_8_8_avx
void ff_hevc_add_residual_8_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride)
mc_rep_uni_func
#define mc_rep_uni_func(name, bitd, step, W, opt)
Definition: hevcdsp_init.c:100
EXTERNAL_AVX512ICL
#define EXTERNAL_AVX512ICL(flags)
Definition: cpu.h:83
SAO_BAND_INIT
#define SAO_BAND_INIT(bitd, opt)
Definition: hevcdsp_init.c:652
ff_hevc_add_residual_16_8_avx
void ff_hevc_add_residual_16_8_avx(uint8_t *dst, int16_t *res, ptrdiff_t stride)
ff_hevc_add_residual_4_10_mmxext
void ff_hevc_add_residual_4_10_mmxext(uint8_t *dst, int16_t *res, ptrdiff_t stride)
ff_hevc_put_hevc_qpel_h4_8_avx512icl
void ff_hevc_put_hevc_qpel_h4_8_avx512icl(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_bi_pel_pixels32_8_avx2
void ff_hevc_put_hevc_bi_pel_pixels32_8_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width)
EXTERNAL_SSSE3
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:65
ff_hevc_put_hevc_pel_pixels32_10_avx2
void ff_hevc_put_hevc_pel_pixels32_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_pel_pixels32_8_avx2
void ff_hevc_put_hevc_pel_pixels32_8_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)
EXTERNAL_MMXEXT
#define EXTERNAL_MMXEXT(flags)
Definition: cpu.h:57
ff_hevc_put_hevc_bi_pel_pixels32_10_avx2
void ff_hevc_put_hevc_bi_pel_pixels32_10_avx2(uint8_t *_dst, ptrdiff_t _dststride, uint8_t *_src, ptrdiff_t _srcstride, int16_t *src2, int height, intptr_t mx, intptr_t my, int width)
ff_hevc_put_hevc_pel_pixels24_10_avx2
void ff_hevc_put_hevc_pel_pixels24_10_avx2(int16_t *dst, uint8_t *_src, ptrdiff_t _srcstride, int height, intptr_t mx, intptr_t my, int width)