FFmpeg
ops.c
Go to the documentation of this file.
1 /**
2  * Copyright (C) 2025 Niklas Haas
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include <float.h>
22 
23 #include "libavutil/avassert.h"
24 #include "libavutil/mem.h"
25 
26 #include "../ops_chain.h"
27 
28 #define DECL_ENTRY(TYPE, NAME, ...) \
29  static const SwsOpEntry op_##NAME = { \
30  .type = SWS_PIXEL_##TYPE, \
31  __VA_ARGS__ \
32  }
33 
34 #define DECL_ASM(TYPE, NAME, ...) \
35  void ff_##NAME(void); \
36  DECL_ENTRY(TYPE, NAME, \
37  .func = ff_##NAME, \
38  __VA_ARGS__)
39 
40 #define DECL_PATTERN(TYPE, NAME, X, Y, Z, W, ...) \
41  DECL_ASM(TYPE, p##X##Y##Z##W##_##NAME, \
42  .unused = { !X, !Y, !Z, !W }, \
43  __VA_ARGS__ \
44  )
45 
46 #define REF_PATTERN(NAME, X, Y, Z, W) \
47  &op_p##X##Y##Z##W##_##NAME
48 
49 #define DECL_COMMON_PATTERNS(TYPE, NAME, ...) \
50  DECL_PATTERN(TYPE, NAME, 1, 0, 0, 0, __VA_ARGS__); \
51  DECL_PATTERN(TYPE, NAME, 1, 0, 0, 1, __VA_ARGS__); \
52  DECL_PATTERN(TYPE, NAME, 1, 1, 1, 0, __VA_ARGS__); \
53  DECL_PATTERN(TYPE, NAME, 1, 1, 1, 1, __VA_ARGS__) \
54 
55 #define REF_COMMON_PATTERNS(NAME) \
56  REF_PATTERN(NAME, 1, 0, 0, 0), \
57  REF_PATTERN(NAME, 1, 0, 0, 1), \
58  REF_PATTERN(NAME, 1, 1, 1, 0), \
59  REF_PATTERN(NAME, 1, 1, 1, 1)
60 
61 #define DECL_RW(EXT, TYPE, NAME, OP, ELEMS, PACKED, FRAC) \
62  DECL_ASM(TYPE, NAME##ELEMS##EXT, \
63  .op = SWS_OP_##OP, \
64  .rw = { .elems = ELEMS, .packed = PACKED, .frac = FRAC }, \
65  );
66 
67 #define DECL_PACKED_RW(EXT, DEPTH) \
68  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 2, true, 0) \
69  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 3, true, 0) \
70  DECL_RW(EXT, U##DEPTH, read##DEPTH##_packed, READ, 4, true, 0) \
71  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 2, true, 0) \
72  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 3, true, 0) \
73  DECL_RW(EXT, U##DEPTH, write##DEPTH##_packed, WRITE, 4, true, 0) \
74 
75 #define DECL_PACK_UNPACK(EXT, TYPE, X, Y, Z, W) \
76  DECL_ASM(TYPE, pack_##X##Y##Z##W##EXT, \
77  .op = SWS_OP_PACK, \
78  .pack.pattern = {X, Y, Z, W}, \
79  ); \
80  \
81  DECL_ASM(TYPE, unpack_##X##Y##Z##W##EXT, \
82  .op = SWS_OP_UNPACK, \
83  .pack.pattern = {X, Y, Z, W}, \
84  ); \
85 
86 static int setup_swap_bytes(const SwsOp *op, SwsOpPriv *out)
87 {
88  const int mask = ff_sws_pixel_type_size(op->type) - 1;
89  for (int i = 0; i < 16; i++)
90  out->u8[i] = (i & ~mask) | (mask - (i & mask));
91  return 0;
92 }
93 
94 #define DECL_SWAP_BYTES(EXT, TYPE, X, Y, Z, W) \
95  DECL_ENTRY(TYPE, p##X##Y##Z##W##_swap_bytes_##TYPE##EXT, \
96  .op = SWS_OP_SWAP_BYTES, \
97  .unused = { !X, !Y, !Z, !W }, \
98  .func = ff_p##X##Y##Z##W##_shuffle##EXT, \
99  .setup = setup_swap_bytes, \
100  );
101 
102 #define DECL_CLEAR_ALPHA(EXT, IDX) \
103  DECL_ASM(U8, clear_alpha##IDX##EXT, \
104  .op = SWS_OP_CLEAR, \
105  .clear_value = -1, \
106  .unused[IDX] = true, \
107  ); \
108 
109 #define DECL_CLEAR_ZERO(EXT, IDX) \
110  DECL_ASM(U8, clear_zero##IDX##EXT, \
111  .op = SWS_OP_CLEAR, \
112  .clear_value = 0, \
113  .unused[IDX] = true, \
114  );
115 
116 static int setup_clear(const SwsOp *op, SwsOpPriv *out)
117 {
118  for (int i = 0; i < 4; i++)
119  out->u32[i] = (uint32_t) op->c.q4[i].num;
120  return 0;
121 }
122 
123 #define DECL_CLEAR(EXT, X, Y, Z, W) \
124  DECL_PATTERN(U8, clear##EXT, X, Y, Z, W, \
125  .op = SWS_OP_CLEAR, \
126  .setup = setup_clear, \
127  .flexible = true, \
128  );
129 
130 #define DECL_SWIZZLE(EXT, X, Y, Z, W) \
131  DECL_ASM(U8, swizzle_##X##Y##Z##W##EXT, \
132  .op = SWS_OP_SWIZZLE, \
133  .swizzle.in = {X, Y, Z, W}, \
134  );
135 
136 #define DECL_CONVERT(EXT, FROM, TO) \
137  DECL_COMMON_PATTERNS(FROM, convert_##FROM##_##TO##EXT, \
138  .op = SWS_OP_CONVERT, \
139  .convert.to = SWS_PIXEL_##TO, \
140  );
141 
142 #define DECL_EXPAND(EXT, FROM, TO) \
143  DECL_COMMON_PATTERNS(FROM, expand_##FROM##_##TO##EXT, \
144  .op = SWS_OP_CONVERT, \
145  .convert.to = SWS_PIXEL_##TO, \
146  .convert.expand = true, \
147  );
148 
149 static int setup_shift(const SwsOp *op, SwsOpPriv *out)
150 {
151  out->u16[0] = op->c.u;
152  return 0;
153 }
154 
155 #define DECL_SHIFT16(EXT) \
156  DECL_COMMON_PATTERNS(U16, lshift16##EXT, \
157  .op = SWS_OP_LSHIFT, \
158  .setup = setup_shift, \
159  .flexible = true, \
160  ); \
161  \
162  DECL_COMMON_PATTERNS(U16, rshift16##EXT, \
163  .op = SWS_OP_RSHIFT, \
164  .setup = setup_shift, \
165  .flexible = true, \
166  );
167 
168 #define DECL_MIN_MAX(EXT) \
169  DECL_COMMON_PATTERNS(F32, min##EXT, \
170  .op = SWS_OP_MIN, \
171  .setup = ff_sws_setup_q4, \
172  .flexible = true, \
173  ); \
174  \
175  DECL_COMMON_PATTERNS(F32, max##EXT, \
176  .op = SWS_OP_MAX, \
177  .setup = ff_sws_setup_q4, \
178  .flexible = true, \
179  );
180 
181 #define DECL_SCALE(EXT) \
182  DECL_COMMON_PATTERNS(F32, scale##EXT, \
183  .op = SWS_OP_SCALE, \
184  .setup = ff_sws_setup_q, \
185  .flexible = true, \
186  );
187 
188 #define DECL_EXPAND_BITS(EXT, BITS) \
189  DECL_ASM(U##BITS, expand_bits##BITS##EXT, \
190  .op = SWS_OP_SCALE, \
191  .scale = { .num = ((1 << (BITS)) - 1), .den = 1 }, \
192  );
193 
194 static int setup_dither(const SwsOp *op, SwsOpPriv *out)
195 {
196  /* 1x1 matrix / single constant */
197  if (!op->dither.size_log2) {
198  const AVRational k = op->dither.matrix[0];
199  out->f32[0] = (float) k.num / k.den;
200  return 0;
201  }
202 
203  const int size = 1 << op->dither.size_log2;
204  const int8_t *off = op->dither.y_offset;
205  int max_offset = 0;
206  for (int i = 0; i < 4; i++) {
207  if (off[i] >= 0)
208  max_offset = FFMAX(max_offset, off[i] & (size - 1));
209  }
210 
211  /* Allocate extra rows to allow over-reading for row offsets. Note that
212  * max_offset is currently never larger than 5, so the extra space needed
213  * for this over-allocation is bounded by 5 * size * sizeof(float),
214  * typically 320 bytes for a 16x16 dither matrix. */
215  const int stride = size * sizeof(float);
216  const int num_rows = size + max_offset;
217  float *matrix = out->ptr = av_mallocz(num_rows * stride);
218  if (!matrix)
219  return AVERROR(ENOMEM);
220 
221  for (int i = 0; i < size * size; i++)
222  matrix[i] = (float) op->dither.matrix[i].num / op->dither.matrix[i].den;
223 
224  memcpy(&matrix[size * size], matrix, max_offset * stride);
225 
226  /* Store relative pointer offset to each row inside extra space */
227  static_assert(sizeof(out->ptr) <= sizeof(int16_t[4]), ">8 byte pointers not supported");
228  assert(max_offset * stride <= INT16_MAX);
229  int16_t *off_out = &out->i16[4];
230  for (int i = 0; i < 4; i++)
231  off_out[i] = off[i] >= 0 ? (off[i] & (size - 1)) * stride : -1;
232 
233  return 0;
234 }
235 
236 #define DECL_DITHER(DECL_MACRO, EXT, SIZE) \
237  DECL_MACRO(F32, dither##SIZE##EXT, \
238  .op = SWS_OP_DITHER, \
239  .setup = setup_dither, \
240  .free = (SIZE) ? av_free : NULL, \
241  .dither_size = SIZE, \
242  );
243 
244 static int setup_linear(const SwsOp *op, SwsOpPriv *out)
245 {
246  float *matrix = out->ptr = av_mallocz(sizeof(float[4][5]));
247  if (!matrix)
248  return AVERROR(ENOMEM);
249 
250  for (int y = 0; y < 4; y++) {
251  for (int x = 0; x < 5; x++)
252  matrix[y * 5 + x] = (float) op->lin.m[y][x].num / op->lin.m[y][x].den;
253  }
254 
255  return 0;
256 }
257 
258 #define DECL_LINEAR(EXT, NAME, MASK) \
259  DECL_ASM(F32, NAME##EXT, \
260  .op = SWS_OP_LINEAR, \
261  .setup = setup_linear, \
262  .free = av_free, \
263  .linear_mask = (MASK), \
264  );
265 
266 #define DECL_FUNCS_8(SIZE, EXT, FLAG) \
267  DECL_RW(EXT, U8, read_planar, READ, 1, false, 0) \
268  DECL_RW(EXT, U8, read_planar, READ, 2, false, 0) \
269  DECL_RW(EXT, U8, read_planar, READ, 3, false, 0) \
270  DECL_RW(EXT, U8, read_planar, READ, 4, false, 0) \
271  DECL_RW(EXT, U8, write_planar, WRITE, 1, false, 0) \
272  DECL_RW(EXT, U8, write_planar, WRITE, 2, false, 0) \
273  DECL_RW(EXT, U8, write_planar, WRITE, 3, false, 0) \
274  DECL_RW(EXT, U8, write_planar, WRITE, 4, false, 0) \
275  DECL_RW(EXT, U8, read_nibbles, READ, 1, false, 1) \
276  DECL_RW(EXT, U8, read_bits, READ, 1, false, 3) \
277  DECL_RW(EXT, U8, write_bits, WRITE, 1, false, 3) \
278  DECL_EXPAND_BITS(EXT, 8) \
279  DECL_PACKED_RW(EXT, 8) \
280  DECL_PACK_UNPACK(EXT, U8, 1, 2, 1, 0) \
281  DECL_PACK_UNPACK(EXT, U8, 3, 3, 2, 0) \
282  DECL_PACK_UNPACK(EXT, U8, 2, 3, 3, 0) \
283  void ff_p1000_shuffle##EXT(void); \
284  void ff_p1001_shuffle##EXT(void); \
285  void ff_p1110_shuffle##EXT(void); \
286  void ff_p1111_shuffle##EXT(void); \
287  DECL_SWIZZLE(EXT, 3, 0, 1, 2) \
288  DECL_SWIZZLE(EXT, 3, 0, 2, 1) \
289  DECL_SWIZZLE(EXT, 2, 1, 0, 3) \
290  DECL_SWIZZLE(EXT, 3, 2, 1, 0) \
291  DECL_SWIZZLE(EXT, 3, 1, 0, 2) \
292  DECL_SWIZZLE(EXT, 3, 2, 0, 1) \
293  DECL_SWIZZLE(EXT, 1, 2, 0, 3) \
294  DECL_SWIZZLE(EXT, 1, 0, 2, 3) \
295  DECL_SWIZZLE(EXT, 2, 0, 1, 3) \
296  DECL_SWIZZLE(EXT, 2, 3, 1, 0) \
297  DECL_SWIZZLE(EXT, 2, 1, 3, 0) \
298  DECL_SWIZZLE(EXT, 1, 2, 3, 0) \
299  DECL_SWIZZLE(EXT, 1, 3, 2, 0) \
300  DECL_SWIZZLE(EXT, 0, 2, 1, 3) \
301  DECL_SWIZZLE(EXT, 0, 2, 3, 1) \
302  DECL_SWIZZLE(EXT, 0, 3, 1, 2) \
303  DECL_SWIZZLE(EXT, 3, 1, 2, 0) \
304  DECL_SWIZZLE(EXT, 0, 3, 2, 1) \
305  DECL_SWIZZLE(EXT, 0, 0, 0, 3) \
306  DECL_SWIZZLE(EXT, 3, 0, 0, 0) \
307  DECL_SWIZZLE(EXT, 0, 0, 0, 1) \
308  DECL_SWIZZLE(EXT, 1, 0, 0, 0) \
309  DECL_CLEAR_ALPHA(EXT, 0) \
310  DECL_CLEAR_ALPHA(EXT, 1) \
311  DECL_CLEAR_ALPHA(EXT, 3) \
312  DECL_CLEAR_ZERO(EXT, 0) \
313  DECL_CLEAR_ZERO(EXT, 1) \
314  DECL_CLEAR_ZERO(EXT, 3) \
315  DECL_CLEAR(EXT, 1, 1, 1, 0) \
316  DECL_CLEAR(EXT, 0, 1, 1, 1) \
317  DECL_CLEAR(EXT, 0, 0, 1, 1) \
318  DECL_CLEAR(EXT, 1, 0, 0, 1) \
319  DECL_CLEAR(EXT, 1, 1, 0, 0) \
320  DECL_CLEAR(EXT, 0, 1, 0, 1) \
321  DECL_CLEAR(EXT, 1, 0, 1, 0) \
322  DECL_CLEAR(EXT, 1, 0, 0, 0) \
323  DECL_CLEAR(EXT, 0, 1, 0, 0) \
324  DECL_CLEAR(EXT, 0, 0, 1, 0) \
325  \
326 static const SwsOpTable ops8##EXT = { \
327  .cpu_flags = AV_CPU_FLAG_##FLAG, \
328  .block_size = SIZE, \
329  .entries = { \
330  &op_read_planar1##EXT, \
331  &op_read_planar2##EXT, \
332  &op_read_planar3##EXT, \
333  &op_read_planar4##EXT, \
334  &op_write_planar1##EXT, \
335  &op_write_planar2##EXT, \
336  &op_write_planar3##EXT, \
337  &op_write_planar4##EXT, \
338  &op_read8_packed2##EXT, \
339  &op_read8_packed3##EXT, \
340  &op_read8_packed4##EXT, \
341  &op_write8_packed2##EXT, \
342  &op_write8_packed3##EXT, \
343  &op_write8_packed4##EXT, \
344  &op_read_nibbles1##EXT, \
345  &op_read_bits1##EXT, \
346  &op_write_bits1##EXT, \
347  &op_expand_bits8##EXT, \
348  &op_pack_1210##EXT, \
349  &op_pack_3320##EXT, \
350  &op_pack_2330##EXT, \
351  &op_unpack_1210##EXT, \
352  &op_unpack_3320##EXT, \
353  &op_unpack_2330##EXT, \
354  &op_swizzle_3012##EXT, \
355  &op_swizzle_3021##EXT, \
356  &op_swizzle_2103##EXT, \
357  &op_swizzle_3210##EXT, \
358  &op_swizzle_3102##EXT, \
359  &op_swizzle_3201##EXT, \
360  &op_swizzle_1203##EXT, \
361  &op_swizzle_1023##EXT, \
362  &op_swizzle_2013##EXT, \
363  &op_swizzle_2310##EXT, \
364  &op_swizzle_2130##EXT, \
365  &op_swizzle_1230##EXT, \
366  &op_swizzle_1320##EXT, \
367  &op_swizzle_0213##EXT, \
368  &op_swizzle_0231##EXT, \
369  &op_swizzle_0312##EXT, \
370  &op_swizzle_3120##EXT, \
371  &op_swizzle_0321##EXT, \
372  &op_swizzle_0003##EXT, \
373  &op_swizzle_0001##EXT, \
374  &op_swizzle_3000##EXT, \
375  &op_swizzle_1000##EXT, \
376  &op_clear_alpha0##EXT, \
377  &op_clear_alpha1##EXT, \
378  &op_clear_alpha3##EXT, \
379  &op_clear_zero0##EXT, \
380  &op_clear_zero1##EXT, \
381  &op_clear_zero3##EXT, \
382  REF_PATTERN(clear##EXT, 1, 1, 1, 0), \
383  REF_PATTERN(clear##EXT, 0, 1, 1, 1), \
384  REF_PATTERN(clear##EXT, 0, 0, 1, 1), \
385  REF_PATTERN(clear##EXT, 1, 0, 0, 1), \
386  REF_PATTERN(clear##EXT, 1, 1, 0, 0), \
387  REF_PATTERN(clear##EXT, 0, 1, 0, 1), \
388  REF_PATTERN(clear##EXT, 1, 0, 1, 0), \
389  REF_PATTERN(clear##EXT, 1, 0, 0, 0), \
390  REF_PATTERN(clear##EXT, 0, 1, 0, 0), \
391  REF_PATTERN(clear##EXT, 0, 0, 1, 0), \
392  NULL \
393  }, \
394 };
395 
396 #define DECL_FUNCS_16(SIZE, EXT, FLAG) \
397  DECL_PACKED_RW(EXT, 16) \
398  DECL_EXPAND_BITS(EXT, 16) \
399  DECL_PACK_UNPACK(EXT, U16, 4, 4, 4, 0) \
400  DECL_PACK_UNPACK(EXT, U16, 5, 5, 5, 0) \
401  DECL_PACK_UNPACK(EXT, U16, 5, 6, 5, 0) \
402  DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 0) \
403  DECL_SWAP_BYTES(EXT, U16, 1, 0, 0, 1) \
404  DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 0) \
405  DECL_SWAP_BYTES(EXT, U16, 1, 1, 1, 1) \
406  DECL_SHIFT16(EXT) \
407  DECL_CONVERT(EXT, U8, U16) \
408  DECL_CONVERT(EXT, U16, U8) \
409  DECL_EXPAND(EXT, U8, U16) \
410  \
411 static const SwsOpTable ops16##EXT = { \
412  .cpu_flags = AV_CPU_FLAG_##FLAG, \
413  .block_size = SIZE, \
414  .entries = { \
415  &op_read16_packed2##EXT, \
416  &op_read16_packed3##EXT, \
417  &op_read16_packed4##EXT, \
418  &op_write16_packed2##EXT, \
419  &op_write16_packed3##EXT, \
420  &op_write16_packed4##EXT, \
421  &op_pack_4440##EXT, \
422  &op_pack_5550##EXT, \
423  &op_pack_5650##EXT, \
424  &op_unpack_4440##EXT, \
425  &op_unpack_5550##EXT, \
426  &op_unpack_5650##EXT, \
427  &op_expand_bits16##EXT, \
428  REF_COMMON_PATTERNS(swap_bytes_U16##EXT), \
429  REF_COMMON_PATTERNS(convert_U8_U16##EXT), \
430  REF_COMMON_PATTERNS(convert_U16_U8##EXT), \
431  REF_COMMON_PATTERNS(expand_U8_U16##EXT), \
432  REF_COMMON_PATTERNS(lshift16##EXT), \
433  REF_COMMON_PATTERNS(rshift16##EXT), \
434  NULL \
435  }, \
436 };
437 
438 #define DECL_FUNCS_32(SIZE, EXT, FLAG) \
439  DECL_PACKED_RW(_m2##EXT, 32) \
440  DECL_PACK_UNPACK(_m2##EXT, U32, 10, 10, 10, 2) \
441  DECL_PACK_UNPACK(_m2##EXT, U32, 2, 10, 10, 10) \
442  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 0) \
443  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 0, 0, 1) \
444  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 0) \
445  DECL_SWAP_BYTES(_m2##EXT, U32, 1, 1, 1, 1) \
446  DECL_CONVERT(EXT, U8, U32) \
447  DECL_CONVERT(EXT, U32, U8) \
448  DECL_CONVERT(EXT, U16, U32) \
449  DECL_CONVERT(EXT, U32, U16) \
450  DECL_CONVERT(EXT, U8, F32) \
451  DECL_CONVERT(EXT, F32, U8) \
452  DECL_CONVERT(EXT, U16, F32) \
453  DECL_CONVERT(EXT, F32, U16) \
454  DECL_EXPAND(EXT, U8, U32) \
455  DECL_MIN_MAX(EXT) \
456  DECL_SCALE(EXT) \
457  DECL_DITHER(DECL_COMMON_PATTERNS, EXT, 0) \
458  DECL_DITHER(DECL_ASM, EXT, 1) \
459  DECL_DITHER(DECL_ASM, EXT, 2) \
460  DECL_DITHER(DECL_ASM, EXT, 3) \
461  DECL_DITHER(DECL_ASM, EXT, 4) \
462  DECL_DITHER(DECL_ASM, EXT, 5) \
463  DECL_DITHER(DECL_ASM, EXT, 6) \
464  DECL_DITHER(DECL_ASM, EXT, 7) \
465  DECL_DITHER(DECL_ASM, EXT, 8) \
466  DECL_LINEAR(EXT, luma, SWS_MASK_LUMA) \
467  DECL_LINEAR(EXT, alpha, SWS_MASK_ALPHA) \
468  DECL_LINEAR(EXT, lumalpha, SWS_MASK_LUMA | SWS_MASK_ALPHA) \
469  DECL_LINEAR(EXT, dot3, 0x7) \
470  DECL_LINEAR(EXT, row0, SWS_MASK_ROW(0)) \
471  DECL_LINEAR(EXT, row0a, SWS_MASK_ROW(0) | SWS_MASK_ALPHA) \
472  DECL_LINEAR(EXT, diag3, SWS_MASK_DIAG3) \
473  DECL_LINEAR(EXT, diag4, SWS_MASK_DIAG4) \
474  DECL_LINEAR(EXT, diagoff3, SWS_MASK_DIAG3 | SWS_MASK_OFF3) \
475  DECL_LINEAR(EXT, matrix3, SWS_MASK_MAT3) \
476  DECL_LINEAR(EXT, affine3, SWS_MASK_MAT3 | SWS_MASK_OFF3) \
477  DECL_LINEAR(EXT, affine3a, SWS_MASK_MAT3 | SWS_MASK_OFF3 | SWS_MASK_ALPHA) \
478  DECL_LINEAR(EXT, matrix4, SWS_MASK_MAT4) \
479  DECL_LINEAR(EXT, affine4, SWS_MASK_MAT4 | SWS_MASK_OFF4) \
480  \
481 static const SwsOpTable ops32##EXT = { \
482  .cpu_flags = AV_CPU_FLAG_##FLAG, \
483  .block_size = SIZE, \
484  .entries = { \
485  &op_read32_packed2_m2##EXT, \
486  &op_read32_packed3_m2##EXT, \
487  &op_read32_packed4_m2##EXT, \
488  &op_write32_packed2_m2##EXT, \
489  &op_write32_packed3_m2##EXT, \
490  &op_write32_packed4_m2##EXT, \
491  &op_pack_1010102_m2##EXT, \
492  &op_pack_2101010_m2##EXT, \
493  &op_unpack_1010102_m2##EXT, \
494  &op_unpack_2101010_m2##EXT, \
495  REF_COMMON_PATTERNS(swap_bytes_U32_m2##EXT), \
496  REF_COMMON_PATTERNS(convert_U8_U32##EXT), \
497  REF_COMMON_PATTERNS(convert_U32_U8##EXT), \
498  REF_COMMON_PATTERNS(convert_U16_U32##EXT), \
499  REF_COMMON_PATTERNS(convert_U32_U16##EXT), \
500  REF_COMMON_PATTERNS(convert_U8_F32##EXT), \
501  REF_COMMON_PATTERNS(convert_F32_U8##EXT), \
502  REF_COMMON_PATTERNS(convert_U16_F32##EXT), \
503  REF_COMMON_PATTERNS(convert_F32_U16##EXT), \
504  REF_COMMON_PATTERNS(expand_U8_U32##EXT), \
505  REF_COMMON_PATTERNS(min##EXT), \
506  REF_COMMON_PATTERNS(max##EXT), \
507  REF_COMMON_PATTERNS(scale##EXT), \
508  REF_COMMON_PATTERNS(dither0##EXT), \
509  &op_dither1##EXT, \
510  &op_dither2##EXT, \
511  &op_dither3##EXT, \
512  &op_dither4##EXT, \
513  &op_dither5##EXT, \
514  &op_dither6##EXT, \
515  &op_dither7##EXT, \
516  &op_dither8##EXT, \
517  &op_luma##EXT, \
518  &op_alpha##EXT, \
519  &op_lumalpha##EXT, \
520  &op_dot3##EXT, \
521  &op_row0##EXT, \
522  &op_row0a##EXT, \
523  &op_diag3##EXT, \
524  &op_diag4##EXT, \
525  &op_diagoff3##EXT, \
526  &op_matrix3##EXT, \
527  &op_affine3##EXT, \
528  &op_affine3a##EXT, \
529  &op_matrix4##EXT, \
530  &op_affine4##EXT, \
531  NULL \
532  }, \
533 };
534 
535 DECL_FUNCS_8(16, _m1_sse4, SSE4)
536 DECL_FUNCS_8(32, _m1_avx2, AVX2)
537 DECL_FUNCS_8(32, _m2_sse4, SSE4)
538 DECL_FUNCS_8(64, _m2_avx2, AVX2)
539 
540 DECL_FUNCS_16(16, _m1_avx2, AVX2)
541 DECL_FUNCS_16(32, _m2_avx2, AVX2)
542 
543 DECL_FUNCS_32(16, _avx2, AVX2)
544 
545 static const SwsOpTable *const tables[] = {
546  &ops8_m1_sse4,
547  &ops8_m1_avx2,
548  &ops8_m2_sse4,
549  &ops8_m2_avx2,
550  &ops16_m1_avx2,
551  &ops16_m2_avx2,
552  &ops32_avx2,
553 };
554 
555 static av_const int get_mmsize(const int cpu_flags)
556 {
558  return 64;
559  else if (cpu_flags & AV_CPU_FLAG_AVX2)
560  return 32;
561  else if (cpu_flags & AV_CPU_FLAG_SSE4)
562  return 16;
563  else
564  return AVERROR(ENOTSUP);
565 }
566 
567 /**
568  * Returns true if the operation's implementation only depends on the block
569  * size, and not the underlying pixel type
570  */
571 static bool op_is_type_invariant(const SwsOp *op)
572 {
573  switch (op->op) {
574  case SWS_OP_READ:
575  case SWS_OP_WRITE:
576  return !(op->rw.elems > 1 && op->rw.packed) && !op->rw.frac;
577  case SWS_OP_SWIZZLE:
578  case SWS_OP_CLEAR:
579  return true;
580  }
581 
582  return false;
583 }
584 
585 static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
586 {
587  uint8_t shuffle[16];
588  int read_bytes, write_bytes;
589  int pixels;
590 
591  /* Solve the shuffle mask for one 128-bit lane only */
592  pixels = ff_sws_solve_shuffle(ops, shuffle, 16, 0x80, &read_bytes, &write_bytes);
593  if (pixels < 0)
594  return pixels;
595 
596  /* We can't shuffle acress lanes, so restrict the vector size to XMM
597  * whenever the read/write size would be a subset of the full vector */
598  if (read_bytes < 16 || write_bytes < 16)
599  mmsize = 16;
600 
601  const int num_lanes = mmsize / 16;
602  const int in_total = num_lanes * read_bytes;
603  const int out_total = num_lanes * write_bytes;
604  const int read_size = in_total <= 4 ? 4 : /* movd */
605  in_total <= 8 ? 8 : /* movq */
606  mmsize; /* movu */
607 
608  *out = (SwsCompiledOp) {
609  .priv = av_memdup(shuffle, sizeof(shuffle)),
610  .free = av_free,
611  .slice_align = 1,
612  .block_size = pixels * num_lanes,
613  .over_read = read_size - in_total,
614  .over_write = mmsize - out_total,
615  .cpu_flags = mmsize > 32 ? AV_CPU_FLAG_AVX512 :
616  mmsize > 16 ? AV_CPU_FLAG_AVX2 :
618  };
619 
620  if (!out->priv)
621  return AVERROR(ENOMEM);
622 
623 #define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT) \
624 do { \
625  SWS_DECL_FUNC(ff_packed_shuffle##IN##_##OUT##_##EXT); \
626  if (in_total == IN && out_total == OUT) \
627  out->func = ff_packed_shuffle##IN##_##OUT##_##EXT; \
628 } while (0)
629 
630  ASSIGN_SHUFFLE_FUNC( 5, 15, sse4);
631  ASSIGN_SHUFFLE_FUNC( 4, 16, sse4);
632  ASSIGN_SHUFFLE_FUNC( 2, 12, sse4);
633  ASSIGN_SHUFFLE_FUNC(16, 8, sse4);
634  ASSIGN_SHUFFLE_FUNC(10, 15, sse4);
635  ASSIGN_SHUFFLE_FUNC( 8, 16, sse4);
636  ASSIGN_SHUFFLE_FUNC( 4, 12, sse4);
637  ASSIGN_SHUFFLE_FUNC(15, 15, sse4);
638  ASSIGN_SHUFFLE_FUNC(12, 16, sse4);
639  ASSIGN_SHUFFLE_FUNC( 6, 12, sse4);
640  ASSIGN_SHUFFLE_FUNC(16, 12, sse4);
641  ASSIGN_SHUFFLE_FUNC(16, 16, sse4);
642  ASSIGN_SHUFFLE_FUNC( 8, 12, sse4);
643  ASSIGN_SHUFFLE_FUNC(12, 12, sse4);
644  ASSIGN_SHUFFLE_FUNC(32, 32, avx2);
645  ASSIGN_SHUFFLE_FUNC(64, 64, avx512);
646  av_assert1(out->func);
647  return 0;
648 }
649 
650 /* Normalize clear values into 32-bit integer constants */
651 static void normalize_clear(SwsOp *op)
652 {
653  static_assert(sizeof(uint32_t) == sizeof(int), "int size mismatch");
654  SwsOpPriv priv;
655  union {
656  uint32_t u32;
657  int i;
658  } c;
659 
660  ff_sws_setup_q4(op, &priv);
661  for (int i = 0; i < 4; i++) {
662  if (!op->c.q4[i].den)
663  continue;
664  switch (ff_sws_pixel_type_size(op->type)) {
665  case 1: c.u32 = 0x1010101U * priv.u8[i]; break;
666  case 2: c.u32 = (uint32_t)priv.u16[i] << 16 | priv.u16[i]; break;
667  case 4: c.u32 = priv.u32[i]; break;
668  }
669 
670  op->c.q4[i].num = c.i;
671  op->c.q4[i].den = 1;
672  }
673 }
674 
676 {
677  const int cpu_flags = av_get_cpu_flags();
678  const int mmsize = get_mmsize(cpu_flags);
679  if (mmsize < 0)
680  return mmsize;
681 
682  const SwsOp *read = ff_sws_op_list_input(ops);
683  const SwsOp *write = ff_sws_op_list_output(ops);
684  av_assert1(write);
685  int ret;
686 
687  /* Special fast path for in-place packed shuffle */
688  ret = solve_shuffle(ops, mmsize, out);
689  if (ret != AVERROR(ENOTSUP))
690  return ret;
691 
693  if (!chain)
694  return AVERROR(ENOMEM);
695 
696  *out = (SwsCompiledOp) {
697  .priv = chain,
698  .slice_align = 1,
700 
701  /* Use at most two full YMM regs during the widest precision section */
702  .block_size = 2 * FFMIN(mmsize, 32) / ff_sws_op_list_max_size(ops),
703  };
704 
705  /* 3-component reads/writes process one extra garbage word */
706  if (read && read->rw.packed && read->rw.elems == 3)
707  out->over_read = sizeof(uint32_t);
708  if (write->rw.packed && write->rw.elems == 3)
709  out->over_write = sizeof(uint32_t);
710 
711 
712  /* Make on-stack copy of `ops` to iterate over */
713  SwsOpList rest = *ops;
714  do {
715  int op_block_size = out->block_size;
716  SwsOp *op = &rest.ops[0];
717 
718  if (op_is_type_invariant(op)) {
719  if (op->op == SWS_OP_CLEAR)
721  op_block_size *= ff_sws_pixel_type_size(op->type);
722  op->type = SWS_PIXEL_U8;
723  }
724 
726  op_block_size, chain);
727  } while (ret == AVERROR(EAGAIN));
728 
729  if (ret < 0) {
730  ff_sws_op_chain_free(chain);
731  if (rest.num_ops < ops->num_ops) {
732  av_log(ctx, AV_LOG_TRACE, "Uncompiled remainder:\n");
734  }
735  return ret;
736  }
737 
738 #define ASSIGN_PROCESS_FUNC(NAME) \
739  do { \
740  SWS_DECL_FUNC(NAME); \
741  void NAME##_return(void); \
742  ret = ff_sws_op_chain_append(chain, NAME##_return, \
743  NULL, &(SwsOpPriv) {0}); \
744  out->func = NAME; \
745  } while (0)
746 
747  const int read_planes = read ? (read->rw.packed ? 1 : read->rw.elems) : 0;
748  const int write_planes = write->rw.packed ? 1 : write->rw.elems;
749  switch (FFMAX(read_planes, write_planes)) {
750  case 1: ASSIGN_PROCESS_FUNC(ff_sws_process1_x86); break;
751  case 2: ASSIGN_PROCESS_FUNC(ff_sws_process2_x86); break;
752  case 3: ASSIGN_PROCESS_FUNC(ff_sws_process3_x86); break;
753  case 4: ASSIGN_PROCESS_FUNC(ff_sws_process4_x86); break;
754  }
755 
756  if (ret < 0) {
757  ff_sws_op_chain_free(chain);
758  return ret;
759  }
760 
761  out->cpu_flags = chain->cpu_flags;
762  return 0;
763 }
764 
766  .name = "x86",
767  .compile = compile,
768  .hw_format = AV_PIX_FMT_NONE,
769 };
SWS_OP_READ
@ SWS_OP_READ
Definition: ops.h:47
SwsOpTable
Definition: ops_chain.h:125
SWS_OP_SWIZZLE
@ SWS_OP_SWIZZLE
Definition: ops.h:50
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
ASSIGN_PROCESS_FUNC
#define ASSIGN_PROCESS_FUNC(NAME)
get_mmsize
static av_const int get_mmsize(const int cpu_flags)
Definition: ops.c:555
out
static FILE * out
Definition: movenc.c:55
setup_linear
static int setup_linear(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:244
ff_sws_op_list_input
const SwsOp * ff_sws_op_list_input(const SwsOpList *ops)
Returns the input operation for a given op list, or NULL if there is none (e.g.
Definition: ops.c:544
SWS_OP_CLEAR
@ SWS_OP_CLEAR
Definition: ops.h:59
ff_sws_op_list_max_size
int ff_sws_op_list_max_size(const SwsOpList *ops)
Returns the size of the largest pixel type used in ops.
Definition: ops.c:620
backend_x86
const SwsOpBackend backend_x86
Definition: ops.c:765
matrix
Definition: vc1dsp.c:43
mask
int mask
Definition: mediacodecdec_common.c:154
SwsOp::rw
SwsReadWriteOp rw
Definition: ops.h:191
normalize_clear
static void normalize_clear(SwsOp *op)
Definition: ops.c:651
av_const
#define av_const
Definition: attributes.h:100
read_bytes
static void read_bytes(const uint8_t *src, float *dst, int src_stride, int dst_stride, int width, int height, float scale)
Definition: vf_nnedi.c:442
float.h
DECL_FUNCS_32
#define DECL_FUNCS_32(SIZE, EXT, FLAG)
Definition: ops.c:438
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
SwsOpBackend::name
const char * name
Definition: ops_internal.h:56
ff_sws_pixel_type_size
int ff_sws_pixel_type_size(SwsPixelType type)
Definition: ops.c:63
SwsOpChain::cpu_flags
int cpu_flags
Definition: ops_chain.h:87
av_memdup
void * av_memdup(const void *p, size_t size)
Duplicate a buffer with av_malloc().
Definition: mem.c:304
SwsOpPriv::u32
uint32_t u32[4]
Definition: ops_chain.h:51
ff_sws_op_list_print
void ff_sws_op_list_print(void *log, int lev, int lev_extra, const SwsOpList *ops)
Print out the contents of an operation list.
Definition: ops.c:728
SwsOpList::num_ops
int num_ops
Definition: ops.h:224
SWS_PIXEL_U8
@ SWS_PIXEL_U8
Definition: ops.h:32
AVRational::num
int num
Numerator.
Definition: rational.h:59
AV_CPU_FLAG_AVX512
#define AV_CPU_FLAG_AVX512
AVX-512 functions: requires OS support even if YMM/ZMM registers aren't used.
Definition: cpu.h:60
avassert.h
AV_LOG_TRACE
#define AV_LOG_TRACE
Extremely verbose debugging, useful for libav* development.
Definition: log.h:236
setup_dither
static int setup_dither(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:194
FF_ARRAY_ELEMS
#define FF_ARRAY_ELEMS(a)
Definition: sinewin_tablegen.c:29
float
float
Definition: af_crystalizer.c:122
ff_sws_op_chain_alloc
SwsOpChain * ff_sws_op_chain_alloc(void)
Definition: ops_chain.c:29
op
static int op(uint8_t **dst, const uint8_t *dst_end, GetByteContext *gb, int pixel, int count, int *x, int width, int linesize)
Perform decode operation.
Definition: anm.c:76
ctx
static AVFormatContext * ctx
Definition: movenc.c:49
AV_CPU_FLAG_SSE4
#define AV_CPU_FLAG_SSE4
Penryn SSE4.1 functions.
Definition: cpu.h:47
ff_sws_op_list_output
const SwsOp * ff_sws_op_list_output(const SwsOpList *ops)
Returns the output operation for a given op list, or NULL if there is none.
Definition: ops.c:553
av_mallocz
#define av_mallocz(s)
Definition: tableprint_vlc.h:31
SwsOpBackend
Definition: ops_internal.h:55
SwsOpChain
Compiled "chain" of operations, which can be dispatched efficiently.
Definition: ops_chain.h:82
ff_sws_op_compile_tables
int ff_sws_op_compile_tables(const SwsOpTable *const tables[], int num_tables, SwsOpList *ops, const int block_size, SwsOpChain *chain)
"Compile" a single op by looking it up in a list of fixed size op tables.
Definition: ops_chain.c:196
AVRational
Rational number (pair of numerator and denominator).
Definition: rational.h:58
tables
static const SwsOpTable *const tables[]
Definition: ops.c:545
solve_shuffle
static int solve_shuffle(const SwsOpList *ops, int mmsize, SwsCompiledOp *out)
Definition: ops.c:585
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
AV_CPU_FLAG_AVX2
#define AV_CPU_FLAG_AVX2
AVX2 functions: requires OS support even if YMM registers aren't used.
Definition: cpu.h:56
i
#define i(width, name, range_min, range_max)
Definition: cbs_h264.c:63
SwsOpPriv::u8
uint8_t u8[16]
Definition: ops_chain.h:48
size
int size
Definition: twinvq_data.h:10344
SWS_OP_WRITE
@ SWS_OP_WRITE
Definition: ops.h:48
SwsOpPriv::u16
uint16_t u16[8]
Definition: ops_chain.h:49
ff_sws_op_chain_free_cb
void ff_sws_op_chain_free_cb(void *ptr)
Definition: ops_chain.c:34
compile
static int compile(SwsContext *ctx, SwsOpList *ops, SwsCompiledOp *out)
Definition: ops.c:675
SwsOpChain::free
void(* free[SWS_MAX_OPS+1])(void *)
Definition: ops_chain.h:85
setup_swap_bytes
static int setup_swap_bytes(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:86
ff_sws_op_chain_free
static void ff_sws_op_chain_free(SwsOpChain *chain)
Definition: ops_chain.h:92
SwsOpList::ops
SwsOp * ops
Definition: ops.h:223
op_is_type_invariant
static bool op_is_type_invariant(const SwsOp *op)
Returns true if the operation's implementation only depends on the block size, and not the underlying...
Definition: ops.c:571
ff_sws_setup_q4
int ff_sws_setup_q4(const SwsOp *op, SwsOpPriv *out)
Definition: ops_chain.c:279
av_assert1
#define av_assert1(cond)
assert() equivalent, that does not lie in speed critical code.
Definition: avassert.h:58
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
SwsOp
Definition: ops.h:186
write_bytes
static void write_bytes(const float *src, uint8_t *dst, int src_stride, int dst_stride, int width, int height, int depth, float scale)
Definition: vf_nnedi.c:484
ret
ret
Definition: filter_design.txt:187
SwsCompiledOp
Definition: ops_dispatch.h:80
U
#define U(x)
Definition: vpx_arith.h:37
ASSIGN_SHUFFLE_FUNC
#define ASSIGN_SHUFFLE_FUNC(IN, OUT, EXT)
AVRational::den
int den
Denominator.
Definition: rational.h:60
SwsReadWriteOp::packed
bool packed
Definition: ops.h:101
AV_PIX_FMT_NONE
@ AV_PIX_FMT_NONE
Definition: pixfmt.h:72
ff_sws_solve_shuffle
int ff_sws_solve_shuffle(const SwsOpList *ops, uint8_t shuffle[], int size, uint8_t clear_val, int *read_bytes, int *write_bytes)
"Solve" an op list into a fixed shuffle mask, with an optional ability to also directly clear the out...
Definition: ops_optimizer.c:686
setup_shift
static int setup_shift(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:149
SwsReadWriteOp::elems
uint8_t elems
Definition: ops.h:99
mem.h
setup_clear
static int setup_clear(const SwsOp *op, SwsOpPriv *out)
Definition: ops.c:116
av_free
#define av_free(p)
Definition: tableprint_vlc.h:34
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
DECL_FUNCS_16
#define DECL_FUNCS_16(SIZE, EXT, FLAG)
Definition: ops.c:396
stride
#define stride
Definition: h264pred_template.c:536
SwsOpList
Helper struct for representing a list of operations.
Definition: ops.h:222
DECL_FUNCS_8
#define DECL_FUNCS_8(SIZE, EXT, FLAG)
Definition: ops.c:266
SwsContext
Main external API structure.
Definition: swscale.h:191
SwsOpPriv
Copyright (C) 2025 Niklas Haas.
Definition: ops_chain.h:42
shuffle
static uint64_t shuffle(uint64_t in, const uint8_t *shuffle, int shuffle_len)
Definition: des.c:179
read
static uint32_t BS_FUNC() read(BSCTX *bc, unsigned int n)
Return n bits from the buffer, n has to be in the 0-32 range.
Definition: bitstream_template.h:239