FFmpeg
mpeg4videodsp.c
Go to the documentation of this file.
1 /*
2  * This file is part of FFmpeg.
3  *
4  * FFmpeg is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU Lesser General Public
6  * License as published by the Free Software Foundation; either
7  * version 2.1 of the License, or (at your option) any later version.
8  *
9  * FFmpeg is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12  * Lesser General Public License for more details.
13  *
14  * You should have received a copy of the GNU Lesser General Public
15  * License along with FFmpeg; if not, write to the Free Software
16  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17  */
18 
19 #include "config.h"
20 #include "libavutil/attributes.h"
21 #include "libavutil/cpu.h"
22 #include "libavutil/mem_internal.h"
23 #include "libavutil/x86/asm.h"
24 #include "libavutil/x86/cpu.h"
26 #include "videodsp.h"
27 
28 #if HAVE_SSSE3_INLINE
29 
30 #define SPLATW(reg) "pshuflw $0, %%" #reg ", %%" #reg "\n\t" \
31  "punpcklqdq %%" #reg ", %%" #reg "\n\t"
32 
33 typedef struct {
34  DECLARE_ALIGNED_16(uint16_t, u16)[8];
35 } xmm_u16;
36 
37 DECLARE_ASM_CONST(16, xmm_u16, pw_0to7) = { { 0, 1, 2, 3, 4, 5, 6, 7 } };
38 
39 static void gmc_ssse3(uint8_t *dst, const uint8_t *src,
40  int stride, int h, int ox, int oy,
41  int dxx, int dxy, int dyx, int dyy,
42  int shift, int r, int width, int height)
43 {
44  enum {
45  W = 8,
46  EDGE_EMU_STRIDE = 16, //< anything >= W+1 will do
47  MAX_H = 16,
48  };
49  const int w = 8;
50  const int ix = ox >> (16 + shift);
51  const int iy = oy >> (16 + shift);
52  const int ox2 = ox & (1 << (16 + shift)) - 1;
53  const int oy2 = oy & (1 << (16 + shift)) - 1;
54  const int oxs = ox2 >> 4;
55  const int oys = oy2 >> 4;
56  const int dxx2 = dxx - (1 << (16 + shift));
57  const int dyy2 = dyy - (1 << (16 + shift));
58  const int dxxs = dxx2 >> 4;
59  const int dxys = dxy >> 4;
60  const int dyxs = dyx >> 4;
61  const int dyys = dyy2 >> 4;
62  uint8_t edge_buf[(MAX_H + 1) * EDGE_EMU_STRIDE];
63 
64  const int dxw = dxx2 * (w - 1);
65  const int dyh = dyy2 * (h - 1);
66  const int dxh = dxy * (h - 1);
67  const int dyw = dyx * (w - 1);
68  int need_emu = (unsigned) ix >= width - w || width < w ||
69  (unsigned) iy >= height - h || height< h
70  ;
71 
72  if ( // non-constant fullpel offset (3% of blocks)
73  ((ox2 + dxw) | (ox2 + dxh) | (ox2 + dxw + dxh) |
74  (oy2 + dyw) | (oy2 + dyh) | (oy2 + dyw + dyh)) >> (16 + shift) ||
75  // uses more than 16 bits of subpel mv (only at huge resolution)
76  (dxx | dxy | dyx | dyy) & 15) {
77  ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
78  shift, r, width, height);
79  return;
80  }
81 
82  src += ix + iy * stride;
83  const ptrdiff_t dst_stride = stride;
84  ptrdiff_t src_stride = stride;
85  if (need_emu) {
86  ff_emulated_edge_mc_sse2(edge_buf, src, EDGE_EMU_STRIDE, src_stride,
87  w + 1, h + 1, ix, iy, width, height);
88  src = edge_buf;
89  src_stride = EDGE_EMU_STRIDE;
90  }
91 
92 #if ARCH_X86_32
93  xmm_u16 dxy8, dyy8, r8;
94  DECLARE_ALIGNED_16(uint64_t, shift2) = 2 * shift;
95 #endif
96 
97  __asm__ volatile (
98  "movd %[dxxs], %%xmm2 \n\t"
99  "movd %[dyxs], %%xmm3 \n\t"
100  "movd %[oxs], %%xmm1 \n\t"
101  SPLATW(xmm2)
102  "movd %[oys], %%xmm7 \n\t"
103  SPLATW(xmm3)
104  "pmullw "MANGLE(pw_0to7)", %%xmm2 \n\t"
105  SPLATW(xmm1)
106  "movd %[s], %%xmm6 \n\t"
107  "pmullw "MANGLE(pw_0to7)", %%xmm3 \n\t"
108  "movq (%[src]), %%xmm5 \n\t"
109  SPLATW(xmm7)
110 #if ARCH_X86_32
111  "movd %[dxys], %%xmm0 \n\t"
112 #else
113  "movd %[dxys], %%xmm11 \n\t"
114 #endif
115  "paddw %%xmm2, %%xmm1 \n\t"
116  "movq 1(%[src]), %%xmm2 \n\t"
117  SPLATW(xmm6)
118 #if ARCH_X86_32
119  "movd %[dyys], %%xmm4 \n\t"
120 #else
121  "movd %[dyys], %%xmm9 \n\t"
122 #endif
123  "paddw %%xmm3, %%xmm7 \n\t"
124  "punpcklbw %%xmm2, %%xmm5 \n\t"
125 #if ARCH_X86_32
126  SPLATW(xmm0)
127  "movd %[r], %%xmm2 \n\t"
128  SPLATW(xmm4)
129  "movdqa %%xmm0, %[dxy8] \n\t"
130  SPLATW(xmm2)
131  "movdqa %%xmm4, %[dyy8] \n\t"
132  "movdqa %%xmm2, %[r8] \n\t"
133 #else
134  SPLATW(xmm11)
135  "movd %[r], %%xmm8 \n\t"
136  SPLATW(xmm9)
137  SPLATW(xmm8)
138  "movd %[shift2], %%xmm12 \n\t"
139 #endif
140 
141  "1: \n\t"
142  "add %[src_stride], %[src] \n\t"
143  "movq (%[src]), %%xmm3 \n\t"
144  "movq 1(%[src]), %%xmm0 \n\t"
145  "movdqa %%xmm1, %%xmm4 \n\t"
146  "psrlw $12, %%xmm4 \n\t" // dx
147  "movdqa %%xmm6, %%xmm2 \n\t"
148  "psubw %%xmm4, %%xmm2 \n\t" // (s-dx)
149  "psllw $8, %%xmm4 \n\t"
150  "por %%xmm4, %%xmm2 \n\t" // s-dx,dx,s-dx,dx (bytes)
151  "pmaddubsw %%xmm2, %%xmm5 \n\t" // src[0, 0] * (s - dx) + src[1,0] * dx
152  "punpcklbw %%xmm0, %%xmm3 \n\t"
153  "movdqa %%xmm3, %%xmm0 \n\t"
154  "pmaddubsw %%xmm2, %%xmm3 \n\t" // src[0, 1] * (s - dx) + src[1,1] * dx
155 #if ARCH_X86_32
156  "paddw %[dxy8], %%xmm1 \n\t"
157 #else
158  "paddw %%xmm11, %%xmm1 \n\t"
159 #endif
160  "movdqa %%xmm7, %%xmm4 \n\t"
161  "movdqa %%xmm6, %%xmm2 \n\t"
162  "psrlw $12, %%xmm4 \n\t" // dy
163  "psubw %%xmm4, %%xmm2 \n\t" // (s-dy)
164  "pmullw %%xmm5, %%xmm2 \n\t" // (src[0, 0] * (s - dx) + src[1,0] * dx) * (s - dy)
165 #if ARCH_X86_32
166  "paddw %[dyy8], %%xmm7 \n\t"
167 #else
168  "paddw %%xmm9, %%xmm7 \n\t"
169 #endif
170  "pmullw %%xmm3, %%xmm4 \n\t" // (src[0, 1] * (s - dx) + src[1,1] * dx) * dy
171 
172 #if ARCH_X86_32
173  "paddw %[r8], %%xmm2 \n\t"
174 #else
175  "paddw %%xmm8, %%xmm2 \n\t"
176 #endif
177  "paddw %%xmm2, %%xmm4 \n\t"
178 
179 #if ARCH_X86_32
180  "psrlw %[shift2], %%xmm4 \n\t"
181 #else
182  "psrlw %%xmm12, %%xmm4 \n\t"
183 #endif
184  "packuswb %%xmm4, %%xmm4 \n\t"
185  "movq %%xmm4, (%[dst]) \n\t"
186  "movdqa %%xmm0, %%xmm5 \n\t"
187  "add %[dst_stride], %[dst] \n\t"
188 
189  "decl %[h] \n\t"
190  "jnz 1b \n\t"
191  : [dst]"+r"(dst), [src]"+r"(src),
192 #if HAVE_6REGS || HAVE_INLINE_ASM_DIRECT_SYMBOL_REFS
193  [h]"+r"(h)
194 #else
195  [h]"+m"(h)
196 #endif
197 #if ARCH_X86_32
198  , [dxy8]"=m" (dxy8), [dyy8]"=m" (dyy8), [r8]"=m" (r8)
199 #endif
200  : [dst_stride]"r"(dst_stride), [src_stride]"r"(src_stride),
201  [s]"g" (1 << shift),
202 #if ARCH_X86_32
203  [shift2]"m" (shift2),
204 #else
205  [shift2]"g" (2*shift),
206 #endif
207  [oxs]"g"(oxs), [oys]"g"(oys), [dxxs]"g"(dxxs), [dyxs]"g"(dyxs),
208  [dxys]"g"(dxys), [dyys]"g"(dyys), [r]"g"(r) NAMED_CONSTRAINTS_ADD(pw_0to7)
209  : XMM_CLOBBERS("xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",)
210 #if ARCH_X86_64
211  XMM_CLOBBERS("xmm8", "xmm9", "xmm10", "xmm11", "xmm12",)
212 #endif
213  "memory");
214 }
215 
216 #endif /* HAVE_SSSE3_INLINE */
217 
219 {
220 #if HAVE_SSSE3_INLINE
221  int cpu_flags = av_get_cpu_flags();
222 
223  if (INLINE_SSSE3(cpu_flags))
224  c->gmc = gmc_ssse3;
225 #endif /* HAVE_SSSE3_INLINE */
226 }
cpu.h
r
const char * r
Definition: vf_curves.c:127
ff_mpeg4videodsp_init_x86
av_cold void ff_mpeg4videodsp_init_x86(Mpeg4VideoDSPContext *c)
Definition: mpeg4videodsp.c:218
mem_internal.h
videodsp.h
Mpeg4VideoDSPContext
Definition: mpeg4videodsp.h:28
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:109
cpu_flags
static atomic_int cpu_flags
Definition: cpu.c:56
ff_emulated_edge_mc_sse2
void ff_emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src, ptrdiff_t buf_stride, ptrdiff_t src_stride, int block_w, int block_h, int src_x, int src_y, int w, int h)
Definition: videodsp_init.c:191
MANGLE
#define MANGLE(a)
Definition: asm.h:126
av_cold
#define av_cold
Definition: attributes.h:111
NAMED_CONSTRAINTS_ADD
#define NAMED_CONSTRAINTS_ADD(...)
Definition: asm.h:144
s
#define s(width, name)
Definition: cbs_vp9.c:198
W
@ W
Definition: mpeg4videodsp.c:32
asm.h
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
height
#define height
Definition: dsp.h:89
shift
static int shift(int a, int b)
Definition: bonk.c:261
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:87
cpu.h
attributes.h
DECLARE_ASM_CONST
DECLARE_ASM_CONST(16, double, pd_1)[2]
shift2
static const uint8_t shift2[6]
Definition: dxa.c:49
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
XMM_CLOBBERS
#define XMM_CLOBBERS(...)
Definition: asm.h:97
INLINE_SSSE3
#define INLINE_SSSE3(flags)
Definition: cpu.h:89
mpeg4videodsp.h
w
uint8_t w
Definition: llvidencdsp.c:39
ff_gmc_c
void ff_gmc_c(uint8_t *dst, const uint8_t *src, int stride, int h, int ox, int oy, int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
Definition: mpeg4videodsp.c:47
DECLARE_ALIGNED_16
#define DECLARE_ALIGNED_16(t, v)
Definition: mem_internal.h:112
h
h
Definition: vp9dsp_template.c:2070
stride
#define stride
Definition: h264pred_template.c:536
width
#define width
Definition: dsp.h:89
src
#define src
Definition: vp8dsp.c:248