FFmpeg
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
dsputil_mmx.c
Go to the documentation of this file.
1 /*
2  * MMX optimized DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  *
22  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23  */
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavcodec/dsputil.h"
29 #include "libavcodec/h264dsp.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "libavcodec/simple_idct.h"
32 #include "libavcodec/videodsp.h"
33 #include "dsputil_mmx.h"
34 #include "idct_xvid.h"
35 #include "diracdsp_mmx.h"
36 
37 //#undef NDEBUG
38 //#include <assert.h>
39 
40 /* pixel operations */
41 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
42 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
43 
44 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
51 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
55 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
59 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
61 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
66 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
68 
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
77 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
78 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
80 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
81 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
82 
83 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
84 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
85 
86 
87 #if HAVE_YASM
88 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
89  ptrdiff_t line_size, int h);
90 void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
91  ptrdiff_t line_size, int h);
92 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
93  int dstStride, int src1Stride, int h);
94 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
95  uint8_t *src2, int dstStride,
96  int src1Stride, int h);
97 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
98  int dstStride, int src1Stride, int h);
99 void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
100  ptrdiff_t line_size, int h);
101 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
102  ptrdiff_t line_size, int h);
103 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
104  int dstStride, int src1Stride, int h);
105 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
106  int dstStride, int src1Stride, int h);
107 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
108  int dstStride, int src1Stride, int h);
109 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
110  ptrdiff_t line_size, int h);
111 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
112  ptrdiff_t line_size, int h);
113 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
114  const uint8_t *pixels,
115  ptrdiff_t line_size, int h);
116 void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
117  const uint8_t *pixels,
118  ptrdiff_t line_size, int h);
119 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
120  ptrdiff_t line_size, int h);
121 void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
122  ptrdiff_t line_size, int h);
123 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
124  ptrdiff_t line_size, int h);
125 void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
126  ptrdiff_t line_size, int h);
127 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
128  const uint8_t *pixels,
129  ptrdiff_t line_size, int h);
130 void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
131  const uint8_t *pixels,
132  ptrdiff_t line_size, int h);
133 void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
134  ptrdiff_t line_size, int h);
135 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
136  ptrdiff_t line_size, int h);
137 void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
138  ptrdiff_t line_size, int h);
139 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
140  ptrdiff_t line_size, int h);
141 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
142  ptrdiff_t line_size, int h);
143 void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
144  ptrdiff_t line_size, int h);
145 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
146  ptrdiff_t line_size, int h);
147 
148 void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
149 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
150  ptrdiff_t line_size, int h)
151 {
152  ff_put_pixels8_mmxext(block, pixels, line_size, h);
153  ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
154 }
155 
156 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
157  int dstStride, int srcStride, int h);
158 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
159  int dstStride, int srcStride, int h);
160 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
161  int dstStride, int srcStride,
162  int h);
163 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
164  int dstStride, int srcStride, int h);
165 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
166  int dstStride, int srcStride, int h);
167 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
168  int dstStride, int srcStride,
169  int h);
170 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
171  int dstStride, int srcStride);
172 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
173  int dstStride, int srcStride);
174 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
175  int dstStride, int srcStride);
176 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
177  int dstStride, int srcStride);
178 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
179  int dstStride, int srcStride);
180 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
181  int dstStride, int srcStride);
182 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
183 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
184 #endif /* HAVE_YASM */
185 
186 
187 #if HAVE_INLINE_ASM
188 
189 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
190 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
191 
192 #define MOVQ_BFE(regd) \
193  __asm__ volatile ( \
194  "pcmpeqd %%"#regd", %%"#regd" \n\t" \
195  "paddb %%"#regd", %%"#regd" \n\t" ::)
196 
197 #ifndef PIC
198 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
199 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
200 #else
201 // for shared library it's better to use this way for accessing constants
202 // pcmpeqd -> -1
203 #define MOVQ_BONE(regd) \
204  __asm__ volatile ( \
205  "pcmpeqd %%"#regd", %%"#regd" \n\t" \
206  "psrlw $15, %%"#regd" \n\t" \
207  "packuswb %%"#regd", %%"#regd" \n\t" ::)
208 
209 #define MOVQ_WTWO(regd) \
210  __asm__ volatile ( \
211  "pcmpeqd %%"#regd", %%"#regd" \n\t" \
212  "psrlw $15, %%"#regd" \n\t" \
213  "psllw $1, %%"#regd" \n\t"::)
214 
215 #endif
216 
217 // using regr as temporary and for the output result
218 // first argument is unmodifed and second is trashed
219 // regfe is supposed to contain 0xfefefefefefefefe
220 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
221  "movq "#rega", "#regr" \n\t" \
222  "pand "#regb", "#regr" \n\t" \
223  "pxor "#rega", "#regb" \n\t" \
224  "pand "#regfe", "#regb" \n\t" \
225  "psrlq $1, "#regb" \n\t" \
226  "paddb "#regb", "#regr" \n\t"
227 
228 #define PAVGB_MMX(rega, regb, regr, regfe) \
229  "movq "#rega", "#regr" \n\t" \
230  "por "#regb", "#regr" \n\t" \
231  "pxor "#rega", "#regb" \n\t" \
232  "pand "#regfe", "#regb" \n\t" \
233  "psrlq $1, "#regb" \n\t" \
234  "psubb "#regb", "#regr" \n\t"
235 
236 // mm6 is supposed to contain 0xfefefefefefefefe
237 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
238  "movq "#rega", "#regr" \n\t" \
239  "movq "#regc", "#regp" \n\t" \
240  "pand "#regb", "#regr" \n\t" \
241  "pand "#regd", "#regp" \n\t" \
242  "pxor "#rega", "#regb" \n\t" \
243  "pxor "#regc", "#regd" \n\t" \
244  "pand %%mm6, "#regb" \n\t" \
245  "pand %%mm6, "#regd" \n\t" \
246  "psrlq $1, "#regb" \n\t" \
247  "psrlq $1, "#regd" \n\t" \
248  "paddb "#regb", "#regr" \n\t" \
249  "paddb "#regd", "#regp" \n\t"
250 
251 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
252  "movq "#rega", "#regr" \n\t" \
253  "movq "#regc", "#regp" \n\t" \
254  "por "#regb", "#regr" \n\t" \
255  "por "#regd", "#regp" \n\t" \
256  "pxor "#rega", "#regb" \n\t" \
257  "pxor "#regc", "#regd" \n\t" \
258  "pand %%mm6, "#regb" \n\t" \
259  "pand %%mm6, "#regd" \n\t" \
260  "psrlq $1, "#regd" \n\t" \
261  "psrlq $1, "#regb" \n\t" \
262  "psubb "#regb", "#regr" \n\t" \
263  "psubb "#regd", "#regp" \n\t"
264 
265 /***********************************/
266 /* MMX no rounding */
267 #define NO_RND 1
268 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
269 #define SET_RND MOVQ_WONE
270 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
271 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
272 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
273 
274 #include "dsputil_rnd_template.c"
275 
276 #undef DEF
277 #undef SET_RND
278 #undef PAVGBP
279 #undef PAVGB
280 #undef NO_RND
281 /***********************************/
282 /* MMX rounding */
283 
284 #define DEF(x, y) x ## _ ## y ## _mmx
285 #define SET_RND MOVQ_WTWO
286 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
287 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
288 
289 #include "dsputil_rnd_template.c"
290 
291 #undef DEF
292 #undef SET_RND
293 #undef PAVGBP
294 #undef PAVGB
295 #undef OP_AVG
296 
297 #endif /* HAVE_INLINE_ASM */
298 
299 
300 #if HAVE_YASM
301 
302 /***********************************/
303 /* 3Dnow specific */
304 
305 #define DEF(x) x ## _3dnow
306 
307 #include "dsputil_avg_template.c"
308 
309 #undef DEF
310 
311 /***********************************/
312 /* MMXEXT specific */
313 
314 #define DEF(x) x ## _mmxext
315 
316 #include "dsputil_avg_template.c"
317 
318 #undef DEF
319 
320 #endif /* HAVE_YASM */
321 
322 
323 #if HAVE_INLINE_ASM
324 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
325 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
326 
327 /***********************************/
328 /* standard MMX */
329 
330 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
331  int line_size)
332 {
333  const int16_t *p;
334  uint8_t *pix;
335 
336  /* read the pixels */
337  p = block;
338  pix = pixels;
339  /* unrolled loop */
340  __asm__ volatile (
341  "movq (%3), %%mm0 \n\t"
342  "movq 8(%3), %%mm1 \n\t"
343  "movq 16(%3), %%mm2 \n\t"
344  "movq 24(%3), %%mm3 \n\t"
345  "movq 32(%3), %%mm4 \n\t"
346  "movq 40(%3), %%mm5 \n\t"
347  "movq 48(%3), %%mm6 \n\t"
348  "movq 56(%3), %%mm7 \n\t"
349  "packuswb %%mm1, %%mm0 \n\t"
350  "packuswb %%mm3, %%mm2 \n\t"
351  "packuswb %%mm5, %%mm4 \n\t"
352  "packuswb %%mm7, %%mm6 \n\t"
353  "movq %%mm0, (%0) \n\t"
354  "movq %%mm2, (%0, %1) \n\t"
355  "movq %%mm4, (%0, %1, 2) \n\t"
356  "movq %%mm6, (%0, %2) \n\t"
357  :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
358  "r"(p)
359  : "memory");
360  pix += line_size * 4;
361  p += 32;
362 
363  // if here would be an exact copy of the code above
364  // compiler would generate some very strange code
365  // thus using "r"
366  __asm__ volatile (
367  "movq (%3), %%mm0 \n\t"
368  "movq 8(%3), %%mm1 \n\t"
369  "movq 16(%3), %%mm2 \n\t"
370  "movq 24(%3), %%mm3 \n\t"
371  "movq 32(%3), %%mm4 \n\t"
372  "movq 40(%3), %%mm5 \n\t"
373  "movq 48(%3), %%mm6 \n\t"
374  "movq 56(%3), %%mm7 \n\t"
375  "packuswb %%mm1, %%mm0 \n\t"
376  "packuswb %%mm3, %%mm2 \n\t"
377  "packuswb %%mm5, %%mm4 \n\t"
378  "packuswb %%mm7, %%mm6 \n\t"
379  "movq %%mm0, (%0) \n\t"
380  "movq %%mm2, (%0, %1) \n\t"
381  "movq %%mm4, (%0, %1, 2) \n\t"
382  "movq %%mm6, (%0, %2) \n\t"
383  :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
384  : "memory");
385 }
386 
387 #define put_signed_pixels_clamped_mmx_half(off) \
388  "movq "#off"(%2), %%mm1 \n\t" \
389  "movq 16 + "#off"(%2), %%mm2 \n\t" \
390  "movq 32 + "#off"(%2), %%mm3 \n\t" \
391  "movq 48 + "#off"(%2), %%mm4 \n\t" \
392  "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
393  "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
394  "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
395  "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
396  "paddb %%mm0, %%mm1 \n\t" \
397  "paddb %%mm0, %%mm2 \n\t" \
398  "paddb %%mm0, %%mm3 \n\t" \
399  "paddb %%mm0, %%mm4 \n\t" \
400  "movq %%mm1, (%0) \n\t" \
401  "movq %%mm2, (%0, %3) \n\t" \
402  "movq %%mm3, (%0, %3, 2) \n\t" \
403  "movq %%mm4, (%0, %1) \n\t"
404 
405 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
406  int line_size)
407 {
408  x86_reg line_skip = line_size;
409  x86_reg line_skip3;
410 
411  __asm__ volatile (
412  "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
413  "lea (%3, %3, 2), %1 \n\t"
414  put_signed_pixels_clamped_mmx_half(0)
415  "lea (%0, %3, 4), %0 \n\t"
416  put_signed_pixels_clamped_mmx_half(64)
417  : "+&r"(pixels), "=&r"(line_skip3)
418  : "r"(block), "r"(line_skip)
419  : "memory");
420 }
421 
422 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
423  int line_size)
424 {
425  const int16_t *p;
426  uint8_t *pix;
427  int i;
428 
429  /* read the pixels */
430  p = block;
431  pix = pixels;
432  MOVQ_ZERO(mm7);
433  i = 4;
434  do {
435  __asm__ volatile (
436  "movq (%2), %%mm0 \n\t"
437  "movq 8(%2), %%mm1 \n\t"
438  "movq 16(%2), %%mm2 \n\t"
439  "movq 24(%2), %%mm3 \n\t"
440  "movq %0, %%mm4 \n\t"
441  "movq %1, %%mm6 \n\t"
442  "movq %%mm4, %%mm5 \n\t"
443  "punpcklbw %%mm7, %%mm4 \n\t"
444  "punpckhbw %%mm7, %%mm5 \n\t"
445  "paddsw %%mm4, %%mm0 \n\t"
446  "paddsw %%mm5, %%mm1 \n\t"
447  "movq %%mm6, %%mm5 \n\t"
448  "punpcklbw %%mm7, %%mm6 \n\t"
449  "punpckhbw %%mm7, %%mm5 \n\t"
450  "paddsw %%mm6, %%mm2 \n\t"
451  "paddsw %%mm5, %%mm3 \n\t"
452  "packuswb %%mm1, %%mm0 \n\t"
453  "packuswb %%mm3, %%mm2 \n\t"
454  "movq %%mm0, %0 \n\t"
455  "movq %%mm2, %1 \n\t"
456  : "+m"(*pix), "+m"(*(pix + line_size))
457  : "r"(p)
458  : "memory");
459  pix += line_size * 2;
460  p += 16;
461  } while (--i);
462 }
463 
464 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
465  ptrdiff_t line_size, int h)
466 {
467  __asm__ volatile (
468  "lea (%3, %3), %%"REG_a" \n\t"
469  ".p2align 3 \n\t"
470  "1: \n\t"
471  "movq (%1 ), %%mm0 \n\t"
472  "movq (%1, %3), %%mm1 \n\t"
473  "movq %%mm0, (%2) \n\t"
474  "movq %%mm1, (%2, %3) \n\t"
475  "add %%"REG_a", %1 \n\t"
476  "add %%"REG_a", %2 \n\t"
477  "movq (%1 ), %%mm0 \n\t"
478  "movq (%1, %3), %%mm1 \n\t"
479  "movq %%mm0, (%2) \n\t"
480  "movq %%mm1, (%2, %3) \n\t"
481  "add %%"REG_a", %1 \n\t"
482  "add %%"REG_a", %2 \n\t"
483  "subl $4, %0 \n\t"
484  "jnz 1b \n\t"
485  : "+g"(h), "+r"(pixels), "+r"(block)
486  : "r"((x86_reg)line_size)
487  : "%"REG_a, "memory"
488  );
489 }
490 
491 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
492  ptrdiff_t line_size, int h)
493 {
494  __asm__ volatile (
495  "lea (%3, %3), %%"REG_a" \n\t"
496  ".p2align 3 \n\t"
497  "1: \n\t"
498  "movq (%1 ), %%mm0 \n\t"
499  "movq 8(%1 ), %%mm4 \n\t"
500  "movq (%1, %3), %%mm1 \n\t"
501  "movq 8(%1, %3), %%mm5 \n\t"
502  "movq %%mm0, (%2) \n\t"
503  "movq %%mm4, 8(%2) \n\t"
504  "movq %%mm1, (%2, %3) \n\t"
505  "movq %%mm5, 8(%2, %3) \n\t"
506  "add %%"REG_a", %1 \n\t"
507  "add %%"REG_a", %2 \n\t"
508  "movq (%1 ), %%mm0 \n\t"
509  "movq 8(%1 ), %%mm4 \n\t"
510  "movq (%1, %3), %%mm1 \n\t"
511  "movq 8(%1, %3), %%mm5 \n\t"
512  "movq %%mm0, (%2) \n\t"
513  "movq %%mm4, 8(%2) \n\t"
514  "movq %%mm1, (%2, %3) \n\t"
515  "movq %%mm5, 8(%2, %3) \n\t"
516  "add %%"REG_a", %1 \n\t"
517  "add %%"REG_a", %2 \n\t"
518  "subl $4, %0 \n\t"
519  "jnz 1b \n\t"
520  : "+g"(h), "+r"(pixels), "+r"(block)
521  : "r"((x86_reg)line_size)
522  : "%"REG_a, "memory"
523  );
524 }
525 
526 #define CLEAR_BLOCKS(name, n) \
527 static void name(int16_t *blocks) \
528 { \
529  __asm__ volatile ( \
530  "pxor %%mm7, %%mm7 \n\t" \
531  "mov %1, %%"REG_a" \n\t" \
532  "1: \n\t" \
533  "movq %%mm7, (%0, %%"REG_a") \n\t" \
534  "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
535  "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
536  "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
537  "add $32, %%"REG_a" \n\t" \
538  "js 1b \n\t" \
539  :: "r"(((uint8_t *)blocks) + 128 * n), \
540  "i"(-128 * n) \
541  : "%"REG_a \
542  ); \
543 }
544 CLEAR_BLOCKS(clear_blocks_mmx, 6)
545 CLEAR_BLOCKS(clear_block_mmx, 1)
546 
547 static void clear_block_sse(int16_t *block)
548 {
549  __asm__ volatile (
550  "xorps %%xmm0, %%xmm0 \n"
551  "movaps %%xmm0, (%0) \n"
552  "movaps %%xmm0, 16(%0) \n"
553  "movaps %%xmm0, 32(%0) \n"
554  "movaps %%xmm0, 48(%0) \n"
555  "movaps %%xmm0, 64(%0) \n"
556  "movaps %%xmm0, 80(%0) \n"
557  "movaps %%xmm0, 96(%0) \n"
558  "movaps %%xmm0, 112(%0) \n"
559  :: "r"(block)
560  : "memory"
561  );
562 }
563 
564 static void clear_blocks_sse(int16_t *blocks)
565 {
566  __asm__ volatile (
567  "xorps %%xmm0, %%xmm0 \n"
568  "mov %1, %%"REG_a" \n"
569  "1: \n"
570  "movaps %%xmm0, (%0, %%"REG_a") \n"
571  "movaps %%xmm0, 16(%0, %%"REG_a") \n"
572  "movaps %%xmm0, 32(%0, %%"REG_a") \n"
573  "movaps %%xmm0, 48(%0, %%"REG_a") \n"
574  "movaps %%xmm0, 64(%0, %%"REG_a") \n"
575  "movaps %%xmm0, 80(%0, %%"REG_a") \n"
576  "movaps %%xmm0, 96(%0, %%"REG_a") \n"
577  "movaps %%xmm0, 112(%0, %%"REG_a") \n"
578  "add $128, %%"REG_a" \n"
579  "js 1b \n"
580  :: "r"(((uint8_t *)blocks) + 128 * 6),
581  "i"(-128 * 6)
582  : "%"REG_a
583  );
584 }
585 
586 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
587 {
588  x86_reg i = 0;
589  __asm__ volatile (
590  "jmp 2f \n\t"
591  "1: \n\t"
592  "movq (%1, %0), %%mm0 \n\t"
593  "movq (%2, %0), %%mm1 \n\t"
594  "paddb %%mm0, %%mm1 \n\t"
595  "movq %%mm1, (%2, %0) \n\t"
596  "movq 8(%1, %0), %%mm0 \n\t"
597  "movq 8(%2, %0), %%mm1 \n\t"
598  "paddb %%mm0, %%mm1 \n\t"
599  "movq %%mm1, 8(%2, %0) \n\t"
600  "add $16, %0 \n\t"
601  "2: \n\t"
602  "cmp %3, %0 \n\t"
603  "js 1b \n\t"
604  : "+r"(i)
605  : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
606  );
607  for ( ; i < w; i++)
608  dst[i + 0] += src[i + 0];
609 }
610 
611 #if HAVE_7REGS
612 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
613  const uint8_t *diff, int w,
614  int *left, int *left_top)
615 {
616  x86_reg w2 = -w;
617  x86_reg x;
618  int l = *left & 0xff;
619  int tl = *left_top & 0xff;
620  int t;
621  __asm__ volatile (
622  "mov %7, %3 \n"
623  "1: \n"
624  "movzbl (%3, %4), %2 \n"
625  "mov %2, %k3 \n"
626  "sub %b1, %b3 \n"
627  "add %b0, %b3 \n"
628  "mov %2, %1 \n"
629  "cmp %0, %2 \n"
630  "cmovg %0, %2 \n"
631  "cmovg %1, %0 \n"
632  "cmp %k3, %0 \n"
633  "cmovg %k3, %0 \n"
634  "mov %7, %3 \n"
635  "cmp %2, %0 \n"
636  "cmovl %2, %0 \n"
637  "add (%6, %4), %b0 \n"
638  "mov %b0, (%5, %4) \n"
639  "inc %4 \n"
640  "jl 1b \n"
641  : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
642  : "r"(dst + w), "r"(diff + w), "rm"(top + w)
643  );
644  *left = l;
645  *left_top = tl;
646 }
647 #endif
648 #endif /* HAVE_INLINE_ASM */
649 
650 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
651 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
652 
653 #if HAVE_INLINE_ASM
654 /* Draw the edges of width 'w' of an image of size width, height
655  * this MMX version can only handle w == 8 || w == 16. */
656 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
657  int w, int h, int sides)
658 {
659  uint8_t *ptr, *last_line;
660  int i;
661 
662  last_line = buf + (height - 1) * wrap;
663  /* left and right */
664  ptr = buf;
665  if (w == 8) {
666  __asm__ volatile (
667  "1: \n\t"
668  "movd (%0), %%mm0 \n\t"
669  "punpcklbw %%mm0, %%mm0 \n\t"
670  "punpcklwd %%mm0, %%mm0 \n\t"
671  "punpckldq %%mm0, %%mm0 \n\t"
672  "movq %%mm0, -8(%0) \n\t"
673  "movq -8(%0, %2), %%mm1 \n\t"
674  "punpckhbw %%mm1, %%mm1 \n\t"
675  "punpckhwd %%mm1, %%mm1 \n\t"
676  "punpckhdq %%mm1, %%mm1 \n\t"
677  "movq %%mm1, (%0, %2) \n\t"
678  "add %1, %0 \n\t"
679  "cmp %3, %0 \n\t"
680  "jb 1b \n\t"
681  : "+r"(ptr)
682  : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
683  );
684  } else if(w==16){
685  __asm__ volatile (
686  "1: \n\t"
687  "movd (%0), %%mm0 \n\t"
688  "punpcklbw %%mm0, %%mm0 \n\t"
689  "punpcklwd %%mm0, %%mm0 \n\t"
690  "punpckldq %%mm0, %%mm0 \n\t"
691  "movq %%mm0, -8(%0) \n\t"
692  "movq %%mm0, -16(%0) \n\t"
693  "movq -8(%0, %2), %%mm1 \n\t"
694  "punpckhbw %%mm1, %%mm1 \n\t"
695  "punpckhwd %%mm1, %%mm1 \n\t"
696  "punpckhdq %%mm1, %%mm1 \n\t"
697  "movq %%mm1, (%0, %2) \n\t"
698  "movq %%mm1, 8(%0, %2) \n\t"
699  "add %1, %0 \n\t"
700  "cmp %3, %0 \n\t"
701  "jb 1b \n\t"
702  : "+r"(ptr)
703  : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
704  );
705  } else {
706  av_assert1(w == 4);
707  __asm__ volatile (
708  "1: \n\t"
709  "movd (%0), %%mm0 \n\t"
710  "punpcklbw %%mm0, %%mm0 \n\t"
711  "punpcklwd %%mm0, %%mm0 \n\t"
712  "movd %%mm0, -4(%0) \n\t"
713  "movd -4(%0, %2), %%mm1 \n\t"
714  "punpcklbw %%mm1, %%mm1 \n\t"
715  "punpckhwd %%mm1, %%mm1 \n\t"
716  "punpckhdq %%mm1, %%mm1 \n\t"
717  "movd %%mm1, (%0, %2) \n\t"
718  "add %1, %0 \n\t"
719  "cmp %3, %0 \n\t"
720  "jb 1b \n\t"
721  : "+r"(ptr)
722  : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
723  );
724  }
725 
726  /* top and bottom (and hopefully also the corners) */
727  if (sides & EDGE_TOP) {
728  for (i = 0; i < h; i += 4) {
729  ptr = buf - (i + 1) * wrap - w;
730  __asm__ volatile (
731  "1: \n\t"
732  "movq (%1, %0), %%mm0 \n\t"
733  "movq %%mm0, (%0) \n\t"
734  "movq %%mm0, (%0, %2) \n\t"
735  "movq %%mm0, (%0, %2, 2) \n\t"
736  "movq %%mm0, (%0, %3) \n\t"
737  "add $8, %0 \n\t"
738  "cmp %4, %0 \n\t"
739  "jb 1b \n\t"
740  : "+r"(ptr)
741  : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
742  "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
743  );
744  }
745  }
746 
747  if (sides & EDGE_BOTTOM) {
748  for (i = 0; i < h; i += 4) {
749  ptr = last_line + (i + 1) * wrap - w;
750  __asm__ volatile (
751  "1: \n\t"
752  "movq (%1, %0), %%mm0 \n\t"
753  "movq %%mm0, (%0) \n\t"
754  "movq %%mm0, (%0, %2) \n\t"
755  "movq %%mm0, (%0, %2, 2) \n\t"
756  "movq %%mm0, (%0, %3) \n\t"
757  "add $8, %0 \n\t"
758  "cmp %4, %0 \n\t"
759  "jb 1b \n\t"
760  : "+r"(ptr)
761  : "r"((x86_reg)last_line - (x86_reg)ptr - w),
762  "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
763  "r"(ptr + width + 2 * w)
764  );
765  }
766  }
767 }
768 #endif /* HAVE_INLINE_ASM */
769 
770 
771 #if HAVE_YASM
772 #define QPEL_OP(OPNAME, ROUNDER, RND, MMX) \
773 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
774  int stride) \
775 { \
776  ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
777 } \
778  \
779 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
780  int stride) \
781 { \
782  uint64_t temp[8]; \
783  uint8_t * const half = (uint8_t*)temp; \
784  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
785  stride, 8); \
786  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
787  stride, stride, 8); \
788 } \
789  \
790 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
791  int stride) \
792 { \
793  ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
794  stride, 8); \
795 } \
796  \
797 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
798  int stride) \
799 { \
800  uint64_t temp[8]; \
801  uint8_t * const half = (uint8_t*)temp; \
802  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
803  stride, 8); \
804  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
805  stride, 8); \
806 } \
807  \
808 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
809  int stride) \
810 { \
811  uint64_t temp[8]; \
812  uint8_t * const half = (uint8_t*)temp; \
813  ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
814  8, stride); \
815  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
816  stride, stride, 8); \
817 } \
818  \
819 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
820  int stride) \
821 { \
822  ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
823  stride, stride); \
824 } \
825  \
826 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
827  int stride) \
828 { \
829  uint64_t temp[8]; \
830  uint8_t * const half = (uint8_t*)temp; \
831  ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
832  8, stride); \
833  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
834  stride, 8); \
835 } \
836  \
837 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
838  int stride) \
839 { \
840  uint64_t half[8 + 9]; \
841  uint8_t * const halfH = ((uint8_t*)half) + 64; \
842  uint8_t * const halfHV = ((uint8_t*)half); \
843  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
844  stride, 9); \
845  ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
846  stride, 9); \
847  ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
848  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
849  stride, 8, 8); \
850 } \
851  \
852 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
853  int stride) \
854 { \
855  uint64_t half[8 + 9]; \
856  uint8_t * const halfH = ((uint8_t*)half) + 64; \
857  uint8_t * const halfHV = ((uint8_t*)half); \
858  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
859  stride, 9); \
860  ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
861  stride, 9); \
862  ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
863  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
864  stride, 8, 8); \
865 } \
866  \
867 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
868  int stride) \
869 { \
870  uint64_t half[8 + 9]; \
871  uint8_t * const halfH = ((uint8_t*)half) + 64; \
872  uint8_t * const halfHV = ((uint8_t*)half); \
873  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
874  stride, 9); \
875  ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
876  stride, 9); \
877  ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
878  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
879  stride, 8, 8); \
880 } \
881  \
882 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
883  int stride) \
884 { \
885  uint64_t half[8 + 9]; \
886  uint8_t * const halfH = ((uint8_t*)half) + 64; \
887  uint8_t * const halfHV = ((uint8_t*)half); \
888  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
889  stride, 9); \
890  ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
891  stride, 9); \
892  ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
893  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
894  stride, 8, 8); \
895 } \
896  \
897 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
898  int stride) \
899 { \
900  uint64_t half[8 + 9]; \
901  uint8_t * const halfH = ((uint8_t*)half) + 64; \
902  uint8_t * const halfHV = ((uint8_t*)half); \
903  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
904  stride, 9); \
905  ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
906  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
907  stride, 8, 8); \
908 } \
909  \
910 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
911  int stride) \
912 { \
913  uint64_t half[8 + 9]; \
914  uint8_t * const halfH = ((uint8_t*)half) + 64; \
915  uint8_t * const halfHV = ((uint8_t*)half); \
916  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
917  stride, 9); \
918  ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
919  ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
920  stride, 8, 8); \
921 } \
922  \
923 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
924  int stride) \
925 { \
926  uint64_t half[8 + 9]; \
927  uint8_t * const halfH = ((uint8_t*)half); \
928  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
929  stride, 9); \
930  ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
931  8, stride, 9); \
932  ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
933  stride, 8); \
934 } \
935  \
936 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
937  int stride) \
938 { \
939  uint64_t half[8 + 9]; \
940  uint8_t * const halfH = ((uint8_t*)half); \
941  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
942  stride, 9); \
943  ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
944  stride, 9); \
945  ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
946  stride, 8); \
947 } \
948  \
949 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
950  int stride) \
951 { \
952  uint64_t half[9]; \
953  uint8_t * const halfH = ((uint8_t*)half); \
954  ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
955  stride, 9); \
956  ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
957  stride, 8); \
958 } \
959  \
960 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
961  int stride) \
962 { \
963  ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
964 } \
965  \
966 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
967  int stride) \
968 { \
969  uint64_t temp[32]; \
970  uint8_t * const half = (uint8_t*)temp; \
971  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
972  stride, 16); \
973  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
974  stride, 16); \
975 } \
976  \
977 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
978  int stride) \
979 { \
980  ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
981  stride, stride, 16);\
982 } \
983  \
984 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
985  int stride) \
986 { \
987  uint64_t temp[32]; \
988  uint8_t * const half = (uint8_t*)temp; \
989  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
990  stride, 16); \
991  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
992  stride, stride, 16); \
993 } \
994  \
995 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
996  int stride) \
997 { \
998  uint64_t temp[32]; \
999  uint8_t * const half = (uint8_t*)temp; \
1000  ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1001  stride); \
1002  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
1003  stride, 16); \
1004 } \
1005  \
1006 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1007  int stride) \
1008 { \
1009  ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
1010  stride, stride); \
1011 } \
1012  \
1013 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1014  int stride) \
1015 { \
1016  uint64_t temp[32]; \
1017  uint8_t * const half = (uint8_t*)temp; \
1018  ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1019  stride); \
1020  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1021  stride, stride, 16); \
1022 } \
1023  \
1024 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1025  int stride) \
1026 { \
1027  uint64_t half[16 * 2 + 17 * 2]; \
1028  uint8_t * const halfH = ((uint8_t*)half) + 256; \
1029  uint8_t * const halfHV = ((uint8_t*)half); \
1030  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1031  stride, 17); \
1032  ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1033  stride, 17); \
1034  ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1035  16, 16); \
1036  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1037  stride, 16, 16); \
1038 } \
1039  \
1040 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1041  int stride) \
1042 { \
1043  uint64_t half[16 * 2 + 17 * 2]; \
1044  uint8_t * const halfH = ((uint8_t*)half) + 256; \
1045  uint8_t * const halfHV = ((uint8_t*)half); \
1046  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1047  stride, 17); \
1048  ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1049  stride, 17); \
1050  ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1051  16, 16); \
1052  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1053  stride, 16, 16); \
1054 } \
1055  \
1056 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1057  int stride) \
1058 { \
1059  uint64_t half[16 * 2 + 17 * 2]; \
1060  uint8_t * const halfH = ((uint8_t*)half) + 256; \
1061  uint8_t * const halfHV = ((uint8_t*)half); \
1062  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1063  stride, 17); \
1064  ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1065  stride, 17); \
1066  ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1067  16, 16); \
1068  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1069  stride, 16, 16); \
1070 } \
1071  \
1072 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1073  int stride) \
1074 { \
1075  uint64_t half[16 * 2 + 17 * 2]; \
1076  uint8_t * const halfH = ((uint8_t*)half) + 256; \
1077  uint8_t * const halfHV = ((uint8_t*)half); \
1078  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1079  stride, 17); \
1080  ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1081  stride, 17); \
1082  ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1083  16, 16); \
1084  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1085  stride, 16, 16); \
1086 } \
1087  \
1088 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1089  int stride) \
1090 { \
1091  uint64_t half[16 * 2 + 17 * 2]; \
1092  uint8_t * const halfH = ((uint8_t*)half) + 256; \
1093  uint8_t * const halfHV = ((uint8_t*)half); \
1094  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1095  stride, 17); \
1096  ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1097  16, 16); \
1098  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1099  stride, 16, 16); \
1100 } \
1101  \
1102 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1103  int stride) \
1104 { \
1105  uint64_t half[16 * 2 + 17 * 2]; \
1106  uint8_t * const halfH = ((uint8_t*)half) + 256; \
1107  uint8_t * const halfHV = ((uint8_t*)half); \
1108  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1109  stride, 17); \
1110  ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1111  16, 16); \
1112  ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1113  stride, 16, 16); \
1114 } \
1115  \
1116 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1117  int stride) \
1118 { \
1119  uint64_t half[17 * 2]; \
1120  uint8_t * const halfH = ((uint8_t*)half); \
1121  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1122  stride, 17); \
1123  ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1124  stride, 17); \
1125  ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1126  stride, 16); \
1127 } \
1128  \
1129 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1130  int stride) \
1131 { \
1132  uint64_t half[17 * 2]; \
1133  uint8_t * const halfH = ((uint8_t*)half); \
1134  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1135  stride, 17); \
1136  ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1137  stride, 17); \
1138  ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1139  stride, 16); \
1140 } \
1141  \
1142 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1143  int stride) \
1144 { \
1145  uint64_t half[17 * 2]; \
1146  uint8_t * const halfH = ((uint8_t*)half); \
1147  ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1148  stride, 17); \
1149  ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1150  stride, 16); \
1151 }
1152 
1153 QPEL_OP(put_, ff_pw_16, _, mmxext)
1154 QPEL_OP(avg_, ff_pw_16, _, mmxext)
1155 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, mmxext)
1156 #endif /* HAVE_YASM */
1157 
1158 
1159 #if HAVE_INLINE_ASM
1160 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1161 {
1162  put_pixels8_xy2_mmx(dst, src, stride, 8);
1163 }
1164 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1165 {
1166  put_pixels16_xy2_mmx(dst, src, stride, 16);
1167 }
1168 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1169 {
1170  avg_pixels8_xy2_mmx(dst, src, stride, 8);
1171 }
1172 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1173 {
1174  avg_pixels16_xy2_mmx(dst, src, stride, 16);
1175 }
1176 
1177 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1178  ptrdiff_t linesize, int block_w, int block_h,
1179  int src_x, int src_y, int w, int h);
1180 
1181 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1182  int stride, int h, int ox, int oy,
1183  int dxx, int dxy, int dyx, int dyy,
1184  int shift, int r, int width, int height,
1185  emulated_edge_mc_func *emu_edge_fn)
1186 {
1187  const int w = 8;
1188  const int ix = ox >> (16 + shift);
1189  const int iy = oy >> (16 + shift);
1190  const int oxs = ox >> 4;
1191  const int oys = oy >> 4;
1192  const int dxxs = dxx >> 4;
1193  const int dxys = dxy >> 4;
1194  const int dyxs = dyx >> 4;
1195  const int dyys = dyy >> 4;
1196  const uint16_t r4[4] = { r, r, r, r };
1197  const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1198  const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1199  const uint64_t shift2 = 2 * shift;
1200 #define MAX_STRIDE 4096U
1201 #define MAX_H 8U
1202  uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
1203  int x, y;
1204 
1205  const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1206  const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1207  const int dxh = dxy * (h - 1);
1208  const int dyw = dyx * (w - 1);
1209  int need_emu = (unsigned)ix >= width - w ||
1210  (unsigned)iy >= height - h;
1211 
1212  if ( // non-constant fullpel offset (3% of blocks)
1213  ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1214  (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1215  // uses more than 16 bits of subpel mv (only at huge resolution)
1216  || (dxx | dxy | dyx | dyy) & 15
1217  || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
1218  // FIXME could still use mmx for some of the rows
1219  ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1220  shift, r, width, height);
1221  return;
1222  }
1223 
1224  src += ix + iy * stride;
1225  if (need_emu) {
1226  emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1227  src = edge_buf;
1228  }
1229 
1230  __asm__ volatile (
1231  "movd %0, %%mm6 \n\t"
1232  "pxor %%mm7, %%mm7 \n\t"
1233  "punpcklwd %%mm6, %%mm6 \n\t"
1234  "punpcklwd %%mm6, %%mm6 \n\t"
1235  :: "r"(1<<shift)
1236  );
1237 
1238  for (x = 0; x < w; x += 4) {
1239  uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1240  oxs - dxys + dxxs * (x + 1),
1241  oxs - dxys + dxxs * (x + 2),
1242  oxs - dxys + dxxs * (x + 3) };
1243  uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1244  oys - dyys + dyxs * (x + 1),
1245  oys - dyys + dyxs * (x + 2),
1246  oys - dyys + dyxs * (x + 3) };
1247 
1248  for (y = 0; y < h; y++) {
1249  __asm__ volatile (
1250  "movq %0, %%mm4 \n\t"
1251  "movq %1, %%mm5 \n\t"
1252  "paddw %2, %%mm4 \n\t"
1253  "paddw %3, %%mm5 \n\t"
1254  "movq %%mm4, %0 \n\t"
1255  "movq %%mm5, %1 \n\t"
1256  "psrlw $12, %%mm4 \n\t"
1257  "psrlw $12, %%mm5 \n\t"
1258  : "+m"(*dx4), "+m"(*dy4)
1259  : "m"(*dxy4), "m"(*dyy4)
1260  );
1261 
1262  __asm__ volatile (
1263  "movq %%mm6, %%mm2 \n\t"
1264  "movq %%mm6, %%mm1 \n\t"
1265  "psubw %%mm4, %%mm2 \n\t"
1266  "psubw %%mm5, %%mm1 \n\t"
1267  "movq %%mm2, %%mm0 \n\t"
1268  "movq %%mm4, %%mm3 \n\t"
1269  "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1270  "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1271  "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1272  "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1273 
1274  "movd %4, %%mm5 \n\t"
1275  "movd %3, %%mm4 \n\t"
1276  "punpcklbw %%mm7, %%mm5 \n\t"
1277  "punpcklbw %%mm7, %%mm4 \n\t"
1278  "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1279  "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1280 
1281  "movd %2, %%mm5 \n\t"
1282  "movd %1, %%mm4 \n\t"
1283  "punpcklbw %%mm7, %%mm5 \n\t"
1284  "punpcklbw %%mm7, %%mm4 \n\t"
1285  "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1286  "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1287  "paddw %5, %%mm1 \n\t"
1288  "paddw %%mm3, %%mm2 \n\t"
1289  "paddw %%mm1, %%mm0 \n\t"
1290  "paddw %%mm2, %%mm0 \n\t"
1291 
1292  "psrlw %6, %%mm0 \n\t"
1293  "packuswb %%mm0, %%mm0 \n\t"
1294  "movd %%mm0, %0 \n\t"
1295 
1296  : "=m"(dst[x + y * stride])
1297  : "m"(src[0]), "m"(src[1]),
1298  "m"(src[stride]), "m"(src[stride + 1]),
1299  "m"(*r4), "m"(shift2)
1300  );
1301  src += stride;
1302  }
1303  src += 4 - h * stride;
1304  }
1305 }
1306 
1307 #if CONFIG_VIDEODSP
1308 #if HAVE_YASM
1309 #if ARCH_X86_32
1310 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1311  int stride, int h, int ox, int oy,
1312  int dxx, int dxy, int dyx, int dyy,
1313  int shift, int r, int width, int height)
1314 {
1315  gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1316  width, height, &ff_emulated_edge_mc_8);
1317 }
1318 #endif
1319 static void gmc_sse(uint8_t *dst, uint8_t *src,
1320  int stride, int h, int ox, int oy,
1321  int dxx, int dxy, int dyx, int dyy,
1322  int shift, int r, int width, int height)
1323 {
1324  gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1325  width, height, &ff_emulated_edge_mc_8);
1326 }
1327 #else
1328 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1329  int stride, int h, int ox, int oy,
1330  int dxx, int dxy, int dyx, int dyy,
1331  int shift, int r, int width, int height)
1332 {
1333  gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1334  width, height, &ff_emulated_edge_mc_8);
1335 }
1336 #endif
1337 #endif
1338 
1339 #endif /* HAVE_INLINE_ASM */
1340 
1341 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1342  ptrdiff_t line_size, int h);
1343 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1344  ptrdiff_t line_size, int h);
1345 
1346 #if HAVE_INLINE_ASM
1347 
1348 /* CAVS-specific */
1349 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1350 {
1351  put_pixels8_mmx(dst, src, stride, 8);
1352 }
1353 
1354 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1355 {
1356  avg_pixels8_mmx(dst, src, stride, 8);
1357 }
1358 
1359 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1360 {
1361  put_pixels16_mmx(dst, src, stride, 16);
1362 }
1363 
1364 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1365 {
1366  avg_pixels16_mmx(dst, src, stride, 16);
1367 }
1368 
1369 /* VC-1-specific */
1370 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1371  int stride, int rnd)
1372 {
1373  put_pixels8_mmx(dst, src, stride, 8);
1374 }
1375 
1376 #if CONFIG_DIRAC_DECODER
1377 #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
1378 void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1379 {\
1380  if (h&3)\
1381  ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
1382  else\
1383  OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
1384 }\
1385 void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1386 {\
1387  if (h&3)\
1388  ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
1389  else\
1390  OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
1391 }\
1392 void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1393 {\
1394  if (h&3) {\
1395  ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
1396  } else {\
1397  OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
1398  OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
1399  }\
1400 }
1401 
1402 #if HAVE_MMX_INLINE
1403 DIRAC_PIXOP(put, put, mmx)
1404 DIRAC_PIXOP(avg, avg, mmx)
1405 #endif
1406 
1407 #if HAVE_YASM
1408 DIRAC_PIXOP(avg, ff_avg, mmxext)
1409 
1410 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1411 {
1412  if (h&3)
1413  ff_put_dirac_pixels16_c(dst, src, stride, h);
1414  else
1415  ff_put_pixels16_sse2(dst, src[0], stride, h);
1416 }
1417 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1418 {
1419  if (h&3)
1420  ff_avg_dirac_pixels16_c(dst, src, stride, h);
1421  else
1422  ff_avg_pixels16_sse2(dst, src[0], stride, h);
1423 }
1424 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1425 {
1426  if (h&3) {
1427  ff_put_dirac_pixels32_c(dst, src, stride, h);
1428  } else {
1429  ff_put_pixels16_sse2(dst , src[0] , stride, h);
1430  ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
1431  }
1432 }
1433 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1434 {
1435  if (h&3) {
1436  ff_avg_dirac_pixels32_c(dst, src, stride, h);
1437  } else {
1438  ff_avg_pixels16_sse2(dst , src[0] , stride, h);
1439  ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
1440  }
1441 }
1442 #endif
1443 #endif
1444 
1445 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
1446  * converted. */
1447 #if CONFIG_GPL
1448 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
1449  int16_t *block)
1450 {
1451  ff_mmx_idct(block);
1452  ff_put_pixels_clamped_mmx(block, dest, line_size);
1453 }
1454 
1455 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
1456  int16_t *block)
1457 {
1458  ff_mmx_idct(block);
1459  ff_add_pixels_clamped_mmx(block, dest, line_size);
1460 }
1461 
1462 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
1463  int16_t *block)
1464 {
1465  ff_mmxext_idct(block);
1466  ff_put_pixels_clamped_mmx(block, dest, line_size);
1467 }
1468 
1469 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
1470  int16_t *block)
1471 {
1472  ff_mmxext_idct(block);
1473  ff_add_pixels_clamped_mmx(block, dest, line_size);
1474 }
1475 #endif
1476 
1477 static void vector_clipf_sse(float *dst, const float *src,
1478  float min, float max, int len)
1479 {
1480  x86_reg i = (len - 16) * 4;
1481  __asm__ volatile (
1482  "movss %3, %%xmm4 \n\t"
1483  "movss %4, %%xmm5 \n\t"
1484  "shufps $0, %%xmm4, %%xmm4 \n\t"
1485  "shufps $0, %%xmm5, %%xmm5 \n\t"
1486  "1: \n\t"
1487  "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1488  "movaps 16(%2, %0), %%xmm1 \n\t"
1489  "movaps 32(%2, %0), %%xmm2 \n\t"
1490  "movaps 48(%2, %0), %%xmm3 \n\t"
1491  "maxps %%xmm4, %%xmm0 \n\t"
1492  "maxps %%xmm4, %%xmm1 \n\t"
1493  "maxps %%xmm4, %%xmm2 \n\t"
1494  "maxps %%xmm4, %%xmm3 \n\t"
1495  "minps %%xmm5, %%xmm0 \n\t"
1496  "minps %%xmm5, %%xmm1 \n\t"
1497  "minps %%xmm5, %%xmm2 \n\t"
1498  "minps %%xmm5, %%xmm3 \n\t"
1499  "movaps %%xmm0, (%1, %0) \n\t"
1500  "movaps %%xmm1, 16(%1, %0) \n\t"
1501  "movaps %%xmm2, 32(%1, %0) \n\t"
1502  "movaps %%xmm3, 48(%1, %0) \n\t"
1503  "sub $64, %0 \n\t"
1504  "jge 1b \n\t"
1505  : "+&r"(i)
1506  : "r"(dst), "r"(src), "m"(min), "m"(max)
1507  : "memory"
1508  );
1509 }
1510 
1511 #endif /* HAVE_INLINE_ASM */
1512 
1513 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1514  int order);
1515 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1516  int order);
1517 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1518  const int16_t *v3,
1519  int order, int mul);
1520 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1521  const int16_t *v3,
1522  int order, int mul);
1523 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1524  const int16_t *v3,
1525  int order, int mul);
1526 
1527 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1528  const int16_t *window, unsigned int len);
1529 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1530  const int16_t *window, unsigned int len);
1531 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1532  const int16_t *window, unsigned int len);
1533 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1534  const int16_t *window, unsigned int len);
1535 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1536  const int16_t *window, unsigned int len);
1537 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1538  const int16_t *window, unsigned int len);
1539 
1540 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1541 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1542 
1544  const uint8_t *diff, int w,
1545  int *left, int *left_top);
1547  int w, int left);
1548 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1549  int w, int left);
1550 
1551 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1552  int32_t min, int32_t max, unsigned int len);
1553 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1554  int32_t min, int32_t max, unsigned int len);
1555 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1556  int32_t min, int32_t max, unsigned int len);
1557 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1558  int32_t min, int32_t max, unsigned int len);
1559 
1560 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1561  do { \
1562  c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1563  c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1564  c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1565  c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1566  c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1567  c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1568  c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1569  c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1570  c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1571  c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1572  c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1573  c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1574  c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1575  c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1576  c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1577  c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1578  } while (0)
1579 
1580 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
1581  do { \
1582  c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
1583  c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
1584  c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
1585  c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
1586  } while (0)
1587 
1589  int mm_flags)
1590 {
1591  const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1592 
1593 #if HAVE_INLINE_ASM
1597 
1598  if (!high_bit_depth) {
1599  c->clear_block = clear_block_mmx;
1600  c->clear_blocks = clear_blocks_mmx;
1601  c->draw_edges = draw_edges_mmx;
1602 
1603  SET_HPEL_FUNCS(put, [0], 16, mmx);
1604  SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
1605  SET_HPEL_FUNCS(avg, [0], 16, mmx);
1606  SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
1607  SET_HPEL_FUNCS(put, [1], 8, mmx);
1608  SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
1609  SET_HPEL_FUNCS(avg, [1], 8, mmx);
1610  }
1611 
1612 #if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
1613  c->gmc = gmc_mmx;
1614 #endif
1615 
1616  c->add_bytes = add_bytes_mmx;
1617 #endif /* HAVE_INLINE_ASM */
1618 
1619 #if HAVE_YASM
1620  if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1623  }
1624 
1626 #endif
1627 
1628 }
1629 
1631  int mm_flags)
1632 {
1633  const int bit_depth = avctx->bits_per_raw_sample;
1634  const int high_bit_depth = bit_depth > 8;
1635 
1636 #if HAVE_YASM
1637  SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
1638  SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
1639 
1640  SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
1641  SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
1642  SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1643  SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
1644 
1645  if (!high_bit_depth) {
1646  c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
1647  c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
1648 
1649  c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
1650  c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
1651  c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
1652 
1653  c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
1654  c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
1655 
1657  c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
1658  c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
1659  }
1660 
1661  if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1662  if (!high_bit_depth) {
1663  c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
1664  c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
1665  c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
1666  c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
1667 
1668  c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
1669  c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
1670  }
1671  }
1672 #endif /* HAVE_YASM */
1673 
1674 #if HAVE_MMXEXT_EXTERNAL
1675  if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1676  avctx->codec_id == AV_CODEC_ID_THEORA)) {
1677  c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
1678  c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
1679  }
1680 
1681  /* slower than cmov version on AMD */
1682  if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1684 
1687 
1688  if (avctx->flags & CODEC_FLAG_BITEXACT) {
1690  } else {
1692  }
1693 #endif /* HAVE_MMXEXT_EXTERNAL */
1694 }
1695 
1697  int mm_flags)
1698 {
1699  const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1700 
1701 #if HAVE_YASM
1702  if (!high_bit_depth) {
1703  c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
1704  c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
1705 
1706  c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
1707  c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
1708  c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
1709 
1710  c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
1711  c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
1712 
1713  c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
1714  c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
1715  c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
1716 
1717  if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
1718  c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
1719  c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
1720  c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
1721  c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
1722 
1723  c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
1724  c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
1725  }
1726  }
1727 
1728  if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1729  avctx->codec_id == AV_CODEC_ID_THEORA)) {
1730  c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
1731  c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
1732  }
1733 #endif /* HAVE_YASM */
1734 }
1735 
1737  int mm_flags)
1738 {
1739  const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1740 
1741 #if HAVE_INLINE_ASM
1742  if (!high_bit_depth) {
1743  if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
1744  /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
1745  c->clear_block = clear_block_sse;
1746  c->clear_blocks = clear_blocks_sse;
1747  }
1748  }
1749 
1750  c->vector_clipf = vector_clipf_sse;
1751 #endif /* HAVE_INLINE_ASM */
1752 
1753 #if HAVE_YASM
1754 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
1755  c->gmc = gmc_sse;
1756 #endif
1757 #endif /* HAVE_YASM */
1758 }
1759 
1761  int mm_flags)
1762 {
1763  const int bit_depth = avctx->bits_per_raw_sample;
1764  const int high_bit_depth = bit_depth > 8;
1765 
1766 #if HAVE_SSE2_INLINE
1767  if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1770  c->idct = ff_idct_xvid_sse2;
1772  }
1773 #endif /* HAVE_SSE2_INLINE */
1774 
1775 #if HAVE_SSE2_EXTERNAL
1776  if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1777  // these functions are slower than mmx on AMD, but faster on Intel
1778  if (!high_bit_depth) {
1782  }
1783  }
1784 
1787  if (mm_flags & AV_CPU_FLAG_ATOM) {
1789  } else {
1791  }
1792  if (avctx->flags & CODEC_FLAG_BITEXACT) {
1794  } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1796  }
1798 #endif /* HAVE_SSE2_EXTERNAL */
1799 }
1800 
1802  int mm_flags)
1803 {
1804 #if HAVE_SSSE3_EXTERNAL
1806  if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
1808 
1809  if (mm_flags & AV_CPU_FLAG_ATOM)
1811  else
1813  if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
1816 #endif /* HAVE_SSSE3_EXTERNAL */
1817 }
1818 
1820  int mm_flags)
1821 {
1822 #if HAVE_SSE4_EXTERNAL
1824 #endif /* HAVE_SSE4_EXTERNAL */
1825 }
1826 
1828 {
1829  int mm_flags = av_get_cpu_flags();
1830 
1831 #if HAVE_7REGS && HAVE_INLINE_ASM
1832  if (mm_flags & AV_CPU_FLAG_CMOV)
1833  c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
1834 #endif
1835 
1836  if (mm_flags & AV_CPU_FLAG_MMX) {
1837 #if HAVE_INLINE_ASM
1838  const int idct_algo = avctx->idct_algo;
1839 
1840  if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
1841  if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
1844  c->idct = ff_simple_idct_mmx;
1846 #if CONFIG_GPL
1847  } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
1848  if (mm_flags & AV_CPU_FLAG_MMX2) {
1849  c->idct_put = ff_libmpeg2mmx2_idct_put;
1850  c->idct_add = ff_libmpeg2mmx2_idct_add;
1851  c->idct = ff_mmxext_idct;
1852  } else {
1853  c->idct_put = ff_libmpeg2mmx_idct_put;
1854  c->idct_add = ff_libmpeg2mmx_idct_add;
1855  c->idct = ff_mmx_idct;
1856  }
1858 #endif
1859  } else if (idct_algo == FF_IDCT_XVIDMMX) {
1860  if (mm_flags & AV_CPU_FLAG_SSE2) {
1863  c->idct = ff_idct_xvid_sse2;
1865  } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
1869  } else {
1872  c->idct = ff_idct_xvid_mmx;
1873  }
1874  }
1875  }
1876 #endif /* HAVE_INLINE_ASM */
1877 
1878  dsputil_init_mmx(c, avctx, mm_flags);
1879  }
1880 
1881  if (mm_flags & AV_CPU_FLAG_MMXEXT)
1882  dsputil_init_mmxext(c, avctx, mm_flags);
1883 
1884  if (mm_flags & AV_CPU_FLAG_3DNOW)
1885  dsputil_init_3dnow(c, avctx, mm_flags);
1886 
1887  if (mm_flags & AV_CPU_FLAG_SSE)
1888  dsputil_init_sse(c, avctx, mm_flags);
1889 
1890  if (mm_flags & AV_CPU_FLAG_SSE2)
1891  dsputil_init_sse2(c, avctx, mm_flags);
1892 
1893  if (mm_flags & AV_CPU_FLAG_SSSE3)
1894  dsputil_init_ssse3(c, avctx, mm_flags);
1895 
1896  if (mm_flags & AV_CPU_FLAG_SSE4)
1897  dsputil_init_sse4(c, avctx, mm_flags);
1898 
1899  if (CONFIG_ENCODERS)
1900  ff_dsputilenc_init_mmx(c, avctx);
1901 }