FFmpeg
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
dsputil_mmx.c
Go to the documentation of this file.
1 /*
2  * MMX optimized DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  *
22  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
23  */
24 
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "dsputil_mmx.h"
32 #include "idct_xvid.h"
33 #include "diracdsp_mmx.h"
34 
35 //#undef NDEBUG
36 //#include <assert.h>
37 
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
41 
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43  { 0x8000000080000000ULL, 0x8000000080000000ULL };
44 
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
52 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
69 
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
81 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
82 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
83 
84 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
85 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
86 
87 #if HAVE_INLINE_ASM
88 
89 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
90 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
91 
92 #define MOVQ_BFE(regd) \
93  __asm__ volatile ( \
94  "pcmpeqd %%"#regd", %%"#regd" \n\t" \
95  "paddb %%"#regd", %%"#regd" \n\t" ::)
96 
97 #ifndef PIC
98 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
99 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
100 #else
101 // for shared library it's better to use this way for accessing constants
102 // pcmpeqd -> -1
103 #define MOVQ_BONE(regd) \
104  __asm__ volatile ( \
105  "pcmpeqd %%"#regd", %%"#regd" \n\t" \
106  "psrlw $15, %%"#regd" \n\t" \
107  "packuswb %%"#regd", %%"#regd" \n\t" ::)
108 
109 #define MOVQ_WTWO(regd) \
110  __asm__ volatile ( \
111  "pcmpeqd %%"#regd", %%"#regd" \n\t" \
112  "psrlw $15, %%"#regd" \n\t" \
113  "psllw $1, %%"#regd" \n\t"::)
114 
115 #endif
116 
117 // using regr as temporary and for the output result
118 // first argument is unmodifed and second is trashed
119 // regfe is supposed to contain 0xfefefefefefefefe
120 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
121  "movq "#rega", "#regr" \n\t" \
122  "pand "#regb", "#regr" \n\t" \
123  "pxor "#rega", "#regb" \n\t" \
124  "pand "#regfe", "#regb" \n\t" \
125  "psrlq $1, "#regb" \n\t" \
126  "paddb "#regb", "#regr" \n\t"
127 
128 #define PAVGB_MMX(rega, regb, regr, regfe) \
129  "movq "#rega", "#regr" \n\t" \
130  "por "#regb", "#regr" \n\t" \
131  "pxor "#rega", "#regb" \n\t" \
132  "pand "#regfe", "#regb" \n\t" \
133  "psrlq $1, "#regb" \n\t" \
134  "psubb "#regb", "#regr" \n\t"
135 
136 // mm6 is supposed to contain 0xfefefefefefefefe
137 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
138  "movq "#rega", "#regr" \n\t" \
139  "movq "#regc", "#regp" \n\t" \
140  "pand "#regb", "#regr" \n\t" \
141  "pand "#regd", "#regp" \n\t" \
142  "pxor "#rega", "#regb" \n\t" \
143  "pxor "#regc", "#regd" \n\t" \
144  "pand %%mm6, "#regb" \n\t" \
145  "pand %%mm6, "#regd" \n\t" \
146  "psrlq $1, "#regb" \n\t" \
147  "psrlq $1, "#regd" \n\t" \
148  "paddb "#regb", "#regr" \n\t" \
149  "paddb "#regd", "#regp" \n\t"
150 
151 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
152  "movq "#rega", "#regr" \n\t" \
153  "movq "#regc", "#regp" \n\t" \
154  "por "#regb", "#regr" \n\t" \
155  "por "#regd", "#regp" \n\t" \
156  "pxor "#rega", "#regb" \n\t" \
157  "pxor "#regc", "#regd" \n\t" \
158  "pand %%mm6, "#regb" \n\t" \
159  "pand %%mm6, "#regd" \n\t" \
160  "psrlq $1, "#regd" \n\t" \
161  "psrlq $1, "#regb" \n\t" \
162  "psubb "#regb", "#regr" \n\t" \
163  "psubb "#regd", "#regp" \n\t"
164 
165 /***********************************/
166 /* MMX no rounding */
167 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
168 #define SET_RND MOVQ_WONE
169 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
170 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
171 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
172 
173 #include "dsputil_rnd_template.c"
174 
175 #undef DEF
176 #undef SET_RND
177 #undef PAVGBP
178 #undef PAVGB
179 /***********************************/
180 /* MMX rounding */
181 
182 #define DEF(x, y) x ## _ ## y ## _mmx
183 #define SET_RND MOVQ_WTWO
184 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
185 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
186 
187 #include "dsputil_rnd_template.c"
188 
189 #undef DEF
190 #undef SET_RND
191 #undef PAVGBP
192 #undef PAVGB
193 #undef OP_AVG
194 
195 /***********************************/
196 /* 3Dnow specific */
197 
198 #define DEF(x) x ## _3dnow
199 #define PAVGB "pavgusb"
200 #define OP_AVG PAVGB
201 #define SKIP_FOR_3DNOW
202 
203 #include "dsputil_avg_template.c"
204 
205 #undef DEF
206 #undef PAVGB
207 #undef OP_AVG
208 #undef SKIP_FOR_3DNOW
209 
210 /***********************************/
211 /* MMXEXT specific */
212 
213 #define DEF(x) x ## _mmxext
214 
215 /* Introduced only in MMXEXT set */
216 #define PAVGB "pavgb"
217 #define OP_AVG PAVGB
218 
219 #include "dsputil_avg_template.c"
220 
221 #undef DEF
222 #undef PAVGB
223 #undef OP_AVG
224 
225 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
226 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
227 #define put_pixels16_mmxext put_pixels16_mmx
228 #define put_pixels8_mmxext put_pixels8_mmx
229 #define put_pixels4_mmxext put_pixels4_mmx
230 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
231 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
232 
233 /***********************************/
234 /* standard MMX */
235 
236 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
237  int line_size)
238 {
239  const DCTELEM *p;
240  uint8_t *pix;
241 
242  /* read the pixels */
243  p = block;
244  pix = pixels;
245  /* unrolled loop */
246  __asm__ volatile (
247  "movq (%3), %%mm0 \n\t"
248  "movq 8(%3), %%mm1 \n\t"
249  "movq 16(%3), %%mm2 \n\t"
250  "movq 24(%3), %%mm3 \n\t"
251  "movq 32(%3), %%mm4 \n\t"
252  "movq 40(%3), %%mm5 \n\t"
253  "movq 48(%3), %%mm6 \n\t"
254  "movq 56(%3), %%mm7 \n\t"
255  "packuswb %%mm1, %%mm0 \n\t"
256  "packuswb %%mm3, %%mm2 \n\t"
257  "packuswb %%mm5, %%mm4 \n\t"
258  "packuswb %%mm7, %%mm6 \n\t"
259  "movq %%mm0, (%0) \n\t"
260  "movq %%mm2, (%0, %1) \n\t"
261  "movq %%mm4, (%0, %1, 2) \n\t"
262  "movq %%mm6, (%0, %2) \n\t"
263  :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
264  "r"(p)
265  : "memory");
266  pix += line_size * 4;
267  p += 32;
268 
269  // if here would be an exact copy of the code above
270  // compiler would generate some very strange code
271  // thus using "r"
272  __asm__ volatile (
273  "movq (%3), %%mm0 \n\t"
274  "movq 8(%3), %%mm1 \n\t"
275  "movq 16(%3), %%mm2 \n\t"
276  "movq 24(%3), %%mm3 \n\t"
277  "movq 32(%3), %%mm4 \n\t"
278  "movq 40(%3), %%mm5 \n\t"
279  "movq 48(%3), %%mm6 \n\t"
280  "movq 56(%3), %%mm7 \n\t"
281  "packuswb %%mm1, %%mm0 \n\t"
282  "packuswb %%mm3, %%mm2 \n\t"
283  "packuswb %%mm5, %%mm4 \n\t"
284  "packuswb %%mm7, %%mm6 \n\t"
285  "movq %%mm0, (%0) \n\t"
286  "movq %%mm2, (%0, %1) \n\t"
287  "movq %%mm4, (%0, %1, 2) \n\t"
288  "movq %%mm6, (%0, %2) \n\t"
289  :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
290  : "memory");
291 }
292 
293 #define put_signed_pixels_clamped_mmx_half(off) \
294  "movq "#off"(%2), %%mm1 \n\t" \
295  "movq 16 + "#off"(%2), %%mm2 \n\t" \
296  "movq 32 + "#off"(%2), %%mm3 \n\t" \
297  "movq 48 + "#off"(%2), %%mm4 \n\t" \
298  "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
299  "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
300  "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
301  "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
302  "paddb %%mm0, %%mm1 \n\t" \
303  "paddb %%mm0, %%mm2 \n\t" \
304  "paddb %%mm0, %%mm3 \n\t" \
305  "paddb %%mm0, %%mm4 \n\t" \
306  "movq %%mm1, (%0) \n\t" \
307  "movq %%mm2, (%0, %3) \n\t" \
308  "movq %%mm3, (%0, %3, 2) \n\t" \
309  "movq %%mm4, (%0, %1) \n\t"
310 
311 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
312  int line_size)
313 {
314  x86_reg line_skip = line_size;
315  x86_reg line_skip3;
316 
317  __asm__ volatile (
318  "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
319  "lea (%3, %3, 2), %1 \n\t"
320  put_signed_pixels_clamped_mmx_half(0)
321  "lea (%0, %3, 4), %0 \n\t"
322  put_signed_pixels_clamped_mmx_half(64)
323  : "+&r"(pixels), "=&r"(line_skip3)
324  : "r"(block), "r"(line_skip)
325  : "memory");
326 }
327 
328 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
329  int line_size)
330 {
331  const DCTELEM *p;
332  uint8_t *pix;
333  int i;
334 
335  /* read the pixels */
336  p = block;
337  pix = pixels;
338  MOVQ_ZERO(mm7);
339  i = 4;
340  do {
341  __asm__ volatile (
342  "movq (%2), %%mm0 \n\t"
343  "movq 8(%2), %%mm1 \n\t"
344  "movq 16(%2), %%mm2 \n\t"
345  "movq 24(%2), %%mm3 \n\t"
346  "movq %0, %%mm4 \n\t"
347  "movq %1, %%mm6 \n\t"
348  "movq %%mm4, %%mm5 \n\t"
349  "punpcklbw %%mm7, %%mm4 \n\t"
350  "punpckhbw %%mm7, %%mm5 \n\t"
351  "paddsw %%mm4, %%mm0 \n\t"
352  "paddsw %%mm5, %%mm1 \n\t"
353  "movq %%mm6, %%mm5 \n\t"
354  "punpcklbw %%mm7, %%mm6 \n\t"
355  "punpckhbw %%mm7, %%mm5 \n\t"
356  "paddsw %%mm6, %%mm2 \n\t"
357  "paddsw %%mm5, %%mm3 \n\t"
358  "packuswb %%mm1, %%mm0 \n\t"
359  "packuswb %%mm3, %%mm2 \n\t"
360  "movq %%mm0, %0 \n\t"
361  "movq %%mm2, %1 \n\t"
362  : "+m"(*pix), "+m"(*(pix + line_size))
363  : "r"(p)
364  : "memory");
365  pix += line_size * 2;
366  p += 16;
367  } while (--i);
368 }
369 
370 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
371  int line_size, int h)
372 {
373  __asm__ volatile (
374  "lea (%3, %3), %%"REG_a" \n\t"
375  ".p2align 3 \n\t"
376  "1: \n\t"
377  "movq (%1 ), %%mm0 \n\t"
378  "movq (%1, %3), %%mm1 \n\t"
379  "movq %%mm0, (%2) \n\t"
380  "movq %%mm1, (%2, %3) \n\t"
381  "add %%"REG_a", %1 \n\t"
382  "add %%"REG_a", %2 \n\t"
383  "movq (%1 ), %%mm0 \n\t"
384  "movq (%1, %3), %%mm1 \n\t"
385  "movq %%mm0, (%2) \n\t"
386  "movq %%mm1, (%2, %3) \n\t"
387  "add %%"REG_a", %1 \n\t"
388  "add %%"REG_a", %2 \n\t"
389  "subl $4, %0 \n\t"
390  "jnz 1b \n\t"
391  : "+g"(h), "+r"(pixels), "+r"(block)
392  : "r"((x86_reg)line_size)
393  : "%"REG_a, "memory"
394  );
395 }
396 
397 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
398  int line_size, int h)
399 {
400  __asm__ volatile (
401  "lea (%3, %3), %%"REG_a" \n\t"
402  ".p2align 3 \n\t"
403  "1: \n\t"
404  "movq (%1 ), %%mm0 \n\t"
405  "movq 8(%1 ), %%mm4 \n\t"
406  "movq (%1, %3), %%mm1 \n\t"
407  "movq 8(%1, %3), %%mm5 \n\t"
408  "movq %%mm0, (%2) \n\t"
409  "movq %%mm4, 8(%2) \n\t"
410  "movq %%mm1, (%2, %3) \n\t"
411  "movq %%mm5, 8(%2, %3) \n\t"
412  "add %%"REG_a", %1 \n\t"
413  "add %%"REG_a", %2 \n\t"
414  "movq (%1 ), %%mm0 \n\t"
415  "movq 8(%1 ), %%mm4 \n\t"
416  "movq (%1, %3), %%mm1 \n\t"
417  "movq 8(%1, %3), %%mm5 \n\t"
418  "movq %%mm0, (%2) \n\t"
419  "movq %%mm4, 8(%2) \n\t"
420  "movq %%mm1, (%2, %3) \n\t"
421  "movq %%mm5, 8(%2, %3) \n\t"
422  "add %%"REG_a", %1 \n\t"
423  "add %%"REG_a", %2 \n\t"
424  "subl $4, %0 \n\t"
425  "jnz 1b \n\t"
426  : "+g"(h), "+r"(pixels), "+r"(block)
427  : "r"((x86_reg)line_size)
428  : "%"REG_a, "memory"
429  );
430 }
431 
432 #define CLEAR_BLOCKS(name, n) \
433 static void name(DCTELEM *blocks) \
434 { \
435  __asm__ volatile ( \
436  "pxor %%mm7, %%mm7 \n\t" \
437  "mov %1, %%"REG_a" \n\t" \
438  "1: \n\t" \
439  "movq %%mm7, (%0, %%"REG_a") \n\t" \
440  "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
441  "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
442  "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
443  "add $32, %%"REG_a" \n\t" \
444  "js 1b \n\t" \
445  :: "r"(((uint8_t *)blocks) + 128 * n), \
446  "i"(-128 * n) \
447  : "%"REG_a \
448  ); \
449 }
450 CLEAR_BLOCKS(clear_blocks_mmx, 6)
451 CLEAR_BLOCKS(clear_block_mmx, 1)
452 
453 static void clear_block_sse(DCTELEM *block)
454 {
455  __asm__ volatile (
456  "xorps %%xmm0, %%xmm0 \n"
457  "movaps %%xmm0, (%0) \n"
458  "movaps %%xmm0, 16(%0) \n"
459  "movaps %%xmm0, 32(%0) \n"
460  "movaps %%xmm0, 48(%0) \n"
461  "movaps %%xmm0, 64(%0) \n"
462  "movaps %%xmm0, 80(%0) \n"
463  "movaps %%xmm0, 96(%0) \n"
464  "movaps %%xmm0, 112(%0) \n"
465  :: "r"(block)
466  : "memory"
467  );
468 }
469 
470 static void clear_blocks_sse(DCTELEM *blocks)
471 {
472  __asm__ volatile (
473  "xorps %%xmm0, %%xmm0 \n"
474  "mov %1, %%"REG_a" \n"
475  "1: \n"
476  "movaps %%xmm0, (%0, %%"REG_a") \n"
477  "movaps %%xmm0, 16(%0, %%"REG_a") \n"
478  "movaps %%xmm0, 32(%0, %%"REG_a") \n"
479  "movaps %%xmm0, 48(%0, %%"REG_a") \n"
480  "movaps %%xmm0, 64(%0, %%"REG_a") \n"
481  "movaps %%xmm0, 80(%0, %%"REG_a") \n"
482  "movaps %%xmm0, 96(%0, %%"REG_a") \n"
483  "movaps %%xmm0, 112(%0, %%"REG_a") \n"
484  "add $128, %%"REG_a" \n"
485  "js 1b \n"
486  :: "r"(((uint8_t *)blocks) + 128 * 6),
487  "i"(-128 * 6)
488  : "%"REG_a
489  );
490 }
491 
492 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
493 {
494  x86_reg i = 0;
495  __asm__ volatile (
496  "jmp 2f \n\t"
497  "1: \n\t"
498  "movq (%1, %0), %%mm0 \n\t"
499  "movq (%2, %0), %%mm1 \n\t"
500  "paddb %%mm0, %%mm1 \n\t"
501  "movq %%mm1, (%2, %0) \n\t"
502  "movq 8(%1, %0), %%mm0 \n\t"
503  "movq 8(%2, %0), %%mm1 \n\t"
504  "paddb %%mm0, %%mm1 \n\t"
505  "movq %%mm1, 8(%2, %0) \n\t"
506  "add $16, %0 \n\t"
507  "2: \n\t"
508  "cmp %3, %0 \n\t"
509  "js 1b \n\t"
510  : "+r"(i)
511  : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
512  );
513  for ( ; i < w; i++)
514  dst[i + 0] += src[i + 0];
515 }
516 
517 #if HAVE_7REGS
518 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
519  const uint8_t *diff, int w,
520  int *left, int *left_top)
521 {
522  x86_reg w2 = -w;
523  x86_reg x;
524  int l = *left & 0xff;
525  int tl = *left_top & 0xff;
526  int t;
527  __asm__ volatile (
528  "mov %7, %3 \n"
529  "1: \n"
530  "movzbl (%3, %4), %2 \n"
531  "mov %2, %k3 \n"
532  "sub %b1, %b3 \n"
533  "add %b0, %b3 \n"
534  "mov %2, %1 \n"
535  "cmp %0, %2 \n"
536  "cmovg %0, %2 \n"
537  "cmovg %1, %0 \n"
538  "cmp %k3, %0 \n"
539  "cmovg %k3, %0 \n"
540  "mov %7, %3 \n"
541  "cmp %2, %0 \n"
542  "cmovl %2, %0 \n"
543  "add (%6, %4), %b0 \n"
544  "mov %b0, (%5, %4) \n"
545  "inc %4 \n"
546  "jl 1b \n"
547  : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
548  : "r"(dst + w), "r"(diff + w), "rm"(top + w)
549  );
550  *left = l;
551  *left_top = tl;
552 }
553 #endif
554 
555 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
556  __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
557  "movd (%1), %%mm0 \n\t"
558  "add %3, %1 \n\t"
559  "movd (%1), %%mm1 \n\t"
560  "movd (%1,%3,1), %%mm2 \n\t"
561  "movd (%1,%3,2), %%mm3 \n\t"
562  "punpcklbw %%mm1, %%mm0 \n\t"
563  "punpcklbw %%mm3, %%mm2 \n\t"
564  "movq %%mm0, %%mm1 \n\t"
565  "punpcklwd %%mm2, %%mm0 \n\t"
566  "punpckhwd %%mm2, %%mm1 \n\t"
567  "movd %%mm0, (%0) \n\t"
568  "add %2, %0 \n\t"
569  "punpckhdq %%mm0, %%mm0 \n\t"
570  "movd %%mm0, (%0) \n\t"
571  "movd %%mm1, (%0,%2,1) \n\t"
572  "punpckhdq %%mm1, %%mm1 \n\t"
573  "movd %%mm1, (%0,%2,2) \n\t"
574 
575  : "+&r" (dst),
576  "+&r" (src)
577  : "r" (dst_stride),
578  "r" (src_stride)
579  : "memory"
580  );
581 }
582 
583 #define H263_LOOP_FILTER \
584  "pxor %%mm7, %%mm7 \n\t" \
585  "movq %0, %%mm0 \n\t" \
586  "movq %0, %%mm1 \n\t" \
587  "movq %3, %%mm2 \n\t" \
588  "movq %3, %%mm3 \n\t" \
589  "punpcklbw %%mm7, %%mm0 \n\t" \
590  "punpckhbw %%mm7, %%mm1 \n\t" \
591  "punpcklbw %%mm7, %%mm2 \n\t" \
592  "punpckhbw %%mm7, %%mm3 \n\t" \
593  "psubw %%mm2, %%mm0 \n\t" \
594  "psubw %%mm3, %%mm1 \n\t" \
595  "movq %1, %%mm2 \n\t" \
596  "movq %1, %%mm3 \n\t" \
597  "movq %2, %%mm4 \n\t" \
598  "movq %2, %%mm5 \n\t" \
599  "punpcklbw %%mm7, %%mm2 \n\t" \
600  "punpckhbw %%mm7, %%mm3 \n\t" \
601  "punpcklbw %%mm7, %%mm4 \n\t" \
602  "punpckhbw %%mm7, %%mm5 \n\t" \
603  "psubw %%mm2, %%mm4 \n\t" \
604  "psubw %%mm3, %%mm5 \n\t" \
605  "psllw $2, %%mm4 \n\t" \
606  "psllw $2, %%mm5 \n\t" \
607  "paddw %%mm0, %%mm4 \n\t" \
608  "paddw %%mm1, %%mm5 \n\t" \
609  "pxor %%mm6, %%mm6 \n\t" \
610  "pcmpgtw %%mm4, %%mm6 \n\t" \
611  "pcmpgtw %%mm5, %%mm7 \n\t" \
612  "pxor %%mm6, %%mm4 \n\t" \
613  "pxor %%mm7, %%mm5 \n\t" \
614  "psubw %%mm6, %%mm4 \n\t" \
615  "psubw %%mm7, %%mm5 \n\t" \
616  "psrlw $3, %%mm4 \n\t" \
617  "psrlw $3, %%mm5 \n\t" \
618  "packuswb %%mm5, %%mm4 \n\t" \
619  "packsswb %%mm7, %%mm6 \n\t" \
620  "pxor %%mm7, %%mm7 \n\t" \
621  "movd %4, %%mm2 \n\t" \
622  "punpcklbw %%mm2, %%mm2 \n\t" \
623  "punpcklbw %%mm2, %%mm2 \n\t" \
624  "punpcklbw %%mm2, %%mm2 \n\t" \
625  "psubusb %%mm4, %%mm2 \n\t" \
626  "movq %%mm2, %%mm3 \n\t" \
627  "psubusb %%mm4, %%mm3 \n\t" \
628  "psubb %%mm3, %%mm2 \n\t" \
629  "movq %1, %%mm3 \n\t" \
630  "movq %2, %%mm4 \n\t" \
631  "pxor %%mm6, %%mm3 \n\t" \
632  "pxor %%mm6, %%mm4 \n\t" \
633  "paddusb %%mm2, %%mm3 \n\t" \
634  "psubusb %%mm2, %%mm4 \n\t" \
635  "pxor %%mm6, %%mm3 \n\t" \
636  "pxor %%mm6, %%mm4 \n\t" \
637  "paddusb %%mm2, %%mm2 \n\t" \
638  "packsswb %%mm1, %%mm0 \n\t" \
639  "pcmpgtb %%mm0, %%mm7 \n\t" \
640  "pxor %%mm7, %%mm0 \n\t" \
641  "psubb %%mm7, %%mm0 \n\t" \
642  "movq %%mm0, %%mm1 \n\t" \
643  "psubusb %%mm2, %%mm0 \n\t" \
644  "psubb %%mm0, %%mm1 \n\t" \
645  "pand %5, %%mm1 \n\t" \
646  "psrlw $2, %%mm1 \n\t" \
647  "pxor %%mm7, %%mm1 \n\t" \
648  "psubb %%mm7, %%mm1 \n\t" \
649  "movq %0, %%mm5 \n\t" \
650  "movq %3, %%mm6 \n\t" \
651  "psubb %%mm1, %%mm5 \n\t" \
652  "paddb %%mm1, %%mm6 \n\t"
653 
654 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
655 {
656  if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
657  const int strength = ff_h263_loop_filter_strength[qscale];
658 
659  __asm__ volatile (
660  H263_LOOP_FILTER
661 
662  "movq %%mm3, %1 \n\t"
663  "movq %%mm4, %2 \n\t"
664  "movq %%mm5, %0 \n\t"
665  "movq %%mm6, %3 \n\t"
666  : "+m"(*(uint64_t*)(src - 2 * stride)),
667  "+m"(*(uint64_t*)(src - 1 * stride)),
668  "+m"(*(uint64_t*)(src + 0 * stride)),
669  "+m"(*(uint64_t*)(src + 1 * stride))
670  : "g"(2 * strength), "m"(ff_pb_FC)
671  );
672  }
673 }
674 
675 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
676 {
677  if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
678  const int strength = ff_h263_loop_filter_strength[qscale];
679  DECLARE_ALIGNED(8, uint64_t, temp)[4];
680  uint8_t *btemp = (uint8_t*)temp;
681 
682  src -= 2;
683 
684  transpose4x4(btemp, src, 8, stride);
685  transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
686  __asm__ volatile (
687  H263_LOOP_FILTER // 5 3 4 6
688 
689  : "+m"(temp[0]),
690  "+m"(temp[1]),
691  "+m"(temp[2]),
692  "+m"(temp[3])
693  : "g"(2 * strength), "m"(ff_pb_FC)
694  );
695 
696  __asm__ volatile (
697  "movq %%mm5, %%mm1 \n\t"
698  "movq %%mm4, %%mm0 \n\t"
699  "punpcklbw %%mm3, %%mm5 \n\t"
700  "punpcklbw %%mm6, %%mm4 \n\t"
701  "punpckhbw %%mm3, %%mm1 \n\t"
702  "punpckhbw %%mm6, %%mm0 \n\t"
703  "movq %%mm5, %%mm3 \n\t"
704  "movq %%mm1, %%mm6 \n\t"
705  "punpcklwd %%mm4, %%mm5 \n\t"
706  "punpcklwd %%mm0, %%mm1 \n\t"
707  "punpckhwd %%mm4, %%mm3 \n\t"
708  "punpckhwd %%mm0, %%mm6 \n\t"
709  "movd %%mm5, (%0) \n\t"
710  "punpckhdq %%mm5, %%mm5 \n\t"
711  "movd %%mm5, (%0, %2) \n\t"
712  "movd %%mm3, (%0, %2, 2) \n\t"
713  "punpckhdq %%mm3, %%mm3 \n\t"
714  "movd %%mm3, (%0, %3) \n\t"
715  "movd %%mm1, (%1) \n\t"
716  "punpckhdq %%mm1, %%mm1 \n\t"
717  "movd %%mm1, (%1, %2) \n\t"
718  "movd %%mm6, (%1, %2, 2) \n\t"
719  "punpckhdq %%mm6, %%mm6 \n\t"
720  "movd %%mm6, (%1, %3) \n\t"
721  :: "r"(src),
722  "r"(src + 4 * stride),
723  "r"((x86_reg)stride),
724  "r"((x86_reg)(3 * stride))
725  );
726  }
727 }
728 
729 /* Draw the edges of width 'w' of an image of size width, height
730  * this MMX version can only handle w == 8 || w == 16. */
731 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
732  int w, int h, int sides)
733 {
734  uint8_t *ptr, *last_line;
735  int i;
736 
737  last_line = buf + (height - 1) * wrap;
738  /* left and right */
739  ptr = buf;
740  if (w == 8) {
741  __asm__ volatile (
742  "1: \n\t"
743  "movd (%0), %%mm0 \n\t"
744  "punpcklbw %%mm0, %%mm0 \n\t"
745  "punpcklwd %%mm0, %%mm0 \n\t"
746  "punpckldq %%mm0, %%mm0 \n\t"
747  "movq %%mm0, -8(%0) \n\t"
748  "movq -8(%0, %2), %%mm1 \n\t"
749  "punpckhbw %%mm1, %%mm1 \n\t"
750  "punpckhwd %%mm1, %%mm1 \n\t"
751  "punpckhdq %%mm1, %%mm1 \n\t"
752  "movq %%mm1, (%0, %2) \n\t"
753  "add %1, %0 \n\t"
754  "cmp %3, %0 \n\t"
755  "jb 1b \n\t"
756  : "+r"(ptr)
757  : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
758  );
759  } else if(w==16){
760  __asm__ volatile (
761  "1: \n\t"
762  "movd (%0), %%mm0 \n\t"
763  "punpcklbw %%mm0, %%mm0 \n\t"
764  "punpcklwd %%mm0, %%mm0 \n\t"
765  "punpckldq %%mm0, %%mm0 \n\t"
766  "movq %%mm0, -8(%0) \n\t"
767  "movq %%mm0, -16(%0) \n\t"
768  "movq -8(%0, %2), %%mm1 \n\t"
769  "punpckhbw %%mm1, %%mm1 \n\t"
770  "punpckhwd %%mm1, %%mm1 \n\t"
771  "punpckhdq %%mm1, %%mm1 \n\t"
772  "movq %%mm1, (%0, %2) \n\t"
773  "movq %%mm1, 8(%0, %2) \n\t"
774  "add %1, %0 \n\t"
775  "cmp %3, %0 \n\t"
776  "jb 1b \n\t"
777  : "+r"(ptr)
778  : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
779  );
780  } else {
781  av_assert1(w == 4);
782  __asm__ volatile (
783  "1: \n\t"
784  "movd (%0), %%mm0 \n\t"
785  "punpcklbw %%mm0, %%mm0 \n\t"
786  "punpcklwd %%mm0, %%mm0 \n\t"
787  "movd %%mm0, -4(%0) \n\t"
788  "movd -4(%0, %2), %%mm1 \n\t"
789  "punpcklbw %%mm1, %%mm1 \n\t"
790  "punpckhwd %%mm1, %%mm1 \n\t"
791  "punpckhdq %%mm1, %%mm1 \n\t"
792  "movd %%mm1, (%0, %2) \n\t"
793  "add %1, %0 \n\t"
794  "cmp %3, %0 \n\t"
795  "jb 1b \n\t"
796  : "+r"(ptr)
797  : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
798  );
799  }
800 
801  /* top and bottom (and hopefully also the corners) */
802  if (sides & EDGE_TOP) {
803  for (i = 0; i < h; i += 4) {
804  ptr = buf - (i + 1) * wrap - w;
805  __asm__ volatile (
806  "1: \n\t"
807  "movq (%1, %0), %%mm0 \n\t"
808  "movq %%mm0, (%0) \n\t"
809  "movq %%mm0, (%0, %2) \n\t"
810  "movq %%mm0, (%0, %2, 2) \n\t"
811  "movq %%mm0, (%0, %3) \n\t"
812  "add $8, %0 \n\t"
813  "cmp %4, %0 \n\t"
814  "jb 1b \n\t"
815  : "+r"(ptr)
816  : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
817  "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
818  );
819  }
820  }
821 
822  if (sides & EDGE_BOTTOM) {
823  for (i = 0; i < h; i += 4) {
824  ptr = last_line + (i + 1) * wrap - w;
825  __asm__ volatile (
826  "1: \n\t"
827  "movq (%1, %0), %%mm0 \n\t"
828  "movq %%mm0, (%0) \n\t"
829  "movq %%mm0, (%0, %2) \n\t"
830  "movq %%mm0, (%0, %2, 2) \n\t"
831  "movq %%mm0, (%0, %3) \n\t"
832  "add $8, %0 \n\t"
833  "cmp %4, %0 \n\t"
834  "jb 1b \n\t"
835  : "+r"(ptr)
836  : "r"((x86_reg)last_line - (x86_reg)ptr - w),
837  "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
838  "r"(ptr + width + 2 * w)
839  );
840  }
841  }
842 }
843 
844 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
845  in0, in1, in2, in7, out, OP) \
846  "paddw "#m4", "#m3" \n\t" /* x1 */ \
847  "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */ \
848  "pmullw "#m3", %%mm4 \n\t" /* 20x1 */ \
849  "movq "#in7", "#m3" \n\t" /* d */ \
850  "movq "#in0", %%mm5 \n\t" /* D */ \
851  "paddw "#m3", %%mm5 \n\t" /* x4 */ \
852  "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */ \
853  "movq "#in1", %%mm5 \n\t" /* C */ \
854  "movq "#in2", %%mm6 \n\t" /* B */ \
855  "paddw "#m6", %%mm5 \n\t" /* x3 */ \
856  "paddw "#m5", %%mm6 \n\t" /* x2 */ \
857  "paddw %%mm6, %%mm6 \n\t" /* 2x2 */ \
858  "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */ \
859  "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */ \
860  "paddw "#rnd", %%mm4 \n\t" /* x2 */ \
861  "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
862  "psraw $5, %%mm5 \n\t" \
863  "packuswb %%mm5, %%mm5 \n\t" \
864  OP(%%mm5, out, %%mm7, d)
865 
866 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT) \
867 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, \
868  uint8_t *src, \
869  int dstStride, \
870  int srcStride, \
871  int h) \
872 { \
873  uint64_t temp; \
874  \
875  __asm__ volatile ( \
876  "pxor %%mm7, %%mm7 \n\t" \
877  "1: \n\t" \
878  "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
879  "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
880  "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
881  "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
882  "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
883  "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
884  "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
885  "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
886  "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
887  "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
888  "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
889  "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
890  "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
891  "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
892  "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
893  "paddw %%mm3, %%mm5 \n\t" /* b */ \
894  "paddw %%mm2, %%mm6 \n\t" /* c */ \
895  "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
896  "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
897  "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
898  "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
899  "paddw %%mm4, %%mm0 \n\t" /* a */ \
900  "paddw %%mm1, %%mm5 \n\t" /* d */ \
901  "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
902  "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
903  "paddw %6, %%mm6 \n\t" \
904  "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
905  "psraw $5, %%mm0 \n\t" \
906  "movq %%mm0, %5 \n\t" \
907  /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
908  \
909  "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */ \
910  "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */ \
911  "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */ \
912  "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */ \
913  "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */ \
914  "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */ \
915  "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */ \
916  "paddw %%mm0, %%mm2 \n\t" /* b */ \
917  "paddw %%mm5, %%mm3 \n\t" /* c */ \
918  "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
919  "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
920  "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */ \
921  "psrlq $24, %%mm6 \n\t" /* IJKLM000 */ \
922  "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */ \
923  "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */ \
924  "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
925  "paddw %%mm2, %%mm1 \n\t" /* a */ \
926  "paddw %%mm6, %%mm4 \n\t" /* d */ \
927  "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
928  "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */ \
929  "paddw %6, %%mm1 \n\t" \
930  "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */ \
931  "psraw $5, %%mm3 \n\t" \
932  "movq %5, %%mm1 \n\t" \
933  "packuswb %%mm3, %%mm1 \n\t" \
934  OP_MMXEXT(%%mm1, (%1), %%mm4, q) \
935  /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */ \
936  \
937  "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */ \
938  "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */ \
939  "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */ \
940  "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */ \
941  "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */ \
942  "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */ \
943  "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */ \
944  "paddw %%mm1, %%mm5 \n\t" /* b */ \
945  "paddw %%mm4, %%mm0 \n\t" /* c */ \
946  "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
947  "psubw %%mm5, %%mm0 \n\t" /* c - 2b */ \
948  "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */ \
949  "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */ \
950  "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */ \
951  "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */ \
952  "paddw %%mm3, %%mm2 \n\t" /* d */ \
953  "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */ \
954  "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */ \
955  "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */ \
956  "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */ \
957  "paddw %%mm2, %%mm6 \n\t" /* a */ \
958  "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */ \
959  "paddw %6, %%mm0 \n\t" \
960  "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
961  "psraw $5, %%mm0 \n\t" \
962  /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */ \
963  /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */ \
964  \
965  "paddw %%mm5, %%mm3 \n\t" /* a */ \
966  "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */ \
967  "paddw %%mm4, %%mm6 \n\t" /* b */ \
968  "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */ \
969  "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */ \
970  "paddw %%mm1, %%mm4 \n\t" /* c */ \
971  "paddw %%mm2, %%mm5 \n\t" /* d */ \
972  "paddw %%mm6, %%mm6 \n\t" /* 2b */ \
973  "psubw %%mm6, %%mm4 \n\t" /* c - 2b */ \
974  "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */ \
975  "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */ \
976  "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */ \
977  "paddw %6, %%mm4 \n\t" \
978  "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */ \
979  "psraw $5, %%mm4 \n\t" \
980  "packuswb %%mm4, %%mm0 \n\t" \
981  OP_MMXEXT(%%mm0, 8(%1), %%mm4, q) \
982  \
983  "add %3, %0 \n\t" \
984  "add %4, %1 \n\t" \
985  "decl %2 \n\t" \
986  "jnz 1b \n\t" \
987  : "+a"(src), "+c"(dst), "+D"(h) \
988  : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
989  /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER) \
990  : "memory" \
991  ); \
992 } \
993  \
994 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, \
995  uint8_t *src, \
996  int dstStride, \
997  int srcStride, \
998  int h) \
999 { \
1000  __asm__ volatile ( \
1001  "pxor %%mm7, %%mm7 \n\t" \
1002  "1: \n\t" \
1003  "movq (%0), %%mm0 \n\t" /* ABCDEFGH */ \
1004  "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */ \
1005  "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */ \
1006  "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */ \
1007  "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */ \
1008  "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */ \
1009  "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */ \
1010  "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */ \
1011  "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */ \
1012  "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */ \
1013  "psllq $16, %%mm3 \n\t" /* 00ABCDEF */ \
1014  "psllq $24, %%mm4 \n\t" /* 000ABCDE */ \
1015  "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */ \
1016  "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */ \
1017  "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */ \
1018  "paddw %%mm3, %%mm5 \n\t" /* b */ \
1019  "paddw %%mm2, %%mm6 \n\t" /* c */ \
1020  "paddw %%mm5, %%mm5 \n\t" /* 2b */ \
1021  "psubw %%mm5, %%mm6 \n\t" /* c - 2b */ \
1022  "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */ \
1023  "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */ \
1024  "paddw %%mm4, %%mm0 \n\t" /* a */ \
1025  "paddw %%mm1, %%mm5 \n\t" /* d */ \
1026  "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */ \
1027  "psubw %%mm5, %%mm0 \n\t" /* 20a - d */ \
1028  "paddw %5, %%mm6 \n\t" \
1029  "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */ \
1030  "psraw $5, %%mm0 \n\t" \
1031  /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */ \
1032  \
1033  "movd 5(%0), %%mm5 \n\t" /* FGHI */ \
1034  "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */ \
1035  "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */ \
1036  "paddw %%mm5, %%mm1 \n\t" /* a */ \
1037  "paddw %%mm6, %%mm2 \n\t" /* b */ \
1038  "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */ \
1039  "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */ \
1040  "paddw %%mm6, %%mm3 \n\t" /* c */ \
1041  "paddw %%mm5, %%mm4 \n\t" /* d */ \
1042  "paddw %%mm2, %%mm2 \n\t" /* 2b */ \
1043  "psubw %%mm2, %%mm3 \n\t" /* c - 2b */ \
1044  "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */ \
1045  "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */ \
1046  "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */ \
1047  "paddw %5, %%mm1 \n\t" \
1048  "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */ \
1049  "psraw $5, %%mm3 \n\t" \
1050  "packuswb %%mm3, %%mm0 \n\t" \
1051  OP_MMXEXT(%%mm0, (%1), %%mm4, q) \
1052  \
1053  "add %3, %0 \n\t" \
1054  "add %4, %1 \n\t" \
1055  "decl %2 \n\t" \
1056  "jnz 1b \n\t" \
1057  : "+a"(src), "+c"(dst), "+d"(h) \
1058  : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
1059  /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER) \
1060  : "memory" \
1061  ); \
1062 }
1063 
1064 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
1065 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
1066  uint8_t *src, \
1067  int dstStride, \
1068  int srcStride) \
1069 { \
1070  uint64_t temp[17 * 4]; \
1071  uint64_t *temp_ptr = temp; \
1072  int count = 17; \
1073  \
1074  /* FIXME unroll */ \
1075  __asm__ volatile ( \
1076  "pxor %%mm7, %%mm7 \n\t" \
1077  "1: \n\t" \
1078  "movq (%0), %%mm0 \n\t" \
1079  "movq (%0), %%mm1 \n\t" \
1080  "movq 8(%0), %%mm2 \n\t" \
1081  "movq 8(%0), %%mm3 \n\t" \
1082  "punpcklbw %%mm7, %%mm0 \n\t" \
1083  "punpckhbw %%mm7, %%mm1 \n\t" \
1084  "punpcklbw %%mm7, %%mm2 \n\t" \
1085  "punpckhbw %%mm7, %%mm3 \n\t" \
1086  "movq %%mm0, (%1) \n\t" \
1087  "movq %%mm1, 17 * 8(%1) \n\t" \
1088  "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
1089  "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
1090  "add $8, %1 \n\t" \
1091  "add %3, %0 \n\t" \
1092  "decl %2 \n\t" \
1093  "jnz 1b \n\t" \
1094  : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1095  : "r"((x86_reg)srcStride) \
1096  : "memory" \
1097  ); \
1098  \
1099  temp_ptr = temp; \
1100  count = 4; \
1101  \
1102  /* FIXME reorder for speed */ \
1103  __asm__ volatile ( \
1104  /* "pxor %%mm7, %%mm7 \n\t" */ \
1105  "1: \n\t" \
1106  "movq (%0), %%mm0 \n\t" \
1107  "movq 8(%0), %%mm1 \n\t" \
1108  "movq 16(%0), %%mm2 \n\t" \
1109  "movq 24(%0), %%mm3 \n\t" \
1110  QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1111  QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1112  "add %4, %1 \n\t" \
1113  QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1114  \
1115  QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1116  "add %4, %1 \n\t" \
1117  QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1118  QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
1119  "add %4, %1 \n\t" \
1120  QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
1121  QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
1122  "add %4, %1 \n\t" \
1123  QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
1124  QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
1125  "add %4, %1 \n\t" \
1126  QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
1127  QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
1128  "add %4, %1 \n\t" \
1129  QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
1130  \
1131  QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
1132  "add %4, %1 \n\t" \
1133  QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
1134  QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
1135  \
1136  "add $136, %0 \n\t" \
1137  "add %6, %1 \n\t" \
1138  "decl %2 \n\t" \
1139  "jnz 1b \n\t" \
1140  \
1141  : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1142  : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1143  /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1144  "g"(4 - 14 * (x86_reg)dstStride) \
1145  : "memory" \
1146  ); \
1147 } \
1148  \
1149 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
1150  uint8_t *src, \
1151  int dstStride, \
1152  int srcStride) \
1153 { \
1154  uint64_t temp[9 * 2]; \
1155  uint64_t *temp_ptr = temp; \
1156  int count = 9; \
1157  \
1158  /* FIXME unroll */ \
1159  __asm__ volatile ( \
1160  "pxor %%mm7, %%mm7 \n\t" \
1161  "1: \n\t" \
1162  "movq (%0), %%mm0 \n\t" \
1163  "movq (%0), %%mm1 \n\t" \
1164  "punpcklbw %%mm7, %%mm0 \n\t" \
1165  "punpckhbw %%mm7, %%mm1 \n\t" \
1166  "movq %%mm0, (%1) \n\t" \
1167  "movq %%mm1, 9*8(%1) \n\t" \
1168  "add $8, %1 \n\t" \
1169  "add %3, %0 \n\t" \
1170  "decl %2 \n\t" \
1171  "jnz 1b \n\t" \
1172  : "+r"(src), "+r"(temp_ptr), "+r"(count) \
1173  : "r"((x86_reg)srcStride) \
1174  : "memory" \
1175  ); \
1176  \
1177  temp_ptr = temp; \
1178  count = 2; \
1179  \
1180  /* FIXME reorder for speed */ \
1181  __asm__ volatile ( \
1182  /* "pxor %%mm7, %%mm7 \n\t" */ \
1183  "1: \n\t" \
1184  "movq (%0), %%mm0 \n\t" \
1185  "movq 8(%0), %%mm1 \n\t" \
1186  "movq 16(%0), %%mm2 \n\t" \
1187  "movq 24(%0), %%mm3 \n\t" \
1188  QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
1189  QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
1190  "add %4, %1 \n\t" \
1191  QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
1192  \
1193  QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
1194  "add %4, %1 \n\t" \
1195  QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
1196  \
1197  QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
1198  "add %4, %1 \n\t" \
1199  QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
1200  QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
1201  \
1202  "add $72, %0 \n\t" \
1203  "add %6, %1 \n\t" \
1204  "decl %2 \n\t" \
1205  "jnz 1b \n\t" \
1206  \
1207  : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
1208  : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
1209  /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER), \
1210  "g"(4 - 6 * (x86_reg)dstStride) \
1211  : "memory" \
1212  ); \
1213 } \
1214  \
1215 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1216  int stride) \
1217 { \
1218  OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
1219 } \
1220  \
1221 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1222  int stride) \
1223 { \
1224  uint64_t temp[8]; \
1225  uint8_t * const half = (uint8_t*)temp; \
1226  put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1227  stride, 8); \
1228  OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1229 } \
1230  \
1231 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1232  int stride) \
1233 { \
1234  OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
1235  stride, 8); \
1236 } \
1237  \
1238 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1239  int stride) \
1240 { \
1241  uint64_t temp[8]; \
1242  uint8_t * const half = (uint8_t*)temp; \
1243  put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
1244  stride, 8); \
1245  OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
1246  stride, 8); \
1247 } \
1248  \
1249 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1250  int stride) \
1251 { \
1252  uint64_t temp[8]; \
1253  uint8_t * const half = (uint8_t*)temp; \
1254  put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1255  OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
1256 } \
1257  \
1258 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1259  int stride) \
1260 { \
1261  OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
1262 } \
1263  \
1264 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1265  int stride) \
1266 { \
1267  uint64_t temp[8]; \
1268  uint8_t * const half = (uint8_t*)temp; \
1269  put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
1270  OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
1271  stride, 8); \
1272 } \
1273  \
1274 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1275  int stride) \
1276 { \
1277  uint64_t half[8 + 9]; \
1278  uint8_t * const halfH = ((uint8_t*)half) + 64; \
1279  uint8_t * const halfHV = ((uint8_t*)half); \
1280  put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1281  stride, 9); \
1282  put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1283  put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1284  OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1285 } \
1286  \
1287 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1288  int stride) \
1289 { \
1290  uint64_t half[8 + 9]; \
1291  uint8_t * const halfH = ((uint8_t*)half) + 64; \
1292  uint8_t * const halfHV = ((uint8_t*)half); \
1293  put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1294  stride, 9); \
1295  put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1296  stride, 9); \
1297  put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1298  OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1299 } \
1300  \
1301 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1302  int stride) \
1303 { \
1304  uint64_t half[8 + 9]; \
1305  uint8_t * const halfH = ((uint8_t*)half) + 64; \
1306  uint8_t * const halfHV = ((uint8_t*)half); \
1307  put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1308  stride, 9); \
1309  put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1310  put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1311  OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1312 } \
1313  \
1314 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1315  int stride) \
1316 { \
1317  uint64_t half[8 + 9]; \
1318  uint8_t * const halfH = ((uint8_t*)half) + 64; \
1319  uint8_t * const halfHV = ((uint8_t*)half); \
1320  put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1321  stride, 9); \
1322  put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1323  stride, 9); \
1324  put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1325  OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1326 } \
1327  \
1328 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1329  int stride) \
1330 { \
1331  uint64_t half[8 + 9]; \
1332  uint8_t * const halfH = ((uint8_t*)half) + 64; \
1333  uint8_t * const halfHV = ((uint8_t*)half); \
1334  put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1335  stride, 9); \
1336  put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1337  OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
1338 } \
1339  \
1340 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1341  int stride) \
1342 { \
1343  uint64_t half[8 + 9]; \
1344  uint8_t * const halfH = ((uint8_t*)half) + 64; \
1345  uint8_t * const halfHV = ((uint8_t*)half); \
1346  put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1347  stride, 9); \
1348  put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
1349  OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
1350 } \
1351  \
1352 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1353  int stride) \
1354 { \
1355  uint64_t half[8 + 9]; \
1356  uint8_t * const halfH = ((uint8_t*)half); \
1357  put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1358  stride, 9); \
1359  put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
1360  OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1361 } \
1362  \
1363 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1364  int stride) \
1365 { \
1366  uint64_t half[8 + 9]; \
1367  uint8_t * const halfH = ((uint8_t*)half); \
1368  put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1369  stride, 9); \
1370  put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1371  stride, 9); \
1372  OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1373 } \
1374  \
1375 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1376  int stride) \
1377 { \
1378  uint64_t half[9]; \
1379  uint8_t * const halfH = ((uint8_t*)half); \
1380  put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1381  stride, 9); \
1382  OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
1383 } \
1384  \
1385 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1386  int stride) \
1387 { \
1388  OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1389 } \
1390  \
1391 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1392  int stride) \
1393 { \
1394  uint64_t temp[32]; \
1395  uint8_t * const half = (uint8_t*)temp; \
1396  put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1397  stride, 16); \
1398  OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1399 } \
1400  \
1401 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1402  int stride) \
1403 { \
1404  OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1405  stride, stride, 16); \
1406 } \
1407  \
1408 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1409  int stride) \
1410 { \
1411  uint64_t temp[32]; \
1412  uint8_t * const half = (uint8_t*)temp; \
1413  put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1414  stride, 16); \
1415  OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1416  stride, stride, 16); \
1417 } \
1418  \
1419 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1420  int stride) \
1421 { \
1422  uint64_t temp[32]; \
1423  uint8_t * const half = (uint8_t*)temp; \
1424  put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1425  stride); \
1426  OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
1427 } \
1428  \
1429 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1430  int stride) \
1431 { \
1432  OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
1433 } \
1434  \
1435 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1436  int stride) \
1437 { \
1438  uint64_t temp[32]; \
1439  uint8_t * const half = (uint8_t*)temp; \
1440  put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1441  stride); \
1442  OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1443  stride, stride, 16); \
1444 } \
1445  \
1446 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1447  int stride) \
1448 { \
1449  uint64_t half[16 * 2 + 17 * 2]; \
1450  uint8_t * const halfH = ((uint8_t*)half) + 256; \
1451  uint8_t * const halfHV = ((uint8_t*)half); \
1452  put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1453  stride, 17); \
1454  put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1455  stride, 17); \
1456  put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1457  16, 16); \
1458  OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1459 } \
1460  \
1461 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1462  int stride) \
1463 { \
1464  uint64_t half[16 * 2 + 17 * 2]; \
1465  uint8_t * const halfH = ((uint8_t*)half) + 256; \
1466  uint8_t * const halfHV = ((uint8_t*)half); \
1467  put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1468  stride, 17); \
1469  put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1470  stride, 17); \
1471  put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1472  16, 16); \
1473  OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1474 } \
1475  \
1476 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1477  int stride) \
1478 { \
1479  uint64_t half[16 * 2 + 17 * 2]; \
1480  uint8_t * const halfH = ((uint8_t*)half) + 256; \
1481  uint8_t * const halfHV = ((uint8_t*)half); \
1482  put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1483  stride, 17); \
1484  put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1485  stride, 17); \
1486  put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1487  16, 16); \
1488  OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1489  16, 16); \
1490 } \
1491  \
1492 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1493  int stride) \
1494 { \
1495  uint64_t half[16 * 2 + 17 * 2]; \
1496  uint8_t * const halfH = ((uint8_t*)half) + 256; \
1497  uint8_t * const halfHV = ((uint8_t*)half); \
1498  put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1499  stride, 17); \
1500  put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1501  stride, 17); \
1502  put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1503  16, 16); \
1504  OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1505  16, 16); \
1506 } \
1507  \
1508 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1509  int stride) \
1510 { \
1511  uint64_t half[16 * 2 + 17 * 2]; \
1512  uint8_t * const halfH = ((uint8_t*)half) + 256; \
1513  uint8_t * const halfHV = ((uint8_t*)half); \
1514  put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1515  stride, 17); \
1516  put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1517  16, 16); \
1518  OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
1519 } \
1520  \
1521 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1522  int stride) \
1523 { \
1524  uint64_t half[16 * 2 + 17 * 2]; \
1525  uint8_t * const halfH = ((uint8_t*)half) + 256; \
1526  uint8_t * const halfHV = ((uint8_t*)half); \
1527  put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1528  stride, 17); \
1529  put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1530  16, 16); \
1531  OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
1532  16, 16); \
1533 } \
1534  \
1535 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1536  int stride) \
1537 { \
1538  uint64_t half[17 * 2]; \
1539  uint8_t * const halfH = ((uint8_t*)half); \
1540  put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1541  stride, 17); \
1542  put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1543  stride, 17); \
1544  OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1545 } \
1546  \
1547 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1548  int stride) \
1549 { \
1550  uint64_t half[17 * 2]; \
1551  uint8_t * const halfH = ((uint8_t*)half); \
1552  put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1553  stride, 17); \
1554  put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1555  stride, 17); \
1556  OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1557 } \
1558  \
1559 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1560  int stride) \
1561 { \
1562  uint64_t half[17 * 2]; \
1563  uint8_t * const halfH = ((uint8_t*)half); \
1564  put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1565  stride, 17); \
1566  OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
1567 }
1568 
1569 #define PUT_OP(a, b, temp, size) \
1570  "mov"#size" "#a", "#b" \n\t"
1571 
1572 #define AVG_MMXEXT_OP(a, b, temp, size) \
1573  "mov"#size" "#b", "#temp" \n\t" \
1574  "pavgb "#temp", "#a" \n\t" \
1575  "mov"#size" "#a", "#b" \n\t"
1576 
1577 QPEL_BASE(put_, ff_pw_16, _, PUT_OP)
1578 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMXEXT_OP)
1579 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
1580 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
1581 QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
1582 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
1583 
1584 /***********************************/
1585 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1586 
1587 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
1588 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1589  uint8_t *src, \
1590  int stride) \
1591 { \
1592  OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
1593 }
1594 
1595 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
1596 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
1597  uint8_t *src, \
1598  int stride) \
1599 { \
1600  OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
1601  S1, S2); \
1602 }
1603 
1604 #define QPEL_2TAP(OPNAME, SIZE, MMX) \
1605 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
1606 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
1607 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
1608 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
1609  OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
1610 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
1611  OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
1612 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
1613  OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
1614 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
1615  uint8_t *src, \
1616  int stride) \
1617 { \
1618  OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
1619 } \
1620 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
1621  uint8_t *src, \
1622  int stride) \
1623 { \
1624  OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
1625  stride, SIZE); \
1626 } \
1627 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
1628 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
1629 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
1630 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
1631 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
1632 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
1633 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
1634 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
1635 
1636 QPEL_2TAP(put_, 16, mmxext)
1637 QPEL_2TAP(avg_, 16, mmxext)
1638 QPEL_2TAP(put_, 8, mmxext)
1639 QPEL_2TAP(avg_, 8, mmxext)
1640 
1641 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1642 {
1643  put_pixels8_xy2_mmx(dst, src, stride, 8);
1644 }
1645 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1646 {
1647  put_pixels16_xy2_mmx(dst, src, stride, 16);
1648 }
1649 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1650 {
1651  avg_pixels8_xy2_mmx(dst, src, stride, 8);
1652 }
1653 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1654 {
1655  avg_pixels16_xy2_mmx(dst, src, stride, 16);
1656 }
1657 
1658 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1659  ptrdiff_t linesize, int block_w, int block_h,
1660  int src_x, int src_y, int w, int h);
1661 
1662 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1663  int stride, int h, int ox, int oy,
1664  int dxx, int dxy, int dyx, int dyy,
1665  int shift, int r, int width, int height,
1666  emulated_edge_mc_func *emu_edge_fn)
1667 {
1668  const int w = 8;
1669  const int ix = ox >> (16 + shift);
1670  const int iy = oy >> (16 + shift);
1671  const int oxs = ox >> 4;
1672  const int oys = oy >> 4;
1673  const int dxxs = dxx >> 4;
1674  const int dxys = dxy >> 4;
1675  const int dyxs = dyx >> 4;
1676  const int dyys = dyy >> 4;
1677  const uint16_t r4[4] = { r, r, r, r };
1678  const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1679  const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1680  const uint64_t shift2 = 2 * shift;
1681 #define MAX_STRIDE 4096U
1682 #define MAX_H 8U
1683  uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
1684  int x, y;
1685 
1686  const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1687  const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1688  const int dxh = dxy * (h - 1);
1689  const int dyw = dyx * (w - 1);
1690  int need_emu = (unsigned)ix >= width - w ||
1691  (unsigned)iy >= height - h;
1692 
1693  if ( // non-constant fullpel offset (3% of blocks)
1694  ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1695  (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1696  // uses more than 16 bits of subpel mv (only at huge resolution)
1697  || (dxx | dxy | dyx | dyy) & 15
1698  || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
1699  // FIXME could still use mmx for some of the rows
1700  ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1701  shift, r, width, height);
1702  return;
1703  }
1704 
1705  src += ix + iy * stride;
1706  if (need_emu) {
1707  emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1708  src = edge_buf;
1709  }
1710 
1711  __asm__ volatile (
1712  "movd %0, %%mm6 \n\t"
1713  "pxor %%mm7, %%mm7 \n\t"
1714  "punpcklwd %%mm6, %%mm6 \n\t"
1715  "punpcklwd %%mm6, %%mm6 \n\t"
1716  :: "r"(1<<shift)
1717  );
1718 
1719  for (x = 0; x < w; x += 4) {
1720  uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1721  oxs - dxys + dxxs * (x + 1),
1722  oxs - dxys + dxxs * (x + 2),
1723  oxs - dxys + dxxs * (x + 3) };
1724  uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1725  oys - dyys + dyxs * (x + 1),
1726  oys - dyys + dyxs * (x + 2),
1727  oys - dyys + dyxs * (x + 3) };
1728 
1729  for (y = 0; y < h; y++) {
1730  __asm__ volatile (
1731  "movq %0, %%mm4 \n\t"
1732  "movq %1, %%mm5 \n\t"
1733  "paddw %2, %%mm4 \n\t"
1734  "paddw %3, %%mm5 \n\t"
1735  "movq %%mm4, %0 \n\t"
1736  "movq %%mm5, %1 \n\t"
1737  "psrlw $12, %%mm4 \n\t"
1738  "psrlw $12, %%mm5 \n\t"
1739  : "+m"(*dx4), "+m"(*dy4)
1740  : "m"(*dxy4), "m"(*dyy4)
1741  );
1742 
1743  __asm__ volatile (
1744  "movq %%mm6, %%mm2 \n\t"
1745  "movq %%mm6, %%mm1 \n\t"
1746  "psubw %%mm4, %%mm2 \n\t"
1747  "psubw %%mm5, %%mm1 \n\t"
1748  "movq %%mm2, %%mm0 \n\t"
1749  "movq %%mm4, %%mm3 \n\t"
1750  "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1751  "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1752  "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1753  "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1754 
1755  "movd %4, %%mm5 \n\t"
1756  "movd %3, %%mm4 \n\t"
1757  "punpcklbw %%mm7, %%mm5 \n\t"
1758  "punpcklbw %%mm7, %%mm4 \n\t"
1759  "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1760  "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1761 
1762  "movd %2, %%mm5 \n\t"
1763  "movd %1, %%mm4 \n\t"
1764  "punpcklbw %%mm7, %%mm5 \n\t"
1765  "punpcklbw %%mm7, %%mm4 \n\t"
1766  "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1767  "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1768  "paddw %5, %%mm1 \n\t"
1769  "paddw %%mm3, %%mm2 \n\t"
1770  "paddw %%mm1, %%mm0 \n\t"
1771  "paddw %%mm2, %%mm0 \n\t"
1772 
1773  "psrlw %6, %%mm0 \n\t"
1774  "packuswb %%mm0, %%mm0 \n\t"
1775  "movd %%mm0, %0 \n\t"
1776 
1777  : "=m"(dst[x + y * stride])
1778  : "m"(src[0]), "m"(src[1]),
1779  "m"(src[stride]), "m"(src[stride + 1]),
1780  "m"(*r4), "m"(shift2)
1781  );
1782  src += stride;
1783  }
1784  src += 4 - h * stride;
1785  }
1786 }
1787 
1788 #if CONFIG_VIDEODSP
1789 #if HAVE_YASM
1790 #if ARCH_X86_32
1791 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1792  int stride, int h, int ox, int oy,
1793  int dxx, int dxy, int dyx, int dyy,
1794  int shift, int r, int width, int height)
1795 {
1796  gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1797  width, height, &ff_emulated_edge_mc_8);
1798 }
1799 #endif
1800 static void gmc_sse(uint8_t *dst, uint8_t *src,
1801  int stride, int h, int ox, int oy,
1802  int dxx, int dxy, int dyx, int dyy,
1803  int shift, int r, int width, int height)
1804 {
1805  gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1806  width, height, &ff_emulated_edge_mc_8);
1807 }
1808 #else
1809 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1810  int stride, int h, int ox, int oy,
1811  int dxx, int dxy, int dyx, int dyy,
1812  int shift, int r, int width, int height)
1813 {
1814  gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1815  width, height, &ff_emulated_edge_mc_8);
1816 }
1817 #endif
1818 #endif
1819 
1820 #endif /* HAVE_INLINE_ASM */
1821 
1822 #include "h264_qpel.c"
1823 
1825  int stride, int h, int x, int y);
1827  int stride, int h, int x, int y);
1829  int stride, int h, int x, int y);
1830 
1831 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1832  int stride, int h, int x, int y);
1834  int stride, int h, int x, int y);
1836  int stride, int h, int x, int y);
1837 
1839  int stride, int h, int x, int y);
1841  int stride, int h, int x, int y);
1842 
1844  int stride, int h, int x, int y);
1846  int stride, int h, int x, int y);
1847 
1849  int stride, int h, int x, int y);
1851  int stride, int h, int x, int y);
1852 
1853 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1854 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1855  (uint8_t *dst, uint8_t *src, \
1856  int stride, int h, int x, int y);
1857 
1858 CHROMA_MC(put, 2, 10, mmxext)
1859 CHROMA_MC(avg, 2, 10, mmxext)
1860 CHROMA_MC(put, 4, 10, mmxext)
1861 CHROMA_MC(avg, 4, 10, mmxext)
1862 CHROMA_MC(put, 8, 10, sse2)
1863 CHROMA_MC(avg, 8, 10, sse2)
1864 CHROMA_MC(put, 8, 10, avx)
1865 CHROMA_MC(avg, 8, 10, avx)
1866 
1867 #if HAVE_INLINE_ASM
1868 
1869 /* CAVS-specific */
1870 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1871 {
1872  put_pixels8_mmx(dst, src, stride, 8);
1873 }
1874 
1875 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1876 {
1877  avg_pixels8_mmx(dst, src, stride, 8);
1878 }
1879 
1880 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1881 {
1882  put_pixels16_mmx(dst, src, stride, 16);
1883 }
1884 
1885 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1886 {
1887  avg_pixels16_mmx(dst, src, stride, 16);
1888 }
1889 
1890 /* VC-1-specific */
1891 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1892  int stride, int rnd)
1893 {
1894  put_pixels8_mmx(dst, src, stride, 8);
1895 }
1896 
1897 void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
1898  int stride, int rnd)
1899 {
1900  avg_pixels8_mmxext(dst, src, stride, 8);
1901 }
1902 
1903 /* only used in VP3/5/6 */
1904 static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
1905 {
1906 // START_TIMER
1907  MOVQ_BFE(mm6);
1908  __asm__ volatile(
1909  "1: \n\t"
1910  "movq (%1), %%mm0 \n\t"
1911  "movq (%2), %%mm1 \n\t"
1912  "movq (%1,%4), %%mm2 \n\t"
1913  "movq (%2,%4), %%mm3 \n\t"
1914  PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
1915  "movq %%mm4, (%3) \n\t"
1916  "movq %%mm5, (%3,%4) \n\t"
1917 
1918  "movq (%1,%4,2), %%mm0 \n\t"
1919  "movq (%2,%4,2), %%mm1 \n\t"
1920  "movq (%1,%5), %%mm2 \n\t"
1921  "movq (%2,%5), %%mm3 \n\t"
1922  "lea (%1,%4,4), %1 \n\t"
1923  "lea (%2,%4,4), %2 \n\t"
1924  PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
1925  "movq %%mm4, (%3,%4,2) \n\t"
1926  "movq %%mm5, (%3,%5) \n\t"
1927  "lea (%3,%4,4), %3 \n\t"
1928  "subl $4, %0 \n\t"
1929  "jnz 1b \n\t"
1930  :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
1931  :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
1932  :"memory");
1933 // STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
1934 }
1935 static void put_vp_no_rnd_pixels16_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
1936 {
1937  put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h);
1938  put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h);
1939 }
1940 
1941 #if CONFIG_DIRAC_DECODER
1942 #define DIRAC_PIXOP(OPNAME, EXT)\
1943 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1944 {\
1945  OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
1946 }\
1947 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1948 {\
1949  OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
1950 }\
1951 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1952 {\
1953  OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
1954  OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
1955 }
1956 
1957 DIRAC_PIXOP(put, mmx)
1958 DIRAC_PIXOP(avg, mmx)
1959 DIRAC_PIXOP(avg, mmxext)
1960 
1961 #if HAVE_YASM
1962 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1963 {
1964  ff_put_pixels16_sse2(dst, src[0], stride, h);
1965 }
1966 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1967 {
1968  ff_avg_pixels16_sse2(dst, src[0], stride, h);
1969 }
1970 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1971 {
1972  ff_put_pixels16_sse2(dst , src[0] , stride, h);
1973  ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
1974 }
1975 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1976 {
1977  ff_avg_pixels16_sse2(dst , src[0] , stride, h);
1978  ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
1979 }
1980 #endif
1981 #endif
1982 
1983 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
1984  * converted. */
1985 #if CONFIG_GPL
1986 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
1987  DCTELEM *block)
1988 {
1989  ff_mmx_idct(block);
1990  ff_put_pixels_clamped_mmx(block, dest, line_size);
1991 }
1992 
1993 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
1994  DCTELEM *block)
1995 {
1996  ff_mmx_idct(block);
1997  ff_add_pixels_clamped_mmx(block, dest, line_size);
1998 }
1999 
2000 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
2001  DCTELEM *block)
2002 {
2003  ff_mmxext_idct(block);
2004  ff_put_pixels_clamped_mmx(block, dest, line_size);
2005 }
2006 
2007 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
2008  DCTELEM *block)
2009 {
2010  ff_mmxext_idct(block);
2011  ff_add_pixels_clamped_mmx(block, dest, line_size);
2012 }
2013 #endif
2014 
2015 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2016 {
2017  int i;
2018  __asm__ volatile ("pxor %%mm7, %%mm7":);
2019  for (i = 0; i < blocksize; i += 2) {
2020  __asm__ volatile (
2021  "movq %0, %%mm0 \n\t"
2022  "movq %1, %%mm1 \n\t"
2023  "movq %%mm0, %%mm2 \n\t"
2024  "movq %%mm1, %%mm3 \n\t"
2025  "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2026  "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2027  "pslld $31, %%mm2 \n\t" // keep only the sign bit
2028  "pxor %%mm2, %%mm1 \n\t"
2029  "movq %%mm3, %%mm4 \n\t"
2030  "pand %%mm1, %%mm3 \n\t"
2031  "pandn %%mm1, %%mm4 \n\t"
2032  "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2033  "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2034  "movq %%mm3, %1 \n\t"
2035  "movq %%mm0, %0 \n\t"
2036  : "+m"(mag[i]), "+m"(ang[i])
2037  :: "memory"
2038  );
2039  }
2040  __asm__ volatile ("femms");
2041 }
2042 
2043 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2044 {
2045  int i;
2046 
2047  __asm__ volatile (
2048  "movaps %0, %%xmm5 \n\t"
2049  :: "m"(ff_pdw_80000000[0])
2050  );
2051  for (i = 0; i < blocksize; i += 4) {
2052  __asm__ volatile (
2053  "movaps %0, %%xmm0 \n\t"
2054  "movaps %1, %%xmm1 \n\t"
2055  "xorps %%xmm2, %%xmm2 \n\t"
2056  "xorps %%xmm3, %%xmm3 \n\t"
2057  "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2058  "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2059  "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2060  "xorps %%xmm2, %%xmm1 \n\t"
2061  "movaps %%xmm3, %%xmm4 \n\t"
2062  "andps %%xmm1, %%xmm3 \n\t"
2063  "andnps %%xmm1, %%xmm4 \n\t"
2064  "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a < 0) & (a ^ sign(m)))
2065  "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a > 0) & (a ^ sign(m)))
2066  "movaps %%xmm3, %1 \n\t"
2067  "movaps %%xmm0, %0 \n\t"
2068  : "+m"(mag[i]), "+m"(ang[i])
2069  :: "memory"
2070  );
2071  }
2072 }
2073 
2074 #if HAVE_6REGS
2075 static void vector_fmul_window_3dnowext(float *dst, const float *src0,
2076  const float *src1, const float *win,
2077  int len)
2078 {
2079  x86_reg i = -len * 4;
2080  x86_reg j = len * 4 - 8;
2081  __asm__ volatile (
2082  "1: \n"
2083  "pswapd (%5, %1), %%mm1 \n"
2084  "movq (%5, %0), %%mm0 \n"
2085  "pswapd (%4, %1), %%mm5 \n"
2086  "movq (%3, %0), %%mm4 \n"
2087  "movq %%mm0, %%mm2 \n"
2088  "movq %%mm1, %%mm3 \n"
2089  "pfmul %%mm4, %%mm2 \n" // src0[len + i] * win[len + i]
2090  "pfmul %%mm5, %%mm3 \n" // src1[j] * win[len + j]
2091  "pfmul %%mm4, %%mm1 \n" // src0[len + i] * win[len + j]
2092  "pfmul %%mm5, %%mm0 \n" // src1[j] * win[len + i]
2093  "pfadd %%mm3, %%mm2 \n"
2094  "pfsub %%mm0, %%mm1 \n"
2095  "pswapd %%mm2, %%mm2 \n"
2096  "movq %%mm1, (%2, %0) \n"
2097  "movq %%mm2, (%2, %1) \n"
2098  "sub $8, %1 \n"
2099  "add $8, %0 \n"
2100  "jl 1b \n"
2101  "femms \n"
2102  : "+r"(i), "+r"(j)
2103  : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2104  );
2105 }
2106 
2107 static void vector_fmul_window_sse(float *dst, const float *src0,
2108  const float *src1, const float *win, int len)
2109 {
2110  x86_reg i = -len * 4;
2111  x86_reg j = len * 4 - 16;
2112  __asm__ volatile (
2113  "1: \n"
2114  "movaps (%5, %1), %%xmm1 \n"
2115  "movaps (%5, %0), %%xmm0 \n"
2116  "movaps (%4, %1), %%xmm5 \n"
2117  "movaps (%3, %0), %%xmm4 \n"
2118  "shufps $0x1b, %%xmm1, %%xmm1 \n"
2119  "shufps $0x1b, %%xmm5, %%xmm5 \n"
2120  "movaps %%xmm0, %%xmm2 \n"
2121  "movaps %%xmm1, %%xmm3 \n"
2122  "mulps %%xmm4, %%xmm2 \n" // src0[len + i] * win[len + i]
2123  "mulps %%xmm5, %%xmm3 \n" // src1[j] * win[len + j]
2124  "mulps %%xmm4, %%xmm1 \n" // src0[len + i] * win[len + j]
2125  "mulps %%xmm5, %%xmm0 \n" // src1[j] * win[len + i]
2126  "addps %%xmm3, %%xmm2 \n"
2127  "subps %%xmm0, %%xmm1 \n"
2128  "shufps $0x1b, %%xmm2, %%xmm2 \n"
2129  "movaps %%xmm1, (%2, %0) \n"
2130  "movaps %%xmm2, (%2, %1) \n"
2131  "sub $16, %1 \n"
2132  "add $16, %0 \n"
2133  "jl 1b \n"
2134  : "+r"(i), "+r"(j)
2135  : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
2136  );
2137 }
2138 #endif /* HAVE_6REGS */
2139 
2140 static void vector_clipf_sse(float *dst, const float *src,
2141  float min, float max, int len)
2142 {
2143  x86_reg i = (len - 16) * 4;
2144  __asm__ volatile (
2145  "movss %3, %%xmm4 \n\t"
2146  "movss %4, %%xmm5 \n\t"
2147  "shufps $0, %%xmm4, %%xmm4 \n\t"
2148  "shufps $0, %%xmm5, %%xmm5 \n\t"
2149  "1: \n\t"
2150  "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
2151  "movaps 16(%2, %0), %%xmm1 \n\t"
2152  "movaps 32(%2, %0), %%xmm2 \n\t"
2153  "movaps 48(%2, %0), %%xmm3 \n\t"
2154  "maxps %%xmm4, %%xmm0 \n\t"
2155  "maxps %%xmm4, %%xmm1 \n\t"
2156  "maxps %%xmm4, %%xmm2 \n\t"
2157  "maxps %%xmm4, %%xmm3 \n\t"
2158  "minps %%xmm5, %%xmm0 \n\t"
2159  "minps %%xmm5, %%xmm1 \n\t"
2160  "minps %%xmm5, %%xmm2 \n\t"
2161  "minps %%xmm5, %%xmm3 \n\t"
2162  "movaps %%xmm0, (%1, %0) \n\t"
2163  "movaps %%xmm1, 16(%1, %0) \n\t"
2164  "movaps %%xmm2, 32(%1, %0) \n\t"
2165  "movaps %%xmm3, 48(%1, %0) \n\t"
2166  "sub $64, %0 \n\t"
2167  "jge 1b \n\t"
2168  : "+&r"(i)
2169  : "r"(dst), "r"(src), "m"(min), "m"(max)
2170  : "memory"
2171  );
2172 }
2173 
2174 #endif /* HAVE_INLINE_ASM */
2175 
2176 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
2177  int order);
2178 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
2179  int order);
2180 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
2181  const int16_t *v3,
2182  int order, int mul);
2183 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
2184  const int16_t *v3,
2185  int order, int mul);
2186 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
2187  const int16_t *v3,
2188  int order, int mul);
2189 
2190 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
2191  const int16_t *window, unsigned int len);
2192 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
2193  const int16_t *window, unsigned int len);
2194 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
2195  const int16_t *window, unsigned int len);
2196 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
2197  const int16_t *window, unsigned int len);
2198 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
2199  const int16_t *window, unsigned int len);
2200 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2201  const int16_t *window, unsigned int len);
2202 
2203 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
2204 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
2205 
2207  const uint8_t *diff, int w,
2208  int *left, int *left_top);
2210  int w, int left);
2211 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
2212  int w, int left);
2213 
2214 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2215 
2216 void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
2217  const float *src1, int len);
2218 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
2219  const float *src1, int len);
2220 
2221 void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2222  const float *src2, int len);
2223 void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
2224  const float *src2, int len);
2225 
2226 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
2227  int32_t min, int32_t max, unsigned int len);
2228 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
2229  int32_t min, int32_t max, unsigned int len);
2230 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
2231  int32_t min, int32_t max, unsigned int len);
2232 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
2233  int32_t min, int32_t max, unsigned int len);
2234 
2235 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2236  const float *src1, int len);
2237 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2238  const float *src1, int len);
2239 
2240 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2241  do { \
2242  c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2243  c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2244  c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2245  c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2246  c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2247  c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2248  c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2249  c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2250  c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2251  c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2252  c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2253  c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2254  c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2255  c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2256  c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2257  c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2258  } while (0)
2259 
2260 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2261  do { \
2262  c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2263  c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2264  c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2265  c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
2266  } while (0)
2267 
2268 #define H264_QPEL_FUNCS(x, y, CPU) \
2269  do { \
2270  c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2271  c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2272  c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
2273  c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
2274  } while (0)
2275 
2276 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2277  do { \
2278  c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2279  c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2280  c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
2281  c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
2282  } while (0)
2283 
2284 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2285 {
2286  const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2287 
2288 #if HAVE_INLINE_ASM
2292 
2293  if (!high_bit_depth) {
2294  c->clear_block = clear_block_mmx;
2295  c->clear_blocks = clear_blocks_mmx;
2296  c->draw_edges = draw_edges_mmx;
2297 
2298  SET_HPEL_FUNCS(put, 0, 16, mmx);
2299  SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2300  SET_HPEL_FUNCS(avg, 0, 16, mmx);
2301  SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2302  SET_HPEL_FUNCS(put, 1, 8, mmx);
2303  SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2304  SET_HPEL_FUNCS(avg, 1, 8, mmx);
2305  SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2306  }
2307 
2308 #if ARCH_X86_32 || !HAVE_YASM
2309  c->gmc = gmc_mmx;
2310 #endif
2311 
2312  c->add_bytes = add_bytes_mmx;
2313 
2314  c->put_no_rnd_pixels_l2[0]= put_vp_no_rnd_pixels16_l2_mmx;
2315  c->put_no_rnd_pixels_l2[1]= put_vp_no_rnd_pixels8_l2_mmx;
2316 
2317  if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2318  c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2319  c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2320  }
2321 #endif /* HAVE_INLINE_ASM */
2322 
2323 #if HAVE_YASM
2324  if (!high_bit_depth && CONFIG_H264CHROMA) {
2327  }
2328 
2330 #endif
2331 
2332 }
2333 
2335  int mm_flags)
2336 {
2337  const int bit_depth = avctx->bits_per_raw_sample;
2338  const int high_bit_depth = bit_depth > 8;
2339 
2340 #if HAVE_INLINE_ASM
2341  SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
2342  SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
2343  SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmxext, );
2344  SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmxext, );
2345 
2346  SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
2347  SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
2348  SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmxext, );
2349  SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmxext, );
2350  SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
2351  SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
2352 
2353  if (!high_bit_depth) {
2354  c->put_pixels_tab[0][1] = put_pixels16_x2_mmxext;
2355  c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
2356 
2357  c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
2358  c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
2359  c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
2360 
2361  c->put_pixels_tab[1][1] = put_pixels8_x2_mmxext;
2362  c->put_pixels_tab[1][2] = put_pixels8_y2_mmxext;
2363 
2364  c->avg_pixels_tab[1][0] = avg_pixels8_mmxext;
2365  c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmxext;
2366  c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmxext;
2367  }
2368 
2369  if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2370  if (!high_bit_depth) {
2371  c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
2372  c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
2373  c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmxext;
2374  c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmxext;
2375 
2376  c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
2377  c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmxext;
2378  }
2379  }
2380 
2381  if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2382  avctx->codec_id == AV_CODEC_ID_THEORA)) {
2383  c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmxext;
2384  c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmxext;
2385  }
2386 #endif /* HAVE_INLINE_ASM */
2387 
2388 #if HAVE_MMXEXT_EXTERNAL
2389  if (CONFIG_H264QPEL) {
2390  if (!high_bit_depth) {
2391  SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
2392  SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
2393  SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
2394  SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
2395  SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
2396  SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
2397  } else if (bit_depth == 10) {
2398 #if !ARCH_X86_64
2399  SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2400  SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2401  SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2402  SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2403 #endif
2404  SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2405  SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2406  }
2407  }
2408 
2409  if (!high_bit_depth && CONFIG_H264CHROMA) {
2414  }
2415  if (bit_depth == 10 && CONFIG_H264CHROMA) {
2416  c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2417  c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2418  c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2419  c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2420  }
2421 
2422  /* slower than cmov version on AMD */
2423  if (!(mm_flags & AV_CPU_FLAG_3DNOW))
2425 
2428 
2429  if (avctx->flags & CODEC_FLAG_BITEXACT) {
2431  } else {
2433  }
2434 #endif /* HAVE_MMXEXT_EXTERNAL */
2435 }
2436 
2438  int mm_flags)
2439 {
2440  const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2441 
2442 #if HAVE_INLINE_ASM
2443  if (!high_bit_depth) {
2444  c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2445  c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2446 
2447  c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2448  c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2449  c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2450 
2451  c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2452  c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2453 
2454  c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2455  c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2456  c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2457 
2458  if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2459  c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2460  c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2461  c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2462  c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2463 
2464  c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2465  c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2466  }
2467  }
2468 
2469  if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
2470  avctx->codec_id == AV_CODEC_ID_THEORA)) {
2471  c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2472  c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2473  }
2474 
2475  c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2476 #endif /* HAVE_INLINE_ASM */
2477 
2478 #if HAVE_YASM
2479  if (!high_bit_depth && CONFIG_H264CHROMA) {
2482  }
2483 #endif /* HAVE_YASM */
2484 }
2485 
2487  int mm_flags)
2488 {
2489 #if HAVE_AMD3DNOWEXT_INLINE && HAVE_6REGS
2490  c->vector_fmul_window = vector_fmul_window_3dnowext;
2491 #endif
2492 }
2493 
2494 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2495 {
2496  const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2497 
2498 #if HAVE_INLINE_ASM
2499  if (!high_bit_depth) {
2500  if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
2501  /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2502  c->clear_block = clear_block_sse;
2503  c->clear_blocks = clear_blocks_sse;
2504  }
2505  }
2506 
2507  c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2508 
2509 #if HAVE_6REGS
2510  c->vector_fmul_window = vector_fmul_window_sse;
2511 #endif
2512 
2513  c->vector_clipf = vector_clipf_sse;
2514 #endif /* HAVE_INLINE_ASM */
2515 
2516 #if HAVE_YASM
2519 
2522 
2523 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
2524  c->gmc = gmc_sse;
2525 #endif
2526 #endif /* HAVE_YASM */
2527 }
2528 
2530  int mm_flags)
2531 {
2532  const int bit_depth = avctx->bits_per_raw_sample;
2533  const int high_bit_depth = bit_depth > 8;
2534 
2535 #if HAVE_SSE2_INLINE
2536  if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
2539  c->idct = ff_idct_xvid_sse2;
2541  }
2542 #endif /* HAVE_SSE2_INLINE */
2543 
2544 #if HAVE_SSE2_EXTERNAL
2545  if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2546  // these functions are slower than mmx on AMD, but faster on Intel
2547  if (!high_bit_depth) {
2548  c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
2549  c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
2550  c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
2551  if (CONFIG_H264QPEL)
2552  H264_QPEL_FUNCS(0, 0, sse2);
2553  }
2554  }
2555 
2556  if (!high_bit_depth && CONFIG_H264QPEL) {
2557  H264_QPEL_FUNCS(0, 1, sse2);
2558  H264_QPEL_FUNCS(0, 2, sse2);
2559  H264_QPEL_FUNCS(0, 3, sse2);
2560  H264_QPEL_FUNCS(1, 1, sse2);
2561  H264_QPEL_FUNCS(1, 2, sse2);
2562  H264_QPEL_FUNCS(1, 3, sse2);
2563  H264_QPEL_FUNCS(2, 1, sse2);
2564  H264_QPEL_FUNCS(2, 2, sse2);
2565  H264_QPEL_FUNCS(2, 3, sse2);
2566  H264_QPEL_FUNCS(3, 1, sse2);
2567  H264_QPEL_FUNCS(3, 2, sse2);
2568  H264_QPEL_FUNCS(3, 3, sse2);
2569  }
2570 
2571  if (bit_depth == 10) {
2572  if (CONFIG_H264QPEL) {
2573  SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2574  SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2575  SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2576  SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2577  H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2578  H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2579  H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2580  }
2581  if (CONFIG_H264CHROMA) {
2582  c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2583  c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2584  }
2585  }
2586 
2589  if (mm_flags & AV_CPU_FLAG_ATOM) {
2591  } else {
2593  }
2594  if (avctx->flags & CODEC_FLAG_BITEXACT) {
2596  } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2598  }
2600 #endif /* HAVE_SSE2_EXTERNAL */
2601 }
2602 
2604  int mm_flags)
2605 {
2606 #if HAVE_SSSE3_EXTERNAL
2607  const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2608  const int bit_depth = avctx->bits_per_raw_sample;
2609 
2610  if (!high_bit_depth && CONFIG_H264QPEL) {
2611  H264_QPEL_FUNCS(1, 0, ssse3);
2612  H264_QPEL_FUNCS(1, 1, ssse3);
2613  H264_QPEL_FUNCS(1, 2, ssse3);
2614  H264_QPEL_FUNCS(1, 3, ssse3);
2615  H264_QPEL_FUNCS(2, 0, ssse3);
2616  H264_QPEL_FUNCS(2, 1, ssse3);
2617  H264_QPEL_FUNCS(2, 2, ssse3);
2618  H264_QPEL_FUNCS(2, 3, ssse3);
2619  H264_QPEL_FUNCS(3, 0, ssse3);
2620  H264_QPEL_FUNCS(3, 1, ssse3);
2621  H264_QPEL_FUNCS(3, 2, ssse3);
2622  H264_QPEL_FUNCS(3, 3, ssse3);
2623  }
2624  if (bit_depth == 10 && CONFIG_H264QPEL) {
2625  H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
2626  H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
2627  H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
2628  }
2629  if (!high_bit_depth && CONFIG_H264CHROMA) {
2634  }
2636  if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2638 
2639  if (mm_flags & AV_CPU_FLAG_ATOM)
2641  else
2643  if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
2646 #endif /* HAVE_SSSE3_EXTERNAL */
2647 }
2648 
2650  int mm_flags)
2651 {
2652 #if HAVE_SSE4_EXTERNAL
2654 #endif /* HAVE_SSE4_EXTERNAL */
2655 }
2656 
2657 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2658 {
2659 #if HAVE_AVX_EXTERNAL
2660  const int bit_depth = avctx->bits_per_raw_sample;
2661 
2662  if (bit_depth == 10) {
2663  // AVX implies !cache64.
2664  // TODO: Port cache(32|64) detection from x264.
2665  if (CONFIG_H264QPEL) {
2666  H264_QPEL_FUNCS_10(1, 0, sse2);
2667  H264_QPEL_FUNCS_10(2, 0, sse2);
2668  H264_QPEL_FUNCS_10(3, 0, sse2);
2669  }
2670 
2671  if (CONFIG_H264CHROMA) {
2672  c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2673  c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2674  }
2675  }
2679 #endif /* HAVE_AVX_EXTERNAL */
2680 }
2681 
2683 {
2684  int mm_flags = av_get_cpu_flags();
2685 
2686 #if HAVE_7REGS && HAVE_INLINE_ASM
2687  if (mm_flags & AV_CPU_FLAG_CMOV)
2688  c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2689 #endif
2690 
2691  if (mm_flags & AV_CPU_FLAG_MMX) {
2692 #if HAVE_INLINE_ASM
2693  const int idct_algo = avctx->idct_algo;
2694 
2695  if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
2696  if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
2699  c->idct = ff_simple_idct_mmx;
2701 #if CONFIG_GPL
2702  } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
2703  if (mm_flags & AV_CPU_FLAG_MMX2) {
2704  c->idct_put = ff_libmpeg2mmx2_idct_put;
2705  c->idct_add = ff_libmpeg2mmx2_idct_add;
2706  c->idct = ff_mmxext_idct;
2707  } else {
2708  c->idct_put = ff_libmpeg2mmx_idct_put;
2709  c->idct_add = ff_libmpeg2mmx_idct_add;
2710  c->idct = ff_mmx_idct;
2711  }
2713 #endif
2714  } else if (idct_algo == FF_IDCT_XVIDMMX) {
2715  if (mm_flags & AV_CPU_FLAG_SSE2) {
2718  c->idct = ff_idct_xvid_sse2;
2720  } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
2724  } else {
2727  c->idct = ff_idct_xvid_mmx;
2728  }
2729  }
2730  }
2731 #endif /* HAVE_INLINE_ASM */
2732 
2733  dsputil_init_mmx(c, avctx, mm_flags);
2734  }
2735 
2736  if (mm_flags & AV_CPU_FLAG_MMXEXT)
2737  dsputil_init_mmxext(c, avctx, mm_flags);
2738 
2739  if (mm_flags & AV_CPU_FLAG_3DNOW)
2740  dsputil_init_3dnow(c, avctx, mm_flags);
2741 
2742  if (mm_flags & AV_CPU_FLAG_3DNOWEXT)
2743  dsputil_init_3dnowext(c, avctx, mm_flags);
2744 
2745  if (mm_flags & AV_CPU_FLAG_SSE)
2746  dsputil_init_sse(c, avctx, mm_flags);
2747 
2748  if (mm_flags & AV_CPU_FLAG_SSE2)
2749  dsputil_init_sse2(c, avctx, mm_flags);
2750 
2751  if (mm_flags & AV_CPU_FLAG_SSSE3)
2752  dsputil_init_ssse3(c, avctx, mm_flags);
2753 
2754  if (mm_flags & AV_CPU_FLAG_SSE4)
2755  dsputil_init_sse4(c, avctx, mm_flags);
2756 
2757  if (mm_flags & AV_CPU_FLAG_AVX)
2758  dsputil_init_avx(c, avctx, mm_flags);
2759 
2760  if (CONFIG_ENCODERS)
2761  ff_dsputilenc_init_mmx(c, avctx);
2762 }