FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
me_cmp_init.c
Go to the documentation of this file.
1 /*
2  * SIMD-optimized motion estimation
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavutil/x86/cpu.h"
29 #include "libavcodec/me_cmp.h"
30 #include "libavcodec/mpegvideo.h"
31 
32 int ff_sum_abs_dctelem_mmx(int16_t *block);
33 int ff_sum_abs_dctelem_mmxext(int16_t *block);
34 int ff_sum_abs_dctelem_sse2(int16_t *block);
35 int ff_sum_abs_dctelem_ssse3(int16_t *block);
36 int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
37  ptrdiff_t stride, int h);
38 int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
39  ptrdiff_t stride, int h);
40 int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
41  ptrdiff_t stride, int h);
42 int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
43 int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
44 int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
45  ptrdiff_t stride, int h);
46 int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
47  ptrdiff_t stride, int h);
48 int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
49  ptrdiff_t stride, int h);
50 int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
51  ptrdiff_t stride, int h);
53  ptrdiff_t stride, int h);
54 int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
55  ptrdiff_t stride, int h);
56 int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
57  ptrdiff_t stride, int h);
59  ptrdiff_t stride, int h);
60 int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
61  ptrdiff_t stride, int h);
63  ptrdiff_t stride, int h);
65  ptrdiff_t stride, int h);
67  ptrdiff_t stride, int h);
69  ptrdiff_t stride, int h);
71  ptrdiff_t stride, int h);
73  ptrdiff_t stride, int h);
75  ptrdiff_t stride, int h);
77  ptrdiff_t stride, int h);
79  ptrdiff_t stride, int h);
80 
81 #define hadamard_func(cpu) \
82  int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
83  uint8_t *src2, ptrdiff_t stride, int h); \
84  int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
85  uint8_t *src2, ptrdiff_t stride, int h);
86 
88 hadamard_func(mmxext)
89 hadamard_func(sse2)
90 hadamard_func(ssse3)
91 
92 #if HAVE_YASM
93 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
94  ptrdiff_t stride, int h)
95 {
96  int score1, score2;
97 
98  if (c)
99  score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
100  else
101  score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
102  score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
103  - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
104 
105  if (c)
106  return score1 + FFABS(score2) * c->avctx->nsse_weight;
107  else
108  return score1 + FFABS(score2) * 8;
109 }
110 
111 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
112  ptrdiff_t stride, int h)
113 {
114  int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
115  int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
116  ff_hf_noise8_mmx(pix2, stride, h);
117 
118  if (c)
119  return score1 + FFABS(score2) * c->avctx->nsse_weight;
120  else
121  return score1 + FFABS(score2) * 8;
122 }
123 
124 #endif /* HAVE_YASM */
125 
126 #if HAVE_INLINE_ASM
127 
128 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
129  ptrdiff_t stride, int h)
130 {
131  int tmp;
132 
133  av_assert2((((int) pix) & 7) == 0);
134  av_assert2((stride & 7) == 0);
135 
136 #define SUM(in0, in1, out0, out1) \
137  "movq (%0), %%mm2\n" \
138  "movq 8(%0), %%mm3\n" \
139  "add %2,%0\n" \
140  "movq %%mm2, " #out0 "\n" \
141  "movq %%mm3, " #out1 "\n" \
142  "psubusb " #in0 ", %%mm2\n" \
143  "psubusb " #in1 ", %%mm3\n" \
144  "psubusb " #out0 ", " #in0 "\n" \
145  "psubusb " #out1 ", " #in1 "\n" \
146  "por %%mm2, " #in0 "\n" \
147  "por %%mm3, " #in1 "\n" \
148  "movq " #in0 ", %%mm2\n" \
149  "movq " #in1 ", %%mm3\n" \
150  "punpcklbw %%mm7, " #in0 "\n" \
151  "punpcklbw %%mm7, " #in1 "\n" \
152  "punpckhbw %%mm7, %%mm2\n" \
153  "punpckhbw %%mm7, %%mm3\n" \
154  "paddw " #in1 ", " #in0 "\n" \
155  "paddw %%mm3, %%mm2\n" \
156  "paddw %%mm2, " #in0 "\n" \
157  "paddw " #in0 ", %%mm6\n"
158 
159 
160  __asm__ volatile (
161  "movl %3, %%ecx\n"
162  "pxor %%mm6, %%mm6\n"
163  "pxor %%mm7, %%mm7\n"
164  "movq (%0), %%mm0\n"
165  "movq 8(%0), %%mm1\n"
166  "add %2, %0\n"
167  "jmp 2f\n"
168  "1:\n"
169 
170  SUM(%%mm4, %%mm5, %%mm0, %%mm1)
171  "2:\n"
172  SUM(%%mm0, %%mm1, %%mm4, %%mm5)
173 
174  "subl $2, %%ecx\n"
175  "jnz 1b\n"
176 
177  "movq %%mm6, %%mm0\n"
178  "psrlq $32, %%mm6\n"
179  "paddw %%mm6, %%mm0\n"
180  "movq %%mm0, %%mm6\n"
181  "psrlq $16, %%mm0\n"
182  "paddw %%mm6, %%mm0\n"
183  "movd %%mm0, %1\n"
184  : "+r" (pix), "=r" (tmp)
185  : "r" (stride), "m" (h)
186  : "%ecx");
187 
188  return tmp & 0xFFFF;
189 }
190 #undef SUM
191 
192 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
193  ptrdiff_t stride, int h)
194 {
195  int tmp;
196 
197  av_assert2((((int) pix1) & 7) == 0);
198  av_assert2((((int) pix2) & 7) == 0);
199  av_assert2((stride & 7) == 0);
200 
201 #define SUM(in0, in1, out0, out1) \
202  "movq (%0), %%mm2\n" \
203  "movq (%1), " #out0 "\n" \
204  "movq 8(%0), %%mm3\n" \
205  "movq 8(%1), " #out1 "\n" \
206  "add %3, %0\n" \
207  "add %3, %1\n" \
208  "psubb " #out0 ", %%mm2\n" \
209  "psubb " #out1 ", %%mm3\n" \
210  "pxor %%mm7, %%mm2\n" \
211  "pxor %%mm7, %%mm3\n" \
212  "movq %%mm2, " #out0 "\n" \
213  "movq %%mm3, " #out1 "\n" \
214  "psubusb " #in0 ", %%mm2\n" \
215  "psubusb " #in1 ", %%mm3\n" \
216  "psubusb " #out0 ", " #in0 "\n" \
217  "psubusb " #out1 ", " #in1 "\n" \
218  "por %%mm2, " #in0 "\n" \
219  "por %%mm3, " #in1 "\n" \
220  "movq " #in0 ", %%mm2\n" \
221  "movq " #in1 ", %%mm3\n" \
222  "punpcklbw %%mm7, " #in0 "\n" \
223  "punpcklbw %%mm7, " #in1 "\n" \
224  "punpckhbw %%mm7, %%mm2\n" \
225  "punpckhbw %%mm7, %%mm3\n" \
226  "paddw " #in1 ", " #in0 "\n" \
227  "paddw %%mm3, %%mm2\n" \
228  "paddw %%mm2, " #in0 "\n" \
229  "paddw " #in0 ", %%mm6\n"
230 
231 
232  __asm__ volatile (
233  "movl %4, %%ecx\n"
234  "pxor %%mm6, %%mm6\n"
235  "pcmpeqw %%mm7, %%mm7\n"
236  "psllw $15, %%mm7\n"
237  "packsswb %%mm7, %%mm7\n"
238  "movq (%0), %%mm0\n"
239  "movq (%1), %%mm2\n"
240  "movq 8(%0), %%mm1\n"
241  "movq 8(%1), %%mm3\n"
242  "add %3, %0\n"
243  "add %3, %1\n"
244  "psubb %%mm2, %%mm0\n"
245  "psubb %%mm3, %%mm1\n"
246  "pxor %%mm7, %%mm0\n"
247  "pxor %%mm7, %%mm1\n"
248  "jmp 2f\n"
249  "1:\n"
250 
251  SUM(%%mm4, %%mm5, %%mm0, %%mm1)
252  "2:\n"
253  SUM(%%mm0, %%mm1, %%mm4, %%mm5)
254 
255  "subl $2, %%ecx\n"
256  "jnz 1b\n"
257 
258  "movq %%mm6, %%mm0\n"
259  "psrlq $32, %%mm6\n"
260  "paddw %%mm6, %%mm0\n"
261  "movq %%mm0, %%mm6\n"
262  "psrlq $16, %%mm0\n"
263  "paddw %%mm6, %%mm0\n"
264  "movd %%mm0, %2\n"
265  : "+r" (pix1), "+r" (pix2), "=r" (tmp)
266  : "r" (stride), "m" (h)
267  : "%ecx");
268 
269  return tmp & 0x7FFF;
270 }
271 #undef SUM
272 
273 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
274  0x0000000000000000ULL,
275  0x0001000100010001ULL,
276  0x0002000200020002ULL,
277 };
278 
279 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
280  ptrdiff_t stride, int h)
281 {
282  x86_reg len = -stride * h;
283  __asm__ volatile (
284  ".p2align 4 \n\t"
285  "1: \n\t"
286  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
287  "movq (%2, %%"FF_REG_a"), %%mm2 \n\t"
288  "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
289  "add %3, %%"FF_REG_a" \n\t"
290  "psubusb %%mm0, %%mm2 \n\t"
291  "psubusb %%mm4, %%mm0 \n\t"
292  "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
293  "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
294  "movq (%2, %%"FF_REG_a"), %%mm5 \n\t"
295  "psubusb %%mm1, %%mm3 \n\t"
296  "psubusb %%mm5, %%mm1 \n\t"
297  "por %%mm2, %%mm0 \n\t"
298  "por %%mm1, %%mm3 \n\t"
299  "movq %%mm0, %%mm1 \n\t"
300  "movq %%mm3, %%mm2 \n\t"
301  "punpcklbw %%mm7, %%mm0 \n\t"
302  "punpckhbw %%mm7, %%mm1 \n\t"
303  "punpcklbw %%mm7, %%mm3 \n\t"
304  "punpckhbw %%mm7, %%mm2 \n\t"
305  "paddw %%mm1, %%mm0 \n\t"
306  "paddw %%mm3, %%mm2 \n\t"
307  "paddw %%mm2, %%mm0 \n\t"
308  "paddw %%mm0, %%mm6 \n\t"
309  "add %3, %%"FF_REG_a" \n\t"
310  " js 1b \n\t"
311  : "+a" (len)
312  : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
313 }
314 
315 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
316  ptrdiff_t stride, int h)
317 {
318  x86_reg len = -stride * h;
319  __asm__ volatile (
320  ".p2align 4 \n\t"
321  "1: \n\t"
322  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
323  "movq (%2, %%"FF_REG_a"), %%mm1 \n\t"
324  "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
325  "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
326  "punpcklbw %%mm7, %%mm0 \n\t"
327  "punpcklbw %%mm7, %%mm1 \n\t"
328  "punpckhbw %%mm7, %%mm2 \n\t"
329  "punpckhbw %%mm7, %%mm3 \n\t"
330  "paddw %%mm0, %%mm1 \n\t"
331  "paddw %%mm2, %%mm3 \n\t"
332  "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
333  "movq (%3, %%"FF_REG_a"), %%mm2 \n\t"
334  "paddw %%mm5, %%mm1 \n\t"
335  "paddw %%mm5, %%mm3 \n\t"
336  "psrlw $1, %%mm1 \n\t"
337  "psrlw $1, %%mm3 \n\t"
338  "packuswb %%mm3, %%mm1 \n\t"
339  "psubusb %%mm1, %%mm4 \n\t"
340  "psubusb %%mm2, %%mm1 \n\t"
341  "por %%mm4, %%mm1 \n\t"
342  "movq %%mm1, %%mm0 \n\t"
343  "punpcklbw %%mm7, %%mm0 \n\t"
344  "punpckhbw %%mm7, %%mm1 \n\t"
345  "paddw %%mm1, %%mm0 \n\t"
346  "paddw %%mm0, %%mm6 \n\t"
347  "add %4, %%"FF_REG_a" \n\t"
348  " js 1b \n\t"
349  : "+a" (len)
350  : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
351  "r" (stride));
352 }
353 
354 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
355  ptrdiff_t stride, int h)
356 {
357  x86_reg len = -stride * h;
358  __asm__ volatile (
359  "movq (%1, %%"FF_REG_a"), %%mm0\n\t"
360  "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
361  "movq %%mm0, %%mm1 \n\t"
362  "movq %%mm2, %%mm3 \n\t"
363  "punpcklbw %%mm7, %%mm0 \n\t"
364  "punpckhbw %%mm7, %%mm1 \n\t"
365  "punpcklbw %%mm7, %%mm2 \n\t"
366  "punpckhbw %%mm7, %%mm3 \n\t"
367  "paddw %%mm2, %%mm0 \n\t"
368  "paddw %%mm3, %%mm1 \n\t"
369  ".p2align 4 \n\t"
370  "1: \n\t"
371  "movq (%2, %%"FF_REG_a"), %%mm2\n\t"
372  "movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
373  "movq %%mm2, %%mm3 \n\t"
374  "movq %%mm4, %%mm5 \n\t"
375  "punpcklbw %%mm7, %%mm2 \n\t"
376  "punpckhbw %%mm7, %%mm3 \n\t"
377  "punpcklbw %%mm7, %%mm4 \n\t"
378  "punpckhbw %%mm7, %%mm5 \n\t"
379  "paddw %%mm4, %%mm2 \n\t"
380  "paddw %%mm5, %%mm3 \n\t"
381  "movq %5, %%mm5 \n\t"
382  "paddw %%mm2, %%mm0 \n\t"
383  "paddw %%mm3, %%mm1 \n\t"
384  "paddw %%mm5, %%mm0 \n\t"
385  "paddw %%mm5, %%mm1 \n\t"
386  "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
387  "movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
388  "psrlw $2, %%mm0 \n\t"
389  "psrlw $2, %%mm1 \n\t"
390  "packuswb %%mm1, %%mm0 \n\t"
391  "psubusb %%mm0, %%mm4 \n\t"
392  "psubusb %%mm5, %%mm0 \n\t"
393  "por %%mm4, %%mm0 \n\t"
394  "movq %%mm0, %%mm4 \n\t"
395  "punpcklbw %%mm7, %%mm0 \n\t"
396  "punpckhbw %%mm7, %%mm4 \n\t"
397  "paddw %%mm0, %%mm6 \n\t"
398  "paddw %%mm4, %%mm6 \n\t"
399  "movq %%mm2, %%mm0 \n\t"
400  "movq %%mm3, %%mm1 \n\t"
401  "add %4, %%"FF_REG_a" \n\t"
402  " js 1b \n\t"
403  : "+a" (len)
404  : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
405  "r" (stride), "m" (round_tab[2]));
406 }
407 
408 static inline int sum_mmx(void)
409 {
410  int ret;
411  __asm__ volatile (
412  "movq %%mm6, %%mm0 \n\t"
413  "psrlq $32, %%mm6 \n\t"
414  "paddw %%mm0, %%mm6 \n\t"
415  "movq %%mm6, %%mm0 \n\t"
416  "psrlq $16, %%mm6 \n\t"
417  "paddw %%mm0, %%mm6 \n\t"
418  "movd %%mm6, %0 \n\t"
419  : "=r" (ret));
420  return ret & 0xFFFF;
421 }
422 
423 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
424  ptrdiff_t stride, int h)
425 {
426  sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
427 }
428 
429 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
430  ptrdiff_t stride, int h)
431 {
432  sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
433 }
434 
435 #define PIX_SAD(suf) \
436 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
437  uint8_t *blk1, ptrdiff_t stride, int h) \
438 { \
439  av_assert2(h == 8); \
440  __asm__ volatile ( \
441  "pxor %%mm7, %%mm7 \n\t" \
442  "pxor %%mm6, %%mm6 \n\t" \
443  :); \
444  \
445  sad8_1_ ## suf(blk1, blk2, stride, 8); \
446  \
447  return sum_ ## suf(); \
448 } \
449  \
450 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
451  uint8_t *blk1, ptrdiff_t stride, int h) \
452 { \
453  av_assert2(h == 8); \
454  __asm__ volatile ( \
455  "pxor %%mm7, %%mm7 \n\t" \
456  "pxor %%mm6, %%mm6 \n\t" \
457  "movq %0, %%mm5 \n\t" \
458  :: "m" (round_tab[1])); \
459  \
460  sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
461  \
462  return sum_ ## suf(); \
463 } \
464  \
465 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
466  uint8_t *blk1, ptrdiff_t stride, int h) \
467 { \
468  av_assert2(h == 8); \
469  __asm__ volatile ( \
470  "pxor %%mm7, %%mm7 \n\t" \
471  "pxor %%mm6, %%mm6 \n\t" \
472  "movq %0, %%mm5 \n\t" \
473  :: "m" (round_tab[1])); \
474  \
475  sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
476  \
477  return sum_ ## suf(); \
478 } \
479  \
480 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
481  uint8_t *blk1, ptrdiff_t stride, int h) \
482 { \
483  av_assert2(h == 8); \
484  __asm__ volatile ( \
485  "pxor %%mm7, %%mm7 \n\t" \
486  "pxor %%mm6, %%mm6 \n\t" \
487  ::); \
488  \
489  sad8_4_ ## suf(blk1, blk2, stride, 8); \
490  \
491  return sum_ ## suf(); \
492 } \
493  \
494 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
495  uint8_t *blk1, ptrdiff_t stride, int h) \
496 { \
497  __asm__ volatile ( \
498  "pxor %%mm7, %%mm7 \n\t" \
499  "pxor %%mm6, %%mm6 \n\t" \
500  :); \
501  \
502  sad8_1_ ## suf(blk1, blk2, stride, h); \
503  sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
504  \
505  return sum_ ## suf(); \
506 } \
507  \
508 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
509  uint8_t *blk1, ptrdiff_t stride, int h) \
510 { \
511  __asm__ volatile ( \
512  "pxor %%mm7, %%mm7 \n\t" \
513  "pxor %%mm6, %%mm6 \n\t" \
514  "movq %0, %%mm5 \n\t" \
515  :: "m" (round_tab[1])); \
516  \
517  sad8_x2a_ ## suf(blk1, blk2, stride, h); \
518  sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
519  \
520  return sum_ ## suf(); \
521 } \
522  \
523 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
524  uint8_t *blk1, ptrdiff_t stride, int h) \
525 { \
526  __asm__ volatile ( \
527  "pxor %%mm7, %%mm7 \n\t" \
528  "pxor %%mm6, %%mm6 \n\t" \
529  "movq %0, %%mm5 \n\t" \
530  :: "m" (round_tab[1])); \
531  \
532  sad8_y2a_ ## suf(blk1, blk2, stride, h); \
533  sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
534  \
535  return sum_ ## suf(); \
536 } \
537  \
538 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
539  uint8_t *blk1, ptrdiff_t stride, int h) \
540 { \
541  __asm__ volatile ( \
542  "pxor %%mm7, %%mm7 \n\t" \
543  "pxor %%mm6, %%mm6 \n\t" \
544  ::); \
545  \
546  sad8_4_ ## suf(blk1, blk2, stride, h); \
547  sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
548  \
549  return sum_ ## suf(); \
550 } \
551 
552 PIX_SAD(mmx)
553 
554 #endif /* HAVE_INLINE_ASM */
555 
557 {
558  int cpu_flags = av_get_cpu_flags();
559 
560 #if HAVE_INLINE_ASM
561  if (INLINE_MMX(cpu_flags)) {
562  c->pix_abs[0][0] = sad16_mmx;
563  c->pix_abs[0][1] = sad16_x2_mmx;
564  c->pix_abs[0][2] = sad16_y2_mmx;
565  c->pix_abs[0][3] = sad16_xy2_mmx;
566  c->pix_abs[1][0] = sad8_mmx;
567  c->pix_abs[1][1] = sad8_x2_mmx;
568  c->pix_abs[1][2] = sad8_y2_mmx;
569  c->pix_abs[1][3] = sad8_xy2_mmx;
570 
571  c->sad[0] = sad16_mmx;
572  c->sad[1] = sad8_mmx;
573 
574  c->vsad[4] = vsad_intra16_mmx;
575 
576  if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
577  c->vsad[0] = vsad16_mmx;
578  }
579  }
580 
581 #endif /* HAVE_INLINE_ASM */
582 
583  if (EXTERNAL_MMX(cpu_flags)) {
584  c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
585  c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
587  c->sse[0] = ff_sse16_mmx;
588  c->sse[1] = ff_sse8_mmx;
589 #if HAVE_YASM
590  c->nsse[0] = nsse16_mmx;
591  c->nsse[1] = nsse8_mmx;
592 #endif
593  }
594 
595  if (EXTERNAL_MMXEXT(cpu_flags)) {
596  c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
597  c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
599 
600  c->sad[0] = ff_sad16_mmxext;
601  c->sad[1] = ff_sad8_mmxext;
602 
603  c->pix_abs[0][0] = ff_sad16_mmxext;
604  c->pix_abs[0][1] = ff_sad16_x2_mmxext;
605  c->pix_abs[0][2] = ff_sad16_y2_mmxext;
606  c->pix_abs[1][0] = ff_sad8_mmxext;
607  c->pix_abs[1][1] = ff_sad8_x2_mmxext;
608  c->pix_abs[1][2] = ff_sad8_y2_mmxext;
609 
611  c->vsad[5] = ff_vsad_intra8_mmxext;
612 
613  if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
616 
619  }
620  }
621 
622  if (EXTERNAL_SSE2(cpu_flags)) {
623  c->sse[0] = ff_sse16_sse2;
625 
626 #if HAVE_ALIGNED_STACK
627  c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
628  c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
629 #endif
630  if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
631  c->sad[0] = ff_sad16_sse2;
632  c->pix_abs[0][0] = ff_sad16_sse2;
633  c->pix_abs[0][1] = ff_sad16_x2_sse2;
634  c->pix_abs[0][2] = ff_sad16_y2_sse2;
635 
636  c->vsad[4] = ff_vsad_intra16_sse2;
637  if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
639  c->vsad[0] = ff_vsad16_approx_sse2;
640  }
641  }
642  }
643 
644  if (EXTERNAL_SSSE3(cpu_flags)) {
646 #if HAVE_ALIGNED_STACK
647  c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
648  c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
649 #endif
650  }
651 }
#define EXTERNAL_MMX(flags)
Definition: cpu.h:54
int ff_sum_abs_dctelem_mmx(int16_t *block)
int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
mpegvideo header.
int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Macro definitions for various function/variable attributes.
me_cmp_func hadamard8_diff[6]
Definition: me_cmp.h:58
static int16_t block[64]
Definition: dct.c:113
int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
uint8_t
#define av_cold
Definition: attributes.h:82
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:64
#define hadamard_func(cpu)
Definition: me_cmp_init.c:81
int ff_sum_abs_dctelem_sse2(int16_t *block)
me_cmp_func nsse[6]
Definition: me_cmp.h:65
#define AV_CPU_FLAG_SSE2SLOW
SSE2 supported, but usually not faster.
Definition: cpu.h:35
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:57
int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sum_abs_dctelem_ssse3(int16_t *block)
int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define INLINE_MMX(flags)
Definition: cpu.h:81
int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int flags
AV_CODEC_FLAG_*.
Definition: avcodec.h:1771
me_cmp_func pix_abs[2][4]
Definition: me_cmp.h:78
int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define DECLARE_ASM_CONST(n, t, v)
Declare a static constant aligned variable appropriate for use in inline assembly code...
Definition: mem.h:102
int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define AV_CODEC_FLAG_BITEXACT
Use only bitexact stuff (except (I)DCT).
Definition: avcodec.h:886
int ff_sum_abs_dctelem_mmxext(int16_t *block)
me_cmp_func vsad[6]
Definition: me_cmp.h:63
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:72
int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int dummy
Definition: motion.c:64
enum AVCodecID codec_id
Definition: avcodec.h:1693
main external API structure.
Definition: avcodec.h:1676
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:63
int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:76
#define EXTERNAL_MMXEXT(flags)
Definition: cpu.h:55
me_cmp_func sad[6]
Definition: me_cmp.h:56
me_cmp_func sse[6]
Definition: me_cmp.h:57
MpegEncContext.
Definition: mpegvideo.h:78
struct AVCodecContext * avctx
Definition: mpegvideo.h:95
int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
GLint GLenum GLboolean GLsizei stride
Definition: opengl_enc.c:105
MECmpContext mecc
Definition: mpegvideo.h:228
int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int nsse_weight
noise vs.
Definition: avcodec.h:3174
static double c[64]
int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int x86_reg
Definition: asm.h:72
int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int len
int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
static uint8_t tmp[8]
Definition: des.c:38
int(* sum_abs_dctelem)(int16_t *block)
Definition: me_cmp.h:54
#define stride
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)