FFmpeg
me_cmp_init.c
Go to the documentation of this file.
1 /*
2  * SIMD-optimized motion estimation
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24 
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/mem_internal.h"
28 #include "libavutil/x86/asm.h"
29 #include "libavutil/x86/cpu.h"
30 #include "libavcodec/me_cmp.h"
31 #include "libavcodec/mpegvideo.h"
32 
33 int ff_sum_abs_dctelem_mmx(int16_t *block);
34 int ff_sum_abs_dctelem_mmxext(int16_t *block);
35 int ff_sum_abs_dctelem_sse2(int16_t *block);
36 int ff_sum_abs_dctelem_ssse3(int16_t *block);
37 int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
38  ptrdiff_t stride, int h);
39 int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
40  ptrdiff_t stride, int h);
41 int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
42  ptrdiff_t stride, int h);
43 int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
44 int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h);
45 int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
46  ptrdiff_t stride, int h);
47 int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
48  ptrdiff_t stride, int h);
49 int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
50  ptrdiff_t stride, int h);
51 int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
52  ptrdiff_t stride, int h);
54  ptrdiff_t stride, int h);
55 int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
56  ptrdiff_t stride, int h);
57 int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
58  ptrdiff_t stride, int h);
60  ptrdiff_t stride, int h);
61 int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
62  ptrdiff_t stride, int h);
64  ptrdiff_t stride, int h);
66  ptrdiff_t stride, int h);
68  ptrdiff_t stride, int h);
70  ptrdiff_t stride, int h);
72  ptrdiff_t stride, int h);
74  ptrdiff_t stride, int h);
76  ptrdiff_t stride, int h);
78  ptrdiff_t stride, int h);
80  ptrdiff_t stride, int h);
81 
82 #define hadamard_func(cpu) \
83  int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
84  uint8_t *src2, ptrdiff_t stride, int h); \
85  int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
86  uint8_t *src2, ptrdiff_t stride, int h);
87 
89 hadamard_func(mmxext)
90 hadamard_func(sse2)
91 hadamard_func(ssse3)
92 
93 #if HAVE_X86ASM
94 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
95  ptrdiff_t stride, int h)
96 {
97  int score1, score2;
98 
99  if (c)
100  score1 = c->mecc.sse[0](c, pix1, pix2, stride, h);
101  else
102  score1 = ff_sse16_mmx(c, pix1, pix2, stride, h);
103  score2 = ff_hf_noise16_mmx(pix1, stride, h) + ff_hf_noise8_mmx(pix1+8, stride, h)
104  - ff_hf_noise16_mmx(pix2, stride, h) - ff_hf_noise8_mmx(pix2+8, stride, h);
105 
106  if (c)
107  return score1 + FFABS(score2) * c->avctx->nsse_weight;
108  else
109  return score1 + FFABS(score2) * 8;
110 }
111 
112 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
113  ptrdiff_t stride, int h)
114 {
115  int score1 = ff_sse8_mmx(c, pix1, pix2, stride, h);
116  int score2 = ff_hf_noise8_mmx(pix1, stride, h) -
117  ff_hf_noise8_mmx(pix2, stride, h);
118 
119  if (c)
120  return score1 + FFABS(score2) * c->avctx->nsse_weight;
121  else
122  return score1 + FFABS(score2) * 8;
123 }
124 
125 #endif /* HAVE_X86ASM */
126 
127 #if HAVE_INLINE_ASM
128 
129 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
130  ptrdiff_t stride, int h)
131 {
132  int tmp;
133 
134  av_assert2((((int) pix) & 7) == 0);
135  av_assert2((stride & 7) == 0);
136 
137 #define SUM(in0, in1, out0, out1) \
138  "movq (%0), %%mm2\n" \
139  "movq 8(%0), %%mm3\n" \
140  "add %2,%0\n" \
141  "movq %%mm2, " #out0 "\n" \
142  "movq %%mm3, " #out1 "\n" \
143  "psubusb " #in0 ", %%mm2\n" \
144  "psubusb " #in1 ", %%mm3\n" \
145  "psubusb " #out0 ", " #in0 "\n" \
146  "psubusb " #out1 ", " #in1 "\n" \
147  "por %%mm2, " #in0 "\n" \
148  "por %%mm3, " #in1 "\n" \
149  "movq " #in0 ", %%mm2\n" \
150  "movq " #in1 ", %%mm3\n" \
151  "punpcklbw %%mm7, " #in0 "\n" \
152  "punpcklbw %%mm7, " #in1 "\n" \
153  "punpckhbw %%mm7, %%mm2\n" \
154  "punpckhbw %%mm7, %%mm3\n" \
155  "paddw " #in1 ", " #in0 "\n" \
156  "paddw %%mm3, %%mm2\n" \
157  "paddw %%mm2, " #in0 "\n" \
158  "paddw " #in0 ", %%mm6\n"
159 
160 
161  __asm__ volatile (
162  "movl %3, %%ecx\n"
163  "pxor %%mm6, %%mm6\n"
164  "pxor %%mm7, %%mm7\n"
165  "movq (%0), %%mm0\n"
166  "movq 8(%0), %%mm1\n"
167  "add %2, %0\n"
168  "jmp 2f\n"
169  "1:\n"
170 
171  SUM(%%mm4, %%mm5, %%mm0, %%mm1)
172  "2:\n"
173  SUM(%%mm0, %%mm1, %%mm4, %%mm5)
174 
175  "subl $2, %%ecx\n"
176  "jnz 1b\n"
177 
178  "movq %%mm6, %%mm0\n"
179  "psrlq $32, %%mm6\n"
180  "paddw %%mm6, %%mm0\n"
181  "movq %%mm0, %%mm6\n"
182  "psrlq $16, %%mm0\n"
183  "paddw %%mm6, %%mm0\n"
184  "movd %%mm0, %1\n"
185  : "+r" (pix), "=r" (tmp)
186  : "r" (stride), "m" (h)
187  : "%ecx");
188 
189  return tmp & 0xFFFF;
190 }
191 #undef SUM
192 
193 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
194  ptrdiff_t stride, int h)
195 {
196  int tmp;
197 
198  av_assert2((((int) pix1) & 7) == 0);
199  av_assert2((((int) pix2) & 7) == 0);
200  av_assert2((stride & 7) == 0);
201 
202 #define SUM(in0, in1, out0, out1) \
203  "movq (%0), %%mm2\n" \
204  "movq (%1), " #out0 "\n" \
205  "movq 8(%0), %%mm3\n" \
206  "movq 8(%1), " #out1 "\n" \
207  "add %3, %0\n" \
208  "add %3, %1\n" \
209  "psubb " #out0 ", %%mm2\n" \
210  "psubb " #out1 ", %%mm3\n" \
211  "pxor %%mm7, %%mm2\n" \
212  "pxor %%mm7, %%mm3\n" \
213  "movq %%mm2, " #out0 "\n" \
214  "movq %%mm3, " #out1 "\n" \
215  "psubusb " #in0 ", %%mm2\n" \
216  "psubusb " #in1 ", %%mm3\n" \
217  "psubusb " #out0 ", " #in0 "\n" \
218  "psubusb " #out1 ", " #in1 "\n" \
219  "por %%mm2, " #in0 "\n" \
220  "por %%mm3, " #in1 "\n" \
221  "movq " #in0 ", %%mm2\n" \
222  "movq " #in1 ", %%mm3\n" \
223  "punpcklbw %%mm7, " #in0 "\n" \
224  "punpcklbw %%mm7, " #in1 "\n" \
225  "punpckhbw %%mm7, %%mm2\n" \
226  "punpckhbw %%mm7, %%mm3\n" \
227  "paddw " #in1 ", " #in0 "\n" \
228  "paddw %%mm3, %%mm2\n" \
229  "paddw %%mm2, " #in0 "\n" \
230  "paddw " #in0 ", %%mm6\n"
231 
232 
233  __asm__ volatile (
234  "movl %4, %%ecx\n"
235  "pxor %%mm6, %%mm6\n"
236  "pcmpeqw %%mm7, %%mm7\n"
237  "psllw $15, %%mm7\n"
238  "packsswb %%mm7, %%mm7\n"
239  "movq (%0), %%mm0\n"
240  "movq (%1), %%mm2\n"
241  "movq 8(%0), %%mm1\n"
242  "movq 8(%1), %%mm3\n"
243  "add %3, %0\n"
244  "add %3, %1\n"
245  "psubb %%mm2, %%mm0\n"
246  "psubb %%mm3, %%mm1\n"
247  "pxor %%mm7, %%mm0\n"
248  "pxor %%mm7, %%mm1\n"
249  "jmp 2f\n"
250  "1:\n"
251 
252  SUM(%%mm4, %%mm5, %%mm0, %%mm1)
253  "2:\n"
254  SUM(%%mm0, %%mm1, %%mm4, %%mm5)
255 
256  "subl $2, %%ecx\n"
257  "jnz 1b\n"
258 
259  "movq %%mm6, %%mm0\n"
260  "psrlq $32, %%mm6\n"
261  "paddw %%mm6, %%mm0\n"
262  "movq %%mm0, %%mm6\n"
263  "psrlq $16, %%mm0\n"
264  "paddw %%mm6, %%mm0\n"
265  "movd %%mm0, %2\n"
266  : "+r" (pix1), "+r" (pix2), "=r" (tmp)
267  : "r" (stride), "m" (h)
268  : "%ecx");
269 
270  return tmp & 0x7FFF;
271 }
272 #undef SUM
273 
274 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
275  0x0000000000000000ULL,
276  0x0001000100010001ULL,
277  0x0002000200020002ULL,
278 };
279 
280 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2,
281  ptrdiff_t stride, int h)
282 {
283  x86_reg len = -stride * h;
284  __asm__ volatile (
285  ".p2align 4 \n\t"
286  "1: \n\t"
287  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
288  "movq (%2, %%"FF_REG_a"), %%mm2 \n\t"
289  "movq (%2, %%"FF_REG_a"), %%mm4 \n\t"
290  "add %3, %%"FF_REG_a" \n\t"
291  "psubusb %%mm0, %%mm2 \n\t"
292  "psubusb %%mm4, %%mm0 \n\t"
293  "movq (%1, %%"FF_REG_a"), %%mm1 \n\t"
294  "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
295  "movq (%2, %%"FF_REG_a"), %%mm5 \n\t"
296  "psubusb %%mm1, %%mm3 \n\t"
297  "psubusb %%mm5, %%mm1 \n\t"
298  "por %%mm2, %%mm0 \n\t"
299  "por %%mm1, %%mm3 \n\t"
300  "movq %%mm0, %%mm1 \n\t"
301  "movq %%mm3, %%mm2 \n\t"
302  "punpcklbw %%mm7, %%mm0 \n\t"
303  "punpckhbw %%mm7, %%mm1 \n\t"
304  "punpcklbw %%mm7, %%mm3 \n\t"
305  "punpckhbw %%mm7, %%mm2 \n\t"
306  "paddw %%mm1, %%mm0 \n\t"
307  "paddw %%mm3, %%mm2 \n\t"
308  "paddw %%mm2, %%mm0 \n\t"
309  "paddw %%mm0, %%mm6 \n\t"
310  "add %3, %%"FF_REG_a" \n\t"
311  " js 1b \n\t"
312  : "+a" (len)
313  : "r" (blk1 - len), "r" (blk2 - len), "r" (stride));
314 }
315 
316 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
317  ptrdiff_t stride, int h)
318 {
319  x86_reg len = -stride * h;
320  __asm__ volatile (
321  ".p2align 4 \n\t"
322  "1: \n\t"
323  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
324  "movq (%2, %%"FF_REG_a"), %%mm1 \n\t"
325  "movq (%1, %%"FF_REG_a"), %%mm2 \n\t"
326  "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
327  "punpcklbw %%mm7, %%mm0 \n\t"
328  "punpcklbw %%mm7, %%mm1 \n\t"
329  "punpckhbw %%mm7, %%mm2 \n\t"
330  "punpckhbw %%mm7, %%mm3 \n\t"
331  "paddw %%mm0, %%mm1 \n\t"
332  "paddw %%mm2, %%mm3 \n\t"
333  "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
334  "movq (%3, %%"FF_REG_a"), %%mm2 \n\t"
335  "paddw %%mm5, %%mm1 \n\t"
336  "paddw %%mm5, %%mm3 \n\t"
337  "psrlw $1, %%mm1 \n\t"
338  "psrlw $1, %%mm3 \n\t"
339  "packuswb %%mm3, %%mm1 \n\t"
340  "psubusb %%mm1, %%mm4 \n\t"
341  "psubusb %%mm2, %%mm1 \n\t"
342  "por %%mm4, %%mm1 \n\t"
343  "movq %%mm1, %%mm0 \n\t"
344  "punpcklbw %%mm7, %%mm0 \n\t"
345  "punpckhbw %%mm7, %%mm1 \n\t"
346  "paddw %%mm1, %%mm0 \n\t"
347  "paddw %%mm0, %%mm6 \n\t"
348  "add %4, %%"FF_REG_a" \n\t"
349  " js 1b \n\t"
350  : "+a" (len)
351  : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
352  "r" (stride));
353 }
354 
355 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2,
356  ptrdiff_t stride, int h)
357 {
358  x86_reg len = -stride * h;
359  __asm__ volatile (
360  "movq (%1, %%"FF_REG_a"), %%mm0\n\t"
361  "movq 1(%1, %%"FF_REG_a"), %%mm2\n\t"
362  "movq %%mm0, %%mm1 \n\t"
363  "movq %%mm2, %%mm3 \n\t"
364  "punpcklbw %%mm7, %%mm0 \n\t"
365  "punpckhbw %%mm7, %%mm1 \n\t"
366  "punpcklbw %%mm7, %%mm2 \n\t"
367  "punpckhbw %%mm7, %%mm3 \n\t"
368  "paddw %%mm2, %%mm0 \n\t"
369  "paddw %%mm3, %%mm1 \n\t"
370  ".p2align 4 \n\t"
371  "1: \n\t"
372  "movq (%2, %%"FF_REG_a"), %%mm2\n\t"
373  "movq 1(%2, %%"FF_REG_a"), %%mm4\n\t"
374  "movq %%mm2, %%mm3 \n\t"
375  "movq %%mm4, %%mm5 \n\t"
376  "punpcklbw %%mm7, %%mm2 \n\t"
377  "punpckhbw %%mm7, %%mm3 \n\t"
378  "punpcklbw %%mm7, %%mm4 \n\t"
379  "punpckhbw %%mm7, %%mm5 \n\t"
380  "paddw %%mm4, %%mm2 \n\t"
381  "paddw %%mm5, %%mm3 \n\t"
382  "movq %5, %%mm5 \n\t"
383  "paddw %%mm2, %%mm0 \n\t"
384  "paddw %%mm3, %%mm1 \n\t"
385  "paddw %%mm5, %%mm0 \n\t"
386  "paddw %%mm5, %%mm1 \n\t"
387  "movq (%3, %%"FF_REG_a"), %%mm4 \n\t"
388  "movq (%3, %%"FF_REG_a"), %%mm5 \n\t"
389  "psrlw $2, %%mm0 \n\t"
390  "psrlw $2, %%mm1 \n\t"
391  "packuswb %%mm1, %%mm0 \n\t"
392  "psubusb %%mm0, %%mm4 \n\t"
393  "psubusb %%mm5, %%mm0 \n\t"
394  "por %%mm4, %%mm0 \n\t"
395  "movq %%mm0, %%mm4 \n\t"
396  "punpcklbw %%mm7, %%mm0 \n\t"
397  "punpckhbw %%mm7, %%mm4 \n\t"
398  "paddw %%mm0, %%mm6 \n\t"
399  "paddw %%mm4, %%mm6 \n\t"
400  "movq %%mm2, %%mm0 \n\t"
401  "movq %%mm3, %%mm1 \n\t"
402  "add %4, %%"FF_REG_a" \n\t"
403  " js 1b \n\t"
404  : "+a" (len)
405  : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
406  "r" (stride), "m" (round_tab[2]));
407 }
408 
409 static inline int sum_mmx(void)
410 {
411  int ret;
412  __asm__ volatile (
413  "movq %%mm6, %%mm0 \n\t"
414  "psrlq $32, %%mm6 \n\t"
415  "paddw %%mm0, %%mm6 \n\t"
416  "movq %%mm6, %%mm0 \n\t"
417  "psrlq $16, %%mm6 \n\t"
418  "paddw %%mm0, %%mm6 \n\t"
419  "movd %%mm6, %0 \n\t"
420  : "=r" (ret));
421  return ret & 0xFFFF;
422 }
423 
424 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2,
425  ptrdiff_t stride, int h)
426 {
427  sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
428 }
429 
430 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2,
431  ptrdiff_t stride, int h)
432 {
433  sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
434 }
435 
436 #define PIX_SAD(suf) \
437 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \
438  uint8_t *blk1, ptrdiff_t stride, int h) \
439 { \
440  av_assert2(h == 8); \
441  __asm__ volatile ( \
442  "pxor %%mm7, %%mm7 \n\t" \
443  "pxor %%mm6, %%mm6 \n\t" \
444  :); \
445  \
446  sad8_1_ ## suf(blk1, blk2, stride, 8); \
447  \
448  return sum_ ## suf(); \
449 } \
450  \
451 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
452  uint8_t *blk1, ptrdiff_t stride, int h) \
453 { \
454  av_assert2(h == 8); \
455  __asm__ volatile ( \
456  "pxor %%mm7, %%mm7 \n\t" \
457  "pxor %%mm6, %%mm6 \n\t" \
458  "movq %0, %%mm5 \n\t" \
459  :: "m" (round_tab[1])); \
460  \
461  sad8_x2a_ ## suf(blk1, blk2, stride, 8); \
462  \
463  return sum_ ## suf(); \
464 } \
465  \
466 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
467  uint8_t *blk1, ptrdiff_t stride, int h) \
468 { \
469  av_assert2(h == 8); \
470  __asm__ volatile ( \
471  "pxor %%mm7, %%mm7 \n\t" \
472  "pxor %%mm6, %%mm6 \n\t" \
473  "movq %0, %%mm5 \n\t" \
474  :: "m" (round_tab[1])); \
475  \
476  sad8_y2a_ ## suf(blk1, blk2, stride, 8); \
477  \
478  return sum_ ## suf(); \
479 } \
480  \
481 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
482  uint8_t *blk1, ptrdiff_t stride, int h) \
483 { \
484  av_assert2(h == 8); \
485  __asm__ volatile ( \
486  "pxor %%mm7, %%mm7 \n\t" \
487  "pxor %%mm6, %%mm6 \n\t" \
488  ::); \
489  \
490  sad8_4_ ## suf(blk1, blk2, stride, 8); \
491  \
492  return sum_ ## suf(); \
493 } \
494  \
495 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \
496  uint8_t *blk1, ptrdiff_t stride, int h) \
497 { \
498  __asm__ volatile ( \
499  "pxor %%mm7, %%mm7 \n\t" \
500  "pxor %%mm6, %%mm6 \n\t" \
501  :); \
502  \
503  sad8_1_ ## suf(blk1, blk2, stride, h); \
504  sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
505  \
506  return sum_ ## suf(); \
507 } \
508  \
509 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
510  uint8_t *blk1, ptrdiff_t stride, int h) \
511 { \
512  __asm__ volatile ( \
513  "pxor %%mm7, %%mm7 \n\t" \
514  "pxor %%mm6, %%mm6 \n\t" \
515  "movq %0, %%mm5 \n\t" \
516  :: "m" (round_tab[1])); \
517  \
518  sad8_x2a_ ## suf(blk1, blk2, stride, h); \
519  sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
520  \
521  return sum_ ## suf(); \
522 } \
523  \
524 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
525  uint8_t *blk1, ptrdiff_t stride, int h) \
526 { \
527  __asm__ volatile ( \
528  "pxor %%mm7, %%mm7 \n\t" \
529  "pxor %%mm6, %%mm6 \n\t" \
530  "movq %0, %%mm5 \n\t" \
531  :: "m" (round_tab[1])); \
532  \
533  sad8_y2a_ ## suf(blk1, blk2, stride, h); \
534  sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
535  \
536  return sum_ ## suf(); \
537 } \
538  \
539 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \
540  uint8_t *blk1, ptrdiff_t stride, int h) \
541 { \
542  __asm__ volatile ( \
543  "pxor %%mm7, %%mm7 \n\t" \
544  "pxor %%mm6, %%mm6 \n\t" \
545  ::); \
546  \
547  sad8_4_ ## suf(blk1, blk2, stride, h); \
548  sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \
549  \
550  return sum_ ## suf(); \
551 } \
552 
553 PIX_SAD(mmx)
554 
555 #endif /* HAVE_INLINE_ASM */
556 
558 {
559  int cpu_flags = av_get_cpu_flags();
560 
561 #if HAVE_INLINE_ASM
562  if (INLINE_MMX(cpu_flags)) {
563  c->pix_abs[0][0] = sad16_mmx;
564  c->pix_abs[0][1] = sad16_x2_mmx;
565  c->pix_abs[0][2] = sad16_y2_mmx;
566  c->pix_abs[0][3] = sad16_xy2_mmx;
567  c->pix_abs[1][0] = sad8_mmx;
568  c->pix_abs[1][1] = sad8_x2_mmx;
569  c->pix_abs[1][2] = sad8_y2_mmx;
570  c->pix_abs[1][3] = sad8_xy2_mmx;
571 
572  c->sad[0] = sad16_mmx;
573  c->sad[1] = sad8_mmx;
574 
575  c->vsad[4] = vsad_intra16_mmx;
576 
577  if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
578  c->vsad[0] = vsad16_mmx;
579  }
580  }
581 
582 #endif /* HAVE_INLINE_ASM */
583 
584  if (EXTERNAL_MMX(cpu_flags)) {
585  c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
586  c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
588  c->sse[0] = ff_sse16_mmx;
589  c->sse[1] = ff_sse8_mmx;
590 #if HAVE_X86ASM
591  c->nsse[0] = nsse16_mmx;
592  c->nsse[1] = nsse8_mmx;
593 #endif
594  }
595 
596  if (EXTERNAL_MMXEXT(cpu_flags)) {
597  c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
598  c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
600 
601  c->sad[0] = ff_sad16_mmxext;
602  c->sad[1] = ff_sad8_mmxext;
603 
604  c->pix_abs[0][0] = ff_sad16_mmxext;
605  c->pix_abs[0][1] = ff_sad16_x2_mmxext;
606  c->pix_abs[0][2] = ff_sad16_y2_mmxext;
607  c->pix_abs[1][0] = ff_sad8_mmxext;
608  c->pix_abs[1][1] = ff_sad8_x2_mmxext;
609  c->pix_abs[1][2] = ff_sad8_y2_mmxext;
610 
612  c->vsad[5] = ff_vsad_intra8_mmxext;
613 
614  if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
617 
620  }
621  }
622 
623  if (EXTERNAL_SSE2(cpu_flags)) {
624  c->sse[0] = ff_sse16_sse2;
626 
627 #if HAVE_ALIGNED_STACK
628  c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
629  c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
630 #endif
631  if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
632  c->sad[0] = ff_sad16_sse2;
633  c->pix_abs[0][0] = ff_sad16_sse2;
634  c->pix_abs[0][1] = ff_sad16_x2_sse2;
635  c->pix_abs[0][2] = ff_sad16_y2_sse2;
636 
637  c->vsad[4] = ff_vsad_intra16_sse2;
638  if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {
640  c->vsad[0] = ff_vsad16_approx_sse2;
641  }
642  }
643  }
644 
645  if (EXTERNAL_SSSE3(cpu_flags)) {
647 #if HAVE_ALIGNED_STACK
648  c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
649  c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
650 #endif
651  }
652 }
#define EXTERNAL_MMX(flags)
Definition: cpu.h:56
int ff_sum_abs_dctelem_mmx(int16_t *block)
int(* sum_abs_dctelem)(int16_t *block)
Definition: me_cmp.h:54
int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
mpegvideo header.
static atomic_int cpu_flags
Definition: cpu.c:50
int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Macro definitions for various function/variable attributes.
me_cmp_func hadamard8_diff[6]
Definition: me_cmp.h:58
int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
The exact code depends on how similar the blocks are and how related they are to the block
uint8_t
#define av_cold
Definition: attributes.h:88
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:64
#define hadamard_func(cpu)
Definition: me_cmp_init.c:82
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
int ff_sum_abs_dctelem_sse2(int16_t *block)
me_cmp_func nsse[6]
Definition: me_cmp.h:65
#define AV_CPU_FLAG_SSE2SLOW
SSE2 supported, but usually not faster.
Definition: cpu.h:37
#define EXTERNAL_SSE2(flags)
Definition: cpu.h:59
int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sum_abs_dctelem_ssse3(int16_t *block)
int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define INLINE_MMX(flags)
Definition: cpu.h:86
int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int flags
AV_CODEC_FLAG_*.
Definition: avcodec.h:611
me_cmp_func pix_abs[2][4]
Definition: me_cmp.h:78
int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define DECLARE_ASM_CONST(n, t, v)
Declare a static constant aligned variable appropriate for use in inline assembly code...
Definition: mem.h:119
int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define AV_CODEC_FLAG_BITEXACT
Use only bitexact stuff (except (I)DCT).
Definition: avcodec.h:333
int ff_sum_abs_dctelem_mmxext(int16_t *block)
me_cmp_func vsad[6]
Definition: me_cmp.h:63
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:72
int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int dummy
Definition: motion.c:64
enum AVCodecID codec_id
Definition: avcodec.h:541
main external API structure.
Definition: avcodec.h:531
#define EXTERNAL_SSSE3(flags)
Definition: cpu.h:65
int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:95
#define EXTERNAL_MMXEXT(flags)
Definition: cpu.h:57
me_cmp_func sad[6]
Definition: me_cmp.h:56
me_cmp_func sse[6]
Definition: me_cmp.h:57
MpegEncContext.
Definition: mpegvideo.h:81
struct AVCodecContext * avctx
Definition: mpegvideo.h:98
int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
GLint GLenum GLboolean GLsizei stride
Definition: opengl_enc.c:104
MECmpContext mecc
Definition: mpegvideo.h:231
int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int nsse_weight
noise vs.
Definition: avcodec.h:1842
__asm__(".macro parse_r var r\n\t""\\var = -1\n\t"_IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31)".iflt \\var\n\t"".error \"Unable to parse register name \\r\"\n\t"".endif\n\t"".endm")
int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int x86_reg
Definition: asm.h:72
int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int len
int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define stride
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)
static uint8_t tmp[11]
Definition: aes_ctr.c:27