82 #define hadamard_func(cpu) \ 83 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \ 84 uint8_t *src2, ptrdiff_t stride, int h); \ 85 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \ 86 uint8_t *src2, ptrdiff_t stride, int h); 109 return score1 +
FFABS(score2) * 8;
115 int score1 =
ff_sse8_mmx(c, pix1, pix2, stride, h);
122 return score1 +
FFABS(score2) * 8;
130 ptrdiff_t stride,
int h)
137 #define SUM(in0, in1, out0, out1) \ 138 "movq (%0), %%mm2\n" \ 139 "movq 8(%0), %%mm3\n" \ 141 "movq %%mm2, " #out0 "\n" \ 142 "movq %%mm3, " #out1 "\n" \ 143 "psubusb " #in0 ", %%mm2\n" \ 144 "psubusb " #in1 ", %%mm3\n" \ 145 "psubusb " #out0 ", " #in0 "\n" \ 146 "psubusb " #out1 ", " #in1 "\n" \ 147 "por %%mm2, " #in0 "\n" \ 148 "por %%mm3, " #in1 "\n" \ 149 "movq " #in0 ", %%mm2\n" \ 150 "movq " #in1 ", %%mm3\n" \ 151 "punpcklbw %%mm7, " #in0 "\n" \ 152 "punpcklbw %%mm7, " #in1 "\n" \ 153 "punpckhbw %%mm7, %%mm2\n" \ 154 "punpckhbw %%mm7, %%mm3\n" \ 155 "paddw " #in1 ", " #in0 "\n" \ 156 "paddw %%mm3, %%mm2\n" \ 157 "paddw %%mm2, " #in0 "\n" \ 158 "paddw " #in0 ", %%mm6\n" 163 "pxor %%mm6, %%mm6\n" 164 "pxor %%mm7, %%mm7\n" 166 "movq 8(%0), %%mm1\n" 171 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
173 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
178 "movq %%mm6, %%mm0\n" 180 "paddw %%mm6, %%mm0\n" 181 "movq %%mm0, %%mm6\n" 183 "paddw %%mm6, %%mm0\n" 185 :
"+r" (pix),
"=r" (tmp)
194 ptrdiff_t stride,
int h)
202 #define SUM(in0, in1, out0, out1) \ 203 "movq (%0), %%mm2\n" \ 204 "movq (%1), " #out0 "\n" \ 205 "movq 8(%0), %%mm3\n" \ 206 "movq 8(%1), " #out1 "\n" \ 209 "psubb " #out0 ", %%mm2\n" \ 210 "psubb " #out1 ", %%mm3\n" \ 211 "pxor %%mm7, %%mm2\n" \ 212 "pxor %%mm7, %%mm3\n" \ 213 "movq %%mm2, " #out0 "\n" \ 214 "movq %%mm3, " #out1 "\n" \ 215 "psubusb " #in0 ", %%mm2\n" \ 216 "psubusb " #in1 ", %%mm3\n" \ 217 "psubusb " #out0 ", " #in0 "\n" \ 218 "psubusb " #out1 ", " #in1 "\n" \ 219 "por %%mm2, " #in0 "\n" \ 220 "por %%mm3, " #in1 "\n" \ 221 "movq " #in0 ", %%mm2\n" \ 222 "movq " #in1 ", %%mm3\n" \ 223 "punpcklbw %%mm7, " #in0 "\n" \ 224 "punpcklbw %%mm7, " #in1 "\n" \ 225 "punpckhbw %%mm7, %%mm2\n" \ 226 "punpckhbw %%mm7, %%mm3\n" \ 227 "paddw " #in1 ", " #in0 "\n" \ 228 "paddw %%mm3, %%mm2\n" \ 229 "paddw %%mm2, " #in0 "\n" \ 230 "paddw " #in0 ", %%mm6\n" 235 "pxor %%mm6, %%mm6\n" 236 "pcmpeqw %%mm7, %%mm7\n" 238 "packsswb %%mm7, %%mm7\n" 241 "movq 8(%0), %%mm1\n" 242 "movq 8(%1), %%mm3\n" 245 "psubb %%mm2, %%mm0\n" 246 "psubb %%mm3, %%mm1\n" 247 "pxor %%mm7, %%mm0\n" 248 "pxor %%mm7, %%mm1\n" 252 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
254 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
259 "movq %%mm6, %%mm0\n" 261 "paddw %%mm6, %%mm0\n" 262 "movq %%mm0, %%mm6\n" 264 "paddw %%mm6, %%mm0\n" 266 :
"+r" (pix1),
"+r" (pix2),
"=r" (
tmp)
267 :
"r" (stride),
"m" (
h)
275 0x0000000000000000ULL,
276 0x0001000100010001ULL,
277 0x0002000200020002ULL,
281 ptrdiff_t stride,
int h)
287 "movq (%1, %%"FF_REG_a
"), %%mm0 \n\t" 288 "movq (%2, %%"FF_REG_a
"), %%mm2 \n\t" 289 "movq (%2, %%"FF_REG_a
"), %%mm4 \n\t" 290 "add %3, %%"FF_REG_a
" \n\t" 291 "psubusb %%mm0, %%mm2 \n\t" 292 "psubusb %%mm4, %%mm0 \n\t" 293 "movq (%1, %%"FF_REG_a
"), %%mm1 \n\t" 294 "movq (%2, %%"FF_REG_a
"), %%mm3 \n\t" 295 "movq (%2, %%"FF_REG_a
"), %%mm5 \n\t" 296 "psubusb %%mm1, %%mm3 \n\t" 297 "psubusb %%mm5, %%mm1 \n\t" 298 "por %%mm2, %%mm0 \n\t" 299 "por %%mm1, %%mm3 \n\t" 300 "movq %%mm0, %%mm1 \n\t" 301 "movq %%mm3, %%mm2 \n\t" 302 "punpcklbw %%mm7, %%mm0 \n\t" 303 "punpckhbw %%mm7, %%mm1 \n\t" 304 "punpcklbw %%mm7, %%mm3 \n\t" 305 "punpckhbw %%mm7, %%mm2 \n\t" 306 "paddw %%mm1, %%mm0 \n\t" 307 "paddw %%mm3, %%mm2 \n\t" 308 "paddw %%mm2, %%mm0 \n\t" 309 "paddw %%mm0, %%mm6 \n\t" 310 "add %3, %%"FF_REG_a
" \n\t" 313 :
"r" (blk1 - len),
"r" (blk2 -
len),
"r" (stride));
317 ptrdiff_t stride,
int h)
323 "movq (%1, %%"FF_REG_a
"), %%mm0 \n\t" 324 "movq (%2, %%"FF_REG_a
"), %%mm1 \n\t" 325 "movq (%1, %%"FF_REG_a
"), %%mm2 \n\t" 326 "movq (%2, %%"FF_REG_a
"), %%mm3 \n\t" 327 "punpcklbw %%mm7, %%mm0 \n\t" 328 "punpcklbw %%mm7, %%mm1 \n\t" 329 "punpckhbw %%mm7, %%mm2 \n\t" 330 "punpckhbw %%mm7, %%mm3 \n\t" 331 "paddw %%mm0, %%mm1 \n\t" 332 "paddw %%mm2, %%mm3 \n\t" 333 "movq (%3, %%"FF_REG_a
"), %%mm4 \n\t" 334 "movq (%3, %%"FF_REG_a
"), %%mm2 \n\t" 335 "paddw %%mm5, %%mm1 \n\t" 336 "paddw %%mm5, %%mm3 \n\t" 337 "psrlw $1, %%mm1 \n\t" 338 "psrlw $1, %%mm3 \n\t" 339 "packuswb %%mm3, %%mm1 \n\t" 340 "psubusb %%mm1, %%mm4 \n\t" 341 "psubusb %%mm2, %%mm1 \n\t" 342 "por %%mm4, %%mm1 \n\t" 343 "movq %%mm1, %%mm0 \n\t" 344 "punpcklbw %%mm7, %%mm0 \n\t" 345 "punpckhbw %%mm7, %%mm1 \n\t" 346 "paddw %%mm1, %%mm0 \n\t" 347 "paddw %%mm0, %%mm6 \n\t" 348 "add %4, %%"FF_REG_a
" \n\t" 351 :
"r" (blk1a - len),
"r" (blk1b -
len),
"r" (blk2 - len),
356 ptrdiff_t stride,
int h)
360 "movq (%1, %%"FF_REG_a
"), %%mm0\n\t" 361 "movq 1(%1, %%"FF_REG_a
"), %%mm2\n\t" 362 "movq %%mm0, %%mm1 \n\t" 363 "movq %%mm2, %%mm3 \n\t" 364 "punpcklbw %%mm7, %%mm0 \n\t" 365 "punpckhbw %%mm7, %%mm1 \n\t" 366 "punpcklbw %%mm7, %%mm2 \n\t" 367 "punpckhbw %%mm7, %%mm3 \n\t" 368 "paddw %%mm2, %%mm0 \n\t" 369 "paddw %%mm3, %%mm1 \n\t" 372 "movq (%2, %%"FF_REG_a
"), %%mm2\n\t" 373 "movq 1(%2, %%"FF_REG_a
"), %%mm4\n\t" 374 "movq %%mm2, %%mm3 \n\t" 375 "movq %%mm4, %%mm5 \n\t" 376 "punpcklbw %%mm7, %%mm2 \n\t" 377 "punpckhbw %%mm7, %%mm3 \n\t" 378 "punpcklbw %%mm7, %%mm4 \n\t" 379 "punpckhbw %%mm7, %%mm5 \n\t" 380 "paddw %%mm4, %%mm2 \n\t" 381 "paddw %%mm5, %%mm3 \n\t" 382 "movq %5, %%mm5 \n\t" 383 "paddw %%mm2, %%mm0 \n\t" 384 "paddw %%mm3, %%mm1 \n\t" 385 "paddw %%mm5, %%mm0 \n\t" 386 "paddw %%mm5, %%mm1 \n\t" 387 "movq (%3, %%"FF_REG_a
"), %%mm4 \n\t" 388 "movq (%3, %%"FF_REG_a
"), %%mm5 \n\t" 389 "psrlw $2, %%mm0 \n\t" 390 "psrlw $2, %%mm1 \n\t" 391 "packuswb %%mm1, %%mm0 \n\t" 392 "psubusb %%mm0, %%mm4 \n\t" 393 "psubusb %%mm5, %%mm0 \n\t" 394 "por %%mm4, %%mm0 \n\t" 395 "movq %%mm0, %%mm4 \n\t" 396 "punpcklbw %%mm7, %%mm0 \n\t" 397 "punpckhbw %%mm7, %%mm4 \n\t" 398 "paddw %%mm0, %%mm6 \n\t" 399 "paddw %%mm4, %%mm6 \n\t" 400 "movq %%mm2, %%mm0 \n\t" 401 "movq %%mm3, %%mm1 \n\t" 402 "add %4, %%"FF_REG_a
" \n\t" 405 :
"r" (blk1 - len),
"r" (blk1 - len +
stride),
"r" (blk2 - len),
406 "r" (
stride),
"m" (round_tab[2]));
409 static inline int sum_mmx(
void)
413 "movq %%mm6, %%mm0 \n\t" 414 "psrlq $32, %%mm6 \n\t" 415 "paddw %%mm0, %%mm6 \n\t" 416 "movq %%mm6, %%mm0 \n\t" 417 "psrlq $16, %%mm6 \n\t" 418 "paddw %%mm0, %%mm6 \n\t" 419 "movd %%mm6, %0 \n\t" 425 ptrdiff_t stride,
int h)
427 sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
431 ptrdiff_t stride,
int h)
433 sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
436 #define PIX_SAD(suf) \ 437 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 438 uint8_t *blk1, ptrdiff_t stride, int h) \ 440 av_assert2(h == 8); \ 442 "pxor %%mm7, %%mm7 \n\t" \ 443 "pxor %%mm6, %%mm6 \n\t" \ 446 sad8_1_ ## suf(blk1, blk2, stride, 8); \ 448 return sum_ ## suf(); \ 451 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 452 uint8_t *blk1, ptrdiff_t stride, int h) \ 454 av_assert2(h == 8); \ 456 "pxor %%mm7, %%mm7 \n\t" \ 457 "pxor %%mm6, %%mm6 \n\t" \ 458 "movq %0, %%mm5 \n\t" \ 459 :: "m" (round_tab[1])); \ 461 sad8_x2a_ ## suf(blk1, blk2, stride, 8); \ 463 return sum_ ## suf(); \ 466 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 467 uint8_t *blk1, ptrdiff_t stride, int h) \ 469 av_assert2(h == 8); \ 471 "pxor %%mm7, %%mm7 \n\t" \ 472 "pxor %%mm6, %%mm6 \n\t" \ 473 "movq %0, %%mm5 \n\t" \ 474 :: "m" (round_tab[1])); \ 476 sad8_y2a_ ## suf(blk1, blk2, stride, 8); \ 478 return sum_ ## suf(); \ 481 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 482 uint8_t *blk1, ptrdiff_t stride, int h) \ 484 av_assert2(h == 8); \ 486 "pxor %%mm7, %%mm7 \n\t" \ 487 "pxor %%mm6, %%mm6 \n\t" \ 490 sad8_4_ ## suf(blk1, blk2, stride, 8); \ 492 return sum_ ## suf(); \ 495 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 496 uint8_t *blk1, ptrdiff_t stride, int h) \ 499 "pxor %%mm7, %%mm7 \n\t" \ 500 "pxor %%mm6, %%mm6 \n\t" \ 503 sad8_1_ ## suf(blk1, blk2, stride, h); \ 504 sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ 506 return sum_ ## suf(); \ 509 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 510 uint8_t *blk1, ptrdiff_t stride, int h) \ 513 "pxor %%mm7, %%mm7 \n\t" \ 514 "pxor %%mm6, %%mm6 \n\t" \ 515 "movq %0, %%mm5 \n\t" \ 516 :: "m" (round_tab[1])); \ 518 sad8_x2a_ ## suf(blk1, blk2, stride, h); \ 519 sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ 521 return sum_ ## suf(); \ 524 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 525 uint8_t *blk1, ptrdiff_t stride, int h) \ 528 "pxor %%mm7, %%mm7 \n\t" \ 529 "pxor %%mm6, %%mm6 \n\t" \ 530 "movq %0, %%mm5 \n\t" \ 531 :: "m" (round_tab[1])); \ 533 sad8_y2a_ ## suf(blk1, blk2, stride, h); \ 534 sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ 536 return sum_ ## suf(); \ 539 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2, \ 540 uint8_t *blk1, ptrdiff_t stride, int h) \ 543 "pxor %%mm7, %%mm7 \n\t" \ 544 "pxor %%mm6, %%mm6 \n\t" \ 547 sad8_4_ ## suf(blk1, blk2, stride, h); \ 548 sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h); \ 550 return sum_ ## suf(); \ 564 c->
pix_abs[0][1] = sad16_x2_mmx;
565 c->
pix_abs[0][2] = sad16_y2_mmx;
566 c->
pix_abs[0][3] = sad16_xy2_mmx;
568 c->
pix_abs[1][1] = sad8_x2_mmx;
569 c->
pix_abs[1][2] = sad8_y2_mmx;
570 c->
pix_abs[1][3] = sad8_xy2_mmx;
572 c->
sad[0] = sad16_mmx;
573 c->
sad[1] = sad8_mmx;
575 c->
vsad[4] = vsad_intra16_mmx;
578 c->
vsad[0] = vsad16_mmx;
591 c->
nsse[0] = nsse16_mmx;
592 c->
nsse[1] = nsse8_mmx;
627 #if HAVE_ALIGNED_STACK 647 #if HAVE_ALIGNED_STACK #define EXTERNAL_MMX(flags)
int ff_sum_abs_dctelem_mmx(int16_t *block)
int(* sum_abs_dctelem)(int16_t *block)
int ff_hf_noise8_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
int ff_vsad16_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
static atomic_int cpu_flags
int ff_hf_noise16_mmx(uint8_t *pix1, ptrdiff_t stride, int h)
int ff_vsad_intra8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
Macro definitions for various function/variable attributes.
me_cmp_func hadamard8_diff[6]
int ff_vsad16_approx_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
The exact code depends on how similar the blocks are and how related they are to the block
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
#define hadamard_func(cpu)
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
int ff_sum_abs_dctelem_sse2(int16_t *block)
#define AV_CPU_FLAG_SSE2SLOW
SSE2 supported, but usually not faster.
#define EXTERNAL_SSE2(flags)
int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sum_abs_dctelem_ssse3(int16_t *block)
int ff_vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define INLINE_MMX(flags)
int ff_sad8_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int flags
AV_CODEC_FLAG_*.
me_cmp_func pix_abs[2][4]
int ff_sad16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_y2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define DECLARE_ASM_CONST(n, t, v)
Declare a static constant aligned variable appropriate for use in inline assembly code...
int ff_sad8_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
#define AV_CODEC_FLAG_BITEXACT
Use only bitexact stuff (except (I)DCT).
int ff_sum_abs_dctelem_mmxext(int16_t *block)
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
int ff_sad16_approx_xy2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
main external API structure.
#define EXTERNAL_SSSE3(flags)
int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
#define EXTERNAL_MMXEXT(flags)
struct AVCodecContext * avctx
int ff_sad8_x2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
GLint GLenum GLboolean GLsizei stride
int ff_sad16_y2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_vsad_intra16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_vsad8_approx_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
__asm__(".macro parse_r var r\n\t""\\var = -1\n\t"_IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31)".iflt \\var\n\t"".error \"Unable to parse register name \\r\"\n\t"".endif\n\t"".endm")
int ff_sad16_approx_xy2_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad8_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
int ff_sad16_x2_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, ptrdiff_t stride, int h)
void ff_me_cmp_init_x86(MECmpContext *c, AVCodecContext *avctx)