47 "movq (%3), %%mm0 \n\t"
48 "movq 8(%3), %%mm1 \n\t"
49 "movq 16(%3), %%mm2 \n\t"
50 "movq 24(%3), %%mm3 \n\t"
51 "movq 32(%3), %%mm4 \n\t"
52 "movq 40(%3), %%mm5 \n\t"
53 "movq 48(%3), %%mm6 \n\t"
54 "movq 56(%3), %%mm7 \n\t"
55 "packuswb %%mm1, %%mm0 \n\t"
56 "packuswb %%mm3, %%mm2 \n\t"
57 "packuswb %%mm5, %%mm4 \n\t"
58 "packuswb %%mm7, %%mm6 \n\t"
59 "movq %%mm0, (%0) \n\t"
60 "movq %%mm2, (%0, %1) \n\t"
61 "movq %%mm4, (%0, %1, 2) \n\t"
62 "movq %%mm6, (%0, %2) \n\t"
73 "movq (%3), %%mm0 \n\t"
74 "movq 8(%3), %%mm1 \n\t"
75 "movq 16(%3), %%mm2 \n\t"
76 "movq 24(%3), %%mm3 \n\t"
77 "movq 32(%3), %%mm4 \n\t"
78 "movq 40(%3), %%mm5 \n\t"
79 "movq 48(%3), %%mm6 \n\t"
80 "movq 56(%3), %%mm7 \n\t"
81 "packuswb %%mm1, %%mm0 \n\t"
82 "packuswb %%mm3, %%mm2 \n\t"
83 "packuswb %%mm5, %%mm4 \n\t"
84 "packuswb %%mm7, %%mm6 \n\t"
85 "movq %%mm0, (%0) \n\t"
86 "movq %%mm2, (%0, %1) \n\t"
87 "movq %%mm4, (%0, %1, 2) \n\t"
88 "movq %%mm6, (%0, %2) \n\t"
89 ::
"r"(pix),
"r"((
x86_reg)line_size),
"r"((
x86_reg)line_size * 3),
"r"(p)
93 #define put_signed_pixels_clamped_mmx_half(off) \
94 "movq "#off"(%2), %%mm1 \n\t" \
95 "movq 16 + "#off"(%2), %%mm2 \n\t" \
96 "movq 32 + "#off"(%2), %%mm3 \n\t" \
97 "movq 48 + "#off"(%2), %%mm4 \n\t" \
98 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
99 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
100 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
101 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
102 "paddb %%mm0, %%mm1 \n\t" \
103 "paddb %%mm0, %%mm2 \n\t" \
104 "paddb %%mm0, %%mm3 \n\t" \
105 "paddb %%mm0, %%mm4 \n\t" \
106 "movq %%mm1, (%0) \n\t" \
107 "movq %%mm2, (%0, %3) \n\t" \
108 "movq %%mm3, (%0, %3, 2) \n\t" \
109 "movq %%mm4, (%0, %1) \n\t"
119 "lea (%3, %3, 2), %1 \n\t"
120 put_signed_pixels_clamped_mmx_half(0)
121 "lea (%0, %3, 4), %0 \n\t"
122 put_signed_pixels_clamped_mmx_half(64)
123 :
"+&r"(pixels),
"=&r"(line_skip3)
124 :
"r"(block),
"r"(line_skip)
142 "movq (%2), %%mm0 \n\t"
143 "movq 8(%2), %%mm1 \n\t"
144 "movq 16(%2), %%mm2 \n\t"
145 "movq 24(%2), %%mm3 \n\t"
146 "movq %0, %%mm4 \n\t"
147 "movq %1, %%mm6 \n\t"
148 "movq %%mm4, %%mm5 \n\t"
149 "punpcklbw %%mm7, %%mm4 \n\t"
150 "punpckhbw %%mm7, %%mm5 \n\t"
151 "paddsw %%mm4, %%mm0 \n\t"
152 "paddsw %%mm5, %%mm1 \n\t"
153 "movq %%mm6, %%mm5 \n\t"
154 "punpcklbw %%mm7, %%mm6 \n\t"
155 "punpckhbw %%mm7, %%mm5 \n\t"
156 "paddsw %%mm6, %%mm2 \n\t"
157 "paddsw %%mm5, %%mm3 \n\t"
158 "packuswb %%mm1, %%mm0 \n\t"
159 "packuswb %%mm3, %%mm2 \n\t"
160 "movq %%mm0, %0 \n\t"
161 "movq %%mm2, %1 \n\t"
162 :
"+m"(*pix),
"+m"(*(pix + line_size))
165 pix += line_size * 2;
170 #define CLEAR_BLOCKS(name, n) \
171 void name(int16_t *blocks) \
174 "pxor %%mm7, %%mm7 \n\t" \
175 "mov %1, %%"REG_a" \n\t" \
177 "movq %%mm7, (%0, %%"REG_a") \n\t" \
178 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
179 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
180 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
181 "add $32, %%"REG_a" \n\t" \
183 :: "r"(((uint8_t *)blocks) + 128 * n), \
194 "xorps %%xmm0, %%xmm0 \n"
195 "movaps %%xmm0, (%0) \n"
196 "movaps %%xmm0, 16(%0) \n"
197 "movaps %%xmm0, 32(%0) \n"
198 "movaps %%xmm0, 48(%0) \n"
199 "movaps %%xmm0, 64(%0) \n"
200 "movaps %%xmm0, 80(%0) \n"
201 "movaps %%xmm0, 96(%0) \n"
202 "movaps %%xmm0, 112(%0) \n"
211 "xorps %%xmm0, %%xmm0 \n"
212 "mov %1, %%"REG_a
" \n"
214 "movaps %%xmm0, (%0, %%"REG_a
") \n"
215 "movaps %%xmm0, 16(%0, %%"REG_a
") \n"
216 "movaps %%xmm0, 32(%0, %%"REG_a
") \n"
217 "movaps %%xmm0, 48(%0, %%"REG_a
") \n"
218 "movaps %%xmm0, 64(%0, %%"REG_a
") \n"
219 "movaps %%xmm0, 80(%0, %%"REG_a
") \n"
220 "movaps %%xmm0, 96(%0, %%"REG_a
") \n"
221 "movaps %%xmm0, 112(%0, %%"REG_a
") \n"
222 "add $128, %%"REG_a
" \n"
224 ::
"r"(((
uint8_t *)blocks) + 128 * 6),
236 "movq (%1, %0), %%mm0 \n\t"
237 "movq (%2, %0), %%mm1 \n\t"
238 "paddb %%mm0, %%mm1 \n\t"
239 "movq %%mm1, (%2, %0) \n\t"
240 "movq 8(%1, %0), %%mm0 \n\t"
241 "movq 8(%2, %0), %%mm1 \n\t"
242 "paddb %%mm0, %%mm1 \n\t"
243 "movq %%mm1, 8(%2, %0) \n\t"
249 :
"r"(src),
"r"(dst),
"r"((
x86_reg)w - 15)
252 dst[i + 0] += src[i + 0];
258 int w,
int h,
int sides)
263 last_line = buf + (height - 1) * wrap;
269 "movd (%0), %%mm0 \n\t"
270 "punpcklbw %%mm0, %%mm0 \n\t"
271 "punpcklwd %%mm0, %%mm0 \n\t"
272 "punpckldq %%mm0, %%mm0 \n\t"
273 "movq %%mm0, -8(%0) \n\t"
274 "movq -8(%0, %2), %%mm1 \n\t"
275 "punpckhbw %%mm1, %%mm1 \n\t"
276 "punpckhwd %%mm1, %%mm1 \n\t"
277 "punpckhdq %%mm1, %%mm1 \n\t"
278 "movq %%mm1, (%0, %2) \n\t"
288 "movd (%0), %%mm0 \n\t"
289 "punpcklbw %%mm0, %%mm0 \n\t"
290 "punpcklwd %%mm0, %%mm0 \n\t"
291 "punpckldq %%mm0, %%mm0 \n\t"
292 "movq %%mm0, -8(%0) \n\t"
293 "movq %%mm0, -16(%0) \n\t"
294 "movq -8(%0, %2), %%mm1 \n\t"
295 "punpckhbw %%mm1, %%mm1 \n\t"
296 "punpckhwd %%mm1, %%mm1 \n\t"
297 "punpckhdq %%mm1, %%mm1 \n\t"
298 "movq %%mm1, (%0, %2) \n\t"
299 "movq %%mm1, 8(%0, %2) \n\t"
310 "movd (%0), %%mm0 \n\t"
311 "punpcklbw %%mm0, %%mm0 \n\t"
312 "punpcklwd %%mm0, %%mm0 \n\t"
313 "movd %%mm0, -4(%0) \n\t"
314 "movd -4(%0, %2), %%mm1 \n\t"
315 "punpcklbw %%mm1, %%mm1 \n\t"
316 "punpckhwd %%mm1, %%mm1 \n\t"
317 "punpckhdq %%mm1, %%mm1 \n\t"
318 "movd %%mm1, (%0, %2) \n\t"
329 for (i = 0; i < h; i += 4) {
330 ptr = buf - (i + 1) * wrap - w;
333 "movq (%1, %0), %%mm0 \n\t"
334 "movq %%mm0, (%0) \n\t"
335 "movq %%mm0, (%0, %2) \n\t"
336 "movq %%mm0, (%0, %2, 2) \n\t"
337 "movq %%mm0, (%0, %3) \n\t"
343 "r"((
x86_reg) -wrap * 3),
"r"(ptr + width + 2 * w)
349 for (i = 0; i < h; i += 4) {
350 ptr = last_line + (i + 1) * wrap - w;
353 "movq (%1, %0), %%mm0 \n\t"
354 "movq %%mm0, (%0) \n\t"
355 "movq %%mm0, (%0, %2) \n\t"
356 "movq %%mm0, (%0, %2, 2) \n\t"
357 "movq %%mm0, (%0, %3) \n\t"
364 "r"(ptr + width + 2 * w)
370 typedef void emulated_edge_mc_func(
uint8_t *dst, ptrdiff_t dst_stride,
371 const uint8_t *src, ptrdiff_t src_linesize,
372 int block_w,
int block_h,
373 int src_x,
int src_y,
int w,
int h);
376 int stride,
int h,
int ox,
int oy,
377 int dxx,
int dxy,
int dyx,
int dyy,
378 int shift,
int r,
int width,
int height,
379 emulated_edge_mc_func *emu_edge_fn)
382 const int ix = ox >> (16 +
shift);
383 const int iy = oy >> (16 +
shift);
384 const int oxs = ox >> 4;
385 const int oys = oy >> 4;
386 const int dxxs = dxx >> 4;
387 const int dxys = dxy >> 4;
388 const int dyxs = dyx >> 4;
389 const int dyys = dyy >> 4;
390 const uint16_t r4[4] = {
r,
r,
r, r };
391 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
392 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
394 #define MAX_STRIDE 4096U
396 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
399 const int dxw = (dxx - (1 << (16 +
shift))) * (w - 1);
400 const int dyh = (dyy - (1 << (16 +
shift))) * (h - 1);
401 const int dxh = dxy * (h - 1);
402 const int dyw = dyx * (w - 1);
403 int need_emu = (unsigned)ix >= width - w ||
404 (
unsigned)iy >= height - h;
407 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
408 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 +
shift)
410 || (dxx | dxy | dyx | dyy) & 15
411 || (need_emu && (h > MAX_H ||
stride > MAX_STRIDE))) {
413 ff_gmc_c(dst, src,
stride, h, ox, oy, dxx, dxy, dyx, dyy,
420 emu_edge_fn(edge_buf, stride, src, stride, w + 1, h + 1, ix, iy, width, height);
425 "movd %0, %%mm6 \n\t"
426 "pxor %%mm7, %%mm7 \n\t"
427 "punpcklwd %%mm6, %%mm6 \n\t"
428 "punpcklwd %%mm6, %%mm6 \n\t"
432 for (x = 0; x < w; x += 4) {
433 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
434 oxs - dxys + dxxs * (x + 1),
435 oxs - dxys + dxxs * (x + 2),
436 oxs - dxys + dxxs * (x + 3) };
437 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
438 oys - dyys + dyxs * (x + 1),
439 oys - dyys + dyxs * (x + 2),
440 oys - dyys + dyxs * (x + 3) };
442 for (
y = 0;
y < h;
y++) {
444 "movq %0, %%mm4 \n\t"
445 "movq %1, %%mm5 \n\t"
446 "paddw %2, %%mm4 \n\t"
447 "paddw %3, %%mm5 \n\t"
448 "movq %%mm4, %0 \n\t"
449 "movq %%mm5, %1 \n\t"
450 "psrlw $12, %%mm4 \n\t"
451 "psrlw $12, %%mm5 \n\t"
452 :
"+m"(*dx4),
"+m"(*dy4)
453 :
"m"(*dxy4),
"m"(*dyy4)
457 "movq %%mm6, %%mm2 \n\t"
458 "movq %%mm6, %%mm1 \n\t"
459 "psubw %%mm4, %%mm2 \n\t"
460 "psubw %%mm5, %%mm1 \n\t"
461 "movq %%mm2, %%mm0 \n\t"
462 "movq %%mm4, %%mm3 \n\t"
463 "pmullw %%mm1, %%mm0 \n\t"
464 "pmullw %%mm5, %%mm3 \n\t"
465 "pmullw %%mm5, %%mm2 \n\t"
466 "pmullw %%mm4, %%mm1 \n\t"
468 "movd %4, %%mm5 \n\t"
469 "movd %3, %%mm4 \n\t"
470 "punpcklbw %%mm7, %%mm5 \n\t"
471 "punpcklbw %%mm7, %%mm4 \n\t"
472 "pmullw %%mm5, %%mm3 \n\t"
473 "pmullw %%mm4, %%mm2 \n\t"
475 "movd %2, %%mm5 \n\t"
476 "movd %1, %%mm4 \n\t"
477 "punpcklbw %%mm7, %%mm5 \n\t"
478 "punpcklbw %%mm7, %%mm4 \n\t"
479 "pmullw %%mm5, %%mm1 \n\t"
480 "pmullw %%mm4, %%mm0 \n\t"
481 "paddw %5, %%mm1 \n\t"
482 "paddw %%mm3, %%mm2 \n\t"
483 "paddw %%mm1, %%mm0 \n\t"
484 "paddw %%mm2, %%mm0 \n\t"
486 "psrlw %6, %%mm0 \n\t"
487 "packuswb %%mm0, %%mm0 \n\t"
488 "movd %%mm0, %0 \n\t"
491 :
"m"(src[0]),
"m"(src[1]),
492 "m"(src[stride]),
"m"(src[stride + 1]),
505 int stride,
int h,
int ox,
int oy,
506 int dxx,
int dxy,
int dyx,
int dyy,
507 int shift,
int r,
int width,
int height)
509 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
510 width, height, &ff_emulated_edge_mc_8);
514 int stride,
int h,
int ox,
int oy,
515 int dxx,
int dxy,
int dyx,
int dyy,
516 int shift,
int r,
int width,
int height)
518 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
519 width, height, &ff_emulated_edge_mc_8);
523 int stride,
int h,
int ox,
int oy,
524 int dxx,
int dxy,
int dyx,
int dyy,
525 int shift,
int r,
int width,
int height)
527 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
528 width, height, &ff_emulated_edge_mc_8);
533 #if CONFIG_DIRAC_DECODER
534 #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
535 void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
538 ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
540 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
542 void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
545 ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
547 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
549 void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
552 ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
554 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
555 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
560 PIXELS16(
static, ff_avg, , , _mmxext)
561 DIRAC_PIXOP(put, ff_put, mmx)
562 DIRAC_PIXOP(avg, ff_avg, mmx)
566 DIRAC_PIXOP(avg, ff_avg, mmxext)
571 ff_put_dirac_pixels16_c(dst, src, stride, h);
578 ff_avg_dirac_pixels16_c(dst, src, stride, h);
585 ff_put_dirac_pixels32_c(dst, src, stride, h);
594 ff_avg_dirac_pixels32_c(dst, src, stride, h);
604 float min,
float max,
int len)
608 "movss %3, %%xmm4 \n\t"
609 "movss %4, %%xmm5 \n\t"
610 "shufps $0, %%xmm4, %%xmm4 \n\t"
611 "shufps $0, %%xmm5, %%xmm5 \n\t"
613 "movaps (%2, %0), %%xmm0 \n\t"
614 "movaps 16(%2, %0), %%xmm1 \n\t"
615 "movaps 32(%2, %0), %%xmm2 \n\t"
616 "movaps 48(%2, %0), %%xmm3 \n\t"
617 "maxps %%xmm4, %%xmm0 \n\t"
618 "maxps %%xmm4, %%xmm1 \n\t"
619 "maxps %%xmm4, %%xmm2 \n\t"
620 "maxps %%xmm4, %%xmm3 \n\t"
621 "minps %%xmm5, %%xmm0 \n\t"
622 "minps %%xmm5, %%xmm1 \n\t"
623 "minps %%xmm5, %%xmm2 \n\t"
624 "minps %%xmm5, %%xmm3 \n\t"
625 "movaps %%xmm0, (%1, %0) \n\t"
626 "movaps %%xmm1, 16(%1, %0) \n\t"
627 "movaps %%xmm2, 32(%1, %0) \n\t"
628 "movaps %%xmm3, 48(%1, %0) \n\t"
632 :
"r"(dst),
"r"(
src),
"m"(min),
"m"(max)