32 #if COMPILE_TEMPLATE_MMXEXT 33 #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" 34 #define MOVNTQ2 "movntq " 36 #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" 37 #define MOVNTQ2 "movq " 39 #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) 41 #define YSCALEYUV2PACKEDX_UV \ 43 "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ 47 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\ 48 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 49 "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ 50 "movq %%mm3, %%mm4 \n\t"\ 53 "movq 8(%%"FF_REG_d"), %%mm0 \n\t" \ 54 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" \ 55 "add %6, %%"FF_REG_S" \n\t" \ 56 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm5 \n\t" \ 57 "add $16, %%"FF_REG_d" \n\t"\ 58 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 59 "pmulhw %%mm0, %%mm2 \n\t"\ 60 "pmulhw %%mm0, %%mm5 \n\t"\ 61 "paddw %%mm2, %%mm3 \n\t"\ 62 "paddw %%mm5, %%mm4 \n\t"\ 63 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ 66 #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ 67 "lea "offset"(%0), %%"FF_REG_d" \n\t"\ 68 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 69 "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ 70 "movq "#dst1", "#dst2" \n\t"\ 73 "movq 8(%%"FF_REG_d"), "#coeff" \n\t" \ 74 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" \ 75 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" \ 76 "add $16, %%"FF_REG_d" \n\t"\ 77 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 78 "pmulhw "#coeff", "#src1" \n\t"\ 79 "pmulhw "#coeff", "#src2" \n\t"\ 80 "paddw "#src1", "#dst1" \n\t"\ 81 "paddw "#src2", "#dst2" \n\t"\ 82 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ 85 #define YSCALEYUV2PACKEDX \ 86 YSCALEYUV2PACKEDX_UV \ 87 YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ 89 #define YSCALEYUV2PACKEDX_END \ 90 :: "r" (&c->redDither), \ 91 "m" (dummy), "m" (dummy), "m" (dummy),\ 92 "r" (dest), "m" (dstW_reg), "m"(uv_off) \ 93 NAMED_CONSTRAINTS_ADD(bF8,bFC) \ 94 : "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S \ 97 #define YSCALEYUV2PACKEDX_ACCURATE_UV \ 99 "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ 103 "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\ 104 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 105 "pxor %%mm4, %%mm4 \n\t"\ 106 "pxor %%mm5, %%mm5 \n\t"\ 107 "pxor %%mm6, %%mm6 \n\t"\ 108 "pxor %%mm7, %%mm7 \n\t"\ 111 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm0 \n\t" \ 112 "add %6, %%"FF_REG_S" \n\t" \ 113 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" \ 114 "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 115 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm1 \n\t" \ 116 "movq %%mm0, %%mm3 \n\t"\ 117 "punpcklwd %%mm1, %%mm0 \n\t"\ 118 "punpckhwd %%mm1, %%mm3 \n\t"\ 119 "movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1 \n\t" \ 120 "pmaddwd %%mm1, %%mm0 \n\t"\ 121 "pmaddwd %%mm1, %%mm3 \n\t"\ 122 "paddd %%mm0, %%mm4 \n\t"\ 123 "paddd %%mm3, %%mm5 \n\t"\ 124 "add %6, %%"FF_REG_S" \n\t" \ 125 "movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm3 \n\t" \ 126 "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 127 "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\ 128 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ 129 "movq %%mm2, %%mm0 \n\t"\ 130 "punpcklwd %%mm3, %%mm2 \n\t"\ 131 "punpckhwd %%mm3, %%mm0 \n\t"\ 132 "pmaddwd %%mm1, %%mm2 \n\t"\ 133 "pmaddwd %%mm1, %%mm0 \n\t"\ 134 "paddd %%mm2, %%mm6 \n\t"\ 135 "paddd %%mm0, %%mm7 \n\t"\ 137 "psrad $16, %%mm4 \n\t"\ 138 "psrad $16, %%mm5 \n\t"\ 139 "psrad $16, %%mm6 \n\t"\ 140 "psrad $16, %%mm7 \n\t"\ 141 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 142 "packssdw %%mm5, %%mm4 \n\t"\ 143 "packssdw %%mm7, %%mm6 \n\t"\ 144 "paddw %%mm0, %%mm4 \n\t"\ 145 "paddw %%mm0, %%mm6 \n\t"\ 146 "movq %%mm4, "U_TEMP"(%0) \n\t"\ 147 "movq %%mm6, "V_TEMP"(%0) \n\t"\ 149 #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ 150 "lea "offset"(%0), %%"FF_REG_d" \n\t"\ 151 "mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 152 "pxor %%mm1, %%mm1 \n\t"\ 153 "pxor %%mm5, %%mm5 \n\t"\ 154 "pxor %%mm7, %%mm7 \n\t"\ 155 "pxor %%mm6, %%mm6 \n\t"\ 158 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0 \n\t" \ 159 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2 \n\t" \ 160 "mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 161 "movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4 \n\t" \ 162 "movq %%mm0, %%mm3 \n\t"\ 163 "punpcklwd %%mm4, %%mm0 \n\t"\ 164 "punpckhwd %%mm4, %%mm3 \n\t"\ 165 "movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4 \n\t" \ 166 "pmaddwd %%mm4, %%mm0 \n\t"\ 167 "pmaddwd %%mm4, %%mm3 \n\t"\ 168 "paddd %%mm0, %%mm1 \n\t"\ 169 "paddd %%mm3, %%mm5 \n\t"\ 170 "movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3 \n\t" \ 171 "mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ 172 "add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\ 173 "test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ 174 "movq %%mm2, %%mm0 \n\t"\ 175 "punpcklwd %%mm3, %%mm2 \n\t"\ 176 "punpckhwd %%mm3, %%mm0 \n\t"\ 177 "pmaddwd %%mm4, %%mm2 \n\t"\ 178 "pmaddwd %%mm4, %%mm0 \n\t"\ 179 "paddd %%mm2, %%mm7 \n\t"\ 180 "paddd %%mm0, %%mm6 \n\t"\ 182 "psrad $16, %%mm1 \n\t"\ 183 "psrad $16, %%mm5 \n\t"\ 184 "psrad $16, %%mm7 \n\t"\ 185 "psrad $16, %%mm6 \n\t"\ 186 "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ 187 "packssdw %%mm5, %%mm1 \n\t"\ 188 "packssdw %%mm6, %%mm7 \n\t"\ 189 "paddw %%mm0, %%mm1 \n\t"\ 190 "paddw %%mm0, %%mm7 \n\t"\ 191 "movq "U_TEMP"(%0), %%mm3 \n\t"\ 192 "movq "V_TEMP"(%0), %%mm4 \n\t"\ 194 #define YSCALEYUV2PACKEDX_ACCURATE \ 195 YSCALEYUV2PACKEDX_ACCURATE_UV \ 196 YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) 198 #define YSCALEYUV2RGBX \ 199 "psubw "U_OFFSET"(%0), %%mm3 \n\t" \ 200 "psubw "V_OFFSET"(%0), %%mm4 \n\t" \ 201 "movq %%mm3, %%mm2 \n\t" \ 202 "movq %%mm4, %%mm5 \n\t" \ 203 "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ 204 "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ 206 "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ 207 "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ 208 "psubw "Y_OFFSET"(%0), %%mm1 \n\t" \ 209 "psubw "Y_OFFSET"(%0), %%mm7 \n\t" \ 210 "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ 211 "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ 213 "paddw %%mm3, %%mm4 \n\t"\ 214 "movq %%mm2, %%mm0 \n\t"\ 215 "movq %%mm5, %%mm6 \n\t"\ 216 "movq %%mm4, %%mm3 \n\t"\ 217 "punpcklwd %%mm2, %%mm2 \n\t"\ 218 "punpcklwd %%mm5, %%mm5 \n\t"\ 219 "punpcklwd %%mm4, %%mm4 \n\t"\ 220 "paddw %%mm1, %%mm2 \n\t"\ 221 "paddw %%mm1, %%mm5 \n\t"\ 222 "paddw %%mm1, %%mm4 \n\t"\ 223 "punpckhwd %%mm0, %%mm0 \n\t"\ 224 "punpckhwd %%mm6, %%mm6 \n\t"\ 225 "punpckhwd %%mm3, %%mm3 \n\t"\ 226 "paddw %%mm7, %%mm0 \n\t"\ 227 "paddw %%mm7, %%mm6 \n\t"\ 228 "paddw %%mm7, %%mm3 \n\t"\ 230 "packuswb %%mm0, %%mm2 \n\t"\ 231 "packuswb %%mm6, %%mm5 \n\t"\ 232 "packuswb %%mm3, %%mm4 \n\t"\ 234 #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ 235 "movq "#b", "#q2" \n\t" \ 236 "movq "#r", "#t" \n\t" \ 237 "punpcklbw "#g", "#b" \n\t" \ 238 "punpcklbw "#a", "#r" \n\t" \ 239 "punpckhbw "#g", "#q2" \n\t" \ 240 "punpckhbw "#a", "#t" \n\t" \ 241 "movq "#b", "#q0" \n\t" \ 242 "movq "#q2", "#q3" \n\t" \ 243 "punpcklwd "#r", "#q0" \n\t" \ 244 "punpckhwd "#r", "#b" \n\t" \ 245 "punpcklwd "#t", "#q2" \n\t" \ 246 "punpckhwd "#t", "#q3" \n\t" \ 248 MOVNTQ( q0, (dst, index, 4))\ 249 MOVNTQ( b, 8(dst, index, 4))\ 250 MOVNTQ( q2, 16(dst, index, 4))\ 251 MOVNTQ( q3, 24(dst, index, 4))\ 253 "add $8, "#index" \n\t"\ 254 "cmp "dstw", "#index" \n\t"\ 256 #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) 259 const int16_t **lumSrc,
int lumFilterSize,
260 const int16_t *chrFilter,
const int16_t **chrUSrc,
261 const int16_t **chrVSrc,
262 int chrFilterSize,
const int16_t **alpSrc,
263 uint8_t *dest,
int dstW,
int dstY)
269 if (CONFIG_SWSCALE_ALPHA &&
c->needAlpha) {
272 "movq %%mm2, "U_TEMP"(%0) \n\t" 273 "movq %%mm4, "V_TEMP"(%0) \n\t" 274 "movq %%mm5, "Y_TEMP"(%0) \n\t" 276 "movq "Y_TEMP"(%0), %%mm5 \n\t" 277 "psraw $3, %%mm1 \n\t" 278 "psraw $3, %%mm7 \n\t" 279 "packuswb %%mm7, %%mm1 \n\t" 280 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
285 "pcmpeqd %%mm7, %%mm7 \n\t" 286 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
292 const int16_t **lumSrc,
int lumFilterSize,
293 const int16_t *chrFilter,
const int16_t **chrUSrc,
294 const int16_t **chrVSrc,
295 int chrFilterSize,
const int16_t **alpSrc,
296 uint8_t *dest,
int dstW,
int dstY)
302 if (CONFIG_SWSCALE_ALPHA &&
c->needAlpha) {
306 "psraw $3, %%mm1 \n\t" 307 "psraw $3, %%mm7 \n\t" 308 "packuswb %%mm7, %%mm1 \n\t" 309 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
314 "pcmpeqd %%mm7, %%mm7 \n\t" 315 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
321 const int16_t **lumSrc,
int lumFilterSize,
322 const int16_t *chrFilter,
const int16_t **chrUSrc,
323 const int16_t **chrVSrc,
324 int chrFilterSize,
const int16_t **alpSrc,
325 uint8_t *dest,
int dstW,
int dstY)
331 if (CONFIG_SWSCALE_ALPHA &&
c->needAlpha) {
335 "psraw $3, %%mm1 \n\t" 336 "psraw $3, %%mm7 \n\t" 337 "packuswb %%mm7, %%mm1 \n\t" 338 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
343 "pcmpeqd %%mm7, %%mm7 \n\t" 344 WRITEBGR32(%4,
"%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
349 #define REAL_WRITERGB16(dst, dstw, index) \ 350 "pand "MANGLE(bF8)", %%mm2 \n\t" \ 351 "pand "MANGLE(bFC)", %%mm4 \n\t" \ 352 "pand "MANGLE(bF8)", %%mm5 \n\t" \ 353 "psrlq $3, %%mm2 \n\t"\ 355 "movq %%mm2, %%mm1 \n\t"\ 356 "movq %%mm4, %%mm3 \n\t"\ 358 "punpcklbw %%mm7, %%mm3 \n\t"\ 359 "punpcklbw %%mm5, %%mm2 \n\t"\ 360 "punpckhbw %%mm7, %%mm4 \n\t"\ 361 "punpckhbw %%mm5, %%mm1 \n\t"\ 363 "psllq $3, %%mm3 \n\t"\ 364 "psllq $3, %%mm4 \n\t"\ 366 "por %%mm3, %%mm2 \n\t"\ 367 "por %%mm4, %%mm1 \n\t"\ 369 MOVNTQ(%%mm2, (dst, index, 2))\ 370 MOVNTQ(%%mm1, 8(dst, index, 2))\ 372 "add $8, "#index" \n\t"\ 373 "cmp "dstw", "#index" \n\t"\ 375 #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) 378 const int16_t **lumSrc,
int lumFilterSize,
379 const int16_t *chrFilter,
const int16_t **chrUSrc,
380 const int16_t **chrVSrc,
381 int chrFilterSize,
const int16_t **alpSrc,
382 uint8_t *dest,
int dstW,
int dstY)
390 "pxor %%mm7, %%mm7 \n\t" 402 const int16_t **lumSrc,
int lumFilterSize,
403 const int16_t *chrFilter,
const int16_t **chrUSrc,
404 const int16_t **chrVSrc,
405 int chrFilterSize,
const int16_t **alpSrc,
406 uint8_t *dest,
int dstW,
int dstY)
414 "pxor %%mm7, %%mm7 \n\t" 425 #define REAL_WRITERGB15(dst, dstw, index) \ 426 "pand "MANGLE(bF8)", %%mm2 \n\t" \ 427 "pand "MANGLE(bF8)", %%mm4 \n\t" \ 428 "pand "MANGLE(bF8)", %%mm5 \n\t" \ 429 "psrlq $3, %%mm2 \n\t"\ 430 "psrlq $1, %%mm5 \n\t"\ 432 "movq %%mm2, %%mm1 \n\t"\ 433 "movq %%mm4, %%mm3 \n\t"\ 435 "punpcklbw %%mm7, %%mm3 \n\t"\ 436 "punpcklbw %%mm5, %%mm2 \n\t"\ 437 "punpckhbw %%mm7, %%mm4 \n\t"\ 438 "punpckhbw %%mm5, %%mm1 \n\t"\ 440 "psllq $2, %%mm3 \n\t"\ 441 "psllq $2, %%mm4 \n\t"\ 443 "por %%mm3, %%mm2 \n\t"\ 444 "por %%mm4, %%mm1 \n\t"\ 446 MOVNTQ(%%mm2, (dst, index, 2))\ 447 MOVNTQ(%%mm1, 8(dst, index, 2))\ 449 "add $8, "#index" \n\t"\ 450 "cmp "dstw", "#index" \n\t"\ 452 #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) 455 const int16_t **lumSrc,
int lumFilterSize,
456 const int16_t *chrFilter,
const int16_t **chrUSrc,
457 const int16_t **chrVSrc,
458 int chrFilterSize,
const int16_t **alpSrc,
459 uint8_t *dest,
int dstW,
int dstY)
467 "pxor %%mm7, %%mm7 \n\t" 479 const int16_t **lumSrc,
int lumFilterSize,
480 const int16_t *chrFilter,
const int16_t **chrUSrc,
481 const int16_t **chrVSrc,
482 int chrFilterSize,
const int16_t **alpSrc,
483 uint8_t *dest,
int dstW,
int dstY)
491 "pxor %%mm7, %%mm7 \n\t" 502 #define WRITEBGR24MMX(dst, dstw, index) \ 504 "movq %%mm2, %%mm1 \n\t" \ 505 "movq %%mm5, %%mm6 \n\t" \ 506 "punpcklbw %%mm4, %%mm2 \n\t" \ 507 "punpcklbw %%mm7, %%mm5 \n\t" \ 508 "punpckhbw %%mm4, %%mm1 \n\t" \ 509 "punpckhbw %%mm7, %%mm6 \n\t" \ 510 "movq %%mm2, %%mm0 \n\t" \ 511 "movq %%mm1, %%mm3 \n\t" \ 512 "punpcklwd %%mm5, %%mm0 \n\t" \ 513 "punpckhwd %%mm5, %%mm2 \n\t" \ 514 "punpcklwd %%mm6, %%mm1 \n\t" \ 515 "punpckhwd %%mm6, %%mm3 \n\t" \ 517 "movq %%mm0, %%mm4 \n\t" \ 518 "movq %%mm2, %%mm6 \n\t" \ 519 "movq %%mm1, %%mm5 \n\t" \ 520 "movq %%mm3, %%mm7 \n\t" \ 522 "psllq $40, %%mm0 \n\t" \ 523 "psllq $40, %%mm2 \n\t" \ 524 "psllq $40, %%mm1 \n\t" \ 525 "psllq $40, %%mm3 \n\t" \ 527 "punpckhdq %%mm4, %%mm0 \n\t" \ 528 "punpckhdq %%mm6, %%mm2 \n\t" \ 529 "punpckhdq %%mm5, %%mm1 \n\t" \ 530 "punpckhdq %%mm7, %%mm3 \n\t" \ 532 "psrlq $8, %%mm0 \n\t" \ 533 "movq %%mm2, %%mm6 \n\t" \ 534 "psllq $40, %%mm2 \n\t" \ 535 "por %%mm2, %%mm0 \n\t" \ 536 MOVNTQ(%%mm0, (dst))\ 538 "psrlq $24, %%mm6 \n\t" \ 539 "movq %%mm1, %%mm5 \n\t" \ 540 "psllq $24, %%mm1 \n\t" \ 541 "por %%mm1, %%mm6 \n\t" \ 542 MOVNTQ(%%mm6, 8(dst))\ 544 "psrlq $40, %%mm5 \n\t" \ 545 "psllq $8, %%mm3 \n\t" \ 546 "por %%mm3, %%mm5 \n\t" \ 547 MOVNTQ(%%mm5, 16(dst))\ 549 "add $24, "#dst" \n\t"\ 551 "add $8, "#index" \n\t"\ 552 "cmp "dstw", "#index" \n\t"\ 555 #define WRITEBGR24MMXEXT(dst, dstw, index) \ 557 "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ 558 "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ 559 "pshufw $0x50, %%mm2, %%mm1 \n\t" \ 560 "pshufw $0x50, %%mm4, %%mm3 \n\t" \ 561 "pshufw $0x00, %%mm5, %%mm6 \n\t" \ 563 "pand %%mm0, %%mm1 \n\t" \ 564 "pand %%mm0, %%mm3 \n\t" \ 565 "pand %%mm7, %%mm6 \n\t" \ 567 "psllq $8, %%mm3 \n\t" \ 568 "por %%mm1, %%mm6 \n\t"\ 569 "por %%mm3, %%mm6 \n\t"\ 570 MOVNTQ(%%mm6, (dst))\ 572 "psrlq $8, %%mm4 \n\t" \ 573 "pshufw $0xA5, %%mm2, %%mm1 \n\t" \ 574 "pshufw $0x55, %%mm4, %%mm3 \n\t" \ 575 "pshufw $0xA5, %%mm5, %%mm6 \n\t" \ 577 "pand "MANGLE(ff_M24B)", %%mm1 \n\t" \ 578 "pand %%mm7, %%mm3 \n\t" \ 579 "pand %%mm0, %%mm6 \n\t" \ 581 "por %%mm1, %%mm3 \n\t" \ 582 "por %%mm3, %%mm6 \n\t"\ 583 MOVNTQ(%%mm6, 8(dst))\ 585 "pshufw $0xFF, %%mm2, %%mm1 \n\t" \ 586 "pshufw $0xFA, %%mm4, %%mm3 \n\t" \ 587 "pshufw $0xFA, %%mm5, %%mm6 \n\t" \ 589 "pand %%mm7, %%mm1 \n\t" \ 590 "pand %%mm0, %%mm3 \n\t" \ 591 "pand "MANGLE(ff_M24B)", %%mm6 \n\t" \ 593 "por %%mm1, %%mm3 \n\t"\ 594 "por %%mm3, %%mm6 \n\t"\ 595 MOVNTQ(%%mm6, 16(dst))\ 597 "add $24, "#dst" \n\t"\ 599 "add $8, "#index" \n\t"\ 600 "cmp "dstw", "#index" \n\t"\ 603 #if COMPILE_TEMPLATE_MMXEXT 605 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index) 608 #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) 613 const int16_t **lumSrc,
int lumFilterSize,
614 const int16_t *chrFilter,
const int16_t **chrUSrc,
615 const int16_t **chrVSrc,
616 int chrFilterSize,
const int16_t **alpSrc,
617 uint8_t *dest,
int dstW,
int dstY)
625 "pxor %%mm7, %%mm7 \n\t" 626 "lea (%%"FF_REG_a
", %%"FF_REG_a
", 2), %%"FF_REG_c
"\n\t" 627 "add %4, %%"FF_REG_c
" \n\t" 629 ::
"r" (&
c->redDither),
630 "m" (dummy),
"m" (
dummy),
"m" (dummy),
631 "r" (dest),
"m" (dstW_reg),
"m"(uv_off)
633 :
"%"FF_REG_a,
"%"FF_REG_c,
"%"FF_REG_d,
"%"FF_REG_S
638 const int16_t **lumSrc,
int lumFilterSize,
639 const int16_t *chrFilter,
const int16_t **chrUSrc,
640 const int16_t **chrVSrc,
641 int chrFilterSize,
const int16_t **alpSrc,
642 uint8_t *dest,
int dstW,
int dstY)
650 "pxor %%mm7, %%mm7 \n\t" 651 "lea (%%"FF_REG_a
", %%"FF_REG_a
", 2), %%"FF_REG_c
" \n\t" 652 "add %4, %%"FF_REG_c
" \n\t" 654 ::
"r" (&
c->redDither),
655 "m" (dummy),
"m" (
dummy),
"m" (dummy),
656 "r" (dest),
"m" (dstW_reg),
"m"(uv_off)
658 :
"%"FF_REG_a,
"%"FF_REG_c,
"%"FF_REG_d,
"%"FF_REG_S
663 #define REAL_WRITEYUY2(dst, dstw, index) \ 664 "packuswb %%mm3, %%mm3 \n\t"\ 665 "packuswb %%mm4, %%mm4 \n\t"\ 666 "packuswb %%mm7, %%mm1 \n\t"\ 667 "punpcklbw %%mm4, %%mm3 \n\t"\ 668 "movq %%mm1, %%mm7 \n\t"\ 669 "punpcklbw %%mm3, %%mm1 \n\t"\ 670 "punpckhbw %%mm3, %%mm7 \n\t"\ 672 MOVNTQ(%%mm1, (dst, index, 2))\ 673 MOVNTQ(%%mm7, 8(dst, index, 2))\ 675 "add $8, "#index" \n\t"\ 676 "cmp "dstw", "#index" \n\t"\ 678 #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) 681 const int16_t **lumSrc,
int lumFilterSize,
682 const int16_t *chrFilter,
const int16_t **chrUSrc,
683 const int16_t **chrVSrc,
684 int chrFilterSize,
const int16_t **alpSrc,
685 uint8_t *dest,
int dstW,
int dstY)
693 "psraw $3, %%mm3 \n\t" 694 "psraw $3, %%mm4 \n\t" 695 "psraw $3, %%mm1 \n\t" 696 "psraw $3, %%mm7 \n\t" 702 const int16_t **lumSrc,
int lumFilterSize,
703 const int16_t *chrFilter,
const int16_t **chrUSrc,
704 const int16_t **chrVSrc,
705 int chrFilterSize,
const int16_t **alpSrc,
706 uint8_t *dest,
int dstW,
int dstY)
714 "psraw $3, %%mm3 \n\t" 715 "psraw $3, %%mm4 \n\t" 716 "psraw $3, %%mm1 \n\t" 717 "psraw $3, %%mm7 \n\t" 722 #define REAL_YSCALEYUV2RGB_UV(index, c) \ 723 "xor "#index", "#index" \n\t"\ 726 "movq (%2, "#index"), %%mm2 \n\t" \ 727 "movq (%3, "#index"), %%mm3 \n\t" \ 728 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 729 "movq (%2, "#index"), %%mm5 \n\t" \ 730 "movq (%3, "#index"), %%mm4 \n\t" \ 731 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 732 "psubw %%mm3, %%mm2 \n\t" \ 733 "psubw %%mm4, %%mm5 \n\t" \ 734 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 735 "pmulhw %%mm0, %%mm2 \n\t" \ 736 "pmulhw %%mm0, %%mm5 \n\t" \ 737 "psraw $4, %%mm3 \n\t" \ 738 "psraw $4, %%mm4 \n\t" \ 739 "paddw %%mm2, %%mm3 \n\t" \ 740 "paddw %%mm5, %%mm4 \n\t" \ 741 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \ 742 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \ 743 "movq %%mm3, %%mm2 \n\t" \ 744 "movq %%mm4, %%mm5 \n\t" \ 745 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 746 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 749 #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ 750 "movq ("#b1", "#index", 2), %%mm0 \n\t" \ 751 "movq ("#b2", "#index", 2), %%mm1 \n\t" \ 752 "movq 8("#b1", "#index", 2), %%mm6 \n\t" \ 753 "movq 8("#b2", "#index", 2), %%mm7 \n\t" \ 754 "psubw %%mm1, %%mm0 \n\t" \ 755 "psubw %%mm7, %%mm6 \n\t" \ 756 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \ 757 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \ 758 "psraw $4, %%mm1 \n\t" \ 759 "psraw $4, %%mm7 \n\t" \ 760 "paddw %%mm0, %%mm1 \n\t" \ 761 "paddw %%mm6, %%mm7 \n\t" \ 763 #define REAL_YSCALEYUV2RGB_COEFF(c) \ 764 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 765 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 766 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \ 767 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \ 768 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 769 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 771 "paddw %%mm3, %%mm4 \n\t"\ 772 "movq %%mm2, %%mm0 \n\t"\ 773 "movq %%mm5, %%mm6 \n\t"\ 774 "movq %%mm4, %%mm3 \n\t"\ 775 "punpcklwd %%mm2, %%mm2 \n\t"\ 776 "punpcklwd %%mm5, %%mm5 \n\t"\ 777 "punpcklwd %%mm4, %%mm4 \n\t"\ 778 "paddw %%mm1, %%mm2 \n\t"\ 779 "paddw %%mm1, %%mm5 \n\t"\ 780 "paddw %%mm1, %%mm4 \n\t"\ 781 "punpckhwd %%mm0, %%mm0 \n\t"\ 782 "punpckhwd %%mm6, %%mm6 \n\t"\ 783 "punpckhwd %%mm3, %%mm3 \n\t"\ 784 "paddw %%mm7, %%mm0 \n\t"\ 785 "paddw %%mm7, %%mm6 \n\t"\ 786 "paddw %%mm7, %%mm3 \n\t"\ 788 "packuswb %%mm0, %%mm2 \n\t"\ 789 "packuswb %%mm6, %%mm5 \n\t"\ 790 "packuswb %%mm3, %%mm4 \n\t"\ 792 #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) 794 #define YSCALEYUV2RGB(index, c) \ 795 REAL_YSCALEYUV2RGB_UV(index, c) \ 796 REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ 797 REAL_YSCALEYUV2RGB_COEFF(c) 803 const int16_t *ubuf[2],
const int16_t *vbuf[2],
804 const int16_t *abuf[2],
uint8_t *dest,
805 int dstW,
int yalpha,
int uvalpha,
int y)
807 const int16_t *buf0 = buf[0], *buf1 = buf[1],
808 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
810 if (CONFIG_SWSCALE_ALPHA &&
c->needAlpha) {
811 const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
816 "psraw $3, %%mm1 \n\t" 817 "psraw $3, %%mm7 \n\t" 818 "packuswb %%mm7, %%mm1 \n\t" 819 WRITEBGR32(%4,
DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
820 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"r" (dest),
822 "r" (abuf0),
"r" (abuf1)
826 c->u_temp=(intptr_t)abuf0;
827 c->v_temp=(intptr_t)abuf1;
830 "mov %4, %%"FF_REG_b
" \n\t" 831 "push %%"FF_REG_BP
" \n\t" 835 "mov "U_TEMP"(%5), %0 \n\t" 836 "mov "V_TEMP"(%5), %1 \n\t" 838 "psraw $3, %%mm1 \n\t" 839 "psraw $3, %%mm7 \n\t" 840 "packuswb %%mm7, %%mm1 \n\t" 843 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
844 "pop %%"FF_REG_BP
" \n\t" 846 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
853 "mov %4, %%"FF_REG_b
" \n\t" 854 "push %%"FF_REG_BP
" \n\t" 856 "pcmpeqd %%mm7, %%mm7 \n\t" 857 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
858 "pop %%"FF_REG_BP
" \n\t" 860 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
867 const int16_t *ubuf[2],
const int16_t *vbuf[2],
868 const int16_t *abuf[2],
uint8_t *dest,
869 int dstW,
int yalpha,
int uvalpha,
int y)
871 const int16_t *buf0 = buf[0], *buf1 = buf[1],
872 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
876 "mov %4, %%"FF_REG_b
" \n\t" 877 "push %%"FF_REG_BP
" \n\t" 879 "pxor %%mm7, %%mm7 \n\t" 881 "pop %%"FF_REG_BP
" \n\t" 883 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
890 const int16_t *ubuf[2],
const int16_t *vbuf[2],
891 const int16_t *abuf[2],
uint8_t *dest,
892 int dstW,
int yalpha,
int uvalpha,
int y)
894 const int16_t *buf0 = buf[0], *buf1 = buf[1],
895 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
899 "mov %4, %%"FF_REG_b
" \n\t" 900 "push %%"FF_REG_BP
" \n\t" 902 "pxor %%mm7, %%mm7 \n\t" 910 "pop %%"FF_REG_BP
" \n\t" 912 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
919 const int16_t *ubuf[2],
const int16_t *vbuf[2],
920 const int16_t *abuf[2],
uint8_t *dest,
921 int dstW,
int yalpha,
int uvalpha,
int y)
923 const int16_t *buf0 = buf[0], *buf1 = buf[1],
924 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
928 "mov %4, %%"FF_REG_b
" \n\t" 929 "push %%"FF_REG_BP
" \n\t" 931 "pxor %%mm7, %%mm7 \n\t" 939 "pop %%"FF_REG_BP
" \n\t" 941 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
947 #define REAL_YSCALEYUV2PACKED(index, c) \ 948 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 949 "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ 950 "psraw $3, %%mm0 \n\t"\ 951 "psraw $3, %%mm1 \n\t"\ 952 "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 953 "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ 954 "xor "#index", "#index" \n\t"\ 957 "movq (%2, "#index"), %%mm2 \n\t" \ 958 "movq (%3, "#index"), %%mm3 \n\t" \ 959 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 960 "movq (%2, "#index"), %%mm5 \n\t" \ 961 "movq (%3, "#index"), %%mm4 \n\t" \ 962 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 963 "psubw %%mm3, %%mm2 \n\t" \ 964 "psubw %%mm4, %%mm5 \n\t" \ 965 "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ 966 "pmulhw %%mm0, %%mm2 \n\t" \ 967 "pmulhw %%mm0, %%mm5 \n\t" \ 968 "psraw $7, %%mm3 \n\t" \ 969 "psraw $7, %%mm4 \n\t" \ 970 "paddw %%mm2, %%mm3 \n\t" \ 971 "paddw %%mm5, %%mm4 \n\t" \ 972 "movq (%0, "#index", 2), %%mm0 \n\t" \ 973 "movq (%1, "#index", 2), %%mm1 \n\t" \ 974 "movq 8(%0, "#index", 2), %%mm6 \n\t" \ 975 "movq 8(%1, "#index", 2), %%mm7 \n\t" \ 976 "psubw %%mm1, %%mm0 \n\t" \ 977 "psubw %%mm7, %%mm6 \n\t" \ 978 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" \ 979 "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" \ 980 "psraw $7, %%mm1 \n\t" \ 981 "psraw $7, %%mm7 \n\t" \ 982 "paddw %%mm0, %%mm1 \n\t" \ 983 "paddw %%mm6, %%mm7 \n\t" \ 985 #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) 988 const int16_t *ubuf[2],
const int16_t *vbuf[2],
989 const int16_t *abuf[2],
uint8_t *dest,
990 int dstW,
int yalpha,
int uvalpha,
int y)
992 const int16_t *buf0 = buf[0], *buf1 = buf[1],
993 *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
997 "mov %4, %%"FF_REG_b
" \n\t" 998 "push %%"FF_REG_BP
" \n\t" 1001 "pop %%"FF_REG_BP
" \n\t" 1003 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1008 #define REAL_YSCALEYUV2RGB1(index, c) \ 1009 "xor "#index", "#index" \n\t"\ 1012 "movq (%2, "#index"), %%mm3 \n\t" \ 1013 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1014 "movq (%2, "#index"), %%mm4 \n\t" \ 1015 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1016 "psraw $4, %%mm3 \n\t" \ 1017 "psraw $4, %%mm4 \n\t" \ 1018 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \ 1019 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \ 1020 "movq %%mm3, %%mm2 \n\t" \ 1021 "movq %%mm4, %%mm5 \n\t" \ 1022 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 1023 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 1025 "movq (%0, "#index", 2), %%mm1 \n\t" \ 1026 "movq 8(%0, "#index", 2), %%mm7 \n\t" \ 1027 "psraw $4, %%mm1 \n\t" \ 1028 "psraw $4, %%mm7 \n\t" \ 1029 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 1030 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 1031 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \ 1032 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \ 1033 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 1034 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 1036 "paddw %%mm3, %%mm4 \n\t"\ 1037 "movq %%mm2, %%mm0 \n\t"\ 1038 "movq %%mm5, %%mm6 \n\t"\ 1039 "movq %%mm4, %%mm3 \n\t"\ 1040 "punpcklwd %%mm2, %%mm2 \n\t"\ 1041 "punpcklwd %%mm5, %%mm5 \n\t"\ 1042 "punpcklwd %%mm4, %%mm4 \n\t"\ 1043 "paddw %%mm1, %%mm2 \n\t"\ 1044 "paddw %%mm1, %%mm5 \n\t"\ 1045 "paddw %%mm1, %%mm4 \n\t"\ 1046 "punpckhwd %%mm0, %%mm0 \n\t"\ 1047 "punpckhwd %%mm6, %%mm6 \n\t"\ 1048 "punpckhwd %%mm3, %%mm3 \n\t"\ 1049 "paddw %%mm7, %%mm0 \n\t"\ 1050 "paddw %%mm7, %%mm6 \n\t"\ 1051 "paddw %%mm7, %%mm3 \n\t"\ 1053 "packuswb %%mm0, %%mm2 \n\t"\ 1054 "packuswb %%mm6, %%mm5 \n\t"\ 1055 "packuswb %%mm3, %%mm4 \n\t"\ 1057 #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) 1060 #define REAL_YSCALEYUV2RGB1b(index, c) \ 1061 "xor "#index", "#index" \n\t"\ 1064 "movq (%2, "#index"), %%mm2 \n\t" \ 1065 "movq (%3, "#index"), %%mm3 \n\t" \ 1066 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1067 "movq (%2, "#index"), %%mm5 \n\t" \ 1068 "movq (%3, "#index"), %%mm4 \n\t" \ 1069 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1070 "paddw %%mm2, %%mm3 \n\t" \ 1071 "paddw %%mm5, %%mm4 \n\t" \ 1072 "psrlw $5, %%mm3 \n\t" \ 1073 "psrlw $5, %%mm4 \n\t" \ 1074 "psubw "U_OFFSET"("#c"), %%mm3 \n\t" \ 1075 "psubw "V_OFFSET"("#c"), %%mm4 \n\t" \ 1076 "movq %%mm3, %%mm2 \n\t" \ 1077 "movq %%mm4, %%mm5 \n\t" \ 1078 "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ 1079 "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ 1081 "movq (%0, "#index", 2), %%mm1 \n\t" \ 1082 "movq 8(%0, "#index", 2), %%mm7 \n\t" \ 1083 "psraw $4, %%mm1 \n\t" \ 1084 "psraw $4, %%mm7 \n\t" \ 1085 "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ 1086 "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ 1087 "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" \ 1088 "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" \ 1089 "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ 1090 "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ 1092 "paddw %%mm3, %%mm4 \n\t"\ 1093 "movq %%mm2, %%mm0 \n\t"\ 1094 "movq %%mm5, %%mm6 \n\t"\ 1095 "movq %%mm4, %%mm3 \n\t"\ 1096 "punpcklwd %%mm2, %%mm2 \n\t"\ 1097 "punpcklwd %%mm5, %%mm5 \n\t"\ 1098 "punpcklwd %%mm4, %%mm4 \n\t"\ 1099 "paddw %%mm1, %%mm2 \n\t"\ 1100 "paddw %%mm1, %%mm5 \n\t"\ 1101 "paddw %%mm1, %%mm4 \n\t"\ 1102 "punpckhwd %%mm0, %%mm0 \n\t"\ 1103 "punpckhwd %%mm6, %%mm6 \n\t"\ 1104 "punpckhwd %%mm3, %%mm3 \n\t"\ 1105 "paddw %%mm7, %%mm0 \n\t"\ 1106 "paddw %%mm7, %%mm6 \n\t"\ 1107 "paddw %%mm7, %%mm3 \n\t"\ 1109 "packuswb %%mm0, %%mm2 \n\t"\ 1110 "packuswb %%mm6, %%mm5 \n\t"\ 1111 "packuswb %%mm3, %%mm4 \n\t"\ 1113 #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) 1115 #define REAL_YSCALEYUV2RGB1_ALPHA(index) \ 1116 "movq (%1, "#index", 2), %%mm7 \n\t" \ 1117 "movq 8(%1, "#index", 2), %%mm1 \n\t" \ 1118 "psraw $7, %%mm7 \n\t" \ 1119 "psraw $7, %%mm1 \n\t" \ 1120 "packuswb %%mm1, %%mm7 \n\t" 1121 #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) 1127 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1128 const int16_t *abuf0,
uint8_t *dest,
1129 int dstW,
int uvalpha,
int y)
1131 const int16_t *ubuf0 = ubuf[0];
1132 const int16_t *buf1= buf0;
1134 if (uvalpha < 2048) {
1135 const int16_t *ubuf1 = ubuf[0];
1136 if (CONFIG_SWSCALE_ALPHA &&
c->needAlpha) {
1139 "mov %4, %%"FF_REG_b
" \n\t" 1140 "push %%"FF_REG_BP
" \n\t" 1143 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1144 "pop %%"FF_REG_BP
" \n\t" 1146 ::
"c" (buf0),
"d" (abuf0),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1152 "mov %4, %%"FF_REG_b
" \n\t" 1153 "push %%"FF_REG_BP
" \n\t" 1155 "pcmpeqd %%mm7, %%mm7 \n\t" 1156 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1157 "pop %%"FF_REG_BP
" \n\t" 1159 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1164 const int16_t *ubuf1 = ubuf[1];
1165 if (CONFIG_SWSCALE_ALPHA &&
c->needAlpha) {
1168 "mov %4, %%"FF_REG_b
" \n\t" 1169 "push %%"FF_REG_BP
" \n\t" 1172 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1173 "pop %%"FF_REG_BP
" \n\t" 1175 ::
"c" (buf0),
"d" (abuf0),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1181 "mov %4, %%"FF_REG_b
" \n\t" 1182 "push %%"FF_REG_BP
" \n\t" 1184 "pcmpeqd %%mm7, %%mm7 \n\t" 1185 WRITEBGR32(%%FF_REGb,
DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1186 "pop %%"FF_REG_BP
" \n\t" 1188 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1196 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1197 const int16_t *abuf0,
uint8_t *dest,
1198 int dstW,
int uvalpha,
int y)
1200 const int16_t *ubuf0 = ubuf[0];
1201 const int16_t *buf1= buf0;
1203 if (uvalpha < 2048) {
1204 const int16_t *ubuf1 = ubuf[0];
1207 "mov %4, %%"FF_REG_b
" \n\t" 1208 "push %%"FF_REG_BP
" \n\t" 1210 "pxor %%mm7, %%mm7 \n\t" 1212 "pop %%"FF_REG_BP
" \n\t" 1214 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1219 const int16_t *ubuf1 = ubuf[1];
1222 "mov %4, %%"FF_REG_b
" \n\t" 1223 "push %%"FF_REG_BP
" \n\t" 1225 "pxor %%mm7, %%mm7 \n\t" 1227 "pop %%"FF_REG_BP
" \n\t" 1229 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1237 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1238 const int16_t *abuf0,
uint8_t *dest,
1239 int dstW,
int uvalpha,
int y)
1241 const int16_t *ubuf0 = ubuf[0];
1242 const int16_t *buf1= buf0;
1244 if (uvalpha < 2048) {
1245 const int16_t *ubuf1 = ubuf[0];
1248 "mov %4, %%"FF_REG_b
" \n\t" 1249 "push %%"FF_REG_BP
" \n\t" 1251 "pxor %%mm7, %%mm7 \n\t" 1259 "pop %%"FF_REG_BP
" \n\t" 1261 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1266 const int16_t *ubuf1 = ubuf[1];
1269 "mov %4, %%"FF_REG_b
" \n\t" 1270 "push %%"FF_REG_BP
" \n\t" 1272 "pxor %%mm7, %%mm7 \n\t" 1280 "pop %%"FF_REG_BP
" \n\t" 1282 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1290 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1291 const int16_t *abuf0,
uint8_t *dest,
1292 int dstW,
int uvalpha,
int y)
1294 const int16_t *ubuf0 = ubuf[0];
1295 const int16_t *buf1= buf0;
1297 if (uvalpha < 2048) {
1298 const int16_t *ubuf1 = ubuf[0];
1301 "mov %4, %%"FF_REG_b
" \n\t" 1302 "push %%"FF_REG_BP
" \n\t" 1304 "pxor %%mm7, %%mm7 \n\t" 1312 "pop %%"FF_REG_BP
" \n\t" 1314 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1319 const int16_t *ubuf1 = ubuf[1];
1322 "mov %4, %%"FF_REG_b
" \n\t" 1323 "push %%"FF_REG_BP
" \n\t" 1325 "pxor %%mm7, %%mm7 \n\t" 1333 "pop %%"FF_REG_BP
" \n\t" 1335 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1342 #define REAL_YSCALEYUV2PACKED1(index, c) \ 1343 "xor "#index", "#index" \n\t"\ 1346 "movq (%2, "#index"), %%mm3 \n\t" \ 1347 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1348 "movq (%2, "#index"), %%mm4 \n\t" \ 1349 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1350 "psraw $7, %%mm3 \n\t" \ 1351 "psraw $7, %%mm4 \n\t" \ 1352 "movq (%0, "#index", 2), %%mm1 \n\t" \ 1353 "movq 8(%0, "#index", 2), %%mm7 \n\t" \ 1354 "psraw $7, %%mm1 \n\t" \ 1355 "psraw $7, %%mm7 \n\t" \ 1357 #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) 1359 #define REAL_YSCALEYUV2PACKED1b(index, c) \ 1360 "xor "#index", "#index" \n\t"\ 1363 "movq (%2, "#index"), %%mm2 \n\t" \ 1364 "movq (%3, "#index"), %%mm3 \n\t" \ 1365 "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1366 "movq (%2, "#index"), %%mm5 \n\t" \ 1367 "movq (%3, "#index"), %%mm4 \n\t" \ 1368 "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ 1369 "paddw %%mm2, %%mm3 \n\t" \ 1370 "paddw %%mm5, %%mm4 \n\t" \ 1371 "psrlw $8, %%mm3 \n\t" \ 1372 "psrlw $8, %%mm4 \n\t" \ 1373 "movq (%0, "#index", 2), %%mm1 \n\t" \ 1374 "movq 8(%0, "#index", 2), %%mm7 \n\t" \ 1375 "psraw $7, %%mm1 \n\t" \ 1376 "psraw $7, %%mm7 \n\t" 1377 #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) 1380 const int16_t *ubuf[2],
const int16_t *vbuf[2],
1381 const int16_t *abuf0,
uint8_t *dest,
1382 int dstW,
int uvalpha,
int y)
1384 const int16_t *ubuf0 = ubuf[0];
1385 const int16_t *buf1= buf0;
1387 if (uvalpha < 2048) {
1388 const int16_t *ubuf1 = ubuf[0];
1391 "mov %4, %%"FF_REG_b
" \n\t" 1392 "push %%"FF_REG_BP
" \n\t" 1395 "pop %%"FF_REG_BP
" \n\t" 1397 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1401 const int16_t *ubuf1 = ubuf[1];
1404 "mov %4, %%"FF_REG_b
" \n\t" 1405 "push %%"FF_REG_BP
" \n\t" 1408 "pop %%"FF_REG_BP
" \n\t" 1410 ::
"c" (buf0),
"d" (buf1),
"S" (ubuf0),
"D" (ubuf1),
"m" (dest),
1419 c->use_mmx_vfilter= 0;
1425 switch (
c->dstFormat) {
1437 c->use_mmx_vfilter= 1;
1439 switch (
c->dstFormat) {
1453 switch (
c->dstFormat) {
1455 c->yuv2packed1 =
RENAME(yuv2rgb32_1);
1456 c->yuv2packed2 =
RENAME(yuv2rgb32_2);
1459 c->yuv2packed1 =
RENAME(yuv2bgr24_1);
1460 c->yuv2packed2 =
RENAME(yuv2bgr24_2);
1463 c->yuv2packed1 =
RENAME(yuv2rgb555_1);
1464 c->yuv2packed2 =
RENAME(yuv2rgb555_2);
1467 c->yuv2packed1 =
RENAME(yuv2rgb565_1);
1468 c->yuv2packed2 =
RENAME(yuv2rgb565_2);
1471 c->yuv2packed1 =
RENAME(yuv2yuyv422_1);
1472 c->yuv2packed2 =
RENAME(yuv2yuyv422_2);
1480 if (
c->srcBpc == 8 &&
c->dstBpc <= 14) {
1482 #if COMPILE_TEMPLATE_MMXEXT 1488 c->hyscale_fast =
NULL;
1489 c->hcscale_fast =
NULL;
1490 #if COMPILE_TEMPLATE_MMXEXT IEEE-754 single precision Y, 32bpp, big-endian.
#define YSCALEYUV2RGB1_ALPHA(index)
#define ALP_MMX_FILTER_OFFSET
#define YSCALEYUV2PACKED1(index, c)
void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2, int dstWidth, const uint8_t *src1, const uint8_t *src2, int srcW, int xInc)
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
#define YSCALEYUV2PACKEDX_END
void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst, int dstWidth, const uint8_t *src, int srcW, int xInc)
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
#define SWS_FULL_CHR_H_INT
#define SWS_FAST_BILINEAR
#define WRITERGB15(dst, dstw, index)
#define YSCALEYUV2PACKEDX
#define WRITEBGR24(dst, dstw, index)
#define YSCALEYUV2RGB1b(index, c)
static av_always_inline int isSemiPlanarYUV(enum AVPixelFormat pix_fmt)
#define YSCALEYUV2RGB_YA(index, c, b1, b2)
#define YSCALEYUV2PACKED(index, c)
#define WRITERGB16(dst, dstw, index)
static av_cold void sws_init_swscale(SwsContext *c)
#define YSCALEYUV2PACKEDX_ACCURATE
#define YSCALEYUV2PACKEDX_YA(offset, coeff, src1, src2, dst1, dst2)
packed RGB 8:8:8, 24bpp, BGRBGR...
#define YSCALEYUV2RGB(index, c)
packed YUV 4:2:2, 16bpp, Y0 Cb Y1 Cr
#define YSCALEYUV2PACKED1b(index, c)
__asm__(".macro parse_r var r\n\t""\\var = -1\n\t"_IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31)".iflt \\var\n\t"".error \"Unable to parse register name \\r\"\n\t"".endif\n\t"".endm")
#define AV_PIX_FMT_RGB555
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset)
#define AV_PIX_FMT_RGB565
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
#define WRITEYUY2(dst, dstw, index)
IEEE-754 single precision Y, 32bpp, little-endian.
AVPixelFormat
Pixel format.
static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)
#define YSCALEYUV2RGB1(index, c)
#define NAMED_CONSTRAINTS_ADD(...)