00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "libavutil/cpu.h"
00026 #include "libavutil/x86/asm.h"
00027 #include "libavcodec/dsputil.h"
00028 #include "libavcodec/h264dsp.h"
00029 #include "libavcodec/mpegvideo.h"
00030 #include "libavcodec/simple_idct.h"
00031 #include "dsputil_mmx.h"
00032 #include "idct_xvid.h"
00033 #include "diracdsp_mmx.h"
00034
00035
00036
00037
00038
00039 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
00040 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
00041
00042 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
00043 { 0x8000000080000000ULL, 0x8000000080000000ULL };
00044
00045 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
00046 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
00047 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
00048 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
00049 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
00050 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
00051 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
00052 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
00053 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
00054 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
00055 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
00056 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
00057 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
00058 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
00059 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
00060 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
00061 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
00062 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
00063 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
00064 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
00065 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
00066 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
00067 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
00068 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
00069
00070 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
00071 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
00072 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
00073 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
00074 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
00075 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
00076 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
00077 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
00078 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
00079 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
00080 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
00081 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
00082 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
00083
00084 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
00085 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
00086
00087 #if HAVE_INLINE_ASM
00088
00089 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
00090 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
00091
00092 #define MOVQ_BFE(regd) \
00093 __asm__ volatile ( \
00094 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
00095 "paddb %%"#regd", %%"#regd" \n\t" ::)
00096
00097 #ifndef PIC
00098 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
00099 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
00100 #else
00101
00102
00103 #define MOVQ_BONE(regd) \
00104 __asm__ volatile ( \
00105 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
00106 "psrlw $15, %%"#regd" \n\t" \
00107 "packuswb %%"#regd", %%"#regd" \n\t" ::)
00108
00109 #define MOVQ_WTWO(regd) \
00110 __asm__ volatile ( \
00111 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
00112 "psrlw $15, %%"#regd" \n\t" \
00113 "psllw $1, %%"#regd" \n\t"::)
00114
00115 #endif
00116
00117
00118
00119
00120 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
00121 "movq "#rega", "#regr" \n\t" \
00122 "pand "#regb", "#regr" \n\t" \
00123 "pxor "#rega", "#regb" \n\t" \
00124 "pand "#regfe", "#regb" \n\t" \
00125 "psrlq $1, "#regb" \n\t" \
00126 "paddb "#regb", "#regr" \n\t"
00127
00128 #define PAVGB_MMX(rega, regb, regr, regfe) \
00129 "movq "#rega", "#regr" \n\t" \
00130 "por "#regb", "#regr" \n\t" \
00131 "pxor "#rega", "#regb" \n\t" \
00132 "pand "#regfe", "#regb" \n\t" \
00133 "psrlq $1, "#regb" \n\t" \
00134 "psubb "#regb", "#regr" \n\t"
00135
00136
00137 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
00138 "movq "#rega", "#regr" \n\t" \
00139 "movq "#regc", "#regp" \n\t" \
00140 "pand "#regb", "#regr" \n\t" \
00141 "pand "#regd", "#regp" \n\t" \
00142 "pxor "#rega", "#regb" \n\t" \
00143 "pxor "#regc", "#regd" \n\t" \
00144 "pand %%mm6, "#regb" \n\t" \
00145 "pand %%mm6, "#regd" \n\t" \
00146 "psrlq $1, "#regb" \n\t" \
00147 "psrlq $1, "#regd" \n\t" \
00148 "paddb "#regb", "#regr" \n\t" \
00149 "paddb "#regd", "#regp" \n\t"
00150
00151 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
00152 "movq "#rega", "#regr" \n\t" \
00153 "movq "#regc", "#regp" \n\t" \
00154 "por "#regb", "#regr" \n\t" \
00155 "por "#regd", "#regp" \n\t" \
00156 "pxor "#rega", "#regb" \n\t" \
00157 "pxor "#regc", "#regd" \n\t" \
00158 "pand %%mm6, "#regb" \n\t" \
00159 "pand %%mm6, "#regd" \n\t" \
00160 "psrlq $1, "#regd" \n\t" \
00161 "psrlq $1, "#regb" \n\t" \
00162 "psubb "#regb", "#regr" \n\t" \
00163 "psubb "#regd", "#regp" \n\t"
00164
00165
00166
00167 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
00168 #define SET_RND MOVQ_WONE
00169 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
00170 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
00171 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
00172
00173 #include "dsputil_rnd_template.c"
00174
00175 #undef DEF
00176 #undef SET_RND
00177 #undef PAVGBP
00178 #undef PAVGB
00179
00180
00181
00182 #define DEF(x, y) x ## _ ## y ## _mmx
00183 #define SET_RND MOVQ_WTWO
00184 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
00185 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
00186
00187 #include "dsputil_rnd_template.c"
00188
00189 #undef DEF
00190 #undef SET_RND
00191 #undef PAVGBP
00192 #undef PAVGB
00193 #undef OP_AVG
00194
00195
00196
00197
00198 #define DEF(x) x ## _3dnow
00199 #define PAVGB "pavgusb"
00200 #define OP_AVG PAVGB
00201 #define SKIP_FOR_3DNOW
00202
00203 #include "dsputil_avg_template.c"
00204
00205 #undef DEF
00206 #undef PAVGB
00207 #undef OP_AVG
00208 #undef SKIP_FOR_3DNOW
00209
00210
00211
00212
00213 #define DEF(x) x ## _mmxext
00214
00215
00216 #define PAVGB "pavgb"
00217 #define OP_AVG PAVGB
00218
00219 #include "dsputil_avg_template.c"
00220
00221 #undef DEF
00222 #undef PAVGB
00223 #undef OP_AVG
00224
00225 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
00226 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
00227 #define put_pixels16_mmxext put_pixels16_mmx
00228 #define put_pixels8_mmxext put_pixels8_mmx
00229 #define put_pixels4_mmxext put_pixels4_mmx
00230 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
00231 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
00232
00233
00234
00235
00236 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
00237 int line_size)
00238 {
00239 const DCTELEM *p;
00240 uint8_t *pix;
00241
00242
00243 p = block;
00244 pix = pixels;
00245
00246 __asm__ volatile (
00247 "movq (%3), %%mm0 \n\t"
00248 "movq 8(%3), %%mm1 \n\t"
00249 "movq 16(%3), %%mm2 \n\t"
00250 "movq 24(%3), %%mm3 \n\t"
00251 "movq 32(%3), %%mm4 \n\t"
00252 "movq 40(%3), %%mm5 \n\t"
00253 "movq 48(%3), %%mm6 \n\t"
00254 "movq 56(%3), %%mm7 \n\t"
00255 "packuswb %%mm1, %%mm0 \n\t"
00256 "packuswb %%mm3, %%mm2 \n\t"
00257 "packuswb %%mm5, %%mm4 \n\t"
00258 "packuswb %%mm7, %%mm6 \n\t"
00259 "movq %%mm0, (%0) \n\t"
00260 "movq %%mm2, (%0, %1) \n\t"
00261 "movq %%mm4, (%0, %1, 2) \n\t"
00262 "movq %%mm6, (%0, %2) \n\t"
00263 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
00264 "r"(p)
00265 : "memory");
00266 pix += line_size * 4;
00267 p += 32;
00268
00269
00270
00271
00272 __asm__ volatile (
00273 "movq (%3), %%mm0 \n\t"
00274 "movq 8(%3), %%mm1 \n\t"
00275 "movq 16(%3), %%mm2 \n\t"
00276 "movq 24(%3), %%mm3 \n\t"
00277 "movq 32(%3), %%mm4 \n\t"
00278 "movq 40(%3), %%mm5 \n\t"
00279 "movq 48(%3), %%mm6 \n\t"
00280 "movq 56(%3), %%mm7 \n\t"
00281 "packuswb %%mm1, %%mm0 \n\t"
00282 "packuswb %%mm3, %%mm2 \n\t"
00283 "packuswb %%mm5, %%mm4 \n\t"
00284 "packuswb %%mm7, %%mm6 \n\t"
00285 "movq %%mm0, (%0) \n\t"
00286 "movq %%mm2, (%0, %1) \n\t"
00287 "movq %%mm4, (%0, %1, 2) \n\t"
00288 "movq %%mm6, (%0, %2) \n\t"
00289 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
00290 : "memory");
00291 }
00292
00293 #define put_signed_pixels_clamped_mmx_half(off) \
00294 "movq "#off"(%2), %%mm1 \n\t" \
00295 "movq 16 + "#off"(%2), %%mm2 \n\t" \
00296 "movq 32 + "#off"(%2), %%mm3 \n\t" \
00297 "movq 48 + "#off"(%2), %%mm4 \n\t" \
00298 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
00299 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
00300 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
00301 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
00302 "paddb %%mm0, %%mm1 \n\t" \
00303 "paddb %%mm0, %%mm2 \n\t" \
00304 "paddb %%mm0, %%mm3 \n\t" \
00305 "paddb %%mm0, %%mm4 \n\t" \
00306 "movq %%mm1, (%0) \n\t" \
00307 "movq %%mm2, (%0, %3) \n\t" \
00308 "movq %%mm3, (%0, %3, 2) \n\t" \
00309 "movq %%mm4, (%0, %1) \n\t"
00310
00311 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
00312 int line_size)
00313 {
00314 x86_reg line_skip = line_size;
00315 x86_reg line_skip3;
00316
00317 __asm__ volatile (
00318 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
00319 "lea (%3, %3, 2), %1 \n\t"
00320 put_signed_pixels_clamped_mmx_half(0)
00321 "lea (%0, %3, 4), %0 \n\t"
00322 put_signed_pixels_clamped_mmx_half(64)
00323 : "+&r"(pixels), "=&r"(line_skip3)
00324 : "r"(block), "r"(line_skip)
00325 : "memory");
00326 }
00327
00328 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels,
00329 int line_size)
00330 {
00331 const DCTELEM *p;
00332 uint8_t *pix;
00333 int i;
00334
00335
00336 p = block;
00337 pix = pixels;
00338 MOVQ_ZERO(mm7);
00339 i = 4;
00340 do {
00341 __asm__ volatile (
00342 "movq (%2), %%mm0 \n\t"
00343 "movq 8(%2), %%mm1 \n\t"
00344 "movq 16(%2), %%mm2 \n\t"
00345 "movq 24(%2), %%mm3 \n\t"
00346 "movq %0, %%mm4 \n\t"
00347 "movq %1, %%mm6 \n\t"
00348 "movq %%mm4, %%mm5 \n\t"
00349 "punpcklbw %%mm7, %%mm4 \n\t"
00350 "punpckhbw %%mm7, %%mm5 \n\t"
00351 "paddsw %%mm4, %%mm0 \n\t"
00352 "paddsw %%mm5, %%mm1 \n\t"
00353 "movq %%mm6, %%mm5 \n\t"
00354 "punpcklbw %%mm7, %%mm6 \n\t"
00355 "punpckhbw %%mm7, %%mm5 \n\t"
00356 "paddsw %%mm6, %%mm2 \n\t"
00357 "paddsw %%mm5, %%mm3 \n\t"
00358 "packuswb %%mm1, %%mm0 \n\t"
00359 "packuswb %%mm3, %%mm2 \n\t"
00360 "movq %%mm0, %0 \n\t"
00361 "movq %%mm2, %1 \n\t"
00362 : "+m"(*pix), "+m"(*(pix + line_size))
00363 : "r"(p)
00364 : "memory");
00365 pix += line_size * 2;
00366 p += 16;
00367 } while (--i);
00368 }
00369
00370 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
00371 int line_size, int h)
00372 {
00373 __asm__ volatile (
00374 "lea (%3, %3), %%"REG_a" \n\t"
00375 ".p2align 3 \n\t"
00376 "1: \n\t"
00377 "movq (%1 ), %%mm0 \n\t"
00378 "movq (%1, %3), %%mm1 \n\t"
00379 "movq %%mm0, (%2) \n\t"
00380 "movq %%mm1, (%2, %3) \n\t"
00381 "add %%"REG_a", %1 \n\t"
00382 "add %%"REG_a", %2 \n\t"
00383 "movq (%1 ), %%mm0 \n\t"
00384 "movq (%1, %3), %%mm1 \n\t"
00385 "movq %%mm0, (%2) \n\t"
00386 "movq %%mm1, (%2, %3) \n\t"
00387 "add %%"REG_a", %1 \n\t"
00388 "add %%"REG_a", %2 \n\t"
00389 "subl $4, %0 \n\t"
00390 "jnz 1b \n\t"
00391 : "+g"(h), "+r"(pixels), "+r"(block)
00392 : "r"((x86_reg)line_size)
00393 : "%"REG_a, "memory"
00394 );
00395 }
00396
00397 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
00398 int line_size, int h)
00399 {
00400 __asm__ volatile (
00401 "lea (%3, %3), %%"REG_a" \n\t"
00402 ".p2align 3 \n\t"
00403 "1: \n\t"
00404 "movq (%1 ), %%mm0 \n\t"
00405 "movq 8(%1 ), %%mm4 \n\t"
00406 "movq (%1, %3), %%mm1 \n\t"
00407 "movq 8(%1, %3), %%mm5 \n\t"
00408 "movq %%mm0, (%2) \n\t"
00409 "movq %%mm4, 8(%2) \n\t"
00410 "movq %%mm1, (%2, %3) \n\t"
00411 "movq %%mm5, 8(%2, %3) \n\t"
00412 "add %%"REG_a", %1 \n\t"
00413 "add %%"REG_a", %2 \n\t"
00414 "movq (%1 ), %%mm0 \n\t"
00415 "movq 8(%1 ), %%mm4 \n\t"
00416 "movq (%1, %3), %%mm1 \n\t"
00417 "movq 8(%1, %3), %%mm5 \n\t"
00418 "movq %%mm0, (%2) \n\t"
00419 "movq %%mm4, 8(%2) \n\t"
00420 "movq %%mm1, (%2, %3) \n\t"
00421 "movq %%mm5, 8(%2, %3) \n\t"
00422 "add %%"REG_a", %1 \n\t"
00423 "add %%"REG_a", %2 \n\t"
00424 "subl $4, %0 \n\t"
00425 "jnz 1b \n\t"
00426 : "+g"(h), "+r"(pixels), "+r"(block)
00427 : "r"((x86_reg)line_size)
00428 : "%"REG_a, "memory"
00429 );
00430 }
00431
00432 #define CLEAR_BLOCKS(name, n) \
00433 static void name(DCTELEM *blocks) \
00434 { \
00435 __asm__ volatile ( \
00436 "pxor %%mm7, %%mm7 \n\t" \
00437 "mov %1, %%"REG_a" \n\t" \
00438 "1: \n\t" \
00439 "movq %%mm7, (%0, %%"REG_a") \n\t" \
00440 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
00441 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
00442 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
00443 "add $32, %%"REG_a" \n\t" \
00444 "js 1b \n\t" \
00445 :: "r"(((uint8_t *)blocks) + 128 * n), \
00446 "i"(-128 * n) \
00447 : "%"REG_a \
00448 ); \
00449 }
00450 CLEAR_BLOCKS(clear_blocks_mmx, 6)
00451 CLEAR_BLOCKS(clear_block_mmx, 1)
00452
00453 static void clear_block_sse(DCTELEM *block)
00454 {
00455 __asm__ volatile (
00456 "xorps %%xmm0, %%xmm0 \n"
00457 "movaps %%xmm0, (%0) \n"
00458 "movaps %%xmm0, 16(%0) \n"
00459 "movaps %%xmm0, 32(%0) \n"
00460 "movaps %%xmm0, 48(%0) \n"
00461 "movaps %%xmm0, 64(%0) \n"
00462 "movaps %%xmm0, 80(%0) \n"
00463 "movaps %%xmm0, 96(%0) \n"
00464 "movaps %%xmm0, 112(%0) \n"
00465 :: "r"(block)
00466 : "memory"
00467 );
00468 }
00469
00470 static void clear_blocks_sse(DCTELEM *blocks)
00471 {
00472 __asm__ volatile (
00473 "xorps %%xmm0, %%xmm0 \n"
00474 "mov %1, %%"REG_a" \n"
00475 "1: \n"
00476 "movaps %%xmm0, (%0, %%"REG_a") \n"
00477 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
00478 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
00479 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
00480 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
00481 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
00482 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
00483 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
00484 "add $128, %%"REG_a" \n"
00485 "js 1b \n"
00486 :: "r"(((uint8_t *)blocks) + 128 * 6),
00487 "i"(-128 * 6)
00488 : "%"REG_a
00489 );
00490 }
00491
00492 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
00493 {
00494 x86_reg i = 0;
00495 __asm__ volatile (
00496 "jmp 2f \n\t"
00497 "1: \n\t"
00498 "movq (%1, %0), %%mm0 \n\t"
00499 "movq (%2, %0), %%mm1 \n\t"
00500 "paddb %%mm0, %%mm1 \n\t"
00501 "movq %%mm1, (%2, %0) \n\t"
00502 "movq 8(%1, %0), %%mm0 \n\t"
00503 "movq 8(%2, %0), %%mm1 \n\t"
00504 "paddb %%mm0, %%mm1 \n\t"
00505 "movq %%mm1, 8(%2, %0) \n\t"
00506 "add $16, %0 \n\t"
00507 "2: \n\t"
00508 "cmp %3, %0 \n\t"
00509 "js 1b \n\t"
00510 : "+r"(i)
00511 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
00512 );
00513 for ( ; i < w; i++)
00514 dst[i + 0] += src[i + 0];
00515 }
00516
00517 #if HAVE_7REGS
00518 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
00519 const uint8_t *diff, int w,
00520 int *left, int *left_top)
00521 {
00522 x86_reg w2 = -w;
00523 x86_reg x;
00524 int l = *left & 0xff;
00525 int tl = *left_top & 0xff;
00526 int t;
00527 __asm__ volatile (
00528 "mov %7, %3 \n"
00529 "1: \n"
00530 "movzbl (%3, %4), %2 \n"
00531 "mov %2, %k3 \n"
00532 "sub %b1, %b3 \n"
00533 "add %b0, %b3 \n"
00534 "mov %2, %1 \n"
00535 "cmp %0, %2 \n"
00536 "cmovg %0, %2 \n"
00537 "cmovg %1, %0 \n"
00538 "cmp %k3, %0 \n"
00539 "cmovg %k3, %0 \n"
00540 "mov %7, %3 \n"
00541 "cmp %2, %0 \n"
00542 "cmovl %2, %0 \n"
00543 "add (%6, %4), %b0 \n"
00544 "mov %b0, (%5, %4) \n"
00545 "inc %4 \n"
00546 "jl 1b \n"
00547 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
00548 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
00549 );
00550 *left = l;
00551 *left_top = tl;
00552 }
00553 #endif
00554
00555 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
00556 __asm__ volatile(
00557 "movd (%1), %%mm0 \n\t"
00558 "add %3, %1 \n\t"
00559 "movd (%1), %%mm1 \n\t"
00560 "movd (%1,%3,1), %%mm2 \n\t"
00561 "movd (%1,%3,2), %%mm3 \n\t"
00562 "punpcklbw %%mm1, %%mm0 \n\t"
00563 "punpcklbw %%mm3, %%mm2 \n\t"
00564 "movq %%mm0, %%mm1 \n\t"
00565 "punpcklwd %%mm2, %%mm0 \n\t"
00566 "punpckhwd %%mm2, %%mm1 \n\t"
00567 "movd %%mm0, (%0) \n\t"
00568 "add %2, %0 \n\t"
00569 "punpckhdq %%mm0, %%mm0 \n\t"
00570 "movd %%mm0, (%0) \n\t"
00571 "movd %%mm1, (%0,%2,1) \n\t"
00572 "punpckhdq %%mm1, %%mm1 \n\t"
00573 "movd %%mm1, (%0,%2,2) \n\t"
00574
00575 : "+&r" (dst),
00576 "+&r" (src)
00577 : "r" (dst_stride),
00578 "r" (src_stride)
00579 : "memory"
00580 );
00581 }
00582
00583 #define H263_LOOP_FILTER \
00584 "pxor %%mm7, %%mm7 \n\t" \
00585 "movq %0, %%mm0 \n\t" \
00586 "movq %0, %%mm1 \n\t" \
00587 "movq %3, %%mm2 \n\t" \
00588 "movq %3, %%mm3 \n\t" \
00589 "punpcklbw %%mm7, %%mm0 \n\t" \
00590 "punpckhbw %%mm7, %%mm1 \n\t" \
00591 "punpcklbw %%mm7, %%mm2 \n\t" \
00592 "punpckhbw %%mm7, %%mm3 \n\t" \
00593 "psubw %%mm2, %%mm0 \n\t" \
00594 "psubw %%mm3, %%mm1 \n\t" \
00595 "movq %1, %%mm2 \n\t" \
00596 "movq %1, %%mm3 \n\t" \
00597 "movq %2, %%mm4 \n\t" \
00598 "movq %2, %%mm5 \n\t" \
00599 "punpcklbw %%mm7, %%mm2 \n\t" \
00600 "punpckhbw %%mm7, %%mm3 \n\t" \
00601 "punpcklbw %%mm7, %%mm4 \n\t" \
00602 "punpckhbw %%mm7, %%mm5 \n\t" \
00603 "psubw %%mm2, %%mm4 \n\t" \
00604 "psubw %%mm3, %%mm5 \n\t" \
00605 "psllw $2, %%mm4 \n\t" \
00606 "psllw $2, %%mm5 \n\t" \
00607 "paddw %%mm0, %%mm4 \n\t" \
00608 "paddw %%mm1, %%mm5 \n\t" \
00609 "pxor %%mm6, %%mm6 \n\t" \
00610 "pcmpgtw %%mm4, %%mm6 \n\t" \
00611 "pcmpgtw %%mm5, %%mm7 \n\t" \
00612 "pxor %%mm6, %%mm4 \n\t" \
00613 "pxor %%mm7, %%mm5 \n\t" \
00614 "psubw %%mm6, %%mm4 \n\t" \
00615 "psubw %%mm7, %%mm5 \n\t" \
00616 "psrlw $3, %%mm4 \n\t" \
00617 "psrlw $3, %%mm5 \n\t" \
00618 "packuswb %%mm5, %%mm4 \n\t" \
00619 "packsswb %%mm7, %%mm6 \n\t" \
00620 "pxor %%mm7, %%mm7 \n\t" \
00621 "movd %4, %%mm2 \n\t" \
00622 "punpcklbw %%mm2, %%mm2 \n\t" \
00623 "punpcklbw %%mm2, %%mm2 \n\t" \
00624 "punpcklbw %%mm2, %%mm2 \n\t" \
00625 "psubusb %%mm4, %%mm2 \n\t" \
00626 "movq %%mm2, %%mm3 \n\t" \
00627 "psubusb %%mm4, %%mm3 \n\t" \
00628 "psubb %%mm3, %%mm2 \n\t" \
00629 "movq %1, %%mm3 \n\t" \
00630 "movq %2, %%mm4 \n\t" \
00631 "pxor %%mm6, %%mm3 \n\t" \
00632 "pxor %%mm6, %%mm4 \n\t" \
00633 "paddusb %%mm2, %%mm3 \n\t" \
00634 "psubusb %%mm2, %%mm4 \n\t" \
00635 "pxor %%mm6, %%mm3 \n\t" \
00636 "pxor %%mm6, %%mm4 \n\t" \
00637 "paddusb %%mm2, %%mm2 \n\t" \
00638 "packsswb %%mm1, %%mm0 \n\t" \
00639 "pcmpgtb %%mm0, %%mm7 \n\t" \
00640 "pxor %%mm7, %%mm0 \n\t" \
00641 "psubb %%mm7, %%mm0 \n\t" \
00642 "movq %%mm0, %%mm1 \n\t" \
00643 "psubusb %%mm2, %%mm0 \n\t" \
00644 "psubb %%mm0, %%mm1 \n\t" \
00645 "pand %5, %%mm1 \n\t" \
00646 "psrlw $2, %%mm1 \n\t" \
00647 "pxor %%mm7, %%mm1 \n\t" \
00648 "psubb %%mm7, %%mm1 \n\t" \
00649 "movq %0, %%mm5 \n\t" \
00650 "movq %3, %%mm6 \n\t" \
00651 "psubb %%mm1, %%mm5 \n\t" \
00652 "paddb %%mm1, %%mm6 \n\t"
00653
00654 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
00655 {
00656 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
00657 const int strength = ff_h263_loop_filter_strength[qscale];
00658
00659 __asm__ volatile (
00660 H263_LOOP_FILTER
00661
00662 "movq %%mm3, %1 \n\t"
00663 "movq %%mm4, %2 \n\t"
00664 "movq %%mm5, %0 \n\t"
00665 "movq %%mm6, %3 \n\t"
00666 : "+m"(*(uint64_t*)(src - 2 * stride)),
00667 "+m"(*(uint64_t*)(src - 1 * stride)),
00668 "+m"(*(uint64_t*)(src + 0 * stride)),
00669 "+m"(*(uint64_t*)(src + 1 * stride))
00670 : "g"(2 * strength), "m"(ff_pb_FC)
00671 );
00672 }
00673 }
00674
00675 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
00676 {
00677 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
00678 const int strength = ff_h263_loop_filter_strength[qscale];
00679 DECLARE_ALIGNED(8, uint64_t, temp)[4];
00680 uint8_t *btemp = (uint8_t*)temp;
00681
00682 src -= 2;
00683
00684 transpose4x4(btemp, src, 8, stride);
00685 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
00686 __asm__ volatile (
00687 H263_LOOP_FILTER
00688
00689 : "+m"(temp[0]),
00690 "+m"(temp[1]),
00691 "+m"(temp[2]),
00692 "+m"(temp[3])
00693 : "g"(2 * strength), "m"(ff_pb_FC)
00694 );
00695
00696 __asm__ volatile (
00697 "movq %%mm5, %%mm1 \n\t"
00698 "movq %%mm4, %%mm0 \n\t"
00699 "punpcklbw %%mm3, %%mm5 \n\t"
00700 "punpcklbw %%mm6, %%mm4 \n\t"
00701 "punpckhbw %%mm3, %%mm1 \n\t"
00702 "punpckhbw %%mm6, %%mm0 \n\t"
00703 "movq %%mm5, %%mm3 \n\t"
00704 "movq %%mm1, %%mm6 \n\t"
00705 "punpcklwd %%mm4, %%mm5 \n\t"
00706 "punpcklwd %%mm0, %%mm1 \n\t"
00707 "punpckhwd %%mm4, %%mm3 \n\t"
00708 "punpckhwd %%mm0, %%mm6 \n\t"
00709 "movd %%mm5, (%0) \n\t"
00710 "punpckhdq %%mm5, %%mm5 \n\t"
00711 "movd %%mm5, (%0, %2) \n\t"
00712 "movd %%mm3, (%0, %2, 2) \n\t"
00713 "punpckhdq %%mm3, %%mm3 \n\t"
00714 "movd %%mm3, (%0, %3) \n\t"
00715 "movd %%mm1, (%1) \n\t"
00716 "punpckhdq %%mm1, %%mm1 \n\t"
00717 "movd %%mm1, (%1, %2) \n\t"
00718 "movd %%mm6, (%1, %2, 2) \n\t"
00719 "punpckhdq %%mm6, %%mm6 \n\t"
00720 "movd %%mm6, (%1, %3) \n\t"
00721 :: "r"(src),
00722 "r"(src + 4 * stride),
00723 "r"((x86_reg)stride),
00724 "r"((x86_reg)(3 * stride))
00725 );
00726 }
00727 }
00728
00729
00730
00731 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
00732 int w, int h, int sides)
00733 {
00734 uint8_t *ptr, *last_line;
00735 int i;
00736
00737 last_line = buf + (height - 1) * wrap;
00738
00739 ptr = buf;
00740 if (w == 8) {
00741 __asm__ volatile (
00742 "1: \n\t"
00743 "movd (%0), %%mm0 \n\t"
00744 "punpcklbw %%mm0, %%mm0 \n\t"
00745 "punpcklwd %%mm0, %%mm0 \n\t"
00746 "punpckldq %%mm0, %%mm0 \n\t"
00747 "movq %%mm0, -8(%0) \n\t"
00748 "movq -8(%0, %2), %%mm1 \n\t"
00749 "punpckhbw %%mm1, %%mm1 \n\t"
00750 "punpckhwd %%mm1, %%mm1 \n\t"
00751 "punpckhdq %%mm1, %%mm1 \n\t"
00752 "movq %%mm1, (%0, %2) \n\t"
00753 "add %1, %0 \n\t"
00754 "cmp %3, %0 \n\t"
00755 "jb 1b \n\t"
00756 : "+r"(ptr)
00757 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
00758 );
00759 } else if(w==16){
00760 __asm__ volatile (
00761 "1: \n\t"
00762 "movd (%0), %%mm0 \n\t"
00763 "punpcklbw %%mm0, %%mm0 \n\t"
00764 "punpcklwd %%mm0, %%mm0 \n\t"
00765 "punpckldq %%mm0, %%mm0 \n\t"
00766 "movq %%mm0, -8(%0) \n\t"
00767 "movq %%mm0, -16(%0) \n\t"
00768 "movq -8(%0, %2), %%mm1 \n\t"
00769 "punpckhbw %%mm1, %%mm1 \n\t"
00770 "punpckhwd %%mm1, %%mm1 \n\t"
00771 "punpckhdq %%mm1, %%mm1 \n\t"
00772 "movq %%mm1, (%0, %2) \n\t"
00773 "movq %%mm1, 8(%0, %2) \n\t"
00774 "add %1, %0 \n\t"
00775 "cmp %3, %0 \n\t"
00776 "jb 1b \n\t"
00777 : "+r"(ptr)
00778 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
00779 );
00780 } else {
00781 av_assert1(w == 4);
00782 __asm__ volatile (
00783 "1: \n\t"
00784 "movd (%0), %%mm0 \n\t"
00785 "punpcklbw %%mm0, %%mm0 \n\t"
00786 "punpcklwd %%mm0, %%mm0 \n\t"
00787 "movd %%mm0, -4(%0) \n\t"
00788 "movd -4(%0, %2), %%mm1 \n\t"
00789 "punpcklbw %%mm1, %%mm1 \n\t"
00790 "punpckhwd %%mm1, %%mm1 \n\t"
00791 "punpckhdq %%mm1, %%mm1 \n\t"
00792 "movd %%mm1, (%0, %2) \n\t"
00793 "add %1, %0 \n\t"
00794 "cmp %3, %0 \n\t"
00795 "jb 1b \n\t"
00796 : "+r"(ptr)
00797 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
00798 );
00799 }
00800
00801
00802 if (sides & EDGE_TOP) {
00803 for (i = 0; i < h; i += 4) {
00804 ptr = buf - (i + 1) * wrap - w;
00805 __asm__ volatile (
00806 "1: \n\t"
00807 "movq (%1, %0), %%mm0 \n\t"
00808 "movq %%mm0, (%0) \n\t"
00809 "movq %%mm0, (%0, %2) \n\t"
00810 "movq %%mm0, (%0, %2, 2) \n\t"
00811 "movq %%mm0, (%0, %3) \n\t"
00812 "add $8, %0 \n\t"
00813 "cmp %4, %0 \n\t"
00814 "jb 1b \n\t"
00815 : "+r"(ptr)
00816 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
00817 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
00818 );
00819 }
00820 }
00821
00822 if (sides & EDGE_BOTTOM) {
00823 for (i = 0; i < h; i += 4) {
00824 ptr = last_line + (i + 1) * wrap - w;
00825 __asm__ volatile (
00826 "1: \n\t"
00827 "movq (%1, %0), %%mm0 \n\t"
00828 "movq %%mm0, (%0) \n\t"
00829 "movq %%mm0, (%0, %2) \n\t"
00830 "movq %%mm0, (%0, %2, 2) \n\t"
00831 "movq %%mm0, (%0, %3) \n\t"
00832 "add $8, %0 \n\t"
00833 "cmp %4, %0 \n\t"
00834 "jb 1b \n\t"
00835 : "+r"(ptr)
00836 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
00837 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
00838 "r"(ptr + width + 2 * w)
00839 );
00840 }
00841 }
00842 }
00843
00844 #define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd, \
00845 in0, in1, in2, in7, out, OP) \
00846 "paddw "#m4", "#m3" \n\t" \
00847 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" \
00848 "pmullw "#m3", %%mm4 \n\t" \
00849 "movq "#in7", "#m3" \n\t" \
00850 "movq "#in0", %%mm5 \n\t" \
00851 "paddw "#m3", %%mm5 \n\t" \
00852 "psubw %%mm5, %%mm4 \n\t" \
00853 "movq "#in1", %%mm5 \n\t" \
00854 "movq "#in2", %%mm6 \n\t" \
00855 "paddw "#m6", %%mm5 \n\t" \
00856 "paddw "#m5", %%mm6 \n\t" \
00857 "paddw %%mm6, %%mm6 \n\t" \
00858 "psubw %%mm6, %%mm5 \n\t" \
00859 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" \
00860 "paddw "#rnd", %%mm4 \n\t" \
00861 "paddw %%mm4, %%mm5 \n\t" \
00862 "psraw $5, %%mm5 \n\t" \
00863 "packuswb %%mm5, %%mm5 \n\t" \
00864 OP(%%mm5, out, %%mm7, d)
00865
00866 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT) \
00867 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, \
00868 uint8_t *src, \
00869 int dstStride, \
00870 int srcStride, \
00871 int h) \
00872 { \
00873 uint64_t temp; \
00874 \
00875 __asm__ volatile ( \
00876 "pxor %%mm7, %%mm7 \n\t" \
00877 "1: \n\t" \
00878 "movq (%0), %%mm0 \n\t" \
00879 "movq %%mm0, %%mm1 \n\t" \
00880 "movq %%mm0, %%mm2 \n\t" \
00881 "punpcklbw %%mm7, %%mm0 \n\t" \
00882 "punpckhbw %%mm7, %%mm1 \n\t" \
00883 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
00884 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
00885 "movq %%mm2, %%mm3 \n\t" \
00886 "movq %%mm2, %%mm4 \n\t" \
00887 "psllq $8, %%mm2 \n\t" \
00888 "psllq $16, %%mm3 \n\t" \
00889 "psllq $24, %%mm4 \n\t" \
00890 "punpckhbw %%mm7, %%mm2 \n\t" \
00891 "punpckhbw %%mm7, %%mm3 \n\t" \
00892 "punpckhbw %%mm7, %%mm4 \n\t" \
00893 "paddw %%mm3, %%mm5 \n\t" \
00894 "paddw %%mm2, %%mm6 \n\t" \
00895 "paddw %%mm5, %%mm5 \n\t" \
00896 "psubw %%mm5, %%mm6 \n\t" \
00897 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
00898 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
00899 "paddw %%mm4, %%mm0 \n\t" \
00900 "paddw %%mm1, %%mm5 \n\t" \
00901 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
00902 "psubw %%mm5, %%mm0 \n\t" \
00903 "paddw %6, %%mm6 \n\t" \
00904 "paddw %%mm6, %%mm0 \n\t" \
00905 "psraw $5, %%mm0 \n\t" \
00906 "movq %%mm0, %5 \n\t" \
00907 \
00908 \
00909 "movq 5(%0), %%mm0 \n\t" \
00910 "movq %%mm0, %%mm5 \n\t" \
00911 "movq %%mm0, %%mm6 \n\t" \
00912 "psrlq $8, %%mm0 \n\t" \
00913 "psrlq $16, %%mm5 \n\t" \
00914 "punpcklbw %%mm7, %%mm0 \n\t" \
00915 "punpcklbw %%mm7, %%mm5 \n\t" \
00916 "paddw %%mm0, %%mm2 \n\t" \
00917 "paddw %%mm5, %%mm3 \n\t" \
00918 "paddw %%mm2, %%mm2 \n\t" \
00919 "psubw %%mm2, %%mm3 \n\t" \
00920 "movq %%mm6, %%mm2 \n\t" \
00921 "psrlq $24, %%mm6 \n\t" \
00922 "punpcklbw %%mm7, %%mm2 \n\t" \
00923 "punpcklbw %%mm7, %%mm6 \n\t" \
00924 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
00925 "paddw %%mm2, %%mm1 \n\t" \
00926 "paddw %%mm6, %%mm4 \n\t" \
00927 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
00928 "psubw %%mm4, %%mm3 \n\t" \
00929 "paddw %6, %%mm1 \n\t" \
00930 "paddw %%mm1, %%mm3 \n\t" \
00931 "psraw $5, %%mm3 \n\t" \
00932 "movq %5, %%mm1 \n\t" \
00933 "packuswb %%mm3, %%mm1 \n\t" \
00934 OP_MMXEXT(%%mm1, (%1), %%mm4, q) \
00935 \
00936 \
00937 "movq 9(%0), %%mm1 \n\t" \
00938 "movq %%mm1, %%mm4 \n\t" \
00939 "movq %%mm1, %%mm3 \n\t" \
00940 "psrlq $8, %%mm1 \n\t" \
00941 "psrlq $16, %%mm4 \n\t" \
00942 "punpcklbw %%mm7, %%mm1 \n\t" \
00943 "punpcklbw %%mm7, %%mm4 \n\t" \
00944 "paddw %%mm1, %%mm5 \n\t" \
00945 "paddw %%mm4, %%mm0 \n\t" \
00946 "paddw %%mm5, %%mm5 \n\t" \
00947 "psubw %%mm5, %%mm0 \n\t" \
00948 "movq %%mm3, %%mm5 \n\t" \
00949 "psrlq $24, %%mm3 \n\t" \
00950 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" \
00951 "punpcklbw %%mm7, %%mm3 \n\t" \
00952 "paddw %%mm3, %%mm2 \n\t" \
00953 "psubw %%mm2, %%mm0 \n\t" \
00954 "movq %%mm5, %%mm2 \n\t" \
00955 "punpcklbw %%mm7, %%mm2 \n\t" \
00956 "punpckhbw %%mm7, %%mm5 \n\t" \
00957 "paddw %%mm2, %%mm6 \n\t" \
00958 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" \
00959 "paddw %6, %%mm0 \n\t" \
00960 "paddw %%mm6, %%mm0 \n\t" \
00961 "psraw $5, %%mm0 \n\t" \
00962 \
00963 \
00964 \
00965 "paddw %%mm5, %%mm3 \n\t" \
00966 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
00967 "paddw %%mm4, %%mm6 \n\t" \
00968 "pshufw $0xBE, %%mm5, %%mm4 \n\t" \
00969 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
00970 "paddw %%mm1, %%mm4 \n\t" \
00971 "paddw %%mm2, %%mm5 \n\t" \
00972 "paddw %%mm6, %%mm6 \n\t" \
00973 "psubw %%mm6, %%mm4 \n\t" \
00974 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" \
00975 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" \
00976 "psubw %%mm5, %%mm3 \n\t" \
00977 "paddw %6, %%mm4 \n\t" \
00978 "paddw %%mm3, %%mm4 \n\t" \
00979 "psraw $5, %%mm4 \n\t" \
00980 "packuswb %%mm4, %%mm0 \n\t" \
00981 OP_MMXEXT(%%mm0, 8(%1), %%mm4, q) \
00982 \
00983 "add %3, %0 \n\t" \
00984 "add %4, %1 \n\t" \
00985 "decl %2 \n\t" \
00986 "jnz 1b \n\t" \
00987 : "+a"(src), "+c"(dst), "+D"(h) \
00988 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), \
00989 "m"(temp), "m"(ROUNDER) \
00990 : "memory" \
00991 ); \
00992 } \
00993 \
00994 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, \
00995 uint8_t *src, \
00996 int dstStride, \
00997 int srcStride, \
00998 int h) \
00999 { \
01000 __asm__ volatile ( \
01001 "pxor %%mm7, %%mm7 \n\t" \
01002 "1: \n\t" \
01003 "movq (%0), %%mm0 \n\t" \
01004 "movq %%mm0, %%mm1 \n\t" \
01005 "movq %%mm0, %%mm2 \n\t" \
01006 "punpcklbw %%mm7, %%mm0 \n\t" \
01007 "punpckhbw %%mm7, %%mm1 \n\t" \
01008 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
01009 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
01010 "movq %%mm2, %%mm3 \n\t" \
01011 "movq %%mm2, %%mm4 \n\t" \
01012 "psllq $8, %%mm2 \n\t" \
01013 "psllq $16, %%mm3 \n\t" \
01014 "psllq $24, %%mm4 \n\t" \
01015 "punpckhbw %%mm7, %%mm2 \n\t" \
01016 "punpckhbw %%mm7, %%mm3 \n\t" \
01017 "punpckhbw %%mm7, %%mm4 \n\t" \
01018 "paddw %%mm3, %%mm5 \n\t" \
01019 "paddw %%mm2, %%mm6 \n\t" \
01020 "paddw %%mm5, %%mm5 \n\t" \
01021 "psubw %%mm5, %%mm6 \n\t" \
01022 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
01023 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
01024 "paddw %%mm4, %%mm0 \n\t" \
01025 "paddw %%mm1, %%mm5 \n\t" \
01026 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
01027 "psubw %%mm5, %%mm0 \n\t" \
01028 "paddw %5, %%mm6 \n\t" \
01029 "paddw %%mm6, %%mm0 \n\t" \
01030 "psraw $5, %%mm0 \n\t" \
01031 \
01032 \
01033 "movd 5(%0), %%mm5 \n\t" \
01034 "punpcklbw %%mm7, %%mm5 \n\t" \
01035 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
01036 "paddw %%mm5, %%mm1 \n\t" \
01037 "paddw %%mm6, %%mm2 \n\t" \
01038 "pshufw $0xBE, %%mm5, %%mm6 \n\t" \
01039 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
01040 "paddw %%mm6, %%mm3 \n\t" \
01041 "paddw %%mm5, %%mm4 \n\t" \
01042 "paddw %%mm2, %%mm2 \n\t" \
01043 "psubw %%mm2, %%mm3 \n\t" \
01044 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
01045 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
01046 "psubw %%mm4, %%mm3 \n\t" \
01047 "paddw %5, %%mm1 \n\t" \
01048 "paddw %%mm1, %%mm3 \n\t" \
01049 "psraw $5, %%mm3 \n\t" \
01050 "packuswb %%mm3, %%mm0 \n\t" \
01051 OP_MMXEXT(%%mm0, (%1), %%mm4, q) \
01052 \
01053 "add %3, %0 \n\t" \
01054 "add %4, %1 \n\t" \
01055 "decl %2 \n\t" \
01056 "jnz 1b \n\t" \
01057 : "+a"(src), "+c"(dst), "+d"(h) \
01058 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), \
01059 "m"(ROUNDER) \
01060 : "memory" \
01061 ); \
01062 }
01063
01064 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
01065 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, \
01066 uint8_t *src, \
01067 int dstStride, \
01068 int srcStride) \
01069 { \
01070 uint64_t temp[17 * 4]; \
01071 uint64_t *temp_ptr = temp; \
01072 int count = 17; \
01073 \
01074 \
01075 __asm__ volatile ( \
01076 "pxor %%mm7, %%mm7 \n\t" \
01077 "1: \n\t" \
01078 "movq (%0), %%mm0 \n\t" \
01079 "movq (%0), %%mm1 \n\t" \
01080 "movq 8(%0), %%mm2 \n\t" \
01081 "movq 8(%0), %%mm3 \n\t" \
01082 "punpcklbw %%mm7, %%mm0 \n\t" \
01083 "punpckhbw %%mm7, %%mm1 \n\t" \
01084 "punpcklbw %%mm7, %%mm2 \n\t" \
01085 "punpckhbw %%mm7, %%mm3 \n\t" \
01086 "movq %%mm0, (%1) \n\t" \
01087 "movq %%mm1, 17 * 8(%1) \n\t" \
01088 "movq %%mm2, 2 * 17 * 8(%1) \n\t" \
01089 "movq %%mm3, 3 * 17 * 8(%1) \n\t" \
01090 "add $8, %1 \n\t" \
01091 "add %3, %0 \n\t" \
01092 "decl %2 \n\t" \
01093 "jnz 1b \n\t" \
01094 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
01095 : "r"((x86_reg)srcStride) \
01096 : "memory" \
01097 ); \
01098 \
01099 temp_ptr = temp; \
01100 count = 4; \
01101 \
01102 \
01103 __asm__ volatile ( \
01104 \
01105 "1: \n\t" \
01106 "movq (%0), %%mm0 \n\t" \
01107 "movq 8(%0), %%mm1 \n\t" \
01108 "movq 16(%0), %%mm2 \n\t" \
01109 "movq 24(%0), %%mm3 \n\t" \
01110 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
01111 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
01112 "add %4, %1 \n\t" \
01113 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
01114 \
01115 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
01116 "add %4, %1 \n\t" \
01117 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
01118 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP) \
01119 "add %4, %1 \n\t" \
01120 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP) \
01121 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP) \
01122 "add %4, %1 \n\t" \
01123 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP) \
01124 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0), 104(%0), (%1, %3), OP) \
01125 "add %4, %1 \n\t" \
01126 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0), 112(%0), (%1), OP) \
01127 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0), 120(%0), (%1, %3), OP) \
01128 "add %4, %1 \n\t" \
01129 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0), 128(%0), (%1), OP) \
01130 \
01131 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0), 128(%0), (%1, %3), OP) \
01132 "add %4, %1 \n\t" \
01133 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0), 104(%0), 120(%0), (%1), OP) \
01134 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
01135 \
01136 "add $136, %0 \n\t" \
01137 "add %6, %1 \n\t" \
01138 "decl %2 \n\t" \
01139 "jnz 1b \n\t" \
01140 \
01141 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
01142 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
01143 "m"(ROUNDER), \
01144 "g"(4 - 14 * (x86_reg)dstStride) \
01145 : "memory" \
01146 ); \
01147 } \
01148 \
01149 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, \
01150 uint8_t *src, \
01151 int dstStride, \
01152 int srcStride) \
01153 { \
01154 uint64_t temp[9 * 2]; \
01155 uint64_t *temp_ptr = temp; \
01156 int count = 9; \
01157 \
01158 \
01159 __asm__ volatile ( \
01160 "pxor %%mm7, %%mm7 \n\t" \
01161 "1: \n\t" \
01162 "movq (%0), %%mm0 \n\t" \
01163 "movq (%0), %%mm1 \n\t" \
01164 "punpcklbw %%mm7, %%mm0 \n\t" \
01165 "punpckhbw %%mm7, %%mm1 \n\t" \
01166 "movq %%mm0, (%1) \n\t" \
01167 "movq %%mm1, 9*8(%1) \n\t" \
01168 "add $8, %1 \n\t" \
01169 "add %3, %0 \n\t" \
01170 "decl %2 \n\t" \
01171 "jnz 1b \n\t" \
01172 : "+r"(src), "+r"(temp_ptr), "+r"(count) \
01173 : "r"((x86_reg)srcStride) \
01174 : "memory" \
01175 ); \
01176 \
01177 temp_ptr = temp; \
01178 count = 2; \
01179 \
01180 \
01181 __asm__ volatile ( \
01182 \
01183 "1: \n\t" \
01184 "movq (%0), %%mm0 \n\t" \
01185 "movq 8(%0), %%mm1 \n\t" \
01186 "movq 16(%0), %%mm2 \n\t" \
01187 "movq 24(%0), %%mm3 \n\t" \
01188 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP) \
01189 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP) \
01190 "add %4, %1 \n\t" \
01191 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP) \
01192 \
01193 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP) \
01194 "add %4, %1 \n\t" \
01195 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP) \
01196 \
01197 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
01198 "add %4, %1 \n\t" \
01199 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP) \
01200 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
01201 \
01202 "add $72, %0 \n\t" \
01203 "add %6, %1 \n\t" \
01204 "decl %2 \n\t" \
01205 "jnz 1b \n\t" \
01206 \
01207 : "+r"(temp_ptr), "+r"(dst), "+g"(count) \
01208 : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride), \
01209 "m"(ROUNDER), \
01210 "g"(4 - 6 * (x86_reg)dstStride) \
01211 : "memory" \
01212 ); \
01213 } \
01214 \
01215 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
01216 int stride) \
01217 { \
01218 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
01219 } \
01220 \
01221 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
01222 int stride) \
01223 { \
01224 uint64_t temp[8]; \
01225 uint8_t * const half = (uint8_t*)temp; \
01226 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
01227 stride, 8); \
01228 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
01229 } \
01230 \
01231 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
01232 int stride) \
01233 { \
01234 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
01235 stride, 8); \
01236 } \
01237 \
01238 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
01239 int stride) \
01240 { \
01241 uint64_t temp[8]; \
01242 uint8_t * const half = (uint8_t*)temp; \
01243 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
01244 stride, 8); \
01245 OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
01246 stride, 8); \
01247 } \
01248 \
01249 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
01250 int stride) \
01251 { \
01252 uint64_t temp[8]; \
01253 uint8_t * const half = (uint8_t*)temp; \
01254 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
01255 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8); \
01256 } \
01257 \
01258 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
01259 int stride) \
01260 { \
01261 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride); \
01262 } \
01263 \
01264 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
01265 int stride) \
01266 { \
01267 uint64_t temp[8]; \
01268 uint8_t * const half = (uint8_t*)temp; \
01269 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride); \
01270 OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride, \
01271 stride, 8); \
01272 } \
01273 \
01274 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
01275 int stride) \
01276 { \
01277 uint64_t half[8 + 9]; \
01278 uint8_t * const halfH = ((uint8_t*)half) + 64; \
01279 uint8_t * const halfHV = ((uint8_t*)half); \
01280 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01281 stride, 9); \
01282 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
01283 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
01284 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
01285 } \
01286 \
01287 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
01288 int stride) \
01289 { \
01290 uint64_t half[8 + 9]; \
01291 uint8_t * const halfH = ((uint8_t*)half) + 64; \
01292 uint8_t * const halfHV = ((uint8_t*)half); \
01293 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01294 stride, 9); \
01295 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
01296 stride, 9); \
01297 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
01298 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
01299 } \
01300 \
01301 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
01302 int stride) \
01303 { \
01304 uint64_t half[8 + 9]; \
01305 uint8_t * const halfH = ((uint8_t*)half) + 64; \
01306 uint8_t * const halfHV = ((uint8_t*)half); \
01307 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01308 stride, 9); \
01309 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
01310 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
01311 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
01312 } \
01313 \
01314 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
01315 int stride) \
01316 { \
01317 uint64_t half[8 + 9]; \
01318 uint8_t * const halfH = ((uint8_t*)half) + 64; \
01319 uint8_t * const halfHV = ((uint8_t*)half); \
01320 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01321 stride, 9); \
01322 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
01323 stride, 9); \
01324 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
01325 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
01326 } \
01327 \
01328 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
01329 int stride) \
01330 { \
01331 uint64_t half[8 + 9]; \
01332 uint8_t * const halfH = ((uint8_t*)half) + 64; \
01333 uint8_t * const halfHV = ((uint8_t*)half); \
01334 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01335 stride, 9); \
01336 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
01337 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8); \
01338 } \
01339 \
01340 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
01341 int stride) \
01342 { \
01343 uint64_t half[8 + 9]; \
01344 uint8_t * const halfH = ((uint8_t*)half) + 64; \
01345 uint8_t * const halfHV = ((uint8_t*)half); \
01346 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01347 stride, 9); \
01348 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8); \
01349 OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
01350 } \
01351 \
01352 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
01353 int stride) \
01354 { \
01355 uint64_t half[8 + 9]; \
01356 uint8_t * const halfH = ((uint8_t*)half); \
01357 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01358 stride, 9); \
01359 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9); \
01360 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
01361 } \
01362 \
01363 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
01364 int stride) \
01365 { \
01366 uint64_t half[8 + 9]; \
01367 uint8_t * const halfH = ((uint8_t*)half); \
01368 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01369 stride, 9); \
01370 put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
01371 stride, 9); \
01372 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
01373 } \
01374 \
01375 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
01376 int stride) \
01377 { \
01378 uint64_t half[9]; \
01379 uint8_t * const halfH = ((uint8_t*)half); \
01380 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
01381 stride, 9); \
01382 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8); \
01383 } \
01384 \
01385 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
01386 int stride) \
01387 { \
01388 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
01389 } \
01390 \
01391 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
01392 int stride) \
01393 { \
01394 uint64_t temp[32]; \
01395 uint8_t * const half = (uint8_t*)temp; \
01396 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
01397 stride, 16); \
01398 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
01399 } \
01400 \
01401 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
01402 int stride) \
01403 { \
01404 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
01405 stride, stride, 16); \
01406 } \
01407 \
01408 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
01409 int stride) \
01410 { \
01411 uint64_t temp[32]; \
01412 uint8_t * const half = (uint8_t*)temp; \
01413 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
01414 stride, 16); \
01415 OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
01416 stride, stride, 16); \
01417 } \
01418 \
01419 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
01420 int stride) \
01421 { \
01422 uint64_t temp[32]; \
01423 uint8_t * const half = (uint8_t*)temp; \
01424 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
01425 stride); \
01426 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16); \
01427 } \
01428 \
01429 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
01430 int stride) \
01431 { \
01432 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
01433 } \
01434 \
01435 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
01436 int stride) \
01437 { \
01438 uint64_t temp[32]; \
01439 uint8_t * const half = (uint8_t*)temp; \
01440 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
01441 stride); \
01442 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
01443 stride, stride, 16); \
01444 } \
01445 \
01446 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
01447 int stride) \
01448 { \
01449 uint64_t half[16 * 2 + 17 * 2]; \
01450 uint8_t * const halfH = ((uint8_t*)half) + 256; \
01451 uint8_t * const halfHV = ((uint8_t*)half); \
01452 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01453 stride, 17); \
01454 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
01455 stride, 17); \
01456 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
01457 16, 16); \
01458 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
01459 } \
01460 \
01461 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
01462 int stride) \
01463 { \
01464 uint64_t half[16 * 2 + 17 * 2]; \
01465 uint8_t * const halfH = ((uint8_t*)half) + 256; \
01466 uint8_t * const halfHV = ((uint8_t*)half); \
01467 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01468 stride, 17); \
01469 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
01470 stride, 17); \
01471 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
01472 16, 16); \
01473 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
01474 } \
01475 \
01476 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
01477 int stride) \
01478 { \
01479 uint64_t half[16 * 2 + 17 * 2]; \
01480 uint8_t * const halfH = ((uint8_t*)half) + 256; \
01481 uint8_t * const halfHV = ((uint8_t*)half); \
01482 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01483 stride, 17); \
01484 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
01485 stride, 17); \
01486 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
01487 16, 16); \
01488 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
01489 16, 16); \
01490 } \
01491 \
01492 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
01493 int stride) \
01494 { \
01495 uint64_t half[16 * 2 + 17 * 2]; \
01496 uint8_t * const halfH = ((uint8_t*)half) + 256; \
01497 uint8_t * const halfHV = ((uint8_t*)half); \
01498 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01499 stride, 17); \
01500 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
01501 stride, 17); \
01502 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
01503 16, 16); \
01504 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
01505 16, 16); \
01506 } \
01507 \
01508 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
01509 int stride) \
01510 { \
01511 uint64_t half[16 * 2 + 17 * 2]; \
01512 uint8_t * const halfH = ((uint8_t*)half) + 256; \
01513 uint8_t * const halfHV = ((uint8_t*)half); \
01514 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01515 stride, 17); \
01516 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
01517 16, 16); \
01518 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16); \
01519 } \
01520 \
01521 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
01522 int stride) \
01523 { \
01524 uint64_t half[16 * 2 + 17 * 2]; \
01525 uint8_t * const halfH = ((uint8_t*)half) + 256; \
01526 uint8_t * const halfHV = ((uint8_t*)half); \
01527 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01528 stride, 17); \
01529 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
01530 16, 16); \
01531 OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride, \
01532 16, 16); \
01533 } \
01534 \
01535 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
01536 int stride) \
01537 { \
01538 uint64_t half[17 * 2]; \
01539 uint8_t * const halfH = ((uint8_t*)half); \
01540 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01541 stride, 17); \
01542 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
01543 stride, 17); \
01544 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
01545 } \
01546 \
01547 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
01548 int stride) \
01549 { \
01550 uint64_t half[17 * 2]; \
01551 uint8_t * const halfH = ((uint8_t*)half); \
01552 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01553 stride, 17); \
01554 put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
01555 stride, 17); \
01556 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
01557 } \
01558 \
01559 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
01560 int stride) \
01561 { \
01562 uint64_t half[17 * 2]; \
01563 uint8_t * const halfH = ((uint8_t*)half); \
01564 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
01565 stride, 17); \
01566 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16); \
01567 }
01568
01569 #define PUT_OP(a, b, temp, size) \
01570 "mov"#size" "#a", "#b" \n\t"
01571
01572 #define AVG_MMXEXT_OP(a, b, temp, size) \
01573 "mov"#size" "#b", "#temp" \n\t" \
01574 "pavgb "#temp", "#a" \n\t" \
01575 "mov"#size" "#a", "#b" \n\t"
01576
01577 QPEL_BASE(put_, ff_pw_16, _, PUT_OP)
01578 QPEL_BASE(avg_, ff_pw_16, _, AVG_MMXEXT_OP)
01579 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
01580 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
01581 QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
01582 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
01583
01584
01585
01586
01587 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL) \
01588 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
01589 uint8_t *src, \
01590 int stride) \
01591 { \
01592 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE); \
01593 }
01594
01595 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2) \
01596 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, \
01597 uint8_t *src, \
01598 int stride) \
01599 { \
01600 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src + S0, stride, SIZE, \
01601 S1, S2); \
01602 }
01603
01604 #define QPEL_2TAP(OPNAME, SIZE, MMX) \
01605 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX) \
01606 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX) \
01607 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx) \
01608 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX = \
01609 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX; \
01610 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX = \
01611 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX; \
01612 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX = \
01613 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX; \
01614 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, \
01615 uint8_t *src, \
01616 int stride) \
01617 { \
01618 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src + 1, stride, SIZE); \
01619 } \
01620 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, \
01621 uint8_t *src, \
01622 int stride) \
01623 { \
01624 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src + stride, \
01625 stride, SIZE); \
01626 } \
01627 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0) \
01628 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0) \
01629 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0) \
01630 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0) \
01631 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1) \
01632 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1) \
01633 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1) \
01634 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride + 1, -stride, -1) \
01635
01636 QPEL_2TAP(put_, 16, mmxext)
01637 QPEL_2TAP(avg_, 16, mmxext)
01638 QPEL_2TAP(put_, 8, mmxext)
01639 QPEL_2TAP(avg_, 8, mmxext)
01640
01641 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
01642 {
01643 put_pixels8_xy2_mmx(dst, src, stride, 8);
01644 }
01645 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
01646 {
01647 put_pixels16_xy2_mmx(dst, src, stride, 16);
01648 }
01649 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
01650 {
01651 avg_pixels8_xy2_mmx(dst, src, stride, 8);
01652 }
01653 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
01654 {
01655 avg_pixels16_xy2_mmx(dst, src, stride, 16);
01656 }
01657
01658 #endif
01659
01660 #if HAVE_YASM
01661 typedef void emu_edge_core_func(uint8_t *buf, const uint8_t *src,
01662 x86_reg linesize, x86_reg start_y,
01663 x86_reg end_y, x86_reg block_h,
01664 x86_reg start_x, x86_reg end_x,
01665 x86_reg block_w);
01666 extern emu_edge_core_func ff_emu_edge_core_mmx;
01667 extern emu_edge_core_func ff_emu_edge_core_sse;
01668
01669 static av_always_inline void emulated_edge_mc(uint8_t *buf, const uint8_t *src,
01670 int linesize,
01671 int block_w, int block_h,
01672 int src_x, int src_y,
01673 int w, int h,
01674 emu_edge_core_func *core_fn)
01675 {
01676 int start_y, start_x, end_y, end_x, src_y_add = 0;
01677
01678 if(!w || !h)
01679 return;
01680
01681 if (src_y >= h) {
01682 src -= src_y*linesize;
01683 src_y_add = h - 1;
01684 src_y = h - 1;
01685 } else if (src_y <= -block_h) {
01686 src -= src_y*linesize;
01687 src_y_add = 1 - block_h;
01688 src_y = 1 - block_h;
01689 }
01690 if (src_x >= w) {
01691 src += w - 1 - src_x;
01692 src_x = w - 1;
01693 } else if (src_x <= -block_w) {
01694 src += 1 - block_w - src_x;
01695 src_x = 1 - block_w;
01696 }
01697
01698 start_y = FFMAX(0, -src_y);
01699 start_x = FFMAX(0, -src_x);
01700 end_y = FFMIN(block_h, h-src_y);
01701 end_x = FFMIN(block_w, w-src_x);
01702 av_assert2(start_x < end_x && block_w > 0);
01703 av_assert2(start_y < end_y && block_h > 0);
01704
01705
01706 src += (src_y_add + start_y) * linesize + start_x;
01707 buf += start_x;
01708 core_fn(buf, src, linesize, start_y, end_y,
01709 block_h, start_x, end_x, block_w);
01710 }
01711
01712 #if ARCH_X86_32
01713 static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src,
01714 int linesize,
01715 int block_w, int block_h,
01716 int src_x, int src_y, int w, int h)
01717 {
01718 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
01719 w, h, &ff_emu_edge_core_mmx);
01720 }
01721 #endif
01722
01723 static av_noinline void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src,
01724 int linesize,
01725 int block_w, int block_h,
01726 int src_x, int src_y, int w, int h)
01727 {
01728 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
01729 w, h, &ff_emu_edge_core_sse);
01730 }
01731 #endif
01732
01733 #if HAVE_INLINE_ASM
01734
01735 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
01736 int linesize, int block_w, int block_h,
01737 int src_x, int src_y, int w, int h);
01738
01739 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
01740 int stride, int h, int ox, int oy,
01741 int dxx, int dxy, int dyx, int dyy,
01742 int shift, int r, int width, int height,
01743 emulated_edge_mc_func *emu_edge_fn)
01744 {
01745 const int w = 8;
01746 const int ix = ox >> (16 + shift);
01747 const int iy = oy >> (16 + shift);
01748 const int oxs = ox >> 4;
01749 const int oys = oy >> 4;
01750 const int dxxs = dxx >> 4;
01751 const int dxys = dxy >> 4;
01752 const int dyxs = dyx >> 4;
01753 const int dyys = dyy >> 4;
01754 const uint16_t r4[4] = { r, r, r, r };
01755 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
01756 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
01757 const uint64_t shift2 = 2 * shift;
01758 #define MAX_STRIDE 4096U
01759 #define MAX_H 8U
01760 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
01761 int x, y;
01762
01763 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
01764 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
01765 const int dxh = dxy * (h - 1);
01766 const int dyw = dyx * (w - 1);
01767 int need_emu = (unsigned)ix >= width - w ||
01768 (unsigned)iy >= height - h;
01769
01770 if (
01771 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
01772 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
01773
01774 || (dxx | dxy | dyx | dyy) & 15
01775 || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
01776
01777 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
01778 shift, r, width, height);
01779 return;
01780 }
01781
01782 src += ix + iy * stride;
01783 if (need_emu) {
01784 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
01785 src = edge_buf;
01786 }
01787
01788 __asm__ volatile (
01789 "movd %0, %%mm6 \n\t"
01790 "pxor %%mm7, %%mm7 \n\t"
01791 "punpcklwd %%mm6, %%mm6 \n\t"
01792 "punpcklwd %%mm6, %%mm6 \n\t"
01793 :: "r"(1<<shift)
01794 );
01795
01796 for (x = 0; x < w; x += 4) {
01797 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
01798 oxs - dxys + dxxs * (x + 1),
01799 oxs - dxys + dxxs * (x + 2),
01800 oxs - dxys + dxxs * (x + 3) };
01801 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
01802 oys - dyys + dyxs * (x + 1),
01803 oys - dyys + dyxs * (x + 2),
01804 oys - dyys + dyxs * (x + 3) };
01805
01806 for (y = 0; y < h; y++) {
01807 __asm__ volatile (
01808 "movq %0, %%mm4 \n\t"
01809 "movq %1, %%mm5 \n\t"
01810 "paddw %2, %%mm4 \n\t"
01811 "paddw %3, %%mm5 \n\t"
01812 "movq %%mm4, %0 \n\t"
01813 "movq %%mm5, %1 \n\t"
01814 "psrlw $12, %%mm4 \n\t"
01815 "psrlw $12, %%mm5 \n\t"
01816 : "+m"(*dx4), "+m"(*dy4)
01817 : "m"(*dxy4), "m"(*dyy4)
01818 );
01819
01820 __asm__ volatile (
01821 "movq %%mm6, %%mm2 \n\t"
01822 "movq %%mm6, %%mm1 \n\t"
01823 "psubw %%mm4, %%mm2 \n\t"
01824 "psubw %%mm5, %%mm1 \n\t"
01825 "movq %%mm2, %%mm0 \n\t"
01826 "movq %%mm4, %%mm3 \n\t"
01827 "pmullw %%mm1, %%mm0 \n\t"
01828 "pmullw %%mm5, %%mm3 \n\t"
01829 "pmullw %%mm5, %%mm2 \n\t"
01830 "pmullw %%mm4, %%mm1 \n\t"
01831
01832 "movd %4, %%mm5 \n\t"
01833 "movd %3, %%mm4 \n\t"
01834 "punpcklbw %%mm7, %%mm5 \n\t"
01835 "punpcklbw %%mm7, %%mm4 \n\t"
01836 "pmullw %%mm5, %%mm3 \n\t"
01837 "pmullw %%mm4, %%mm2 \n\t"
01838
01839 "movd %2, %%mm5 \n\t"
01840 "movd %1, %%mm4 \n\t"
01841 "punpcklbw %%mm7, %%mm5 \n\t"
01842 "punpcklbw %%mm7, %%mm4 \n\t"
01843 "pmullw %%mm5, %%mm1 \n\t"
01844 "pmullw %%mm4, %%mm0 \n\t"
01845 "paddw %5, %%mm1 \n\t"
01846 "paddw %%mm3, %%mm2 \n\t"
01847 "paddw %%mm1, %%mm0 \n\t"
01848 "paddw %%mm2, %%mm0 \n\t"
01849
01850 "psrlw %6, %%mm0 \n\t"
01851 "packuswb %%mm0, %%mm0 \n\t"
01852 "movd %%mm0, %0 \n\t"
01853
01854 : "=m"(dst[x + y * stride])
01855 : "m"(src[0]), "m"(src[1]),
01856 "m"(src[stride]), "m"(src[stride + 1]),
01857 "m"(*r4), "m"(shift2)
01858 );
01859 src += stride;
01860 }
01861 src += 4 - h * stride;
01862 }
01863 }
01864
01865 #if HAVE_YASM
01866 #if ARCH_X86_32
01867 static void gmc_mmx(uint8_t *dst, uint8_t *src,
01868 int stride, int h, int ox, int oy,
01869 int dxx, int dxy, int dyx, int dyy,
01870 int shift, int r, int width, int height)
01871 {
01872 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
01873 width, height, &emulated_edge_mc_mmx);
01874 }
01875 #endif
01876 static void gmc_sse(uint8_t *dst, uint8_t *src,
01877 int stride, int h, int ox, int oy,
01878 int dxx, int dxy, int dyx, int dyy,
01879 int shift, int r, int width, int height)
01880 {
01881 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
01882 width, height, &emulated_edge_mc_sse);
01883 }
01884 #else
01885 static void gmc_mmx(uint8_t *dst, uint8_t *src,
01886 int stride, int h, int ox, int oy,
01887 int dxx, int dxy, int dyx, int dyy,
01888 int shift, int r, int width, int height)
01889 {
01890 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
01891 width, height, &ff_emulated_edge_mc_8);
01892 }
01893 #endif
01894
01895 #define PREFETCH(name, op) \
01896 static void name(void *mem, int stride, int h) \
01897 { \
01898 const uint8_t *p = mem; \
01899 do { \
01900 __asm__ volatile (#op" %0" :: "m"(*p)); \
01901 p += stride; \
01902 } while (--h); \
01903 }
01904
01905 PREFETCH(prefetch_mmxext, prefetcht0)
01906 PREFETCH(prefetch_3dnow, prefetch)
01907 #undef PREFETCH
01908
01909 #endif
01910
01911 #include "h264_qpel.c"
01912
01913 void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
01914 int stride, int h, int x, int y);
01915 void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
01916 int stride, int h, int x, int y);
01917 void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
01918 int stride, int h, int x, int y);
01919
01920 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
01921 int stride, int h, int x, int y);
01922 void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
01923 int stride, int h, int x, int y);
01924 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
01925 int stride, int h, int x, int y);
01926
01927 void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
01928 int stride, int h, int x, int y);
01929 void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
01930 int stride, int h, int x, int y);
01931
01932 void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
01933 int stride, int h, int x, int y);
01934 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
01935 int stride, int h, int x, int y);
01936
01937 void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
01938 int stride, int h, int x, int y);
01939 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
01940 int stride, int h, int x, int y);
01941
01942 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
01943 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
01944 (uint8_t *dst, uint8_t *src, \
01945 int stride, int h, int x, int y);
01946
01947 CHROMA_MC(put, 2, 10, mmxext)
01948 CHROMA_MC(avg, 2, 10, mmxext)
01949 CHROMA_MC(put, 4, 10, mmxext)
01950 CHROMA_MC(avg, 4, 10, mmxext)
01951 CHROMA_MC(put, 8, 10, sse2)
01952 CHROMA_MC(avg, 8, 10, sse2)
01953 CHROMA_MC(put, 8, 10, avx)
01954 CHROMA_MC(avg, 8, 10, avx)
01955
01956 #if HAVE_INLINE_ASM
01957
01958
01959 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
01960 {
01961 put_pixels8_mmx(dst, src, stride, 8);
01962 }
01963
01964 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
01965 {
01966 avg_pixels8_mmx(dst, src, stride, 8);
01967 }
01968
01969 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
01970 {
01971 put_pixels16_mmx(dst, src, stride, 16);
01972 }
01973
01974 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
01975 {
01976 avg_pixels16_mmx(dst, src, stride, 16);
01977 }
01978
01979
01980 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
01981 int stride, int rnd)
01982 {
01983 put_pixels8_mmx(dst, src, stride, 8);
01984 }
01985
01986 void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
01987 int stride, int rnd)
01988 {
01989 avg_pixels8_mmxext(dst, src, stride, 8);
01990 }
01991
01992
01993 static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
01994 {
01995
01996 MOVQ_BFE(mm6);
01997 __asm__ volatile(
01998 "1: \n\t"
01999 "movq (%1), %%mm0 \n\t"
02000 "movq (%2), %%mm1 \n\t"
02001 "movq (%1,%4), %%mm2 \n\t"
02002 "movq (%2,%4), %%mm3 \n\t"
02003 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
02004 "movq %%mm4, (%3) \n\t"
02005 "movq %%mm5, (%3,%4) \n\t"
02006
02007 "movq (%1,%4,2), %%mm0 \n\t"
02008 "movq (%2,%4,2), %%mm1 \n\t"
02009 "movq (%1,%5), %%mm2 \n\t"
02010 "movq (%2,%5), %%mm3 \n\t"
02011 "lea (%1,%4,4), %1 \n\t"
02012 "lea (%2,%4,4), %2 \n\t"
02013 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
02014 "movq %%mm4, (%3,%4,2) \n\t"
02015 "movq %%mm5, (%3,%5) \n\t"
02016 "lea (%3,%4,4), %3 \n\t"
02017 "subl $4, %0 \n\t"
02018 "jnz 1b \n\t"
02019 :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
02020 :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
02021 :"memory");
02022
02023 }
02024 static void put_vp_no_rnd_pixels16_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
02025 {
02026 put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h);
02027 put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h);
02028 }
02029
02030 #if CONFIG_DIRAC_DECODER
02031 #define DIRAC_PIXOP(OPNAME, EXT)\
02032 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
02033 {\
02034 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
02035 }\
02036 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
02037 {\
02038 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
02039 }\
02040 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
02041 {\
02042 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
02043 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
02044 }
02045
02046 DIRAC_PIXOP(put, mmx)
02047 DIRAC_PIXOP(avg, mmx)
02048 DIRAC_PIXOP(avg, mmxext)
02049
02050 #if HAVE_YASM
02051 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
02052 {
02053 ff_put_pixels16_sse2(dst, src[0], stride, h);
02054 }
02055 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
02056 {
02057 ff_avg_pixels16_sse2(dst, src[0], stride, h);
02058 }
02059 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
02060 {
02061 ff_put_pixels16_sse2(dst , src[0] , stride, h);
02062 ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
02063 }
02064 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
02065 {
02066 ff_avg_pixels16_sse2(dst , src[0] , stride, h);
02067 ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
02068 }
02069 #endif
02070 #endif
02071
02072
02073
02074 #if CONFIG_GPL
02075 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
02076 DCTELEM *block)
02077 {
02078 ff_mmx_idct(block);
02079 ff_put_pixels_clamped_mmx(block, dest, line_size);
02080 }
02081
02082 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
02083 DCTELEM *block)
02084 {
02085 ff_mmx_idct(block);
02086 ff_add_pixels_clamped_mmx(block, dest, line_size);
02087 }
02088
02089 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
02090 DCTELEM *block)
02091 {
02092 ff_mmxext_idct(block);
02093 ff_put_pixels_clamped_mmx(block, dest, line_size);
02094 }
02095
02096 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
02097 DCTELEM *block)
02098 {
02099 ff_mmxext_idct(block);
02100 ff_add_pixels_clamped_mmx(block, dest, line_size);
02101 }
02102 #endif
02103
02104 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
02105 {
02106 int i;
02107 __asm__ volatile ("pxor %%mm7, %%mm7":);
02108 for (i = 0; i < blocksize; i += 2) {
02109 __asm__ volatile (
02110 "movq %0, %%mm0 \n\t"
02111 "movq %1, %%mm1 \n\t"
02112 "movq %%mm0, %%mm2 \n\t"
02113 "movq %%mm1, %%mm3 \n\t"
02114 "pfcmpge %%mm7, %%mm2 \n\t"
02115 "pfcmpge %%mm7, %%mm3 \n\t"
02116 "pslld $31, %%mm2 \n\t"
02117 "pxor %%mm2, %%mm1 \n\t"
02118 "movq %%mm3, %%mm4 \n\t"
02119 "pand %%mm1, %%mm3 \n\t"
02120 "pandn %%mm1, %%mm4 \n\t"
02121 "pfadd %%mm0, %%mm3 \n\t"
02122 "pfsub %%mm4, %%mm0 \n\t"
02123 "movq %%mm3, %1 \n\t"
02124 "movq %%mm0, %0 \n\t"
02125 : "+m"(mag[i]), "+m"(ang[i])
02126 :: "memory"
02127 );
02128 }
02129 __asm__ volatile ("femms");
02130 }
02131
02132 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
02133 {
02134 int i;
02135
02136 __asm__ volatile (
02137 "movaps %0, %%xmm5 \n\t"
02138 :: "m"(ff_pdw_80000000[0])
02139 );
02140 for (i = 0; i < blocksize; i += 4) {
02141 __asm__ volatile (
02142 "movaps %0, %%xmm0 \n\t"
02143 "movaps %1, %%xmm1 \n\t"
02144 "xorps %%xmm2, %%xmm2 \n\t"
02145 "xorps %%xmm3, %%xmm3 \n\t"
02146 "cmpleps %%xmm0, %%xmm2 \n\t"
02147 "cmpleps %%xmm1, %%xmm3 \n\t"
02148 "andps %%xmm5, %%xmm2 \n\t"
02149 "xorps %%xmm2, %%xmm1 \n\t"
02150 "movaps %%xmm3, %%xmm4 \n\t"
02151 "andps %%xmm1, %%xmm3 \n\t"
02152 "andnps %%xmm1, %%xmm4 \n\t"
02153 "addps %%xmm0, %%xmm3 \n\t"
02154 "subps %%xmm4, %%xmm0 \n\t"
02155 "movaps %%xmm3, %1 \n\t"
02156 "movaps %%xmm0, %0 \n\t"
02157 : "+m"(mag[i]), "+m"(ang[i])
02158 :: "memory"
02159 );
02160 }
02161 }
02162
02163 #if HAVE_6REGS
02164 static void vector_fmul_window_3dnowext(float *dst, const float *src0,
02165 const float *src1, const float *win,
02166 int len)
02167 {
02168 x86_reg i = -len * 4;
02169 x86_reg j = len * 4 - 8;
02170 __asm__ volatile (
02171 "1: \n"
02172 "pswapd (%5, %1), %%mm1 \n"
02173 "movq (%5, %0), %%mm0 \n"
02174 "pswapd (%4, %1), %%mm5 \n"
02175 "movq (%3, %0), %%mm4 \n"
02176 "movq %%mm0, %%mm2 \n"
02177 "movq %%mm1, %%mm3 \n"
02178 "pfmul %%mm4, %%mm2 \n"
02179 "pfmul %%mm5, %%mm3 \n"
02180 "pfmul %%mm4, %%mm1 \n"
02181 "pfmul %%mm5, %%mm0 \n"
02182 "pfadd %%mm3, %%mm2 \n"
02183 "pfsub %%mm0, %%mm1 \n"
02184 "pswapd %%mm2, %%mm2 \n"
02185 "movq %%mm1, (%2, %0) \n"
02186 "movq %%mm2, (%2, %1) \n"
02187 "sub $8, %1 \n"
02188 "add $8, %0 \n"
02189 "jl 1b \n"
02190 "femms \n"
02191 : "+r"(i), "+r"(j)
02192 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
02193 );
02194 }
02195
02196 static void vector_fmul_window_sse(float *dst, const float *src0,
02197 const float *src1, const float *win, int len)
02198 {
02199 x86_reg i = -len * 4;
02200 x86_reg j = len * 4 - 16;
02201 __asm__ volatile (
02202 "1: \n"
02203 "movaps (%5, %1), %%xmm1 \n"
02204 "movaps (%5, %0), %%xmm0 \n"
02205 "movaps (%4, %1), %%xmm5 \n"
02206 "movaps (%3, %0), %%xmm4 \n"
02207 "shufps $0x1b, %%xmm1, %%xmm1 \n"
02208 "shufps $0x1b, %%xmm5, %%xmm5 \n"
02209 "movaps %%xmm0, %%xmm2 \n"
02210 "movaps %%xmm1, %%xmm3 \n"
02211 "mulps %%xmm4, %%xmm2 \n"
02212 "mulps %%xmm5, %%xmm3 \n"
02213 "mulps %%xmm4, %%xmm1 \n"
02214 "mulps %%xmm5, %%xmm0 \n"
02215 "addps %%xmm3, %%xmm2 \n"
02216 "subps %%xmm0, %%xmm1 \n"
02217 "shufps $0x1b, %%xmm2, %%xmm2 \n"
02218 "movaps %%xmm1, (%2, %0) \n"
02219 "movaps %%xmm2, (%2, %1) \n"
02220 "sub $16, %1 \n"
02221 "add $16, %0 \n"
02222 "jl 1b \n"
02223 : "+r"(i), "+r"(j)
02224 : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
02225 );
02226 }
02227 #endif
02228
02229 static void vector_clipf_sse(float *dst, const float *src,
02230 float min, float max, int len)
02231 {
02232 x86_reg i = (len - 16) * 4;
02233 __asm__ volatile (
02234 "movss %3, %%xmm4 \n\t"
02235 "movss %4, %%xmm5 \n\t"
02236 "shufps $0, %%xmm4, %%xmm4 \n\t"
02237 "shufps $0, %%xmm5, %%xmm5 \n\t"
02238 "1: \n\t"
02239 "movaps (%2, %0), %%xmm0 \n\t"
02240 "movaps 16(%2, %0), %%xmm1 \n\t"
02241 "movaps 32(%2, %0), %%xmm2 \n\t"
02242 "movaps 48(%2, %0), %%xmm3 \n\t"
02243 "maxps %%xmm4, %%xmm0 \n\t"
02244 "maxps %%xmm4, %%xmm1 \n\t"
02245 "maxps %%xmm4, %%xmm2 \n\t"
02246 "maxps %%xmm4, %%xmm3 \n\t"
02247 "minps %%xmm5, %%xmm0 \n\t"
02248 "minps %%xmm5, %%xmm1 \n\t"
02249 "minps %%xmm5, %%xmm2 \n\t"
02250 "minps %%xmm5, %%xmm3 \n\t"
02251 "movaps %%xmm0, (%1, %0) \n\t"
02252 "movaps %%xmm1, 16(%1, %0) \n\t"
02253 "movaps %%xmm2, 32(%1, %0) \n\t"
02254 "movaps %%xmm3, 48(%1, %0) \n\t"
02255 "sub $64, %0 \n\t"
02256 "jge 1b \n\t"
02257 : "+&r"(i)
02258 : "r"(dst), "r"(src), "m"(min), "m"(max)
02259 : "memory"
02260 );
02261 }
02262
02263 #endif
02264
02265 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
02266 int order);
02267 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
02268 int order);
02269 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
02270 const int16_t *v3,
02271 int order, int mul);
02272 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
02273 const int16_t *v3,
02274 int order, int mul);
02275 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
02276 const int16_t *v3,
02277 int order, int mul);
02278
02279 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
02280 const int16_t *window, unsigned int len);
02281 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
02282 const int16_t *window, unsigned int len);
02283 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
02284 const int16_t *window, unsigned int len);
02285 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
02286 const int16_t *window, unsigned int len);
02287 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
02288 const int16_t *window, unsigned int len);
02289 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
02290 const int16_t *window, unsigned int len);
02291
02292 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
02293 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
02294
02295 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
02296 const uint8_t *diff, int w,
02297 int *left, int *left_top);
02298 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
02299 int w, int left);
02300 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
02301 int w, int left);
02302
02303 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
02304
02305 void ff_vector_fmul_reverse_sse(float *dst, const float *src0,
02306 const float *src1, int len);
02307 void ff_vector_fmul_reverse_avx(float *dst, const float *src0,
02308 const float *src1, int len);
02309
02310 void ff_vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
02311 const float *src2, int len);
02312 void ff_vector_fmul_add_avx(float *dst, const float *src0, const float *src1,
02313 const float *src2, int len);
02314
02315 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
02316 int32_t min, int32_t max, unsigned int len);
02317 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
02318 int32_t min, int32_t max, unsigned int len);
02319 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
02320 int32_t min, int32_t max, unsigned int len);
02321 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
02322 int32_t min, int32_t max, unsigned int len);
02323
02324 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
02325 const float *src1, int len);
02326 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
02327 const float *src1, int len);
02328
02329 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
02330 do { \
02331 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
02332 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
02333 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
02334 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
02335 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
02336 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
02337 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
02338 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
02339 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
02340 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
02341 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
02342 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
02343 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
02344 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
02345 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
02346 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
02347 } while (0)
02348
02349 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
02350 do { \
02351 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
02352 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
02353 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
02354 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
02355 } while (0)
02356
02357 #define H264_QPEL_FUNCS(x, y, CPU) \
02358 do { \
02359 c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
02360 c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
02361 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
02362 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
02363 } while (0)
02364
02365 #define H264_QPEL_FUNCS_10(x, y, CPU) \
02366 do { \
02367 c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
02368 c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
02369 c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
02370 c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
02371 } while (0)
02372
02373 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
02374 {
02375 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
02376
02377 #if HAVE_INLINE_ASM
02378 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
02379 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
02380 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
02381
02382 if (!high_bit_depth) {
02383 c->clear_block = clear_block_mmx;
02384 c->clear_blocks = clear_blocks_mmx;
02385 c->draw_edges = draw_edges_mmx;
02386
02387 SET_HPEL_FUNCS(put, 0, 16, mmx);
02388 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
02389 SET_HPEL_FUNCS(avg, 0, 16, mmx);
02390 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
02391 SET_HPEL_FUNCS(put, 1, 8, mmx);
02392 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
02393 SET_HPEL_FUNCS(avg, 1, 8, mmx);
02394 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
02395 }
02396
02397 #if ARCH_X86_32 || !HAVE_YASM
02398 c->gmc = gmc_mmx;
02399 #endif
02400
02401 c->add_bytes = add_bytes_mmx;
02402
02403 c->put_no_rnd_pixels_l2[0]= put_vp_no_rnd_pixels16_l2_mmx;
02404 c->put_no_rnd_pixels_l2[1]= put_vp_no_rnd_pixels8_l2_mmx;
02405
02406 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
02407 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
02408 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
02409 }
02410 #endif
02411
02412 #if HAVE_YASM
02413 #if ARCH_X86_32
02414 if (!high_bit_depth)
02415 c->emulated_edge_mc = emulated_edge_mc_mmx;
02416 #endif
02417
02418 if (!high_bit_depth && CONFIG_H264CHROMA) {
02419 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
02420 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
02421 }
02422
02423 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
02424 #endif
02425
02426 }
02427
02428 static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
02429 int mm_flags)
02430 {
02431 const int bit_depth = avctx->bits_per_raw_sample;
02432 const int high_bit_depth = bit_depth > 8;
02433
02434 #if HAVE_INLINE_ASM
02435 c->prefetch = prefetch_mmxext;
02436
02437 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
02438 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
02439 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmxext, );
02440 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmxext, );
02441
02442 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
02443 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
02444 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmxext, );
02445 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmxext, );
02446 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
02447 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
02448
02449 if (!high_bit_depth) {
02450 c->put_pixels_tab[0][1] = put_pixels16_x2_mmxext;
02451 c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
02452
02453 c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
02454 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
02455 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
02456
02457 c->put_pixels_tab[1][1] = put_pixels8_x2_mmxext;
02458 c->put_pixels_tab[1][2] = put_pixels8_y2_mmxext;
02459
02460 c->avg_pixels_tab[1][0] = avg_pixels8_mmxext;
02461 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmxext;
02462 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmxext;
02463 }
02464
02465 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
02466 if (!high_bit_depth) {
02467 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
02468 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
02469 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmxext;
02470 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmxext;
02471
02472 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
02473 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmxext;
02474 }
02475 }
02476
02477 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
02478 avctx->codec_id == AV_CODEC_ID_THEORA)) {
02479 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmxext;
02480 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmxext;
02481 }
02482 #endif
02483
02484 #if HAVE_MMXEXT_EXTERNAL
02485 if (CONFIG_H264QPEL) {
02486 if (!high_bit_depth) {
02487 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
02488 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
02489 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
02490 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
02491 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
02492 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
02493 } else if (bit_depth == 10) {
02494 #if !ARCH_X86_64
02495 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
02496 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
02497 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
02498 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
02499 #endif
02500 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
02501 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
02502 }
02503 }
02504
02505 if (!high_bit_depth && CONFIG_H264CHROMA) {
02506 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
02507 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
02508 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
02509 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
02510 }
02511 if (bit_depth == 10 && CONFIG_H264CHROMA) {
02512 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
02513 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
02514 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
02515 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
02516 }
02517
02518
02519 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
02520 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
02521
02522 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
02523 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
02524
02525 if (avctx->flags & CODEC_FLAG_BITEXACT) {
02526 c->apply_window_int16 = ff_apply_window_int16_mmxext;
02527 } else {
02528 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
02529 }
02530 #endif
02531 }
02532
02533 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
02534 int mm_flags)
02535 {
02536 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
02537
02538 #if HAVE_INLINE_ASM
02539 c->prefetch = prefetch_3dnow;
02540
02541 if (!high_bit_depth) {
02542 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
02543 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
02544
02545 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
02546 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
02547 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
02548
02549 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
02550 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
02551
02552 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
02553 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
02554 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
02555
02556 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
02557 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
02558 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
02559 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
02560 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
02561
02562 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
02563 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
02564 }
02565 }
02566
02567 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
02568 avctx->codec_id == AV_CODEC_ID_THEORA)) {
02569 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
02570 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
02571 }
02572
02573 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
02574 #endif
02575
02576 #if HAVE_YASM
02577 if (!high_bit_depth && CONFIG_H264CHROMA) {
02578 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
02579 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
02580 }
02581 #endif
02582 }
02583
02584 static void dsputil_init_3dnowext(DSPContext *c, AVCodecContext *avctx,
02585 int mm_flags)
02586 {
02587 #if HAVE_AMD3DNOWEXT_INLINE && HAVE_6REGS
02588 c->vector_fmul_window = vector_fmul_window_3dnowext;
02589 #endif
02590 }
02591
02592 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
02593 {
02594 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
02595
02596 #if HAVE_INLINE_ASM
02597 if (!high_bit_depth) {
02598 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
02599
02600 c->clear_block = clear_block_sse;
02601 c->clear_blocks = clear_blocks_sse;
02602 }
02603 }
02604
02605 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
02606
02607 #if HAVE_6REGS
02608 c->vector_fmul_window = vector_fmul_window_sse;
02609 #endif
02610
02611 c->vector_clipf = vector_clipf_sse;
02612 #endif
02613
02614 #if HAVE_YASM
02615 c->vector_fmul_reverse = ff_vector_fmul_reverse_sse;
02616 c->vector_fmul_add = ff_vector_fmul_add_sse;
02617
02618 c->scalarproduct_float = ff_scalarproduct_float_sse;
02619 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
02620
02621 if (!high_bit_depth)
02622 c->emulated_edge_mc = emulated_edge_mc_sse;
02623 #if HAVE_INLINE_ASM
02624 c->gmc = gmc_sse;
02625 #endif
02626 #endif
02627 }
02628
02629 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
02630 int mm_flags)
02631 {
02632 const int bit_depth = avctx->bits_per_raw_sample;
02633 const int high_bit_depth = bit_depth > 8;
02634
02635 #if HAVE_SSE2_INLINE
02636 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
02637 c->idct_put = ff_idct_xvid_sse2_put;
02638 c->idct_add = ff_idct_xvid_sse2_add;
02639 c->idct = ff_idct_xvid_sse2;
02640 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
02641 }
02642 #endif
02643
02644 #if HAVE_SSE2_EXTERNAL
02645 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
02646
02647 if (!high_bit_depth) {
02648 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
02649 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
02650 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
02651 if (CONFIG_H264QPEL)
02652 H264_QPEL_FUNCS(0, 0, sse2);
02653 }
02654 }
02655
02656 if (!high_bit_depth && CONFIG_H264QPEL) {
02657 H264_QPEL_FUNCS(0, 1, sse2);
02658 H264_QPEL_FUNCS(0, 2, sse2);
02659 H264_QPEL_FUNCS(0, 3, sse2);
02660 H264_QPEL_FUNCS(1, 1, sse2);
02661 H264_QPEL_FUNCS(1, 2, sse2);
02662 H264_QPEL_FUNCS(1, 3, sse2);
02663 H264_QPEL_FUNCS(2, 1, sse2);
02664 H264_QPEL_FUNCS(2, 2, sse2);
02665 H264_QPEL_FUNCS(2, 3, sse2);
02666 H264_QPEL_FUNCS(3, 1, sse2);
02667 H264_QPEL_FUNCS(3, 2, sse2);
02668 H264_QPEL_FUNCS(3, 3, sse2);
02669 }
02670
02671 if (bit_depth == 10) {
02672 if (CONFIG_H264QPEL) {
02673 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
02674 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
02675 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
02676 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
02677 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
02678 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
02679 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
02680 }
02681 if (CONFIG_H264CHROMA) {
02682 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
02683 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
02684 }
02685 }
02686
02687 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
02688 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
02689 if (mm_flags & AV_CPU_FLAG_ATOM) {
02690 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
02691 } else {
02692 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
02693 }
02694 if (avctx->flags & CODEC_FLAG_BITEXACT) {
02695 c->apply_window_int16 = ff_apply_window_int16_sse2;
02696 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
02697 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
02698 }
02699 c->bswap_buf = ff_bswap32_buf_sse2;
02700 #endif
02701 }
02702
02703 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
02704 int mm_flags)
02705 {
02706 #if HAVE_SSSE3_EXTERNAL
02707 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
02708 const int bit_depth = avctx->bits_per_raw_sample;
02709
02710 if (!high_bit_depth && CONFIG_H264QPEL) {
02711 H264_QPEL_FUNCS(1, 0, ssse3);
02712 H264_QPEL_FUNCS(1, 1, ssse3);
02713 H264_QPEL_FUNCS(1, 2, ssse3);
02714 H264_QPEL_FUNCS(1, 3, ssse3);
02715 H264_QPEL_FUNCS(2, 0, ssse3);
02716 H264_QPEL_FUNCS(2, 1, ssse3);
02717 H264_QPEL_FUNCS(2, 2, ssse3);
02718 H264_QPEL_FUNCS(2, 3, ssse3);
02719 H264_QPEL_FUNCS(3, 0, ssse3);
02720 H264_QPEL_FUNCS(3, 1, ssse3);
02721 H264_QPEL_FUNCS(3, 2, ssse3);
02722 H264_QPEL_FUNCS(3, 3, ssse3);
02723 }
02724 if (bit_depth == 10 && CONFIG_H264QPEL) {
02725 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
02726 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
02727 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
02728 }
02729 if (!high_bit_depth && CONFIG_H264CHROMA) {
02730 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
02731 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
02732 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
02733 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
02734 }
02735 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
02736 if (mm_flags & AV_CPU_FLAG_SSE4)
02737 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
02738
02739 if (mm_flags & AV_CPU_FLAG_ATOM)
02740 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
02741 else
02742 c->apply_window_int16 = ff_apply_window_int16_ssse3;
02743 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW)))
02744 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
02745 c->bswap_buf = ff_bswap32_buf_ssse3;
02746 #endif
02747 }
02748
02749 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
02750 int mm_flags)
02751 {
02752 #if HAVE_SSE4_EXTERNAL
02753 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
02754 #endif
02755 }
02756
02757 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
02758 {
02759 #if HAVE_AVX_EXTERNAL
02760 const int bit_depth = avctx->bits_per_raw_sample;
02761
02762 if (bit_depth == 10) {
02763
02764
02765 if (CONFIG_H264QPEL) {
02766 H264_QPEL_FUNCS_10(1, 0, sse2);
02767 H264_QPEL_FUNCS_10(2, 0, sse2);
02768 H264_QPEL_FUNCS_10(3, 0, sse2);
02769 }
02770
02771 if (CONFIG_H264CHROMA) {
02772 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
02773 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
02774 }
02775 }
02776 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
02777 c->vector_fmul_reverse = ff_vector_fmul_reverse_avx;
02778 c->vector_fmul_add = ff_vector_fmul_add_avx;
02779 #endif
02780 }
02781
02782 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
02783 {
02784 int mm_flags = av_get_cpu_flags();
02785
02786 #if HAVE_7REGS && HAVE_INLINE_ASM
02787 if (mm_flags & AV_CPU_FLAG_CMOV)
02788 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
02789 #endif
02790
02791 if (mm_flags & AV_CPU_FLAG_MMX) {
02792 #if HAVE_INLINE_ASM
02793 const int idct_algo = avctx->idct_algo;
02794
02795 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
02796 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
02797 c->idct_put = ff_simple_idct_put_mmx;
02798 c->idct_add = ff_simple_idct_add_mmx;
02799 c->idct = ff_simple_idct_mmx;
02800 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
02801 #if CONFIG_GPL
02802 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
02803 if (mm_flags & AV_CPU_FLAG_MMX2) {
02804 c->idct_put = ff_libmpeg2mmx2_idct_put;
02805 c->idct_add = ff_libmpeg2mmx2_idct_add;
02806 c->idct = ff_mmxext_idct;
02807 } else {
02808 c->idct_put = ff_libmpeg2mmx_idct_put;
02809 c->idct_add = ff_libmpeg2mmx_idct_add;
02810 c->idct = ff_mmx_idct;
02811 }
02812 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
02813 #endif
02814 } else if (idct_algo == FF_IDCT_XVIDMMX) {
02815 if (mm_flags & AV_CPU_FLAG_SSE2) {
02816 c->idct_put = ff_idct_xvid_sse2_put;
02817 c->idct_add = ff_idct_xvid_sse2_add;
02818 c->idct = ff_idct_xvid_sse2;
02819 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
02820 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
02821 c->idct_put = ff_idct_xvid_mmxext_put;
02822 c->idct_add = ff_idct_xvid_mmxext_add;
02823 c->idct = ff_idct_xvid_mmxext;
02824 } else {
02825 c->idct_put = ff_idct_xvid_mmx_put;
02826 c->idct_add = ff_idct_xvid_mmx_add;
02827 c->idct = ff_idct_xvid_mmx;
02828 }
02829 }
02830 }
02831 #endif
02832
02833 dsputil_init_mmx(c, avctx, mm_flags);
02834 }
02835
02836 if (mm_flags & AV_CPU_FLAG_MMXEXT)
02837 dsputil_init_mmxext(c, avctx, mm_flags);
02838
02839 if (mm_flags & AV_CPU_FLAG_3DNOW)
02840 dsputil_init_3dnow(c, avctx, mm_flags);
02841
02842 if (mm_flags & AV_CPU_FLAG_3DNOWEXT)
02843 dsputil_init_3dnowext(c, avctx, mm_flags);
02844
02845 if (mm_flags & AV_CPU_FLAG_SSE)
02846 dsputil_init_sse(c, avctx, mm_flags);
02847
02848 if (mm_flags & AV_CPU_FLAG_SSE2)
02849 dsputil_init_sse2(c, avctx, mm_flags);
02850
02851 if (mm_flags & AV_CPU_FLAG_SSSE3)
02852 dsputil_init_ssse3(c, avctx, mm_flags);
02853
02854 if (mm_flags & AV_CPU_FLAG_SSE4)
02855 dsputil_init_sse4(c, avctx, mm_flags);
02856
02857 if (mm_flags & AV_CPU_FLAG_AVX)
02858 dsputil_init_avx(c, avctx, mm_flags);
02859
02860 if (CONFIG_ENCODERS)
02861 ff_dsputilenc_init_mmx(c, avctx);
02862 }