00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #include "dsputil_mmx.h"
00022
00023 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL;
00024 DECLARE_ALIGNED_8 (static const uint64_t, ff_pb_7_3 ) = 0x0307030703070307ULL;
00025
00026
00027
00028
00029 #define SUMSUB_BADC( a, b, c, d ) \
00030 "paddw "#b", "#a" \n\t"\
00031 "paddw "#d", "#c" \n\t"\
00032 "paddw "#b", "#b" \n\t"\
00033 "paddw "#d", "#d" \n\t"\
00034 "psubw "#a", "#b" \n\t"\
00035 "psubw "#c", "#d" \n\t"
00036
00037 #define SUMSUBD2_AB( a, b, t ) \
00038 "movq "#b", "#t" \n\t"\
00039 "psraw $1 , "#b" \n\t"\
00040 "paddw "#a", "#b" \n\t"\
00041 "psraw $1 , "#a" \n\t"\
00042 "psubw "#t", "#a" \n\t"
00043
00044 #define IDCT4_1D( s02, s13, d02, d13, t ) \
00045 SUMSUB_BA ( s02, d02 )\
00046 SUMSUBD2_AB( s13, d13, t )\
00047 SUMSUB_BADC( d13, s02, s13, d02 )
00048
00049 #define STORE_DIFF_4P( p, t, z ) \
00050 "psraw $6, "#p" \n\t"\
00051 "movd (%0), "#t" \n\t"\
00052 "punpcklbw "#z", "#t" \n\t"\
00053 "paddsw "#t", "#p" \n\t"\
00054 "packuswb "#z", "#p" \n\t"\
00055 "movd "#p", (%0) \n\t"
00056
00057 static void ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
00058 {
00059
00060 __asm__ volatile(
00061 "movq (%0), %%mm0 \n\t"
00062 "movq 8(%0), %%mm1 \n\t"
00063 "movq 16(%0), %%mm2 \n\t"
00064 "movq 24(%0), %%mm3 \n\t"
00065 :: "r"(block) );
00066
00067 __asm__ volatile(
00068
00069 IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
00070
00071 "movq %0, %%mm6 \n\t"
00072
00073 TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
00074
00075 "paddw %%mm6, %%mm3 \n\t"
00076
00077
00078 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
00079
00080 "pxor %%mm7, %%mm7 \n\t"
00081 :: "m"(ff_pw_32));
00082
00083 __asm__ volatile(
00084 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
00085 "add %1, %0 \n\t"
00086 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
00087 "add %1, %0 \n\t"
00088 STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
00089 "add %1, %0 \n\t"
00090 STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
00091 : "+r"(dst)
00092 : "r" ((x86_reg)stride)
00093 );
00094 }
00095
00096 static inline void h264_idct8_1d(int16_t *block)
00097 {
00098 __asm__ volatile(
00099 "movq 112(%0), %%mm7 \n\t"
00100 "movq 80(%0), %%mm0 \n\t"
00101 "movq 48(%0), %%mm3 \n\t"
00102 "movq 16(%0), %%mm5 \n\t"
00103
00104 "movq %%mm0, %%mm4 \n\t"
00105 "movq %%mm5, %%mm1 \n\t"
00106 "psraw $1, %%mm4 \n\t"
00107 "psraw $1, %%mm1 \n\t"
00108 "paddw %%mm0, %%mm4 \n\t"
00109 "paddw %%mm5, %%mm1 \n\t"
00110 "paddw %%mm7, %%mm4 \n\t"
00111 "paddw %%mm0, %%mm1 \n\t"
00112 "psubw %%mm5, %%mm4 \n\t"
00113 "paddw %%mm3, %%mm1 \n\t"
00114
00115 "psubw %%mm3, %%mm5 \n\t"
00116 "psubw %%mm3, %%mm0 \n\t"
00117 "paddw %%mm7, %%mm5 \n\t"
00118 "psubw %%mm7, %%mm0 \n\t"
00119 "psraw $1, %%mm3 \n\t"
00120 "psraw $1, %%mm7 \n\t"
00121 "psubw %%mm3, %%mm5 \n\t"
00122 "psubw %%mm7, %%mm0 \n\t"
00123
00124 "movq %%mm4, %%mm3 \n\t"
00125 "movq %%mm1, %%mm7 \n\t"
00126 "psraw $2, %%mm1 \n\t"
00127 "psraw $2, %%mm3 \n\t"
00128 "paddw %%mm5, %%mm3 \n\t"
00129 "psraw $2, %%mm5 \n\t"
00130 "paddw %%mm0, %%mm1 \n\t"
00131 "psraw $2, %%mm0 \n\t"
00132 "psubw %%mm4, %%mm5 \n\t"
00133 "psubw %%mm0, %%mm7 \n\t"
00134
00135 "movq 32(%0), %%mm2 \n\t"
00136 "movq 96(%0), %%mm6 \n\t"
00137 "movq %%mm2, %%mm4 \n\t"
00138 "movq %%mm6, %%mm0 \n\t"
00139 "psraw $1, %%mm4 \n\t"
00140 "psraw $1, %%mm6 \n\t"
00141 "psubw %%mm0, %%mm4 \n\t"
00142 "paddw %%mm2, %%mm6 \n\t"
00143
00144 "movq (%0), %%mm2 \n\t"
00145 "movq 64(%0), %%mm0 \n\t"
00146 SUMSUB_BA( %%mm0, %%mm2 )
00147 SUMSUB_BA( %%mm6, %%mm0 )
00148 SUMSUB_BA( %%mm4, %%mm2 )
00149 SUMSUB_BA( %%mm7, %%mm6 )
00150 SUMSUB_BA( %%mm5, %%mm4 )
00151 SUMSUB_BA( %%mm3, %%mm2 )
00152 SUMSUB_BA( %%mm1, %%mm0 )
00153 :: "r"(block)
00154 );
00155 }
00156
00157 static void ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
00158 {
00159 int i;
00160 int16_t __attribute__ ((aligned(8))) b2[64];
00161
00162 block[0] += 32;
00163
00164 for(i=0; i<2; i++){
00165 DECLARE_ALIGNED_8(uint64_t, tmp);
00166
00167 h264_idct8_1d(block+4*i);
00168
00169 __asm__ volatile(
00170 "movq %%mm7, %0 \n\t"
00171 TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
00172 "movq %%mm0, 8(%1) \n\t"
00173 "movq %%mm6, 24(%1) \n\t"
00174 "movq %%mm7, 40(%1) \n\t"
00175 "movq %%mm4, 56(%1) \n\t"
00176 "movq %0, %%mm7 \n\t"
00177 TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
00178 "movq %%mm7, (%1) \n\t"
00179 "movq %%mm1, 16(%1) \n\t"
00180 "movq %%mm0, 32(%1) \n\t"
00181 "movq %%mm3, 48(%1) \n\t"
00182 : "=m"(tmp)
00183 : "r"(b2+32*i)
00184 : "memory"
00185 );
00186 }
00187
00188 for(i=0; i<2; i++){
00189 h264_idct8_1d(b2+4*i);
00190
00191 __asm__ volatile(
00192 "psraw $6, %%mm7 \n\t"
00193 "psraw $6, %%mm6 \n\t"
00194 "psraw $6, %%mm5 \n\t"
00195 "psraw $6, %%mm4 \n\t"
00196 "psraw $6, %%mm3 \n\t"
00197 "psraw $6, %%mm2 \n\t"
00198 "psraw $6, %%mm1 \n\t"
00199 "psraw $6, %%mm0 \n\t"
00200
00201 "movq %%mm7, (%0) \n\t"
00202 "movq %%mm5, 16(%0) \n\t"
00203 "movq %%mm3, 32(%0) \n\t"
00204 "movq %%mm1, 48(%0) \n\t"
00205 "movq %%mm0, 64(%0) \n\t"
00206 "movq %%mm2, 80(%0) \n\t"
00207 "movq %%mm4, 96(%0) \n\t"
00208 "movq %%mm6, 112(%0) \n\t"
00209 :: "r"(b2+4*i)
00210 : "memory"
00211 );
00212 }
00213
00214 add_pixels_clamped_mmx(b2, dst, stride);
00215 }
00216
00217 #define STORE_DIFF_8P( p, d, t, z )\
00218 "movq "#d", "#t" \n"\
00219 "psraw $6, "#p" \n"\
00220 "punpcklbw "#z", "#t" \n"\
00221 "paddsw "#t", "#p" \n"\
00222 "packuswb "#p", "#p" \n"\
00223 "movq "#p", "#d" \n"
00224
00225 #define H264_IDCT8_1D_SSE2(a,b,c,d,e,f,g,h)\
00226 "movdqa "#c", "#a" \n"\
00227 "movdqa "#g", "#e" \n"\
00228 "psraw $1, "#c" \n"\
00229 "psraw $1, "#g" \n"\
00230 "psubw "#e", "#c" \n"\
00231 "paddw "#a", "#g" \n"\
00232 "movdqa "#b", "#e" \n"\
00233 "psraw $1, "#e" \n"\
00234 "paddw "#b", "#e" \n"\
00235 "paddw "#d", "#e" \n"\
00236 "paddw "#f", "#e" \n"\
00237 "movdqa "#f", "#a" \n"\
00238 "psraw $1, "#a" \n"\
00239 "paddw "#f", "#a" \n"\
00240 "paddw "#h", "#a" \n"\
00241 "psubw "#b", "#a" \n"\
00242 "psubw "#d", "#b" \n"\
00243 "psubw "#d", "#f" \n"\
00244 "paddw "#h", "#b" \n"\
00245 "psubw "#h", "#f" \n"\
00246 "psraw $1, "#d" \n"\
00247 "psraw $1, "#h" \n"\
00248 "psubw "#d", "#b" \n"\
00249 "psubw "#h", "#f" \n"\
00250 "movdqa "#e", "#d" \n"\
00251 "movdqa "#a", "#h" \n"\
00252 "psraw $2, "#d" \n"\
00253 "psraw $2, "#h" \n"\
00254 "paddw "#f", "#d" \n"\
00255 "paddw "#b", "#h" \n"\
00256 "psraw $2, "#f" \n"\
00257 "psraw $2, "#b" \n"\
00258 "psubw "#f", "#e" \n"\
00259 "psubw "#a", "#b" \n"\
00260 "movdqa 0x00(%1), "#a" \n"\
00261 "movdqa 0x40(%1), "#f" \n"\
00262 SUMSUB_BA(f, a)\
00263 SUMSUB_BA(g, f)\
00264 SUMSUB_BA(c, a)\
00265 SUMSUB_BA(e, g)\
00266 SUMSUB_BA(b, c)\
00267 SUMSUB_BA(h, a)\
00268 SUMSUB_BA(d, f)
00269
00270 static void ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
00271 {
00272 __asm__ volatile(
00273 "movdqa 0x10(%1), %%xmm1 \n"
00274 "movdqa 0x20(%1), %%xmm2 \n"
00275 "movdqa 0x30(%1), %%xmm3 \n"
00276 "movdqa 0x50(%1), %%xmm5 \n"
00277 "movdqa 0x60(%1), %%xmm6 \n"
00278 "movdqa 0x70(%1), %%xmm7 \n"
00279 H264_IDCT8_1D_SSE2(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
00280 TRANSPOSE8(%%xmm4, %%xmm1, %%xmm7, %%xmm3, %%xmm5, %%xmm0, %%xmm2, %%xmm6, (%1))
00281 "paddw %4, %%xmm4 \n"
00282 "movdqa %%xmm4, 0x00(%1) \n"
00283 "movdqa %%xmm2, 0x40(%1) \n"
00284 H264_IDCT8_1D_SSE2(%%xmm4, %%xmm0, %%xmm6, %%xmm3, %%xmm2, %%xmm5, %%xmm7, %%xmm1)
00285 "movdqa %%xmm6, 0x60(%1) \n"
00286 "movdqa %%xmm7, 0x70(%1) \n"
00287 "pxor %%xmm7, %%xmm7 \n"
00288 STORE_DIFF_8P(%%xmm2, (%0), %%xmm6, %%xmm7)
00289 STORE_DIFF_8P(%%xmm0, (%0,%2), %%xmm6, %%xmm7)
00290 STORE_DIFF_8P(%%xmm1, (%0,%2,2), %%xmm6, %%xmm7)
00291 STORE_DIFF_8P(%%xmm3, (%0,%3), %%xmm6, %%xmm7)
00292 "lea (%0,%2,4), %0 \n"
00293 STORE_DIFF_8P(%%xmm5, (%0), %%xmm6, %%xmm7)
00294 STORE_DIFF_8P(%%xmm4, (%0,%2), %%xmm6, %%xmm7)
00295 "movdqa 0x60(%1), %%xmm0 \n"
00296 "movdqa 0x70(%1), %%xmm1 \n"
00297 STORE_DIFF_8P(%%xmm0, (%0,%2,2), %%xmm6, %%xmm7)
00298 STORE_DIFF_8P(%%xmm1, (%0,%3), %%xmm6, %%xmm7)
00299 :"+r"(dst)
00300 :"r"(block), "r"((x86_reg)stride), "r"((x86_reg)3L*stride), "m"(ff_pw_32)
00301 );
00302 }
00303
00304 static void ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
00305 {
00306 int dc = (block[0] + 32) >> 6;
00307 __asm__ volatile(
00308 "movd %0, %%mm0 \n\t"
00309 "pshufw $0, %%mm0, %%mm0 \n\t"
00310 "pxor %%mm1, %%mm1 \n\t"
00311 "psubw %%mm0, %%mm1 \n\t"
00312 "packuswb %%mm0, %%mm0 \n\t"
00313 "packuswb %%mm1, %%mm1 \n\t"
00314 ::"r"(dc)
00315 );
00316 __asm__ volatile(
00317 "movd %0, %%mm2 \n\t"
00318 "movd %1, %%mm3 \n\t"
00319 "movd %2, %%mm4 \n\t"
00320 "movd %3, %%mm5 \n\t"
00321 "paddusb %%mm0, %%mm2 \n\t"
00322 "paddusb %%mm0, %%mm3 \n\t"
00323 "paddusb %%mm0, %%mm4 \n\t"
00324 "paddusb %%mm0, %%mm5 \n\t"
00325 "psubusb %%mm1, %%mm2 \n\t"
00326 "psubusb %%mm1, %%mm3 \n\t"
00327 "psubusb %%mm1, %%mm4 \n\t"
00328 "psubusb %%mm1, %%mm5 \n\t"
00329 "movd %%mm2, %0 \n\t"
00330 "movd %%mm3, %1 \n\t"
00331 "movd %%mm4, %2 \n\t"
00332 "movd %%mm5, %3 \n\t"
00333 :"+m"(*(uint32_t*)(dst+0*stride)),
00334 "+m"(*(uint32_t*)(dst+1*stride)),
00335 "+m"(*(uint32_t*)(dst+2*stride)),
00336 "+m"(*(uint32_t*)(dst+3*stride))
00337 );
00338 }
00339
00340 static void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
00341 {
00342 int dc = (block[0] + 32) >> 6;
00343 int y;
00344 __asm__ volatile(
00345 "movd %0, %%mm0 \n\t"
00346 "pshufw $0, %%mm0, %%mm0 \n\t"
00347 "pxor %%mm1, %%mm1 \n\t"
00348 "psubw %%mm0, %%mm1 \n\t"
00349 "packuswb %%mm0, %%mm0 \n\t"
00350 "packuswb %%mm1, %%mm1 \n\t"
00351 ::"r"(dc)
00352 );
00353 for(y=2; y--; dst += 4*stride){
00354 __asm__ volatile(
00355 "movq %0, %%mm2 \n\t"
00356 "movq %1, %%mm3 \n\t"
00357 "movq %2, %%mm4 \n\t"
00358 "movq %3, %%mm5 \n\t"
00359 "paddusb %%mm0, %%mm2 \n\t"
00360 "paddusb %%mm0, %%mm3 \n\t"
00361 "paddusb %%mm0, %%mm4 \n\t"
00362 "paddusb %%mm0, %%mm5 \n\t"
00363 "psubusb %%mm1, %%mm2 \n\t"
00364 "psubusb %%mm1, %%mm3 \n\t"
00365 "psubusb %%mm1, %%mm4 \n\t"
00366 "psubusb %%mm1, %%mm5 \n\t"
00367 "movq %%mm2, %0 \n\t"
00368 "movq %%mm3, %1 \n\t"
00369 "movq %%mm4, %2 \n\t"
00370 "movq %%mm5, %3 \n\t"
00371 :"+m"(*(uint64_t*)(dst+0*stride)),
00372 "+m"(*(uint64_t*)(dst+1*stride)),
00373 "+m"(*(uint64_t*)(dst+2*stride)),
00374 "+m"(*(uint64_t*)(dst+3*stride))
00375 );
00376 }
00377 }
00378
00379
00380 static const uint8_t scan8[16 + 2*4]={
00381 4+1*8, 5+1*8, 4+2*8, 5+2*8,
00382 6+1*8, 7+1*8, 6+2*8, 7+2*8,
00383 4+3*8, 5+3*8, 4+4*8, 5+4*8,
00384 6+3*8, 7+3*8, 6+4*8, 7+4*8,
00385 1+1*8, 2+1*8,
00386 1+2*8, 2+2*8,
00387 1+4*8, 2+4*8,
00388 1+5*8, 2+5*8,
00389 };
00390
00391 static void ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00392 int i;
00393 for(i=0; i<16; i++){
00394 if(nnzc[ scan8[i] ])
00395 ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
00396 }
00397 }
00398
00399 static void ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00400 int i;
00401 for(i=0; i<16; i+=4){
00402 if(nnzc[ scan8[i] ])
00403 ff_h264_idct8_add_mmx(dst + block_offset[i], block + i*16, stride);
00404 }
00405 }
00406
00407
00408 static void ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00409 int i;
00410 for(i=0; i<16; i++){
00411 int nnz = nnzc[ scan8[i] ];
00412 if(nnz){
00413 if(nnz==1 && block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
00414 else ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride);
00415 }
00416 }
00417 }
00418
00419 static void ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00420 int i;
00421 for(i=0; i<16; i++){
00422 if(nnzc[ scan8[i] ] || block[i*16])
00423 ff_h264_idct_add_mmx(dst + block_offset[i], block + i*16, stride);
00424 }
00425 }
00426
00427 static void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00428 int i;
00429 for(i=0; i<16; i++){
00430 if(nnzc[ scan8[i] ]) ff_h264_idct_add_mmx (dst + block_offset[i], block + i*16, stride);
00431 else if(block[i*16]) ff_h264_idct_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
00432 }
00433 }
00434
00435 static void ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00436 int i;
00437 for(i=0; i<16; i+=4){
00438 int nnz = nnzc[ scan8[i] ];
00439 if(nnz){
00440 if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
00441 else ff_h264_idct8_add_mmx (dst + block_offset[i], block + i*16, stride);
00442 }
00443 }
00444 }
00445
00446 static void ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00447 int i;
00448 for(i=0; i<16; i+=4){
00449 int nnz = nnzc[ scan8[i] ];
00450 if(nnz){
00451 if(nnz==1 && block[i*16]) ff_h264_idct8_dc_add_mmx2(dst + block_offset[i], block + i*16, stride);
00452 else ff_h264_idct8_add_sse2 (dst + block_offset[i], block + i*16, stride);
00453 }
00454 }
00455 }
00456
00457 static void ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00458 int i;
00459 for(i=16; i<16+8; i++){
00460 if(nnzc[ scan8[i] ] || block[i*16])
00461 ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00462 }
00463 }
00464
00465 static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00466 int i;
00467 for(i=16; i<16+8; i++){
00468 if(nnzc[ scan8[i] ])
00469 ff_h264_idct_add_mmx (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00470 else if(block[i*16])
00471 ff_h264_idct_dc_add_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00472 }
00473 }
00474
00475 #if CONFIG_GPL && HAVE_YASM
00476 static void ff_h264_idct_dc_add8_mmx2(uint8_t *dst, int16_t *block, int stride)
00477 {
00478 __asm__ volatile(
00479 "movd %0, %%mm0 \n\t"
00480 "punpcklwd %1, %%mm0 \n\t"
00481 "paddsw %2, %%mm0 \n\t"
00482 "psraw $6, %%mm0 \n\t"
00483 "punpcklwd %%mm0, %%mm0 \n\t"
00484 "pxor %%mm1, %%mm1 \n\t"
00485 "psubw %%mm0, %%mm1 \n\t"
00486 "packuswb %%mm1, %%mm0 \n\t"
00487 "pshufw $0xFA, %%mm0, %%mm1 \n\t"
00488 "punpcklwd %%mm0, %%mm0 \n\t"
00489 ::"m"(block[ 0]),
00490 "m"(block[16]),
00491 "m"(ff_pw_32)
00492 );
00493 __asm__ volatile(
00494 "movq %0, %%mm2 \n\t"
00495 "movq %1, %%mm3 \n\t"
00496 "movq %2, %%mm4 \n\t"
00497 "movq %3, %%mm5 \n\t"
00498 "paddusb %%mm0, %%mm2 \n\t"
00499 "paddusb %%mm0, %%mm3 \n\t"
00500 "paddusb %%mm0, %%mm4 \n\t"
00501 "paddusb %%mm0, %%mm5 \n\t"
00502 "psubusb %%mm1, %%mm2 \n\t"
00503 "psubusb %%mm1, %%mm3 \n\t"
00504 "psubusb %%mm1, %%mm4 \n\t"
00505 "psubusb %%mm1, %%mm5 \n\t"
00506 "movq %%mm2, %0 \n\t"
00507 "movq %%mm3, %1 \n\t"
00508 "movq %%mm4, %2 \n\t"
00509 "movq %%mm5, %3 \n\t"
00510 :"+m"(*(uint64_t*)(dst+0*stride)),
00511 "+m"(*(uint64_t*)(dst+1*stride)),
00512 "+m"(*(uint64_t*)(dst+2*stride)),
00513 "+m"(*(uint64_t*)(dst+3*stride))
00514 );
00515 }
00516
00517 extern void ff_x264_add8x4_idct_sse2(uint8_t *dst, int16_t *block, int stride);
00518
00519 static void ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00520 int i;
00521 for(i=0; i<16; i+=2)
00522 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
00523 ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
00524 }
00525
00526 static void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00527 int i;
00528 for(i=0; i<16; i+=2){
00529 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
00530 ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
00531 else if(block[i*16]|block[i*16+16])
00532 ff_h264_idct_dc_add8_mmx2(dst + block_offset[i], block + i*16, stride);
00533 }
00534 }
00535
00536 static void ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
00537 int i;
00538 for(i=16; i<16+8; i+=2){
00539 if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
00540 ff_x264_add8x4_idct_sse2 (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00541 else if(block[i*16]|block[i*16+16])
00542 ff_h264_idct_dc_add8_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
00543 }
00544 }
00545 #endif
00546
00547
00548
00549
00550
00551
00552 #define DIFF_GT_MMX(x,y,a,o,t)\
00553 "movq "#y", "#t" \n\t"\
00554 "movq "#x", "#o" \n\t"\
00555 "psubusb "#x", "#t" \n\t"\
00556 "psubusb "#y", "#o" \n\t"\
00557 "por "#t", "#o" \n\t"\
00558 "psubusb "#a", "#o" \n\t"
00559
00560
00561
00562 #define DIFF_GT2_MMX(x,y,a,o,t)\
00563 "movq "#y", "#t" \n\t"\
00564 "movq "#x", "#o" \n\t"\
00565 "psubusb "#x", "#t" \n\t"\
00566 "psubusb "#y", "#o" \n\t"\
00567 "psubusb "#a", "#t" \n\t"\
00568 "psubusb "#a", "#o" \n\t"\
00569 "pcmpeqb "#t", "#o" \n\t"\
00570
00571
00572
00573
00574 #define H264_DEBLOCK_MASK(alpha1, beta1) \
00575 "pshufw $0, "#alpha1", %%mm4 \n\t"\
00576 "pshufw $0, "#beta1 ", %%mm5 \n\t"\
00577 "packuswb %%mm4, %%mm4 \n\t"\
00578 "packuswb %%mm5, %%mm5 \n\t"\
00579 DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) \
00580 DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) \
00581 "por %%mm4, %%mm7 \n\t"\
00582 DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) \
00583 "por %%mm4, %%mm7 \n\t"\
00584 "pxor %%mm6, %%mm6 \n\t"\
00585 "pcmpeqb %%mm6, %%mm7 \n\t"
00586
00587
00588
00589
00590 #define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
00591 "movq %%mm1 , %%mm5 \n\t"\
00592 "pxor %%mm2 , %%mm5 \n\t" \
00593 "pand "#pb_01" , %%mm5 \n\t" \
00594 "pcmpeqb %%mm4 , %%mm4 \n\t"\
00595 "pxor %%mm4 , %%mm3 \n\t"\
00596 "pavgb %%mm0 , %%mm3 \n\t" \
00597 "pavgb "MANGLE(ff_pb_3)" , %%mm3 \n\t" \
00598 "pxor %%mm1 , %%mm4 \n\t"\
00599 "pavgb %%mm2 , %%mm4 \n\t" \
00600 "pavgb %%mm5 , %%mm3 \n\t"\
00601 "paddusb %%mm4 , %%mm3 \n\t" \
00602 "movq "MANGLE(ff_pb_A1)" , %%mm6 \n\t"\
00603 "psubusb %%mm3 , %%mm6 \n\t"\
00604 "psubusb "MANGLE(ff_pb_A1)" , %%mm3 \n\t"\
00605 "pminub %%mm7 , %%mm6 \n\t"\
00606 "pminub %%mm7 , %%mm3 \n\t"\
00607 "psubusb %%mm6 , %%mm1 \n\t"\
00608 "psubusb %%mm3 , %%mm2 \n\t"\
00609 "paddusb %%mm3 , %%mm1 \n\t"\
00610 "paddusb %%mm6 , %%mm2 \n\t"
00611
00612
00613
00614
00615 #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
00616 "movq %%mm1, "#tmp" \n\t"\
00617 "pavgb %%mm2, "#tmp" \n\t"\
00618 "pavgb "#tmp", "#q2" \n\t" \
00619 "pxor "q2addr", "#tmp" \n\t"\
00620 "pand %8, "#tmp" \n\t" \
00621 "psubusb "#tmp", "#q2" \n\t" \
00622 "movq "#p1", "#tmp" \n\t"\
00623 "psubusb "#tc0", "#tmp" \n\t"\
00624 "paddusb "#p1", "#tc0" \n\t"\
00625 "pmaxub "#tmp", "#q2" \n\t"\
00626 "pminub "#tc0", "#q2" \n\t"\
00627 "movq "#q2", "q1addr" \n\t"
00628
00629 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
00630 {
00631 DECLARE_ALIGNED_8(uint64_t, tmp0[2]);
00632
00633 __asm__ volatile(
00634 "movq (%1,%3), %%mm0 \n\t"
00635 "movq (%1,%3,2), %%mm1 \n\t"
00636 "movq (%2), %%mm2 \n\t"
00637 "movq (%2,%3), %%mm3 \n\t"
00638 H264_DEBLOCK_MASK(%6, %7)
00639
00640 "movd %5, %%mm4 \n\t"
00641 "punpcklbw %%mm4, %%mm4 \n\t"
00642 "punpcklwd %%mm4, %%mm4 \n\t"
00643 "pcmpeqb %%mm3, %%mm3 \n\t"
00644 "movq %%mm4, %%mm6 \n\t"
00645 "pcmpgtb %%mm3, %%mm4 \n\t"
00646 "movq %%mm6, 8+%0 \n\t"
00647 "pand %%mm4, %%mm7 \n\t"
00648 "movq %%mm7, %0 \n\t"
00649
00650
00651 "movq (%1), %%mm3 \n\t"
00652 DIFF_GT2_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4)
00653 "pand %%mm7, %%mm6 \n\t"
00654 "pand 8+%0, %%mm7 \n\t"
00655 "movq %%mm7, %%mm4 \n\t"
00656 "psubb %%mm6, %%mm7 \n\t"
00657 "pand %%mm4, %%mm6 \n\t"
00658 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4)
00659
00660
00661 "movq (%2,%3,2), %%mm4 \n\t"
00662 DIFF_GT2_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3)
00663 "pand %0, %%mm6 \n\t"
00664 "movq 8+%0, %%mm5 \n\t"
00665 "pand %%mm6, %%mm5 \n\t"
00666 "psubb %%mm6, %%mm7 \n\t"
00667 "movq (%2,%3), %%mm3 \n\t"
00668 H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)
00669
00670
00671 H264_DEBLOCK_P0_Q0(%8, unused)
00672 "movq %%mm1, (%1,%3,2) \n\t"
00673 "movq %%mm2, (%2) \n\t"
00674
00675 : "=m"(*tmp0)
00676 : "r"(pix-3*stride), "r"(pix), "r"((x86_reg)stride),
00677 "m"(*tmp0), "m"(*(uint32_t*)tc0), "m"(alpha1), "m"(beta1),
00678 "m"(ff_bone)
00679 );
00680 }
00681
00682 static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00683 {
00684 if((tc0[0] & tc0[1]) >= 0)
00685 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
00686 if((tc0[2] & tc0[3]) >= 0)
00687 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
00688 }
00689 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00690 {
00691
00692
00693 DECLARE_ALIGNED_8(uint8_t, trans[8*8]);
00694 int i;
00695 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
00696 if((tc0[0] & tc0[1]) < 0)
00697 continue;
00698 transpose4x4(trans, pix-4, 8, stride);
00699 transpose4x4(trans +4*8, pix, 8, stride);
00700 transpose4x4(trans+4, pix-4+4*stride, 8, stride);
00701 transpose4x4(trans+4+4*8, pix +4*stride, 8, stride);
00702 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
00703 transpose4x4(pix-2, trans +2*8, stride, 8);
00704 transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
00705 }
00706 }
00707
00708 static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
00709 {
00710 __asm__ volatile(
00711 "movq (%0), %%mm0 \n\t"
00712 "movq (%0,%2), %%mm1 \n\t"
00713 "movq (%1), %%mm2 \n\t"
00714 "movq (%1,%2), %%mm3 \n\t"
00715 H264_DEBLOCK_MASK(%4, %5)
00716 "movd %3, %%mm6 \n\t"
00717 "punpcklbw %%mm6, %%mm6 \n\t"
00718 "pand %%mm6, %%mm7 \n\t"
00719 H264_DEBLOCK_P0_Q0(%6, %7)
00720 "movq %%mm1, (%0,%2) \n\t"
00721 "movq %%mm2, (%1) \n\t"
00722
00723 :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
00724 "r"(*(uint32_t*)tc0),
00725 "m"(alpha1), "m"(beta1), "m"(ff_bone), "m"(ff_pb_3F)
00726 );
00727 }
00728
00729 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00730 {
00731 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
00732 }
00733
00734 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00735 {
00736
00737 DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
00738 transpose4x4(trans, pix-2, 8, stride);
00739 transpose4x4(trans+4, pix-2+4*stride, 8, stride);
00740 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
00741 transpose4x4(pix-2, trans, stride, 8);
00742 transpose4x4(pix-2+4*stride, trans+4, stride, 8);
00743 }
00744
00745
00746 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \
00747 "movq "#p0", %%mm4 \n\t"\
00748 "pxor "#q1", %%mm4 \n\t"\
00749 "pand "#one", %%mm4 \n\t" \
00750 "pavgb "#q1", "#p0" \n\t"\
00751 "psubusb %%mm4, "#p0" \n\t"\
00752 "pavgb "#p1", "#p0" \n\t" \
00753
00754 static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
00755 {
00756 __asm__ volatile(
00757 "movq (%0), %%mm0 \n\t"
00758 "movq (%0,%2), %%mm1 \n\t"
00759 "movq (%1), %%mm2 \n\t"
00760 "movq (%1,%2), %%mm3 \n\t"
00761 H264_DEBLOCK_MASK(%3, %4)
00762 "movq %%mm1, %%mm5 \n\t"
00763 "movq %%mm2, %%mm6 \n\t"
00764 H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5)
00765 H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5)
00766 "psubb %%mm5, %%mm1 \n\t"
00767 "psubb %%mm6, %%mm2 \n\t"
00768 "pand %%mm7, %%mm1 \n\t"
00769 "pand %%mm7, %%mm2 \n\t"
00770 "paddb %%mm5, %%mm1 \n\t"
00771 "paddb %%mm6, %%mm2 \n\t"
00772 "movq %%mm1, (%0,%2) \n\t"
00773 "movq %%mm2, (%1) \n\t"
00774 :: "r"(pix-2*stride), "r"(pix), "r"((x86_reg)stride),
00775 "m"(alpha1), "m"(beta1), "m"(ff_bone)
00776 );
00777 }
00778
00779 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
00780 {
00781 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
00782 }
00783
00784 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
00785 {
00786
00787 DECLARE_ALIGNED_8(uint8_t, trans[8*4]);
00788 transpose4x4(trans, pix-2, 8, stride);
00789 transpose4x4(trans+4, pix-2+4*stride, 8, stride);
00790 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
00791 transpose4x4(pix-2, trans, stride, 8);
00792 transpose4x4(pix-2+4*stride, trans+4, stride, 8);
00793 }
00794
00795 static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
00796 int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
00797 int dir;
00798 __asm__ volatile(
00799 "pxor %%mm7, %%mm7 \n\t"
00800 "movq %0, %%mm6 \n\t"
00801 "movq %1, %%mm5 \n\t"
00802 "movq %2, %%mm4 \n\t"
00803 ::"m"(ff_pb_1), "m"(ff_pb_3), "m"(ff_pb_7)
00804 );
00805 if(field)
00806 __asm__ volatile(
00807 "movq %0, %%mm5 \n\t"
00808 "movq %1, %%mm4 \n\t"
00809 ::"m"(ff_pb_3_1), "m"(ff_pb_7_3)
00810 );
00811
00812
00813
00814 for( dir=1; dir>=0; dir-- ) {
00815 const int d_idx = dir ? -8 : -1;
00816 const int mask_mv = dir ? mask_mv1 : mask_mv0;
00817 DECLARE_ALIGNED_8(const uint64_t, mask_dir) = dir ? 0 : 0xffffffffffffffffULL;
00818 int b_idx, edge, l;
00819 for( b_idx=12, edge=0; edge<edges; edge+=step, b_idx+=8*step ) {
00820 __asm__ volatile(
00821 "pand %0, %%mm0 \n\t"
00822 ::"m"(mask_dir)
00823 );
00824 if(!(mask_mv & edge)) {
00825 __asm__ volatile("pxor %%mm0, %%mm0 \n\t":);
00826 for( l = bidir; l >= 0; l-- ) {
00827 __asm__ volatile(
00828 "movd %0, %%mm1 \n\t"
00829 "punpckldq %1, %%mm1 \n\t"
00830 "movq %%mm1, %%mm2 \n\t"
00831 "psrlw $7, %%mm2 \n\t"
00832 "pand %%mm6, %%mm2 \n\t"
00833 "por %%mm2, %%mm1 \n\t"
00834 "punpckldq %%mm1, %%mm2 \n\t"
00835 "pcmpeqb %%mm2, %%mm1 \n\t"
00836 "paddb %%mm6, %%mm1 \n\t"
00837 "punpckhbw %%mm7, %%mm1 \n\t"
00838 "por %%mm1, %%mm0 \n\t"
00839
00840 "movq %2, %%mm1 \n\t"
00841 "movq %3, %%mm2 \n\t"
00842 "psubw %4, %%mm1 \n\t"
00843 "psubw %5, %%mm2 \n\t"
00844 "packsswb %%mm2, %%mm1 \n\t"
00845 "paddb %%mm5, %%mm1 \n\t"
00846 "pminub %%mm4, %%mm1 \n\t"
00847 "pcmpeqb %%mm4, %%mm1 \n\t"
00848 "por %%mm1, %%mm0 \n\t"
00849 ::"m"(ref[l][b_idx]),
00850 "m"(ref[l][b_idx+d_idx]),
00851 "m"(mv[l][b_idx][0]),
00852 "m"(mv[l][b_idx+2][0]),
00853 "m"(mv[l][b_idx+d_idx][0]),
00854 "m"(mv[l][b_idx+d_idx+2][0])
00855 );
00856 }
00857 }
00858 __asm__ volatile(
00859 "movd %0, %%mm1 \n\t"
00860 "por %1, %%mm1 \n\t"
00861 "punpcklbw %%mm7, %%mm1 \n\t"
00862 "pcmpgtw %%mm7, %%mm1 \n\t"
00863 ::"m"(nnz[b_idx]),
00864 "m"(nnz[b_idx+d_idx])
00865 );
00866 __asm__ volatile(
00867 "pcmpeqw %%mm7, %%mm0 \n\t"
00868 "pcmpeqw %%mm7, %%mm0 \n\t"
00869 "psrlw $15, %%mm0 \n\t"
00870 "psrlw $14, %%mm1 \n\t"
00871 "movq %%mm0, %%mm2 \n\t"
00872 "por %%mm1, %%mm2 \n\t"
00873 "psrlw $1, %%mm1 \n\t"
00874 "pandn %%mm2, %%mm1 \n\t"
00875 "movq %%mm1, %0 \n\t"
00876 :"=m"(*bS[dir][edge])
00877 ::"memory"
00878 );
00879 }
00880 edges = 4;
00881 step = 1;
00882 }
00883 __asm__ volatile(
00884 "movq (%0), %%mm0 \n\t"
00885 "movq 8(%0), %%mm1 \n\t"
00886 "movq 16(%0), %%mm2 \n\t"
00887 "movq 24(%0), %%mm3 \n\t"
00888 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
00889 "movq %%mm0, (%0) \n\t"
00890 "movq %%mm3, 8(%0) \n\t"
00891 "movq %%mm4, 16(%0) \n\t"
00892 "movq %%mm2, 24(%0) \n\t"
00893 ::"r"(bS[0])
00894 :"memory"
00895 );
00896 }
00897
00898
00899
00900
00901 #define QPEL_H264V_MM(A,B,C,D,E,F,OP,T,Z,d,q)\
00902 "mov"#q" "#C", "#T" \n\t"\
00903 "mov"#d" (%0), "#F" \n\t"\
00904 "paddw "#D", "#T" \n\t"\
00905 "psllw $2, "#T" \n\t"\
00906 "psubw "#B", "#T" \n\t"\
00907 "psubw "#E", "#T" \n\t"\
00908 "punpcklbw "#Z", "#F" \n\t"\
00909 "pmullw %4, "#T" \n\t"\
00910 "paddw %5, "#A" \n\t"\
00911 "add %2, %0 \n\t"\
00912 "paddw "#F", "#A" \n\t"\
00913 "paddw "#A", "#T" \n\t"\
00914 "psraw $5, "#T" \n\t"\
00915 "packuswb "#T", "#T" \n\t"\
00916 OP(T, (%1), A, d)\
00917 "add %3, %1 \n\t"
00918
00919 #define QPEL_H264HV_MM(A,B,C,D,E,F,OF,T,Z,d,q)\
00920 "mov"#q" "#C", "#T" \n\t"\
00921 "mov"#d" (%0), "#F" \n\t"\
00922 "paddw "#D", "#T" \n\t"\
00923 "psllw $2, "#T" \n\t"\
00924 "paddw %4, "#A" \n\t"\
00925 "psubw "#B", "#T" \n\t"\
00926 "psubw "#E", "#T" \n\t"\
00927 "punpcklbw "#Z", "#F" \n\t"\
00928 "pmullw %3, "#T" \n\t"\
00929 "paddw "#F", "#A" \n\t"\
00930 "add %2, %0 \n\t"\
00931 "paddw "#A", "#T" \n\t"\
00932 "mov"#q" "#T", "#OF"(%1) \n\t"
00933
00934 #define QPEL_H264V(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%mm6,%%mm7,d,q)
00935 #define QPEL_H264HV(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%mm6,%%mm7,d,q)
00936 #define QPEL_H264V_XMM(A,B,C,D,E,F,OP) QPEL_H264V_MM(A,B,C,D,E,F,OP,%%xmm6,%%xmm7,q,dqa)
00937 #define QPEL_H264HV_XMM(A,B,C,D,E,F,OF) QPEL_H264HV_MM(A,B,C,D,E,F,OF,%%xmm6,%%xmm7,q,dqa)
00938
00939
00940 #define QPEL_H264(OPNAME, OP, MMX)\
00941 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00942 int h=4;\
00943 \
00944 __asm__ volatile(\
00945 "pxor %%mm7, %%mm7 \n\t"\
00946 "movq %5, %%mm4 \n\t"\
00947 "movq %6, %%mm5 \n\t"\
00948 "1: \n\t"\
00949 "movd -1(%0), %%mm1 \n\t"\
00950 "movd (%0), %%mm2 \n\t"\
00951 "movd 1(%0), %%mm3 \n\t"\
00952 "movd 2(%0), %%mm0 \n\t"\
00953 "punpcklbw %%mm7, %%mm1 \n\t"\
00954 "punpcklbw %%mm7, %%mm2 \n\t"\
00955 "punpcklbw %%mm7, %%mm3 \n\t"\
00956 "punpcklbw %%mm7, %%mm0 \n\t"\
00957 "paddw %%mm0, %%mm1 \n\t"\
00958 "paddw %%mm3, %%mm2 \n\t"\
00959 "movd -2(%0), %%mm0 \n\t"\
00960 "movd 3(%0), %%mm3 \n\t"\
00961 "punpcklbw %%mm7, %%mm0 \n\t"\
00962 "punpcklbw %%mm7, %%mm3 \n\t"\
00963 "paddw %%mm3, %%mm0 \n\t"\
00964 "psllw $2, %%mm2 \n\t"\
00965 "psubw %%mm1, %%mm2 \n\t"\
00966 "pmullw %%mm4, %%mm2 \n\t"\
00967 "paddw %%mm5, %%mm0 \n\t"\
00968 "paddw %%mm2, %%mm0 \n\t"\
00969 "psraw $5, %%mm0 \n\t"\
00970 "packuswb %%mm0, %%mm0 \n\t"\
00971 OP(%%mm0, (%1),%%mm6, d)\
00972 "add %3, %0 \n\t"\
00973 "add %4, %1 \n\t"\
00974 "decl %2 \n\t"\
00975 " jnz 1b \n\t"\
00976 : "+a"(src), "+c"(dst), "+g"(h)\
00977 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
00978 : "memory"\
00979 );\
00980 }\
00981 static av_noinline void OPNAME ## h264_qpel4_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
00982 int h=4;\
00983 __asm__ volatile(\
00984 "pxor %%mm7, %%mm7 \n\t"\
00985 "movq %0, %%mm4 \n\t"\
00986 "movq %1, %%mm5 \n\t"\
00987 :: "m"(ff_pw_5), "m"(ff_pw_16)\
00988 );\
00989 do{\
00990 __asm__ volatile(\
00991 "movd -1(%0), %%mm1 \n\t"\
00992 "movd (%0), %%mm2 \n\t"\
00993 "movd 1(%0), %%mm3 \n\t"\
00994 "movd 2(%0), %%mm0 \n\t"\
00995 "punpcklbw %%mm7, %%mm1 \n\t"\
00996 "punpcklbw %%mm7, %%mm2 \n\t"\
00997 "punpcklbw %%mm7, %%mm3 \n\t"\
00998 "punpcklbw %%mm7, %%mm0 \n\t"\
00999 "paddw %%mm0, %%mm1 \n\t"\
01000 "paddw %%mm3, %%mm2 \n\t"\
01001 "movd -2(%0), %%mm0 \n\t"\
01002 "movd 3(%0), %%mm3 \n\t"\
01003 "punpcklbw %%mm7, %%mm0 \n\t"\
01004 "punpcklbw %%mm7, %%mm3 \n\t"\
01005 "paddw %%mm3, %%mm0 \n\t"\
01006 "psllw $2, %%mm2 \n\t"\
01007 "psubw %%mm1, %%mm2 \n\t"\
01008 "pmullw %%mm4, %%mm2 \n\t"\
01009 "paddw %%mm5, %%mm0 \n\t"\
01010 "paddw %%mm2, %%mm0 \n\t"\
01011 "movd (%2), %%mm3 \n\t"\
01012 "psraw $5, %%mm0 \n\t"\
01013 "packuswb %%mm0, %%mm0 \n\t"\
01014 PAVGB" %%mm3, %%mm0 \n\t"\
01015 OP(%%mm0, (%1),%%mm6, d)\
01016 "add %4, %0 \n\t"\
01017 "add %4, %1 \n\t"\
01018 "add %3, %2 \n\t"\
01019 : "+a"(src), "+c"(dst), "+d"(src2)\
01020 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride)\
01021 : "memory"\
01022 );\
01023 }while(--h);\
01024 }\
01025 static av_noinline void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01026 src -= 2*srcStride;\
01027 __asm__ volatile(\
01028 "pxor %%mm7, %%mm7 \n\t"\
01029 "movd (%0), %%mm0 \n\t"\
01030 "add %2, %0 \n\t"\
01031 "movd (%0), %%mm1 \n\t"\
01032 "add %2, %0 \n\t"\
01033 "movd (%0), %%mm2 \n\t"\
01034 "add %2, %0 \n\t"\
01035 "movd (%0), %%mm3 \n\t"\
01036 "add %2, %0 \n\t"\
01037 "movd (%0), %%mm4 \n\t"\
01038 "add %2, %0 \n\t"\
01039 "punpcklbw %%mm7, %%mm0 \n\t"\
01040 "punpcklbw %%mm7, %%mm1 \n\t"\
01041 "punpcklbw %%mm7, %%mm2 \n\t"\
01042 "punpcklbw %%mm7, %%mm3 \n\t"\
01043 "punpcklbw %%mm7, %%mm4 \n\t"\
01044 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01045 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01046 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01047 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01048 \
01049 : "+a"(src), "+c"(dst)\
01050 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01051 : "memory"\
01052 );\
01053 }\
01054 static av_noinline void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01055 int h=4;\
01056 int w=3;\
01057 src -= 2*srcStride+2;\
01058 while(w--){\
01059 __asm__ volatile(\
01060 "pxor %%mm7, %%mm7 \n\t"\
01061 "movd (%0), %%mm0 \n\t"\
01062 "add %2, %0 \n\t"\
01063 "movd (%0), %%mm1 \n\t"\
01064 "add %2, %0 \n\t"\
01065 "movd (%0), %%mm2 \n\t"\
01066 "add %2, %0 \n\t"\
01067 "movd (%0), %%mm3 \n\t"\
01068 "add %2, %0 \n\t"\
01069 "movd (%0), %%mm4 \n\t"\
01070 "add %2, %0 \n\t"\
01071 "punpcklbw %%mm7, %%mm0 \n\t"\
01072 "punpcklbw %%mm7, %%mm1 \n\t"\
01073 "punpcklbw %%mm7, %%mm2 \n\t"\
01074 "punpcklbw %%mm7, %%mm3 \n\t"\
01075 "punpcklbw %%mm7, %%mm4 \n\t"\
01076 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
01077 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
01078 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
01079 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
01080 \
01081 : "+a"(src)\
01082 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01083 : "memory"\
01084 );\
01085 tmp += 4;\
01086 src += 4 - 9*srcStride;\
01087 }\
01088 tmp -= 3*4;\
01089 __asm__ volatile(\
01090 "1: \n\t"\
01091 "movq (%0), %%mm0 \n\t"\
01092 "paddw 10(%0), %%mm0 \n\t"\
01093 "movq 2(%0), %%mm1 \n\t"\
01094 "paddw 8(%0), %%mm1 \n\t"\
01095 "movq 4(%0), %%mm2 \n\t"\
01096 "paddw 6(%0), %%mm2 \n\t"\
01097 "psubw %%mm1, %%mm0 \n\t"\
01098 "psraw $2, %%mm0 \n\t"\
01099 "psubw %%mm1, %%mm0 \n\t"\
01100 "paddsw %%mm2, %%mm0 \n\t"\
01101 "psraw $2, %%mm0 \n\t"\
01102 "paddw %%mm2, %%mm0 \n\t"\
01103 "psraw $6, %%mm0 \n\t"\
01104 "packuswb %%mm0, %%mm0 \n\t"\
01105 OP(%%mm0, (%1),%%mm7, d)\
01106 "add $24, %0 \n\t"\
01107 "add %3, %1 \n\t"\
01108 "decl %2 \n\t"\
01109 " jnz 1b \n\t"\
01110 : "+a"(tmp), "+c"(dst), "+g"(h)\
01111 : "S"((x86_reg)dstStride)\
01112 : "memory"\
01113 );\
01114 }\
01115 \
01116 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01117 int h=8;\
01118 __asm__ volatile(\
01119 "pxor %%mm7, %%mm7 \n\t"\
01120 "movq %5, %%mm6 \n\t"\
01121 "1: \n\t"\
01122 "movq (%0), %%mm0 \n\t"\
01123 "movq 1(%0), %%mm2 \n\t"\
01124 "movq %%mm0, %%mm1 \n\t"\
01125 "movq %%mm2, %%mm3 \n\t"\
01126 "punpcklbw %%mm7, %%mm0 \n\t"\
01127 "punpckhbw %%mm7, %%mm1 \n\t"\
01128 "punpcklbw %%mm7, %%mm2 \n\t"\
01129 "punpckhbw %%mm7, %%mm3 \n\t"\
01130 "paddw %%mm2, %%mm0 \n\t"\
01131 "paddw %%mm3, %%mm1 \n\t"\
01132 "psllw $2, %%mm0 \n\t"\
01133 "psllw $2, %%mm1 \n\t"\
01134 "movq -1(%0), %%mm2 \n\t"\
01135 "movq 2(%0), %%mm4 \n\t"\
01136 "movq %%mm2, %%mm3 \n\t"\
01137 "movq %%mm4, %%mm5 \n\t"\
01138 "punpcklbw %%mm7, %%mm2 \n\t"\
01139 "punpckhbw %%mm7, %%mm3 \n\t"\
01140 "punpcklbw %%mm7, %%mm4 \n\t"\
01141 "punpckhbw %%mm7, %%mm5 \n\t"\
01142 "paddw %%mm4, %%mm2 \n\t"\
01143 "paddw %%mm3, %%mm5 \n\t"\
01144 "psubw %%mm2, %%mm0 \n\t"\
01145 "psubw %%mm5, %%mm1 \n\t"\
01146 "pmullw %%mm6, %%mm0 \n\t"\
01147 "pmullw %%mm6, %%mm1 \n\t"\
01148 "movd -2(%0), %%mm2 \n\t"\
01149 "movd 7(%0), %%mm5 \n\t"\
01150 "punpcklbw %%mm7, %%mm2 \n\t"\
01151 "punpcklbw %%mm7, %%mm5 \n\t"\
01152 "paddw %%mm3, %%mm2 \n\t"\
01153 "paddw %%mm5, %%mm4 \n\t"\
01154 "movq %6, %%mm5 \n\t"\
01155 "paddw %%mm5, %%mm2 \n\t"\
01156 "paddw %%mm5, %%mm4 \n\t"\
01157 "paddw %%mm2, %%mm0 \n\t"\
01158 "paddw %%mm4, %%mm1 \n\t"\
01159 "psraw $5, %%mm0 \n\t"\
01160 "psraw $5, %%mm1 \n\t"\
01161 "packuswb %%mm1, %%mm0 \n\t"\
01162 OP(%%mm0, (%1),%%mm5, q)\
01163 "add %3, %0 \n\t"\
01164 "add %4, %1 \n\t"\
01165 "decl %2 \n\t"\
01166 " jnz 1b \n\t"\
01167 : "+a"(src), "+c"(dst), "+g"(h)\
01168 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01169 : "memory"\
01170 );\
01171 }\
01172 \
01173 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01174 int h=8;\
01175 __asm__ volatile(\
01176 "pxor %%mm7, %%mm7 \n\t"\
01177 "movq %0, %%mm6 \n\t"\
01178 :: "m"(ff_pw_5)\
01179 );\
01180 do{\
01181 __asm__ volatile(\
01182 "movq (%0), %%mm0 \n\t"\
01183 "movq 1(%0), %%mm2 \n\t"\
01184 "movq %%mm0, %%mm1 \n\t"\
01185 "movq %%mm2, %%mm3 \n\t"\
01186 "punpcklbw %%mm7, %%mm0 \n\t"\
01187 "punpckhbw %%mm7, %%mm1 \n\t"\
01188 "punpcklbw %%mm7, %%mm2 \n\t"\
01189 "punpckhbw %%mm7, %%mm3 \n\t"\
01190 "paddw %%mm2, %%mm0 \n\t"\
01191 "paddw %%mm3, %%mm1 \n\t"\
01192 "psllw $2, %%mm0 \n\t"\
01193 "psllw $2, %%mm1 \n\t"\
01194 "movq -1(%0), %%mm2 \n\t"\
01195 "movq 2(%0), %%mm4 \n\t"\
01196 "movq %%mm2, %%mm3 \n\t"\
01197 "movq %%mm4, %%mm5 \n\t"\
01198 "punpcklbw %%mm7, %%mm2 \n\t"\
01199 "punpckhbw %%mm7, %%mm3 \n\t"\
01200 "punpcklbw %%mm7, %%mm4 \n\t"\
01201 "punpckhbw %%mm7, %%mm5 \n\t"\
01202 "paddw %%mm4, %%mm2 \n\t"\
01203 "paddw %%mm3, %%mm5 \n\t"\
01204 "psubw %%mm2, %%mm0 \n\t"\
01205 "psubw %%mm5, %%mm1 \n\t"\
01206 "pmullw %%mm6, %%mm0 \n\t"\
01207 "pmullw %%mm6, %%mm1 \n\t"\
01208 "movd -2(%0), %%mm2 \n\t"\
01209 "movd 7(%0), %%mm5 \n\t"\
01210 "punpcklbw %%mm7, %%mm2 \n\t"\
01211 "punpcklbw %%mm7, %%mm5 \n\t"\
01212 "paddw %%mm3, %%mm2 \n\t"\
01213 "paddw %%mm5, %%mm4 \n\t"\
01214 "movq %5, %%mm5 \n\t"\
01215 "paddw %%mm5, %%mm2 \n\t"\
01216 "paddw %%mm5, %%mm4 \n\t"\
01217 "paddw %%mm2, %%mm0 \n\t"\
01218 "paddw %%mm4, %%mm1 \n\t"\
01219 "psraw $5, %%mm0 \n\t"\
01220 "psraw $5, %%mm1 \n\t"\
01221 "movq (%2), %%mm4 \n\t"\
01222 "packuswb %%mm1, %%mm0 \n\t"\
01223 PAVGB" %%mm4, %%mm0 \n\t"\
01224 OP(%%mm0, (%1),%%mm5, q)\
01225 "add %4, %0 \n\t"\
01226 "add %4, %1 \n\t"\
01227 "add %3, %2 \n\t"\
01228 : "+a"(src), "+c"(dst), "+d"(src2)\
01229 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
01230 "m"(ff_pw_16)\
01231 : "memory"\
01232 );\
01233 }while(--h);\
01234 }\
01235 \
01236 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01237 int w= 2;\
01238 src -= 2*srcStride;\
01239 \
01240 while(w--){\
01241 __asm__ volatile(\
01242 "pxor %%mm7, %%mm7 \n\t"\
01243 "movd (%0), %%mm0 \n\t"\
01244 "add %2, %0 \n\t"\
01245 "movd (%0), %%mm1 \n\t"\
01246 "add %2, %0 \n\t"\
01247 "movd (%0), %%mm2 \n\t"\
01248 "add %2, %0 \n\t"\
01249 "movd (%0), %%mm3 \n\t"\
01250 "add %2, %0 \n\t"\
01251 "movd (%0), %%mm4 \n\t"\
01252 "add %2, %0 \n\t"\
01253 "punpcklbw %%mm7, %%mm0 \n\t"\
01254 "punpcklbw %%mm7, %%mm1 \n\t"\
01255 "punpcklbw %%mm7, %%mm2 \n\t"\
01256 "punpcklbw %%mm7, %%mm3 \n\t"\
01257 "punpcklbw %%mm7, %%mm4 \n\t"\
01258 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01259 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01260 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01261 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01262 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
01263 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
01264 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01265 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01266 \
01267 : "+a"(src), "+c"(dst)\
01268 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01269 : "memory"\
01270 );\
01271 if(h==16){\
01272 __asm__ volatile(\
01273 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01274 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01275 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
01276 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
01277 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
01278 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
01279 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
01280 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
01281 \
01282 : "+a"(src), "+c"(dst)\
01283 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01284 : "memory"\
01285 );\
01286 }\
01287 src += 4-(h+5)*srcStride;\
01288 dst += 4-h*dstStride;\
01289 }\
01290 }\
01291 static av_always_inline void OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
01292 int w = (size+8)>>2;\
01293 src -= 2*srcStride+2;\
01294 while(w--){\
01295 __asm__ volatile(\
01296 "pxor %%mm7, %%mm7 \n\t"\
01297 "movd (%0), %%mm0 \n\t"\
01298 "add %2, %0 \n\t"\
01299 "movd (%0), %%mm1 \n\t"\
01300 "add %2, %0 \n\t"\
01301 "movd (%0), %%mm2 \n\t"\
01302 "add %2, %0 \n\t"\
01303 "movd (%0), %%mm3 \n\t"\
01304 "add %2, %0 \n\t"\
01305 "movd (%0), %%mm4 \n\t"\
01306 "add %2, %0 \n\t"\
01307 "punpcklbw %%mm7, %%mm0 \n\t"\
01308 "punpcklbw %%mm7, %%mm1 \n\t"\
01309 "punpcklbw %%mm7, %%mm2 \n\t"\
01310 "punpcklbw %%mm7, %%mm3 \n\t"\
01311 "punpcklbw %%mm7, %%mm4 \n\t"\
01312 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*48)\
01313 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*48)\
01314 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*48)\
01315 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*48)\
01316 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*48)\
01317 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*48)\
01318 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*48)\
01319 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*48)\
01320 : "+a"(src)\
01321 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01322 : "memory"\
01323 );\
01324 if(size==16){\
01325 __asm__ volatile(\
01326 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 8*48)\
01327 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 9*48)\
01328 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 10*48)\
01329 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 11*48)\
01330 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 12*48)\
01331 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 13*48)\
01332 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 14*48)\
01333 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 15*48)\
01334 : "+a"(src)\
01335 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01336 : "memory"\
01337 );\
01338 }\
01339 tmp += 4;\
01340 src += 4 - (size+5)*srcStride;\
01341 }\
01342 }\
01343 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
01344 int w = size>>4;\
01345 do{\
01346 int h = size;\
01347 __asm__ volatile(\
01348 "1: \n\t"\
01349 "movq (%0), %%mm0 \n\t"\
01350 "movq 8(%0), %%mm3 \n\t"\
01351 "movq 2(%0), %%mm1 \n\t"\
01352 "movq 10(%0), %%mm4 \n\t"\
01353 "paddw %%mm4, %%mm0 \n\t"\
01354 "paddw %%mm3, %%mm1 \n\t"\
01355 "paddw 18(%0), %%mm3 \n\t"\
01356 "paddw 16(%0), %%mm4 \n\t"\
01357 "movq 4(%0), %%mm2 \n\t"\
01358 "movq 12(%0), %%mm5 \n\t"\
01359 "paddw 6(%0), %%mm2 \n\t"\
01360 "paddw 14(%0), %%mm5 \n\t"\
01361 "psubw %%mm1, %%mm0 \n\t"\
01362 "psubw %%mm4, %%mm3 \n\t"\
01363 "psraw $2, %%mm0 \n\t"\
01364 "psraw $2, %%mm3 \n\t"\
01365 "psubw %%mm1, %%mm0 \n\t"\
01366 "psubw %%mm4, %%mm3 \n\t"\
01367 "paddsw %%mm2, %%mm0 \n\t"\
01368 "paddsw %%mm5, %%mm3 \n\t"\
01369 "psraw $2, %%mm0 \n\t"\
01370 "psraw $2, %%mm3 \n\t"\
01371 "paddw %%mm2, %%mm0 \n\t"\
01372 "paddw %%mm5, %%mm3 \n\t"\
01373 "psraw $6, %%mm0 \n\t"\
01374 "psraw $6, %%mm3 \n\t"\
01375 "packuswb %%mm3, %%mm0 \n\t"\
01376 OP(%%mm0, (%1),%%mm7, q)\
01377 "add $48, %0 \n\t"\
01378 "add %3, %1 \n\t"\
01379 "decl %2 \n\t"\
01380 " jnz 1b \n\t"\
01381 : "+a"(tmp), "+c"(dst), "+g"(h)\
01382 : "S"((x86_reg)dstStride)\
01383 : "memory"\
01384 );\
01385 tmp += 8 - size*24;\
01386 dst += 8 - size*dstStride;\
01387 }while(w--);\
01388 }\
01389 \
01390 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01391 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
01392 }\
01393 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01394 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
01395 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
01396 }\
01397 \
01398 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01399 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
01400 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01401 src += 8*srcStride;\
01402 dst += 8*dstStride;\
01403 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
01404 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01405 }\
01406 \
01407 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01408 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
01409 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01410 src += 8*dstStride;\
01411 dst += 8*dstStride;\
01412 src2 += 8*src2Stride;\
01413 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
01414 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01415 }\
01416 \
01417 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
01418 put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
01419 OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
01420 }\
01421 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01422 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
01423 }\
01424 \
01425 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01426 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
01427 }\
01428 \
01429 static av_noinline void OPNAME ## pixels4_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
01430 {\
01431 __asm__ volatile(\
01432 "movq (%1), %%mm0 \n\t"\
01433 "movq 24(%1), %%mm1 \n\t"\
01434 "psraw $5, %%mm0 \n\t"\
01435 "psraw $5, %%mm1 \n\t"\
01436 "packuswb %%mm0, %%mm0 \n\t"\
01437 "packuswb %%mm1, %%mm1 \n\t"\
01438 PAVGB" (%0), %%mm0 \n\t"\
01439 PAVGB" (%0,%3), %%mm1 \n\t"\
01440 OP(%%mm0, (%2), %%mm4, d)\
01441 OP(%%mm1, (%2,%4), %%mm5, d)\
01442 "lea (%0,%3,2), %0 \n\t"\
01443 "lea (%2,%4,2), %2 \n\t"\
01444 "movq 48(%1), %%mm0 \n\t"\
01445 "movq 72(%1), %%mm1 \n\t"\
01446 "psraw $5, %%mm0 \n\t"\
01447 "psraw $5, %%mm1 \n\t"\
01448 "packuswb %%mm0, %%mm0 \n\t"\
01449 "packuswb %%mm1, %%mm1 \n\t"\
01450 PAVGB" (%0), %%mm0 \n\t"\
01451 PAVGB" (%0,%3), %%mm1 \n\t"\
01452 OP(%%mm0, (%2), %%mm4, d)\
01453 OP(%%mm1, (%2,%4), %%mm5, d)\
01454 :"+a"(src8), "+c"(src16), "+d"(dst)\
01455 :"S"((x86_reg)src8Stride), "D"((x86_reg)dstStride)\
01456 :"memory");\
01457 }\
01458 static av_noinline void OPNAME ## pixels8_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
01459 {\
01460 do{\
01461 __asm__ volatile(\
01462 "movq (%1), %%mm0 \n\t"\
01463 "movq 8(%1), %%mm1 \n\t"\
01464 "movq 48(%1), %%mm2 \n\t"\
01465 "movq 8+48(%1), %%mm3 \n\t"\
01466 "psraw $5, %%mm0 \n\t"\
01467 "psraw $5, %%mm1 \n\t"\
01468 "psraw $5, %%mm2 \n\t"\
01469 "psraw $5, %%mm3 \n\t"\
01470 "packuswb %%mm1, %%mm0 \n\t"\
01471 "packuswb %%mm3, %%mm2 \n\t"\
01472 PAVGB" (%0), %%mm0 \n\t"\
01473 PAVGB" (%0,%3), %%mm2 \n\t"\
01474 OP(%%mm0, (%2), %%mm5, q)\
01475 OP(%%mm2, (%2,%4), %%mm5, q)\
01476 ::"a"(src8), "c"(src16), "d"(dst),\
01477 "r"((x86_reg)src8Stride), "r"((x86_reg)dstStride)\
01478 :"memory");\
01479 src8 += 2L*src8Stride;\
01480 src16 += 48;\
01481 dst += 2L*dstStride;\
01482 }while(h-=2);\
01483 }\
01484 static void OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
01485 {\
01486 OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
01487 OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
01488 }\
01489
01490
01491 #if ARCH_X86_64
01492 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
01493 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01494 int h=16;\
01495 __asm__ volatile(\
01496 "pxor %%xmm15, %%xmm15 \n\t"\
01497 "movdqa %6, %%xmm14 \n\t"\
01498 "movdqa %7, %%xmm13 \n\t"\
01499 "1: \n\t"\
01500 "lddqu 3(%0), %%xmm1 \n\t"\
01501 "lddqu -5(%0), %%xmm7 \n\t"\
01502 "movdqa %%xmm1, %%xmm0 \n\t"\
01503 "punpckhbw %%xmm15, %%xmm1 \n\t"\
01504 "punpcklbw %%xmm15, %%xmm0 \n\t"\
01505 "punpcklbw %%xmm15, %%xmm7 \n\t"\
01506 "movdqa %%xmm1, %%xmm2 \n\t"\
01507 "movdqa %%xmm0, %%xmm6 \n\t"\
01508 "movdqa %%xmm1, %%xmm3 \n\t"\
01509 "movdqa %%xmm0, %%xmm8 \n\t"\
01510 "movdqa %%xmm1, %%xmm4 \n\t"\
01511 "movdqa %%xmm0, %%xmm9 \n\t"\
01512 "movdqa %%xmm1, %%xmm5 \n\t"\
01513 "movdqa %%xmm0, %%xmm10 \n\t"\
01514 "palignr $6, %%xmm0, %%xmm5 \n\t"\
01515 "palignr $6, %%xmm7, %%xmm10\n\t"\
01516 "palignr $8, %%xmm0, %%xmm4 \n\t"\
01517 "palignr $8, %%xmm7, %%xmm9 \n\t"\
01518 "palignr $10,%%xmm0, %%xmm3 \n\t"\
01519 "palignr $10,%%xmm7, %%xmm8 \n\t"\
01520 "paddw %%xmm1, %%xmm5 \n\t"\
01521 "paddw %%xmm0, %%xmm10 \n\t"\
01522 "palignr $12,%%xmm0, %%xmm2 \n\t"\
01523 "palignr $12,%%xmm7, %%xmm6 \n\t"\
01524 "palignr $14,%%xmm0, %%xmm1 \n\t"\
01525 "palignr $14,%%xmm7, %%xmm0 \n\t"\
01526 "paddw %%xmm3, %%xmm2 \n\t"\
01527 "paddw %%xmm8, %%xmm6 \n\t"\
01528 "paddw %%xmm4, %%xmm1 \n\t"\
01529 "paddw %%xmm9, %%xmm0 \n\t"\
01530 "psllw $2, %%xmm2 \n\t"\
01531 "psllw $2, %%xmm6 \n\t"\
01532 "psubw %%xmm1, %%xmm2 \n\t"\
01533 "psubw %%xmm0, %%xmm6 \n\t"\
01534 "paddw %%xmm13,%%xmm5 \n\t"\
01535 "paddw %%xmm13,%%xmm10 \n\t"\
01536 "pmullw %%xmm14,%%xmm2 \n\t"\
01537 "pmullw %%xmm14,%%xmm6 \n\t"\
01538 "lddqu (%2), %%xmm3 \n\t"\
01539 "paddw %%xmm5, %%xmm2 \n\t"\
01540 "paddw %%xmm10,%%xmm6 \n\t"\
01541 "psraw $5, %%xmm2 \n\t"\
01542 "psraw $5, %%xmm6 \n\t"\
01543 "packuswb %%xmm2,%%xmm6 \n\t"\
01544 "pavgb %%xmm3, %%xmm6 \n\t"\
01545 OP(%%xmm6, (%1), %%xmm4, dqa)\
01546 "add %5, %0 \n\t"\
01547 "add %5, %1 \n\t"\
01548 "add %4, %2 \n\t"\
01549 "decl %3 \n\t"\
01550 "jg 1b \n\t"\
01551 : "+a"(src), "+c"(dst), "+d"(src2), "+g"(h)\
01552 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
01553 "m"(ff_pw_5), "m"(ff_pw_16)\
01554 : "memory"\
01555 );\
01556 }
01557 #else // ARCH_X86_64
01558 #define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
01559 static av_noinline void OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01560 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
01561 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01562 src += 8*dstStride;\
01563 dst += 8*dstStride;\
01564 src2 += 8*src2Stride;\
01565 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
01566 OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
01567 }
01568 #endif // ARCH_X86_64
01569
01570 #define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
01571 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
01572 int h=8;\
01573 __asm__ volatile(\
01574 "pxor %%xmm7, %%xmm7 \n\t"\
01575 "movdqa %0, %%xmm6 \n\t"\
01576 :: "m"(ff_pw_5)\
01577 );\
01578 do{\
01579 __asm__ volatile(\
01580 "lddqu -5(%0), %%xmm1 \n\t"\
01581 "movdqa %%xmm1, %%xmm0 \n\t"\
01582 "punpckhbw %%xmm7, %%xmm1 \n\t"\
01583 "punpcklbw %%xmm7, %%xmm0 \n\t"\
01584 "movdqa %%xmm1, %%xmm2 \n\t"\
01585 "movdqa %%xmm1, %%xmm3 \n\t"\
01586 "movdqa %%xmm1, %%xmm4 \n\t"\
01587 "movdqa %%xmm1, %%xmm5 \n\t"\
01588 "palignr $6, %%xmm0, %%xmm5 \n\t"\
01589 "palignr $8, %%xmm0, %%xmm4 \n\t"\
01590 "palignr $10,%%xmm0, %%xmm3 \n\t"\
01591 "paddw %%xmm1, %%xmm5 \n\t"\
01592 "palignr $12,%%xmm0, %%xmm2 \n\t"\
01593 "palignr $14,%%xmm0, %%xmm1 \n\t"\
01594 "paddw %%xmm3, %%xmm2 \n\t"\
01595 "paddw %%xmm4, %%xmm1 \n\t"\
01596 "psllw $2, %%xmm2 \n\t"\
01597 "movq (%2), %%xmm3 \n\t"\
01598 "psubw %%xmm1, %%xmm2 \n\t"\
01599 "paddw %5, %%xmm5 \n\t"\
01600 "pmullw %%xmm6, %%xmm2 \n\t"\
01601 "paddw %%xmm5, %%xmm2 \n\t"\
01602 "psraw $5, %%xmm2 \n\t"\
01603 "packuswb %%xmm2, %%xmm2 \n\t"\
01604 "pavgb %%xmm3, %%xmm2 \n\t"\
01605 OP(%%xmm2, (%1), %%xmm4, q)\
01606 "add %4, %0 \n\t"\
01607 "add %4, %1 \n\t"\
01608 "add %3, %2 \n\t"\
01609 : "+a"(src), "+c"(dst), "+d"(src2)\
01610 : "D"((x86_reg)src2Stride), "S"((x86_reg)dstStride),\
01611 "m"(ff_pw_16)\
01612 : "memory"\
01613 );\
01614 }while(--h);\
01615 }\
01616 QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
01617 \
01618 static av_noinline void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01619 int h=8;\
01620 __asm__ volatile(\
01621 "pxor %%xmm7, %%xmm7 \n\t"\
01622 "movdqa %5, %%xmm6 \n\t"\
01623 "1: \n\t"\
01624 "lddqu -5(%0), %%xmm1 \n\t"\
01625 "movdqa %%xmm1, %%xmm0 \n\t"\
01626 "punpckhbw %%xmm7, %%xmm1 \n\t"\
01627 "punpcklbw %%xmm7, %%xmm0 \n\t"\
01628 "movdqa %%xmm1, %%xmm2 \n\t"\
01629 "movdqa %%xmm1, %%xmm3 \n\t"\
01630 "movdqa %%xmm1, %%xmm4 \n\t"\
01631 "movdqa %%xmm1, %%xmm5 \n\t"\
01632 "palignr $6, %%xmm0, %%xmm5 \n\t"\
01633 "palignr $8, %%xmm0, %%xmm4 \n\t"\
01634 "palignr $10,%%xmm0, %%xmm3 \n\t"\
01635 "paddw %%xmm1, %%xmm5 \n\t"\
01636 "palignr $12,%%xmm0, %%xmm2 \n\t"\
01637 "palignr $14,%%xmm0, %%xmm1 \n\t"\
01638 "paddw %%xmm3, %%xmm2 \n\t"\
01639 "paddw %%xmm4, %%xmm1 \n\t"\
01640 "psllw $2, %%xmm2 \n\t"\
01641 "psubw %%xmm1, %%xmm2 \n\t"\
01642 "paddw %6, %%xmm5 \n\t"\
01643 "pmullw %%xmm6, %%xmm2 \n\t"\
01644 "paddw %%xmm5, %%xmm2 \n\t"\
01645 "psraw $5, %%xmm2 \n\t"\
01646 "packuswb %%xmm2, %%xmm2 \n\t"\
01647 OP(%%xmm2, (%1), %%xmm4, q)\
01648 "add %3, %0 \n\t"\
01649 "add %4, %1 \n\t"\
01650 "decl %2 \n\t"\
01651 " jnz 1b \n\t"\
01652 : "+a"(src), "+c"(dst), "+g"(h)\
01653 : "D"((x86_reg)srcStride), "S"((x86_reg)dstStride),\
01654 "m"(ff_pw_5), "m"(ff_pw_16)\
01655 : "memory"\
01656 );\
01657 }\
01658 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01659 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
01660 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01661 src += 8*srcStride;\
01662 dst += 8*dstStride;\
01663 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
01664 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
01665 }\
01666
01667 #define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
01668 static av_noinline void OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01669 src -= 2*srcStride;\
01670 \
01671 __asm__ volatile(\
01672 "pxor %%xmm7, %%xmm7 \n\t"\
01673 "movq (%0), %%xmm0 \n\t"\
01674 "add %2, %0 \n\t"\
01675 "movq (%0), %%xmm1 \n\t"\
01676 "add %2, %0 \n\t"\
01677 "movq (%0), %%xmm2 \n\t"\
01678 "add %2, %0 \n\t"\
01679 "movq (%0), %%xmm3 \n\t"\
01680 "add %2, %0 \n\t"\
01681 "movq (%0), %%xmm4 \n\t"\
01682 "add %2, %0 \n\t"\
01683 "punpcklbw %%xmm7, %%xmm0 \n\t"\
01684 "punpcklbw %%xmm7, %%xmm1 \n\t"\
01685 "punpcklbw %%xmm7, %%xmm2 \n\t"\
01686 "punpcklbw %%xmm7, %%xmm3 \n\t"\
01687 "punpcklbw %%xmm7, %%xmm4 \n\t"\
01688 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
01689 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
01690 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
01691 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
01692 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
01693 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
01694 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
01695 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
01696 \
01697 : "+a"(src), "+c"(dst)\
01698 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01699 : "memory"\
01700 );\
01701 if(h==16){\
01702 __asm__ volatile(\
01703 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
01704 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
01705 QPEL_H264V_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, OP)\
01706 QPEL_H264V_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, OP)\
01707 QPEL_H264V_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, OP)\
01708 QPEL_H264V_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, OP)\
01709 QPEL_H264V_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, OP)\
01710 QPEL_H264V_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, OP)\
01711 \
01712 : "+a"(src), "+c"(dst)\
01713 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
01714 : "memory"\
01715 );\
01716 }\
01717 }\
01718 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01719 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
01720 }\
01721 static av_noinline void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01722 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
01723 OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
01724 }
01725
01726 static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){
01727 int w = (size+8)>>3;
01728 src -= 2*srcStride+2;
01729 while(w--){
01730 __asm__ volatile(
01731 "pxor %%xmm7, %%xmm7 \n\t"
01732 "movq (%0), %%xmm0 \n\t"
01733 "add %2, %0 \n\t"
01734 "movq (%0), %%xmm1 \n\t"
01735 "add %2, %0 \n\t"
01736 "movq (%0), %%xmm2 \n\t"
01737 "add %2, %0 \n\t"
01738 "movq (%0), %%xmm3 \n\t"
01739 "add %2, %0 \n\t"
01740 "movq (%0), %%xmm4 \n\t"
01741 "add %2, %0 \n\t"
01742 "punpcklbw %%xmm7, %%xmm0 \n\t"
01743 "punpcklbw %%xmm7, %%xmm1 \n\t"
01744 "punpcklbw %%xmm7, %%xmm2 \n\t"
01745 "punpcklbw %%xmm7, %%xmm3 \n\t"
01746 "punpcklbw %%xmm7, %%xmm4 \n\t"
01747 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 0*48)
01748 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 1*48)
01749 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 2*48)
01750 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 3*48)
01751 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 4*48)
01752 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 5*48)
01753 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 6*48)
01754 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 7*48)
01755 : "+a"(src)
01756 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
01757 : "memory"
01758 );
01759 if(size==16){
01760 __asm__ volatile(
01761 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 8*48)
01762 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 9*48)
01763 QPEL_H264HV_XMM(%%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, 10*48)
01764 QPEL_H264HV_XMM(%%xmm5, %%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, 11*48)
01765 QPEL_H264HV_XMM(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, 12*48)
01766 QPEL_H264HV_XMM(%%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, 13*48)
01767 QPEL_H264HV_XMM(%%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, 14*48)
01768 QPEL_H264HV_XMM(%%xmm3, %%xmm4, %%xmm5, %%xmm0, %%xmm1, %%xmm2, 15*48)
01769 : "+a"(src)
01770 : "c"(tmp), "S"((x86_reg)srcStride), "m"(ff_pw_5), "m"(ff_pw_16)
01771 : "memory"
01772 );
01773 }
01774 tmp += 8;
01775 src += 8 - (size+5)*srcStride;
01776 }
01777 }
01778
01779 #define QPEL_H264_HV2_XMM(OPNAME, OP, MMX)\
01780 static av_always_inline void OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
01781 int h = size;\
01782 if(size == 16){\
01783 __asm__ volatile(\
01784 "1: \n\t"\
01785 "movdqa 32(%0), %%xmm4 \n\t"\
01786 "movdqa 16(%0), %%xmm5 \n\t"\
01787 "movdqa (%0), %%xmm7 \n\t"\
01788 "movdqa %%xmm4, %%xmm3 \n\t"\
01789 "movdqa %%xmm4, %%xmm2 \n\t"\
01790 "movdqa %%xmm4, %%xmm1 \n\t"\
01791 "movdqa %%xmm4, %%xmm0 \n\t"\
01792 "palignr $10, %%xmm5, %%xmm0 \n\t"\
01793 "palignr $8, %%xmm5, %%xmm1 \n\t"\
01794 "palignr $6, %%xmm5, %%xmm2 \n\t"\
01795 "palignr $4, %%xmm5, %%xmm3 \n\t"\
01796 "palignr $2, %%xmm5, %%xmm4 \n\t"\
01797 "paddw %%xmm5, %%xmm0 \n\t"\
01798 "paddw %%xmm4, %%xmm1 \n\t"\
01799 "paddw %%xmm3, %%xmm2 \n\t"\
01800 "movdqa %%xmm5, %%xmm6 \n\t"\
01801 "movdqa %%xmm5, %%xmm4 \n\t"\
01802 "movdqa %%xmm5, %%xmm3 \n\t"\
01803 "palignr $8, %%xmm7, %%xmm4 \n\t"\
01804 "palignr $2, %%xmm7, %%xmm6 \n\t"\
01805 "palignr $10, %%xmm7, %%xmm3 \n\t"\
01806 "paddw %%xmm6, %%xmm4 \n\t"\
01807 "movdqa %%xmm5, %%xmm6 \n\t"\
01808 "palignr $6, %%xmm7, %%xmm5 \n\t"\
01809 "palignr $4, %%xmm7, %%xmm6 \n\t"\
01810 "paddw %%xmm7, %%xmm3 \n\t"\
01811 "paddw %%xmm6, %%xmm5 \n\t"\
01812 \
01813 "psubw %%xmm1, %%xmm0 \n\t"\
01814 "psubw %%xmm4, %%xmm3 \n\t"\
01815 "psraw $2, %%xmm0 \n\t"\
01816 "psraw $2, %%xmm3 \n\t"\
01817 "psubw %%xmm1, %%xmm0 \n\t"\
01818 "psubw %%xmm4, %%xmm3 \n\t"\
01819 "paddw %%xmm2, %%xmm0 \n\t"\
01820 "paddw %%xmm5, %%xmm3 \n\t"\
01821 "psraw $2, %%xmm0 \n\t"\
01822 "psraw $2, %%xmm3 \n\t"\
01823 "paddw %%xmm2, %%xmm0 \n\t"\
01824 "paddw %%xmm5, %%xmm3 \n\t"\
01825 "psraw $6, %%xmm0 \n\t"\
01826 "psraw $6, %%xmm3 \n\t"\
01827 "packuswb %%xmm0, %%xmm3 \n\t"\
01828 OP(%%xmm3, (%1), %%xmm7, dqa)\
01829 "add $48, %0 \n\t"\
01830 "add %3, %1 \n\t"\
01831 "decl %2 \n\t"\
01832 " jnz 1b \n\t"\
01833 : "+a"(tmp), "+c"(dst), "+g"(h)\
01834 : "S"((x86_reg)dstStride)\
01835 : "memory"\
01836 );\
01837 }else{\
01838 __asm__ volatile(\
01839 "1: \n\t"\
01840 "movdqa 16(%0), %%xmm1 \n\t"\
01841 "movdqa (%0), %%xmm0 \n\t"\
01842 "movdqa %%xmm1, %%xmm2 \n\t"\
01843 "movdqa %%xmm1, %%xmm3 \n\t"\
01844 "movdqa %%xmm1, %%xmm4 \n\t"\
01845 "movdqa %%xmm1, %%xmm5 \n\t"\
01846 "palignr $10, %%xmm0, %%xmm5 \n\t"\
01847 "palignr $8, %%xmm0, %%xmm4 \n\t"\
01848 "palignr $6, %%xmm0, %%xmm3 \n\t"\
01849 "palignr $4, %%xmm0, %%xmm2 \n\t"\
01850 "palignr $2, %%xmm0, %%xmm1 \n\t"\
01851 "paddw %%xmm5, %%xmm0 \n\t"\
01852 "paddw %%xmm4, %%xmm1 \n\t"\
01853 "paddw %%xmm3, %%xmm2 \n\t"\
01854 "psubw %%xmm1, %%xmm0 \n\t"\
01855 "psraw $2, %%xmm0 \n\t"\
01856 "psubw %%xmm1, %%xmm0 \n\t"\
01857 "paddw %%xmm2, %%xmm0 \n\t"\
01858 "psraw $2, %%xmm0 \n\t"\
01859 "paddw %%xmm2, %%xmm0 \n\t"\
01860 "psraw $6, %%xmm0 \n\t"\
01861 "packuswb %%xmm0, %%xmm0 \n\t"\
01862 OP(%%xmm0, (%1), %%xmm7, q)\
01863 "add $48, %0 \n\t"\
01864 "add %3, %1 \n\t"\
01865 "decl %2 \n\t"\
01866 " jnz 1b \n\t"\
01867 : "+a"(tmp), "+c"(dst), "+g"(h)\
01868 : "S"((x86_reg)dstStride)\
01869 : "memory"\
01870 );\
01871 }\
01872 }
01873
01874 #define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
01875 static av_noinline void OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
01876 put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
01877 OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
01878 }\
01879 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01880 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
01881 }\
01882 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
01883 OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
01884 }\
01885
01886 #define put_pixels8_l2_sse2 put_pixels8_l2_mmx2
01887 #define avg_pixels8_l2_sse2 avg_pixels8_l2_mmx2
01888 #define put_pixels16_l2_sse2 put_pixels16_l2_mmx2
01889 #define avg_pixels16_l2_sse2 avg_pixels16_l2_mmx2
01890 #define put_pixels8_l2_ssse3 put_pixels8_l2_mmx2
01891 #define avg_pixels8_l2_ssse3 avg_pixels8_l2_mmx2
01892 #define put_pixels16_l2_ssse3 put_pixels16_l2_mmx2
01893 #define avg_pixels16_l2_ssse3 avg_pixels16_l2_mmx2
01894
01895 #define put_pixels8_l2_shift5_sse2 put_pixels8_l2_shift5_mmx2
01896 #define avg_pixels8_l2_shift5_sse2 avg_pixels8_l2_shift5_mmx2
01897 #define put_pixels16_l2_shift5_sse2 put_pixels16_l2_shift5_mmx2
01898 #define avg_pixels16_l2_shift5_sse2 avg_pixels16_l2_shift5_mmx2
01899 #define put_pixels8_l2_shift5_ssse3 put_pixels8_l2_shift5_mmx2
01900 #define avg_pixels8_l2_shift5_ssse3 avg_pixels8_l2_shift5_mmx2
01901 #define put_pixels16_l2_shift5_ssse3 put_pixels16_l2_shift5_mmx2
01902 #define avg_pixels16_l2_shift5_ssse3 avg_pixels16_l2_shift5_mmx2
01903
01904 #define put_h264_qpel8_h_lowpass_l2_sse2 put_h264_qpel8_h_lowpass_l2_mmx2
01905 #define avg_h264_qpel8_h_lowpass_l2_sse2 avg_h264_qpel8_h_lowpass_l2_mmx2
01906 #define put_h264_qpel16_h_lowpass_l2_sse2 put_h264_qpel16_h_lowpass_l2_mmx2
01907 #define avg_h264_qpel16_h_lowpass_l2_sse2 avg_h264_qpel16_h_lowpass_l2_mmx2
01908
01909 #define put_h264_qpel8_v_lowpass_ssse3 put_h264_qpel8_v_lowpass_sse2
01910 #define avg_h264_qpel8_v_lowpass_ssse3 avg_h264_qpel8_v_lowpass_sse2
01911 #define put_h264_qpel16_v_lowpass_ssse3 put_h264_qpel16_v_lowpass_sse2
01912 #define avg_h264_qpel16_v_lowpass_ssse3 avg_h264_qpel16_v_lowpass_sse2
01913
01914 #define put_h264_qpel8or16_hv2_lowpass_sse2 put_h264_qpel8or16_hv2_lowpass_mmx2
01915 #define avg_h264_qpel8or16_hv2_lowpass_sse2 avg_h264_qpel8or16_hv2_lowpass_mmx2
01916
01917 #define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
01918 H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
01919 H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
01920 H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
01921 H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
01922
01923 static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
01924 put_pixels16_sse2(dst, src, stride, 16);
01925 }
01926 static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, int stride){
01927 avg_pixels16_sse2(dst, src, stride, 16);
01928 }
01929 #define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmx2
01930 #define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmx2
01931
01932 #define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
01933 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
01934 OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
01935 }\
01936
01937 #define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
01938 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01939 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
01940 }\
01941 \
01942 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01943 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
01944 }\
01945 \
01946 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01947 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
01948 }\
01949
01950 #define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
01951 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01952 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
01953 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01954 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
01955 }\
01956 \
01957 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01958 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
01959 }\
01960 \
01961 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01962 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
01963 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01964 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
01965 }\
01966
01967 #define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
01968 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01969 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
01970 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01971 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
01972 }\
01973 \
01974 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01975 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
01976 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
01977 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
01978 }\
01979 \
01980 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01981 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
01982 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
01983 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
01984 }\
01985 \
01986 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01987 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*SIZE]);\
01988 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
01989 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
01990 }\
01991 \
01992 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01993 DECLARE_ALIGNED(ALIGN, uint16_t, temp[SIZE*(SIZE<8?12:24)]);\
01994 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
01995 }\
01996 \
01997 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
01998 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
01999 uint8_t * const halfHV= temp;\
02000 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
02001 assert(((int)temp & 7) == 0);\
02002 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
02003 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
02004 }\
02005 \
02006 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02007 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
02008 uint8_t * const halfHV= temp;\
02009 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
02010 assert(((int)temp & 7) == 0);\
02011 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
02012 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
02013 }\
02014 \
02015 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02016 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
02017 uint8_t * const halfHV= temp;\
02018 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
02019 assert(((int)temp & 7) == 0);\
02020 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
02021 OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
02022 }\
02023 \
02024 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02025 DECLARE_ALIGNED(ALIGN, uint8_t, temp[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE]);\
02026 uint8_t * const halfHV= temp;\
02027 int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
02028 assert(((int)temp & 7) == 0);\
02029 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
02030 OPNAME ## pixels ## SIZE ## _l2_shift5_ ## MMX(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
02031 }\
02032
02033 #define H264_MC_4816(MMX)\
02034 H264_MC(put_, 4, MMX, 8)\
02035 H264_MC(put_, 8, MMX, 8)\
02036 H264_MC(put_, 16,MMX, 8)\
02037 H264_MC(avg_, 4, MMX, 8)\
02038 H264_MC(avg_, 8, MMX, 8)\
02039 H264_MC(avg_, 16,MMX, 8)\
02040
02041 #define H264_MC_816(QPEL, XMM)\
02042 QPEL(put_, 8, XMM, 16)\
02043 QPEL(put_, 16,XMM, 16)\
02044 QPEL(avg_, 8, XMM, 16)\
02045 QPEL(avg_, 16,XMM, 16)\
02046
02047
02048 #define AVG_3DNOW_OP(a,b,temp, size) \
02049 "mov" #size " " #b ", " #temp " \n\t"\
02050 "pavgusb " #temp ", " #a " \n\t"\
02051 "mov" #size " " #a ", " #b " \n\t"
02052 #define AVG_MMX2_OP(a,b,temp, size) \
02053 "mov" #size " " #b ", " #temp " \n\t"\
02054 "pavgb " #temp ", " #a " \n\t"\
02055 "mov" #size " " #a ", " #b " \n\t"
02056
02057 #define PAVGB "pavgusb"
02058 QPEL_H264(put_, PUT_OP, 3dnow)
02059 QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
02060 #undef PAVGB
02061 #define PAVGB "pavgb"
02062 QPEL_H264(put_, PUT_OP, mmx2)
02063 QPEL_H264(avg_, AVG_MMX2_OP, mmx2)
02064 QPEL_H264_V_XMM(put_, PUT_OP, sse2)
02065 QPEL_H264_V_XMM(avg_, AVG_MMX2_OP, sse2)
02066 QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
02067 QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, sse2)
02068 #if HAVE_SSSE3
02069 QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
02070 QPEL_H264_H_XMM(avg_, AVG_MMX2_OP, ssse3)
02071 QPEL_H264_HV2_XMM(put_, PUT_OP, ssse3)
02072 QPEL_H264_HV2_XMM(avg_, AVG_MMX2_OP, ssse3)
02073 QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
02074 QPEL_H264_HV_XMM(avg_, AVG_MMX2_OP, ssse3)
02075 #endif
02076 #undef PAVGB
02077
02078 H264_MC_4816(3dnow)
02079 H264_MC_4816(mmx2)
02080 H264_MC_816(H264_MC_V, sse2)
02081 H264_MC_816(H264_MC_HV, sse2)
02082 #if HAVE_SSSE3
02083 H264_MC_816(H264_MC_H, ssse3)
02084 H264_MC_816(H264_MC_HV, ssse3)
02085 #endif
02086
02087
02088 DECLARE_ALIGNED_8(static const uint64_t, h264_rnd_reg[4]) = {
02089 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL
02090 };
02091
02092 #define H264_CHROMA_OP(S,D)
02093 #define H264_CHROMA_OP4(S,D,T)
02094 #define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx
02095 #define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx
02096 #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
02097 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
02098 #include "dsputil_h264_template_mmx.c"
02099
02100 static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02101 {
02102 put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg);
02103 }
02104 static void put_h264_chroma_mc8_mmx_nornd(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02105 {
02106 put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg+2);
02107 }
02108 static void put_h264_chroma_mc4_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02109 {
02110 put_h264_chroma_generic_mc4_mmx(dst, src, stride, h, x, y, h264_rnd_reg);
02111 }
02112
02113 #undef H264_CHROMA_OP
02114 #undef H264_CHROMA_OP4
02115 #undef H264_CHROMA_MC8_TMPL
02116 #undef H264_CHROMA_MC4_TMPL
02117 #undef H264_CHROMA_MC2_TMPL
02118 #undef H264_CHROMA_MC8_MV0
02119
02120 #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
02121 #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
02122 "pavgb " #T ", " #D " \n\t"
02123 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_mmx2
02124 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_mmx2
02125 #define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
02126 #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
02127 #include "dsputil_h264_template_mmx.c"
02128 static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02129 {
02130 avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg);
02131 }
02132 static void avg_h264_chroma_mc4_mmx2(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02133 {
02134 avg_h264_chroma_generic_mc4_mmx2(dst, src, stride, h, x, y, h264_rnd_reg);
02135 }
02136 #undef H264_CHROMA_OP
02137 #undef H264_CHROMA_OP4
02138 #undef H264_CHROMA_MC8_TMPL
02139 #undef H264_CHROMA_MC4_TMPL
02140 #undef H264_CHROMA_MC2_TMPL
02141 #undef H264_CHROMA_MC8_MV0
02142
02143 #define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
02144 #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
02145 "pavgusb " #T ", " #D " \n\t"
02146 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_3dnow
02147 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_3dnow
02148 #define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow
02149 #include "dsputil_h264_template_mmx.c"
02150 static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02151 {
02152 avg_h264_chroma_generic_mc8_3dnow(dst, src, stride, h, x, y, h264_rnd_reg);
02153 }
02154 static void avg_h264_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02155 {
02156 avg_h264_chroma_generic_mc4_3dnow(dst, src, stride, h, x, y, h264_rnd_reg);
02157 }
02158 #undef H264_CHROMA_OP
02159 #undef H264_CHROMA_OP4
02160 #undef H264_CHROMA_MC8_TMPL
02161 #undef H264_CHROMA_MC4_TMPL
02162 #undef H264_CHROMA_MC8_MV0
02163
02164 #if HAVE_SSSE3
02165 #define AVG_OP(X)
02166 #undef H264_CHROMA_MC8_TMPL
02167 #undef H264_CHROMA_MC4_TMPL
02168 #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_ssse3
02169 #define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_ssse3
02170 #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
02171 #include "dsputil_h264_template_ssse3.c"
02172 static void put_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02173 {
02174 put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
02175 }
02176 static void put_h264_chroma_mc8_ssse3_nornd(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02177 {
02178 put_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 0);
02179 }
02180
02181 #undef AVG_OP
02182 #undef H264_CHROMA_MC8_TMPL
02183 #undef H264_CHROMA_MC4_TMPL
02184 #undef H264_CHROMA_MC8_MV0
02185 #define AVG_OP(X) X
02186 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_ssse3
02187 #define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_ssse3
02188 #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
02189 #include "dsputil_h264_template_ssse3.c"
02190 static void avg_h264_chroma_mc8_ssse3_rnd(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y)
02191 {
02192 avg_h264_chroma_mc8_ssse3(dst, src, stride, h, x, y, 1);
02193 }
02194 #undef AVG_OP
02195 #undef H264_CHROMA_MC8_TMPL
02196 #undef H264_CHROMA_MC4_TMPL
02197 #undef H264_CHROMA_MC8_MV0
02198 #endif
02199
02200
02201
02202
02203 static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
02204 {
02205 int x, y;
02206 offset <<= log2_denom;
02207 offset += (1 << log2_denom) >> 1;
02208 __asm__ volatile(
02209 "movd %0, %%mm4 \n\t"
02210 "movd %1, %%mm5 \n\t"
02211 "movd %2, %%mm6 \n\t"
02212 "pshufw $0, %%mm4, %%mm4 \n\t"
02213 "pshufw $0, %%mm5, %%mm5 \n\t"
02214 "pxor %%mm7, %%mm7 \n\t"
02215 :: "g"(weight), "g"(offset), "g"(log2_denom)
02216 );
02217 for(y=0; y<h; y+=2){
02218 for(x=0; x<w; x+=4){
02219 __asm__ volatile(
02220 "movd %0, %%mm0 \n\t"
02221 "movd %1, %%mm1 \n\t"
02222 "punpcklbw %%mm7, %%mm0 \n\t"
02223 "punpcklbw %%mm7, %%mm1 \n\t"
02224 "pmullw %%mm4, %%mm0 \n\t"
02225 "pmullw %%mm4, %%mm1 \n\t"
02226 "paddsw %%mm5, %%mm0 \n\t"
02227 "paddsw %%mm5, %%mm1 \n\t"
02228 "psraw %%mm6, %%mm0 \n\t"
02229 "psraw %%mm6, %%mm1 \n\t"
02230 "packuswb %%mm7, %%mm0 \n\t"
02231 "packuswb %%mm7, %%mm1 \n\t"
02232 "movd %%mm0, %0 \n\t"
02233 "movd %%mm1, %1 \n\t"
02234 : "+m"(*(uint32_t*)(dst+x)),
02235 "+m"(*(uint32_t*)(dst+x+stride))
02236 );
02237 }
02238 dst += 2*stride;
02239 }
02240 }
02241
02242 static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset, int w, int h)
02243 {
02244 int x, y;
02245 offset = ((offset + 1) | 1) << log2_denom;
02246 __asm__ volatile(
02247 "movd %0, %%mm3 \n\t"
02248 "movd %1, %%mm4 \n\t"
02249 "movd %2, %%mm5 \n\t"
02250 "movd %3, %%mm6 \n\t"
02251 "pshufw $0, %%mm3, %%mm3 \n\t"
02252 "pshufw $0, %%mm4, %%mm4 \n\t"
02253 "pshufw $0, %%mm5, %%mm5 \n\t"
02254 "pxor %%mm7, %%mm7 \n\t"
02255 :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
02256 );
02257 for(y=0; y<h; y++){
02258 for(x=0; x<w; x+=4){
02259 __asm__ volatile(
02260 "movd %0, %%mm0 \n\t"
02261 "movd %1, %%mm1 \n\t"
02262 "punpcklbw %%mm7, %%mm0 \n\t"
02263 "punpcklbw %%mm7, %%mm1 \n\t"
02264 "pmullw %%mm3, %%mm0 \n\t"
02265 "pmullw %%mm4, %%mm1 \n\t"
02266 "paddsw %%mm1, %%mm0 \n\t"
02267 "paddsw %%mm5, %%mm0 \n\t"
02268 "psraw %%mm6, %%mm0 \n\t"
02269 "packuswb %%mm0, %%mm0 \n\t"
02270 "movd %%mm0, %0 \n\t"
02271 : "+m"(*(uint32_t*)(dst+x))
02272 : "m"(*(uint32_t*)(src+x))
02273 );
02274 }
02275 src += stride;
02276 dst += stride;
02277 }
02278 }
02279
02280 #define H264_WEIGHT(W,H) \
02281 static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
02282 ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offset, W, H); \
02283 } \
02284 static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
02285 ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
02286 }
02287
02288 H264_WEIGHT(16,16)
02289 H264_WEIGHT(16, 8)
02290 H264_WEIGHT( 8,16)
02291 H264_WEIGHT( 8, 8)
02292 H264_WEIGHT( 8, 4)
02293 H264_WEIGHT( 4, 8)
02294 H264_WEIGHT( 4, 4)
02295 H264_WEIGHT( 4, 2)
02296