doxygen/trunk/x86_2mpegvideoencdsp__init_8c_source.html

/*

 * This file is part of FFmpeg.

 *

 * FFmpeg is free software; you can redistribute it and/or

 * modify it under the terms of the GNU Lesser General Public

 * License as published by the Free Software Foundation; either

 * version 2.1 of the License, or (at your option) any later version.

 *

 * FFmpeg is distributed in the hope that it will be useful,

 * but WITHOUT ANY WARRANTY; without even the implied warranty of

 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU

 * Lesser General Public License for more details.

 *

 * You should have received a copy of the GNU Lesser General Public

 * License along with FFmpeg; if not, write to the Free Software

 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA

 */


#include <stdint.h>


#include "libavutil/attributes.h"

#include "libavutil/avassert.h"

#include "libavutil/common.h"

#include "libavutil/cpu.h"

#include "libavutil/x86/asm.h"

#include "libavutil/x86/cpu.h"

#include "libavcodec/avcodec.h"

#include "libavcodec/mpegvideoencdsp.h"


void ff_mpv_denoise_dct_sse2(int16_t block[64], int dct_error_sum[64],

                             const uint16_t dct_offset[64]);

int ff_pix_sum16_sse2(const uint8_t *pix, ptrdiff_t line_size);

int ff_pix_sum16_xop(const uint8_t *pix, ptrdiff_t line_size);

int ff_pix_norm1_sse2(const uint8_t *pix, ptrdiff_t line_size);

void ff_add_8x8basis_ssse3(int16_t rem[64], const int16_t basis[64], int scale);


#if HAVE_INLINE_ASM

#if HAVE_SSSE3_INLINE

#define SCALE_OFFSET -1


#define MAX_ABS 512


static int try_8x8basis_ssse3(const int16_t rem[64], const int16_t weight[64], const int16_t basis[64], int scale)

{

    x86_reg i=0;


    av_assert2(FFABS(scale) < MAX_ABS);

    scale *= 1 << (16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT);


    __asm__ volatile(

        "pxor            %%xmm2, %%xmm2     \n\t"

        "movd                %4, %%xmm3     \n\t"

        "punpcklwd       %%xmm3, %%xmm3     \n\t"

        "pshufd      $0, %%xmm3, %%xmm3     \n\t"

        ".p2align 4                         \n\t"

        "1:                                 \n\t"

        "movdqa        (%1, %0), %%xmm0     \n\t"

        "movdqa      16(%1, %0), %%xmm1     \n\t"

        "pmulhrsw        %%xmm3, %%xmm0     \n\t"

        "pmulhrsw        %%xmm3, %%xmm1     \n\t"

        "paddw         (%2, %0), %%xmm0     \n\t"

        "paddw       16(%2, %0), %%xmm1     \n\t"

        "psraw               $6, %%xmm0     \n\t"

        "psraw               $6, %%xmm1     \n\t"

        "pmullw        (%3, %0), %%xmm0     \n\t"

        "pmullw      16(%3, %0), %%xmm1     \n\t"

        "pmaddwd         %%xmm0, %%xmm0     \n\t"

        "pmaddwd         %%xmm1, %%xmm1     \n\t"

        "paddd           %%xmm1, %%xmm0     \n\t"

        "psrld               $4, %%xmm0     \n\t"

        "paddd           %%xmm0, %%xmm2     \n\t"

        "add                $32, %0         \n\t"

        "cmp               $128, %0         \n\t" //FIXME optimize & bench

        " jb                 1b             \n\t"

        "pshufd   $0x0E, %%xmm2, %%xmm0     \n\t"

        "paddd           %%xmm0, %%xmm2     \n\t"

        "pshufd   $0x01, %%xmm2, %%xmm0     \n\t"

        "paddd           %%xmm0, %%xmm2     \n\t"

        "psrld               $2, %%xmm2     \n\t"

        "movd            %%xmm2, %0         \n\t"

        : "+r" (i)

        : "r"(basis), "r"(rem), "r"(weight), "g"(scale)

        XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3")

    );

    return i;

}

#endif /* HAVE_SSSE3_INLINE */


/* Draw the edges of width 'w' of an image of size width, height */

static void draw_edges_mmx(uint8_t *buf, ptrdiff_t wrap, int width, int height,

                           int w, int h, int sides)

{

    uint8_t *ptr, *last_line;

    int i;


    /* left and right */

    ptr = buf;

    if (w == 8) {

        __asm__ volatile (

            "1:                             \n\t"

            "movd            (%0), %%mm0    \n\t"

            "punpcklbw      %%mm0, %%mm0    \n\t"

            "punpcklwd      %%mm0, %%mm0    \n\t"

            "punpckldq      %%mm0, %%mm0    \n\t"

            "movq           %%mm0, -8(%0)   \n\t"

            "movq      -8(%0, %2), %%mm1    \n\t"

            "punpckhbw      %%mm1, %%mm1    \n\t"

            "punpckhwd      %%mm1, %%mm1    \n\t"

            "punpckhdq      %%mm1, %%mm1    \n\t"

            "movq           %%mm1, (%0, %2) \n\t"

            "add               %1, %0       \n\t"

            "cmp               %3, %0       \n\t"

            "jnz               1b           \n\t"

            : "+r" (ptr)

            : "r" ((x86_reg) wrap), "r" ((x86_reg) width),

              "r" (ptr + wrap * height));

    } else if (w == 16) {

        __asm__ volatile (

            "1:                                 \n\t"

            "movd            (%0), %%mm0        \n\t"

            "punpcklbw      %%mm0, %%mm0        \n\t"

            "punpcklwd      %%mm0, %%mm0        \n\t"

            "punpckldq      %%mm0, %%mm0        \n\t"

            "movq           %%mm0, -8(%0)       \n\t"

            "movq           %%mm0, -16(%0)      \n\t"

            "movq      -8(%0, %2), %%mm1        \n\t"

            "punpckhbw      %%mm1, %%mm1        \n\t"

            "punpckhwd      %%mm1, %%mm1        \n\t"

            "punpckhdq      %%mm1, %%mm1        \n\t"

            "movq           %%mm1,  (%0, %2)    \n\t"

            "movq           %%mm1, 8(%0, %2)    \n\t"

            "add               %1, %0           \n\t"

            "cmp               %3, %0           \n\t"

            "jnz               1b               \n\t"

            : "+r"(ptr)

            : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)

            );

    } else {

        av_assert1(w == 4);

        __asm__ volatile (

            "1:                             \n\t"

            "movd            (%0), %%mm0    \n\t"

            "punpcklbw      %%mm0, %%mm0    \n\t"

            "punpcklwd      %%mm0, %%mm0    \n\t"

            "movd           %%mm0, -4(%0)   \n\t"

            "movd      -4(%0, %2), %%mm1    \n\t"

            "punpcklbw      %%mm1, %%mm1    \n\t"

            "punpckhwd      %%mm1, %%mm1    \n\t"

            "punpckhdq      %%mm1, %%mm1    \n\t"

            "movd           %%mm1, (%0, %2) \n\t"

            "add               %1, %0       \n\t"

            "cmp               %3, %0       \n\t"

            "jnz               1b           \n\t"

            : "+r" (ptr)

            : "r" ((x86_reg) wrap), "r" ((x86_reg) width),

              "r" (ptr + wrap * height));

    }


    /* top and bottom + corners */

    buf -= w;

    last_line = buf + (height - 1) * wrap;

    if (sides & EDGE_TOP)

        for (i = 0; i < h; i++)

            // top

            memcpy(buf - (i + 1) * wrap, buf, width + w + w);

    if (sides & EDGE_BOTTOM)

        for (i = 0; i < h; i++)

            // bottom

            memcpy(last_line + (i + 1) * wrap, last_line, width + w + w);

}


#endif /* HAVE_INLINE_ASM */


av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c,

                                         AVCodecContext *avctx)

{

    int cpu_flags = av_get_cpu_flags();


    if (EXTERNAL_SSE2(cpu_flags)) {

        c->denoise_dct = ff_mpv_denoise_dct_sse2;

        c->pix_sum     = ff_pix_sum16_sse2;

        c->pix_norm1   = ff_pix_norm1_sse2;

    }


    if (EXTERNAL_XOP(cpu_flags)) {

        c->pix_sum     = ff_pix_sum16_xop;

    }


#if HAVE_INLINE_ASM


    if (INLINE_MMX(cpu_flags)) {

        if (avctx->bits_per_raw_sample <= 8) {

            c->draw_edges = draw_edges_mmx;

        }

    }

#endif /* HAVE_INLINE_ASM */


    if (X86_SSSE3(cpu_flags)) {

#if HAVE_SSSE3_INLINE

        if (!(avctx->flags & AV_CODEC_FLAG_BITEXACT)) {

            c->try_8x8basis = try_8x8basis_ssse3;

        }

#endif /* HAVE_SSSE3_INLINE */

#if HAVE_SSSE3_EXTERNAL

        c->add_8x8basis = ff_add_8x8basis_ssse3;

#endif

    }


}