FFmpeg: libavcodec/x86/fmtconvert

00001 /*
00002  * Format Conversion Utils
00003  * Copyright (c) 2000, 2001 Fabrice Bellard
00004  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
00005  *
00006  * This file is part of FFmpeg.
00007  *
00008  * FFmpeg is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * FFmpeg is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with FFmpeg; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  *
00022  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
00023  */
00024 
00025 #include "libavutil/cpu.h"
00026 #include "libavutil/x86_cpu.h"
00027 #include "libavcodec/fmtconvert.h"
00028 
00029 static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
00030 {
00031     x86_reg i = -4*len;
00032     __asm__ volatile(
00033         "movss  %3, %%xmm4 \n"
00034         "shufps $0, %%xmm4, %%xmm4 \n"
00035         "1: \n"
00036         "cvtpi2ps   (%2,%0), %%xmm0 \n"
00037         "cvtpi2ps  8(%2,%0), %%xmm1 \n"
00038         "cvtpi2ps 16(%2,%0), %%xmm2 \n"
00039         "cvtpi2ps 24(%2,%0), %%xmm3 \n"
00040         "movlhps  %%xmm1,    %%xmm0 \n"
00041         "movlhps  %%xmm3,    %%xmm2 \n"
00042         "mulps    %%xmm4,    %%xmm0 \n"
00043         "mulps    %%xmm4,    %%xmm2 \n"
00044         "movaps   %%xmm0,   (%1,%0) \n"
00045         "movaps   %%xmm2, 16(%1,%0) \n"
00046         "add $32, %0 \n"
00047         "jl 1b \n"
00048         :"+r"(i)
00049         :"r"(dst+len), "r"(src+len), "m"(mul)
00050     );
00051 }
00052 
00053 static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
00054 {
00055     x86_reg i = -4*len;
00056     __asm__ volatile(
00057         "movss  %3, %%xmm4 \n"
00058         "shufps $0, %%xmm4, %%xmm4 \n"
00059         "1: \n"
00060         "cvtdq2ps   (%2,%0), %%xmm0 \n"
00061         "cvtdq2ps 16(%2,%0), %%xmm1 \n"
00062         "mulps    %%xmm4,    %%xmm0 \n"
00063         "mulps    %%xmm4,    %%xmm1 \n"
00064         "movaps   %%xmm0,   (%1,%0) \n"
00065         "movaps   %%xmm1, 16(%1,%0) \n"
00066         "add $32, %0 \n"
00067         "jl 1b \n"
00068         :"+r"(i)
00069         :"r"(dst+len), "r"(src+len), "m"(mul)
00070     );
00071 }
00072 
00073 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
00074     x86_reg reglen = len;
00075     // not bit-exact: pf2id uses different rounding than C and SSE
00076     __asm__ volatile(
00077         "add        %0          , %0        \n\t"
00078         "lea         (%2,%0,2)  , %2        \n\t"
00079         "add        %0          , %1        \n\t"
00080         "neg        %0                      \n\t"
00081         "1:                                 \n\t"
00082         "pf2id       (%2,%0,2)  , %%mm0     \n\t"
00083         "pf2id      8(%2,%0,2)  , %%mm1     \n\t"
00084         "pf2id     16(%2,%0,2)  , %%mm2     \n\t"
00085         "pf2id     24(%2,%0,2)  , %%mm3     \n\t"
00086         "packssdw   %%mm1       , %%mm0     \n\t"
00087         "packssdw   %%mm3       , %%mm2     \n\t"
00088         "movq       %%mm0       ,  (%1,%0)  \n\t"
00089         "movq       %%mm2       , 8(%1,%0)  \n\t"
00090         "add        $16         , %0        \n\t"
00091         " js 1b                             \n\t"
00092         "femms                              \n\t"
00093         :"+r"(reglen), "+r"(dst), "+r"(src)
00094     );
00095 }
00096 
00097 static void float_to_int16_sse(int16_t *dst, const float *src, long len){
00098     x86_reg reglen = len;
00099     __asm__ volatile(
00100         "add        %0          , %0        \n\t"
00101         "lea         (%2,%0,2)  , %2        \n\t"
00102         "add        %0          , %1        \n\t"
00103         "neg        %0                      \n\t"
00104         "1:                                 \n\t"
00105         "cvtps2pi    (%2,%0,2)  , %%mm0     \n\t"
00106         "cvtps2pi   8(%2,%0,2)  , %%mm1     \n\t"
00107         "cvtps2pi  16(%2,%0,2)  , %%mm2     \n\t"
00108         "cvtps2pi  24(%2,%0,2)  , %%mm3     \n\t"
00109         "packssdw   %%mm1       , %%mm0     \n\t"
00110         "packssdw   %%mm3       , %%mm2     \n\t"
00111         "movq       %%mm0       ,  (%1,%0)  \n\t"
00112         "movq       %%mm2       , 8(%1,%0)  \n\t"
00113         "add        $16         , %0        \n\t"
00114         " js 1b                             \n\t"
00115         "emms                               \n\t"
00116         :"+r"(reglen), "+r"(dst), "+r"(src)
00117     );
00118 }
00119 
00120 static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
00121     x86_reg reglen = len;
00122     __asm__ volatile(
00123         "add        %0          , %0        \n\t"
00124         "lea         (%2,%0,2)  , %2        \n\t"
00125         "add        %0          , %1        \n\t"
00126         "neg        %0                      \n\t"
00127         "1:                                 \n\t"
00128         "cvtps2dq    (%2,%0,2)  , %%xmm0    \n\t"
00129         "cvtps2dq  16(%2,%0,2)  , %%xmm1    \n\t"
00130         "packssdw   %%xmm1      , %%xmm0    \n\t"
00131         "movdqa     %%xmm0      ,  (%1,%0)  \n\t"
00132         "add        $16         , %0        \n\t"
00133         " js 1b                             \n\t"
00134         :"+r"(reglen), "+r"(dst), "+r"(src)
00135     );
00136 }
00137 
00138 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
00139 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
00140 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
00141 
00142 #if !HAVE_YASM
00143 #define ff_float_to_int16_interleave6_sse(a,b,c)   float_to_int16_interleave_misc_sse(a,b,c,6)
00144 #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
00145 #define ff_float_to_int16_interleave6_3dn2(a,b,c)  float_to_int16_interleave_misc_3dnow(a,b,c,6)
00146 #endif
00147 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
00148 
00149 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
00150 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
00151 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
00152     DECLARE_ALIGNED(16, int16_t, tmp)[len];\
00153     int i,j,c;\
00154     for(c=0; c<channels; c++){\
00155         float_to_int16_##cpu(tmp, src[c], len);\
00156         for(i=0, j=c; i<len; i++, j+=channels)\
00157             dst[j] = tmp[i];\
00158     }\
00159 }\
00160 \
00161 static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
00162     if(channels==1)\
00163         float_to_int16_##cpu(dst, src[0], len);\
00164     else if(channels==2){\
00165         x86_reg reglen = len; \
00166         const float *src0 = src[0];\
00167         const float *src1 = src[1];\
00168         __asm__ volatile(\
00169             "shl $2, %0 \n"\
00170             "add %0, %1 \n"\
00171             "add %0, %2 \n"\
00172             "add %0, %3 \n"\
00173             "neg %0 \n"\
00174             body\
00175             :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
00176         );\
00177     }else if(channels==6){\
00178         ff_float_to_int16_interleave6_##cpu(dst, src, len);\
00179     }else\
00180         float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
00181 }
00182 
00183 FLOAT_TO_INT16_INTERLEAVE(3dnow,
00184     "1:                         \n"
00185     "pf2id     (%2,%0), %%mm0   \n"
00186     "pf2id    8(%2,%0), %%mm1   \n"
00187     "pf2id     (%3,%0), %%mm2   \n"
00188     "pf2id    8(%3,%0), %%mm3   \n"
00189     "packssdw    %%mm1, %%mm0   \n"
00190     "packssdw    %%mm3, %%mm2   \n"
00191     "movq        %%mm0, %%mm1   \n"
00192     "punpcklwd   %%mm2, %%mm0   \n"
00193     "punpckhwd   %%mm2, %%mm1   \n"
00194     "movq        %%mm0,  (%1,%0)\n"
00195     "movq        %%mm1, 8(%1,%0)\n"
00196     "add $16, %0                \n"
00197     "js 1b                      \n"
00198     "femms                      \n"
00199 )
00200 
00201 FLOAT_TO_INT16_INTERLEAVE(sse,
00202     "1:                         \n"
00203     "cvtps2pi  (%2,%0), %%mm0   \n"
00204     "cvtps2pi 8(%2,%0), %%mm1   \n"
00205     "cvtps2pi  (%3,%0), %%mm2   \n"
00206     "cvtps2pi 8(%3,%0), %%mm3   \n"
00207     "packssdw    %%mm1, %%mm0   \n"
00208     "packssdw    %%mm3, %%mm2   \n"
00209     "movq        %%mm0, %%mm1   \n"
00210     "punpcklwd   %%mm2, %%mm0   \n"
00211     "punpckhwd   %%mm2, %%mm1   \n"
00212     "movq        %%mm0,  (%1,%0)\n"
00213     "movq        %%mm1, 8(%1,%0)\n"
00214     "add $16, %0                \n"
00215     "js 1b                      \n"
00216     "emms                       \n"
00217 )
00218 
00219 FLOAT_TO_INT16_INTERLEAVE(sse2,
00220     "1:                         \n"
00221     "cvtps2dq  (%2,%0), %%xmm0  \n"
00222     "cvtps2dq  (%3,%0), %%xmm1  \n"
00223     "packssdw   %%xmm1, %%xmm0  \n"
00224     "movhlps    %%xmm0, %%xmm1  \n"
00225     "punpcklwd  %%xmm1, %%xmm0  \n"
00226     "movdqa     %%xmm0, (%1,%0) \n"
00227     "add $16, %0                \n"
00228     "js 1b                      \n"
00229 )
00230 
00231 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
00232     if(channels==6)
00233         ff_float_to_int16_interleave6_3dn2(dst, src, len);
00234     else
00235         float_to_int16_interleave_3dnow(dst, src, len, channels);
00236 }
00237 
00238 #if HAVE_YASM
00239 void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
00240 void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
00241 
00242 void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
00243 void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
00244 
00245 static void float_interleave_mmx(float *dst, const float **src,
00246                                  unsigned int len, int channels)
00247 {
00248     if (channels == 2) {
00249         ff_float_interleave2_mmx(dst, src, len);
00250     } else if (channels == 6)
00251         ff_float_interleave6_mmx(dst, src, len);
00252     else
00253         ff_float_interleave_c(dst, src, len, channels);
00254 }
00255 
00256 static void float_interleave_sse(float *dst, const float **src,
00257                                  unsigned int len, int channels)
00258 {
00259     if (channels == 2) {
00260         ff_float_interleave2_sse(dst, src, len);
00261     } else if (channels == 6)
00262         ff_float_interleave6_sse(dst, src, len);
00263     else
00264         ff_float_interleave_c(dst, src, len, channels);
00265 }
00266 #endif
00267 
00268 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
00269 {
00270     int mm_flags = av_get_cpu_flags();
00271 
00272     if (mm_flags & AV_CPU_FLAG_MMX) {
00273 #if HAVE_YASM
00274         c->float_interleave = float_interleave_mmx;
00275 #endif
00276 
00277         if(mm_flags & AV_CPU_FLAG_3DNOW){
00278             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
00279                 c->float_to_int16 = float_to_int16_3dnow;
00280                 c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
00281             }
00282         }
00283         if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
00284             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
00285                 c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
00286             }
00287         }
00288         if(mm_flags & AV_CPU_FLAG_SSE){
00289             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
00290             c->float_to_int16 = float_to_int16_sse;
00291             c->float_to_int16_interleave = float_to_int16_interleave_sse;
00292 #if HAVE_YASM
00293             c->float_interleave = float_interleave_sse;
00294 #endif
00295         }
00296         if(mm_flags & AV_CPU_FLAG_SSE2){
00297             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
00298             c->float_to_int16 = float_to_int16_sse2;
00299             c->float_to_int16_interleave = float_to_int16_interleave_sse2;
00300         }
00301     }
00302 }
libavcodec/x86/fmtconvert_mmx.c