00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025 #include "libavutil/cpu.h"
00026 #include "libavutil/x86_cpu.h"
00027 #include "libavcodec/fmtconvert.h"
00028
00029 static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
00030 {
00031 x86_reg i = -4*len;
00032 __asm__ volatile(
00033 "movss %3, %%xmm4 \n"
00034 "shufps $0, %%xmm4, %%xmm4 \n"
00035 "1: \n"
00036 "cvtpi2ps (%2,%0), %%xmm0 \n"
00037 "cvtpi2ps 8(%2,%0), %%xmm1 \n"
00038 "cvtpi2ps 16(%2,%0), %%xmm2 \n"
00039 "cvtpi2ps 24(%2,%0), %%xmm3 \n"
00040 "movlhps %%xmm1, %%xmm0 \n"
00041 "movlhps %%xmm3, %%xmm2 \n"
00042 "mulps %%xmm4, %%xmm0 \n"
00043 "mulps %%xmm4, %%xmm2 \n"
00044 "movaps %%xmm0, (%1,%0) \n"
00045 "movaps %%xmm2, 16(%1,%0) \n"
00046 "add $32, %0 \n"
00047 "jl 1b \n"
00048 :"+r"(i)
00049 :"r"(dst+len), "r"(src+len), "m"(mul)
00050 );
00051 }
00052
00053 static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
00054 {
00055 x86_reg i = -4*len;
00056 __asm__ volatile(
00057 "movss %3, %%xmm4 \n"
00058 "shufps $0, %%xmm4, %%xmm4 \n"
00059 "1: \n"
00060 "cvtdq2ps (%2,%0), %%xmm0 \n"
00061 "cvtdq2ps 16(%2,%0), %%xmm1 \n"
00062 "mulps %%xmm4, %%xmm0 \n"
00063 "mulps %%xmm4, %%xmm1 \n"
00064 "movaps %%xmm0, (%1,%0) \n"
00065 "movaps %%xmm1, 16(%1,%0) \n"
00066 "add $32, %0 \n"
00067 "jl 1b \n"
00068 :"+r"(i)
00069 :"r"(dst+len), "r"(src+len), "m"(mul)
00070 );
00071 }
00072
00073 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
00074 x86_reg reglen = len;
00075
00076 __asm__ volatile(
00077 "add %0 , %0 \n\t"
00078 "lea (%2,%0,2) , %2 \n\t"
00079 "add %0 , %1 \n\t"
00080 "neg %0 \n\t"
00081 "1: \n\t"
00082 "pf2id (%2,%0,2) , %%mm0 \n\t"
00083 "pf2id 8(%2,%0,2) , %%mm1 \n\t"
00084 "pf2id 16(%2,%0,2) , %%mm2 \n\t"
00085 "pf2id 24(%2,%0,2) , %%mm3 \n\t"
00086 "packssdw %%mm1 , %%mm0 \n\t"
00087 "packssdw %%mm3 , %%mm2 \n\t"
00088 "movq %%mm0 , (%1,%0) \n\t"
00089 "movq %%mm2 , 8(%1,%0) \n\t"
00090 "add $16 , %0 \n\t"
00091 " js 1b \n\t"
00092 "femms \n\t"
00093 :"+r"(reglen), "+r"(dst), "+r"(src)
00094 );
00095 }
00096
00097 static void float_to_int16_sse(int16_t *dst, const float *src, long len){
00098 x86_reg reglen = len;
00099 __asm__ volatile(
00100 "add %0 , %0 \n\t"
00101 "lea (%2,%0,2) , %2 \n\t"
00102 "add %0 , %1 \n\t"
00103 "neg %0 \n\t"
00104 "1: \n\t"
00105 "cvtps2pi (%2,%0,2) , %%mm0 \n\t"
00106 "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
00107 "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
00108 "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
00109 "packssdw %%mm1 , %%mm0 \n\t"
00110 "packssdw %%mm3 , %%mm2 \n\t"
00111 "movq %%mm0 , (%1,%0) \n\t"
00112 "movq %%mm2 , 8(%1,%0) \n\t"
00113 "add $16 , %0 \n\t"
00114 " js 1b \n\t"
00115 "emms \n\t"
00116 :"+r"(reglen), "+r"(dst), "+r"(src)
00117 );
00118 }
00119
00120 static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
00121 x86_reg reglen = len;
00122 __asm__ volatile(
00123 "add %0 , %0 \n\t"
00124 "lea (%2,%0,2) , %2 \n\t"
00125 "add %0 , %1 \n\t"
00126 "neg %0 \n\t"
00127 "1: \n\t"
00128 "cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
00129 "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
00130 "packssdw %%xmm1 , %%xmm0 \n\t"
00131 "movdqa %%xmm0 , (%1,%0) \n\t"
00132 "add $16 , %0 \n\t"
00133 " js 1b \n\t"
00134 :"+r"(reglen), "+r"(dst), "+r"(src)
00135 );
00136 }
00137
00138 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
00139 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
00140 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
00141
00142 #if !HAVE_YASM
00143 #define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
00144 #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
00145 #define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
00146 #endif
00147 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
00148
00149 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
00150 \
00151 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
00152 DECLARE_ALIGNED(16, int16_t, tmp)[len];\
00153 int i,j,c;\
00154 for(c=0; c<channels; c++){\
00155 float_to_int16_##cpu(tmp, src[c], len);\
00156 for(i=0, j=c; i<len; i++, j+=channels)\
00157 dst[j] = tmp[i];\
00158 }\
00159 }\
00160 \
00161 static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
00162 if(channels==1)\
00163 float_to_int16_##cpu(dst, src[0], len);\
00164 else if(channels==2){\
00165 x86_reg reglen = len; \
00166 const float *src0 = src[0];\
00167 const float *src1 = src[1];\
00168 __asm__ volatile(\
00169 "shl $2, %0 \n"\
00170 "add %0, %1 \n"\
00171 "add %0, %2 \n"\
00172 "add %0, %3 \n"\
00173 "neg %0 \n"\
00174 body\
00175 :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
00176 );\
00177 }else if(channels==6){\
00178 ff_float_to_int16_interleave6_##cpu(dst, src, len);\
00179 }else\
00180 float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
00181 }
00182
00183 FLOAT_TO_INT16_INTERLEAVE(3dnow,
00184 "1: \n"
00185 "pf2id (%2,%0), %%mm0 \n"
00186 "pf2id 8(%2,%0), %%mm1 \n"
00187 "pf2id (%3,%0), %%mm2 \n"
00188 "pf2id 8(%3,%0), %%mm3 \n"
00189 "packssdw %%mm1, %%mm0 \n"
00190 "packssdw %%mm3, %%mm2 \n"
00191 "movq %%mm0, %%mm1 \n"
00192 "punpcklwd %%mm2, %%mm0 \n"
00193 "punpckhwd %%mm2, %%mm1 \n"
00194 "movq %%mm0, (%1,%0)\n"
00195 "movq %%mm1, 8(%1,%0)\n"
00196 "add $16, %0 \n"
00197 "js 1b \n"
00198 "femms \n"
00199 )
00200
00201 FLOAT_TO_INT16_INTERLEAVE(sse,
00202 "1: \n"
00203 "cvtps2pi (%2,%0), %%mm0 \n"
00204 "cvtps2pi 8(%2,%0), %%mm1 \n"
00205 "cvtps2pi (%3,%0), %%mm2 \n"
00206 "cvtps2pi 8(%3,%0), %%mm3 \n"
00207 "packssdw %%mm1, %%mm0 \n"
00208 "packssdw %%mm3, %%mm2 \n"
00209 "movq %%mm0, %%mm1 \n"
00210 "punpcklwd %%mm2, %%mm0 \n"
00211 "punpckhwd %%mm2, %%mm1 \n"
00212 "movq %%mm0, (%1,%0)\n"
00213 "movq %%mm1, 8(%1,%0)\n"
00214 "add $16, %0 \n"
00215 "js 1b \n"
00216 "emms \n"
00217 )
00218
00219 FLOAT_TO_INT16_INTERLEAVE(sse2,
00220 "1: \n"
00221 "cvtps2dq (%2,%0), %%xmm0 \n"
00222 "cvtps2dq (%3,%0), %%xmm1 \n"
00223 "packssdw %%xmm1, %%xmm0 \n"
00224 "movhlps %%xmm0, %%xmm1 \n"
00225 "punpcklwd %%xmm1, %%xmm0 \n"
00226 "movdqa %%xmm0, (%1,%0) \n"
00227 "add $16, %0 \n"
00228 "js 1b \n"
00229 )
00230
00231 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
00232 if(channels==6)
00233 ff_float_to_int16_interleave6_3dn2(dst, src, len);
00234 else
00235 float_to_int16_interleave_3dnow(dst, src, len, channels);
00236 }
00237
00238 #if HAVE_YASM
00239 void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
00240 void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
00241
00242 void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
00243 void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
00244
00245 static void float_interleave_mmx(float *dst, const float **src,
00246 unsigned int len, int channels)
00247 {
00248 if (channels == 2) {
00249 ff_float_interleave2_mmx(dst, src, len);
00250 } else if (channels == 6)
00251 ff_float_interleave6_mmx(dst, src, len);
00252 else
00253 ff_float_interleave_c(dst, src, len, channels);
00254 }
00255
00256 static void float_interleave_sse(float *dst, const float **src,
00257 unsigned int len, int channels)
00258 {
00259 if (channels == 2) {
00260 ff_float_interleave2_sse(dst, src, len);
00261 } else if (channels == 6)
00262 ff_float_interleave6_sse(dst, src, len);
00263 else
00264 ff_float_interleave_c(dst, src, len, channels);
00265 }
00266 #endif
00267
00268 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
00269 {
00270 int mm_flags = av_get_cpu_flags();
00271
00272 if (mm_flags & AV_CPU_FLAG_MMX) {
00273 #if HAVE_YASM
00274 c->float_interleave = float_interleave_mmx;
00275 #endif
00276
00277 if(mm_flags & AV_CPU_FLAG_3DNOW){
00278 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
00279 c->float_to_int16 = float_to_int16_3dnow;
00280 c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
00281 }
00282 }
00283 if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
00284 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
00285 c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
00286 }
00287 }
00288 if(mm_flags & AV_CPU_FLAG_SSE){
00289 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
00290 c->float_to_int16 = float_to_int16_sse;
00291 c->float_to_int16_interleave = float_to_int16_interleave_sse;
00292 #if HAVE_YASM
00293 c->float_interleave = float_interleave_sse;
00294 #endif
00295 }
00296 if(mm_flags & AV_CPU_FLAG_SSE2){
00297 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
00298 c->float_to_int16 = float_to_int16_sse2;
00299 c->float_to_int16_interleave = float_to_int16_interleave_sse2;
00300 }
00301 }
00302 }