00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "libavutil/cpu.h"
00023 #include "libavutil/x86_cpu.h"
00024 #include "libavcodec/dsputil.h"
00025 #include "libavcodec/mpegaudiodsp.h"
00026
00027 void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
00028 void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
00029 void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
00030 void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
00031 void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
00032
00033 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
00034 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
00035
00036 #define SUM8(op, sum, w, p) \
00037 { \
00038 op(sum, (w)[0 * 64], (p)[0 * 64]); \
00039 op(sum, (w)[1 * 64], (p)[1 * 64]); \
00040 op(sum, (w)[2 * 64], (p)[2 * 64]); \
00041 op(sum, (w)[3 * 64], (p)[3 * 64]); \
00042 op(sum, (w)[4 * 64], (p)[4 * 64]); \
00043 op(sum, (w)[5 * 64], (p)[5 * 64]); \
00044 op(sum, (w)[6 * 64], (p)[6 * 64]); \
00045 op(sum, (w)[7 * 64], (p)[7 * 64]); \
00046 }
00047
00048 static void apply_window(const float *buf, const float *win1,
00049 const float *win2, float *sum1, float *sum2, int len)
00050 {
00051 x86_reg count = - 4*len;
00052 const float *win1a = win1+len;
00053 const float *win2a = win2+len;
00054 const float *bufa = buf+len;
00055 float *sum1a = sum1+len;
00056 float *sum2a = sum2+len;
00057
00058
00059 #define MULT(a, b) \
00060 "movaps " #a "(%1,%0), %%xmm1 \n\t" \
00061 "movaps " #a "(%3,%0), %%xmm2 \n\t" \
00062 "mulps %%xmm2, %%xmm1 \n\t" \
00063 "subps %%xmm1, %%xmm0 \n\t" \
00064 "mulps " #b "(%2,%0), %%xmm2 \n\t" \
00065 "subps %%xmm2, %%xmm4 \n\t" \
00066
00067 __asm__ volatile(
00068 "1: \n\t"
00069 "xorps %%xmm0, %%xmm0 \n\t"
00070 "xorps %%xmm4, %%xmm4 \n\t"
00071
00072 MULT( 0, 0)
00073 MULT( 256, 64)
00074 MULT( 512, 128)
00075 MULT( 768, 192)
00076 MULT(1024, 256)
00077 MULT(1280, 320)
00078 MULT(1536, 384)
00079 MULT(1792, 448)
00080
00081 "movaps %%xmm0, (%4,%0) \n\t"
00082 "movaps %%xmm4, (%5,%0) \n\t"
00083 "add $16, %0 \n\t"
00084 "jl 1b \n\t"
00085 :"+&r"(count)
00086 :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
00087 );
00088
00089 #undef MULT
00090 }
00091
00092 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
00093 int incr)
00094 {
00095 LOCAL_ALIGNED_16(float, suma, [17]);
00096 LOCAL_ALIGNED_16(float, sumb, [17]);
00097 LOCAL_ALIGNED_16(float, sumc, [17]);
00098 LOCAL_ALIGNED_16(float, sumd, [17]);
00099
00100 float sum;
00101
00102
00103 memcpy(in + 512, in, 32 * sizeof(*in));
00104
00105 apply_window(in + 16, win , win + 512, suma, sumc, 16);
00106 apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
00107
00108 SUM8(MACS, suma[0], win + 32, in + 48);
00109
00110 sumc[ 0] = 0;
00111 sumb[16] = 0;
00112 sumd[16] = 0;
00113
00114 #define SUMS(suma, sumb, sumc, sumd, out1, out2) \
00115 "movups " #sumd "(%4), %%xmm0 \n\t" \
00116 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
00117 "subps " #suma "(%1), %%xmm0 \n\t" \
00118 "movaps %%xmm0," #out1 "(%0) \n\t" \
00119 \
00120 "movups " #sumc "(%3), %%xmm0 \n\t" \
00121 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
00122 "addps " #sumb "(%2), %%xmm0 \n\t" \
00123 "movaps %%xmm0," #out2 "(%0) \n\t"
00124
00125 if (incr == 1) {
00126 __asm__ volatile(
00127 SUMS( 0, 48, 4, 52, 0, 112)
00128 SUMS(16, 32, 20, 36, 16, 96)
00129 SUMS(32, 16, 36, 20, 32, 80)
00130 SUMS(48, 0, 52, 4, 48, 64)
00131
00132 :"+&r"(out)
00133 :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
00134 :"memory"
00135 );
00136 out += 16*incr;
00137 } else {
00138 int j;
00139 float *out2 = out + 32 * incr;
00140 out[0 ] = -suma[ 0];
00141 out += incr;
00142 out2 -= incr;
00143 for(j=1;j<16;j++) {
00144 *out = -suma[ j] + sumd[16-j];
00145 *out2 = sumb[16-j] + sumc[ j];
00146 out += incr;
00147 out2 -= incr;
00148 }
00149 }
00150
00151 sum = 0;
00152 SUM8(MLSS, sum, win + 16 + 32, in + 32);
00153 *out = sum;
00154 }
00155
00156 void ff_mpadsp_init_mmx(MPADSPContext *s)
00157 {
00158 int mm_flags = av_get_cpu_flags();
00159
00160 if (mm_flags & AV_CPU_FLAG_SSE2) {
00161 s->apply_window_float = apply_window_mp3;
00162 }
00163 if (HAVE_YASM && mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
00164 s->imdct36_float = ff_imdct36_float_avx;
00165 }
00166 else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSSE3 && HAVE_SSE) {
00167 s->imdct36_float = ff_imdct36_float_ssse3;
00168 }
00169 else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE3 && HAVE_SSE) {
00170 s->imdct36_float = ff_imdct36_float_sse3;
00171 }
00172 else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
00173 s->imdct36_float = ff_imdct36_float_sse2;
00174 }
00175 else if (HAVE_YASM && mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
00176 s->imdct36_float = ff_imdct36_float_sse;
00177 }
00178 }