00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "libavutil/cpu.h"
00023 #include "libavutil/x86_cpu.h"
00024 #include "libavcodec/dsputil.h"
00025 #include "libavcodec/mpegaudiodsp.h"
00026
00027 void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
00028 void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
00029 void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
00030 void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
00031 void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
00032 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
00033 float *tmpbuf);
00034 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
00035 float *tmpbuf);
00036
00037 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
00038
00039 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
00040 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
00041
00042 #define SUM8(op, sum, w, p) \
00043 { \
00044 op(sum, (w)[0 * 64], (p)[0 * 64]); \
00045 op(sum, (w)[1 * 64], (p)[1 * 64]); \
00046 op(sum, (w)[2 * 64], (p)[2 * 64]); \
00047 op(sum, (w)[3 * 64], (p)[3 * 64]); \
00048 op(sum, (w)[4 * 64], (p)[4 * 64]); \
00049 op(sum, (w)[5 * 64], (p)[5 * 64]); \
00050 op(sum, (w)[6 * 64], (p)[6 * 64]); \
00051 op(sum, (w)[7 * 64], (p)[7 * 64]); \
00052 }
00053
00054 static void apply_window(const float *buf, const float *win1,
00055 const float *win2, float *sum1, float *sum2, int len)
00056 {
00057 x86_reg count = - 4*len;
00058 const float *win1a = win1+len;
00059 const float *win2a = win2+len;
00060 const float *bufa = buf+len;
00061 float *sum1a = sum1+len;
00062 float *sum2a = sum2+len;
00063
00064
00065 #define MULT(a, b) \
00066 "movaps " #a "(%1,%0), %%xmm1 \n\t" \
00067 "movaps " #a "(%3,%0), %%xmm2 \n\t" \
00068 "mulps %%xmm2, %%xmm1 \n\t" \
00069 "subps %%xmm1, %%xmm0 \n\t" \
00070 "mulps " #b "(%2,%0), %%xmm2 \n\t" \
00071 "subps %%xmm2, %%xmm4 \n\t" \
00072
00073 __asm__ volatile(
00074 "1: \n\t"
00075 "xorps %%xmm0, %%xmm0 \n\t"
00076 "xorps %%xmm4, %%xmm4 \n\t"
00077
00078 MULT( 0, 0)
00079 MULT( 256, 64)
00080 MULT( 512, 128)
00081 MULT( 768, 192)
00082 MULT(1024, 256)
00083 MULT(1280, 320)
00084 MULT(1536, 384)
00085 MULT(1792, 448)
00086
00087 "movaps %%xmm0, (%4,%0) \n\t"
00088 "movaps %%xmm4, (%5,%0) \n\t"
00089 "add $16, %0 \n\t"
00090 "jl 1b \n\t"
00091 :"+&r"(count)
00092 :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
00093 );
00094
00095 #undef MULT
00096 }
00097
00098 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
00099 int incr)
00100 {
00101 LOCAL_ALIGNED_16(float, suma, [17]);
00102 LOCAL_ALIGNED_16(float, sumb, [17]);
00103 LOCAL_ALIGNED_16(float, sumc, [17]);
00104 LOCAL_ALIGNED_16(float, sumd, [17]);
00105
00106 float sum;
00107
00108
00109 __asm__ volatile(
00110 "movaps 0(%0), %%xmm0 \n\t" \
00111 "movaps 16(%0), %%xmm1 \n\t" \
00112 "movaps 32(%0), %%xmm2 \n\t" \
00113 "movaps 48(%0), %%xmm3 \n\t" \
00114 "movaps %%xmm0, 0(%1) \n\t" \
00115 "movaps %%xmm1, 16(%1) \n\t" \
00116 "movaps %%xmm2, 32(%1) \n\t" \
00117 "movaps %%xmm3, 48(%1) \n\t" \
00118 "movaps 64(%0), %%xmm0 \n\t" \
00119 "movaps 80(%0), %%xmm1 \n\t" \
00120 "movaps 96(%0), %%xmm2 \n\t" \
00121 "movaps 112(%0), %%xmm3 \n\t" \
00122 "movaps %%xmm0, 64(%1) \n\t" \
00123 "movaps %%xmm1, 80(%1) \n\t" \
00124 "movaps %%xmm2, 96(%1) \n\t" \
00125 "movaps %%xmm3, 112(%1) \n\t"
00126 ::"r"(in), "r"(in+512)
00127 :"memory"
00128 );
00129
00130 apply_window(in + 16, win , win + 512, suma, sumc, 16);
00131 apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
00132
00133 SUM8(MACS, suma[0], win + 32, in + 48);
00134
00135 sumc[ 0] = 0;
00136 sumb[16] = 0;
00137 sumd[16] = 0;
00138
00139 #define SUMS(suma, sumb, sumc, sumd, out1, out2) \
00140 "movups " #sumd "(%4), %%xmm0 \n\t" \
00141 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
00142 "subps " #suma "(%1), %%xmm0 \n\t" \
00143 "movaps %%xmm0," #out1 "(%0) \n\t" \
00144 \
00145 "movups " #sumc "(%3), %%xmm0 \n\t" \
00146 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
00147 "addps " #sumb "(%2), %%xmm0 \n\t" \
00148 "movaps %%xmm0," #out2 "(%0) \n\t"
00149
00150 if (incr == 1) {
00151 __asm__ volatile(
00152 SUMS( 0, 48, 4, 52, 0, 112)
00153 SUMS(16, 32, 20, 36, 16, 96)
00154 SUMS(32, 16, 36, 20, 32, 80)
00155 SUMS(48, 0, 52, 4, 48, 64)
00156
00157 :"+&r"(out)
00158 :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
00159 :"memory"
00160 );
00161 out += 16*incr;
00162 } else {
00163 int j;
00164 float *out2 = out + 32 * incr;
00165 out[0 ] = -suma[ 0];
00166 out += incr;
00167 out2 -= incr;
00168 for(j=1;j<16;j++) {
00169 *out = -suma[ j] + sumd[16-j];
00170 *out2 = sumb[16-j] + sumc[ j];
00171 out += incr;
00172 out2 -= incr;
00173 }
00174 }
00175
00176 sum = 0;
00177 SUM8(MLSS, sum, win + 16 + 32, in + 32);
00178 *out = sum;
00179 }
00180
00181
00182 #define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
00183 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
00184 int count, int switch_point, int block_type) \
00185 { \
00186 int align_end = count - (count & 3); \
00187 int j; \
00188 for (j = 0; j < align_end; j+= 4) { \
00189 LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
00190 float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
00191 \
00192 \
00193 \
00194 ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
00195 in += 4*18; \
00196 buf += 4*18; \
00197 out += 4; \
00198 } \
00199 for (; j < count; j++) { \
00200 \
00201 \
00202 \
00203 int win_idx = (switch_point && j < 2) ? 0 : block_type; \
00204 float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
00205 \
00206 ff_imdct36_float_ ## CPU1(out, buf, in, win); \
00207 \
00208 in += 18; \
00209 buf++; \
00210 out++; \
00211 } \
00212 }
00213
00214 #if HAVE_YASM
00215 #if HAVE_SSE
00216 DECL_IMDCT_BLOCKS(sse,sse)
00217 DECL_IMDCT_BLOCKS(sse2,sse)
00218 DECL_IMDCT_BLOCKS(sse3,sse)
00219 DECL_IMDCT_BLOCKS(ssse3,sse)
00220 #endif
00221 #if HAVE_AVX
00222 DECL_IMDCT_BLOCKS(avx,avx)
00223 #endif
00224 #endif
00225
00226 void ff_mpadsp_init_mmx(MPADSPContext *s)
00227 {
00228 int mm_flags = av_get_cpu_flags();
00229
00230 int i, j;
00231 for (j = 0; j < 4; j++) {
00232 for (i = 0; i < 40; i ++) {
00233 mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
00234 mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
00235 mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
00236 mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
00237 mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
00238 mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
00239 mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
00240 mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
00241 }
00242 }
00243
00244 if (mm_flags & AV_CPU_FLAG_SSE2) {
00245 s->apply_window_float = apply_window_mp3;
00246 }
00247 #if HAVE_YASM
00248 if (0) {
00249 #if HAVE_AVX
00250 } else if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
00251 s->imdct36_blocks_float = imdct36_blocks_avx;
00252 #endif
00253 #if HAVE_SSE
00254 } else if (mm_flags & AV_CPU_FLAG_SSSE3) {
00255 s->imdct36_blocks_float = imdct36_blocks_ssse3;
00256 } else if (mm_flags & AV_CPU_FLAG_SSE3) {
00257 s->imdct36_blocks_float = imdct36_blocks_sse3;
00258 } else if (mm_flags & AV_CPU_FLAG_SSE2) {
00259 s->imdct36_blocks_float = imdct36_blocks_sse2;
00260 } else if (mm_flags & AV_CPU_FLAG_SSE) {
00261 s->imdct36_blocks_float = imdct36_blocks_sse;
00262 #endif
00263 }
00264 #endif
00265 }