00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "libavutil/cpu.h"
00023 #include "libavutil/x86_cpu.h"
00024 #include "libavcodec/dsputil.h"
00025 #include "libavcodec/mpegaudiodsp.h"
00026
00027 void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
00028 void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
00029 void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
00030 void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
00031 void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
00032 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
00033 float *tmpbuf);
00034 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
00035 float *tmpbuf);
00036
00037 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
00038
00039 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
00040 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
00041
00042 #define SUM8(op, sum, w, p) \
00043 { \
00044 op(sum, (w)[0 * 64], (p)[0 * 64]); \
00045 op(sum, (w)[1 * 64], (p)[1 * 64]); \
00046 op(sum, (w)[2 * 64], (p)[2 * 64]); \
00047 op(sum, (w)[3 * 64], (p)[3 * 64]); \
00048 op(sum, (w)[4 * 64], (p)[4 * 64]); \
00049 op(sum, (w)[5 * 64], (p)[5 * 64]); \
00050 op(sum, (w)[6 * 64], (p)[6 * 64]); \
00051 op(sum, (w)[7 * 64], (p)[7 * 64]); \
00052 }
00053
00054 static void apply_window(const float *buf, const float *win1,
00055 const float *win2, float *sum1, float *sum2, int len)
00056 {
00057 x86_reg count = - 4*len;
00058 const float *win1a = win1+len;
00059 const float *win2a = win2+len;
00060 const float *bufa = buf+len;
00061 float *sum1a = sum1+len;
00062 float *sum2a = sum2+len;
00063
00064
00065 #define MULT(a, b) \
00066 "movaps " #a "(%1,%0), %%xmm1 \n\t" \
00067 "movaps " #a "(%3,%0), %%xmm2 \n\t" \
00068 "mulps %%xmm2, %%xmm1 \n\t" \
00069 "subps %%xmm1, %%xmm0 \n\t" \
00070 "mulps " #b "(%2,%0), %%xmm2 \n\t" \
00071 "subps %%xmm2, %%xmm4 \n\t" \
00072
00073 __asm__ volatile(
00074 "1: \n\t"
00075 "xorps %%xmm0, %%xmm0 \n\t"
00076 "xorps %%xmm4, %%xmm4 \n\t"
00077
00078 MULT( 0, 0)
00079 MULT( 256, 64)
00080 MULT( 512, 128)
00081 MULT( 768, 192)
00082 MULT(1024, 256)
00083 MULT(1280, 320)
00084 MULT(1536, 384)
00085 MULT(1792, 448)
00086
00087 "movaps %%xmm0, (%4,%0) \n\t"
00088 "movaps %%xmm4, (%5,%0) \n\t"
00089 "add $16, %0 \n\t"
00090 "jl 1b \n\t"
00091 :"+&r"(count)
00092 :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
00093 );
00094
00095 #undef MULT
00096 }
00097
00098 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
00099 int incr)
00100 {
00101 LOCAL_ALIGNED_16(float, suma, [17]);
00102 LOCAL_ALIGNED_16(float, sumb, [17]);
00103 LOCAL_ALIGNED_16(float, sumc, [17]);
00104 LOCAL_ALIGNED_16(float, sumd, [17]);
00105
00106 float sum;
00107
00108
00109 memcpy(in + 512, in, 32 * sizeof(*in));
00110
00111 apply_window(in + 16, win , win + 512, suma, sumc, 16);
00112 apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
00113
00114 SUM8(MACS, suma[0], win + 32, in + 48);
00115
00116 sumc[ 0] = 0;
00117 sumb[16] = 0;
00118 sumd[16] = 0;
00119
00120 #define SUMS(suma, sumb, sumc, sumd, out1, out2) \
00121 "movups " #sumd "(%4), %%xmm0 \n\t" \
00122 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
00123 "subps " #suma "(%1), %%xmm0 \n\t" \
00124 "movaps %%xmm0," #out1 "(%0) \n\t" \
00125 \
00126 "movups " #sumc "(%3), %%xmm0 \n\t" \
00127 "shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
00128 "addps " #sumb "(%2), %%xmm0 \n\t" \
00129 "movaps %%xmm0," #out2 "(%0) \n\t"
00130
00131 if (incr == 1) {
00132 __asm__ volatile(
00133 SUMS( 0, 48, 4, 52, 0, 112)
00134 SUMS(16, 32, 20, 36, 16, 96)
00135 SUMS(32, 16, 36, 20, 32, 80)
00136 SUMS(48, 0, 52, 4, 48, 64)
00137
00138 :"+&r"(out)
00139 :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
00140 :"memory"
00141 );
00142 out += 16*incr;
00143 } else {
00144 int j;
00145 float *out2 = out + 32 * incr;
00146 out[0 ] = -suma[ 0];
00147 out += incr;
00148 out2 -= incr;
00149 for(j=1;j<16;j++) {
00150 *out = -suma[ j] + sumd[16-j];
00151 *out2 = sumb[16-j] + sumc[ j];
00152 out += incr;
00153 out2 -= incr;
00154 }
00155 }
00156
00157 sum = 0;
00158 SUM8(MLSS, sum, win + 16 + 32, in + 32);
00159 *out = sum;
00160 }
00161
00162
00163 #define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
00164 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
00165 int count, int switch_point, int block_type) \
00166 { \
00167 int align_end = count - (count & 3); \
00168 int j; \
00169 for (j = 0; j < align_end; j+= 4) { \
00170 LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
00171 float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
00172 \
00173 \
00174 \
00175 ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
00176 in += 4*18; \
00177 buf += 4*18; \
00178 out += 4; \
00179 } \
00180 for (; j < count; j++) { \
00181 \
00182 \
00183 \
00184 int win_idx = (switch_point && j < 2) ? 0 : block_type; \
00185 float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
00186 \
00187 ff_imdct36_float_ ## CPU1(out, buf, in, win); \
00188 \
00189 in += 18; \
00190 buf++; \
00191 out++; \
00192 } \
00193 }
00194
00195 #if HAVE_YASM
00196 #if HAVE_SSE
00197 DECL_IMDCT_BLOCKS(sse,sse)
00198 DECL_IMDCT_BLOCKS(sse2,sse)
00199 DECL_IMDCT_BLOCKS(sse3,sse)
00200 DECL_IMDCT_BLOCKS(ssse3,sse)
00201 #endif
00202 #if HAVE_AVX
00203 DECL_IMDCT_BLOCKS(avx,avx)
00204 #endif
00205 #endif
00206
00207 void ff_mpadsp_init_mmx(MPADSPContext *s)
00208 {
00209 int mm_flags = av_get_cpu_flags();
00210
00211 int i, j;
00212 for (j = 0; j < 4; j++) {
00213 for (i = 0; i < 40; i ++) {
00214 mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
00215 mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
00216 mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
00217 mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
00218 mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
00219 mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
00220 mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
00221 mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
00222 }
00223 }
00224
00225 if (mm_flags & AV_CPU_FLAG_SSE2) {
00226 s->apply_window_float = apply_window_mp3;
00227 }
00228 #if HAVE_YASM
00229 if (0) {
00230 #if HAVE_AVX
00231 } else if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
00232 s->imdct36_blocks_float = imdct36_blocks_avx;
00233 #endif
00234 #if HAVE_SSE
00235 } else if (mm_flags & AV_CPU_FLAG_SSSE3) {
00236 s->imdct36_blocks_float = imdct36_blocks_ssse3;
00237 } else if (mm_flags & AV_CPU_FLAG_SSE3) {
00238 s->imdct36_blocks_float = imdct36_blocks_sse3;
00239 } else if (mm_flags & AV_CPU_FLAG_SSE2) {
00240 s->imdct36_blocks_float = imdct36_blocks_sse2;
00241 } else if (mm_flags & AV_CPU_FLAG_SSE) {
00242 s->imdct36_blocks_float = imdct36_blocks_sse;
00243 #endif
00244 }
00245 #endif
00246 }