FFmpeg: libavcodec/x86/mpegaudiodec

00001 /*
00002  * MMX optimized MP3 decoding functions
00003  * Copyright (c) 2010 Vitor Sessak
00004  *
00005  * This file is part of FFmpeg.
00006  *
00007  * FFmpeg is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU Lesser General Public
00009  * License as published by the Free Software Foundation; either
00010  * version 2.1 of the License, or (at your option) any later version.
00011  *
00012  * FFmpeg is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015  * Lesser General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU Lesser General Public
00018  * License along with FFmpeg; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00020  */
00021 
00022 #include "libavutil/cpu.h"
00023 #include "libavutil/x86_cpu.h"
00024 #include "libavcodec/dsputil.h"
00025 #include "libavcodec/mpegaudiodsp.h"
00026 
00027 void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
00028 void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
00029 void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
00030 void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
00031 void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
00032 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
00033                                float *tmpbuf);
00034 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
00035                                float *tmpbuf);
00036 
00037 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
00038 
00039 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
00040 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
00041 
00042 #define SUM8(op, sum, w, p)               \
00043 {                                         \
00044     op(sum, (w)[0 * 64], (p)[0 * 64]);    \
00045     op(sum, (w)[1 * 64], (p)[1 * 64]);    \
00046     op(sum, (w)[2 * 64], (p)[2 * 64]);    \
00047     op(sum, (w)[3 * 64], (p)[3 * 64]);    \
00048     op(sum, (w)[4 * 64], (p)[4 * 64]);    \
00049     op(sum, (w)[5 * 64], (p)[5 * 64]);    \
00050     op(sum, (w)[6 * 64], (p)[6 * 64]);    \
00051     op(sum, (w)[7 * 64], (p)[7 * 64]);    \
00052 }
00053 
00054 static void apply_window(const float *buf, const float *win1,
00055                          const float *win2, float *sum1, float *sum2, int len)
00056 {
00057     x86_reg count = - 4*len;
00058     const float *win1a = win1+len;
00059     const float *win2a = win2+len;
00060     const float *bufa  = buf+len;
00061     float *sum1a = sum1+len;
00062     float *sum2a = sum2+len;
00063 
00064 
00065 #define MULT(a, b)                                 \
00066     "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
00067     "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
00068     "mulps         %%xmm2, %%xmm1           \n\t"  \
00069     "subps         %%xmm1, %%xmm0           \n\t"  \
00070     "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
00071     "subps         %%xmm2, %%xmm4           \n\t"  \
00072 
00073     __asm__ volatile(
00074             "1:                                   \n\t"
00075             "xorps       %%xmm0, %%xmm0           \n\t"
00076             "xorps       %%xmm4, %%xmm4           \n\t"
00077 
00078             MULT(   0,   0)
00079             MULT( 256,  64)
00080             MULT( 512, 128)
00081             MULT( 768, 192)
00082             MULT(1024, 256)
00083             MULT(1280, 320)
00084             MULT(1536, 384)
00085             MULT(1792, 448)
00086 
00087             "movaps      %%xmm0, (%4,%0)          \n\t"
00088             "movaps      %%xmm4, (%5,%0)          \n\t"
00089             "add            $16,  %0              \n\t"
00090             "jl              1b                   \n\t"
00091             :"+&r"(count)
00092             :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
00093             );
00094 
00095 #undef MULT
00096 }
00097 
00098 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
00099                              int incr)
00100 {
00101     LOCAL_ALIGNED_16(float, suma, [17]);
00102     LOCAL_ALIGNED_16(float, sumb, [17]);
00103     LOCAL_ALIGNED_16(float, sumc, [17]);
00104     LOCAL_ALIGNED_16(float, sumd, [17]);
00105 
00106     float sum;
00107 
00108     /* copy to avoid wrap */
00109     __asm__ volatile(
00110             "movaps    0(%0), %%xmm0   \n\t" \
00111             "movaps   16(%0), %%xmm1   \n\t" \
00112             "movaps   32(%0), %%xmm2   \n\t" \
00113             "movaps   48(%0), %%xmm3   \n\t" \
00114             "movaps   %%xmm0,   0(%1) \n\t" \
00115             "movaps   %%xmm1,  16(%1) \n\t" \
00116             "movaps   %%xmm2,  32(%1) \n\t" \
00117             "movaps   %%xmm3,  48(%1) \n\t" \
00118             "movaps   64(%0), %%xmm0   \n\t" \
00119             "movaps   80(%0), %%xmm1   \n\t" \
00120             "movaps   96(%0), %%xmm2   \n\t" \
00121             "movaps  112(%0), %%xmm3   \n\t" \
00122             "movaps   %%xmm0,  64(%1) \n\t" \
00123             "movaps   %%xmm1,  80(%1) \n\t" \
00124             "movaps   %%xmm2,  96(%1) \n\t" \
00125             "movaps   %%xmm3, 112(%1) \n\t"
00126             ::"r"(in), "r"(in+512)
00127             :"memory"
00128             );
00129 
00130     apply_window(in + 16, win     , win + 512, suma, sumc, 16);
00131     apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
00132 
00133     SUM8(MACS, suma[0], win + 32, in + 48);
00134 
00135     sumc[ 0] = 0;
00136     sumb[16] = 0;
00137     sumd[16] = 0;
00138 
00139 #define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
00140             "movups " #sumd "(%4),       %%xmm0          \n\t" \
00141             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
00142             "subps  " #suma "(%1),       %%xmm0          \n\t" \
00143             "movaps        %%xmm0," #out1 "(%0)          \n\t" \
00144 \
00145             "movups " #sumc "(%3),       %%xmm0          \n\t" \
00146             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
00147             "addps  " #sumb "(%2),       %%xmm0          \n\t" \
00148             "movaps        %%xmm0," #out2 "(%0)          \n\t"
00149 
00150     if (incr == 1) {
00151         __asm__ volatile(
00152             SUMS( 0, 48,  4, 52,  0, 112)
00153             SUMS(16, 32, 20, 36, 16,  96)
00154             SUMS(32, 16, 36, 20, 32,  80)
00155             SUMS(48,  0, 52,  4, 48,  64)
00156 
00157             :"+&r"(out)
00158             :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
00159             :"memory"
00160             );
00161         out += 16*incr;
00162     } else {
00163         int j;
00164         float *out2 = out + 32 * incr;
00165         out[0  ]  = -suma[   0];
00166         out += incr;
00167         out2 -= incr;
00168         for(j=1;j<16;j++) {
00169             *out  = -suma[   j] + sumd[16-j];
00170             *out2 =  sumb[16-j] + sumc[   j];
00171             out  += incr;
00172             out2 -= incr;
00173         }
00174     }
00175 
00176     sum = 0;
00177     SUM8(MLSS, sum, win + 16 + 32, in + 32);
00178     *out = sum;
00179 }
00180 
00181 
00182 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
00183 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
00184                                int count, int switch_point, int block_type) \
00185 {                                                                           \
00186     int align_end = count - (count & 3);                                \
00187     int j;                                                              \
00188     for (j = 0; j < align_end; j+= 4) {                                 \
00189         LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
00190         float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
00191         /* apply window & overlap with previous buffer */               \
00192                                                                         \
00193         /* select window */                                             \
00194         ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
00195         in      += 4*18;                                                \
00196         buf     += 4*18;                                                \
00197         out     += 4;                                                   \
00198     }                                                                   \
00199     for (; j < count; j++) {                                            \
00200         /* apply window & overlap with previous buffer */               \
00201                                                                         \
00202         /* select window */                                             \
00203         int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
00204         float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
00205                                                                         \
00206         ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
00207                                                                         \
00208         in  += 18;                                                      \
00209         buf++;                                                          \
00210         out++;                                                          \
00211     }                                                                   \
00212 }
00213 
00214 #if HAVE_YASM
00215 #if HAVE_SSE
00216 DECL_IMDCT_BLOCKS(sse,sse)
00217 DECL_IMDCT_BLOCKS(sse2,sse)
00218 DECL_IMDCT_BLOCKS(sse3,sse)
00219 DECL_IMDCT_BLOCKS(ssse3,sse)
00220 #endif
00221 #if HAVE_AVX
00222 DECL_IMDCT_BLOCKS(avx,avx)
00223 #endif
00224 #endif
00225 
00226 void ff_mpadsp_init_mmx(MPADSPContext *s)
00227 {
00228     int mm_flags = av_get_cpu_flags();
00229 
00230     int i, j;
00231     for (j = 0; j < 4; j++) {
00232         for (i = 0; i < 40; i ++) {
00233             mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
00234             mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
00235             mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
00236             mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
00237             mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
00238             mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
00239             mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
00240             mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
00241         }
00242     }
00243 
00244     if (mm_flags & AV_CPU_FLAG_SSE2) {
00245         s->apply_window_float = apply_window_mp3;
00246     }
00247 #if HAVE_YASM
00248     if (0) {
00249 #if HAVE_AVX
00250     } else if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
00251         s->imdct36_blocks_float = imdct36_blocks_avx;
00252 #endif
00253 #if HAVE_SSE
00254     } else if (mm_flags & AV_CPU_FLAG_SSSE3) {
00255         s->imdct36_blocks_float = imdct36_blocks_ssse3;
00256     } else if (mm_flags & AV_CPU_FLAG_SSE3) {
00257         s->imdct36_blocks_float = imdct36_blocks_sse3;
00258     } else if (mm_flags & AV_CPU_FLAG_SSE2) {
00259         s->imdct36_blocks_float = imdct36_blocks_sse2;
00260     } else if (mm_flags & AV_CPU_FLAG_SSE) {
00261         s->imdct36_blocks_float = imdct36_blocks_sse;
00262 #endif /* HAVE_SSE */
00263     }
00264 #endif /* HAVE_YASM */
00265 }
libavcodec/x86/mpegaudiodec_mmx.c