FFmpeg: libavcodec/x86/mpegaudiodec

00001 /*
00002  * MMX optimized MP3 decoding functions
00003  * Copyright (c) 2010 Vitor Sessak
00004  *
00005  * This file is part of FFmpeg.
00006  *
00007  * FFmpeg is free software; you can redistribute it and/or
00008  * modify it under the terms of the GNU Lesser General Public
00009  * License as published by the Free Software Foundation; either
00010  * version 2.1 of the License, or (at your option) any later version.
00011  *
00012  * FFmpeg is distributed in the hope that it will be useful,
00013  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00014  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00015  * Lesser General Public License for more details.
00016  *
00017  * You should have received a copy of the GNU Lesser General Public
00018  * License along with FFmpeg; if not, write to the Free Software
00019  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00020  */
00021 
00022 #include "libavutil/cpu.h"
00023 #include "libavutil/x86_cpu.h"
00024 #include "libavcodec/dsputil.h"
00025 #include "libavcodec/mpegaudiodsp.h"
00026 
00027 void ff_imdct36_float_sse(float *out, float *buf, float *in, float *win);
00028 void ff_imdct36_float_sse2(float *out, float *buf, float *in, float *win);
00029 void ff_imdct36_float_sse3(float *out, float *buf, float *in, float *win);
00030 void ff_imdct36_float_ssse3(float *out, float *buf, float *in, float *win);
00031 void ff_imdct36_float_avx(float *out, float *buf, float *in, float *win);
00032 void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
00033                                float *tmpbuf);
00034 void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
00035                                float *tmpbuf);
00036 
00037 DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
00038 
00039 #define MACS(rt, ra, rb) rt+=(ra)*(rb)
00040 #define MLSS(rt, ra, rb) rt-=(ra)*(rb)
00041 
00042 #define SUM8(op, sum, w, p)               \
00043 {                                         \
00044     op(sum, (w)[0 * 64], (p)[0 * 64]);    \
00045     op(sum, (w)[1 * 64], (p)[1 * 64]);    \
00046     op(sum, (w)[2 * 64], (p)[2 * 64]);    \
00047     op(sum, (w)[3 * 64], (p)[3 * 64]);    \
00048     op(sum, (w)[4 * 64], (p)[4 * 64]);    \
00049     op(sum, (w)[5 * 64], (p)[5 * 64]);    \
00050     op(sum, (w)[6 * 64], (p)[6 * 64]);    \
00051     op(sum, (w)[7 * 64], (p)[7 * 64]);    \
00052 }
00053 
00054 static void apply_window(const float *buf, const float *win1,
00055                          const float *win2, float *sum1, float *sum2, int len)
00056 {
00057     x86_reg count = - 4*len;
00058     const float *win1a = win1+len;
00059     const float *win2a = win2+len;
00060     const float *bufa  = buf+len;
00061     float *sum1a = sum1+len;
00062     float *sum2a = sum2+len;
00063 
00064 
00065 #define MULT(a, b)                                 \
00066     "movaps " #a "(%1,%0), %%xmm1           \n\t"  \
00067     "movaps " #a "(%3,%0), %%xmm2           \n\t"  \
00068     "mulps         %%xmm2, %%xmm1           \n\t"  \
00069     "subps         %%xmm1, %%xmm0           \n\t"  \
00070     "mulps  " #b "(%2,%0), %%xmm2           \n\t"  \
00071     "subps         %%xmm2, %%xmm4           \n\t"  \
00072 
00073     __asm__ volatile(
00074             "1:                                   \n\t"
00075             "xorps       %%xmm0, %%xmm0           \n\t"
00076             "xorps       %%xmm4, %%xmm4           \n\t"
00077 
00078             MULT(   0,   0)
00079             MULT( 256,  64)
00080             MULT( 512, 128)
00081             MULT( 768, 192)
00082             MULT(1024, 256)
00083             MULT(1280, 320)
00084             MULT(1536, 384)
00085             MULT(1792, 448)
00086 
00087             "movaps      %%xmm0, (%4,%0)          \n\t"
00088             "movaps      %%xmm4, (%5,%0)          \n\t"
00089             "add            $16,  %0              \n\t"
00090             "jl              1b                   \n\t"
00091             :"+&r"(count)
00092             :"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
00093             );
00094 
00095 #undef MULT
00096 }
00097 
00098 static void apply_window_mp3(float *in, float *win, int *unused, float *out,
00099                              int incr)
00100 {
00101     LOCAL_ALIGNED_16(float, suma, [17]);
00102     LOCAL_ALIGNED_16(float, sumb, [17]);
00103     LOCAL_ALIGNED_16(float, sumc, [17]);
00104     LOCAL_ALIGNED_16(float, sumd, [17]);
00105 
00106     float sum;
00107 
00108     /* copy to avoid wrap */
00109     memcpy(in + 512, in, 32 * sizeof(*in));
00110 
00111     apply_window(in + 16, win     , win + 512, suma, sumc, 16);
00112     apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
00113 
00114     SUM8(MACS, suma[0], win + 32, in + 48);
00115 
00116     sumc[ 0] = 0;
00117     sumb[16] = 0;
00118     sumd[16] = 0;
00119 
00120 #define SUMS(suma, sumb, sumc, sumd, out1, out2)               \
00121             "movups " #sumd "(%4),       %%xmm0          \n\t" \
00122             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
00123             "subps  " #suma "(%1),       %%xmm0          \n\t" \
00124             "movaps        %%xmm0," #out1 "(%0)          \n\t" \
00125 \
00126             "movups " #sumc "(%3),       %%xmm0          \n\t" \
00127             "shufps         $0x1b,       %%xmm0, %%xmm0  \n\t" \
00128             "addps  " #sumb "(%2),       %%xmm0          \n\t" \
00129             "movaps        %%xmm0," #out2 "(%0)          \n\t"
00130 
00131     if (incr == 1) {
00132         __asm__ volatile(
00133             SUMS( 0, 48,  4, 52,  0, 112)
00134             SUMS(16, 32, 20, 36, 16,  96)
00135             SUMS(32, 16, 36, 20, 32,  80)
00136             SUMS(48,  0, 52,  4, 48,  64)
00137 
00138             :"+&r"(out)
00139             :"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
00140             :"memory"
00141             );
00142         out += 16*incr;
00143     } else {
00144         int j;
00145         float *out2 = out + 32 * incr;
00146         out[0  ]  = -suma[   0];
00147         out += incr;
00148         out2 -= incr;
00149         for(j=1;j<16;j++) {
00150             *out  = -suma[   j] + sumd[16-j];
00151             *out2 =  sumb[16-j] + sumc[   j];
00152             out  += incr;
00153             out2 -= incr;
00154         }
00155     }
00156 
00157     sum = 0;
00158     SUM8(MLSS, sum, win + 16 + 32, in + 32);
00159     *out = sum;
00160 }
00161 
00162 
00163 #define DECL_IMDCT_BLOCKS(CPU1, CPU2)                                       \
00164 static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
00165                                int count, int switch_point, int block_type) \
00166 {                                                                           \
00167     int align_end = count - (count & 3);                                \
00168     int j;                                                              \
00169     for (j = 0; j < align_end; j+= 4) {                                 \
00170         LOCAL_ALIGNED_16(float, tmpbuf, [1024]);                        \
00171         float *win = mdct_win_sse[switch_point && j < 4][block_type];   \
00172         /* apply window & overlap with previous buffer */               \
00173                                                                         \
00174         /* select window */                                             \
00175         ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf);      \
00176         in      += 4*18;                                                \
00177         buf     += 4*18;                                                \
00178         out     += 4;                                                   \
00179     }                                                                   \
00180     for (; j < count; j++) {                                            \
00181         /* apply window & overlap with previous buffer */               \
00182                                                                         \
00183         /* select window */                                             \
00184         int win_idx = (switch_point && j < 2) ? 0 : block_type;         \
00185         float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))];       \
00186                                                                         \
00187         ff_imdct36_float_ ## CPU1(out, buf, in, win);                   \
00188                                                                         \
00189         in  += 18;                                                      \
00190         buf++;                                                          \
00191         out++;                                                          \
00192     }                                                                   \
00193 }
00194 
00195 #if HAVE_YASM
00196 #if HAVE_SSE
00197 DECL_IMDCT_BLOCKS(sse,sse)
00198 DECL_IMDCT_BLOCKS(sse2,sse)
00199 DECL_IMDCT_BLOCKS(sse3,sse)
00200 DECL_IMDCT_BLOCKS(ssse3,sse)
00201 #endif
00202 #if HAVE_AVX
00203 DECL_IMDCT_BLOCKS(avx,avx)
00204 #endif
00205 #endif
00206 
00207 void ff_mpadsp_init_mmx(MPADSPContext *s)
00208 {
00209     int mm_flags = av_get_cpu_flags();
00210 
00211     int i, j;
00212     for (j = 0; j < 4; j++) {
00213         for (i = 0; i < 40; i ++) {
00214             mdct_win_sse[0][j][4*i    ] = ff_mdct_win_float[j    ][i];
00215             mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
00216             mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j    ][i];
00217             mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
00218             mdct_win_sse[1][j][4*i    ] = ff_mdct_win_float[0    ][i];
00219             mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4    ][i];
00220             mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j    ][i];
00221             mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
00222         }
00223     }
00224 
00225     if (mm_flags & AV_CPU_FLAG_SSE2) {
00226         s->apply_window_float = apply_window_mp3;
00227     }
00228 #if HAVE_YASM
00229     if (0) {
00230 #if HAVE_AVX
00231     } else if (mm_flags & AV_CPU_FLAG_AVX && HAVE_AVX) {
00232         s->imdct36_blocks_float = imdct36_blocks_avx;
00233 #endif
00234 #if HAVE_SSE
00235     } else if (mm_flags & AV_CPU_FLAG_SSSE3) {
00236         s->imdct36_blocks_float = imdct36_blocks_ssse3;
00237     } else if (mm_flags & AV_CPU_FLAG_SSE3) {
00238         s->imdct36_blocks_float = imdct36_blocks_sse3;
00239     } else if (mm_flags & AV_CPU_FLAG_SSE2) {
00240         s->imdct36_blocks_float = imdct36_blocks_sse2;
00241     } else if (mm_flags & AV_CPU_FLAG_SSE) {
00242         s->imdct36_blocks_float = imdct36_blocks_sse;
00243 #endif /* HAVE_SSE */
00244     }
00245 #endif /* HAVE_YASM */
00246 }
libavcodec/x86/mpegaudiodec_mmx.c