FFmpeg: libavcodec/ppc/fft_altivec.c Source File

00001 /*
00002  * FFT/IFFT transforms
00003  * AltiVec-enabled
00004  * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
00005  * Based on code Copyright (c) 2002 Fabrice Bellard
00006  *
00007  * This file is part of FFmpeg.
00008  *
00009  * FFmpeg is free software; you can redistribute it and/or
00010  * modify it under the terms of the GNU Lesser General Public
00011  * License as published by the Free Software Foundation; either
00012  * version 2.1 of the License, or (at your option) any later version.
00013  *
00014  * FFmpeg is distributed in the hope that it will be useful,
00015  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00017  * Lesser General Public License for more details.
00018  *
00019  * You should have received a copy of the GNU Lesser General Public
00020  * License along with FFmpeg; if not, write to the Free Software
00021  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00022  */
00023 #include "libavcodec/fft.h"
00024 #include "dsputil_ppc.h"
00025 #include "util_altivec.h"
00026 #include "dsputil_altivec.h"
00027 
00039 static void ff_fft_calc_altivec(FFTContext *s, FFTComplex *z)
00040 {
00041 POWERPC_PERF_DECLARE(altivec_fft_num, s->nbits >= 6);
00042     register const vector float vczero = (const vector float)vec_splat_u32(0.);
00043 
00044     int ln = s->nbits;
00045     int j, np, np2;
00046     int nblocks, nloops;
00047     register FFTComplex *p, *q;
00048     FFTComplex *cptr, *cptr1;
00049     int k;
00050 
00051 POWERPC_PERF_START_COUNT(altivec_fft_num, s->nbits >= 6);
00052 
00053     np = 1 << ln;
00054 
00055     {
00056         vector float *r, a, b, a1, c1, c2;
00057 
00058         r = (vector float *)&z[0];
00059 
00060         c1 = vcii(p,p,n,n);
00061 
00062         if (s->inverse) {
00063             c2 = vcii(p,p,n,p);
00064         } else {
00065             c2 = vcii(p,p,p,n);
00066         }
00067 
00068         j = (np >> 2);
00069         do {
00070             a = vec_ld(0, r);
00071             a1 = vec_ld(sizeof(vector float), r);
00072 
00073             b = vec_perm(a,a,vcprmle(1,0,3,2));
00074             a = vec_madd(a,c1,b);
00075             /* do the pass 0 butterfly */
00076 
00077             b = vec_perm(a1,a1,vcprmle(1,0,3,2));
00078             b = vec_madd(a1,c1,b);
00079             /* do the pass 0 butterfly */
00080 
00081             /* multiply third by -i */
00082             b = vec_perm(b,b,vcprmle(2,3,1,0));
00083 
00084             /* do the pass 1 butterfly */
00085             vec_st(vec_madd(b,c2,a), 0, r);
00086             vec_st(vec_nmsub(b,c2,a), sizeof(vector float), r);
00087 
00088             r += 2;
00089         } while (--j != 0);
00090     }
00091     /* pass 2 .. ln-1 */
00092 
00093     nblocks = np >> 3;
00094     nloops = 1 << 2;
00095     np2 = np >> 1;
00096 
00097     cptr1 = s->exptab1;
00098     do {
00099         p = z;
00100         q = z + nloops;
00101         j = nblocks;
00102         do {
00103             cptr = cptr1;
00104             k = nloops >> 1;
00105             do {
00106                 vector float a,b,c,t1;
00107 
00108                 a = vec_ld(0, (float*)p);
00109                 b = vec_ld(0, (float*)q);
00110 
00111                 /* complex mul */
00112                 c = vec_ld(0, (float*)cptr);
00113                 /*  cre*re cim*re */
00114                 t1 = vec_madd(c, vec_perm(b,b,vcprmle(2,2,0,0)),vczero);
00115                 c = vec_ld(sizeof(vector float), (float*)cptr);
00116                 /*  -cim*im cre*im */
00117                 b = vec_madd(c, vec_perm(b,b,vcprmle(3,3,1,1)),t1);
00118 
00119                 /* butterfly */
00120                 vec_st(vec_add(a,b), 0, (float*)p);
00121                 vec_st(vec_sub(a,b), 0, (float*)q);
00122 
00123                 p += 2;
00124                 q += 2;
00125                 cptr += 4;
00126             } while (--k);
00127 
00128             p += nloops;
00129             q += nloops;
00130         } while (--j);
00131         cptr1 += nloops * 2;
00132         nblocks = nblocks >> 1;
00133         nloops = nloops << 1;
00134     } while (nblocks != 0);
00135 
00136 POWERPC_PERF_STOP_COUNT(altivec_fft_num, s->nbits >= 6);
00137 }
00138 
00139 av_cold void ff_fft_init_altivec(FFTContext *s)
00140 {
00141     s->fft_calc = ff_fft_calc_altivec;
00142     s->split_radix = 0;
00143 }