00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038 #include <stdlib.h>
00039 #include <string.h>
00040 #include "config.h"
00041 #if HAVE_ALTIVEC_H
00042 #include <altivec.h>
00043 #endif
00044 #include "libavcodec/dsputil.h"
00045 #include "types_altivec.h"
00046 #include "dsputil_altivec.h"
00047
00048 #define IDCT_HALF \
00049 \
00050 t1 = vec_mradds (a1, vx7, vx1 ); \
00051 t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \
00052 t7 = vec_mradds (a2, vx5, vx3); \
00053 t3 = vec_mradds (ma2, vx3, vx5); \
00054 \
00055 \
00056 t5 = vec_adds (vx0, vx4); \
00057 t0 = vec_subs (vx0, vx4); \
00058 t2 = vec_mradds (a0, vx6, vx2); \
00059 t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \
00060 t6 = vec_adds (t8, t3); \
00061 t3 = vec_subs (t8, t3); \
00062 t8 = vec_subs (t1, t7); \
00063 t1 = vec_adds (t1, t7); \
00064 \
00065 \
00066 t7 = vec_adds (t5, t2); \
00067 t2 = vec_subs (t5, t2); \
00068 t5 = vec_adds (t0, t4); \
00069 t0 = vec_subs (t0, t4); \
00070 t4 = vec_subs (t8, t3); \
00071 t3 = vec_adds (t8, t3); \
00072 \
00073 \
00074 vy0 = vec_adds (t7, t1); \
00075 vy7 = vec_subs (t7, t1); \
00076 vy1 = vec_mradds (c4, t3, t5); \
00077 vy6 = vec_mradds (mc4, t3, t5); \
00078 vy2 = vec_mradds (c4, t4, t0); \
00079 vy5 = vec_mradds (mc4, t4, t0); \
00080 vy3 = vec_adds (t2, t6); \
00081 vy4 = vec_subs (t2, t6);
00082
00083
00084 #define IDCT \
00085 vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \
00086 vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
00087 vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias; \
00088 vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \
00089 vec_u16 shift; \
00090 \
00091 c4 = vec_splat (constants[0], 0); \
00092 a0 = vec_splat (constants[0], 1); \
00093 a1 = vec_splat (constants[0], 2); \
00094 a2 = vec_splat (constants[0], 3); \
00095 mc4 = vec_splat (constants[0], 4); \
00096 ma2 = vec_splat (constants[0], 5); \
00097 bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3); \
00098 \
00099 zero = vec_splat_s16 (0); \
00100 shift = vec_splat_u16 (4); \
00101 \
00102 vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \
00103 vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \
00104 vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \
00105 vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \
00106 vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \
00107 vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \
00108 vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \
00109 vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \
00110 \
00111 IDCT_HALF \
00112 \
00113 vx0 = vec_mergeh (vy0, vy4); \
00114 vx1 = vec_mergel (vy0, vy4); \
00115 vx2 = vec_mergeh (vy1, vy5); \
00116 vx3 = vec_mergel (vy1, vy5); \
00117 vx4 = vec_mergeh (vy2, vy6); \
00118 vx5 = vec_mergel (vy2, vy6); \
00119 vx6 = vec_mergeh (vy3, vy7); \
00120 vx7 = vec_mergel (vy3, vy7); \
00121 \
00122 vy0 = vec_mergeh (vx0, vx4); \
00123 vy1 = vec_mergel (vx0, vx4); \
00124 vy2 = vec_mergeh (vx1, vx5); \
00125 vy3 = vec_mergel (vx1, vx5); \
00126 vy4 = vec_mergeh (vx2, vx6); \
00127 vy5 = vec_mergel (vx2, vx6); \
00128 vy6 = vec_mergeh (vx3, vx7); \
00129 vy7 = vec_mergel (vx3, vx7); \
00130 \
00131 vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \
00132 vx1 = vec_mergel (vy0, vy4); \
00133 vx2 = vec_mergeh (vy1, vy5); \
00134 vx3 = vec_mergel (vy1, vy5); \
00135 vx4 = vec_mergeh (vy2, vy6); \
00136 vx5 = vec_mergel (vy2, vy6); \
00137 vx6 = vec_mergeh (vy3, vy7); \
00138 vx7 = vec_mergel (vy3, vy7); \
00139 \
00140 IDCT_HALF \
00141 \
00142 shift = vec_splat_u16 (6); \
00143 vx0 = vec_sra (vy0, shift); \
00144 vx1 = vec_sra (vy1, shift); \
00145 vx2 = vec_sra (vy2, shift); \
00146 vx3 = vec_sra (vy3, shift); \
00147 vx4 = vec_sra (vy4, shift); \
00148 vx5 = vec_sra (vy5, shift); \
00149 vx6 = vec_sra (vy6, shift); \
00150 vx7 = vec_sra (vy7, shift);
00151
00152
00153 static const vec_s16 constants[5] = {
00154 {23170, 13573, 6518, 21895, -23170, -21895, 32, 31},
00155 {16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725},
00156 {22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521},
00157 {21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692},
00158 {19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722}
00159 };
00160
00161 void idct_put_altivec(uint8_t* dest, int stride, int16_t *blk)
00162 {
00163 vec_s16 *block = (vec_s16*)blk;
00164 vec_u8 tmp;
00165
00166 IDCT
00167
00168 #define COPY(dest,src) \
00169 tmp = vec_packsu (src, src); \
00170 vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
00171 vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
00172
00173 COPY (dest, vx0) dest += stride;
00174 COPY (dest, vx1) dest += stride;
00175 COPY (dest, vx2) dest += stride;
00176 COPY (dest, vx3) dest += stride;
00177 COPY (dest, vx4) dest += stride;
00178 COPY (dest, vx5) dest += stride;
00179 COPY (dest, vx6) dest += stride;
00180 COPY (dest, vx7)
00181 }
00182
00183 void idct_add_altivec(uint8_t* dest, int stride, int16_t *blk)
00184 {
00185 vec_s16 *block = (vec_s16*)blk;
00186 vec_u8 tmp;
00187 vec_s16 tmp2, tmp3;
00188 vec_u8 perm0;
00189 vec_u8 perm1;
00190 vec_u8 p0, p1, p;
00191
00192 IDCT
00193
00194 p0 = vec_lvsl (0, dest);
00195 p1 = vec_lvsl (stride, dest);
00196 p = vec_splat_u8 (-1);
00197 perm0 = vec_mergeh (p, p0);
00198 perm1 = vec_mergeh (p, p1);
00199
00200 #define ADD(dest,src,perm) \
00201 \
00202 tmp = vec_ld (0, dest); \
00203 tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \
00204 tmp3 = vec_adds (tmp2, src); \
00205 tmp = vec_packsu (tmp3, tmp3); \
00206 vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
00207 vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
00208
00209 ADD (dest, vx0, perm0) dest += stride;
00210 ADD (dest, vx1, perm1) dest += stride;
00211 ADD (dest, vx2, perm0) dest += stride;
00212 ADD (dest, vx3, perm1) dest += stride;
00213 ADD (dest, vx4, perm0) dest += stride;
00214 ADD (dest, vx5, perm1) dest += stride;
00215 ADD (dest, vx6, perm0) dest += stride;
00216 ADD (dest, vx7, perm1)
00217 }
00218