00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00026 #include "libavcodec/dsputil.h"
00027 #include "dsputil_mmx.h"
00028
00029 DECLARE_ALIGNED_16(const uint16_t, ff_vp3_idct_data[7 * 8]) =
00030 {
00031 64277,64277,64277,64277,64277,64277,64277,64277,
00032 60547,60547,60547,60547,60547,60547,60547,60547,
00033 54491,54491,54491,54491,54491,54491,54491,54491,
00034 46341,46341,46341,46341,46341,46341,46341,46341,
00035 36410,36410,36410,36410,36410,36410,36410,36410,
00036 25080,25080,25080,25080,25080,25080,25080,25080,
00037 12785,12785,12785,12785,12785,12785,12785,12785
00038 };
00039
00040
00041 #define VP3_1D_IDCT_SSE2(ADD, SHIFT) \
00042 "movdqa "I(3)", %%xmm2 \n\t" \
00043 "movdqa "C(3)", %%xmm6 \n\t" \
00044 "movdqa %%xmm2, %%xmm4 \n\t" \
00045 "movdqa "I(5)", %%xmm7 \n\t" \
00046 "pmulhw %%xmm6, %%xmm4 \n\t" \
00047 "movdqa "C(5)", %%xmm1 \n\t" \
00048 "pmulhw %%xmm7, %%xmm6 \n\t" \
00049 "movdqa %%xmm1, %%xmm5 \n\t" \
00050 "pmulhw %%xmm2, %%xmm1 \n\t" \
00051 "movdqa "I(1)", %%xmm3 \n\t" \
00052 "pmulhw %%xmm7, %%xmm5 \n\t" \
00053 "movdqa "C(1)", %%xmm0 \n\t" \
00054 "paddw %%xmm2, %%xmm4 \n\t" \
00055 "paddw %%xmm7, %%xmm6 \n\t" \
00056 "paddw %%xmm1, %%xmm2 \n\t" \
00057 "movdqa "I(7)", %%xmm1 \n\t" \
00058 "paddw %%xmm5, %%xmm7 \n\t" \
00059 "movdqa %%xmm0, %%xmm5 \n\t" \
00060 "pmulhw %%xmm3, %%xmm0 \n\t" \
00061 "paddsw %%xmm7, %%xmm4 \n\t" \
00062 "pmulhw %%xmm1, %%xmm5 \n\t" \
00063 "movdqa "C(7)", %%xmm7 \n\t" \
00064 "psubsw %%xmm2, %%xmm6 \n\t" \
00065 "paddw %%xmm3, %%xmm0 \n\t" \
00066 "pmulhw %%xmm7, %%xmm3 \n\t" \
00067 "movdqa "I(2)", %%xmm2 \n\t" \
00068 "pmulhw %%xmm1, %%xmm7 \n\t" \
00069 "paddw %%xmm1, %%xmm5 \n\t" \
00070 "movdqa %%xmm2, %%xmm1 \n\t" \
00071 "pmulhw "C(2)", %%xmm2 \n\t" \
00072 "psubsw %%xmm5, %%xmm3 \n\t" \
00073 "movdqa "I(6)", %%xmm5 \n\t" \
00074 "paddsw %%xmm7, %%xmm0 \n\t" \
00075 "movdqa %%xmm5, %%xmm7 \n\t" \
00076 "psubsw %%xmm4, %%xmm0 \n\t" \
00077 "pmulhw "C(2)", %%xmm5 \n\t" \
00078 "paddw %%xmm1, %%xmm2 \n\t" \
00079 "pmulhw "C(6)", %%xmm1 \n\t" \
00080 "paddsw %%xmm4, %%xmm4 \n\t" \
00081 "paddsw %%xmm0, %%xmm4 \n\t" \
00082 "psubsw %%xmm6, %%xmm3 \n\t" \
00083 "paddw %%xmm7, %%xmm5 \n\t" \
00084 "paddsw %%xmm6, %%xmm6 \n\t" \
00085 "pmulhw "C(6)", %%xmm7 \n\t" \
00086 "paddsw %%xmm3, %%xmm6 \n\t" \
00087 "movdqa %%xmm4, "I(1)" \n\t" \
00088 "psubsw %%xmm5, %%xmm1 \n\t" \
00089 "movdqa "C(4)", %%xmm4 \n\t" \
00090 "movdqa %%xmm3, %%xmm5 \n\t" \
00091 "pmulhw %%xmm4, %%xmm3 \n\t" \
00092 "paddsw %%xmm2, %%xmm7 \n\t" \
00093 "movdqa %%xmm6, "I(2)" \n\t" \
00094 "movdqa %%xmm0, %%xmm2 \n\t" \
00095 "movdqa "I(0)", %%xmm6 \n\t" \
00096 "pmulhw %%xmm4, %%xmm0 \n\t" \
00097 "paddw %%xmm3, %%xmm5 \n\t" \
00098 "movdqa "I(4)", %%xmm3 \n\t" \
00099 "psubsw %%xmm1, %%xmm5 \n\t" \
00100 "paddw %%xmm0, %%xmm2 \n\t" \
00101 "psubsw %%xmm3, %%xmm6 \n\t" \
00102 "movdqa %%xmm6, %%xmm0 \n\t" \
00103 "pmulhw %%xmm4, %%xmm6 \n\t" \
00104 "paddsw %%xmm3, %%xmm3 \n\t" \
00105 "paddsw %%xmm1, %%xmm1 \n\t" \
00106 "paddsw %%xmm0, %%xmm3 \n\t" \
00107 "paddsw %%xmm5, %%xmm1 \n\t" \
00108 "pmulhw %%xmm3, %%xmm4 \n\t" \
00109 "paddw %%xmm0, %%xmm6 \n\t" \
00110 "psubsw %%xmm2, %%xmm6 \n\t" \
00111 "paddsw %%xmm2, %%xmm2 \n\t" \
00112 "movdqa "I(1)", %%xmm0 \n\t" \
00113 "paddsw %%xmm6, %%xmm2 \n\t" \
00114 "paddw %%xmm3, %%xmm4 \n\t" \
00115 "psubsw %%xmm1, %%xmm2 \n\t" \
00116 ADD(%%xmm2) \
00117 "paddsw %%xmm1, %%xmm1 \n\t" \
00118 "paddsw %%xmm2, %%xmm1 \n\t" \
00119 SHIFT(%%xmm2) \
00120 "psubsw %%xmm7, %%xmm4 \n\t" \
00121 SHIFT(%%xmm1) \
00122 "movdqa "I(2)", %%xmm3 \n\t" \
00123 "paddsw %%xmm7, %%xmm7 \n\t" \
00124 "paddsw %%xmm4, %%xmm7 \n\t" \
00125 "psubsw %%xmm3, %%xmm4 \n\t" \
00126 ADD(%%xmm4) \
00127 "paddsw %%xmm3, %%xmm3 \n\t" \
00128 "paddsw %%xmm4, %%xmm3 \n\t" \
00129 SHIFT(%%xmm4) \
00130 "psubsw %%xmm5, %%xmm6 \n\t" \
00131 SHIFT(%%xmm3) \
00132 ADD(%%xmm6) \
00133 "paddsw %%xmm5, %%xmm5 \n\t" \
00134 "paddsw %%xmm6, %%xmm5 \n\t" \
00135 SHIFT(%%xmm6) \
00136 SHIFT(%%xmm5) \
00137 "psubsw %%xmm0, %%xmm7 \n\t" \
00138 ADD(%%xmm7) \
00139 "paddsw %%xmm0, %%xmm0 \n\t" \
00140 "paddsw %%xmm7, %%xmm0 \n\t" \
00141 SHIFT(%%xmm7) \
00142 SHIFT(%%xmm0)
00143
00144 #define PUT_BLOCK(r0, r1, r2, r3, r4, r5, r6, r7) \
00145 "movdqa " #r0 ", " O(0) "\n\t" \
00146 "movdqa " #r1 ", " O(1) "\n\t" \
00147 "movdqa " #r2 ", " O(2) "\n\t" \
00148 "movdqa " #r3 ", " O(3) "\n\t" \
00149 "movdqa " #r4 ", " O(4) "\n\t" \
00150 "movdqa " #r5 ", " O(5) "\n\t" \
00151 "movdqa " #r6 ", " O(6) "\n\t" \
00152 "movdqa " #r7 ", " O(7) "\n\t"
00153
00154 #define NOP(xmm)
00155 #define SHIFT4(xmm) "psraw $4, "#xmm"\n\t"
00156 #define ADD8(xmm) "paddsw %2, "#xmm"\n\t"
00157
00158 void ff_vp3_idct_sse2(int16_t *input_data)
00159 {
00160 #define I(x) AV_STRINGIFY(16*x)"(%0)"
00161 #define O(x) I(x)
00162 #define C(x) AV_STRINGIFY(16*(x-1))"(%1)"
00163
00164 __asm__ volatile (
00165 VP3_1D_IDCT_SSE2(NOP, NOP)
00166
00167 TRANSPOSE8(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7, (%0))
00168 PUT_BLOCK(%%xmm0, %%xmm5, %%xmm7, %%xmm3, %%xmm6, %%xmm4, %%xmm2, %%xmm1)
00169
00170 VP3_1D_IDCT_SSE2(ADD8, SHIFT4)
00171 PUT_BLOCK(%%xmm0, %%xmm1, %%xmm2, %%xmm3, %%xmm4, %%xmm5, %%xmm6, %%xmm7)
00172 :: "r"(input_data), "r"(ff_vp3_idct_data), "m"(ff_pw_8)
00173 );
00174 }
00175
00176 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block)
00177 {
00178 ff_vp3_idct_sse2(block);
00179 put_signed_pixels_clamped_mmx(block, dest, line_size);
00180 }
00181
00182 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block)
00183 {
00184 ff_vp3_idct_sse2(block);
00185 add_pixels_clamped_mmx(block, dest, line_size);
00186 }