00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041 #include "libavcodec/dsputil.h"
00042 #include "libavutil/mem.h"
00043 #include "libavutil/x86/asm.h"
00044 #include "idct_xvid.h"
00045 #include "dsputil_mmx.h"
00046
00047 #if HAVE_INLINE_ASM
00048
00054 #define X8(x) x,x,x,x,x,x,x,x
00055
00056 #define ROW_SHIFT 11
00057 #define COL_SHIFT 6
00058
00059 DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)};
00060 DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)};
00061 DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)};
00062 DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)};
00063 DECLARE_ASM_CONST(8, uint8_t, m127)[] = {X8(127)};
00064
00065 DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
00066 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
00067 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
00068 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
00069 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
00070 };
00071
00072 DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
00073 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
00074 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
00075 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
00076 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
00077 };
00078
00079 DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
00080 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
00081 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
00082 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
00083 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
00084 };
00085
00086 DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
00087 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
00088 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
00089 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
00090 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
00091 };
00092
00093 DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
00094 65536, 65536, 65536, 65536,
00095 3597, 3597, 3597, 3597,
00096 2260, 2260, 2260, 2260,
00097 1203, 1203, 1203, 1203,
00098 120, 120, 120, 120,
00099 512, 512, 512, 512
00100 };
00101
00102
00103 #define ROW1 "%%xmm6"
00104 #define ROW3 "%%xmm4"
00105 #define ROW5 "%%xmm5"
00106 #define ROW7 "%%xmm7"
00107
00108 #define CLEAR_ODD(r) "pxor "r","r" \n\t"
00109 #define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t"
00110
00111 #if ARCH_X86_64
00112
00113 # define ROW0 "%%xmm8"
00114 # define REG0 ROW0
00115 # define ROW2 "%%xmm9"
00116 # define REG2 ROW2
00117 # define ROW4 "%%xmm10"
00118 # define REG4 ROW4
00119 # define ROW6 "%%xmm11"
00120 # define REG6 ROW6
00121 # define CLEAR_EVEN(r) CLEAR_ODD(r)
00122 # define PUT_EVEN(dst) PUT_ODD(dst)
00123 # define XMMS "%%xmm12"
00124 # define MOV_32_ONLY "#"
00125 # define SREG2 REG2
00126 # define TAN3 "%%xmm13"
00127 # define TAN1 "%%xmm14"
00128
00129 #else
00130
00131 # define ROW0 "(%0)"
00132 # define REG0 "%%xmm4"
00133 # define ROW2 "2*16(%0)"
00134 # define REG2 "%%xmm4"
00135 # define ROW4 "4*16(%0)"
00136 # define REG4 "%%xmm6"
00137 # define ROW6 "6*16(%0)"
00138 # define REG6 "%%xmm6"
00139 # define CLEAR_EVEN(r)
00140 # define PUT_EVEN(dst) \
00141 "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \
00142 "movdqa %%xmm2, "dst" \n\t"
00143 # define XMMS "%%xmm2"
00144 # define MOV_32_ONLY "movdqa "
00145 # define SREG2 "%%xmm7"
00146 # define TAN3 "%%xmm0"
00147 # define TAN1 "%%xmm2"
00148
00149 #endif
00150
00151 #define ROUND(x) "paddd "MANGLE(x)
00152
00153 #define JZ(reg, to) \
00154 "testl "reg","reg" \n\t" \
00155 "jz "to" \n\t"
00156
00157 #define JNZ(reg, to) \
00158 "testl "reg","reg" \n\t" \
00159 "jnz "to" \n\t"
00160
00161 #define TEST_ONE_ROW(src, reg, clear) \
00162 clear \
00163 "movq "src", %%mm1 \n\t" \
00164 "por 8+"src", %%mm1 \n\t" \
00165 "paddusb %%mm0, %%mm1 \n\t" \
00166 "pmovmskb %%mm1, "reg" \n\t"
00167
00168 #define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
00169 clear1 \
00170 clear2 \
00171 "movq "row1", %%mm1 \n\t" \
00172 "por 8+"row1", %%mm1 \n\t" \
00173 "movq "row2", %%mm2 \n\t" \
00174 "por 8+"row2", %%mm2 \n\t" \
00175 "paddusb %%mm0, %%mm1 \n\t" \
00176 "paddusb %%mm0, %%mm2 \n\t" \
00177 "pmovmskb %%mm1, "reg1" \n\t" \
00178 "pmovmskb %%mm2, "reg2" \n\t"
00179
00181 #define iMTX_MULT(src, table, rounder, put) \
00182 "movdqa "src", %%xmm3 \n\t" \
00183 "movdqa %%xmm3, %%xmm0 \n\t" \
00184 "pshufd $0x11, %%xmm3, %%xmm1 \n\t" \
00185 "punpcklqdq %%xmm0, %%xmm0 \n\t" \
00186 "pmaddwd "table", %%xmm0 \n\t" \
00187 "pmaddwd 16+"table", %%xmm1 \n\t" \
00188 "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" \
00189 "punpckhqdq %%xmm3, %%xmm3 \n\t" \
00190 "pmaddwd 32+"table", %%xmm2 \n\t" \
00191 "pmaddwd 48+"table", %%xmm3 \n\t" \
00192 "paddd %%xmm1, %%xmm0 \n\t" \
00193 "paddd %%xmm3, %%xmm2 \n\t" \
00194 rounder", %%xmm0 \n\t" \
00195 "movdqa %%xmm2, %%xmm3 \n\t" \
00196 "paddd %%xmm0, %%xmm2 \n\t" \
00197 "psubd %%xmm3, %%xmm0 \n\t" \
00198 "psrad $11, %%xmm2 \n\t" \
00199 "psrad $11, %%xmm0 \n\t" \
00200 "packssdw %%xmm0, %%xmm2 \n\t" \
00201 put \
00202 "1: \n\t"
00203
00204 #define iLLM_HEAD \
00205 "movdqa "MANGLE(tan3)", "TAN3" \n\t" \
00206 "movdqa "MANGLE(tan1)", "TAN1" \n\t" \
00207
00209 #define iLLM_PASS(dct) \
00210 "movdqa "TAN3", %%xmm1 \n\t" \
00211 "movdqa "TAN1", %%xmm3 \n\t" \
00212 "pmulhw %%xmm4, "TAN3" \n\t" \
00213 "pmulhw %%xmm5, %%xmm1 \n\t" \
00214 "paddsw %%xmm4, "TAN3" \n\t" \
00215 "paddsw %%xmm5, %%xmm1 \n\t" \
00216 "psubsw %%xmm5, "TAN3" \n\t" \
00217 "paddsw %%xmm4, %%xmm1 \n\t" \
00218 "pmulhw %%xmm7, %%xmm3 \n\t" \
00219 "pmulhw %%xmm6, "TAN1" \n\t" \
00220 "paddsw %%xmm6, %%xmm3 \n\t" \
00221 "psubsw %%xmm7, "TAN1" \n\t" \
00222 "movdqa %%xmm3, %%xmm7 \n\t" \
00223 "movdqa "TAN1", %%xmm6 \n\t" \
00224 "psubsw %%xmm1, %%xmm3 \n\t" \
00225 "psubsw "TAN3", "TAN1" \n\t" \
00226 "paddsw %%xmm7, %%xmm1 \n\t" \
00227 "paddsw %%xmm6, "TAN3" \n\t" \
00228 "movdqa %%xmm3, %%xmm6 \n\t" \
00229 "psubsw "TAN3", %%xmm3 \n\t" \
00230 "paddsw %%xmm6, "TAN3" \n\t" \
00231 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
00232 "pmulhw %%xmm4, %%xmm3 \n\t" \
00233 "pmulhw %%xmm4, "TAN3" \n\t" \
00234 "paddsw "TAN3", "TAN3" \n\t" \
00235 "paddsw %%xmm3, %%xmm3 \n\t" \
00236 "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \
00237 MOV_32_ONLY ROW2", "REG2" \n\t" \
00238 MOV_32_ONLY ROW6", "REG6" \n\t" \
00239 "movdqa %%xmm7, %%xmm5 \n\t" \
00240 "pmulhw "REG6", %%xmm7 \n\t" \
00241 "pmulhw "REG2", %%xmm5 \n\t" \
00242 "paddsw "REG2", %%xmm7 \n\t" \
00243 "psubsw "REG6", %%xmm5 \n\t" \
00244 MOV_32_ONLY ROW0", "REG0" \n\t" \
00245 MOV_32_ONLY ROW4", "REG4" \n\t" \
00246 MOV_32_ONLY" "TAN1", (%0) \n\t" \
00247 "movdqa "REG0", "XMMS" \n\t" \
00248 "psubsw "REG4", "REG0" \n\t" \
00249 "paddsw "XMMS", "REG4" \n\t" \
00250 "movdqa "REG4", "XMMS" \n\t" \
00251 "psubsw %%xmm7, "REG4" \n\t" \
00252 "paddsw "XMMS", %%xmm7 \n\t" \
00253 "movdqa "REG0", "XMMS" \n\t" \
00254 "psubsw %%xmm5, "REG0" \n\t" \
00255 "paddsw "XMMS", %%xmm5 \n\t" \
00256 "movdqa %%xmm5, "XMMS" \n\t" \
00257 "psubsw "TAN3", %%xmm5 \n\t" \
00258 "paddsw "XMMS", "TAN3" \n\t" \
00259 "movdqa "REG0", "XMMS" \n\t" \
00260 "psubsw %%xmm3, "REG0" \n\t" \
00261 "paddsw "XMMS", %%xmm3 \n\t" \
00262 MOV_32_ONLY" (%0), "TAN1" \n\t" \
00263 "psraw $6, %%xmm5 \n\t" \
00264 "psraw $6, "REG0" \n\t" \
00265 "psraw $6, "TAN3" \n\t" \
00266 "psraw $6, %%xmm3 \n\t" \
00267 "movdqa "TAN3", 1*16("dct") \n\t" \
00268 "movdqa %%xmm3, 2*16("dct") \n\t" \
00269 "movdqa "REG0", 5*16("dct") \n\t" \
00270 "movdqa %%xmm5, 6*16("dct") \n\t" \
00271 "movdqa %%xmm7, %%xmm0 \n\t" \
00272 "movdqa "REG4", %%xmm4 \n\t" \
00273 "psubsw %%xmm1, %%xmm7 \n\t" \
00274 "psubsw "TAN1", "REG4" \n\t" \
00275 "paddsw %%xmm0, %%xmm1 \n\t" \
00276 "paddsw %%xmm4, "TAN1" \n\t" \
00277 "psraw $6, %%xmm1 \n\t" \
00278 "psraw $6, %%xmm7 \n\t" \
00279 "psraw $6, "TAN1" \n\t" \
00280 "psraw $6, "REG4" \n\t" \
00281 "movdqa %%xmm1, ("dct") \n\t" \
00282 "movdqa "TAN1", 3*16("dct") \n\t" \
00283 "movdqa "REG4", 4*16("dct") \n\t" \
00284 "movdqa %%xmm7, 7*16("dct") \n\t"
00285
00287 #define iLLM_PASS_SPARSE(dct) \
00288 "pmulhw %%xmm4, "TAN3" \n\t" \
00289 "paddsw %%xmm4, "TAN3" \n\t" \
00290 "movdqa %%xmm6, %%xmm3 \n\t" \
00291 "pmulhw %%xmm6, "TAN1" \n\t" \
00292 "movdqa %%xmm4, %%xmm1 \n\t" \
00293 "psubsw %%xmm1, %%xmm3 \n\t" \
00294 "paddsw %%xmm6, %%xmm1 \n\t" \
00295 "movdqa "TAN1", %%xmm6 \n\t" \
00296 "psubsw "TAN3", "TAN1" \n\t" \
00297 "paddsw %%xmm6, "TAN3" \n\t" \
00298 "movdqa %%xmm3, %%xmm6 \n\t" \
00299 "psubsw "TAN3", %%xmm3 \n\t" \
00300 "paddsw %%xmm6, "TAN3" \n\t" \
00301 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
00302 "pmulhw %%xmm4, %%xmm3 \n\t" \
00303 "pmulhw %%xmm4, "TAN3" \n\t" \
00304 "paddsw "TAN3", "TAN3" \n\t" \
00305 "paddsw %%xmm3, %%xmm3 \n\t" \
00306 "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \
00307 MOV_32_ONLY ROW2", "SREG2" \n\t" \
00308 "pmulhw "SREG2", %%xmm5 \n\t" \
00309 MOV_32_ONLY ROW0", "REG0" \n\t" \
00310 "movdqa "REG0", %%xmm6 \n\t" \
00311 "psubsw "SREG2", %%xmm6 \n\t" \
00312 "paddsw "REG0", "SREG2" \n\t" \
00313 MOV_32_ONLY" "TAN1", (%0) \n\t" \
00314 "movdqa "REG0", "XMMS" \n\t" \
00315 "psubsw %%xmm5, "REG0" \n\t" \
00316 "paddsw "XMMS", %%xmm5 \n\t" \
00317 "movdqa %%xmm5, "XMMS" \n\t" \
00318 "psubsw "TAN3", %%xmm5 \n\t" \
00319 "paddsw "XMMS", "TAN3" \n\t" \
00320 "movdqa "REG0", "XMMS" \n\t" \
00321 "psubsw %%xmm3, "REG0" \n\t" \
00322 "paddsw "XMMS", %%xmm3 \n\t" \
00323 MOV_32_ONLY" (%0), "TAN1" \n\t" \
00324 "psraw $6, %%xmm5 \n\t" \
00325 "psraw $6, "REG0" \n\t" \
00326 "psraw $6, "TAN3" \n\t" \
00327 "psraw $6, %%xmm3 \n\t" \
00328 "movdqa "TAN3", 1*16("dct") \n\t" \
00329 "movdqa %%xmm3, 2*16("dct") \n\t" \
00330 "movdqa "REG0", 5*16("dct") \n\t" \
00331 "movdqa %%xmm5, 6*16("dct") \n\t" \
00332 "movdqa "SREG2", %%xmm0 \n\t" \
00333 "movdqa %%xmm6, %%xmm4 \n\t" \
00334 "psubsw %%xmm1, "SREG2" \n\t" \
00335 "psubsw "TAN1", %%xmm6 \n\t" \
00336 "paddsw %%xmm0, %%xmm1 \n\t" \
00337 "paddsw %%xmm4, "TAN1" \n\t" \
00338 "psraw $6, %%xmm1 \n\t" \
00339 "psraw $6, "SREG2" \n\t" \
00340 "psraw $6, "TAN1" \n\t" \
00341 "psraw $6, %%xmm6 \n\t" \
00342 "movdqa %%xmm1, ("dct") \n\t" \
00343 "movdqa "TAN1", 3*16("dct") \n\t" \
00344 "movdqa %%xmm6, 4*16("dct") \n\t" \
00345 "movdqa "SREG2", 7*16("dct") \n\t"
00346
00347 inline void ff_idct_xvid_sse2(short *block)
00348 {
00349 __asm__ volatile(
00350 "movq "MANGLE(m127)", %%mm0 \n\t"
00351 iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))
00352 iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
00353 iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
00354
00355 TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
00356 JZ("%%eax", "1f")
00357 iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
00358
00359 TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
00360 TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
00361 iLLM_HEAD
00362 ".p2align 4 \n\t"
00363 JNZ("%%ecx", "2f")
00364 JNZ("%%eax", "3f")
00365 JNZ("%%edx", "4f")
00366 JNZ("%%esi", "5f")
00367 iLLM_PASS_SPARSE("%0")
00368 "jmp 6f \n\t"
00369 "2: \n\t"
00370 iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
00371 "3: \n\t"
00372 iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
00373 JZ("%%edx", "1f")
00374 "4: \n\t"
00375 iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
00376 JZ("%%esi", "1f")
00377 "5: \n\t"
00378 iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
00379 #if !ARCH_X86_64
00380 iLLM_HEAD
00381 #endif
00382 iLLM_PASS("%0")
00383 "6: \n\t"
00384 : "+r"(block)
00385 :
00386 : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" ,
00387 "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,)
00388 #if ARCH_X86_64
00389 XMM_CLOBBERS("%xmm8" , "%xmm9" , "%xmm10", "%xmm11",
00390 "%xmm12", "%xmm13", "%xmm14",)
00391 #endif
00392 "%eax", "%ecx", "%edx", "%esi", "memory"
00393 );
00394 }
00395
00396 void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
00397 {
00398 ff_idct_xvid_sse2(block);
00399 ff_put_pixels_clamped_mmx(block, dest, line_size);
00400 }
00401
00402 void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
00403 {
00404 ff_idct_xvid_sse2(block);
00405 ff_add_pixels_clamped_mmx(block, dest, line_size);
00406 }
00407
00408 #endif