00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041 #include "libavcodec/dsputil.h"
00042 #include "libavutil/x86_cpu.h"
00043 #include "idct_xvid.h"
00044 #include "dsputil_mmx.h"
00045
00051 #define X8(x) x,x,x,x,x,x,x,x
00052
00053 #define ROW_SHIFT 11
00054 #define COL_SHIFT 6
00055
00056 DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)};
00057 DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)};
00058 DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)};
00059 DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)};
00060 DECLARE_ASM_CONST(8, uint8_t, m127)[] = {X8(127)};
00061
00062 DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
00063 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
00064 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
00065 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
00066 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
00067 };
00068
00069 DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
00070 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
00071 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
00072 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
00073 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
00074 };
00075
00076 DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
00077 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
00078 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
00079 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
00080 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
00081 };
00082
00083 DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
00084 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
00085 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
00086 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
00087 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
00088 };
00089
00090 DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
00091 65536, 65536, 65536, 65536,
00092 3597, 3597, 3597, 3597,
00093 2260, 2260, 2260, 2260,
00094 1203, 1203, 1203, 1203,
00095 120, 120, 120, 120,
00096 512, 512, 512, 512
00097 };
00098
00099
00100 #define ROW1 "%%xmm6"
00101 #define ROW3 "%%xmm4"
00102 #define ROW5 "%%xmm5"
00103 #define ROW7 "%%xmm7"
00104
00105 #define CLEAR_ODD(r) "pxor "r","r" \n\t"
00106 #define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t"
00107
00108 #if ARCH_X86_64
00109
00110 # define ROW0 "%%xmm8"
00111 # define REG0 ROW0
00112 # define ROW2 "%%xmm9"
00113 # define REG2 ROW2
00114 # define ROW4 "%%xmm10"
00115 # define REG4 ROW4
00116 # define ROW6 "%%xmm11"
00117 # define REG6 ROW6
00118 # define CLEAR_EVEN(r) CLEAR_ODD(r)
00119 # define PUT_EVEN(dst) PUT_ODD(dst)
00120 # define XMMS "%%xmm12"
00121 # define MOV_32_ONLY "#"
00122 # define SREG2 REG2
00123 # define TAN3 "%%xmm13"
00124 # define TAN1 "%%xmm14"
00125
00126 #else
00127
00128 # define ROW0 "(%0)"
00129 # define REG0 "%%xmm4"
00130 # define ROW2 "2*16(%0)"
00131 # define REG2 "%%xmm4"
00132 # define ROW4 "4*16(%0)"
00133 # define REG4 "%%xmm6"
00134 # define ROW6 "6*16(%0)"
00135 # define REG6 "%%xmm6"
00136 # define CLEAR_EVEN(r)
00137 # define PUT_EVEN(dst) \
00138 "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \
00139 "movdqa %%xmm2, "dst" \n\t"
00140 # define XMMS "%%xmm2"
00141 # define MOV_32_ONLY "movdqa "
00142 # define SREG2 "%%xmm7"
00143 # define TAN3 "%%xmm0"
00144 # define TAN1 "%%xmm2"
00145
00146 #endif
00147
00148 #define ROUND(x) "paddd "MANGLE(x)
00149
00150 #define JZ(reg, to) \
00151 "testl "reg","reg" \n\t" \
00152 "jz "to" \n\t"
00153
00154 #define JNZ(reg, to) \
00155 "testl "reg","reg" \n\t" \
00156 "jnz "to" \n\t"
00157
00158 #define TEST_ONE_ROW(src, reg, clear) \
00159 clear \
00160 "movq "src", %%mm1 \n\t" \
00161 "por 8+"src", %%mm1 \n\t" \
00162 "paddusb %%mm0, %%mm1 \n\t" \
00163 "pmovmskb %%mm1, "reg" \n\t"
00164
00165 #define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
00166 clear1 \
00167 clear2 \
00168 "movq "row1", %%mm1 \n\t" \
00169 "por 8+"row1", %%mm1 \n\t" \
00170 "movq "row2", %%mm2 \n\t" \
00171 "por 8+"row2", %%mm2 \n\t" \
00172 "paddusb %%mm0, %%mm1 \n\t" \
00173 "paddusb %%mm0, %%mm2 \n\t" \
00174 "pmovmskb %%mm1, "reg1" \n\t" \
00175 "pmovmskb %%mm2, "reg2" \n\t"
00176
00178 #define iMTX_MULT(src, table, rounder, put) \
00179 "movdqa "src", %%xmm3 \n\t" \
00180 "movdqa %%xmm3, %%xmm0 \n\t" \
00181 "pshufd $0x11, %%xmm3, %%xmm1 \n\t" \
00182 "punpcklqdq %%xmm0, %%xmm0 \n\t" \
00183 "pmaddwd "table", %%xmm0 \n\t" \
00184 "pmaddwd 16+"table", %%xmm1 \n\t" \
00185 "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" \
00186 "punpckhqdq %%xmm3, %%xmm3 \n\t" \
00187 "pmaddwd 32+"table", %%xmm2 \n\t" \
00188 "pmaddwd 48+"table", %%xmm3 \n\t" \
00189 "paddd %%xmm1, %%xmm0 \n\t" \
00190 "paddd %%xmm3, %%xmm2 \n\t" \
00191 rounder", %%xmm0 \n\t" \
00192 "movdqa %%xmm2, %%xmm3 \n\t" \
00193 "paddd %%xmm0, %%xmm2 \n\t" \
00194 "psubd %%xmm3, %%xmm0 \n\t" \
00195 "psrad $11, %%xmm2 \n\t" \
00196 "psrad $11, %%xmm0 \n\t" \
00197 "packssdw %%xmm0, %%xmm2 \n\t" \
00198 put \
00199 "1: \n\t"
00200
00201 #define iLLM_HEAD \
00202 "movdqa "MANGLE(tan3)", "TAN3" \n\t" \
00203 "movdqa "MANGLE(tan1)", "TAN1" \n\t" \
00204
00206 #define iLLM_PASS(dct) \
00207 "movdqa "TAN3", %%xmm1 \n\t" \
00208 "movdqa "TAN1", %%xmm3 \n\t" \
00209 "pmulhw %%xmm4, "TAN3" \n\t" \
00210 "pmulhw %%xmm5, %%xmm1 \n\t" \
00211 "paddsw %%xmm4, "TAN3" \n\t" \
00212 "paddsw %%xmm5, %%xmm1 \n\t" \
00213 "psubsw %%xmm5, "TAN3" \n\t" \
00214 "paddsw %%xmm4, %%xmm1 \n\t" \
00215 "pmulhw %%xmm7, %%xmm3 \n\t" \
00216 "pmulhw %%xmm6, "TAN1" \n\t" \
00217 "paddsw %%xmm6, %%xmm3 \n\t" \
00218 "psubsw %%xmm7, "TAN1" \n\t" \
00219 "movdqa %%xmm3, %%xmm7 \n\t" \
00220 "movdqa "TAN1", %%xmm6 \n\t" \
00221 "psubsw %%xmm1, %%xmm3 \n\t" \
00222 "psubsw "TAN3", "TAN1" \n\t" \
00223 "paddsw %%xmm7, %%xmm1 \n\t" \
00224 "paddsw %%xmm6, "TAN3" \n\t" \
00225 "movdqa %%xmm3, %%xmm6 \n\t" \
00226 "psubsw "TAN3", %%xmm3 \n\t" \
00227 "paddsw %%xmm6, "TAN3" \n\t" \
00228 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
00229 "pmulhw %%xmm4, %%xmm3 \n\t" \
00230 "pmulhw %%xmm4, "TAN3" \n\t" \
00231 "paddsw "TAN3", "TAN3" \n\t" \
00232 "paddsw %%xmm3, %%xmm3 \n\t" \
00233 "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \
00234 MOV_32_ONLY ROW2", "REG2" \n\t" \
00235 MOV_32_ONLY ROW6", "REG6" \n\t" \
00236 "movdqa %%xmm7, %%xmm5 \n\t" \
00237 "pmulhw "REG6", %%xmm7 \n\t" \
00238 "pmulhw "REG2", %%xmm5 \n\t" \
00239 "paddsw "REG2", %%xmm7 \n\t" \
00240 "psubsw "REG6", %%xmm5 \n\t" \
00241 MOV_32_ONLY ROW0", "REG0" \n\t" \
00242 MOV_32_ONLY ROW4", "REG4" \n\t" \
00243 MOV_32_ONLY" "TAN1", (%0) \n\t" \
00244 "movdqa "REG0", "XMMS" \n\t" \
00245 "psubsw "REG4", "REG0" \n\t" \
00246 "paddsw "XMMS", "REG4" \n\t" \
00247 "movdqa "REG4", "XMMS" \n\t" \
00248 "psubsw %%xmm7, "REG4" \n\t" \
00249 "paddsw "XMMS", %%xmm7 \n\t" \
00250 "movdqa "REG0", "XMMS" \n\t" \
00251 "psubsw %%xmm5, "REG0" \n\t" \
00252 "paddsw "XMMS", %%xmm5 \n\t" \
00253 "movdqa %%xmm5, "XMMS" \n\t" \
00254 "psubsw "TAN3", %%xmm5 \n\t" \
00255 "paddsw "XMMS", "TAN3" \n\t" \
00256 "movdqa "REG0", "XMMS" \n\t" \
00257 "psubsw %%xmm3, "REG0" \n\t" \
00258 "paddsw "XMMS", %%xmm3 \n\t" \
00259 MOV_32_ONLY" (%0), "TAN1" \n\t" \
00260 "psraw $6, %%xmm5 \n\t" \
00261 "psraw $6, "REG0" \n\t" \
00262 "psraw $6, "TAN3" \n\t" \
00263 "psraw $6, %%xmm3 \n\t" \
00264 "movdqa "TAN3", 1*16("dct") \n\t" \
00265 "movdqa %%xmm3, 2*16("dct") \n\t" \
00266 "movdqa "REG0", 5*16("dct") \n\t" \
00267 "movdqa %%xmm5, 6*16("dct") \n\t" \
00268 "movdqa %%xmm7, %%xmm0 \n\t" \
00269 "movdqa "REG4", %%xmm4 \n\t" \
00270 "psubsw %%xmm1, %%xmm7 \n\t" \
00271 "psubsw "TAN1", "REG4" \n\t" \
00272 "paddsw %%xmm0, %%xmm1 \n\t" \
00273 "paddsw %%xmm4, "TAN1" \n\t" \
00274 "psraw $6, %%xmm1 \n\t" \
00275 "psraw $6, %%xmm7 \n\t" \
00276 "psraw $6, "TAN1" \n\t" \
00277 "psraw $6, "REG4" \n\t" \
00278 "movdqa %%xmm1, ("dct") \n\t" \
00279 "movdqa "TAN1", 3*16("dct") \n\t" \
00280 "movdqa "REG4", 4*16("dct") \n\t" \
00281 "movdqa %%xmm7, 7*16("dct") \n\t"
00282
00284 #define iLLM_PASS_SPARSE(dct) \
00285 "pmulhw %%xmm4, "TAN3" \n\t" \
00286 "paddsw %%xmm4, "TAN3" \n\t" \
00287 "movdqa %%xmm6, %%xmm3 \n\t" \
00288 "pmulhw %%xmm6, "TAN1" \n\t" \
00289 "movdqa %%xmm4, %%xmm1 \n\t" \
00290 "psubsw %%xmm1, %%xmm3 \n\t" \
00291 "paddsw %%xmm6, %%xmm1 \n\t" \
00292 "movdqa "TAN1", %%xmm6 \n\t" \
00293 "psubsw "TAN3", "TAN1" \n\t" \
00294 "paddsw %%xmm6, "TAN3" \n\t" \
00295 "movdqa %%xmm3, %%xmm6 \n\t" \
00296 "psubsw "TAN3", %%xmm3 \n\t" \
00297 "paddsw %%xmm6, "TAN3" \n\t" \
00298 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
00299 "pmulhw %%xmm4, %%xmm3 \n\t" \
00300 "pmulhw %%xmm4, "TAN3" \n\t" \
00301 "paddsw "TAN3", "TAN3" \n\t" \
00302 "paddsw %%xmm3, %%xmm3 \n\t" \
00303 "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \
00304 MOV_32_ONLY ROW2", "SREG2" \n\t" \
00305 "pmulhw "SREG2", %%xmm5 \n\t" \
00306 MOV_32_ONLY ROW0", "REG0" \n\t" \
00307 "movdqa "REG0", %%xmm6 \n\t" \
00308 "psubsw "SREG2", %%xmm6 \n\t" \
00309 "paddsw "REG0", "SREG2" \n\t" \
00310 MOV_32_ONLY" "TAN1", (%0) \n\t" \
00311 "movdqa "REG0", "XMMS" \n\t" \
00312 "psubsw %%xmm5, "REG0" \n\t" \
00313 "paddsw "XMMS", %%xmm5 \n\t" \
00314 "movdqa %%xmm5, "XMMS" \n\t" \
00315 "psubsw "TAN3", %%xmm5 \n\t" \
00316 "paddsw "XMMS", "TAN3" \n\t" \
00317 "movdqa "REG0", "XMMS" \n\t" \
00318 "psubsw %%xmm3, "REG0" \n\t" \
00319 "paddsw "XMMS", %%xmm3 \n\t" \
00320 MOV_32_ONLY" (%0), "TAN1" \n\t" \
00321 "psraw $6, %%xmm5 \n\t" \
00322 "psraw $6, "REG0" \n\t" \
00323 "psraw $6, "TAN3" \n\t" \
00324 "psraw $6, %%xmm3 \n\t" \
00325 "movdqa "TAN3", 1*16("dct") \n\t" \
00326 "movdqa %%xmm3, 2*16("dct") \n\t" \
00327 "movdqa "REG0", 5*16("dct") \n\t" \
00328 "movdqa %%xmm5, 6*16("dct") \n\t" \
00329 "movdqa "SREG2", %%xmm0 \n\t" \
00330 "movdqa %%xmm6, %%xmm4 \n\t" \
00331 "psubsw %%xmm1, "SREG2" \n\t" \
00332 "psubsw "TAN1", %%xmm6 \n\t" \
00333 "paddsw %%xmm0, %%xmm1 \n\t" \
00334 "paddsw %%xmm4, "TAN1" \n\t" \
00335 "psraw $6, %%xmm1 \n\t" \
00336 "psraw $6, "SREG2" \n\t" \
00337 "psraw $6, "TAN1" \n\t" \
00338 "psraw $6, %%xmm6 \n\t" \
00339 "movdqa %%xmm1, ("dct") \n\t" \
00340 "movdqa "TAN1", 3*16("dct") \n\t" \
00341 "movdqa %%xmm6, 4*16("dct") \n\t" \
00342 "movdqa "SREG2", 7*16("dct") \n\t"
00343
00344 inline void ff_idct_xvid_sse2(short *block)
00345 {
00346 __asm__ volatile(
00347 "movq "MANGLE(m127)", %%mm0 \n\t"
00348 iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))
00349 iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
00350 iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
00351
00352 TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
00353 JZ("%%eax", "1f")
00354 iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
00355
00356 TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
00357 TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
00358 iLLM_HEAD
00359 ".p2align 4 \n\t"
00360 JNZ("%%ecx", "2f")
00361 JNZ("%%eax", "3f")
00362 JNZ("%%edx", "4f")
00363 JNZ("%%esi", "5f")
00364 iLLM_PASS_SPARSE("%0")
00365 "jmp 6f \n\t"
00366 "2: \n\t"
00367 iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
00368 "3: \n\t"
00369 iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
00370 JZ("%%edx", "1f")
00371 "4: \n\t"
00372 iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
00373 JZ("%%esi", "1f")
00374 "5: \n\t"
00375 iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
00376 #if !ARCH_X86_64
00377 iLLM_HEAD
00378 #endif
00379 iLLM_PASS("%0")
00380 "6: \n\t"
00381 : "+r"(block)
00382 :
00383 : XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" ,
00384 "%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,)
00385 #if ARCH_X86_64
00386 XMM_CLOBBERS("%xmm8" , "%xmm9" , "%xmm10", "%xmm11",
00387 "%xmm12", "%xmm13", "%xmm14",)
00388 #endif
00389 "%eax", "%ecx", "%edx", "%esi", "memory"
00390 );
00391 }
00392
00393 void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
00394 {
00395 ff_idct_xvid_sse2(block);
00396 ff_put_pixels_clamped_mmx(block, dest, line_size);
00397 }
00398
00399 void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
00400 {
00401 ff_idct_xvid_sse2(block);
00402 ff_add_pixels_clamped_mmx(block, dest, line_size);
00403 }