00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041 #include "libavcodec/dsputil.h"
00042 #include "idct_xvid.h"
00043 #include "dsputil_mmx.h"
00044
00050 #define X8(x) x,x,x,x,x,x,x,x
00051
00052 #define ROW_SHIFT 11
00053 #define COL_SHIFT 6
00054
00055 DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)};
00056 DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)};
00057 DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)};
00058 DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)};
00059 DECLARE_ASM_CONST(8, uint8_t, m127)[] = {X8(127)};
00060
00061 DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
00062 0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
00063 0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
00064 0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
00065 0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
00066 };
00067
00068 DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
00069 0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
00070 0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
00071 0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
00072 0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
00073 };
00074
00075 DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
00076 0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
00077 0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
00078 0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
00079 0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
00080 };
00081
00082 DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
00083 0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
00084 0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
00085 0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
00086 0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
00087 };
00088
00089 DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
00090 65536, 65536, 65536, 65536,
00091 3597, 3597, 3597, 3597,
00092 2260, 2260, 2260, 2260,
00093 1203, 1203, 1203, 1203,
00094 120, 120, 120, 120,
00095 512, 512, 512, 512
00096 };
00097
00098
00099 #define ROW1 "%%xmm6"
00100 #define ROW3 "%%xmm4"
00101 #define ROW5 "%%xmm5"
00102 #define ROW7 "%%xmm7"
00103
00104 #define CLEAR_ODD(r) "pxor "r","r" \n\t"
00105 #define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t"
00106
00107 #if ARCH_X86_64
00108
00109 # define ROW0 "%%xmm8"
00110 # define REG0 ROW0
00111 # define ROW2 "%%xmm9"
00112 # define REG2 ROW2
00113 # define ROW4 "%%xmm10"
00114 # define REG4 ROW4
00115 # define ROW6 "%%xmm11"
00116 # define REG6 ROW6
00117 # define CLEAR_EVEN(r) CLEAR_ODD(r)
00118 # define PUT_EVEN(dst) PUT_ODD(dst)
00119 # define XMMS "%%xmm12"
00120 # define MOV_32_ONLY "#"
00121 # define SREG2 REG2
00122 # define TAN3 "%%xmm13"
00123 # define TAN1 "%%xmm14"
00124
00125 #else
00126
00127 # define ROW0 "(%0)"
00128 # define REG0 "%%xmm4"
00129 # define ROW2 "2*16(%0)"
00130 # define REG2 "%%xmm4"
00131 # define ROW4 "4*16(%0)"
00132 # define REG4 "%%xmm6"
00133 # define ROW6 "6*16(%0)"
00134 # define REG6 "%%xmm6"
00135 # define CLEAR_EVEN(r)
00136 # define PUT_EVEN(dst) \
00137 "pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \
00138 "movdqa %%xmm2, "dst" \n\t"
00139 # define XMMS "%%xmm2"
00140 # define MOV_32_ONLY "movdqa "
00141 # define SREG2 "%%xmm7"
00142 # define TAN3 "%%xmm0"
00143 # define TAN1 "%%xmm2"
00144
00145 #endif
00146
00147 #define ROUND(x) "paddd "MANGLE(x)
00148
00149 #define JZ(reg, to) \
00150 "testl "reg","reg" \n\t" \
00151 "jz "to" \n\t"
00152
00153 #define JNZ(reg, to) \
00154 "testl "reg","reg" \n\t" \
00155 "jnz "to" \n\t"
00156
00157 #define TEST_ONE_ROW(src, reg, clear) \
00158 clear \
00159 "movq "src", %%mm1 \n\t" \
00160 "por 8+"src", %%mm1 \n\t" \
00161 "paddusb %%mm0, %%mm1 \n\t" \
00162 "pmovmskb %%mm1, "reg" \n\t"
00163
00164 #define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
00165 clear1 \
00166 clear2 \
00167 "movq "row1", %%mm1 \n\t" \
00168 "por 8+"row1", %%mm1 \n\t" \
00169 "movq "row2", %%mm2 \n\t" \
00170 "por 8+"row2", %%mm2 \n\t" \
00171 "paddusb %%mm0, %%mm1 \n\t" \
00172 "paddusb %%mm0, %%mm2 \n\t" \
00173 "pmovmskb %%mm1, "reg1" \n\t" \
00174 "pmovmskb %%mm2, "reg2" \n\t"
00175
00177 #define iMTX_MULT(src, table, rounder, put) \
00178 "movdqa "src", %%xmm3 \n\t" \
00179 "movdqa %%xmm3, %%xmm0 \n\t" \
00180 "pshufd $0x11, %%xmm3, %%xmm1 \n\t" \
00181 "punpcklqdq %%xmm0, %%xmm0 \n\t" \
00182 "pmaddwd "table", %%xmm0 \n\t" \
00183 "pmaddwd 16+"table", %%xmm1 \n\t" \
00184 "pshufd $0xBB, %%xmm3, %%xmm2 \n\t" \
00185 "punpckhqdq %%xmm3, %%xmm3 \n\t" \
00186 "pmaddwd 32+"table", %%xmm2 \n\t" \
00187 "pmaddwd 48+"table", %%xmm3 \n\t" \
00188 "paddd %%xmm1, %%xmm0 \n\t" \
00189 "paddd %%xmm3, %%xmm2 \n\t" \
00190 rounder", %%xmm0 \n\t" \
00191 "movdqa %%xmm2, %%xmm3 \n\t" \
00192 "paddd %%xmm0, %%xmm2 \n\t" \
00193 "psubd %%xmm3, %%xmm0 \n\t" \
00194 "psrad $11, %%xmm2 \n\t" \
00195 "psrad $11, %%xmm0 \n\t" \
00196 "packssdw %%xmm0, %%xmm2 \n\t" \
00197 put \
00198 "1: \n\t"
00199
00200 #define iLLM_HEAD \
00201 "movdqa "MANGLE(tan3)", "TAN3" \n\t" \
00202 "movdqa "MANGLE(tan1)", "TAN1" \n\t" \
00203
00205 #define iLLM_PASS(dct) \
00206 "movdqa "TAN3", %%xmm1 \n\t" \
00207 "movdqa "TAN1", %%xmm3 \n\t" \
00208 "pmulhw %%xmm4, "TAN3" \n\t" \
00209 "pmulhw %%xmm5, %%xmm1 \n\t" \
00210 "paddsw %%xmm4, "TAN3" \n\t" \
00211 "paddsw %%xmm5, %%xmm1 \n\t" \
00212 "psubsw %%xmm5, "TAN3" \n\t" \
00213 "paddsw %%xmm4, %%xmm1 \n\t" \
00214 "pmulhw %%xmm7, %%xmm3 \n\t" \
00215 "pmulhw %%xmm6, "TAN1" \n\t" \
00216 "paddsw %%xmm6, %%xmm3 \n\t" \
00217 "psubsw %%xmm7, "TAN1" \n\t" \
00218 "movdqa %%xmm3, %%xmm7 \n\t" \
00219 "movdqa "TAN1", %%xmm6 \n\t" \
00220 "psubsw %%xmm1, %%xmm3 \n\t" \
00221 "psubsw "TAN3", "TAN1" \n\t" \
00222 "paddsw %%xmm7, %%xmm1 \n\t" \
00223 "paddsw %%xmm6, "TAN3" \n\t" \
00224 "movdqa %%xmm3, %%xmm6 \n\t" \
00225 "psubsw "TAN3", %%xmm3 \n\t" \
00226 "paddsw %%xmm6, "TAN3" \n\t" \
00227 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
00228 "pmulhw %%xmm4, %%xmm3 \n\t" \
00229 "pmulhw %%xmm4, "TAN3" \n\t" \
00230 "paddsw "TAN3", "TAN3" \n\t" \
00231 "paddsw %%xmm3, %%xmm3 \n\t" \
00232 "movdqa "MANGLE(tan2)", %%xmm7 \n\t" \
00233 MOV_32_ONLY ROW2", "REG2" \n\t" \
00234 MOV_32_ONLY ROW6", "REG6" \n\t" \
00235 "movdqa %%xmm7, %%xmm5 \n\t" \
00236 "pmulhw "REG6", %%xmm7 \n\t" \
00237 "pmulhw "REG2", %%xmm5 \n\t" \
00238 "paddsw "REG2", %%xmm7 \n\t" \
00239 "psubsw "REG6", %%xmm5 \n\t" \
00240 MOV_32_ONLY ROW0", "REG0" \n\t" \
00241 MOV_32_ONLY ROW4", "REG4" \n\t" \
00242 MOV_32_ONLY" "TAN1", (%0) \n\t" \
00243 "movdqa "REG0", "XMMS" \n\t" \
00244 "psubsw "REG4", "REG0" \n\t" \
00245 "paddsw "XMMS", "REG4" \n\t" \
00246 "movdqa "REG4", "XMMS" \n\t" \
00247 "psubsw %%xmm7, "REG4" \n\t" \
00248 "paddsw "XMMS", %%xmm7 \n\t" \
00249 "movdqa "REG0", "XMMS" \n\t" \
00250 "psubsw %%xmm5, "REG0" \n\t" \
00251 "paddsw "XMMS", %%xmm5 \n\t" \
00252 "movdqa %%xmm5, "XMMS" \n\t" \
00253 "psubsw "TAN3", %%xmm5 \n\t" \
00254 "paddsw "XMMS", "TAN3" \n\t" \
00255 "movdqa "REG0", "XMMS" \n\t" \
00256 "psubsw %%xmm3, "REG0" \n\t" \
00257 "paddsw "XMMS", %%xmm3 \n\t" \
00258 MOV_32_ONLY" (%0), "TAN1" \n\t" \
00259 "psraw $6, %%xmm5 \n\t" \
00260 "psraw $6, "REG0" \n\t" \
00261 "psraw $6, "TAN3" \n\t" \
00262 "psraw $6, %%xmm3 \n\t" \
00263 "movdqa "TAN3", 1*16("dct") \n\t" \
00264 "movdqa %%xmm3, 2*16("dct") \n\t" \
00265 "movdqa "REG0", 5*16("dct") \n\t" \
00266 "movdqa %%xmm5, 6*16("dct") \n\t" \
00267 "movdqa %%xmm7, %%xmm0 \n\t" \
00268 "movdqa "REG4", %%xmm4 \n\t" \
00269 "psubsw %%xmm1, %%xmm7 \n\t" \
00270 "psubsw "TAN1", "REG4" \n\t" \
00271 "paddsw %%xmm0, %%xmm1 \n\t" \
00272 "paddsw %%xmm4, "TAN1" \n\t" \
00273 "psraw $6, %%xmm1 \n\t" \
00274 "psraw $6, %%xmm7 \n\t" \
00275 "psraw $6, "TAN1" \n\t" \
00276 "psraw $6, "REG4" \n\t" \
00277 "movdqa %%xmm1, ("dct") \n\t" \
00278 "movdqa "TAN1", 3*16("dct") \n\t" \
00279 "movdqa "REG4", 4*16("dct") \n\t" \
00280 "movdqa %%xmm7, 7*16("dct") \n\t"
00281
00283 #define iLLM_PASS_SPARSE(dct) \
00284 "pmulhw %%xmm4, "TAN3" \n\t" \
00285 "paddsw %%xmm4, "TAN3" \n\t" \
00286 "movdqa %%xmm6, %%xmm3 \n\t" \
00287 "pmulhw %%xmm6, "TAN1" \n\t" \
00288 "movdqa %%xmm4, %%xmm1 \n\t" \
00289 "psubsw %%xmm1, %%xmm3 \n\t" \
00290 "paddsw %%xmm6, %%xmm1 \n\t" \
00291 "movdqa "TAN1", %%xmm6 \n\t" \
00292 "psubsw "TAN3", "TAN1" \n\t" \
00293 "paddsw %%xmm6, "TAN3" \n\t" \
00294 "movdqa %%xmm3, %%xmm6 \n\t" \
00295 "psubsw "TAN3", %%xmm3 \n\t" \
00296 "paddsw %%xmm6, "TAN3" \n\t" \
00297 "movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
00298 "pmulhw %%xmm4, %%xmm3 \n\t" \
00299 "pmulhw %%xmm4, "TAN3" \n\t" \
00300 "paddsw "TAN3", "TAN3" \n\t" \
00301 "paddsw %%xmm3, %%xmm3 \n\t" \
00302 "movdqa "MANGLE(tan2)", %%xmm5 \n\t" \
00303 MOV_32_ONLY ROW2", "SREG2" \n\t" \
00304 "pmulhw "SREG2", %%xmm5 \n\t" \
00305 MOV_32_ONLY ROW0", "REG0" \n\t" \
00306 "movdqa "REG0", %%xmm6 \n\t" \
00307 "psubsw "SREG2", %%xmm6 \n\t" \
00308 "paddsw "REG0", "SREG2" \n\t" \
00309 MOV_32_ONLY" "TAN1", (%0) \n\t" \
00310 "movdqa "REG0", "XMMS" \n\t" \
00311 "psubsw %%xmm5, "REG0" \n\t" \
00312 "paddsw "XMMS", %%xmm5 \n\t" \
00313 "movdqa %%xmm5, "XMMS" \n\t" \
00314 "psubsw "TAN3", %%xmm5 \n\t" \
00315 "paddsw "XMMS", "TAN3" \n\t" \
00316 "movdqa "REG0", "XMMS" \n\t" \
00317 "psubsw %%xmm3, "REG0" \n\t" \
00318 "paddsw "XMMS", %%xmm3 \n\t" \
00319 MOV_32_ONLY" (%0), "TAN1" \n\t" \
00320 "psraw $6, %%xmm5 \n\t" \
00321 "psraw $6, "REG0" \n\t" \
00322 "psraw $6, "TAN3" \n\t" \
00323 "psraw $6, %%xmm3 \n\t" \
00324 "movdqa "TAN3", 1*16("dct") \n\t" \
00325 "movdqa %%xmm3, 2*16("dct") \n\t" \
00326 "movdqa "REG0", 5*16("dct") \n\t" \
00327 "movdqa %%xmm5, 6*16("dct") \n\t" \
00328 "movdqa "SREG2", %%xmm0 \n\t" \
00329 "movdqa %%xmm6, %%xmm4 \n\t" \
00330 "psubsw %%xmm1, "SREG2" \n\t" \
00331 "psubsw "TAN1", %%xmm6 \n\t" \
00332 "paddsw %%xmm0, %%xmm1 \n\t" \
00333 "paddsw %%xmm4, "TAN1" \n\t" \
00334 "psraw $6, %%xmm1 \n\t" \
00335 "psraw $6, "SREG2" \n\t" \
00336 "psraw $6, "TAN1" \n\t" \
00337 "psraw $6, %%xmm6 \n\t" \
00338 "movdqa %%xmm1, ("dct") \n\t" \
00339 "movdqa "TAN1", 3*16("dct") \n\t" \
00340 "movdqa %%xmm6, 4*16("dct") \n\t" \
00341 "movdqa "SREG2", 7*16("dct") \n\t"
00342
00343 inline void ff_idct_xvid_sse2(short *block)
00344 {
00345 __asm__ volatile(
00346 "movq "MANGLE(m127)", %%mm0 \n\t"
00347 iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))
00348 iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
00349 iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
00350
00351 TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
00352 JZ("%%eax", "1f")
00353 iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
00354
00355 TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
00356 TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
00357 iLLM_HEAD
00358 ASMALIGN(4)
00359 JNZ("%%ecx", "2f")
00360 JNZ("%%eax", "3f")
00361 JNZ("%%edx", "4f")
00362 JNZ("%%esi", "5f")
00363 iLLM_PASS_SPARSE("%0")
00364 "jmp 6f \n\t"
00365 "2: \n\t"
00366 iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
00367 "3: \n\t"
00368 iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
00369 JZ("%%edx", "1f")
00370 "4: \n\t"
00371 iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
00372 JZ("%%esi", "1f")
00373 "5: \n\t"
00374 iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
00375 #if !ARCH_X86_64
00376 iLLM_HEAD
00377 #endif
00378 iLLM_PASS("%0")
00379 "6: \n\t"
00380 : "+r"(block)
00381 :
00382 : "%eax", "%ecx", "%edx", "%esi", "memory");
00383 }
00384
00385 void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
00386 {
00387 ff_idct_xvid_sse2(block);
00388 put_pixels_clamped_mmx(block, dest, line_size);
00389 }
00390
00391 void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
00392 {
00393 ff_idct_xvid_sse2(block);
00394 add_pixels_clamped_mmx(block, dest, line_size);
00395 }