00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021 #ifdef COMPILE_TEMPLATE_SSE
00022 #define MM "%%xmm"
00023 #define MOV "movq"
00024 #define MOVQ "movdqa"
00025 #define MOVQU "movdqu"
00026 #define STEP 8
00027 #define LOAD(mem,dst) \
00028 MOV" "mem", "dst" \n\t"\
00029 "punpcklbw "MM"7, "dst" \n\t"
00030 #define PSRL1(reg) "psrldq $1, "reg" \n\t"
00031 #define PSRL2(reg) "psrldq $2, "reg" \n\t"
00032 #define PSHUF(src,dst) "movdqa "dst", "src" \n\t"\
00033 "psrldq $2, "src" \n\t"
00034 #else
00035 #define MM "%%mm"
00036 #define MOV "movd"
00037 #define MOVQ "movq"
00038 #define MOVQU "movq"
00039 #define STEP 4
00040 #define LOAD(mem,dst) \
00041 MOV" "mem", "dst" \n\t"\
00042 "punpcklbw "MM"7, "dst" \n\t"
00043 #define PSRL1(reg) "psrlq $8, "reg" \n\t"
00044 #define PSRL2(reg) "psrlq $16, "reg" \n\t"
00045 #define PSHUF(src,dst) "pshufw $9, "dst", "src" \n\t"
00046 #endif
00047
00048 #ifdef COMPILE_TEMPLATE_SSSE3
00049 #define PABS(tmp,dst) \
00050 "pabsw "dst", "dst" \n\t"
00051 #else
00052 #define PABS(tmp,dst) \
00053 "pxor "tmp", "tmp" \n\t"\
00054 "psubw "dst", "tmp" \n\t"\
00055 "pmaxsw "tmp", "dst" \n\t"
00056 #endif
00057
00058 #define CHECK(pj,mj) \
00059 MOVQU" "#pj"(%[cur],%[mrefs]), "MM"2 \n\t" \
00060 MOVQU" "#mj"(%[cur],%[prefs]), "MM"3 \n\t" \
00061 MOVQ" "MM"2, "MM"4 \n\t"\
00062 MOVQ" "MM"2, "MM"5 \n\t"\
00063 "pxor "MM"3, "MM"4 \n\t"\
00064 "pavgb "MM"3, "MM"5 \n\t"\
00065 "pand "MANGLE(pb_1)", "MM"4 \n\t"\
00066 "psubusb "MM"4, "MM"5 \n\t"\
00067 PSRL1(MM"5") \
00068 "punpcklbw "MM"7, "MM"5 \n\t" \
00069 MOVQ" "MM"2, "MM"4 \n\t"\
00070 "psubusb "MM"3, "MM"2 \n\t"\
00071 "psubusb "MM"4, "MM"3 \n\t"\
00072 "pmaxub "MM"3, "MM"2 \n\t"\
00073 MOVQ" "MM"2, "MM"3 \n\t"\
00074 MOVQ" "MM"2, "MM"4 \n\t" \
00075 PSRL1(MM"3") \
00076 PSRL2(MM"4") \
00077 "punpcklbw "MM"7, "MM"2 \n\t"\
00078 "punpcklbw "MM"7, "MM"3 \n\t"\
00079 "punpcklbw "MM"7, "MM"4 \n\t"\
00080 "paddw "MM"3, "MM"2 \n\t"\
00081 "paddw "MM"4, "MM"2 \n\t"
00082
00083 #define CHECK1 \
00084 MOVQ" "MM"0, "MM"3 \n\t"\
00085 "pcmpgtw "MM"2, "MM"3 \n\t" \
00086 "pminsw "MM"2, "MM"0 \n\t" \
00087 MOVQ" "MM"3, "MM"6 \n\t"\
00088 "pand "MM"3, "MM"5 \n\t"\
00089 "pandn "MM"1, "MM"3 \n\t"\
00090 "por "MM"5, "MM"3 \n\t"\
00091 MOVQ" "MM"3, "MM"1 \n\t"
00092
00093 #define CHECK2
00094 \
00095 "paddw "MANGLE(pw_1)", "MM"6 \n\t"\
00096 "psllw $14, "MM"6 \n\t"\
00097 "paddsw "MM"6, "MM"2 \n\t"\
00098 MOVQ" "MM"0, "MM"3 \n\t"\
00099 "pcmpgtw "MM"2, "MM"3 \n\t"\
00100 "pminsw "MM"2, "MM"0 \n\t"\
00101 "pand "MM"3, "MM"5 \n\t"\
00102 "pandn "MM"1, "MM"3 \n\t"\
00103 "por "MM"5, "MM"3 \n\t"\
00104 MOVQ" "MM"3, "MM"1 \n\t"
00105
00106 void RENAME(ff_yadif_filter_line)(uint8_t *dst,
00107 uint8_t *prev, uint8_t *cur, uint8_t *next,
00108 int w, int prefs, int mrefs, int parity, int mode)
00109 {
00110 uint8_t tmp[5*16];
00111 uint8_t *tmpA= (uint8_t*)(((uint64_t)(tmp+15)) & ~15);
00112 int x;
00113
00114 #define FILTER\
00115 for(x=0; x<w; x+=STEP){\
00116 __asm__ volatile(\
00117 "pxor "MM"7, "MM"7 \n\t"\
00118 LOAD("(%[cur],%[mrefs])", MM"0") \
00119 LOAD("(%[cur],%[prefs])", MM"1") \
00120 LOAD("(%["prev2"])", MM"2") \
00121 LOAD("(%["next2"])", MM"3") \
00122 MOVQ" "MM"3, "MM"4 \n\t"\
00123 "paddw "MM"2, "MM"3 \n\t"\
00124 "psraw $1, "MM"3 \n\t" \
00125 MOVQ" "MM"0, (%[tmpA]) \n\t" \
00126 MOVQ" "MM"3, 16(%[tmpA]) \n\t" \
00127 MOVQ" "MM"1, 32(%[tmpA]) \n\t" \
00128 "psubw "MM"4, "MM"2 \n\t"\
00129 PABS( MM"4", MM"2") \
00130 LOAD("(%[prev],%[mrefs])", MM"3") \
00131 LOAD("(%[prev],%[prefs])", MM"4") \
00132 "psubw "MM"0, "MM"3 \n\t"\
00133 "psubw "MM"1, "MM"4 \n\t"\
00134 PABS( MM"5", MM"3")\
00135 PABS( MM"5", MM"4")\
00136 "paddw "MM"4, "MM"3 \n\t" \
00137 "psrlw $1, "MM"2 \n\t"\
00138 "psrlw $1, "MM"3 \n\t"\
00139 "pmaxsw "MM"3, "MM"2 \n\t"\
00140 LOAD("(%[next],%[mrefs])", MM"3") \
00141 LOAD("(%[next],%[prefs])", MM"4") \
00142 "psubw "MM"0, "MM"3 \n\t"\
00143 "psubw "MM"1, "MM"4 \n\t"\
00144 PABS( MM"5", MM"3")\
00145 PABS( MM"5", MM"4")\
00146 "paddw "MM"4, "MM"3 \n\t" \
00147 "psrlw $1, "MM"3 \n\t"\
00148 "pmaxsw "MM"3, "MM"2 \n\t"\
00149 MOVQ" "MM"2, 48(%[tmpA]) \n\t" \
00150 \
00151 "paddw "MM"0, "MM"1 \n\t"\
00152 "paddw "MM"0, "MM"0 \n\t"\
00153 "psubw "MM"1, "MM"0 \n\t"\
00154 "psrlw $1, "MM"1 \n\t" \
00155 PABS( MM"2", MM"0") \
00156 \
00157 MOVQU" -1(%[cur],%[mrefs]), "MM"2 \n\t" \
00158 MOVQU" -1(%[cur],%[prefs]), "MM"3 \n\t" \
00159 MOVQ" "MM"2, "MM"4 \n\t"\
00160 "psubusb "MM"3, "MM"2 \n\t"\
00161 "psubusb "MM"4, "MM"3 \n\t"\
00162 "pmaxub "MM"3, "MM"2 \n\t"\
00163 PSHUF(MM"3", MM"2") \
00164 "punpcklbw "MM"7, "MM"2 \n\t" \
00165 "punpcklbw "MM"7, "MM"3 \n\t" \
00166 "paddw "MM"2, "MM"0 \n\t"\
00167 "paddw "MM"3, "MM"0 \n\t"\
00168 "psubw "MANGLE(pw_1)", "MM"0 \n\t" \
00169 \
00170 CHECK(-2,0)\
00171 CHECK1\
00172 CHECK(-3,1)\
00173 CHECK2\
00174 CHECK(0,-2)\
00175 CHECK1\
00176 CHECK(1,-3)\
00177 CHECK2\
00178 \
00179 \
00180 MOVQ" 48(%[tmpA]), "MM"6 \n\t" \
00181 "cmpl $2, %[mode] \n\t"\
00182 "jge 1f \n\t"\
00183 LOAD("(%["prev2"],%[mrefs],2)", MM"2") \
00184 LOAD("(%["next2"],%[mrefs],2)", MM"4") \
00185 LOAD("(%["prev2"],%[prefs],2)", MM"3") \
00186 LOAD("(%["next2"],%[prefs],2)", MM"5") \
00187 "paddw "MM"4, "MM"2 \n\t"\
00188 "paddw "MM"5, "MM"3 \n\t"\
00189 "psrlw $1, "MM"2 \n\t" \
00190 "psrlw $1, "MM"3 \n\t" \
00191 MOVQ" (%[tmpA]), "MM"4 \n\t" \
00192 MOVQ" 16(%[tmpA]), "MM"5 \n\t" \
00193 MOVQ" 32(%[tmpA]), "MM"7 \n\t" \
00194 "psubw "MM"4, "MM"2 \n\t" \
00195 "psubw "MM"7, "MM"3 \n\t" \
00196 MOVQ" "MM"5, "MM"0 \n\t"\
00197 "psubw "MM"4, "MM"5 \n\t" \
00198 "psubw "MM"7, "MM"0 \n\t" \
00199 MOVQ" "MM"2, "MM"4 \n\t"\
00200 "pminsw "MM"3, "MM"2 \n\t"\
00201 "pmaxsw "MM"4, "MM"3 \n\t"\
00202 "pmaxsw "MM"5, "MM"2 \n\t"\
00203 "pminsw "MM"5, "MM"3 \n\t"\
00204 "pmaxsw "MM"0, "MM"2 \n\t" \
00205 "pminsw "MM"0, "MM"3 \n\t" \
00206 "pxor "MM"4, "MM"4 \n\t"\
00207 "pmaxsw "MM"3, "MM"6 \n\t"\
00208 "psubw "MM"2, "MM"4 \n\t" \
00209 "pmaxsw "MM"4, "MM"6 \n\t" \
00210 "1: \n\t"\
00211 \
00212 MOVQ" 16(%[tmpA]), "MM"2 \n\t" \
00213 MOVQ" "MM"2, "MM"3 \n\t"\
00214 "psubw "MM"6, "MM"2 \n\t" \
00215 "paddw "MM"6, "MM"3 \n\t" \
00216 "pmaxsw "MM"2, "MM"1 \n\t"\
00217 "pminsw "MM"3, "MM"1 \n\t" \
00218 "packuswb "MM"1, "MM"1 \n\t"\
00219 \
00220 :\
00221 :[tmpA] "r"(tmpA),\
00222 [prev] "r"(prev),\
00223 [cur] "r"(cur),\
00224 [next] "r"(next),\
00225 [prefs]"r"((x86_reg)prefs),\
00226 [mrefs]"r"((x86_reg)mrefs),\
00227 [mode] "g"(mode)\
00228 );\
00229 __asm__ volatile(MOV" "MM"1, %0" :"=m"(*dst));\
00230 dst += STEP;\
00231 prev+= STEP;\
00232 cur += STEP;\
00233 next+= STEP;\
00234 }
00235
00236 if (parity) {
00237 #define prev2 "prev"
00238 #define next2 "cur"
00239 FILTER
00240 #undef prev2
00241 #undef next2
00242 } else {
00243 #define prev2 "cur"
00244 #define next2 "next"
00245 FILTER
00246 #undef prev2
00247 #undef next2
00248 }
00249 }
00250 #undef STEP
00251 #undef MM
00252 #undef MOV
00253 #undef MOVQ
00254 #undef MOVQU
00255 #undef PSHUF
00256 #undef PSRL1
00257 #undef PSRL2
00258 #undef LOAD
00259 #undef PABS
00260 #undef CHECK
00261 #undef CHECK1
00262 #undef CHECK2
00263 #undef FILTER
00264