00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00031 {
00032 __asm__ volatile(
00033 "lea (%3, %3), %%"REG_a" \n\t"
00034 "1: \n\t"
00035 "movq (%1), %%mm0 \n\t"
00036 "movq (%1, %3), %%mm1 \n\t"
00037 PAVGB" 1(%1), %%mm0 \n\t"
00038 PAVGB" 1(%1, %3), %%mm1 \n\t"
00039 "movq %%mm0, (%2) \n\t"
00040 "movq %%mm1, (%2, %3) \n\t"
00041 "add %%"REG_a", %1 \n\t"
00042 "add %%"REG_a", %2 \n\t"
00043 "movq (%1), %%mm0 \n\t"
00044 "movq (%1, %3), %%mm1 \n\t"
00045 PAVGB" 1(%1), %%mm0 \n\t"
00046 PAVGB" 1(%1, %3), %%mm1 \n\t"
00047 "add %%"REG_a", %1 \n\t"
00048 "movq %%mm0, (%2) \n\t"
00049 "movq %%mm1, (%2, %3) \n\t"
00050 "add %%"REG_a", %2 \n\t"
00051 "subl $4, %0 \n\t"
00052 "jnz 1b \n\t"
00053 :"+g"(h), "+S"(pixels), "+D"(block)
00054 :"r" ((x86_reg)line_size)
00055 :"%"REG_a, "memory");
00056 }
00057
00058 static void DEF(put_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00059 {
00060 __asm__ volatile(
00061 "testl $1, %0 \n\t"
00062 " jz 1f \n\t"
00063 "movd (%1), %%mm0 \n\t"
00064 "movd (%2), %%mm1 \n\t"
00065 "add %4, %1 \n\t"
00066 "add $4, %2 \n\t"
00067 PAVGB" %%mm1, %%mm0 \n\t"
00068 "movd %%mm0, (%3) \n\t"
00069 "add %5, %3 \n\t"
00070 "decl %0 \n\t"
00071 "1: \n\t"
00072 "movd (%1), %%mm0 \n\t"
00073 "add %4, %1 \n\t"
00074 "movd (%1), %%mm1 \n\t"
00075 "movd (%2), %%mm2 \n\t"
00076 "movd 4(%2), %%mm3 \n\t"
00077 "add %4, %1 \n\t"
00078 PAVGB" %%mm2, %%mm0 \n\t"
00079 PAVGB" %%mm3, %%mm1 \n\t"
00080 "movd %%mm0, (%3) \n\t"
00081 "add %5, %3 \n\t"
00082 "movd %%mm1, (%3) \n\t"
00083 "add %5, %3 \n\t"
00084 "movd (%1), %%mm0 \n\t"
00085 "add %4, %1 \n\t"
00086 "movd (%1), %%mm1 \n\t"
00087 "movd 8(%2), %%mm2 \n\t"
00088 "movd 12(%2), %%mm3 \n\t"
00089 "add %4, %1 \n\t"
00090 PAVGB" %%mm2, %%mm0 \n\t"
00091 PAVGB" %%mm3, %%mm1 \n\t"
00092 "movd %%mm0, (%3) \n\t"
00093 "add %5, %3 \n\t"
00094 "movd %%mm1, (%3) \n\t"
00095 "add %5, %3 \n\t"
00096 "add $16, %2 \n\t"
00097 "subl $4, %0 \n\t"
00098 "jnz 1b \n\t"
00099 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00100 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00101 #else
00102 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00103 #endif
00104 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00105 :"memory");
00106 }
00107
00108
00109 static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00110 {
00111 __asm__ volatile(
00112 "testl $1, %0 \n\t"
00113 " jz 1f \n\t"
00114 "movq (%1), %%mm0 \n\t"
00115 "movq (%2), %%mm1 \n\t"
00116 "add %4, %1 \n\t"
00117 "add $8, %2 \n\t"
00118 PAVGB" %%mm1, %%mm0 \n\t"
00119 "movq %%mm0, (%3) \n\t"
00120 "add %5, %3 \n\t"
00121 "decl %0 \n\t"
00122 "1: \n\t"
00123 "movq (%1), %%mm0 \n\t"
00124 "add %4, %1 \n\t"
00125 "movq (%1), %%mm1 \n\t"
00126 "add %4, %1 \n\t"
00127 PAVGB" (%2), %%mm0 \n\t"
00128 PAVGB" 8(%2), %%mm1 \n\t"
00129 "movq %%mm0, (%3) \n\t"
00130 "add %5, %3 \n\t"
00131 "movq %%mm1, (%3) \n\t"
00132 "add %5, %3 \n\t"
00133 "movq (%1), %%mm0 \n\t"
00134 "add %4, %1 \n\t"
00135 "movq (%1), %%mm1 \n\t"
00136 "add %4, %1 \n\t"
00137 PAVGB" 16(%2), %%mm0 \n\t"
00138 PAVGB" 24(%2), %%mm1 \n\t"
00139 "movq %%mm0, (%3) \n\t"
00140 "add %5, %3 \n\t"
00141 "movq %%mm1, (%3) \n\t"
00142 "add %5, %3 \n\t"
00143 "add $32, %2 \n\t"
00144 "subl $4, %0 \n\t"
00145 "jnz 1b \n\t"
00146 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00147 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00148 #else
00149 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00150 #endif
00151 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00152 :"memory");
00153
00154
00155
00156
00157 }
00158
00159 static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00160 {
00161 __asm__ volatile(
00162 "pcmpeqb %%mm6, %%mm6 \n\t"
00163 "testl $1, %0 \n\t"
00164 " jz 1f \n\t"
00165 "movq (%1), %%mm0 \n\t"
00166 "movq (%2), %%mm1 \n\t"
00167 "add %4, %1 \n\t"
00168 "add $8, %2 \n\t"
00169 "pxor %%mm6, %%mm0 \n\t"
00170 "pxor %%mm6, %%mm1 \n\t"
00171 PAVGB" %%mm1, %%mm0 \n\t"
00172 "pxor %%mm6, %%mm0 \n\t"
00173 "movq %%mm0, (%3) \n\t"
00174 "add %5, %3 \n\t"
00175 "decl %0 \n\t"
00176 "1: \n\t"
00177 "movq (%1), %%mm0 \n\t"
00178 "add %4, %1 \n\t"
00179 "movq (%1), %%mm1 \n\t"
00180 "add %4, %1 \n\t"
00181 "movq (%2), %%mm2 \n\t"
00182 "movq 8(%2), %%mm3 \n\t"
00183 "pxor %%mm6, %%mm0 \n\t"
00184 "pxor %%mm6, %%mm1 \n\t"
00185 "pxor %%mm6, %%mm2 \n\t"
00186 "pxor %%mm6, %%mm3 \n\t"
00187 PAVGB" %%mm2, %%mm0 \n\t"
00188 PAVGB" %%mm3, %%mm1 \n\t"
00189 "pxor %%mm6, %%mm0 \n\t"
00190 "pxor %%mm6, %%mm1 \n\t"
00191 "movq %%mm0, (%3) \n\t"
00192 "add %5, %3 \n\t"
00193 "movq %%mm1, (%3) \n\t"
00194 "add %5, %3 \n\t"
00195 "movq (%1), %%mm0 \n\t"
00196 "add %4, %1 \n\t"
00197 "movq (%1), %%mm1 \n\t"
00198 "add %4, %1 \n\t"
00199 "movq 16(%2), %%mm2 \n\t"
00200 "movq 24(%2), %%mm3 \n\t"
00201 "pxor %%mm6, %%mm0 \n\t"
00202 "pxor %%mm6, %%mm1 \n\t"
00203 "pxor %%mm6, %%mm2 \n\t"
00204 "pxor %%mm6, %%mm3 \n\t"
00205 PAVGB" %%mm2, %%mm0 \n\t"
00206 PAVGB" %%mm3, %%mm1 \n\t"
00207 "pxor %%mm6, %%mm0 \n\t"
00208 "pxor %%mm6, %%mm1 \n\t"
00209 "movq %%mm0, (%3) \n\t"
00210 "add %5, %3 \n\t"
00211 "movq %%mm1, (%3) \n\t"
00212 "add %5, %3 \n\t"
00213 "add $32, %2 \n\t"
00214 "subl $4, %0 \n\t"
00215 "jnz 1b \n\t"
00216 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00217 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00218 #else
00219 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00220 #endif
00221 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00222 :"memory");
00223
00224
00225
00226
00227 }
00228
00229 static void DEF(avg_pixels4_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00230 {
00231 __asm__ volatile(
00232 "testl $1, %0 \n\t"
00233 " jz 1f \n\t"
00234 "movd (%1), %%mm0 \n\t"
00235 "movd (%2), %%mm1 \n\t"
00236 "add %4, %1 \n\t"
00237 "add $4, %2 \n\t"
00238 PAVGB" %%mm1, %%mm0 \n\t"
00239 PAVGB" (%3), %%mm0 \n\t"
00240 "movd %%mm0, (%3) \n\t"
00241 "add %5, %3 \n\t"
00242 "decl %0 \n\t"
00243 "1: \n\t"
00244 "movd (%1), %%mm0 \n\t"
00245 "add %4, %1 \n\t"
00246 "movd (%1), %%mm1 \n\t"
00247 "add %4, %1 \n\t"
00248 PAVGB" (%2), %%mm0 \n\t"
00249 PAVGB" 4(%2), %%mm1 \n\t"
00250 PAVGB" (%3), %%mm0 \n\t"
00251 "movd %%mm0, (%3) \n\t"
00252 "add %5, %3 \n\t"
00253 PAVGB" (%3), %%mm1 \n\t"
00254 "movd %%mm1, (%3) \n\t"
00255 "add %5, %3 \n\t"
00256 "movd (%1), %%mm0 \n\t"
00257 "add %4, %1 \n\t"
00258 "movd (%1), %%mm1 \n\t"
00259 "add %4, %1 \n\t"
00260 PAVGB" 8(%2), %%mm0 \n\t"
00261 PAVGB" 12(%2), %%mm1 \n\t"
00262 PAVGB" (%3), %%mm0 \n\t"
00263 "movd %%mm0, (%3) \n\t"
00264 "add %5, %3 \n\t"
00265 PAVGB" (%3), %%mm1 \n\t"
00266 "movd %%mm1, (%3) \n\t"
00267 "add %5, %3 \n\t"
00268 "add $16, %2 \n\t"
00269 "subl $4, %0 \n\t"
00270 "jnz 1b \n\t"
00271 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00272 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00273 #else
00274 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00275 #endif
00276 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00277 :"memory");
00278 }
00279
00280
00281 static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00282 {
00283 __asm__ volatile(
00284 "testl $1, %0 \n\t"
00285 " jz 1f \n\t"
00286 "movq (%1), %%mm0 \n\t"
00287 "movq (%2), %%mm1 \n\t"
00288 "add %4, %1 \n\t"
00289 "add $8, %2 \n\t"
00290 PAVGB" %%mm1, %%mm0 \n\t"
00291 PAVGB" (%3), %%mm0 \n\t"
00292 "movq %%mm0, (%3) \n\t"
00293 "add %5, %3 \n\t"
00294 "decl %0 \n\t"
00295 "1: \n\t"
00296 "movq (%1), %%mm0 \n\t"
00297 "add %4, %1 \n\t"
00298 "movq (%1), %%mm1 \n\t"
00299 "add %4, %1 \n\t"
00300 PAVGB" (%2), %%mm0 \n\t"
00301 PAVGB" 8(%2), %%mm1 \n\t"
00302 PAVGB" (%3), %%mm0 \n\t"
00303 "movq %%mm0, (%3) \n\t"
00304 "add %5, %3 \n\t"
00305 PAVGB" (%3), %%mm1 \n\t"
00306 "movq %%mm1, (%3) \n\t"
00307 "add %5, %3 \n\t"
00308 "movq (%1), %%mm0 \n\t"
00309 "add %4, %1 \n\t"
00310 "movq (%1), %%mm1 \n\t"
00311 "add %4, %1 \n\t"
00312 PAVGB" 16(%2), %%mm0 \n\t"
00313 PAVGB" 24(%2), %%mm1 \n\t"
00314 PAVGB" (%3), %%mm0 \n\t"
00315 "movq %%mm0, (%3) \n\t"
00316 "add %5, %3 \n\t"
00317 PAVGB" (%3), %%mm1 \n\t"
00318 "movq %%mm1, (%3) \n\t"
00319 "add %5, %3 \n\t"
00320 "add $32, %2 \n\t"
00321 "subl $4, %0 \n\t"
00322 "jnz 1b \n\t"
00323 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00324 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00325 #else
00326 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00327 #endif
00328 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00329 :"memory");
00330
00331
00332
00333
00334 }
00335
00336 static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00337 {
00338 __asm__ volatile(
00339 "lea (%3, %3), %%"REG_a" \n\t"
00340 "1: \n\t"
00341 "movq (%1), %%mm0 \n\t"
00342 "movq (%1, %3), %%mm1 \n\t"
00343 "movq 8(%1), %%mm2 \n\t"
00344 "movq 8(%1, %3), %%mm3 \n\t"
00345 PAVGB" 1(%1), %%mm0 \n\t"
00346 PAVGB" 1(%1, %3), %%mm1 \n\t"
00347 PAVGB" 9(%1), %%mm2 \n\t"
00348 PAVGB" 9(%1, %3), %%mm3 \n\t"
00349 "movq %%mm0, (%2) \n\t"
00350 "movq %%mm1, (%2, %3) \n\t"
00351 "movq %%mm2, 8(%2) \n\t"
00352 "movq %%mm3, 8(%2, %3) \n\t"
00353 "add %%"REG_a", %1 \n\t"
00354 "add %%"REG_a", %2 \n\t"
00355 "movq (%1), %%mm0 \n\t"
00356 "movq (%1, %3), %%mm1 \n\t"
00357 "movq 8(%1), %%mm2 \n\t"
00358 "movq 8(%1, %3), %%mm3 \n\t"
00359 PAVGB" 1(%1), %%mm0 \n\t"
00360 PAVGB" 1(%1, %3), %%mm1 \n\t"
00361 PAVGB" 9(%1), %%mm2 \n\t"
00362 PAVGB" 9(%1, %3), %%mm3 \n\t"
00363 "add %%"REG_a", %1 \n\t"
00364 "movq %%mm0, (%2) \n\t"
00365 "movq %%mm1, (%2, %3) \n\t"
00366 "movq %%mm2, 8(%2) \n\t"
00367 "movq %%mm3, 8(%2, %3) \n\t"
00368 "add %%"REG_a", %2 \n\t"
00369 "subl $4, %0 \n\t"
00370 "jnz 1b \n\t"
00371 :"+g"(h), "+S"(pixels), "+D"(block)
00372 :"r" ((x86_reg)line_size)
00373 :"%"REG_a, "memory");
00374 }
00375
00376 static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00377 {
00378 __asm__ volatile(
00379 "testl $1, %0 \n\t"
00380 " jz 1f \n\t"
00381 "movq (%1), %%mm0 \n\t"
00382 "movq 8(%1), %%mm1 \n\t"
00383 PAVGB" (%2), %%mm0 \n\t"
00384 PAVGB" 8(%2), %%mm1 \n\t"
00385 "add %4, %1 \n\t"
00386 "add $16, %2 \n\t"
00387 "movq %%mm0, (%3) \n\t"
00388 "movq %%mm1, 8(%3) \n\t"
00389 "add %5, %3 \n\t"
00390 "decl %0 \n\t"
00391 "1: \n\t"
00392 "movq (%1), %%mm0 \n\t"
00393 "movq 8(%1), %%mm1 \n\t"
00394 "add %4, %1 \n\t"
00395 PAVGB" (%2), %%mm0 \n\t"
00396 PAVGB" 8(%2), %%mm1 \n\t"
00397 "movq %%mm0, (%3) \n\t"
00398 "movq %%mm1, 8(%3) \n\t"
00399 "add %5, %3 \n\t"
00400 "movq (%1), %%mm0 \n\t"
00401 "movq 8(%1), %%mm1 \n\t"
00402 "add %4, %1 \n\t"
00403 PAVGB" 16(%2), %%mm0 \n\t"
00404 PAVGB" 24(%2), %%mm1 \n\t"
00405 "movq %%mm0, (%3) \n\t"
00406 "movq %%mm1, 8(%3) \n\t"
00407 "add %5, %3 \n\t"
00408 "add $32, %2 \n\t"
00409 "subl $2, %0 \n\t"
00410 "jnz 1b \n\t"
00411 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00412 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00413 #else
00414 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00415 #endif
00416 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00417 :"memory");
00418
00419
00420
00421
00422 }
00423
00424 static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00425 {
00426 __asm__ volatile(
00427 "testl $1, %0 \n\t"
00428 " jz 1f \n\t"
00429 "movq (%1), %%mm0 \n\t"
00430 "movq 8(%1), %%mm1 \n\t"
00431 PAVGB" (%2), %%mm0 \n\t"
00432 PAVGB" 8(%2), %%mm1 \n\t"
00433 "add %4, %1 \n\t"
00434 "add $16, %2 \n\t"
00435 PAVGB" (%3), %%mm0 \n\t"
00436 PAVGB" 8(%3), %%mm1 \n\t"
00437 "movq %%mm0, (%3) \n\t"
00438 "movq %%mm1, 8(%3) \n\t"
00439 "add %5, %3 \n\t"
00440 "decl %0 \n\t"
00441 "1: \n\t"
00442 "movq (%1), %%mm0 \n\t"
00443 "movq 8(%1), %%mm1 \n\t"
00444 "add %4, %1 \n\t"
00445 PAVGB" (%2), %%mm0 \n\t"
00446 PAVGB" 8(%2), %%mm1 \n\t"
00447 PAVGB" (%3), %%mm0 \n\t"
00448 PAVGB" 8(%3), %%mm1 \n\t"
00449 "movq %%mm0, (%3) \n\t"
00450 "movq %%mm1, 8(%3) \n\t"
00451 "add %5, %3 \n\t"
00452 "movq (%1), %%mm0 \n\t"
00453 "movq 8(%1), %%mm1 \n\t"
00454 "add %4, %1 \n\t"
00455 PAVGB" 16(%2), %%mm0 \n\t"
00456 PAVGB" 24(%2), %%mm1 \n\t"
00457 PAVGB" (%3), %%mm0 \n\t"
00458 PAVGB" 8(%3), %%mm1 \n\t"
00459 "movq %%mm0, (%3) \n\t"
00460 "movq %%mm1, 8(%3) \n\t"
00461 "add %5, %3 \n\t"
00462 "add $32, %2 \n\t"
00463 "subl $2, %0 \n\t"
00464 "jnz 1b \n\t"
00465 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00466 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00467 #else
00468 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00469 #endif
00470 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00471 :"memory");
00472
00473
00474
00475
00476 }
00477
00478 static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
00479 {
00480 __asm__ volatile(
00481 "pcmpeqb %%mm6, %%mm6 \n\t"
00482 "testl $1, %0 \n\t"
00483 " jz 1f \n\t"
00484 "movq (%1), %%mm0 \n\t"
00485 "movq 8(%1), %%mm1 \n\t"
00486 "movq (%2), %%mm2 \n\t"
00487 "movq 8(%2), %%mm3 \n\t"
00488 "pxor %%mm6, %%mm0 \n\t"
00489 "pxor %%mm6, %%mm1 \n\t"
00490 "pxor %%mm6, %%mm2 \n\t"
00491 "pxor %%mm6, %%mm3 \n\t"
00492 PAVGB" %%mm2, %%mm0 \n\t"
00493 PAVGB" %%mm3, %%mm1 \n\t"
00494 "pxor %%mm6, %%mm0 \n\t"
00495 "pxor %%mm6, %%mm1 \n\t"
00496 "add %4, %1 \n\t"
00497 "add $16, %2 \n\t"
00498 "movq %%mm0, (%3) \n\t"
00499 "movq %%mm1, 8(%3) \n\t"
00500 "add %5, %3 \n\t"
00501 "decl %0 \n\t"
00502 "1: \n\t"
00503 "movq (%1), %%mm0 \n\t"
00504 "movq 8(%1), %%mm1 \n\t"
00505 "add %4, %1 \n\t"
00506 "movq (%2), %%mm2 \n\t"
00507 "movq 8(%2), %%mm3 \n\t"
00508 "pxor %%mm6, %%mm0 \n\t"
00509 "pxor %%mm6, %%mm1 \n\t"
00510 "pxor %%mm6, %%mm2 \n\t"
00511 "pxor %%mm6, %%mm3 \n\t"
00512 PAVGB" %%mm2, %%mm0 \n\t"
00513 PAVGB" %%mm3, %%mm1 \n\t"
00514 "pxor %%mm6, %%mm0 \n\t"
00515 "pxor %%mm6, %%mm1 \n\t"
00516 "movq %%mm0, (%3) \n\t"
00517 "movq %%mm1, 8(%3) \n\t"
00518 "add %5, %3 \n\t"
00519 "movq (%1), %%mm0 \n\t"
00520 "movq 8(%1), %%mm1 \n\t"
00521 "add %4, %1 \n\t"
00522 "movq 16(%2), %%mm2 \n\t"
00523 "movq 24(%2), %%mm3 \n\t"
00524 "pxor %%mm6, %%mm0 \n\t"
00525 "pxor %%mm6, %%mm1 \n\t"
00526 "pxor %%mm6, %%mm2 \n\t"
00527 "pxor %%mm6, %%mm3 \n\t"
00528 PAVGB" %%mm2, %%mm0 \n\t"
00529 PAVGB" %%mm3, %%mm1 \n\t"
00530 "pxor %%mm6, %%mm0 \n\t"
00531 "pxor %%mm6, %%mm1 \n\t"
00532 "movq %%mm0, (%3) \n\t"
00533 "movq %%mm1, 8(%3) \n\t"
00534 "add %5, %3 \n\t"
00535 "add $32, %2 \n\t"
00536 "subl $2, %0 \n\t"
00537 "jnz 1b \n\t"
00538 #if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
00539 :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00540 #else
00541 :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
00542 #endif
00543 :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
00544 :"memory");
00545
00546
00547
00548
00549 }
00550
00551
00552 static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00553 {
00554 MOVQ_BONE(mm6);
00555 __asm__ volatile(
00556 "lea (%3, %3), %%"REG_a" \n\t"
00557 "1: \n\t"
00558 "movq (%1), %%mm0 \n\t"
00559 "movq (%1, %3), %%mm2 \n\t"
00560 "movq 1(%1), %%mm1 \n\t"
00561 "movq 1(%1, %3), %%mm3 \n\t"
00562 "add %%"REG_a", %1 \n\t"
00563 "psubusb %%mm6, %%mm0 \n\t"
00564 "psubusb %%mm6, %%mm2 \n\t"
00565 PAVGB" %%mm1, %%mm0 \n\t"
00566 PAVGB" %%mm3, %%mm2 \n\t"
00567 "movq %%mm0, (%2) \n\t"
00568 "movq %%mm2, (%2, %3) \n\t"
00569 "movq (%1), %%mm0 \n\t"
00570 "movq 1(%1), %%mm1 \n\t"
00571 "movq (%1, %3), %%mm2 \n\t"
00572 "movq 1(%1, %3), %%mm3 \n\t"
00573 "add %%"REG_a", %2 \n\t"
00574 "add %%"REG_a", %1 \n\t"
00575 "psubusb %%mm6, %%mm0 \n\t"
00576 "psubusb %%mm6, %%mm2 \n\t"
00577 PAVGB" %%mm1, %%mm0 \n\t"
00578 PAVGB" %%mm3, %%mm2 \n\t"
00579 "movq %%mm0, (%2) \n\t"
00580 "movq %%mm2, (%2, %3) \n\t"
00581 "add %%"REG_a", %2 \n\t"
00582 "subl $4, %0 \n\t"
00583 "jnz 1b \n\t"
00584 :"+g"(h), "+S"(pixels), "+D"(block)
00585 :"r" ((x86_reg)line_size)
00586 :"%"REG_a, "memory");
00587 }
00588
00589 static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00590 {
00591 __asm__ volatile(
00592 "lea (%3, %3), %%"REG_a" \n\t"
00593 "movq (%1), %%mm0 \n\t"
00594 "sub %3, %2 \n\t"
00595 "1: \n\t"
00596 "movq (%1, %3), %%mm1 \n\t"
00597 "movq (%1, %%"REG_a"), %%mm2 \n\t"
00598 "add %%"REG_a", %1 \n\t"
00599 PAVGB" %%mm1, %%mm0 \n\t"
00600 PAVGB" %%mm2, %%mm1 \n\t"
00601 "movq %%mm0, (%2, %3) \n\t"
00602 "movq %%mm1, (%2, %%"REG_a") \n\t"
00603 "movq (%1, %3), %%mm1 \n\t"
00604 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00605 "add %%"REG_a", %2 \n\t"
00606 "add %%"REG_a", %1 \n\t"
00607 PAVGB" %%mm1, %%mm2 \n\t"
00608 PAVGB" %%mm0, %%mm1 \n\t"
00609 "movq %%mm2, (%2, %3) \n\t"
00610 "movq %%mm1, (%2, %%"REG_a") \n\t"
00611 "add %%"REG_a", %2 \n\t"
00612 "subl $4, %0 \n\t"
00613 "jnz 1b \n\t"
00614 :"+g"(h), "+S"(pixels), "+D" (block)
00615 :"r" ((x86_reg)line_size)
00616 :"%"REG_a, "memory");
00617 }
00618
00619
00620 static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00621 {
00622 MOVQ_BONE(mm6);
00623 __asm__ volatile(
00624 "lea (%3, %3), %%"REG_a" \n\t"
00625 "movq (%1), %%mm0 \n\t"
00626 "sub %3, %2 \n\t"
00627 "1: \n\t"
00628 "movq (%1, %3), %%mm1 \n\t"
00629 "movq (%1, %%"REG_a"), %%mm2 \n\t"
00630 "add %%"REG_a", %1 \n\t"
00631 "psubusb %%mm6, %%mm1 \n\t"
00632 PAVGB" %%mm1, %%mm0 \n\t"
00633 PAVGB" %%mm2, %%mm1 \n\t"
00634 "movq %%mm0, (%2, %3) \n\t"
00635 "movq %%mm1, (%2, %%"REG_a") \n\t"
00636 "movq (%1, %3), %%mm1 \n\t"
00637 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00638 "add %%"REG_a", %2 \n\t"
00639 "add %%"REG_a", %1 \n\t"
00640 "psubusb %%mm6, %%mm1 \n\t"
00641 PAVGB" %%mm1, %%mm2 \n\t"
00642 PAVGB" %%mm0, %%mm1 \n\t"
00643 "movq %%mm2, (%2, %3) \n\t"
00644 "movq %%mm1, (%2, %%"REG_a") \n\t"
00645 "add %%"REG_a", %2 \n\t"
00646 "subl $4, %0 \n\t"
00647 "jnz 1b \n\t"
00648 :"+g"(h), "+S"(pixels), "+D" (block)
00649 :"r" ((x86_reg)line_size)
00650 :"%"REG_a, "memory");
00651 }
00652
00653 static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00654 {
00655 __asm__ volatile(
00656 "lea (%3, %3), %%"REG_a" \n\t"
00657 "1: \n\t"
00658 "movq (%2), %%mm0 \n\t"
00659 "movq (%2, %3), %%mm1 \n\t"
00660 PAVGB" (%1), %%mm0 \n\t"
00661 PAVGB" (%1, %3), %%mm1 \n\t"
00662 "movq %%mm0, (%2) \n\t"
00663 "movq %%mm1, (%2, %3) \n\t"
00664 "add %%"REG_a", %1 \n\t"
00665 "add %%"REG_a", %2 \n\t"
00666 "movq (%2), %%mm0 \n\t"
00667 "movq (%2, %3), %%mm1 \n\t"
00668 PAVGB" (%1), %%mm0 \n\t"
00669 PAVGB" (%1, %3), %%mm1 \n\t"
00670 "add %%"REG_a", %1 \n\t"
00671 "movq %%mm0, (%2) \n\t"
00672 "movq %%mm1, (%2, %3) \n\t"
00673 "add %%"REG_a", %2 \n\t"
00674 "subl $4, %0 \n\t"
00675 "jnz 1b \n\t"
00676 :"+g"(h), "+S"(pixels), "+D"(block)
00677 :"r" ((x86_reg)line_size)
00678 :"%"REG_a, "memory");
00679 }
00680
00681 static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00682 {
00683 __asm__ volatile(
00684 "lea (%3, %3), %%"REG_a" \n\t"
00685 "1: \n\t"
00686 "movq (%1), %%mm0 \n\t"
00687 "movq (%1, %3), %%mm2 \n\t"
00688 PAVGB" 1(%1), %%mm0 \n\t"
00689 PAVGB" 1(%1, %3), %%mm2 \n\t"
00690 PAVGB" (%2), %%mm0 \n\t"
00691 PAVGB" (%2, %3), %%mm2 \n\t"
00692 "add %%"REG_a", %1 \n\t"
00693 "movq %%mm0, (%2) \n\t"
00694 "movq %%mm2, (%2, %3) \n\t"
00695 "movq (%1), %%mm0 \n\t"
00696 "movq (%1, %3), %%mm2 \n\t"
00697 PAVGB" 1(%1), %%mm0 \n\t"
00698 PAVGB" 1(%1, %3), %%mm2 \n\t"
00699 "add %%"REG_a", %2 \n\t"
00700 "add %%"REG_a", %1 \n\t"
00701 PAVGB" (%2), %%mm0 \n\t"
00702 PAVGB" (%2, %3), %%mm2 \n\t"
00703 "movq %%mm0, (%2) \n\t"
00704 "movq %%mm2, (%2, %3) \n\t"
00705 "add %%"REG_a", %2 \n\t"
00706 "subl $4, %0 \n\t"
00707 "jnz 1b \n\t"
00708 :"+g"(h), "+S"(pixels), "+D"(block)
00709 :"r" ((x86_reg)line_size)
00710 :"%"REG_a, "memory");
00711 }
00712
00713 static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00714 {
00715 __asm__ volatile(
00716 "lea (%3, %3), %%"REG_a" \n\t"
00717 "movq (%1), %%mm0 \n\t"
00718 "sub %3, %2 \n\t"
00719 "1: \n\t"
00720 "movq (%1, %3), %%mm1 \n\t"
00721 "movq (%1, %%"REG_a"), %%mm2 \n\t"
00722 "add %%"REG_a", %1 \n\t"
00723 PAVGB" %%mm1, %%mm0 \n\t"
00724 PAVGB" %%mm2, %%mm1 \n\t"
00725 "movq (%2, %3), %%mm3 \n\t"
00726 "movq (%2, %%"REG_a"), %%mm4 \n\t"
00727 PAVGB" %%mm3, %%mm0 \n\t"
00728 PAVGB" %%mm4, %%mm1 \n\t"
00729 "movq %%mm0, (%2, %3) \n\t"
00730 "movq %%mm1, (%2, %%"REG_a") \n\t"
00731 "movq (%1, %3), %%mm1 \n\t"
00732 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00733 PAVGB" %%mm1, %%mm2 \n\t"
00734 PAVGB" %%mm0, %%mm1 \n\t"
00735 "add %%"REG_a", %2 \n\t"
00736 "add %%"REG_a", %1 \n\t"
00737 "movq (%2, %3), %%mm3 \n\t"
00738 "movq (%2, %%"REG_a"), %%mm4 \n\t"
00739 PAVGB" %%mm3, %%mm2 \n\t"
00740 PAVGB" %%mm4, %%mm1 \n\t"
00741 "movq %%mm2, (%2, %3) \n\t"
00742 "movq %%mm1, (%2, %%"REG_a") \n\t"
00743 "add %%"REG_a", %2 \n\t"
00744 "subl $4, %0 \n\t"
00745 "jnz 1b \n\t"
00746 :"+g"(h), "+S"(pixels), "+D"(block)
00747 :"r" ((x86_reg)line_size)
00748 :"%"REG_a, "memory");
00749 }
00750
00751
00752
00753 static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00754 {
00755 MOVQ_BONE(mm6);
00756 __asm__ volatile(
00757 "lea (%3, %3), %%"REG_a" \n\t"
00758 "movq (%1), %%mm0 \n\t"
00759 PAVGB" 1(%1), %%mm0 \n\t"
00760 ASMALIGN(3)
00761 "1: \n\t"
00762 "movq (%1, %%"REG_a"), %%mm2 \n\t"
00763 "movq (%1, %3), %%mm1 \n\t"
00764 "psubusb %%mm6, %%mm2 \n\t"
00765 PAVGB" 1(%1, %3), %%mm1 \n\t"
00766 PAVGB" 1(%1, %%"REG_a"), %%mm2 \n\t"
00767 "add %%"REG_a", %1 \n\t"
00768 PAVGB" %%mm1, %%mm0 \n\t"
00769 PAVGB" %%mm2, %%mm1 \n\t"
00770 PAVGB" (%2), %%mm0 \n\t"
00771 PAVGB" (%2, %3), %%mm1 \n\t"
00772 "movq %%mm0, (%2) \n\t"
00773 "movq %%mm1, (%2, %3) \n\t"
00774 "movq (%1, %3), %%mm1 \n\t"
00775 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00776 PAVGB" 1(%1, %3), %%mm1 \n\t"
00777 PAVGB" 1(%1, %%"REG_a"), %%mm0 \n\t"
00778 "add %%"REG_a", %2 \n\t"
00779 "add %%"REG_a", %1 \n\t"
00780 PAVGB" %%mm1, %%mm2 \n\t"
00781 PAVGB" %%mm0, %%mm1 \n\t"
00782 PAVGB" (%2), %%mm2 \n\t"
00783 PAVGB" (%2, %3), %%mm1 \n\t"
00784 "movq %%mm2, (%2) \n\t"
00785 "movq %%mm1, (%2, %3) \n\t"
00786 "add %%"REG_a", %2 \n\t"
00787 "subl $4, %0 \n\t"
00788 "jnz 1b \n\t"
00789 :"+g"(h), "+S"(pixels), "+D"(block)
00790 :"r" ((x86_reg)line_size)
00791 :"%"REG_a, "memory");
00792 }
00793
00794 static void DEF(avg_pixels4)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00795 {
00796 do {
00797 __asm__ volatile(
00798 "movd (%1), %%mm0 \n\t"
00799 "movd (%1, %2), %%mm1 \n\t"
00800 "movd (%1, %2, 2), %%mm2 \n\t"
00801 "movd (%1, %3), %%mm3 \n\t"
00802 PAVGB" (%0), %%mm0 \n\t"
00803 PAVGB" (%0, %2), %%mm1 \n\t"
00804 PAVGB" (%0, %2, 2), %%mm2 \n\t"
00805 PAVGB" (%0, %3), %%mm3 \n\t"
00806 "movd %%mm0, (%1) \n\t"
00807 "movd %%mm1, (%1, %2) \n\t"
00808 "movd %%mm2, (%1, %2, 2) \n\t"
00809 "movd %%mm3, (%1, %3) \n\t"
00810 ::"S"(pixels), "D"(block),
00811 "r" ((x86_reg)line_size), "r"((x86_reg)3L*line_size)
00812 :"memory");
00813 block += 4*line_size;
00814 pixels += 4*line_size;
00815 h -= 4;
00816 } while(h > 0);
00817 }
00818
00819
00820 static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00821 DEF(put_no_rnd_pixels8_x2)(block , pixels , line_size, h);
00822 DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
00823 }
00824 static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00825 DEF(put_pixels8_y2)(block , pixels , line_size, h);
00826 DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
00827 }
00828 static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00829 DEF(put_no_rnd_pixels8_y2)(block , pixels , line_size, h);
00830 DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
00831 }
00832 static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00833 DEF(avg_pixels8)(block , pixels , line_size, h);
00834 DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
00835 }
00836 static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00837 DEF(avg_pixels8_x2)(block , pixels , line_size, h);
00838 DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
00839 }
00840 static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00841 DEF(avg_pixels8_y2)(block , pixels , line_size, h);
00842 DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
00843 }
00844 static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
00845 DEF(avg_pixels8_xy2)(block , pixels , line_size, h);
00846 DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
00847 }
00848
00849 #define QPEL_2TAP_L3(OPNAME) \
00850 static void DEF(OPNAME ## 2tap_qpel16_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
00851 __asm__ volatile(\
00852 "1: \n\t"\
00853 "movq (%1,%2), %%mm0 \n\t"\
00854 "movq 8(%1,%2), %%mm1 \n\t"\
00855 PAVGB" (%1,%3), %%mm0 \n\t"\
00856 PAVGB" 8(%1,%3), %%mm1 \n\t"\
00857 PAVGB" (%1), %%mm0 \n\t"\
00858 PAVGB" 8(%1), %%mm1 \n\t"\
00859 STORE_OP( (%1,%4),%%mm0)\
00860 STORE_OP(8(%1,%4),%%mm1)\
00861 "movq %%mm0, (%1,%4) \n\t"\
00862 "movq %%mm1, 8(%1,%4) \n\t"\
00863 "add %5, %1 \n\t"\
00864 "decl %0 \n\t"\
00865 "jnz 1b \n\t"\
00866 :"+g"(h), "+r"(src)\
00867 :"r"((x86_reg)off1), "r"((x86_reg)off2),\
00868 "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
00869 :"memory"\
00870 );\
00871 }\
00872 static void DEF(OPNAME ## 2tap_qpel8_l3)(uint8_t *dst, uint8_t *src, int stride, int h, int off1, int off2){\
00873 __asm__ volatile(\
00874 "1: \n\t"\
00875 "movq (%1,%2), %%mm0 \n\t"\
00876 PAVGB" (%1,%3), %%mm0 \n\t"\
00877 PAVGB" (%1), %%mm0 \n\t"\
00878 STORE_OP((%1,%4),%%mm0)\
00879 "movq %%mm0, (%1,%4) \n\t"\
00880 "add %5, %1 \n\t"\
00881 "decl %0 \n\t"\
00882 "jnz 1b \n\t"\
00883 :"+g"(h), "+r"(src)\
00884 :"r"((x86_reg)off1), "r"((x86_reg)off2),\
00885 "r"((x86_reg)(dst-src)), "r"((x86_reg)stride)\
00886 :"memory"\
00887 );\
00888 }
00889
00890 #define STORE_OP(a,b) PAVGB" "#a","#b" \n\t"
00891 QPEL_2TAP_L3(avg_)
00892 #undef STORE_OP
00893 #define STORE_OP(a,b)
00894 QPEL_2TAP_L3(put_)
00895 #undef STORE_OP
00896 #undef QPEL_2TAP_L3