00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030 #include <stdio.h>
00031 #include <stdlib.h>
00032 #include <string.h>
00033 #include <inttypes.h>
00034
00035 #include "config.h"
00036 #include "cpudetect.h"
00037 #include "img_format.h"
00038 #include "mp_image.h"
00039 #include "vf.h"
00040 #include "libvo/fastmemcpy.h"
00041 #include "libavutil/avutil.h"
00042 #include "libavutil/x86_cpu.h"
00043
00044 struct vf_priv_s {
00045 int thresh;
00046 int radius;
00047 uint16_t *buf;
00048 void (*filter_line)(uint8_t *dst, uint8_t *src, uint16_t *dc,
00049 int width, int thresh, const uint16_t *dithers);
00050 void (*blur_line)(uint16_t *dc, uint16_t *buf, uint16_t *buf1,
00051 uint8_t *src, int sstride, int width);
00052 };
00053
00054 static const uint16_t __attribute__((aligned(16))) pw_7f[8] = {127,127,127,127,127,127,127,127};
00055 static const uint16_t __attribute__((aligned(16))) pw_ff[8] = {255,255,255,255,255,255,255,255};
00056 static const uint16_t __attribute__((aligned(16))) dither[8][8] = {
00057 { 0, 96, 24,120, 6,102, 30,126 },
00058 { 64, 32, 88, 56, 70, 38, 94, 62 },
00059 { 16,112, 8,104, 22,118, 14,110 },
00060 { 80, 48, 72, 40, 86, 54, 78, 46 },
00061 { 4,100, 28,124, 2, 98, 26,122 },
00062 { 68, 36, 92, 60, 66, 34, 90, 58 },
00063 { 20,116, 12,108, 18,114, 10,106 },
00064 { 84, 52, 76, 44, 82, 50, 74, 42 },
00065 };
00066
00067 static void filter_line_c(uint8_t *dst, uint8_t *src, uint16_t *dc,
00068 int width, int thresh, const uint16_t *dithers)
00069 {
00070 int x;
00071 for (x=0; x<width; x++, dc+=x&1) {
00072 int pix = src[x]<<7;
00073 int delta = dc[0] - pix;
00074 int m = abs(delta) * thresh >> 16;
00075 m = FFMAX(0, 127-m);
00076 m = m*m*delta >> 14;
00077 pix += m + dithers[x&7];
00078 dst[x] = av_clip_uint8(pix>>7);
00079 }
00080 }
00081
00082 static void blur_line_c(uint16_t *dc, uint16_t *buf, uint16_t *buf1,
00083 uint8_t *src, int sstride, int width)
00084 {
00085 int x, v, old;
00086 for (x=0; x<width; x++) {
00087 v = buf1[x] + src[2*x] + src[2*x+1] + src[2*x+sstride] + src[2*x+1+sstride];
00088 old = buf[x];
00089 buf[x] = v;
00090 dc[x] = v - old;
00091 }
00092 }
00093
00094 #if HAVE_MMX2
00095 static void filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc,
00096 int width, int thresh, const uint16_t *dithers)
00097 {
00098 intptr_t x;
00099 if (width&3) {
00100 x = width&~3;
00101 filter_line_c(dst+x, src+x, dc+x/2, width-x, thresh, dithers);
00102 width = x;
00103 }
00104 x = -width;
00105 __asm__ volatile(
00106 "movd %4, %%mm5 \n"
00107 "pxor %%mm7, %%mm7 \n"
00108 "pshufw $0, %%mm5, %%mm5 \n"
00109 "movq %6, %%mm6 \n"
00110 "movq %5, %%mm4 \n"
00111 "1: \n"
00112 "movd (%2,%0), %%mm0 \n"
00113 "movd (%3,%0), %%mm1 \n"
00114 "punpcklbw %%mm7, %%mm0 \n"
00115 "punpcklwd %%mm1, %%mm1 \n"
00116 "psllw $7, %%mm0 \n"
00117 "pxor %%mm2, %%mm2 \n"
00118 "psubw %%mm0, %%mm1 \n"
00119 "psubw %%mm1, %%mm2 \n"
00120 "pmaxsw %%mm1, %%mm2 \n"
00121 "pmulhuw %%mm5, %%mm2 \n"
00122 "psubw %%mm6, %%mm2 \n"
00123 "pminsw %%mm7, %%mm2 \n"
00124 "pmullw %%mm2, %%mm2 \n"
00125 "paddw %%mm4, %%mm0 \n"
00126 "pmulhw %%mm2, %%mm1 \n"
00127 "psllw $2, %%mm1 \n"
00128 "paddw %%mm1, %%mm0 \n"
00129 "psraw $7, %%mm0 \n"
00130 "packuswb %%mm0, %%mm0 \n"
00131 "movd %%mm0, (%1,%0) \n"
00132 "add $4, %0 \n"
00133 "jl 1b \n"
00134 "emms \n"
00135 :"+r"(x)
00136 :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
00137 "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
00138 :"memory"
00139 );
00140 }
00141 #endif
00142
00143 #if HAVE_SSSE3
00144 static void filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc,
00145 int width, int thresh, const uint16_t *dithers)
00146 {
00147 intptr_t x;
00148 if (width&7) {
00149
00150 x = width&~7;
00151 filter_line_c(dst+x, src+x, dc+x/2, width-x, thresh, dithers);
00152 width = x;
00153 }
00154 x = -width;
00155 __asm__ volatile(
00156 "movd %4, %%xmm5 \n"
00157 "pxor %%xmm7, %%xmm7 \n"
00158 "pshuflw $0,%%xmm5, %%xmm5 \n"
00159 "movdqa %6, %%xmm6 \n"
00160 "punpcklqdq %%xmm5, %%xmm5 \n"
00161 "movdqa %5, %%xmm4 \n"
00162 "1: \n"
00163 "movq (%2,%0), %%xmm0 \n"
00164 "movq (%3,%0), %%xmm1 \n"
00165 "punpcklbw %%xmm7, %%xmm0 \n"
00166 "punpcklwd %%xmm1, %%xmm1 \n"
00167 "psllw $7, %%xmm0 \n"
00168 "psubw %%xmm0, %%xmm1 \n"
00169 "pabsw %%xmm1, %%xmm2 \n"
00170 "pmulhuw %%xmm5, %%xmm2 \n"
00171 "psubw %%xmm6, %%xmm2 \n"
00172 "pminsw %%xmm7, %%xmm2 \n"
00173 "pmullw %%xmm2, %%xmm2 \n"
00174 "psllw $1, %%xmm2 \n"
00175 "paddw %%xmm4, %%xmm0 \n"
00176 "pmulhrsw %%xmm2, %%xmm1 \n"
00177 "paddw %%xmm1, %%xmm0 \n"
00178 "psraw $7, %%xmm0 \n"
00179 "packuswb %%xmm0, %%xmm0 \n"
00180 "movq %%xmm0, (%1,%0) \n"
00181 "add $8, %0 \n"
00182 "jl 1b \n"
00183 :"+&r"(x)
00184 :"r"(dst+width), "r"(src+width), "r"(dc+width/2),
00185 "rm"(thresh), "m"(*dithers), "m"(*pw_7f)
00186 :"memory"
00187 );
00188 }
00189 #endif // HAVE_SSSE3
00190
00191 #if HAVE_SSE && HAVE_6REGS
00192 #define BLURV(load)\
00193 intptr_t x = -2*width;\
00194 __asm__ volatile(\
00195 "movdqa %6, %%xmm7 \n"\
00196 "1: \n"\
00197 load" (%4,%0), %%xmm0 \n"\
00198 load" (%5,%0), %%xmm1 \n"\
00199 "movdqa %%xmm0, %%xmm2 \n"\
00200 "movdqa %%xmm1, %%xmm3 \n"\
00201 "psrlw $8, %%xmm0 \n"\
00202 "psrlw $8, %%xmm1 \n"\
00203 "pand %%xmm7, %%xmm2 \n"\
00204 "pand %%xmm7, %%xmm3 \n"\
00205 "paddw %%xmm1, %%xmm0 \n"\
00206 "paddw %%xmm3, %%xmm2 \n"\
00207 "paddw %%xmm2, %%xmm0 \n"\
00208 "paddw (%2,%0), %%xmm0 \n"\
00209 "movdqa (%1,%0), %%xmm1 \n"\
00210 "movdqa %%xmm0, (%1,%0) \n"\
00211 "psubw %%xmm1, %%xmm0 \n"\
00212 "movdqa %%xmm0, (%3,%0) \n"\
00213 "add $16, %0 \n"\
00214 "jl 1b \n"\
00215 :"+&r"(x)\
00216 :"r"(buf+width),\
00217 "r"(buf1+width),\
00218 "r"(dc+width),\
00219 "r"(src+width*2),\
00220 "r"(src+width*2+sstride),\
00221 "m"(*pw_ff)\
00222 :"memory"\
00223 );
00224
00225 static void blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1,
00226 uint8_t *src, int sstride, int width)
00227 {
00228 if (((intptr_t)src|sstride)&15) {
00229 BLURV("movdqu");
00230 } else {
00231 BLURV("movdqa");
00232 }
00233 }
00234 #endif // HAVE_6REGS && HAVE_SSE
00235
00236 static void filter(struct vf_priv_s *ctx, uint8_t *dst, uint8_t *src,
00237 int width, int height, int dstride, int sstride, int r)
00238 {
00239 int bstride = ((width+15)&~15)/2;
00240 int y;
00241 uint32_t dc_factor = (1<<21)/(r*r);
00242 uint16_t *dc = ctx->buf+16;
00243 uint16_t *buf = ctx->buf+bstride+32;
00244 int thresh = ctx->thresh;
00245
00246 memset(dc, 0, (bstride+16)*sizeof(*buf));
00247 for (y=0; y<r; y++)
00248 ctx->blur_line(dc, buf+y*bstride, buf+(y-1)*bstride, src+2*y*sstride, sstride, width/2);
00249 for (;;) {
00250 if (y < height-r) {
00251 int mod = ((y+r)/2)%r;
00252 uint16_t *buf0 = buf+mod*bstride;
00253 uint16_t *buf1 = buf+(mod?mod-1:r-1)*bstride;
00254 int x, v;
00255 ctx->blur_line(dc, buf0, buf1, src+(y+r)*sstride, sstride, width/2);
00256 for (x=v=0; x<r; x++)
00257 v += dc[x];
00258 for (; x<width/2; x++) {
00259 v += dc[x] - dc[x-r];
00260 dc[x-r] = v * dc_factor >> 16;
00261 }
00262 for (; x<(width+r+1)/2; x++)
00263 dc[x-r] = v * dc_factor >> 16;
00264 for (x=-r/2; x<0; x++)
00265 dc[x] = dc[0];
00266 }
00267 if (y == r) {
00268 for (y=0; y<r; y++)
00269 ctx->filter_line(dst+y*dstride, src+y*sstride, dc-r/2, width, thresh, dither[y&7]);
00270 }
00271 ctx->filter_line(dst+y*dstride, src+y*sstride, dc-r/2, width, thresh, dither[y&7]);
00272 if (++y >= height) break;
00273 ctx->filter_line(dst+y*dstride, src+y*sstride, dc-r/2, width, thresh, dither[y&7]);
00274 if (++y >= height) break;
00275 }
00276 }
00277
00278 static void get_image(struct vf_instance *vf, mp_image_t *mpi)
00279 {
00280 if (mpi->flags&MP_IMGFLAG_PRESERVE) return;
00281
00282 vf->dmpi = vf_get_image(vf->next, mpi->imgfmt,
00283 mpi->type, mpi->flags, mpi->width, mpi->height);
00284 mpi->planes[0] = vf->dmpi->planes[0];
00285 mpi->stride[0] = vf->dmpi->stride[0];
00286 mpi->width = vf->dmpi->width;
00287 if (mpi->flags&MP_IMGFLAG_PLANAR){
00288 mpi->planes[1] = vf->dmpi->planes[1];
00289 mpi->planes[2] = vf->dmpi->planes[2];
00290 mpi->stride[1] = vf->dmpi->stride[1];
00291 mpi->stride[2] = vf->dmpi->stride[2];
00292 }
00293 mpi->flags |= MP_IMGFLAG_DIRECT;
00294 }
00295
00296 static int put_image(struct vf_instance *vf, mp_image_t *mpi, double pts)
00297 {
00298 mp_image_t *dmpi = vf->dmpi;
00299 int p;
00300
00301 if (!(mpi->flags&MP_IMGFLAG_DIRECT)) {
00302
00303 dmpi = vf_get_image(vf->next,mpi->imgfmt, MP_IMGTYPE_TEMP,
00304 MP_IMGFLAG_ACCEPT_STRIDE|MP_IMGFLAG_PREFER_ALIGNED_STRIDE,
00305 mpi->w, mpi->h);
00306 }
00307 vf_clone_mpi_attributes(dmpi, mpi);
00308
00309 for (p=0; p<mpi->num_planes; p++) {
00310 int w = mpi->w;
00311 int h = mpi->h;
00312 int r = vf->priv->radius;
00313 if (p) {
00314 w >>= mpi->chroma_x_shift;
00315 h >>= mpi->chroma_y_shift;
00316 r = ((r>>mpi->chroma_x_shift) + (r>>mpi->chroma_y_shift)) / 2;
00317 r = av_clip((r+1)&~1,4,32);
00318 }
00319 if (FFMIN(w,h) > 2*r)
00320 filter(vf->priv, dmpi->planes[p], mpi->planes[p], w, h,
00321 dmpi->stride[p], mpi->stride[p], r);
00322 else if (dmpi->planes[p] != mpi->planes[p])
00323 memcpy_pic(dmpi->planes[p], mpi->planes[p], w, h,
00324 dmpi->stride[p], mpi->stride[p]);
00325 }
00326
00327 return vf_next_put_image(vf, dmpi, pts);
00328 }
00329
00330 static int query_format(struct vf_instance *vf, unsigned int fmt)
00331 {
00332 switch (fmt){
00333 case IMGFMT_YVU9:
00334 case IMGFMT_IF09:
00335 case IMGFMT_YV12:
00336 case IMGFMT_I420:
00337 case IMGFMT_IYUV:
00338 case IMGFMT_CLPL:
00339 case IMGFMT_Y800:
00340 case IMGFMT_Y8:
00341 case IMGFMT_NV12:
00342 case IMGFMT_NV21:
00343 case IMGFMT_444P:
00344 case IMGFMT_422P:
00345 case IMGFMT_411P:
00346 case IMGFMT_HM12:
00347 return vf_next_query_format(vf,fmt);
00348 }
00349 return 0;
00350 }
00351
00352 static int config(struct vf_instance *vf,
00353 int width, int height, int d_width, int d_height,
00354 unsigned int flags, unsigned int outfmt)
00355 {
00356 free(vf->priv->buf);
00357 vf->priv->buf = av_mallocz((((width+15)&~15)*(vf->priv->radius+1)/2+32)*sizeof(uint16_t));
00358 return vf_next_config(vf,width,height,d_width,d_height,flags,outfmt);
00359 }
00360
00361 static void uninit(struct vf_instance *vf)
00362 {
00363 if (!vf->priv) return;
00364 av_free(vf->priv->buf);
00365 free(vf->priv);
00366 vf->priv = NULL;
00367 }
00368
00369 static int vf_open(vf_instance_t *vf, char *args)
00370 {
00371 float thresh = 1.2;
00372 int radius = 16;
00373
00374 vf->get_image=get_image;
00375 vf->put_image=put_image;
00376 vf->query_format=query_format;
00377 vf->config=config;
00378 vf->uninit=uninit;
00379 vf->priv=malloc(sizeof(struct vf_priv_s));
00380 memset(vf->priv, 0, sizeof(struct vf_priv_s));
00381
00382 if (args) sscanf(args, "%f:%d", &thresh, &radius);
00383 vf->priv->thresh = (1<<15)/av_clipf(thresh,0.51,255);
00384 vf->priv->radius = av_clip((radius+1)&~1,4,32);
00385
00386 vf->priv->blur_line = blur_line_c;
00387 vf->priv->filter_line = filter_line_c;
00388 #if HAVE_SSE && HAVE_6REGS
00389 if (gCpuCaps.hasSSE2)
00390 vf->priv->blur_line = blur_line_sse2;
00391 #endif
00392 #if HAVE_MMX2
00393 if (gCpuCaps.hasMMX2)
00394 vf->priv->filter_line = filter_line_mmx2;
00395 #endif
00396 #if HAVE_SSSE3
00397 if (gCpuCaps.hasSSSE3)
00398 vf->priv->filter_line = filter_line_ssse3;
00399 #endif
00400
00401 return 1;
00402 }
00403
00404 const vf_info_t vf_info_gradfun = {
00405 "gradient deband",
00406 "gradfun",
00407 "Loren Merritt",
00408 "",
00409 vf_open,
00410 NULL
00411 };