FFmpeg: libavcodec/x86/vp8dsp

00001 /*
00002  * VP8 DSP functions x86-optimized
00003  * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
00004  * Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
00005  *
00006  * This file is part of FFmpeg.
00007  *
00008  * FFmpeg is free software; you can redistribute it and/or
00009  * modify it under the terms of the GNU Lesser General Public
00010  * License as published by the Free Software Foundation; either
00011  * version 2.1 of the License, or (at your option) any later version.
00012  *
00013  * FFmpeg is distributed in the hope that it will be useful,
00014  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
00016  * Lesser General Public License for more details.
00017  *
00018  * You should have received a copy of the GNU Lesser General Public
00019  * License along with FFmpeg; if not, write to the Free Software
00020  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
00021  */
00022 
00023 #include "libavutil/cpu.h"
00024 #include "libavutil/mem.h"
00025 #include "libavutil/x86/asm.h"
00026 #include "libavcodec/vp8dsp.h"
00027 
00028 #if HAVE_YASM
00029 
00030 /*
00031  * MC functions
00032  */
00033 extern void ff_put_vp8_epel4_h4_mmx2  (uint8_t *dst, ptrdiff_t dststride,
00034                                        uint8_t *src, ptrdiff_t srcstride,
00035                                        int height, int mx, int my);
00036 extern void ff_put_vp8_epel4_h6_mmx2  (uint8_t *dst, ptrdiff_t dststride,
00037                                        uint8_t *src, ptrdiff_t srcstride,
00038                                        int height, int mx, int my);
00039 extern void ff_put_vp8_epel4_v4_mmx2  (uint8_t *dst, ptrdiff_t dststride,
00040                                        uint8_t *src, ptrdiff_t srcstride,
00041                                        int height, int mx, int my);
00042 extern void ff_put_vp8_epel4_v6_mmx2  (uint8_t *dst, ptrdiff_t dststride,
00043                                        uint8_t *src, ptrdiff_t srcstride,
00044                                        int height, int mx, int my);
00045 
00046 extern void ff_put_vp8_epel8_h4_sse2  (uint8_t *dst, ptrdiff_t dststride,
00047                                        uint8_t *src, ptrdiff_t srcstride,
00048                                        int height, int mx, int my);
00049 extern void ff_put_vp8_epel8_h6_sse2  (uint8_t *dst, ptrdiff_t dststride,
00050                                        uint8_t *src, ptrdiff_t srcstride,
00051                                        int height, int mx, int my);
00052 extern void ff_put_vp8_epel8_v4_sse2  (uint8_t *dst, ptrdiff_t dststride,
00053                                        uint8_t *src, ptrdiff_t srcstride,
00054                                        int height, int mx, int my);
00055 extern void ff_put_vp8_epel8_v6_sse2  (uint8_t *dst, ptrdiff_t dststride,
00056                                        uint8_t *src, ptrdiff_t srcstride,
00057                                        int height, int mx, int my);
00058 
00059 extern void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
00060                                        uint8_t *src, ptrdiff_t srcstride,
00061                                        int height, int mx, int my);
00062 extern void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
00063                                        uint8_t *src, ptrdiff_t srcstride,
00064                                        int height, int mx, int my);
00065 extern void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
00066                                        uint8_t *src, ptrdiff_t srcstride,
00067                                        int height, int mx, int my);
00068 extern void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
00069                                        uint8_t *src, ptrdiff_t srcstride,
00070                                        int height, int mx, int my);
00071 extern void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
00072                                        uint8_t *src, ptrdiff_t srcstride,
00073                                        int height, int mx, int my);
00074 extern void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
00075                                        uint8_t *src, ptrdiff_t srcstride,
00076                                        int height, int mx, int my);
00077 extern void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
00078                                        uint8_t *src, ptrdiff_t srcstride,
00079                                        int height, int mx, int my);
00080 extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
00081                                        uint8_t *src, ptrdiff_t srcstride,
00082                                        int height, int mx, int my);
00083 
00084 extern void ff_put_vp8_bilinear4_h_mmx2  (uint8_t *dst, ptrdiff_t dststride,
00085                                           uint8_t *src, ptrdiff_t srcstride,
00086                                           int height, int mx, int my);
00087 extern void ff_put_vp8_bilinear8_h_sse2  (uint8_t *dst, ptrdiff_t dststride,
00088                                           uint8_t *src, ptrdiff_t srcstride,
00089                                           int height, int mx, int my);
00090 extern void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
00091                                           uint8_t *src, ptrdiff_t srcstride,
00092                                           int height, int mx, int my);
00093 extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
00094                                           uint8_t *src, ptrdiff_t srcstride,
00095                                           int height, int mx, int my);
00096 
00097 extern void ff_put_vp8_bilinear4_v_mmx2  (uint8_t *dst, ptrdiff_t dststride,
00098                                           uint8_t *src, ptrdiff_t srcstride,
00099                                           int height, int mx, int my);
00100 extern void ff_put_vp8_bilinear8_v_sse2  (uint8_t *dst, ptrdiff_t dststride,
00101                                           uint8_t *src, ptrdiff_t srcstride,
00102                                           int height, int mx, int my);
00103 extern void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
00104                                           uint8_t *src, ptrdiff_t srcstride,
00105                                           int height, int mx, int my);
00106 extern void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
00107                                           uint8_t *src, ptrdiff_t srcstride,
00108                                           int height, int mx, int my);
00109 
00110 
00111 extern void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
00112                                     uint8_t *src, ptrdiff_t srcstride,
00113                                     int height, int mx, int my);
00114 extern void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride,
00115                                     uint8_t *src, ptrdiff_t srcstride,
00116                                     int height, int mx, int my);
00117 extern void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
00118                                     uint8_t *src, ptrdiff_t srcstride,
00119                                     int height, int mx, int my);
00120 
00121 #define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
00122 static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
00123     uint8_t *dst,  ptrdiff_t dststride, uint8_t *src, \
00124     ptrdiff_t srcstride, int height, int mx, int my) \
00125 { \
00126     ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
00127         dst,     dststride, src,     srcstride, height, mx, my); \
00128     ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
00129         dst + 8, dststride, src + 8, srcstride, height, mx, my); \
00130 }
00131 #define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
00132 static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
00133     uint8_t *dst,  ptrdiff_t dststride, uint8_t *src, \
00134     ptrdiff_t srcstride, int height, int mx, int my) \
00135 { \
00136     ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
00137         dst,     dststride, src,     srcstride, height, mx, my); \
00138     ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
00139         dst + 4, dststride, src + 4, srcstride, height, mx, my); \
00140 }
00141 
00142 #if ARCH_X86_32
00143 TAP_W8 (mmx2,  epel, h4)
00144 TAP_W8 (mmx2,  epel, h6)
00145 TAP_W16(mmx2,  epel, h6)
00146 TAP_W8 (mmx2,  epel, v4)
00147 TAP_W8 (mmx2,  epel, v6)
00148 TAP_W16(mmx2,  epel, v6)
00149 TAP_W8 (mmx2,  bilinear, h)
00150 TAP_W16(mmx2,  bilinear, h)
00151 TAP_W8 (mmx2,  bilinear, v)
00152 TAP_W16(mmx2,  bilinear, v)
00153 #endif
00154 
00155 TAP_W16(sse2,  epel, h6)
00156 TAP_W16(sse2,  epel, v6)
00157 TAP_W16(sse2,  bilinear, h)
00158 TAP_W16(sse2,  bilinear, v)
00159 
00160 TAP_W16(ssse3, epel, h6)
00161 TAP_W16(ssse3, epel, v6)
00162 TAP_W16(ssse3, bilinear, h)
00163 TAP_W16(ssse3, bilinear, v)
00164 
00165 #define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
00166 static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
00167     uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
00168     ptrdiff_t srcstride, int height, int mx, int my) \
00169 { \
00170     DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
00171     uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
00172     src -= srcstride * (TAPNUMY / 2 - 1); \
00173     ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
00174         tmp, SIZE,      src,    srcstride, height + TAPNUMY - 1, mx, my); \
00175     ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
00176         dst, dststride, tmpptr, SIZE,      height,               mx, my); \
00177 }
00178 
00179 #if ARCH_X86_32
00180 #define HVTAPMMX(x, y) \
00181 HVTAP(mmx2, 8, x, y,  4,  8) \
00182 HVTAP(mmx2, 8, x, y,  8, 16)
00183 
00184 HVTAP(mmx2, 8, 6, 6, 16, 16)
00185 #else
00186 #define HVTAPMMX(x, y) \
00187 HVTAP(mmx2, 8, x, y,  4,  8)
00188 #endif
00189 
00190 HVTAPMMX(4, 4)
00191 HVTAPMMX(4, 6)
00192 HVTAPMMX(6, 4)
00193 HVTAPMMX(6, 6)
00194 
00195 #define HVTAPSSE2(x, y, w) \
00196 HVTAP(sse2,  16, x, y, w, 16) \
00197 HVTAP(ssse3, 16, x, y, w, 16)
00198 
00199 HVTAPSSE2(4, 4, 8)
00200 HVTAPSSE2(4, 6, 8)
00201 HVTAPSSE2(6, 4, 8)
00202 HVTAPSSE2(6, 6, 8)
00203 HVTAPSSE2(6, 6, 16)
00204 
00205 HVTAP(ssse3, 16, 4, 4, 4, 8)
00206 HVTAP(ssse3, 16, 4, 6, 4, 8)
00207 HVTAP(ssse3, 16, 6, 4, 4, 8)
00208 HVTAP(ssse3, 16, 6, 6, 4, 8)
00209 
00210 #define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
00211 static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
00212     uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
00213     ptrdiff_t srcstride, int height, int mx, int my) \
00214 { \
00215     DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
00216     ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
00217         tmp, SIZE,      src, srcstride, height + 1, mx, my); \
00218     ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
00219         dst, dststride, tmp, SIZE,      height,     mx, my); \
00220 }
00221 
00222 HVBILIN(mmx2,  8,  4,  8)
00223 #if ARCH_X86_32
00224 HVBILIN(mmx2,  8,  8, 16)
00225 HVBILIN(mmx2,  8, 16, 16)
00226 #endif
00227 HVBILIN(sse2,  8,  8, 16)
00228 HVBILIN(sse2,  8, 16, 16)
00229 HVBILIN(ssse3, 8,  4,  8)
00230 HVBILIN(ssse3, 8,  8, 16)
00231 HVBILIN(ssse3, 8, 16, 16)
00232 
00233 extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16],
00234                                    ptrdiff_t stride);
00235 extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16],
00236                                     ptrdiff_t stride);
00237 extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16],
00238                                       ptrdiff_t stride);
00239 extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16],
00240                                       ptrdiff_t stride);
00241 extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16],
00242                                       ptrdiff_t stride);
00243 extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
00244 extern void ff_vp8_luma_dc_wht_sse(DCTELEM block[4][4][16], DCTELEM dc[16]);
00245 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16],
00246                                 ptrdiff_t stride);
00247 extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16],
00248                                 ptrdiff_t stride);
00249 
00250 #define DECLARE_LOOP_FILTER(NAME)\
00251 extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \
00252                                                  ptrdiff_t stride, \
00253                                                  int flim);\
00254 extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \
00255                                                  ptrdiff_t stride, \
00256                                                  int flim);\
00257 extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
00258                                                      ptrdiff_t stride,\
00259                                                     int e, int i, int hvt);\
00260 extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
00261                                                     ptrdiff_t stride,\
00262                                                     int e, int i, int hvt);\
00263 extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
00264                                                     uint8_t *dstV,\
00265                                                     ptrdiff_t s, \
00266                                                     int e, int i, int hvt);\
00267 extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
00268                                                     uint8_t *dstV,\
00269                                                     ptrdiff_t s, \
00270                                                     int e, int i, int hvt);\
00271 extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
00272                                                     ptrdiff_t stride,\
00273                                                     int e, int i, int hvt);\
00274 extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
00275                                                     ptrdiff_t stride,\
00276                                                     int e, int i, int hvt);\
00277 extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
00278                                                     uint8_t *dstV,\
00279                                                     ptrdiff_t s, \
00280                                                     int e, int i, int hvt);\
00281 extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
00282                                                     uint8_t *dstV,\
00283                                                     ptrdiff_t s, \
00284                                                     int e, int i, int hvt);
00285 
00286 DECLARE_LOOP_FILTER(mmx)
00287 DECLARE_LOOP_FILTER(mmx2)
00288 DECLARE_LOOP_FILTER(sse2)
00289 DECLARE_LOOP_FILTER(ssse3)
00290 DECLARE_LOOP_FILTER(sse4)
00291 
00292 #endif /* HAVE_YASM */
00293 
00294 #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
00295     c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
00296     c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
00297     c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
00298 
00299 #define VP8_MC_FUNC(IDX, SIZE, OPT) \
00300     c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
00301     c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
00302     c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
00303     c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
00304     c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
00305     VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
00306 
00307 #define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
00308     c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
00309     c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
00310     c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
00311     c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
00312     c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
00313     c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
00314     c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
00315     c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
00316 
00317 
00318 av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
00319 {
00320 #if HAVE_YASM
00321     int mm_flags = av_get_cpu_flags();
00322 
00323     if (mm_flags & AV_CPU_FLAG_MMX) {
00324         c->vp8_idct_dc_add    = ff_vp8_idct_dc_add_mmx;
00325         c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
00326 #if ARCH_X86_32
00327         c->vp8_idct_dc_add4y  = ff_vp8_idct_dc_add4y_mmx;
00328         c->vp8_idct_add       = ff_vp8_idct_add_mmx;
00329         c->vp8_luma_dc_wht    = ff_vp8_luma_dc_wht_mmx;
00330         c->put_vp8_epel_pixels_tab[0][0][0]     =
00331         c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
00332 #endif
00333         c->put_vp8_epel_pixels_tab[1][0][0]     =
00334         c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
00335 
00336 #if ARCH_X86_32
00337         c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
00338         c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
00339 
00340         c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
00341         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
00342         c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
00343         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
00344 
00345         c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_mmx;
00346         c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_mmx;
00347         c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_mmx;
00348         c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_mmx;
00349 #endif
00350     }
00351 
00352     /* note that 4-tap width=16 functions are missing because w=16
00353      * is only used for luma, and luma is always a copy or sixtap. */
00354     if (mm_flags & AV_CPU_FLAG_MMXEXT) {
00355         VP8_MC_FUNC(2, 4, mmx2);
00356         VP8_BILINEAR_MC_FUNC(2, 4, mmx2);
00357 #if ARCH_X86_32
00358         VP8_LUMA_MC_FUNC(0, 16, mmx2);
00359         VP8_MC_FUNC(1, 8, mmx2);
00360         VP8_BILINEAR_MC_FUNC(0, 16, mmx2);
00361         VP8_BILINEAR_MC_FUNC(1, 8, mmx2);
00362 
00363         c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx2;
00364         c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx2;
00365 
00366         c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx2;
00367         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx2;
00368         c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx2;
00369         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx2;
00370 
00371         c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_mmx2;
00372         c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_mmx2;
00373         c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_mmx2;
00374         c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_mmx2;
00375 #endif
00376     }
00377 
00378     if (mm_flags & AV_CPU_FLAG_SSE) {
00379         c->vp8_idct_add                         = ff_vp8_idct_add_sse;
00380         c->vp8_luma_dc_wht                      = ff_vp8_luma_dc_wht_sse;
00381         c->put_vp8_epel_pixels_tab[0][0][0]     =
00382         c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
00383     }
00384 
00385     if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) {
00386         VP8_LUMA_MC_FUNC(0, 16, sse2);
00387         VP8_MC_FUNC(1, 8, sse2);
00388         VP8_BILINEAR_MC_FUNC(0, 16, sse2);
00389         VP8_BILINEAR_MC_FUNC(1, 8, sse2);
00390 
00391         c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
00392 
00393 #if ARCH_X86_64 || HAVE_ALIGNED_STACK
00394         c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
00395         c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
00396 
00397         c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_sse2;
00398         c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_sse2;
00399 #endif
00400     }
00401 
00402     if (mm_flags & AV_CPU_FLAG_SSE2) {
00403         c->vp8_idct_dc_add4y          = ff_vp8_idct_dc_add4y_sse2;
00404 
00405         c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
00406 
00407 #if ARCH_X86_64 || HAVE_ALIGNED_STACK
00408         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
00409         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
00410 
00411         c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse2;
00412         c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse2;
00413 #endif
00414     }
00415 
00416     if (mm_flags & AV_CPU_FLAG_SSSE3) {
00417         VP8_LUMA_MC_FUNC(0, 16, ssse3);
00418         VP8_MC_FUNC(1, 8, ssse3);
00419         VP8_MC_FUNC(2, 4, ssse3);
00420         VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
00421         VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
00422         VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
00423 
00424         c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
00425         c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
00426 
00427 #if ARCH_X86_64 || HAVE_ALIGNED_STACK
00428         c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
00429         c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
00430         c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
00431         c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
00432 
00433         c->vp8_v_loop_filter16y       = ff_vp8_v_loop_filter16y_mbedge_ssse3;
00434         c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_ssse3;
00435         c->vp8_v_loop_filter8uv       = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
00436         c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
00437 #endif
00438     }
00439 
00440     if (mm_flags & AV_CPU_FLAG_SSE4) {
00441         c->vp8_idct_dc_add                  = ff_vp8_idct_dc_add_sse4;
00442 
00443         c->vp8_h_loop_filter_simple   = ff_vp8_h_loop_filter_simple_sse4;
00444 #if ARCH_X86_64 || HAVE_ALIGNED_STACK
00445         c->vp8_h_loop_filter16y       = ff_vp8_h_loop_filter16y_mbedge_sse4;
00446         c->vp8_h_loop_filter8uv       = ff_vp8_h_loop_filter8uv_mbedge_sse4;
00447 #endif
00448     }
00449 #endif /* HAVE_YASM */
00450 }
libavcodec/x86/vp8dsp_init.c