FFmpeg
swscale_altivec.c
Go to the documentation of this file.
1 /*
2  * AltiVec-enhanced yuv2yuvX
3  *
4  * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
5  * based on the equivalent C code in swscale.c
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include <inttypes.h>
25 
26 #include "config.h"
27 #include "libswscale/swscale.h"
29 #include "libavutil/attributes.h"
30 #include "libavutil/cpu.h"
31 #include "yuv2rgb_altivec.h"
33 
34 #if HAVE_ALTIVEC
35 #if HAVE_BIGENDIAN
36 #define vzero vec_splat_s32(0)
37 
38 #define GET_LS(a,b,c,s) {\
39  vector signed short l2 = vec_ld(((b) << 1) + 16, s);\
40  ls = vec_perm(a, l2, c);\
41  a = l2;\
42  }
43 
44 #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
45  vector signed short ls;\
46  vector signed int vf1, vf2, i1, i2;\
47  GET_LS(l1, x, perm, src);\
48  i1 = vec_mule(filter, ls);\
49  i2 = vec_mulo(filter, ls);\
50  vf1 = vec_mergeh(i1, i2);\
51  vf2 = vec_mergel(i1, i2);\
52  d1 = vec_add(d1, vf1);\
53  d2 = vec_add(d2, vf2);\
54  } while (0)
55 
56 #define LOAD_FILTER(vf,f) {\
57  vector unsigned char perm0 = vec_lvsl(joffset, f);\
58  vf = vec_ld(joffset, f);\
59  vf = vec_perm(vf, vf, perm0);\
60 }
61 #define LOAD_L1(ll1,s,p){\
62  p = vec_lvsl(xoffset, s);\
63  ll1 = vec_ld(xoffset, s);\
64 }
65 
66 // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
67 
68 // The neat trick: We only care for half the elements,
69 // high or low depending on (i<<3)%16 (it's 0 or 8 here),
70 // and we're going to use vec_mule, so we choose
71 // carefully how to "unpack" the elements into the even slots.
72 #define GET_VF4(a, vf, f) {\
73  vf = vec_ld(a<< 3, f);\
74  if ((a << 3) % 16)\
75  vf = vec_mergel(vf, (vector signed short)vzero);\
76  else\
77  vf = vec_mergeh(vf, (vector signed short)vzero);\
78 }
79 #define FIRST_LOAD(sv, pos, s, per) {\
80  sv = vec_ld(pos, s);\
81  per = vec_lvsl(pos, s);\
82 }
83 #define UPDATE_PTR(s0, d0, s1, d1) {\
84  d0 = s0;\
85  d1 = s1;\
86 }
87 #define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
88  v1 = vec_ld(pos + a + 16, s);\
89  vf = vec_perm(v0, v1, per);\
90 }
91 #define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) {\
92  if ((((uintptr_t)s + pos) % 16) > 8) {\
93  v1 = vec_ld(pos + a + 16, s);\
94  }\
95  vf = vec_perm(v0, src_v1, per);\
96 }
97 #define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
98  vf1 = vec_ld((a * 2 * filterSize) + (b * 2) + 16 + off, f);\
99  vf = vec_perm(vf0, vf1, per);\
100 }
101 
102 #define FUNC(name) name ## _altivec
103 #include "swscale_ppc_template.c"
104 #undef FUNC
105 
106 #undef vzero
107 
108 #endif /* HAVE_BIGENDIAN */
109 
110 #define output_pixel(pos, val, bias, signedness) \
111  if (big_endian) { \
112  AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
113  } else { \
114  AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
115  }
116 
117 static void
118 yuv2plane1_float_u(const int32_t *src, float *dest, int dstW, int start)
119 {
120  static const int big_endian = HAVE_BIGENDIAN;
121  static const int shift = 3;
122  static const float float_mult = 1.0f / 65535.0f;
123  int i, val;
124  uint16_t val_uint;
125 
126  for (i = start; i < dstW; ++i){
127  val = src[i] + (1 << (shift - 1));
128  output_pixel(&val_uint, val, 0, uint);
129  dest[i] = float_mult * (float)val_uint;
130  }
131 }
132 
133 static void
134 yuv2plane1_float_bswap_u(const int32_t *src, uint32_t *dest, int dstW, int start)
135 {
136  static const int big_endian = HAVE_BIGENDIAN;
137  static const int shift = 3;
138  static const float float_mult = 1.0f / 65535.0f;
139  int i, val;
140  uint16_t val_uint;
141 
142  for (i = start; i < dstW; ++i){
143  val = src[i] + (1 << (shift - 1));
144  output_pixel(&val_uint, val, 0, uint);
145  dest[i] = av_bswap32(av_float2int(float_mult * (float)val_uint));
146  }
147 }
148 
149 static void yuv2plane1_float_altivec(const int32_t *src, float *dest, int dstW)
150 {
151  const int dst_u = -(uintptr_t)dest & 3;
152  const int shift = 3;
153  const int add = (1 << (shift - 1));
154  const int clip = (1 << 16) - 1;
155  const float fmult = 1.0f / 65535.0f;
156  const vec_u32 vadd = (vec_u32) {add, add, add, add};
157  const vec_u32 vshift = (vec_u32) vec_splat_u32(shift);
158  const vec_u32 vlargest = (vec_u32) {clip, clip, clip, clip};
159  const vec_f vmul = (vec_f) {fmult, fmult, fmult, fmult};
160  const vec_f vzero = (vec_f) {0, 0, 0, 0};
161  vec_u32 v;
162  vec_f vd;
163  int i;
164 
165  yuv2plane1_float_u(src, dest, dst_u, 0);
166 
167  for (i = dst_u; i < dstW - 3; i += 4) {
168  v = vec_ld(0, (const uint32_t *) &src[i]);
169  v = vec_add(v, vadd);
170  v = vec_sr(v, vshift);
171  v = vec_min(v, vlargest);
172 
173  vd = vec_ctf(v, 0);
174  vd = vec_madd(vd, vmul, vzero);
175 
176  vec_st(vd, 0, &dest[i]);
177  }
178 
179  yuv2plane1_float_u(src, dest, dstW, i);
180 }
181 
182 static void yuv2plane1_float_bswap_altivec(const int32_t *src, uint32_t *dest, int dstW)
183 {
184  const int dst_u = -(uintptr_t)dest & 3;
185  const int shift = 3;
186  const int add = (1 << (shift - 1));
187  const int clip = (1 << 16) - 1;
188  const float fmult = 1.0f / 65535.0f;
189  const vec_u32 vadd = (vec_u32) {add, add, add, add};
190  const vec_u32 vshift = (vec_u32) vec_splat_u32(shift);
191  const vec_u32 vlargest = (vec_u32) {clip, clip, clip, clip};
192  const vec_f vmul = (vec_f) {fmult, fmult, fmult, fmult};
193  const vec_f vzero = (vec_f) {0, 0, 0, 0};
194  const vec_u32 vswapbig = (vec_u32) {16, 16, 16, 16};
195  const vec_u16 vswapsmall = vec_splat_u16(8);
196  vec_u32 v;
197  vec_f vd;
198  int i;
199 
200  yuv2plane1_float_bswap_u(src, dest, dst_u, 0);
201 
202  for (i = dst_u; i < dstW - 3; i += 4) {
203  v = vec_ld(0, (const uint32_t *) &src[i]);
204  v = vec_add(v, vadd);
205  v = vec_sr(v, vshift);
206  v = vec_min(v, vlargest);
207 
208  vd = vec_ctf(v, 0);
209  vd = vec_madd(vd, vmul, vzero);
210 
211  vd = (vec_f) vec_rl((vec_u32) vd, vswapbig);
212  vd = (vec_f) vec_rl((vec_u16) vd, vswapsmall);
213 
214  vec_st(vd, 0, (float *) &dest[i]);
215  }
216 
217  yuv2plane1_float_bswap_u(src, dest, dstW, i);
218 }
219 
220 #define yuv2plane1_float(template, dest_type, BE_LE) \
221 static void yuv2plane1_float ## BE_LE ## _altivec(const int16_t *src, uint8_t *dest, \
222  int dstW, \
223  const uint8_t *dither, int offset) \
224 { \
225  template((const int32_t *)src, (dest_type *)dest, dstW); \
226 }
227 
228 #if HAVE_BIGENDIAN
229 yuv2plane1_float(yuv2plane1_float_altivec, float, BE)
230 yuv2plane1_float(yuv2plane1_float_bswap_altivec, uint32_t, LE)
231 #else
232 yuv2plane1_float(yuv2plane1_float_altivec, float, LE)
233 yuv2plane1_float(yuv2plane1_float_bswap_altivec, uint32_t, BE)
234 #endif
235 
236 #endif /* HAVE_ALTIVEC */
237 
239 {
240 #if HAVE_ALTIVEC
241  enum AVPixelFormat dstFormat = c->dstFormat;
242 
244  return;
245 
246 #if HAVE_BIGENDIAN
247  if (c->srcBpc == 8 && c->dstBpc <= 14) {
248  c->hyScale = c->hcScale = hScale_real_altivec;
249  }
250  if (!is16BPS(dstFormat) && !isNBPS(dstFormat) && !isSemiPlanarYUV(dstFormat) &&
251  dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE &&
252  !c->needAlpha) {
253  c->yuv2planeX = yuv2planeX_altivec;
254  }
255 #endif
256 
257  if (dstFormat == AV_PIX_FMT_GRAYF32BE) {
258  c->yuv2plane1 = yuv2plane1_floatBE_altivec;
259  } else if (dstFormat == AV_PIX_FMT_GRAYF32LE) {
260  c->yuv2plane1 = yuv2plane1_floatLE_altivec;
261  }
262 
263  /* The following list of supported dstFormat values should
264  * match what's found in the body of ff_yuv2packedX_altivec() */
265  if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->needAlpha) {
266  switch (c->dstFormat) {
267  case AV_PIX_FMT_ABGR:
268  c->yuv2packedX = ff_yuv2abgr_X_altivec;
269  break;
270  case AV_PIX_FMT_BGRA:
271  c->yuv2packedX = ff_yuv2bgra_X_altivec;
272  break;
273  case AV_PIX_FMT_ARGB:
274  c->yuv2packedX = ff_yuv2argb_X_altivec;
275  break;
276  case AV_PIX_FMT_RGBA:
277  c->yuv2packedX = ff_yuv2rgba_X_altivec;
278  break;
279  case AV_PIX_FMT_BGR24:
280  c->yuv2packedX = ff_yuv2bgr24_X_altivec;
281  break;
282  case AV_PIX_FMT_RGB24:
283  c->yuv2packedX = ff_yuv2rgb24_X_altivec;
284  break;
285  }
286  }
287 #endif /* HAVE_ALTIVEC */
288 
290 }
AVPixelFormat
AVPixelFormat
Pixel format.
Definition: pixfmt.h:71
AV_PIX_FMT_BGR24
@ AV_PIX_FMT_BGR24
packed RGB 8:8:8, 24bpp, BGRBGR...
Definition: pixfmt.h:76
AV_PIX_FMT_BGRA
@ AV_PIX_FMT_BGRA
packed BGRA 8:8:8:8, 32bpp, BGRABGRA...
Definition: pixfmt.h:102
av_get_cpu_flags
int av_get_cpu_flags(void)
Return the flags which specify extensions supported by the CPU.
Definition: cpu.c:103
av_float2int
static av_always_inline uint32_t av_float2int(float f)
Reinterpret a float as a 32-bit integer.
Definition: intfloat.h:50
AV_PIX_FMT_GRAYF32LE
@ AV_PIX_FMT_GRAYF32LE
IEEE-754 single precision Y, 32bpp, little-endian.
Definition: pixfmt.h:364
is16BPS
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:703
SWS_BITEXACT
#define SWS_BITEXACT
Definition: swscale.h:91
val
static double val(void *priv, double ch)
Definition: aeval.c:78
isNBPS
static av_always_inline int isNBPS(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:717
av_cold
#define av_cold
Definition: attributes.h:90
clip
clip
Definition: af_crystalizer.c:121
float
float
Definition: af_crystalizer.c:121
AV_PIX_FMT_RGBA
@ AV_PIX_FMT_RGBA
packed RGBA 8:8:8:8, 32bpp, RGBARGBA...
Definition: pixfmt.h:100
isSemiPlanarYUV
static av_always_inline int isSemiPlanarYUV(enum AVPixelFormat pix_fmt)
Definition: swscale_internal.h:749
yuv2plane1_float
yuv2plane1_float(yuv2plane1_float_c_template, yuv2plane1_float(float, LE)
Definition: output.c:311
AV_CPU_FLAG_ALTIVEC
#define AV_CPU_FLAG_ALTIVEC
standard
Definition: cpu.h:60
AV_PIX_FMT_ABGR
@ AV_PIX_FMT_ABGR
packed ABGR 8:8:8:8, 32bpp, ABGRABGR...
Definition: pixfmt.h:101
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
AV_PIX_FMT_RGB24
@ AV_PIX_FMT_RGB24
packed RGB 8:8:8, 24bpp, RGBRGB...
Definition: pixfmt.h:75
vec_u32
#define vec_u32
Definition: util_altivec.h:38
shift
static int shift(int a, int b)
Definition: bonk.c:262
cpu.h
av_bswap32
#define av_bswap32
Definition: bswap.h:28
SWS_FULL_CHR_H_INT
#define SWS_FULL_CHR_H_INT
Definition: swscale.h:86
output_pixel
#define output_pixel(pos, val, bias, signedness)
Definition: output.c:2748
yuv2rgb_altivec.h
attributes.h
ff_sws_init_swscale_vsx
av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
Definition: swscale_vsx.c:2019
AV_PIX_FMT_ARGB
@ AV_PIX_FMT_ARGB
packed ARGB 8:8:8:8, 32bpp, ARGBARGB...
Definition: pixfmt.h:99
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:255
swscale_internal.h
AV_PIX_FMT_GRAYF32BE
@ AV_PIX_FMT_GRAYF32BE
IEEE-754 single precision Y, 32bpp, big-endian.
Definition: pixfmt.h:363
ff_sws_init_swscale_ppc
av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
Definition: swscale_altivec.c:238
swscale_ppc_template.c
util_altivec.h
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
int32_t
int32_t
Definition: audioconvert.c:56
SwsContext
Definition: swscale_internal.h:299
vec_f
#define vec_f
Definition: util_altivec.h:40
vec_u16
#define vec_u16
Definition: util_altivec.h:36
swscale.h