105 #undef PROFILE_THE_BEAST
108 typedef unsigned char ubyte;
109 typedef signed char sbyte;
146 static const vector
unsigned char
147 perm_rgb_0 = { 0x00, 0x01, 0x10, 0x02, 0x03, 0x11, 0x04, 0x05,
148 0x12, 0x06, 0x07, 0x13, 0x08, 0x09, 0x14, 0x0a },
149 perm_rgb_1 = { 0x0b, 0x15, 0x0c, 0x0d, 0x16, 0x0e, 0x0f, 0x17,
150 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
151 perm_rgb_2 = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
152 0x00, 0x01, 0x18, 0x02, 0x03, 0x19, 0x04, 0x05 },
153 perm_rgb_3 = { 0x1a, 0x06, 0x07, 0x1b, 0x08, 0x09, 0x1c, 0x0a,
154 0x0b, 0x1d, 0x0c, 0x0d, 0x1e, 0x0e, 0x0f, 0x1f };
156 #define vec_merge3(x2, x1, x0, y0, y1, y2) \
158 __typeof__(x0) o0, o2, o3; \
159 o0 = vec_mergeh(x0, x1); \
160 y0 = vec_perm(o0, x2, perm_rgb_0); \
161 o2 = vec_perm(o0, x2, perm_rgb_1); \
162 o3 = vec_mergel(x0, x1); \
163 y1 = vec_perm(o3, o2, perm_rgb_2); \
164 y2 = vec_perm(o3, o2, perm_rgb_3); \
167 #define vec_mstbgr24(x0, x1, x2, ptr) \
169 __typeof__(x0) _0, _1, _2; \
170 vec_merge3(x0, x1, x2, _0, _1, _2); \
171 vec_st(_0, 0, ptr++); \
172 vec_st(_1, 0, ptr++); \
173 vec_st(_2, 0, ptr++); \
176 #define vec_mstrgb24(x0, x1, x2, ptr) \
178 __typeof__(x0) _0, _1, _2; \
179 vec_merge3(x2, x1, x0, _0, _1, _2); \
180 vec_st(_0, 0, ptr++); \
181 vec_st(_1, 0, ptr++); \
182 vec_st(_2, 0, ptr++); \
189 #define vec_mstrgb32(T, x0, x1, x2, x3, ptr) \
192 _0 = vec_mergeh(x0, x1); \
193 _1 = vec_mergeh(x2, x3); \
194 _2 = (T) vec_mergeh((vector unsigned short) _0, \
195 (vector unsigned short) _1); \
196 _3 = (T) vec_mergel((vector unsigned short) _0, \
197 (vector unsigned short) _1); \
198 vec_st(_2, 0 * 16, (T *) ptr); \
199 vec_st(_3, 1 * 16, (T *) ptr); \
200 _0 = vec_mergel(x0, x1); \
201 _1 = vec_mergel(x2, x3); \
202 _2 = (T) vec_mergeh((vector unsigned short) _0, \
203 (vector unsigned short) _1); \
204 _3 = (T) vec_mergel((vector unsigned short) _0, \
205 (vector unsigned short) _1); \
206 vec_st(_2, 2 * 16, (T *) ptr); \
207 vec_st(_3, 3 * 16, (T *) ptr); \
226 (vector signed short) \
227 vec_perm(x, (__typeof__(x)) { 0 }, \
228 ((vector unsigned char) { \
229 0x10, 0x00, 0x10, 0x01, 0x10, 0x02, 0x10, 0x03, \
230 0x10, 0x04, 0x10, 0x05, 0x10, 0x06, 0x10, 0x07 }))
233 (vector signed short) \
234 vec_perm(x, (__typeof__(x)) { 0 }, \
235 ((vector unsigned char) { \
236 0x10, 0x08, 0x10, 0x09, 0x10, 0x0A, 0x10, 0x0B, \
237 0x10, 0x0C, 0x10, 0x0D, 0x10, 0x0E, 0x10, 0x0F }))
239 #define vec_unh(x)(vector signed short) vec_mergeh(x,(__typeof__(x)) { 0 })
240 #define vec_unl(x)(vector signed short) vec_mergel(x,(__typeof__(x)) { 0 })
243 #define vec_clip_s16(x) \
244 vec_max(vec_min(x, ((vector signed short) { \
245 235, 235, 235, 235, 235, 235, 235, 235 })), \
246 ((vector signed short) { 16, 16, 16, 16, 16, 16, 16, 16 }))
248 #define vec_packclp(x, y) \
249 (vector unsigned char) \
250 vec_packs((vector unsigned short) \
251 vec_max(x, ((vector signed short) { 0 })), \
252 (vector unsigned short) \
253 vec_max(y, ((vector signed short) { 0 })))
255 static inline void cvtyuvtoRGB(
SwsInternal *
c, vector
signed short Y,
256 vector
signed short U, vector
signed short V,
257 vector
signed short *
R, vector
signed short *
G,
258 vector
signed short *
B)
260 vector
signed short vx, ux, uvx;
262 Y = vec_mradds(
Y,
c->CY,
c->OY);
263 U = vec_sub(
U, (vector
signed short)
264 vec_splat((vector
signed short) { 128 }, 0));
265 V = vec_sub(
V, (vector
signed short)
266 vec_splat((vector
signed short) { 128 }, 0));
269 ux = vec_sl(
U,
c->CSHIFT);
270 *
B = vec_mradds(ux,
c->CBU,
Y);
273 vx = vec_sl(
V,
c->CSHIFT);
274 *
R = vec_mradds(vx,
c->CRV,
Y);
277 uvx = vec_mradds(
U,
c->CGU,
Y);
278 *
G = vec_mradds(
V,
c->CGV, uvx);
288 static inline vector
unsigned char vec_xl(
signed long long offset,
const ubyte *addr)
290 const vector
unsigned char *v_addr = (
const vector
unsigned char *) (addr +
offset);
291 vector
unsigned char align_perm = vec_lvsl(
offset, addr);
293 return (vector
unsigned char) vec_perm(v_addr[0], v_addr[1], align_perm);
297 #define DEFCSP420_CVT(name, out_pixels) \
298 static int altivec_ ## name(SwsInternal *c, const unsigned char *const *in, \
299 const int *instrides, int srcSliceY, int srcSliceH, \
300 unsigned char *const *oplanes, const int *outstrides) \
302 int w = c->opts.src_w; \
305 int instrides_scl[3]; \
306 vector unsigned char y0, y1; \
308 vector signed char u, v; \
310 vector signed short Y0, Y1, Y2, Y3; \
311 vector signed short U, V; \
312 vector signed short vx, ux, uvx; \
313 vector signed short vx0, ux0, uvx0; \
314 vector signed short vx1, ux1, uvx1; \
315 vector signed short R0, G0, B0; \
316 vector signed short R1, G1, B1; \
317 vector unsigned char R, G, B; \
319 vector signed short lCY = c->CY; \
320 vector signed short lOY = c->OY; \
321 vector signed short lCRV = c->CRV; \
322 vector signed short lCBU = c->CBU; \
323 vector signed short lCGU = c->CGU; \
324 vector signed short lCGV = c->CGV; \
325 vector unsigned short lCSHIFT = c->CSHIFT; \
327 const ubyte *y1i = in[0]; \
328 const ubyte *y2i = in[0] + instrides[0]; \
329 const ubyte *ui = in[1]; \
330 const ubyte *vi = in[2]; \
332 vector unsigned char *oute, *outo; \
335 instrides_scl[0] = instrides[0] * 2 - w; \
337 instrides_scl[1] = instrides[1] - w / 2; \
339 instrides_scl[2] = instrides[2] - w / 2; \
341 for (i = 0; i < h / 2; i++) { \
342 oute = (vector unsigned char *)(oplanes[0] + outstrides[0] * \
343 (srcSliceY + i * 2)); \
344 outo = oute + (outstrides[0] >> 4); \
345 vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0); \
346 vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1); \
348 for (j = 0; j < w / 16; j++) { \
349 y0 = vec_xl(0, y1i); \
351 y1 = vec_xl(0, y2i); \
353 u = (vector signed char) vec_xl(0, ui); \
355 v = (vector signed char) vec_xl(0, vi); \
357 u = (vector signed char) \
359 (vector signed char) \
360 vec_splat((vector signed char) { 128 }, 0)); \
361 v = (vector signed char) \
363 (vector signed char) \
364 vec_splat((vector signed char) { 128 }, 0)); \
366 U = vec_unpackh(u); \
367 V = vec_unpackh(v); \
374 Y0 = vec_mradds(Y0, lCY, lOY); \
375 Y1 = vec_mradds(Y1, lCY, lOY); \
376 Y2 = vec_mradds(Y2, lCY, lOY); \
377 Y3 = vec_mradds(Y3, lCY, lOY); \
380 ux = vec_sl(U, lCSHIFT); \
381 ux = vec_mradds(ux, lCBU, (vector signed short) { 0 }); \
382 ux0 = vec_mergeh(ux, ux); \
383 ux1 = vec_mergel(ux, ux); \
386 vx = vec_sl(V, lCSHIFT); \
387 vx = vec_mradds(vx, lCRV, (vector signed short) { 0 }); \
388 vx0 = vec_mergeh(vx, vx); \
389 vx1 = vec_mergel(vx, vx); \
392 uvx = vec_mradds(U, lCGU, (vector signed short) { 0 }); \
393 uvx = vec_mradds(V, lCGV, uvx); \
394 uvx0 = vec_mergeh(uvx, uvx); \
395 uvx1 = vec_mergel(uvx, uvx); \
397 R0 = vec_add(Y0, vx0); \
398 G0 = vec_add(Y0, uvx0); \
399 B0 = vec_add(Y0, ux0); \
400 R1 = vec_add(Y1, vx1); \
401 G1 = vec_add(Y1, uvx1); \
402 B1 = vec_add(Y1, ux1); \
404 R = vec_packclp(R0, R1); \
405 G = vec_packclp(G0, G1); \
406 B = vec_packclp(B0, B1); \
408 out_pixels(R, G, B, oute); \
410 R0 = vec_add(Y2, vx0); \
411 G0 = vec_add(Y2, uvx0); \
412 B0 = vec_add(Y2, ux0); \
413 R1 = vec_add(Y3, vx1); \
414 G1 = vec_add(Y3, uvx1); \
415 B1 = vec_add(Y3, ux1); \
416 R = vec_packclp(R0, R1); \
417 G = vec_packclp(G0, G1); \
418 B = vec_packclp(B0, B1); \
421 out_pixels(R, G, B, outo); \
429 ui += instrides_scl[1]; \
430 vi += instrides_scl[2]; \
431 y1i += instrides_scl[0]; \
432 y2i += instrides_scl[0]; \
437 #define out_abgr(a, b, c, ptr) \
438 vec_mstrgb32(__typeof__(a), ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), c, b, a, ptr)
439 #define out_bgra(a, b, c, ptr) \
440 vec_mstrgb32(__typeof__(a), c, b, a, ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), ptr)
441 #define out_rgba(a, b, c, ptr) \
442 vec_mstrgb32(__typeof__(a), a, b, c, ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), ptr)
443 #define out_argb(a, b, c, ptr) \
444 vec_mstrgb32(__typeof__(a), ((__typeof__(a)) vec_splat((__typeof__(a)){ 255 }, 0)), a, b, c, ptr)
445 #define out_rgb24(a, b, c, ptr) vec_mstrgb24(a, b, c, ptr)
446 #define out_bgr24(a, b, c, ptr) vec_mstbgr24(a, b, c, ptr)
448 DEFCSP420_CVT(yuv2_abgr, out_abgr)
449 DEFCSP420_CVT(yuv2_bgra, out_bgra)
450 DEFCSP420_CVT(yuv2_rgba, out_rgba)
451 DEFCSP420_CVT(yuv2_argb, out_argb)
452 DEFCSP420_CVT(yuv2_rgb24, out_rgb24)
453 DEFCSP420_CVT(yuv2_bgr24, out_bgr24)
457 static const vector
unsigned char
458 demux_u = { 0x10, 0x00, 0x10, 0x00,
459 0x10, 0x04, 0x10, 0x04,
460 0x10, 0x08, 0x10, 0x08,
461 0x10, 0x0c, 0x10, 0x0c },
462 demux_v = { 0x10, 0x02, 0x10, 0x02,
463 0x10, 0x06, 0x10, 0x06,
464 0x10, 0x0A, 0x10, 0x0A,
465 0x10, 0x0E, 0x10, 0x0E },
466 demux_y = { 0x10, 0x01, 0x10, 0x03,
467 0x10, 0x05, 0x10, 0x07,
468 0x10, 0x09, 0x10, 0x0B,
469 0x10, 0x0D, 0x10, 0x0F };
474 static int altivec_uyvy_rgb32(
SwsInternal *
c,
const unsigned char *
const *in,
475 const int *instrides,
int srcSliceY,
int srcSliceH,
476 unsigned char *
const *oplanes,
const int *outstrides)
478 int w =
c->opts.src_w;
481 vector
unsigned char uyvy;
482 vector
signed short Y,
U,
V;
483 vector
signed short R0, G0,
B0,
R1, G1,
B1;
484 vector
unsigned char R,
G,
B;
485 vector
unsigned char *
out;
489 out = (vector
unsigned char *) (oplanes[0] + srcSliceY * outstrides[0]);
491 for (
i = 0;
i <
h;
i++)
492 for (j = 0; j <
w / 16; j++) {
493 uyvy = vec_ld(0,
img);
496 vec_perm(uyvy, (vector
unsigned char) { 0 }, demux_u);
498 vec_perm(uyvy, (vector
unsigned char) { 0 }, demux_v);
500 vec_perm(uyvy, (vector
unsigned char) { 0 }, demux_y);
502 cvtyuvtoRGB(
c,
Y,
U,
V, &
R0, &G0, &
B0);
504 uyvy = vec_ld(16,
img);
507 vec_perm(uyvy, (vector
unsigned char) { 0 }, demux_u);
509 vec_perm(uyvy, (vector
unsigned char) { 0 }, demux_v);
511 vec_perm(uyvy, (vector
unsigned char) { 0 }, demux_y);
513 cvtyuvtoRGB(
c,
Y,
U,
V, &
R1, &G1, &
B1);
515 R = vec_packclp(
R0,
R1);
516 G = vec_packclp(G0, G1);
517 B = vec_packclp(
B0,
B1);
548 if ((
c->opts.src_w & 0
xf) != 0)
551 switch (
c->opts.src_format) {
558 if ((
c->opts.src_h & 0x1) != 0)
568 switch (
c->opts.dst_format) {
571 return altivec_yuv2_rgb24;
574 return altivec_yuv2_bgr24;
577 return altivec_yuv2_argb;
580 return altivec_yuv2_abgr;
583 return altivec_yuv2_rgba;
586 return altivec_yuv2_bgra;
587 default:
return NULL;
593 switch (
c->opts.dst_format) {
596 return altivec_uyvy_rgb32;
597 default:
return NULL;
607 const int inv_table[4],
615 vector
signed short vec;
621 buf.tmp[0] = ((0xffffLL) * contrast >> 8) >> 9;
622 buf.tmp[1] = -256 * brightness;
623 buf.tmp[2] = (inv_table[0] >> 3) * (contrast >> 16) * (
saturation >> 16);
624 buf.tmp[3] = (inv_table[1] >> 3) * (contrast >> 16) * (
saturation >> 16);
625 buf.tmp[4] = -((inv_table[2] >> 1) * (contrast >> 16) * (
saturation >> 16));
626 buf.tmp[5] = -((inv_table[3] >> 1) * (contrast >> 16) * (
saturation >> 16));
628 c->CSHIFT = (vector
unsigned short) vec_splat_u16(2);
629 c->CY = vec_splat((vector
signed short) buf.vec, 0);
630 c->OY = vec_splat((vector
signed short) buf.vec, 1);
631 c->CRV = vec_splat((vector
signed short) buf.vec, 2);
632 c->CBU = vec_splat((vector
signed short) buf.vec, 3);
633 c->CGU = vec_splat((vector
signed short) buf.vec, 4);
634 c->CGV = vec_splat((vector
signed short) buf.vec, 5);
642 const int16_t *lumFilter,
643 const int16_t **lumSrc,
645 const int16_t *chrFilter,
646 const int16_t **chrUSrc,
647 const int16_t **chrVSrc,
649 const int16_t **alpSrc,
655 vector
signed short X, X0, X1, Y0, U0, V0, Y1, U1, V1,
U,
V;
656 vector
signed short R0, G0,
B0,
R1, G1,
B1;
658 vector
unsigned char R,
G,
B;
659 vector
unsigned char *
out, *nout;
661 vector
signed short RND = vec_splat_s16(1 << 3);
662 vector
unsigned short SCL = vec_splat_u16(4);
665 vector
signed short *YCoeffs, *CCoeffs;
667 YCoeffs =
c->vYCoeffsBank + dstY * lumFilterSize;
668 CCoeffs =
c->vCCoeffsBank + dstY * chrFilterSize;
670 out = (vector
unsigned char *) dest;
672 for (
i = 0;
i < dstW;
i += 16) {
676 for (j = 0; j < lumFilterSize; j++) {
677 X0 = vec_ld(0, &lumSrc[j][
i]);
678 X1 = vec_ld(16, &lumSrc[j][
i]);
679 Y0 = vec_mradds(X0, YCoeffs[j], Y0);
680 Y1 = vec_mradds(X1, YCoeffs[j], Y1);
686 for (j = 0; j < chrFilterSize; j++) {
687 X = vec_ld(0, &chrUSrc[j][
i / 2]);
688 U = vec_mradds(
X, CCoeffs[j],
U);
689 X = vec_ld(0, &chrVSrc[j][
i / 2]);
690 V = vec_mradds(
X, CCoeffs[j],
V);
694 Y0 = vec_sra(Y0, SCL);
695 Y1 = vec_sra(Y1, SCL);
699 Y0 = vec_clip_s16(Y0);
700 Y1 = vec_clip_s16(Y1);
713 U0 = vec_mergeh(
U,
U);
714 V0 = vec_mergeh(
V,
V);
716 U1 = vec_mergel(
U,
U);
717 V1 = vec_mergel(
V,
V);
719 cvtyuvtoRGB(
c, Y0, U0, V0, &
R0, &G0, &
B0);
720 cvtyuvtoRGB(
c, Y1, U1, V1, &
R1, &G1, &
B1);
722 R = vec_packclp(
R0,
R1);
723 G = vec_packclp(G0, G1);
724 B = vec_packclp(
B0,
B1);
749 static int printed_error_message;
750 if (!printed_error_message) {
752 "altivec_yuv2packedX doesn't support %s output\n",
754 printed_error_message = 1;
767 for (j = 0; j < lumFilterSize; j++) {
768 X0 = vec_ld(0, &lumSrc[j][
i]);
769 X1 = vec_ld(16, &lumSrc[j][
i]);
770 Y0 = vec_mradds(X0, YCoeffs[j], Y0);
771 Y1 = vec_mradds(X1, YCoeffs[j], Y1);
777 for (j = 0; j < chrFilterSize; j++) {
778 X = vec_ld(0, &chrUSrc[j][
i / 2]);
779 U = vec_mradds(
X, CCoeffs[j],
U);
780 X = vec_ld(0, &chrVSrc[j][
i / 2]);
781 V = vec_mradds(
X, CCoeffs[j],
V);
785 Y0 = vec_sra(Y0, SCL);
786 Y1 = vec_sra(Y1, SCL);
790 Y0 = vec_clip_s16(Y0);
791 Y1 = vec_clip_s16(Y1);
804 U0 = vec_mergeh(
U,
U);
805 V0 = vec_mergeh(
V,
V);
807 U1 = vec_mergel(
U,
U);
808 V1 = vec_mergel(
V,
V);
810 cvtyuvtoRGB(
c, Y0, U0, V0, &
R0, &G0, &
B0);
811 cvtyuvtoRGB(
c, Y1, U1, V1, &
R1, &G1, &
B1);
813 R = vec_packclp(
R0,
R1);
814 G = vec_packclp(G0, G1);
815 B = vec_packclp(
B0,
B1);
817 nout = (vector
unsigned char *) scratch;
820 out_abgr(
R,
G,
B, nout);
823 out_bgra(
R,
G,
B, nout);
826 out_rgba(
R,
G,
B, nout);
829 out_argb(
R,
G,
B, nout);
832 out_rgb24(
R,
G,
B, nout);
835 out_bgr24(
R,
G,
B, nout);
840 "altivec_yuv2packedX doesn't support %s output\n",
845 memcpy(&((uint32_t *) dest)[
i], scratch, (dstW -
i) / 4);
849 #define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
850 void ff_yuv2 ## suffix ## _X_altivec(SwsInternal *c, \
851 const int16_t *lumFilter, \
852 const int16_t **lumSrc, \
854 const int16_t *chrFilter, \
855 const int16_t **chrUSrc, \
856 const int16_t **chrVSrc, \
858 const int16_t **alpSrc, \
859 uint8_t *dest, int dstW, int dstY) \
861 yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
862 chrFilter, chrUSrc, chrVSrc, \
863 chrFilterSize, alpSrc, \
864 dest, dstW, dstY, pixfmt); \