FFmpeg
rnd_template.c
Go to the documentation of this file.
1 /*
2  * SIMD-optimized halfpel functions are compiled twice for rnd/no_rnd
3  * Copyright (c) 2000, 2001 Fabrice Bellard
4  * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
7  * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
8  * and improved by Zdenek Kabelac <kabi@users.sf.net>
9  *
10  * This file is part of FFmpeg.
11  *
12  * FFmpeg is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU Lesser General Public
14  * License as published by the Free Software Foundation; either
15  * version 2.1 of the License, or (at your option) any later version.
16  *
17  * FFmpeg is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20  * Lesser General Public License for more details.
21  *
22  * You should have received a copy of the GNU Lesser General Public
23  * License along with FFmpeg; if not, write to the Free Software
24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25  */
26 
27 #include <stddef.h>
28 #include <stdint.h>
29 
30 #include "inline_asm.h"
31 
32 // put_pixels
33 av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
34  ptrdiff_t line_size, int h)
35 {
36  MOVQ_ZERO(mm7);
37  SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
38  __asm__ volatile(
39  "movq (%1), %%mm0 \n\t"
40  "movq 1(%1), %%mm4 \n\t"
41  "movq %%mm0, %%mm1 \n\t"
42  "movq %%mm4, %%mm5 \n\t"
43  "punpcklbw %%mm7, %%mm0 \n\t"
44  "punpcklbw %%mm7, %%mm4 \n\t"
45  "punpckhbw %%mm7, %%mm1 \n\t"
46  "punpckhbw %%mm7, %%mm5 \n\t"
47  "paddusw %%mm0, %%mm4 \n\t"
48  "paddusw %%mm1, %%mm5 \n\t"
49  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
50  "add %3, %1 \n\t"
51  ".p2align 3 \n\t"
52  "1: \n\t"
53  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
54  "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
55  "movq %%mm0, %%mm1 \n\t"
56  "movq %%mm2, %%mm3 \n\t"
57  "punpcklbw %%mm7, %%mm0 \n\t"
58  "punpcklbw %%mm7, %%mm2 \n\t"
59  "punpckhbw %%mm7, %%mm1 \n\t"
60  "punpckhbw %%mm7, %%mm3 \n\t"
61  "paddusw %%mm2, %%mm0 \n\t"
62  "paddusw %%mm3, %%mm1 \n\t"
63  "paddusw %%mm6, %%mm4 \n\t"
64  "paddusw %%mm6, %%mm5 \n\t"
65  "paddusw %%mm0, %%mm4 \n\t"
66  "paddusw %%mm1, %%mm5 \n\t"
67  "psrlw $2, %%mm4 \n\t"
68  "psrlw $2, %%mm5 \n\t"
69  "packuswb %%mm5, %%mm4 \n\t"
70  "movq %%mm4, (%2, %%"FF_REG_a") \n\t"
71  "add %3, %%"FF_REG_a" \n\t"
72 
73  "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
74  "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
75  "movq %%mm2, %%mm3 \n\t"
76  "movq %%mm4, %%mm5 \n\t"
77  "punpcklbw %%mm7, %%mm2 \n\t"
78  "punpcklbw %%mm7, %%mm4 \n\t"
79  "punpckhbw %%mm7, %%mm3 \n\t"
80  "punpckhbw %%mm7, %%mm5 \n\t"
81  "paddusw %%mm2, %%mm4 \n\t"
82  "paddusw %%mm3, %%mm5 \n\t"
83  "paddusw %%mm6, %%mm0 \n\t"
84  "paddusw %%mm6, %%mm1 \n\t"
85  "paddusw %%mm4, %%mm0 \n\t"
86  "paddusw %%mm5, %%mm1 \n\t"
87  "psrlw $2, %%mm0 \n\t"
88  "psrlw $2, %%mm1 \n\t"
89  "packuswb %%mm1, %%mm0 \n\t"
90  "movq %%mm0, (%2, %%"FF_REG_a") \n\t"
91  "add %3, %%"FF_REG_a" \n\t"
92 
93  "subl $2, %0 \n\t"
94  "jnz 1b \n\t"
95  :"+g"(h), "+S"(pixels)
96  :"D"(block), "r"((x86_reg)line_size)
97  :FF_REG_a, "memory");
98 }
99 
100 // avg_pixels
101 // this routine is 'slightly' suboptimal but mostly unused
102 av_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
103  ptrdiff_t line_size, int h)
104 {
105  MOVQ_ZERO(mm7);
106  SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
107  __asm__ volatile(
108  "movq (%1), %%mm0 \n\t"
109  "movq 1(%1), %%mm4 \n\t"
110  "movq %%mm0, %%mm1 \n\t"
111  "movq %%mm4, %%mm5 \n\t"
112  "punpcklbw %%mm7, %%mm0 \n\t"
113  "punpcklbw %%mm7, %%mm4 \n\t"
114  "punpckhbw %%mm7, %%mm1 \n\t"
115  "punpckhbw %%mm7, %%mm5 \n\t"
116  "paddusw %%mm0, %%mm4 \n\t"
117  "paddusw %%mm1, %%mm5 \n\t"
118  "xor %%"FF_REG_a", %%"FF_REG_a" \n\t"
119  "add %3, %1 \n\t"
120  ".p2align 3 \n\t"
121  "1: \n\t"
122  "movq (%1, %%"FF_REG_a"), %%mm0 \n\t"
123  "movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t"
124  "movq %%mm0, %%mm1 \n\t"
125  "movq %%mm2, %%mm3 \n\t"
126  "punpcklbw %%mm7, %%mm0 \n\t"
127  "punpcklbw %%mm7, %%mm2 \n\t"
128  "punpckhbw %%mm7, %%mm1 \n\t"
129  "punpckhbw %%mm7, %%mm3 \n\t"
130  "paddusw %%mm2, %%mm0 \n\t"
131  "paddusw %%mm3, %%mm1 \n\t"
132  "paddusw %%mm6, %%mm4 \n\t"
133  "paddusw %%mm6, %%mm5 \n\t"
134  "paddusw %%mm0, %%mm4 \n\t"
135  "paddusw %%mm1, %%mm5 \n\t"
136  "psrlw $2, %%mm4 \n\t"
137  "psrlw $2, %%mm5 \n\t"
138  "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
139  "packuswb %%mm5, %%mm4 \n\t"
140  "pcmpeqd %%mm2, %%mm2 \n\t"
141  "paddb %%mm2, %%mm2 \n\t"
142  PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
143  "movq %%mm5, (%2, %%"FF_REG_a") \n\t"
144  "add %3, %%"FF_REG_a" \n\t"
145 
146  "movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
147  "movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t"
148  "movq %%mm2, %%mm3 \n\t"
149  "movq %%mm4, %%mm5 \n\t"
150  "punpcklbw %%mm7, %%mm2 \n\t"
151  "punpcklbw %%mm7, %%mm4 \n\t"
152  "punpckhbw %%mm7, %%mm3 \n\t"
153  "punpckhbw %%mm7, %%mm5 \n\t"
154  "paddusw %%mm2, %%mm4 \n\t"
155  "paddusw %%mm3, %%mm5 \n\t"
156  "paddusw %%mm6, %%mm0 \n\t"
157  "paddusw %%mm6, %%mm1 \n\t"
158  "paddusw %%mm4, %%mm0 \n\t"
159  "paddusw %%mm5, %%mm1 \n\t"
160  "psrlw $2, %%mm0 \n\t"
161  "psrlw $2, %%mm1 \n\t"
162  "movq (%2, %%"FF_REG_a"), %%mm3 \n\t"
163  "packuswb %%mm1, %%mm0 \n\t"
164  "pcmpeqd %%mm2, %%mm2 \n\t"
165  "paddb %%mm2, %%mm2 \n\t"
166  PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
167  "movq %%mm1, (%2, %%"FF_REG_a") \n\t"
168  "add %3, %%"FF_REG_a" \n\t"
169 
170  "subl $2, %0 \n\t"
171  "jnz 1b \n\t"
172  :"+g"(h), "+S"(pixels)
173  :"D"(block), "r"((x86_reg)line_size)
174  :FF_REG_a, "memory");
175 }
PAVGB_MMX
#define PAVGB_MMX(rega, regb, regr, regfe)
Definition: inline_asm.h:63
inline_asm.h
av_unused
#define av_unused
Definition: attributes.h:131
MOVQ_ZERO
#define MOVQ_ZERO(regd)
Definition: inline_asm.h:32
pixels8_xy2
av_unused STATIC void put_TMPL pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: rnd_template.c:33
DEF
#define DEF(type, name, bytes, read, write)
Definition: bytestream.h:42
avg
#define avg(a, b, c, d)
Definition: colorspacedsp_template.c:28
STATIC
#define STATIC
Definition: vf_libplacebo.c:515
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
x86_reg
int x86_reg
Definition: asm.h:72
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
h
h
Definition: vp9dsp_template.c:2038