FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
simple_idct.c
Go to the documentation of this file.
1 /*
2  * Simple IDCT MMX
3  *
4  * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 #include "libavcodec/simple_idct.h"
23 #include "libavutil/mem.h"
24 #include "libavutil/x86/asm.h"
25 #include "idctdsp.h"
26 
27 #if HAVE_INLINE_ASM
28 
29 /*
30 23170.475006
31 22725.260826
32 21406.727617
33 19265.545870
34 16384.000000
35 12872.826198
36 8866.956905
37 4520.335430
38 */
39 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
40 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
41 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
43 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
44 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
47 
48 #define ROW_SHIFT 11
49 #define COL_SHIFT 20 // 6
50 
51 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
52 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
53 
54 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
55  1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
56 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
57 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
58  1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
59  // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
60 // 0, 0, 0, 0,
61 // 0, 0, 0, 0,
62 
63  C4, C4, C4, C4,
64  C4, -C4, C4, -C4,
65 
66  C2, C6, C2, C6,
67  C6, -C2, C6, -C2,
68 
69  C1, C3, C1, C3,
70  C5, C7, C5, C7,
71 
72  C3, -C7, C3, -C7,
73 -C1, -C5, -C1, -C5,
74 
75  C5, -C1, C5, -C1,
76  C7, C3, C7, C3,
77 
78  C7, -C5, C7, -C5,
79  C3, -C1, C3, -C1
80 };
81 
82 static inline void idct(int16_t *block)
83 {
84  LOCAL_ALIGNED_8(int64_t, align_tmp, [16]);
85  int16_t * const temp= (int16_t*)align_tmp;
86 
87  __asm__ volatile(
88 #if 0 //Alternative, simpler variant
89 
90 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
91  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
92  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
93  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
94  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
95  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
96  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
97  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
98  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
99  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
100  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
101  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
102  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
103  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
104  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
105  #rounder ", %%mm4 \n\t"\
106  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
107  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
108  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
109  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
110  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
111  #rounder ", %%mm0 \n\t"\
112  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
113  "paddd %%mm0, %%mm0 \n\t" \
114  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
115  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
116  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
117  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
118  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
119  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
120  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
121  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
122  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
123  "psrad $" #shift ", %%mm7 \n\t"\
124  "psrad $" #shift ", %%mm4 \n\t"\
125  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
126  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
127  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
128  "psrad $" #shift ", %%mm1 \n\t"\
129  "psrad $" #shift ", %%mm2 \n\t"\
130  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
131  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
132  "movq %%mm7, " #dst " \n\t"\
133  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
134  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
135  "movq %%mm2, 24+" #dst " \n\t"\
136  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
137  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
138  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
139  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
140  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
141  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
142  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
143  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
144  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
145  "psrad $" #shift ", %%mm2 \n\t"\
146  "psrad $" #shift ", %%mm0 \n\t"\
147  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
148  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
149  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
150  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
151  "psrad $" #shift ", %%mm6 \n\t"\
152  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
153  "movq %%mm2, 8+" #dst " \n\t"\
154  "psrad $" #shift ", %%mm4 \n\t"\
155  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
156  "movq %%mm4, 16+" #dst " \n\t"\
157 
158 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
159  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
160  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
161  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
162  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
163  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
164  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
165  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
166  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
167  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
168  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
169  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
170  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
171  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
172  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
173  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
174  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
175  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
176  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
177  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
178  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
179  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
180  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
181  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
182  "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
183  "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
184  "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
185  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
186  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
187  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
188  "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
189  "psrad $" #shift ", %%mm7 \n\t"\
190  "psrad $" #shift ", %%mm4 \n\t"\
191  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
192  "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
193  "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
194  "psrad $" #shift ", %%mm0 \n\t"\
195  "psrad $" #shift ", %%mm2 \n\t"\
196  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
197  "movd %%mm7, " #dst " \n\t"\
198  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
199  "movd %%mm0, 16+" #dst " \n\t"\
200  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
201  "movd %%mm2, 96+" #dst " \n\t"\
202  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
203  "movd %%mm4, 112+" #dst " \n\t"\
204  "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
205  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
206  "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
207  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
208  "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
209  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
210  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
211  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
212  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
213  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
214  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
215  "psrad $" #shift ", %%mm2 \n\t"\
216  "psrad $" #shift ", %%mm5 \n\t"\
217  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
218  "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
219  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
220  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
221  "psrad $" #shift ", %%mm6 \n\t"\
222  "psrad $" #shift ", %%mm4 \n\t"\
223  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
224  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
225  "movd %%mm2, 32+" #dst " \n\t"\
226  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
227  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
228  "movd %%mm6, 48+" #dst " \n\t"\
229  "movd %%mm4, 64+" #dst " \n\t"\
230  "movd %%mm5, 80+" #dst " \n\t"\
231 
232 
233 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
234  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
235  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
236  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
237  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
238  "movq "MANGLE(wm1010)", %%mm4 \n\t"\
239  "pand %%mm0, %%mm4 \n\t"\
240  "por %%mm1, %%mm4 \n\t"\
241  "por %%mm2, %%mm4 \n\t"\
242  "por %%mm3, %%mm4 \n\t"\
243  "packssdw %%mm4,%%mm4 \n\t"\
244  "movd %%mm4, %%eax \n\t"\
245  "orl %%eax, %%eax \n\t"\
246  "jz 1f \n\t"\
247  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
248  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
249  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
250  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
251  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
252  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
253  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
254  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
255  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
256  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
257  #rounder ", %%mm4 \n\t"\
258  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
259  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
260  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
261  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
262  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
263  #rounder ", %%mm0 \n\t"\
264  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
265  "paddd %%mm0, %%mm0 \n\t" \
266  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
267  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
268  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
269  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
270  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
271  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
272  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
273  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
274  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
275  "psrad $" #shift ", %%mm7 \n\t"\
276  "psrad $" #shift ", %%mm4 \n\t"\
277  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
278  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
279  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
280  "psrad $" #shift ", %%mm1 \n\t"\
281  "psrad $" #shift ", %%mm2 \n\t"\
282  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
283  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
284  "movq %%mm7, " #dst " \n\t"\
285  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
286  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
287  "movq %%mm2, 24+" #dst " \n\t"\
288  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
289  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
290  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
291  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
292  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
293  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
294  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
295  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
296  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
297  "psrad $" #shift ", %%mm2 \n\t"\
298  "psrad $" #shift ", %%mm0 \n\t"\
299  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
300  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
301  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
302  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
303  "psrad $" #shift ", %%mm6 \n\t"\
304  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
305  "movq %%mm2, 8+" #dst " \n\t"\
306  "psrad $" #shift ", %%mm4 \n\t"\
307  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
308  "movq %%mm4, 16+" #dst " \n\t"\
309  "jmp 2f \n\t"\
310  "1: \n\t"\
311  "pslld $16, %%mm0 \n\t"\
312  "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
313  "psrad $13, %%mm0 \n\t"\
314  "packssdw %%mm0, %%mm0 \n\t"\
315  "movq %%mm0, " #dst " \n\t"\
316  "movq %%mm0, 8+" #dst " \n\t"\
317  "movq %%mm0, 16+" #dst " \n\t"\
318  "movq %%mm0, 24+" #dst " \n\t"\
319  "2: \n\t"
320 
321 
322 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
323 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
324 /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
325 ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
326 ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
327 
328 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
329 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
330 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
331 
332 
333 //IDCT( src0, src4, src1, src5, dst, shift)
334 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
335 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
336 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
337 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
338 
339 #else
340 
341 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
342  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
343  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
344  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
345  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
346  "movq "MANGLE(wm1010)", %%mm4 \n\t"\
347  "pand %%mm0, %%mm4 \n\t"\
348  "por %%mm1, %%mm4 \n\t"\
349  "por %%mm2, %%mm4 \n\t"\
350  "por %%mm3, %%mm4 \n\t"\
351  "packssdw %%mm4,%%mm4 \n\t"\
352  "movd %%mm4, %%eax \n\t"\
353  "orl %%eax, %%eax \n\t"\
354  "jz 1f \n\t"\
355  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
356  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
357  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
358  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
359  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
360  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
361  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
362  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
363  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
364  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
365  #rounder ", %%mm4 \n\t"\
366  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
367  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
368  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
369  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
370  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
371  #rounder ", %%mm0 \n\t"\
372  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
373  "paddd %%mm0, %%mm0 \n\t" \
374  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
375  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
376  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
377  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
378  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
379  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
380  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
381  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
382  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
383  "psrad $" #shift ", %%mm7 \n\t"\
384  "psrad $" #shift ", %%mm4 \n\t"\
385  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
386  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
387  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
388  "psrad $" #shift ", %%mm1 \n\t"\
389  "psrad $" #shift ", %%mm2 \n\t"\
390  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
391  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
392  "movq %%mm7, " #dst " \n\t"\
393  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
394  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
395  "movq %%mm2, 24+" #dst " \n\t"\
396  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
397  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
398  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
399  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
400  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
401  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
402  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
403  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
404  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
405  "psrad $" #shift ", %%mm2 \n\t"\
406  "psrad $" #shift ", %%mm0 \n\t"\
407  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
408  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
409  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
410  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
411  "psrad $" #shift ", %%mm6 \n\t"\
412  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
413  "movq %%mm2, 8+" #dst " \n\t"\
414  "psrad $" #shift ", %%mm4 \n\t"\
415  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
416  "movq %%mm4, 16+" #dst " \n\t"\
417  "jmp 2f \n\t"\
418  "1: \n\t"\
419  "pslld $16, %%mm0 \n\t"\
420  "paddd "MANGLE(d40000)", %%mm0 \n\t"\
421  "psrad $13, %%mm0 \n\t"\
422  "packssdw %%mm0, %%mm0 \n\t"\
423  "movq %%mm0, " #dst " \n\t"\
424  "movq %%mm0, 8+" #dst " \n\t"\
425  "movq %%mm0, 16+" #dst " \n\t"\
426  "movq %%mm0, 24+" #dst " \n\t"\
427  "2: \n\t"
428 
429 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
430  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
431  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
432  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
433  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
434  "movq %%mm0, %%mm4 \n\t"\
435  "por %%mm1, %%mm4 \n\t"\
436  "por %%mm2, %%mm4 \n\t"\
437  "por %%mm3, %%mm4 \n\t"\
438  "packssdw %%mm4,%%mm4 \n\t"\
439  "movd %%mm4, %%eax \n\t"\
440  "orl %%eax, %%eax \n\t"\
441  "jz " #bt " \n\t"\
442  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
443  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
444  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
445  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
446  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
447  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
448  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
449  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
450  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
451  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
452  #rounder ", %%mm4 \n\t"\
453  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
454  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
455  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
456  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
457  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
458  #rounder ", %%mm0 \n\t"\
459  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
460  "paddd %%mm0, %%mm0 \n\t" \
461  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
462  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
463  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
464  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
465  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
466  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
467  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
468  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
469  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
470  "psrad $" #shift ", %%mm7 \n\t"\
471  "psrad $" #shift ", %%mm4 \n\t"\
472  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
473  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
474  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
475  "psrad $" #shift ", %%mm1 \n\t"\
476  "psrad $" #shift ", %%mm2 \n\t"\
477  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
478  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
479  "movq %%mm7, " #dst " \n\t"\
480  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
481  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
482  "movq %%mm2, 24+" #dst " \n\t"\
483  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
484  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
485  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
486  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
487  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
488  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
489  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
490  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
491  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
492  "psrad $" #shift ", %%mm2 \n\t"\
493  "psrad $" #shift ", %%mm0 \n\t"\
494  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
495  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
496  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
497  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
498  "psrad $" #shift ", %%mm6 \n\t"\
499  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
500  "movq %%mm2, 8+" #dst " \n\t"\
501  "psrad $" #shift ", %%mm4 \n\t"\
502  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
503  "movq %%mm4, 16+" #dst " \n\t"\
504 
505 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
506  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
507  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
508  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
509  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
510  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
511  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
512  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
513  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
514  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
515  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
516  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
517  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
518  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
519  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
520  #rounder ", %%mm4 \n\t"\
521  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
522  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
523  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
524  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
525  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
526  #rounder ", %%mm0 \n\t"\
527  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
528  "paddd %%mm0, %%mm0 \n\t" \
529  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
530  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
531  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
532  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
533  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
534  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
535  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
536  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
537  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
538  "psrad $" #shift ", %%mm7 \n\t"\
539  "psrad $" #shift ", %%mm4 \n\t"\
540  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
541  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
542  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
543  "psrad $" #shift ", %%mm1 \n\t"\
544  "psrad $" #shift ", %%mm2 \n\t"\
545  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
546  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
547  "movq %%mm7, " #dst " \n\t"\
548  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
549  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
550  "movq %%mm2, 24+" #dst " \n\t"\
551  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
552  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
553  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
554  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
555  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
556  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
557  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
558  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
559  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
560  "psrad $" #shift ", %%mm2 \n\t"\
561  "psrad $" #shift ", %%mm0 \n\t"\
562  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
563  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
564  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
565  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
566  "psrad $" #shift ", %%mm6 \n\t"\
567  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
568  "movq %%mm2, 8+" #dst " \n\t"\
569  "psrad $" #shift ", %%mm4 \n\t"\
570  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
571  "movq %%mm4, 16+" #dst " \n\t"\
572 
573 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
574 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
575 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
576 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
577 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
578 
579 #undef IDCT
580 #define IDCT(src0, src4, src1, src5, dst, shift) \
581  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
582  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
583  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
584  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
585  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
586  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
587  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
588  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
589  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
590  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
591  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
592  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
593  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
594  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
595  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
596  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
597  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
598  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
599  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
600  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
601  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
602  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
603  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
604  "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
605  "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
606  "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
607  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
608  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
609  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
610  "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
611  "psrad $" #shift ", %%mm7 \n\t"\
612  "psrad $" #shift ", %%mm4 \n\t"\
613  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
614  "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
615  "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
616  "psrad $" #shift ", %%mm0 \n\t"\
617  "psrad $" #shift ", %%mm2 \n\t"\
618  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
619  "movd %%mm7, " #dst " \n\t"\
620  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
621  "movd %%mm0, 16+" #dst " \n\t"\
622  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
623  "movd %%mm2, 96+" #dst " \n\t"\
624  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
625  "movd %%mm4, 112+" #dst " \n\t"\
626  "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
627  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
628  "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
629  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
630  "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
631  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
632  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
633  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
634  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
635  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
636  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
637  "psrad $" #shift ", %%mm2 \n\t"\
638  "psrad $" #shift ", %%mm5 \n\t"\
639  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
640  "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
641  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
642  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
643  "psrad $" #shift ", %%mm6 \n\t"\
644  "psrad $" #shift ", %%mm4 \n\t"\
645  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
646  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
647  "movd %%mm2, 32+" #dst " \n\t"\
648  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
649  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
650  "movd %%mm6, 48+" #dst " \n\t"\
651  "movd %%mm4, 64+" #dst " \n\t"\
652  "movd %%mm5, 80+" #dst " \n\t"
653 
654 
655 //IDCT( src0, src4, src1, src5, dst, shift)
656 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
657 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
658 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
659 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
660  "jmp 9f \n\t"
661 
662  "# .p2align 4 \n\t"\
663  "4: \n\t"
664 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
665 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
666 
667 #undef IDCT
668 #define IDCT(src0, src4, src1, src5, dst, shift) \
669  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
670  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
671  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
672  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
673  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
674  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
675  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
676  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
677  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
678  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
679  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
680  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
681  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
682  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
683  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
684  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
685  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
686  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
687  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
688  "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
689  "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
690  "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
691  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
692  "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
693  "psrad $" #shift ", %%mm1 \n\t"\
694  "psrad $" #shift ", %%mm4 \n\t"\
695  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
696  "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
697  "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
698  "psrad $" #shift ", %%mm0 \n\t"\
699  "psrad $" #shift ", %%mm2 \n\t"\
700  "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
701  "movd %%mm1, " #dst " \n\t"\
702  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
703  "movd %%mm0, 16+" #dst " \n\t"\
704  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
705  "movd %%mm2, 96+" #dst " \n\t"\
706  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
707  "movd %%mm4, 112+" #dst " \n\t"\
708  "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
709  "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
710  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
711  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
712  "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
713  "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
714  "psrad $" #shift ", %%mm2 \n\t"\
715  "psrad $" #shift ", %%mm5 \n\t"\
716  "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
717  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
718  "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
719  "psrad $" #shift ", %%mm6 \n\t"\
720  "psrad $" #shift ", %%mm1 \n\t"\
721  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
722  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
723  "movd %%mm2, 32+" #dst " \n\t"\
724  "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
725  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
726  "movd %%mm6, 48+" #dst " \n\t"\
727  "movd %%mm1, 64+" #dst " \n\t"\
728  "movd %%mm5, 80+" #dst " \n\t"
729 
730 //IDCT( src0, src4, src1, src5, dst, shift)
731 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
732 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
733 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
734 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
735  "jmp 9f \n\t"
736 
737  "# .p2align 4 \n\t"\
738  "6: \n\t"
739 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
740 
741 #undef IDCT
742 #define IDCT(src0, src4, src1, src5, dst, shift) \
743  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
744  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
745  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
746  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
747  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
748  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
749  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
750  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
751  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
752  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
753  "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
754  "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
755  "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
756  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
757  "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
758  "psrad $" #shift ", %%mm1 \n\t"\
759  "psrad $" #shift ", %%mm4 \n\t"\
760  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
761  "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
762  "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
763  "psrad $" #shift ", %%mm0 \n\t"\
764  "psrad $" #shift ", %%mm2 \n\t"\
765  "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
766  "movd %%mm1, " #dst " \n\t"\
767  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
768  "movd %%mm0, 16+" #dst " \n\t"\
769  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
770  "movd %%mm2, 96+" #dst " \n\t"\
771  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
772  "movd %%mm4, 112+" #dst " \n\t"\
773  "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
774  "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
775  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
776  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
777  "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
778  "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
779  "psrad $" #shift ", %%mm2 \n\t"\
780  "psrad $" #shift ", %%mm5 \n\t"\
781  "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
782  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
783  "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
784  "psrad $" #shift ", %%mm6 \n\t"\
785  "psrad $" #shift ", %%mm1 \n\t"\
786  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
787  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
788  "movd %%mm2, 32+" #dst " \n\t"\
789  "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
790  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
791  "movd %%mm6, 48+" #dst " \n\t"\
792  "movd %%mm1, 64+" #dst " \n\t"\
793  "movd %%mm5, 80+" #dst " \n\t"
794 
795 
796 //IDCT( src0, src4, src1, src5, dst, shift)
797 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
798 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
799 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
800 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
801  "jmp 9f \n\t"
802 
803  "# .p2align 4 \n\t"\
804  "2: \n\t"
805 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
806 
807 #undef IDCT
808 #define IDCT(src0, src4, src1, src5, dst, shift) \
809  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
810  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
811  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
812  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
813  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
814  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
815  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
816  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
817  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
818  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
819  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
820  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
821  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
822  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
823  "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
824  "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
825  "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
826  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
827  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
828  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
829  "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
830  "psrad $" #shift ", %%mm7 \n\t"\
831  "psrad $" #shift ", %%mm4 \n\t"\
832  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
833  "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
834  "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
835  "psrad $" #shift ", %%mm0 \n\t"\
836  "psrad $" #shift ", %%mm2 \n\t"\
837  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
838  "movd %%mm7, " #dst " \n\t"\
839  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
840  "movd %%mm0, 16+" #dst " \n\t"\
841  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
842  "movd %%mm2, 96+" #dst " \n\t"\
843  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
844  "movd %%mm4, 112+" #dst " \n\t"\
845  "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
846  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
847  "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
848  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
849  "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
850  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
851  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
852  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
853  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
854  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
855  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
856  "psrad $" #shift ", %%mm2 \n\t"\
857  "psrad $" #shift ", %%mm5 \n\t"\
858  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
859  "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
860  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
861  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
862  "psrad $" #shift ", %%mm6 \n\t"\
863  "psrad $" #shift ", %%mm4 \n\t"\
864  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
865  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
866  "movd %%mm2, 32+" #dst " \n\t"\
867  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
868  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
869  "movd %%mm6, 48+" #dst " \n\t"\
870  "movd %%mm4, 64+" #dst " \n\t"\
871  "movd %%mm5, 80+" #dst " \n\t"
872 
873 //IDCT( src0, src4, src1, src5, dst, shift)
874 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
875 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
876 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
877 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
878  "jmp 9f \n\t"
879 
880  "# .p2align 4 \n\t"\
881  "3: \n\t"
882 #undef IDCT
883 #define IDCT(src0, src4, src1, src5, dst, shift) \
884  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
885  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
886  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
887  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
888  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
889  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
890  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
891  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
892  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
893  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
894  "movq 64(%2), %%mm3 \n\t"\
895  "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
896  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
897  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
898  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
899  "psrad $" #shift ", %%mm7 \n\t"\
900  "psrad $" #shift ", %%mm4 \n\t"\
901  "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
902  "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
903  "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
904  "psrad $" #shift ", %%mm0 \n\t"\
905  "psrad $" #shift ", %%mm1 \n\t"\
906  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
907  "movd %%mm7, " #dst " \n\t"\
908  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
909  "movd %%mm0, 16+" #dst " \n\t"\
910  "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
911  "movd %%mm1, 96+" #dst " \n\t"\
912  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
913  "movd %%mm4, 112+" #dst " \n\t"\
914  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
915  "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
916  "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
917  "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
918  "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
919  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
920  "psrad $" #shift ", %%mm1 \n\t"\
921  "psrad $" #shift ", %%mm5 \n\t"\
922  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
923  "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
924  "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
925  "psrad $" #shift ", %%mm6 \n\t"\
926  "psrad $" #shift ", %%mm4 \n\t"\
927  "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
928  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
929  "movd %%mm1, 32+" #dst " \n\t"\
930  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
931  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
932  "movd %%mm6, 48+" #dst " \n\t"\
933  "movd %%mm4, 64+" #dst " \n\t"\
934  "movd %%mm5, 80+" #dst " \n\t"
935 
936 
937 //IDCT( src0, src4, src1, src5, dst, shift)
938 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
939 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
940 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
941 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
942  "jmp 9f \n\t"
943 
944  "# .p2align 4 \n\t"\
945  "5: \n\t"
946 #undef IDCT
947 #define IDCT(src0, src4, src1, src5, dst, shift) \
948  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
949  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
950  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
951  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
952  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
953  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
954  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
955  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
956  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
957  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
958  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
959  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
960  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
961  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
962  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
963  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
964  "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
965  "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
966  "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
967  "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
968  "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
969  "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
970  "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
971  "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
972  "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
973  "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
974  "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
975  "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
976  "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
977  "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
978  "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
979  "psrad $" #shift ", %%mm4 \n\t"\
980  "psrad $" #shift ", %%mm7 \n\t"\
981  "psrad $" #shift ", %%mm3 \n\t"\
982  "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
983  "movq %%mm4, " #dst " \n\t"\
984  "psrad $" #shift ", %%mm0 \n\t"\
985  "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
986  "movq %%mm0, 16+" #dst " \n\t"\
987  "movq %%mm0, 96+" #dst " \n\t"\
988  "movq %%mm4, 112+" #dst " \n\t"\
989  "psrad $" #shift ", %%mm5 \n\t"\
990  "psrad $" #shift ", %%mm6 \n\t"\
991  "psrad $" #shift ", %%mm2 \n\t"\
992  "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
993  "movq %%mm5, 32+" #dst " \n\t"\
994  "psrad $" #shift ", %%mm1 \n\t"\
995  "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
996  "movq %%mm6, 48+" #dst " \n\t"\
997  "movq %%mm6, 64+" #dst " \n\t"\
998  "movq %%mm5, 80+" #dst " \n\t"
999 
1000 
1001 //IDCT( src0, src4, src1, src5, dst, shift)
1002 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1003 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1004 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1005 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1006  "jmp 9f \n\t"
1007 
1008 
1009  "# .p2align 4 \n\t"\
1010  "1: \n\t"
1011 #undef IDCT
1012 #define IDCT(src0, src4, src1, src5, dst, shift) \
1013  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1014  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1015  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1016  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1017  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1018  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1019  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1020  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1021  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1022  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1023  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1024  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1025  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1026  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1027  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1028  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1029  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1030  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1031  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1032  "movq 64(%2), %%mm1 \n\t"\
1033  "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1034  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1035  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1036  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1037  "psrad $" #shift ", %%mm7 \n\t"\
1038  "psrad $" #shift ", %%mm4 \n\t"\
1039  "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1040  "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1041  "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1042  "psrad $" #shift ", %%mm0 \n\t"\
1043  "psrad $" #shift ", %%mm3 \n\t"\
1044  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1045  "movd %%mm7, " #dst " \n\t"\
1046  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1047  "movd %%mm0, 16+" #dst " \n\t"\
1048  "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1049  "movd %%mm3, 96+" #dst " \n\t"\
1050  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1051  "movd %%mm4, 112+" #dst " \n\t"\
1052  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1053  "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1054  "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1055  "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1056  "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1057  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1058  "psrad $" #shift ", %%mm3 \n\t"\
1059  "psrad $" #shift ", %%mm5 \n\t"\
1060  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1061  "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1062  "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1063  "psrad $" #shift ", %%mm6 \n\t"\
1064  "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1065  "movd %%mm3, 32+" #dst " \n\t"\
1066  "psrad $" #shift ", %%mm4 \n\t"\
1067  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1068  "movd %%mm6, 48+" #dst " \n\t"\
1069  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1070  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1071  "movd %%mm4, 64+" #dst " \n\t"\
1072  "movd %%mm5, 80+" #dst " \n\t"
1073 
1074 
1075 //IDCT( src0, src4, src1, src5, dst, shift)
1076 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1077 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1078 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1079 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1080  "jmp 9f \n\t"
1081 
1082 
1083  "# .p2align 4 \n\t"
1084  "7: \n\t"
1085 #undef IDCT
1086 #define IDCT(src0, src4, src1, src5, dst, shift) \
1087  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1088  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1089  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1090  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1091  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1092  "psrad $" #shift ", %%mm4 \n\t"\
1093  "psrad $" #shift ", %%mm0 \n\t"\
1094  "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1095  "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1096  "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1097  "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1098  "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1099  "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1100  "psrad $" #shift ", %%mm1 \n\t"\
1101  "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1102  "movq %%mm4, " #dst " \n\t"\
1103  "psrad $" #shift ", %%mm2 \n\t"\
1104  "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1105  "movq %%mm0, 16+" #dst " \n\t"\
1106  "movq %%mm0, 96+" #dst " \n\t"\
1107  "movq %%mm4, 112+" #dst " \n\t"\
1108  "movq %%mm0, 32+" #dst " \n\t"\
1109  "movq %%mm4, 48+" #dst " \n\t"\
1110  "movq %%mm4, 64+" #dst " \n\t"\
1111  "movq %%mm0, 80+" #dst " \n\t"
1112 
1113 //IDCT( src0, src4, src1, src5, dst, shift)
1114 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1115 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1116 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1117 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1118 
1119 
1120 #endif
1121 
1122 /*
1123 Input
1124  00 40 04 44 20 60 24 64
1125  10 30 14 34 50 70 54 74
1126  01 41 03 43 21 61 23 63
1127  11 31 13 33 51 71 53 73
1128  02 42 06 46 22 62 26 66
1129  12 32 16 36 52 72 56 76
1130  05 45 07 47 25 65 27 67
1131  15 35 17 37 55 75 57 77
1132 
1133 Temp
1134  00 04 10 14 20 24 30 34
1135  40 44 50 54 60 64 70 74
1136  01 03 11 13 21 23 31 33
1137  41 43 51 53 61 63 71 73
1138  02 06 12 16 22 26 32 36
1139  42 46 52 56 62 66 72 76
1140  05 07 15 17 25 27 35 37
1141  45 47 55 57 65 67 75 77
1142 */
1143 
1144 "9: \n\t"
1145  :: "r" (block), "r" (temp), "r" (coeffs)
1146  NAMED_CONSTRAINTS_ADD(wm1010,d40000)
1147  : "%eax"
1148  );
1149 }
1150 
1151 void ff_simple_idct_mmx(int16_t *block)
1152 {
1153  idct(block);
1154 }
1155 
1156 //FIXME merge add/put into the idct
1157 
1158 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
1159 {
1160  idct(block);
1161  ff_put_pixels_clamped_mmx(block, dest, line_size);
1162 }
1163 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
1164 {
1165  idct(block);
1166  ff_add_pixels_clamped_mmx(block, dest, line_size);
1167 }
1168 
1169 #endif /* HAVE_INLINE_ASM */