FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
simple_idct.c
Go to the documentation of this file.
1 /*
2  * Simple IDCT MMX
3  *
4  * Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 #include "libavcodec/simple_idct.h"
23 #include "libavutil/mem.h"
24 #include "dsputil_x86.h"
25 
26 #if HAVE_INLINE_ASM
27 
28 /*
29 23170.475006
30 22725.260826
31 21406.727617
32 19265.545870
33 16384.000000
34 12872.826198
35 8866.956905
36 4520.335430
37 */
38 #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39 #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
40 #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
41 #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42 #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
43 #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44 #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45 #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46 
47 #define ROW_SHIFT 11
48 #define COL_SHIFT 20 // 6
49 
50 DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
51 DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
52 
53 DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
54  1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
55 // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
56 // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
57  1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
58  // the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
59 // 0, 0, 0, 0,
60 // 0, 0, 0, 0,
61 
62  C4, C4, C4, C4,
63  C4, -C4, C4, -C4,
64 
65  C2, C6, C2, C6,
66  C6, -C2, C6, -C2,
67 
68  C1, C3, C1, C3,
69  C5, C7, C5, C7,
70 
71  C3, -C7, C3, -C7,
72 -C1, -C5, -C1, -C5,
73 
74  C5, -C1, C5, -C1,
75  C7, C3, C7, C3,
76 
77  C7, -C5, C7, -C5,
78  C3, -C1, C3, -C1
79 };
80 
81 static inline void idct(int16_t *block)
82 {
83  LOCAL_ALIGNED_8(int64_t, align_tmp, [16]);
84  int16_t * const temp= (int16_t*)align_tmp;
85 
86  __asm__ volatile(
87 #if 0 //Alternative, simpler variant
88 
89 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
90  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
91  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
92  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
93  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
94  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
95  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
96  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
97  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
98  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
99  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
100  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
101  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
102  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
103  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
104  #rounder ", %%mm4 \n\t"\
105  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
106  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
107  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
108  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
109  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
110  #rounder ", %%mm0 \n\t"\
111  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
112  "paddd %%mm0, %%mm0 \n\t" \
113  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
114  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
115  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
116  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
117  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
118  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
119  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
120  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
121  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
122  "psrad $" #shift ", %%mm7 \n\t"\
123  "psrad $" #shift ", %%mm4 \n\t"\
124  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
125  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
126  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
127  "psrad $" #shift ", %%mm1 \n\t"\
128  "psrad $" #shift ", %%mm2 \n\t"\
129  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
130  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
131  "movq %%mm7, " #dst " \n\t"\
132  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
133  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
134  "movq %%mm2, 24+" #dst " \n\t"\
135  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
136  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
137  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
138  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
139  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
140  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
141  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
142  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
143  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
144  "psrad $" #shift ", %%mm2 \n\t"\
145  "psrad $" #shift ", %%mm0 \n\t"\
146  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
147  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
148  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
149  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
150  "psrad $" #shift ", %%mm6 \n\t"\
151  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
152  "movq %%mm2, 8+" #dst " \n\t"\
153  "psrad $" #shift ", %%mm4 \n\t"\
154  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
155  "movq %%mm4, 16+" #dst " \n\t"\
156 
157 #define COL_IDCT(src0, src4, src1, src5, dst, shift) \
158  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
159  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
160  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
161  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
162  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
163  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
164  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
165  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
166  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
167  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
168  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
169  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
170  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
171  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
172  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
173  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
174  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
175  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
176  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
177  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
178  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
179  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
180  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
181  "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
182  "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
183  "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
184  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
185  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
186  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
187  "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
188  "psrad $" #shift ", %%mm7 \n\t"\
189  "psrad $" #shift ", %%mm4 \n\t"\
190  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
191  "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
192  "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
193  "psrad $" #shift ", %%mm0 \n\t"\
194  "psrad $" #shift ", %%mm2 \n\t"\
195  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
196  "movd %%mm7, " #dst " \n\t"\
197  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
198  "movd %%mm0, 16+" #dst " \n\t"\
199  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
200  "movd %%mm2, 96+" #dst " \n\t"\
201  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
202  "movd %%mm4, 112+" #dst " \n\t"\
203  "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
204  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
205  "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
206  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
207  "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
208  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
209  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
210  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
211  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
212  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
213  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
214  "psrad $" #shift ", %%mm2 \n\t"\
215  "psrad $" #shift ", %%mm5 \n\t"\
216  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
217  "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
218  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
219  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
220  "psrad $" #shift ", %%mm6 \n\t"\
221  "psrad $" #shift ", %%mm4 \n\t"\
222  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
223  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
224  "movd %%mm2, 32+" #dst " \n\t"\
225  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
226  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
227  "movd %%mm6, 48+" #dst " \n\t"\
228  "movd %%mm4, 64+" #dst " \n\t"\
229  "movd %%mm5, 80+" #dst " \n\t"\
230 
231 
232 #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
233  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
234  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
235  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
236  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
237  "movq "MANGLE(wm1010)", %%mm4 \n\t"\
238  "pand %%mm0, %%mm4 \n\t"\
239  "por %%mm1, %%mm4 \n\t"\
240  "por %%mm2, %%mm4 \n\t"\
241  "por %%mm3, %%mm4 \n\t"\
242  "packssdw %%mm4,%%mm4 \n\t"\
243  "movd %%mm4, %%eax \n\t"\
244  "orl %%eax, %%eax \n\t"\
245  "jz 1f \n\t"\
246  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
247  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
248  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
249  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
250  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
251  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
252  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
253  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
254  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
255  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
256  #rounder ", %%mm4 \n\t"\
257  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
258  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
259  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
260  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
261  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
262  #rounder ", %%mm0 \n\t"\
263  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
264  "paddd %%mm0, %%mm0 \n\t" \
265  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
266  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
267  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
268  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
269  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
270  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
271  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
272  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
273  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
274  "psrad $" #shift ", %%mm7 \n\t"\
275  "psrad $" #shift ", %%mm4 \n\t"\
276  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
277  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
278  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
279  "psrad $" #shift ", %%mm1 \n\t"\
280  "psrad $" #shift ", %%mm2 \n\t"\
281  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
282  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
283  "movq %%mm7, " #dst " \n\t"\
284  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
285  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
286  "movq %%mm2, 24+" #dst " \n\t"\
287  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
288  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
289  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
290  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
291  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
292  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
293  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
294  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
295  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
296  "psrad $" #shift ", %%mm2 \n\t"\
297  "psrad $" #shift ", %%mm0 \n\t"\
298  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
299  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
300  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
301  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
302  "psrad $" #shift ", %%mm6 \n\t"\
303  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
304  "movq %%mm2, 8+" #dst " \n\t"\
305  "psrad $" #shift ", %%mm4 \n\t"\
306  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
307  "movq %%mm4, 16+" #dst " \n\t"\
308  "jmp 2f \n\t"\
309  "1: \n\t"\
310  "pslld $16, %%mm0 \n\t"\
311  "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
312  "psrad $13, %%mm0 \n\t"\
313  "packssdw %%mm0, %%mm0 \n\t"\
314  "movq %%mm0, " #dst " \n\t"\
315  "movq %%mm0, 8+" #dst " \n\t"\
316  "movq %%mm0, 16+" #dst " \n\t"\
317  "movq %%mm0, 24+" #dst " \n\t"\
318  "2: \n\t"
319 
320 
321 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
322 ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
323 /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
324 ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
325 ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
326 
327 DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
328 DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
329 DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
330 
331 
332 //IDCT( src0, src4, src1, src5, dst, shift)
333 COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
334 COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
335 COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
336 COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
337 
338 #else
339 
340 #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
341  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
342  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
343  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
344  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
345  "movq "MANGLE(wm1010)", %%mm4 \n\t"\
346  "pand %%mm0, %%mm4 \n\t"\
347  "por %%mm1, %%mm4 \n\t"\
348  "por %%mm2, %%mm4 \n\t"\
349  "por %%mm3, %%mm4 \n\t"\
350  "packssdw %%mm4,%%mm4 \n\t"\
351  "movd %%mm4, %%eax \n\t"\
352  "orl %%eax, %%eax \n\t"\
353  "jz 1f \n\t"\
354  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
355  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
356  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
357  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
358  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
359  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
360  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
361  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
362  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
363  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
364  #rounder ", %%mm4 \n\t"\
365  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
366  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
367  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
368  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
369  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
370  #rounder ", %%mm0 \n\t"\
371  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
372  "paddd %%mm0, %%mm0 \n\t" \
373  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
374  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
375  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
376  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
377  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
378  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
379  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
380  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
381  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
382  "psrad $" #shift ", %%mm7 \n\t"\
383  "psrad $" #shift ", %%mm4 \n\t"\
384  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
385  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
386  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
387  "psrad $" #shift ", %%mm1 \n\t"\
388  "psrad $" #shift ", %%mm2 \n\t"\
389  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
390  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
391  "movq %%mm7, " #dst " \n\t"\
392  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
393  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
394  "movq %%mm2, 24+" #dst " \n\t"\
395  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
396  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
397  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
398  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
399  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
400  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
401  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
402  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
403  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
404  "psrad $" #shift ", %%mm2 \n\t"\
405  "psrad $" #shift ", %%mm0 \n\t"\
406  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
407  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
408  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
409  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
410  "psrad $" #shift ", %%mm6 \n\t"\
411  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
412  "movq %%mm2, 8+" #dst " \n\t"\
413  "psrad $" #shift ", %%mm4 \n\t"\
414  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
415  "movq %%mm4, 16+" #dst " \n\t"\
416  "jmp 2f \n\t"\
417  "1: \n\t"\
418  "pslld $16, %%mm0 \n\t"\
419  "paddd "MANGLE(d40000)", %%mm0 \n\t"\
420  "psrad $13, %%mm0 \n\t"\
421  "packssdw %%mm0, %%mm0 \n\t"\
422  "movq %%mm0, " #dst " \n\t"\
423  "movq %%mm0, 8+" #dst " \n\t"\
424  "movq %%mm0, 16+" #dst " \n\t"\
425  "movq %%mm0, 24+" #dst " \n\t"\
426  "2: \n\t"
427 
428 #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
429  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
430  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
431  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
432  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
433  "movq %%mm0, %%mm4 \n\t"\
434  "por %%mm1, %%mm4 \n\t"\
435  "por %%mm2, %%mm4 \n\t"\
436  "por %%mm3, %%mm4 \n\t"\
437  "packssdw %%mm4,%%mm4 \n\t"\
438  "movd %%mm4, %%eax \n\t"\
439  "orl %%eax, %%eax \n\t"\
440  "jz " #bt " \n\t"\
441  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
442  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
443  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
444  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
445  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
446  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
447  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
448  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
449  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
450  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
451  #rounder ", %%mm4 \n\t"\
452  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
453  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
454  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
455  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
456  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
457  #rounder ", %%mm0 \n\t"\
458  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
459  "paddd %%mm0, %%mm0 \n\t" \
460  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
461  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
462  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
463  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
464  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
465  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
466  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
467  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
468  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
469  "psrad $" #shift ", %%mm7 \n\t"\
470  "psrad $" #shift ", %%mm4 \n\t"\
471  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
472  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
473  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
474  "psrad $" #shift ", %%mm1 \n\t"\
475  "psrad $" #shift ", %%mm2 \n\t"\
476  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
477  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
478  "movq %%mm7, " #dst " \n\t"\
479  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
480  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
481  "movq %%mm2, 24+" #dst " \n\t"\
482  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
483  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
484  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
485  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
486  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
487  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
488  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
489  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
490  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
491  "psrad $" #shift ", %%mm2 \n\t"\
492  "psrad $" #shift ", %%mm0 \n\t"\
493  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
494  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
495  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
496  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
497  "psrad $" #shift ", %%mm6 \n\t"\
498  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
499  "movq %%mm2, 8+" #dst " \n\t"\
500  "psrad $" #shift ", %%mm4 \n\t"\
501  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
502  "movq %%mm4, 16+" #dst " \n\t"\
503 
504 #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
505  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
506  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
507  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
508  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
509  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
510  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
511  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
512  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
513  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
514  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
515  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
516  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
517  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
518  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
519  #rounder ", %%mm4 \n\t"\
520  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
521  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
522  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
523  "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
524  "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
525  #rounder ", %%mm0 \n\t"\
526  "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
527  "paddd %%mm0, %%mm0 \n\t" \
528  "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
529  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
530  "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
531  "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
532  "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
533  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
534  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
535  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
536  "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
537  "psrad $" #shift ", %%mm7 \n\t"\
538  "psrad $" #shift ", %%mm4 \n\t"\
539  "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
540  "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
541  "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
542  "psrad $" #shift ", %%mm1 \n\t"\
543  "psrad $" #shift ", %%mm2 \n\t"\
544  "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
545  "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
546  "movq %%mm7, " #dst " \n\t"\
547  "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
548  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
549  "movq %%mm2, 24+" #dst " \n\t"\
550  "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
551  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
552  "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
553  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
554  "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
555  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
556  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
557  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
558  "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
559  "psrad $" #shift ", %%mm2 \n\t"\
560  "psrad $" #shift ", %%mm0 \n\t"\
561  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
562  "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
563  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
564  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
565  "psrad $" #shift ", %%mm6 \n\t"\
566  "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
567  "movq %%mm2, 8+" #dst " \n\t"\
568  "psrad $" #shift ", %%mm4 \n\t"\
569  "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
570  "movq %%mm4, 16+" #dst " \n\t"\
571 
572 //IDCT( src0, src4, src1, src5, dst, rounder, shift)
573 DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
574 Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
575 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
576 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
577 
578 #undef IDCT
579 #define IDCT(src0, src4, src1, src5, dst, shift) \
580  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
581  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
582  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
583  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
584  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
585  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
586  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
587  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
588  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
589  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
590  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
591  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
592  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
593  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
594  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
595  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
596  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
597  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
598  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
599  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
600  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
601  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
602  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
603  "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
604  "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
605  "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
606  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
607  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
608  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
609  "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
610  "psrad $" #shift ", %%mm7 \n\t"\
611  "psrad $" #shift ", %%mm4 \n\t"\
612  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
613  "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
614  "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
615  "psrad $" #shift ", %%mm0 \n\t"\
616  "psrad $" #shift ", %%mm2 \n\t"\
617  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
618  "movd %%mm7, " #dst " \n\t"\
619  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
620  "movd %%mm0, 16+" #dst " \n\t"\
621  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
622  "movd %%mm2, 96+" #dst " \n\t"\
623  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
624  "movd %%mm4, 112+" #dst " \n\t"\
625  "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
626  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
627  "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
628  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
629  "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
630  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
631  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
632  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
633  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
634  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
635  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
636  "psrad $" #shift ", %%mm2 \n\t"\
637  "psrad $" #shift ", %%mm5 \n\t"\
638  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
639  "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
640  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
641  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
642  "psrad $" #shift ", %%mm6 \n\t"\
643  "psrad $" #shift ", %%mm4 \n\t"\
644  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
645  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
646  "movd %%mm2, 32+" #dst " \n\t"\
647  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
648  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
649  "movd %%mm6, 48+" #dst " \n\t"\
650  "movd %%mm4, 64+" #dst " \n\t"\
651  "movd %%mm5, 80+" #dst " \n\t"
652 
653 
654 //IDCT( src0, src4, src1, src5, dst, shift)
655 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
656 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
657 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
658 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
659  "jmp 9f \n\t"
660 
661  "# .p2align 4 \n\t"\
662  "4: \n\t"
663 Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
664 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
665 
666 #undef IDCT
667 #define IDCT(src0, src4, src1, src5, dst, shift) \
668  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
669  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
670  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
671  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
672  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
673  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
674  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
675  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
676  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
677  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
678  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
679  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
680  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
681  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
682  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
683  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
684  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
685  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
686  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
687  "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
688  "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
689  "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
690  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
691  "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
692  "psrad $" #shift ", %%mm1 \n\t"\
693  "psrad $" #shift ", %%mm4 \n\t"\
694  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
695  "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
696  "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
697  "psrad $" #shift ", %%mm0 \n\t"\
698  "psrad $" #shift ", %%mm2 \n\t"\
699  "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
700  "movd %%mm1, " #dst " \n\t"\
701  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
702  "movd %%mm0, 16+" #dst " \n\t"\
703  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
704  "movd %%mm2, 96+" #dst " \n\t"\
705  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
706  "movd %%mm4, 112+" #dst " \n\t"\
707  "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
708  "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
709  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
710  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
711  "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
712  "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
713  "psrad $" #shift ", %%mm2 \n\t"\
714  "psrad $" #shift ", %%mm5 \n\t"\
715  "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
716  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
717  "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
718  "psrad $" #shift ", %%mm6 \n\t"\
719  "psrad $" #shift ", %%mm1 \n\t"\
720  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
721  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
722  "movd %%mm2, 32+" #dst " \n\t"\
723  "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
724  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
725  "movd %%mm6, 48+" #dst " \n\t"\
726  "movd %%mm1, 64+" #dst " \n\t"\
727  "movd %%mm5, 80+" #dst " \n\t"
728 
729 //IDCT( src0, src4, src1, src5, dst, shift)
730 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
731 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
732 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
733 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
734  "jmp 9f \n\t"
735 
736  "# .p2align 4 \n\t"\
737  "6: \n\t"
738 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
739 
740 #undef IDCT
741 #define IDCT(src0, src4, src1, src5, dst, shift) \
742  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
743  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
744  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
745  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
746  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
747  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
748  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
749  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
750  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
751  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
752  "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
753  "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
754  "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
755  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
756  "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
757  "psrad $" #shift ", %%mm1 \n\t"\
758  "psrad $" #shift ", %%mm4 \n\t"\
759  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
760  "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
761  "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
762  "psrad $" #shift ", %%mm0 \n\t"\
763  "psrad $" #shift ", %%mm2 \n\t"\
764  "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
765  "movd %%mm1, " #dst " \n\t"\
766  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
767  "movd %%mm0, 16+" #dst " \n\t"\
768  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
769  "movd %%mm2, 96+" #dst " \n\t"\
770  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
771  "movd %%mm4, 112+" #dst " \n\t"\
772  "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
773  "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
774  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
775  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
776  "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
777  "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
778  "psrad $" #shift ", %%mm2 \n\t"\
779  "psrad $" #shift ", %%mm5 \n\t"\
780  "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
781  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
782  "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
783  "psrad $" #shift ", %%mm6 \n\t"\
784  "psrad $" #shift ", %%mm1 \n\t"\
785  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
786  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
787  "movd %%mm2, 32+" #dst " \n\t"\
788  "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
789  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
790  "movd %%mm6, 48+" #dst " \n\t"\
791  "movd %%mm1, 64+" #dst " \n\t"\
792  "movd %%mm5, 80+" #dst " \n\t"
793 
794 
795 //IDCT( src0, src4, src1, src5, dst, shift)
796 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
797 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
798 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
799 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
800  "jmp 9f \n\t"
801 
802  "# .p2align 4 \n\t"\
803  "2: \n\t"
804 Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
805 
806 #undef IDCT
807 #define IDCT(src0, src4, src1, src5, dst, shift) \
808  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
809  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
810  "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
811  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
812  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
813  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
814  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
815  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
816  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
817  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
818  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
819  "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
820  "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
821  "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
822  "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
823  "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
824  "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
825  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
826  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
827  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
828  "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
829  "psrad $" #shift ", %%mm7 \n\t"\
830  "psrad $" #shift ", %%mm4 \n\t"\
831  "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
832  "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
833  "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
834  "psrad $" #shift ", %%mm0 \n\t"\
835  "psrad $" #shift ", %%mm2 \n\t"\
836  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
837  "movd %%mm7, " #dst " \n\t"\
838  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
839  "movd %%mm0, 16+" #dst " \n\t"\
840  "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
841  "movd %%mm2, 96+" #dst " \n\t"\
842  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
843  "movd %%mm4, 112+" #dst " \n\t"\
844  "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
845  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
846  "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
847  "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
848  "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
849  "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
850  "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
851  "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
852  "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
853  "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
854  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
855  "psrad $" #shift ", %%mm2 \n\t"\
856  "psrad $" #shift ", %%mm5 \n\t"\
857  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
858  "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
859  "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
860  "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
861  "psrad $" #shift ", %%mm6 \n\t"\
862  "psrad $" #shift ", %%mm4 \n\t"\
863  "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
864  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
865  "movd %%mm2, 32+" #dst " \n\t"\
866  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
867  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
868  "movd %%mm6, 48+" #dst " \n\t"\
869  "movd %%mm4, 64+" #dst " \n\t"\
870  "movd %%mm5, 80+" #dst " \n\t"
871 
872 //IDCT( src0, src4, src1, src5, dst, shift)
873 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
874 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
875 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
876 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
877  "jmp 9f \n\t"
878 
879  "# .p2align 4 \n\t"\
880  "3: \n\t"
881 #undef IDCT
882 #define IDCT(src0, src4, src1, src5, dst, shift) \
883  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
884  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
885  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
886  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
887  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
888  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
889  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
890  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
891  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
892  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
893  "movq 64(%2), %%mm3 \n\t"\
894  "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
895  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
896  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
897  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
898  "psrad $" #shift ", %%mm7 \n\t"\
899  "psrad $" #shift ", %%mm4 \n\t"\
900  "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
901  "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
902  "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
903  "psrad $" #shift ", %%mm0 \n\t"\
904  "psrad $" #shift ", %%mm1 \n\t"\
905  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
906  "movd %%mm7, " #dst " \n\t"\
907  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
908  "movd %%mm0, 16+" #dst " \n\t"\
909  "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
910  "movd %%mm1, 96+" #dst " \n\t"\
911  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
912  "movd %%mm4, 112+" #dst " \n\t"\
913  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
914  "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
915  "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
916  "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
917  "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
918  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
919  "psrad $" #shift ", %%mm1 \n\t"\
920  "psrad $" #shift ", %%mm5 \n\t"\
921  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
922  "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
923  "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
924  "psrad $" #shift ", %%mm6 \n\t"\
925  "psrad $" #shift ", %%mm4 \n\t"\
926  "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
927  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
928  "movd %%mm1, 32+" #dst " \n\t"\
929  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
930  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
931  "movd %%mm6, 48+" #dst " \n\t"\
932  "movd %%mm4, 64+" #dst " \n\t"\
933  "movd %%mm5, 80+" #dst " \n\t"
934 
935 
936 //IDCT( src0, src4, src1, src5, dst, shift)
937 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
938 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
939 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
940 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
941  "jmp 9f \n\t"
942 
943  "# .p2align 4 \n\t"\
944  "5: \n\t"
945 #undef IDCT
946 #define IDCT(src0, src4, src1, src5, dst, shift) \
947  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
948  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
949  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
950  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
951  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
952  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
953  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
954  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
955  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
956  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
957  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
958  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
959  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
960  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
961  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
962  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
963  "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
964  "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
965  "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
966  "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
967  "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
968  "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
969  "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
970  "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
971  "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
972  "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
973  "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
974  "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
975  "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
976  "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
977  "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
978  "psrad $" #shift ", %%mm4 \n\t"\
979  "psrad $" #shift ", %%mm7 \n\t"\
980  "psrad $" #shift ", %%mm3 \n\t"\
981  "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
982  "movq %%mm4, " #dst " \n\t"\
983  "psrad $" #shift ", %%mm0 \n\t"\
984  "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
985  "movq %%mm0, 16+" #dst " \n\t"\
986  "movq %%mm0, 96+" #dst " \n\t"\
987  "movq %%mm4, 112+" #dst " \n\t"\
988  "psrad $" #shift ", %%mm5 \n\t"\
989  "psrad $" #shift ", %%mm6 \n\t"\
990  "psrad $" #shift ", %%mm2 \n\t"\
991  "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
992  "movq %%mm5, 32+" #dst " \n\t"\
993  "psrad $" #shift ", %%mm1 \n\t"\
994  "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
995  "movq %%mm6, 48+" #dst " \n\t"\
996  "movq %%mm6, 64+" #dst " \n\t"\
997  "movq %%mm5, 80+" #dst " \n\t"
998 
999 
1000 //IDCT( src0, src4, src1, src5, dst, shift)
1001 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1002 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1003 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1004 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1005  "jmp 9f \n\t"
1006 
1007 
1008  "# .p2align 4 \n\t"\
1009  "1: \n\t"
1010 #undef IDCT
1011 #define IDCT(src0, src4, src1, src5, dst, shift) \
1012  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1013  "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
1014  "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
1015  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1016  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1017  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1018  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1019  "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
1020  "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
1021  "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
1022  "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
1023  "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1024  "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
1025  "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
1026  "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
1027  "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
1028  "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1029  "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
1030  "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
1031  "movq 64(%2), %%mm1 \n\t"\
1032  "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
1033  "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1034  "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
1035  "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1036  "psrad $" #shift ", %%mm7 \n\t"\
1037  "psrad $" #shift ", %%mm4 \n\t"\
1038  "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
1039  "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1040  "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1041  "psrad $" #shift ", %%mm0 \n\t"\
1042  "psrad $" #shift ", %%mm3 \n\t"\
1043  "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
1044  "movd %%mm7, " #dst " \n\t"\
1045  "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
1046  "movd %%mm0, 16+" #dst " \n\t"\
1047  "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
1048  "movd %%mm3, 96+" #dst " \n\t"\
1049  "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
1050  "movd %%mm4, 112+" #dst " \n\t"\
1051  "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
1052  "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
1053  "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
1054  "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
1055  "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1056  "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
1057  "psrad $" #shift ", %%mm3 \n\t"\
1058  "psrad $" #shift ", %%mm5 \n\t"\
1059  "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
1060  "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1061  "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
1062  "psrad $" #shift ", %%mm6 \n\t"\
1063  "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
1064  "movd %%mm3, 32+" #dst " \n\t"\
1065  "psrad $" #shift ", %%mm4 \n\t"\
1066  "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
1067  "movd %%mm6, 48+" #dst " \n\t"\
1068  "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
1069  "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
1070  "movd %%mm4, 64+" #dst " \n\t"\
1071  "movd %%mm5, 80+" #dst " \n\t"
1072 
1073 
1074 //IDCT( src0, src4, src1, src5, dst, shift)
1075 IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1076 IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1077 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1078 IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1079  "jmp 9f \n\t"
1080 
1081 
1082  "# .p2align 4 \n\t"
1083  "7: \n\t"
1084 #undef IDCT
1085 #define IDCT(src0, src4, src1, src5, dst, shift) \
1086  "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
1087  "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
1088  "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1089  "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
1090  "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1091  "psrad $" #shift ", %%mm4 \n\t"\
1092  "psrad $" #shift ", %%mm0 \n\t"\
1093  "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
1094  "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
1095  "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
1096  "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
1097  "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
1098  "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
1099  "psrad $" #shift ", %%mm1 \n\t"\
1100  "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
1101  "movq %%mm4, " #dst " \n\t"\
1102  "psrad $" #shift ", %%mm2 \n\t"\
1103  "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
1104  "movq %%mm0, 16+" #dst " \n\t"\
1105  "movq %%mm0, 96+" #dst " \n\t"\
1106  "movq %%mm4, 112+" #dst " \n\t"\
1107  "movq %%mm0, 32+" #dst " \n\t"\
1108  "movq %%mm4, 48+" #dst " \n\t"\
1109  "movq %%mm4, 64+" #dst " \n\t"\
1110  "movq %%mm0, 80+" #dst " \n\t"
1111 
1112 //IDCT( src0, src4, src1, src5, dst, shift)
1113 IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
1114 //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
1115 IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
1116 //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1117 
1118 
1119 #endif
1120 
1121 /*
1122 Input
1123  00 40 04 44 20 60 24 64
1124  10 30 14 34 50 70 54 74
1125  01 41 03 43 21 61 23 63
1126  11 31 13 33 51 71 53 73
1127  02 42 06 46 22 62 26 66
1128  12 32 16 36 52 72 56 76
1129  05 45 07 47 25 65 27 67
1130  15 35 17 37 55 75 57 77
1131 
1132 Temp
1133  00 04 10 14 20 24 30 34
1134  40 44 50 54 60 64 70 74
1135  01 03 11 13 21 23 31 33
1136  41 43 51 53 61 63 71 73
1137  02 06 12 16 22 26 32 36
1138  42 46 52 56 62 66 72 76
1139  05 07 15 17 25 27 35 37
1140  45 47 55 57 65 67 75 77
1141 */
1142 
1143 "9: \n\t"
1144  :: "r" (block), "r" (temp), "r" (coeffs)
1145  : "%eax"
1146  );
1147 }
1148 
1149 void ff_simple_idct_mmx(int16_t *block)
1150 {
1151  idct(block);
1152 }
1153 
1154 //FIXME merge add/put into the idct
1155 
1156 void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
1157 {
1158  idct(block);
1159  ff_put_pixels_clamped_mmx(block, dest, line_size);
1160 }
1161 void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
1162 {
1163  idct(block);
1164  ff_add_pixels_clamped_mmx(block, dest, line_size);
1165 }
1166 
1167 #endif /* HAVE_INLINE_ASM */