FFmpeg
fdct.c
Go to the documentation of this file.
1 /*
2  * SIMD-optimized forward DCT
3  * The gcc porting is Copyright (c) 2001 Fabrice Bellard.
4  * cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
5  * SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
6  *
7  * from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
8  *
9  * Intel Application Note AP-922 - fast, precise implementation of DCT
10  * http://developer.intel.com/vtune/cbts/appnotes.htm
11  *
12  * Also of inspiration:
13  * a page about fdct at http://www.geocities.com/ssavekar/dct.htm
14  * Skal's fdct at http://skal.planet-d.net/coding/dct.html
15  *
16  * This file is part of FFmpeg.
17  *
18  * FFmpeg is free software; you can redistribute it and/or
19  * modify it under the terms of the GNU Lesser General Public
20  * License as published by the Free Software Foundation; either
21  * version 2.1 of the License, or (at your option) any later version.
22  *
23  * FFmpeg is distributed in the hope that it will be useful,
24  * but WITHOUT ANY WARRANTY; without even the implied warranty of
25  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
26  * Lesser General Public License for more details.
27  *
28  * You should have received a copy of the GNU Lesser General Public
29  * License along with FFmpeg; if not, write to the Free Software
30  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
31  */
32 
33 #include "libavutil/common.h"
34 #include "libavutil/mem_internal.h"
35 #include "libavutil/x86/asm.h"
36 #include "fdct.h"
37 
38 #if HAVE_MMX_INLINE
39 
40 //////////////////////////////////////////////////////////////////////
41 //
42 // constants for the forward DCT
43 // -----------------------------
44 //
45 // Be sure to check that your compiler is aligning all constants to QWORD
46 // (8-byte) memory boundaries! Otherwise the unaligned memory access will
47 // severely stall MMX execution.
48 //
49 //////////////////////////////////////////////////////////////////////
50 
51 #define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
52 #define SHIFT_FRW_COL BITS_FRW_ACC
53 #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
54 #define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
55 //#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
56 
57 #define X8(x) x,x,x,x,x,x,x,x
58 
59 //concatenated table, for forward DCT transformation
60 DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
61  X8(13036), // tg * (2<<16) + 0.5
62  X8(27146), // tg * (2<<16) + 0.5
63  X8(-21746) // tg * (2<<16) + 0.5
64 };
65 
66 DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
67  X8(23170) //cos * (2<<15) + 0.5
68 };
69 
70 DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
71 
72 DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
73 
74 static const struct
75 {
76  DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
77 } fdct_r_row_sse2 =
78 {{
80 }};
81 //DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
82 
83 DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table
84  16384, 16384, 22725, 19266,
85  16384, 16384, 12873, 4520,
86  21407, 8867, 19266, -4520,
87  -8867, -21407, -22725, -12873,
88  16384, -16384, 12873, -22725,
89  -16384, 16384, 4520, 19266,
90  8867, -21407, 4520, -12873,
91  21407, -8867, 19266, -22725,
92 
93  22725, 22725, 31521, 26722,
94  22725, 22725, 17855, 6270,
95  29692, 12299, 26722, -6270,
96  -12299, -29692, -31521, -17855,
97  22725, -22725, 17855, -31521,
98  -22725, 22725, 6270, 26722,
99  12299, -29692, 6270, -17855,
100  29692, -12299, 26722, -31521,
101 
102  21407, 21407, 29692, 25172,
103  21407, 21407, 16819, 5906,
104  27969, 11585, 25172, -5906,
105  -11585, -27969, -29692, -16819,
106  21407, -21407, 16819, -29692,
107  -21407, 21407, 5906, 25172,
108  11585, -27969, 5906, -16819,
109  27969, -11585, 25172, -29692,
110 
111  19266, 19266, 26722, 22654,
112  19266, 19266, 15137, 5315,
113  25172, 10426, 22654, -5315,
114  -10426, -25172, -26722, -15137,
115  19266, -19266, 15137, -26722,
116  -19266, 19266, 5315, 22654,
117  10426, -25172, 5315, -15137,
118  25172, -10426, 22654, -26722,
119 
120  16384, 16384, 22725, 19266,
121  16384, 16384, 12873, 4520,
122  21407, 8867, 19266, -4520,
123  -8867, -21407, -22725, -12873,
124  16384, -16384, 12873, -22725,
125  -16384, 16384, 4520, 19266,
126  8867, -21407, 4520, -12873,
127  21407, -8867, 19266, -22725,
128 
129  19266, 19266, 26722, 22654,
130  19266, 19266, 15137, 5315,
131  25172, 10426, 22654, -5315,
132  -10426, -25172, -26722, -15137,
133  19266, -19266, 15137, -26722,
134  -19266, 19266, 5315, 22654,
135  10426, -25172, 5315, -15137,
136  25172, -10426, 22654, -26722,
137 
138  21407, 21407, 29692, 25172,
139  21407, 21407, 16819, 5906,
140  27969, 11585, 25172, -5906,
141  -11585, -27969, -29692, -16819,
142  21407, -21407, 16819, -29692,
143  -21407, 21407, 5906, 25172,
144  11585, -27969, 5906, -16819,
145  27969, -11585, 25172, -29692,
146 
147  22725, 22725, 31521, 26722,
148  22725, 22725, 17855, 6270,
149  29692, 12299, 26722, -6270,
150  -12299, -29692, -31521, -17855,
151  22725, -22725, 17855, -31521,
152  -22725, 22725, 6270, 26722,
153  12299, -29692, 6270, -17855,
154  29692, -12299, 26722, -31521,
155 };
156 
157 static const struct
158 {
159  DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
160 } tab_frw_01234567_sse2 =
161 {{
162 //DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table
163 #define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
164  C4, C4, C5, C7, C2, C6, C3, -C7, \
165  -C4, C4, C7, C3, C6, -C2, C7, -C5, \
166  C4, -C4, C5, -C1, C2, -C6, C3, -C1,
167 // c1..c7 * cos(pi/4) * 2^15
168 #define C1 22725
169 #define C2 21407
170 #define C3 19266
171 #define C4 16384
172 #define C5 12873
173 #define C6 8867
174 #define C7 4520
175 TABLE_SSE2
176 
177 #undef C1
178 #undef C2
179 #undef C3
180 #undef C4
181 #undef C5
182 #undef C6
183 #undef C7
184 #define C1 31521
185 #define C2 29692
186 #define C3 26722
187 #define C4 22725
188 #define C5 17855
189 #define C6 12299
190 #define C7 6270
191 TABLE_SSE2
192 
193 #undef C1
194 #undef C2
195 #undef C3
196 #undef C4
197 #undef C5
198 #undef C6
199 #undef C7
200 #define C1 29692
201 #define C2 27969
202 #define C3 25172
203 #define C4 21407
204 #define C5 16819
205 #define C6 11585
206 #define C7 5906
207 TABLE_SSE2
208 
209 #undef C1
210 #undef C2
211 #undef C3
212 #undef C4
213 #undef C5
214 #undef C6
215 #undef C7
216 #define C1 26722
217 #define C2 25172
218 #define C3 22654
219 #define C4 19266
220 #define C5 15137
221 #define C6 10426
222 #define C7 5315
223 TABLE_SSE2
224 
225 #undef C1
226 #undef C2
227 #undef C3
228 #undef C4
229 #undef C5
230 #undef C6
231 #undef C7
232 #define C1 22725
233 #define C2 21407
234 #define C3 19266
235 #define C4 16384
236 #define C5 12873
237 #define C6 8867
238 #define C7 4520
239 TABLE_SSE2
240 
241 #undef C1
242 #undef C2
243 #undef C3
244 #undef C4
245 #undef C5
246 #undef C6
247 #undef C7
248 #define C1 26722
249 #define C2 25172
250 #define C3 22654
251 #define C4 19266
252 #define C5 15137
253 #define C6 10426
254 #define C7 5315
255 TABLE_SSE2
256 
257 #undef C1
258 #undef C2
259 #undef C3
260 #undef C4
261 #undef C5
262 #undef C6
263 #undef C7
264 #define C1 29692
265 #define C2 27969
266 #define C3 25172
267 #define C4 21407
268 #define C5 16819
269 #define C6 11585
270 #define C7 5906
271 TABLE_SSE2
272 
273 #undef C1
274 #undef C2
275 #undef C3
276 #undef C4
277 #undef C5
278 #undef C6
279 #undef C7
280 #define C1 31521
281 #define C2 29692
282 #define C3 26722
283 #define C4 22725
284 #define C5 17855
285 #define C6 12299
286 #define C7 6270
287 TABLE_SSE2
288 }};
289 
290 #define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
291 
292 #define FDCT_COL(cpu, mm, mov)\
293 static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
294 {\
295  __asm__ volatile (\
296  #mov" 16(%0), %%"#mm"0 \n\t" \
297  #mov" 96(%0), %%"#mm"1 \n\t" \
298  #mov" %%"#mm"0, %%"#mm"2 \n\t" \
299  #mov" 32(%0), %%"#mm"3 \n\t" \
300  "paddsw %%"#mm"1, %%"#mm"0 \n\t" \
301  #mov" 80(%0), %%"#mm"4 \n\t" \
302  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
303  #mov" (%0), %%"#mm"5 \n\t" \
304  "paddsw %%"#mm"3, %%"#mm"4 \n\t" \
305  "paddsw 112(%0), %%"#mm"5 \n\t" \
306  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
307  #mov" %%"#mm"0, %%"#mm"6 \n\t" \
308  "psubsw %%"#mm"1, %%"#mm"2 \n\t" \
309  #mov" 16(%1), %%"#mm"1 \n\t" \
310  "psubsw %%"#mm"4, %%"#mm"0 \n\t" \
311  #mov" 48(%0), %%"#mm"7 \n\t" \
312  "pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
313  "paddsw 64(%0), %%"#mm"7 \n\t" \
314  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
315  "paddsw %%"#mm"4, %%"#mm"6 \n\t" \
316  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
317  #mov" %%"#mm"5, %%"#mm"4 \n\t" \
318  "psubsw %%"#mm"7, %%"#mm"5 \n\t" \
319  "paddsw %%"#mm"5, %%"#mm"1 \n\t" \
320  "paddsw %%"#mm"7, %%"#mm"4 \n\t" \
321  "por (%2), %%"#mm"1 \n\t" \
322  "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
323  "pmulhw 16(%1), %%"#mm"5 \n\t" \
324  #mov" %%"#mm"4, %%"#mm"7 \n\t" \
325  "psubsw 80(%0), %%"#mm"3 \n\t" \
326  "psubsw %%"#mm"6, %%"#mm"4 \n\t" \
327  #mov" %%"#mm"1, 32(%3) \n\t" \
328  "paddsw %%"#mm"6, %%"#mm"7 \n\t" \
329  #mov" 48(%0), %%"#mm"1 \n\t" \
330  "psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
331  "psubsw 64(%0), %%"#mm"1 \n\t" \
332  #mov" %%"#mm"2, %%"#mm"6 \n\t" \
333  #mov" %%"#mm"4, 64(%3) \n\t" \
334  "paddsw %%"#mm"3, %%"#mm"2 \n\t" \
335  "pmulhw (%4), %%"#mm"2 \n\t" \
336  "psubsw %%"#mm"3, %%"#mm"6 \n\t" \
337  "pmulhw (%4), %%"#mm"6 \n\t" \
338  "psubsw %%"#mm"0, %%"#mm"5 \n\t" \
339  "por (%2), %%"#mm"5 \n\t" \
340  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
341  "por (%2), %%"#mm"2 \n\t" \
342  #mov" %%"#mm"1, %%"#mm"4 \n\t" \
343  #mov" (%0), %%"#mm"3 \n\t" \
344  "paddsw %%"#mm"6, %%"#mm"1 \n\t" \
345  "psubsw 112(%0), %%"#mm"3 \n\t" \
346  "psubsw %%"#mm"6, %%"#mm"4 \n\t" \
347  #mov" (%1), %%"#mm"0 \n\t" \
348  "psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
349  #mov" 32(%1), %%"#mm"6 \n\t" \
350  "pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
351  #mov" %%"#mm"7, (%3) \n\t" \
352  "pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
353  #mov" %%"#mm"5, 96(%3) \n\t" \
354  #mov" %%"#mm"3, %%"#mm"7 \n\t" \
355  #mov" 32(%1), %%"#mm"5 \n\t" \
356  "psubsw %%"#mm"2, %%"#mm"7 \n\t" \
357  "paddsw %%"#mm"2, %%"#mm"3 \n\t" \
358  "pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
359  "paddsw %%"#mm"3, %%"#mm"0 \n\t" \
360  "paddsw %%"#mm"4, %%"#mm"6 \n\t" \
361  "pmulhw (%1), %%"#mm"3 \n\t" \
362  "por (%2), %%"#mm"0 \n\t" \
363  "paddsw %%"#mm"7, %%"#mm"5 \n\t" \
364  "psubsw %%"#mm"6, %%"#mm"7 \n\t" \
365  #mov" %%"#mm"0, 16(%3) \n\t" \
366  "paddsw %%"#mm"4, %%"#mm"5 \n\t" \
367  #mov" %%"#mm"7, 48(%3) \n\t" \
368  "psubsw %%"#mm"1, %%"#mm"3 \n\t" \
369  #mov" %%"#mm"5, 80(%3) \n\t" \
370  #mov" %%"#mm"3, 112(%3) \n\t" \
371  : \
372  : "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
373  "r" (out + offset), "r" (ocos_4_16)); \
374 }
375 
376 FDCT_COL(mmx, mm, movq)
377 FDCT_COL(sse2, xmm, movdqa)
378 
379 static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
380 {
381  __asm__ volatile(
382 #define FDCT_ROW_SSE2_H1(i,t) \
383  "movq " #i "(%0), %%xmm2 \n\t" \
384  "movq " #i "+8(%0), %%xmm0 \n\t" \
385  "movdqa " #t "+32(%1), %%xmm3 \n\t" \
386  "movdqa " #t "+48(%1), %%xmm7 \n\t" \
387  "movdqa " #t "(%1), %%xmm4 \n\t" \
388  "movdqa " #t "+16(%1), %%xmm5 \n\t"
389 
390 #define FDCT_ROW_SSE2_H2(i,t) \
391  "movq " #i "(%0), %%xmm2 \n\t" \
392  "movq " #i "+8(%0), %%xmm0 \n\t" \
393  "movdqa " #t "+32(%1), %%xmm3 \n\t" \
394  "movdqa " #t "+48(%1), %%xmm7 \n\t"
395 
396 #define FDCT_ROW_SSE2(i) \
397  "movq %%xmm2, %%xmm1 \n\t" \
398  "pshuflw $27, %%xmm0, %%xmm0 \n\t" \
399  "paddsw %%xmm0, %%xmm1 \n\t" \
400  "psubsw %%xmm0, %%xmm2 \n\t" \
401  "punpckldq %%xmm2, %%xmm1 \n\t" \
402  "pshufd $78, %%xmm1, %%xmm2 \n\t" \
403  "pmaddwd %%xmm2, %%xmm3 \n\t" \
404  "pmaddwd %%xmm1, %%xmm7 \n\t" \
405  "pmaddwd %%xmm5, %%xmm2 \n\t" \
406  "pmaddwd %%xmm4, %%xmm1 \n\t" \
407  "paddd %%xmm7, %%xmm3 \n\t" \
408  "paddd %%xmm2, %%xmm1 \n\t" \
409  "paddd %%xmm6, %%xmm3 \n\t" \
410  "paddd %%xmm6, %%xmm1 \n\t" \
411  "psrad %3, %%xmm3 \n\t" \
412  "psrad %3, %%xmm1 \n\t" \
413  "packssdw %%xmm3, %%xmm1 \n\t" \
414  "movdqa %%xmm1, " #i "(%4) \n\t"
415 
416  "movdqa (%2), %%xmm6 \n\t"
417  FDCT_ROW_SSE2_H1(0,0)
418  FDCT_ROW_SSE2(0)
419  FDCT_ROW_SSE2_H2(64,0)
420  FDCT_ROW_SSE2(64)
421 
422  FDCT_ROW_SSE2_H1(16,64)
423  FDCT_ROW_SSE2(16)
424  FDCT_ROW_SSE2_H2(112,64)
425  FDCT_ROW_SSE2(112)
426 
427  FDCT_ROW_SSE2_H1(32,128)
428  FDCT_ROW_SSE2(32)
429  FDCT_ROW_SSE2_H2(96,128)
430  FDCT_ROW_SSE2(96)
431 
432  FDCT_ROW_SSE2_H1(48,192)
433  FDCT_ROW_SSE2(48)
434  FDCT_ROW_SSE2_H2(80,192)
435  FDCT_ROW_SSE2(80)
436  :
437  : "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
438  "r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
439  XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
440  "%xmm4", "%xmm5", "%xmm6", "%xmm7")
441  );
442 }
443 
444 static av_always_inline void fdct_row_mmxext(const int16_t *in, int16_t *out,
445  const int16_t *table)
446 {
447  __asm__ volatile (
448  "pshufw $0x1B, 8(%0), %%mm5 \n\t"
449  "movq (%0), %%mm0 \n\t"
450  "movq %%mm0, %%mm1 \n\t"
451  "paddsw %%mm5, %%mm0 \n\t"
452  "psubsw %%mm5, %%mm1 \n\t"
453  "movq %%mm0, %%mm2 \n\t"
454  "punpckldq %%mm1, %%mm0 \n\t"
455  "punpckhdq %%mm1, %%mm2 \n\t"
456  "movq (%1), %%mm1 \n\t"
457  "movq 8(%1), %%mm3 \n\t"
458  "movq 16(%1), %%mm4 \n\t"
459  "movq 24(%1), %%mm5 \n\t"
460  "movq 32(%1), %%mm6 \n\t"
461  "movq 40(%1), %%mm7 \n\t"
462  "pmaddwd %%mm0, %%mm1 \n\t"
463  "pmaddwd %%mm2, %%mm3 \n\t"
464  "pmaddwd %%mm0, %%mm4 \n\t"
465  "pmaddwd %%mm2, %%mm5 \n\t"
466  "pmaddwd %%mm0, %%mm6 \n\t"
467  "pmaddwd %%mm2, %%mm7 \n\t"
468  "pmaddwd 48(%1), %%mm0 \n\t"
469  "pmaddwd 56(%1), %%mm2 \n\t"
470  "paddd %%mm1, %%mm3 \n\t"
471  "paddd %%mm4, %%mm5 \n\t"
472  "paddd %%mm6, %%mm7 \n\t"
473  "paddd %%mm0, %%mm2 \n\t"
474  "movq (%2), %%mm0 \n\t"
475  "paddd %%mm0, %%mm3 \n\t"
476  "paddd %%mm0, %%mm5 \n\t"
477  "paddd %%mm0, %%mm7 \n\t"
478  "paddd %%mm0, %%mm2 \n\t"
479  "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
480  "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
481  "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
482  "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
483  "packssdw %%mm5, %%mm3 \n\t"
484  "packssdw %%mm2, %%mm7 \n\t"
485  "movq %%mm3, (%3) \n\t"
486  "movq %%mm7, 8(%3) \n\t"
487  :
488  : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
489 }
490 
491 static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
492 {
493  //FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
494  __asm__ volatile(
495  "movd 12(%0), %%mm1 \n\t"
496  "punpcklwd 8(%0), %%mm1 \n\t"
497  "movq %%mm1, %%mm2 \n\t"
498  "psrlq $0x20, %%mm1 \n\t"
499  "movq 0(%0), %%mm0 \n\t"
500  "punpcklwd %%mm2, %%mm1 \n\t"
501  "movq %%mm0, %%mm5 \n\t"
502  "paddsw %%mm1, %%mm0 \n\t"
503  "psubsw %%mm1, %%mm5 \n\t"
504  "movq %%mm0, %%mm2 \n\t"
505  "punpckldq %%mm5, %%mm0 \n\t"
506  "punpckhdq %%mm5, %%mm2 \n\t"
507  "movq 0(%1), %%mm1 \n\t"
508  "movq 8(%1), %%mm3 \n\t"
509  "movq 16(%1), %%mm4 \n\t"
510  "movq 24(%1), %%mm5 \n\t"
511  "movq 32(%1), %%mm6 \n\t"
512  "movq 40(%1), %%mm7 \n\t"
513  "pmaddwd %%mm0, %%mm1 \n\t"
514  "pmaddwd %%mm2, %%mm3 \n\t"
515  "pmaddwd %%mm0, %%mm4 \n\t"
516  "pmaddwd %%mm2, %%mm5 \n\t"
517  "pmaddwd %%mm0, %%mm6 \n\t"
518  "pmaddwd %%mm2, %%mm7 \n\t"
519  "pmaddwd 48(%1), %%mm0 \n\t"
520  "pmaddwd 56(%1), %%mm2 \n\t"
521  "paddd %%mm1, %%mm3 \n\t"
522  "paddd %%mm4, %%mm5 \n\t"
523  "paddd %%mm6, %%mm7 \n\t"
524  "paddd %%mm0, %%mm2 \n\t"
525  "movq (%2), %%mm0 \n\t"
526  "paddd %%mm0, %%mm3 \n\t"
527  "paddd %%mm0, %%mm5 \n\t"
528  "paddd %%mm0, %%mm7 \n\t"
529  "paddd %%mm0, %%mm2 \n\t"
530  "psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
531  "psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
532  "psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
533  "psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
534  "packssdw %%mm5, %%mm3 \n\t"
535  "packssdw %%mm2, %%mm7 \n\t"
536  "movq %%mm3, 0(%3) \n\t"
537  "movq %%mm7, 8(%3) \n\t"
538  :
539  : "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
540 }
541 
542 void ff_fdct_mmx(int16_t *block)
543 {
544  DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
545  int16_t * block1= (int16_t*)align_tmp;
546  const int16_t *table= tab_frw_01234567;
547  int i;
548 
549  fdct_col_mmx(block, block1, 0);
550  fdct_col_mmx(block, block1, 4);
551 
552  for(i=8;i>0;i--) {
553  fdct_row_mmx(block1, block, table);
554  block1 += 8;
555  table += 32;
556  block += 8;
557  }
558 }
559 
560 #endif /* HAVE_MMX_INLINE */
561 
562 #if HAVE_MMXEXT_INLINE
563 
564 void ff_fdct_mmxext(int16_t *block)
565 {
566  DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
567  int16_t *block1= (int16_t*)align_tmp;
568  const int16_t *table= tab_frw_01234567;
569  int i;
570 
571  fdct_col_mmx(block, block1, 0);
572  fdct_col_mmx(block, block1, 4);
573 
574  for(i=8;i>0;i--) {
575  fdct_row_mmxext(block1, block, table);
576  block1 += 8;
577  table += 32;
578  block += 8;
579  }
580 }
581 
582 #endif /* HAVE_MMXEXT_INLINE */
583 
584 #if HAVE_SSE2_INLINE
585 
586 void ff_fdct_sse2(int16_t *block)
587 {
588  DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
589  int16_t * const block1= (int16_t*)align_tmp;
590 
591  fdct_col_sse2(block, block1, 0);
592  fdct_row_sse2(block1, block);
593 }
594 
595 #endif /* HAVE_SSE2_INLINE */
SHIFT_FRW_ROW
#define SHIFT_FRW_ROW
Definition: xvid_idct_mmi.c:38
r
const char * r
Definition: vf_curves.c:116
mem_internal.h
out
FILE * out
Definition: movenc.c:54
ff_fdct_mmxext
void ff_fdct_mmxext(int16_t *block)
fdct.h
table
static const uint16_t table[]
Definition: prosumer.c:206
S
#define S(s, c, i)
Definition: flacdsp_template.c:46
RND_FRW_ROW
#define RND_FRW_ROW
Definition: xvid_idct_mmi.c:39
ff_fdct_sse2
void ff_fdct_sse2(int16_t *block)
int32_t
int32_t
Definition: audio_convert.c:194
asm.h
ff_fdct_mmx
void ff_fdct_mmx(int16_t *block)
XMM_CLOBBERS_ONLY
#define XMM_CLOBBERS_ONLY(...)
Definition: asm.h:99
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:117
in
uint8_t pi<< 24) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0f/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_U8, uint8_t,(*(const uint8_t *) pi - 0x80) *(1.0/(1<< 7))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S16, int16_t,(*(const int16_t *) pi >> 8)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0f/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S16, int16_t, *(const int16_t *) pi *(1.0/(1<< 15))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_S32, int32_t,(*(const int32_t *) pi >> 24)+0x80) CONV_FUNC_GROUP(AV_SAMPLE_FMT_FLT, float, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0f/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_DBL, double, AV_SAMPLE_FMT_S32, int32_t, *(const int32_t *) pi *(1.0/(1U<< 31))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_FLT, float, av_clip_uint8(lrintf(*(const float *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_FLT, float, av_clip_int16(lrintf(*(const float *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_FLT, float, av_clipl_int32(llrintf(*(const float *) pi *(1U<< 31)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_U8, uint8_t, AV_SAMPLE_FMT_DBL, double, av_clip_uint8(lrint(*(const double *) pi *(1<< 7))+0x80)) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S16, int16_t, AV_SAMPLE_FMT_DBL, double, av_clip_int16(lrint(*(const double *) pi *(1<< 15)))) CONV_FUNC_GROUP(AV_SAMPLE_FMT_S32, int32_t, AV_SAMPLE_FMT_DBL, double, av_clipl_int32(llrint(*(const double *) pi *(1U<< 31)))) #define SET_CONV_FUNC_GROUP(ofmt, ifmt) static void set_generic_function(AudioConvert *ac) { } void ff_audio_convert_free(AudioConvert **ac) { if(! *ac) return;ff_dither_free(&(*ac) ->dc);av_freep(ac);} AudioConvert *ff_audio_convert_alloc(AVAudioResampleContext *avr, enum AVSampleFormat out_fmt, enum AVSampleFormat in_fmt, int channels, int sample_rate, int apply_map) { AudioConvert *ac;int in_planar, out_planar;ac=av_mallocz(sizeof(*ac));if(!ac) return NULL;ac->avr=avr;ac->out_fmt=out_fmt;ac->in_fmt=in_fmt;ac->channels=channels;ac->apply_map=apply_map;if(avr->dither_method !=AV_RESAMPLE_DITHER_NONE &&av_get_packed_sample_fmt(out_fmt)==AV_SAMPLE_FMT_S16 &&av_get_bytes_per_sample(in_fmt) > 2) { ac->dc=ff_dither_alloc(avr, out_fmt, in_fmt, channels, sample_rate, apply_map);if(!ac->dc) { av_free(ac);return NULL;} return ac;} in_planar=ff_sample_fmt_is_planar(in_fmt, channels);out_planar=ff_sample_fmt_is_planar(out_fmt, channels);if(in_planar==out_planar) { ac->func_type=CONV_FUNC_TYPE_FLAT;ac->planes=in_planar ? ac->channels :1;} else if(in_planar) ac->func_type=CONV_FUNC_TYPE_INTERLEAVE;else ac->func_type=CONV_FUNC_TYPE_DEINTERLEAVE;set_generic_function(ac);if(ARCH_AARCH64) ff_audio_convert_init_aarch64(ac);if(ARCH_ARM) ff_audio_convert_init_arm(ac);if(ARCH_X86) ff_audio_convert_init_x86(ac);return ac;} int ff_audio_convert(AudioConvert *ac, AudioData *out, AudioData *in) { int use_generic=1;int len=in->nb_samples;int p;if(ac->dc) { av_log(ac->avr, AV_LOG_TRACE, "%d samples - audio_convert: %s to %s (dithered)\n", len, av_get_sample_fmt_name(ac->in_fmt), av_get_sample_fmt_name(ac->out_fmt));return ff_convert_dither(ac-> in
Definition: audio_convert.c:326
i
int i
Definition: input.c:407
common.h
av_always_inline
#define av_always_inline
Definition: attributes.h:49
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
block1
static int16_t block1[64]
Definition: dct.c:117