FFmpeg
xvid_idct_mmi.c
Go to the documentation of this file.
1 /*
2  * Loongson SIMD optimized xvid idct
3  *
4  * Copyright (c) 2015 Loongson Technology Corporation Limited
5  * Copyright (c) 2015 Zhou Xiaoyong <zhouxiaoyong@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "libavutil/mem_internal.h"
25 
26 #include "idctdsp_mips.h"
27 #include "xvididct_mips.h"
28 
29 #define BITS_INV_ACC 5 // 4 or 5 for IEEE
30 #define SHIFT_INV_ROW (16 - BITS_INV_ACC) //11
31 #define SHIFT_INV_COL (1 + BITS_INV_ACC) //6
32 #define RND_INV_ROW (1024 * (6 - BITS_INV_ACC))
33 #define RND_INV_COL (16 * (BITS_INV_ACC - 3))
34 #define RND_INV_CORR (RND_INV_COL - 1)
35 
36 #define BITS_FRW_ACC 3 // 2 or 3 for accuracy
37 #define SHIFT_FRW_COL BITS_FRW_ACC
38 #define SHIFT_FRW_ROW (BITS_FRW_ACC + 17)
39 #define RND_FRW_ROW (262144*(BITS_FRW_ACC - 1))
40 
41 DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4*4] = {
42  13036, 13036, 13036, 13036, // tg * (2<<16) + 0.5
43  27146, 27146, 27146, 27146, // tg * (2<<16) + 0.5
44  -21746,-21746,-21746,-21746, // tg * (2<<16) + 0.5
45  23170, 23170, 23170, 23170 // cos * (2<<15) + 0.5
46 };
47 
48 DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2*8] = {
49  65536,65536,
50  3597, 3597,
51  2260, 2260,
52  1203, 1203,
53  0, 0,
54  120, 120,
55  512, 512,
56  512, 512
57 };
58 
59 DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmi)[32*4] = {
60  16384, 21407, 16384, 8867, // w05 w04 w01 w00
61  16384, 8867,-16384,-21407, // w07 w06 w03 w02
62  16384, -8867, 16384,-21407, // w13 w12 w09 w08
63  -16384, 21407, 16384, -8867, // w15 w14 w11 w10
64  22725, 19266, 19266, -4520, // w21 w20 w17 w16
65  12873, 4520,-22725,-12873, // w23 w22 w19 w18
66  12873,-22725, 4520,-12873, // w29 w28 w25 w24
67  4520, 19266, 19266,-22725, // w31 w30 w27 w26
68 
69  22725, 29692, 22725, 12299, // w05 w04 w01 w00
70  22725, 12299,-22725,-29692, // w07 w06 w03 w02
71  22725,-12299, 22725,-29692, // w13 w12 w09 w08
72  -22725, 29692, 22725,-12299, // w15 w14 w11 w10
73  31521, 26722, 26722, -6270, // w21 w20 w17 w16
74  17855, 6270,-31521,-17855, // w23 w22 w19 w18
75  17855,-31521, 6270,-17855, // w29 w28 w25 w24
76  6270, 26722, 26722,-31521, // w31 w30 w27 w26
77 
78  21407, 27969, 21407, 11585, // w05 w04 w01 w00
79  21407, 11585,-21407,-27969, // w07 w06 w03 w02
80  21407,-11585, 21407,-27969, // w13 w12 w09 w08
81  -21407, 27969, 21407,-11585, // w15 w14 w11 w10
82  29692, 25172, 25172, -5906, // w21 w20 w17 w16
83  16819, 5906,-29692,-16819, // w23 w22 w19 w18
84  16819,-29692, 5906,-16819, // w29 w28 w25 w24
85  5906, 25172, 25172,-29692, // w31 w30 w27 w26
86 
87  19266, 25172, 19266, 10426, // w05 w04 w01 w00
88  19266, 10426,-19266,-25172, // w07 w06 w03 w02
89  19266,-10426, 19266,-25172, // w13 w12 w09 w08
90  -19266, 25172, 19266,-10426, // w15 w14 w11 w10
91  26722, 22654, 22654, -5315, // w21 w20 w17 w16
92  15137, 5315,-26722,-15137, // w23 w22 w19 w18
93  15137,-26722, 5315,-15137, // w29 w28 w25 w24
94  5315, 22654, 22654,-26722, // w31 w30 w27 w26
95 };
96 
97 #define DCT_8_INV_ROW_MMI(A1,A2,A3,A4) \
98  "dli $10, 0x88 \n\t" \
99  "ldc1 $f4, "#A1" \n\t" /* 0; x3 x2 x1 x0 */\
100  "dmtc1 $10, $f16 \n\t" \
101  "ldc1 $f10, 8+"#A1" \n\t" /* 1; x7 x6 x5 x4 */\
102  "ldc1 $f6, "#A3" \n\t" /* 3; w05 w04 w01 w00 */\
103  "pshufh $f0, $f4, $f16 \n\t" /* x2 x0 x2 x0 */\
104  "ldc1 $f8, 8+"#A3" \n\t" /* 4; w07 w06 w03 w02 */\
105  "ldc1 $f12, 32+"#A3" \n\t" /* 6; w21 w20 w17 w16 */\
106  "pmaddhw $f6, $f6, $f0 \n\t" /* x2*w05+x0*w04 x2*w01+x0*w00 */\
107  "dli $10, 0xdd \n\t" \
108  "pshufh $f2, $f10, $f16 \n\t" /* x6 x4 x6 x4 */\
109  "dmtc1 $10, $f16 \n\t" \
110  "pmaddhw $f8, $f8, $f2 \n\t" /* x6*w07+x4*w06 x6*w03+x4*w02 */\
111  "ldc1 $f14, 40+"#A3" \n\t" /* 7; w23 w22 w19 w18 */\
112  "pshufh $f4, $f4, $f16 \n\t" /* x3 x1 x3 x1 */\
113  "pmaddhw $f12, $f12, $f4 \n\t" /* x3*w21+x1*w20 x3*w17+x1*w16 */\
114  "pshufh $f10, $f10, $f16 \n\t" /* x7 x5 x7 x5 */\
115  "ldc1 $f18, "#A4" \n\t" \
116  "pmaddhw $f14, $f14, $f10 \n\t" /* x7*w23+x5*w22 x7*w19+x5*w18 */\
117  "paddw $f6, $f6, $f18 \n\t" /* +%4 */\
118  "ldc1 $f16, 16+"#A3" \n\t" \
119  "pmaddhw $f0, $f0, $f16 \n\t" /* x2*w13+x0*w12 x2*w09+x0*w08 */\
120  "ldc1 $f16, 24+"#A3" \n\t" \
121  "paddw $f6, $f6, $f8 \n\t" /* 4; a1=sum(even1) a0=sum(even0) */\
122  "pmaddhw $f2, $f2, $f16 \n\t" /* x6*w15+x4*w14 x6*w11+x4*w10 */\
123  "ldc1 $f16, 48+"#A3" \n\t" \
124  "pmaddhw $f4, $f4, $f16 \n\t" /* x3*w29+x1*w28 x3*w25+x1*w24 */\
125  "ldc1 $f16, 56+"#A3" \n\t" \
126  "paddw $f12, $f12, $f14 \n\t" /* 7; b1=sum(odd1) b0=sum(odd0) */\
127  "dli $10, 11 \n\t" \
128  "pmaddhw $f10, $f10, $f16 \n\t" /* x7*w31+x5*w30 x7*w27+x5*w26 */\
129  "dmtc1 $10, $f16 \n\t" \
130  "psubw $f8, $f6, $f12 \n\t" /* 6; a1-b1 a0-b0 */\
131  "paddw $f6, $f6, $f12 \n\t" /* a1+b1 a0+b0 */\
132  "paddw $f0, $f0, $f18 \n\t" /* +%4 */\
133  "psraw $f6, $f6, $f16 \n\t" /* y1=a1+b1 y0=a0+b0 */\
134  "paddw $f0, $f0, $f2 \n\t" /* 1; a3=sum(even3) a2=sum(even2) */\
135  "paddw $f4, $f4, $f10 \n\t" /* 5; b3=sum(odd3) b2=sum(odd2) */\
136  "psraw $f8, $f8, $f16 \n\t" /* y6=a1-b1 y7=a0-b0 */\
137  "psubw $f14, $f0, $f4 \n\t" /* 2; a3-b3 a2-b2 */\
138  "paddw $f0, $f0, $f4 \n\t" /* a3+b3 a2+b2 */\
139  "psraw $f0, $f0, $f16 \n\t" /* y3=a3+b3 y2=a2+b2 */\
140  "psraw $f14, $f14, $f16 \n\t" /* y4=a3-b3 y5=a2-b2 */\
141  "dli $10, 0xb1 \n\t" \
142  "packsswh $f6, $f6, $f0 \n\t" /* 0; y3 y2 y1 y0 */\
143  "dmtc1 $10, $f16 \n\t" \
144  "packsswh $f14, $f14, $f8 \n\t" /* 4; y6 y7 y4 y5 */\
145  "sdc1 $f6, "#A2" \n\t" /* 3; save y3 y2 y1 y0 */\
146  "pshufh $f14, $f14, $f16 \n\t" /* y7 y6 y5 y4 */\
147  "sdc1 $f14, 8+"#A2" \n\t" /* 7; save y7 y6 y5 y4 */\
148 
149 
150 #define DCT_8_INV_COL(A1,A2) \
151  "ldc1 $f2, 2*8(%3) \n\t" \
152  "ldc1 $f6, 16*3+"#A1" \n\t" \
153  "ldc1 $f10, 16*5+"#A1" \n\t" \
154  "pmulhh $f0, $f2, $f6 \n\t" /* x3*(tg_3_16-1) */\
155  "ldc1 $f4, 0(%3) \n\t" \
156  "pmulhh $f2, $f2, $f10 \n\t" /* x5*(tg_3_16-1) */\
157  "ldc1 $f14, 16*7+"#A1" \n\t" \
158  "ldc1 $f12, 16*1+"#A1" \n\t" \
159  "pmulhh $f8, $f4, $f14 \n\t" /* x7*tg_1_16 */\
160  "paddsh $f0, $f0, $f6 \n\t" /* x3*tg_3_16 */\
161  "pmulhh $f4, $f4, $f12 \n\t" /* x1*tg_1_16 */\
162  "paddsh $f2, $f2, $f6 \n\t" /* x3+x5*(tg_3_16-1) */\
163  "psubsh $f0, $f0, $f10 \n\t" /* x3*tg_3_16-x5 = tm35 */\
164  "ldc1 $f6, 3*8(%3) \n\t" \
165  "paddsh $f2, $f2, $f10 \n\t" /* x3+x5*tg_3_16 = tp35 */\
166  "paddsh $f8, $f8, $f12 \n\t" /* x1+tg_1_16*x7 = tp17 */\
167  "psubsh $f4, $f4, $f14 \n\t" /* x1*tg_1_16-x7 = tm17 */\
168  "paddsh $f10, $f8, $f2 \n\t" /* tp17+tp35 = b0 */\
169  "psubsh $f12, $f4, $f0 \n\t" /* tm17-tm35 = b3 */\
170  "psubsh $f8, $f8, $f2 \n\t" /* tp17-tp35 = t1 */\
171  "paddsh $f4, $f4, $f0 \n\t" /* tm17+tm35 = t2 */\
172  "ldc1 $f14, 1*8(%3) \n\t" \
173  "sdc1 $f10, 3*16+"#A2" \n\t" /* save b0 */\
174  "paddsh $f2, $f8, $f4 \n\t" /* t1+t2 */\
175  "sdc1 $f12, 5*16+"#A2" \n\t" /* save b3 */\
176  "psubsh $f8, $f8, $f4 \n\t" /* t1-t2 */\
177  "ldc1 $f10, 2*16+"#A1" \n\t" \
178  "ldc1 $f12, 6*16+"#A1" \n\t" \
179  "pmulhh $f0, $f14, $f10 \n\t" /* x2*tg_2_16 */\
180  "pmulhh $f14, $f14, $f12 \n\t" /* x6*tg_2_16 */\
181  "pmulhh $f2, $f2, $f6 \n\t" /* ocos_4_16*(t1+t2) = b1/2 */\
182  "ldc1 $f4, 0*16+"#A1" \n\t" \
183  "pmulhh $f8, $f8, $f6 \n\t" /* ocos_4_16*(t1-t2) = b2/2 */\
184  "psubsh $f0, $f0, $f12 \n\t" /* t2*tg_2_16-x6 = tm26 */\
185  "ldc1 $f12, 4*16+"#A1" \n\t" \
186  "paddsh $f14, $f14, $f10 \n\t" /* x2+x6*tg_2_16 = tp26 */\
187  "psubsh $f6, $f4, $f12 \n\t" /* x0-x4 = tm04 */\
188  "paddsh $f4, $f4, $f12 \n\t" /* x0+x4 = tp04 */\
189  "paddsh $f10, $f4, $f14 \n\t" /* tp04+tp26 = a0 */\
190  "psubsh $f12, $f6, $f0 \n\t" /* tm04-tm26 = a2 */\
191  "psubsh $f4, $f4, $f14 \n\t" /* tp04-tp26 = a3 */\
192  "paddsh $f6, $f6, $f0 \n\t" /* tm04+tm26 = a1 */\
193  "paddsh $f2, $f2, $f2 \n\t" /* b1 */\
194  "paddsh $f8, $f8, $f8 \n\t" /* b2 */\
195  "psubsh $f14, $f6, $f2 \n\t" /* a1-b1 */\
196  "dli $10, 6 \n\t" \
197  "paddsh $f6, $f6, $f2 \n\t" /* a1+b1 */\
198  "dmtc1 $10, $f16 \n\t" \
199  "psubsh $f0, $f12, $f8 \n\t" /* a2-b2 */\
200  "paddsh $f12, $f12, $f8 \n\t" /* a2+b2 */\
201  "psrah $f6, $f6, $f16 \n\t" /* dst1 */\
202  "psrah $f12, $f12, $f16 \n\t" /* dst2 */\
203  "ldc1 $f2, 3*16+"#A2" \n\t" /* load b0 */\
204  "psrah $f14, $f14, $f16 \n\t" /* dst6 */\
205  "psrah $f0, $f0, $f16 \n\t" /* dst5 */\
206  "sdc1 $f6, 1*16+"#A2" \n\t" \
207  "psubsh $f8, $f10, $f2 \n\t" /* a0-b0 */\
208  "paddsh $f10, $f10, $f2 \n\t" /* a0+b0 */\
209  "sdc1 $f12, 2*16+"#A2" \n\t" \
210  "ldc1 $f6, 5*16+"#A2" \n\t" /* load b3 */\
211  "psrah $f10, $f10, $f16 \n\t" /* dst0 */\
212  "psrah $f8, $f8, $f16 \n\t" /* dst7 */\
213  "sdc1 $f0, 5*16+"#A2" \n\t" \
214  "psubsh $f12, $f4, $f6 \n\t" /* a3-b3 */\
215  "paddsh $f4, $f4, $f6 \n\t" /* a3+b3 */\
216  "sdc1 $f14, 6*16+"#A2" \n\t" \
217  "sdc1 $f10, 0*16+"#A2" \n\t" \
218  "psrah $f4, $f4, $f16 \n\t" /* dst3 */\
219  "sdc1 $f8, 7*16+"#A2" \n\t" \
220  "psrah $f12, $f12, $f16 \n\t" /* dst4 */\
221  "sdc1 $f4, 3*16+"#A2" \n\t" \
222  "sdc1 $f12, 4*16+"#A2" \n\t" \
223 
224 
225 void ff_xvid_idct_mmi(int16_t *block)
226 {
227  __asm__ volatile (
228  //# Process each row
229  DCT_8_INV_ROW_MMI(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
230  DCT_8_INV_ROW_MMI(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
231  DCT_8_INV_ROW_MMI(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
232  DCT_8_INV_ROW_MMI(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
233  DCT_8_INV_ROW_MMI(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
234  DCT_8_INV_ROW_MMI(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
235  DCT_8_INV_ROW_MMI(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
236  DCT_8_INV_ROW_MMI(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
237  //# Process the columns (4 at a time)
238  DCT_8_INV_COL(0(%0), 0(%0))
239  DCT_8_INV_COL(8(%0), 8(%0))
240  ::"r"(block),"r"(rounder_0),"r"(tab_i_04_mmi),"r"(tg_1_16)
241  : "$10"
242  );
243 }
244 
245 void ff_xvid_idct_put_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
246 {
248  ff_put_pixels_clamped_mmi(block, dest, line_size);
249 }
250 
251 void ff_xvid_idct_add_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
252 {
254  ff_add_pixels_clamped_mmi(block, dest, line_size);
255 }
mem_internal.h
ff_add_pixels_clamped_mmi
void ff_add_pixels_clamped_mmi(const int16_t *block, uint8_t *av_restrict pixels, ptrdiff_t line_size)
Definition: idctdsp_mmi.c:150
ff_xvid_idct_add_mmi
void ff_xvid_idct_add_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
Definition: xvid_idct_mmi.c:251
tab_i_04_mmi
static const int16_t tab_i_04_mmi[32 *4]
Definition: xvid_idct_mmi.c:59
rounder_0
static const int32_t rounder_0[2 *8]
Definition: xvid_idct_mmi.c:48
tg_1_16
static const int16_t tg_1_16[4 *4]
Definition: xvid_idct_mmi.c:41
xvididct_mips.h
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem_internal.h:87
ff_put_pixels_clamped_mmi
void ff_put_pixels_clamped_mmi(const int16_t *block, uint8_t *av_restrict pixels, ptrdiff_t line_size)
Definition: idctdsp_mmi.c:28
ff_xvid_idct_put_mmi
void ff_xvid_idct_put_mmi(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
Definition: xvid_idct_mmi.c:245
__asm__
__asm__(".macro parse_r var r\n\t" "\\var = -1\n\t" _IFC_REG(0) _IFC_REG(1) _IFC_REG(2) _IFC_REG(3) _IFC_REG(4) _IFC_REG(5) _IFC_REG(6) _IFC_REG(7) _IFC_REG(8) _IFC_REG(9) _IFC_REG(10) _IFC_REG(11) _IFC_REG(12) _IFC_REG(13) _IFC_REG(14) _IFC_REG(15) _IFC_REG(16) _IFC_REG(17) _IFC_REG(18) _IFC_REG(19) _IFC_REG(20) _IFC_REG(21) _IFC_REG(22) _IFC_REG(23) _IFC_REG(24) _IFC_REG(25) _IFC_REG(26) _IFC_REG(27) _IFC_REG(28) _IFC_REG(29) _IFC_REG(30) _IFC_REG(31) ".iflt \\var\n\t" ".error \"Unable to parse register name \\r\"\n\t" ".endif\n\t" ".endm")
idctdsp_mips.h
ff_xvid_idct_mmi
void ff_xvid_idct_mmi(int16_t *block)
Definition: xvid_idct_mmi.c:225
int32_t
int32_t
Definition: audioconvert.c:56
DCT_8_INV_ROW_MMI
#define DCT_8_INV_ROW_MMI(A1, A2, A3, A4)
Definition: xvid_idct_mmi.c:97
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
DCT_8_INV_COL
#define DCT_8_INV_COL(A1, A2)
Definition: xvid_idct_mmi.c:150