FFmpeg
vc1dsp_msa.c
Go to the documentation of this file.
1 /*
2  * Loongson SIMD optimized vc1dsp
3  *
4  * Copyright (c) 2019 Loongson Technology Corporation Limited
5  * gxw <guxiwei-hf@loongson.cn>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "vc1dsp_mips.h"
25 #include "constants.h"
27 
29 {
30  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
31  v4i32 in_r0, in_r1, in_r2, in_r3, in_r4, in_r5, in_r6, in_r7;
32  v4i32 in_l0, in_l1, in_l2, in_l3, in_l4, in_l5, in_l6, in_l7;
33  v4i32 t_r1, t_r2, t_r3, t_r4, t_r5, t_r6, t_r7, t_r8;
34  v4i32 t_l1, t_l2, t_l3, t_l4, t_l5, t_l6, t_l7, t_l8;
35  v4i32 cnst_12 = {12, 12, 12, 12};
36  v4i32 cnst_4 = {4, 4, 4, 4};
37  v4i32 cnst_16 = {16, 16, 16, 16};
38  v4i32 cnst_6 = {6, 6, 6, 6};
39  v4i32 cnst_15 = {15, 15, 15, 15};
40  v4i32 cnst_9 = {9, 9, 9, 9};
41  v4i32 cnst_1 = {1, 1, 1, 1};
42  v4i32 cnst_64 = {64, 64, 64, 64};
43 
44  LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
45  UNPCK_SH_SW(in0, in_r0, in_l0);
46  UNPCK_SH_SW(in1, in_r1, in_l1);
47  UNPCK_SH_SW(in2, in_r2, in_l2);
48  UNPCK_SH_SW(in3, in_r3, in_l3);
49  UNPCK_SH_SW(in4, in_r4, in_l4);
50  UNPCK_SH_SW(in5, in_r5, in_l5);
51  UNPCK_SH_SW(in6, in_r6, in_l6);
52  UNPCK_SH_SW(in7, in_r7, in_l7);
53  // First loop
54  t_r1 = cnst_12 * (in_r0 + in_r4) + cnst_4;
55  t_l1 = cnst_12 * (in_l0 + in_l4) + cnst_4;
56  t_r2 = cnst_12 * (in_r0 - in_r4) + cnst_4;
57  t_l2 = cnst_12 * (in_l0 - in_l4) + cnst_4;
58  t_r3 = cnst_16 * in_r2 + cnst_6 * in_r6;
59  t_l3 = cnst_16 * in_l2 + cnst_6 * in_l6;
60  t_r4 = cnst_6 * in_r2 - cnst_16 * in_r6;
61  t_l4 = cnst_6 * in_l2 - cnst_16 * in_l6;
62 
63  ADD4(t_r1, t_r3, t_l1, t_l3, t_r2, t_r4, t_l2, t_l4, t_r5, t_l5, t_r6, t_l6);
64  SUB4(t_r2, t_r4, t_l2, t_l4, t_r1, t_r3, t_l1, t_l3, t_r7, t_l7, t_r8, t_l8);
65  t_r1 = cnst_16 * in_r1 + cnst_15 * in_r3 + cnst_9 * in_r5 + cnst_4 * in_r7;
66  t_l1 = cnst_16 * in_l1 + cnst_15 * in_l3 + cnst_9 * in_l5 + cnst_4 * in_l7;
67  t_r2 = cnst_15 * in_r1 - cnst_4 * in_r3 - cnst_16 * in_r5 - cnst_9 * in_r7;
68  t_l2 = cnst_15 * in_l1 - cnst_4 * in_l3 - cnst_16 * in_l5 - cnst_9 * in_l7;
69  t_r3 = cnst_9 * in_r1 - cnst_16 * in_r3 + cnst_4 * in_r5 + cnst_15 * in_r7;
70  t_l3 = cnst_9 * in_l1 - cnst_16 * in_l3 + cnst_4 * in_l5 + cnst_15 * in_l7;
71  t_r4 = cnst_4 * in_r1 - cnst_9 * in_r3 + cnst_15 * in_r5 - cnst_16 * in_r7;
72  t_l4 = cnst_4 * in_l1 - cnst_9 * in_l3 + cnst_15 * in_l5 - cnst_16 * in_l7;
73 
74  in_r0 = (t_r5 + t_r1) >> 3;
75  in_l0 = (t_l5 + t_l1) >> 3;
76  in_r1 = (t_r6 + t_r2) >> 3;
77  in_l1 = (t_l6 + t_l2) >> 3;
78  in_r2 = (t_r7 + t_r3) >> 3;
79  in_l2 = (t_l7 + t_l3) >> 3;
80  in_r3 = (t_r8 + t_r4) >> 3;
81  in_l3 = (t_l8 + t_l4) >> 3;
82 
83  in_r4 = (t_r8 - t_r4) >> 3;
84  in_l4 = (t_l8 - t_l4) >> 3;
85  in_r5 = (t_r7 - t_r3) >> 3;
86  in_l5 = (t_l7 - t_l3) >> 3;
87  in_r6 = (t_r6 - t_r2) >> 3;
88  in_l6 = (t_l6 - t_l2) >> 3;
89  in_r7 = (t_r5 - t_r1) >> 3;
90  in_l7 = (t_l5 - t_l1) >> 3;
91  TRANSPOSE4x4_SW_SW(in_r0, in_r1, in_r2, in_r3, in_r0, in_r1, in_r2, in_r3);
92  TRANSPOSE4x4_SW_SW(in_l0, in_l1, in_l2, in_l3, in_l0, in_l1, in_l2, in_l3);
93  TRANSPOSE4x4_SW_SW(in_r4, in_r5, in_r6, in_r7, in_r4, in_r5, in_r6, in_r7);
94  TRANSPOSE4x4_SW_SW(in_l4, in_l5, in_l6, in_l7, in_l4, in_l5, in_l6, in_l7);
95  // Second loop
96  t_r1 = cnst_12 * (in_r0 + in_l0) + cnst_64;
97  t_l1 = cnst_12 * (in_r4 + in_l4) + cnst_64;
98  t_r2 = cnst_12 * (in_r0 - in_l0) + cnst_64;
99  t_l2 = cnst_12 * (in_r4 - in_l4) + cnst_64;
100  t_r3 = cnst_16 * in_r2 + cnst_6 * in_l2;
101  t_l3 = cnst_16 * in_r6 + cnst_6 * in_l6;
102  t_r4 = cnst_6 * in_r2 - cnst_16 * in_l2;
103  t_l4 = cnst_6 * in_r6 - cnst_16 * in_l6;
104 
105  ADD4(t_r1, t_r3, t_l1, t_l3, t_r2, t_r4, t_l2, t_l4, t_r5, t_l5, t_r6, t_l6);
106  SUB4(t_r2, t_r4, t_l2, t_l4, t_r1, t_r3, t_l1, t_l3, t_r7, t_l7, t_r8, t_l8);
107  t_r1 = cnst_16 * in_r1 + cnst_15 * in_r3 + cnst_9 * in_l1 + cnst_4 * in_l3;
108  t_l1 = cnst_16 * in_r5 + cnst_15 * in_r7 + cnst_9 * in_l5 + cnst_4 * in_l7;
109  t_r2 = cnst_15 * in_r1 - cnst_4 * in_r3 - cnst_16 * in_l1 - cnst_9 * in_l3;
110  t_l2 = cnst_15 * in_r5 - cnst_4 * in_r7 - cnst_16 * in_l5 - cnst_9 * in_l7;
111  t_r3 = cnst_9 * in_r1 - cnst_16 * in_r3 + cnst_4 * in_l1 + cnst_15 * in_l3;
112  t_l3 = cnst_9 * in_r5 - cnst_16 * in_r7 + cnst_4 * in_l5 + cnst_15 * in_l7;
113  t_r4 = cnst_4 * in_r1 - cnst_9 * in_r3 + cnst_15 * in_l1 - cnst_16 * in_l3;
114  t_l4 = cnst_4 * in_r5 - cnst_9 * in_r7 + cnst_15 * in_l5 - cnst_16 * in_l7;
115 
116  in_r0 = (t_r5 + t_r1) >> 7;
117  in_l0 = (t_l5 + t_l1) >> 7;
118  in_r1 = (t_r6 + t_r2) >> 7;
119  in_l1 = (t_l6 + t_l2) >> 7;
120  in_r2 = (t_r7 + t_r3) >> 7;
121  in_l2 = (t_l7 + t_l3) >> 7;
122  in_r3 = (t_r8 + t_r4) >> 7;
123  in_l3 = (t_l8 + t_l4) >> 7;
124 
125  in_r4 = (t_r8 - t_r4 + cnst_1) >> 7;
126  in_l4 = (t_l8 - t_l4 + cnst_1) >> 7;
127  in_r5 = (t_r7 - t_r3 + cnst_1) >> 7;
128  in_l5 = (t_l7 - t_l3 + cnst_1) >> 7;
129  in_r6 = (t_r6 - t_r2 + cnst_1) >> 7;
130  in_l6 = (t_l6 - t_l2 + cnst_1) >> 7;
131  in_r7 = (t_r5 - t_r1 + cnst_1) >> 7;
132  in_l7 = (t_l5 - t_l1 + cnst_1) >> 7;
133  PCKEV_H4_SH(in_l0, in_r0, in_l1, in_r1, in_l2, in_r2, in_l3, in_r3,
134  in0, in1, in2, in3);
135  PCKEV_H4_SH(in_l4, in_r4, in_l5, in_r5, in_l6, in_r6, in_l7, in_r7,
136  in4, in5, in6, in7);
137  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, block, 8);
138 }
139 
140 void ff_vc1_inv_trans_4x8_msa(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
141 {
142  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
143  v4i32 in_r0, in_r1, in_r2, in_r3, in_r4, in_r5, in_r6, in_r7;
144  v4i32 t1, t2, t3, t4, t5, t6, t7, t8;
145  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
146  v16i8 zero_m = { 0 };
147  v4i32 cnst_17 = {17, 17, 17, 17};
148  v4i32 cnst_22 = {22, 22, 22, 22};
149  v4i32 cnst_10 = {10, 10, 10, 10};
150  v4i32 cnst_12 = {12, 12, 12, 12};
151  v4i32 cnst_64 = {64, 64, 64, 64};
152  v4i32 cnst_16 = {16, 16, 16, 16};
153  v4i32 cnst_15 = {15, 15, 15, 15};
154  v4i32 cnst_4 = {4, 4, 4, 4};
155  v4i32 cnst_6 = {6, 6, 6, 6};
156  v4i32 cnst_9 = {9, 9, 9, 9};
157  v4i32 cnst_1 = {1, 1, 1, 1};
158 
159  LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
160  UNPCK_R_SH_SW(in0, in_r0);
161  UNPCK_R_SH_SW(in1, in_r1);
162  UNPCK_R_SH_SW(in2, in_r2);
163  UNPCK_R_SH_SW(in3, in_r3);
164  UNPCK_R_SH_SW(in4, in_r4);
165  UNPCK_R_SH_SW(in5, in_r5);
166  UNPCK_R_SH_SW(in6, in_r6);
167  UNPCK_R_SH_SW(in7, in_r7);
168  // First loop
169  TRANSPOSE4x4_SW_SW(in_r0, in_r1, in_r2, in_r3, in_r0, in_r1, in_r2, in_r3);
170  TRANSPOSE4x4_SW_SW(in_r4, in_r5, in_r6, in_r7, in_r4, in_r5, in_r6, in_r7);
171  t1 = cnst_17 * (in_r0 + in_r2) + cnst_4;
172  t5 = cnst_17 * (in_r4 + in_r6) + cnst_4;
173  t2 = cnst_17 * (in_r0 - in_r2) + cnst_4;
174  t6 = cnst_17 * (in_r4 - in_r6) + cnst_4;
175  t3 = cnst_22 * in_r1 + cnst_10 * in_r3;
176  t7 = cnst_22 * in_r5 + cnst_10 * in_r7;
177  t4 = cnst_22 * in_r3 - cnst_10 * in_r1;
178  t8 = cnst_22 * in_r7 - cnst_10 * in_r5;
179 
180  in_r0 = (t1 + t3) >> 3;
181  in_r4 = (t5 + t7) >> 3;
182  in_r1 = (t2 - t4) >> 3;
183  in_r5 = (t6 - t8) >> 3;
184  in_r2 = (t2 + t4) >> 3;
185  in_r6 = (t6 + t8) >> 3;
186  in_r3 = (t1 - t3) >> 3;
187  in_r7 = (t5 - t7) >> 3;
188  TRANSPOSE4x4_SW_SW(in_r0, in_r1, in_r2, in_r3, in_r0, in_r1, in_r2, in_r3);
189  TRANSPOSE4x4_SW_SW(in_r4, in_r5, in_r6, in_r7, in_r4, in_r5, in_r6, in_r7);
190  PCKEV_H4_SH(in_r1, in_r0, in_r3, in_r2, in_r5, in_r4, in_r7, in_r6,
191  in0, in1, in2, in3);
192  ST_D8(in0, in1, in2, in3, 0, 1, 0, 1, 0, 1, 0, 1, block, 8);
193  // Second loop
194  t1 = cnst_12 * (in_r0 + in_r4) + cnst_64;
195  t2 = cnst_12 * (in_r0 - in_r4) + cnst_64;
196  t3 = cnst_16 * in_r2 + cnst_6 * in_r6;
197  t4 = cnst_6 * in_r2 - cnst_16 * in_r6;
198  t5 = t1 + t3, t6 = t2 + t4;
199  t7 = t2 - t4, t8 = t1 - t3;
200  t1 = cnst_16 * in_r1 + cnst_15 * in_r3 + cnst_9 * in_r5 + cnst_4 * in_r7;
201  t2 = cnst_15 * in_r1 - cnst_4 * in_r3 - cnst_16 * in_r5 - cnst_9 * in_r7;
202  t3 = cnst_9 * in_r1 - cnst_16 * in_r3 + cnst_4 * in_r5 + cnst_15 * in_r7;
203  t4 = cnst_4 * in_r1 - cnst_9 * in_r3 + cnst_15 * in_r5 - cnst_16 * in_r7;
204  LD_SW8(dest, linesize, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
205  ILVR_B8_SW(zero_m, dst0, zero_m, dst1, zero_m, dst2, zero_m, dst3,
206  zero_m, dst4, zero_m, dst5, zero_m, dst6, zero_m, dst7,
207  dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
208  ILVR_H4_SW(zero_m, dst0, zero_m, dst1, zero_m, dst2, zero_m, dst3,
209  dst0, dst1, dst2, dst3);
210  ILVR_H4_SW(zero_m, dst4, zero_m, dst5, zero_m, dst6, zero_m, dst7,
211  dst4, dst5, dst6, dst7);
212  in_r0 = (t5 + t1) >> 7;
213  in_r1 = (t6 + t2) >> 7;
214  in_r2 = (t7 + t3) >> 7;
215  in_r3 = (t8 + t4) >> 7;
216  in_r4 = (t8 - t4 + cnst_1) >> 7;
217  in_r5 = (t7 - t3 + cnst_1) >> 7;
218  in_r6 = (t6 - t2 + cnst_1) >> 7;
219  in_r7 = (t5 - t1 + cnst_1) >> 7;
220  ADD4(in_r0, dst0, in_r1, dst1, in_r2, dst2, in_r3, dst3,
221  in_r0, in_r1, in_r2, in_r3);
222  ADD4(in_r4, dst4, in_r5, dst5, in_r6, dst6, in_r7, dst7,
223  in_r4, in_r5, in_r6, in_r7);
224  CLIP_SW8_0_255(in_r0, in_r1, in_r2, in_r3, in_r4, in_r5, in_r6, in_r7);
225  PCKEV_H4_SH(in_r1, in_r0, in_r3, in_r2, in_r5, in_r4, in_r7, in_r6,
226  in0, in1, in2, in3);
227  PCKEV_B2_SH(in1, in0, in3, in2, in0, in1);
228  ST_W8(in0, in1, 0, 1, 2, 3, 0, 1, 2, 3, dest, linesize);
229 }
230 
231 void ff_vc1_inv_trans_8x4_msa(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
232 {
233  v4i32 in0, in1, in2, in3, in4, in5, in6, in7;
234  v4i32 t1, t2, t3, t4, t5, t6, t7, t8;
235  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
236  v16i8 zero_m = { 0 };
237  v4i32 cnst_17 = {17, 17, 17, 17};
238  v4i32 cnst_22 = {22, 22, 22, 22};
239  v4i32 cnst_10 = {10, 10, 10, 10};
240  v4i32 cnst_12 = {12, 12, 12, 12};
241  v4i32 cnst_64 = {64, 64, 64, 64};
242  v4i32 cnst_16 = {16, 16, 16, 16};
243  v4i32 cnst_15 = {15, 15, 15, 15};
244  v4i32 cnst_4 = {4, 4, 4, 4};
245  v4i32 cnst_6 = {6, 6, 6, 6};
246  v4i32 cnst_9 = {9, 9, 9, 9};
247 
248  LD_SW4(block, 8, t1, t2, t3, t4);
249  UNPCK_SH_SW(t1, in0, in4);
250  UNPCK_SH_SW(t2, in1, in5);
251  UNPCK_SH_SW(t3, in2, in6);
252  UNPCK_SH_SW(t4, in3, in7);
253  TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, in0, in1, in2, in3);
254  TRANSPOSE4x4_SW_SW(in4, in5, in6, in7, in4, in5, in6, in7);
255  // First loop
256  t1 = cnst_12 * (in0 + in4) + cnst_4;
257  t2 = cnst_12 * (in0 - in4) + cnst_4;
258  t3 = cnst_16 * in2 + cnst_6 * in6;
259  t4 = cnst_6 * in2 - cnst_16 * in6;
260  t5 = t1 + t3, t6 = t2 + t4;
261  t7 = t2 - t4, t8 = t1 - t3;
262  t1 = cnst_16 * in1 + cnst_15 * in3 + cnst_9 * in5 + cnst_4 * in7;
263  t2 = cnst_15 * in1 - cnst_4 * in3 - cnst_16 * in5 - cnst_9 * in7;
264  t3 = cnst_9 * in1 - cnst_16 * in3 + cnst_4 * in5 + cnst_15 * in7;
265  t4 = cnst_4 * in1 - cnst_9 * in3 + cnst_15 * in5 - cnst_16 * in7;
266  in0 = (t5 + t1) >> 3;
267  in1 = (t6 + t2) >> 3;
268  in2 = (t7 + t3) >> 3;
269  in3 = (t8 + t4) >> 3;
270  in4 = (t8 - t4) >> 3;
271  in5 = (t7 - t3) >> 3;
272  in6 = (t6 - t2) >> 3;
273  in7 = (t5 - t1) >> 3;
274  TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, in0, in1, in2, in3);
275  TRANSPOSE4x4_SW_SW(in4, in5, in6, in7, in4, in5, in6, in7);
276  PCKEV_H4_SW(in4, in0, in5, in1, in6, in2, in7, in3, t1, t2, t3, t4);
277  ST_SW4(t1, t2, t3, t4, block, 8);
278  // Second loop
279  LD_SW4(dest, linesize, dst0, dst1, dst2, dst3);
280  ILVR_B4_SW(zero_m, dst0, zero_m, dst1, zero_m, dst2, zero_m, dst3,
281  dst0, dst1, dst2, dst3);
282  ILVL_H4_SW(zero_m, dst0, zero_m, dst1, zero_m, dst2, zero_m, dst3,
283  dst4, dst5, dst6, dst7);
284  ILVR_H4_SW(zero_m, dst0, zero_m, dst1, zero_m, dst2, zero_m, dst3,
285  dst0, dst1, dst2, dst3);
286  // Right part
287  t1 = cnst_17 * (in0 + in2) + cnst_64;
288  t2 = cnst_17 * (in0 - in2) + cnst_64;
289  t3 = cnst_22 * in1 + cnst_10 * in3;
290  t4 = cnst_22 * in3 - cnst_10 * in1;
291  in0 = (t1 + t3) >> 7;
292  in1 = (t2 - t4) >> 7;
293  in2 = (t2 + t4) >> 7;
294  in3 = (t1 - t3) >> 7;
295  ADD4(in0, dst0, in1, dst1, in2, dst2, in3, dst3, in0, in1, in2, in3);
296  CLIP_SW4_0_255(in0, in1, in2, in3);
297  // Left part
298  t5 = cnst_17 * (in4 + in6) + cnst_64;
299  t6 = cnst_17 * (in4 - in6) + cnst_64;
300  t7 = cnst_22 * in5 + cnst_10 * in7;
301  t8 = cnst_22 * in7 - cnst_10 * in5;
302  in4 = (t5 + t7) >> 7;
303  in5 = (t6 - t8) >> 7;
304  in6 = (t6 + t8) >> 7;
305  in7 = (t5 - t7) >> 7;
306  ADD4(in4, dst4, in5, dst5, in6, dst6, in7, dst7, in4, in5, in6, in7);
307  CLIP_SW4_0_255(in4, in5, in6, in7);
308  PCKEV_H4_SW(in4, in0, in5, in1, in6, in2, in7, in3, in0, in1, in2, in3);
309  PCKEV_B2_SW(in1, in0, in3, in2, in0, in1);
310  ST_D4(in0, in1, 0, 1, 0, 1, dest, linesize);
311 }
312 
313 static void put_vc1_mspel_mc_h_v_msa(uint8_t *dst, const uint8_t *src,
314  ptrdiff_t stride, int hmode, int vmode,
315  int rnd)
316 {
317  v8i16 in_r0, in_r1, in_r2, in_r3, in_l0, in_l1, in_l2, in_l3;
318  v8i16 t0, t1, t2, t3, t4, t5, t6, t7;
319  v8i16 t8, t9, t10, t11, t12, t13, t14, t15;
320  v8i16 cnst_para0, cnst_para1, cnst_para2, cnst_para3, cnst_r;
321  static const int para_value[][4] = {{4, 53, 18, 3},
322  {1, 9, 9, 1},
323  {3, 18, 53, 4}};
324  static const int shift_value[] = {0, 5, 1, 5};
325  int shift = (shift_value[hmode] + shift_value[vmode]) >> 1;
326  int r = (1 << (shift - 1)) + rnd - 1;
327  cnst_r = __msa_fill_h(r);
328  src -= 1, src -= stride;
329  cnst_para0 = __msa_fill_h(para_value[vmode - 1][0]);
330  cnst_para1 = __msa_fill_h(para_value[vmode - 1][1]);
331  cnst_para2 = __msa_fill_h(para_value[vmode - 1][2]);
332  cnst_para3 = __msa_fill_h(para_value[vmode - 1][3]);
333  LD_SH4(src, stride, in_l0, in_l1, in_l2, in_l3);
334  UNPCK_UB_SH(in_l0, in_r0, in_l0);
335  UNPCK_UB_SH(in_l1, in_r1, in_l1);
336  UNPCK_UB_SH(in_l2, in_r2, in_l2);
337  UNPCK_UB_SH(in_l3, in_r3, in_l3);
338  // row 0
339  t0 = cnst_para1 * in_r1 + cnst_para2 * in_r2
340  - cnst_para0 * in_r0 - cnst_para3 * in_r3;
341  t8 = cnst_para1 * in_l1 + cnst_para2 * in_l2
342  - cnst_para0 * in_l0 - cnst_para3 * in_l3;
343  in_l0 = LD_SH(src + 4 * stride);
344  UNPCK_UB_SH(in_l0, in_r0, in_l0);
345  // row 1
346  t1 = cnst_para1 * in_r2 + cnst_para2 * in_r3
347  - cnst_para0 * in_r1 - cnst_para3 * in_r0;
348  t9 = cnst_para1 * in_l2 + cnst_para2 * in_l3
349  - cnst_para0 * in_l1 - cnst_para3 * in_l0;
350  in_l1 = LD_SH(src + 5 * stride);
351  UNPCK_UB_SH(in_l1, in_r1, in_l1);
352  // row 2
353  t2 = cnst_para1 * in_r3 + cnst_para2 * in_r0
354  - cnst_para0 * in_r2 - cnst_para3 * in_r1;
355  t10 = cnst_para1 * in_l3 + cnst_para2 * in_l0
356  - cnst_para0 * in_l2 - cnst_para3 * in_l1;
357  in_l2 = LD_SH(src + 6 * stride);
358  UNPCK_UB_SH(in_l2, in_r2, in_l2);
359  // row 3
360  t3 = cnst_para1 * in_r0 + cnst_para2 * in_r1
361  - cnst_para0 * in_r3 - cnst_para3 * in_r2;
362  t11 = cnst_para1 * in_l0 + cnst_para2 * in_l1
363  - cnst_para0 * in_l3 - cnst_para3 * in_l2;
364  in_l3 = LD_SH(src + 7 * stride);
365  UNPCK_UB_SH(in_l3, in_r3, in_l3);
366  // row 4
367  t4 = cnst_para1 * in_r1 + cnst_para2 * in_r2
368  - cnst_para0 * in_r0 - cnst_para3 * in_r3;
369  t12 = cnst_para1 * in_l1 + cnst_para2 * in_l2
370  - cnst_para0 * in_l0 - cnst_para3 * in_l3;
371  in_l0 = LD_SH(src + 8 * stride);
372  UNPCK_UB_SH(in_l0, in_r0, in_l0);
373  // row 5
374  t5 = cnst_para1 * in_r2 + cnst_para2 * in_r3
375  - cnst_para0 * in_r1 - cnst_para3 * in_r0;
376  t13 = cnst_para1 * in_l2 + cnst_para2 * in_l3
377  - cnst_para0 * in_l1 - cnst_para3 * in_l0;
378  in_l1 = LD_SH(src + 9 * stride);
379  UNPCK_UB_SH(in_l1, in_r1, in_l1);
380  // row 6
381  t6 = cnst_para1 * in_r3 + cnst_para2 * in_r0
382  - cnst_para0 * in_r2 - cnst_para3 * in_r1;
383  t14 = cnst_para1 * in_l3 + cnst_para2 * in_l0
384  - cnst_para0 * in_l2 - cnst_para3 * in_l1;
385  in_l2 = LD_SH(src + 10 * stride);
386  UNPCK_UB_SH(in_l2, in_r2, in_l2);
387  // row 7
388  t7 = cnst_para1 * in_r0 + cnst_para2 * in_r1
389  - cnst_para0 * in_r3 - cnst_para3 * in_r2;
390  t15 = cnst_para1 * in_l0 + cnst_para2 * in_l1
391  - cnst_para0 * in_l3 - cnst_para3 * in_l2;
392 
393  ADD4(t0, cnst_r, t1, cnst_r, t2, cnst_r, t3, cnst_r, t0, t1, t2, t3);
394  ADD4(t4, cnst_r, t5, cnst_r, t6, cnst_r, t7, cnst_r, t4, t5, t6, t7);
395  ADD4(t8, cnst_r, t9, cnst_r, t10, cnst_r, t11, cnst_r,
396  t8, t9, t10, t11);
397  ADD4(t12, cnst_r, t13, cnst_r, t14, cnst_r, t15, cnst_r,
398  t12, t13, t14, t15);
399  t0 >>= shift, t1 >>= shift, t2 >>= shift, t3 >>= shift;
400  t4 >>= shift, t5 >>= shift, t6 >>= shift, t7 >>= shift;
401  t8 >>= shift, t9 >>= shift, t10 >>= shift, t11 >>= shift;
402  t12 >>= shift, t13 >>= shift, t14 >>= shift, t15 >>= shift;
403  TRANSPOSE8x8_SH_SH(t0, t1, t2, t3, t4, t5, t6, t7,
404  t0, t1, t2, t3, t4, t5, t6, t7);
405  TRANSPOSE8x8_SH_SH(t8, t9, t10, t11, t12, t13, t14, t15,
406  t8, t9, t10, t11, t12, t13, t14, t15);
407  cnst_para0 = __msa_fill_h(para_value[hmode - 1][0]);
408  cnst_para1 = __msa_fill_h(para_value[hmode - 1][1]);
409  cnst_para2 = __msa_fill_h(para_value[hmode - 1][2]);
410  cnst_para3 = __msa_fill_h(para_value[hmode - 1][3]);
411  r = 64 - rnd;
412  cnst_r = __msa_fill_h(r);
413  // col 0 ~ 7
414  t0 = cnst_para1 * t1 + cnst_para2 * t2 - cnst_para0 * t0 - cnst_para3 * t3;
415  t1 = cnst_para1 * t2 + cnst_para2 * t3 - cnst_para0 * t1 - cnst_para3 * t4;
416  t2 = cnst_para1 * t3 + cnst_para2 * t4 - cnst_para0 * t2 - cnst_para3 * t5;
417  t3 = cnst_para1 * t4 + cnst_para2 * t5 - cnst_para0 * t3 - cnst_para3 * t6;
418  t4 = cnst_para1 * t5 + cnst_para2 * t6 - cnst_para0 * t4 - cnst_para3 * t7;
419  t5 = cnst_para1 * t6 + cnst_para2 * t7 - cnst_para0 * t5 - cnst_para3 * t8;
420  t6 = cnst_para1 * t7 + cnst_para2 * t8 - cnst_para0 * t6 - cnst_para3 * t9;
421  t7 = cnst_para1 * t8 + cnst_para2 * t9 - cnst_para0 * t7 - cnst_para3 * t10;
422  ADD4(t0, cnst_r, t1, cnst_r, t2, cnst_r, t3, cnst_r, t0, t1, t2, t3);
423  ADD4(t4, cnst_r, t5, cnst_r, t6, cnst_r, t7, cnst_r, t4, t5, t6, t7);
424  t0 >>= 7, t1 >>= 7, t2 >>= 7, t3 >>= 7;
425  t4 >>= 7, t5 >>= 7, t6 >>= 7, t7 >>= 7;
426  TRANSPOSE8x8_SH_SH(t0, t1, t2, t3, t4, t5, t6, t7,
427  t0, t1, t2, t3, t4, t5, t6, t7);
428  CLIP_SH8_0_255(t0, t1, t2, t3, t4, t5, t6, t7);
429  PCKEV_B4_SH(t1, t0, t3, t2, t5, t4, t7, t6, t0, t1, t2, t3);
430  ST_D8(t0, t1, t2, t3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
431 }
432 
433 #define PUT_VC1_MSPEL_MC_MSA(hmode, vmode) \
434 void ff_put_vc1_mspel_mc ## hmode ## vmode ## _msa(uint8_t *dst, \
435  const uint8_t *src, \
436  ptrdiff_t stride, int rnd) \
437 { \
438  put_vc1_mspel_mc_h_v_msa(dst, src, stride, hmode, vmode, rnd); \
439 } \
440 void ff_put_vc1_mspel_mc ## hmode ## vmode ## _16_msa(uint8_t *dst, \
441  const uint8_t *src, \
442  ptrdiff_t stride, int rnd) \
443 { \
444  put_vc1_mspel_mc_h_v_msa(dst, src, stride, hmode, vmode, rnd); \
445  put_vc1_mspel_mc_h_v_msa(dst + 8, src + 8, stride, hmode, vmode, rnd); \
446  dst += 8 * stride, src += 8 * stride; \
447  put_vc1_mspel_mc_h_v_msa(dst, src, stride, hmode, vmode, rnd); \
448  put_vc1_mspel_mc_h_v_msa(dst + 8, src + 8, stride, hmode, vmode, rnd); \
449 }
450 
454 
458 
ILVR_B8_SW
#define ILVR_B8_SW(...)
Definition: generic_macros_msa.h:1375
stride
int stride
Definition: mace.c:144
r
const char * r
Definition: vf_curves.c:116
LD_SH4
#define LD_SH4(...)
Definition: generic_macros_msa.h:299
ff_vc1_inv_trans_8x4_msa
void ff_vc1_inv_trans_8x4_msa(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_msa.c:231
PCKEV_B4_SH
#define PCKEV_B4_SH(...)
Definition: generic_macros_msa.h:1740
ILVR_H4_SW
#define ILVR_H4_SW(...)
Definition: generic_macros_msa.h:1409
SUB4
#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2140
t0
#define t0
Definition: regdef.h:28
LD_SH
#define LD_SH(...)
Definition: generic_macros_msa.h:35
t1
#define t1
Definition: regdef.h:29
ff_vc1_inv_trans_4x8_msa
void ff_vc1_inv_trans_4x8_msa(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
Definition: vc1dsp_msa.c:140
ADD4
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2123
t10
#define t10
Definition: regdef.h:55
ILVR_B4_SW
#define ILVR_B4_SW(...)
Definition: generic_macros_msa.h:1363
ILVL_H4_SW
#define ILVL_H4_SW(...)
Definition: generic_macros_msa.h:1302
vc1dsp_mips.h
generic_macros_msa.h
constants.h
TRANSPOSE8x8_SH_SH
#define TRANSPOSE8x8_SH_SH(...)
Definition: generic_macros_msa.h:2505
PCKEV_H4_SW
#define PCKEV_H4_SW(...)
Definition: generic_macros_msa.h:1769
ST_SW4
#define ST_SW4(...)
Definition: generic_macros_msa.h:377
rnd
#define rnd()
Definition: checkasm.h:111
t15
static int t15(InterplayACMContext *s, unsigned ind, unsigned col)
Definition: interplayacm.c:335
LD_SW8
#define LD_SW8(...)
Definition: generic_macros_msa.h:339
put_vc1_mspel_mc_h_v_msa
static void put_vc1_mspel_mc_h_v_msa(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int hmode, int vmode, int rnd)
Definition: vc1dsp_msa.c:313
t7
#define t7
Definition: regdef.h:35
PCKEV_H4_SH
#define PCKEV_H4_SH(...)
Definition: generic_macros_msa.h:1768
PCKEV_B2_SW
#define PCKEV_B2_SW(...)
Definition: generic_macros_msa.h:1722
TRANSPOSE4x4_SW_SW
#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2513
PUT_VC1_MSPEL_MC_MSA
#define PUT_VC1_MSPEL_MC_MSA(hmode, vmode)
Definition: vc1dsp_msa.c:433
ST_D8
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:511
LD_SW4
#define LD_SW4(...)
Definition: generic_macros_msa.h:300
t5
#define t5
Definition: regdef.h:33
t6
#define t6
Definition: regdef.h:34
UNPCK_SH_SW
#define UNPCK_SH_SW(in, out0, out1)
Definition: generic_macros_msa.h:2224
src
#define src
Definition: vp8dsp.c:255
CLIP_SH8_0_255
#define CLIP_SH8_0_255(in0, in1, in2, in3, in4, in5, in6, in7)
Definition: generic_macros_msa.h:953
ff_vc1_inv_trans_8x8_msa
void ff_vc1_inv_trans_8x8_msa(int16_t block[64])
Definition: vc1dsp_msa.c:28
CLIP_SW8_0_255
#define CLIP_SW8_0_255(in0, in1, in2, in3, in4, in5, in6, in7)
Definition: generic_macros_msa.h:984
CLIP_SW4_0_255
#define CLIP_SW4_0_255(in0, in1, in2, in3)
Definition: generic_macros_msa.h:978
t11
#define t11
Definition: regdef.h:56
t12
#define t12
Definition: regdef.h:58
ST_SH8
#define ST_SH8(...)
Definition: generic_macros_msa.h:392
t8
#define t8
Definition: regdef.h:53
PCKEV_B2_SH
#define PCKEV_B2_SH(...)
Definition: generic_macros_msa.h:1721
LD_SH8
#define LD_SH8(...)
Definition: generic_macros_msa.h:338
t4
#define t4
Definition: regdef.h:32
t3
#define t3
Definition: regdef.h:31
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:470
UNPCK_UB_SH
#define UNPCK_UB_SH(in, out0, out1)
Definition: generic_macros_msa.h:2206
t2
#define t2
Definition: regdef.h:30
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:499
shift
static int shift(int a, int b)
Definition: sonic.c:83
t9
#define t9
Definition: regdef.h:54
UNPCK_R_SH_SW
#define UNPCK_R_SH_SW(in, out)
Definition: generic_macros_msa.h:2172
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207