FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
simple_idct_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "idctdsp_mips.h"
23 
24 static void simple_idct_msa(int16_t *block)
25 {
26  int32_t const_val;
27  v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
28  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
29  v8i16 w1, w3, w5, w7;
30  v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
31  v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
32  v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
33  v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
34  v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
35  v4i32 w2, w4, w6;
36  v8i16 select_vec, temp;
37  v8i16 zero = { 0 };
38  v4i32 const_val0 = __msa_ldi_w(1);
39  v4i32 const_val1 = __msa_ldi_w(1);
40 
41  LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
42  const_val0 <<= 10;
43  const_val = 16383 * ((1 << 19) / 16383);
44  const_val1 = __msa_insert_w(const_val0, 0, const_val);
45  const_val1 = __msa_splati_w(const_val1, 0);
46  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
47  in0, in1, in2, in3, in4, in5, in6, in7);
48  select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
49  select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
50  UNPCK_SH_SW(in0, a0_r, a0_l);
51  UNPCK_SH_SW(in2, temp3_r, temp3_l);
52  temp = in0 << 3;
53  w2 = (v4i32) __msa_splati_h(weights, 2);
54  w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
55  w4 = (v4i32) __msa_splati_h(weights, 4);
56  w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
57  w6 = (v4i32) __msa_splati_h(weights, 6);
58  w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
59  MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
60  ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
61  MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
62  temp1_r, temp1_l, temp2_r, temp2_l);
63  BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
64  temp2_l, temp2_r, temp1_l, temp1_r,
65  a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
66  UNPCK_SH_SW(in4, temp0_r, temp0_l);
67  UNPCK_SH_SW(in6, temp3_r, temp3_l);
68  MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
69  MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
70  temp2_r, temp2_l, temp1_r, temp1_l);
71  ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
72  SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
73  a1_r, a1_l, a2_r, a2_l);
74  ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
75  a3_r, a3_l, a0_r, a0_l);
76  SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
77  ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
78  SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
79  ILVRL_H2_SW(in1, in3, b3_r, b3_l);
80  SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
81  ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
82  ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
83  const0, const1, const2, const3);
84  ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
85  const5 = __msa_ilvod_h(-w1, -w5);
86  const7 = __msa_ilvod_h(w3, -w1);
87  DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
88  b0_r, b1_r, b2_r, b3_r);
89  DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
90  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
91  DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
92  b0_l, b1_l, b2_l, b3_l);
93  DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
94  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
95  BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
96  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
97  temp0_r, temp0_l, temp1_r, temp1_l,
98  temp2_r, temp2_l, temp3_r, temp3_l,
99  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
100  SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
101  SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
102  PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
103  temp2_l, temp2_r, temp3_l, temp3_r,
104  temp0_r, temp1_r, temp2_r, temp3_r);
105  in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
106  (v16u8) select_vec);
107  in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
108  (v16u8) select_vec);
109  in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
110  (v16u8) select_vec);
111  in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
112  (v16u8) select_vec);
113  SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
114  SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
115  PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
116  a0_r, a1_r, a2_r, a3_r);
117  in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
118  in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
119  in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
120  in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
121  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
122  in0, in1, in2, in3, in4, in5, in6, in7);
123 
124  UNPCK_SH_SW(in0, a0_r, a0_l);
125  UNPCK_SH_SW(in2, temp3_r, temp3_l);
126  w2 = (v4i32) __msa_splati_h(weights, 2);
127  w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
128  w4 = (v4i32) __msa_splati_h(weights, 4);
129  w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
130  w6 = (v4i32) __msa_splati_h(weights, 6);
131  w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
132  MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
133  ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
134  MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
135  temp1_r, temp1_l, temp2_r, temp2_l);
136  BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
137  temp2_l, temp2_r, temp1_l, temp1_r,
138  a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
139  UNPCK_SH_SW(in4, temp0_r, temp0_l);
140  UNPCK_SH_SW(in6, temp3_r, temp3_l);
141  MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
142  MUL4(w2, temp3_r, w2, temp3_l, w6, temp3_r, w6, temp3_l,
143  temp2_r, temp2_l, temp1_r, temp1_l);
144  ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
145  SUB4(a1_r, temp0_r, a1_l, temp0_l, a2_r, temp0_r, a2_l, temp0_l,
146  a1_r, a1_l, a2_r, a2_l);
147  ADD4(a3_r, temp0_r, a3_l, temp0_l, a0_r, temp1_r, a0_l, temp1_l,
148  a3_r, a3_l, a0_r, a0_l);
149  SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
150  ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
151  SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
152  ILVRL_H2_SW(in1, in3, b3_r, b3_l);
153  SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
154  ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
155  const0, const1, const2, const3);
156  DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
157  b0_r, b1_r, b2_r, b3_r);
158  DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
159  b0_l, b1_l, b2_l, b3_l);
160  ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
161  ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
162  const5 = __msa_ilvod_h(-w1, -w5);
163  const7 = __msa_ilvod_h(w3, -w1);
164  DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
165  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
166  DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
167  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
168  BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
169  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
170  temp0_r, temp0_l, temp1_r, temp1_l,
171  temp2_r, temp2_l, temp3_r, temp3_l,
172  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
173  SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
174  SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
175  PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
176  temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
177  SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
178  SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
179  PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
180  a0_r, a1_r, a2_r, a3_r);
181  ST_SW8(temp0_r, temp1_r, temp2_r, temp3_r, a3_r, a2_r, a1_r, a0_r,
182  block, 8);
183 }
184 
185 static void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride,
186  int16_t *block)
187 {
188  int32_t const_val;
189  uint64_t tmp0, tmp1, tmp2, tmp3;
190  v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
191  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
192  v8i16 w1, w3, w5, w7;
193  v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
194  v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
195  v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
196  v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
197  v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
198  v4i32 w2, w4, w6;
199  v8i16 select_vec, temp;
200  v8i16 zero = { 0 };
201  v4i32 const_val0 = __msa_ldi_w(1);
202  v4i32 const_val1 = __msa_ldi_w(1);
203 
204  LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
205  const_val0 <<= 10;
206  const_val = 16383 * ((1 << 19) / 16383);
207  const_val1 = __msa_insert_w(const_val0, 0, const_val);
208  const_val1 = __msa_splati_w(const_val1, 0);
209  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
210  in0, in1, in2, in3, in4, in5, in6, in7);
211  select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
212  select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
213  UNPCK_SH_SW(in0, a0_r, a0_l);
214  UNPCK_SH_SW(in2, temp3_r, temp3_l);
215  temp = in0 << 3;
216  w2 = (v4i32) __msa_splati_h(weights, 2);
217  w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
218  w4 = (v4i32) __msa_splati_h(weights, 4);
219  w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
220  w6 = (v4i32) __msa_splati_h(weights, 6);
221  w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
222  MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
223  ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
224  MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
225  MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
226  BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
227  temp2_l, temp2_r, temp1_l, temp1_r,
228  a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
229  UNPCK_SH_SW(in4, temp0_r, temp0_l);
230  UNPCK_SH_SW(in6, temp3_r, temp3_l);
231  MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
232  MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
233  MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
234  ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
235  SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
236  SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
237  ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
238  ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
239  SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
240  ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
241  SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
242  ILVRL_H2_SW(in1, in3, b3_r, b3_l);
243  SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
244  ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
245  ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
246  const0, const1, const2, const3);
247  ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
248  const5 = __msa_ilvod_h(-w1, -w5);
249  const7 = __msa_ilvod_h(w3, -w1);
250  DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
251  b0_r, b1_r, b2_r, b3_r);
252  DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
253  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
254  DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
255  b0_l, b1_l, b2_l, b3_l);
256  DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
257  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
258  BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
259  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
260  temp0_r, temp0_l, temp1_r, temp1_l,
261  temp2_r, temp2_l, temp3_r, temp3_l,
262  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
263  SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
264  SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
265  PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
266  temp2_l, temp2_r, temp3_l, temp3_r,
267  temp0_r, temp1_r, temp2_r, temp3_r);
268  in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
269  (v16u8) select_vec);
270  in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
271  (v16u8) select_vec);
272  in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
273  (v16u8) select_vec);
274  in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
275  (v16u8) select_vec);
276  SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
277  SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
278  PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
279  a0_r, a1_r, a2_r, a3_r);
280  in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
281  in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
282  in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
283  in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
284  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
285  in0, in1, in2, in3, in4, in5, in6, in7);
286  UNPCK_SH_SW(in0, a0_r, a0_l);
287  UNPCK_SH_SW(in2, temp3_r, temp3_l);
288  w2 = (v4i32) __msa_splati_h(weights, 2);
289  w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
290  w4 = (v4i32) __msa_splati_h(weights, 4);
291  w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
292  w6 = (v4i32) __msa_splati_h(weights, 6);
293  w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
294  MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
295  ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
296  MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
297  MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
298  BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
299  temp2_l, temp2_r, temp1_l, temp1_r,
300  a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
301  UNPCK_SH_SW(in4, temp0_r, temp0_l);
302  UNPCK_SH_SW(in6, temp3_r, temp3_l);
303  MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
304  MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
305  MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
306  ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
307  SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
308  SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
309  ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
310  ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
311  SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
312  ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
313  SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
314  ILVRL_H2_SW(in1, in3, b3_r, b3_l);
315  SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
316  ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
317  const0, const1, const2, const3);
318  DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
319  b0_r, b1_r, b2_r, b3_r);
320  DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
321  b0_l, b1_l, b2_l, b3_l);
322  ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
323  ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
324  const5 = __msa_ilvod_h(-w1, -w5);
325  const7 = __msa_ilvod_h(w3, -w1);
326  DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
327  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
328  DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
329  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
330  BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
331  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
332  temp0_r, temp0_l, temp1_r, temp1_l,
333  temp2_r, temp2_l, temp3_r, temp3_l,
334  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
335  SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
336  SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
337  SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
338  SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
339  PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
340  temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
341  PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
342  a0_r, a1_r, a2_r, a3_r);
343  temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
344  temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
345  temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
346  temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
347  PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
348  temp2_r, temp2_r, temp3_r, temp3_r,
349  temp0_r, temp1_r, temp2_r, temp3_r);
350  tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
351  tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
352  tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
353  tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
354  SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
355  dst += 4 * dst_stride;
356  a0_r = (v4i32) CLIP_SH_0_255(a0_r);
357  a1_r = (v4i32) CLIP_SH_0_255(a1_r);
358  a2_r = (v4i32) CLIP_SH_0_255(a2_r);
359  a3_r = (v4i32) CLIP_SH_0_255(a3_r);
360  PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
361  a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
362  tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
363  tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
364  tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
365  tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
366  SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
367  dst += 4 * dst_stride;
368 }
369 
370 static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride,
371  int16_t *block)
372 {
373  int32_t const_val;
374  uint64_t tmp0, tmp1, tmp2, tmp3;
375  v8i16 weights = { 0, 22725, 21407, 19266, 16383, 12873, 8867, 4520 };
376  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
377  v8i16 w1, w3, w5, w7;
378  v8i16 const0, const1, const2, const3, const4, const5, const6, const7;
379  v4i32 temp0_r, temp1_r, temp2_r, temp3_r;
380  v4i32 temp4_r, temp5_r, temp6_r, temp7_r, temp8_r;
381  v4i32 temp0_l, temp1_l, temp2_l, temp3_l;
382  v4i32 temp4_l, temp5_l, temp6_l, temp7_l, temp8_l;
383  v4i32 a0_r, a1_r, a2_r, a3_r, a0_l, a1_l, a2_l, a3_l;
384  v4i32 b0_r, b1_r, b2_r, b3_r, b0_l, b1_l, b2_l, b3_l;
385  v4i32 w2, w4, w6;
386  v8i16 select_vec, temp;
387  v8i16 zero = { 0 };
388  v4i32 const_val0 = __msa_ldi_w(1);
389  v4i32 const_val1 = __msa_ldi_w(1);
390 
391  const_val0 <<= 10;
392  const_val = 16383 * ((1 << 19) / 16383);
393  const_val1 = __msa_insert_w(const_val0, 0, const_val);
394  const_val1 = __msa_splati_w(const_val1, 0);
395  LD_SH8(block, 8, in0, in1, in2, in3, in4, in5, in6, in7);
396  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
397  in0, in1, in2, in3, in4, in5, in6, in7);
398 
399  select_vec = in1 | in2 | in3 | in4 | in5 | in6 | in7;
400  select_vec = __msa_clti_u_h((v8u16) select_vec, 1);
401  UNPCK_SH_SW(in0, a0_r, a0_l);
402  UNPCK_SH_SW(in2, temp3_r, temp3_l);
403  ILVRL_H2_SW(in1, in3, b3_r, b3_l);
404  UNPCK_SH_SW(in4, temp4_r, temp4_l);
405  UNPCK_SH_SW(in6, temp7_r, temp7_l);
406  ILVRL_H2_SW(in5, in7, temp8_r, temp8_l);
407  temp = in0 << 3;
408  SPLATI_H4_SH(weights, 1, 3, 5, 7, w1, w3, w5, w7);
409  ILVR_H4_SH(w1, w3, w3, -w7, w5, -w1, w7, -w5,
410  const0, const1, const2, const3);
411  ILVR_H2_SH(w5, w7, w7, w3, const4, const6);
412  const5 = __msa_ilvod_h(-w1, -w5);
413  const7 = __msa_ilvod_h(w3, -w1);
414  DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
415  b0_r, b1_r, b2_r, b3_r);
416  DPADD_SH4_SW(temp8_r, temp8_r, temp8_r, temp8_r,
417  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
418  DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
419  b0_l, b1_l, b2_l, b3_l);
420  DPADD_SH4_SW(temp8_l, temp8_l, temp8_l, temp8_l,
421  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
422  w2 = (v4i32) __msa_splati_h(weights, 2);
423  w2 = (v4i32) __msa_ilvr_h(zero, (v8i16) w2);
424  w4 = (v4i32) __msa_splati_h(weights, 4);
425  w4 = (v4i32) __msa_ilvr_h(zero, (v8i16) w4);
426  w6 = (v4i32) __msa_splati_h(weights, 6);
427  w6 = (v4i32) __msa_ilvr_h(zero, (v8i16) w6);
428  MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
429  ADD2(a0_r, const_val0, a0_l, const_val0, temp0_r, temp0_l);
430  MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
431  MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
432  BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
433  temp2_l, temp2_r, temp1_l, temp1_r,
434  a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
435  MUL2(temp4_r, w4, temp4_l, w4, temp4_r, temp4_l);
436  MUL2(temp7_r, w2, temp7_l, w2, temp6_r, temp6_l);
437  MUL2(temp7_r, w6, temp7_l, w6, temp5_r, temp5_l);
438  ADD2(a0_r, temp4_r, a0_l, temp4_l, a0_r, a0_l);
439  SUB2(a1_r, temp4_r, a1_l, temp4_l, a1_r, a1_l);
440  SUB2(a2_r, temp4_r, a2_l, temp4_l, a2_r, a2_l);
441  ADD2(a3_r, temp4_r, a3_l, temp4_l, a3_r, a3_l);
442  ADD2(a0_r, temp5_r, a0_l, temp5_l, a0_r, a0_l);
443  SUB2(a1_r, temp6_r, a1_l, temp6_l, a1_r, a1_l);
444  ADD2(a2_r, temp6_r, a2_l, temp6_l, a2_r, a2_l);
445  SUB2(a3_r, temp5_r, a3_l, temp5_l, a3_r, a3_l);
446  BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
447  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
448  temp0_r, temp0_l, temp1_r, temp1_l,
449  temp2_r, temp2_l, temp3_r, temp3_l,
450  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
451  SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 11);
452  SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 11);
453  PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r,
454  temp2_l, temp2_r, temp3_l, temp3_r,
455  temp0_r, temp1_r, temp2_r, temp3_r);
456  in0 = (v8i16) __msa_bmnz_v((v16u8) temp0_r, (v16u8) temp,
457  (v16u8) select_vec);
458  in1 = (v8i16) __msa_bmnz_v((v16u8) temp1_r, (v16u8) temp,
459  (v16u8) select_vec);
460  in2 = (v8i16) __msa_bmnz_v((v16u8) temp2_r, (v16u8) temp,
461  (v16u8) select_vec);
462  in3 = (v8i16) __msa_bmnz_v((v16u8) temp3_r, (v16u8) temp,
463  (v16u8) select_vec);
464  SRA_4V(a3_r, a3_l, a2_r, a2_l, 11);
465  SRA_4V(a1_r, a1_l, a0_r, a0_l, 11);
466  PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
467  a0_r, a1_r, a2_r, a3_r);
468  in4 = (v8i16) __msa_bmnz_v((v16u8) a3_r, (v16u8) temp, (v16u8) select_vec);
469  in5 = (v8i16) __msa_bmnz_v((v16u8) a2_r, (v16u8) temp, (v16u8) select_vec);
470  in6 = (v8i16) __msa_bmnz_v((v16u8) a1_r, (v16u8) temp, (v16u8) select_vec);
471  in7 = (v8i16) __msa_bmnz_v((v16u8) a0_r, (v16u8) temp, (v16u8) select_vec);
472  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
473  in0, in1, in2, in3, in4, in5, in6, in7);
474 
475  UNPCK_SH_SW(in0, a0_r, a0_l);
476  UNPCK_SH_SW(in2, temp3_r, temp3_l);
477  MUL2(a0_r, w4, a0_l, w4, a0_r, a0_l);
478  ADD2(a0_r, const_val1, a0_l, const_val1, temp0_r, temp0_l);
479  MUL2(w2, temp3_r, w2, temp3_l, temp1_r, temp1_l);
480  MUL2(w6, temp3_r, w6, temp3_l, temp2_r, temp2_l);
481  BUTTERFLY_8(temp0_r, temp0_l, temp0_r, temp0_l,
482  temp2_l, temp2_r, temp1_l, temp1_r,
483  a0_r, a0_l, a1_r, a1_l, a2_l, a2_r, a3_l, a3_r);
484  UNPCK_SH_SW(in4, temp0_r, temp0_l);
485  UNPCK_SH_SW(in6, temp3_r, temp3_l);
486  MUL2(temp0_r, w4, temp0_l, w4, temp0_r, temp0_l);
487  MUL2(w2, temp3_r, w2, temp3_l, temp2_r, temp2_l);
488  MUL2(w6, temp3_r, w6, temp3_l, temp1_r, temp1_l);
489  ADD2(a0_r, temp0_r, a0_l, temp0_l, a0_r, a0_l);
490  SUB2(a1_r, temp0_r, a1_l, temp0_l, a1_r, a1_l);
491  SUB2(a2_r, temp0_r, a2_l, temp0_l, a2_r, a2_l);
492  ADD2(a3_r, temp0_r, a3_l, temp0_l, a3_r, a3_l);
493  ADD2(a0_r, temp1_r, a0_l, temp1_l, a0_r, a0_l);
494  SUB2(a1_r, temp2_r, a1_l, temp2_l, a1_r, a1_l);
495  ADD2(a2_r, temp2_r, a2_l, temp2_l, a2_r, a2_l);
496  SUB2(a3_r, temp1_r, a3_l, temp1_l, a3_r, a3_l);
497  ILVRL_H2_SW(in1, in3, b3_r, b3_l);
498  ILVRL_H2_SW(in5, in7, temp0_r, temp0_l);
499  DOTP_SH4_SW(b3_r, b3_r, b3_r, b3_r, const0, const1, const2, const3,
500  b0_r, b1_r, b2_r, b3_r);
501  DOTP_SH4_SW(b3_l, b3_l, b3_l, b3_l, const0, const1, const2, const3,
502  b0_l, b1_l, b2_l, b3_l);
503  DPADD_SH4_SW(temp0_r, temp0_r, temp0_r, temp0_r,
504  const4, const5, const6, const7, b0_r, b1_r, b2_r, b3_r);
505  DPADD_SH4_SW(temp0_l, temp0_l, temp0_l, temp0_l,
506  const4, const5, const6, const7, b0_l, b1_l, b2_l, b3_l);
507  BUTTERFLY_16(a0_r, a0_l, a1_r, a1_l, a2_r, a2_l, a3_r, a3_l,
508  b3_l, b3_r, b2_l, b2_r, b1_l, b1_r, b0_l, b0_r,
509  temp0_r, temp0_l, temp1_r, temp1_l,
510  temp2_r, temp2_l, temp3_r, temp3_l,
511  a3_l, a3_r, a2_l, a2_r, a1_l, a1_r, a0_l, a0_r);
512  SRA_4V(temp0_r, temp0_l, temp1_r, temp1_l, 20);
513  SRA_4V(temp2_r, temp2_l, temp3_r, temp3_l, 20);
514  LD_SH4(dst, dst_stride, in0, in1, in2, in3);
515  PCKEV_H4_SW(temp0_l, temp0_r, temp1_l, temp1_r, temp2_l, temp2_r,
516  temp3_l, temp3_r, temp0_r, temp1_r, temp2_r, temp3_r);
517  ILVR_B4_SW(zero, in0, zero, in1, zero, in2, zero, in3,
518  temp0_l, temp1_l, temp2_l, temp3_l);
519  temp0_r = (v4i32) ((v8i16) (temp0_r) + (v8i16) (temp0_l));
520  temp1_r = (v4i32) ((v8i16) (temp1_r) + (v8i16) (temp1_l));
521  temp2_r = (v4i32) ((v8i16) (temp2_r) + (v8i16) (temp2_l));
522  temp3_r = (v4i32) ((v8i16) (temp3_r) + (v8i16) (temp3_l));
523  temp0_r = (v4i32) CLIP_SH_0_255(temp0_r);
524  temp1_r = (v4i32) CLIP_SH_0_255(temp1_r);
525  temp2_r = (v4i32) CLIP_SH_0_255(temp2_r);
526  temp3_r = (v4i32) CLIP_SH_0_255(temp3_r);
527  PCKEV_B4_SW(temp0_r, temp0_r, temp1_r, temp1_r,
528  temp2_r, temp2_r, temp3_r, temp3_r,
529  temp0_r, temp1_r, temp2_r, temp3_r);
530  tmp0 = __msa_copy_u_d((v2i64) temp0_r, 1);
531  tmp1 = __msa_copy_u_d((v2i64) temp1_r, 1);
532  tmp2 = __msa_copy_u_d((v2i64) temp2_r, 1);
533  tmp3 = __msa_copy_u_d((v2i64) temp3_r, 1);
534  SD4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
535 
536  SRA_4V(a3_r, a3_l, a2_r, a2_l, 20);
537  SRA_4V(a1_r, a1_l, a0_r, a0_l, 20);
538  LD_SH4(dst + 4 * dst_stride, dst_stride, in4, in5, in6, in7);
539  PCKEV_H4_SW(a0_l, a0_r, a1_l, a1_r, a2_l, a2_r, a3_l, a3_r,
540  a0_r, a1_r, a2_r, a3_r);
541  ILVR_B4_SW(zero, in4, zero, in5, zero, in6, zero, in7,
542  a3_l, a2_l, a1_l, a0_l);
543  a3_r = (v4i32) ((v8i16) (a3_r) + (v8i16) (a3_l));
544  a2_r = (v4i32) ((v8i16) (a2_r) + (v8i16) (a2_l));
545  a1_r = (v4i32) ((v8i16) (a1_r) + (v8i16) (a1_l));
546  a0_r = (v4i32) ((v8i16) (a0_r) + (v8i16) (a0_l));
547  a3_r = (v4i32) CLIP_SH_0_255(a3_r);
548  a2_r = (v4i32) CLIP_SH_0_255(a2_r);
549  a1_r = (v4i32) CLIP_SH_0_255(a1_r);
550  a0_r = (v4i32) CLIP_SH_0_255(a0_r);
551  PCKEV_B4_SW(a0_r, a0_r, a1_r, a1_r,
552  a2_r, a2_r, a3_r, a3_r, a0_r, a1_r, a2_r, a3_r);
553  tmp0 = __msa_copy_u_d((v2i64) a3_r, 1);
554  tmp1 = __msa_copy_u_d((v2i64) a2_r, 1);
555  tmp2 = __msa_copy_u_d((v2i64) a1_r, 1);
556  tmp3 = __msa_copy_u_d((v2i64) a0_r, 1);
557  SD4(tmp0, tmp1, tmp2, tmp3, dst + 4 * dst_stride, dst_stride);
558 }
559 
560 void ff_simple_idct_msa(int16_t *block)
561 {
562  simple_idct_msa(block);
563 }
564 
565 void ff_simple_idct_put_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
566 {
567  simple_idct_put_msa(dst, dst_stride, block);
568 }
569 
570 void ff_simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
571 {
572  simple_idct_add_msa(dst, dst_stride, block);
573 }
else temp
Definition: vf_mcdeint.c:257
#define ILVR_H4_SH(...)
#define MUL2(in0, in1, in2, in3, out0, out1)
void ff_simple_idct_msa(int16_t *block)
#define ILVRL_H2_SW(...)
#define ST_SW8(in0, in1, in2, in3, in4, in5, in6, in7,pdst, stride)
#define DOTP_SH4_SW(...)
#define SRA_4V(in0, in1, in2, in3, shift)
uint8_t
#define CLIP_SH_0_255(in)
#define SPLATI_H4_SH(...)
#define PCKEV_B4_SW(...)
#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
static void simple_idct_msa(int16_t *block)
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
#define DPADD_SH4_SW(...)
#define zero
Definition: regdef.h:64
#define TRANSPOSE8x8_SH_SH(...)
#define LD_SH8(...)
#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,in8, in9,in10, in11, in12, in13, in14, in15,out0, out1, out2, out3, out4, out5, out6, out7,out8, out9, out10, out11, out12, out13, out14, out15)
int32_t
#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,out0, out1, out2, out3, out4, out5, out6, out7)
void ff_simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
#define UNPCK_SH_SW(in, out0, out1)
static void simple_idct_add_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
void ff_simple_idct_put_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
#define ILVR_B4_SW(...)
#define ADD2(in0, in1, in2, in3, out0, out1)
#define SD4(in0, in1, in2, in3, pdst, stride)
#define PCKEV_H4_SW(...)
#define SUB2(in0, in1, in2, in3, out0, out1)
#define ILVR_H2_SH(...)
#define LD_SH4(...)
static void simple_idct_put_msa(uint8_t *dst, int32_t dst_stride, int16_t *block)
static int16_t block[64]
Definition: dct-test.c:110