FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vp9_lpf_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #include "libavcodec/vp9dsp.h"
23 #include "vp9dsp_mips.h"
24 
25 #define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
26  p1_out, p0_out, q0_out, q1_out) \
27 { \
28  v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
29  v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
30  v8i16 q0_sub_p0_r, filt_r, cnst3h; \
31  \
32  p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \
33  p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \
34  q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \
35  q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \
36  \
37  filt = __msa_subs_s_b(p1_m, q1_m); \
38  filt = filt & (v16i8) hev_in; \
39  q0_sub_p0 = q0_m - p0_m; \
40  filt_sign = __msa_clti_s_b(filt, 0); \
41  \
42  cnst3h = __msa_ldi_h(3); \
43  q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
44  q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h); \
45  filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \
46  filt_r += q0_sub_p0_r; \
47  filt_r = __msa_sat_s_h(filt_r, 7); \
48  \
49  /* combine left and right part */ \
50  filt = __msa_pckev_b((v16i8) filt_r, (v16i8) filt_r); \
51  \
52  filt = filt & (v16i8) mask_in; \
53  cnst4b = __msa_ldi_b(4); \
54  filt1 = __msa_adds_s_b(filt, cnst4b); \
55  filt1 >>= 3; \
56  \
57  cnst3b = __msa_ldi_b(3); \
58  filt2 = __msa_adds_s_b(filt, cnst3b); \
59  filt2 >>= 3; \
60  \
61  q0_m = __msa_subs_s_b(q0_m, filt1); \
62  q0_out = __msa_xori_b((v16u8) q0_m, 0x80); \
63  p0_m = __msa_adds_s_b(p0_m, filt2); \
64  p0_out = __msa_xori_b((v16u8) p0_m, 0x80); \
65  \
66  filt = __msa_srari_b(filt1, 1); \
67  hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \
68  filt = filt & (v16i8) hev_in; \
69  \
70  q1_m = __msa_subs_s_b(q1_m, filt); \
71  q1_out = __msa_xori_b((v16u8) q1_m, 0x80); \
72  p1_m = __msa_adds_s_b(p1_m, filt); \
73  p1_out = __msa_xori_b((v16u8) p1_m, 0x80); \
74 }
75 
76 #define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in, \
77  p1_out, p0_out, q0_out, q1_out) \
78 { \
79  v16i8 p1_m, p0_m, q0_m, q1_m, q0_sub_p0, filt_sign; \
80  v16i8 filt, filt1, filt2, cnst4b, cnst3b; \
81  v8i16 q0_sub_p0_r, q0_sub_p0_l, filt_l, filt_r, cnst3h; \
82  \
83  p1_m = (v16i8) __msa_xori_b(p1_in, 0x80); \
84  p0_m = (v16i8) __msa_xori_b(p0_in, 0x80); \
85  q0_m = (v16i8) __msa_xori_b(q0_in, 0x80); \
86  q1_m = (v16i8) __msa_xori_b(q1_in, 0x80); \
87  \
88  filt = __msa_subs_s_b(p1_m, q1_m); \
89  \
90  filt = filt & (v16i8) hev_in; \
91  \
92  q0_sub_p0 = q0_m - p0_m; \
93  filt_sign = __msa_clti_s_b(filt, 0); \
94  \
95  cnst3h = __msa_ldi_h(3); \
96  q0_sub_p0_r = (v8i16) __msa_ilvr_b(q0_sub_p0, q0_sub_p0); \
97  q0_sub_p0_r = __msa_dotp_s_h((v16i8) q0_sub_p0_r, (v16i8) cnst3h); \
98  filt_r = (v8i16) __msa_ilvr_b(filt_sign, filt); \
99  filt_r += q0_sub_p0_r; \
100  filt_r = __msa_sat_s_h(filt_r, 7); \
101  \
102  q0_sub_p0_l = (v8i16) __msa_ilvl_b(q0_sub_p0, q0_sub_p0); \
103  q0_sub_p0_l = __msa_dotp_s_h((v16i8) q0_sub_p0_l, (v16i8) cnst3h); \
104  filt_l = (v8i16) __msa_ilvl_b(filt_sign, filt); \
105  filt_l += q0_sub_p0_l; \
106  filt_l = __msa_sat_s_h(filt_l, 7); \
107  \
108  filt = __msa_pckev_b((v16i8) filt_l, (v16i8) filt_r); \
109  filt = filt & (v16i8) mask_in; \
110  \
111  cnst4b = __msa_ldi_b(4); \
112  filt1 = __msa_adds_s_b(filt, cnst4b); \
113  filt1 >>= 3; \
114  \
115  cnst3b = __msa_ldi_b(3); \
116  filt2 = __msa_adds_s_b(filt, cnst3b); \
117  filt2 >>= 3; \
118  \
119  q0_m = __msa_subs_s_b(q0_m, filt1); \
120  q0_out = __msa_xori_b((v16u8) q0_m, 0x80); \
121  p0_m = __msa_adds_s_b(p0_m, filt2); \
122  p0_out = __msa_xori_b((v16u8) p0_m, 0x80); \
123  \
124  filt = __msa_srari_b(filt1, 1); \
125  hev_in = __msa_xori_b((v16u8) hev_in, 0xff); \
126  filt = filt & (v16i8) hev_in; \
127  \
128  q1_m = __msa_subs_s_b(q1_m, filt); \
129  q1_out = __msa_xori_b((v16u8) q1_m, 0x80); \
130  p1_m = __msa_adds_s_b(p1_m, filt); \
131  p1_out = __msa_xori_b((v16u8) p1_m, 0x80); \
132 }
133 
134 #define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out) \
135 { \
136  v16u8 tmp, p2_a_sub_p0, q2_a_sub_q0, p3_a_sub_p0, q3_a_sub_q0; \
137  v16u8 zero_in = { 0 }; \
138  \
139  tmp = __msa_ori_b(zero_in, 1); \
140  p2_a_sub_p0 = __msa_asub_u_b(p2_in, p0_in); \
141  q2_a_sub_q0 = __msa_asub_u_b(q2_in, q0_in); \
142  p3_a_sub_p0 = __msa_asub_u_b(p3_in, p0_in); \
143  q3_a_sub_q0 = __msa_asub_u_b(q3_in, q0_in); \
144  \
145  p2_a_sub_p0 = __msa_max_u_b(p2_a_sub_p0, q2_a_sub_q0); \
146  flat_out = __msa_max_u_b(p2_a_sub_p0, flat_out); \
147  p3_a_sub_p0 = __msa_max_u_b(p3_a_sub_p0, q3_a_sub_q0); \
148  flat_out = __msa_max_u_b(p3_a_sub_p0, flat_out); \
149  \
150  flat_out = (tmp < (v16u8) flat_out); \
151  flat_out = __msa_xori_b(flat_out, 0xff); \
152  flat_out = flat_out & (mask); \
153 }
154 
155 #define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, \
156  q5_in, q6_in, q7_in, flat_in, flat2_out) \
157 { \
158  v16u8 tmp, zero_in = { 0 }; \
159  v16u8 p4_a_sub_p0, q4_a_sub_q0, p5_a_sub_p0, q5_a_sub_q0; \
160  v16u8 p6_a_sub_p0, q6_a_sub_q0, p7_a_sub_p0, q7_a_sub_q0; \
161  \
162  tmp = __msa_ori_b(zero_in, 1); \
163  p4_a_sub_p0 = __msa_asub_u_b(p4_in, p0_in); \
164  q4_a_sub_q0 = __msa_asub_u_b(q4_in, q0_in); \
165  p5_a_sub_p0 = __msa_asub_u_b(p5_in, p0_in); \
166  q5_a_sub_q0 = __msa_asub_u_b(q5_in, q0_in); \
167  p6_a_sub_p0 = __msa_asub_u_b(p6_in, p0_in); \
168  q6_a_sub_q0 = __msa_asub_u_b(q6_in, q0_in); \
169  p7_a_sub_p0 = __msa_asub_u_b(p7_in, p0_in); \
170  q7_a_sub_q0 = __msa_asub_u_b(q7_in, q0_in); \
171  \
172  p4_a_sub_p0 = __msa_max_u_b(p4_a_sub_p0, q4_a_sub_q0); \
173  flat2_out = __msa_max_u_b(p5_a_sub_p0, q5_a_sub_q0); \
174  flat2_out = __msa_max_u_b(p4_a_sub_p0, flat2_out); \
175  p6_a_sub_p0 = __msa_max_u_b(p6_a_sub_p0, q6_a_sub_q0); \
176  flat2_out = __msa_max_u_b(p6_a_sub_p0, flat2_out); \
177  p7_a_sub_p0 = __msa_max_u_b(p7_a_sub_p0, q7_a_sub_q0); \
178  flat2_out = __msa_max_u_b(p7_a_sub_p0, flat2_out); \
179  \
180  flat2_out = (tmp < (v16u8) flat2_out); \
181  flat2_out = __msa_xori_b(flat2_out, 0xff); \
182  flat2_out = flat2_out & flat_in; \
183 }
184 
185 #define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, \
186  q0_in, q1_in, q2_in, q3_in, \
187  p2_filt8_out, p1_filt8_out, p0_filt8_out, \
188  q0_filt8_out, q1_filt8_out, q2_filt8_out) \
189 { \
190  v8u16 tmp0, tmp1, tmp2; \
191  \
192  tmp2 = p2_in + p1_in + p0_in; \
193  tmp0 = p3_in << 1; \
194  \
195  tmp0 = tmp0 + tmp2 + q0_in; \
196  tmp1 = tmp0 + p3_in + p2_in; \
197  p2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
198  \
199  tmp1 = tmp0 + p1_in + q1_in; \
200  p1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
201  \
202  tmp1 = q2_in + q1_in + q0_in; \
203  tmp2 = tmp2 + tmp1; \
204  tmp0 = tmp2 + (p0_in); \
205  tmp0 = tmp0 + (p3_in); \
206  p0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp0, 3); \
207  \
208  tmp0 = q2_in + q3_in; \
209  tmp0 = p0_in + tmp1 + tmp0; \
210  tmp1 = q3_in + q3_in; \
211  tmp1 = tmp1 + tmp0; \
212  q2_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
213  \
214  tmp0 = tmp2 + q3_in; \
215  tmp1 = tmp0 + q0_in; \
216  q0_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
217  \
218  tmp1 = tmp0 - p2_in; \
219  tmp0 = q1_in + q3_in; \
220  tmp1 = tmp0 + tmp1; \
221  q1_filt8_out = (v8i16) __msa_srari_h((v8i16) tmp1, 3); \
222 }
223 
224 #define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, \
225  q0_in, q1_in, q2_in, q3_in, \
226  limit_in, b_limit_in, thresh_in, \
227  hev_out, mask_out, flat_out) \
228 { \
229  v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m; \
230  v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m; \
231  \
232  /* absolute subtraction of pixel values */ \
233  p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in); \
234  p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in); \
235  p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in); \
236  q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in); \
237  q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in); \
238  q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in); \
239  p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in); \
240  p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in); \
241  \
242  /* calculation of hev */ \
243  flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m); \
244  hev_out = thresh_in < (v16u8) flat_out; \
245  \
246  /* calculation of mask */ \
247  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m); \
248  p1_asub_q1_m >>= 1; \
249  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m); \
250  \
251  mask_out = b_limit_in < p0_asub_q0_m; \
252  mask_out = __msa_max_u_b(flat_out, mask_out); \
253  p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m); \
254  mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out); \
255  q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m); \
256  mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out); \
257  \
258  mask_out = limit_in < (v16u8) mask_out; \
259  mask_out = __msa_xori_b(mask_out, 0xff); \
260 }
261 
262 void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch,
263  int32_t b_limit_ptr,
264  int32_t limit_ptr,
265  int32_t thresh_ptr)
266 {
267  uint64_t p1_d, p0_d, q0_d, q1_d;
268  v16u8 mask, hev, flat, thresh, b_limit, limit;
269  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
270 
271  /* load vector elements */
272  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
273 
274  thresh = (v16u8) __msa_fill_b(thresh_ptr);
275  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
276  limit = (v16u8) __msa_fill_b(limit_ptr);
277 
278  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
279  hev, mask, flat);
280  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
281  q1_out);
282 
283  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
284  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
285  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
286  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
287  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
288 }
289 
290 
291 void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch,
292  int32_t b_limit_ptr,
293  int32_t limit_ptr,
294  int32_t thresh_ptr)
295 {
296  v16u8 mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
297  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
298 
299  /* load vector elements */
300  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
301 
302  thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
303  thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
304  thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
305 
306  b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
307  b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
308  b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
309 
310  limit0 = (v16u8) __msa_fill_b(limit_ptr);
311  limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
312  limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
313 
314  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
315  hev, mask, flat);
316  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
317 
318  ST_UB4(p1, p0, q0, q1, (src - 2 * pitch), pitch);
319 }
320 
321 void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch,
322  int32_t b_limit_ptr,
323  int32_t limit_ptr,
324  int32_t thresh_ptr)
325 {
326  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
327  v16u8 mask, hev, flat, thresh, b_limit, limit;
328  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
329  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
330  v8i16 p2_filter8, p1_filter8, p0_filter8;
331  v8i16 q0_filter8, q1_filter8, q2_filter8;
332  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
333  v16i8 zero = { 0 };
334 
335  /* load vector elements */
336  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
337 
338  thresh = (v16u8) __msa_fill_b(thresh_ptr);
339  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
340  limit = (v16u8) __msa_fill_b(limit_ptr);
341 
342  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
343  hev, mask, flat);
344  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
345  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
346  q1_out);
347 
348  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
349 
350  /* if flat is zero for all pixels, then no need to calculate other filter */
351  if (__msa_test_bz_v(flat)) {
352  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
353  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
354  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
355  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
356  SD4(p1_d, p0_d, q0_d, q1_d, (src - 2 * pitch), pitch);
357  } else {
358  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
359  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
360  q2_r, q3_r);
361  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filter8,
362  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
363 
364  /* convert 16 bit output data into 8 bit */
365  PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
366  zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
367  q0_filter8);
368  PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8, q2_filter8);
369 
370  /* store pixel values */
371  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
372  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
373  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
374  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
375  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
376  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
377 
378  p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
379  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
380  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
381  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
382  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
383  q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
384 
385  src -= 3 * pitch;
386 
387  SD4(p2_d, p1_d, p0_d, q0_d, src, pitch);
388  src += (4 * pitch);
389  SD(q1_d, src);
390  src += pitch;
391  SD(q2_d, src);
392  }
393 }
394 
395 void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch,
396  int32_t b_limit_ptr,
397  int32_t limit_ptr,
398  int32_t thresh_ptr)
399 {
400  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
401  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
402  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
403  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
404  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
405  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
406  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
407  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
408  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
409  v16u8 zero = { 0 };
410 
411  /* load vector elements */
412  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
413 
414  thresh = (v16u8) __msa_fill_b(thresh_ptr);
415  tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
416  thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
417 
418  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
419  tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
420  b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
421 
422  limit = (v16u8) __msa_fill_b(limit_ptr);
423  tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
424  limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
425 
426  /* mask and hev */
427  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
428  hev, mask, flat);
429  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
430  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
431  q1_out);
432 
433  /* if flat is zero for all pixels, then no need to calculate other filter */
434  if (__msa_test_bz_v(flat)) {
435  ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
436  } else {
437  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
438  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
439  q2_r, q3_r);
440  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
441  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
442 
443  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
444  p0_l);
445  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
446  q3_l);
447  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
448  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
449 
450  /* convert 16 bit output data into 8 bit */
451  PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
452  p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
453  p0_filt8_r, q0_filt8_r);
454  PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r,
455  q1_filt8_r, q2_filt8_r);
456 
457  /* store pixel values */
458  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
459  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
460  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
461  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
462  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
463  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
464 
465  src -= 3 * pitch;
466 
467  ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
468  src += (4 * pitch);
469  ST_UB2(q1_out, q2_out, src, pitch);
470  src += (2 * pitch);
471  }
472 }
473 
474 void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch,
475  int32_t b_limit_ptr,
476  int32_t limit_ptr,
477  int32_t thresh_ptr)
478 {
479  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
480  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
481  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
482  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
483  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
484  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
485  v16u8 zero = { 0 };
486 
487  /* load vector elements */
488  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
489 
490  thresh = (v16u8) __msa_fill_b(thresh_ptr);
491  tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
492  thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
493 
494  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
495  tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
496  b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
497 
498  limit = (v16u8) __msa_fill_b(limit_ptr);
499  tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
500  limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
501 
502  /* mask and hev */
503  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
504  hev, mask, flat);
505  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
506  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
507  q1_out);
508 
509  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
510 
511  /* if flat is zero for all pixels, then no need to calculate other filter */
512  if (__msa_test_bz_v(flat)) {
513  ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
514  } else {
515  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
516  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
517  q2_r, q3_r);
518  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
519  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
520 
521  /* convert 16 bit output data into 8 bit */
522  PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
523  p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
524  p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
525  PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
526  q1_filt8_r, q2_filt8_r);
527 
528  /* store pixel values */
529  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
530  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
531  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
532  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
533  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
534  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
535 
536  src -= 3 * pitch;
537 
538  ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
539  src += (4 * pitch);
540  ST_UB2(q1_out, q2_out, src, pitch);
541  src += (2 * pitch);
542  }
543 }
544 
545 void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch,
546  int32_t b_limit_ptr,
547  int32_t limit_ptr,
548  int32_t thresh_ptr)
549 {
550  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
551  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
552  v16u8 flat, mask, hev, tmp, thresh, b_limit, limit;
553  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
554  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
555  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
556  v16u8 zero = { 0 };
557 
558  /* load vector elements */
559  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
560 
561  thresh = (v16u8) __msa_fill_b(thresh_ptr);
562  tmp = (v16u8) __msa_fill_b(thresh_ptr >> 8);
563  thresh = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) thresh);
564 
565  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
566  tmp = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
567  b_limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) b_limit);
568 
569  limit = (v16u8) __msa_fill_b(limit_ptr);
570  tmp = (v16u8) __msa_fill_b(limit_ptr >> 8);
571  limit = (v16u8) __msa_ilvr_d((v2i64) tmp, (v2i64) limit);
572 
573  /* mask and hev */
574  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
575  hev, mask, flat);
576  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
577  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
578  q1_out);
579 
580  flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
581 
582  /* if flat is zero for all pixels, then no need to calculate other filter */
583  if (__msa_test_bz_v(flat)) {
584  ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
585  } else {
586  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
587  p0_l);
588  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
589  q3_l);
590  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
591  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
592 
593  /* convert 16 bit output data into 8 bit */
594  PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
595  p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
596  p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
597  PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
598  q1_filt8_l, q2_filt8_l);
599 
600  /* store pixel values */
601  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
602  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
603  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
604  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
605  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
606  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
607 
608  src -= 3 * pitch;
609 
610  ST_UB4(p2_out, p1_out, p0_out, q0_out, src, pitch);
611  src += (4 * pitch);
612  ST_UB2(q1_out, q2_out, src, pitch);
613  src += (2 * pitch);
614  }
615 }
616 
617 static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch,
618  uint8_t *filter48,
619  int32_t b_limit_ptr,
620  int32_t limit_ptr,
621  int32_t thresh_ptr)
622 {
623  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
624  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
625  v16u8 flat, mask, hev, thresh, b_limit, limit;
626  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
627  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
628  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
629  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
630  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
631  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
632  v16u8 zero = { 0 };
633 
634  /* load vector elements */
635  LD_UB8(src - (4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
636 
637  thresh = (v16u8) __msa_fill_b(thresh_ptr);
638  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
639  limit = (v16u8) __msa_fill_b(limit_ptr);
640 
641  /* mask and hev */
642  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
643  hev, mask, flat);
644  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
645  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
646  q1_out);
647 
648  /* if flat is zero for all pixels, then no need to calculate other filter */
649  if (__msa_test_bz_v(flat)) {
650  ST_UB4(p1_out, p0_out, q0_out, q1_out, (src - 2 * pitch), pitch);
651 
652  return 1;
653  } else {
654  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
655  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r,
656  q2_r, q3_r);
657  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
658  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
659 
660  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
661  p0_l);
662  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
663  q3_l);
664  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
665  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
666 
667  /* convert 16 bit output data into 8 bit */
668  PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
669  p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
670  p0_filt8_r, q0_filt8_r);
671  PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
672  q2_filt8_r);
673 
674  /* store pixel values */
675  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
676  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
677  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
678  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
679  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
680  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
681 
682  ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
683  filter48 += (4 * 16);
684  ST_UB2(q1_out, q2_out, filter48, 16);
685  filter48 += (2 * 16);
686  ST_UB(flat, filter48);
687 
688  return 0;
689  }
690 }
691 
692 static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48)
693 {
694  v16u8 flat, flat2, filter8;
695  v16i8 zero = { 0 };
696  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
697  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
698  v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
699  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
700  v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
701  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
702  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
703  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
704  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
705  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
706  v8i16 l_out, r_out;
707 
708  flat = LD_UB(filter48 + 96);
709 
710  LD_UB8((src - 8 * pitch), pitch, p7, p6, p5, p4, p3, p2, p1, p0);
711  LD_UB8(src, pitch, q0, q1, q2, q3, q4, q5, q6, q7);
712  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
713 
714  /* if flat2 is zero for all pixels, then no need to calculate other filter */
715  if (__msa_test_bz_v(flat2)) {
716  LD_UB4(filter48, 16, p2, p1, p0, q0);
717  LD_UB2(filter48 + 4 * 16, 16, q1, q2);
718 
719  src -= 3 * pitch;
720  ST_UB4(p2, p1, p0, q0, src, pitch);
721  src += (4 * pitch);
722  ST_UB2(q1, q2, src, pitch);
723  } else {
724  src -= 7 * pitch;
725 
726  ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
727  zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
728  p3_r_in, p2_r_in, p1_r_in, p0_r_in);
729 
730  q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
731 
732  tmp0_r = p7_r_in << 3;
733  tmp0_r -= p7_r_in;
734  tmp0_r += p6_r_in;
735  tmp0_r += q0_r_in;
736  tmp1_r = p6_r_in + p5_r_in;
737  tmp1_r += p4_r_in;
738  tmp1_r += p3_r_in;
739  tmp1_r += p2_r_in;
740  tmp1_r += p1_r_in;
741  tmp1_r += p0_r_in;
742  tmp1_r += tmp0_r;
743  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
744 
745  ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
746  p5_l_in, p4_l_in);
747  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
748  p1_l_in, p0_l_in);
749  q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
750 
751  tmp0_l = p7_l_in << 3;
752  tmp0_l -= p7_l_in;
753  tmp0_l += p6_l_in;
754  tmp0_l += q0_l_in;
755  tmp1_l = p6_l_in + p5_l_in;
756  tmp1_l += p4_l_in;
757  tmp1_l += p3_l_in;
758  tmp1_l += p2_l_in;
759  tmp1_l += p1_l_in;
760  tmp1_l += p0_l_in;
761  tmp1_l += tmp0_l;
762  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
763 
764  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
765  p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
766  ST_UB(p6, src);
767  src += pitch;
768 
769  /* p5 */
770  q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
771  tmp0_r = p5_r_in - p6_r_in;
772  tmp0_r += q1_r_in;
773  tmp0_r -= p7_r_in;
774  tmp1_r += tmp0_r;
775  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
776 
777  q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
778  tmp0_l = p5_l_in - p6_l_in;
779  tmp0_l += q1_l_in;
780  tmp0_l -= p7_l_in;
781  tmp1_l += tmp0_l;
782  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
783 
784  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
785  p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
786  ST_UB(p5, src);
787  src += pitch;
788 
789  /* p4 */
790  q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
791  tmp0_r = p4_r_in - p5_r_in;
792  tmp0_r += q2_r_in;
793  tmp0_r -= p7_r_in;
794  tmp1_r += tmp0_r;
795  r_out = (v8i16) __msa_srari_h((v8i16) tmp1_r, 4);
796 
797  q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
798  tmp0_l = p4_l_in - p5_l_in;
799  tmp0_l += q2_l_in;
800  tmp0_l -= p7_l_in;
801  tmp1_l += tmp0_l;
802  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
803 
804  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
805  p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
806  ST_UB(p4, src);
807  src += pitch;
808 
809  /* p3 */
810  q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
811  tmp0_r = p3_r_in - p4_r_in;
812  tmp0_r += q3_r_in;
813  tmp0_r -= p7_r_in;
814  tmp1_r += tmp0_r;
815  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
816 
817  q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
818  tmp0_l = p3_l_in - p4_l_in;
819  tmp0_l += q3_l_in;
820  tmp0_l -= p7_l_in;
821  tmp1_l += tmp0_l;
822  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
823 
824  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
825  p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
826  ST_UB(p3, src);
827  src += pitch;
828 
829  /* p2 */
830  q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
831  filter8 = LD_UB(filter48);
832  tmp0_r = p2_r_in - p3_r_in;
833  tmp0_r += q4_r_in;
834  tmp0_r -= p7_r_in;
835  tmp1_r += tmp0_r;
836  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
837 
838  q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
839  tmp0_l = p2_l_in - p3_l_in;
840  tmp0_l += q4_l_in;
841  tmp0_l -= p7_l_in;
842  tmp1_l += tmp0_l;
843  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
844 
845  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
846  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
847  ST_UB(filter8, src);
848  src += pitch;
849 
850  /* p1 */
851  q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
852  filter8 = LD_UB(filter48 + 16);
853  tmp0_r = p1_r_in - p2_r_in;
854  tmp0_r += q5_r_in;
855  tmp0_r -= p7_r_in;
856  tmp1_r += tmp0_r;
857  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
858 
859  q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
860  tmp0_l = p1_l_in - p2_l_in;
861  tmp0_l += q5_l_in;
862  tmp0_l -= p7_l_in;
863  tmp1_l += tmp0_l;
864  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
865 
866  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
867  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
868  ST_UB(filter8, src);
869  src += pitch;
870 
871  /* p0 */
872  q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
873  filter8 = LD_UB(filter48 + 32);
874  tmp0_r = p0_r_in - p1_r_in;
875  tmp0_r += q6_r_in;
876  tmp0_r -= p7_r_in;
877  tmp1_r += tmp0_r;
878  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
879 
880  q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
881  tmp0_l = p0_l_in - p1_l_in;
882  tmp0_l += q6_l_in;
883  tmp0_l -= p7_l_in;
884  tmp1_l += tmp0_l;
885  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
886 
887  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
888  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
889  ST_UB(filter8, src);
890  src += pitch;
891 
892  /* q0 */
893  q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
894  filter8 = LD_UB(filter48 + 48);
895  tmp0_r = q7_r_in - p0_r_in;
896  tmp0_r += q0_r_in;
897  tmp0_r -= p7_r_in;
898  tmp1_r += tmp0_r;
899  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
900 
901  q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
902  tmp0_l = q7_l_in - p0_l_in;
903  tmp0_l += q0_l_in;
904  tmp0_l -= p7_l_in;
905  tmp1_l += tmp0_l;
906  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
907 
908  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
909  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
910  ST_UB(filter8, src);
911  src += pitch;
912 
913  /* q1 */
914  filter8 = LD_UB(filter48 + 64);
915  tmp0_r = q7_r_in - q0_r_in;
916  tmp0_r += q1_r_in;
917  tmp0_r -= p6_r_in;
918  tmp1_r += tmp0_r;
919  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
920 
921  tmp0_l = q7_l_in - q0_l_in;
922  tmp0_l += q1_l_in;
923  tmp0_l -= p6_l_in;
924  tmp1_l += tmp0_l;
925  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
926 
927  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
928  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
929  ST_UB(filter8, src);
930  src += pitch;
931 
932  /* q2 */
933  filter8 = LD_UB(filter48 + 80);
934  tmp0_r = q7_r_in - q1_r_in;
935  tmp0_r += q2_r_in;
936  tmp0_r -= p5_r_in;
937  tmp1_r += tmp0_r;
938  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
939 
940  tmp0_l = q7_l_in - q1_l_in;
941  tmp0_l += q2_l_in;
942  tmp0_l -= p5_l_in;
943  tmp1_l += tmp0_l;
944  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
945 
946  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
947  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
948  ST_UB(filter8, src);
949  src += pitch;
950 
951  /* q3 */
952  tmp0_r = q7_r_in - q2_r_in;
953  tmp0_r += q3_r_in;
954  tmp0_r -= p4_r_in;
955  tmp1_r += tmp0_r;
956  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
957 
958  tmp0_l = q7_l_in - q2_l_in;
959  tmp0_l += q3_l_in;
960  tmp0_l -= p4_l_in;
961  tmp1_l += tmp0_l;
962  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
963 
964  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
965  q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
966  ST_UB(q3, src);
967  src += pitch;
968 
969  /* q4 */
970  tmp0_r = q7_r_in - q3_r_in;
971  tmp0_r += q4_r_in;
972  tmp0_r -= p3_r_in;
973  tmp1_r += tmp0_r;
974  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
975 
976  tmp0_l = q7_l_in - q3_l_in;
977  tmp0_l += q4_l_in;
978  tmp0_l -= p3_l_in;
979  tmp1_l += tmp0_l;
980  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
981 
982  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
983  q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
984  ST_UB(q4, src);
985  src += pitch;
986 
987  /* q5 */
988  tmp0_r = q7_r_in - q4_r_in;
989  tmp0_r += q5_r_in;
990  tmp0_r -= p2_r_in;
991  tmp1_r += tmp0_r;
992  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
993 
994  tmp0_l = q7_l_in - q4_l_in;
995  tmp0_l += q5_l_in;
996  tmp0_l -= p2_l_in;
997  tmp1_l += tmp0_l;
998  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
999 
1000  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
1001  q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
1002  ST_UB(q5, src);
1003  src += pitch;
1004 
1005  /* q6 */
1006  tmp0_r = q7_r_in - q5_r_in;
1007  tmp0_r += q6_r_in;
1008  tmp0_r -= p1_r_in;
1009  tmp1_r += tmp0_r;
1010  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1011 
1012  tmp0_l = q7_l_in - q5_l_in;
1013  tmp0_l += q6_l_in;
1014  tmp0_l -= p1_l_in;
1015  tmp1_l += tmp0_l;
1016  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
1017 
1018  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
1019  q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
1020  ST_UB(q6, src);
1021  }
1022 }
1023 
1024 void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch,
1025  int32_t b_limit_ptr,
1026  int32_t limit_ptr,
1027  int32_t thresh_ptr)
1028 {
1029  uint8_t filter48[16 * 8] ALLOC_ALIGNED(ALIGNMENT);
1030  uint8_t early_exit = 0;
1031 
1032  early_exit = vp9_hz_lpf_t4_and_t8_16w(src, pitch, &filter48[0],
1033  b_limit_ptr, limit_ptr, thresh_ptr);
1034 
1035  if (0 == early_exit) {
1036  vp9_hz_lpf_t16_16w(src, pitch, filter48);
1037  }
1038 }
1039 
1040 void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch,
1041  int32_t b_limit_ptr,
1042  int32_t limit_ptr,
1043  int32_t thresh_ptr)
1044 {
1045  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
1046  uint64_t dword0, dword1;
1047  v16u8 flat2, mask, hev, flat, thresh, b_limit, limit;
1048  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
1049  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1050  v16u8 p0_filter16, p1_filter16;
1051  v8i16 p2_filter8, p1_filter8, p0_filter8;
1052  v8i16 q0_filter8, q1_filter8, q2_filter8;
1053  v8u16 p7_r, p6_r, p5_r, p4_r, q7_r, q6_r, q5_r, q4_r;
1054  v8u16 p3_r, p2_r, p1_r, p0_r, q3_r, q2_r, q1_r, q0_r;
1055  v16i8 zero = { 0 };
1056  v8u16 tmp0, tmp1, tmp2;
1057 
1058  /* load vector elements */
1059  LD_UB8((src - 4 * pitch), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1060 
1061  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1062  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1063  limit = (v16u8) __msa_fill_b(limit_ptr);
1064 
1065  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1066  hev, mask, flat);
1067  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1068  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1069  q1_out);
1070 
1071  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1072 
1073  /* if flat is zero for all pixels, then no need to calculate other filter */
1074  if (__msa_test_bz_v(flat)) {
1075  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1076  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1077  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1078  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1079  SD4(p1_d, p0_d, q0_d, q1_d, src - 2 * pitch, pitch);
1080  } else {
1081  /* convert 8 bit input data into 16 bit */
1082  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero,
1083  q1, zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r,
1084  q1_r, q2_r, q3_r);
1085  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r,
1086  p2_filter8, p1_filter8, p0_filter8, q0_filter8,
1087  q1_filter8, q2_filter8);
1088 
1089  /* convert 16 bit output data into 8 bit */
1090  PCKEV_B4_SH(zero, p2_filter8, zero, p1_filter8, zero, p0_filter8,
1091  zero, q0_filter8, p2_filter8, p1_filter8, p0_filter8,
1092  q0_filter8);
1093  PCKEV_B2_SH(zero, q1_filter8, zero, q2_filter8, q1_filter8,
1094  q2_filter8);
1095 
1096  /* store pixel values */
1097  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filter8, flat);
1098  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filter8, flat);
1099  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filter8, flat);
1100  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filter8, flat);
1101  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filter8, flat);
1102  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filter8, flat);
1103 
1104  /* load 16 vector elements */
1105  LD_UB4((src - 8 * pitch), pitch, p7, p6, p5, p4);
1106  LD_UB4(src + (4 * pitch), pitch, q4, q5, q6, q7);
1107 
1108  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1109 
1110  /* if flat2 is zero for all pixels, then no need to calculate other filter */
1111  if (__msa_test_bz_v(flat2)) {
1112  p2_d = __msa_copy_u_d((v2i64) p2_out, 0);
1113  p1_d = __msa_copy_u_d((v2i64) p1_out, 0);
1114  p0_d = __msa_copy_u_d((v2i64) p0_out, 0);
1115  q0_d = __msa_copy_u_d((v2i64) q0_out, 0);
1116  q1_d = __msa_copy_u_d((v2i64) q1_out, 0);
1117  q2_d = __msa_copy_u_d((v2i64) q2_out, 0);
1118 
1119  SD4(p2_d, p1_d, p0_d, q0_d, src - 3 * pitch, pitch);
1120  SD(q1_d, src + pitch);
1121  SD(q2_d, src + 2 * pitch);
1122  } else {
1123  /* LSB(right) 8 pixel operation */
1124  ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, q4,
1125  zero, q5, zero, q6, zero, q7, p7_r, p6_r, p5_r, p4_r,
1126  q4_r, q5_r, q6_r, q7_r);
1127 
1128  tmp0 = p7_r << 3;
1129  tmp0 -= p7_r;
1130  tmp0 += p6_r;
1131  tmp0 += q0_r;
1132 
1133  src -= 7 * pitch;
1134 
1135  /* calculation of p6 and p5 */
1136  tmp1 = p6_r + p5_r + p4_r + p3_r;
1137  tmp1 += (p2_r + p1_r + p0_r);
1138  tmp1 += tmp0;
1139  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1140  tmp0 = p5_r - p6_r + q1_r - p7_r;
1141  tmp1 += tmp0;
1142  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1143  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1144  p0_filter16, p1_filter16);
1145  p0_filter16 = __msa_bmnz_v(p6, p0_filter16, flat2);
1146  p1_filter16 = __msa_bmnz_v(p5, p1_filter16, flat2);
1147  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1148  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1149  SD(dword0, src);
1150  src += pitch;
1151  SD(dword1, src);
1152  src += pitch;
1153 
1154  /* calculation of p4 and p3 */
1155  tmp0 = p4_r - p5_r + q2_r - p7_r;
1156  tmp2 = p3_r - p4_r + q3_r - p7_r;
1157  tmp1 += tmp0;
1158  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1159  tmp1 += tmp2;
1160  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1161  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1162  p0_filter16, p1_filter16);
1163  p0_filter16 = __msa_bmnz_v(p4, p0_filter16, flat2);
1164  p1_filter16 = __msa_bmnz_v(p3, p1_filter16, flat2);
1165  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1166  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1167  SD(dword0, src);
1168  src += pitch;
1169  SD(dword1, src);
1170  src += pitch;
1171 
1172  /* calculation of p2 and p1 */
1173  tmp0 = p2_r - p3_r + q4_r - p7_r;
1174  tmp2 = p1_r - p2_r + q5_r - p7_r;
1175  tmp1 += tmp0;
1176  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1177  tmp1 += tmp2;
1178  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1179  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1180  p0_filter16, p1_filter16);
1181  p0_filter16 = __msa_bmnz_v(p2_out, p0_filter16, flat2);
1182  p1_filter16 = __msa_bmnz_v(p1_out, p1_filter16, flat2);
1183  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1184  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1185  SD(dword0, src);
1186  src += pitch;
1187  SD(dword1, src);
1188  src += pitch;
1189 
1190  /* calculation of p0 and q0 */
1191  tmp0 = (p0_r - p1_r) + (q6_r - p7_r);
1192  tmp2 = (q7_r - p0_r) + (q0_r - p7_r);
1193  tmp1 += tmp0;
1194  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1195  tmp1 += tmp2;
1196  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1197  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1198  p0_filter16, p1_filter16);
1199  p0_filter16 = __msa_bmnz_v(p0_out, p0_filter16, flat2);
1200  p1_filter16 = __msa_bmnz_v(q0_out, p1_filter16, flat2);
1201  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1202  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1203  SD(dword0, src);
1204  src += pitch;
1205  SD(dword1, src);
1206  src += pitch;
1207 
1208  /* calculation of q1 and q2 */
1209  tmp0 = q7_r - q0_r + q1_r - p6_r;
1210  tmp2 = q7_r - q1_r + q2_r - p5_r;
1211  tmp1 += tmp0;
1212  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1213  tmp1 += tmp2;
1214  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1215  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1216  p0_filter16, p1_filter16);
1217  p0_filter16 = __msa_bmnz_v(q1_out, p0_filter16, flat2);
1218  p1_filter16 = __msa_bmnz_v(q2_out, p1_filter16, flat2);
1219  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1220  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1221  SD(dword0, src);
1222  src += pitch;
1223  SD(dword1, src);
1224  src += pitch;
1225 
1226  /* calculation of q3 and q4 */
1227  tmp0 = (q7_r - q2_r) + (q3_r - p4_r);
1228  tmp2 = (q7_r - q3_r) + (q4_r - p3_r);
1229  tmp1 += tmp0;
1230  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1231  tmp1 += tmp2;
1232  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1233  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1234  p0_filter16, p1_filter16);
1235  p0_filter16 = __msa_bmnz_v(q3, p0_filter16, flat2);
1236  p1_filter16 = __msa_bmnz_v(q4, p1_filter16, flat2);
1237  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1238  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1239  SD(dword0, src);
1240  src += pitch;
1241  SD(dword1, src);
1242  src += pitch;
1243 
1244  /* calculation of q5 and q6 */
1245  tmp0 = (q7_r - q4_r) + (q5_r - p2_r);
1246  tmp2 = (q7_r - q5_r) + (q6_r - p1_r);
1247  tmp1 += tmp0;
1248  p0_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1249  tmp1 += tmp2;
1250  p1_filter16 = (v16u8) __msa_srari_h((v8i16) tmp1, 4);
1251  PCKEV_B2_UB(zero, p0_filter16, zero, p1_filter16,
1252  p0_filter16, p1_filter16);
1253  p0_filter16 = __msa_bmnz_v(q5, p0_filter16, flat2);
1254  p1_filter16 = __msa_bmnz_v(q6, p1_filter16, flat2);
1255  dword0 = __msa_copy_u_d((v2i64) p0_filter16, 0);
1256  dword1 = __msa_copy_u_d((v2i64) p1_filter16, 0);
1257  SD(dword0, src);
1258  src += pitch;
1259  SD(dword1, src);
1260  }
1261  }
1262 }
1263 
1264 void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch,
1265  int32_t b_limit_ptr,
1266  int32_t limit_ptr,
1267  int32_t thresh_ptr)
1268 {
1269  v16u8 mask, hev, flat, limit, thresh, b_limit;
1270  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1271  v8i16 vec0, vec1, vec2, vec3;
1272 
1273  LD_UB8((src - 4), pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1274 
1275  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1276  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1277  limit = (v16u8) __msa_fill_b(limit_ptr);
1278 
1279  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
1280  p3, p2, p1, p0, q0, q1, q2, q3);
1281  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1282  hev, mask, flat);
1283  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1284  ILVR_B2_SH(p0, p1, q1, q0, vec0, vec1);
1285  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1286 
1287  src -= 2;
1288  ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
1289  src += 4 * pitch;
1290  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1291 }
1292 
1293 void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch,
1294  int32_t b_limit_ptr,
1295  int32_t limit_ptr,
1296  int32_t thresh_ptr)
1297 {
1298  v16u8 mask, hev, flat;
1299  v16u8 thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
1300  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1301  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1302  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1303  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1304 
1305  LD_UB8(src - 4, pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1306  LD_UB8(src - 4 + (8 * pitch), pitch,
1307  row8, row9, row10, row11, row12, row13, row14, row15);
1308 
1309  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1310  row8, row9, row10, row11, row12, row13, row14, row15,
1311  p3, p2, p1, p0, q0, q1, q2, q3);
1312 
1313  thresh0 = (v16u8) __msa_fill_b(thresh_ptr);
1314  thresh1 = (v16u8) __msa_fill_b(thresh_ptr >> 8);
1315  thresh0 = (v16u8) __msa_ilvr_d((v2i64) thresh1, (v2i64) thresh0);
1316 
1317  b_limit0 = (v16u8) __msa_fill_b(b_limit_ptr);
1318  b_limit1 = (v16u8) __msa_fill_b(b_limit_ptr >> 8);
1319  b_limit0 = (v16u8) __msa_ilvr_d((v2i64) b_limit1, (v2i64) b_limit0);
1320 
1321  limit0 = (v16u8) __msa_fill_b(limit_ptr);
1322  limit1 = (v16u8) __msa_fill_b(limit_ptr >> 8);
1323  limit0 = (v16u8) __msa_ilvr_d((v2i64) limit1, (v2i64) limit0);
1324 
1325  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
1326  hev, mask, flat);
1327  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
1328  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
1329  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
1330  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
1331  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
1332 
1333  src -= 2;
1334 
1335  ST4x8_UB(tmp2, tmp3, src, pitch);
1336  src += (8 * pitch);
1337  ST4x8_UB(tmp4, tmp5, src, pitch);
1338 }
1339 
1340 void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch,
1341  int32_t b_limit_ptr,
1342  int32_t limit_ptr,
1343  int32_t thresh_ptr)
1344 {
1345  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1346  v16u8 p1_out, p0_out, q0_out, q1_out;
1347  v16u8 flat, mask, hev, thresh, b_limit, limit;
1348  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1349  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1350  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1351  v16u8 zero = { 0 };
1352  v8i16 vec0, vec1, vec2, vec3, vec4;
1353 
1354  /* load vector elements */
1355  LD_UB8(src - 4, pitch, p3, p2, p1, p0, q0, q1, q2, q3);
1356 
1357  TRANSPOSE8x8_UB_UB(p3, p2, p1, p0, q0, q1, q2, q3,
1358  p3, p2, p1, p0, q0, q1, q2, q3);
1359 
1360  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1361  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1362  limit = (v16u8) __msa_fill_b(limit_ptr);
1363 
1364  /* mask and hev */
1365  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1366  hev, mask, flat);
1367  /* flat4 */
1368  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1369  /* filter4 */
1370  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1371  q1_out);
1372 
1373  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1374 
1375  /* if flat is zero for all pixels, then no need to calculate other filter */
1376  if (__msa_test_bz_v(flat)) {
1377  /* Store 4 pixels p1-_q1 */
1378  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1379  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1380 
1381  src -= 2;
1382  ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
1383  src += 4 * pitch;
1384  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1385  } else {
1386  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1387  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1388  q3_r);
1389  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1390  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1391  /* convert 16 bit output data into 8 bit */
1392  PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r, p0_filt8_r,
1393  p0_filt8_r, q0_filt8_r, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1394  p0_filt8_r, q0_filt8_r);
1395  PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r, q1_filt8_r,
1396  q2_filt8_r);
1397 
1398  /* store pixel values */
1399  p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1400  p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1401  p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1402  q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1403  q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1404  q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1405 
1406  /* Store 6 pixels p2-_q2 */
1407  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1408  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1409  vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1410 
1411  src -= 3;
1412  ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
1413  ST2x4_UB(vec4, 0, src + 4, pitch);
1414  src += (4 * pitch);
1415  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1416  ST2x4_UB(vec4, 4, src + 4, pitch);
1417  }
1418 }
1419 
1420 void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch,
1421  int32_t b_limit_ptr,
1422  int32_t limit_ptr,
1423  int32_t thresh_ptr)
1424 {
1425  uint8_t *temp_src;
1426  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1427  v16u8 p1_out, p0_out, q0_out, q1_out;
1428  v16u8 flat, mask, hev, thresh, b_limit, limit;
1429  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1430  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1431  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1432  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1433  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1434  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1435  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1436  v16u8 zero = { 0 };
1437  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1438 
1439  temp_src = src - 4;
1440 
1441  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1442  temp_src += (8 * pitch);
1443  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1444 
1445  /* transpose 16x8 matrix into 8x16 */
1446  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1447  q3, q2, q1, q0, row12, row13, row14, row15,
1448  p3, p2, p1, p0, q0, q1, q2, q3);
1449 
1450  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1451  vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1452  thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1453 
1454  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1455  vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1456  b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1457 
1458  limit = (v16u8) __msa_fill_b(limit_ptr);
1459  vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1460  limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1461 
1462  /* mask and hev */
1463  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1464  hev, mask, flat);
1465  /* flat4 */
1466  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1467  /* filter4 */
1468  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1469  q1_out);
1470 
1471  /* if flat is zero for all pixels, then no need to calculate other filter */
1472  if (__msa_test_bz_v(flat)) {
1473  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1474  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1475  ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1476  ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1477 
1478  src -= 2;
1479  ST4x8_UB(vec2, vec3, src, pitch);
1480  src += 8 * pitch;
1481  ST4x8_UB(vec4, vec5, src, pitch);
1482  } else {
1483  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1484  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1485  q3_r);
1486  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1487  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1488 
1489  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1490  p0_l);
1491  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1492  q3_l);
1493 
1494  /* filter8 */
1495  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1496  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1497 
1498  /* convert 16 bit output data into 8 bit */
1499  PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
1500  p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
1501  p0_filt8_r, q0_filt8_r);
1502  PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
1503  q2_filt8_r);
1504 
1505  /* store pixel values */
1506  p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1507  p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1508  p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1509  q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1510  q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1511  q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1512 
1513  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1514  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1515  ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1516  ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1517  ILVRL_B2_SH(q2, q1, vec2, vec5);
1518 
1519  src -= 3;
1520  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1521  ST2x4_UB(vec2, 0, src + 4, pitch);
1522  src += (4 * pitch);
1523  ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
1524  ST2x4_UB(vec2, 4, src + 4, pitch);
1525  src += (4 * pitch);
1526  ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
1527  ST2x4_UB(vec5, 0, src + 4, pitch);
1528  src += (4 * pitch);
1529  ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
1530  ST2x4_UB(vec5, 4, src + 4, pitch);
1531  }
1532 }
1533 
1534 void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch,
1535  int32_t b_limit_ptr,
1536  int32_t limit_ptr,
1537  int32_t thresh_ptr)
1538 {
1539  uint8_t *temp_src;
1540  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1541  v16u8 p1_out, p0_out, q0_out, q1_out;
1542  v16u8 flat, mask, hev, thresh, b_limit, limit;
1543  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1544  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1545  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1546  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1547  v16u8 zero = { 0 };
1548  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1549 
1550  temp_src = src - 4;
1551 
1552  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1553  temp_src += (8 * pitch);
1554  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1555 
1556  /* transpose 16x8 matrix into 8x16 */
1557  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1558  q3, q2, q1, q0, row12, row13, row14, row15,
1559  p3, p2, p1, p0, q0, q1, q2, q3);
1560 
1561  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1562  vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1563  thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1564 
1565  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1566  vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1567  b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1568 
1569  limit = (v16u8) __msa_fill_b(limit_ptr);
1570  vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1571  limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1572 
1573  /* mask and hev */
1574  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1575  hev, mask, flat);
1576  /* flat4 */
1577  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1578  /* filter4 */
1579  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1580  q1_out);
1581 
1582  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1583 
1584  /* if flat is zero for all pixels, then no need to calculate other filter */
1585  if (__msa_test_bz_v(flat)) {
1586  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1587  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1588  ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1589  ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1590 
1591  src -= 2;
1592  ST4x8_UB(vec2, vec3, src, pitch);
1593  src += 8 * pitch;
1594  ST4x8_UB(vec4, vec5, src, pitch);
1595  } else {
1596  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1597  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1598  q3_r);
1599  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1600  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1601 
1602  /* convert 16 bit output data into 8 bit */
1603  PCKEV_B4_SH(p2_filt8_r, p2_filt8_r, p1_filt8_r, p1_filt8_r,
1604  p0_filt8_r, p0_filt8_r, q0_filt8_r, q0_filt8_r,
1605  p2_filt8_r, p1_filt8_r, p0_filt8_r, q0_filt8_r);
1606  PCKEV_B2_SH(q1_filt8_r, q1_filt8_r, q2_filt8_r, q2_filt8_r,
1607  q1_filt8_r, q2_filt8_r);
1608 
1609  /* store pixel values */
1610  p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
1611  p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
1612  p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
1613  q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
1614  q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
1615  q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
1616 
1617  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1618  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1619  ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1620  ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1621  ILVRL_B2_SH(q2, q1, vec2, vec5);
1622 
1623  src -= 3;
1624  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1625  ST2x4_UB(vec2, 0, src + 4, pitch);
1626  src += (4 * pitch);
1627  ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
1628  ST2x4_UB(vec2, 4, src + 4, pitch);
1629  src += (4 * pitch);
1630  ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
1631  ST2x4_UB(vec5, 0, src + 4, pitch);
1632  src += (4 * pitch);
1633  ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
1634  ST2x4_UB(vec5, 4, src + 4, pitch);
1635  }
1636 }
1637 
1638 void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch,
1639  int32_t b_limit_ptr,
1640  int32_t limit_ptr,
1641  int32_t thresh_ptr)
1642 {
1643  uint8_t *temp_src;
1644  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1645  v16u8 p1_out, p0_out, q0_out, q1_out;
1646  v16u8 flat, mask, hev, thresh, b_limit, limit;
1647  v16u8 row4, row5, row6, row7, row12, row13, row14, row15;
1648  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
1649  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
1650  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
1651  v16u8 zero = { 0 };
1652  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1653 
1654  temp_src = src - 4;
1655 
1656  LD_UB8(temp_src, pitch, p0, p1, p2, p3, row4, row5, row6, row7);
1657  temp_src += (8 * pitch);
1658  LD_UB8(temp_src, pitch, q3, q2, q1, q0, row12, row13, row14, row15);
1659 
1660  /* transpose 16x8 matrix into 8x16 */
1661  TRANSPOSE16x8_UB_UB(p0, p1, p2, p3, row4, row5, row6, row7,
1662  q3, q2, q1, q0, row12, row13, row14, row15,
1663  p3, p2, p1, p0, q0, q1, q2, q3);
1664 
1665  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1666  vec0 = (v8i16) __msa_fill_b(thresh_ptr >> 8);
1667  thresh = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) thresh);
1668 
1669  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1670  vec0 = (v8i16) __msa_fill_b(b_limit_ptr >> 8);
1671  b_limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) b_limit);
1672 
1673  limit = (v16u8) __msa_fill_b(limit_ptr);
1674  vec0 = (v8i16) __msa_fill_b(limit_ptr >> 8);
1675  limit = (v16u8) __msa_ilvr_d((v2i64) vec0, (v2i64) limit);
1676 
1677  /* mask and hev */
1678  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1679  hev, mask, flat);
1680  /* flat4 */
1681  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1682  /* filter4 */
1683  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1684  q1_out);
1685 
1686  flat = (v16u8) __msa_insve_d((v2i64) flat, 0, (v2i64) zero);
1687 
1688  /* if flat is zero for all pixels, then no need to calculate other filter */
1689  if (__msa_test_bz_v(flat)) {
1690  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1691  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1692  ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1693  ILVRL_H2_SH(vec1, vec0, vec4, vec5);
1694 
1695  src -= 2;
1696  ST4x8_UB(vec2, vec3, src, pitch);
1697  src += 8 * pitch;
1698  ST4x8_UB(vec4, vec5, src, pitch);
1699  } else {
1700  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
1701  p0_l);
1702  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
1703  q3_l);
1704 
1705  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
1706  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
1707 
1708  /* convert 16 bit output data into 8 bit */
1709  PCKEV_B4_SH(p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
1710  p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l,
1711  p2_filt8_l, p1_filt8_l, p0_filt8_l, q0_filt8_l);
1712  PCKEV_B2_SH(q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
1713  q1_filt8_l, q2_filt8_l);
1714 
1715  /* store pixel values */
1716  p2 = __msa_bmnz_v(p2, (v16u8) p2_filt8_l, flat);
1717  p1 = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_l, flat);
1718  p0 = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_l, flat);
1719  q0 = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_l, flat);
1720  q1 = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_l, flat);
1721  q2 = __msa_bmnz_v(q2, (v16u8) q2_filt8_l, flat);
1722 
1723  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1724  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1725  ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
1726  ILVRL_H2_SH(vec1, vec0, vec6, vec7);
1727  ILVRL_B2_SH(q2, q1, vec2, vec5);
1728 
1729  src -= 3;
1730  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
1731  ST2x4_UB(vec2, 0, src + 4, pitch);
1732  src += (4 * pitch);
1733  ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
1734  ST2x4_UB(vec2, 4, src + 4, pitch);
1735  src += (4 * pitch);
1736  ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
1737  ST2x4_UB(vec5, 0, src + 4, pitch);
1738  src += (4 * pitch);
1739  ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
1740  ST2x4_UB(vec5, 4, src + 4, pitch);
1741  }
1742 }
1743 
1744 static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch,
1745  uint8_t *output, int32_t out_pitch)
1746 {
1747  v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
1748  v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
1749  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1750 
1751  LD_UB8(input, in_pitch,
1752  p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
1753  /* 8x8 transpose */
1754  TRANSPOSE8x8_UB_UB(p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org,
1755  p0_org, p7, p6, p5, p4, p3, p2, p1, p0);
1756  /* 8x8 transpose */
1757  ILVL_B4_SB(p5_org, p7_org, p4_org, p6_org, p1_org, p3_org, p0_org, p2_org,
1758  tmp0, tmp1, tmp2, tmp3);
1759  ILVR_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp4, tmp6);
1760  ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
1761  ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
1762  ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
1763  SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
1764 
1765  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1766  output += (8 * out_pitch);
1767  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1768 }
1769 
1770 static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch,
1771  uint8_t *output, int32_t out_pitch)
1772 {
1773  v16u8 p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o;
1774  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1775 
1776  LD_UB8(input, in_pitch, p7, p6, p5, p4, p3, p2, p1, p0);
1777  LD_UB8(input + (8 * in_pitch), in_pitch, q0, q1, q2, q3, q4, q5, q6, q7);
1778  TRANSPOSE16x8_UB_UB(p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5,
1779  q6, q7, p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o);
1780  ST_UB8(p7_o, p6_o, p5_o, p4_o, p3_o, p2_o, p1_o, p0_o, output, out_pitch);
1781 }
1782 
1783 static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch,
1784  uint8_t *output, int32_t out_pitch)
1785 {
1786  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1787  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
1788  v8i16 tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
1789  v4i32 tmp2, tmp3;
1790  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1791 
1792  LD_UB8(input, in_pitch, row0, row1, row2, row3, row4, row5, row6, row7);
1793  input += (8 * in_pitch);
1794  LD_UB8(input, in_pitch,
1795  row8, row9, row10, row11, row12, row13, row14, row15);
1796 
1797  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
1798  row8, row9, row10, row11, row12, row13, row14, row15,
1799  p7, p6, p5, p4, p3, p2, p1, p0);
1800 
1801  /* transpose 16x8 matrix into 8x16 */
1802  /* total 8 intermediate register and 32 instructions */
1803  q7 = (v16u8) __msa_ilvod_d((v2i64) row8, (v2i64) row0);
1804  q6 = (v16u8) __msa_ilvod_d((v2i64) row9, (v2i64) row1);
1805  q5 = (v16u8) __msa_ilvod_d((v2i64) row10, (v2i64) row2);
1806  q4 = (v16u8) __msa_ilvod_d((v2i64) row11, (v2i64) row3);
1807  q3 = (v16u8) __msa_ilvod_d((v2i64) row12, (v2i64) row4);
1808  q2 = (v16u8) __msa_ilvod_d((v2i64) row13, (v2i64) row5);
1809  q1 = (v16u8) __msa_ilvod_d((v2i64) row14, (v2i64) row6);
1810  q0 = (v16u8) __msa_ilvod_d((v2i64) row15, (v2i64) row7);
1811 
1812  ILVEV_B2_SH(q7, q6, q5, q4, tmp0, tmp1);
1813  tmp4 = (v8i16) __msa_ilvod_b((v16i8) q6, (v16i8) q7);
1814  tmp5 = (v8i16) __msa_ilvod_b((v16i8) q4, (v16i8) q5);
1815 
1816  ILVEV_B2_UB(q3, q2, q1, q0, q5, q7);
1817  tmp6 = (v8i16) __msa_ilvod_b((v16i8) q2, (v16i8) q3);
1818  tmp7 = (v8i16) __msa_ilvod_b((v16i8) q0, (v16i8) q1);
1819 
1820  ILVEV_H2_SW(tmp0, tmp1, q5, q7, tmp2, tmp3);
1821  q0 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1822  q4 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1823 
1824  tmp2 = (v4i32) __msa_ilvod_h(tmp1, tmp0);
1825  tmp3 = (v4i32) __msa_ilvod_h((v8i16) q7, (v8i16) q5);
1826  q2 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1827  q6 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1828 
1829  ILVEV_H2_SW(tmp4, tmp5, tmp6, tmp7, tmp2, tmp3);
1830  q1 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1831  q5 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1832 
1833  tmp2 = (v4i32) __msa_ilvod_h(tmp5, tmp4);
1834  tmp3 = (v4i32) __msa_ilvod_h(tmp7, tmp6);
1835  q3 = (v16u8) __msa_ilvev_w(tmp3, tmp2);
1836  q7 = (v16u8) __msa_ilvod_w(tmp3, tmp2);
1837 
1838  ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
1839  output += (8 * out_pitch);
1840  ST_UB8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_pitch);
1841 }
1842 
1844  uint8_t *src_org, int32_t pitch_org,
1845  int32_t b_limit_ptr,
1846  int32_t limit_ptr,
1847  int32_t thresh_ptr)
1848 {
1849  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
1850  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
1851  v16u8 flat, mask, hev, thresh, b_limit, limit;
1852  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
1853  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
1854  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
1855  v16i8 zero = { 0 };
1856  v8i16 vec0, vec1, vec2, vec3;
1857 
1858  /* load vector elements */
1859  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
1860 
1861  thresh = (v16u8) __msa_fill_b(thresh_ptr);
1862  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
1863  limit = (v16u8) __msa_fill_b(limit_ptr);
1864 
1865  /* mask and hev */
1866  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
1867  hev, mask, flat);
1868  /* flat4 */
1869  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
1870  /* filter4 */
1871  VP9_LPF_FILTER4_8W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
1872  q1_out);
1873 
1874  flat = (v16u8) __msa_ilvr_d((v2i64) zero, (v2i64) flat);
1875 
1876  /* if flat is zero for all pixels, then no need to calculate other filter */
1877  if (__msa_test_bz_v(flat)) {
1878  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
1879  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
1880  ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
1881  return 1;
1882  } else {
1883  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
1884  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
1885  q3_r);
1886  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
1887  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
1888 
1889  /* convert 16 bit output data into 8 bit */
1890  p2_r = (v8u16) __msa_pckev_b((v16i8) p2_filt8_r, (v16i8) p2_filt8_r);
1891  p1_r = (v8u16) __msa_pckev_b((v16i8) p1_filt8_r, (v16i8) p1_filt8_r);
1892  p0_r = (v8u16) __msa_pckev_b((v16i8) p0_filt8_r, (v16i8) p0_filt8_r);
1893  q0_r = (v8u16) __msa_pckev_b((v16i8) q0_filt8_r, (v16i8) q0_filt8_r);
1894  q1_r = (v8u16) __msa_pckev_b((v16i8) q1_filt8_r, (v16i8) q1_filt8_r);
1895  q2_r = (v8u16) __msa_pckev_b((v16i8) q2_filt8_r, (v16i8) q2_filt8_r);
1896 
1897  /* store pixel values */
1898  p2_out = __msa_bmnz_v(p2, (v16u8) p2_r, flat);
1899  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_r, flat);
1900  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_r, flat);
1901  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_r, flat);
1902  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_r, flat);
1903  q2_out = __msa_bmnz_v(q2, (v16u8) q2_r, flat);
1904 
1905  ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
1906  filter48 += (4 * 16);
1907  ST_UB2(q1_out, q2_out, filter48, 16);
1908  filter48 += (2 * 16);
1909  ST_UB(flat, filter48);
1910 
1911  return 0;
1912  }
1913 }
1914 
1915 static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
1916  uint8_t *filter48)
1917 {
1918  v16i8 zero = { 0 };
1919  v16u8 filter8, flat, flat2;
1920  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
1921  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
1922  v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
1923  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
1924  v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
1925  v8u16 tmp0_r, tmp1_r;
1926  v8i16 r_out;
1927 
1928  flat = LD_UB(filter48 + 6 * 16);
1929 
1930  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
1931  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
1932 
1933  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
1934 
1935  /* if flat2 is zero for all pixels, then no need to calculate other filter */
1936  if (__msa_test_bz_v(flat2)) {
1937  v8i16 vec0, vec1, vec2, vec3, vec4;
1938 
1939  LD_UB4(filter48, 16, p2, p1, p0, q0);
1940  LD_UB2(filter48 + 4 * 16, 16, q1, q2);
1941 
1942  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
1943  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
1944  vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
1945 
1946  src_org -= 3;
1947  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
1948  ST2x4_UB(vec2, 0, (src_org + 4), pitch);
1949  src_org += (4 * pitch);
1950  ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
1951  ST2x4_UB(vec2, 4, (src_org + 4), pitch);
1952 
1953  return 1;
1954  } else {
1955  src -= 7 * 16;
1956 
1957  ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
1958  zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
1959  p3_r_in, p2_r_in, p1_r_in, p0_r_in);
1960  q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
1961 
1962  tmp0_r = p7_r_in << 3;
1963  tmp0_r -= p7_r_in;
1964  tmp0_r += p6_r_in;
1965  tmp0_r += q0_r_in;
1966  tmp1_r = p6_r_in + p5_r_in;
1967  tmp1_r += p4_r_in;
1968  tmp1_r += p3_r_in;
1969  tmp1_r += p2_r_in;
1970  tmp1_r += p1_r_in;
1971  tmp1_r += p0_r_in;
1972  tmp1_r += tmp0_r;
1973 
1974  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1975  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1976  p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
1977  ST8x1_UB(p6, src);
1978  src += 16;
1979 
1980  /* p5 */
1981  q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
1982  tmp0_r = p5_r_in - p6_r_in;
1983  tmp0_r += q1_r_in;
1984  tmp0_r -= p7_r_in;
1985  tmp1_r += tmp0_r;
1986  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1987  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
1988  p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
1989  ST8x1_UB(p5, src);
1990  src += 16;
1991 
1992  /* p4 */
1993  q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
1994  tmp0_r = p4_r_in - p5_r_in;
1995  tmp0_r += q2_r_in;
1996  tmp0_r -= p7_r_in;
1997  tmp1_r += tmp0_r;
1998  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
1999  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2000  p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
2001  ST8x1_UB(p4, src);
2002  src += 16;
2003 
2004  /* p3 */
2005  q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
2006  tmp0_r = p3_r_in - p4_r_in;
2007  tmp0_r += q3_r_in;
2008  tmp0_r -= p7_r_in;
2009  tmp1_r += tmp0_r;
2010  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2011  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2012  p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
2013  ST8x1_UB(p3, src);
2014  src += 16;
2015 
2016  /* p2 */
2017  q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
2018  filter8 = LD_UB(filter48);
2019  tmp0_r = p2_r_in - p3_r_in;
2020  tmp0_r += q4_r_in;
2021  tmp0_r -= p7_r_in;
2022  tmp1_r += tmp0_r;
2023  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2024  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2025  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2026  ST8x1_UB(filter8, src);
2027  src += 16;
2028 
2029  /* p1 */
2030  q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
2031  filter8 = LD_UB(filter48 + 16);
2032  tmp0_r = p1_r_in - p2_r_in;
2033  tmp0_r += q5_r_in;
2034  tmp0_r -= p7_r_in;
2035  tmp1_r += tmp0_r;
2036  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2037  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2038  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2039  ST8x1_UB(filter8, src);
2040  src += 16;
2041 
2042  /* p0 */
2043  q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
2044  filter8 = LD_UB(filter48 + 32);
2045  tmp0_r = p0_r_in - p1_r_in;
2046  tmp0_r += q6_r_in;
2047  tmp0_r -= p7_r_in;
2048  tmp1_r += tmp0_r;
2049  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2050  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2051  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2052  ST8x1_UB(filter8, src);
2053  src += 16;
2054 
2055  /* q0 */
2056  q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
2057  filter8 = LD_UB(filter48 + 48);
2058  tmp0_r = q7_r_in - p0_r_in;
2059  tmp0_r += q0_r_in;
2060  tmp0_r -= p7_r_in;
2061  tmp1_r += tmp0_r;
2062  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2063  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2064  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2065  ST8x1_UB(filter8, src);
2066  src += 16;
2067 
2068  /* q1 */
2069  filter8 = LD_UB(filter48 + 64);
2070  tmp0_r = q7_r_in - q0_r_in;
2071  tmp0_r += q1_r_in;
2072  tmp0_r -= p6_r_in;
2073  tmp1_r += tmp0_r;
2074  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2075  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2076  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2077  ST8x1_UB(filter8, src);
2078  src += 16;
2079 
2080  /* q2 */
2081  filter8 = LD_UB(filter48 + 80);
2082  tmp0_r = q7_r_in - q1_r_in;
2083  tmp0_r += q2_r_in;
2084  tmp0_r -= p5_r_in;
2085  tmp1_r += tmp0_r;
2086  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2087  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2088  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2089  ST8x1_UB(filter8, src);
2090  src += 16;
2091 
2092  /* q3 */
2093  tmp0_r = q7_r_in - q2_r_in;
2094  tmp0_r += q3_r_in;
2095  tmp0_r -= p4_r_in;
2096  tmp1_r += tmp0_r;
2097  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2098  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2099  q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2100  ST8x1_UB(q3, src);
2101  src += 16;
2102 
2103  /* q4 */
2104  tmp0_r = q7_r_in - q3_r_in;
2105  tmp0_r += q4_r_in;
2106  tmp0_r -= p3_r_in;
2107  tmp1_r += tmp0_r;
2108  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2109  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2110  q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2111  ST8x1_UB(q4, src);
2112  src += 16;
2113 
2114  /* q5 */
2115  tmp0_r = q7_r_in - q4_r_in;
2116  tmp0_r += q5_r_in;
2117  tmp0_r -= p2_r_in;
2118  tmp1_r += tmp0_r;
2119  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2120  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2121  q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2122  ST8x1_UB(q5, src);
2123  src += 16;
2124 
2125  /* q6 */
2126  tmp0_r = q7_r_in - q5_r_in;
2127  tmp0_r += q6_r_in;
2128  tmp0_r -= p1_r_in;
2129  tmp1_r += tmp0_r;
2130  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2131  r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
2132  q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2133  ST8x1_UB(q6, src);
2134 
2135  return 0;
2136  }
2137 }
2138 
2139 void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch,
2140  int32_t b_limit_ptr,
2141  int32_t limit_ptr,
2142  int32_t thresh_ptr)
2143 {
2144  uint8_t early_exit = 0;
2145  uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
2146  uint8_t *filter48 = &transposed_input[16 * 16];
2147 
2148  vp9_transpose_16x8_to_8x16(src - 8, pitch, transposed_input, 16);
2149 
2150  early_exit = vp9_vt_lpf_t4_and_t8_8w((transposed_input + 16 * 8),
2151  &filter48[0], src, pitch,
2152  b_limit_ptr, limit_ptr, thresh_ptr);
2153 
2154  if (0 == early_exit) {
2155  early_exit = vp9_vt_lpf_t16_8w((transposed_input + 16 * 8), src, pitch,
2156  &filter48[0]);
2157 
2158  if (0 == early_exit) {
2159  vp9_transpose_8x16_to_16x8(transposed_input, 16, src - 8, pitch);
2160  }
2161  }
2162 }
2163 
2165  uint8_t *src_org, ptrdiff_t pitch,
2166  int32_t b_limit_ptr,
2167  int32_t limit_ptr,
2168  int32_t thresh_ptr)
2169 {
2170  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
2171  v16u8 p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
2172  v16u8 flat, mask, hev, thresh, b_limit, limit;
2173  v8u16 p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r;
2174  v8u16 p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
2175  v8i16 p2_filt8_r, p1_filt8_r, p0_filt8_r;
2176  v8i16 q0_filt8_r, q1_filt8_r, q2_filt8_r;
2177  v8i16 p2_filt8_l, p1_filt8_l, p0_filt8_l;
2178  v8i16 q0_filt8_l, q1_filt8_l, q2_filt8_l;
2179  v16i8 zero = { 0 };
2180  v8i16 vec0, vec1, vec2, vec3, vec4, vec5;
2181 
2182  /* load vector elements */
2183  LD_UB8(src - (4 * 16), 16, p3, p2, p1, p0, q0, q1, q2, q3);
2184 
2185  thresh = (v16u8) __msa_fill_b(thresh_ptr);
2186  b_limit = (v16u8) __msa_fill_b(b_limit_ptr);
2187  limit = (v16u8) __msa_fill_b(limit_ptr);
2188 
2189  /* mask and hev */
2190  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
2191  hev, mask, flat);
2192  /* flat4 */
2193  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
2194  /* filter4 */
2195  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
2196  q1_out);
2197 
2198  /* if flat is zero for all pixels, then no need to calculate other filter */
2199  if (__msa_test_bz_v(flat)) {
2200  ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2201  ILVRL_H2_SH(vec1, vec0, vec2, vec3);
2202  ILVL_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
2203  ILVRL_H2_SH(vec1, vec0, vec4, vec5);
2204 
2205  src_org -= 2;
2206  ST4x8_UB(vec2, vec3, src_org, pitch);
2207  src_org += 8 * pitch;
2208  ST4x8_UB(vec4, vec5, src_org, pitch);
2209 
2210  return 1;
2211  } else {
2212  ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
2213  zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
2214  q3_r);
2215  VP9_FILTER8(p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r, q3_r, p2_filt8_r,
2216  p1_filt8_r, p0_filt8_r, q0_filt8_r, q1_filt8_r, q2_filt8_r);
2217  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
2218  p0_l);
2219  ILVL_B4_UH(zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l, q2_l,
2220  q3_l);
2221  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
2222  p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
2223 
2224  /* convert 16 bit output data into 8 bit */
2225  PCKEV_B4_SH(p2_filt8_l, p2_filt8_r, p1_filt8_l, p1_filt8_r, p0_filt8_l,
2226  p0_filt8_r, q0_filt8_l, q0_filt8_r, p2_filt8_r, p1_filt8_r,
2227  p0_filt8_r, q0_filt8_r);
2228  PCKEV_B2_SH(q1_filt8_l, q1_filt8_r, q2_filt8_l, q2_filt8_r, q1_filt8_r,
2229  q2_filt8_r);
2230 
2231  /* store pixel values */
2232  p2_out = __msa_bmnz_v(p2, (v16u8) p2_filt8_r, flat);
2233  p1_out = __msa_bmnz_v(p1_out, (v16u8) p1_filt8_r, flat);
2234  p0_out = __msa_bmnz_v(p0_out, (v16u8) p0_filt8_r, flat);
2235  q0_out = __msa_bmnz_v(q0_out, (v16u8) q0_filt8_r, flat);
2236  q1_out = __msa_bmnz_v(q1_out, (v16u8) q1_filt8_r, flat);
2237  q2_out = __msa_bmnz_v(q2, (v16u8) q2_filt8_r, flat);
2238 
2239  ST_UB4(p2_out, p1_out, p0_out, q0_out, filter48, 16);
2240  filter48 += (4 * 16);
2241  ST_UB2(q1_out, q2_out, filter48, 16);
2242  filter48 += (2 * 16);
2243  ST_UB(flat, filter48);
2244 
2245  return 0;
2246  }
2247 }
2248 
2249 static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch,
2250  uint8_t *filter48)
2251 {
2252  v16u8 flat, flat2, filter8;
2253  v16i8 zero = { 0 };
2254  v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
2255  v8u16 p7_r_in, p6_r_in, p5_r_in, p4_r_in;
2256  v8u16 p3_r_in, p2_r_in, p1_r_in, p0_r_in;
2257  v8u16 q7_r_in, q6_r_in, q5_r_in, q4_r_in;
2258  v8u16 q3_r_in, q2_r_in, q1_r_in, q0_r_in;
2259  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
2260  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
2261  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
2262  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
2263  v8u16 tmp0_r, tmp1_r, tmp0_l, tmp1_l;
2264  v8i16 l_out, r_out;
2265 
2266  flat = LD_UB(filter48 + 6 * 16);
2267 
2268  LD_UB8((src - 8 * 16), 16, p7, p6, p5, p4, p3, p2, p1, p0);
2269  LD_UB8(src, 16, q0, q1, q2, q3, q4, q5, q6, q7);
2270 
2271  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
2272 
2273  /* if flat2 is zero for all pixels, then no need to calculate other filter */
2274  if (__msa_test_bz_v(flat2)) {
2275  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2276 
2277  LD_UB4(filter48, 16, p2, p1, p0, q0);
2278  LD_UB2(filter48 + 4 * 16, 16, q1, q2);
2279 
2280  ILVR_B2_SH(p1, p2, q0, p0, vec0, vec1);
2281  ILVRL_H2_SH(vec1, vec0, vec3, vec4);
2282  ILVL_B2_SH(p1, p2, q0, p0, vec0, vec1);
2283  ILVRL_H2_SH(vec1, vec0, vec6, vec7);
2284  ILVRL_B2_SH(q2, q1, vec2, vec5);
2285 
2286  src_org -= 3;
2287  ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
2288  ST2x4_UB(vec2, 0, (src_org + 4), pitch);
2289  src_org += (4 * pitch);
2290  ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
2291  ST2x4_UB(vec2, 4, (src_org + 4), pitch);
2292  src_org += (4 * pitch);
2293  ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
2294  ST2x4_UB(vec5, 0, (src_org + 4), pitch);
2295  src_org += (4 * pitch);
2296  ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
2297  ST2x4_UB(vec5, 4, (src_org + 4), pitch);
2298 
2299  return 1;
2300  } else {
2301  src -= 7 * 16;
2302 
2303  ILVR_B8_UH(zero, p7, zero, p6, zero, p5, zero, p4, zero, p3, zero, p2,
2304  zero, p1, zero, p0, p7_r_in, p6_r_in, p5_r_in, p4_r_in,
2305  p3_r_in, p2_r_in, p1_r_in, p0_r_in);
2306  q0_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q0);
2307 
2308  tmp0_r = p7_r_in << 3;
2309  tmp0_r -= p7_r_in;
2310  tmp0_r += p6_r_in;
2311  tmp0_r += q0_r_in;
2312  tmp1_r = p6_r_in + p5_r_in;
2313  tmp1_r += p4_r_in;
2314  tmp1_r += p3_r_in;
2315  tmp1_r += p2_r_in;
2316  tmp1_r += p1_r_in;
2317  tmp1_r += p0_r_in;
2318  tmp1_r += tmp0_r;
2319  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2320 
2321  ILVL_B4_UH(zero, p7, zero, p6, zero, p5, zero, p4, p7_l_in, p6_l_in,
2322  p5_l_in, p4_l_in);
2323  ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l_in, p2_l_in,
2324  p1_l_in, p0_l_in);
2325  q0_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q0);
2326 
2327  tmp0_l = p7_l_in << 3;
2328  tmp0_l -= p7_l_in;
2329  tmp0_l += p6_l_in;
2330  tmp0_l += q0_l_in;
2331  tmp1_l = p6_l_in + p5_l_in;
2332  tmp1_l += p4_l_in;
2333  tmp1_l += p3_l_in;
2334  tmp1_l += p2_l_in;
2335  tmp1_l += p1_l_in;
2336  tmp1_l += p0_l_in;
2337  tmp1_l += tmp0_l;
2338  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2339 
2340  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2341  p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
2342  ST_UB(p6, src);
2343  src += 16;
2344 
2345  /* p5 */
2346  q1_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q1);
2347  tmp0_r = p5_r_in - p6_r_in;
2348  tmp0_r += q1_r_in;
2349  tmp0_r -= p7_r_in;
2350  tmp1_r += tmp0_r;
2351  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2352  q1_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q1);
2353  tmp0_l = p5_l_in - p6_l_in;
2354  tmp0_l += q1_l_in;
2355  tmp0_l -= p7_l_in;
2356  tmp1_l += tmp0_l;
2357  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2358  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2359  p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
2360  ST_UB(p5, src);
2361  src += 16;
2362 
2363  /* p4 */
2364  q2_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q2);
2365  tmp0_r = p4_r_in - p5_r_in;
2366  tmp0_r += q2_r_in;
2367  tmp0_r -= p7_r_in;
2368  tmp1_r += tmp0_r;
2369  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2370  q2_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q2);
2371  tmp0_l = p4_l_in - p5_l_in;
2372  tmp0_l += q2_l_in;
2373  tmp0_l -= p7_l_in;
2374  tmp1_l += tmp0_l;
2375  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2376  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2377  p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
2378  ST_UB(p4, src);
2379  src += 16;
2380 
2381  /* p3 */
2382  q3_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q3);
2383  tmp0_r = p3_r_in - p4_r_in;
2384  tmp0_r += q3_r_in;
2385  tmp0_r -= p7_r_in;
2386  tmp1_r += tmp0_r;
2387  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2388  q3_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q3);
2389  tmp0_l = p3_l_in - p4_l_in;
2390  tmp0_l += q3_l_in;
2391  tmp0_l -= p7_l_in;
2392  tmp1_l += tmp0_l;
2393  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2394  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2395  p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
2396  ST_UB(p3, src);
2397  src += 16;
2398 
2399  /* p2 */
2400  q4_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q4);
2401  filter8 = LD_UB(filter48);
2402  tmp0_r = p2_r_in - p3_r_in;
2403  tmp0_r += q4_r_in;
2404  tmp0_r -= p7_r_in;
2405  tmp1_r += tmp0_r;
2406  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2407  q4_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q4);
2408  tmp0_l = p2_l_in - p3_l_in;
2409  tmp0_l += q4_l_in;
2410  tmp0_l -= p7_l_in;
2411  tmp1_l += tmp0_l;
2412  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2413  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2414  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2415  ST_UB(filter8, src);
2416  src += 16;
2417 
2418  /* p1 */
2419  q5_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q5);
2420  filter8 = LD_UB(filter48 + 16);
2421  tmp0_r = p1_r_in - p2_r_in;
2422  tmp0_r += q5_r_in;
2423  tmp0_r -= p7_r_in;
2424  tmp1_r += tmp0_r;
2425  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2426  q5_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q5);
2427  tmp0_l = p1_l_in - p2_l_in;
2428  tmp0_l += q5_l_in;
2429  tmp0_l -= p7_l_in;
2430  tmp1_l += tmp0_l;
2431  l_out = __msa_srari_h((v8i16) (tmp1_l), 4);
2432  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2433  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2434  ST_UB(filter8, src);
2435  src += 16;
2436 
2437  /* p0 */
2438  q6_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q6);
2439  filter8 = LD_UB(filter48 + 32);
2440  tmp0_r = p0_r_in - p1_r_in;
2441  tmp0_r += q6_r_in;
2442  tmp0_r -= p7_r_in;
2443  tmp1_r += tmp0_r;
2444  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2445  q6_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q6);
2446  tmp0_l = p0_l_in - p1_l_in;
2447  tmp0_l += q6_l_in;
2448  tmp0_l -= p7_l_in;
2449  tmp1_l += tmp0_l;
2450  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2451  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2452  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2453  ST_UB(filter8, src);
2454  src += 16;
2455 
2456  /* q0 */
2457  q7_r_in = (v8u16) __msa_ilvr_b(zero, (v16i8) q7);
2458  filter8 = LD_UB(filter48 + 48);
2459  tmp0_r = q7_r_in - p0_r_in;
2460  tmp0_r += q0_r_in;
2461  tmp0_r -= p7_r_in;
2462  tmp1_r += tmp0_r;
2463  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2464  q7_l_in = (v8u16) __msa_ilvl_b(zero, (v16i8) q7);
2465  tmp0_l = q7_l_in - p0_l_in;
2466  tmp0_l += q0_l_in;
2467  tmp0_l -= p7_l_in;
2468  tmp1_l += tmp0_l;
2469  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2470  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2471  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2472  ST_UB(filter8, src);
2473  src += 16;
2474 
2475  /* q1 */
2476  filter8 = LD_UB(filter48 + 64);
2477  tmp0_r = q7_r_in - q0_r_in;
2478  tmp0_r += q1_r_in;
2479  tmp0_r -= p6_r_in;
2480  tmp1_r += tmp0_r;
2481  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2482  tmp0_l = q7_l_in - q0_l_in;
2483  tmp0_l += q1_l_in;
2484  tmp0_l -= p6_l_in;
2485  tmp1_l += tmp0_l;
2486  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2487  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2488  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2489  ST_UB(filter8, src);
2490  src += 16;
2491 
2492  /* q2 */
2493  filter8 = LD_UB(filter48 + 80);
2494  tmp0_r = q7_r_in - q1_r_in;
2495  tmp0_r += q2_r_in;
2496  tmp0_r -= p5_r_in;
2497  tmp1_r += tmp0_r;
2498  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2499  tmp0_l = q7_l_in - q1_l_in;
2500  tmp0_l += q2_l_in;
2501  tmp0_l -= p5_l_in;
2502  tmp1_l += tmp0_l;
2503  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2504  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2505  filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
2506  ST_UB(filter8, src);
2507  src += 16;
2508 
2509  /* q3 */
2510  tmp0_r = q7_r_in - q2_r_in;
2511  tmp0_r += q3_r_in;
2512  tmp0_r -= p4_r_in;
2513  tmp1_r += tmp0_r;
2514  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2515  tmp0_l = q7_l_in - q2_l_in;
2516  tmp0_l += q3_l_in;
2517  tmp0_l -= p4_l_in;
2518  tmp1_l += tmp0_l;
2519  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2520  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2521  q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
2522  ST_UB(q3, src);
2523  src += 16;
2524 
2525  /* q4 */
2526  tmp0_r = q7_r_in - q3_r_in;
2527  tmp0_r += q4_r_in;
2528  tmp0_r -= p3_r_in;
2529  tmp1_r += tmp0_r;
2530  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2531  tmp0_l = q7_l_in - q3_l_in;
2532  tmp0_l += q4_l_in;
2533  tmp0_l -= p3_l_in;
2534  tmp1_l += tmp0_l;
2535  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2536  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2537  q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
2538  ST_UB(q4, src);
2539  src += 16;
2540 
2541  /* q5 */
2542  tmp0_r = q7_r_in - q4_r_in;
2543  tmp0_r += q5_r_in;
2544  tmp0_r -= p2_r_in;
2545  tmp1_r += tmp0_r;
2546  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2547  tmp0_l = q7_l_in - q4_l_in;
2548  tmp0_l += q5_l_in;
2549  tmp0_l -= p2_l_in;
2550  tmp1_l += tmp0_l;
2551  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2552  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2553  q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
2554  ST_UB(q5, src);
2555  src += 16;
2556 
2557  /* q6 */
2558  tmp0_r = q7_r_in - q5_r_in;
2559  tmp0_r += q6_r_in;
2560  tmp0_r -= p1_r_in;
2561  tmp1_r += tmp0_r;
2562  r_out = __msa_srari_h((v8i16) tmp1_r, 4);
2563  tmp0_l = q7_l_in - q5_l_in;
2564  tmp0_l += q6_l_in;
2565  tmp0_l -= p1_l_in;
2566  tmp1_l += tmp0_l;
2567  l_out = __msa_srari_h((v8i16) tmp1_l, 4);
2568  r_out = (v8i16) __msa_pckev_b((v16i8) l_out, (v16i8) r_out);
2569  q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
2570  ST_UB(q6, src);
2571 
2572  return 0;
2573  }
2574 }
2575 
2576 void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch,
2577  int32_t b_limit_ptr,
2578  int32_t limit_ptr,
2579  int32_t thresh_ptr)
2580 {
2581  uint8_t early_exit = 0;
2582  uint8_t transposed_input[16 * 24] ALLOC_ALIGNED(ALIGNMENT);
2583  uint8_t *filter48 = &transposed_input[16 * 16];
2584 
2585  vp9_transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
2586 
2587  early_exit = vp9_vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8),
2588  &filter48[0], src, pitch,
2589  b_limit_ptr, limit_ptr, thresh_ptr);
2590 
2591  if (0 == early_exit) {
2592  early_exit = vp9_vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch,
2593  &filter48[0]);
2594 
2595  if (0 == early_exit) {
2596  vp9_transpose_16x16(transposed_input, 16, (src - 8), pitch);
2597  }
2598  }
2599 }
void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1340
#define ILVRL_B2_SH(...)
static void vp9_transpose_8x16_to_16x8(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
Definition: vp9_lpf_msa.c:1770
static void vp9_transpose_16x16(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
Definition: vp9_lpf_msa.c:1783
void ff_loop_filter_v_44_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:291
#define PCKEV_B2_SH(...)
#define ILVL_B4_UH(...)
#define SD
Definition: ccaption_dec.c:819
void ff_loop_filter_v_4_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:262
#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in,q0_in, q1_in, q2_in, q3_in,p2_filt8_out, p1_filt8_out, p0_filt8_out,q0_filt8_out, q1_filt8_out, q2_filt8_out)
Definition: vp9_lpf_msa.c:185
void ff_loop_filter_h_48_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1638
static const uint8_t q1[256]
Definition: twofish.c:96
#define LD_UB4(...)
#define ILVR_B2_SB(...)
#define src
Definition: vp8dsp.c:254
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,in8, in9, in10, in11, in12, in13, in14, in15,out0, out1, out2, out3, out4, out5, out6, out7)
uint8_t
#define LD_UB2(...)
#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)
Definition: vp9_lpf_msa.c:134
#define ILVEV_H2_SW(...)
#define ILVL_B2_SB(...)
#define ILVRL_H2_SH(...)
#define SLDI_B4_0_UB(...)
static int32_t vp9_hz_lpf_t4_and_t8_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:617
static void vp9_hz_lpf_t16_16w(uint8_t *src, ptrdiff_t pitch, uint8_t *filter48)
Definition: vp9_lpf_msa.c:692
static av_always_inline int hev(uint8_t *p, ptrdiff_t stride, int thresh)
Definition: vp8dsp_mmi.c:566
static const uint16_t mask[17]
Definition: lzw.c:38
void ff_loop_filter_h_84_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1534
#define zero
Definition: regdef.h:64
static int32_t vp9_vt_lpf_t4_and_t8_8w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, int32_t pitch_org, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1843
#define ILVR_B2_SH(...)
void ff_loop_filter_h_4_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1264
void ff_loop_filter_v_48_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:545
static const uint8_t q0[256]
Definition: twofish.c:77
static av_always_inline void flat(WaveformContext *s, AVFrame *in, AVFrame *out, int component, int intensity, int offset_y, int offset_x, int column, int mirror)
Definition: vf_waveform.c:899
#define ILVR_B8_UH(...)
#define ALIGNMENT
#define LD_UB8(...)
#define PCKEV_B4_SH(...)
void ff_loop_filter_v_16_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1024
int32_t
#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in,q5_in, q6_in, q7_in, flat_in, flat2_out)
Definition: vp9_lpf_msa.c:155
void ff_loop_filter_h_16_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:2139
#define ST_UB(...)
static int32_t vp9_vt_lpf_t16_16w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, uint8_t *filter48)
Definition: vp9_lpf_msa.c:2249
#define ST2x4_UB(in, stidx, pdst, stride)
#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,q0_in, q1_in, q2_in, q3_in,limit_in, b_limit_in, thresh_in,hev_out, mask_out, flat_out)
Definition: vp9_lpf_msa.c:224
#define ST_UB2(...)
#define ILVL_B2_SH(...)
#define ST_UB8(...)
#define ST_UB4(...)
static int32_t vp9_vt_lpf_t16_8w(uint8_t *src, uint8_t *src_org, ptrdiff_t pitch, uint8_t *filter48)
Definition: vp9_lpf_msa.c:1915
#define ILVL_B4_SB(...)
void ff_loop_filter_v_16_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1040
#define TRANSPOSE8x8_UB_UB(...)
void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1293
static void vp9_transpose_16x8_to_8x16(uint8_t *input, int32_t in_pitch, uint8_t *output, int32_t out_pitch)
Definition: vp9_lpf_msa.c:1744
void ff_loop_filter_v_84_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:474
#define SD4(in0, in1, in2, in3, pdst, stride)
#define ILVL_W2_UB(...)
#define ILVR_W2_UB(...)
#define ST4x8_UB(in0, in1, pdst, stride)
#define ILVEV_B2_SH(...)
#define ILVEV_B2_UB(...)
#define ALLOC_ALIGNED(align)
void ff_loop_filter_v_88_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:395
#define LD_UB(...)
void ff_loop_filter_v_8_8_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:321
void ff_loop_filter_h_16_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:2576
#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,p1_out, p0_out, q0_out, q1_out)
Definition: vp9_lpf_msa.c:76
#define ST8x1_UB(in, pdst)
void ff_loop_filter_h_88_16_msa(uint8_t *src, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:1420
#define PCKEV_B2_UB(...)
#define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,p1_out, p0_out, q0_out, q1_out)
Definition: vp9_lpf_msa.c:25
static int32_t vp9_vt_lpf_t4_and_t8_16w(uint8_t *src, uint8_t *filter48, uint8_t *src_org, ptrdiff_t pitch, int32_t b_limit_ptr, int32_t limit_ptr, int32_t thresh_ptr)
Definition: vp9_lpf_msa.c:2164
static uint8_t tmp[11]
Definition: aes_ctr.c:26