FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
hevc_mc_uni_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
26  uint8_t *dst, int32_t dst_stride,
28 {
29  int32_t cnt;
30  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
31 
32  if (2 == height) {
33  LD2(src, src_stride, out0, out1);
34  SD(out0, dst);
35  dst += dst_stride;
36  SD(out1, dst);
37  } else if (6 == height) {
38  LD4(src, src_stride, out0, out1, out2, out3);
39  src += (4 * src_stride);
40  SD4(out0, out1, out2, out3, dst, dst_stride);
41  dst += (4 * dst_stride);
42  LD2(src, src_stride, out0, out1);
43  SD(out0, dst);
44  dst += dst_stride;
45  SD(out1, dst);
46  } else if (0 == (height % 8)) {
47  for (cnt = (height >> 3); cnt--;) {
48  LD4(src, src_stride, out0, out1, out2, out3);
49  src += (4 * src_stride);
50  LD4(src, src_stride, out4, out5, out6, out7);
51  src += (4 * src_stride);
52  SD4(out0, out1, out2, out3, dst, dst_stride);
53  dst += (4 * dst_stride);
54  SD4(out4, out5, out6, out7, dst, dst_stride);
55  dst += (4 * dst_stride);
56  }
57  } else if (0 == (height % 4)) {
58  for (cnt = (height >> 2); cnt--;) {
59  LD4(src, src_stride, out0, out1, out2, out3);
60  src += (4 * src_stride);
61  SD4(out0, out1, out2, out3, dst, dst_stride);
62  dst += (4 * dst_stride);
63  }
64  }
65 }
66 
67 static void copy_width12_msa(uint8_t *src, int32_t src_stride,
68  uint8_t *dst, int32_t dst_stride,
70 {
71  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
72 
73  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
74  src += (8 * src_stride);
75  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
76  dst += (8 * dst_stride);
77  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
78  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
79 }
80 
81 static void copy_width16_msa(uint8_t *src, int32_t src_stride,
82  uint8_t *dst, int32_t dst_stride,
84 {
85  int32_t cnt;
86  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
87 
88  if (12 == height) {
89  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
90  src += (8 * src_stride);
91  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
92  dst += (8 * dst_stride);
93  LD_UB4(src, src_stride, src0, src1, src2, src3);
94  src += (4 * src_stride);
95  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
96  dst += (4 * dst_stride);
97  } else if (0 == (height % 8)) {
98  for (cnt = (height >> 3); cnt--;) {
99  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
100  src7);
101  src += (8 * src_stride);
102  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
103  dst_stride);
104  dst += (8 * dst_stride);
105  }
106  } else if (0 == (height % 4)) {
107  for (cnt = (height >> 2); cnt--;) {
108  LD_UB4(src, src_stride, src0, src1, src2, src3);
109  src += (4 * src_stride);
110 
111  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
112  dst += (4 * dst_stride);
113  }
114  }
115 }
116 
117 static void copy_width24_msa(uint8_t *src, int32_t src_stride,
118  uint8_t *dst, int32_t dst_stride,
119  int32_t height)
120 {
121  int32_t cnt;
122  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
123  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
124 
125  for (cnt = 4; cnt--;) {
126  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
127  LD4(src + 16, src_stride, out0, out1, out2, out3);
128  src += (4 * src_stride);
129  LD4(src + 16, src_stride, out4, out5, out6, out7);
130  src += (4 * src_stride);
131 
132  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
133  SD4(out0, out1, out2, out3, dst + 16, dst_stride);
134  dst += (4 * dst_stride);
135  SD4(out4, out5, out6, out7, dst + 16, dst_stride);
136  dst += (4 * dst_stride);
137  }
138 }
139 
140 static void copy_width32_msa(uint8_t *src, int32_t src_stride,
141  uint8_t *dst, int32_t dst_stride,
142  int32_t height)
143 {
144  int32_t cnt;
145  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
146 
147  for (cnt = (height >> 2); cnt--;) {
148  LD_UB4(src, src_stride, src0, src1, src2, src3);
149  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
150  src += (4 * src_stride);
151  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
152  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
153  dst += (4 * dst_stride);
154  }
155 }
156 
157 static void copy_width48_msa(uint8_t *src, int32_t src_stride,
158  uint8_t *dst, int32_t dst_stride,
159  int32_t height)
160 {
161  int32_t cnt;
162  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
163  v16u8 src11;
164 
165  for (cnt = (height >> 2); cnt--;) {
166  LD_UB4(src, src_stride, src0, src1, src2, src3);
167  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
168  LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
169  src += (4 * src_stride);
170 
171  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
172  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
173  ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
174  dst += (4 * dst_stride);
175  }
176 }
177 
178 static void copy_width64_msa(uint8_t *src, int32_t src_stride,
179  uint8_t *dst, int32_t dst_stride,
180  int32_t height)
181 {
182  int32_t cnt;
183  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
184  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
185 
186  for (cnt = (height >> 2); cnt--;) {
187  LD_UB4(src, 16, src0, src1, src2, src3);
188  src += src_stride;
189  LD_UB4(src, 16, src4, src5, src6, src7);
190  src += src_stride;
191  LD_UB4(src, 16, src8, src9, src10, src11);
192  src += src_stride;
193  LD_UB4(src, 16, src12, src13, src14, src15);
194  src += src_stride;
195 
196  ST_UB4(src0, src1, src2, src3, dst, 16);
197  dst += dst_stride;
198  ST_UB4(src4, src5, src6, src7, dst, 16);
199  dst += dst_stride;
200  ST_UB4(src8, src9, src10, src11, dst, 16);
201  dst += dst_stride;
202  ST_UB4(src12, src13, src14, src15, dst, 16);
203  dst += dst_stride;
204  }
205 }
206 
207 static const uint8_t mc_filt_mask_arr[16 * 3] = {
208  /* 8 width cases */
209  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
210  /* 4 width cases */
211  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
212  /* 4 width cases */
213  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
214 };
215 
216 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
217  filt0, filt1, filt2, filt3) \
218 ( { \
219  v8i16 tmp0, tmp1; \
220  \
221  tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
222  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
223  tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \
224  tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3); \
225  tmp0 = __msa_adds_s_h(tmp0, tmp1); \
226  \
227  tmp0; \
228 } )
229 
230 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
231  mask0, mask1, mask2, mask3, \
232  filt0, filt1, filt2, filt3, \
233  out0, out1) \
234 { \
235  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
236  v8i16 res0_m, res1_m, res2_m, res3_m; \
237  \
238  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
239  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
240  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
241  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
242  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
243  DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
244  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
245  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
246  ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
247 }
248 
249 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
250  mask0, mask1, mask2, mask3, \
251  filt0, filt1, filt2, filt3, \
252  out0, out1, out2, out3) \
253 { \
254  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
255  v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
256  \
257  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
258  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
259  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
260  res0_m, res1_m, res2_m, res3_m); \
261  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
262  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
263  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
264  res4_m, res5_m, res6_m, res7_m); \
265  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
266  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
267  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
268  res0_m, res1_m, res2_m, res3_m); \
269  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
270  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
271  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
272  res4_m, res5_m, res6_m, res7_m); \
273  ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
274  res7_m, out0, out1, out2, out3); \
275 }
276 
277 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
278 ( { \
279  v8i16 tmp0; \
280  \
281  tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
282  tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
283  \
284  tmp0; \
285 } )
286 
287 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
288  mask0, mask1, filt0, filt1, \
289  out0, out1) \
290 { \
291  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
292  \
293  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
294  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
295  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
296  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
297 }
298 
299 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
300  mask0, mask1, filt0, filt1, \
301  out0, out1, out2, out3) \
302 { \
303  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
304  \
305  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
306  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
307  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
308  out0, out1, out2, out3); \
309  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
310  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
311  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
312  out0, out1, out2, out3); \
313 }
314 
315 static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
316  uint8_t *dst, int32_t dst_stride,
317  const int8_t *filter)
318 {
319  v16u8 mask0, mask1, mask2, mask3, out;
320  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
321  v8i16 filt, out0, out1;
322 
323  mask0 = LD_UB(&mc_filt_mask_arr[16]);
324  src -= 3;
325 
326  /* rearranging filter */
327  filt = LD_SH(filter);
328  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
329 
330  mask1 = mask0 + 2;
331  mask2 = mask0 + 4;
332  mask3 = mask0 + 6;
333 
334  LD_SB4(src, src_stride, src0, src1, src2, src3);
335  XORI_B4_128_SB(src0, src1, src2, src3);
336  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
337  mask3, filt0, filt1, filt2, filt3, out0, out1);
338  SRARI_H2_SH(out0, out1, 6);
339  SAT_SH2_SH(out0, out1, 7);
340  out = PCKEV_XORI128_UB(out0, out1);
341  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
342 }
343 
344 static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
345  uint8_t *dst, int32_t dst_stride,
346  const int8_t *filter)
347 {
348  v16i8 filt0, filt1, filt2, filt3;
349  v16i8 src0, src1, src2, src3;
350  v16u8 mask0, mask1, mask2, mask3, out;
351  v8i16 filt, out0, out1, out2, out3;
352 
353  mask0 = LD_UB(&mc_filt_mask_arr[16]);
354  src -= 3;
355 
356  /* rearranging filter */
357  filt = LD_SH(filter);
358  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
359 
360  mask1 = mask0 + 2;
361  mask2 = mask0 + 4;
362  mask3 = mask0 + 6;
363 
364  LD_SB4(src, src_stride, src0, src1, src2, src3);
365  XORI_B4_128_SB(src0, src1, src2, src3);
366  src += (4 * src_stride);
367  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
368  mask3, filt0, filt1, filt2, filt3, out0, out1);
369  LD_SB4(src, src_stride, src0, src1, src2, src3);
370  XORI_B4_128_SB(src0, src1, src2, src3);
371  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
372  mask3, filt0, filt1, filt2, filt3, out2, out3);
373  SRARI_H4_SH(out0, out1, out2, out3, 6);
374  SAT_SH4_SH(out0, out1, out2, out3, 7);
375  out = PCKEV_XORI128_UB(out0, out1);
376  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
377  dst += (4 * dst_stride);
378  out = PCKEV_XORI128_UB(out2, out3);
379  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
380 }
381 
382 static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
383  uint8_t *dst, int32_t dst_stride,
384  const int8_t *filter)
385 {
386  v16u8 mask0, mask1, mask2, mask3, out;
387  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
388  v8i16 filt, out0, out1, out2, out3;
389 
390  mask0 = LD_UB(&mc_filt_mask_arr[16]);
391  src -= 3;
392 
393  /* rearranging filter */
394  filt = LD_SH(filter);
395  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
396 
397  mask1 = mask0 + 2;
398  mask2 = mask0 + 4;
399  mask3 = mask0 + 6;
400 
401  LD_SB4(src, src_stride, src0, src1, src2, src3);
402  XORI_B4_128_SB(src0, src1, src2, src3);
403  src += (4 * src_stride);
404  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
405  mask3, filt0, filt1, filt2, filt3, out0, out1);
406  LD_SB4(src, src_stride, src0, src1, src2, src3);
407  XORI_B4_128_SB(src0, src1, src2, src3);
408  src += (4 * src_stride);
409  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
410  mask3, filt0, filt1, filt2, filt3, out2, out3);
411  SRARI_H4_SH(out0, out1, out2, out3, 6);
412  SAT_SH4_SH(out0, out1, out2, out3, 7);
413  out = PCKEV_XORI128_UB(out0, out1);
414  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
415  dst += (4 * dst_stride);
416  out = PCKEV_XORI128_UB(out2, out3);
417  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
418  dst += (4 * dst_stride);
419 
420  LD_SB4(src, src_stride, src0, src1, src2, src3);
421  XORI_B4_128_SB(src0, src1, src2, src3);
422  src += (4 * src_stride);
423  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
424  mask3, filt0, filt1, filt2, filt3, out0, out1);
425  LD_SB4(src, src_stride, src0, src1, src2, src3);
426  XORI_B4_128_SB(src0, src1, src2, src3);
427  src += (4 * src_stride);
428  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
429  mask3, filt0, filt1, filt2, filt3, out2, out3);
430 
431  SRARI_H4_SH(out0, out1, out2, out3, 6);
432  SAT_SH4_SH(out0, out1, out2, out3, 7);
433  out = PCKEV_XORI128_UB(out0, out1);
434  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
435  dst += (4 * dst_stride);
436  out = PCKEV_XORI128_UB(out2, out3);
437  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
438 }
439 
440 static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
441  uint8_t *dst, int32_t dst_stride,
442  const int8_t *filter, int32_t height)
443 {
444  if (4 == height) {
445  common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
446  } else if (8 == height) {
447  common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
448  } else if (16 == height) {
449  common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
450  }
451 }
452 
453 static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride,
454  uint8_t *dst, int32_t dst_stride,
455  const int8_t *filter)
456 {
457  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
458  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
459  v8i16 filt, out0, out1, out2, out3;
460 
461  mask0 = LD_UB(&mc_filt_mask_arr[0]);
462  src -= 3;
463 
464  /* rearranging filter */
465  filt = LD_SH(filter);
466  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
467 
468  mask1 = mask0 + 2;
469  mask2 = mask0 + 4;
470  mask3 = mask0 + 6;
471 
472  LD_SB4(src, src_stride, src0, src1, src2, src3);
473  XORI_B4_128_SB(src0, src1, src2, src3);
474  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
475  mask3, filt0, filt1, filt2, filt3, out0, out1,
476  out2, out3);
477  SRARI_H4_SH(out0, out1, out2, out3, 6);
478  SAT_SH4_SH(out0, out1, out2, out3, 7);
479  tmp0 = PCKEV_XORI128_UB(out0, out1);
480  tmp1 = PCKEV_XORI128_UB(out2, out3);
481  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
482 }
483 
484 static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride,
485  uint8_t *dst, int32_t dst_stride,
486  const int8_t *filter, int32_t height)
487 {
488  uint32_t loop_cnt;
489  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
490  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
491  v8i16 filt, out0, out1, out2, out3;
492 
493  mask0 = LD_UB(&mc_filt_mask_arr[0]);
494  src -= 3;
495 
496  /* rearranging filter */
497  filt = LD_SH(filter);
498  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
499 
500  mask1 = mask0 + 2;
501  mask2 = mask0 + 4;
502  mask3 = mask0 + 6;
503 
504  for (loop_cnt = (height >> 2); loop_cnt--;) {
505  LD_SB4(src, src_stride, src0, src1, src2, src3);
506  XORI_B4_128_SB(src0, src1, src2, src3);
507  src += (4 * src_stride);
508  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
509  mask3, filt0, filt1, filt2, filt3, out0,
510  out1, out2, out3);
511  SRARI_H4_SH(out0, out1, out2, out3, 6);
512  SAT_SH4_SH(out0, out1, out2, out3, 7);
513  tmp0 = PCKEV_XORI128_UB(out0, out1);
514  tmp1 = PCKEV_XORI128_UB(out2, out3);
515  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
516  dst += (4 * dst_stride);
517  }
518 }
519 
520 static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
521  uint8_t *dst, int32_t dst_stride,
522  const int8_t *filter, int32_t height)
523 {
524  if (4 == height) {
525  common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
526  } else {
527  common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
528  height);
529  }
530 }
531 
532 static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
533  uint8_t *dst, int32_t dst_stride,
534  const int8_t *filter, int32_t height)
535 {
536  uint8_t *src1_ptr, *dst1;
537  uint32_t loop_cnt;
538  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
539  v8i16 filt, out0, out1, out2, out3;
540  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00, tmp0, tmp1;
541 
542  mask00 = LD_UB(&mc_filt_mask_arr[0]);
543  mask0 = LD_UB(&mc_filt_mask_arr[16]);
544 
545  src1_ptr = src - 3;
546  dst1 = dst;
547 
548  dst = dst1 + 8;
549  src = src1_ptr + 8;
550 
551  /* rearranging filter */
552  filt = LD_SH(filter);
553  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
554 
555  mask1 = mask00 + 2;
556  mask2 = mask00 + 4;
557  mask3 = mask00 + 6;
558  mask4 = mask0 + 2;
559  mask5 = mask0 + 4;
560  mask6 = mask0 + 6;
561 
562  for (loop_cnt = (height >> 2); loop_cnt--;) {
563  /* 8 width */
564  LD_SB4(src1_ptr, src_stride, src0, src1, src2, src3);
565  XORI_B4_128_SB(src0, src1, src2, src3);
566  src1_ptr += (4 * src_stride);
567  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask00, mask1, mask2,
568  mask3, filt0, filt1, filt2, filt3, out0,
569  out1, out2, out3);
570  SRARI_H4_SH(out0, out1, out2, out3, 6);
571  SAT_SH4_SH(out0, out1, out2, out3, 7);
572  tmp0 = PCKEV_XORI128_UB(out0, out1);
573  tmp1 = PCKEV_XORI128_UB(out2, out3);
574  ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
575  dst1 += (4 * dst_stride);
576 
577  /* 4 width */
578  LD_SB4(src, src_stride, src0, src1, src2, src3);
579  XORI_B4_128_SB(src0, src1, src2, src3);
580  src += (4 * src_stride);
581  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask4, mask5,
582  mask6, filt0, filt1, filt2, filt3, out0,
583  out1);
584  SRARI_H2_SH(out0, out1, 6);
585  SAT_SH2_SH(out0, out1, 7);
586  tmp0 = PCKEV_XORI128_UB(out0, out1);
587  ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst, dst_stride);
588  dst += (4 * dst_stride);
589  }
590 }
591 
592 static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
593  uint8_t *dst, int32_t dst_stride,
594  const int8_t *filter, int32_t height)
595 {
596  uint32_t loop_cnt;
597  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
598  v16u8 mask0, mask1, mask2, mask3, out;
599  v8i16 filt, out0, out1, out2, out3;
600 
601  mask0 = LD_UB(&mc_filt_mask_arr[0]);
602  src -= 3;
603 
604  /* rearranging filter */
605  filt = LD_SH(filter);
606  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
607 
608  mask1 = mask0 + 2;
609  mask2 = mask0 + 4;
610  mask3 = mask0 + 6;
611 
612  for (loop_cnt = (height >> 1); loop_cnt--;) {
613  LD_SB2(src, src_stride, src0, src2);
614  LD_SB2(src + 8, src_stride, src1, src3);
615  XORI_B4_128_SB(src0, src1, src2, src3);
616  src += (2 * src_stride);
617  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
618  mask3, filt0, filt1, filt2, filt3, out0,
619  out1, out2, out3);
620  SRARI_H4_SH(out0, out1, out2, out3, 6);
621  SAT_SH4_SH(out0, out1, out2, out3, 7);
622  out = PCKEV_XORI128_UB(out0, out1);
623  ST_UB(out, dst);
624  dst += dst_stride;
625  out = PCKEV_XORI128_UB(out2, out3);
626  ST_UB(out, dst);
627  dst += dst_stride;
628  }
629 }
630 
631 static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
632  uint8_t *dst, int32_t dst_stride,
633  const int8_t *filter, int32_t height)
634 {
635  uint32_t loop_cnt;
636  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
637  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
638  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
639  v16i8 vec11;
640  v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10;
641  v8i16 out11, filt;
642 
643  mask0 = LD_UB(&mc_filt_mask_arr[0]);
644  src -= 3;
645 
646  /* rearranging filter */
647  filt = LD_SH(filter);
648  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
649 
650  mask1 = mask0 + 2;
651  mask2 = mask0 + 4;
652  mask3 = mask0 + 6;
653  mask4 = mask0 + 8;
654  mask5 = mask0 + 10;
655  mask6 = mask0 + 12;
656  mask7 = mask0 + 14;
657 
658  for (loop_cnt = (height >> 1); loop_cnt--;) {
659  LD_SB2(src, src_stride, src0, src2);
660  LD_SB2(src + 16, src_stride, src1, src3);
661  XORI_B4_128_SB(src0, src1, src2, src3);
662  src += (2 * src_stride);
663  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
664  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
665  VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
666  DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
667  out8, out2, out9);
668  DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
669  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
670  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
671  VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
672  DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2, out4,
673  out10, out6, out11);
674  DOTP_SB2_SH(vec1, vec3, filt2, filt2, out5, out7);
675  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
676  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
677  VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
678  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
679  out0, out8, out2, out9);
680  DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
681  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
682  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
683  VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
684  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
685  out4, out10, out6, out11);
686  DPADD_SB2_SH(vec5, vec7, filt3, filt3, out5, out7);
687  ADDS_SH4_SH(out0, out4, out8, out10, out2, out6, out9, out11, out0,
688  out8, out2, out9);
689  ADDS_SH2_SH(out1, out5, out3, out7, out1, out3);
690  SRARI_H4_SH(out0, out8, out2, out9, 6);
691  SRARI_H2_SH(out1, out3, 6);
692  SAT_SH4_SH(out0, out8, out2, out9, 7);
693  SAT_SH2_SH(out1, out3, 7);
694  out = PCKEV_XORI128_UB(out8, out9);
695  ST8x2_UB(out, dst + 16, dst_stride);
696  out = PCKEV_XORI128_UB(out0, out1);
697  ST_UB(out, dst);
698  dst += dst_stride;
699  out = PCKEV_XORI128_UB(out2, out3);
700  ST_UB(out, dst);
701  dst += dst_stride;
702  }
703 }
704 
705 static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
706  uint8_t *dst, int32_t dst_stride,
707  const int8_t *filter, int32_t height)
708 {
709  uint32_t loop_cnt;
710  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
711  v16u8 mask0, mask1, mask2, mask3, out;
712  v8i16 filt, out0, out1, out2, out3;
713 
714  mask0 = LD_UB(&mc_filt_mask_arr[0]);
715  src -= 3;
716 
717  /* rearranging filter */
718  filt = LD_SH(filter);
719  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
720 
721  mask1 = mask0 + 2;
722  mask2 = mask0 + 4;
723  mask3 = mask0 + 6;
724 
725  for (loop_cnt = (height >> 1); loop_cnt--;) {
726  src0 = LD_SB(src);
727  src2 = LD_SB(src + 16);
728  src3 = LD_SB(src + 24);
729  src1 = __msa_sldi_b(src2, src0, 8);
730  src += src_stride;
731  XORI_B4_128_SB(src0, src1, src2, src3);
732  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
733  mask3, filt0, filt1, filt2, filt3, out0,
734  out1, out2, out3);
735  SRARI_H4_SH(out0, out1, out2, out3, 6);
736  SAT_SH4_SH(out0, out1, out2, out3, 7);
737 
738  src0 = LD_SB(src);
739  src2 = LD_SB(src + 16);
740  src3 = LD_SB(src + 24);
741  src1 = __msa_sldi_b(src2, src0, 8);
742  src += src_stride;
743 
744  out = PCKEV_XORI128_UB(out0, out1);
745  ST_UB(out, dst);
746  out = PCKEV_XORI128_UB(out2, out3);
747  ST_UB(out, dst + 16);
748  dst += dst_stride;
749 
750  XORI_B4_128_SB(src0, src1, src2, src3);
751  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
752  mask3, filt0, filt1, filt2, filt3, out0,
753  out1, out2, out3);
754  SRARI_H4_SH(out0, out1, out2, out3, 6);
755  SAT_SH4_SH(out0, out1, out2, out3, 7);
756  out = PCKEV_XORI128_UB(out0, out1);
757  ST_UB(out, dst);
758  out = PCKEV_XORI128_UB(out2, out3);
759  ST_UB(out, dst + 16);
760  dst += dst_stride;
761  }
762 }
763 
764 static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
765  uint8_t *dst, int32_t dst_stride,
766  const int8_t *filter, int32_t height)
767 {
768  uint32_t loop_cnt;
769  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
770  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
771  v8i16 filt, out0, out1, out2, out3, out4, out5, out6;
772 
773  mask0 = LD_UB(&mc_filt_mask_arr[0]);
774  src -= 3;
775 
776  /* rearranging filter */
777  filt = LD_SH(filter);
778  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
779 
780  mask1 = mask0 + 2;
781  mask2 = mask0 + 4;
782  mask3 = mask0 + 6;
783  mask4 = mask0 + 8;
784  mask5 = mask0 + 10;
785  mask6 = mask0 + 12;
786  mask7 = mask0 + 14;
787 
788  for (loop_cnt = height; loop_cnt--;) {
789  LD_SB3(src, 16, src0, src2, src3);
790  src1 = __msa_sldi_b(src2, src0, 8);
791 
792  XORI_B4_128_SB(src0, src1, src2, src3);
793  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
794  vec0, vec1, vec2);
795  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
796  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
797  vec0, vec1, vec2);
798  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
799  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
800  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
801  vec0, vec1, vec2);
802  DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
803  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
804  vec0, vec1, vec2);
805  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out3, out4);
806  out5 = __msa_dpadd_s_h(out5, vec2, filt3);
807  ADDS_SH2_SH(out0, out3, out1, out4, out0, out1);
808  out2 = __msa_adds_s_h(out2, out5);
809  SRARI_H2_SH(out0, out1, 6);
810  out6 = __msa_srari_h(out2, 6);
811  SAT_SH3_SH(out0, out1, out6, 7);
812  out = PCKEV_XORI128_UB(out0, out1);
813  ST_UB(out, dst);
814 
815  src1 = LD_SB(src + 40);
816  src += src_stride;
817  src1 = (v16i8) __msa_xori_b((v16u8) src1, 128);
818 
819  VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask4, mask0, mask0,
820  vec0, vec1, vec2);
821  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
822  VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask5, mask1, mask1,
823  vec0, vec1, vec2);
824  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
825  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
826  VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask6, mask2, mask2,
827  vec0, vec1, vec2);
828  DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
829  VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask7, mask3, mask3,
830  vec0, vec1, vec2);
831  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out3, out4);
832  out5 = __msa_dpadd_s_h(out5, vec2, filt3);
833  ADDS_SH2_SH(out0, out3, out1, out4, out3, out4);
834  out5 = __msa_adds_s_h(out2, out5);
835  SRARI_H2_SH(out3, out4, 6);
836  out5 = __msa_srari_h(out5, 6);
837  SAT_SH3_SH(out3, out4, out5, 7);
838  out = PCKEV_XORI128_UB(out6, out3);
839  ST_UB(out, dst + 16);
840  out = PCKEV_XORI128_UB(out4, out5);
841  ST_UB(out, dst + 32);
842  dst += dst_stride;
843  }
844 }
845 
846 static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
847  uint8_t *dst, int32_t dst_stride,
848  const int8_t *filter, int32_t height)
849 {
850  int32_t loop_cnt;
851  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
852  v16u8 mask0, mask1, mask2, mask3, out;
853  v8i16 filt, out0, out1, out2, out3;
854 
855  mask0 = LD_UB(&mc_filt_mask_arr[0]);
856  src -= 3;
857 
858  /* rearranging filter */
859  filt = LD_SH(filter);
860  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
861 
862  mask1 = mask0 + 2;
863  mask2 = mask0 + 4;
864  mask3 = mask0 + 6;
865 
866  for (loop_cnt = height; loop_cnt--;) {
867  src0 = LD_SB(src);
868  src2 = LD_SB(src + 16);
869  src3 = LD_SB(src + 24);
870  src1 = __msa_sldi_b(src2, src0, 8);
871 
872  XORI_B4_128_SB(src0, src1, src2, src3);
873  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
874  mask2, mask3, filt0, filt1, filt2, filt3,
875  out0, out1, out2, out3);
876  SRARI_H4_SH(out0, out1, out2, out3, 6);
877  SAT_SH4_SH(out0, out1, out2, out3, 7);
878  out = PCKEV_XORI128_UB(out0, out1);
879  ST_UB(out, dst);
880  out = PCKEV_XORI128_UB(out2, out3);
881  ST_UB(out, dst + 16);
882 
883  src0 = LD_SB(src + 32);
884  src2 = LD_SB(src + 48);
885  src3 = LD_SB(src + 56);
886  src1 = __msa_sldi_b(src2, src0, 8);
887  src += src_stride;
888 
889  XORI_B4_128_SB(src0, src1, src2, src3);
890  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
891  mask2, mask3, filt0, filt1, filt2, filt3,
892  out0, out1, out2, out3);
893  SRARI_H4_SH(out0, out1, out2, out3, 6);
894  SAT_SH4_SH(out0, out1, out2, out3, 7);
895  out = PCKEV_XORI128_UB(out0, out1);
896  ST_UB(out, dst + 32);
897  out = PCKEV_XORI128_UB(out2, out3);
898  ST_UB(out, dst + 48);
899  dst += dst_stride;
900  }
901 }
902 
903 static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
904  uint8_t *dst, int32_t dst_stride,
905  const int8_t *filter, int32_t height)
906 {
907  uint32_t loop_cnt;
908  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
909  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
910  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
911  v16i8 src10998, filt0, filt1, filt2, filt3;
912  v16u8 out;
913  v8i16 filt, out10, out32;
914 
915  src -= (3 * src_stride);
916 
917  filt = LD_SH(filter);
918  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
919 
920  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
921  src += (7 * src_stride);
922 
923  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
924  src54_r, src21_r);
925  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
926  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
927  src4332, src6554);
928  XORI_B3_128_SB(src2110, src4332, src6554);
929 
930  for (loop_cnt = (height >> 2); loop_cnt--;) {
931  LD_SB4(src, src_stride, src7, src8, src9, src10);
932  src += (4 * src_stride);
933 
934  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
935  src87_r, src98_r, src109_r);
936  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
937  XORI_B2_128_SB(src8776, src10998);
938  out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
939  filt1, filt2, filt3);
940  out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
941  filt1, filt2, filt3);
942  SRARI_H2_SH(out10, out32, 6);
943  SAT_SH2_SH(out10, out32, 7);
944  out = PCKEV_XORI128_UB(out10, out32);
945  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
946  dst += (4 * dst_stride);
947 
948  src2110 = src6554;
949  src4332 = src8776;
950  src6554 = src10998;
951  src6 = src10;
952  }
953 }
954 
955 static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
956  uint8_t *dst, int32_t dst_stride,
957  const int8_t *filter, int32_t height)
958 {
959  uint32_t loop_cnt;
960  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
961  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
962  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
963  v16u8 tmp0, tmp1;
964  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
965 
966  src -= (3 * src_stride);
967 
968  filt = LD_SH(filter);
969  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
970 
971  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
972  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
973  src += (7 * src_stride);
974  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
975  src54_r, src21_r);
976  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
977 
978  for (loop_cnt = (height >> 2); loop_cnt--;) {
979  LD_SB4(src, src_stride, src7, src8, src9, src10);
980  XORI_B4_128_SB(src7, src8, src9, src10);
981  src += (4 * src_stride);
982 
983  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
984  src87_r, src98_r, src109_r);
985  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
986  filt1, filt2, filt3);
987  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
988  filt1, filt2, filt3);
989  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
990  filt1, filt2, filt3);
991  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
992  filt1, filt2, filt3);
993  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
994  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
995  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
996  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
997  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
998  dst += (4 * dst_stride);
999 
1000  src10_r = src54_r;
1001  src32_r = src76_r;
1002  src54_r = src98_r;
1003  src21_r = src65_r;
1004  src43_r = src87_r;
1005  src65_r = src109_r;
1006  src6 = src10;
1007  }
1008 }
1009 
1010 static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1011  uint8_t *dst, int32_t dst_stride,
1012  const int8_t *filter, int32_t height)
1013 {
1014  int32_t loop_cnt;
1015  uint32_t out2, out3;
1016  uint64_t out0, out1;
1017  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
1018  v16i8 res2, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1019  v8i16 vec01, vec23, vec45, vec67, tmp0, tmp1, tmp2;
1020  v8i16 filt, filt0, filt1, filt2, filt3;
1021  v4i32 mask = { 2, 6, 2, 6 };
1022 
1023  src -= (3 * src_stride);
1024 
1025  /* rearranging filter_y */
1026  filt = LD_SH(filter);
1027  SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1028 
1029  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1030  src += (7 * src_stride);
1031 
1032  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1033 
1034  /* 4 width */
1035  VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
1036  VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
1037  VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
1038 
1039  for (loop_cnt = (height >> 1); loop_cnt--;) {
1040  LD_SB2(src, src_stride, src7, src8);
1041  XORI_B2_128_SB(src7, src8);
1042  src += (2 * src_stride);
1043 
1044  ILVR_B4_SH(src1, src0, src3, src2, src5, src4, src7, src6,
1045  vec01, vec23, vec45, vec67);
1046  tmp0 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
1047  filt2, filt3);
1048  ILVR_B4_SH(src2, src1, src4, src3, src6, src5, src8, src7, vec01, vec23,
1049  vec45, vec67);
1050  tmp1 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
1051  filt2, filt3);
1052 
1053  /* 4 width */
1054  VSHF_W2_SB(src6, src7, src7, src8, mask, mask, vec6, vec7);
1055  ILVR_B4_SH(vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec01, vec23,
1056  vec45, vec67);
1057  tmp2 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
1058  filt2, filt3);
1059  SRARI_H2_SH(tmp0, tmp1, 6);
1060  tmp2 = __msa_srari_h(tmp2, 6);
1061  SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
1062  PCKEV_B3_SB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, res0, res1, res2);
1063  XORI_B3_128_SB(res0, res1, res2);
1064 
1065  out0 = __msa_copy_u_d((v2i64) res0, 0);
1066  out1 = __msa_copy_u_d((v2i64) res1, 0);
1067  out2 = __msa_copy_u_w((v4i32) res2, 0);
1068  out3 = __msa_copy_u_w((v4i32) res2, 1);
1069  SD(out0, dst);
1070  SW(out2, (dst + 8));
1071  dst += dst_stride;
1072  SD(out1, dst);
1073  SW(out3, (dst + 8));
1074  dst += dst_stride;
1075 
1076  src0 = src2;
1077  src1 = src3;
1078  src2 = src4;
1079  src3 = src5;
1080  src4 = src6;
1081  src5 = src7;
1082  src6 = src8;
1083  vec0 = vec2;
1084  vec1 = vec3;
1085  vec2 = vec4;
1086  vec3 = vec5;
1087  vec4 = vec6;
1088  vec5 = vec7;
1089  }
1090 }
1091 
1092 static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1093  uint8_t *dst, int32_t dst_stride,
1094  const int8_t *filter, int32_t height)
1095 {
1096  uint32_t loop_cnt;
1097  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1098  v16i8 filt0, filt1, filt2, filt3;
1099  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1100  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1101  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1102  v16u8 tmp0, tmp1, tmp2, tmp3;
1103  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1104 
1105  src -= (3 * src_stride);
1106 
1107  filt = LD_SH(filter);
1108  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1109 
1110  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1111  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1112  src += (7 * src_stride);
1113  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1114  src54_r, src21_r);
1115  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1116  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1117  src54_l, src21_l);
1118  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1119 
1120  for (loop_cnt = (height >> 2); loop_cnt--;) {
1121  LD_SB4(src, src_stride, src7, src8, src9, src10);
1122  XORI_B4_128_SB(src7, src8, src9, src10);
1123  src += (4 * src_stride);
1124 
1125  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1126  src87_r, src98_r, src109_r);
1127  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1128  src87_l, src98_l, src109_l);
1129  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
1130  filt1, filt2, filt3);
1131  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
1132  filt1, filt2, filt3);
1133  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
1134  filt1, filt2, filt3);
1135  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
1136  filt1, filt2, filt3);
1137  out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
1138  filt1, filt2, filt3);
1139  out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
1140  filt1, filt2, filt3);
1141  out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
1142  filt1, filt2, filt3);
1143  out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
1144  filt1, filt2, filt3);
1145  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1146  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1147  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1148  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1149  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1150  out3_r, tmp0, tmp1, tmp2, tmp3);
1151  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1152  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1153  dst += (4 * dst_stride);
1154 
1155  src10_r = src54_r;
1156  src32_r = src76_r;
1157  src54_r = src98_r;
1158  src21_r = src65_r;
1159  src43_r = src87_r;
1160  src65_r = src109_r;
1161  src10_l = src54_l;
1162  src32_l = src76_l;
1163  src54_l = src98_l;
1164  src21_l = src65_l;
1165  src43_l = src87_l;
1166  src65_l = src109_l;
1167  src6 = src10;
1168  }
1169 }
1170 
1172  uint8_t *dst, int32_t dst_stride,
1173  const int8_t *filter, int32_t height,
1174  int32_t width)
1175 {
1176  uint8_t *src_tmp;
1177  uint8_t *dst_tmp;
1178  uint32_t loop_cnt, cnt;
1179  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1180  v16i8 filt0, filt1, filt2, filt3;
1181  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1182  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1183  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1184  v16u8 tmp0, tmp1, tmp2, tmp3;
1185  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1186 
1187  src -= (3 * src_stride);
1188 
1189  filt = LD_SH(filter);
1190  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1191 
1192  for (cnt = (width >> 4); cnt--;) {
1193  src_tmp = src;
1194  dst_tmp = dst;
1195 
1196  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1197  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1198  src_tmp += (7 * src_stride);
1199  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1200  src32_r, src54_r, src21_r);
1201  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1202  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1203  src32_l, src54_l, src21_l);
1204  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1205 
1206  for (loop_cnt = (height >> 2); loop_cnt--;) {
1207  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1208  XORI_B4_128_SB(src7, src8, src9, src10);
1209  src_tmp += (4 * src_stride);
1210  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1211  src87_r, src98_r, src109_r);
1212  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1213  src87_l, src98_l, src109_l);
1214  out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
1215  filt0, filt1, filt2, filt3);
1216  out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
1217  filt0, filt1, filt2, filt3);
1218  out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
1219  filt0, filt1, filt2, filt3);
1220  out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
1221  filt0, filt1, filt2, filt3);
1222  out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
1223  filt0, filt1, filt2, filt3);
1224  out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
1225  filt0, filt1, filt2, filt3);
1226  out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
1227  filt0, filt1, filt2, filt3);
1228  out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
1229  filt0, filt1, filt2, filt3);
1230  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1231  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1232  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1233  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1234  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1235  out3_r, tmp0, tmp1, tmp2, tmp3);
1236  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1237  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1238  dst_tmp += (4 * dst_stride);
1239 
1240  src10_r = src54_r;
1241  src32_r = src76_r;
1242  src54_r = src98_r;
1243  src21_r = src65_r;
1244  src43_r = src87_r;
1245  src65_r = src109_r;
1246  src10_l = src54_l;
1247  src32_l = src76_l;
1248  src54_l = src98_l;
1249  src21_l = src65_l;
1250  src43_l = src87_l;
1251  src65_l = src109_l;
1252  src6 = src10;
1253  }
1254 
1255  src += 16;
1256  dst += 16;
1257  }
1258 }
1259 
1260 static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1261  uint8_t *dst, int32_t dst_stride,
1262  const int8_t *filter, int32_t height)
1263 {
1264  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1265  16);
1266 
1267  common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1268  height);
1269 }
1270 
1271 static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1272  uint8_t *dst, int32_t dst_stride,
1273  const int8_t *filter, int32_t height)
1274 {
1275  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1276  32);
1277 }
1278 
1279 static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1280  uint8_t *dst, int32_t dst_stride,
1281  const int8_t *filter, int32_t height)
1282 {
1283  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1284  48);
1285 }
1286 
1287 static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1288  uint8_t *dst, int32_t dst_stride,
1289  const int8_t *filter, int32_t height)
1290 {
1291  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1292  64);
1293 }
1294 
1296  int32_t src_stride,
1297  uint8_t *dst,
1298  int32_t dst_stride,
1299  const int8_t *filter_x,
1300  const int8_t *filter_y,
1301  int32_t height)
1302 {
1303  uint32_t loop_cnt;
1304  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1305  v8i16 filt0, filt1, filt2, filt3;
1306  v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1307  v16i8 mask1, mask2, mask3;
1308  v8i16 filter_vec, const_vec;
1309  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1310  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1311  v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1312  v4i32 dst0_r, dst1_r;
1313  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1314  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1315  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1316  v8i16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1317 
1318  src -= ((3 * src_stride) + 3);
1319  filter_vec = LD_SH(filter_x);
1320  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1321 
1322  filter_vec = LD_SH(filter_y);
1323  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1324  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1325 
1326  SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1327 
1328  mask1 = mask0 + 2;
1329  mask2 = mask0 + 4;
1330  mask3 = mask0 + 6;
1331 
1332  const_vec = __msa_ldi_h(128);
1333  const_vec <<= 6;
1334 
1335  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1336  src += (7 * src_stride);
1337  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1338 
1339  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1340  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1341  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1342  vec8, vec9, vec10, vec11);
1343  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1344  vec12, vec13, vec14, vec15);
1345 
1346  dst30 = const_vec;
1347  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1348  dst30, dst30, dst30, dst30);
1349  dst41 = const_vec;
1350  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1351  dst41, dst41, dst41, dst41);
1352  dst52 = const_vec;
1353  DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1354  dst52, dst52, dst52, dst52);
1355  dst63 = const_vec;
1356  DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1357  dst63, dst63, dst63, dst63);
1358 
1359  ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1360  dst10_r, dst21_r, dst32_r);
1361  dst43_r = __msa_ilvl_h(dst41, dst30);
1362  dst54_r = __msa_ilvl_h(dst52, dst41);
1363  dst65_r = __msa_ilvl_h(dst63, dst52);
1364  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1365 
1366  for (loop_cnt = height >> 1; loop_cnt--;) {
1367  LD_SB2(src, src_stride, src7, src8);
1368  src += 2 * src_stride;
1369  XORI_B2_128_SB(src7, src8);
1370 
1371  VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1372  vec0, vec1, vec2, vec3);
1373  dst87 = const_vec;
1374  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1375  dst87, dst87, dst87, dst87);
1376 
1377  dst76_r = __msa_ilvr_h(dst87, dst66);
1378  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1379  filt_h0, filt_h1, filt_h2, filt_h3);
1380  dst87_r = __msa_vshf_h(mask4, dst87, dst87);
1381  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1382  filt_h0, filt_h1, filt_h2, filt_h3);
1383 
1384  dst0_r >>= 6;
1385  dst1_r >>= 6;
1386  SRARI_W2_SW(dst0_r, dst1_r, 6);
1387  dst0_r = CLIP_SW_0_255(dst0_r);
1388  dst1_r = CLIP_SW_0_255(dst1_r);
1389 
1390  HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
1391  ST4x2_UB(dst0_r, dst, dst_stride);
1392  dst += (2 * dst_stride);
1393 
1394  dst10_r = dst32_r;
1395  dst32_r = dst54_r;
1396  dst54_r = dst76_r;
1397  dst21_r = dst43_r;
1398  dst43_r = dst65_r;
1399  dst65_r = dst87_r;
1400  dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1401  }
1402 }
1403 
1405  int32_t src_stride,
1406  uint8_t *dst,
1407  int32_t dst_stride,
1408  const int8_t *filter_x,
1409  const int8_t *filter_y,
1411 {
1412  uint32_t loop_cnt, cnt;
1413  uint8_t *src_tmp;
1414  uint8_t *dst_tmp;
1415  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1416  v8i16 filt0, filt1, filt2, filt3;
1417  v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1418  v16i8 mask1, mask2, mask3;
1419  v8i16 filter_vec, const_vec;
1420  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1421  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1422  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1423  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1424  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1425  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1426  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1427  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1428  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1429 
1430  src -= ((3 * src_stride) + 3);
1431  const_vec = __msa_ldi_h(128);
1432  const_vec <<= 6;
1433 
1434  filter_vec = LD_SH(filter_x);
1435  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1436 
1437  filter_vec = LD_SH(filter_y);
1438  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1439  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1440 
1441  SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1442 
1443  mask1 = mask0 + 2;
1444  mask2 = mask0 + 4;
1445  mask3 = mask0 + 6;
1446 
1447  for (cnt = width >> 3; cnt--;) {
1448  src_tmp = src;
1449  dst_tmp = dst;
1450 
1451  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1452  src_tmp += (7 * src_stride);
1453  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1454 
1455  /* row 0 row 1 row 2 row 3 */
1456  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1457  vec0, vec1, vec2, vec3);
1458  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1459  vec4, vec5, vec6, vec7);
1460  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1461  vec8, vec9, vec10, vec11);
1462  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1463  vec12, vec13, vec14, vec15);
1464  dst0 = const_vec;
1465  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1466  dst0, dst0, dst0, dst0);
1467  dst1 = const_vec;
1468  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1469  dst1, dst1, dst1, dst1);
1470  dst2 = const_vec;
1471  DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1472  dst2, dst2, dst2, dst2);
1473  dst3 = const_vec;
1474  DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1475  dst3, dst3, dst3, dst3);
1476 
1477  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1478  vec0, vec1, vec2, vec3);
1479  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1480  vec4, vec5, vec6, vec7);
1481  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1482  vec8, vec9, vec10, vec11);
1483  dst4 = const_vec;
1484  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1485  dst4, dst4, dst4, dst4);
1486  dst5 = const_vec;
1487  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1488  dst5, dst5, dst5, dst5);
1489  dst6 = const_vec;
1490  DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1491  dst6, dst6, dst6, dst6);
1492 
1493  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1494  dst10_r, dst32_r, dst54_r, dst21_r);
1495  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1496  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1497  dst10_l, dst32_l, dst54_l, dst21_l);
1498  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1499 
1500  for (loop_cnt = height >> 1; loop_cnt--;) {
1501  LD_SB2(src_tmp, src_stride, src7, src8);
1502  XORI_B2_128_SB(src7, src8);
1503  src_tmp += 2 * src_stride;
1504 
1505  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1506  vec0, vec1, vec2, vec3);
1507  dst7 = const_vec;
1508  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1509  dst7, dst7, dst7, dst7);
1510 
1511  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1512  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1513  filt_h0, filt_h1, filt_h2, filt_h3);
1514  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1515  filt_h0, filt_h1, filt_h2, filt_h3);
1516  dst0_r >>= 6;
1517  dst0_l >>= 6;
1518 
1519  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1520  vec0, vec1, vec2, vec3);
1521  dst8 = const_vec;
1522  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1523  dst8, dst8, dst8, dst8);
1524 
1525  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1526  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1527  filt_h0, filt_h1, filt_h2, filt_h3);
1528  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1529  filt_h0, filt_h1, filt_h2, filt_h3);
1530  dst1_r >>= 6;
1531  dst1_l >>= 6;
1532  SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1533  dst0_r = CLIP_SW_0_255(dst0_r);
1534  dst0_l = CLIP_SW_0_255(dst0_l);
1535  dst1_r = CLIP_SW_0_255(dst1_r);
1536  dst1_l = CLIP_SW_0_255(dst1_l);
1537 
1538  HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
1539  ST8x2_UB(dst0_r, dst_tmp, dst_stride);
1540  dst_tmp += (2 * dst_stride);
1541 
1542  dst10_r = dst32_r;
1543  dst32_r = dst54_r;
1544  dst54_r = dst76_r;
1545  dst10_l = dst32_l;
1546  dst32_l = dst54_l;
1547  dst54_l = dst76_l;
1548  dst21_r = dst43_r;
1549  dst43_r = dst65_r;
1550  dst65_r = dst87_r;
1551  dst21_l = dst43_l;
1552  dst43_l = dst65_l;
1553  dst65_l = dst87_l;
1554  dst6 = dst8;
1555  }
1556 
1557  src += 8;
1558  dst += 8;
1559  }
1560 }
1561 
1563  int32_t src_stride,
1564  uint8_t *dst,
1565  int32_t dst_stride,
1566  const int8_t *filter_x,
1567  const int8_t *filter_y,
1568  int32_t height)
1569 {
1570  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1571  filter_x, filter_y, height, 8);
1572 }
1573 
1575  int32_t src_stride,
1576  uint8_t *dst,
1577  int32_t dst_stride,
1578  const int8_t *filter_x,
1579  const int8_t *filter_y,
1580  int32_t height)
1581 {
1582  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1583  filter_x, filter_y, height, 8);
1584 
1585  hevc_hv_uni_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
1586  filter_x, filter_y, height);
1587 }
1588 
1590  int32_t src_stride,
1591  uint8_t *dst,
1592  int32_t dst_stride,
1593  const int8_t *filter_x,
1594  const int8_t *filter_y,
1595  int32_t height)
1596 {
1597  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1598  filter_x, filter_y, height, 16);
1599 }
1600 
1602  int32_t src_stride,
1603  uint8_t *dst,
1604  int32_t dst_stride,
1605  const int8_t *filter_x,
1606  const int8_t *filter_y,
1607  int32_t height)
1608 {
1609  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1610  filter_x, filter_y, height, 24);
1611 }
1612 
1614  int32_t src_stride,
1615  uint8_t *dst,
1616  int32_t dst_stride,
1617  const int8_t *filter_x,
1618  const int8_t *filter_y,
1619  int32_t height)
1620 {
1621  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1622  filter_x, filter_y, height, 32);
1623 }
1624 
1626  int32_t src_stride,
1627  uint8_t *dst,
1628  int32_t dst_stride,
1629  const int8_t *filter_x,
1630  const int8_t *filter_y,
1631  int32_t height)
1632 {
1633  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1634  filter_x, filter_y, height, 48);
1635 }
1636 
1638  int32_t src_stride,
1639  uint8_t *dst,
1640  int32_t dst_stride,
1641  const int8_t *filter_x,
1642  const int8_t *filter_y,
1643  int32_t height)
1644 {
1645  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1646  filter_x, filter_y, height, 64);
1647 }
1648 
1649 static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
1650  uint8_t *dst, int32_t dst_stride,
1651  const int8_t *filter)
1652 {
1653  v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1654  v16u8 out;
1655  v8i16 filt, res0;
1656 
1657  mask0 = LD_SB(&mc_filt_mask_arr[16]);
1658  src -= 1;
1659 
1660  /* rearranging filter */
1661  filt = LD_SH(filter);
1662  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1663 
1664  mask1 = mask0 + 2;
1665 
1666  LD_SB2(src, src_stride, src0, src1);
1667  XORI_B2_128_SB(src0, src1);
1668  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1669  res0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
1670  res0 = __msa_srari_h(res0, 6);
1671  res0 = __msa_sat_s_h(res0, 7);
1672  out = PCKEV_XORI128_UB(res0, res0);
1673  ST4x2_UB(out, dst, dst_stride);
1674 }
1675 
1676 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
1677  uint8_t *dst, int32_t dst_stride,
1678  const int8_t *filter)
1679 {
1680  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1681  v8i16 filt, out0, out1;
1682  v16u8 out;
1683 
1684  mask0 = LD_SB(&mc_filt_mask_arr[16]);
1685  src -= 1;
1686 
1687  /* rearranging filter */
1688  filt = LD_SH(filter);
1689  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1690 
1691  mask1 = mask0 + 2;
1692 
1693  LD_SB4(src, src_stride, src0, src1, src2, src3);
1694  XORI_B4_128_SB(src0, src1, src2, src3);
1695  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1696  filt0, filt1, out0, out1);
1697  SRARI_H2_SH(out0, out1, 6);
1698  SAT_SH2_SH(out0, out1, 7);
1699  out = PCKEV_XORI128_UB(out0, out1);
1700  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1701 }
1702 
1703 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
1704  uint8_t *dst, int32_t dst_stride,
1705  const int8_t *filter)
1706 {
1707  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1708  v16u8 out;
1709  v8i16 filt, out0, out1, out2, out3;
1710 
1711  mask0 = LD_SB(&mc_filt_mask_arr[16]);
1712  src -= 1;
1713 
1714  /* rearranging filter */
1715  filt = LD_SH(filter);
1716  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1717 
1718  mask1 = mask0 + 2;
1719 
1720  LD_SB4(src, src_stride, src0, src1, src2, src3);
1721  src += (4 * src_stride);
1722 
1723  XORI_B4_128_SB(src0, src1, src2, src3);
1724  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1725  filt0, filt1, out0, out1);
1726  LD_SB4(src, src_stride, src0, src1, src2, src3);
1727  XORI_B4_128_SB(src0, src1, src2, src3);
1728  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1729  filt0, filt1, out2, out3);
1730  SRARI_H4_SH(out0, out1, out2, out3, 6);
1731  SAT_SH4_SH(out0, out1, out2, out3, 7);
1732  out = PCKEV_XORI128_UB(out0, out1);
1733  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1734  dst += (4 * dst_stride);
1735  out = PCKEV_XORI128_UB(out2, out3);
1736  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1737 }
1738 
1739 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
1740  uint8_t *dst, int32_t dst_stride,
1741  const int8_t *filter)
1742 {
1743  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1744  v16i8 filt0, filt1, mask0, mask1;
1745  v16u8 out;
1746  v8i16 filt, out0, out1, out2, out3;
1747 
1748  mask0 = LD_SB(&mc_filt_mask_arr[16]);
1749  src -= 1;
1750 
1751  /* rearranging filter */
1752  filt = LD_SH(filter);
1753  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1754 
1755  mask1 = mask0 + 2;
1756 
1757  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1758  src += (8 * src_stride);
1759  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1760  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1761  filt0, filt1, out0, out1);
1762  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
1763  filt0, filt1, out2, out3);
1764  SRARI_H4_SH(out0, out1, out2, out3, 6);
1765  SAT_SH4_SH(out0, out1, out2, out3, 7);
1766  out = PCKEV_XORI128_UB(out0, out1);
1767  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1768  dst += (4 * dst_stride);
1769  out = PCKEV_XORI128_UB(out2, out3);
1770  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1771  dst += (4 * dst_stride);
1772 
1773  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1774  src += (8 * src_stride);
1775  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1776  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1777  filt0, filt1, out0, out1);
1778  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
1779  filt0, filt1, out2, out3);
1780  SRARI_H4_SH(out0, out1, out2, out3, 6);
1781  SAT_SH4_SH(out0, out1, out2, out3, 7);
1782  out = PCKEV_XORI128_UB(out0, out1);
1783  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1784  dst += (4 * dst_stride);
1785  out = PCKEV_XORI128_UB(out2, out3);
1786  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1787 }
1788 
1789 static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
1790  uint8_t *dst, int32_t dst_stride,
1791  const int8_t *filter, int32_t height)
1792 {
1793  if (2 == height) {
1794  common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
1795  } else if (4 == height) {
1796  common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1797  } else if (8 == height) {
1798  common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1799  } else if (16 == height) {
1800  common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
1801  }
1802 }
1803 
1804 static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
1805  uint8_t *dst, int32_t dst_stride,
1806  const int8_t *filter, int32_t height)
1807 {
1808  uint32_t loop_cnt;
1809  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1810  v16u8 out4, out5;
1811  v8i16 filt, out0, out1, out2, out3;
1812 
1813  mask0 = LD_SB(&mc_filt_mask_arr[0]);
1814  src -= 1;
1815 
1816  /* rearranging filter */
1817  filt = LD_SH(filter);
1818  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1819 
1820  mask1 = mask0 + 2;
1821 
1822  for (loop_cnt = (height >> 2); loop_cnt--;) {
1823  LD_SB4(src, src_stride, src0, src1, src2, src3);
1824  src += (4 * src_stride);
1825 
1826  XORI_B4_128_SB(src0, src1, src2, src3);
1827  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
1828  filt1, out0, out1, out2, out3);
1829  SRARI_H4_SH(out0, out1, out2, out3, 6);
1830  SAT_SH4_SH(out0, out1, out2, out3, 7);
1831 
1832  out4 = PCKEV_XORI128_UB(out0, out1);
1833  out5 = PCKEV_XORI128_UB(out2, out3);
1834  ST6x4_UB(out4, out5, dst, dst_stride);
1835  dst += (4 * dst_stride);
1836  }
1837 }
1838 
1839 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
1840  uint8_t *dst, int32_t dst_stride,
1841  const int8_t *filter, int32_t height)
1842 {
1843  uint32_t loop_cnt;
1844  v16i8 src0, src1, filt0, filt1, mask0, mask1;
1845  v16u8 out;
1846  v8i16 filt, vec0, vec1, vec2, vec3;
1847 
1848  mask0 = LD_SB(&mc_filt_mask_arr[0]);
1849  src -= 1;
1850 
1851  filt = LD_SH(filter);
1852  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1853 
1854  mask1 = mask0 + 2;
1855 
1856  for (loop_cnt = (height >> 1); loop_cnt--;) {
1857  LD_SB2(src, src_stride, src0, src1);
1858  src += (2 * src_stride);
1859 
1860  XORI_B2_128_SB(src0, src1);
1861  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1862  DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
1863  VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
1864  DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
1865  SRARI_H2_SH(vec0, vec1, 6);
1866  SAT_SH2_SH(vec0, vec1, 7);
1867  out = PCKEV_XORI128_UB(vec0, vec1);
1868  ST8x2_UB(out, dst, dst_stride);
1869  dst += (2 * dst_stride);
1870  }
1871 }
1872 
1873 static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
1874  uint8_t *dst, int32_t dst_stride,
1875  const int8_t *filter, int32_t height)
1876 {
1877  uint32_t loop_cnt;
1878  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1879  v16u8 tmp0, tmp1;
1880  v8i16 filt, out0, out1, out2, out3;
1881 
1882  mask0 = LD_SB(&mc_filt_mask_arr[0]);
1883  src -= 1;
1884 
1885  /* rearranging filter */
1886  filt = LD_SH(filter);
1887  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1888 
1889  mask1 = mask0 + 2;
1890 
1891  for (loop_cnt = (height >> 2); loop_cnt--;) {
1892  LD_SB4(src, src_stride, src0, src1, src2, src3);
1893  src += (4 * src_stride);
1894 
1895  XORI_B4_128_SB(src0, src1, src2, src3);
1896  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
1897  filt1, out0, out1, out2, out3);
1898  SRARI_H4_SH(out0, out1, out2, out3, 6);
1899  SAT_SH4_SH(out0, out1, out2, out3, 7);
1900  tmp0 = PCKEV_XORI128_UB(out0, out1);
1901  tmp1 = PCKEV_XORI128_UB(out2, out3);
1902  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1903  dst += (4 * dst_stride);
1904  }
1905 }
1906 
1907 static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
1908  uint8_t *dst, int32_t dst_stride,
1909  const int8_t *filter, int32_t height)
1910 {
1911  if ((2 == height) || (6 == height)) {
1912  common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
1913  height);
1914  } else {
1915  common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
1916  height);
1917  }
1918 }
1919 
1920 static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
1921  uint8_t *dst, int32_t dst_stride,
1922  const int8_t *filter, int32_t height)
1923 {
1924  uint32_t loop_cnt;
1925  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
1926  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1927  v16i8 vec10, vec11;
1928  v16u8 tmp0, tmp1;
1929  v8i16 filt, out0, out1, out2, out3, out4, out5;
1930 
1931  mask0 = LD_SB(&mc_filt_mask_arr[0]);
1932  mask2 = LD_SB(&mc_filt_mask_arr[32]);
1933 
1934  src -= 1;
1935 
1936  /* rearranging filter */
1937  filt = LD_SH(filter);
1938  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1939 
1940  mask1 = mask0 + 2;
1941  mask3 = mask2 + 2;
1942 
1943  for (loop_cnt = (height >> 2); loop_cnt--;) {
1944  LD_SB4(src, src_stride, src0, src1, src2, src3);
1945  src += (4 * src_stride);
1946 
1947  XORI_B4_128_SB(src0, src1, src2, src3);
1948  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
1949  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
1950  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
1951  DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1952  out2, out3, out4, out5);
1953  DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
1954  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
1955  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
1956  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
1957  DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
1958  out2, out3, out4, out5);
1959  DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
1960  SRARI_H4_SH(out0, out1, out2, out3, 6);
1961  SRARI_H2_SH(out4, out5, 6);
1962  SAT_SH4_SH(out0, out1, out2, out3, 7);
1963  SAT_SH2_SH(out4, out5, 7);
1964  tmp0 = PCKEV_XORI128_UB(out2, out3);
1965  tmp1 = PCKEV_XORI128_UB(out4, out5);
1966  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1967  tmp0 = PCKEV_XORI128_UB(out0, out1);
1968  ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
1969  dst += (4 * dst_stride);
1970  }
1971 }
1972 
1973 static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
1974  uint8_t *dst, int32_t dst_stride,
1975  const int8_t *filter, int32_t height)
1976 {
1977  uint32_t loop_cnt;
1978  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1979  v16i8 filt0, filt1, mask0, mask1;
1980  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
1981  v16u8 out;
1982 
1983  mask0 = LD_SB(&mc_filt_mask_arr[0]);
1984  src -= 1;
1985 
1986  /* rearranging filter */
1987  filt = LD_SH(filter);
1988  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1989 
1990  mask1 = mask0 + 2;
1991 
1992  for (loop_cnt = (height >> 2); loop_cnt--;) {
1993  LD_SB4(src, src_stride, src0, src2, src4, src6);
1994  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1995  src += (4 * src_stride);
1996 
1997  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1998  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
1999  filt1, out0, out1, out2, out3);
2000  HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
2001  filt1, out4, out5, out6, out7);
2002  SRARI_H4_SH(out0, out1, out2, out3, 6);
2003  SRARI_H4_SH(out4, out5, out6, out7, 6);
2004  SAT_SH4_SH(out0, out1, out2, out3, 7);
2005  SAT_SH4_SH(out4, out5, out6, out7, 7);
2006  out = PCKEV_XORI128_UB(out0, out1);
2007  ST_UB(out, dst);
2008  dst += dst_stride;
2009  out = PCKEV_XORI128_UB(out2, out3);
2010  ST_UB(out, dst);
2011  dst += dst_stride;
2012  out = PCKEV_XORI128_UB(out4, out5);
2013  ST_UB(out, dst);
2014  dst += dst_stride;
2015  out = PCKEV_XORI128_UB(out6, out7);
2016  ST_UB(out, dst);
2017  dst += dst_stride;
2018  }
2019 }
2020 
2021 static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
2022  uint8_t *dst, int32_t dst_stride,
2023  const int8_t *filter, int32_t height)
2024 {
2025  uint8_t *dst1 = dst + 16;
2026  uint32_t loop_cnt;
2027  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2028  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2029  v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2030  v8i16 filt, out0, out1, out2, out3;
2031  v16u8 tmp0, tmp1;
2032 
2033  mask0 = LD_SB(&mc_filt_mask_arr[0]);
2034  src -= 1;
2035 
2036  /* rearranging filter */
2037  filt = LD_SH(filter);
2038  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2039 
2040  mask1 = mask0 + 2;
2041  mask00 = mask0 + 8;
2042  mask11 = mask0 + 10;
2043 
2044  for (loop_cnt = (height >> 2); loop_cnt--;) {
2045  LD_SB4(src, src_stride, src0, src2, src4, src6);
2046  LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2047  src += (4 * src_stride);
2048 
2049  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2050  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2051  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2052  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2053  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2054  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2055  out0, out1, out2, out3);
2056  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2057  out0, out1, out2, out3);
2058  SRARI_H4_SH(out0, out1, out2, out3, 6);
2059  SAT_SH4_SH(out0, out1, out2, out3, 7);
2060  tmp0 = PCKEV_XORI128_UB(out0, out1);
2061  ST_UB(tmp0, dst);
2062  dst += dst_stride;
2063  tmp0 = PCKEV_XORI128_UB(out2, out3);
2064  ST_UB(tmp0, dst);
2065  dst += dst_stride;
2066 
2067  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2068  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2069  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2070  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2071  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2072  out0, out1, out2, out3);
2073  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2074  out0, out1, out2, out3);
2075  SRARI_H4_SH(out0, out1, out2, out3, 6);
2076  SAT_SH4_SH(out0, out1, out2, out3, 7);
2077  tmp0 = PCKEV_XORI128_UB(out0, out1);
2078  ST_UB(tmp0, dst);
2079  dst += dst_stride;
2080  tmp0 = PCKEV_XORI128_UB(out2, out3);
2081  ST_UB(tmp0, dst);
2082  dst += dst_stride;
2083 
2084  /* 8 width */
2085  VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2086  VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2087  VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2088  VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2089 
2090  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2091  out0, out1, out2, out3);
2092  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2093  out0, out1, out2, out3);
2094 
2095  SRARI_H4_SH(out0, out1, out2, out3, 6);
2096  SAT_SH4_SH(out0, out1, out2, out3, 7);
2097  tmp0 = PCKEV_XORI128_UB(out0, out1);
2098  tmp1 = PCKEV_XORI128_UB(out2, out3);
2099  ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
2100  dst1 += (4 * dst_stride);
2101  }
2102 }
2103 
2104 static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
2105  uint8_t *dst, int32_t dst_stride,
2106  const int8_t *filter, int32_t height)
2107 {
2108  uint32_t loop_cnt;
2109  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2110  v16i8 filt0, filt1, mask0, mask1;
2111  v16u8 out;
2112  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2113 
2114  mask0 = LD_SB(&mc_filt_mask_arr[0]);
2115  src -= 1;
2116 
2117  /* rearranging filter */
2118  filt = LD_SH(filter);
2119  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2120 
2121  mask1 = mask0 + 2;
2122 
2123  for (loop_cnt = (height >> 1); loop_cnt--;) {
2124  src0 = LD_SB(src);
2125  src2 = LD_SB(src + 16);
2126  src3 = LD_SB(src + 24);
2127  src += src_stride;
2128  src4 = LD_SB(src);
2129  src6 = LD_SB(src + 16);
2130  src7 = LD_SB(src + 24);
2131  SLDI_B2_SB(src2, src6, src0, src4, src1, src5, 8);
2132  src += src_stride;
2133 
2134  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2135  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2136  filt0, filt1, out0, out1, out2, out3);
2137  HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2138  filt0, filt1, out4, out5, out6, out7);
2139  SRARI_H4_SH(out0, out1, out2, out3, 6);
2140  SRARI_H4_SH(out4, out5, out6, out7, 6);
2141  SAT_SH4_SH(out0, out1, out2, out3, 7);
2142  SAT_SH4_SH(out4, out5, out6, out7, 7);
2143  out = PCKEV_XORI128_UB(out0, out1);
2144  ST_UB(out, dst);
2145  out = PCKEV_XORI128_UB(out2, out3);
2146  ST_UB(out, dst + 16);
2147  dst += dst_stride;
2148  out = PCKEV_XORI128_UB(out4, out5);
2149  ST_UB(out, dst);
2150  out = PCKEV_XORI128_UB(out6, out7);
2151  ST_UB(out, dst + 16);
2152  dst += dst_stride;
2153  }
2154 }
2155 
2156 static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
2157  uint8_t *dst, int32_t dst_stride,
2158  const int8_t *filter)
2159 {
2160  v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2161  v16i8 src2110, src4332, filt0, filt1;
2162  v16u8 out;
2163  v8i16 filt, out10;
2164 
2165  src -= src_stride;
2166 
2167  filt = LD_SH(filter);
2168  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2169 
2170  LD_SB3(src, src_stride, src0, src1, src2);
2171  src += (3 * src_stride);
2172 
2173  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2174  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2175  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2176  LD_SB2(src, src_stride, src3, src4);
2177  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2178  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2179  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2180  out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
2181  out10 = __msa_srari_h(out10, 6);
2182  out10 = __msa_sat_s_h(out10, 7);
2183  out = PCKEV_XORI128_UB(out10, out10);
2184  ST4x2_UB(out, dst, dst_stride);
2185 }
2186 
2188  uint8_t *dst, int32_t dst_stride,
2189  const int8_t *filter, int32_t height)
2190 {
2191  uint32_t loop_cnt;
2192  v16i8 src0, src1, src2, src3, src4, src5;
2193  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2194  v16i8 src2110, src4332, filt0, filt1;
2195  v8i16 filt, out10, out32;
2196  v16u8 out;
2197 
2198  src -= src_stride;
2199 
2200  filt = LD_SH(filter);
2201  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2202 
2203  LD_SB3(src, src_stride, src0, src1, src2);
2204  src += (3 * src_stride);
2205 
2206  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2207 
2208  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2209  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2210 
2211  for (loop_cnt = (height >> 2); loop_cnt--;) {
2212  LD_SB3(src, src_stride, src3, src4, src5);
2213  src += (3 * src_stride);
2214  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2215  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2216  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2217  out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
2218 
2219  src2 = LD_SB(src);
2220  src += (src_stride);
2221  ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2222  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2223  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2224  out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
2225  SRARI_H2_SH(out10, out32, 6);
2226  SAT_SH2_SH(out10, out32, 7);
2227  out = PCKEV_XORI128_UB(out10, out32);
2228  ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2229  dst += (4 * dst_stride);
2230  }
2231 }
2232 
2233 static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
2234  uint8_t *dst, int32_t dst_stride,
2235  const int8_t *filter, int32_t height)
2236 {
2237  if (2 == height) {
2238  common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2239  } else {
2240  common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2241  height);
2242  }
2243 }
2244 
2245 static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
2246  uint8_t *dst, int32_t dst_stride,
2247  const int8_t *filter, int32_t height)
2248 {
2249  uint32_t loop_cnt;
2250  v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, out0, out1;
2251  v8i16 vec01, vec12, vec23, vec30, tmp0, tmp1, tmp2, tmp3;
2252  v8i16 filt, filt0, filt1;
2253 
2254  src -= src_stride;
2255 
2256  /* rearranging filter_y */
2257  filt = LD_SH(filter);
2258  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2259 
2260  LD_UB3(src, src_stride, src0, src1, src2);
2261  src += (3 * src_stride);
2262 
2263  vec0 = (v16u8) __msa_xori_b((v16u8) src0, 128);
2264  vec1 = (v16u8) __msa_xori_b((v16u8) src1, 128);
2265  vec2 = (v16u8) __msa_xori_b((v16u8) src2, 128);
2266 
2267  for (loop_cnt = (height >> 2); loop_cnt--;) {
2268  LD_UB4(src, src_stride, src3, src0, src1, src2);
2269  src += (4 * src_stride);
2270 
2271  vec3 = (v16u8) __msa_xori_b((v16u8) src3, 128);
2272  ILVR_B2_SH(vec1, vec0, vec3, vec2, vec01, vec23);
2273  tmp0 = FILT_4TAP_DPADD_S_H(vec01, vec23, filt0, filt1);
2274 
2275  vec0 = __msa_xori_b((v16u8) src0, 128);
2276  ILVR_B2_SH(vec2, vec1, vec0, vec3, vec12, vec30);
2277  tmp1 = FILT_4TAP_DPADD_S_H(vec12, vec30, filt0, filt1);
2278 
2279  vec1 = __msa_xori_b((v16u8) src1, 128);
2280  vec01 = (v8i16) __msa_ilvr_b((v16i8) vec1, (v16i8) vec0);
2281  tmp2 = FILT_4TAP_DPADD_S_H(vec23, vec01, filt0, filt1);
2282 
2283  vec2 = __msa_xori_b((v16u8) src2, 128);
2284  vec12 = (v8i16) __msa_ilvr_b((v16i8) vec2, (v16i8) vec1);
2285  tmp3 = FILT_4TAP_DPADD_S_H(vec30, vec12, filt0, filt1);
2286 
2287  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
2288  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
2289  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
2290  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
2291  ST6x4_UB(out0, out1, dst, dst_stride);
2292  dst += (4 * dst_stride);
2293  }
2294 }
2295 
2296 static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
2297  uint8_t *dst, int32_t dst_stride,
2298  const int8_t *filter)
2299 {
2300  v16i8 src0, src1, src2, src3, src4;
2301  v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2302  v16u8 out;
2303 
2304  src -= src_stride;
2305 
2306  /* rearranging filter_y */
2307  filt = LD_SH(filter);
2308  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2309 
2310  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2311  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2312  ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2313  tmp0 = FILT_4TAP_DPADD_S_H(src01, src23, filt0, filt1);
2314  ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2315  tmp1 = FILT_4TAP_DPADD_S_H(src12, src34, filt0, filt1);
2316  SRARI_H2_SH(tmp0, tmp1, 6);
2317  SAT_SH2_SH(tmp0, tmp1, 7);
2318  out = PCKEV_XORI128_UB(tmp0, tmp1);
2319  ST8x2_UB(out, dst, dst_stride);
2320 }
2321 
2322 static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
2323  uint8_t *dst, int32_t dst_stride,
2324  const int8_t *filter)
2325 {
2326  uint32_t loop_cnt;
2327  uint64_t out0, out1, out2;
2328  v16i8 src0, src1, src2, src3, src4, src5;
2329  v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2330  v8i16 filt, filt0, filt1;
2331 
2332  src -= src_stride;
2333 
2334  /* rearranging filter_y */
2335  filt = LD_SH(filter);
2336  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2337 
2338  LD_SB3(src, src_stride, src0, src1, src2);
2339  src += (3 * src_stride);
2340 
2341  XORI_B3_128_SB(src0, src1, src2);
2342  ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2343 
2344  for (loop_cnt = 2; loop_cnt--;) {
2345  LD_SB3(src, src_stride, src3, src4, src5);
2346  src += (3 * src_stride);
2347 
2348  XORI_B3_128_SB(src3, src4, src5);
2349  ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2350  tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
2351  tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt0, filt1);
2352  tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt0, filt1);
2353  SRARI_H2_SH(tmp0, tmp1, 6);
2354  tmp2 = __msa_srari_h(tmp2, 6);
2355  SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2356  PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2357  XORI_B2_128_SH(tmp0, tmp2);
2358 
2359  out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2360  out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2361  out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2362  SD(out0, dst);
2363  dst += dst_stride;
2364  SD(out1, dst);
2365  dst += dst_stride;
2366  SD(out2, dst);
2367  dst += dst_stride;
2368 
2369  src2 = src5;
2370  vec0 = vec3;
2371  vec2 = vec4;
2372  }
2373 }
2374 
2375 static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2376  uint8_t *dst, int32_t dst_stride,
2377  const int8_t *filter, int32_t height)
2378 {
2379  uint32_t loop_cnt;
2380  v16i8 src0, src1, src2, src7, src8, src9, src10;
2381  v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2382  v16u8 tmp0, tmp1;
2383  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2384 
2385  src -= src_stride;
2386 
2387  filt = LD_SH(filter);
2388  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2389 
2390  LD_SB3(src, src_stride, src0, src1, src2);
2391  src += (3 * src_stride);
2392 
2393  XORI_B3_128_SB(src0, src1, src2);
2394  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2395 
2396  for (loop_cnt = (height >> 2); loop_cnt--;) {
2397  LD_SB4(src, src_stride, src7, src8, src9, src10);
2398  src += (4 * src_stride);
2399 
2400  XORI_B4_128_SB(src7, src8, src9, src10);
2401  ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2402  src72_r, src87_r, src98_r, src109_r);
2403  out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
2404  out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
2405  out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
2406  out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2407  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2408  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2409  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2410  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2411  ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2412  dst += (4 * dst_stride);
2413 
2414  src10_r = src98_r;
2415  src21_r = src109_r;
2416  src2 = src10;
2417  }
2418 }
2419 
2420 static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
2421  uint8_t *dst, int32_t dst_stride,
2422  const int8_t *filter, int32_t height)
2423 {
2424  if (2 == height) {
2425  common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2426  } else if (6 == height) {
2427  common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2428  } else {
2429  common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2430  filter, height);
2431  }
2432 }
2433 
2434 static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
2435  uint8_t *dst, int32_t dst_stride,
2436  const int8_t *filter, int32_t height)
2437 {
2438  uint32_t loop_cnt;
2439  v16i8 src0, src1, src2, src3, src4, src5, src6;
2440  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2441  v16u8 out0, out1;
2442  v8i16 src10, src21, src32, src43, src54, src65, src87, src109, src1211;
2443  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, filt, filt0, filt1;
2444  v4u32 mask = { 2, 6, 2, 6 };
2445 
2446  /* rearranging filter_y */
2447  filt = LD_SH(filter);
2448  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2449 
2450  src -= src_stride;
2451 
2452  LD_SB3(src, src_stride, src0, src1, src2);
2453  src += (3 * src_stride);
2454 
2455  XORI_B3_128_SB(src0, src1, src2);
2456  VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
2457 
2458  for (loop_cnt = (height >> 2); loop_cnt--;) {
2459  LD_SB4(src, src_stride, src3, src4, src5, src6);
2460  src += (4 * src_stride);
2461 
2462  XORI_B4_128_SB(src3, src4, src5, src6);
2463  ILVR_B2_SH(src1, src0, src3, src2, src10, src32);
2464  VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
2465  VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
2466  tmp0 = FILT_4TAP_DPADD_S_H(src10, src32, filt0, filt1);
2467  ILVR_B4_SH(src2, src1, src4, src3, src5, src4, src6, src5,
2468  src21, src43, src54, src65);
2469  tmp1 = FILT_4TAP_DPADD_S_H(src21, src43, filt0, filt1);
2470  tmp2 = FILT_4TAP_DPADD_S_H(src32, src54, filt0, filt1);
2471  tmp3 = FILT_4TAP_DPADD_S_H(src43, src65, filt0, filt1);
2472  ILVR_B3_SH(vec1, vec0, vec3, vec2, vec5, vec4, src87, src109, src1211);
2473  tmp4 = FILT_4TAP_DPADD_S_H(src87, src109, filt0, filt1);
2474  tmp5 = FILT_4TAP_DPADD_S_H(src109, src1211, filt0, filt1);
2475  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
2476  SRARI_H2_SH(tmp4, tmp5, 6);
2477  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
2478  SAT_SH2_SH(tmp4, tmp5, 7);
2479  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
2480  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
2481  ST8x4_UB(out0, out1, dst, dst_stride);
2482  out0 = PCKEV_XORI128_UB(tmp4, tmp5);
2483  ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
2484  dst += (4 * dst_stride);
2485 
2486  src0 = src4;
2487  src1 = src5;
2488  src2 = src6;
2489  vec0 = vec4;
2490  vec1 = vec5;
2491  src2 = src6;
2492  }
2493 }
2494 
2495 static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
2496  uint8_t *dst, int32_t dst_stride,
2497  const int8_t *filter, int32_t height)
2498 {
2499  uint32_t loop_cnt;
2500  v16i8 src0, src1, src2, src3, src4, src5, src6;
2501  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2502  v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2503  v16u8 tmp0, tmp1, tmp2, tmp3;
2504  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2505 
2506  src -= src_stride;
2507 
2508  filt = LD_SH(filter);
2509  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2510 
2511  LD_SB3(src, src_stride, src0, src1, src2);
2512  src += (3 * src_stride);
2513 
2514  XORI_B3_128_SB(src0, src1, src2);
2515  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2516  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2517 
2518  for (loop_cnt = (height >> 2); loop_cnt--;) {
2519  LD_SB4(src, src_stride, src3, src4, src5, src6);
2520  src += (4 * src_stride);
2521 
2522  XORI_B4_128_SB(src3, src4, src5, src6);
2523  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2524  src32_r, src43_r, src54_r, src65_r);
2525  ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2526  src32_l, src43_l, src54_l, src65_l);
2527  out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2528  out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2529  out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
2530  out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
2531  out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2532  out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2533  out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
2534  out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
2535  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2536  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2537  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2538  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2539  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2540  out3_r, tmp0, tmp1, tmp2, tmp3);
2541  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2542  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2543  dst += (4 * dst_stride);
2544 
2545  src10_r = src54_r;
2546  src21_r = src65_r;
2547  src10_l = src54_l;
2548  src21_l = src65_l;
2549  src2 = src6;
2550  }
2551 }
2552 
2553 static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
2554  uint8_t *dst, int32_t dst_stride,
2555  const int8_t *filter, int32_t height)
2556 {
2557  uint32_t loop_cnt;
2558  uint64_t out0, out1;
2559  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2560  v16i8 src11, filt0, filt1;
2561  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2562  v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2563  v16u8 out;
2564  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2565 
2566  src -= src_stride;
2567 
2568  filt = LD_SH(filter);
2569  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2570 
2571  /* 16 width */
2572  LD_SB3(src, src_stride, src0, src1, src2);
2573  XORI_B3_128_SB(src0, src1, src2);
2574  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2575  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2576 
2577  /* 8 width */
2578  LD_SB3(src + 16, src_stride, src6, src7, src8);
2579  src += (3 * src_stride);
2580  XORI_B3_128_SB(src6, src7, src8);
2581  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2582 
2583  for (loop_cnt = (height >> 2); loop_cnt--;) {
2584  /* 16 width */
2585  LD_SB2(src, src_stride, src3, src4);
2586  XORI_B2_128_SB(src3, src4);
2587  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2588  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2589 
2590  /* 8 width */
2591  LD_SB2(src + 16, src_stride, src9, src10);
2592  src += (2 * src_stride);
2593  XORI_B2_128_SB(src9, src10);
2594  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2595 
2596  /* 16 width */
2597  out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2598  out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2599  out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2600  out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2601 
2602  /* 8 width */
2603  out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1);
2604  out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2605 
2606  /* 16 + 8 width */
2607  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2608  SRARI_H2_SH(out0_l, out1_l, 6);
2609  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2610  SAT_SH2_SH(out0_l, out1_l, 7);
2611  out = PCKEV_XORI128_UB(out0_r, out0_l);
2612  ST_UB(out, dst);
2613  PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2614  XORI_B2_128_SH(out2_r, out3_r);
2615  out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2616  out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2617  SD(out0, dst + 16);
2618  dst += dst_stride;
2619  out = PCKEV_XORI128_UB(out1_r, out1_l);
2620  ST_UB(out, dst);
2621  SD(out1, dst + 16);
2622  dst += dst_stride;
2623 
2624  /* 16 width */
2625  LD_SB2(src, src_stride, src5, src2);
2626  XORI_B2_128_SB(src5, src2);
2627  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2628  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2629 
2630  /* 8 width */
2631  LD_SB2(src + 16, src_stride, src11, src8);
2632  src += (2 * src_stride);
2633  XORI_B2_128_SB(src11, src8);
2634  ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2635 
2636  /* 16 width */
2637  out0_r = FILT_4TAP_DPADD_S_H(src32_r, src10_r, filt0, filt1);
2638  out0_l = FILT_4TAP_DPADD_S_H(src32_l, src10_l, filt0, filt1);
2639  out1_r = FILT_4TAP_DPADD_S_H(src43_r, src21_r, filt0, filt1);
2640  out1_l = FILT_4TAP_DPADD_S_H(src43_l, src21_l, filt0, filt1);
2641 
2642  /* 8 width */
2643  out2_r = FILT_4TAP_DPADD_S_H(src98_r, src76_r, filt0, filt1);
2644  out3_r = FILT_4TAP_DPADD_S_H(src109_r, src87_r, filt0, filt1);
2645 
2646  /* 16 + 8 width */
2647  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2648  SRARI_H2_SH(out0_l, out1_l, 6);
2649  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2650  SAT_SH2_SH(out0_l, out1_l, 7);
2651  out = PCKEV_XORI128_UB(out0_r, out0_l);
2652  ST_UB(out, dst);
2653  out = PCKEV_XORI128_UB(out2_r, out2_r);
2654  ST8x1_UB(out, dst + 16);
2655  dst += dst_stride;
2656  out = PCKEV_XORI128_UB(out1_r, out1_l);
2657  ST_UB(out, dst);
2658  out = PCKEV_XORI128_UB(out3_r, out3_r);
2659  ST8x1_UB(out, dst + 16);
2660  dst += dst_stride;
2661  }
2662 }
2663 
2665  uint8_t *dst, int32_t dst_stride,
2666  const int8_t *filter, int32_t height,
2667  int32_t width)
2668 {
2669  uint32_t loop_cnt, cnt;
2670  uint8_t *dst_tmp, *src_tmp;
2671  v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
2672  v16i8 src10_r, src32_r, src76_r, src98_r;
2673  v16i8 src21_r, src43_r, src87_r, src109_r;
2674  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2675  v16i8 src10_l, src32_l, src76_l, src98_l;
2676  v16i8 src21_l, src43_l, src87_l, src109_l;
2677  v8i16 filt;
2678  v16i8 filt0, filt1;
2679  v16u8 out;
2680 
2681  src -= src_stride;
2682 
2683  filt = LD_SH(filter);
2684  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2685 
2686  for (cnt = (width >> 5); cnt--;) {
2687  dst_tmp = dst;
2688  src_tmp = src;
2689 
2690  /* 16 width */
2691  LD_SB3(src_tmp, src_stride, src0, src1, src2);
2692  XORI_B3_128_SB(src0, src1, src2);
2693 
2694  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2695  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2696 
2697  /* next 16 width */
2698  LD_SB3(src_tmp + 16, src_stride, src6, src7, src8);
2699  src_tmp += (3 * src_stride);
2700 
2701  XORI_B3_128_SB(src6, src7, src8);
2702  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2703  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
2704 
2705  for (loop_cnt = (height >> 1); loop_cnt--;) {
2706  /* 16 width */
2707  LD_SB2(src_tmp, src_stride, src3, src4);
2708  XORI_B2_128_SB(src3, src4);
2709  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2710  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2711 
2712  /* 16 width */
2713  out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2714  out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2715  out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2716  out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2717 
2718  /* 16 width */
2719  SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
2720  SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
2721  out = PCKEV_XORI128_UB(out0_r, out0_l);
2722  ST_UB(out, dst_tmp);
2723  out = PCKEV_XORI128_UB(out1_r, out1_l);
2724  ST_UB(out, dst_tmp + dst_stride);
2725 
2726  src10_r = src32_r;
2727  src21_r = src43_r;
2728  src10_l = src32_l;
2729  src21_l = src43_l;
2730  src2 = src4;
2731 
2732  /* next 16 width */
2733  LD_SB2(src_tmp + 16, src_stride, src9, src10);
2734  src_tmp += (2 * src_stride);
2735  XORI_B2_128_SB(src9, src10);
2736  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2737  ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
2738 
2739  /* next 16 width */
2740  out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1);
2741  out2_l = FILT_4TAP_DPADD_S_H(src76_l, src98_l, filt0, filt1);
2742  out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2743  out3_l = FILT_4TAP_DPADD_S_H(src87_l, src109_l, filt0, filt1);
2744 
2745  /* next 16 width */
2746  SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
2747  SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
2748  out = PCKEV_XORI128_UB(out2_r, out2_l);
2749  ST_UB(out, dst_tmp + 16);
2750  out = PCKEV_XORI128_UB(out3_r, out3_l);
2751  ST_UB(out, dst_tmp + 16 + dst_stride);
2752 
2753  dst_tmp += 2 * dst_stride;
2754 
2755  src76_r = src98_r;
2756  src87_r = src109_r;
2757  src76_l = src98_l;
2758  src87_l = src109_l;
2759  src8 = src10;
2760  }
2761 
2762  src += 32;
2763  dst += 32;
2764  }
2765 }
2766 
2767 static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
2768  uint8_t *dst, int32_t dst_stride,
2769  const int8_t *filter, int32_t height)
2770 {
2771  common_vt_4t_32w_mult_msa(src, src_stride, dst, dst_stride,
2772  filter, height, 32);
2773 }
2774 
2776  int32_t src_stride,
2777  uint8_t *dst,
2778  int32_t dst_stride,
2779  const int8_t *filter_x,
2780  const int8_t *filter_y,
2781  int32_t height)
2782 {
2783  v16i8 src0, src1, src2, src3, src4;
2784  v8i16 filt0, filt1;
2785  v4i32 filt_h0, filt_h1;
2786  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2787  v16i8 mask1;
2788  v8i16 filter_vec, const_vec;
2789  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2790  v8i16 dst0, dst1, dst2, dst3, dst4;
2791  v4i32 dst0_r, dst1_r;
2792  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
2793 
2794  src -= (src_stride + 1);
2795 
2796  filter_vec = LD_SH(filter_x);
2797  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2798 
2799  filter_vec = LD_SH(filter_y);
2800  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2801  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2802 
2803  SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
2804 
2805  mask1 = mask0 + 2;
2806 
2807  const_vec = __msa_ldi_h(128);
2808  const_vec <<= 6;
2809 
2810  LD_SB3(src, src_stride, src0, src1, src2);
2811  src += (3 * src_stride);
2812 
2813  XORI_B3_128_SB(src0, src1, src2);
2814 
2815  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2816  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2817  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2818 
2819  dst0 = const_vec;
2820  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2821  dst1 = const_vec;
2822  DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
2823  dst2 = const_vec;
2824  DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
2825 
2826  ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2827  LD_SB2(src, src_stride, src3, src4);
2828  XORI_B2_128_SB(src3, src4);
2829 
2830  /* row 3 */
2831  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2832  dst3 = const_vec;
2833  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2834 
2835  dst32_r = __msa_ilvr_h(dst3, dst2);
2836  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
2837  dst0_r >>= 6;
2838 
2839  /* row 4 */
2840  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2841  dst4 = const_vec;
2842  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2843 
2844  dst43_r = __msa_ilvr_h(dst4, dst3);
2845  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
2846  dst1_r >>= 6;
2847 
2848  dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
2849  dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 6);
2850  dst0_r = (v4i32) CLIP_SH_0_255(dst0_r);
2851  dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
2852 
2853  ST4x2_UB(dst0_r, dst, dst_stride);
2854 }
2855 
2857  int32_t src_stride,
2858  uint8_t *dst,
2859  int32_t dst_stride,
2860  const int8_t *filter_x,
2861  const int8_t *filter_y,
2862  int32_t height)
2863 {
2864  v16i8 src0, src1, src2, src3, src4, src5, src6;
2865  v8i16 filt0, filt1;
2866  v4i32 filt_h0, filt_h1;
2867  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2868  v16i8 mask1;
2869  v8i16 filter_vec, const_vec;
2870  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2871  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2872  v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
2873  v8i16 out0_r, out1_r;
2874  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
2875 
2876  src -= (src_stride + 1);
2877 
2878  filter_vec = LD_SH(filter_x);
2879  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2880 
2881  filter_vec = LD_SH(filter_y);
2882  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2883  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2884 
2885  SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
2886 
2887  mask1 = mask0 + 2;
2888 
2889  const_vec = __msa_ldi_h(128);
2890  const_vec <<= 6;
2891 
2892  LD_SB3(src, src_stride, src0, src1, src2);
2893  src += (3 * src_stride);
2894 
2895  XORI_B3_128_SB(src0, src1, src2);
2896 
2897  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2898  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2899  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2900 
2901  dst0 = const_vec;
2902  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2903  dst1 = const_vec;
2904  DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
2905  dst2 = const_vec;
2906  DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
2907 
2908  ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2909  LD_SB4(src, src_stride, src3, src4, src5, src6);
2910  XORI_B4_128_SB(src3, src4, src5, src6);
2911 
2912  /* row 3 */
2913  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2914  dst3 = const_vec;
2915  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2916 
2917  dst32_r = __msa_ilvr_h(dst3, dst2);
2918  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
2919  dst0_r >>= 6;
2920 
2921  /* row 4 */
2922  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2923  dst4 = const_vec;
2924  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2925 
2926  dst43_r = __msa_ilvr_h(dst4, dst3);
2927  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
2928  dst1_r >>= 6;
2929 
2930  /* row 5 */
2931  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2932  dst5 = const_vec;
2933  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2934 
2935  dst10_r = __msa_ilvr_h(dst5, dst4);
2936  dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
2937  dst2_r >>= 6;
2938 
2939  /* row 6 */
2940  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2941  dst2 = const_vec;
2942  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2943 
2944  dst21_r = __msa_ilvr_h(dst2, dst5);
2945  dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
2946  dst3_r >>= 6;
2947 
2948  PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, out0_r, out1_r);
2949  SRARI_H2_SH(out0_r, out1_r, 6);
2950  CLIP_SH2_0_255(out0_r, out1_r);
2951  out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
2952 
2953  ST4x4_UB(out0_r, out0_r, 0, 1, 2, 3, dst, dst_stride);
2954 }
2955 
2957  int32_t src_stride,
2958  uint8_t *dst,
2959  int32_t dst_stride,
2960  const int8_t *filter_x,
2961  const int8_t *filter_y,
2962  int32_t height)
2963 {
2964  uint32_t loop_cnt;
2965  v16i8 src0, src1, src2, src3, src4, src5;
2966  v16i8 src6, src7, src8, src9, src10;
2967  v8i16 filt0, filt1;
2968  v4i32 filt_h0, filt_h1;
2969  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2970  v16i8 mask1;
2971  v8i16 filter_vec, const_vec;
2972  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2973  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
2974  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
2975  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2976  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2977  v8i16 out0_r, out1_r, out2_r, out3_r;
2978 
2979  src -= (src_stride + 1);
2980 
2981  filter_vec = LD_SH(filter_x);
2982  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2983 
2984  filter_vec = LD_SH(filter_y);
2985  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2986  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2987 
2988  SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
2989 
2990  mask1 = mask0 + 2;
2991 
2992  const_vec = __msa_ldi_h(128);
2993  const_vec <<= 6;
2994 
2995  LD_SB3(src, src_stride, src0, src1, src2);
2996  src += (3 * src_stride);
2997 
2998  XORI_B3_128_SB(src0, src1, src2);
2999 
3000  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3001  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3002  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3003 
3004  dst0 = const_vec;
3005  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3006  dst1 = const_vec;
3007  DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3008  dst2 = const_vec;
3009  DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3010 
3011  ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3012 
3013  for (loop_cnt = height >> 3; loop_cnt--;) {
3014  LD_SB8(src, src_stride,
3015  src3, src4, src5, src6, src7, src8, src9, src10);
3016  src += (8 * src_stride);
3017 
3018  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3019 
3020  /* row 3 */
3021  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3022  dst3 = const_vec;
3023  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3024 
3025  dst32_r = __msa_ilvr_h(dst3, dst2);
3026  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3027  dst0_r >>= 6;
3028 
3029  /* row 4 */
3030  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3031  dst4 = const_vec;
3032  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3033 
3034  dst43_r = __msa_ilvr_h(dst4, dst3);
3035  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3036  dst1_r >>= 6;
3037 
3038  /* row 5 */
3039  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3040  dst5 = const_vec;
3041  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3042 
3043  dst54_r = __msa_ilvr_h(dst5, dst4);
3044  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3045  dst2_r >>= 6;
3046 
3047  /* row 6 */
3048  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3049  dst6 = const_vec;
3050  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3051 
3052  dst65_r = __msa_ilvr_h(dst6, dst5);
3053  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3054  dst3_r >>= 6;
3055 
3056  /* row 7 */
3057  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3058  dst7 = const_vec;
3059  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3060 
3061  dst76_r = __msa_ilvr_h(dst7, dst6);
3062  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3063  dst4_r >>= 6;
3064 
3065  /* row 8 */
3066  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3067  dst8 = const_vec;
3068  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3069 
3070  dst87_r = __msa_ilvr_h(dst8, dst7);
3071  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3072  dst5_r >>= 6;
3073 
3074  /* row 9 */
3075  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
3076  dst9 = const_vec;
3077  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
3078 
3079  dst10_r = __msa_ilvr_h(dst9, dst8);
3080  dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
3081  dst6_r >>= 6;
3082 
3083  /* row 10 */
3084  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
3085  dst2 = const_vec;
3086  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3087 
3088  dst21_r = __msa_ilvr_h(dst2, dst9);
3089  dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
3090  dst7_r >>= 6;
3091 
3092  PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3093  dst5_r, dst4_r, dst7_r, dst6_r,
3094  out0_r, out1_r, out2_r, out3_r);
3095 
3096  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3097  CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3098 
3099  PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3100  ST4x8_UB(out0_r, out1_r, dst, dst_stride);
3101  dst += (8 * dst_stride);
3102  }
3103 }
3104 
3106  int32_t src_stride,
3107  uint8_t *dst,
3108  int32_t dst_stride,
3109  const int8_t *filter_x,
3110  const int8_t *filter_y,
3111  int32_t height)
3112 {
3113  if (2 == height) {
3114  hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3115  filter_x, filter_y, height);
3116  } else if (4 == height) {
3117  hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3118  filter_x, filter_y, height);
3119  } else if (0 == (height % 8)) {
3120  hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3121  filter_x, filter_y, height);
3122  }
3123 }
3124 
3126  int32_t src_stride,
3127  uint8_t *dst,
3128  int32_t dst_stride,
3129  const int8_t *filter_x,
3130  const int8_t *filter_y,
3131  int32_t height)
3132 {
3133  uint32_t loop_cnt;
3134  v16i8 src0, src1, src2, src3, src4, src5, src6;
3135  v8i16 filt0, filt1;
3136  v4i32 filt_h0, filt_h1;
3137  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3138  v16i8 mask1;
3139  v8i16 filter_vec, const_vec;
3140  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3141  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3142  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3143  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3144  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3145  v8i16 out0_r, out1_r, out2_r, out3_r;
3146 
3147  src -= (src_stride + 1);
3148 
3149  filter_vec = LD_SH(filter_x);
3150  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3151 
3152  filter_vec = LD_SH(filter_y);
3153  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3154  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3155 
3156  SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3157 
3158  mask1 = mask0 + 2;
3159 
3160  const_vec = __msa_ldi_h(128);
3161  const_vec <<= 6;
3162 
3163  LD_SB3(src, src_stride, src0, src1, src2);
3164  src += (3 * src_stride);
3165 
3166  XORI_B3_128_SB(src0, src1, src2);
3167 
3168  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3169  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3170  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3171 
3172  dst0 = const_vec;
3173  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3174  dst1 = const_vec;
3175  DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3176  dst2 = const_vec;
3177  DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3178 
3179  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3180  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3181 
3182  for (loop_cnt = height >> 2; loop_cnt--;) {
3183  LD_SB4(src, src_stride, src3, src4, src5, src6);
3184  src += (4 * src_stride);
3185 
3186  XORI_B4_128_SB(src3, src4, src5, src6);
3187 
3188  /* row 3 */
3189  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3190  dst3 = const_vec;
3191  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3192 
3193  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3194  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3195  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3196  dst0_r >>= 6;
3197  dst0_l >>= 6;
3198 
3199  /* row 4 */
3200  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3201  dst4 = const_vec;
3202  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3203 
3204  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3205  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3206  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3207  dst1_r >>= 6;
3208  dst1_l >>= 6;
3209 
3210  /* row 5 */
3211  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3212  dst5 = const_vec;
3213  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3214 
3215  ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3216  dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3217  dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3218 
3219  dst2_r >>= 6;
3220  dst2_l >>= 6;
3221 
3222  /* row 6 */
3223  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3224  dst2 = const_vec;
3225  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3226 
3227  ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3228  dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3229  dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3230 
3231  dst3_r >>= 6;
3232  dst3_l >>= 6;
3233 
3234  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3235  dst2_l, dst2_r, dst3_l, dst3_r,
3236  out0_r, out1_r, out2_r, out3_r);
3237 
3238  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3239  CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3240 
3241  PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3242  ST6x4_UB(out0_r, out1_r, dst, dst_stride);
3243  dst += (4 * dst_stride);
3244  }
3245 }
3246 
3248  int32_t src_stride,
3249  uint8_t *dst,
3250  int32_t dst_stride,
3251  const int8_t *filter_x,
3252  const int8_t *filter_y,
3253  int32_t height)
3254 {
3255  v16i8 src0, src1, src2, src3, src4;
3256  v8i16 filt0, filt1;
3257  v4i32 filt_h0, filt_h1;
3258  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3259  v16i8 mask1;
3260  v8i16 filter_vec, const_vec;
3261  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3262  v8i16 dst0, dst1, dst2, dst3, dst4;
3263  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3264  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3265  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3266  v8i16 out0_r, out1_r;
3267 
3268  src -= (src_stride + 1);
3269 
3270  filter_vec = LD_SH(filter_x);
3271  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3272 
3273  filter_vec = LD_SH(filter_y);
3274  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3275  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3276 
3277  SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3278 
3279  mask1 = mask0 + 2;
3280 
3281  const_vec = __msa_ldi_h(128);
3282  const_vec <<= 6;
3283 
3284  LD_SB3(src, src_stride, src0, src1, src2);
3285  src += (3 * src_stride);
3286 
3287  XORI_B3_128_SB(src0, src1, src2);
3288 
3289  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3290  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3291  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3292 
3293  dst0 = const_vec;
3294  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3295  dst1 = const_vec;
3296  DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3297  dst2 = const_vec;
3298  DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3299 
3300  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3301  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3302 
3303  LD_SB2(src, src_stride, src3, src4);
3304  XORI_B2_128_SB(src3, src4);
3305 
3306  /* row 3 */
3307  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3308  dst3 = const_vec;
3309  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3310 
3311  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3312  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3313  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3314  dst0_r >>= 6;
3315  dst0_l >>= 6;
3316 
3317  /* row 4 */
3318  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3319  dst4 = const_vec;
3320  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3321 
3322  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3323  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3324  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3325  dst1_r >>= 6;
3326  dst1_l >>= 6;
3327 
3328  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3329  SRARI_H2_SH(out0_r, out1_r, 6);
3330  CLIP_SH2_0_255(out0_r, out1_r);
3331  out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
3332 
3333  ST8x2_UB(out0_r, dst, dst_stride);
3334 }
3335 
3337  int32_t src_stride,
3338  uint8_t *dst,
3339  int32_t dst_stride,
3340  const int8_t *filter_x,
3341  const int8_t *filter_y,
3342  int32_t height)
3343 {
3344  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3345  v8i16 filt0, filt1;
3346  v4i32 filt_h0, filt_h1;
3347  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3348  v16i8 mask1;
3349  v8i16 filter_vec, const_vec;
3350  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3351  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3352  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3353  v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3354  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3355  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3356  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3357  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3358  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3359 
3360  src -= (src_stride + 1);
3361 
3362  filter_vec = LD_SH(filter_x);
3363  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3364 
3365  filter_vec = LD_SH(filter_y);
3366  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3367  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3368 
3369  SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3370 
3371  mask1 = mask0 + 2;
3372 
3373  const_vec = __msa_ldi_h(128);
3374  const_vec <<= 6;
3375 
3376  LD_SB3(src, src_stride, src0, src1, src2);
3377  src += (3 * src_stride);
3378 
3379  XORI_B3_128_SB(src0, src1, src2);
3380 
3381  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3382  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3383  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3384 
3385  dst0 = const_vec;
3386  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3387  dst1 = const_vec;
3388  DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3389  dst2 = const_vec;
3390  DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3391 
3392  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3393  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3394 
3395  LD_SB2(src, src_stride, src3, src4);
3396  src += (2 * src_stride);
3397 
3398  XORI_B2_128_SB(src3, src4);
3399 
3400  /* row 3 */
3401  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3402  dst3 = const_vec;
3403  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3404 
3405  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3406  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3407  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3408 
3409  dst0_r >>= 6;
3410  dst0_l >>= 6;
3411 
3412  /* row 4 */
3413  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3414  dst4 = const_vec;
3415  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3416 
3417  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3418  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3419  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3420  dst1_r >>= 6;
3421  dst1_l >>= 6;
3422 
3423  LD_SB2(src, src_stride, src5, src6);
3424  src += (2 * src_stride);
3425 
3426  XORI_B2_128_SB(src5, src6);
3427 
3428  /* row 5 */
3429  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3430  dst5 = const_vec;
3431  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3432 
3433  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3434  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3435  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3436  dst2_r >>= 6;
3437  dst2_l >>= 6;
3438 
3439  /* row 6 */
3440  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3441  dst6 = const_vec;
3442  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3443 
3444  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3445  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3446  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3447  dst3_r >>= 6;
3448  dst3_l >>= 6;
3449 
3450  LD_SB2(src, src_stride, src7, src8);
3451  src += (2 * src_stride);
3452 
3453  XORI_B2_128_SB(src7, src8);
3454 
3455  /* row 7 */
3456  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3457  dst7 = const_vec;
3458  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3459 
3460  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3461  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3462  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3463 
3464  dst4_r >>= 6;
3465  dst4_l >>= 6;
3466 
3467  /* row 8 */
3468  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3469  dst8 = const_vec;
3470  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3471 
3472  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3473  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3474  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3475  dst5_r >>= 6;
3476  dst5_l >>= 6;
3477 
3478  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3479  dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3480  PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3481  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3482  SRARI_H2_SH(out4_r, out5_r, 6);
3483  CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3484  CLIP_SH2_0_255(out4_r, out5_r);
3485 
3486  PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3487  out2_r = (v8i16) __msa_pckev_b((v16i8) out5_r, (v16i8) out4_r);
3488 
3489  ST8x4_UB(out0_r, out1_r, dst, dst_stride);
3490  dst += (4 * dst_stride);
3491  ST8x2_UB(out2_r, dst, dst_stride);
3492 }
3493 
3495  int32_t src_stride,
3496  uint8_t *dst,
3497  int32_t dst_stride,
3498  const int8_t *filter_x,
3499  const int8_t *filter_y,
3500  int32_t height,
3501  int32_t width)
3502 {
3503  uint32_t loop_cnt, cnt;
3504  uint8_t *src_tmp;
3505  uint8_t *dst_tmp;
3506  v16i8 src0, src1, src2, src3, src4, src5, src6;
3507  v8i16 filt0, filt1;
3508  v4i32 filt_h0, filt_h1;
3509  v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3510  v16i8 mask1;
3511  v8i16 filter_vec, const_vec;
3512  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3513  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3514  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3515  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3516  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3517  v8i16 out0_r, out1_r, out2_r, out3_r;
3518 
3519  src -= (src_stride + 1);
3520 
3521  filter_vec = LD_SH(filter_x);
3522  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3523 
3524  filter_vec = LD_SH(filter_y);
3525  vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3526  filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3527 
3528  SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3529 
3530  mask1 = mask0 + 2;
3531 
3532  const_vec = __msa_ldi_h(128);
3533  const_vec <<= 6;
3534 
3535  for (cnt = width >> 3; cnt--;) {
3536  src_tmp = src;
3537  dst_tmp = dst;
3538 
3539  LD_SB3(src_tmp, src_stride, src0, src1, src2);
3540  src_tmp += (3 * src_stride);
3541 
3542  XORI_B3_128_SB(src0, src1, src2);
3543 
3544  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3545  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3546  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3547 
3548  dst0 = const_vec;
3549  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3550  dst1 = const_vec;
3551  DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3552  dst2 = const_vec;
3553  DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3554 
3555  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3556  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3557 
3558  for (loop_cnt = height >> 2; loop_cnt--;) {
3559  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3560  src_tmp += (4 * src_stride);
3561 
3562  XORI_B4_128_SB(src3, src4, src5, src6);
3563 
3564  /* row 3 */
3565  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3566  dst3 = const_vec;
3567  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3568 
3569  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3570  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3571  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3572 
3573  dst0_r >>= 6;
3574  dst0_l >>= 6;
3575 
3576  /* row 4 */
3577  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3578  dst4 = const_vec;
3579  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3580 
3581  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3582  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3583  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3584  dst1_r >>= 6;
3585  dst1_l >>= 6;
3586 
3587  /* row 5 */
3588  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3589  dst5 = const_vec;
3590  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3591 
3592  ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3593  dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3594  dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3595 
3596  dst2_r >>= 6;
3597  dst2_l >>= 6;
3598 
3599  /* row 6 */
3600  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3601  dst2 = const_vec;
3602  DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3603 
3604  ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3605  dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3606  dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3607 
3608  dst3_r >>= 6;
3609  dst3_l >>= 6;
3610 
3611  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3612  dst2_l, dst2_r, dst3_l, dst3_r,
3613  out0_r, out1_r, out2_r, out3_r);
3614 
3615  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3616  CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3617 
3618  PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3619  ST8x4_UB(out0_r, out1_r, dst_tmp, dst_stride);
3620  dst_tmp += (4 * dst_stride);
3621  }
3622 
3623  src += 8;
3624  dst += 8;
3625  }
3626 }
3627 
3629  int32_t src_stride,
3630  uint8_t *dst,
3631  int32_t dst_stride,
3632  const int8_t *filter_x,
3633  const int8_t *filter_y,
3634  int32_t height)
3635 {
3636  if (2 == height) {
3637  hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3638  filter_x, filter_y, height);
3639  } else if (6 == height) {
3640  hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3641  filter_x, filter_y, height);
3642  } else if (0 == (height % 4)) {
3643  hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3644  filter_x, filter_y, height, 8);
3645  }
3646 }
3647 
3649  int32_t src_stride,
3650  uint8_t *dst,
3651  int32_t dst_stride,
3652  const int8_t *filter_x,
3653  const int8_t *filter_y,
3654  int32_t height)
3655 {
3656  hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3657  filter_x, filter_y, height, 8);
3658 
3659  hevc_hv_uni_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
3660  filter_x, filter_y, height);
3661 }
3662 
3664  int32_t src_stride,
3665  uint8_t *dst,
3666  int32_t dst_stride,
3667  const int8_t *filter_x,
3668  const int8_t *filter_y,
3669  int32_t height)
3670 {
3671  hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3672  filter_x, filter_y, height, 16);
3673 }
3674 
3676  int32_t src_stride,
3677  uint8_t *dst,
3678  int32_t dst_stride,
3679  const int8_t *filter_x,
3680  const int8_t *filter_y,
3681  int32_t height)
3682 {
3683  hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3684  filter_x, filter_y, height, 24);
3685 }
3686 
3688  int32_t src_stride,
3689  uint8_t *dst,
3690  int32_t dst_stride,
3691  const int8_t *filter_x,
3692  const int8_t *filter_y,
3693  int32_t height)
3694 {
3695  hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3696  filter_x, filter_y, height, 32);
3697 }
3698 
3699 #define UNI_MC_COPY(WIDTH) \
3700 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
3701  ptrdiff_t dst_stride, \
3702  uint8_t *src, \
3703  ptrdiff_t src_stride, \
3704  int height, \
3705  intptr_t mx, \
3706  intptr_t my, \
3707  int width) \
3708 { \
3709  copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
3710 }
3711 
3712 UNI_MC_COPY(8);
3713 UNI_MC_COPY(12);
3714 UNI_MC_COPY(16);
3715 UNI_MC_COPY(24);
3716 UNI_MC_COPY(32);
3717 UNI_MC_COPY(48);
3718 UNI_MC_COPY(64);
3719 
3720 #undef UNI_MC_COPY
3721 
3722 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
3723 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
3724  ptrdiff_t \
3725  dst_stride, \
3726  uint8_t *src, \
3727  ptrdiff_t \
3728  src_stride, \
3729  int height, \
3730  intptr_t mx, \
3731  intptr_t my, \
3732  int width) \
3733 { \
3734  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
3735  \
3736  common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
3737  filter, height); \
3738 }
3739 
3740 UNI_MC(qpel, h, 4, 8, hz, mx);
3741 UNI_MC(qpel, h, 8, 8, hz, mx);
3742 UNI_MC(qpel, h, 12, 8, hz, mx);
3743 UNI_MC(qpel, h, 16, 8, hz, mx);
3744 UNI_MC(qpel, h, 24, 8, hz, mx);
3745 UNI_MC(qpel, h, 32, 8, hz, mx);
3746 UNI_MC(qpel, h, 48, 8, hz, mx);
3747 UNI_MC(qpel, h, 64, 8, hz, mx);
3748 
3749 UNI_MC(qpel, v, 4, 8, vt, my);
3750 UNI_MC(qpel, v, 8, 8, vt, my);
3751 UNI_MC(qpel, v, 12, 8, vt, my);
3752 UNI_MC(qpel, v, 16, 8, vt, my);
3753 UNI_MC(qpel, v, 24, 8, vt, my);
3754 UNI_MC(qpel, v, 32, 8, vt, my);
3755 UNI_MC(qpel, v, 48, 8, vt, my);
3756 UNI_MC(qpel, v, 64, 8, vt, my);
3757 
3758 UNI_MC(epel, h, 4, 4, hz, mx);
3759 UNI_MC(epel, h, 6, 4, hz, mx);
3760 UNI_MC(epel, h, 8, 4, hz, mx);
3761 UNI_MC(epel, h, 12, 4, hz, mx);
3762 UNI_MC(epel, h, 16, 4, hz, mx);
3763 UNI_MC(epel, h, 24, 4, hz, mx);
3764 UNI_MC(epel, h, 32, 4, hz, mx);
3765 
3766 UNI_MC(epel, v, 4, 4, vt, my);
3767 UNI_MC(epel, v, 6, 4, vt, my);
3768 UNI_MC(epel, v, 8, 4, vt, my);
3769 UNI_MC(epel, v, 12, 4, vt, my);
3770 UNI_MC(epel, v, 16, 4, vt, my);
3771 UNI_MC(epel, v, 24, 4, vt, my);
3772 UNI_MC(epel, v, 32, 4, vt, my);
3773 
3774 #undef UNI_MC
3775 
3776 #define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
3777 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
3778  ptrdiff_t \
3779  dst_stride, \
3780  uint8_t *src, \
3781  ptrdiff_t \
3782  src_stride, \
3783  int height, \
3784  intptr_t mx, \
3785  intptr_t my, \
3786  int width) \
3787 { \
3788  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
3789  const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
3790  \
3791  hevc_##DIR1##_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
3792  dst_stride, filter_x, \
3793  filter_y, height); \
3794 }
3795 
3796 UNI_MC_HV(qpel, hv, 4, 8, hv);
3797 UNI_MC_HV(qpel, hv, 8, 8, hv);
3798 UNI_MC_HV(qpel, hv, 12, 8, hv);
3799 UNI_MC_HV(qpel, hv, 16, 8, hv);
3800 UNI_MC_HV(qpel, hv, 24, 8, hv);
3801 UNI_MC_HV(qpel, hv, 32, 8, hv);
3802 UNI_MC_HV(qpel, hv, 48, 8, hv);
3803 UNI_MC_HV(qpel, hv, 64, 8, hv);
3804 
3805 UNI_MC_HV(epel, hv, 4, 4, hv);
3806 UNI_MC_HV(epel, hv, 6, 4, hv);
3807 UNI_MC_HV(epel, hv, 8, 4, hv);
3808 UNI_MC_HV(epel, hv, 12, 4, hv);
3809 UNI_MC_HV(epel, hv, 16, 4, hv);
3810 UNI_MC_HV(epel, hv, 24, 4, hv);
3811 UNI_MC_HV(epel, hv, 32, 4, hv);
3812 
3813 #undef UNI_MC_HV
#define VSHF_B4_SB(...)
static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
#define HEVC_PCK_SW_SB2(in0, in1, out)
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
static void hevc_hv_uni_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define SRARI_W4_SW(...)
static const uint8_t mc_filt_mask_arr[16 *3]
static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_uni_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ILVR_H4_SH(...)
#define PCKEV_B2_SH(...)
static void copy_width24_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define XORI_B2_128_SB(...)
#define PCKEV_XORI128_UB(in0, in1)
static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define SD
Definition: ccaption_dec.c:819
#define LD_SB(...)
#define XORI_B3_128_SB(...)
static void hevc_hv_uni_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_4t_8w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
#define LD_UB4(...)
#define DPADD_SB4_SH(...)
#define ILVR_B2_SB(...)
#define SPLATI_H2_SH(...)
static void hevc_hv_uni_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define src
Definition: vp8dsp.c:254
static void copy_width48_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define ILVL_H2_SH(...)
static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ILVR_H3_SH(...)
#define LD_SB2(...)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define ILVL_H4_SH(...)
static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B2_128_SH(...)
#define VSHF_B2_SB(...)
#define XORI_B4_128_UB(...)
#define HEVC_PCK_SW_SB4(in0, in1, in2, in3, out)
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1, out2, out3)
#define ILVR_D2_SB(...)
uint8_t
#define LD4(psrc, stride, out0, out1, out2, out3)
#define VSHF_W2_SB(...)
#define SRARI_H4_SH(...)
#define CLIP_SH_0_255(in)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, uint8_t clip)
Definition: cfhd.c:80
#define SPLATI_H4_SH(...)
static void hevc_hv_uni_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ILVL_B2_SB(...)
#define height
#define LD_SH(...)
#define CLIP_SW_0_255(in)
static void hevc_hv_uni_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ILVRL_H2_SH(...)
#define ILVR_D3_SB(...)
#define LD_SB8(...)
#define CLIP_SH2_0_255(in0, in1)
static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1)
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
static void hevc_hv_uni_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_uni_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST8x2_UB(in, pdst, stride)
static const uint16_t mask[17]
Definition: lzw.c:38
#define VSHF_B2_SH(...)
#define SPLATI_H2_SB(...)
static void hevc_hv_uni_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define XORI_B7_128_SB(...)
static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
uint16_t width
Definition: gdv.c:47
#define ADDS_SH2_SH(...)
#define ILVR_B2_SH(...)
static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define XORI_B4_128_SB(...)
static void hevc_hv_uni_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define CLIP_SH4_0_255(in0, in1, in2, in3)
#define DPADD_SB2_SH(...)
#define SRARI_H2_SH(...)
static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define PCKEV_B3_SB(...)
#define LD_UB3(...)
#define LD_UB8(...)
#define SPLATI_W4_SW(...)
static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
int32_t
static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define PCKEV_H2_SH(...)
#define SRARI_W2_SW(...)
#define LD_SB3(...)
static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width12_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST_UB(...)
static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define SAT_SH4_SH(...)
#define SPLATI_H4_SB(...)
#define LD_SB4(...)
#define PCKEV_B4_UB(...)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_hv_uni_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST_UB8(...)
static void hevc_hv_uni_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST_UB4(...)
static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define src1
Definition: h264pred.c:139
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define ILVL_B4_SB(...)
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1, out2, out3)
#define SAT_SH2_SH(...)
#define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1)
static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void copy_width64_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_uni_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_uni_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define DOTP_SB4_SH(...)
#define ILVR_B4_SH(...)
static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1)
#define SAT_SH3_SH(...)
static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define src0
Definition: h264pred.c:138
static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void copy_width16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define SD4(in0, in1, in2, in3, pdst, stride)
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SW(val, pdst)
static const int8_t filt[NUMTAPS]
Definition: af_earwax.c:39
#define ST4x8_UB(in0, in1, pdst, stride)
#define ST6x4_UB(in0, in1, pdst, stride)
static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD_SB7(...)
static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD_SB5(...)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define ST8x4_UB(in0, in1, pdst, stride)
#define ILVR_H2_SH(...)
#define UNI_MC_COPY(WIDTH)
#define LD_UB(...)
static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void hevc_hv_uni_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,filt0, filt1, filt2, filt3)
#define ILVR_B4_SB(...)
static void hevc_hv_uni_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
FILE * out
Definition: movenc.c:54
#define ILVR_B3_SH(...)
static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SPLATI_W2_SW(...)
#define ST8x1_UB(in, pdst)
static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)
#define PCKEV_H4_SH(...)
static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST4x2_UB(in, pdst, stride)
#define LD2(psrc, stride, out0, out1)
static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SLDI_B2_SB(...)
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
static void copy_width32_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define ADDS_SH4_SH(...)
static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define DOTP_SB2_SH(...)
#define VSHF_B3_SB(...)
#define DOTP_SB3_SH(...)