FFmpeg
hevc_mc_uni_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
24 
25 static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
26  /* 8 width cases */
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  /* 4 width cases */
29  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
30  /* 4 width cases */
31  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
32 };
33 
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
35  mask0, mask1, mask2, mask3, \
36  filt0, filt1, filt2, filt3, \
37  out0, out1) \
38 { \
39  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
40  \
41  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
42  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
43  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
44  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
45  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
46  DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
47  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
48  DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \
49 }
50 
51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
52  mask0, mask1, mask2, mask3, \
53  filt0, filt1, filt2, filt3, \
54  out0, out1, out2, out3) \
55 { \
56  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
57  \
58  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
59  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
60  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
61  out0, out1, out2, out3); \
62  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
63  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
64  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
65  out0, out1, out2, out3); \
66  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
67  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
68  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
69  out0, out1, out2, out3); \
70  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
71  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
72  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
73  out0, out1, out2, out3); \
74 }
75 
76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
77  mask0, mask1, filt0, filt1, \
78  out0, out1) \
79 { \
80  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
81  \
82  VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
83  DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
84  VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
85  DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
86 }
87 
88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
89  mask0, mask1, filt0, filt1, \
90  out0, out1, out2, out3) \
91 { \
92  v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
93  \
94  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
95  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
96  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
97  out0, out1, out2, out3); \
98  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
99  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
100  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
101  out0, out1, out2, out3); \
102 }
103 
104 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
105  uint8_t *dst, int32_t dst_stride,
106  int32_t height)
107 {
108  int32_t cnt;
109  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
110 
111  if (2 == height) {
112  LD2(src, src_stride, out0, out1);
113  SD(out0, dst);
114  dst += dst_stride;
115  SD(out1, dst);
116  } else if (6 == height) {
117  LD4(src, src_stride, out0, out1, out2, out3);
118  src += (4 * src_stride);
119  SD4(out0, out1, out2, out3, dst, dst_stride);
120  dst += (4 * dst_stride);
121  LD2(src, src_stride, out0, out1);
122  SD(out0, dst);
123  dst += dst_stride;
124  SD(out1, dst);
125  } else if (0 == (height % 8)) {
126  for (cnt = (height >> 3); cnt--;) {
127  LD4(src, src_stride, out0, out1, out2, out3);
128  src += (4 * src_stride);
129  LD4(src, src_stride, out4, out5, out6, out7);
130  src += (4 * src_stride);
131  SD4(out0, out1, out2, out3, dst, dst_stride);
132  dst += (4 * dst_stride);
133  SD4(out4, out5, out6, out7, dst, dst_stride);
134  dst += (4 * dst_stride);
135  }
136  } else if (0 == (height % 4)) {
137  for (cnt = (height >> 2); cnt--;) {
138  LD4(src, src_stride, out0, out1, out2, out3);
139  src += (4 * src_stride);
140  SD4(out0, out1, out2, out3, dst, dst_stride);
141  dst += (4 * dst_stride);
142  }
143  }
144 }
145 
146 static void copy_width12_msa(const uint8_t *src, int32_t src_stride,
147  uint8_t *dst, int32_t dst_stride,
148  int32_t height)
149 {
150  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
151 
152  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153  src += (8 * src_stride);
154  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155  dst += (8 * dst_stride);
156  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157  ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
158 }
159 
160 static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
161  uint8_t *dst, int32_t dst_stride,
162  int32_t height)
163 {
164  int32_t cnt;
165  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
166 
167  if (12 == height) {
168  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169  src += (8 * src_stride);
170  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171  dst += (8 * dst_stride);
172  LD_UB4(src, src_stride, src0, src1, src2, src3);
173  src += (4 * src_stride);
174  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175  dst += (4 * dst_stride);
176  } else if (0 == (height % 8)) {
177  for (cnt = (height >> 3); cnt--;) {
178  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
179  src7);
180  src += (8 * src_stride);
181  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
182  dst_stride);
183  dst += (8 * dst_stride);
184  }
185  } else if (0 == (height % 4)) {
186  for (cnt = (height >> 2); cnt--;) {
187  LD_UB4(src, src_stride, src0, src1, src2, src3);
188  src += (4 * src_stride);
189 
190  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191  dst += (4 * dst_stride);
192  }
193  }
194 }
195 
196 static void copy_width24_msa(const uint8_t *src, int32_t src_stride,
197  uint8_t *dst, int32_t dst_stride,
198  int32_t height)
199 {
200  int32_t cnt;
201  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
202  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
203 
204  for (cnt = 4; cnt--;) {
205  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206  LD4(src + 16, src_stride, out0, out1, out2, out3);
207  src += (4 * src_stride);
208  LD4(src + 16, src_stride, out4, out5, out6, out7);
209  src += (4 * src_stride);
210 
211  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212  SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213  dst += (4 * dst_stride);
214  SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215  dst += (4 * dst_stride);
216  }
217 }
218 
219 static void copy_width32_msa(const uint8_t *src, int32_t src_stride,
220  uint8_t *dst, int32_t dst_stride,
221  int32_t height)
222 {
223  int32_t cnt;
224  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
225 
226  for (cnt = (height >> 2); cnt--;) {
227  LD_UB4(src, src_stride, src0, src1, src2, src3);
228  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229  src += (4 * src_stride);
230  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232  dst += (4 * dst_stride);
233  }
234 }
235 
236 static void copy_width48_msa(const uint8_t *src, int32_t src_stride,
237  uint8_t *dst, int32_t dst_stride,
238  int32_t height)
239 {
240  int32_t cnt;
241  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
242  v16u8 src11;
243 
244  for (cnt = (height >> 2); cnt--;) {
245  LD_UB4(src, src_stride, src0, src1, src2, src3);
246  LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247  LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248  src += (4 * src_stride);
249 
250  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251  ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252  ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253  dst += (4 * dst_stride);
254  }
255 }
256 
257 static void copy_width64_msa(const uint8_t *src, int32_t src_stride,
258  uint8_t *dst, int32_t dst_stride,
259  int32_t height)
260 {
261  int32_t cnt;
262  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
263  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
264 
265  for (cnt = (height >> 2); cnt--;) {
266  LD_UB4(src, 16, src0, src1, src2, src3);
267  src += src_stride;
268  LD_UB4(src, 16, src4, src5, src6, src7);
269  src += src_stride;
270  LD_UB4(src, 16, src8, src9, src10, src11);
271  src += src_stride;
272  LD_UB4(src, 16, src12, src13, src14, src15);
273  src += src_stride;
274 
275  ST_UB4(src0, src1, src2, src3, dst, 16);
276  dst += dst_stride;
277  ST_UB4(src4, src5, src6, src7, dst, 16);
278  dst += dst_stride;
279  ST_UB4(src8, src9, src10, src11, dst, 16);
280  dst += dst_stride;
281  ST_UB4(src12, src13, src14, src15, dst, 16);
282  dst += dst_stride;
283  }
284 }
285 
286 static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
287  uint8_t *dst, int32_t dst_stride,
288  const int8_t *filter)
289 {
290  v16u8 mask0, mask1, mask2, mask3, out;
291  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
292  v8i16 filt, out0, out1;
293 
294  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
295  src -= 3;
296 
297  /* rearranging filter */
298  filt = LD_SH(filter);
299  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
300 
301  mask1 = mask0 + 2;
302  mask2 = mask0 + 4;
303  mask3 = mask0 + 6;
304 
305  LD_SB4(src, src_stride, src0, src1, src2, src3);
306  XORI_B4_128_SB(src0, src1, src2, src3);
307  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
308  mask3, filt0, filt1, filt2, filt3, out0, out1);
309  SRARI_H2_SH(out0, out1, 6);
310  SAT_SH2_SH(out0, out1, 7);
311  out = PCKEV_XORI128_UB(out0, out1);
312  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
313 }
314 
315 static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
316  uint8_t *dst, int32_t dst_stride,
317  const int8_t *filter)
318 {
319  v16i8 filt0, filt1, filt2, filt3;
320  v16i8 src0, src1, src2, src3;
321  v16u8 mask0, mask1, mask2, mask3, out;
322  v8i16 filt, out0, out1, out2, out3;
323 
324  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
325  src -= 3;
326 
327  /* rearranging filter */
328  filt = LD_SH(filter);
329  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
330 
331  mask1 = mask0 + 2;
332  mask2 = mask0 + 4;
333  mask3 = mask0 + 6;
334 
335  LD_SB4(src, src_stride, src0, src1, src2, src3);
336  XORI_B4_128_SB(src0, src1, src2, src3);
337  src += (4 * src_stride);
338  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
339  mask3, filt0, filt1, filt2, filt3, out0, out1);
340  LD_SB4(src, src_stride, src0, src1, src2, src3);
341  XORI_B4_128_SB(src0, src1, src2, src3);
342  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
343  mask3, filt0, filt1, filt2, filt3, out2, out3);
344  SRARI_H4_SH(out0, out1, out2, out3, 6);
345  SAT_SH4_SH(out0, out1, out2, out3, 7);
346  out = PCKEV_XORI128_UB(out0, out1);
347  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
348  out = PCKEV_XORI128_UB(out2, out3);
349  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
350 }
351 
352 static void common_hz_8t_4x16_msa(const uint8_t *src, int32_t src_stride,
353  uint8_t *dst, int32_t dst_stride,
354  const int8_t *filter)
355 {
356  v16u8 mask0, mask1, mask2, mask3, out;
357  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
358  v8i16 filt, out0, out1, out2, out3;
359 
360  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
361  src -= 3;
362 
363  /* rearranging filter */
364  filt = LD_SH(filter);
365  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
366 
367  mask1 = mask0 + 2;
368  mask2 = mask0 + 4;
369  mask3 = mask0 + 6;
370 
371  LD_SB4(src, src_stride, src0, src1, src2, src3);
372  XORI_B4_128_SB(src0, src1, src2, src3);
373  src += (4 * src_stride);
374  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
375  mask3, filt0, filt1, filt2, filt3, out0, out1);
376  LD_SB4(src, src_stride, src0, src1, src2, src3);
377  XORI_B4_128_SB(src0, src1, src2, src3);
378  src += (4 * src_stride);
379  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
380  mask3, filt0, filt1, filt2, filt3, out2, out3);
381  SRARI_H4_SH(out0, out1, out2, out3, 6);
382  SAT_SH4_SH(out0, out1, out2, out3, 7);
383  out = PCKEV_XORI128_UB(out0, out1);
384  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
385  out = PCKEV_XORI128_UB(out2, out3);
386  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
387  dst += (8 * dst_stride);
388 
389  LD_SB4(src, src_stride, src0, src1, src2, src3);
390  XORI_B4_128_SB(src0, src1, src2, src3);
391  src += (4 * src_stride);
392  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
393  mask3, filt0, filt1, filt2, filt3, out0, out1);
394  LD_SB4(src, src_stride, src0, src1, src2, src3);
395  XORI_B4_128_SB(src0, src1, src2, src3);
396  src += (4 * src_stride);
397  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
398  mask3, filt0, filt1, filt2, filt3, out2, out3);
399 
400  SRARI_H4_SH(out0, out1, out2, out3, 6);
401  SAT_SH4_SH(out0, out1, out2, out3, 7);
402  out = PCKEV_XORI128_UB(out0, out1);
403  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
404  out = PCKEV_XORI128_UB(out2, out3);
405  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
406 }
407 
408 static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
409  uint8_t *dst, int32_t dst_stride,
410  const int8_t *filter, int32_t height)
411 {
412  if (4 == height) {
413  common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
414  } else if (8 == height) {
415  common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
416  } else if (16 == height) {
417  common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
418  }
419 }
420 
421 static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride,
422  uint8_t *dst, int32_t dst_stride,
423  const int8_t *filter, int32_t height)
424 {
425  uint32_t loop_cnt;
426  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
427  v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
428  v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
429  v8i16 filt, out0, out1, out2, out3;
430 
431  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
432  src -= 3;
433 
434  /* rearranging filter */
435  filt = LD_SH(filter);
436  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
437 
438  mask1 = mask0 + 2;
439  mask2 = mask0 + 4;
440  mask3 = mask0 + 6;
441 
442  for (loop_cnt = (height >> 2); loop_cnt--;) {
443  LD_SB4(src, src_stride, src0, src1, src2, src3);
444  XORI_B4_128_SB(src0, src1, src2, src3);
445  src += (4 * src_stride);
446 
447  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
448  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
449  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
450  out0, out1, out2, out3);
451  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
452  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
453  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
454  out0, out1, out2, out3);
455  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
456  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
457  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
458  out0, out1, out2, out3);
459  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
460  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
461  DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
462  out0, out1, out2, out3);
463 
464  SRARI_H4_SH(out0, out1, out2, out3, 6);
465  SAT_SH4_SH(out0, out1, out2, out3, 7);
466  tmp0 = PCKEV_XORI128_UB(out0, out1);
467  tmp1 = PCKEV_XORI128_UB(out2, out3);
468  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
469  dst += (4 * dst_stride);
470  }
471 }
472 
473 static void common_hz_8t_12w_msa(const uint8_t *src, int32_t src_stride,
474  uint8_t *dst, int32_t dst_stride,
475  const int8_t *filter, int32_t height)
476 {
477  uint32_t loop_cnt;
478  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
479  v16u8 tmp0, tmp1, tmp2;
480  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
481  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
482  v16i8 filt0, filt1, filt2, filt3;
483  v8i16 filt, out0, out1, out2, out3, out4, out5;
484 
485  mask00 = LD_UB(&ff_hevc_mask_arr[0]);
486  mask0 = LD_UB(&ff_hevc_mask_arr[16]);
487 
488  src = src - 3;
489 
490  /* rearranging filter */
491  filt = LD_SH(filter);
492  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
493 
494  mask1 = mask00 + 2;
495  mask2 = mask00 + 4;
496  mask3 = mask00 + 6;
497  mask4 = mask0 + 2;
498  mask5 = mask0 + 4;
499  mask6 = mask0 + 6;
500 
501  for (loop_cnt = 4; loop_cnt--;) {
502  /* 8 width */
503  LD_SB4(src, src_stride, src0, src1, src2, src3);
504  /* 4 width */
505  LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
506 
507  XORI_B4_128_SB(src0, src1, src2, src3);
508  XORI_B4_128_SB(src4, src5, src6, src7);
509  src += (4 * src_stride);
510 
511  VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
512  VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
513  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
514  out1, out2, out3);
515  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
516  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
517  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
518  out1, out2, out3);
519  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
520  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
521  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
522  out1, out2, out3);
523  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
524  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
525  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
526  out1, out2, out3);
527 
528  /* 4 width */
529  VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
530  DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5);
531  VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
532  DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5);
533  VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
534  DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5);
535  VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
536  DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5);
537 
538  SRARI_H4_SH(out0, out1, out2, out3, 6);
539  SRARI_H2_SH(out4, out5, 6);
540  SAT_SH4_SH(out0, out1, out2, out3, 7);
541  SAT_SH2_SH(out4, out5, 7);
542  tmp0 = PCKEV_XORI128_UB(out0, out1);
543  tmp1 = PCKEV_XORI128_UB(out2, out3);
544  tmp2 = PCKEV_XORI128_UB(out4, out5);
545 
546  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
547  ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
548  dst += (4 * dst_stride);
549  }
550 }
551 
552 static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride,
553  uint8_t *dst, int32_t dst_stride,
554  const int8_t *filter, int32_t height)
555 {
556  uint32_t loop_cnt;
557  v16u8 mask0, mask1, mask2, mask3, out;
558  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
559  v16i8 filt0, filt1, filt2, filt3;
560  v8i16 filt, out0, out1, out2, out3;
561 
562  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
563  src -= 3;
564 
565  /* rearranging filter */
566  filt = LD_SH(filter);
567  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
568 
569  mask1 = mask0 + 2;
570  mask2 = mask0 + 4;
571  mask3 = mask0 + 6;
572 
573  for (loop_cnt = (height >> 2); loop_cnt--;) {
574  LD_SB2(src, src_stride, src0, src2);
575  LD_SB2(src + 8, src_stride, src1, src3);
576  src += (2 * src_stride);
577 
578  LD_SB2(src, src_stride, src4, src6);
579  LD_SB2(src + 8, src_stride, src5, src7);
580  src += (2 * src_stride);
581 
582  XORI_B4_128_SB(src0, src1, src2, src3);
583  XORI_B4_128_SB(src4, src5, src6, src7);
584  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
585  mask3, filt0, filt1, filt2, filt3, out0,
586  out1, out2, out3);
587  SRARI_H4_SH(out0, out1, out2, out3, 6);
588  SAT_SH4_SH(out0, out1, out2, out3, 7);
589  out = PCKEV_XORI128_UB(out0, out1);
590  ST_UB(out, dst);
591  dst += dst_stride;
592  out = PCKEV_XORI128_UB(out2, out3);
593  ST_UB(out, dst);
594  dst += dst_stride;
595 
596  HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
597  mask3, filt0, filt1, filt2, filt3, out0,
598  out1, out2, out3);
599  SRARI_H4_SH(out0, out1, out2, out3, 6);
600  SAT_SH4_SH(out0, out1, out2, out3, 7);
601  out = PCKEV_XORI128_UB(out0, out1);
602  ST_UB(out, dst);
603  dst += dst_stride;
604  out = PCKEV_XORI128_UB(out2, out3);
605  ST_UB(out, dst);
606  dst += dst_stride;
607  }
608 }
609 
610 static void common_hz_8t_24w_msa(const uint8_t *src, int32_t src_stride,
611  uint8_t *dst, int32_t dst_stride,
612  const int8_t *filter, int32_t height)
613 {
614  uint32_t loop_cnt;
615  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
616  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
617  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
618  v16i8 vec11;
619  v8i16 out0, out1, out2, out3, out8, out9, filt;
620 
621  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
622  src -= 3;
623 
624  /* rearranging filter */
625  filt = LD_SH(filter);
626  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
627 
628  mask1 = mask0 + 2;
629  mask2 = mask0 + 4;
630  mask3 = mask0 + 6;
631  mask4 = mask0 + 8;
632  mask5 = mask0 + 10;
633  mask6 = mask0 + 12;
634  mask7 = mask0 + 14;
635 
636  for (loop_cnt = 16; loop_cnt--;) {
637  LD_SB2(src, src_stride, src0, src2);
638  LD_SB2(src + 16, src_stride, src1, src3);
639  XORI_B4_128_SB(src0, src1, src2, src3);
640  src += (2 * src_stride);
641  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
642  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
643  VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
644  DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
645  out8, out2, out9);
646  DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
647  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
648  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
649  VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
650  DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
651  out0, out8, out2, out9);
652  DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3);
653  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
654  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
655  VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
656  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
657  out0, out8, out2, out9);
658  DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
659  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
660  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
661  VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
662  DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
663  out0, out8, out2, out9);
664  DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3);
665  SRARI_H4_SH(out0, out8, out2, out9, 6);
666  SRARI_H2_SH(out1, out3, 6);
667  SAT_SH4_SH(out0, out8, out2, out9, 7);
668  SAT_SH2_SH(out1, out3, 7);
669  out = PCKEV_XORI128_UB(out8, out9);
670  ST_D2(out, 0, 1, dst + 16, dst_stride);
671  out = PCKEV_XORI128_UB(out0, out1);
672  ST_UB(out, dst);
673  dst += dst_stride;
674  out = PCKEV_XORI128_UB(out2, out3);
675  ST_UB(out, dst);
676  dst += dst_stride;
677  }
678 }
679 
680 static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride,
681  uint8_t *dst, int32_t dst_stride,
682  const int8_t *filter, int32_t height)
683 {
684  uint32_t loop_cnt;
685  v16u8 mask0, mask1, mask2, mask3, out;
686  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
687  v16i8 filt0, filt1, filt2, filt3;
688  v8i16 filt, out0, out1, out2, out3;
689 
690  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
691  src -= 3;
692 
693  /* rearranging filter */
694  filt = LD_SH(filter);
695  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
696 
697  mask1 = mask0 + 2;
698  mask2 = mask0 + 4;
699  mask3 = mask0 + 6;
700 
701  for (loop_cnt = (height >> 1); loop_cnt--;) {
702  src0 = LD_SB(src);
703  src1 = LD_SB(src + 8);
704  src2 = LD_SB(src + 16);
705  src3 = LD_SB(src + 24);
706  src += src_stride;
707  XORI_B4_128_SB(src0, src1, src2, src3);
708 
709  src4 = LD_SB(src);
710  src5 = LD_SB(src + 8);
711  src6 = LD_SB(src + 16);
712  src7 = LD_SB(src + 24);
713  src += src_stride;
714  XORI_B4_128_SB(src4, src5, src6, src7);
715 
716  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
717  mask3, filt0, filt1, filt2, filt3, out0,
718  out1, out2, out3);
719  SRARI_H4_SH(out0, out1, out2, out3, 6);
720  SAT_SH4_SH(out0, out1, out2, out3, 7);
721 
722  out = PCKEV_XORI128_UB(out0, out1);
723  ST_UB(out, dst);
724  out = PCKEV_XORI128_UB(out2, out3);
725  ST_UB(out, dst + 16);
726  dst += dst_stride;
727 
728  HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
729  mask3, filt0, filt1, filt2, filt3, out0,
730  out1, out2, out3);
731  SRARI_H4_SH(out0, out1, out2, out3, 6);
732  SAT_SH4_SH(out0, out1, out2, out3, 7);
733  out = PCKEV_XORI128_UB(out0, out1);
734  ST_UB(out, dst);
735  out = PCKEV_XORI128_UB(out2, out3);
736  ST_UB(out, dst + 16);
737  dst += dst_stride;
738  }
739 }
740 
741 static void common_hz_8t_48w_msa(const uint8_t *src, int32_t src_stride,
742  uint8_t *dst, int32_t dst_stride,
743  const int8_t *filter, int32_t height)
744 {
745  uint32_t loop_cnt;
746  v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
747  v16i8 src4;
748  v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
749  v8i16 filt, out0, out1, out2, out3;
750 
751  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
752  src -= 3;
753 
754  /* rearranging filter */
755  filt = LD_SH(filter);
756  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
757 
758  mask1 = mask0 + 2;
759  mask2 = mask0 + 4;
760  mask3 = mask0 + 6;
761  mask4 = mask0 + 8;
762  mask5 = mask0 + 10;
763  mask6 = mask0 + 12;
764  mask7 = mask0 + 14;
765 
766  for (loop_cnt = 64; loop_cnt--;) {
767  src0 = LD_SB(src);
768  src1 = LD_SB(src + 8);
769  src2 = LD_SB(src + 16);
770  src3 = LD_SB(src + 32);
771  src4 = LD_SB(src + 40);
772  src += src_stride;
773 
774  XORI_B4_128_SB(src0, src1, src2, src3);
775  src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
776 
777  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
778  vec0, vec1, vec2);
779  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
780  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
781  vec0, vec1, vec2);
782  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
783  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
784  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
785  vec0, vec1, vec2);
786  DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
787  out2 = __msa_dpadd_s_h(out2, vec2, filt2);
788 
789  VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
790  vec0, vec1, vec2);
791  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
792  out2 = __msa_dpadd_s_h(out2, vec2, filt3);
793 
794  SRARI_H2_SH(out0, out1, 6);
795  out3 = __msa_srari_h(out2, 6);
796  SAT_SH3_SH(out0, out1, out3, 7);
797  out = PCKEV_XORI128_UB(out0, out1);
798  ST_UB(out, dst);
799 
800  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
801  vec0, vec1, vec2);
802  DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
803  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
804  vec0, vec1, vec2);
805  DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
806  out2 = __msa_dpadd_s_h(out2, vec2, filt1);
807  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
808  vec0, vec1, vec2);
809  DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
810  out2 = __msa_dpadd_s_h(out2, vec2, filt2);
811  VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
812  vec0, vec1, vec2);
813  DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
814  out2 = __msa_dpadd_s_h(out2, vec2, filt3);
815 
816  SRARI_H2_SH(out0, out1, 6);
817  out2 = __msa_srari_h(out2, 6);
818  SAT_SH3_SH(out0, out1, out2, 7);
819  out = PCKEV_XORI128_UB(out3, out0);
820  ST_UB(out, dst + 16);
821  out = PCKEV_XORI128_UB(out1, out2);
822  ST_UB(out, dst + 32);
823  dst += dst_stride;
824  }
825 }
826 
827 static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride,
828  uint8_t *dst, int32_t dst_stride,
829  const int8_t *filter, int32_t height)
830 {
831  int32_t loop_cnt;
832  v16u8 mask0, mask1, mask2, mask3, out;
833  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
834  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
835  v16i8 filt0, filt1, filt2, filt3;
836  v8i16 res0, res1, res2, res3, filt;
837 
838  mask0 = LD_UB(&ff_hevc_mask_arr[0]);
839  src -= 3;
840 
841  /* rearranging filter */
842  filt = LD_SH(filter);
843  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
844 
845  mask1 = mask0 + 2;
846  mask2 = mask0 + 4;
847  mask3 = mask0 + 6;
848 
849  for (loop_cnt = height; loop_cnt--;) {
850  LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
851  src += src_stride;
852 
853  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
854 
855  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
856  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
857  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
858  res1, res2, res3);
859  VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
860  VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
861  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
862  res1, res2, res3);
863  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
864  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
865  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
866  res1, res2, res3);
867  VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
868  VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
869  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
870  res1, res2, res3);
871 
872  SRARI_H4_SH(res0, res1, res2, res3, 6);
873  SAT_SH4_SH(res0, res1, res2, res3, 7);
874  out = PCKEV_XORI128_UB(res0, res1);
875  ST_UB(out, dst);
876  out = PCKEV_XORI128_UB(res2, res3);
877  ST_UB(out, dst + 16);
878 
879  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
880  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
881  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
882  res1, res2, res3);
883  VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
884  VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
885  DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
886  res1, res2, res3);
887  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
888  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
889  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
890  res1, res2, res3);
891  VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
892  VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
893  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
894  res1, res2, res3);
895 
896  SRARI_H4_SH(res0, res1, res2, res3, 6);
897  SAT_SH4_SH(res0, res1, res2, res3, 7);
898  out = PCKEV_XORI128_UB(res0, res1);
899  ST_UB(out, dst + 32);
900  out = PCKEV_XORI128_UB(res2, res3);
901  ST_UB(out, dst + 48);
902  dst += dst_stride;
903  }
904 }
905 
906 static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride,
907  uint8_t *dst, int32_t dst_stride,
908  const int8_t *filter, int32_t height)
909 {
910  uint32_t loop_cnt;
911  uint32_t res = (height & 0x07) >> 1;
912  v16u8 out0, out1;
913  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
914  v16i8 src11, src12, src13, src14;
915  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
916  v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
917  v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
918  v16i8 src10998, filt0, filt1, filt2, filt3;
919  v8i16 filt, out10, out32, out54, out76;
920 
921  src -= (3 * src_stride);
922 
923  filt = LD_SH(filter);
924  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
925 
926  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
927  src += (7 * src_stride);
928 
929  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
930  src54_r, src21_r);
931  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
932  ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
933  src4332, src6554);
934  XORI_B3_128_SB(src2110, src4332, src6554);
935 
936  for (loop_cnt = (height >> 3); loop_cnt--;) {
937  LD_SB4(src, src_stride, src7, src8, src9, src10);
938  src += (4 * src_stride);
939  LD_SB4(src, src_stride, src11, src12, src13, src14);
940  src += (4 * src_stride);
941 
942  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
943  src87_r, src98_r, src109_r);
944  ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
945  src1110_r, src1211_r, src1312_r, src1413_r);
946  ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
947  ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
948  src12111110, src14131312);
949  XORI_B2_128_SB(src8776, src10998);
950  XORI_B2_128_SB(src12111110, src14131312);
951 
952  DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
953  DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
954  DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
955  DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
956  DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
957  DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
958  DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
959  DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
960  SRARI_H2_SH(out10, out32, 6);
961  SRARI_H2_SH(out54, out76, 6);
962  SAT_SH2_SH(out10, out32, 7);
963  SAT_SH2_SH(out54, out76, 7);
964  out0 = PCKEV_XORI128_UB(out10, out32);
965  out1 = PCKEV_XORI128_UB(out54, out76);
966  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
967  dst += (8 * dst_stride);
968 
969  src2110 = src10998;
970  src4332 = src12111110;
971  src6554 = src14131312;
972  src6 = src14;
973  }
974  for (; res--; ) {
975  LD_SB2(src, src_stride, src7, src8);
976  src += 2 * src_stride;
977  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
978  src8776 = (v16i8)__msa_ilvr_d((v2i64) src87_r, (v2i64) src76_r);
979  src8776 = (v16i8)__msa_xori_b(src8776, 128);
980  out10 = (v8i16)__msa_dotp_s_h((v16i8) src2110, (v16i8) filt0);
981  out10 = (v8i16)__msa_dpadd_s_h((v8i16) out10, src4332, filt1);
982  out10 = (v8i16)__msa_dpadd_s_h((v8i16) out10, src6554, filt2);
983  out10 = (v8i16)__msa_dpadd_s_h((v8i16) out10, src8776, filt3);
984  out10 = (v8i16)__msa_srari_h((v8i16) out10, 6);
985  out10 = (v8i16)__msa_sat_s_h((v8i16) out10, 7);
986  out0 = (v16u8)__msa_pckev_b((v16i8) out10, (v16i8) out10);
987  out0 = (v16u8)__msa_xori_b((v16u8) out0, 128);
988  ST_W2(out0, 0, 1, dst, dst_stride);
989  dst += 2 * dst_stride;
990  src2110 = src4332;
991  src4332 = src6554;
992  src6554 = src8776;
993  src6 = src8;
994  }
995 }
996 
997 static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride,
998  uint8_t *dst, int32_t dst_stride,
999  const int8_t *filter, int32_t height)
1000 {
1001  uint32_t loop_cnt;
1002  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1003  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1004  v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1005  v16u8 tmp0, tmp1;
1006  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
1007 
1008  src -= (3 * src_stride);
1009 
1010  filt = LD_SH(filter);
1011  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1012 
1013  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1014  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1015  src += (7 * src_stride);
1016  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1017  src54_r, src21_r);
1018  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1019 
1020  for (loop_cnt = (height >> 2); loop_cnt--;) {
1021  LD_SB4(src, src_stride, src7, src8, src9, src10);
1022  XORI_B4_128_SB(src7, src8, src9, src10);
1023  src += (4 * src_stride);
1024 
1025  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1026  src87_r, src98_r, src109_r);
1027  DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1028  filt0, out0_r, out1_r, out2_r, out3_r);
1029  DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1030  filt1, out0_r, out1_r, out2_r, out3_r);
1031  DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1032  filt2, out0_r, out1_r, out2_r, out3_r);
1033  DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1034  filt3, out0_r, out1_r, out2_r, out3_r);
1035  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1036  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1037  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1038  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1039  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
1040  dst += (4 * dst_stride);
1041 
1042  src10_r = src54_r;
1043  src32_r = src76_r;
1044  src54_r = src98_r;
1045  src21_r = src65_r;
1046  src43_r = src87_r;
1047  src65_r = src109_r;
1048  src6 = src10;
1049  }
1050 }
1051 
1052 static void common_vt_8t_12w_msa(const uint8_t *src, int32_t src_stride,
1053  uint8_t *dst, int32_t dst_stride,
1054  const int8_t *filter, int32_t height)
1055 {
1056  uint32_t loop_cnt;
1057  uint32_t out2, out3;
1058  uint64_t out0, out1;
1059  v16u8 tmp0, tmp1, tmp2, tmp3;
1060  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1061  v16i8 filt0, filt1, filt2, filt3;
1062  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1063  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1064  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1065  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1066 
1067  src -= (3 * src_stride);
1068 
1069  filt = LD_SH(filter);
1070  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1071 
1072  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1073  src += (7 * src_stride);
1074 
1075  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1076 
1077  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1078  src54_r, src21_r);
1079  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1080  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1081  src54_l, src21_l);
1082  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1083 
1084  for (loop_cnt = 4; loop_cnt--;) {
1085  LD_SB4(src, src_stride, src7, src8, src9, src10);
1086  XORI_B4_128_SB(src7, src8, src9, src10);
1087  src += (4 * src_stride);
1088 
1089  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1090  src87_r, src98_r, src109_r);
1091  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1092  src87_l, src98_l, src109_l);
1093  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1094  filt1, filt2, filt3);
1095  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1096  filt1, filt2, filt3);
1097  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1098  filt1, filt2, filt3);
1099  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1100  filt1, filt2, filt3);
1101  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1102  filt1, filt2, filt3);
1103  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1104  filt1, filt2, filt3);
1105  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1106  filt1, filt2, filt3);
1107  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1108  filt1, filt2, filt3);
1109  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1110  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1111  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1112  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1113  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1114  out3_r, tmp0, tmp1, tmp2, tmp3);
1115  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1116 
1117  out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1118  out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1119  out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1120  out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1121  SD(out0, dst);
1122  SW(out2, (dst + 8));
1123  dst += dst_stride;
1124  SD(out1, dst);
1125  SW(out3, (dst + 8));
1126  dst += dst_stride;
1127  out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1128  out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1129  out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1130  out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1131  SD(out0, dst);
1132  SW(out2, (dst + 8));
1133  dst += dst_stride;
1134  SD(out1, dst);
1135  SW(out3, (dst + 8));
1136  dst += dst_stride;
1137 
1138  src10_r = src54_r;
1139  src32_r = src76_r;
1140  src54_r = src98_r;
1141  src21_r = src65_r;
1142  src43_r = src87_r;
1143  src65_r = src109_r;
1144  src10_l = src54_l;
1145  src32_l = src76_l;
1146  src54_l = src98_l;
1147  src21_l = src65_l;
1148  src43_l = src87_l;
1149  src65_l = src109_l;
1150  src6 = src10;
1151  }
1152 }
1153 
1154 static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride,
1155  uint8_t *dst, int32_t dst_stride,
1156  const int8_t *filter, int32_t height)
1157 {
1158  uint32_t loop_cnt;
1159  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1160  v16i8 filt0, filt1, filt2, filt3;
1161  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1162  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1163  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1164  v16u8 tmp0, tmp1, tmp2, tmp3;
1165  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1166 
1167  src -= (3 * src_stride);
1168 
1169  filt = LD_SH(filter);
1170  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1171 
1172  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1173  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1174  src += (7 * src_stride);
1175  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1176  src54_r, src21_r);
1177  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1178  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1179  src54_l, src21_l);
1180  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1181 
1182  for (loop_cnt = (height >> 2); loop_cnt--;) {
1183  LD_SB4(src, src_stride, src7, src8, src9, src10);
1184  XORI_B4_128_SB(src7, src8, src9, src10);
1185  src += (4 * src_stride);
1186 
1187  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1188  src87_r, src98_r, src109_r);
1189  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1190  src87_l, src98_l, src109_l);
1191  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1192  filt1, filt2, filt3);
1193  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1194  filt1, filt2, filt3);
1195  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1196  filt1, filt2, filt3);
1197  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1198  filt1, filt2, filt3);
1199  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1200  filt1, filt2, filt3);
1201  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1202  filt1, filt2, filt3);
1203  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1204  filt1, filt2, filt3);
1205  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1206  filt1, filt2, filt3);
1207  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1208  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1209  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1210  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1211  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1212  out3_r, tmp0, tmp1, tmp2, tmp3);
1213  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1214  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1215  dst += (4 * dst_stride);
1216 
1217  src10_r = src54_r;
1218  src32_r = src76_r;
1219  src54_r = src98_r;
1220  src21_r = src65_r;
1221  src43_r = src87_r;
1222  src65_r = src109_r;
1223  src10_l = src54_l;
1224  src32_l = src76_l;
1225  src54_l = src98_l;
1226  src21_l = src65_l;
1227  src43_l = src87_l;
1228  src65_l = src109_l;
1229  src6 = src10;
1230  }
1231 }
1232 
1233 static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride,
1234  uint8_t *dst, int32_t dst_stride,
1235  const int8_t *filter, int32_t height,
1236  int32_t width)
1237 {
1238  const uint8_t *src_tmp;
1239  uint8_t *dst_tmp;
1240  uint32_t loop_cnt, cnt;
1241  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1242  v16i8 filt0, filt1, filt2, filt3;
1243  v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1244  v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1245  v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1246  v16u8 tmp0, tmp1, tmp2, tmp3;
1247  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1248 
1249  src -= (3 * src_stride);
1250 
1251  filt = LD_SH(filter);
1252  SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1253 
1254  for (cnt = (width >> 4); cnt--;) {
1255  src_tmp = src;
1256  dst_tmp = dst;
1257 
1258  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1259  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1260  src_tmp += (7 * src_stride);
1261  ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1262  src32_r, src54_r, src21_r);
1263  ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1264  ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1265  src32_l, src54_l, src21_l);
1266  ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1267 
1268  for (loop_cnt = (height >> 2); loop_cnt--;) {
1269  LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1270  XORI_B4_128_SB(src7, src8, src9, src10);
1271  src_tmp += (4 * src_stride);
1272  ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1273  src87_r, src98_r, src109_r);
1274  ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1275  src87_l, src98_l, src109_l);
1276  out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r,
1277  filt0, filt1, filt2, filt3);
1278  out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r,
1279  filt0, filt1, filt2, filt3);
1280  out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r,
1281  filt0, filt1, filt2, filt3);
1282  out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r,
1283  filt0, filt1, filt2, filt3);
1284  out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l,
1285  filt0, filt1, filt2, filt3);
1286  out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l,
1287  filt0, filt1, filt2, filt3);
1288  out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l,
1289  filt0, filt1, filt2, filt3);
1290  out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l,
1291  filt0, filt1, filt2, filt3);
1292  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1293  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1294  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1295  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1296  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1297  out3_r, tmp0, tmp1, tmp2, tmp3);
1298  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1299  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1300  dst_tmp += (4 * dst_stride);
1301 
1302  src10_r = src54_r;
1303  src32_r = src76_r;
1304  src54_r = src98_r;
1305  src21_r = src65_r;
1306  src43_r = src87_r;
1307  src65_r = src109_r;
1308  src10_l = src54_l;
1309  src32_l = src76_l;
1310  src54_l = src98_l;
1311  src21_l = src65_l;
1312  src43_l = src87_l;
1313  src65_l = src109_l;
1314  src6 = src10;
1315  }
1316 
1317  src += 16;
1318  dst += 16;
1319  }
1320 }
1321 
1322 static void common_vt_8t_24w_msa(const uint8_t *src, int32_t src_stride,
1323  uint8_t *dst, int32_t dst_stride,
1324  const int8_t *filter, int32_t height)
1325 {
1326  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1327  16);
1328 
1329  common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1330  height);
1331 }
1332 
1333 static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride,
1334  uint8_t *dst, int32_t dst_stride,
1335  const int8_t *filter, int32_t height)
1336 {
1337  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1338  32);
1339 }
1340 
1341 static void common_vt_8t_48w_msa(const uint8_t *src, int32_t src_stride,
1342  uint8_t *dst, int32_t dst_stride,
1343  const int8_t *filter, int32_t height)
1344 {
1345  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1346  48);
1347 }
1348 
1349 static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride,
1350  uint8_t *dst, int32_t dst_stride,
1351  const int8_t *filter, int32_t height)
1352 {
1353  common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1354  64);
1355 }
1356 
1357 static void hevc_hv_uni_8t_4w_msa(const uint8_t *src,
1358  int32_t src_stride,
1359  uint8_t *dst,
1360  int32_t dst_stride,
1361  const int8_t *filter_x,
1362  const int8_t *filter_y,
1363  int32_t height)
1364 {
1365  uint32_t loop_cnt;
1366  uint32_t res = height & 0x07;
1367  v16u8 out0, out1;
1368  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1369  v16i8 src9, src10, src11, src12, src13, src14;
1370  v8i16 filt0, filt1, filt2, filt3;
1371  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1372  v16i8 mask1, mask2, mask3;
1373  v8i16 filter_vec;
1374  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1375  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1376  v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1377  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1378  v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1379  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1380  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1381 
1382  src -= ((3 * src_stride) + 3);
1383  filter_vec = LD_SH(filter_x);
1384  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1385 
1386  filter_vec = LD_SH(filter_y);
1387  UNPCK_R_SB_SH(filter_vec, filter_vec);
1388 
1389  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1390 
1391  mask1 = mask0 + 2;
1392  mask2 = mask0 + 4;
1393  mask3 = mask0 + 6;
1394 
1395  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1396  src += (7 * src_stride);
1397  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1398 
1399  VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1400  VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1401  VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1402  vec8, vec9, vec10, vec11);
1403  VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1404  vec12, vec13, vec14, vec15);
1405 
1406  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1407  filt3);
1408  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1409  filt3);
1410  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1411  filt3);
1412  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1413  filt3);
1414 
1415  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1416  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1417  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1418 
1419  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1420 
1421  for (loop_cnt = height >> 3; loop_cnt--;) {
1422  LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1423  src14);
1424  src += (8 * src_stride);
1425  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1426 
1427  VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1428  vec0, vec1, vec2, vec3);
1429  VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1430  vec4, vec5, vec6, vec7);
1431  VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1432  vec8, vec9, vec10, vec11);
1433  VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1434  vec12, vec13, vec14, vec15);
1435 
1436  dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1437  filt3);
1438  dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1439  filt3);
1440  dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1441  filt2, filt3);
1442  dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1443  filt2, filt3);
1444 
1445  dst76_r = __msa_ilvr_h(dst117, dst66);
1446  ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1447  ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1448  ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1449  dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1450  dst1110_r = __msa_ilvr_h(dst117, dst1410);
1451 
1452  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1453  filt_h1, filt_h2, filt_h3);
1454  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1455  filt_h1, filt_h2, filt_h3);
1456  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1457  filt_h1, filt_h2, filt_h3);
1458  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1459  filt_h1, filt_h2, filt_h3);
1460  dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1461  filt_h1, filt_h2, filt_h3);
1462  dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1463  filt_h1, filt_h2, filt_h3);
1464  dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1465  filt_h1, filt_h2, filt_h3);
1466  dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1467  filt_h0, filt_h1, filt_h2, filt_h3);
1468 
1469  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1470  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1471  SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1472  SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1473  SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1474  SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1475  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1476  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1477  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1478  out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1479  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1480  dst += (8 * dst_stride);
1481 
1482  dst10_r = dst98_r;
1483  dst32_r = dst1110_r;
1484  dst54_r = dst1312_r;
1485  dst21_r = dst109_r;
1486  dst43_r = dst1211_r;
1487  dst65_r = dst1413_r;
1488  dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1489  }
1490  if (res) {
1491  LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1492  src14);
1493  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1494 
1495  VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1496  vec0, vec1, vec2, vec3);
1497  VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1498  vec4, vec5, vec6, vec7);
1499  VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1500  vec8, vec9, vec10, vec11);
1501  VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1502  vec12, vec13, vec14, vec15);
1503 
1504  dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1505  filt3);
1506  dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1507  filt3);
1508  dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1509  filt2, filt3);
1510  dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1511  filt2, filt3);
1512 
1513  dst76_r = __msa_ilvr_h(dst117, dst66);
1514  ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1515  ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1516  ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1517  dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1518  dst1110_r = __msa_ilvr_h(dst117, dst1410);
1519 
1520  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1521  filt_h1, filt_h2, filt_h3);
1522  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1523  filt_h1, filt_h2, filt_h3);
1524  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1525  filt_h1, filt_h2, filt_h3);
1526  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1527  filt_h1, filt_h2, filt_h3);
1528  dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1529  filt_h1, filt_h2, filt_h3);
1530  dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1531  filt_h1, filt_h2, filt_h3);
1532  dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1533  filt_h1, filt_h2, filt_h3);
1534  dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1535  filt_h0, filt_h1, filt_h2, filt_h3);
1536 
1537  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1538  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1539  SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1540  SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1541  SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1542  SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1543  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1544  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1545  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1546  out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1547  if (res == 2) {
1548  ST_W2(out0, 0, 1, dst, dst_stride);
1549  } else if(res == 4) {
1550  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
1551  } else {
1552  ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
1553  ST_W2(out1, 0, 1, dst + 4 * dst_stride, dst_stride);
1554  }
1555  }
1556 }
1557 
1558 static void hevc_hv_uni_8t_8multx2mult_msa(const uint8_t *src,
1559  int32_t src_stride,
1560  uint8_t *dst,
1561  int32_t dst_stride,
1562  const int8_t *filter_x,
1563  const int8_t *filter_y,
1565 {
1566  uint32_t loop_cnt, cnt;
1567  const uint8_t *src_tmp;
1568  uint8_t *dst_tmp;
1569  v16u8 out;
1570  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1571  v8i16 filt0, filt1, filt2, filt3;
1572  v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1573  v16i8 mask1, mask2, mask3;
1574  v8i16 filter_vec;
1575  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1576  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1577  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1578  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1579  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1580  v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1581  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1582  v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1583  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1584 
1585  src -= ((3 * src_stride) + 3);
1586 
1587  filter_vec = LD_SH(filter_x);
1588  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1589 
1590  filter_vec = LD_SH(filter_y);
1591  UNPCK_R_SB_SH(filter_vec, filter_vec);
1592 
1593  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1594 
1595  mask1 = mask0 + 2;
1596  mask2 = mask0 + 4;
1597  mask3 = mask0 + 6;
1598 
1599  for (cnt = width >> 3; cnt--;) {
1600  src_tmp = src;
1601  dst_tmp = dst;
1602 
1603  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1604  src_tmp += (7 * src_stride);
1605  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1606 
1607  /* row 0 row 1 row 2 row 3 */
1608  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1609  vec0, vec1, vec2, vec3);
1610  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1611  vec4, vec5, vec6, vec7);
1612  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1613  vec8, vec9, vec10, vec11);
1614  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1615  vec12, vec13, vec14, vec15);
1616  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1617  filt3);
1618  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1619  filt3);
1620  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1621  filt3);
1622  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1623  filt2, filt3);
1624 
1625  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1626  vec0, vec1, vec2, vec3);
1627  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1628  vec4, vec5, vec6, vec7);
1629  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1630  vec8, vec9, vec10, vec11);
1631  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1632  filt3);
1633  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1634  filt3);
1635  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1636  filt3);
1637 
1638  for (loop_cnt = height >> 1; loop_cnt--;) {
1639  LD_SB2(src_tmp, src_stride, src7, src8);
1640  XORI_B2_128_SB(src7, src8);
1641  src_tmp += 2 * src_stride;
1642 
1643  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1644  dst10_r, dst32_r, dst54_r, dst21_r);
1645  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1646  dst10_l, dst32_l, dst54_l, dst21_l);
1647  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1648  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1649 
1650  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1651  vec0, vec1, vec2, vec3);
1652  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1653  filt2, filt3);
1654 
1655  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1656  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1657  filt_h0, filt_h1, filt_h2, filt_h3);
1658  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1659  filt_h0, filt_h1, filt_h2, filt_h3);
1660  dst0_r >>= 6;
1661  dst0_l >>= 6;
1662 
1663  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1664  vec0, vec1, vec2, vec3);
1665  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1666  filt2, filt3);
1667 
1668  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1669  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1670  filt_h0, filt_h1, filt_h2, filt_h3);
1671  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1672  filt_h0, filt_h1, filt_h2, filt_h3);
1673  dst1_r >>= 6;
1674  dst1_l >>= 6;
1675  SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1676  SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1677 
1678  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1679  out = PCKEV_XORI128_UB(dst0, dst1);
1680  ST_D2(out, 0, 1, dst_tmp, dst_stride);
1681  dst_tmp += (2 * dst_stride);
1682 
1683  dst0 = dst2;
1684  dst1 = dst3;
1685  dst2 = dst4;
1686  dst3 = dst5;
1687  dst4 = dst6;
1688  dst5 = dst7;
1689  dst6 = dst8;
1690  }
1691 
1692  src += 8;
1693  dst += 8;
1694  }
1695 }
1696 
1697 static void hevc_hv_uni_8t_8w_msa(const uint8_t *src,
1698  int32_t src_stride,
1699  uint8_t *dst,
1700  int32_t dst_stride,
1701  const int8_t *filter_x,
1702  const int8_t *filter_y,
1703  int32_t height)
1704 {
1705  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1706  filter_x, filter_y, height, 8);
1707 }
1708 
1709 static void hevc_hv_uni_8t_12w_msa(const uint8_t *src,
1710  int32_t src_stride,
1711  uint8_t *dst,
1712  int32_t dst_stride,
1713  const int8_t *filter_x,
1714  const int8_t *filter_y,
1715  int32_t height)
1716 {
1717  uint32_t loop_cnt;
1718  const uint8_t *src_tmp;
1719  uint8_t *dst_tmp;
1720  v16u8 out0, out1;
1721  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1722  v16i8 src11, src12, src13, src14;
1723  v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1724  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1725  v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1726  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1727  v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1728  v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1729  v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1730  v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1731  v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1732  v8i16 dst1413_r, dst87_l, filter_vec;
1733  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1734  v4i32 dst0_l, dst1_l;
1735 
1736  src -= ((3 * src_stride) + 3);
1737 
1738  filter_vec = LD_SH(filter_x);
1739  SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1740 
1741  filter_vec = LD_SH(filter_y);
1742  UNPCK_R_SB_SH(filter_vec, filter_vec);
1743 
1744  SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1745 
1746  mask0 = LD_SB(ff_hevc_mask_arr);
1747  mask1 = mask0 + 2;
1748  mask2 = mask0 + 4;
1749  mask3 = mask0 + 6;
1750 
1751  src_tmp = src;
1752  dst_tmp = dst;
1753 
1754  LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1755  src_tmp += (7 * src_stride);
1756  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1757 
1758  /* row 0 row 1 row 2 row 3 */
1759  VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1760  VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1761  VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1762  vec11);
1763  VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1764  vec15);
1765  dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1766  filt3);
1767  dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1768  filt3);
1769  dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1770  filt3);
1771  dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1772  filt2, filt3);
1773 
1774  VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1775  VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1776  VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1777  vec11);
1778  dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1779  filt3);
1780  dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1781  filt3);
1782  dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1783  filt3);
1784 
1785  for (loop_cnt = 8; loop_cnt--;) {
1786  LD_SB2(src_tmp, src_stride, src7, src8);
1787  XORI_B2_128_SB(src7, src8);
1788  src_tmp += 2 * src_stride;
1789 
1790  ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1791  dst32_r, dst54_r, dst21_r);
1792  ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1793  dst32_l, dst54_l, dst21_l);
1794  ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1795  ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1796 
1797  VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1798  vec3);
1799  dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1800  filt3);
1801 
1802  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1803  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1804  filt_h0, filt_h1, filt_h2, filt_h3);
1805  dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1806  filt_h0, filt_h1, filt_h2, filt_h3);
1807  dst0_r >>= 6;
1808  dst0_l >>= 6;
1809 
1810  VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1811  vec3);
1812  dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1813  filt3);
1814 
1815  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1816  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1817  filt_h0, filt_h1, filt_h2, filt_h3);
1818  dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1819  filt_h0, filt_h1, filt_h2, filt_h3);
1820  dst1_r >>= 6;
1821  dst1_l >>= 6;
1822  SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1823  SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1824 
1825  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1826  out0 = PCKEV_XORI128_UB(dst0, dst1);
1827  ST_D2(out0, 0, 1, dst_tmp, dst_stride);
1828  dst_tmp += (2 * dst_stride);
1829 
1830  dst0 = dst2;
1831  dst1 = dst3;
1832  dst2 = dst4;
1833  dst3 = dst5;
1834  dst4 = dst6;
1835  dst5 = dst7;
1836  dst6 = dst8;
1837  }
1838 
1839  src += 8;
1840  dst += 8;
1841 
1842  mask4 = LD_SB(ff_hevc_mask_arr + 16);
1843  mask5 = mask4 + 2;
1844  mask6 = mask4 + 4;
1845  mask7 = mask4 + 6;
1846 
1847  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1848  src += (7 * src_stride);
1849  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1850 
1851  VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1852  VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1853  VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1854  vec11);
1855  VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1856  vec15);
1857 
1858  dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1859  filt3);
1860  dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1861  filt3);
1862  dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1863  filt3);
1864  dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1865  filt3);
1866 
1867  ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1868  ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1869  ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1870 
1871  dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1872 
1873  for (loop_cnt = 2; loop_cnt--;) {
1874  LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1875  src14);
1876  src += (8 * src_stride);
1877  XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1878 
1879  VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1880  vec3);
1881  VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1882  vec7);
1883  VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1884  vec11);
1885  VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1886  vec14, vec15);
1887 
1888  dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1889  filt3);
1890  dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1891  filt3);
1892  dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1893  filt2, filt3);
1894  dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1895  filt2, filt3);
1896 
1897  dst76_r = __msa_ilvr_h(dst117, dst66);
1898  ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1899  ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1900  ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1901  dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1902  dst1110_r = __msa_ilvr_h(dst117, dst1410);
1903 
1904  dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1905  filt_h1, filt_h2, filt_h3);
1906  dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1907  filt_h1, filt_h2, filt_h3);
1908  dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1909  filt_h1, filt_h2, filt_h3);
1910  dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1911  filt_h1, filt_h2, filt_h3);
1912  dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1913  filt_h1, filt_h2, filt_h3);
1914  dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1915  filt_h1, filt_h2, filt_h3);
1916  dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1917  filt_h1, filt_h2, filt_h3);
1918  dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1919  filt_h0, filt_h1, filt_h2, filt_h3);
1920 
1921  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1922  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1923  SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1924  SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1925  SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1926  SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1927  PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1928  PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1929  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1930  out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1931  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1932  dst += (8 * dst_stride);
1933 
1934  dst10_r = dst98_r;
1935  dst32_r = dst1110_r;
1936  dst54_r = dst1312_r;
1937  dst21_r = dst109_r;
1938  dst43_r = dst1211_r;
1939  dst65_r = dst1413_r;
1940  dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1941  }
1942 }
1943 
1944 static void hevc_hv_uni_8t_16w_msa(const uint8_t *src,
1945  int32_t src_stride,
1946  uint8_t *dst,
1947  int32_t dst_stride,
1948  const int8_t *filter_x,
1949  const int8_t *filter_y,
1950  int32_t height)
1951 {
1952  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1953  filter_x, filter_y, height, 16);
1954 }
1955 
1956 static void hevc_hv_uni_8t_24w_msa(const uint8_t *src,
1957  int32_t src_stride,
1958  uint8_t *dst,
1959  int32_t dst_stride,
1960  const int8_t *filter_x,
1961  const int8_t *filter_y,
1962  int32_t height)
1963 {
1964  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1965  filter_x, filter_y, height, 24);
1966 }
1967 
1968 static void hevc_hv_uni_8t_32w_msa(const uint8_t *src,
1969  int32_t src_stride,
1970  uint8_t *dst,
1971  int32_t dst_stride,
1972  const int8_t *filter_x,
1973  const int8_t *filter_y,
1974  int32_t height)
1975 {
1976  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1977  filter_x, filter_y, height, 32);
1978 }
1979 
1980 static void hevc_hv_uni_8t_48w_msa(const uint8_t *src,
1981  int32_t src_stride,
1982  uint8_t *dst,
1983  int32_t dst_stride,
1984  const int8_t *filter_x,
1985  const int8_t *filter_y,
1986  int32_t height)
1987 {
1988  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1989  filter_x, filter_y, height, 48);
1990 }
1991 
1992 static void hevc_hv_uni_8t_64w_msa(const uint8_t *src,
1993  int32_t src_stride,
1994  uint8_t *dst,
1995  int32_t dst_stride,
1996  const int8_t *filter_x,
1997  const int8_t *filter_y,
1998  int32_t height)
1999 {
2000  hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2001  filter_x, filter_y, height, 64);
2002 }
2003 
2004 static void common_hz_4t_4x2_msa(const uint8_t *src, int32_t src_stride,
2005  uint8_t *dst, int32_t dst_stride,
2006  const int8_t *filter)
2007 {
2008  v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
2009  v16u8 out;
2010  v8i16 filt, res0;
2011 
2012  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2013  src -= 1;
2014 
2015  /* rearranging filter */
2016  filt = LD_SH(filter);
2017  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2018 
2019  mask1 = mask0 + 2;
2020 
2021  LD_SB2(src, src_stride, src0, src1);
2023  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2024  res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2025  res0 = __msa_srari_h(res0, 6);
2026  res0 = __msa_sat_s_h(res0, 7);
2027  out = PCKEV_XORI128_UB(res0, res0);
2028  ST_W2(out, 0, 1, dst, dst_stride);
2029 }
2030 
2031 static void common_hz_4t_4x4_msa(const uint8_t *src, int32_t src_stride,
2032  uint8_t *dst, int32_t dst_stride,
2033  const int8_t *filter)
2034 {
2035  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2036  v8i16 filt, out0, out1;
2037  v16u8 out;
2038 
2039  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2040  src -= 1;
2041 
2042  /* rearranging filter */
2043  filt = LD_SH(filter);
2044  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2045 
2046  mask1 = mask0 + 2;
2047 
2048  LD_SB4(src, src_stride, src0, src1, src2, src3);
2049  XORI_B4_128_SB(src0, src1, src2, src3);
2050  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2051  filt0, filt1, out0, out1);
2052  SRARI_H2_SH(out0, out1, 6);
2053  SAT_SH2_SH(out0, out1, 7);
2054  out = PCKEV_XORI128_UB(out0, out1);
2055  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2056 }
2057 
2058 static void common_hz_4t_4x8_msa(const uint8_t *src, int32_t src_stride,
2059  uint8_t *dst, int32_t dst_stride,
2060  const int8_t *filter)
2061 {
2062  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2063  v16u8 out;
2064  v8i16 filt, out0, out1, out2, out3;
2065 
2066  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2067  src -= 1;
2068 
2069  /* rearranging filter */
2070  filt = LD_SH(filter);
2071  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2072 
2073  mask1 = mask0 + 2;
2074 
2075  LD_SB4(src, src_stride, src0, src1, src2, src3);
2076  src += (4 * src_stride);
2077 
2078  XORI_B4_128_SB(src0, src1, src2, src3);
2079  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2080  filt0, filt1, out0, out1);
2081  LD_SB4(src, src_stride, src0, src1, src2, src3);
2082  XORI_B4_128_SB(src0, src1, src2, src3);
2083  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2084  filt0, filt1, out2, out3);
2085  SRARI_H4_SH(out0, out1, out2, out3, 6);
2086  SAT_SH4_SH(out0, out1, out2, out3, 7);
2087  out = PCKEV_XORI128_UB(out0, out1);
2088  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2089  out = PCKEV_XORI128_UB(out2, out3);
2090  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2091 }
2092 
2093 static void common_hz_4t_4x16_msa(const uint8_t *src, int32_t src_stride,
2094  uint8_t *dst, int32_t dst_stride,
2095  const int8_t *filter)
2096 {
2097  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2098  v16i8 filt0, filt1, mask0, mask1;
2099  v16u8 out;
2100  v8i16 filt, out0, out1, out2, out3;
2101 
2102  mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2103  src -= 1;
2104 
2105  /* rearranging filter */
2106  filt = LD_SH(filter);
2107  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2108 
2109  mask1 = mask0 + 2;
2110 
2111  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2112  src += (8 * src_stride);
2113  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2114  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2115  filt0, filt1, out0, out1);
2116  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2117  filt0, filt1, out2, out3);
2118  SRARI_H4_SH(out0, out1, out2, out3, 6);
2119  SAT_SH4_SH(out0, out1, out2, out3, 7);
2120  out = PCKEV_XORI128_UB(out0, out1);
2121  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2122  out = PCKEV_XORI128_UB(out2, out3);
2123  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2124  dst += (8 * dst_stride);
2125 
2126  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2127  src += (8 * src_stride);
2128  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2129  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2130  filt0, filt1, out0, out1);
2131  HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2132  filt0, filt1, out2, out3);
2133  SRARI_H4_SH(out0, out1, out2, out3, 6);
2134  SAT_SH4_SH(out0, out1, out2, out3, 7);
2135  out = PCKEV_XORI128_UB(out0, out1);
2136  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2137  out = PCKEV_XORI128_UB(out2, out3);
2138  ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2139 }
2140 
2141 static void common_hz_4t_4w_msa(const uint8_t *src, int32_t src_stride,
2142  uint8_t *dst, int32_t dst_stride,
2143  const int8_t *filter, int32_t height)
2144 {
2145  if (2 == height) {
2146  common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2147  } else if (4 == height) {
2148  common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2149  } else if (8 == height) {
2150  common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2151  } else if (16 == height) {
2152  common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
2153  }
2154 }
2155 
2156 static void common_hz_4t_6w_msa(const uint8_t *src, int32_t src_stride,
2157  uint8_t *dst, int32_t dst_stride,
2158  const int8_t *filter, int32_t height)
2159 {
2160  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2161  v16u8 out4, out5;
2162  v8i16 filt, out0, out1, out2, out3;
2163 
2164  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2165  src -= 1;
2166 
2167  /* rearranging filter */
2168  filt = LD_SH(filter);
2169  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2170 
2171  mask1 = mask0 + 2;
2172 
2173  LD_SB4(src, src_stride, src0, src1, src2, src3);
2174  src += (4 * src_stride);
2175 
2176  XORI_B4_128_SB(src0, src1, src2, src3);
2177  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2178  filt1, out0, out1, out2, out3);
2179  SRARI_H4_SH(out0, out1, out2, out3, 6);
2180  SAT_SH4_SH(out0, out1, out2, out3, 7);
2181  out4 = PCKEV_XORI128_UB(out0, out1);
2182  out5 = PCKEV_XORI128_UB(out2, out3);
2183  ST_W2(out4, 0, 2, dst, dst_stride);
2184  ST_H2(out4, 2, 6, dst + 4, dst_stride);
2185  ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2186  ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2187  dst += (4 * dst_stride);
2188 
2189  LD_SB4(src, src_stride, src0, src1, src2, src3);
2190  src += (4 * src_stride);
2191 
2192  XORI_B4_128_SB(src0, src1, src2, src3);
2193  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2194  filt1, out0, out1, out2, out3);
2195  SRARI_H4_SH(out0, out1, out2, out3, 6);
2196  SAT_SH4_SH(out0, out1, out2, out3, 7);
2197  out4 = PCKEV_XORI128_UB(out0, out1);
2198  out5 = PCKEV_XORI128_UB(out2, out3);
2199  ST_W2(out4, 0, 2, dst, dst_stride);
2200  ST_H2(out4, 2, 6, dst + 4, dst_stride);
2201  ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2202  ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2203 }
2204 
2205 static void common_hz_4t_8x2mult_msa(const uint8_t *src, int32_t src_stride,
2206  uint8_t *dst, int32_t dst_stride,
2207  const int8_t *filter, int32_t height)
2208 {
2209  uint32_t loop_cnt;
2210  v16i8 src0, src1, filt0, filt1, mask0, mask1;
2211  v16u8 out;
2212  v8i16 filt, vec0, vec1, vec2, vec3;
2213 
2214  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2215  src -= 1;
2216 
2217  filt = LD_SH(filter);
2218  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2219 
2220  mask1 = mask0 + 2;
2221 
2222  for (loop_cnt = (height >> 1); loop_cnt--;) {
2223  LD_SB2(src, src_stride, src0, src1);
2224  src += (2 * src_stride);
2225 
2227  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2228  DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2229  VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2230  DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
2231  SRARI_H2_SH(vec0, vec1, 6);
2232  SAT_SH2_SH(vec0, vec1, 7);
2233  out = PCKEV_XORI128_UB(vec0, vec1);
2234  ST_D2(out, 0, 1, dst, dst_stride);
2235  dst += (2 * dst_stride);
2236  }
2237 }
2238 
2239 static void common_hz_4t_8x4mult_msa(const uint8_t *src, int32_t src_stride,
2240  uint8_t *dst, int32_t dst_stride,
2241  const int8_t *filter, int32_t height)
2242 {
2243  uint32_t loop_cnt;
2244  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2245  v16u8 tmp0, tmp1;
2246  v8i16 filt, out0, out1, out2, out3;
2247 
2248  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2249  src -= 1;
2250 
2251  /* rearranging filter */
2252  filt = LD_SH(filter);
2253  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2254 
2255  mask1 = mask0 + 2;
2256 
2257  for (loop_cnt = (height >> 2); loop_cnt--;) {
2258  LD_SB4(src, src_stride, src0, src1, src2, src3);
2259  src += (4 * src_stride);
2260 
2261  XORI_B4_128_SB(src0, src1, src2, src3);
2262  HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2263  filt1, out0, out1, out2, out3);
2264  SRARI_H4_SH(out0, out1, out2, out3, 6);
2265  SAT_SH4_SH(out0, out1, out2, out3, 7);
2266  tmp0 = PCKEV_XORI128_UB(out0, out1);
2267  tmp1 = PCKEV_XORI128_UB(out2, out3);
2268  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2269  dst += (4 * dst_stride);
2270  }
2271 }
2272 
2273 static void common_hz_4t_8w_msa(const uint8_t *src, int32_t src_stride,
2274  uint8_t *dst, int32_t dst_stride,
2275  const int8_t *filter, int32_t height)
2276 {
2277  if ((2 == height) || (6 == height)) {
2278  common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
2279  height);
2280  } else {
2281  common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
2282  height);
2283  }
2284 }
2285 
2286 static void common_hz_4t_12w_msa(const uint8_t *src, int32_t src_stride,
2287  uint8_t *dst, int32_t dst_stride,
2288  const int8_t *filter, int32_t height)
2289 {
2290  uint32_t loop_cnt;
2291  v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2292  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2293  v16i8 vec10, vec11;
2294  v16u8 tmp0, tmp1;
2295  v8i16 filt, out0, out1, out2, out3, out4, out5;
2296 
2297  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2298  mask2 = LD_SB(&ff_hevc_mask_arr[32]);
2299 
2300  src -= 1;
2301 
2302  /* rearranging filter */
2303  filt = LD_SH(filter);
2304  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2305 
2306  mask1 = mask0 + 2;
2307  mask3 = mask2 + 2;
2308 
2309  for (loop_cnt = 4; loop_cnt--;) {
2310  LD_SB4(src, src_stride, src0, src1, src2, src3);
2311  src += (4 * src_stride);
2312 
2313  XORI_B4_128_SB(src0, src1, src2, src3);
2314  VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2315  DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2316  VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2317  DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2318  SRARI_H2_SH(out0, out1, 6);
2319  SAT_SH2_SH(out0, out1, 7);
2320  tmp0 = PCKEV_XORI128_UB(out0, out1);
2321  ST_W4(tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2322 
2323  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2324  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2325  DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2326  out2, out3, out4, out5);
2327  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2328  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2329  DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2330  out2, out3, out4, out5);
2331  SRARI_H4_SH(out2, out3, out4, out5, 6);
2332  SAT_SH4_SH(out2, out3, out4, out5, 7);
2333  tmp0 = PCKEV_XORI128_UB(out2, out3);
2334  tmp1 = PCKEV_XORI128_UB(out4, out5);
2335  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2336  dst += (4 * dst_stride);
2337  }
2338 }
2339 
2340 static void common_hz_4t_16w_msa(const uint8_t *src, int32_t src_stride,
2341  uint8_t *dst, int32_t dst_stride,
2342  const int8_t *filter, int32_t height)
2343 {
2344  uint32_t loop_cnt;
2345  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2346  v16i8 filt0, filt1, mask0, mask1;
2347  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2348  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2349  v16u8 out;
2350 
2351  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2352  src -= 1;
2353 
2354  /* rearranging filter */
2355  filt = LD_SH(filter);
2356  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2357 
2358  mask1 = mask0 + 2;
2359 
2360  for (loop_cnt = (height >> 2); loop_cnt--;) {
2361  LD_SB4(src, src_stride, src0, src2, src4, src6);
2362  LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2363  src += (4 * src_stride);
2364 
2365  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2366 
2367  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2368  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2369  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2370  out0, out1, out2, out3);
2371  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2372  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2373  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2374  out0, out1, out2, out3);
2375  SRARI_H4_SH(out0, out1, out2, out3, 6);
2376  SAT_SH4_SH(out0, out1, out2, out3, 7);
2377  out = PCKEV_XORI128_UB(out0, out1);
2378  ST_UB(out, dst);
2379  dst += dst_stride;
2380  out = PCKEV_XORI128_UB(out2, out3);
2381  ST_UB(out, dst);
2382  dst += dst_stride;
2383 
2384  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2385  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2386  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2387  out4, out5, out6, out7);
2388  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2389  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2390  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2391  out4, out5, out6, out7);
2392  SRARI_H4_SH(out4, out5, out6, out7, 6);
2393  SAT_SH4_SH(out4, out5, out6, out7, 7);
2394  out = PCKEV_XORI128_UB(out4, out5);
2395  ST_UB(out, dst);
2396  dst += dst_stride;
2397  out = PCKEV_XORI128_UB(out6, out7);
2398  ST_UB(out, dst);
2399  dst += dst_stride;
2400  }
2401 }
2402 
2403 static void common_hz_4t_24w_msa(const uint8_t *src, int32_t src_stride,
2404  uint8_t *dst, int32_t dst_stride,
2405  const int8_t *filter, int32_t height)
2406 {
2407  uint8_t *dst1 = dst + 16;
2408  uint32_t loop_cnt;
2409  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2410  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2411  v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2412  v8i16 filt, out0, out1, out2, out3;
2413  v16u8 tmp0, tmp1;
2414 
2415  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2416  src -= 1;
2417 
2418  /* rearranging filter */
2419  filt = LD_SH(filter);
2420  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2421 
2422  mask1 = mask0 + 2;
2423  mask00 = mask0 + 8;
2424  mask11 = mask0 + 10;
2425 
2426  for (loop_cnt = 8; loop_cnt--;) {
2427  LD_SB4(src, src_stride, src0, src2, src4, src6);
2428  LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2429  src += (4 * src_stride);
2430 
2431  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2432  VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2433  VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2434  VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2435  VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2436  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2437  out0, out1, out2, out3);
2438  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2439  out0, out1, out2, out3);
2440  SRARI_H4_SH(out0, out1, out2, out3, 6);
2441  SAT_SH4_SH(out0, out1, out2, out3, 7);
2442  tmp0 = PCKEV_XORI128_UB(out0, out1);
2443  ST_UB(tmp0, dst);
2444  dst += dst_stride;
2445  tmp0 = PCKEV_XORI128_UB(out2, out3);
2446  ST_UB(tmp0, dst);
2447  dst += dst_stride;
2448 
2449  VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2450  VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2451  VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2452  VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2453  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2454  out0, out1, out2, out3);
2455  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2456  out0, out1, out2, out3);
2457  SRARI_H4_SH(out0, out1, out2, out3, 6);
2458  SAT_SH4_SH(out0, out1, out2, out3, 7);
2459  tmp0 = PCKEV_XORI128_UB(out0, out1);
2460  ST_UB(tmp0, dst);
2461  dst += dst_stride;
2462  tmp0 = PCKEV_XORI128_UB(out2, out3);
2463  ST_UB(tmp0, dst);
2464  dst += dst_stride;
2465 
2466  /* 8 width */
2467  VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2468  VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2469  VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2470  VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2471 
2472  DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2473  out0, out1, out2, out3);
2474  DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2475  out0, out1, out2, out3);
2476 
2477  SRARI_H4_SH(out0, out1, out2, out3, 6);
2478  SAT_SH4_SH(out0, out1, out2, out3, 7);
2479  tmp0 = PCKEV_XORI128_UB(out0, out1);
2480  tmp1 = PCKEV_XORI128_UB(out2, out3);
2481  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride);
2482  dst1 += (4 * dst_stride);
2483  }
2484 }
2485 
2486 static void common_hz_4t_32w_msa(const uint8_t *src, int32_t src_stride,
2487  uint8_t *dst, int32_t dst_stride,
2488  const int8_t *filter, int32_t height)
2489 {
2490  uint32_t loop_cnt;
2491  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2492  v16i8 filt0, filt1, mask0, mask1;
2493  v16u8 out;
2494  v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2495  v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2496 
2497  mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2498  src -= 1;
2499 
2500  /* rearranging filter */
2501  filt = LD_SH(filter);
2502  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2503 
2504  mask1 = mask0 + 2;
2505 
2506  for (loop_cnt = (height >> 1); loop_cnt--;) {
2507  src0 = LD_SB(src);
2508  src1 = LD_SB(src + 8);
2509  src2 = LD_SB(src + 16);
2510  src3 = LD_SB(src + 24);
2511  src += src_stride;
2512  src4 = LD_SB(src);
2513  src5 = LD_SB(src + 8);
2514  src6 = LD_SB(src + 16);
2515  src7 = LD_SB(src + 24);
2516  src += src_stride;
2517 
2518  XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2519 
2520  VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2521  VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2522  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2523  out0, out1, out2, out3);
2524  VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2525  VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2526  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2527  out0, out1, out2, out3);
2528 
2529  VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2530  VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2531  DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2532  out4, out5, out6, out7);
2533  VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2534  VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2535  DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2536  out4, out5, out6, out7);
2537  SRARI_H4_SH(out0, out1, out2, out3, 6);
2538  SRARI_H4_SH(out4, out5, out6, out7, 6);
2539  SAT_SH4_SH(out0, out1, out2, out3, 7);
2540  SAT_SH4_SH(out4, out5, out6, out7, 7);
2541  out = PCKEV_XORI128_UB(out0, out1);
2542  ST_UB(out, dst);
2543  out = PCKEV_XORI128_UB(out2, out3);
2544  ST_UB(out, dst + 16);
2545  dst += dst_stride;
2546  out = PCKEV_XORI128_UB(out4, out5);
2547  ST_UB(out, dst);
2548  out = PCKEV_XORI128_UB(out6, out7);
2549  ST_UB(out, dst + 16);
2550  dst += dst_stride;
2551  }
2552 }
2553 
2554 static void common_vt_4t_4x2_msa(const uint8_t *src, int32_t src_stride,
2555  uint8_t *dst, int32_t dst_stride,
2556  const int8_t *filter)
2557 {
2558  v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2559  v16i8 src2110, src4332, filt0, filt1;
2560  v16u8 out;
2561  v8i16 filt, out10;
2562 
2563  src -= src_stride;
2564 
2565  filt = LD_SH(filter);
2566  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2567 
2568  LD_SB3(src, src_stride, src0, src1, src2);
2569  src += (3 * src_stride);
2570 
2571  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2572  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2573  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2574  LD_SB2(src, src_stride, src3, src4);
2575  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2576  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2577  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2578  out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2579  out10 = __msa_srari_h(out10, 6);
2580  out10 = __msa_sat_s_h(out10, 7);
2581  out = PCKEV_XORI128_UB(out10, out10);
2582  ST_W2(out, 0, 1, dst, dst_stride);
2583 }
2584 
2585 static void common_vt_4t_4x4multiple_msa(const uint8_t *src, int32_t src_stride,
2586  uint8_t *dst, int32_t dst_stride,
2587  const int8_t *filter, int32_t height)
2588 {
2589  uint32_t loop_cnt;
2590  v16i8 src0, src1, src2, src3, src4, src5;
2591  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2592  v16i8 src2110, src4332, filt0, filt1;
2593  v8i16 filt, out10, out32;
2594  v16u8 out;
2595 
2596  src -= src_stride;
2597 
2598  filt = LD_SH(filter);
2599  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2600 
2601  LD_SB3(src, src_stride, src0, src1, src2);
2602  src += (3 * src_stride);
2603 
2604  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2605 
2606  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2607  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2608 
2609  for (loop_cnt = (height >> 2); loop_cnt--;) {
2610  LD_SB3(src, src_stride, src3, src4, src5);
2611  src += (3 * src_stride);
2612  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2613  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2614  src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2615  out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2616 
2617  src2 = LD_SB(src);
2618  src += (src_stride);
2619  ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2620  src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2621  src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2622  out32 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
2623  SRARI_H2_SH(out10, out32, 6);
2624  SAT_SH2_SH(out10, out32, 7);
2625  out = PCKEV_XORI128_UB(out10, out32);
2626  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2627  dst += (4 * dst_stride);
2628  }
2629 }
2630 
2631 static void common_vt_4t_4w_msa(const uint8_t *src, int32_t src_stride,
2632  uint8_t *dst, int32_t dst_stride,
2633  const int8_t *filter, int32_t height)
2634 {
2635  if (2 == height) {
2636  common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2637  } else {
2638  common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2639  height);
2640  }
2641 }
2642 
2643 static void common_vt_4t_6w_msa(const uint8_t *src, int32_t src_stride,
2644  uint8_t *dst, int32_t dst_stride,
2645  const int8_t *filter, int32_t height)
2646 {
2647  v16u8 out0, out1;
2648  v16i8 src0, src1, src2, src3, src4, src5, src6;
2649  v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2650  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2651 
2652  src -= src_stride;
2653 
2654  filter_vec = LD_SH(filter);
2655  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2656 
2657  LD_SB3(src, src_stride, src0, src1, src2);
2658  src += (3 * src_stride);
2660  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2661 
2662  LD_SB2(src, src_stride, src3, src4);
2663  src += (2 * src_stride);
2664  XORI_B2_128_SB(src3, src4);
2665  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2666 
2667  dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2668  dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2669 
2670  LD_SB2(src, src_stride, src5, src6);
2671  src += (2 * src_stride);
2672  XORI_B2_128_SB(src5, src6);
2673  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2674 
2675  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2676  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2677 
2678  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2679  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2680  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2681  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2682  ST_W2(out0, 0, 2, dst, dst_stride);
2683  ST_H2(out0, 2, 6, dst + 4, dst_stride);
2684  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2685  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2686  dst += (4 * dst_stride);
2687 
2688  LD_SB2(src, src_stride, src3, src4);
2689  src += (2 * src_stride);
2690  XORI_B2_128_SB(src3, src4);
2691  ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2692 
2693  dst0_r = HEVC_FILT_4TAP_SH(src54_r, src32_r, filt0, filt1);
2694  dst1_r = HEVC_FILT_4TAP_SH(src65_r, src43_r, filt0, filt1);
2695 
2696  LD_SB2(src, src_stride, src5, src6);
2697  src += (2 * src_stride);
2698  XORI_B2_128_SB(src5, src6);
2699  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2700 
2701  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2702  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2703 
2704  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2705  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2706  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2707  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2708  ST_W2(out0, 0, 2, dst, dst_stride);
2709  ST_H2(out0, 2, 6, dst + 4, dst_stride);
2710  ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2711  ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2712 }
2713 
2714 static void common_vt_4t_8x2_msa(const uint8_t *src, int32_t src_stride,
2715  uint8_t *dst, int32_t dst_stride,
2716  const int8_t *filter)
2717 {
2718  v16i8 src0, src1, src2, src3, src4;
2719  v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2720  v16u8 out;
2721 
2722  src -= src_stride;
2723 
2724  /* rearranging filter_y */
2725  filt = LD_SH(filter);
2726  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2727 
2728  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2729  XORI_B5_128_SB(src0, src1, src2, src3, src4);
2730  ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2731  tmp0 = HEVC_FILT_4TAP_SH(src01, src23, filt0, filt1);
2732  ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2733  tmp1 = HEVC_FILT_4TAP_SH(src12, src34, filt0, filt1);
2734  SRARI_H2_SH(tmp0, tmp1, 6);
2735  SAT_SH2_SH(tmp0, tmp1, 7);
2736  out = PCKEV_XORI128_UB(tmp0, tmp1);
2737  ST_D2(out, 0, 1, dst, dst_stride);
2738 }
2739 
2740 static void common_vt_4t_8x6_msa(const uint8_t *src, int32_t src_stride,
2741  uint8_t *dst, int32_t dst_stride,
2742  const int8_t *filter)
2743 {
2744  uint32_t loop_cnt;
2745  uint64_t out0, out1, out2;
2746  v16i8 src0, src1, src2, src3, src4, src5;
2747  v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2748  v8i16 filt, filt0, filt1;
2749 
2750  src -= src_stride;
2751 
2752  /* rearranging filter_y */
2753  filt = LD_SH(filter);
2754  SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2755 
2756  LD_SB3(src, src_stride, src0, src1, src2);
2757  src += (3 * src_stride);
2758 
2760  ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2761 
2762  for (loop_cnt = 2; loop_cnt--;) {
2763  LD_SB3(src, src_stride, src3, src4, src5);
2764  src += (3 * src_stride);
2765 
2766  XORI_B3_128_SB(src3, src4, src5);
2767  ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2768  tmp0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2769  tmp1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2770  tmp2 = HEVC_FILT_4TAP_SH(vec1, vec4, filt0, filt1);
2771  SRARI_H2_SH(tmp0, tmp1, 6);
2772  tmp2 = __msa_srari_h(tmp2, 6);
2773  SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2774  PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2775  XORI_B2_128_SH(tmp0, tmp2);
2776 
2777  out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2778  out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2779  out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2780  SD(out0, dst);
2781  dst += dst_stride;
2782  SD(out1, dst);
2783  dst += dst_stride;
2784  SD(out2, dst);
2785  dst += dst_stride;
2786 
2787  src2 = src5;
2788  vec0 = vec3;
2789  vec2 = vec4;
2790  }
2791 }
2792 
2793 static void common_vt_4t_8x4mult_msa(const uint8_t *src, int32_t src_stride,
2794  uint8_t *dst, int32_t dst_stride,
2795  const int8_t *filter, int32_t height)
2796 {
2797  uint32_t loop_cnt;
2798  v16i8 src0, src1, src2, src7, src8, src9, src10;
2799  v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2800  v16u8 tmp0, tmp1;
2801  v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2802 
2803  src -= src_stride;
2804 
2805  filt = LD_SH(filter);
2806  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2807 
2808  LD_SB3(src, src_stride, src0, src1, src2);
2809  src += (3 * src_stride);
2810 
2812  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2813 
2814  for (loop_cnt = (height >> 2); loop_cnt--;) {
2815  LD_SB4(src, src_stride, src7, src8, src9, src10);
2816  src += (4 * src_stride);
2817 
2818  XORI_B4_128_SB(src7, src8, src9, src10);
2819  ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2820  src72_r, src87_r, src98_r, src109_r);
2821  out0_r = HEVC_FILT_4TAP_SH(src10_r, src72_r, filt0, filt1);
2822  out1_r = HEVC_FILT_4TAP_SH(src21_r, src87_r, filt0, filt1);
2823  out2_r = HEVC_FILT_4TAP_SH(src72_r, src98_r, filt0, filt1);
2824  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2825  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2826  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2827  tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2828  tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2829  ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2830  dst += (4 * dst_stride);
2831 
2832  src10_r = src98_r;
2833  src21_r = src109_r;
2834  src2 = src10;
2835  }
2836 }
2837 
2838 static void common_vt_4t_8w_msa(const uint8_t *src, int32_t src_stride,
2839  uint8_t *dst, int32_t dst_stride,
2840  const int8_t *filter, int32_t height)
2841 {
2842  if (2 == height) {
2843  common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2844  } else if (6 == height) {
2845  common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2846  } else {
2847  common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2848  filter, height);
2849  }
2850 }
2851 
2852 static void common_vt_4t_12w_msa(const uint8_t *src, int32_t src_stride,
2853  uint8_t *dst, int32_t dst_stride,
2854  const int8_t *filter, int32_t height)
2855 {
2856  uint32_t loop_cnt;
2857  v16i8 src0, src1, src2, src3, src4, src5, src6;
2858  v16u8 out0, out1;
2859  v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2860  v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2861  v16i8 src2110, src4332, src6554;
2862  v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2863  v8i16 filter_vec;
2864 
2865  src -= (1 * src_stride);
2866 
2867  filter_vec = LD_SH(filter);
2868  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2869 
2870  LD_SB3(src, src_stride, src0, src1, src2);
2871  src += (3 * src_stride);
2872 
2874  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2875  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2876  src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2877 
2878  for (loop_cnt = 4; loop_cnt--;) {
2879  LD_SB4(src, src_stride, src3, src4, src5, src6);
2880  src += (4 * src_stride);
2881 
2882  XORI_B4_128_SB(src3, src4, src5, src6);
2883  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2884  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2885  src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2886  ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2887  ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2888  src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2889 
2890  dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2891  dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2892  dst0_l = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2893  dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2894  dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2895  dst1_l = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
2896 
2897  SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2898  SRARI_H2_SH(dst0_l, dst1_l, 6);
2899  SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2900  SAT_SH2_SH(dst0_l, dst1_l, 7);
2901  out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2902  out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2903  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2904  out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
2905  ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
2906  dst += (4 * dst_stride);
2907 
2908  src2 = src6;
2909  src10_r = src54_r;
2910  src21_r = src65_r;
2911  src2110 = src6554;
2912  }
2913 }
2914 
2915 static void common_vt_4t_16w_msa(const uint8_t *src, int32_t src_stride,
2916  uint8_t *dst, int32_t dst_stride,
2917  const int8_t *filter, int32_t height)
2918 {
2919  uint32_t loop_cnt;
2920  v16i8 src0, src1, src2, src3, src4, src5, src6;
2921  v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2922  v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2923  v16u8 tmp0, tmp1, tmp2, tmp3;
2924  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2925 
2926  src -= src_stride;
2927 
2928  filt = LD_SH(filter);
2929  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2930 
2931  LD_SB3(src, src_stride, src0, src1, src2);
2932  src += (3 * src_stride);
2933 
2935  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2936  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2937 
2938  for (loop_cnt = (height >> 2); loop_cnt--;) {
2939  LD_SB4(src, src_stride, src3, src4, src5, src6);
2940  src += (4 * src_stride);
2941 
2942  XORI_B4_128_SB(src3, src4, src5, src6);
2943  ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2944  src32_r, src43_r, src54_r, src65_r);
2945  ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2946  src32_l, src43_l, src54_l, src65_l);
2947  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2948  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2949  out2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2950  out3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2951  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2952  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2953  out2_l = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
2954  out3_l = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
2955  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2956  SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2957  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2958  SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2959  PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2960  out3_r, tmp0, tmp1, tmp2, tmp3);
2961  XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2962  ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2963  dst += (4 * dst_stride);
2964 
2965  src10_r = src54_r;
2966  src21_r = src65_r;
2967  src10_l = src54_l;
2968  src21_l = src65_l;
2969  src2 = src6;
2970  }
2971 }
2972 
2973 static void common_vt_4t_24w_msa(const uint8_t *src, int32_t src_stride,
2974  uint8_t *dst, int32_t dst_stride,
2975  const int8_t *filter, int32_t height)
2976 {
2977  uint32_t loop_cnt;
2978  uint64_t out0, out1;
2979  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2980  v16i8 src11, filt0, filt1;
2981  v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2982  v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2983  v16u8 out;
2984  v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2985 
2986  src -= src_stride;
2987 
2988  filt = LD_SH(filter);
2989  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2990 
2991  /* 16 width */
2992  LD_SB3(src, src_stride, src0, src1, src2);
2994  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2995  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2996 
2997  /* 8 width */
2998  LD_SB3(src + 16, src_stride, src6, src7, src8);
2999  src += (3 * src_stride);
3000  XORI_B3_128_SB(src6, src7, src8);
3001  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3002 
3003  for (loop_cnt = 8; loop_cnt--;) {
3004  /* 16 width */
3005  LD_SB2(src, src_stride, src3, src4);
3006  XORI_B2_128_SB(src3, src4);
3007  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3008  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3009 
3010  /* 8 width */
3011  LD_SB2(src + 16, src_stride, src9, src10);
3012  src += (2 * src_stride);
3013  XORI_B2_128_SB(src9, src10);
3014  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3015 
3016  /* 16 width */
3017  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3018  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3019  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3020  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3021 
3022  /* 8 width */
3023  out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3024  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3025 
3026  /* 16 + 8 width */
3027  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3028  SRARI_H2_SH(out0_l, out1_l, 6);
3029  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3030  SAT_SH2_SH(out0_l, out1_l, 7);
3031  out = PCKEV_XORI128_UB(out0_r, out0_l);
3032  ST_UB(out, dst);
3033  PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
3034  XORI_B2_128_SH(out2_r, out3_r);
3035  out0 = __msa_copy_u_d((v2i64) out2_r, 0);
3036  out1 = __msa_copy_u_d((v2i64) out3_r, 0);
3037  SD(out0, dst + 16);
3038  dst += dst_stride;
3039  out = PCKEV_XORI128_UB(out1_r, out1_l);
3040  ST_UB(out, dst);
3041  SD(out1, dst + 16);
3042  dst += dst_stride;
3043 
3044  /* 16 width */
3045  LD_SB2(src, src_stride, src5, src2);
3046  XORI_B2_128_SB(src5, src2);
3047  ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3048  ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3049 
3050  /* 8 width */
3051  LD_SB2(src + 16, src_stride, src11, src8);
3052  src += (2 * src_stride);
3053  XORI_B2_128_SB(src11, src8);
3054  ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3055 
3056  /* 16 width */
3057  out0_r = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
3058  out0_l = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
3059  out1_r = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
3060  out1_l = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
3061 
3062  /* 8 width */
3063  out2_r = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
3064  out3_r = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
3065 
3066  /* 16 + 8 width */
3067  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3068  SRARI_H2_SH(out0_l, out1_l, 6);
3069  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3070  SAT_SH2_SH(out0_l, out1_l, 7);
3071  out = PCKEV_XORI128_UB(out0_r, out0_l);
3072  ST_UB(out, dst);
3073  out = PCKEV_XORI128_UB(out2_r, out2_r);
3074  ST_D1(out, 0, dst + 16);
3075  dst += dst_stride;
3076  out = PCKEV_XORI128_UB(out1_r, out1_l);
3077  ST_UB(out, dst);
3078  out = PCKEV_XORI128_UB(out3_r, out3_r);
3079  ST_D1(out, 0, dst + 16);
3080  dst += dst_stride;
3081  }
3082 }
3083 
3084 static void common_vt_4t_32w_msa(const uint8_t *src, int32_t src_stride,
3085  uint8_t *dst, int32_t dst_stride,
3086  const int8_t *filter, int32_t height)
3087 {
3088  uint32_t loop_cnt;
3089  v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3090  v16i8 src10_r, src32_r, src76_r, src98_r;
3091  v16i8 src21_r, src43_r, src87_r, src109_r;
3092  v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3093  v16i8 src10_l, src32_l, src76_l, src98_l;
3094  v16i8 src21_l, src43_l, src87_l, src109_l;
3095  v8i16 filt;
3096  v16i8 filt0, filt1;
3097  v16u8 out;
3098 
3099  src -= src_stride;
3100 
3101  filt = LD_SH(filter);
3102  SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
3103 
3104  /* 16 width */
3105  LD_SB3(src, src_stride, src0, src1, src2);
3107 
3108  ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3109  ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3110 
3111  /* next 16 width */
3112  LD_SB3(src + 16, src_stride, src6, src7, src8);
3113  src += (3 * src_stride);
3114 
3115  XORI_B3_128_SB(src6, src7, src8);
3116  ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3117  ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3118 
3119  for (loop_cnt = (height >> 1); loop_cnt--;) {
3120  /* 16 width */
3121  LD_SB2(src, src_stride, src3, src4);
3122  XORI_B2_128_SB(src3, src4);
3123  ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3124  ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3125 
3126  /* 16 width */
3127  out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3128  out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3129  out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3130  out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3131 
3132  /* 16 width */
3133  SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
3134  SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3135  out = PCKEV_XORI128_UB(out0_r, out0_l);
3136  ST_UB(out, dst);
3137  out = PCKEV_XORI128_UB(out1_r, out1_l);
3138  ST_UB(out, dst + dst_stride);
3139 
3140  src10_r = src32_r;
3141  src21_r = src43_r;
3142  src10_l = src32_l;
3143  src21_l = src43_l;
3144  src2 = src4;
3145 
3146  /* next 16 width */
3147  LD_SB2(src + 16, src_stride, src9, src10);
3148  src += (2 * src_stride);
3149  XORI_B2_128_SB(src9, src10);
3150  ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3151  ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3152 
3153  /* next 16 width */
3154  out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3155  out2_l = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
3156  out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3157  out3_l = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
3158 
3159  /* next 16 width */
3160  SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
3161  SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3162  out = PCKEV_XORI128_UB(out2_r, out2_l);
3163  ST_UB(out, dst + 16);
3164  out = PCKEV_XORI128_UB(out3_r, out3_l);
3165  ST_UB(out, dst + 16 + dst_stride);
3166 
3167  dst += 2 * dst_stride;
3168 
3169  src76_r = src98_r;
3170  src87_r = src109_r;
3171  src76_l = src98_l;
3172  src87_l = src109_l;
3173  src8 = src10;
3174  }
3175 }
3176 
3177 static void hevc_hv_uni_4t_4x2_msa(const uint8_t *src,
3178  int32_t src_stride,
3179  uint8_t *dst,
3180  int32_t dst_stride,
3181  const int8_t *filter_x,
3182  const int8_t *filter_y)
3183 {
3184  v16u8 out;
3185  v16i8 src0, src1, src2, src3, src4;
3186  v8i16 filt0, filt1;
3187  v8i16 filt_h0, filt_h1;
3188  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3189  v16i8 mask1;
3190  v8i16 filter_vec, tmp;
3191  v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3192  v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3193  v4i32 dst0, dst1;
3194 
3195  src -= (src_stride + 1);
3196 
3197  filter_vec = LD_SH(filter_x);
3198  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3199 
3200  filter_vec = LD_SH(filter_y);
3201  UNPCK_R_SB_SH(filter_vec, filter_vec);
3202 
3203  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3204 
3205  mask1 = mask0 + 2;
3206 
3207  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3208  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3209 
3210  VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3211  VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3212  VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3213 
3214  dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3215  dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3216  dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3217 
3218  ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3219  ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3220 
3221  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3222  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3223  dst0 >>= 6;
3224  dst1 >>= 6;
3225  tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3226  tmp = __msa_srari_h(tmp, 6);
3227  tmp = __msa_sat_s_h(tmp, 7);
3229  ST_W2(out, 0, 1, dst, dst_stride);
3230 }
3231 
3232 static void hevc_hv_uni_4t_4x4_msa(const uint8_t *src,
3233  int32_t src_stride,
3234  uint8_t *dst,
3235  int32_t dst_stride,
3236  const int8_t *filter_x,
3237  const int8_t *filter_y)
3238 {
3239  v16u8 out;
3240  v16i8 src0, src1, src2, src3, src4, src5, src6;
3241  v8i16 filt0, filt1;
3242  v8i16 filt_h0, filt_h1;
3243  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3244  v16i8 mask1;
3245  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3246  v8i16 filter_vec, tmp0, tmp1;
3247  v8i16 dst30, dst41, dst52, dst63;
3248  v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3249  v4i32 dst0, dst1, dst2, dst3;
3250 
3251  src -= (src_stride + 1);
3252 
3253  filter_vec = LD_SH(filter_x);
3254  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3255 
3256  filter_vec = LD_SH(filter_y);
3257  UNPCK_R_SB_SH(filter_vec, filter_vec);
3258 
3259  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3260 
3261  mask1 = mask0 + 2;
3262 
3263  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3264  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3265 
3266  VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3267  VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3268  VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3269  VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3270 
3271  dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3272  dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3273  dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3274  dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3275 
3276  ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3277  ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3278  ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3279  dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3280  dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3281  dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3282  dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3283  SRA_4V(dst0, dst1, dst2, dst3, 6);
3284  PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3285  SRARI_H2_SH(tmp0, tmp1, 6);
3286  SAT_SH2_SH(tmp0, tmp1, 7);
3287  out = PCKEV_XORI128_UB(tmp0, tmp1);
3288  ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3289 }
3290 
3291 static void hevc_hv_uni_4t_4multx8mult_msa(const uint8_t *src,
3292  int32_t src_stride,
3293  uint8_t *dst,
3294  int32_t dst_stride,
3295  const int8_t *filter_x,
3296  const int8_t *filter_y,
3297  int32_t height)
3298 {
3299  uint32_t loop_cnt;
3300  v16u8 out0, out1;
3301  v16i8 src0, src1, src2, src3, src4, src5;
3302  v16i8 src6, src7, src8, src9, src10;
3303  v8i16 filt0, filt1;
3304  v8i16 filt_h0, filt_h1;
3305  v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3306  v16i8 mask1;
3307  v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
3308  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3309  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3310  v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3311  v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3312  v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3313  v8i16 dst98_r, dst109_r;
3314 
3315  src -= (src_stride + 1);
3316 
3317  filter_vec = LD_SH(filter_x);
3318  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3319 
3320  filter_vec = LD_SH(filter_y);
3321  UNPCK_R_SB_SH(filter_vec, filter_vec);
3322 
3323  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3324 
3325  mask1 = mask0 + 2;
3326 
3327  LD_SB3(src, src_stride, src0, src1, src2);
3328  src += (3 * src_stride);
3329 
3331 
3332  VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3333  VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3334  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3335  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3336  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3337  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3338 
3339  for (loop_cnt = height >> 3; loop_cnt--;) {
3340  LD_SB8(src, src_stride,
3341  src3, src4, src5, src6, src7, src8, src9, src10);
3342  src += (8 * src_stride);
3343 
3344  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3345 
3346  VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3347  VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3348  VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3349  VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3350 
3351  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3352  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3353  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3354  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3355 
3356  dst32_r = __msa_ilvr_h(dst73, dst22);
3357  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3358  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3359  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3360  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3361  dst76_r = __msa_ilvr_h(dst22, dst106);
3362 
3363  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3364  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3365  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3366  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3367  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3368  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3369  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3370  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3371  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3372  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3373  PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3374  dst5_r, dst4_r, dst7_r, dst6_r,
3375  tmp0, tmp1, tmp2, tmp3);
3376  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3377  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3378  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3379  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3380  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3381  dst += (8 * dst_stride);
3382 
3383  dst10_r = dst98_r;
3384  dst21_r = dst109_r;
3385  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3386  }
3387 }
3388 
3389 static void hevc_hv_uni_4t_4w_msa(const uint8_t *src,
3390  int32_t src_stride,
3391  uint8_t *dst,
3392  int32_t dst_stride,
3393  const int8_t *filter_x,
3394  const int8_t *filter_y,
3395  int32_t height)
3396 {
3397  if (2 == height) {
3398  hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3399  filter_x, filter_y);
3400  } else if (4 == height) {
3401  hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3402  filter_x, filter_y);
3403  } else if (0 == (height % 8)) {
3404  hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3405  filter_x, filter_y, height);
3406  }
3407 }
3408 
3409 static void hevc_hv_uni_4t_6w_msa(const uint8_t *src,
3410  int32_t src_stride,
3411  uint8_t *dst,
3412  int32_t dst_stride,
3413  const int8_t *filter_x,
3414  const int8_t *filter_y,
3415  int32_t height)
3416 {
3417  v16u8 out0, out1, out2;
3418  v16i8 src0, src1, src2, src3, src4, src5, src6;
3419  v16i8 src7, src8, src9, src10;
3420  v8i16 filt0, filt1;
3421  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3422  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3423  v16i8 mask1;
3424  v8i16 filt_h0, filt_h1, filter_vec;
3425  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3426  v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3427  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3428  v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
3429  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3430  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3431  v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
3432  v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
3433  v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3434 
3435  src -= (src_stride + 1);
3436 
3437  filter_vec = LD_SH(filter_x);
3438  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3439 
3440  filter_vec = LD_SH(filter_y);
3441  UNPCK_R_SB_SH(filter_vec, filter_vec);
3442 
3443  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3444 
3445  mask1 = mask0 + 2;
3446 
3447  LD_SB3(src, src_stride, src0, src1, src2);
3448  src += (3 * src_stride);
3449 
3451 
3452  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3453  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3454  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3455 
3456  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3457  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3458  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3459 
3460  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3461  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3462 
3463  LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3464  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3465 
3466  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3467  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3468  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3469  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3470 
3471  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3472  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3473  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3474  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3475 
3476  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3477  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3478  VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3479  VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3480 
3481  dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3482  dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3483  dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3484  dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3485 
3486  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3487  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3488  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3489  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3490  ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
3491  ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
3492  ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
3493  ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
3494 
3495  PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3496  PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3497  dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3498 
3499  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3500  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3501  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3502  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3503  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3504  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3505  dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3506  dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3507  dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
3508  dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
3509  dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
3510  dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3511  SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3512  SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3513  SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3514  PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3515  PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3516  PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3517  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3518  SRARI_H2_SH(tmp4, tmp5, 6);
3519  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3,7);
3520  SAT_SH2_SH(tmp4, tmp5,7);
3521  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3522  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3523  out2 = PCKEV_XORI128_UB(tmp4, tmp5);
3524  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3525  ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
3526 }
3527 
3528 static void hevc_hv_uni_4t_8x2_msa(const uint8_t *src,
3529  int32_t src_stride,
3530  uint8_t *dst,
3531  int32_t dst_stride,
3532  const int8_t *filter_x,
3533  const int8_t *filter_y)
3534 {
3535  v16u8 out;
3536  v16i8 src0, src1, src2, src3, src4;
3537  v8i16 filt0, filt1;
3538  v8i16 filt_h0, filt_h1, filter_vec;
3539  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3540  v16i8 mask1;
3541  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3542  v8i16 dst0, dst1, dst2, dst3, dst4;
3543  v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3544  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3545  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3546  v8i16 out0_r, out1_r;
3547 
3548  src -= (src_stride + 1);
3549 
3550  filter_vec = LD_SH(filter_x);
3551  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3552 
3553  filter_vec = LD_SH(filter_y);
3554  UNPCK_R_SB_SH(filter_vec, filter_vec);
3555 
3556  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3557 
3558  mask1 = mask0 + 2;
3559 
3560  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3561  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3562 
3563  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3564  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3565  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3566  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3567  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3568 
3569  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3570  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3571  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3572  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3573  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3574  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3575  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3576  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3577  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3578  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3579  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3580  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3581  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3582  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3583  PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3584  SRARI_H2_SH(out0_r, out1_r, 6);
3585  SAT_SH2_SH(out0_r, out1_r, 7);
3586  out = PCKEV_XORI128_UB(out0_r, out1_r);
3587  ST_D2(out, 0, 1, dst, dst_stride);
3588 }
3589 
3590 static void hevc_hv_uni_4t_8multx4_msa(const uint8_t *src,
3591  int32_t src_stride,
3592  uint8_t *dst,
3593  int32_t dst_stride,
3594  const int8_t *filter_x,
3595  const int8_t *filter_y,
3596  int32_t width8mult)
3597 {
3598  uint32_t cnt;
3599  v16u8 out0, out1;
3600  v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
3601  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3602  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
3603  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
3604  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3605  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3606  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3607 
3608  src -= (src_stride + 1);
3609 
3610  filter_vec = LD_SH(filter_x);
3611  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3612 
3613  filter_vec = LD_SH(filter_y);
3614  UNPCK_R_SB_SH(filter_vec, filter_vec);
3615 
3616  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3617 
3618  mask0 = LD_SB(ff_hevc_mask_arr);
3619  mask1 = mask0 + 2;
3620 
3621  for (cnt = width8mult; cnt--;) {
3622  LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3623  src += 8;
3624  XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3625 
3626  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3627  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3628  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3629 
3630  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3631  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3632  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3633 
3634  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3635  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3636 
3637  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3638  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3639  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3640  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3641 
3642  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3643  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3644  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3645  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3646 
3647  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3648  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3649  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3650  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3651 
3652  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3653  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3654  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3655  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3656  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3657  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3658  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3659  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3660 
3661  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3662  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3663 
3664  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3665  dst3_r, tmp0, tmp1, tmp2, tmp3);
3666  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3667  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3668  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3669  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3670  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3671  dst += 8;
3672  }
3673 }
3674 
3675 static void hevc_hv_uni_4t_8x6_msa(const uint8_t *src,
3676  int32_t src_stride,
3677  uint8_t *dst,
3678  int32_t dst_stride,
3679  const int8_t *filter_x,
3680  const int8_t *filter_y)
3681 {
3682  v16u8 out0, out1, out2;
3683  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3684  v8i16 filt0, filt1;
3685  v8i16 filt_h0, filt_h1, filter_vec;
3686  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3687  v16i8 mask1;
3688  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3689  v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3690  v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3691  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3692  v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3693  v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3694  v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3695  v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3696  v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3697  v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3698 
3699  src -= (src_stride + 1);
3700 
3701  filter_vec = LD_SH(filter_x);
3702  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3703 
3704  filter_vec = LD_SH(filter_y);
3705  UNPCK_R_SB_SH(filter_vec, filter_vec);
3706 
3707  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3708 
3709  mask1 = mask0 + 2;
3710 
3711  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3712  src += (5 * src_stride);
3713  LD_SB4(src, src_stride, src5, src6, src7, src8);
3714 
3715  XORI_B5_128_SB(src0, src1, src2, src3, src4);
3716  XORI_B4_128_SB(src5, src6, src7, src8);
3717 
3718  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3719  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3720  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3721  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3722  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3723  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3724  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3725  VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3726  VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3727 
3728  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3729  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3730  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3731  dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3732  dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3733  dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
3734  dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
3735  dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
3736  dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
3737 
3738  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3739  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3740  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3741  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3742  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3743  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3744  ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3745  ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3746 
3747  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3748  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3749  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3750  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3751  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3752  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3753  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3754  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3755  dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3756  dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3757  dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3758  dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3759 
3760  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3761  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3762  SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3763  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3764  dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3765  PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3766  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3767  SRARI_H2_SH(out4_r, out5_r, 6);
3768  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3769  SAT_SH2_SH(out4_r, out5_r, 7);
3770  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3771  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3772  out2 = PCKEV_XORI128_UB(out4_r, out5_r);
3773 
3774  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3775  ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3776 }
3777 
3778 static void hevc_hv_uni_4t_8multx4mult_msa(const uint8_t *src,
3779  int32_t src_stride,
3780  uint8_t *dst,
3781  int32_t dst_stride,
3782  const int8_t *filter_x,
3783  const int8_t *filter_y,
3784  int32_t height,
3785  int32_t width8mult)
3786 {
3787  uint32_t loop_cnt, cnt;
3788  const uint8_t *src_tmp;
3789  uint8_t *dst_tmp;
3790  v16u8 out0, out1;
3791  v16i8 src0, src1, src2, src3, src4, src5, src6;
3792  v8i16 filt0, filt1;
3793  v8i16 filt_h0, filt_h1, filter_vec;
3794  v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3795  v16i8 mask1;
3796  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3797  v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3798  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3799  v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3800  v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3801  v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
3802  v8i16 out0_r, out1_r, out2_r, out3_r;
3803 
3804  src -= (src_stride + 1);
3805 
3806  filter_vec = LD_SH(filter_x);
3807  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3808 
3809  filter_vec = LD_SH(filter_y);
3810  UNPCK_R_SB_SH(filter_vec, filter_vec);
3811 
3812  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3813 
3814  mask1 = mask0 + 2;
3815 
3816  for (cnt = width8mult; cnt--;) {
3817  src_tmp = src;
3818  dst_tmp = dst;
3819 
3820  LD_SB3(src_tmp, src_stride, src0, src1, src2);
3821  src_tmp += (3 * src_stride);
3822 
3824 
3825  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3826  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3827  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3828 
3829  dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3830  dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3831  dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3832 
3833  ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3834  ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3835 
3836  for (loop_cnt = (height >> 2); loop_cnt--;) {
3837  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3838  src_tmp += (4 * src_stride);
3839 
3840  XORI_B4_128_SB(src3, src4, src5, src6);
3841 
3842  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3843  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3844  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3845  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3846 
3847  dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3848  dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3849  dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3850  dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3851 
3852  ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3853  ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3854  ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3855  ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3856 
3857  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3858  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3859  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3860  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3861  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3862  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3863  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3864  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3865 
3866  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3867  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3868 
3869  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3870  dst2_l, dst2_r, dst3_l, dst3_r,
3871  out0_r, out1_r, out2_r, out3_r);
3872 
3873  SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3874  SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3875  out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3876  out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3877  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3878  dst_tmp += (4 * dst_stride);
3879 
3880  dst10_r = dst54_r;
3881  dst10_l = dst54_l;
3882  dst21_r = dst65_r;
3883  dst21_l = dst65_l;
3884  dst2 = dst6;
3885  }
3886 
3887  src += 8;
3888  dst += 8;
3889  }
3890 }
3891 
3892 static void hevc_hv_uni_4t_8w_msa(const uint8_t *src,
3893  int32_t src_stride,
3894  uint8_t *dst,
3895  int32_t dst_stride,
3896  const int8_t *filter_x,
3897  const int8_t *filter_y,
3898  int32_t height)
3899 {
3900  if (2 == height) {
3901  hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3902  filter_x, filter_y);
3903  } else if (4 == height) {
3904  hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride,
3905  filter_x, filter_y, 1);
3906  } else if (6 == height) {
3907  hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3908  filter_x, filter_y);
3909  } else if (0 == (height % 4)) {
3910  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3911  filter_x, filter_y, height, 1);
3912  }
3913 }
3914 
3915 static void hevc_hv_uni_4t_12w_msa(const uint8_t *src,
3916  int32_t src_stride,
3917  uint8_t *dst,
3918  int32_t dst_stride,
3919  const int8_t *filter_x,
3920  const int8_t *filter_y,
3921  int32_t height)
3922 {
3923  uint32_t loop_cnt;
3924  const uint8_t *src_tmp;
3925  uint8_t *dst_tmp;
3926  v16u8 out0, out1;
3927  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3928  v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3929  v16i8 mask0, mask1, mask2, mask3;
3930  v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
3931  v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
3932  v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3933  v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
3934  v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3935  v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3936  v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3937  v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3938 
3939  src -= (src_stride + 1);
3940 
3941  filter_vec = LD_SH(filter_x);
3942  SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3943 
3944  filter_vec = LD_SH(filter_y);
3945  UNPCK_R_SB_SH(filter_vec, filter_vec);
3946 
3947  SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3948 
3949  mask0 = LD_SB(ff_hevc_mask_arr);
3950  mask1 = mask0 + 2;
3951 
3952  src_tmp = src;
3953  dst_tmp = dst;
3954 
3955  LD_SB3(src_tmp, src_stride, src0, src1, src2);
3956  src_tmp += (3 * src_stride);
3957 
3959 
3960  VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3961  VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3962  VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3963 
3964  dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3965  dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3966  dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3967 
3968  ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3969  ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3970 
3971  for (loop_cnt = 4; loop_cnt--;) {
3972  LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3973  src_tmp += (4 * src_stride);
3974  XORI_B4_128_SB(src3, src4, src5, src6);
3975 
3976  VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3977  VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3978  VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3979  VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3980 
3981  dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3982  dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3983  dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3984  dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3985 
3986  ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3987  ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3988  ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3989  ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3990 
3991  dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3992  dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3993  dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3994  dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3995  dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3996  dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3997  dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3998  dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3999 
4000  SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4001  SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4002 
4003  PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4004  dst3_r, tmp0, tmp1, tmp2, tmp3);
4005  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
4006  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4007  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4008  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4009  ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4010  dst_tmp += (4 * dst_stride);
4011 
4012  dst10_r = dst54_r;
4013  dst10_l = dst54_l;
4014  dst21_r = dst65_r;
4015  dst21_l = dst65_l;
4016  dsth2 = dsth6;
4017  }
4018 
4019  src += 8;
4020  dst += 8;
4021 
4022  mask2 = LD_SB(ff_hevc_mask_arr + 16);
4023  mask3 = mask2 + 2;
4024 
4025  LD_SB3(src, src_stride, src0, src1, src2);
4026  src += (3 * src_stride);
4028  VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
4029  VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
4030 
4031  dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4032  dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4033 
4034  ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4035  dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4036 
4037  for (loop_cnt = 2; loop_cnt--;) {
4038  LD_SB8(src, src_stride,
4039  src3, src4, src5, src6, src7, src8, src9, src10);
4040  src += (8 * src_stride);
4041  XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4042  VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4043  VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4044  VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4045  VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4046 
4047  dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4048  dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4049  dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4050  dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4051 
4052  dst32_r = __msa_ilvr_h(dst73, dst22);
4053  ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4054  ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4055  ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4056  dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4057  dst76_r = __msa_ilvr_h(dst22, dst106);
4058 
4059  dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4060  dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4061  dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4062  dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4063  dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4064  dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4065  dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4066  dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4067  SRA_4V(dst0, dst1, dst2, dst3, 6);
4068  SRA_4V(dst4, dst5, dst6, dst7, 6);
4069  PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
4070  tmp0, tmp1, tmp2, tmp3);
4071  SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
4072  SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4073  out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4074  out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4075  ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4076  dst += (8 * dst_stride);
4077 
4078  dst10_r = dst98_r;
4079  dst21_r = dst109_r;
4080  dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4081  }
4082 }
4083 
4084 static void hevc_hv_uni_4t_16w_msa(const uint8_t *src,
4085  int32_t src_stride,
4086  uint8_t *dst,
4087  int32_t dst_stride,
4088  const int8_t *filter_x,
4089  const int8_t *filter_y,
4090  int32_t height)
4091 {
4092  if (4 == height) {
4093  hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, filter_x,
4094  filter_y, 2);
4095  } else {
4096  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4097  filter_x, filter_y, height, 2);
4098  }
4099 }
4100 
4101 static void hevc_hv_uni_4t_24w_msa(const uint8_t *src,
4102  int32_t src_stride,
4103  uint8_t *dst,
4104  int32_t dst_stride,
4105  const int8_t *filter_x,
4106  const int8_t *filter_y,
4107  int32_t height)
4108 {
4109  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4110  filter_x, filter_y, height, 3);
4111 }
4112 
4113 static void hevc_hv_uni_4t_32w_msa(const uint8_t *src,
4114  int32_t src_stride,
4115  uint8_t *dst,
4116  int32_t dst_stride,
4117  const int8_t *filter_x,
4118  const int8_t *filter_y,
4119  int32_t height)
4120 {
4121  hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4122  filter_x, filter_y, height, 4);
4123 }
4124 
4125 #define UNI_MC_COPY(WIDTH) \
4126 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4127  ptrdiff_t dst_stride, \
4128  const uint8_t *src, \
4129  ptrdiff_t src_stride, \
4130  int height, \
4131  intptr_t mx, \
4132  intptr_t my, \
4133  int width) \
4134 { \
4135  copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
4136 }
4137 
4138 UNI_MC_COPY(8);
4139 UNI_MC_COPY(12);
4140 UNI_MC_COPY(16);
4141 UNI_MC_COPY(24);
4142 UNI_MC_COPY(32);
4143 UNI_MC_COPY(48);
4144 UNI_MC_COPY(64);
4145 
4146 #undef UNI_MC_COPY
4147 
4148 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4149 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4150  ptrdiff_t dst_stride, \
4151  const uint8_t *src, \
4152  ptrdiff_t src_stride, \
4153  int height, \
4154  intptr_t mx, \
4155  intptr_t my, \
4156  int width) \
4157 { \
4158  const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR]; \
4159  \
4160  common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4161  filter, height); \
4162 }
4163 
4164 UNI_MC(qpel, h, 4, 8, hz, mx);
4165 UNI_MC(qpel, h, 8, 8, hz, mx);
4166 UNI_MC(qpel, h, 12, 8, hz, mx);
4167 UNI_MC(qpel, h, 16, 8, hz, mx);
4168 UNI_MC(qpel, h, 24, 8, hz, mx);
4169 UNI_MC(qpel, h, 32, 8, hz, mx);
4170 UNI_MC(qpel, h, 48, 8, hz, mx);
4171 UNI_MC(qpel, h, 64, 8, hz, mx);
4172 
4173 UNI_MC(qpel, v, 4, 8, vt, my);
4174 UNI_MC(qpel, v, 8, 8, vt, my);
4175 UNI_MC(qpel, v, 12, 8, vt, my);
4176 UNI_MC(qpel, v, 16, 8, vt, my);
4177 UNI_MC(qpel, v, 24, 8, vt, my);
4178 UNI_MC(qpel, v, 32, 8, vt, my);
4179 UNI_MC(qpel, v, 48, 8, vt, my);
4180 UNI_MC(qpel, v, 64, 8, vt, my);
4181 
4182 UNI_MC(epel, h, 4, 4, hz, mx);
4183 UNI_MC(epel, h, 6, 4, hz, mx);
4184 UNI_MC(epel, h, 8, 4, hz, mx);
4185 UNI_MC(epel, h, 12, 4, hz, mx);
4186 UNI_MC(epel, h, 16, 4, hz, mx);
4187 UNI_MC(epel, h, 24, 4, hz, mx);
4188 UNI_MC(epel, h, 32, 4, hz, mx);
4189 
4190 UNI_MC(epel, v, 4, 4, vt, my);
4191 UNI_MC(epel, v, 6, 4, vt, my);
4192 UNI_MC(epel, v, 8, 4, vt, my);
4193 UNI_MC(epel, v, 12, 4, vt, my);
4194 UNI_MC(epel, v, 16, 4, vt, my);
4195 UNI_MC(epel, v, 24, 4, vt, my);
4196 UNI_MC(epel, v, 32, 4, vt, my);
4197 
4198 #undef UNI_MC
4199 
4200 #define UNI_MC_HV(PEL, WIDTH, TAP) \
4201 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4202  ptrdiff_t dst_stride, \
4203  const uint8_t *src, \
4204  ptrdiff_t src_stride, \
4205  int height, \
4206  intptr_t mx, \
4207  intptr_t my, \
4208  int width) \
4209 { \
4210  const int8_t *filter_x = ff_hevc_##PEL##_filters[mx]; \
4211  const int8_t *filter_y = ff_hevc_##PEL##_filters[my]; \
4212  \
4213  hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4214  filter_x, filter_y, height); \
4215 }
4216 
4217 UNI_MC_HV(qpel, 4, 8);
4218 UNI_MC_HV(qpel, 8, 8);
4219 UNI_MC_HV(qpel, 12, 8);
4220 UNI_MC_HV(qpel, 16, 8);
4221 UNI_MC_HV(qpel, 24, 8);
4222 UNI_MC_HV(qpel, 32, 8);
4223 UNI_MC_HV(qpel, 48, 8);
4224 UNI_MC_HV(qpel, 64, 8);
4225 
4226 UNI_MC_HV(epel, 4, 4);
4227 UNI_MC_HV(epel, 6, 4);
4228 UNI_MC_HV(epel, 8, 4);
4229 UNI_MC_HV(epel, 12, 4);
4230 UNI_MC_HV(epel, 16, 4);
4231 UNI_MC_HV(epel, 24, 4);
4232 UNI_MC_HV(epel, 32, 4);
4233 
4234 #undef UNI_MC_HV
common_vt_8t_16w_msa
static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1154
XORI_B2_128_SH
#define XORI_B2_128_SH(...)
Definition: generic_macros_msa.h:1836
VSHF_B2_SB
#define VSHF_B2_SB(...)
Definition: generic_macros_msa.h:662
UNI_MC
#define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
Definition: hevc_mc_uni_msa.c:4148
LD_SB4
#define LD_SB4(...)
Definition: generic_macros_msa.h:297
common_vt_8t_24w_msa
static void common_vt_8t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1322
LD_UB8
#define LD_UB8(...)
Definition: generic_macros_msa.h:335
ILVR_H2_SH
#define ILVR_H2_SH(...)
Definition: generic_macros_msa.h:1392
hevc_hv_uni_4t_8x2_msa
static void hevc_hv_uni_4t_8x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
Definition: hevc_mc_uni_msa.c:3528
HORIZ_8TAP_4WID_4VECS_FILT
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1)
Definition: hevc_mc_uni_msa.c:34
common_hz_4t_8x4mult_msa
static void common_hz_4t_8x4mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2239
DPADD_SB2_SH
#define DPADD_SB2_SH(...)
Definition: generic_macros_msa.h:833
hevc_hv_uni_4t_8multx4_msa
static void hevc_hv_uni_4t_8multx4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult)
Definition: hevc_mc_uni_msa.c:3590
SRARI_H2_SH
#define SRARI_H2_SH(...)
Definition: generic_macros_msa.h:2059
out
FILE * out
Definition: movenc.c:54
common_hz_4t_4x4_msa
static void common_hz_4t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2031
SPLATI_H4_SH
#define SPLATI_H4_SH(...)
Definition: generic_macros_msa.h:1674
common_vt_4t_32w_msa
static void common_vt_4t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:3084
src1
const pixel * src1
Definition: h264pred_template.c:421
SAT_SH4_SH
#define SAT_SH4_SH(...)
Definition: generic_macros_msa.h:1615
common_hz_8t_4x16_msa
static void common_hz_8t_4x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:352
VSHF_B3_SB
#define VSHF_B3_SB(in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, out0, out1, out2)
Definition: vp8_mc_lsx.c:54
PCKEV_H2_SW
#define PCKEV_H2_SW(...)
Definition: generic_macros_msa.h:1760
hevc_hv_uni_4t_4multx8mult_msa
static void hevc_hv_uni_4t_4multx8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:3291
common_vt_4t_8x6_msa
static void common_vt_4t_8x6_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2740
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
SRARI_H4_SH
#define SRARI_H4_SH(...)
Definition: generic_macros_msa.h:2067
common_hz_4t_16w_msa
static void common_hz_4t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2340
SAT_SH2_SH
#define SAT_SH2_SH(...)
Definition: generic_macros_msa.h:1601
copy_width32_msa
static void copy_width32_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:219
ST_D1
#define ST_D1(in, idx, pdst)
Definition: generic_macros_msa.h:485
hevc_hv_uni_8t_12w_msa
static void hevc_hv_uni_8t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1709
LD_SH
#define LD_SH(...)
Definition: generic_macros_msa.h:35
common_hz_8t_24w_msa
static void common_hz_8t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:610
hevc_hv_uni_8t_8multx2mult_msa
static void hevc_hv_uni_8t_8multx2mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
Definition: hevc_mc_uni_msa.c:1558
common_vt_4t_4x2_msa
static void common_vt_4t_4x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2554
VSHF_B4_SB
#define VSHF_B4_SB(...)
Definition: generic_macros_msa.h:680
PCKEV_B4_UB
#define PCKEV_B4_UB(...)
Definition: generic_macros_msa.h:1739
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
ST_UB8
#define ST_UB8(...)
Definition: generic_macros_msa.h:391
hevc_hv_uni_8t_64w_msa
static void hevc_hv_uni_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1992
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:374
SAT_SH3_SH
#define SAT_SH3_SH(...)
Definition: generic_macros_msa.h:1608
DOTP_SB2_SH
#define DOTP_SB2_SH(...)
Definition: generic_macros_msa.h:768
common_hz_8t_4w_msa
static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:408
hevc_hv_uni_8t_4w_msa
static void hevc_hv_uni_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1357
XORI_B4_128_SB
#define XORI_B4_128_SB(...)
Definition: generic_macros_msa.h:1851
copy_width16_msa
static void copy_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:160
hevc_hv_uni_8t_48w_msa
static void hevc_hv_uni_8t_48w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1980
generic_macros_msa.h
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: vp8_lpf_lsx.c:234
hevc_hv_uni_8t_32w_msa
static void hevc_hv_uni_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1968
ST12x8_UB
#define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
Definition: generic_macros_msa.h:527
common_hz_8t_4x8_msa
static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:315
LD_SB
#define LD_SB(...)
Definition: generic_macros_msa.h:33
LD_UB
#define LD_UB(...)
Definition: generic_macros_msa.h:32
LD_SB5
#define LD_SB5(...)
Definition: generic_macros_msa.h:308
aligned
static int aligned(int val)
Definition: dashdec.c:170
copy_width24_msa
static void copy_width24_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:196
common_hz_4t_4w_msa
static void common_hz_4t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2141
SW
#define SW(val, pdst)
Definition: generic_macros_msa.h:167
copy_width64_msa
static void copy_width64_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:257
common_vt_4t_8x2_msa
static void common_vt_4t_8x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2714
ILVL_H2_SH
#define ILVL_H2_SH(...)
Definition: generic_macros_msa.h:1292
common_hz_8t_64w_msa
static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:827
width
#define width
HEVC_FILT_8TAP_SH
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
Definition: hevc_macros_msa.h:24
ST_H8
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:429
hevc_hv_uni_4t_24w_msa
static void hevc_hv_uni_4t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:4101
common_hz_8t_8w_msa
static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:421
UNPCK_R_SB_SH
#define UNPCK_R_SB_SH(in, out)
Definition: generic_macros_msa.h:2156
SRA_4V
#define SRA_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1939
SAT_SW4_SW
#define SAT_SW4_SW(...)
Definition: generic_macros_msa.h:1639
common_vt_4t_24w_msa
static void common_vt_4t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2973
PCKEV_H4_SH
#define PCKEV_H4_SH(...)
Definition: generic_macros_msa.h:1768
HEVC_FILT_8TAP
#define HEVC_FILT_8TAP(in0, in1, in2, in3, filt0, filt1, filt2, filt3)
Definition: hevc_macros_msa.h:35
common_hz_4t_6w_msa
static void common_hz_4t_6w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2156
ILVR_B3_SH
#define ILVR_B3_SH(...)
Definition: generic_macros_msa.h:1351
common_vt_8t_8w_msa
static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:997
common_hz_4t_4x16_msa
static void common_hz_4t_4x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2093
hevc_macros_msa.h
common_hz_4t_8x2mult_msa
static void common_hz_4t_8x2mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2205
ILVR_B4_SB
#define ILVR_B4_SB(...)
Definition: generic_macros_msa.h:1360
LD2
#define LD2(psrc, stride, out0, out1)
Definition: generic_macros_msa.h:223
ILVR_D2_SB
#define ILVR_D2_SB(...)
Definition: generic_macros_msa.h:1444
HORIZ_4TAP_4WID_4VECS_FILT
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1)
Definition: hevc_mc_uni_msa.c:76
XORI_B5_128_SB
#define XORI_B5_128_SB(...)
Definition: generic_macros_msa.h:1859
ILVRL_H2_SH
#define ILVRL_H2_SH(...)
Definition: generic_macros_msa.h:1508
common_vt_8t_12w_msa
static void common_vt_8t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1052
ff_hevc_mask_arr
static const uint8_t ff_hevc_mask_arr[16 *3]
Definition: hevc_mc_uni_msa.c:25
hevc_hv_uni_4t_12w_msa
static void hevc_hv_uni_4t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:3915
DOTP_SB3_SH
#define DOTP_SB3_SH(...)
Definition: generic_macros_msa.h:776
hevc_hv_uni_8t_16w_msa
static void hevc_hv_uni_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1944
common_hz_8t_32w_msa
static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:680
common_hz_4t_8w_msa
static void common_hz_4t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2273
common_hz_4t_4x2_msa
static void common_hz_4t_4x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2004
common_hz_8t_16w_msa
static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:552
common_vt_8t_4w_msa
static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:906
ST_W2
#define ST_W2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:450
HORIZ_4TAP_8WID_4VECS_FILT
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1, out0, out1, out2, out3)
Definition: hevc_mc_uni_msa.c:88
hevc_hv_uni_8t_24w_msa
static void hevc_hv_uni_8t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1956
common_vt_4t_4w_msa
static void common_vt_4t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2631
ILVR_D3_SB
#define ILVR_D3_SB(...)
Definition: generic_macros_msa.h:1452
hevcdsp_mips.h
VSHF_B2_SH
#define VSHF_B2_SH(...)
Definition: generic_macros_msa.h:664
PCKEV_XORI128_UB
#define PCKEV_XORI128_UB(in0, in1)
Definition: generic_macros_msa.h:2751
common_hz_4t_24w_msa
static void common_hz_4t_24w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2403
LD_SB7
#define LD_SB7(...)
Definition: generic_macros_msa.h:327
SD4
#define SD4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:256
LD_UB4
#define LD_UB4(...)
Definition: generic_macros_msa.h:296
XORI_B8_128_SB
#define XORI_B8_128_SB(...)
Definition: generic_macros_msa.h:1880
ILVR_B2_SB
#define ILVR_B2_SB(...)
Definition: generic_macros_msa.h:1338
XORI_B2_128_SB
#define XORI_B2_128_SB(...)
Definition: generic_macros_msa.h:1835
hevc_hv_uni_8t_8w_msa
static void hevc_hv_uni_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:1697
hevc_hv_uni_4t_16w_msa
static void hevc_hv_uni_4t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:4084
height
#define height
common_vt_8t_16w_mult_msa
static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
Definition: hevc_mc_uni_msa.c:1233
hevc_hv_uni_4t_8x6_msa
static void hevc_hv_uni_4t_8x6_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
Definition: hevc_mc_uni_msa.c:3675
common_vt_4t_8x4mult_msa
static void common_vt_4t_8x4mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2793
hevc_hv_uni_4t_8w_msa
static void hevc_hv_uni_4t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:3892
common_vt_4t_8w_msa
static void common_vt_4t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2838
ST_D2
#define ST_D2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:491
SPLATI_H2_SH
#define SPLATI_H2_SH(...)
Definition: generic_macros_msa.h:1656
HORIZ_8TAP_8WID_4VECS_FILT
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3, filt0, filt1, filt2, filt3, out0, out1, out2, out3)
Definition: hevc_mc_uni_msa.c:51
copy_width48_msa
static void copy_width48_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:236
SPLATI_W4_SH
#define SPLATI_W4_SH(...)
Definition: generic_macros_msa.h:1700
HEVC_FILT_4TAP_SH
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
Definition: hevc_macros_msa.h:46
UNI_MC_COPY
#define UNI_MC_COPY(WIDTH)
Definition: hevc_mc_uni_msa.c:4125
PCKEV_B2_SH
#define PCKEV_B2_SH(...)
Definition: generic_macros_msa.h:1721
hevc_hv_uni_4t_6w_msa
static void hevc_hv_uni_4t_6w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:3409
UNI_MC_HV
#define UNI_MC_HV(PEL, WIDTH, TAP)
Definition: hevc_mc_uni_msa.c:4200
common_vt_4t_6w_msa
static void common_vt_4t_6w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2643
XORI_B4_128_UB
#define XORI_B4_128_UB(...)
Definition: generic_macros_msa.h:1850
src2
const pixel * src2
Definition: h264pred_template.c:422
ST_W8
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, idx4, idx5, idx6, idx7, pdst, stride)
Definition: generic_macros_msa.h:470
LD4
#define LD4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:228
ILVL_B2_SB
#define ILVL_B2_SB(...)
Definition: generic_macros_msa.h:1263
filt
static const int8_t filt[NUMTAPS *2]
Definition: af_earwax.c:39
SPLATI_H4_SB
#define SPLATI_H4_SB(...)
Definition: generic_macros_msa.h:1673
DPADD_SB4_SH
#define DPADD_SB4_SH(...)
Definition: generic_macros_msa.h:841
hevc_hv_uni_4t_4w_msa
static void hevc_hv_uni_4t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:3389
ST_H2
#define ST_H2(in, idx0, idx1, pdst, stride)
Definition: generic_macros_msa.h:409
SPLATI_W2_SH
#define SPLATI_W2_SH(...)
Definition: generic_macros_msa.h:1692
common_vt_4t_4x4multiple_msa
static void common_vt_4t_4x4multiple_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2585
LD_SB3
#define LD_SB3(...)
Definition: generic_macros_msa.h:289
ILVL_H4_SH
#define ILVL_H4_SH(...)
Definition: generic_macros_msa.h:1301
copy_width8_msa
static void copy_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:104
ST_UB
#define ST_UB(...)
Definition: generic_macros_msa.h:40
hevc_hv_uni_4t_4x4_msa
static void hevc_hv_uni_4t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
Definition: hevc_mc_uni_msa.c:3232
common_hz_4t_12w_msa
static void common_hz_4t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2286
common_vt_8t_64w_msa
static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1349
copy_width12_msa
static void copy_width12_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hevc_mc_uni_msa.c:146
common_hz_8t_4x4_msa
static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:286
common_hz_4t_32w_msa
static void common_hz_4t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2486
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:499
DOTP_SB4_SH
#define DOTP_SB4_SH(...)
Definition: generic_macros_msa.h:784
ILVL_B4_SB
#define ILVL_B4_SB(...)
Definition: generic_macros_msa.h:1274
LD_SB8
#define LD_SB8(...)
Definition: generic_macros_msa.h:336
SRARI_W4_SW
#define SRARI_W4_SW(...)
Definition: generic_macros_msa.h:2092
src0
const pixel *const src0
Definition: h264pred_template.c:420
common_vt_8t_48w_msa
static void common_vt_8t_48w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1341
common_hz_8t_12w_msa
static void common_hz_8t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:473
XORI_B7_128_SB
#define XORI_B7_128_SB(...)
Definition: generic_macros_msa.h:1873
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
HEVC_FILT_4TAP
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
Definition: hevc_macros_msa.h:64
int32_t
int32_t
Definition: audioconvert.c:56
common_vt_4t_12w_msa
static void common_vt_4t_12w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2852
common_hz_8t_48w_msa
static void common_hz_8t_48w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:741
h
h
Definition: vp9dsp_template.c:2038
ILVR_H4_SH
#define ILVR_H4_SH(...)
Definition: generic_macros_msa.h:1408
common_hz_4t_4x8_msa
static void common_hz_4t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
Definition: hevc_mc_uni_msa.c:2058
hevc_hv_uni_4t_32w_msa
static void hevc_hv_uni_4t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
Definition: hevc_mc_uni_msa.c:4113
common_vt_8t_32w_msa
static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:1333
ILVR_B2_SH
#define ILVR_B2_SH(...)
Definition: generic_macros_msa.h:1340
PCKEV_D2_SH
#define PCKEV_D2_SH(...)
Definition: generic_macros_msa.h:1789
hevc_hv_uni_4t_4x2_msa
static void hevc_hv_uni_4t_4x2_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
Definition: hevc_mc_uni_msa.c:3177
hevc_hv_uni_4t_8multx4mult_msa
static void hevc_hv_uni_4t_8multx4mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width8mult)
Definition: hevc_mc_uni_msa.c:3778
SD
#define SD
Definition: ccaption_dec.c:940
SPLATI_H2_SB
#define SPLATI_H2_SB(...)
Definition: generic_macros_msa.h:1655
PCKEV_H2_SH
#define PCKEV_H2_SH(...)
Definition: generic_macros_msa.h:1759
XORI_B3_128_SB
#define XORI_B3_128_SB(...)
Definition: generic_macros_msa.h:1843
common_vt_4t_16w_msa
static void common_vt_4t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
Definition: hevc_mc_uni_msa.c:2915
LD_SB2
#define LD_SB2(...)
Definition: generic_macros_msa.h:278