FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
h264chroma_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "h264chroma_mips.h"
23 
24 static const uint8_t chroma_mask_arr[16 * 5] = {
25  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
26  0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
27  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28  0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
29  0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
30 };
31 
33  uint32_t coeff0, uint32_t coeff1)
34 {
35  uint16_t out0, out1;
36  v16i8 src0, src1;
37  v8u16 res_r;
38  v8i16 res;
39  v16i8 mask;
40  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
41  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
42  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
43 
44  mask = LD_SB(&chroma_mask_arr[0]);
45 
46  LD_SB2(src, stride, src0, src1);
47 
48  src0 = __msa_vshf_b(mask, src1, src0);
49  res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
50  res_r <<= 3;
51  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
52  res_r = __msa_sat_u_h(res_r, 7);
53  res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
54 
55  out0 = __msa_copy_u_h(res, 0);
56  out1 = __msa_copy_u_h(res, 2);
57 
58  SH(out0, dst);
59  dst += stride;
60  SH(out1, dst);
61 }
62 
64  uint32_t coeff0, uint32_t coeff1)
65 {
66  v16u8 src0, src1, src2, src3;
67  v8u16 res_r;
68  v8i16 res;
69  v16i8 mask;
70  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
71  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
72  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
73 
74  mask = LD_SB(&chroma_mask_arr[64]);
75 
76  LD_UB4(src, stride, src0, src1, src2, src3);
77 
78  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
79 
80  src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
81 
82  res_r = __msa_dotp_u_h(src0, coeff_vec);
83  res_r <<= 3;
84  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
85  res_r = __msa_sat_u_h(res_r, 7);
86  res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
87 
88  ST2x4_UB(res, 0, dst, stride);
89 }
90 
92  uint32_t coeff0, uint32_t coeff1,
94 {
95  if (2 == height) {
96  avc_chroma_hz_2x2_msa(src, dst, stride, coeff0, coeff1);
97  } else if (4 == height) {
98  avc_chroma_hz_2x4_msa(src, dst, stride, coeff0, coeff1);
99  }
100 }
101 
103  uint32_t coeff0, uint32_t coeff1)
104 {
105  v16i8 src0, src1;
106  v8u16 res_r;
107  v4i32 res;
108  v16i8 mask;
109  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
110  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
111  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
112 
113  mask = LD_SB(&chroma_mask_arr[0]);
114 
115  LD_SB2(src, stride, src0, src1);
116 
117  src0 = __msa_vshf_b(mask, src1, src0);
118  res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
119  res_r <<= 3;
120  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
121  res_r = __msa_sat_u_h(res_r, 7);
122  res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
123 
124  ST4x2_UB(res, dst, stride);
125 }
126 
128  uint32_t coeff0, uint32_t coeff1)
129 {
130  v16u8 src0, src1, src2, src3, out;
131  v8u16 res0_r, res1_r;
132  v16i8 mask;
133  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
134  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
135  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
136 
137  mask = LD_SB(&chroma_mask_arr[0]);
138 
139  LD_UB4(src, stride, src0, src1, src2, src3);
140  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
141  DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
142  res0_r <<= 3;
143  res1_r <<= 3;
144  SRARI_H2_UH(res0_r, res1_r, 6);
145  SAT_UH2_UH(res0_r, res1_r, 7);
146  out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
147  ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
148 }
149 
151  uint32_t coeff0, uint32_t coeff1)
152 {
153  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, out0, out1;
154  v16i8 mask;
155  v8u16 res0, res1, res2, res3;
156  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
157  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
158  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
159 
160  mask = LD_SB(&chroma_mask_arr[0]);
161 
162  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
163  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
164  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
165  DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0, res1);
166  DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
167  SLLI_4V(res0, res1, res2, res3, 3);
168  SRARI_H4_UH(res0, res1, res2, res3, 6);
169  SAT_UH4_UH(res0, res1, res2, res3, 7);
170  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
171  ST4x8_UB(out0, out1, dst, stride);
172 }
173 
175  uint32_t coeff0, uint32_t coeff1,
176  int32_t height)
177 {
178  if (2 == height) {
179  avc_chroma_hz_4x2_msa(src, dst, stride, coeff0, coeff1);
180  } else if (4 == height) {
181  avc_chroma_hz_4x4_msa(src, dst, stride, coeff0, coeff1);
182  } else if (8 == height) {
183  avc_chroma_hz_4x8_msa(src, dst, stride, coeff0, coeff1);
184  }
185 }
186 
188  uint32_t coeff0, uint32_t coeff1)
189 {
190  v16u8 src0, src1, src2, src3, out0, out1;
191  v8u16 res0, res1, res2, res3;
192  v16i8 mask;
193  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
194  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
195  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
196 
197  mask = LD_SB(&chroma_mask_arr[32]);
198  LD_UB4(src, stride, src0, src1, src2, src3);
199  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
200  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
201  DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
202  coeff_vec, res0, res1, res2, res3);
203  SLLI_4V(res0, res1, res2, res3, 3);
204  SRARI_H4_UH(res0, res1, res2, res3, 6);
205  SAT_UH4_UH(res0, res1, res2, res3, 7);
206  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
207  ST8x4_UB(out0, out1, dst, stride);
208 }
209 
211  uint32_t coeff0, uint32_t coeff1)
212 {
213  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
214  v16u8 out0, out1, out2, out3;
215  v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
216  v16i8 mask;
217  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
218  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
219  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
220 
221  mask = LD_SB(&chroma_mask_arr[32]);
222 
223  LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
224  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
225  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
226  VSHF_B2_UB(src4, src4, src5, src5, mask, mask, src4, src5);
227  VSHF_B2_UB(src6, src6, src7, src7, mask, mask, src6, src7);
228  DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
229  coeff_vec, res0, res1, res2, res3);
230  DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
231  coeff_vec, res4, res5, res6, res7);
232  SLLI_4V(res0, res1, res2, res3, 3);
233  SLLI_4V(res4, res5, res6, res7, 3);
234  SRARI_H4_UH(res0, res1, res2, res3, 6);
235  SRARI_H4_UH(res4, res5, res6, res7, 6);
236  SAT_UH4_UH(res0, res1, res2, res3, 7);
237  SAT_UH4_UH(res4, res5, res6, res7, 7);
238  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
239  PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
240  ST8x8_UB(out0, out1, out2, out3, dst, stride);
241 }
242 
244  int32_t stride, uint32_t coeff0,
245  uint32_t coeff1, int32_t height)
246 {
247  uint32_t row;
248  v16u8 src0, src1, src2, src3, out0, out1;
249  v8u16 res0, res1, res2, res3;
250  v16i8 mask;
251  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
252  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
253  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
254 
255  mask = LD_SB(&chroma_mask_arr[32]);
256 
257  for (row = height >> 2; row--;) {
258  LD_UB4(src, stride, src0, src1, src2, src3);
259  src += (4 * stride);
260 
261  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
262  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
263  DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
264  coeff_vec, res0, res1, res2, res3);
265  SLLI_4V(res0, res1, res2, res3, 3);
266  SRARI_H4_UH(res0, res1, res2, res3, 6);
267  SAT_UH4_UH(res0, res1, res2, res3, 7);
268  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
269  ST8x4_UB(out0, out1, dst, stride);
270  dst += (4 * stride);
271  }
272 
273  if (0 != (height % 4)) {
274  for (row = (height % 4); row--;) {
275  src0 = LD_UB(src);
276  src += stride;
277 
278  src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
279 
280  res0 = __msa_dotp_u_h(src0, coeff_vec);
281  res0 <<= 3;
282  res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
283  res0 = __msa_sat_u_h(res0, 7);
284  res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
285 
286  ST8x1_UB(res0, dst);
287  dst += stride;
288  }
289  }
290 }
291 
293  uint32_t coeff0, uint32_t coeff1,
294  int32_t height)
295 {
296  if (4 == height) {
297  avc_chroma_hz_8x4_msa(src, dst, stride, coeff0, coeff1);
298  } else if (8 == height) {
299  avc_chroma_hz_8x8_msa(src, dst, stride, coeff0, coeff1);
300  } else {
301  avc_chroma_hz_nonmult_msa(src, dst, stride, coeff0, coeff1, height);
302  }
303 }
304 
306  uint32_t coeff0, uint32_t coeff1)
307 {
308  uint16_t out0, out1;
309  v16i8 src0, src1, src2;
310  v16u8 tmp0, tmp1;
311  v8i16 res;
312  v8u16 res_r;
313  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
314  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
315  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
316 
317  LD_SB3(src, stride, src0, src1, src2);
318 
319  ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
320 
321  tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
322 
323  res_r = __msa_dotp_u_h(tmp0, coeff_vec);
324  res_r <<= 3;
325  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
326  res_r = __msa_sat_u_h(res_r, 7);
327  res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
328 
329  out0 = __msa_copy_u_h(res, 0);
330  out1 = __msa_copy_u_h(res, 2);
331 
332  SH(out0, dst);
333  dst += stride;
334  SH(out1, dst);
335 }
336 
338  uint32_t coeff0, uint32_t coeff1)
339 {
340  v16u8 src0, src1, src2, src3, src4;
341  v16u8 tmp0, tmp1, tmp2, tmp3;
342  v8i16 res;
343  v8u16 res_r;
344  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
345  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
346  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
347 
348  LD_UB5(src, stride, src0, src1, src2, src3, src4);
349  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
350  tmp0, tmp1, tmp2, tmp3);
351  ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
352 
353  tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
354 
355  res_r = __msa_dotp_u_h(tmp0, coeff_vec);
356  res_r <<= 3;
357  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
358  res_r = __msa_sat_u_h(res_r, 7);
359 
360  res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
361 
362  ST2x4_UB(res, 0, dst, stride);
363 }
364 
366  uint32_t coeff0, uint32_t coeff1,
367  int32_t height)
368 {
369  if (2 == height) {
370  avc_chroma_vt_2x2_msa(src, dst, stride, coeff0, coeff1);
371  } else if (4 == height) {
372  avc_chroma_vt_2x4_msa(src, dst, stride, coeff0, coeff1);
373  }
374 }
375 
377  uint32_t coeff0, uint32_t coeff1)
378 {
379  v16u8 src0, src1, src2;
380  v16u8 tmp0, tmp1;
381  v4i32 res;
382  v8u16 res_r;
383  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
384  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
385  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
386 
387  LD_UB3(src, stride, src0, src1, src2);
388  ILVR_B2_UB(src1, src0, src2, src1, tmp0, tmp1);
389 
390  tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
391  res_r = __msa_dotp_u_h(tmp0, coeff_vec);
392  res_r <<= 3;
393  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
394  res_r = __msa_sat_u_h(res_r, 7);
395  res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
396 
397  ST4x2_UB(res, dst, stride);
398 }
399 
401  uint32_t coeff0, uint32_t coeff1)
402 {
403  v16u8 src0, src1, src2, src3, src4;
404  v16u8 tmp0, tmp1, tmp2, tmp3;
405  v16u8 out;
406  v8u16 res0_r, res1_r;
407  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
408  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
409  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
410 
411  LD_UB5(src, stride, src0, src1, src2, src3, src4);
412  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
413  tmp3);
414  ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
415  DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
416  res0_r <<= 3;
417  res1_r <<= 3;
418  SRARI_H2_UH(res0_r, res1_r, 6);
419  SAT_UH2_UH(res0_r, res1_r, 7);
420  out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
421  ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
422 }
423 
425  uint32_t coeff0, uint32_t coeff1)
426 {
427  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
428  v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
429  v8u16 res0, res1, res2, res3;
430  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
431  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
432  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
433 
434  LD_UB5(src, stride, src0, src1, src2, src3, src4);
435  src += (5 * stride);
436  LD_UB4(src, stride, src5, src6, src7, src8);
437  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, tmp0, tmp1, tmp2,
438  tmp3);
439  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
440  tmp7);
441  ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
442  ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
443  DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
444  DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
445  SLLI_4V(res0, res1, res2, res3, 3);
446  SRARI_H4_UH(res0, res1, res2, res3, 6);
447  SAT_UH4_UH(res0, res1, res2, res3, 7);
448  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
449  ST4x8_UB(out0, out1, dst, stride);
450 }
451 
453  uint32_t coeff0, uint32_t coeff1,
454  int32_t height)
455 {
456  if (2 == height) {
457  avc_chroma_vt_4x2_msa(src, dst, stride, coeff0, coeff1);
458  } else if (4 == height) {
459  avc_chroma_vt_4x4_msa(src, dst, stride, coeff0, coeff1);
460  } else if (8 == height) {
461  avc_chroma_vt_4x8_msa(src, dst, stride, coeff0, coeff1);
462  }
463 }
464 
466  uint32_t coeff0, uint32_t coeff1)
467 {
468  v16u8 src0, src1, src2, src3, src4, out0, out1;
469  v8u16 res0, res1, res2, res3;
470  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
471  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
472  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
473 
474  LD_UB5(src, stride, src0, src1, src2, src3, src4);
475  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
476  src3);
477  DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
478  coeff_vec, res0, res1, res2, res3);
479  SLLI_4V(res0, res1, res2, res3, 3);
480  SRARI_H4_UH(res0, res1, res2, res3, 6);
481  SAT_UH4_UH(res0, res1, res2, res3, 7);
482  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
483  ST8x4_UB(out0, out1, dst, stride);
484 }
485 
487  uint32_t coeff0, uint32_t coeff1)
488 {
489  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
490  v16u8 out0, out1, out2, out3;
491  v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
492  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
493  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
494  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
495 
496  LD_UB5(src, stride, src0, src1, src2, src3, src4);
497  src += (5 * stride);
498  LD_UB4(src, stride, src5, src6, src7, src8);
499  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3, src0, src1, src2,
500  src3);
501  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, src4, src5, src6,
502  src7);
503  DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
504  coeff_vec, res0, res1, res2, res3);
505  DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
506  coeff_vec, res4, res5, res6, res7);
507  SLLI_4V(res0, res1, res2, res3, 3);
508  SLLI_4V(res4, res5, res6, res7, 3);
509  SRARI_H4_UH(res0, res1, res2, res3, 6);
510  SRARI_H4_UH(res4, res5, res6, res7, 6);
511  SAT_UH4_UH(res0, res1, res2, res3, 7);
512  SAT_UH4_UH(res0, res1, res2, res3, 7);
513  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
514  PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
515  ST8x8_UB(out0, out1, out2, out3, dst, stride);
516 }
517 
519  uint32_t coeff0, uint32_t coeff1,
520  int32_t height)
521 {
522  if (4 == height) {
523  avc_chroma_vt_8x4_msa(src, dst, stride, coeff0, coeff1);
524  } else if (8 == height) {
525  avc_chroma_vt_8x8_msa(src, dst, stride, coeff0, coeff1);
526  }
527 }
528 
530  uint32_t coef_hor0, uint32_t coef_hor1,
531  uint32_t coef_ver0, uint32_t coef_ver1)
532 {
533  uint16_t out0, out1;
534  v16u8 src0, src1, src2;
535  v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
536  v8i16 res_vert;
537  v16i8 mask;
538  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
539  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
540  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
541  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
542  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
543 
544  mask = LD_SB(&chroma_mask_arr[48]);
545 
546  LD_UB3(src, stride, src0, src1, src2);
547  VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
548  DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
549  MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
550 
551  res_vt0 += res_vt1;
552  res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
553  res_vt0 = __msa_sat_u_h(res_vt0, 7);
554  res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
555 
556  out0 = __msa_copy_u_h(res_vert, 0);
557  out1 = __msa_copy_u_h(res_vert, 1);
558 
559  SH(out0, dst);
560  dst += stride;
561  SH(out1, dst);
562 }
563 
565  uint32_t coef_hor0, uint32_t coef_hor1,
566  uint32_t coef_ver0, uint32_t coef_ver1)
567 {
568  v16u8 src0, src1, src2, src3, src4;
569  v16u8 tmp0, tmp1, tmp2, tmp3;
570  v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
571  v8i16 res;
572  v16i8 mask;
573  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
574  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
575  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
576  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
577  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
578 
579  mask = LD_SB(&chroma_mask_arr[48]);
580 
581  LD_UB5(src, stride, src0, src1, src2, src3, src4);
582 
583  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
584  VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
585  ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
586  DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
587  MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
588 
589  res_vt0 += res_vt1;
590  res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
591  res_vt0 = __msa_sat_u_h(res_vt0, 7);
592 
593  res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
594 
595  ST2x4_UB(res, 0, dst, stride);
596 }
597 
599  uint32_t coef_hor0, uint32_t coef_hor1,
600  uint32_t coef_ver0, uint32_t coef_ver1,
601  int32_t height)
602 {
603  if (2 == height) {
604  avc_chroma_hv_2x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
605  coef_ver1);
606  } else if (4 == height) {
607  avc_chroma_hv_2x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
608  coef_ver1);
609  }
610 }
611 
613  uint32_t coef_hor0, uint32_t coef_hor1,
614  uint32_t coef_ver0, uint32_t coef_ver1)
615 {
616  v16u8 src0, src1, src2;
617  v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
618  v16i8 mask;
619  v4i32 res;
620  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
621  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
622  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
623  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
624  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
625 
626  mask = LD_SB(&chroma_mask_arr[0]);
627  LD_UB3(src, stride, src0, src1, src2);
628  VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
629  DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
630  MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
631 
632  res_vt0 += res_vt1;
633  res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
634  res_vt0 = __msa_sat_u_h(res_vt0, 7);
635  res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
636 
637  ST4x2_UB(res, dst, stride);
638 }
639 
641  uint32_t coef_hor0, uint32_t coef_hor1,
642  uint32_t coef_ver0, uint32_t coef_ver1)
643 {
644  v16u8 src0, src1, src2, src3, src4;
645  v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
646  v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
647  v16i8 mask;
648  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
649  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
650  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
651  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
652  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
653  v4i32 res0, res1;
654 
655  mask = LD_SB(&chroma_mask_arr[0]);
656 
657  LD_UB5(src, stride, src0, src1, src2, src3, src4);
658  VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
659  VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
660  DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
661  coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
662  res_hz3);
663  MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
664  res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
665  ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
666  SRARI_H2_UH(res_vt0, res_vt1, 6);
667  SAT_UH2_UH(res_vt0, res_vt1, 7);
668  PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
669  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, stride);
670 }
671 
673  uint32_t coef_hor0, uint32_t coef_hor1,
674  uint32_t coef_ver0, uint32_t coef_ver1)
675 {
676  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
677  v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
678  v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
679  v16i8 mask;
680  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
681  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
682  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
683  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
684  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
685 
686  mask = LD_SB(&chroma_mask_arr[0]);
687 
688  LD_UB5(src, stride, src0, src1, src2, src3, src4);
689  src += (5 * stride);
690  LD_UB4(src, stride, src5, src6, src7, src8);
691 
692  VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
693  VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
694  VSHF_B2_UB(src4, src5, src5, src6, mask, mask, src4, src5);
695  VSHF_B2_UB(src6, src7, src7, src8, mask, mask, src6, src7);
696  DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
697  coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
698  DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
699  coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
700  MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
701  res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
702  MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
703  res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
704  ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
705  ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
706  SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
707  SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
708  PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
709  ST4x8_UB(res0, res1, dst, stride);
710 }
711 
713  uint32_t coef_hor0, uint32_t coef_hor1,
714  uint32_t coef_ver0, uint32_t coef_ver1,
715  int32_t height)
716 {
717  if (2 == height) {
718  avc_chroma_hv_4x2_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
719  coef_ver1);
720  } else if (4 == height) {
721  avc_chroma_hv_4x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
722  coef_ver1);
723  } else if (8 == height) {
724  avc_chroma_hv_4x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
725  coef_ver1);
726  }
727 }
728 
730  uint32_t coef_hor0, uint32_t coef_hor1,
731  uint32_t coef_ver0, uint32_t coef_ver1)
732 {
733  v16u8 src0, src1, src2, src3, src4, out0, out1;
734  v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
735  v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
736  v16i8 mask;
737  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
738  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
739  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
740  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
741  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
742 
743  mask = LD_SB(&chroma_mask_arr[32]);
744 
745  src0 = LD_UB(src);
746  src += stride;
747 
748  src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
749  res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
750 
751  LD_UB4(src, stride, src1, src2, src3, src4);
752  src += (4 * stride);
753 
754  VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
755  VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
756  DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
757  coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
758  MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
759  res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
760 
761  res_vt0 += (res_hz0 * coeff_vt_vec1);
762  res_vt1 += (res_hz1 * coeff_vt_vec1);
763  res_vt2 += (res_hz2 * coeff_vt_vec1);
764  res_vt3 += (res_hz3 * coeff_vt_vec1);
765 
766  SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
767  SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
768  PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
769  ST8x4_UB(out0, out1, dst, stride);
770 }
771 
773  uint32_t coef_hor0, uint32_t coef_hor1,
774  uint32_t coef_ver0, uint32_t coef_ver1)
775 {
776  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
777  v16u8 out0, out1, out2, out3;
778  v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
779  v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
780  v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
781  v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
782  v16i8 mask;
783  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
784  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
785  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
786  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
787  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
788 
789  mask = LD_SB(&chroma_mask_arr[32]);
790 
791  LD_UB5(src, stride, src0, src1, src2, src3, src4);
792  src += (5 * stride);
793  LD_UB4(src, stride, src5, src6, src7, src8);
794  src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
795  VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
796  VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
797  VSHF_B2_UB(src5, src5, src6, src6, mask, mask, src5, src6);
798  VSHF_B2_UB(src7, src7, src8, src8, mask, mask, src7, src8);
799  res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
800  DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
801  coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
802  res_hz4);
803  DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
804  coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
805  MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
806  coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
807  res_vt3);
808  MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
809  coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
810  res_vt7);
811  res_vt0 += (res_hz0 * coeff_vt_vec1);
812  res_vt1 += (res_hz1 * coeff_vt_vec1);
813  res_vt2 += (res_hz2 * coeff_vt_vec1);
814  res_vt3 += (res_hz3 * coeff_vt_vec1);
815  res_vt4 += (res_hz4 * coeff_vt_vec1);
816  res_vt5 += (res_hz5 * coeff_vt_vec1);
817  res_vt6 += (res_hz6 * coeff_vt_vec1);
818  res_vt7 += (res_hz7 * coeff_vt_vec1);
819  SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
820  SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
821  SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
822  SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
823  PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
824  PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
825  ST8x8_UB(out0, out1, out2, out3, dst, stride);
826 }
827 
829  uint32_t coef_hor0, uint32_t coef_hor1,
830  uint32_t coef_ver0, uint32_t coef_ver1,
831  int32_t height)
832 {
833  if (4 == height) {
834  avc_chroma_hv_8x4_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
835  coef_ver1);
836  } else if (8 == height) {
837  avc_chroma_hv_8x8_msa(src, dst, stride, coef_hor0, coef_hor1, coef_ver0,
838  coef_ver1);
839  }
840 }
841 
843  uint8_t *dst, int32_t dst_stride,
844  uint32_t coeff0, uint32_t coeff1)
845 {
846  uint16_t out0, out1;
847  uint32_t load0, load1;
848  v16i8 src0, src1;
849  v16u8 dst_data = { 0 };
850  v8u16 res_r;
851  v16u8 res;
852  v16i8 mask;
853  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
854  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
855  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
856 
857  mask = LD_SB(&chroma_mask_arr[0]);
858 
859  LD_SB2(src, src_stride, src0, src1);
860 
861  load0 = LW(dst);
862  load1 = LW(dst + dst_stride);
863 
864  INSERT_W2_UB(load0, load1, dst_data);
865 
866  src0 = __msa_vshf_b(mask, src1, src0);
867 
868  res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
869  res_r <<= 3;
870  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
871  res_r = __msa_sat_u_h(res_r, 7);
872 
873  res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
874  dst_data = __msa_aver_u_b(res, dst_data);
875 
876  out0 = __msa_copy_u_h((v8i16) dst_data, 0);
877  out1 = __msa_copy_u_h((v8i16) dst_data, 2);
878 
879  SH(out0, dst);
880  dst += dst_stride;
881  SH(out1, dst);
882 }
883 
885  uint8_t *dst, int32_t dst_stride,
886  uint32_t coeff0, uint32_t coeff1)
887 {
888  v16u8 src0, src1, src2, src3;
889  v16u8 dst0, dst1, dst2, dst3;
890  v8u16 res_r;
891  v16i8 res, mask;
892  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
893  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
894  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
895 
896  mask = LD_SB(&chroma_mask_arr[64]);
897 
898  LD_UB4(src, src_stride, src0, src1, src2, src3);
899  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
900 
901  dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
902  dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
903  dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
904 
905  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
906 
907  src0 = (v16u8) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
908 
909  res_r = __msa_dotp_u_h(src0, coeff_vec);
910  res_r <<= 3;
911  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
912  res_r = __msa_sat_u_h(res_r, 7);
913 
914  res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
915  dst0 = __msa_aver_u_b((v16u8) res, dst0);
916 
917  ST2x4_UB(dst0, 0, dst, dst_stride);
918 }
919 
921  uint8_t *dst, int32_t dst_stride,
922  uint32_t coeff0, uint32_t coeff1)
923 {
924  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
925  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
926  v8u16 res0_r, res1_r;
927  v16u8 res0, res1, mask;
928  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
929  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
930  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
931 
932  mask = LD_UB(&chroma_mask_arr[64]);
933 
934  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
935  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
936 
937  dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
938  dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
939  dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
940 
941  dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
942  dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
943  dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
944 
945  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
946  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, src4, src6);
947  ILVR_D2_UB(src2, src0, src6, src4, src0, src4);
948  DOTP_UB2_UH(src0, src4, coeff_vec, coeff_vec, res0_r, res1_r);
949 
950  res0_r <<= 3;
951  res1_r <<= 3;
952 
953  SRARI_H2_UH(res0_r, res1_r, 6);
954  SAT_UH2_UH(res0_r, res1_r, 7);
955  PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
956  AVER_UB2_UB(res0, dst0, res1, dst4, dst0, dst4);
957 
958  ST2x4_UB(dst0, 0, dst, dst_stride);
959  dst += (4 * dst_stride);
960  ST2x4_UB(dst4, 0, dst, dst_stride);
961 }
962 
964  uint8_t *dst, int32_t dst_stride,
965  uint32_t coeff0, uint32_t coeff1,
966  int32_t height)
967 {
968  if (2 == height) {
969  avc_chroma_hz_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
970  coeff0, coeff1);
971  } else if (4 == height) {
972  avc_chroma_hz_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
973  coeff0, coeff1);
974  } else if (8 == height) {
975  avc_chroma_hz_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
976  coeff0, coeff1);
977  }
978 }
979 
981  uint8_t *dst, int32_t dst_stride,
982  uint32_t coeff0, uint32_t coeff1)
983 {
984  uint32_t load0, load1;
985  v16i8 src0, src1;
986  v16u8 dst_data = { 0 };
987  v8u16 res_r;
988  v16i8 res, mask;
989  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
990  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
991  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
992 
993  mask = LD_SB(&chroma_mask_arr[0]);
994 
995  LD_SB2(src, src_stride, src0, src1);
996 
997  load0 = LW(dst);
998  load1 = LW(dst + dst_stride);
999 
1000  INSERT_W2_UB(load0, load1, dst_data);
1001 
1002  src0 = __msa_vshf_b(mask, src1, src0);
1003 
1004  res_r = __msa_dotp_u_h((v16u8) src0, coeff_vec);
1005  res_r <<= 3;
1006  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1007  res_r = __msa_sat_u_h(res_r, 7);
1008  res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1009  dst_data = __msa_aver_u_b((v16u8) res, dst_data);
1010 
1011  ST4x2_UB(dst_data, dst, dst_stride);
1012 }
1013 
1015  int32_t src_stride,
1016  uint8_t *dst,
1017  int32_t dst_stride,
1018  uint32_t coeff0,
1019  uint32_t coeff1,
1020  int32_t height)
1021 {
1022  uint32_t load0, load1;
1023  uint32_t row;
1024  v16u8 src0, src1, src2, src3;
1025  v16u8 dst0 = { 0 };
1026  v16u8 dst1 = { 0 };
1027  v8u16 res0_r, res1_r;
1028  v16u8 res0, res1, mask;
1029  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1030  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1031  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1032 
1033  mask = LD_UB(&chroma_mask_arr[0]);
1034 
1035  for (row = (height >> 2); row--;) {
1036  LD_UB4(src, src_stride, src0, src1, src2, src3);
1037  src += (4 * src_stride);
1038 
1039  load0 = LW(dst);
1040  load1 = LW(dst + dst_stride);
1041 
1042  INSERT_W2_UB(load0, load1, dst0);
1043 
1044  load0 = LW(dst + 2 * dst_stride);
1045  load1 = LW(dst + 3 * dst_stride);
1046 
1047  INSERT_W2_UB(load0, load1, dst1);
1048 
1049  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, src0, src2);
1050  DOTP_UB2_UH(src0, src2, coeff_vec, coeff_vec, res0_r, res1_r);
1051 
1052  res0_r <<= 3;
1053  res1_r <<= 3;
1054 
1055  SRARI_H2_UH(res0_r, res1_r, 6);
1056  SAT_UH2_UH(res0_r, res1_r, 7);
1057  PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
1058  AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
1059 
1060  ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1061  dst += (4 * dst_stride);
1062  }
1063 }
1064 
1066  uint8_t *dst, int32_t dst_stride,
1067  uint32_t coeff0, uint32_t coeff1,
1068  int32_t height)
1069 {
1070  if (2 == height) {
1071  avc_chroma_hz_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
1072  coeff0, coeff1);
1073  } else {
1075  dst, dst_stride,
1076  coeff0, coeff1, height);
1077  }
1078 }
1079 
1081  uint8_t *dst, int32_t dst_stride,
1082  uint32_t coeff0, uint32_t coeff1,
1083  int32_t height)
1084 {
1085  uint32_t row;
1086  v16u8 src0, src1, src2, src3, out0, out1;
1087  v8u16 res0, res1, res2, res3;
1088  v16u8 dst0, dst1, dst2, dst3;
1089  v16i8 mask;
1090  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1091  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1092  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1093 
1094  mask = LD_SB(&chroma_mask_arr[32]);
1095 
1096  for (row = height >> 2; row--;) {
1097  LD_UB4(src, src_stride, src0, src1, src2, src3);
1098  src += (4 * src_stride);
1099  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1100  VSHF_B2_UB(src0, src0, src1, src1, mask, mask, src0, src1);
1101  VSHF_B2_UB(src2, src2, src3, src3, mask, mask, src2, src3);
1102  DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1103  coeff_vec, res0, res1, res2, res3);
1104  SLLI_4V(res0, res1, res2, res3, 3);
1105  SRARI_H4_UH(res0, res1, res2, res3, 6);
1106  SAT_UH4_UH(res0, res1, res2, res3, 7);
1107  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1108  PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1109  AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1110  ST8x4_UB(out0, out1, dst, dst_stride);
1111  dst += (4 * dst_stride);
1112  }
1113 }
1114 
1116  uint8_t *dst, int32_t dst_stride,
1117  uint32_t coeff0, uint32_t coeff1)
1118 {
1119  uint16_t out0, out1;
1120  uint32_t load0, load1;
1121  v16i8 src0, src1, src2, tmp0, tmp1, res;
1122  v16u8 dst_data = { 0 };
1123  v8u16 res_r;
1124  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1125  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1126  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1127 
1128  LD_SB3(src, src_stride, src0, src1, src2);
1129  load0 = LW(dst);
1130  load1 = LW(dst + dst_stride);
1131 
1132  INSERT_W2_UB(load0, load1, dst_data);
1133 
1134  ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
1135 
1136  tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1137  res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
1138  res_r <<= 3;
1139  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1140  res_r = __msa_sat_u_h(res_r, 7);
1141  res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1142  dst_data = __msa_aver_u_b((v16u8) res, dst_data);
1143  out0 = __msa_copy_u_h((v8i16) dst_data, 0);
1144  out1 = __msa_copy_u_h((v8i16) dst_data, 2);
1145 
1146  SH(out0, dst);
1147  dst += dst_stride;
1148  SH(out1, dst);
1149 }
1150 
1152  uint8_t *dst, int32_t dst_stride,
1153  uint32_t coeff0, uint32_t coeff1)
1154 {
1155  uint32_t load0, load1;
1156  v16i8 src0, src1, src2, src3, src4;
1157  v16u8 tmp0, tmp1, tmp2, tmp3;
1158  v8u16 res_r;
1159  v8i16 res;
1160  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1161  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1162  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1163  v16u8 dst_data = { 0 };
1164 
1165  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1166 
1167  load0 = LW(dst);
1168  load1 = LW(dst + dst_stride);
1169 
1170  dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, load0);
1171  dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, load1);
1172 
1173  load0 = LW(dst + 2 * dst_stride);
1174  load1 = LW(dst + 3 * dst_stride);
1175 
1176  dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, load0);
1177  dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, load1);
1178 
1179  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1180  tmp0, tmp1, tmp2, tmp3);
1181  ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1182 
1183  tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1184 
1185  res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1186  res_r <<= 3;
1187  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1188  res_r = __msa_sat_u_h(res_r, 7);
1189 
1190  res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1191  res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1192 
1193  ST2x4_UB(res, 0, dst, dst_stride);
1194  dst += (4 * dst_stride);
1195 }
1196 
1198  uint8_t *dst, int32_t dst_stride,
1199  uint32_t coeff0, uint32_t coeff1)
1200 {
1201  uint32_t load0, load1, load2, load3;
1202  v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1203  v16u8 tmp0, tmp1, tmp2, tmp3;
1204  v8i16 res;
1205  v8u16 res_r;
1206  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1207  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1208  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1209  v16u8 dst_data0 = { 0 };
1210  v16u8 dst_data1 = { 0 };
1211 
1212  LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1213  src += (5 * src_stride);
1214  LD_SB4(src, src_stride, src5, src6, src7, src8);
1215 
1216  LW4(dst, dst_stride, load0, load1, load2, load3);
1217 
1218  dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 0, load0);
1219  dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 1, load1);
1220  dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 2, load2);
1221  dst_data0 = (v16u8) __msa_insert_h((v8i16) dst_data0, 3, load3);
1222 
1223  LW4(dst + 4 * dst_stride, dst_stride, load0, load1, load2, load3);
1224 
1225  dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 0, load0);
1226  dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 1, load1);
1227  dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 2, load2);
1228  dst_data1 = (v16u8) __msa_insert_h((v8i16) dst_data1, 3, load3);
1229 
1230  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1231  tmp0, tmp1, tmp2, tmp3);
1232 
1233  ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1234 
1235  tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1236 
1237  res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1238  res_r <<= 3;
1239  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1240  res_r = __msa_sat_u_h(res_r, 7);
1241 
1242  res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1243  res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data0);
1244 
1245  ST2x4_UB(res, 0, dst, dst_stride);
1246  dst += (4 * dst_stride);
1247 
1248  ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1249  tmp0, tmp1, tmp2, tmp3);
1250 
1251  ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1252 
1253  tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1254 
1255  res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1256  res_r <<= 3;
1257  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1258  res_r = __msa_sat_u_h(res_r, 7);
1259 
1260  res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1261  res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data1);
1262 
1263  ST2x4_UB(res, 0, dst, dst_stride);
1264 }
1265 
1267  uint8_t *dst, int32_t dst_stride,
1268  uint32_t coeff0, uint32_t coeff1,
1269  int32_t height)
1270 {
1271  if (2 == height) {
1272  avc_chroma_vt_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
1273  coeff0, coeff1);
1274  } else if (4 == height) {
1275  avc_chroma_vt_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
1276  coeff0, coeff1);
1277  } else if (8 == height) {
1278  avc_chroma_vt_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
1279  coeff0, coeff1);
1280  }
1281 }
1282 
1284  uint8_t *dst, int32_t dst_stride,
1285  uint32_t coeff0, uint32_t coeff1)
1286 {
1287  uint32_t load0, load1;
1288  v16i8 src0, src1, src2, tmp0, tmp1;
1289  v16u8 dst_data = { 0 };
1290  v8u16 res_r;
1291  v16u8 res;
1292  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1293  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1294  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1295 
1296  LD_SB3(src, src_stride, src0, src1, src2);
1297 
1298  load0 = LW(dst);
1299  load1 = LW(dst + dst_stride);
1300 
1301  INSERT_W2_UB(load0, load1, dst_data);
1302  ILVR_B2_SB(src1, src0, src2, src1, tmp0, tmp1);
1303 
1304  tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1305 
1306  res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
1307  res_r <<= 3;
1308  res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1309  res_r = __msa_sat_u_h(res_r, 7);
1310  res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1311  res = __msa_aver_u_b(res, dst_data);
1312 
1313  ST4x2_UB(res, dst, dst_stride);
1314 }
1315 
1317  int32_t src_stride,
1318  uint8_t *dst,
1319  int32_t dst_stride,
1320  uint32_t coeff0,
1321  uint32_t coeff1,
1322  int32_t height)
1323 {
1324  uint32_t load0, load1, row;
1325  v16i8 src0, src1, src2, src3, src4;
1326  v16u8 tmp0, tmp1, tmp2, tmp3;
1327  v16u8 dst0 = { 0 };
1328  v16u8 dst1 = { 0 };
1329  v8u16 res0_r, res1_r;
1330  v16u8 res0, res1;
1331  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1332  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1333  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1334 
1335  src0 = LD_SB(src);
1336  src += src_stride;
1337 
1338  for (row = (height >> 2); row--;) {
1339  LD_SB4(src, src_stride, src1, src2, src3, src4);
1340  src += (4 * src_stride);
1341 
1342  load0 = LW(dst);
1343  load1 = LW(dst + dst_stride);
1344 
1345  INSERT_W2_UB(load0, load1, dst0);
1346  load0 = LW(dst + 2 * dst_stride);
1347  load1 = LW(dst + 3 * dst_stride);
1348  INSERT_W2_UB(load0, load1, dst1);
1349 
1350  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1351  tmp0, tmp1, tmp2, tmp3);
1352  ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1353  DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
1354 
1355  res0_r <<= 3;
1356  res1_r <<= 3;
1357 
1358  SRARI_H2_UH(res0_r, res1_r, 6);
1359  SAT_UH2_UH(res0_r, res1_r, 7);
1360  PCKEV_B2_UB(res0_r, res0_r, res1_r, res1_r, res0, res1);
1361  AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
1362 
1363  ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
1364  dst += (4 * dst_stride);
1365  src0 = src4;
1366  }
1367 }
1368 
1370  uint8_t *dst, int32_t dst_stride,
1371  uint32_t coeff0, uint32_t coeff1,
1372  int32_t height)
1373 {
1374  if (2 == height) {
1375  avc_chroma_vt_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
1376  coeff0, coeff1);
1377  } else {
1378  avc_chroma_vt_and_aver_dst_4x4mul_msa(src, src_stride, dst, dst_stride,
1379  coeff0, coeff1, height);
1380  }
1381 }
1382 
1384  uint8_t *dst, int32_t dst_stride,
1385  uint32_t coeff0, uint32_t coeff1,
1386  int32_t height)
1387 {
1388  uint32_t row;
1389  v16u8 src0, src1, src2, src3, src4;
1390  v16u8 out0, out1;
1391  v8u16 res0, res1, res2, res3;
1392  v16u8 dst0, dst1, dst2, dst3;
1393  v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1394  v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1395  v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1396 
1397  src0 = LD_UB(src);
1398  src += src_stride;
1399 
1400  for (row = height >> 2; row--;) {
1401  LD_UB4(src, src_stride, src1, src2, src3, src4);
1402  src += (4 * src_stride);
1403  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1404  ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1405  src0, src1, src2, src3);
1406  DOTP_UB4_UH(src0, src1, src2, src3, coeff_vec, coeff_vec, coeff_vec,
1407  coeff_vec, res0, res1, res2, res3);
1408  SLLI_4V(res0, res1, res2, res3, 3);
1409  SRARI_H4_UH(res0, res1, res2, res3, 6);
1410  SAT_UH4_UH(res0, res1, res2, res3, 7);
1411  PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
1412  PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1413  AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1414  ST8x4_UB(out0, out1, dst, dst_stride);
1415 
1416  dst += (4 * dst_stride);
1417  src0 = src4;
1418  }
1419 }
1420 
1422  uint8_t *dst, int32_t dst_stride,
1423  uint32_t coef_hor0,
1424  uint32_t coef_hor1,
1425  uint32_t coef_ver0,
1426  uint32_t coef_ver1)
1427 {
1428  uint16_t out0, out1;
1429  v16u8 dst0, dst1;
1430  v16u8 src0, src1, src2;
1431  v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1432  v16i8 res, mask;
1433  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1434  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1435  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1436  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1437  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1438 
1439  mask = LD_SB(&chroma_mask_arr[48]);
1440 
1441  LD_UB3(src, src_stride, src0, src1, src2);
1442  LD_UB2(dst, dst_stride, dst0, dst1);
1443  VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1444  DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1445  MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1446 
1447  res_vt0 += res_vt1;
1448  res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1449  res_vt0 = __msa_sat_u_h(res_vt0, 7);
1450  res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1451  dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
1452  dst0 = __msa_aver_u_b((v16u8) res, dst0);
1453  out0 = __msa_copy_u_h((v8i16) dst0, 0);
1454  out1 = __msa_copy_u_h((v8i16) dst0, 1);
1455 
1456  SH(out0, dst);
1457  dst += dst_stride;
1458  SH(out1, dst);
1459 }
1460 
1462  uint8_t *dst, int32_t dst_stride,
1463  uint32_t coef_hor0,
1464  uint32_t coef_hor1,
1465  uint32_t coef_ver0,
1466  uint32_t coef_ver1)
1467 {
1468  v16u8 src0, src1, src2, src3, src4;
1469  v16u8 tmp0, tmp1, tmp2, tmp3;
1470  v16u8 dst0, dst1, dst2, dst3;
1471  v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1472  v16i8 res, mask;
1473  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1474  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1475  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1476  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1477  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1478 
1479  mask = LD_SB(&chroma_mask_arr[48]);
1480 
1481  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1482  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1483  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
1484  VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
1485  ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1486  DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1487  MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1488 
1489  res_vt0 += res_vt1;
1490  res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1491  res_vt0 = __msa_sat_u_h(res_vt0, 7);
1492  res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1493 
1494  dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
1495  dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
1496  dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
1497  dst0 = __msa_aver_u_b((v16u8) res, dst0);
1498 
1499  ST2x4_UB(dst0, 0, dst, dst_stride);
1500 }
1501 
1503  uint8_t *dst, int32_t dst_stride,
1504  uint32_t coef_hor0,
1505  uint32_t coef_hor1,
1506  uint32_t coef_ver0,
1507  uint32_t coef_ver1)
1508 {
1509  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1510  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1511  v16u8 tmp0, tmp1, tmp2, tmp3;
1512  v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1513  v16i8 res, mask;
1514  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1515  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1516  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1517  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1518  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1519 
1520  mask = LD_SB(&chroma_mask_arr[48]);
1521 
1522  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1523  src += (5 * src_stride);
1524  LD_UB4(src, src_stride, src5, src6, src7, src8);
1525 
1526  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1527 
1528  dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 1, (v8i16) dst1);
1529  dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 2, (v8i16) dst2);
1530  dst0 = (v16u8) __msa_insve_h((v8i16) dst0, 3, (v8i16) dst3);
1531 
1532  dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 1, (v8i16) dst5);
1533  dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 2, (v8i16) dst6);
1534  dst4 = (v16u8) __msa_insve_h((v8i16) dst4, 3, (v8i16) dst7);
1535 
1536  VSHF_B2_UB(src0, src1, src2, src3, mask, mask, tmp0, tmp1);
1537  VSHF_B2_UB(src1, src2, src3, src4, mask, mask, tmp2, tmp3);
1538  ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
1539  VSHF_B2_UB(src4, src5, src6, src7, mask, mask, tmp0, tmp1);
1540  VSHF_B2_UB(src5, src6, src7, src8, mask, mask, tmp2, tmp3);
1541  ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, src4, src5);
1542  DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1543  MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1544 
1545  res_vt0 += res_vt1;
1546  res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1547  res_vt0 = __msa_sat_u_h(res_vt0, 7);
1548  res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1549  dst0 = __msa_aver_u_b((v16u8) res, dst0);
1550 
1551  ST2x4_UB(dst0, 0, dst, dst_stride);
1552  dst += (4 * dst_stride);
1553 
1554  DOTP_UB2_UH(src4, src5, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1555  MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1556 
1557  res_vt0 += res_vt1;
1558  res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1559  res_vt0 = __msa_sat_u_h(res_vt0, 7);
1560  res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1561  dst4 = __msa_aver_u_b((v16u8) res, dst4);
1562 
1563  ST2x4_UB(dst4, 0, dst, dst_stride);
1564 }
1565 
1567  uint8_t *dst, int32_t dst_stride,
1568  uint32_t coef_hor0,
1569  uint32_t coef_hor1,
1570  uint32_t coef_ver0,
1571  uint32_t coef_ver1,
1572  int32_t height)
1573 {
1574  if (2 == height) {
1575  avc_chroma_hv_and_aver_dst_2x2_msa(src, src_stride, dst, dst_stride,
1576  coef_hor0, coef_hor1,
1577  coef_ver0, coef_ver1);
1578  } else if (4 == height) {
1579  avc_chroma_hv_and_aver_dst_2x4_msa(src, src_stride, dst, dst_stride,
1580  coef_hor0, coef_hor1,
1581  coef_ver0, coef_ver1);
1582  } else if (8 == height) {
1583  avc_chroma_hv_and_aver_dst_2x8_msa(src, src_stride, dst, dst_stride,
1584  coef_hor0, coef_hor1,
1585  coef_ver0, coef_ver1);
1586  }
1587 }
1588 
1590  uint8_t *dst, int32_t dst_stride,
1591  uint32_t coef_hor0,
1592  uint32_t coef_hor1,
1593  uint32_t coef_ver0,
1594  uint32_t coef_ver1)
1595 {
1596  v16u8 src0, src1, src2;
1597  v16u8 dst0, dst1;
1598  v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1599  v16i8 res, mask;
1600  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1601  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1602  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1603  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1604  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1605 
1606  mask = LD_SB(&chroma_mask_arr[0]);
1607 
1608  LD_UB3(src, src_stride, src0, src1, src2);
1609  LD_UB2(dst, dst_stride, dst0, dst1);
1610  VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1611  DOTP_UB2_UH(src0, src1, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1);
1612  MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1613 
1614  res_vt0 += res_vt1;
1615  res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1616  res_vt0 = __msa_sat_u_h(res_vt0, 7);
1617  res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1618  dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
1619  dst0 = __msa_aver_u_b((v16u8) res, dst0);
1620 
1621  ST4x2_UB(dst0, dst, dst_stride);
1622 }
1623 
1625  int32_t src_stride,
1626  uint8_t *dst,
1627  int32_t dst_stride,
1628  uint32_t coef_hor0,
1629  uint32_t coef_hor1,
1630  uint32_t coef_ver0,
1631  uint32_t coef_ver1,
1632  int32_t height)
1633 {
1634  uint32_t row;
1635  v16u8 src0, src1, src2, src3, src4;
1636  v16u8 dst0, dst1, dst2, dst3;
1637  v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
1638  v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1639  v16i8 mask;
1640  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1641  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1642  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1643  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1644  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1645  v16u8 res0, res1;
1646 
1647  mask = LD_SB(&chroma_mask_arr[0]);
1648 
1649  src0 = LD_UB(src);
1650  src += src_stride;
1651 
1652  for (row = (height >> 2); row--;) {
1653  LD_UB4(src, src_stride, src1, src2, src3, src4);
1654  src += (4 * src_stride);
1655 
1656  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1657 
1658  VSHF_B2_UB(src0, src1, src1, src2, mask, mask, src0, src1);
1659  VSHF_B2_UB(src2, src3, src3, src4, mask, mask, src2, src3);
1660  DOTP_UB4_UH(src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
1661  coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
1662  res_hz3);
1663  MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
1664  coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
1665  res_vt3);
1666  ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1667  SRARI_H2_UH(res_vt0, res_vt1, 6);
1668  SAT_UH2_UH(res_vt0, res_vt1, 7);
1669  PCKEV_B2_UB(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
1670 
1671  dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
1672  dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
1673 
1674  AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
1675 
1676  ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1677  dst += (4 * dst_stride);
1678  src0 = src4;
1679  }
1680 }
1681 
1683  uint8_t *dst, int32_t dst_stride,
1684  uint32_t coef_hor0,
1685  uint32_t coef_hor1,
1686  uint32_t coef_ver0,
1687  uint32_t coef_ver1,
1688  int32_t height)
1689 {
1690  if (2 == height) {
1691  avc_chroma_hv_and_aver_dst_4x2_msa(src, src_stride, dst, dst_stride,
1692  coef_hor0, coef_hor1,
1693  coef_ver0, coef_ver1);
1694  } else {
1695  avc_chroma_hv_and_aver_dst_4x4mul_msa(src, src_stride, dst, dst_stride,
1696  coef_hor0, coef_hor1,
1697  coef_ver0, coef_ver1, height);
1698  }
1699 }
1700 
1702  uint8_t *dst, int32_t dst_stride,
1703  uint32_t coef_hor0,
1704  uint32_t coef_hor1,
1705  uint32_t coef_ver0,
1706  uint32_t coef_ver1,
1707  int32_t height)
1708 {
1709  uint32_t row;
1710  v16u8 src0, src1, src2, src3, src4, out0, out1;
1711  v8u16 res_hz0, res_hz1, res_hz2;
1712  v8u16 res_hz3, res_hz4;
1713  v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1714  v16u8 dst0, dst1, dst2, dst3;
1715  v16i8 mask;
1716  v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1717  v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1718  v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1719  v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1720  v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1721 
1722  mask = LD_SB(&chroma_mask_arr[32]);
1723 
1724  src0 = LD_UB(src);
1725  src += src_stride;
1726 
1727  src0 = (v16u8) __msa_vshf_b(mask, (v16i8) src0, (v16i8) src0);
1728  res_hz0 = __msa_dotp_u_h(src0, coeff_hz_vec);
1729 
1730  for (row = (height >> 2); row--;) {
1731  LD_UB4(src, src_stride, src1, src2, src3, src4);
1732  src += (4 * src_stride);
1733 
1734  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1735  VSHF_B2_UB(src1, src1, src2, src2, mask, mask, src1, src2);
1736  VSHF_B2_UB(src3, src3, src4, src4, mask, mask, src3, src4);
1737  DOTP_UB4_UH(src1, src2, src3, src4, coeff_hz_vec, coeff_hz_vec,
1738  coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
1739  res_hz4);
1740  MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
1741  coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
1742  res_vt3);
1743 
1744  res_vt0 += (res_hz0 * coeff_vt_vec1);
1745  res_vt1 += (res_hz1 * coeff_vt_vec1);
1746  res_vt2 += (res_hz2 * coeff_vt_vec1);
1747  res_vt3 += (res_hz3 * coeff_vt_vec1);
1748 
1749  SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1750  SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1751 
1752  PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1753  PCKEV_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1754  AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
1755  ST8x4_UB(out0, out1, dst, dst_stride);
1756  dst += (4 * dst_stride);
1757 
1758  res_hz0 = res_hz4;
1759  }
1760 }
1761 
1762 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
1763  uint8_t *dst, int32_t dst_stride,
1764  int32_t height)
1765 {
1766  int32_t cnt;
1767  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
1768  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1769 
1770  if (0 == height % 12) {
1771  for (cnt = (height / 12); cnt--;) {
1772  LD_UB8(src, src_stride,
1773  src0, src1, src2, src3, src4, src5, src6, src7);
1774  src += (8 * src_stride);
1775 
1776  out0 = __msa_copy_u_d((v2i64) src0, 0);
1777  out1 = __msa_copy_u_d((v2i64) src1, 0);
1778  out2 = __msa_copy_u_d((v2i64) src2, 0);
1779  out3 = __msa_copy_u_d((v2i64) src3, 0);
1780  out4 = __msa_copy_u_d((v2i64) src4, 0);
1781  out5 = __msa_copy_u_d((v2i64) src5, 0);
1782  out6 = __msa_copy_u_d((v2i64) src6, 0);
1783  out7 = __msa_copy_u_d((v2i64) src7, 0);
1784 
1785  SD4(out0, out1, out2, out3, dst, dst_stride);
1786  dst += (4 * dst_stride);
1787  SD4(out4, out5, out6, out7, dst, dst_stride);
1788  dst += (4 * dst_stride);
1789 
1790  LD_UB4(src, src_stride, src0, src1, src2, src3);
1791  src += (4 * src_stride);
1792 
1793  out0 = __msa_copy_u_d((v2i64) src0, 0);
1794  out1 = __msa_copy_u_d((v2i64) src1, 0);
1795  out2 = __msa_copy_u_d((v2i64) src2, 0);
1796  out3 = __msa_copy_u_d((v2i64) src3, 0);
1797 
1798  SD4(out0, out1, out2, out3, dst, dst_stride);
1799  dst += (4 * dst_stride);
1800  }
1801  } else if (0 == height % 8) {
1802  for (cnt = height >> 3; cnt--;) {
1803  LD_UB8(src, src_stride,
1804  src0, src1, src2, src3, src4, src5, src6, src7);
1805  src += (8 * src_stride);
1806 
1807  out0 = __msa_copy_u_d((v2i64) src0, 0);
1808  out1 = __msa_copy_u_d((v2i64) src1, 0);
1809  out2 = __msa_copy_u_d((v2i64) src2, 0);
1810  out3 = __msa_copy_u_d((v2i64) src3, 0);
1811  out4 = __msa_copy_u_d((v2i64) src4, 0);
1812  out5 = __msa_copy_u_d((v2i64) src5, 0);
1813  out6 = __msa_copy_u_d((v2i64) src6, 0);
1814  out7 = __msa_copy_u_d((v2i64) src7, 0);
1815 
1816  SD4(out0, out1, out2, out3, dst, dst_stride);
1817  dst += (4 * dst_stride);
1818  SD4(out4, out5, out6, out7, dst, dst_stride);
1819  dst += (4 * dst_stride);
1820  }
1821  } else if (0 == height % 4) {
1822  for (cnt = (height / 4); cnt--;) {
1823  LD_UB4(src, src_stride, src0, src1, src2, src3);
1824  src += (4 * src_stride);
1825  out0 = __msa_copy_u_d((v2i64) src0, 0);
1826  out1 = __msa_copy_u_d((v2i64) src1, 0);
1827  out2 = __msa_copy_u_d((v2i64) src2, 0);
1828  out3 = __msa_copy_u_d((v2i64) src3, 0);
1829 
1830  SD4(out0, out1, out2, out3, dst, dst_stride);
1831  dst += (4 * dst_stride);
1832  }
1833  } else if (0 == height % 2) {
1834  for (cnt = (height / 2); cnt--;) {
1835  LD_UB2(src, src_stride, src0, src1);
1836  src += (2 * src_stride);
1837  out0 = __msa_copy_u_d((v2i64) src0, 0);
1838  out1 = __msa_copy_u_d((v2i64) src1, 0);
1839 
1840  SD(out0, dst);
1841  dst += dst_stride;
1842  SD(out1, dst);
1843  dst += dst_stride;
1844  }
1845  }
1846 }
1847 
1848 static void avg_width4_msa(uint8_t *src, int32_t src_stride,
1849  uint8_t *dst, int32_t dst_stride,
1850  int32_t height)
1851 {
1852  int32_t cnt;
1853  uint32_t out0, out1, out2, out3;
1854  v16u8 src0, src1, src2, src3;
1855  v16u8 dst0, dst1, dst2, dst3;
1856 
1857  if (0 == (height % 4)) {
1858  for (cnt = (height / 4); cnt--;) {
1859  LD_UB4(src, src_stride, src0, src1, src2, src3);
1860  src += (4 * src_stride);
1861 
1862  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1863 
1864  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1865  dst0, dst1, dst2, dst3);
1866 
1867  out0 = __msa_copy_u_w((v4i32) dst0, 0);
1868  out1 = __msa_copy_u_w((v4i32) dst1, 0);
1869  out2 = __msa_copy_u_w((v4i32) dst2, 0);
1870  out3 = __msa_copy_u_w((v4i32) dst3, 0);
1871  SW4(out0, out1, out2, out3, dst, dst_stride);
1872  dst += (4 * dst_stride);
1873  }
1874  } else if (0 == (height % 2)) {
1875  for (cnt = (height / 2); cnt--;) {
1876  LD_UB2(src, src_stride, src0, src1);
1877  src += (2 * src_stride);
1878 
1879  LD_UB2(dst, dst_stride, dst0, dst1);
1880 
1881  AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1882 
1883  out0 = __msa_copy_u_w((v4i32) dst0, 0);
1884  out1 = __msa_copy_u_w((v4i32) dst1, 0);
1885  SW(out0, dst);
1886  dst += dst_stride;
1887  SW(out1, dst);
1888  dst += dst_stride;
1889  }
1890  }
1891 }
1892 
1893 static void avg_width8_msa(uint8_t *src, int32_t src_stride,
1894  uint8_t *dst, int32_t dst_stride,
1895  int32_t height)
1896 {
1897  int32_t cnt;
1898  uint64_t out0, out1, out2, out3;
1899  v16u8 src0, src1, src2, src3;
1900  v16u8 dst0, dst1, dst2, dst3;
1901 
1902  for (cnt = (height / 4); cnt--;) {
1903  LD_UB4(src, src_stride, src0, src1, src2, src3);
1904  src += (4 * src_stride);
1905  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1906 
1907  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1908  dst0, dst1, dst2, dst3);
1909 
1910  out0 = __msa_copy_u_d((v2i64) dst0, 0);
1911  out1 = __msa_copy_u_d((v2i64) dst1, 0);
1912  out2 = __msa_copy_u_d((v2i64) dst2, 0);
1913  out3 = __msa_copy_u_d((v2i64) dst3, 0);
1914  SD4(out0, out1, out2, out3, dst, dst_stride);
1915  dst += (4 * dst_stride);
1916  }
1917 }
1918 
1920  ptrdiff_t stride, int height, int x, int y)
1921 {
1922  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1923 
1924  if (x && y) {
1925  avc_chroma_hv_8w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1926  } else if (x) {
1927  avc_chroma_hz_8w_msa(src, dst, stride, x, (8 - x), height);
1928  } else if (y) {
1929  avc_chroma_vt_8w_msa(src, dst, stride, y, (8 - y), height);
1930  } else {
1931  copy_width8_msa(src, stride, dst, stride, height);
1932  }
1933 }
1934 
1936  ptrdiff_t stride, int height, int x, int y)
1937 {
1938  int32_t cnt;
1939 
1940  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1941 
1942  if (x && y) {
1943  avc_chroma_hv_4w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1944  } else if (x) {
1945  avc_chroma_hz_4w_msa(src, dst, stride, x, (8 - x), height);
1946  } else if (y) {
1947  avc_chroma_vt_4w_msa(src, dst, stride, y, (8 - y), height);
1948  } else {
1949  for (cnt = height; cnt--;) {
1950  *((uint32_t *) dst) = *((uint32_t *) src);
1951 
1952  src += stride;
1953  dst += stride;
1954  }
1955  }
1956 }
1957 
1959  ptrdiff_t stride, int height, int x, int y)
1960 {
1961  int32_t cnt;
1962 
1963  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1964 
1965  if (x && y) {
1966  avc_chroma_hv_2w_msa(src, dst, stride, x, (8 - x), y, (8 - y), height);
1967  } else if (x) {
1968  avc_chroma_hz_2w_msa(src, dst, stride, x, (8 - x), height);
1969  } else if (y) {
1970  avc_chroma_vt_2w_msa(src, dst, stride, y, (8 - y), height);
1971  } else {
1972  for (cnt = height; cnt--;) {
1973  *((uint16_t *) dst) = *((uint16_t *) src);
1974 
1975  src += stride;
1976  dst += stride;
1977  }
1978  }
1979 }
1980 
1982  ptrdiff_t stride, int height, int x, int y)
1983 {
1984  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1985 
1986 
1987  if (x && y) {
1988  avc_chroma_hv_and_aver_dst_8w_msa(src, stride, dst,
1989  stride, x, (8 - x), y,
1990  (8 - y), height);
1991  } else if (x) {
1992  avc_chroma_hz_and_aver_dst_8w_msa(src, stride, dst,
1993  stride, x, (8 - x), height);
1994  } else if (y) {
1995  avc_chroma_vt_and_aver_dst_8w_msa(src, stride, dst,
1996  stride, y, (8 - y), height);
1997  } else {
1998  avg_width8_msa(src, stride, dst, stride, height);
1999  }
2000 }
2001 
2003  ptrdiff_t stride, int height, int x, int y)
2004 {
2005  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2006 
2007  if (x && y) {
2008  avc_chroma_hv_and_aver_dst_4w_msa(src, stride, dst,
2009  stride, x, (8 - x), y,
2010  (8 - y), height);
2011  } else if (x) {
2012  avc_chroma_hz_and_aver_dst_4w_msa(src, stride, dst,
2013  stride, x, (8 - x), height);
2014  } else if (y) {
2015  avc_chroma_vt_and_aver_dst_4w_msa(src, stride, dst,
2016  stride, y, (8 - y), height);
2017  } else {
2018  avg_width4_msa(src, stride, dst, stride, height);
2019  }
2020 }
2021 
2023  ptrdiff_t stride, int height, int x, int y)
2024 {
2025  int32_t cnt;
2026 
2027  av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2028 
2029  if (x && y) {
2030  avc_chroma_hv_and_aver_dst_2w_msa(src, stride, dst,
2031  stride, x, (8 - x), y,
2032  (8 - y), height);
2033  } else if (x) {
2034  avc_chroma_hz_and_aver_dst_2w_msa(src, stride, dst,
2035  stride, x, (8 - x), height);
2036  } else if (y) {
2037  avc_chroma_vt_and_aver_dst_2w_msa(src, stride, dst,
2038  stride, y, (8 - y), height);
2039  } else {
2040  for (cnt = height; cnt--;) {
2041  dst[0] = (dst[0] + src[0] + 1) >> 1;
2042  dst[1] = (dst[1] + src[1] + 1) >> 1;
2043 
2044  src += stride;
2045  dst += stride;
2046  }
2047  }
2048 }
static void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_vt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_vt_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_vt_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hv_2x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_hz_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hv_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_hv_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_hv_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
#define LW(psrc)
static void avc_chroma_vt_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
#define MUL2(in0, in1, in2, in3, out0, out1)
#define SD
Definition: ccaption_dec.c:819
#define LD_SB(...)
static void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
#define ILVR_D2_UB(...)
#define LD_UB4(...)
#define ILVR_B2_SB(...)
#define src
Definition: vp8dsp.c:254
static void avc_chroma_hv_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
#define LD_SB2(...)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_vt_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
uint8_t
#define av_assert2(cond)
assert() equivalent, that does lie in speed critical code.
Definition: avassert.h:64
#define LD_UB2(...)
static void avc_chroma_vt_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_hv_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
#define height
static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
#define LD_UB5(...)
static void avc_chroma_hv_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
static void avc_chroma_hv_and_aver_dst_4x4mul_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
void ff_avg_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
void ff_put_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
#define INSERT_W2_UB(...)
static const uint16_t mask[17]
Definition: lzw.c:38
static void avc_chroma_hv_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1, int32_t height)
static void avc_chroma_hz_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_vt_and_aver_dst_2x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
#define SW4(in0, in1, in2, in3, pdst, stride)
void ff_avg_h264_chroma_mc8_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
void ff_put_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
static void avc_chroma_hz_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
#define ILVR_B4_UB(...)
#define LD_UB3(...)
static void avc_chroma_vt_and_aver_dst_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
#define LD_UB8(...)
static void avc_chroma_hz_and_aver_dst_4x4multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
#define SRARI_H2_UH(...)
int32_t
#define AVER_UB2_UB(...)
#define LD_SB3(...)
#define LD_SB4(...)
#define ST2x4_UB(in, stidx, pdst, stride)
static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
#define AVER_UB4_UB(...)
static const uint8_t chroma_mask_arr[16 *5]
static void avc_chroma_vt_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
#define src1
Definition: h264pred.c:139
static void copy_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
void ff_avg_h264_chroma_mc2_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)
#define DOTP_UB2_UH(...)
static void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
static void avc_chroma_hz_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1)
#define SRARI_H4_UH(...)
#define src0
Definition: h264pred.c:138
#define ADD2(in0, in1, in2, in3, out0, out1)
static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_hv_and_aver_dst_2x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
#define SH(val, pdst)
#define SD4(in0, in1, in2, in3, pdst, stride)
#define SW(val, pdst)
static void avc_chroma_hv_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
#define SLLI_4V(in0, in1, in2, in3, shift)
#define ILVR_W2_UB(...)
#define ST4x8_UB(in0, in1, pdst, stride)
#define LD_SB5(...)
static void avc_chroma_hv_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
GLint GLenum GLboolean GLsizei stride
Definition: opengl_enc.c:105
#define LW4(psrc, stride, out0, out1, out2, out3)
#define ST8x4_UB(in0, in1, pdst, stride)
static void avc_chroma_hv_and_aver_dst_2x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_hz_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
#define SAT_UH2_UH(...)
#define ST8x8_UB(in0, in1, in2, in3, pdst, stride)
#define PCKEV_B2_SW(...)
#define SAT_UH4_UH(...)
static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
#define LD_UB(...)
static void avc_chroma_hz_4x2_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
#define DOTP_UB4_UH(...)
static void avg_width8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void avg_width4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void avc_chroma_hv_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
#define VSHF_B2_UB(...)
static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_hz_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
FILE * out
Definition: movenc.c:54
#define PCKEV_D2_UB(...)
#define ST8x1_UB(in, pdst)
#define stride
#define ST4x2_UB(in, pdst, stride)
static void avc_chroma_hv_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coef_hor0, uint32_t coef_hor1, uint32_t coef_ver0, uint32_t coef_ver1)
static void avc_chroma_vt_and_aver_dst_4x4mul_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_hz_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
#define PCKEV_B2_UB(...)
#define ILVR_B2_UB(...)
static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint32_t coeff0, uint32_t coeff1, int32_t height)
static void avc_chroma_vt_2x4_msa(uint8_t *src, uint8_t *dst, int32_t stride, uint32_t coeff0, uint32_t coeff1)
void ff_put_h264_chroma_mc4_msa(uint8_t *dst, uint8_t *src, ptrdiff_t stride, int height, int x, int y)