FFmpeg
hpeldsp_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
23 
24 #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \
25 { \
26  v16u8 tmp_m; \
27  \
28  tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
29  tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \
30  ST_UB(tmp_m, (pdst)); \
31 }
32 
33 #define PCKEV_ST_SB4(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
34 { \
35  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
36  uint8_t *pdst_m = (uint8_t *) (pdst); \
37  \
38  PCKEV_B4_SB(in0, in1, in2, in3, in4, in5, in6, in7, \
39  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
40  ST_SB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst_m, stride); \
41 }
42 
43 #define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \
44  pdst, stride) \
45 { \
46  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
47  uint8_t *pdst_m = (uint8_t *) (pdst); \
48  \
49  PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
50  PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
51  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
52  ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
53 }
54 
55 static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride,
56  uint8_t *dst, int32_t dst_stride,
57  uint8_t height)
58 {
59  uint8_t loop_cnt;
60  uint32_t out0, out1;
61  v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
62  v16i8 zeros = { 0 };
63 
64  for (loop_cnt = (height >> 1); loop_cnt--;) {
65  LD_UB2(src, src_stride, src0, src1);
66  src += (2 * src_stride);
67 
68  SLDI_B2_UB(zeros, src0, zeros, src1, 1, src0_sld1, src1_sld1);
69  AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
70 
71  out0 = __msa_copy_u_w((v4i32) res0, 0);
72  out1 = __msa_copy_u_w((v4i32) res1, 0);
73  SW(out0, dst);
74  dst += dst_stride;
75  SW(out1, dst);
76  dst += dst_stride;
77  }
78 }
79 
80 static void common_hz_bil_8w_msa(const uint8_t *src, int32_t src_stride,
81  uint8_t *dst, int32_t dst_stride,
82  uint8_t height)
83 {
84  uint8_t loop_cnt;
85  v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
86  v16i8 zeros = { 0 };
87 
88  for (loop_cnt = (height >> 2); loop_cnt--;) {
89  LD_SB4(src, src_stride, src0, src1, src2, src3);
90  src += (4 * src_stride);
91 
92  SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
93  src0_sld1, src1_sld1, src2_sld1, src3_sld1);
94  AVER_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
95  src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
96  dst += (4 * dst_stride);
97  }
98 }
99 
100 static void common_hz_bil_16w_msa(const uint8_t *src, int32_t src_stride,
101  uint8_t *dst, int32_t dst_stride,
102  uint8_t height)
103 {
104  uint8_t loop_cnt;
105  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
106  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
107 
108  for (loop_cnt = (height >> 3); loop_cnt--;) {
109  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
110  LD_UB8((src + 1), src_stride,
111  src8, src9, src10, src11, src12, src13, src14, src15);
112  src += (8 * src_stride);
113 
114  AVER_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
115  dst, dst_stride);
116  dst += (4 * dst_stride);
117 
118  AVER_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
119  dst, dst_stride);
120  dst += (4 * dst_stride);
121  }
122 }
123 
124 static void common_hz_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
125  uint8_t *dst, int32_t dst_stride)
126 {
127  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
128  v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
129  v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1;
130  v16i8 zeros = { 0 };
131 
132  LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
133  src += (8 * src_stride);
134 
135  SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
136  src0_sld1, src1_sld1, src2_sld1, src3_sld1);
137  SLDI_B4_SB(zeros, src4, zeros, src5, zeros, src6, zeros, src7, 1,
138  src4_sld1, src5_sld1, src6_sld1, src7_sld1);
139 
140  AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
141  src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
142  dst += (4 * dst_stride);
143  AVE_ST8x4_UB(src4, src4_sld1, src5, src5_sld1,
144  src6, src6_sld1, src7, src7_sld1, dst, dst_stride);
145 }
146 
147 static void common_hz_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
148  uint8_t *dst, int32_t dst_stride)
149 {
150  v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
151  v16i8 zeros = { 0 };
152 
153  LD_SB4(src, src_stride, src0, src1, src2, src3);
154  SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
155  src0_sld1, src1_sld1, src2_sld1, src3_sld1);
156  AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
157  src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
158 }
159 
160 static void common_hz_bil_no_rnd_16x16_msa(const uint8_t *src,
161  int32_t src_stride,
162  uint8_t *dst, int32_t dst_stride)
163 {
164  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
165  v16u8 src9, src10, src11, src12, src13, src14, src15;
166 
167  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
168  LD_UB8((src + 1), src_stride,
169  src8, src9, src10, src11, src12, src13, src14, src15);
170  src += (8 * src_stride);
171 
172  AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
173  dst, dst_stride);
174  dst += (4 * dst_stride);
175 
176  LD_UB4(src, src_stride, src0, src1, src2, src3);
177  LD_UB4((src + 1), src_stride, src8, src9, src10, src11);
178  src += (4 * src_stride);
179 
180  AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
181  dst, dst_stride);
182  dst += (4 * dst_stride);
183 
184  LD_UB4(src, src_stride, src4, src5, src6, src7);
185  LD_UB4((src + 1), src_stride, src12, src13, src14, src15);
186  src += (4 * src_stride);
187 
188  AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
189  dst, dst_stride);
190  dst += (4 * dst_stride);
191  AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
192  dst, dst_stride);
193 }
194 
195 static void common_hz_bil_no_rnd_8x16_msa(const uint8_t *src,
196  int32_t src_stride,
197  uint8_t *dst, int32_t dst_stride)
198 {
199  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
200  v16u8 src9, src10, src11, src12, src13, src14, src15;
201 
202  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
203  LD_UB8((src + 1), src_stride,
204  src8, src9, src10, src11, src12, src13, src14, src15);
205 
206  AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
207  dst, dst_stride);
208  dst += (4 * dst_stride);
209  AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
210  dst, dst_stride);
211 }
212 
213 static void common_hz_bil_and_aver_dst_4w_msa(const uint8_t *src,
214  int32_t src_stride,
215  uint8_t *dst, int32_t dst_stride,
216  uint8_t height)
217 {
218  uint8_t loop_cnt;
219  uint32_t dst0, dst1, out0, out1;
220  v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
221  v16u8 tmp0 = { 0 };
222  v16u8 tmp1 = { 0 };
223  v16i8 zeros = { 0 };
224 
225  for (loop_cnt = (height >> 1); loop_cnt--;) {
226  LD_UB2(src, src_stride, src0, src1);
227  src += (2 * src_stride);
228 
229  SLDI_B2_UB(zeros, src0, zeros, src1, 1, src0_sld1, src1_sld1);
230 
231  dst0 = LW(dst);
232  dst1 = LW(dst + dst_stride);
233  tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
234  tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
235 
236  AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
237  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
238 
239  out0 = __msa_copy_u_w((v4i32) res0, 0);
240  out1 = __msa_copy_u_w((v4i32) res1, 0);
241  SW(out0, dst);
242  dst += dst_stride;
243  SW(out1, dst);
244  dst += dst_stride;
245  }
246 }
247 
248 static void common_hz_bil_and_aver_dst_8w_msa(const uint8_t *src,
249  int32_t src_stride,
250  uint8_t *dst, int32_t dst_stride,
251  uint8_t height)
252 {
253  uint8_t loop_cnt;
254  v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
255  v16i8 zeros = { 0 };
256 
257  for (loop_cnt = (height >> 2); loop_cnt--;) {
258  LD_SB4(src, src_stride, src0, src1, src2, src3);
259  src += (4 * src_stride);
260 
261  SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
262  src0_sld1, src1_sld1, src2_sld1, src3_sld1);
263 
264  AVER_DST_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, src2, src2_sld1,
265  src3, src3_sld1, dst, dst_stride);
266  dst += (4 * dst_stride);
267  }
268 }
269 
270 static void common_hz_bil_and_aver_dst_16w_msa(const uint8_t *src,
271  int32_t src_stride,
272  uint8_t *dst, int32_t dst_stride,
273  uint8_t height)
274 {
275  uint8_t loop_cnt;
276  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
277  v16u8 src9, src10, src11, src12, src13, src14, src15;
278 
279  for (loop_cnt = (height >> 3); loop_cnt--;) {
280  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
281  LD_UB8((src + 1), src_stride,
282  src8, src9, src10, src11, src12, src13, src14, src15);
283  src += (8 * src_stride);
284 
285  AVER_DST_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
286  dst, dst_stride);
287  dst += (4 * dst_stride);
288  AVER_DST_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
289  dst, dst_stride);
290  dst += (4 * dst_stride);
291  }
292 }
293 
294 static void common_vt_bil_4w_msa(const uint8_t *src, int32_t src_stride,
295  uint8_t *dst, int32_t dst_stride,
296  uint8_t height)
297 {
298  uint8_t loop_cnt;
299  uint32_t out0, out1;
300  v16u8 src0, src1, src2, res0, res1;
301 
302  src0 = LD_UB(src);
303  src += src_stride;
304 
305  for (loop_cnt = (height >> 1); loop_cnt--;) {
306  LD_UB2(src, src_stride, src1, src2);
307  src += (2 * src_stride);
308 
309  AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
310 
311  out0 = __msa_copy_u_w((v4i32) res0, 0);
312  out1 = __msa_copy_u_w((v4i32) res1, 0);
313  SW(out0, dst);
314  dst += dst_stride;
315  SW(out1, dst);
316  dst += dst_stride;
317 
318  src0 = src2;
319  }
320 }
321 
322 static void common_vt_bil_8w_msa(const uint8_t *src, int32_t src_stride,
323  uint8_t *dst, int32_t dst_stride,
324  uint8_t height)
325 {
326  uint8_t loop_cnt;
327  v16u8 src0, src1, src2, src3, src4;
328 
329  src0 = LD_UB(src);
330  src += src_stride;
331 
332  for (loop_cnt = (height >> 2); loop_cnt--;) {
333  LD_UB4(src, src_stride, src1, src2, src3, src4);
334  src += (4 * src_stride);
335 
336  AVER_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
337  dst, dst_stride);
338  dst += (4 * dst_stride);
339 
340  src0 = src4;
341  }
342 }
343 
344 static void common_vt_bil_16w_msa(const uint8_t *src, int32_t src_stride,
345  uint8_t *dst, int32_t dst_stride,
346  uint8_t height)
347 {
348  uint8_t loop_cnt;
349  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
350 
351  src0 = LD_UB(src);
352  src += src_stride;
353 
354  for (loop_cnt = (height >> 3); loop_cnt--;) {
355  LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
356  src += (8 * src_stride);
357 
358  AVER_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
359  dst, dst_stride);
360  dst += (4 * dst_stride);
361  AVER_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
362  dst, dst_stride);
363  dst += (4 * dst_stride);
364 
365  src0 = src8;
366  }
367 }
368 
369 static void common_vt_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
370  uint8_t *dst, int32_t dst_stride)
371 {
372  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
373 
374  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
375  src += (8 * src_stride);
376  src8 = LD_UB(src);
377 
378  AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
379  dst, dst_stride);
380  dst += (4 * dst_stride);
381 
382  AVE_ST8x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
383  dst, dst_stride);
384 }
385 
386 static void common_vt_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
387  uint8_t *dst, int32_t dst_stride)
388 {
389  v16u8 src0, src1, src2, src3, src4;
390 
391  LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
392  AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
393  dst, dst_stride);
394 }
395 
396 static void common_vt_bil_no_rnd_16x16_msa(const uint8_t *src,
397  int32_t src_stride,
398  uint8_t *dst, int32_t dst_stride)
399 {
400  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
401  v16u8 src9, src10, src11, src12, src13, src14, src15, src16;
402 
403  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
404  src += (8 * src_stride);
405  LD_UB8(src, src_stride,
406  src8, src9, src10, src11, src12, src13, src14, src15);
407  src += (8 * src_stride);
408  src16 = LD_UB(src);
409 
410  AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
411  dst, dst_stride);
412  dst += (4 * dst_stride);
413  AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
414  dst, dst_stride);
415  dst += (4 * dst_stride);
416  AVE_ST16x4_UB(src8, src9, src9, src10, src10, src11, src11, src12,
417  dst, dst_stride);
418  dst += (4 * dst_stride);
419  AVE_ST16x4_UB(src12, src13, src13, src14,
420  src14, src15, src15, src16, dst, dst_stride);
421 }
422 
423 static void common_vt_bil_no_rnd_8x16_msa(const uint8_t *src,
424  int32_t src_stride,
425  uint8_t *dst, int32_t dst_stride)
426 {
427  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
428 
429  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
430  src += (8 * src_stride);
431  src8 = LD_UB(src);
432 
433  AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
434  dst, dst_stride);
435  dst += (4 * dst_stride);
436  AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
437  dst, dst_stride);
438 }
439 
440 static void common_vt_bil_and_aver_dst_4w_msa(const uint8_t *src,
441  int32_t src_stride,
442  uint8_t *dst, int32_t dst_stride,
443  uint8_t height)
444 {
445  uint8_t loop_cnt;
446  uint32_t out0, out1, dst0, dst1;
447  v16u8 src0, src1, src2;
448  v16u8 tmp0 = { 0 };
449  v16u8 tmp1 = { 0 };
450  v16u8 res0, res1;
451 
452  src0 = LD_UB(src);
453  src += src_stride;
454 
455  for (loop_cnt = (height >> 1); loop_cnt--;) {
456  LD_UB2(src, src_stride, src1, src2);
457  src += (2 * src_stride);
458  dst0 = LW(dst);
459  dst1 = LW(dst + dst_stride);
460  tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
461  tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
462  AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
463  AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
464  out0 = __msa_copy_u_w((v4i32) res0, 0);
465  out1 = __msa_copy_u_w((v4i32) res1, 0);
466  SW(out0, dst);
467  dst += dst_stride;
468  SW(out1, dst);
469  dst += dst_stride;
470  src0 = src2;
471  }
472 }
473 
474 static void common_vt_bil_and_aver_dst_8w_msa(const uint8_t *src,
475  int32_t src_stride,
476  uint8_t *dst, int32_t dst_stride,
477  uint8_t height)
478 {
479  uint8_t loop_cnt;
480  v16u8 src0, src1, src2, src3, src4;
481 
482  src0 = LD_UB(src);
483  src += src_stride;
484 
485  for (loop_cnt = (height >> 2); loop_cnt--;) {
486  LD_UB4(src, src_stride, src1, src2, src3, src4);
487  src += (4 * src_stride);
488 
489  AVER_DST_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
490  dst, dst_stride);
491  dst += (4 * dst_stride);
492  src0 = src4;
493  }
494 }
495 
496 static void common_vt_bil_and_aver_dst_16w_msa(const uint8_t *src,
497  int32_t src_stride,
498  uint8_t *dst, int32_t dst_stride,
499  uint8_t height)
500 {
501  uint8_t loop_cnt;
502  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
503  v16u8 res0, res1, res2, res3, res4, res5, res6, res7;
504  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
505 
506  src0 = LD_UB(src);
507  src += src_stride;
508 
509  for (loop_cnt = (height >> 3); loop_cnt--;) {
510  LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
511  src += (8 * src_stride);
512  AVER_UB4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
513  res0, res1, res2, res3);
514  AVER_UB4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
515  res4, res5, res6, res7);
516 
517  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
518  AVER_UB4_UB(dst0, res0, dst1, res1, dst2, res2, dst3, res3,
519  res0, res1, res2, res3);
520  AVER_UB4_UB(dst4, res4, dst5, res5, dst6, res6, dst7, res7,
521  res4, res5, res6, res7);
522  ST_UB8(res0, res1, res2, res3, res4, res5, res6, res7, dst, dst_stride);
523  dst += (8 * dst_stride);
524 
525  src0 = src8;
526  }
527 }
528 
529 static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride,
530  uint8_t *dst, int32_t dst_stride,
531  uint8_t height)
532 {
533  uint8_t loop_cnt;
534  uint32_t res0, res1;
535  v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
536  v16u8 src0_r, src1_r, src2_r, res;
537  v8u16 add0, add1, add2, sum0, sum1;
538  v16i8 zeros = { 0 };
539 
540  src0 = LD_SB(src);
541  src += src_stride;
542 
543  for (loop_cnt = (height >> 1); loop_cnt--;) {
544  LD_SB2(src, src_stride, src1, src2);
545  src += (2 * src_stride);
546 
547  SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
548  src1_sld1, src2_sld1);
549  ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2,
550  src0_r, src1_r, src2_r);
551  HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
552  ADD2(add0, add1, add1, add2, sum0, sum1);
553  SRARI_H2_UH(sum0, sum1, 2);
554  res = (v16u8) __msa_pckev_b((v16i8) sum1, (v16i8) sum0);
555  res0 = __msa_copy_u_w((v4i32) res, 0);
556  res1 = __msa_copy_u_w((v4i32) res, 2);
557  SW(res0, dst);
558  dst += dst_stride;
559  SW(res1, dst);
560  dst += dst_stride;
561 
562  src0 = src2;
563  }
564 }
565 
566 static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
567  uint8_t *dst, int32_t dst_stride,
568  uint8_t height)
569 {
570  uint8_t loop_cnt;
571  v16i8 src0, src1, src2, src3, src4;
572  v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
573  v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
574  v8u16 add0, add1, add2, add3, add4;
575  v8u16 sum0, sum1, sum2, sum3;
576  v16i8 zeros = { 0 };
577 
578  src0 = LD_SB(src);
579  src += src_stride;
580 
581  for (loop_cnt = (height >> 2); loop_cnt--;) {
582  LD_SB4(src, src_stride, src1, src2, src3, src4);
583  src += (4 * src_stride);
584 
585  SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
586  src1_sld1, src2_sld1);
587  SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
588  ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
589  src1_r, src2_r);
590  ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
591  HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
592  HADD_UB2_UH(src3_r, src4_r, add3, add4);
593  ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
594  sum0, sum1, sum2, sum3);
595  SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
596  PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1);
597  ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
598  dst += (4 * dst_stride);
599  src0 = src4;
600  }
601 }
602 
603 static void common_hv_bil_16w_msa(const uint8_t *src, int32_t src_stride,
604  uint8_t *dst, int32_t dst_stride,
605  uint8_t height)
606 {
607  uint8_t loop_cnt;
608  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
609  v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
610  v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
611  v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
612  v8u16 src7_l, src8_l;
613  v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
614  v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
615 
616  for (loop_cnt = (height >> 3); loop_cnt--;) {
617  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
618  LD_UB8((src + 1), src_stride,
619  src9, src10, src11, src12, src13, src14, src15, src16);
620  src += (8 * src_stride);
621 
622  src8 = LD_UB(src);
623  src17 = LD_UB(src + 1);
624 
625  ILVRL_B2_UH(src9, src0, src0_r, src0_l);
626  ILVRL_B2_UH(src10, src1, src1_r, src1_l);
627  ILVRL_B2_UH(src11, src2, src2_r, src2_l);
628  ILVRL_B2_UH(src12, src3, src3_r, src3_l);
629  ILVRL_B2_UH(src13, src4, src4_r, src4_l);
630  ILVRL_B2_UH(src14, src5, src5_r, src5_l);
631  ILVRL_B2_UH(src15, src6, src6_r, src6_l);
632  ILVRL_B2_UH(src16, src7, src7_r, src7_l);
633  ILVRL_B2_UH(src17, src8, src8_r, src8_l);
634  HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
635  HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
636  HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
637  HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
638  HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
639  HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
640  ADD4(src0_r, src1_r, src1_r, src2_r, src2_r, src3_r, src3_r, src4_r,
641  sum0_r, sum1_r, sum2_r, sum3_r);
642  ADD4(src4_r, src5_r, src5_r, src6_r, src6_r, src7_r, src7_r, src8_r,
643  sum4_r, sum5_r, sum6_r, sum7_r);
644  ADD4(src0_l, src1_l, src1_l, src2_l, src2_l, src3_l, src3_l, src4_l,
645  sum0_l, sum1_l, sum2_l, sum3_l);
646  ADD4(src4_l, src5_l, src5_l, src6_l, src6_l, src7_l, src7_l, src8_l,
647  sum4_l, sum5_l, sum6_l, sum7_l);
648  SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
649  SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
650  SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
651  SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
652  PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, sum2_l, sum2_r,
653  sum3_l, sum3_r, dst, dst_stride);
654  dst += (4 * dst_stride);
655  PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, sum6_l, sum6_r,
656  sum7_l, sum7_r, dst, dst_stride);
657  dst += (4 * dst_stride);
658  }
659 }
660 
661 static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
662  uint8_t *dst, int32_t dst_stride)
663 {
664  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
665  v16u8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
666  v16u8 src4_sld1, src5_sld1, src6_sld1, src7_sld1, src8_sld1;
667  v8u16 src0_r, src1_r, src2_r, src3_r;
668  v8u16 src4_r, src5_r, src6_r, src7_r, src8_r;
669  v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
670  v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
671  v16i8 out0, out1;
672  v16i8 zeros = { 0 };
673 
674  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
675  src += (8 * src_stride);
676  src8 = LD_UB(src);
677 
678  SLDI_B4_UB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
679  src0_sld1, src1_sld1, src2_sld1, src3_sld1);
680  SLDI_B3_UB(zeros, src4, zeros, src5, zeros, src6, 1, src4_sld1,
681  src5_sld1, src6_sld1);
682  SLDI_B2_UB(zeros, src7, zeros, src8, 1, src7_sld1, src8_sld1);
683  ILVR_B4_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src3_sld1,
684  src3, src0_r, src1_r, src2_r, src3_r);
685  ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r,
686  src5_r, src6_r);
687  ILVR_B2_UH(src7_sld1, src7, src8_sld1, src8, src7_r, src8_r);
688  HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
689  HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
690  HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
691 
692  sum0 = add0 + add1 + 1;
693  sum1 = add1 + add2 + 1;
694  sum2 = add2 + add3 + 1;
695  sum3 = add3 + add4 + 1;
696  sum4 = add4 + add5 + 1;
697  sum5 = add5 + add6 + 1;
698  sum6 = add6 + add7 + 1;
699  sum7 = add7 + add8 + 1;
700 
701  SRA_4V(sum0, sum1, sum2, sum3, 2);
702  SRA_4V(sum4, sum5, sum6, sum7, 2);
703  PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
704  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
705  PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1);
706  ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
707 }
708 
709 static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
710  uint8_t *dst, int32_t dst_stride)
711 {
712  v16i8 src0, src1, src2, src3, src4;
713  v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
714  v8u16 src0_r, src1_r, src2_r, src3_r, src4_r;
715  v8u16 add0, add1, add2, add3, add4;
716  v8u16 sum0, sum1, sum2, sum3;
717  v16i8 out0, out1;
718  v16i8 zeros = { 0 };
719 
720  LD_SB4(src, src_stride, src0, src1, src2, src3);
721  src += (4 * src_stride);
722  src4 = LD_SB(src);
723 
724  SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
725  src1_sld1, src2_sld1);
726  SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
727  ILVR_B3_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
728  src1_r, src2_r);
729  ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
730  HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
731  HADD_UB2_UH(src3_r, src4_r, add3, add4);
732 
733  sum0 = add0 + add1 + 1;
734  sum1 = add1 + add2 + 1;
735  sum2 = add2 + add3 + 1;
736  sum3 = add3 + add4 + 1;
737 
738  SRA_4V(sum0, sum1, sum2, sum3, 2);
739  PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
740  ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
741 }
742 
743 static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src,
744  int32_t src_stride,
745  uint8_t *dst, int32_t dst_stride)
746 {
747  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
748  v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
749  v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
750  v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
751  v8u16 src7_l, src8_l;
752  v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
753  v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
754 
755  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
756  LD_UB8((src + 1), src_stride,
757  src9, src10, src11, src12, src13, src14, src15, src16);
758  src += (8 * src_stride);
759  src8 = LD_UB(src);
760  src17 = LD_UB(src + 1);
761 
762  ILVRL_B2_UH(src9, src0, src0_r, src0_l);
763  ILVRL_B2_UH(src10, src1, src1_r, src1_l);
764  ILVRL_B2_UH(src11, src2, src2_r, src2_l);
765  ILVRL_B2_UH(src12, src3, src3_r, src3_l);
766  ILVRL_B2_UH(src13, src4, src4_r, src4_l);
767  ILVRL_B2_UH(src14, src5, src5_r, src5_l);
768  ILVRL_B2_UH(src15, src6, src6_r, src6_l);
769  ILVRL_B2_UH(src16, src7, src7_r, src7_l);
770  ILVRL_B2_UH(src17, src8, src8_r, src8_l);
771 
772  HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
773  HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
774  HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
775  HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
776  HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
777  HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
778 
779  sum0_r = src0_r + src1_r + 1;
780  sum1_r = src1_r + src2_r + 1;
781  sum2_r = src2_r + src3_r + 1;
782  sum3_r = src3_r + src4_r + 1;
783  sum4_r = src4_r + src5_r + 1;
784  sum5_r = src5_r + src6_r + 1;
785  sum6_r = src6_r + src7_r + 1;
786  sum7_r = src7_r + src8_r + 1;
787  sum0_l = src0_l + src1_l + 1;
788  sum1_l = src1_l + src2_l + 1;
789  sum2_l = src2_l + src3_l + 1;
790  sum3_l = src3_l + src4_l + 1;
791  sum4_l = src4_l + src5_l + 1;
792  sum5_l = src5_l + src6_l + 1;
793  sum6_l = src6_l + src7_l + 1;
794  sum7_l = src7_l + src8_l + 1;
795 
796  SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
797  SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
798  SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
799  SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
800  PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
801  sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
802  dst += (4 * dst_stride);
803 
804  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
805  LD_UB8((src + 1), src_stride,
806  src9, src10, src11, src12, src13, src14, src15, src16);
807  src += (8 * src_stride);
808  src8 = LD_UB(src);
809  src17 = LD_UB(src + 1);
810 
811  PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
812  sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
813  dst += (4 * dst_stride);
814 
815  ILVRL_B2_UH(src9, src0, src0_r, src0_l);
816  ILVRL_B2_UH(src10, src1, src1_r, src1_l);
817  ILVRL_B2_UH(src11, src2, src2_r, src2_l);
818  ILVRL_B2_UH(src12, src3, src3_r, src3_l);
819  ILVRL_B2_UH(src13, src4, src4_r, src4_l);
820  ILVRL_B2_UH(src14, src5, src5_r, src5_l);
821  ILVRL_B2_UH(src15, src6, src6_r, src6_l);
822  ILVRL_B2_UH(src16, src7, src7_r, src7_l);
823  ILVRL_B2_UH(src17, src8, src8_r, src8_l);
824 
825  HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
826  HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
827  HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
828  HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
829  HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
830  HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
831 
832  sum0_r = src0_r + src1_r + 1;
833  sum1_r = src1_r + src2_r + 1;
834  sum2_r = src2_r + src3_r + 1;
835  sum3_r = src3_r + src4_r + 1;
836  sum4_r = src4_r + src5_r + 1;
837  sum5_r = src5_r + src6_r + 1;
838  sum6_r = src6_r + src7_r + 1;
839  sum7_r = src7_r + src8_r + 1;
840  sum0_l = src0_l + src1_l + 1;
841  sum1_l = src1_l + src2_l + 1;
842  sum2_l = src2_l + src3_l + 1;
843  sum3_l = src3_l + src4_l + 1;
844  sum4_l = src4_l + src5_l + 1;
845  sum5_l = src5_l + src6_l + 1;
846  sum6_l = src6_l + src7_l + 1;
847  sum7_l = src7_l + src8_l + 1;
848 
849  SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
850  SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
851  SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
852  SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
853  PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
854  sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
855  dst += (4 * dst_stride);
856  PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
857  sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
858 }
859 
860 static void common_hv_bil_no_rnd_8x16_msa(const uint8_t *src,
861  int32_t src_stride,
862  uint8_t *dst, int32_t dst_stride)
863 {
864  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
865  v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
866  v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
867  v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
868  v8u16 src7_l, src8_l;
869  v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
870  v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
871 
872  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
873  LD_UB8((src + 1), src_stride,
874  src9, src10, src11, src12, src13, src14, src15, src16);
875  src += (8 * src_stride);
876  src8 = LD_UB(src);
877  src17 = LD_UB(src + 1);
878 
879  ILVRL_B2_UH(src9, src0, src0_r, src0_l);
880  ILVRL_B2_UH(src10, src1, src1_r, src1_l);
881  ILVRL_B2_UH(src11, src2, src2_r, src2_l);
882  ILVRL_B2_UH(src12, src3, src3_r, src3_l);
883  ILVRL_B2_UH(src13, src4, src4_r, src4_l);
884  ILVRL_B2_UH(src14, src5, src5_r, src5_l);
885  ILVRL_B2_UH(src15, src6, src6_r, src6_l);
886  ILVRL_B2_UH(src16, src7, src7_r, src7_l);
887  ILVRL_B2_UH(src17, src8, src8_r, src8_l);
888 
889  HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
890  HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
891  HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
892  HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
893  HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
894  HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
895 
896  sum0_r = src0_r + src1_r + 1;
897  sum1_r = src1_r + src2_r + 1;
898  sum2_r = src2_r + src3_r + 1;
899  sum3_r = src3_r + src4_r + 1;
900  sum4_r = src4_r + src5_r + 1;
901  sum5_r = src5_r + src6_r + 1;
902  sum6_r = src6_r + src7_r + 1;
903  sum7_r = src7_r + src8_r + 1;
904  sum0_l = src0_l + src1_l + 1;
905  sum1_l = src1_l + src2_l + 1;
906  sum2_l = src2_l + src3_l + 1;
907  sum3_l = src3_l + src4_l + 1;
908  sum4_l = src4_l + src5_l + 1;
909  sum5_l = src5_l + src6_l + 1;
910  sum6_l = src6_l + src7_l + 1;
911  sum7_l = src7_l + src8_l + 1;
912 
913  SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
914  SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
915  SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
916  SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
917  PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
918  sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
919  dst += (4 * dst_stride);
920  PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
921  sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
922 }
923 
924 static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src,
925  int32_t src_stride,
926  uint8_t *dst, int32_t dst_stride,
927  uint8_t height)
928 {
929  uint8_t loop_cnt;
930  uint32_t out0, out1;
931  v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
932  v16u8 src0_r, src1_r, src2_r;
933  v8u16 add0, add1, add2, sum0, sum1;
934  v16u8 dst0, dst1, res0, res1;
935  v16i8 zeros = { 0 };
936 
937  src0 = LD_SB(src);
938  src += src_stride;
939 
940  for (loop_cnt = (height >> 1); loop_cnt--;) {
941  LD_SB2(src, src_stride, src1, src2);
942  src += (2 * src_stride);
943 
944  LD_UB2(dst, dst_stride, dst0, dst1);
945  SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
946  src1_sld1, src2_sld1);
947  ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
948  src1_r, src2_r);
949  HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
950  ADD2(add0, add1, add1, add2, sum0, sum1);
951  SRARI_H2_UH(sum0, sum1, 2);
952  PCKEV_B2_UB(sum0, sum0, sum1, sum1, res0, res1);
953  AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
954 
955  out0 = __msa_copy_u_w((v4i32) res0, 0);
956  out1 = __msa_copy_u_w((v4i32) res1, 0);
957  SW(out0, dst);
958  dst += dst_stride;
959  SW(out1, dst);
960  dst += dst_stride;
961 
962  src0 = src2;
963  }
964 }
965 
966 static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src,
967  int32_t src_stride,
968  uint8_t *dst, int32_t dst_stride,
969  uint8_t height)
970 {
971  uint8_t loop_cnt;
972  v16i8 src0, src1, src2, src3, src4;
973  v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
974  v16u8 dst0, dst1, dst2, dst3;
975  v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
976  v8u16 add0, add1, add2, add3, add4;
977  v8u16 sum0, sum1, sum2, sum3;
978  v16i8 zeros = { 0 };
979 
980  src0 = LD_SB(src);
981  src += src_stride;
982 
983  for (loop_cnt = (height >> 2); loop_cnt--;) {
984  LD_SB4(src, src_stride, src1, src2, src3, src4);
985  src += (4 * src_stride);
986 
987  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
988  SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
989  src1_sld1, src2_sld1);
990  SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
991  ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
992  src1_r, src2_r);
993  ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
994  HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
995  HADD_UB2_UH(src3_r, src4_r, add3, add4);
996  ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
997  sum0, sum1, sum2, sum3);
998  SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
999  PCKEV_AVG_ST8x4_UB(sum0, dst0, sum1, dst1,
1000  sum2, dst2, sum3, dst3, dst, dst_stride);
1001  dst += (4 * dst_stride);
1002  src0 = src4;
1003  }
1004 }
1005 
1006 static void common_hv_bil_and_aver_dst_16w_msa(const uint8_t *src,
1007  int32_t src_stride,
1008  uint8_t *dst, int32_t dst_stride,
1009  uint8_t height)
1010 {
1011  uint8_t loop_cnt;
1012  v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1013  v16u8 src11, src12, src13, src14, src15, src16, src17;
1014  v16u8 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
1015  v16u8 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
1016  v16u8 src7_l, src8_l;
1017  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1018  v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
1019  v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
1020  v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
1021 
1022  for (loop_cnt = (height >> 3); loop_cnt--;) {
1023  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1024  LD_UB8((src + 1), src_stride,
1025  src9, src10, src11, src12, src13, src14, src15, src16);
1026  src += (8 * src_stride);
1027 
1028  src8 = LD_UB(src);
1029  src17 = LD_UB(src + 1);
1030 
1031  ILVRL_B2_UB(src9, src0, src0_r, src0_l);
1032  ILVRL_B2_UB(src10, src1, src1_r, src1_l);
1033  ILVRL_B2_UB(src11, src2, src2_r, src2_l);
1034  ILVRL_B2_UB(src12, src3, src3_r, src3_l);
1035  ILVRL_B2_UB(src13, src4, src4_r, src4_l);
1036  ILVRL_B2_UB(src14, src5, src5_r, src5_l);
1037  ILVRL_B2_UB(src15, src6, src6_r, src6_l);
1038  ILVRL_B2_UB(src16, src7, src7_r, src7_l);
1039  ILVRL_B2_UB(src17, src8, src8_r, src8_l);
1040  HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
1041  HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
1042  HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
1043  ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_r, sum1_r,
1044  sum2_r, sum3_r);
1045  ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_r, sum5_r,
1046  sum6_r, sum7_r);
1047  HADD_UB3_UH(src0_l, src1_l, src2_l, add0, add1, add2);
1048  HADD_UB3_UH(src3_l, src4_l, src5_l, add3, add4, add5);
1049  HADD_UB3_UH(src6_l, src7_l, src8_l, add6, add7, add8);
1050  ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_l, sum1_l,
1051  sum2_l, sum3_l);
1052  ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_l, sum5_l,
1053  sum6_l, sum7_l);
1054  SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
1055  SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
1056  SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
1057  SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
1058  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1059  PCKEV_AVG_ST_UB(sum0_l, sum0_r, dst0, dst);
1060  dst += dst_stride;
1061  PCKEV_AVG_ST_UB(sum1_l, sum1_r, dst1, dst);
1062  dst += dst_stride;
1063  PCKEV_AVG_ST_UB(sum2_l, sum2_r, dst2, dst);
1064  dst += dst_stride;
1065  PCKEV_AVG_ST_UB(sum3_l, sum3_r, dst3, dst);
1066  dst += dst_stride;
1067  PCKEV_AVG_ST_UB(sum4_l, sum4_r, dst4, dst);
1068  dst += dst_stride;
1069  PCKEV_AVG_ST_UB(sum5_l, sum5_r, dst5, dst);
1070  dst += dst_stride;
1071  PCKEV_AVG_ST_UB(sum6_l, sum6_r, dst6, dst);
1072  dst += dst_stride;
1073  PCKEV_AVG_ST_UB(sum7_l, sum7_r, dst7, dst);
1074  dst += dst_stride;
1075  }
1076 }
1077 
1078 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
1079  uint8_t *dst, int32_t dst_stride,
1080  int32_t height)
1081 {
1082  int32_t cnt;
1083  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
1084  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1085 
1086  if (0 == height % 12) {
1087  for (cnt = (height / 12); cnt--;) {
1088  LD_UB8(src, src_stride,
1089  src0, src1, src2, src3, src4, src5, src6, src7);
1090  src += (8 * src_stride);
1091 
1092  out0 = __msa_copy_u_d((v2i64) src0, 0);
1093  out1 = __msa_copy_u_d((v2i64) src1, 0);
1094  out2 = __msa_copy_u_d((v2i64) src2, 0);
1095  out3 = __msa_copy_u_d((v2i64) src3, 0);
1096  out4 = __msa_copy_u_d((v2i64) src4, 0);
1097  out5 = __msa_copy_u_d((v2i64) src5, 0);
1098  out6 = __msa_copy_u_d((v2i64) src6, 0);
1099  out7 = __msa_copy_u_d((v2i64) src7, 0);
1100 
1101  SD4(out0, out1, out2, out3, dst, dst_stride);
1102  dst += (4 * dst_stride);
1103  SD4(out4, out5, out6, out7, dst, dst_stride);
1104  dst += (4 * dst_stride);
1105 
1106  LD_UB4(src, src_stride, src0, src1, src2, src3);
1107  src += (4 * src_stride);
1108 
1109  out0 = __msa_copy_u_d((v2i64) src0, 0);
1110  out1 = __msa_copy_u_d((v2i64) src1, 0);
1111  out2 = __msa_copy_u_d((v2i64) src2, 0);
1112  out3 = __msa_copy_u_d((v2i64) src3, 0);
1113 
1114  SD4(out0, out1, out2, out3, dst, dst_stride);
1115  dst += (4 * dst_stride);
1116  }
1117  } else if (0 == height % 8) {
1118  for (cnt = height >> 3; cnt--;) {
1119  LD_UB8(src, src_stride,
1120  src0, src1, src2, src3, src4, src5, src6, src7);
1121  src += (8 * src_stride);
1122 
1123  out0 = __msa_copy_u_d((v2i64) src0, 0);
1124  out1 = __msa_copy_u_d((v2i64) src1, 0);
1125  out2 = __msa_copy_u_d((v2i64) src2, 0);
1126  out3 = __msa_copy_u_d((v2i64) src3, 0);
1127  out4 = __msa_copy_u_d((v2i64) src4, 0);
1128  out5 = __msa_copy_u_d((v2i64) src5, 0);
1129  out6 = __msa_copy_u_d((v2i64) src6, 0);
1130  out7 = __msa_copy_u_d((v2i64) src7, 0);
1131 
1132  SD4(out0, out1, out2, out3, dst, dst_stride);
1133  dst += (4 * dst_stride);
1134  SD4(out4, out5, out6, out7, dst, dst_stride);
1135  dst += (4 * dst_stride);
1136  }
1137  } else if (0 == height % 4) {
1138  for (cnt = (height / 4); cnt--;) {
1139  LD_UB4(src, src_stride, src0, src1, src2, src3);
1140  src += (4 * src_stride);
1141  out0 = __msa_copy_u_d((v2i64) src0, 0);
1142  out1 = __msa_copy_u_d((v2i64) src1, 0);
1143  out2 = __msa_copy_u_d((v2i64) src2, 0);
1144  out3 = __msa_copy_u_d((v2i64) src3, 0);
1145 
1146  SD4(out0, out1, out2, out3, dst, dst_stride);
1147  dst += (4 * dst_stride);
1148  }
1149  } else if (0 == height % 2) {
1150  for (cnt = (height / 2); cnt--;) {
1151  LD_UB2(src, src_stride, src0, src1);
1152  src += (2 * src_stride);
1153  out0 = __msa_copy_u_d((v2i64) src0, 0);
1154  out1 = __msa_copy_u_d((v2i64) src1, 0);
1155 
1156  SD(out0, dst);
1157  dst += dst_stride;
1158  SD(out1, dst);
1159  dst += dst_stride;
1160  }
1161  }
1162 }
1163 
1164 static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
1165  uint8_t *dst, int32_t dst_stride,
1167 {
1168  int32_t cnt, loop_cnt;
1169  const uint8_t *src_tmp;
1170  uint8_t *dst_tmp;
1171  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1172 
1173  for (cnt = (width >> 4); cnt--;) {
1174  src_tmp = src;
1175  dst_tmp = dst;
1176 
1177  for (loop_cnt = (height >> 3); loop_cnt--;) {
1178  LD_UB8(src_tmp, src_stride,
1179  src0, src1, src2, src3, src4, src5, src6, src7);
1180  src_tmp += (8 * src_stride);
1181 
1182  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
1183  dst_tmp, dst_stride);
1184  dst_tmp += (8 * dst_stride);
1185  }
1186 
1187  src += 16;
1188  dst += 16;
1189  }
1190 }
1191 
1192 static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
1193  uint8_t *dst, int32_t dst_stride,
1194  int32_t height)
1195 {
1196  int32_t cnt;
1197  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1198 
1199  if (0 == height % 12) {
1200  for (cnt = (height / 12); cnt--;) {
1201  LD_UB8(src, src_stride,
1202  src0, src1, src2, src3, src4, src5, src6, src7);
1203  src += (8 * src_stride);
1204  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
1205  dst, dst_stride);
1206  dst += (8 * dst_stride);
1207 
1208  LD_UB4(src, src_stride, src0, src1, src2, src3);
1209  src += (4 * src_stride);
1210  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
1211  dst += (4 * dst_stride);
1212  }
1213  } else if (0 == height % 8) {
1214  copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
1215  } else if (0 == height % 4) {
1216  for (cnt = (height >> 2); cnt--;) {
1217  LD_UB4(src, src_stride, src0, src1, src2, src3);
1218  src += (4 * src_stride);
1219 
1220  ST_UB4(src0, src1, src2, src3, dst, dst_stride);
1221  dst += (4 * dst_stride);
1222  }
1223  }
1224 }
1225 
1226 static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
1227  uint8_t *dst, int32_t dst_stride,
1228  int32_t height)
1229 {
1230  int32_t cnt;
1231  uint32_t out0, out1, out2, out3;
1232  v16u8 src0, src1, src2, src3;
1233  v16u8 dst0, dst1, dst2, dst3;
1234 
1235  if (0 == (height % 4)) {
1236  for (cnt = (height / 4); cnt--;) {
1237  LD_UB4(src, src_stride, src0, src1, src2, src3);
1238  src += (4 * src_stride);
1239 
1240  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1241 
1242  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1243  dst0, dst1, dst2, dst3);
1244 
1245  out0 = __msa_copy_u_w((v4i32) dst0, 0);
1246  out1 = __msa_copy_u_w((v4i32) dst1, 0);
1247  out2 = __msa_copy_u_w((v4i32) dst2, 0);
1248  out3 = __msa_copy_u_w((v4i32) dst3, 0);
1249  SW4(out0, out1, out2, out3, dst, dst_stride);
1250  dst += (4 * dst_stride);
1251  }
1252  } else if (0 == (height % 2)) {
1253  for (cnt = (height / 2); cnt--;) {
1254  LD_UB2(src, src_stride, src0, src1);
1255  src += (2 * src_stride);
1256 
1257  LD_UB2(dst, dst_stride, dst0, dst1);
1258 
1259  AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1260 
1261  out0 = __msa_copy_u_w((v4i32) dst0, 0);
1262  out1 = __msa_copy_u_w((v4i32) dst1, 0);
1263  SW(out0, dst);
1264  dst += dst_stride;
1265  SW(out1, dst);
1266  dst += dst_stride;
1267  }
1268  }
1269 }
1270 
1271 static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
1272  uint8_t *dst, int32_t dst_stride,
1273  int32_t height)
1274 {
1275  int32_t cnt;
1276  uint64_t out0, out1, out2, out3;
1277  v16u8 src0, src1, src2, src3;
1278  v16u8 dst0, dst1, dst2, dst3;
1279 
1280  for (cnt = (height / 4); cnt--;) {
1281  LD_UB4(src, src_stride, src0, src1, src2, src3);
1282  src += (4 * src_stride);
1283  LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1284 
1285  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1286  dst0, dst1, dst2, dst3);
1287 
1288  out0 = __msa_copy_u_d((v2i64) dst0, 0);
1289  out1 = __msa_copy_u_d((v2i64) dst1, 0);
1290  out2 = __msa_copy_u_d((v2i64) dst2, 0);
1291  out3 = __msa_copy_u_d((v2i64) dst3, 0);
1292  SD4(out0, out1, out2, out3, dst, dst_stride);
1293  dst += (4 * dst_stride);
1294  }
1295 }
1296 
1297 static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
1298  uint8_t *dst, int32_t dst_stride,
1299  int32_t height)
1300 {
1301  int32_t cnt;
1302  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1303  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1304 
1305  for (cnt = (height / 8); cnt--;) {
1306  LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1307  src += (8 * src_stride);
1308  LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1309 
1310  AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1311  dst0, dst1, dst2, dst3);
1312  AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
1313  dst4, dst5, dst6, dst7);
1314  ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
1315  dst += (8 * dst_stride);
1316  }
1317 }
1318 
1319 void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels,
1320  ptrdiff_t line_size, int h)
1321 {
1322  copy_width16_msa(pixels, line_size, block, line_size, h);
1323 }
1324 
1325 void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1326  ptrdiff_t line_size, int h)
1327 {
1328  common_hz_bil_16w_msa(pixels, line_size, block, line_size, h);
1329 }
1330 
1331 void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1332  ptrdiff_t line_size, int h)
1333 {
1334  common_vt_bil_16w_msa(pixels, line_size, block, line_size, h);
1335 }
1336 
1337 void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
1338  ptrdiff_t line_size, int h)
1339 {
1340  common_hv_bil_16w_msa(pixels, line_size, block, line_size, h);
1341 }
1342 
1343 void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels,
1344  ptrdiff_t line_size, int h)
1345 {
1346  copy_width8_msa(pixels, line_size, block, line_size, h);
1347 }
1348 
1349 void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1350  ptrdiff_t line_size, int h)
1351 {
1352  common_hz_bil_8w_msa(pixels, line_size, block, line_size, h);
1353 }
1354 
1355 void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1356  ptrdiff_t line_size, int h)
1357 {
1358  common_vt_bil_8w_msa(pixels, line_size, block, line_size, h);
1359 }
1360 
1361 void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1362  ptrdiff_t line_size, int h)
1363 {
1364  common_hv_bil_8w_msa(pixels, line_size, block, line_size, h);
1365 }
1366 
1367 void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
1368  ptrdiff_t line_size, int h)
1369 {
1370  common_hz_bil_4w_msa(pixels, line_size, block, line_size, h);
1371 }
1372 
1373 void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
1374  ptrdiff_t line_size, int h)
1375 {
1376  common_vt_bil_4w_msa(pixels, line_size, block, line_size, h);
1377 }
1378 
1379 void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
1380  ptrdiff_t line_size, int h)
1381 {
1382  common_hv_bil_4w_msa(pixels, line_size, block, line_size, h);
1383 }
1384 
1385 void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1386  ptrdiff_t line_size, int h)
1387 {
1388  if (h == 16) {
1389  common_hz_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1390  } else if (h == 8) {
1391  common_hz_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1392  }
1393 }
1394 
1395 void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1396  ptrdiff_t line_size, int h)
1397 {
1398  if (h == 16) {
1399  common_vt_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1400  } else if (h == 8) {
1401  common_vt_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1402  }
1403 }
1404 
1406  const uint8_t *pixels,
1407  ptrdiff_t line_size, int h)
1408 {
1409  if (h == 16) {
1410  common_hv_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1411  } else if (h == 8) {
1412  common_hv_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1413  }
1414 }
1415 
1416 void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1417  ptrdiff_t line_size, int h)
1418 {
1419  if (h == 8) {
1420  common_hz_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1421  } else if (h == 4) {
1422  common_hz_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1423  }
1424 }
1425 
1426 void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1427  ptrdiff_t line_size, int h)
1428 {
1429  if (h == 8) {
1430  common_vt_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1431  } else if (h == 4) {
1432  common_vt_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1433  }
1434 }
1435 
1436 void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1437  ptrdiff_t line_size, int h)
1438 {
1439  if (h == 8) {
1440  common_hv_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1441  } else if (h == 4) {
1442  common_hv_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1443  }
1444 }
1445 
1446 void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels,
1447  ptrdiff_t line_size, int h)
1448 {
1449  avg_width16_msa(pixels, line_size, block, line_size, h);
1450 }
1451 
1452 void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1453  ptrdiff_t line_size, int h)
1454 {
1455  common_hz_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1456 }
1457 
1458 void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1459  ptrdiff_t line_size, int h)
1460 {
1461  common_vt_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1462 }
1463 
1464 void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
1465  ptrdiff_t line_size, int h)
1466 {
1467  common_hv_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1468 }
1469 
1470 void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels,
1471  ptrdiff_t line_size, int h)
1472 {
1473  avg_width8_msa(pixels, line_size, block, line_size, h);
1474 }
1475 
1476 void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1477  ptrdiff_t line_size, int h)
1478 {
1479  common_hz_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1480 }
1481 
1482 void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1483  ptrdiff_t line_size, int h)
1484 {
1485  common_vt_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1486 }
1487 
1488 void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1489  ptrdiff_t line_size, int h)
1490 {
1491  common_hv_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1492 }
1493 
1494 void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels,
1495  ptrdiff_t line_size, int h)
1496 {
1497  avg_width4_msa(pixels, line_size, block, line_size, h);
1498 }
1499 
1500 void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
1501  ptrdiff_t line_size, int h)
1502 {
1503  common_hz_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
1504 }
1505 
1506 void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
1507  ptrdiff_t line_size, int h)
1508 {
1509  common_vt_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
1510 }
1511 
1512 void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
1513  ptrdiff_t line_size, int h)
1514 {
1515  common_hv_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
1516 }
common_vt_bil_no_rnd_8x16_msa
static void common_vt_bil_no_rnd_8x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: hpeldsp_msa.c:423
LD_SB4
#define LD_SB4(...)
Definition: generic_macros_msa.h:297
ff_put_pixels8_xy2_msa
void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1361
LD_UB8
#define LD_UB8(...)
Definition: generic_macros_msa.h:335
copy_width16_msa
static void copy_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hpeldsp_msa.c:1192
common_hz_bil_no_rnd_8x16_msa
static void common_hz_bil_no_rnd_8x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: hpeldsp_msa.c:195
src1
const pixel * src1
Definition: h264pred_template.c:421
ff_avg_pixels8_x2_msa
void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1476
ff_put_pixels16_xy2_msa
void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1337
ILVR_B3_UB
#define ILVR_B3_UB(...)
Definition: generic_macros_msa.h:1348
AVER_UB2_UB
#define AVER_UB2_UB(...)
Definition: generic_macros_msa.h:595
common_hz_bil_4w_msa
static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:55
common_hv_bil_no_rnd_16x16_msa
static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: hpeldsp_msa.c:743
ILVR_B4_UH
#define ILVR_B4_UH(...)
Definition: generic_macros_msa.h:1361
copy_16multx8mult_msa
static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t width)
Definition: hpeldsp_msa.c:1164
common_hv_bil_4w_msa
static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:529
ILVR_B2_UH
#define ILVR_B2_UH(...)
Definition: generic_macros_msa.h:1339
ff_avg_pixels16_xy2_msa
void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1464
common_hz_bil_16w_msa
static void common_hz_bil_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:100
ff_put_pixels16_y2_msa
void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1331
AVER_ST16x4_UB
#define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
Definition: generic_macros_msa.h:2626
ff_put_no_rnd_pixels8_x2_msa
void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1416
LD_UB5
#define LD_UB5(...)
Definition: generic_macros_msa.h:307
avg_width16_msa
static void avg_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hpeldsp_msa.c:1297
ST_UB8
#define ST_UB8(...)
Definition: generic_macros_msa.h:391
AVER_UB4_UB
#define AVER_UB4_UB(...)
Definition: generic_macros_msa.h:603
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:374
SRARI_H4_UH
#define SRARI_H4_UH(...)
Definition: generic_macros_msa.h:2066
ADD4
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2123
ff_put_pixels4_y2_msa
void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1373
common_hv_bil_8w_msa
static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:566
ff_avg_pixels4_x2_msa
void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1500
common_vt_bil_and_aver_dst_4w_msa
static void common_vt_bil_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:440
ADD2
#define ADD2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2118
HADD_UB2_UH
#define HADD_UB2_UH(...)
Definition: generic_macros_msa.h:1067
generic_macros_msa.h
ff_put_no_rnd_pixels8_xy2_msa
void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1436
ff_avg_pixels4_y2_msa
void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1506
common_vt_bil_16w_msa
static void common_vt_bil_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:344
LD_SB
#define LD_SB(...)
Definition: generic_macros_msa.h:33
common_hz_bil_and_aver_dst_4w_msa
static void common_hz_bil_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:213
ff_put_pixels16_x2_msa
void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1325
SLDI_B3_SB
#define SLDI_B3_SB(...)
Definition: generic_macros_msa.h:634
LD_UB
#define LD_UB(...)
Definition: generic_macros_msa.h:32
common_vt_bil_no_rnd_4x8_msa
static void common_vt_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: hpeldsp_msa.c:386
avg_width8_msa
static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hpeldsp_msa.c:1271
SW
#define SW(val, pdst)
Definition: generic_macros_msa.h:167
common_hv_bil_and_aver_dst_16w_msa
static void common_hv_bil_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:1006
PCKEV_AVG_ST8x4_UB
#define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, pdst, stride)
Definition: hpeldsp_msa.c:43
width
#define width
SLDI_B4_UB
#define SLDI_B4_UB(...)
Definition: generic_macros_msa.h:643
ff_avg_pixels16_y2_msa
void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1458
SRA_4V
#define SRA_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1939
common_vt_bil_and_aver_dst_16w_msa
static void common_vt_bil_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:496
common_hv_bil_and_aver_dst_8w_msa
static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:966
ff_avg_pixels8_msa
void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1470
common_vt_bil_no_rnd_8x8_msa
static void common_vt_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: hpeldsp_msa.c:369
ff_put_pixels8_x2_msa
void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1349
ff_put_pixels4_xy2_msa
void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1379
common_vt_bil_and_aver_dst_8w_msa
static void common_vt_bil_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:474
PCKEV_B2_UB
#define PCKEV_B2_UB(...)
Definition: generic_macros_msa.h:1720
ILVRL_B2_UB
#define ILVRL_B2_UB(...)
Definition: generic_macros_msa.h:1495
ff_put_no_rnd_pixels16_xy2_msa
void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1405
SLDI_B3_UB
#define SLDI_B3_UB(...)
Definition: generic_macros_msa.h:633
SLDI_B2_SB
#define SLDI_B2_SB(...)
Definition: generic_macros_msa.h:623
ff_put_no_rnd_pixels8_y2_msa
void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1426
AVE_ST8x4_UB
#define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
Definition: generic_macros_msa.h:2540
ff_avg_pixels16_x2_msa
void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1452
ff_put_pixels16_msa
void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1319
ILVRL_B2_UH
#define ILVRL_B2_UH(...)
Definition: generic_macros_msa.h:1497
LW
#define LW(psrc)
Definition: generic_macros_msa.h:104
AVER_DST_ST16x4_UB
#define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
Definition: generic_macros_msa.h:2678
hpeldsp_mips.h
SD4
#define SD4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:256
LD_UB4
#define LD_UB4(...)
Definition: generic_macros_msa.h:296
ff_avg_pixels4_msa
void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1494
common_hz_bil_8w_msa
static void common_hz_bil_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:80
HADD_UB3_UH
#define HADD_UB3_UH(...)
Definition: generic_macros_msa.h:1074
height
#define height
ff_avg_pixels8_y2_msa
void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1482
common_hv_bil_16w_msa
static void common_hv_bil_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:603
ff_put_no_rnd_pixels16_y2_msa
void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1395
SW4
#define SW4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:241
ff_avg_pixels8_xy2_msa
void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1488
ff_put_pixels8_y2_msa
void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1355
common_hv_bil_no_rnd_4x8_msa
static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: hpeldsp_msa.c:709
common_hz_bil_no_rnd_4x8_msa
static void common_hz_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: hpeldsp_msa.c:147
SRARI_H2_UH
#define SRARI_H2_UH(...)
Definition: generic_macros_msa.h:2058
src2
const pixel * src2
Definition: h264pred_template.c:422
ff_put_pixels4_x2_msa
void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1367
ff_put_pixels8_msa
void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1343
avg_width4_msa
static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hpeldsp_msa.c:1226
common_vt_bil_4w_msa
static void common_vt_bil_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:294
common_hv_bil_no_rnd_8x8_msa
static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: hpeldsp_msa.c:661
AVER_DST_ST8x4_UB
#define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
Definition: generic_macros_msa.h:2650
copy_width8_msa
static void copy_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
Definition: hpeldsp_msa.c:1078
ILVR_B2_UB
#define ILVR_B2_UB(...)
Definition: generic_macros_msa.h:1337
LD_UB2
#define LD_UB2(...)
Definition: generic_macros_msa.h:277
common_hz_bil_no_rnd_8x8_msa
static void common_hz_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: hpeldsp_msa.c:124
ff_avg_pixels16_msa
void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1446
SLDI_B4_SB
#define SLDI_B4_SB(...)
Definition: generic_macros_msa.h:644
common_hv_bil_and_aver_dst_4w_msa
static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:924
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:499
LD_SB8
#define LD_SB8(...)
Definition: generic_macros_msa.h:336
src0
const pixel *const src0
Definition: h264pred_template.c:420
AVE_ST16x4_UB
#define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
Definition: generic_macros_msa.h:2571
common_vt_bil_no_rnd_16x16_msa
static void common_vt_bil_no_rnd_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: hpeldsp_msa.c:396
common_hz_bil_no_rnd_16x16_msa
static void common_hz_bil_no_rnd_16x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: hpeldsp_msa.c:160
common_hv_bil_no_rnd_8x16_msa
static void common_hv_bil_no_rnd_8x16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: hpeldsp_msa.c:860
ILVR_B3_UH
#define ILVR_B3_UH(...)
Definition: generic_macros_msa.h:1350
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
PCKEV_B2_SB
#define PCKEV_B2_SB(...)
Definition: generic_macros_msa.h:1719
int32_t
int32_t
Definition: audioconvert.c:56
SLDI_B2_UB
#define SLDI_B2_UB(...)
Definition: generic_macros_msa.h:622
AVER_ST8x4_UB
#define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
Definition: generic_macros_msa.h:2597
ff_avg_pixels4_xy2_msa
void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1512
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
h
h
Definition: vp9dsp_template.c:2038
common_hz_bil_and_aver_dst_8w_msa
static void common_hz_bil_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:248
PCKEV_AVG_ST_UB
#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)
Definition: hpeldsp_msa.c:24
common_hz_bil_and_aver_dst_16w_msa
static void common_hz_bil_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:270
common_vt_bil_8w_msa
static void common_vt_bil_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, uint8_t height)
Definition: hpeldsp_msa.c:322
SD
#define SD
Definition: ccaption_dec.c:924
ff_put_no_rnd_pixels16_x2_msa
void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
Definition: hpeldsp_msa.c:1385
LD_SB2
#define LD_SB2(...)
Definition: generic_macros_msa.h:278
PCKEV_ST_SB4
#define PCKEV_ST_SB4(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)
Definition: hpeldsp_msa.c:33