FFmpeg
h264pred_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 - 2017 Shivraj Patil (Shivraj.Patil@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
22 #include "h264dsp_mips.h"
23 
24 static void intra_predict_vert_8x8_msa(uint8_t *src, uint8_t *dst,
25  int32_t dst_stride)
26 {
27  uint64_t out = LD(src);
28 
29  SD4(out, out, out, out, dst, dst_stride);
30  dst += (4 * dst_stride);
31  SD4(out, out, out, out, dst, dst_stride);
32 }
33 
34 static void intra_predict_vert_16x16_msa(uint8_t *src, uint8_t *dst,
35  int32_t dst_stride)
36 {
37  v16u8 out = LD_UB(src);
38 
39  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
40  dst += (8 * dst_stride);
41  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride);
42 }
43 
44 static void intra_predict_horiz_8x8_msa(uint8_t *src, int32_t src_stride,
45  uint8_t *dst, int32_t dst_stride)
46 {
47  uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
48 
49  out0 = src[0 * src_stride] * 0x0101010101010101;
50  out1 = src[1 * src_stride] * 0x0101010101010101;
51  out2 = src[2 * src_stride] * 0x0101010101010101;
52  out3 = src[3 * src_stride] * 0x0101010101010101;
53  out4 = src[4 * src_stride] * 0x0101010101010101;
54  out5 = src[5 * src_stride] * 0x0101010101010101;
55  out6 = src[6 * src_stride] * 0x0101010101010101;
56  out7 = src[7 * src_stride] * 0x0101010101010101;
57 
58  SD4(out0, out1, out2, out3, dst, dst_stride);
59  dst += (4 * dst_stride);
60  SD4(out4, out5, out6, out7, dst, dst_stride);
61 }
62 
63 static void intra_predict_horiz_16x16_msa(uint8_t *src, int32_t src_stride,
64  uint8_t *dst, int32_t dst_stride)
65 {
66  uint8_t inp0, inp1, inp2, inp3;
67  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
68  v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
69 
70  inp0 = src[0 * src_stride];
71  inp1 = src[1 * src_stride];
72  inp2 = src[2 * src_stride];
73  inp3 = src[3 * src_stride];
74  src0 = (v16u8) __msa_fill_b(inp0);
75  src1 = (v16u8) __msa_fill_b(inp1);
76  src2 = (v16u8) __msa_fill_b(inp2);
77  src3 = (v16u8) __msa_fill_b(inp3);
78  inp0 = src[4 * src_stride];
79  inp1 = src[5 * src_stride];
80  inp2 = src[6 * src_stride];
81  inp3 = src[7 * src_stride];
82  src4 = (v16u8) __msa_fill_b(inp0);
83  src5 = (v16u8) __msa_fill_b(inp1);
84  src6 = (v16u8) __msa_fill_b(inp2);
85  src7 = (v16u8) __msa_fill_b(inp3);
86  inp0 = src[ 8 * src_stride];
87  inp1 = src[ 9 * src_stride];
88  inp2 = src[10 * src_stride];
89  inp3 = src[11 * src_stride];
90  src8 = (v16u8) __msa_fill_b(inp0);
91  src9 = (v16u8) __msa_fill_b(inp1);
92  src10 = (v16u8) __msa_fill_b(inp2);
93  src11 = (v16u8) __msa_fill_b(inp3);
94  inp0 = src[12 * src_stride];
95  inp1 = src[13 * src_stride];
96  inp2 = src[14 * src_stride];
97  inp3 = src[15 * src_stride];
98  src12 = (v16u8) __msa_fill_b(inp0);
99  src13 = (v16u8) __msa_fill_b(inp1);
100  src14 = (v16u8) __msa_fill_b(inp2);
101  src15 = (v16u8) __msa_fill_b(inp3);
102 
103  ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
104  dst += (8 * dst_stride);
105  ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15,
106  dst, dst_stride);
107 }
108 
109 #define INTRA_PREDICT_VALDC_8X8_MSA(val) \
110 static void intra_predict_##val##dc_8x8_msa(uint8_t *dst, int32_t dst_stride) \
111 { \
112  v16i8 store = __msa_fill_b(val); \
113  uint64_t out = __msa_copy_u_d((v2i64) store, 0); \
114  \
115  SD4(out, out, out, out, dst, dst_stride); \
116  dst += (4 * dst_stride); \
117  SD4(out, out, out, out, dst, dst_stride); \
118 }
119 
122 
123 #define INTRA_PREDICT_VALDC_16X16_MSA(val) \
124 static void intra_predict_##val##dc_16x16_msa(uint8_t *dst, \
125  int32_t dst_stride) \
126 { \
127  v16u8 out = (v16u8) __msa_fill_b(val); \
128  \
129  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \
130  dst += (8 * dst_stride); \
131  ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); \
132 }
133 
136 
138 {
139  uint8_t lpcnt;
140  int32_t res, res0, res1, res2, res3;
141  uint64_t out0, out1;
142  v16i8 shf_mask = { 3, 5, 2, 6, 1, 7, 0, 8, 3, 5, 2, 6, 1, 7, 0, 8 };
143  v8i16 short_multiplier = { 1, 2, 3, 4, 1, 2, 3, 4 };
144  v4i32 int_multiplier = { 0, 1, 2, 3 };
145  v16u8 src_top;
146  v8i16 vec9, vec10, vec11;
147  v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8;
148  v2i64 sum;
149 
150  src_top = LD_UB(src - (stride + 1));
151  src_top = (v16u8) __msa_vshf_b(shf_mask, (v16i8) src_top, (v16i8) src_top);
152 
153  vec9 = __msa_hsub_u_h(src_top, src_top);
154  vec9 *= short_multiplier;
155  vec8 = __msa_hadd_s_w(vec9, vec9);
156  sum = __msa_hadd_s_d(vec8, vec8);
157 
158  res0 = __msa_copy_s_w((v4i32) sum, 0);
159 
160  res1 = (src[4 * stride - 1] - src[2 * stride - 1]) +
161  2 * (src[5 * stride - 1] - src[stride - 1]) +
162  3 * (src[6 * stride - 1] - src[-1]) +
163  4 * (src[7 * stride - 1] - src[-stride - 1]);
164 
165  res0 *= 17;
166  res1 *= 17;
167  res0 = (res0 + 16) >> 5;
168  res1 = (res1 + 16) >> 5;
169 
170  res3 = 3 * (res0 + res1);
171  res2 = 16 * (src[7 * stride - 1] + src[-stride + 7] + 1);
172  res = res2 - res3;
173 
174  vec8 = __msa_fill_w(res0);
175  vec4 = __msa_fill_w(res);
176  vec2 = __msa_fill_w(res1);
177  vec5 = vec8 * int_multiplier;
178  vec3 = vec8 * 4;
179 
180  for (lpcnt = 4; lpcnt--;) {
181  vec0 = vec5;
182  vec0 += vec4;
183  vec1 = vec0 + vec3;
184  vec6 = vec5;
185  vec4 += vec2;
186  vec6 += vec4;
187  vec7 = vec6 + vec3;
188 
189  SRA_4V(vec0, vec1, vec6, vec7, 5);
190  PCKEV_H2_SH(vec1, vec0, vec7, vec6, vec10, vec11);
191  CLIP_SH2_0_255(vec10, vec11);
192  PCKEV_B2_SH(vec10, vec10, vec11, vec11, vec10, vec11);
193 
194  out0 = __msa_copy_s_d((v2i64) vec10, 0);
195  out1 = __msa_copy_s_d((v2i64) vec11, 0);
196  SD(out0, src);
197  src += stride;
198  SD(out1, src);
199  src += stride;
200 
201  vec4 += vec2;
202  }
203 }
204 
206 {
207  uint8_t lpcnt;
208  int32_t res0, res1, res2, res3;
209  uint64_t load0, load1;
210  v16i8 shf_mask = { 7, 8, 6, 9, 5, 10, 4, 11, 3, 12, 2, 13, 1, 14, 0, 15 };
211  v8i16 short_multiplier = { 1, 2, 3, 4, 5, 6, 7, 8 };
212  v4i32 int_multiplier = { 0, 1, 2, 3 };
213  v16u8 src_top = { 0 };
214  v16u8 store0, store1;
215  v8i16 vec9, vec10, vec11, vec12;
216  v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, res_add;
217  v4i32 reg0, reg1, reg2, reg3;
218 
219  load0 = LD(src - (stride + 1));
220  load1 = LD(src - (stride + 1) + 9);
221 
222  INSERT_D2_UB(load0, load1, src_top);
223 
224  src_top = (v16u8) __msa_vshf_b(shf_mask, (v16i8) src_top, (v16i8) src_top);
225 
226  vec9 = __msa_hsub_u_h(src_top, src_top);
227  vec9 *= short_multiplier;
228  vec8 = __msa_hadd_s_w(vec9, vec9);
229  res_add = (v4i32) __msa_hadd_s_d(vec8, vec8);
230 
231  res0 = __msa_copy_s_w(res_add, 0) + __msa_copy_s_w(res_add, 2);
232 
233  res1 = (src[8 * stride - 1] - src[6 * stride - 1]) +
234  2 * (src[9 * stride - 1] - src[5 * stride - 1]) +
235  3 * (src[10 * stride - 1] - src[4 * stride - 1]) +
236  4 * (src[11 * stride - 1] - src[3 * stride - 1]) +
237  5 * (src[12 * stride - 1] - src[2 * stride - 1]) +
238  6 * (src[13 * stride - 1] - src[stride - 1]) +
239  7 * (src[14 * stride - 1] - src[-1]) +
240  8 * (src[15 * stride - 1] - src[-1 * stride - 1]);
241 
242  res0 *= 5;
243  res1 *= 5;
244  res0 = (res0 + 32) >> 6;
245  res1 = (res1 + 32) >> 6;
246 
247  res3 = 7 * (res0 + res1);
248  res2 = 16 * (src[15 * stride - 1] + src[-stride + 15] + 1);
249  res2 -= res3;
250 
251  vec8 = __msa_fill_w(res0);
252  vec4 = __msa_fill_w(res2);
253  vec5 = __msa_fill_w(res1);
254  vec6 = vec8 * 4;
255  vec7 = vec8 * int_multiplier;
256 
257  for (lpcnt = 8; lpcnt--;) {
258  vec0 = vec7;
259  reg0 = vec7;
260  vec0 += vec4;
261  vec4 += vec5;
262  reg0 += vec4;
263  vec1 = vec0 + vec6;
264  reg1 = reg0 + vec6;
265  vec2 = vec1 + vec6;
266  reg2 = reg1 + vec6;
267  vec3 = vec2 + vec6;
268  reg3 = reg2 + vec6;
269 
270  SRA_4V(vec0, vec1, vec2, vec3, 5);
271  SRA_4V(reg0, reg1, reg2, reg3, 5);
272  PCKEV_H2_SH(vec1, vec0, vec3, vec2, vec9, vec10);
273  PCKEV_H2_SH(reg1, reg0, reg3, reg2, vec11, vec12);
274  CLIP_SH2_0_255(vec9, vec10);
275  CLIP_SH2_0_255(vec11, vec12);
276  PCKEV_B2_UB(vec10, vec9, vec12, vec11, store0, store1);
277  ST_UB2(store0, store1, src, stride);
278  src += 2 * stride;
279 
280  vec4 += vec5;
281  }
282 }
283 
285 {
286  uint32_t src0, src1, src3, src2;
287  uint32_t out0, out1, out2, out3;
288  uint64_t store0, store1;
289  v16u8 src_top;
290  v8u16 add;
291  v4u32 sum;
292 
293  src_top = LD_UB(src - stride);
294  add = __msa_hadd_u_h((v16u8) src_top, (v16u8) src_top);
295  sum = __msa_hadd_u_w(add, add);
296  src0 = __msa_copy_u_w((v4i32) sum, 0);
297  src1 = __msa_copy_u_w((v4i32) sum, 1);
298  src0 += src[0 * stride - 1];
299  src0 += src[1 * stride - 1];
300  src0 += src[2 * stride - 1];
301  src0 += src[3 * stride - 1];
302  src2 = src[4 * stride - 1];
303  src2 += src[5 * stride - 1];
304  src2 += src[6 * stride - 1];
305  src2 += src[7 * stride - 1];
306  src0 = (src0 + 4) >> 3;
307  src3 = (src1 + src2 + 4) >> 3;
308  src1 = (src1 + 2) >> 2;
309  src2 = (src2 + 2) >> 2;
310  out0 = src0 * 0x01010101;
311  out1 = src1 * 0x01010101;
312  out2 = src2 * 0x01010101;
313  out3 = src3 * 0x01010101;
314  store0 = ((uint64_t) out1 << 32) | out0;
315  store1 = ((uint64_t) out3 << 32) | out2;
316 
317  SD4(store0, store0, store0, store0, src, stride);
318  src += (4 * stride);
319  SD4(store1, store1, store1, store1, src, stride);
320 }
321 
323 {
324  uint32_t src0, src1;
325  uint64_t out0, out1;
326 
327  src0 = src[0 * stride - 1];
328  src0 += src[1 * stride - 1];
329  src0 += src[2 * stride - 1];
330  src0 += src[3 * stride - 1];
331  src1 = src[4 * stride - 1];
332  src1 += src[5 * stride - 1];
333  src1 += src[6 * stride - 1];
334  src1 += src[7 * stride - 1];
335  src0 = (src0 + 2) >> 2;
336  src1 = (src1 + 2) >> 2;
337  out0 = src0 * 0x0101010101010101;
338  out1 = src1 * 0x0101010101010101;
339 
340  SD4(out0, out0, out0, out0, src, stride);
341  src += (4 * stride);
342  SD4(out1, out1, out1, out1, src, stride);
343 }
344 
346 {
347  uint64_t out0;
348  v16i8 mask = { 0, 0, 0, 0, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 };
349  v16u8 src_top, res0;
350  v8u16 add;
351  v4u32 sum;
352 
353  src_top = LD_UB(src - stride);
354  add = __msa_hadd_u_h(src_top, src_top);
355  sum = __msa_hadd_u_w(add, add);
356  sum = (v4u32) __msa_srari_w((v4i32) sum, 2);
357  res0 = (v16u8) __msa_vshf_b(mask, (v16i8) sum, (v16i8) sum);
358  out0 = __msa_copy_u_d((v2i64) res0, 0);
359 
360  SD4(out0, out0, out0, out0, src, stride);
361  src += (4 * stride);
362  SD4(out0, out0, out0, out0, src, stride);
363 }
364 
366 {
367  uint32_t src0, src1, src2;
368  uint32_t out0, out1, out2;
369  uint64_t store0, store1;
370  v16u8 src_top;
371  v8u16 add;
372  v4u32 sum;
373 
374  src_top = LD_UB(src - stride);
375  add = __msa_hadd_u_h(src_top, src_top);
376  sum = __msa_hadd_u_w(add, add);
377  src0 = __msa_copy_u_w((v4i32) sum, 0);
378  src1 = __msa_copy_u_w((v4i32) sum, 1);
379 
380  src2 = src[0 * stride - 1];
381  src2 += src[1 * stride - 1];
382  src2 += src[2 * stride - 1];
383  src2 += src[3 * stride - 1];
384  src2 = (src0 + src2 + 4) >> 3;
385  src0 = (src0 + 2) >> 2;
386  src1 = (src1 + 2) >> 2;
387  out0 = src0 * 0x01010101;
388  out1 = src1 * 0x01010101;
389  out2 = src2 * 0x01010101;
390  store1 = ((uint64_t) out1 << 32);
391  store0 = store1 | ((uint64_t) out2);
392  store1 = store1 | ((uint64_t) out0);
393 
394  SD4(store0, store0, store0, store0, src, stride);
395  src += (4 * stride);
396  SD4(store1, store1, store1, store1, src, stride);
397 }
398 
400 {
401  uint32_t src0, src1, src2, src3;
402  uint32_t out0, out1, out2, out3;
403  uint64_t store0, store1;
404  v16u8 src_top;
405  v8u16 add;
406  v4u32 sum;
407 
408  src_top = LD_UB(src - stride);
409  add = __msa_hadd_u_h(src_top, src_top);
410  sum = __msa_hadd_u_w(add, add);
411  src0 = __msa_copy_u_w((v4i32) sum, 0);
412  src1 = __msa_copy_u_w((v4i32) sum, 1);
413 
414  src2 = src[4 * stride - 1];
415  src2 += src[5 * stride - 1];
416  src2 += src[6 * stride - 1];
417  src2 += src[7 * stride - 1];
418  src0 = (src0 + 2) >> 2;
419  src3 = (src1 + src2 + 4) >> 3;
420  src1 = (src1 + 2) >> 2;
421  src2 = (src2 + 2) >> 2;
422 
423  out0 = src0 * 0x01010101;
424  out1 = src1 * 0x01010101;
425  out2 = src2 * 0x01010101;
426  out3 = src3 * 0x01010101;
427  store0 = ((uint64_t) out1 << 32) | out0;
428  store1 = ((uint64_t) out3 << 32) | out2;
429 
430  SD4(store0, store0, store0, store0, src, stride);
431  src += (4 * stride);
432  SD4(store1, store1, store1, store1, src, stride);
433 }
434 
436 {
437  uint32_t src0;
438  uint64_t out0, out1;
439 
440  src0 = src[0 * stride - 1];
441  src0 += src[1 * stride - 1];
442  src0 += src[2 * stride - 1];
443  src0 += src[3 * stride - 1];
444  src0 = (src0 + 2) >> 2;
445  out0 = src0 * 0x0101010101010101;
446  out1 = 0x8080808080808080;
447 
448  SD4(out0, out0, out0, out0, src, stride);
449  src += (4 * stride);
450  SD4(out1, out1, out1, out1, src, stride);
451 }
452 
454 {
455  uint32_t src0;
456  uint64_t out0, out1;
457 
458  src0 = src[4 * stride - 1];
459  src0 += src[5 * stride - 1];
460  src0 += src[6 * stride - 1];
461  src0 += src[7 * stride - 1];
462  src0 = (src0 + 2) >> 2;
463 
464  out0 = 0x8080808080808080;
465  out1 = src0 * 0x0101010101010101;
466 
467  SD4(out0, out0, out0, out0, src, stride);
468  src += (4 * stride);
469  SD4(out1, out1, out1, out1, src, stride);
470 }
471 
473 {
475 }
476 
478 {
480 }
481 
483 {
485 }
486 
488 {
490 }
491 
493  ptrdiff_t stride)
494 {
496 }
497 
499  ptrdiff_t stride)
500 {
502 }
503 
505  ptrdiff_t stride)
506 {
508 }
509 
511  ptrdiff_t stride)
512 {
514 }
515 
517 {
519 }
520 
521 void ff_h264_intra_pred_vert_8x8_msa(uint8_t *src, ptrdiff_t stride)
522 {
523  uint8_t *dst = src;
524 
526 }
527 
528 void ff_h264_intra_pred_horiz_8x8_msa(uint8_t *src, ptrdiff_t stride)
529 {
530  uint8_t *dst = src;
531 
533 }
534 
535 void ff_h264_intra_pred_dc_16x16_msa(uint8_t *src, ptrdiff_t stride)
536 {
537  uint8_t *src_top = src - stride;
538  uint8_t *src_left = src - 1;
539  uint8_t *dst = src;
540  uint32_t addition = 0;
541  v16u8 src_above, out;
542  v8u16 sum_above;
543  v4u32 sum_top;
544  v2u64 sum;
545 
546  src_above = LD_UB(src_top);
547 
548  sum_above = __msa_hadd_u_h(src_above, src_above);
549  sum_top = __msa_hadd_u_w(sum_above, sum_above);
550  sum = __msa_hadd_u_d(sum_top, sum_top);
551  sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum);
552  sum = __msa_hadd_u_d(sum_top, sum_top);
553  addition = __msa_copy_u_w((v4i32) sum, 0);
554  addition += src_left[ 0 * stride];
555  addition += src_left[ 1 * stride];
556  addition += src_left[ 2 * stride];
557  addition += src_left[ 3 * stride];
558  addition += src_left[ 4 * stride];
559  addition += src_left[ 5 * stride];
560  addition += src_left[ 6 * stride];
561  addition += src_left[ 7 * stride];
562  addition += src_left[ 8 * stride];
563  addition += src_left[ 9 * stride];
564  addition += src_left[10 * stride];
565  addition += src_left[11 * stride];
566  addition += src_left[12 * stride];
567  addition += src_left[13 * stride];
568  addition += src_left[14 * stride];
569  addition += src_left[15 * stride];
570  addition = (addition + 16) >> 5;
571  out = (v16u8) __msa_fill_b(addition);
572 
573  ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
574  dst += (8 * stride);
575  ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
576 }
577 
579 {
580  uint8_t *dst = src;
581 
583 }
584 
586 {
587  uint8_t *dst = src;
588 
590 }
591 
593 {
594  uint8_t *src_left = src - 1;
595  uint8_t *dst = src;
596  uint32_t addition;
597  v16u8 out;
598 
599  addition = src_left[ 0 * stride];
600  addition += src_left[ 1 * stride];
601  addition += src_left[ 2 * stride];
602  addition += src_left[ 3 * stride];
603  addition += src_left[ 4 * stride];
604  addition += src_left[ 5 * stride];
605  addition += src_left[ 6 * stride];
606  addition += src_left[ 7 * stride];
607  addition += src_left[ 8 * stride];
608  addition += src_left[ 9 * stride];
609  addition += src_left[10 * stride];
610  addition += src_left[11 * stride];
611  addition += src_left[12 * stride];
612  addition += src_left[13 * stride];
613  addition += src_left[14 * stride];
614  addition += src_left[15 * stride];
615 
616  addition = (addition + 8) >> 4;
617  out = (v16u8) __msa_fill_b(addition);
618 
619  ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
620  dst += (8 * stride);
621  ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
622 }
623 
625 {
626  uint8_t *src_top = src - stride;
627  uint8_t *dst = src;
628  v16u8 src_above, out;
629  v8u16 sum_above;
630  v4u32 sum_top;
631  v2u64 sum;
632 
633  src_above = LD_UB(src_top);
634 
635  sum_above = __msa_hadd_u_h(src_above, src_above);
636  sum_top = __msa_hadd_u_w(sum_above, sum_above);
637  sum = __msa_hadd_u_d(sum_top, sum_top);
638  sum_top = (v4u32) __msa_pckev_w((v4i32) sum, (v4i32) sum);
639  sum = __msa_hadd_u_d(sum_top, sum_top);
640  sum = (v2u64) __msa_srari_d((v2i64) sum, 4);
641  out = (v16u8) __msa_splati_b((v16i8) sum, 0);
642 
643  ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
644  dst += (8 * stride);
645  ST_UB8(out, out, out, out, out, out, out, out, dst, stride);
646 }
647 
649 {
650  uint64_t out;
651  v16u8 store;
652 
653  store = (v16u8) __msa_fill_b(128);
654  out = __msa_copy_u_d((v2i64) store, 0);
655 
656  SD4(out, out, out, out, src, stride);
657  src += (4 * stride);
658  SD4(out, out, out, out, src, stride);
659 }
660 
662 {
663  v16u8 out;
664 
665  out = (v16u8) __msa_fill_b(128);
666 
667  ST_UB8(out, out, out, out, out, out, out, out, src, stride);
668  src += (8 * stride);
669  ST_UB8(out, out, out, out, out, out, out, out, src, stride);
670 }
671 
672 void ff_vp8_pred8x8_127_dc_8_msa(uint8_t *src, ptrdiff_t stride)
673 {
674  intra_predict_127dc_8x8_msa(src, stride);
675 }
676 
677 void ff_vp8_pred8x8_129_dc_8_msa(uint8_t *src, ptrdiff_t stride)
678 {
679  intra_predict_129dc_8x8_msa(src, stride);
680 }
681 
682 void ff_vp8_pred16x16_127_dc_8_msa(uint8_t *src, ptrdiff_t stride)
683 {
684  intra_predict_127dc_16x16_msa(src, stride);
685 }
686 
687 void ff_vp8_pred16x16_129_dc_8_msa(uint8_t *src, ptrdiff_t stride)
688 {
689  intra_predict_129dc_16x16_msa(src, stride);
690 }
ff_h264_intra_pred_horiz_16x16_msa
void ff_h264_intra_pred_horiz_16x16_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:585
intra_predict_plane_8x8_msa
static void intra_predict_plane_8x8_msa(uint8_t *src, int32_t stride)
Definition: h264pred_msa.c:137
out
FILE * out
Definition: movenc.c:54
ST_UB2
#define ST_UB2(...)
Definition: generic_macros_msa.h:363
src1
const pixel * src1
Definition: h264pred_template.c:421
intra_predict_vert_dc_8x8_msa
static void intra_predict_vert_dc_8x8_msa(uint8_t *src, int32_t stride)
Definition: h264pred_msa.c:345
ff_h264_intra_pred_dc_16x16_msa
void ff_h264_intra_pred_dc_16x16_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:535
ff_h264_intra_pred_dc_128_16x16_msa
void ff_h264_intra_pred_dc_128_16x16_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:661
INTRA_PREDICT_VALDC_16X16_MSA
#define INTRA_PREDICT_VALDC_16X16_MSA(val)
Definition: h264pred_msa.c:123
ff_h264_intra_predict_mad_cow_dc_0l0_8x8_msa
void ff_h264_intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:510
ff_h264_intra_pred_vert_8x8_msa
void ff_h264_intra_pred_vert_8x8_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:521
ST_UB8
#define ST_UB8(...)
Definition: generic_macros_msa.h:391
ff_h264_intra_pred_dc_left_16x16_msa
void ff_h264_intra_pred_dc_left_16x16_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:592
generic_macros_msa.h
INTRA_PREDICT_VALDC_8X8_MSA
#define INTRA_PREDICT_VALDC_8X8_MSA(val)
Definition: h264pred_msa.c:109
intra_predict_vert_8x8_msa
static void intra_predict_vert_8x8_msa(uint8_t *src, uint8_t *dst, int32_t dst_stride)
Definition: h264pred_msa.c:24
intra_predict_dc_4blk_8x8_msa
static void intra_predict_dc_4blk_8x8_msa(uint8_t *src, int32_t stride)
Definition: h264pred_msa.c:284
LD_UB
#define LD_UB(...)
Definition: generic_macros_msa.h:32
intra_predict_hor_dc_8x8_msa
static void intra_predict_hor_dc_8x8_msa(uint8_t *src, int32_t stride)
Definition: h264pred_msa.c:322
mask
static const uint16_t mask[17]
Definition: lzw.c:38
ff_h264_intra_predict_mad_cow_dc_l0t_8x8_msa
void ff_h264_intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:492
ff_h264_intra_predict_vert_dc_8x8_msa
void ff_h264_intra_predict_vert_dc_8x8_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:487
intra_predict_horiz_8x8_msa
static void intra_predict_horiz_8x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: h264pred_msa.c:44
ff_h264_intra_predict_plane_8x8_msa
void ff_h264_intra_predict_plane_8x8_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:472
SRA_4V
#define SRA_4V(in0, in1, in2, in3, shift)
Definition: generic_macros_msa.h:1939
intra_predict_mad_cow_dc_l0t_8x8_msa
static void intra_predict_mad_cow_dc_l0t_8x8_msa(uint8_t *src, int32_t stride)
Definition: h264pred_msa.c:365
intra_predict_mad_cow_dc_0l0_8x8_msa
static void intra_predict_mad_cow_dc_0l0_8x8_msa(uint8_t *src, int32_t stride)
Definition: h264pred_msa.c:453
ff_h264_intra_pred_dc_128_8x8_msa
void ff_h264_intra_pred_dc_128_8x8_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:648
h264dsp_mips.h
PCKEV_B2_UB
#define PCKEV_B2_UB(...)
Definition: generic_macros_msa.h:1720
ff_h264_intra_pred_horiz_8x8_msa
void ff_h264_intra_pred_horiz_8x8_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:528
ff_h264_intra_predict_mad_cow_dc_0lt_8x8_msa
void ff_h264_intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:498
CLIP_SH2_0_255
#define CLIP_SH2_0_255(in0, in1)
Definition: generic_macros_msa.h:941
intra_predict_mad_cow_dc_l00_8x8_msa
static void intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src, int32_t stride)
Definition: h264pred_msa.c:435
ff_h264_intra_pred_dc_top_16x16_msa
void ff_h264_intra_pred_dc_top_16x16_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:624
SD4
#define SD4(in0, in1, in2, in3, pdst, stride)
Definition: generic_macros_msa.h:256
ff_h264_intra_pred_vert_16x16_msa
void ff_h264_intra_pred_vert_16x16_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:578
PCKEV_B2_SH
#define PCKEV_B2_SH(...)
Definition: generic_macros_msa.h:1721
ff_h264_intra_predict_dc_4blk_8x8_msa
void ff_h264_intra_predict_dc_4blk_8x8_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:477
src2
const pixel * src2
Definition: h264pred_template.c:422
ff_vp8_pred8x8_127_dc_8_msa
void ff_vp8_pred8x8_127_dc_8_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:672
ff_vp8_pred8x8_129_dc_8_msa
void ff_vp8_pred8x8_129_dc_8_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:677
intra_predict_plane_16x16_msa
static void intra_predict_plane_16x16_msa(uint8_t *src, int32_t stride)
Definition: h264pred_msa.c:205
stride
#define stride
Definition: h264pred_template.c:537
ff_vp8_pred16x16_127_dc_8_msa
void ff_vp8_pred16x16_127_dc_8_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:682
ff_h264_intra_predict_hor_dc_8x8_msa
void ff_h264_intra_predict_hor_dc_8x8_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:482
ff_vp8_pred16x16_129_dc_8_msa
void ff_vp8_pred16x16_129_dc_8_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:687
intra_predict_vert_16x16_msa
static void intra_predict_vert_16x16_msa(uint8_t *src, uint8_t *dst, int32_t dst_stride)
Definition: h264pred_msa.c:34
src0
const pixel *const src0
Definition: h264pred_template.c:420
ff_h264_intra_predict_mad_cow_dc_l00_8x8_msa
void ff_h264_intra_predict_mad_cow_dc_l00_8x8_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:504
ff_h264_intra_predict_plane_16x16_msa
void ff_h264_intra_predict_plane_16x16_msa(uint8_t *src, ptrdiff_t stride)
Definition: h264pred_msa.c:516
add
static float add(float src0, float src1)
Definition: dnn_backend_native_layer_mathbinary.c:35
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
int32_t
int32_t
Definition: audioconvert.c:56
intra_predict_mad_cow_dc_0lt_8x8_msa
static void intra_predict_mad_cow_dc_0lt_8x8_msa(uint8_t *src, int32_t stride)
Definition: h264pred_msa.c:399
LD
#define LD(psrc)
Definition: generic_macros_msa.h:137
intra_predict_horiz_16x16_msa
static void intra_predict_horiz_16x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride)
Definition: h264pred_msa.c:63
INSERT_D2_UB
#define INSERT_D2_UB(...)
Definition: generic_macros_msa.h:1169
SD
#define SD
Definition: ccaption_dec.c:929
PCKEV_H2_SH
#define PCKEV_H2_SH(...)
Definition: generic_macros_msa.h:1759