21 #include "libavcodec/hevc/dec.h"
26 -32, -26, -21, -17, -13, -9, -5, -2, 0, 2, 5, 9, 13, 17, 21, 26, 32
30 32, 26, 21, 17, 13, 9, 5, 2, 0, -2, -5, -9, -13, -17, -21, -26
33 #define HEVC_PRED_PLANAR_16x2(src0_r, src0_l, tmp0, tmp1, vec0, vec1, \
34 mul_val_h0, mul_val_h1, mul_val_h2, mul_val_h3, \
35 res0, res1, mul_val_b0, mul_val_b1, round) \
37 v8i16 res0_m, res1_m, res2_m, res3_m; \
39 MUL4(mul_val_h0, vec0, mul_val_h2, vec0, mul_val_h0, vec1, \
40 mul_val_h2, vec1, res0_m, res1_m, res2_m, res3_m); \
42 res0_m += mul_val_h1 * tmp0; \
43 res1_m += mul_val_h3 * tmp0; \
44 res2_m += mul_val_h1 * tmp0; \
45 res3_m += mul_val_h3 * tmp0; \
47 res0_m += mul_val_b0 * src0_r; \
48 res1_m += mul_val_b0 * src0_l; \
49 res2_m += (mul_val_b0 - 1) * src0_r; \
50 res3_m += (mul_val_b0 - 1) * src0_l; \
52 res0_m += mul_val_b1 * tmp1; \
53 res1_m += mul_val_b1 * tmp1; \
54 res2_m += (mul_val_b1 + 1) * tmp1; \
55 res3_m += (mul_val_b1 + 1) * tmp1; \
57 SRARI_H4_SH(res0_m, res1_m, res2_m, res3_m, round); \
58 PCKEV_B2_SH(res1_m, res0_m, res3_m, res2_m, res0, res1); \
62 const uint8_t *src_left,
68 v8i16 vec0, vec1, vec2;
71 src_data =
LW(src_top);
72 SW4(src_data, src_data, src_data, src_data,
dst,
stride);
75 src_data =
LW(src_left);
77 vec2 = (v8i16) __msa_insert_w((v4i32) vec2, 0, src_data);
79 vec0 = __msa_fill_h(src_left[-1]);
80 vec1 = __msa_fill_h(src_top[0]);
82 vec2 = (v8i16) __msa_ilvr_b(
zero, (v16i8) vec2);
88 for (col = 0; col < 4; col++) {
95 const uint8_t *src_left,
99 uint8_t *tmp_dst =
dst;
101 uint16_t val0, val1, val2, val3;
103 v8i16 vec0, vec1, vec2;
106 src_data1 =
LD(src_top);
108 for (row = 8; row--;) {
109 SD(src_data1, tmp_dst);
114 src_data1 =
LD(src_left);
116 vec2 = (v8i16) __msa_insert_d((v2i64)
zero, 0, src_data1);
118 vec0 = __msa_fill_h(src_left[-1]);
119 vec1 = __msa_fill_h(src_top[0]);
121 vec2 = (v8i16) __msa_ilvr_b(
zero, (v16i8) vec2);
150 const uint8_t *src_left,
155 uint8_t *tmp_dst =
dst;
158 v8i16 vec0, vec1, vec2, vec3;
162 for (row = 16; row--;) {
170 vec0 = __msa_fill_h(src_left[-1]);
171 vec1 = __msa_fill_h(src_top[0]);
174 SUB2(vec2, vec0, vec3, vec0, vec2, vec3);
179 ADD2(vec2, vec1, vec3, vec1, vec2, vec3);
182 src = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
184 for (col = 0; col < 16; col++) {
191 const uint8_t *src_left,
195 uint32_t val0, val1, val2, val3;
197 v8i16 src0_r, src_top_val, src_left_val;
200 val0 = src_left[0] * 0x01010101;
201 val1 = src_left[1] * 0x01010101;
202 val2 = src_left[2] * 0x01010101;
203 val3 = src_left[3] * 0x01010101;
208 src0 = (v16i8) __msa_insert_w((v4i32)
src0, 0, val0);
209 src_top_val = __msa_fill_h(src_top[-1]);
210 src_left_val = __msa_fill_h(src_left[0]);
212 src0_r = (v8i16) __msa_ilvr_b(
zero,
src0);
214 src0_r -= src_top_val;
216 src0_r += src_left_val;
218 src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
219 val0 = __msa_copy_s_w((v4i32)
src0, 0);
225 const uint8_t *src_left,
229 uint64_t val0, val1, val2, val3;
231 v8i16 src0_r, src_top_val, src_left_val;
234 val0 = src_left[0] * 0x0101010101010101;
235 val1 = src_left[1] * 0x0101010101010101;
236 val2 = src_left[2] * 0x0101010101010101;
237 val3 = src_left[3] * 0x0101010101010101;
240 val0 = src_left[4] * 0x0101010101010101;
241 val1 = src_left[5] * 0x0101010101010101;
242 val2 = src_left[6] * 0x0101010101010101;
243 val3 = src_left[7] * 0x0101010101010101;
248 src0 = (v16i8) __msa_insert_d((v2i64)
src0, 0, val0);
249 src_top_val = __msa_fill_h(src_top[-1]);
250 src_left_val = __msa_fill_h(src_left[0]);
252 src0_r = (v8i16) __msa_ilvr_b(
zero,
src0);
254 src0_r -= src_top_val;
256 src0_r += src_left_val;
258 src0 = __msa_pckev_b((v16i8) src0_r, (v16i8) src0_r);
259 val0 = __msa_copy_s_d((v2i64)
src0, 0);
265 const uint8_t *src_left,
269 uint8_t *tmp_dst =
dst;
271 uint8_t inp0, inp1, inp2, inp3;
273 v8i16 src0_r, src0_l, src_left_val, src_top_val;
275 src_left_val = __msa_fill_h(src_left[0]);
277 for (row = 4; row--;) {
284 src0 = __msa_fill_b(inp0);
285 src1 = __msa_fill_b(inp1);
286 src2 = __msa_fill_b(inp2);
287 src3 = __msa_fill_b(inp3);
295 src_top_val = __msa_fill_h(src_top[-1]);
298 SUB2(src0_r, src_top_val, src0_l, src_top_val, src0_r, src0_l);
303 ADD2(src0_r, src_left_val, src0_l, src_left_val, src0_r, src0_l);
305 src0 = __msa_pckev_b((v16i8) src0_l, (v16i8) src0_r);
311 const uint8_t *src_left,
315 uint8_t inp0, inp1, inp2, inp3;
318 for (row = 0; row < 8; row++) {
319 inp0 = src_left[row * 4];
320 inp1 = src_left[row * 4 + 1];
321 inp2 = src_left[row * 4 + 2];
322 inp3 = src_left[row * 4 + 3];
324 src0 = __msa_fill_b(inp0);
325 src1 = __msa_fill_b(inp1);
326 src2 = __msa_fill_b(inp2);
327 src3 = __msa_fill_b(inp3);
341 const uint8_t *src_left,
345 uint8_t *tmp_dst =
dst;
346 uint32_t addition = 0;
347 uint32_t val0, val1, val2;
351 v8u16 sum, vec0, vec1;
356 sum = __msa_hadd_u_h((v16u8)
src, (v16u8)
src);
357 sum = (v8u16) __msa_hadd_u_w(sum, sum);
358 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
359 sum = (v8u16) __msa_srari_w((v4i32) sum, 3);
360 addition = __msa_copy_u_w((v4i32) sum, 0);
361 store = (v16u8) __msa_fill_b(addition);
362 val0 = __msa_copy_u_w((v4i32) store, 0);
372 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
373 store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
374 val1 = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
375 store = (v16u8) __msa_insert_b((v16i8) store, 0, val1);
376 val0 = __msa_copy_u_w((v4i32) store, 0);
385 ADD2(val0, addition, val1, addition, val0, val1);
395 tmp_dst[
stride * 1] = val0;
396 tmp_dst[
stride * 2] = val1;
397 tmp_dst[
stride * 3] = val2;
402 const uint8_t *src_left,
406 uint8_t *tmp_dst =
dst;
407 uint32_t row, col,
val;
408 uint32_t addition = 0;
412 v8u16 sum, vec0, vec1;
418 sum = __msa_hadd_u_h((v16u8)
src, (v16u8)
src);
419 sum = (v8u16) __msa_hadd_u_w(sum, sum);
420 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
421 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
422 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
423 sum = (v8u16) __msa_srari_w((v4i32) sum, 4);
424 addition = __msa_copy_u_w((v4i32) sum, 0);
425 store = (v16u8) __msa_fill_b(addition);
426 val0 = __msa_copy_u_d((v2i64) store, 0);
428 for (row = 8; row--;) {
439 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
440 store = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec1);
441 val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
442 store = (v16u8) __msa_insert_b((v16i8) store, 0,
val);
443 val0 = __msa_copy_u_d((v2i64) store, 0);
447 src = (v16u8) __msa_insert_d((v2i64)
src, 0, val0);
448 vec1 = (v8u16) __msa_ilvr_b(
zero, (v16i8)
src);
449 vec0 = (v8u16) __msa_fill_h(addition);
452 vec1 = (v8u16) __msa_srari_h((v8i16) vec1, 2);
454 for (col = 1; col < 8; col++) {
455 tmp_dst[
stride * col] = vec1[col];
461 const uint8_t *src_left,
465 uint8_t *tmp_dst =
dst;
466 uint32_t row, col,
val;
467 uint32_t addition = 0;
468 v16u8 src_above1, store, src_left1;
469 v8u16 sum, sum_above, sum_left;
470 v8u16 vec0, vec1, vec2;
473 src_above1 =
LD_UB(src_top);
474 src_left1 =
LD_UB(src_left);
476 HADD_UB2_UH(src_above1, src_left1, sum_above, sum_left);
477 sum = sum_above + sum_left;
478 sum = (v8u16) __msa_hadd_u_w(sum, sum);
479 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
480 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
481 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
482 sum = (v8u16) __msa_srari_w((v4i32) sum, 5);
483 addition = __msa_copy_u_w((v4i32) sum, 0);
484 store = (v16u8) __msa_fill_b(addition);
486 for (row = 16; row--;) {
492 vec0 = (v8u16) __msa_ilvr_b(
zero, (v16i8) store);
494 ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
496 ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
498 store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
499 val = (src_left[0] + 2 * addition + src_top[0] + 2) >> 2;
500 store = (v16u8) __msa_insert_b((v16i8) store, 0,
val);
501 ST_UB(store, tmp_dst);
504 vec0 = (v8u16) __msa_fill_h(addition);
506 ADD2(vec1, vec0, vec2, vec0, vec1, vec2);
508 store = (v16u8) __msa_pckev_b((v16i8) vec2, (v16i8) vec1);
510 for (col = 1; col < 16; col++) {
511 tmp_dst[
stride * col] = store[col];
517 const uint8_t *src_left,
521 v16u8 src_above1, src_above2, store, src_left1, src_left2;
522 v8u16 sum_above1, sum_above2;
523 v8u16 sum_left1, sum_left2;
524 v8u16 sum, sum_above, sum_left;
526 LD_UB2(src_top, 16, src_above1, src_above2);
527 LD_UB2(src_left, 16, src_left1, src_left2);
528 HADD_UB2_UH(src_above1, src_above2, sum_above1, sum_above2);
529 HADD_UB2_UH(src_left1, src_left2, sum_left1, sum_left2);
530 sum_above = sum_above1 + sum_above2;
531 sum_left = sum_left1 + sum_left2;
532 sum = sum_above + sum_left;
533 sum = (v8u16) __msa_hadd_u_w(sum, sum);
534 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
535 sum = (v8u16) __msa_pckev_w((v4i32) sum, (v4i32) sum);
536 sum = (v8u16) __msa_hadd_u_d((v4u32) sum, (v4u32) sum);
537 sum = (v8u16) __msa_srari_w((v4i32) sum, 6);
538 store = (v16u8) __msa_splati_b((v16i8) sum, 0);
540 for (row = 16; row--;) {
549 const uint8_t *src_left,
553 v16i8 src_vec0, src_vec1;
554 v8i16 src_vec0_r, src1_r, tmp0, tmp1, mul_val1;
555 v8i16 vec0, vec1, vec2, vec3, res0, res1, res2, res3;
556 v8i16 mul_val0 = { 3, 2, 1, 0, 1, 2, 3, 4 };
562 mul_val1 = (v8i16) __msa_pckod_d((v2i64) mul_val0, (v2i64) mul_val0);
564 src_vec0 = (v16i8) __msa_insert_w((v4i32)
zero, 0,
src0);
565 src_vec1 = (v16i8) __msa_insert_w((v4i32)
zero, 0,
src1);
568 SPLATI_H4_SH(src1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
570 tmp0 = __msa_fill_h(src_top[4]);
571 tmp1 = __msa_fill_h(src_left[4]);
573 MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
574 res0, res1, res2, res3);
576 res0 += mul_val1 * tmp0;
577 res1 += mul_val1 * tmp0;
578 res2 += mul_val1 * tmp0;
579 res3 += mul_val1 * tmp0;
581 res0 += 3 * src_vec0_r;
582 res1 += 2 * src_vec0_r;
591 src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
596 const uint8_t *src_left,
600 v16i8 src_vec0, src_vec1, src_vec2, src_vec3;
601 v8i16 src_vec0_r, src_vec1_r;
602 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
603 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
604 v8i16 tmp0, tmp1, tmp2;
605 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
606 v8i16 mul_val0 = { 7, 6, 5, 4, 3, 2, 1, 0 };
612 src_vec0 = (v16i8) __msa_insert_d((v2i64)
zero, 0,
src0);
613 src_vec1 = (v16i8) __msa_insert_d((v2i64)
zero, 0,
src1);
616 SPLATI_H4_SH(src_vec1_r, 0, 1, 2, 3, vec0, vec1, vec2, vec3);
617 SPLATI_H4_SH(src_vec1_r, 4, 5, 6, 7, vec4, vec5, vec6, vec7);
619 tmp0 = __msa_fill_h(src_top[8]);
620 tmp1 = __msa_fill_h(src_left[8]);
622 MUL4(mul_val0, vec0, mul_val0, vec1, mul_val0, vec2, mul_val0, vec3,
623 res0, res1, res2, res3);
624 MUL4(mul_val0, vec4, mul_val0, vec5, mul_val0, vec6, mul_val0, vec7,
625 res4, res5, res6, res7);
627 tmp2 = mul_val1 * tmp0;
637 res0 += 7 * src_vec0_r;
638 res1 += 6 * src_vec0_r;
639 res2 += 5 * src_vec0_r;
640 res3 += 4 * src_vec0_r;
641 res4 += 3 * src_vec0_r;
642 res5 += 2 * src_vec0_r;
656 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
657 src_vec0, src_vec1, src_vec2, src_vec3);
659 ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1,
664 const uint8_t *src_left,
668 v8i16 src0_r, src1_r, src0_l, src1_l;
670 v8i16 res0, res1, tmp0, tmp1;
671 v8i16 mul_val2, mul_val3;
672 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
673 v8i16 mul_val0 = { 15, 14, 13, 12, 11, 10, 9, 8 };
681 mul_val2 = mul_val0 - 8;
682 mul_val3 = mul_val1 + 8;
684 tmp0 = __msa_fill_h(src_top[16]);
685 tmp1 = __msa_fill_h(src_left[16]);
689 mul_val0, mul_val1, mul_val2, mul_val3,
690 res0, res1, 15, 1, 5);
696 mul_val0, mul_val1, mul_val2, mul_val3,
697 res0, res1, 13, 3, 5);
703 mul_val0, mul_val1, mul_val2, mul_val3,
704 res0, res1, 11, 5, 5);
710 mul_val0, mul_val1, mul_val2, mul_val3,
711 res0, res1, 9, 7, 5);
717 mul_val0, mul_val1, mul_val2, mul_val3,
718 res0, res1, 7, 9, 5);
724 mul_val0, mul_val1, mul_val2, mul_val3,
725 res0, res1, 5, 11, 5);
731 mul_val0, mul_val1, mul_val2, mul_val3,
732 res0, res1, 3, 13, 5);
738 mul_val0, mul_val1, mul_val2, mul_val3,
739 res0, res1, 1, 15, 5);
744 const uint8_t *src_left,
749 v8i16 src0_r, src1_r, src0_l, src1_l;
750 v8i16 vec0, vec1, res0, res1;
752 v8i16 mul_val2, mul_val3;
753 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
754 v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
756 tmp0 = __msa_fill_h(src_top[32 -
offset]);
757 tmp1 = __msa_fill_h(src_left[32]);
767 mul_val2 = mul_val0 - 8;
768 mul_val3 = mul_val1 + 8;
772 mul_val0, mul_val1, mul_val2, mul_val3,
773 res0, res1, 31, 1, 6);
779 mul_val0, mul_val1, mul_val2, mul_val3,
780 res0, res1, 29, 3, 6);
786 mul_val0, mul_val1, mul_val2, mul_val3,
787 res0, res1, 27, 5, 6);
793 mul_val0, mul_val1, mul_val2, mul_val3,
794 res0, res1, 25, 7, 6);
800 mul_val0, mul_val1, mul_val2, mul_val3,
801 res0, res1, 23, 9, 6);
807 mul_val0, mul_val1, mul_val2, mul_val3,
808 res0, res1, 21, 11, 6);
814 mul_val0, mul_val1, mul_val2, mul_val3,
815 res0, res1, 19, 13, 6);
821 mul_val0, mul_val1, mul_val2, mul_val3,
822 res0, res1, 17, 15, 6);
827 const uint8_t *src_left,
832 v8i16 src0_r, src1_r, src0_l, src1_l;
833 v8i16 vec0, vec1, res0, res1, tmp0, tmp1;
834 v8i16 mul_val2, mul_val3;
835 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
836 v8i16 mul_val0 = { 31, 30, 29, 28, 27, 26, 25, 24 };
838 tmp0 = __msa_fill_h(src_top[32 -
offset]);
839 tmp1 = __msa_fill_h(src_left[16]);
849 mul_val2 = mul_val0 - 8;
850 mul_val3 = mul_val1 + 8;
854 mul_val0, mul_val1, mul_val2, mul_val3,
855 res0, res1, 15, 17, 6);
861 mul_val0, mul_val1, mul_val2, mul_val3,
862 res0, res1, 13, 19, 6);
868 mul_val0, mul_val1, mul_val2, mul_val3,
869 res0, res1, 11, 21, 6);
875 mul_val0, mul_val1, mul_val2, mul_val3,
876 res0, res1, 9, 23, 6);
882 mul_val0, mul_val1, mul_val2, mul_val3,
883 res0, res1, 7, 25, 6);
889 mul_val0, mul_val1, mul_val2, mul_val3,
890 res0, res1, 5, 27, 6);
896 mul_val0, mul_val1, mul_val2, mul_val3,
897 res0, res1, 3, 29, 6);
903 mul_val0, mul_val1, mul_val2, mul_val3,
904 res0, res1, 1, 31, 6);
909 const uint8_t *src_left,
924 const uint8_t *src_left,
929 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
930 uint8_t ref_array[3 * 32 + 4];
931 uint8_t *ref_tmp = ref_array + 4;
934 int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
935 int32_t idx2, fact_val2, idx3, fact_val3;
939 v16i8 top0, top1, top2, top3;
942 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
943 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
946 inv_angle_val = inv_angle[
mode - 18];
951 if (angle < 0 && last < -1) {
952 inv_angle_val = inv_angle[
mode - 18];
957 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
958 offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
959 ref_tmp[h_cnt] = src_left[
offset];
965 idx0 = angle_loop >> 5;
966 fact_val0 = angle_loop & 31;
969 idx1 = angle_loop >> 5;
970 fact_val1 = angle_loop & 31;
973 idx2 = angle_loop >> 5;
974 fact_val2 = angle_loop & 31;
977 idx3 = angle_loop >> 5;
978 fact_val3 = angle_loop & 31;
985 fact0 = __msa_fill_h(fact_val0);
986 fact1 = __msa_fill_h(32 - fact_val0);
988 fact2 = __msa_fill_h(fact_val1);
989 fact3 = __msa_fill_h(32 - fact_val1);
991 fact4 = __msa_fill_h(fact_val2);
992 fact5 = __msa_fill_h(32 - fact_val2);
994 fact6 = __msa_fill_h(fact_val3);
995 fact7 = __msa_fill_h(32 - fact_val3);
997 ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
998 ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
1000 diff0, diff2, diff4, diff6);
1002 diff1, diff3, diff5, diff7);
1003 ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1004 ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1005 MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1007 diff1 += diff0 * fact1;
1008 diff3 += diff2 * fact3;
1011 dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
1016 const uint8_t *src_left,
1021 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1022 uint8_t ref_array[3 * 32 + 4];
1023 uint8_t *ref_tmp = ref_array + 8;
1025 const uint8_t *src_left_tmp = src_left - 1;
1027 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1028 int32_t idx2, fact_val2, idx3, fact_val3;
1030 int32_t inv_angle_val, inv_angle_val_loop;
1032 v16i8 top0, top1, top2, top3;
1033 v16u8 dst_val0, dst_val1;
1034 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1035 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1038 inv_angle_val = inv_angle[
mode - 18];
1039 last = (angle) >> 2;
1044 inv_angle_val_loop = inv_angle_val * last;
1050 SW(tmp1, ref_tmp + 4);
1051 SW(tmp2, ref_tmp + 8);
1053 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1054 offset = (inv_angle_val_loop + 128) >> 8;
1055 ref_tmp[h_cnt] = src_left_tmp[
offset];
1056 inv_angle_val_loop += inv_angle_val;
1061 for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1062 idx0 = (angle_loop) >> 5;
1063 fact_val0 = (angle_loop) & 31;
1064 angle_loop += angle;
1066 idx1 = (angle_loop) >> 5;
1067 fact_val1 = (angle_loop) & 31;
1068 angle_loop += angle;
1070 idx2 = (angle_loop) >> 5;
1071 fact_val2 = (angle_loop) & 31;
1072 angle_loop += angle;
1074 idx3 = (angle_loop) >> 5;
1075 fact_val3 = (angle_loop) & 31;
1076 angle_loop += angle;
1083 fact0 = __msa_fill_h(fact_val0);
1084 fact1 = __msa_fill_h(32 - fact_val0);
1085 fact2 = __msa_fill_h(fact_val1);
1086 fact3 = __msa_fill_h(32 - fact_val1);
1087 fact4 = __msa_fill_h(fact_val2);
1088 fact5 = __msa_fill_h(32 - fact_val2);
1089 fact6 = __msa_fill_h(fact_val3);
1090 fact7 = __msa_fill_h(32 - fact_val3);
1097 SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
1098 diff1, diff3, diff5, diff7);
1099 MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1100 diff1, diff3, diff5, diff7);
1102 diff1 += diff0 * fact1;
1103 diff3 += diff2 * fact3;
1104 diff5 += diff4 * fact5;
1105 diff7 += diff6 * fact7;
1108 PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
1115 const uint8_t *src_left,
1120 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1121 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1122 int32_t idx2, fact_val2, idx3, fact_val3;
1125 int32_t inv_angle_val, inv_angle_val_loop;
1126 uint8_t ref_array[3 * 32 + 4];
1127 uint8_t *ref_tmp = ref_array + 16;
1129 const uint8_t *src_left_tmp = src_left - 1;
1131 v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1132 v16i8 dst0, dst1, dst2, dst3;
1133 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1134 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1135 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1138 inv_angle_val = inv_angle[
mode - 18];
1144 inv_angle_val_loop = inv_angle_val * last;
1147 tmp0 =
LW(
ref + 16);
1148 ST_UB(top0, ref_tmp);
1149 SW(tmp0, ref_tmp + 16);
1151 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1152 offset = (inv_angle_val_loop + 128) >> 8;
1153 ref_tmp[h_cnt] = src_left_tmp[
offset];
1154 inv_angle_val_loop += inv_angle_val;
1159 for (v_cnt = 4; v_cnt--;) {
1160 idx0 = (angle_loop) >> 5;
1161 fact_val0 = (angle_loop) & 31;
1162 angle_loop += angle;
1164 idx1 = (angle_loop) >> 5;
1165 fact_val1 = (angle_loop) & 31;
1166 angle_loop += angle;
1168 idx2 = (angle_loop) >> 5;
1169 fact_val2 = (angle_loop) & 31;
1170 angle_loop += angle;
1172 idx3 = (angle_loop) >> 5;
1173 fact_val3 = (angle_loop) & 31;
1174 angle_loop += angle;
1181 fact0 = __msa_fill_h(fact_val0);
1182 fact1 = __msa_fill_h(32 - fact_val0);
1183 fact2 = __msa_fill_h(fact_val1);
1184 fact3 = __msa_fill_h(32 - fact_val1);
1185 fact4 = __msa_fill_h(fact_val2);
1186 fact5 = __msa_fill_h(32 - fact_val2);
1187 fact6 = __msa_fill_h(fact_val3);
1188 fact7 = __msa_fill_h(32 - fact_val3);
1190 SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1191 top1, top3, top5, top7);
1201 MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1202 diff2, diff3, diff6, diff7);
1203 MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1204 diff10, diff11, diff14, diff15);
1206 diff2 += diff0 * fact1;
1207 diff3 += diff1 * fact1;
1208 diff6 += diff4 * fact3;
1209 diff7 += diff5 * fact3;
1210 diff10 += diff8 * fact5;
1211 diff11 += diff9 * fact5;
1212 diff14 += diff12 * fact7;
1213 diff15 += diff13 * fact7;
1217 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1218 dst0, dst1, dst2, dst3);
1225 const uint8_t *src_left,
1230 int16_t inv_angle[] = { -256, -315, -390, -482, -630, -910, -1638, -4096 };
1231 uint8_t ref_array[3 * 32 + 4];
1234 const uint8_t *src_left_tmp = src_left - 1;
1235 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1236 int32_t tmp0, tmp1, tmp2, tmp3;
1238 int32_t inv_angle_val, inv_angle_val_loop;
1240 v16u8 top0, top1, top2, top3, top4, top5, top6, top7;
1241 v16i8 dst0, dst1, dst2, dst3;
1242 v8i16 fact0, fact1, fact2, fact3;
1243 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1244 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1246 ref_tmp = ref_array + 32;
1249 inv_angle_val = inv_angle[
mode - 18];
1255 inv_angle_val_loop = inv_angle_val * last;
1262 ST_UB2(top0, top1, ref_tmp, 16);
1268 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1269 offset = (inv_angle_val_loop + 128) >> 8;
1270 ref_tmp[h_cnt] = src_left_tmp[
offset];
1271 inv_angle_val_loop += inv_angle_val;
1277 for (v_cnt = 16; v_cnt--;) {
1278 idx0 = (angle_loop) >> 5;
1279 fact_val0 = (angle_loop) & 31;
1280 angle_loop += angle;
1282 idx1 = (angle_loop) >> 5;
1283 fact_val1 = (angle_loop) & 31;
1284 angle_loop += angle;
1293 fact0 = __msa_fill_h(fact_val0);
1294 fact1 = __msa_fill_h(32 - fact_val0);
1295 fact2 = __msa_fill_h(fact_val1);
1296 fact3 = __msa_fill_h(32 - fact_val1);
1301 SLDI_B4_UB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1302 top1, top3, top5, top7);
1312 MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1313 diff2, diff3, diff6, diff7);
1314 MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1315 diff10, diff11, diff14, diff15);
1317 diff2 += diff0 * fact1;
1318 diff3 += diff1 * fact1;
1319 diff6 += diff4 * fact1;
1320 diff7 += diff5 * fact1;
1321 diff10 += diff8 * fact3;
1322 diff11 += diff9 * fact3;
1323 diff14 += diff12 * fact3;
1324 diff15 += diff13 * fact3;
1328 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1329 dst0, dst1, dst2, dst3);
1339 const uint8_t *src_left,
1344 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1345 uint8_t ref_array[3 * 32 + 4];
1346 uint8_t *ref_tmp = ref_array + 4;
1349 int32_t h_cnt, idx0, fact_val0, idx1, fact_val1;
1350 int32_t idx2, fact_val2, idx3, fact_val3;
1351 int32_t angle, angle_loop, inv_angle_val;
1353 v16i8 dst_val0, dst_val1;
1354 v16u8 top0, top1, top2, top3;
1356 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1357 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1365 inv_angle_val = inv_angle[
mode - 11];
1370 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1371 offset = -1 + ((h_cnt * inv_angle_val + 128) >> 8);
1372 ref_tmp[h_cnt] = src_top[
offset];
1378 idx0 = angle_loop >> 5;
1379 fact_val0 = angle_loop & 31;
1380 angle_loop += angle;
1382 idx1 = angle_loop >> 5;
1383 fact_val1 = angle_loop & 31;
1384 angle_loop += angle;
1386 idx2 = angle_loop >> 5;
1387 fact_val2 = angle_loop & 31;
1388 angle_loop += angle;
1390 idx3 = angle_loop >> 5;
1391 fact_val3 = angle_loop & 31;
1398 fact0 = __msa_fill_h(fact_val0);
1399 fact1 = __msa_fill_h(32 - fact_val0);
1400 fact2 = __msa_fill_h(fact_val1);
1401 fact3 = __msa_fill_h(32 - fact_val1);
1402 fact4 = __msa_fill_h(fact_val2);
1403 fact5 = __msa_fill_h(32 - fact_val2);
1404 fact6 = __msa_fill_h(fact_val3);
1405 fact7 = __msa_fill_h(32 - fact_val3);
1407 ILVR_D2_SH(fact2, fact0, fact6, fact4, fact0, fact2);
1408 ILVR_D2_SH(fact3, fact1, fact7, fact5, fact1, fact3);
1410 diff0, diff2, diff4, diff6);
1412 diff1, diff3, diff5, diff7);
1413 ILVR_D2_SH(diff2, diff0, diff6, diff4, diff0, diff2);
1414 ILVR_D2_SH(diff3, diff1, diff7, diff5, diff1, diff3);
1415 MUL2(diff1, fact0, diff3, fact2, diff1, diff3);
1417 diff1 += diff0 * fact1;
1418 diff3 += diff2 * fact3;
1421 PCKEV_B2_SB(diff1, diff1, diff3, diff3, dst_val0, dst_val1);
1423 diff0 = (v8i16) __msa_pckev_b(dst_val1, dst_val0);
1424 diff1 = (v8i16) __msa_pckod_b(dst_val1, dst_val0);
1426 diff2 = (v8i16) __msa_pckev_w((v4i32) diff1, (v4i32) diff0);
1428 dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
1429 dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
1436 const uint8_t *src_left,
1441 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1442 uint8_t ref_array[3 * 32 + 4];
1443 uint8_t *ref_tmp = ref_array + 8;
1445 const uint8_t *src_top_tmp = src_top - 1;
1448 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1449 int32_t idx2, fact_val2, idx3, fact_val3;
1450 int32_t angle, angle_loop, inv_angle_val;
1451 v16i8 top0, top1, top2, top3;
1452 v16i8 dst_val0, dst_val1, dst_val2, dst_val3;
1453 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1454 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1457 last = (angle) >> 2;
1462 inv_angle_val = inv_angle[
mode - 11];
1468 SW(tmp1, ref_tmp + 4);
1469 SW(tmp2, ref_tmp + 8);
1471 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1472 offset = (h_cnt * inv_angle_val + 128) >> 8;
1473 ref_tmp[h_cnt] = src_top_tmp[
offset];
1479 for (v_cnt = 0; v_cnt < 2; v_cnt++) {
1482 idx0 = angle_loop >> 5;
1483 fact_val0 = angle_loop & 31;
1484 angle_loop += angle;
1486 idx1 = angle_loop >> 5;
1487 fact_val1 = angle_loop & 31;
1488 angle_loop += angle;
1490 idx2 = angle_loop >> 5;
1491 fact_val2 = angle_loop & 31;
1492 angle_loop += angle;
1494 idx3 = angle_loop >> 5;
1495 fact_val3 = angle_loop & 31;
1496 angle_loop += angle;
1503 fact0 = __msa_fill_h(fact_val0);
1504 fact1 = __msa_fill_h(32 - fact_val0);
1505 fact2 = __msa_fill_h(fact_val1);
1506 fact3 = __msa_fill_h(32 - fact_val1);
1507 fact4 = __msa_fill_h(fact_val2);
1508 fact5 = __msa_fill_h(32 - fact_val2);
1509 fact6 = __msa_fill_h(fact_val3);
1510 fact7 = __msa_fill_h(32 - fact_val3);
1516 SLDI_B4_SH(diff1, diff0, diff3, diff2, diff5, diff4, diff7, diff6, 2,
1517 diff1, diff3, diff5, diff7);
1518 MUL4(diff1, fact0, diff3, fact2, diff5, fact4, diff7, fact6,
1519 diff1, diff3, diff5, diff7);
1521 diff1 += diff0 * fact1;
1522 diff3 += diff2 * fact3;
1523 diff5 += diff4 * fact5;
1524 diff7 += diff6 * fact7;
1527 PCKEV_B4_SB(diff1, diff1, diff3, diff3, diff5, diff5, diff7, diff7,
1528 dst_val0, dst_val1, dst_val2, dst_val3);
1529 ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1531 ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org,
stride);
1537 const uint8_t *src_left,
1542 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1543 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1;
1544 int32_t idx2, fact_val2, idx3, fact_val3, tmp0;
1545 v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1546 v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1547 v8i16 fact0, fact1, fact2, fact3, fact4, fact5, fact6, fact7;
1548 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1549 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1551 uint8_t ref_array[3 * 32 + 4];
1552 uint8_t *ref_tmp = ref_array + 16;
1553 const uint8_t *
ref, *src_top_tmp = src_top - 1;
1558 last = (angle) >> 1;
1563 inv_angle_val = inv_angle[
mode - 11];
1566 tmp0 =
LW(
ref + 16);
1567 ST_SB(top0, ref_tmp);
1568 SW(tmp0, ref_tmp + 16);
1570 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1571 offset = (h_cnt * inv_angle_val + 128) >> 8;
1572 ref_tmp[h_cnt] = src_top_tmp[
offset];
1578 for (v_cnt = 0; v_cnt < 4; v_cnt++) {
1581 idx0 = angle_loop >> 5;
1582 fact_val0 = angle_loop & 31;
1583 angle_loop += angle;
1585 idx1 = angle_loop >> 5;
1586 fact_val1 = angle_loop & 31;
1587 angle_loop += angle;
1589 idx2 = angle_loop >> 5;
1590 fact_val2 = angle_loop & 31;
1591 angle_loop += angle;
1593 idx3 = angle_loop >> 5;
1594 fact_val3 = angle_loop & 31;
1595 angle_loop += angle;
1602 fact0 = __msa_fill_h(fact_val0);
1603 fact1 = __msa_fill_h(32 - fact_val0);
1604 fact2 = __msa_fill_h(fact_val1);
1605 fact3 = __msa_fill_h(32 - fact_val1);
1606 fact4 = __msa_fill_h(fact_val2);
1607 fact5 = __msa_fill_h(32 - fact_val2);
1608 fact6 = __msa_fill_h(fact_val3);
1609 fact7 = __msa_fill_h(32 - fact_val3);
1611 SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1612 top1, top3, top5, top7);
1623 MUL4(diff2, fact0, diff3, fact0, diff6, fact2, diff7, fact2,
1624 diff2, diff3, diff6, diff7);
1625 MUL4(diff10, fact4, diff11, fact4, diff14, fact6, diff15, fact6,
1626 diff10, diff11, diff14, diff15);
1628 diff2 += diff0 * fact1;
1629 diff3 += diff1 * fact1;
1630 diff6 += diff4 * fact3;
1631 diff7 += diff5 * fact3;
1632 diff10 += diff8 * fact5;
1633 diff11 += diff9 * fact5;
1634 diff14 += diff12 * fact7;
1635 diff15 += diff13 * fact7;
1639 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1640 dst_val0, dst_val1, dst_val2, dst_val3);
1641 ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
1642 ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
1645 ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org,
stride);
1647 ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org,
stride);
1653 const uint8_t *src_left,
1658 int16_t inv_angle[] = { -4096, -1638, -910, -630, -482, -390, -315 };
1659 int32_t h_cnt, v_cnt, idx0, fact_val0, idx1, fact_val1, tmp0;
1660 v16i8 top0, top1, dst_val0, top2, top3, dst_val1;
1661 v16i8 top4, top5, dst_val2, top6, top7, dst_val3;
1662 v8i16 fact0, fact1, fact2, fact3;
1663 v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
1664 v8i16 diff8, diff9, diff10, diff11, diff12, diff13, diff14, diff15;
1666 uint8_t ref_array[3 * 32 + 4];
1667 uint8_t *ref_tmp = ref_array + 32;
1668 const uint8_t *
ref, *src_top_tmp = src_top - 1;
1678 inv_angle_val = inv_angle[
mode - 11];
1681 tmp0 =
LW(
ref + 32);
1682 ST_SB2(top0, top1, ref_tmp, 16);
1683 SW(tmp0, ref_tmp + 32);
1685 for (h_cnt = last; h_cnt <= -1; h_cnt++) {
1686 offset = (h_cnt * inv_angle_val + 128) >> 8;
1687 ref_tmp[h_cnt] = src_top_tmp[
offset];
1693 for (v_cnt = 0; v_cnt < 16; v_cnt++) {
1695 idx0 = angle_loop >> 5;
1696 fact_val0 = angle_loop & 31;
1697 angle_loop += angle;
1699 idx1 = angle_loop >> 5;
1700 fact_val1 = angle_loop & 31;
1701 angle_loop += angle;
1710 fact0 = __msa_fill_h(fact_val0);
1711 fact1 = __msa_fill_h(32 - fact_val0);
1712 fact2 = __msa_fill_h(fact_val1);
1713 fact3 = __msa_fill_h(32 - fact_val1);
1718 SLDI_B4_SB(top1, top0, top3, top2, top5, top4, top7, top6, 1,
1719 top1, top3, top5, top7);
1730 MUL4(diff2, fact0, diff3, fact0, diff6, fact0, diff7, fact0,
1731 diff2, diff3, diff6, diff7);
1732 MUL4(diff10, fact2, diff11, fact2, diff14, fact2, diff15, fact2,
1733 diff10, diff11, diff14, diff15);
1735 diff2 += diff0 * fact1;
1736 diff3 += diff1 * fact1;
1737 diff6 += diff4 * fact1;
1738 diff7 += diff5 * fact1;
1739 diff10 += diff8 * fact3;
1740 diff11 += diff9 * fact3;
1741 diff14 += diff12 * fact3;
1742 diff15 += diff13 * fact3;
1746 PCKEV_B4_SB(diff3, diff2, diff7, diff6, diff11, diff10, diff15, diff14,
1747 dst_val0, dst_val1, dst_val2, dst_val3);
1751 ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org,
stride)
1753 ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org,
stride)
1755 ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org,
stride)
1757 ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org,
stride)
1773 for (row = 32; row--;) {
1780 const uint8_t *src_top,
1781 const uint8_t *src_left,
1788 const uint8_t *src_top,
1789 const uint8_t *src_left,
1796 const uint8_t *src_top,
1797 const uint8_t *src_left,
1804 const uint8_t *src_top,
1805 const uint8_t *src_left,
1812 const uint8_t *src_left,
1835 const uint8_t *src_top,
1836 const uint8_t *src_left,
1841 }
else if (
mode == 26) {
1843 }
else if (
mode >= 18) {
1853 const uint8_t *src_top,
1854 const uint8_t *src_left,
1859 }
else if (
mode == 26) {
1861 }
else if (
mode >= 18) {
1871 const uint8_t *src_top,
1872 const uint8_t *src_left,
1877 }
else if (
mode == 26) {
1879 }
else if (
mode >= 18) {
1889 const uint8_t *src_top,
1890 const uint8_t *src_left,
1895 }
else if (
mode == 26) {
1897 }
else if (
mode >= 18) {
1907 int x0,
int y0,
int c_idx)
1913 int hshift =
sps->hshift[c_idx];
1915 int size_in_luma_h = 16 << hshift;
1916 int size_in_tbs_h = size_in_luma_h >>
sps->log2_min_tb_size;
1917 int size_in_luma_v = 16 <<
vshift;
1918 int size_in_tbs_v = size_in_luma_v >>
sps->log2_min_tb_size;
1919 int x = x0 >> hshift;
1921 int x_tb = (x0 >>
sps->log2_min_tb_size) &
sps->tb_mask;
1922 int y_tb = (y0 >>
sps->log2_min_tb_size) &
sps->tb_mask;
1925 pps->min_tb_addr_zs[(y_tb) * (
sps->tb_mask + 2) + (x_tb)];
1927 ptrdiff_t
stride =
s->frame->linesize[c_idx] /
sizeof(uint8_t);
1928 uint8_t *
src = (uint8_t *)
s->frame->data[c_idx] + x + y *
stride;
1930 int min_pu_width =
sps->min_pu_width;
1935 uint8_t left_array[2 * 32 + 1];
1936 uint8_t filtered_left_array[2 * 32 + 1];
1937 uint8_t top_array[2 * 32 + 1];
1938 uint8_t filtered_top_array[2 * 32 + 1];
1940 uint8_t *
left = left_array + 1;
1941 uint8_t *top = top_array + 1;
1942 uint8_t *filtered_left = filtered_left_array + 1;
1943 uint8_t *filtered_top = filtered_top_array + 1;
1946 pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) &
sps->tb_mask) *
1947 (
sps->tb_mask + 2) + (x_tb - 1)];
1953 pps->min_tb_addr_zs[(y_tb - 1) * (
sps->tb_mask + 2) +
1954 ((x_tb + size_in_tbs_h) &
sps->tb_mask)];
1956 int bottom_left_size =
1957 (((y0 + 2 * size_in_luma_v) >
1958 (
sps->height) ? (
sps->height) : (y0 +
1959 2 * size_in_luma_v)) -
1960 (y0 + size_in_luma_v)) >>
vshift;
1961 int top_right_size =
1962 (((x0 + 2 * size_in_luma_h) >
1963 (
sps->width) ? (
sps->width) : (x0 + 2 * size_in_luma_h)) -
1964 (x0 + size_in_luma_h)) >> hshift;
1966 if (
pps->constrained_intra_pred_flag == 1) {
1967 int size_in_luma_pu_v = ((size_in_luma_v) >>
sps->log2_min_pu_size);
1968 int size_in_luma_pu_h = ((size_in_luma_h) >>
sps->log2_min_pu_size);
1969 int on_pu_edge_x = !(x0 & ((1 <<
sps->log2_min_pu_size) - 1));
1970 int on_pu_edge_y = !(y0 & ((1 <<
sps->log2_min_pu_size) - 1));
1971 if (!size_in_luma_pu_h)
1972 size_in_luma_pu_h++;
1973 if (cand_bottom_left == 1 && on_pu_edge_x) {
1974 int x_left_pu = ((x0 - 1) >>
sps->log2_min_pu_size);
1976 ((y0 + size_in_luma_v) >>
sps->log2_min_pu_size);
1978 ((size_in_luma_pu_v) >
1979 (
sps->min_pu_height -
1980 y_bottom_pu) ? (
sps->min_pu_height -
1981 y_bottom_pu) : (size_in_luma_pu_v));
1982 cand_bottom_left = 0;
1983 for (
i = 0;
i <
max;
i += 2)
1985 ((
s->cur_frame->tab_mvf[(x_left_pu) +
1987 i) * min_pu_width]).pred_flag ==
1990 if (cand_left == 1 && on_pu_edge_x) {
1991 int x_left_pu = ((x0 - 1) >>
sps->log2_min_pu_size);
1992 int y_left_pu = ((y0) >>
sps->log2_min_pu_size);
1994 ((size_in_luma_pu_v) >
1995 (
sps->min_pu_height -
1996 y_left_pu) ? (
sps->min_pu_height -
1997 y_left_pu) : (size_in_luma_pu_v));
1999 for (
i = 0;
i <
max;
i += 2)
2001 ((
s->cur_frame->tab_mvf[(x_left_pu) +
2003 i) * min_pu_width]).pred_flag ==
2006 if (cand_up_left == 1) {
2007 int x_left_pu = ((x0 - 1) >>
sps->log2_min_pu_size);
2008 int y_top_pu = ((y0 - 1) >>
sps->log2_min_pu_size);
2010 (
s->cur_frame->tab_mvf[(x_left_pu) +
2011 (y_top_pu) * min_pu_width]).pred_flag ==
2014 if (cand_up == 1 && on_pu_edge_y) {
2015 int x_top_pu = ((x0) >>
sps->log2_min_pu_size);
2016 int y_top_pu = ((y0 - 1) >>
sps->log2_min_pu_size);
2018 ((size_in_luma_pu_h) >
2019 (
sps->min_pu_width -
2020 x_top_pu) ? (
sps->min_pu_width -
2021 x_top_pu) : (size_in_luma_pu_h));
2023 for (
i = 0;
i <
max;
i += 2)
2025 ((
s->cur_frame->tab_mvf[(x_top_pu +
i) +
2027 min_pu_width]).pred_flag ==
PF_INTRA);
2029 if (cand_up_right == 1 && on_pu_edge_y) {
2030 int y_top_pu = ((y0 - 1) >>
sps->log2_min_pu_size);
2032 ((x0 + size_in_luma_h) >>
sps->log2_min_pu_size);
2034 ((size_in_luma_pu_h) >
2035 (
sps->min_pu_width -
2036 x_right_pu) ? (
sps->min_pu_width -
2037 x_right_pu) : (size_in_luma_pu_h));
2039 for (
i = 0;
i <
max;
i += 2)
2041 ((
s->cur_frame->tab_mvf[(x_right_pu +
i) +
2043 min_pu_width]).pred_flag ==
PF_INTRA);
2046 vec0 = (v16u8) __msa_ldi_b(128);
2050 ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2062 if (cand_up_right) {
2064 ST_UB(vec0, (top + 16));
2068 ((
src[(16 + top_right_size - 1) +
stride * (-1)]) *
2070 for (
i = 0;
i < (16 - top_right_size);
i += 4)
2076 for (
i = 0;
i < 16;
i++)
2078 if (cand_bottom_left) {
2079 for (
i = 16;
i < 16 + bottom_left_size;
i++)
2083 ((
src[(-1) +
stride * (16 + bottom_left_size - 1)]) *
2085 for (
i = 0;
i < (16 - bottom_left_size);
i += 4)
2091 if (
pps->constrained_intra_pred_flag == 1) {
2092 if (cand_bottom_left || cand_left || cand_up_left || cand_up
2095 x0 + ((2 * 16) << hshift) <
2096 sps->width ? 2 * 16 : (
sps->width - x0) >> hshift;
2098 y0 + ((2 * 16) <<
vshift) <
2100 int j = 16 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2101 if (!cand_up_right) {
2102 size_max_x = x0 + ((16) << hshift) <
sps->width ?
2103 16 : (
sps->width - x0) >> hshift;
2105 if (!cand_bottom_left) {
2106 size_max_y = y0 + ((16) <<
vshift) <
sps->height ?
2109 if (cand_bottom_left || cand_left || cand_up_left) {
2112 !((
s->cur_frame->tab_mvf[(((x0 +
2113 ((-1) << hshift)) >>
sps->
2114 log2_min_pu_size)) + (((y0 +
2119 * min_pu_width]).pred_flag ==
2123 ((
s->cur_frame->tab_mvf[(((x0 +
2124 ((-1) << hshift)) >>
sps->
2125 log2_min_pu_size)) + (((y0 + ((j)
2130 * min_pu_width]).pred_flag ==
PF_INTRA)) {
2132 while (j < size_max_x
2134 !((
s->cur_frame->tab_mvf[(((x0 +
2135 ((j) << hshift)) >>
sps->
2136 log2_min_pu_size)) + (((y0 +
2141 * min_pu_width]).pred_flag ==
2144 for (
i = j;
i > (j) - (j + 1);
i--)
2146 ((
s->cur_frame->tab_mvf[(((x0 +
2148 1) << hshift)) >>
sps->
2149 log2_min_pu_size)) + (((y0 +
2154 * min_pu_width]).pred_flag ==
2156 top[
i - 1] = top[
i];
2161 while (j < size_max_x
2163 !((
s->cur_frame->tab_mvf[(((x0 +
2164 ((j) << hshift)) >>
sps->
2165 log2_min_pu_size)) + (((y0 + ((-1)
2170 * min_pu_width]).pred_flag ==
2175 for (
i = j;
i > (j) - (j + 1);
i--)
2177 ((
s->cur_frame->tab_mvf[(((x0 +
2180 sps->log2_min_pu_size))
2184 sps->log2_min_pu_size))
2186 min_pu_width]).pred_flag ==
2188 top[
i - 1] = top[
i];
2190 for (
i = j;
i > (j) - (j);
i--)
2192 ((
s->cur_frame->tab_mvf[(((x0 +
2195 sps->log2_min_pu_size))
2199 sps->log2_min_pu_size))
2201 min_pu_width]).pred_flag ==
2203 top[
i - 1] = top[
i];
2209 if (cand_bottom_left || cand_left) {
2210 a = ((
left[-1]) * 0x01010101U);
2211 for (
i = 0;
i < (0) + (size_max_y);
i += 4)
2213 ((
s->cur_frame->tab_mvf[(((x0 +
2214 ((-1) << hshift)) >>
sps->
2215 log2_min_pu_size)) + (((y0 +
2220 * min_pu_width]).pred_flag ==
2224 a = ((
left[
i + 3]) * 0x01010101U);
2227 vec0 = (v16u8) __msa_fill_b(
left[-1]);
2231 if (!cand_bottom_left) {
2233 vec0 = (v16u8) __msa_fill_b(
left[15]);
2237 if (x0 != 0 && y0 != 0) {
2238 a = ((
left[size_max_y - 1]) * 0x01010101U);
2239 for (
i = (size_max_y - 1);
2240 i > (size_max_y - 1) - (size_max_y);
i -= 4)
2242 ((
s->cur_frame->tab_mvf[(((x0 +
2243 ((-1) << hshift)) >>
sps->
2244 log2_min_pu_size)) + (((y0 +
2250 * min_pu_width]).pred_flag ==
2254 a = ((
left[
i - 3]) * 0x01010101U);
2256 ((
s->cur_frame->tab_mvf[(((x0 +
2257 ((-1) << hshift)) >>
sps->
2258 log2_min_pu_size)) + (((y0 + ((-1)
2263 * min_pu_width]).pred_flag ==
PF_INTRA))
2265 }
else if (x0 == 0) {
2267 uint32_t pix = ((0) * 0x01010101U);
2268 for (
i = 0;
i < (size_max_y);
i += 4)
2272 a = ((
left[size_max_y - 1]) * 0x01010101U);
2273 for (
i = (size_max_y - 1);
2274 i > (size_max_y - 1) - (size_max_y);
i -= 4)
2276 ((
s->cur_frame->tab_mvf[(((x0 +
2277 ((-1) << hshift)) >>
sps->
2278 log2_min_pu_size)) + (((y0 +
2284 * min_pu_width]).pred_flag ==
2288 a = ((
left[
i - 3]) * 0x01010101U);
2292 a = ((
left[-1]) * 0x01010101U);
2293 for (
i = 0;
i < (0) + (size_max_x);
i += 4)
2295 ((
s->cur_frame->tab_mvf[(((x0 +
2296 ((
i) << hshift)) >>
sps->
2297 log2_min_pu_size)) + (((y0 + ((-1)
2302 * min_pu_width]).pred_flag ==
2306 a = ((top[
i + 3]) * 0x01010101U);
2311 if (!cand_bottom_left) {
2313 vec0 = (v16u8) __msa_fill_b(
left[15]);
2317 }
else if (cand_up_left) {
2318 vec0 = (v16u8) __msa_fill_b(
left[-1]);
2323 }
else if (cand_up) {
2326 vec0 = (v16u8) __msa_fill_b(
left[-1]);
2332 }
else if (cand_up_right) {
2333 vec0 = (v16u8) __msa_fill_b(top[16]);
2346 vec0 = (v16u8) __msa_ldi_b(128);
2348 ST_UB2(vec0, vec0, top, 16);
2354 vec0 = (v16u8) __msa_fill_b(
left[16]);
2357 if (!cand_up_left) {
2361 vec0 = (v16u8) __msa_fill_b(
left[-1]);
2364 if (!cand_up_right) {
2365 vec0 = (v16u8) __msa_fill_b(top[15]);
2366 ST_UB(vec0, (top + 16));
2372 if (!
sps->intra_smoothing_disabled
2373 && (c_idx == 0 ||
sps->chroma_format_idc == 3)) {
2375 int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2376 int min_dist_vert_hor =
2377 (((((int) (
mode - 26
U)) >=
2378 0 ? ((
int) (
mode - 26
U)) : (-((int) (
mode - 26
U))))) >
2379 ((((int) (
mode - 10
U)) >=
2380 0 ? ((
int) (
mode - 10
U)) : (-((int) (
mode - 10
U)))))
2381 ? ((((int) (
mode - 10
U)) >=
2382 0 ? ((
int) (
mode - 10
U)) : (-((int) (
mode - 10
U)))))
2383 : ((((int) (
mode - 26
U)) >=
2384 0 ? ((
int) (
mode - 26
U)) : (-((int) (
mode - 26
U))))));
2385 if (min_dist_vert_hor > intra_hor_ver_dist_thresh[4 - 3]) {
2386 filtered_left[2 * 16 - 1] =
left[2 * 16 - 1];
2387 filtered_top[2 * 16 - 1] = top[2 * 16 - 1];
2388 for (
i = 2 * 16 - 2;
i >= 0;
i--)
2390 left[
i - 1] + 2) >> 2;
2393 (
left[0] + 2 *
left[-1] + top[0] + 2) >> 2;
2394 for (
i = 2 * 16 - 2;
i >= 0;
i--)
2395 filtered_top[
i] = (top[
i + 1] + 2 * top[
i] +
2396 top[
i - 1] + 2) >> 2;
2397 left = filtered_left;
2405 s->hpc.pred_planar[4 - 2] ((uint8_t *)
src, (uint8_t *) top,
2409 s->hpc.pred_dc((uint8_t *)
src, (uint8_t *) top,
2413 s->hpc.pred_angular[4 - 2] ((uint8_t *)
src, (uint8_t *) top,
2422 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2423 v8i16 res0, res1, res2, res3;
2424 v8i16 mul_val0 = { 63, 62, 61, 60, 59, 58, 57, 56 };
2425 v8i16 mul_val1 = { 1, 2, 3, 4, 5, 6, 7, 8 };
2429 int hshift =
sps->hshift[c_idx];
2431 int size_in_luma_h = 32 << hshift;
2432 int size_in_tbs_h = size_in_luma_h >>
sps->log2_min_tb_size;
2433 int size_in_luma_v = 32 <<
vshift;
2434 int size_in_tbs_v = size_in_luma_v >>
sps->log2_min_tb_size;
2435 int x = x0 >> hshift;
2437 int x_tb = (x0 >>
sps->log2_min_tb_size) &
sps->tb_mask;
2438 int y_tb = (y0 >>
sps->log2_min_tb_size) &
sps->tb_mask;
2441 pps->min_tb_addr_zs[(y_tb) * (
sps->tb_mask + 2) + (x_tb)];
2443 ptrdiff_t
stride =
s->frame->linesize[c_idx] /
sizeof(uint8_t);
2444 uint8_t *
src = (uint8_t *)
s->frame->data[c_idx] + x + y *
stride;
2446 int min_pu_width =
sps->min_pu_width;
2451 uint8_t left_array[2 * 32 + 1];
2452 uint8_t filtered_left_array[2 * 32 + 1];
2453 uint8_t top_array[2 * 32 + 1];
2454 uint8_t filtered_top_array[2 * 32 + 1];
2456 uint8_t *
left = left_array + 1;
2457 uint8_t *top = top_array + 1;
2458 uint8_t *filtered_left = filtered_left_array + 1;
2459 uint8_t *filtered_top = filtered_top_array + 1;
2462 pps->min_tb_addr_zs[((y_tb + size_in_tbs_v) &
sps->tb_mask) *
2463 (
sps->tb_mask + 2) + (x_tb - 1)];
2469 pps->min_tb_addr_zs[(y_tb - 1) * (
sps->tb_mask + 2) +
2470 ((x_tb + size_in_tbs_h) &
sps->tb_mask)];
2472 int bottom_left_size =
2473 (((y0 + 2 * size_in_luma_v) >
2474 (
sps->height) ? (
sps->height) : (y0 +
2475 2 * size_in_luma_v)) -
2476 (y0 + size_in_luma_v)) >>
vshift;
2477 int top_right_size =
2478 (((x0 + 2 * size_in_luma_h) >
2479 (
sps->width) ? (
sps->width) : (x0 + 2 * size_in_luma_h)) -
2480 (x0 + size_in_luma_h)) >> hshift;
2482 if (
pps->constrained_intra_pred_flag == 1) {
2483 int size_in_luma_pu_v = ((size_in_luma_v) >>
sps->log2_min_pu_size);
2484 int size_in_luma_pu_h = ((size_in_luma_h) >>
sps->log2_min_pu_size);
2485 int on_pu_edge_x = !(x0 & ((1 <<
sps->log2_min_pu_size) - 1));
2486 int on_pu_edge_y = !(y0 & ((1 <<
sps->log2_min_pu_size) - 1));
2487 if (!size_in_luma_pu_h)
2488 size_in_luma_pu_h++;
2489 if (cand_bottom_left == 1 && on_pu_edge_x) {
2490 int x_left_pu = ((x0 - 1) >>
sps->log2_min_pu_size);
2492 ((y0 + size_in_luma_v) >>
sps->log2_min_pu_size);
2494 ((size_in_luma_pu_v) >
2495 (
sps->min_pu_height -
2496 y_bottom_pu) ? (
sps->min_pu_height -
2497 y_bottom_pu) : (size_in_luma_pu_v));
2498 cand_bottom_left = 0;
2499 for (
i = 0;
i <
max;
i += 2)
2501 ((
s->cur_frame->tab_mvf[(x_left_pu) +
2503 i) * min_pu_width]).pred_flag ==
2506 if (cand_left == 1 && on_pu_edge_x) {
2507 int x_left_pu = ((x0 - 1) >>
sps->log2_min_pu_size);
2508 int y_left_pu = ((y0) >>
sps->log2_min_pu_size);
2510 ((size_in_luma_pu_v) >
2511 (
sps->min_pu_height -
2512 y_left_pu) ? (
sps->min_pu_height -
2513 y_left_pu) : (size_in_luma_pu_v));
2515 for (
i = 0;
i <
max;
i += 2)
2517 ((
s->cur_frame->tab_mvf[(x_left_pu) +
2519 i) * min_pu_width]).pred_flag ==
2522 if (cand_up_left == 1) {
2523 int x_left_pu = ((x0 - 1) >>
sps->log2_min_pu_size);
2524 int y_top_pu = ((y0 - 1) >>
sps->log2_min_pu_size);
2526 (
s->cur_frame->tab_mvf[(x_left_pu) +
2527 (y_top_pu) * min_pu_width]).pred_flag ==
2530 if (cand_up == 1 && on_pu_edge_y) {
2531 int x_top_pu = ((x0) >>
sps->log2_min_pu_size);
2532 int y_top_pu = ((y0 - 1) >>
sps->log2_min_pu_size);
2534 ((size_in_luma_pu_h) >
2535 (
sps->min_pu_width -
2536 x_top_pu) ? (
sps->min_pu_width -
2537 x_top_pu) : (size_in_luma_pu_h));
2539 for (
i = 0;
i <
max;
i += 2)
2541 ((
s->cur_frame->tab_mvf[(x_top_pu +
i) +
2543 min_pu_width]).pred_flag ==
PF_INTRA);
2545 if (cand_up_right == 1 && on_pu_edge_y) {
2546 int y_top_pu = ((y0 - 1) >>
sps->log2_min_pu_size);
2548 ((x0 + size_in_luma_h) >>
sps->log2_min_pu_size);
2550 ((size_in_luma_pu_h) >
2551 (
sps->min_pu_width -
2552 x_right_pu) ? (
sps->min_pu_width -
2553 x_right_pu) : (size_in_luma_pu_h));
2555 for (
i = 0;
i <
max;
i += 2)
2557 ((
s->cur_frame->tab_mvf[(x_right_pu +
i) +
2559 min_pu_width]).pred_flag ==
PF_INTRA);
2561 vec0 = (v16u8) __msa_ldi_b(128);
2564 ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2574 ST_UB2(vec0, vec1, top, 16);
2577 if (cand_up_right) {
2579 ST_UB2(vec0, vec1, (top + 32), 16);
2582 ((
src[(32 + top_right_size - 1) +
stride * (-1)]) *
2584 for (
i = 0;
i < (32 - top_right_size);
i += 4)
2590 for (
i = 0;
i < 32;
i++)
2592 if (cand_bottom_left) {
2593 for (
i = 32;
i < 32 + bottom_left_size;
i++)
2597 ((
src[(-1) +
stride * (32 + bottom_left_size - 1)]) *
2599 for (
i = 0;
i < (32 - bottom_left_size);
i += 4)
2605 if (
pps->constrained_intra_pred_flag == 1) {
2606 if (cand_bottom_left || cand_left || cand_up_left || cand_up
2609 x0 + ((2 * 32) << hshift) <
2610 sps->width ? 2 * 32 : (
sps->width - x0) >> hshift;
2612 y0 + ((2 * 32) <<
vshift) <
2614 int j = 32 + (cand_bottom_left ? bottom_left_size : 0) - 1;
2615 if (!cand_up_right) {
2616 size_max_x = x0 + ((32) << hshift) <
sps->width ?
2617 32 : (
sps->width - x0) >> hshift;
2619 if (!cand_bottom_left) {
2620 size_max_y = y0 + ((32) <<
vshift) <
sps->height ?
2623 if (cand_bottom_left || cand_left || cand_up_left) {
2626 !((
s->cur_frame->tab_mvf[(((x0 +
2627 ((-1) << hshift)) >>
sps->
2628 log2_min_pu_size)) + (((y0 +
2633 * min_pu_width]).pred_flag ==
2637 ((
s->cur_frame->tab_mvf[(((x0 +
2638 ((-1) << hshift)) >>
sps->
2639 log2_min_pu_size)) + (((y0 + ((j)
2644 * min_pu_width]).pred_flag ==
PF_INTRA)) {
2646 while (j < size_max_x
2648 !((
s->cur_frame->tab_mvf[(((x0 +
2649 ((j) << hshift)) >>
sps->
2650 log2_min_pu_size)) + (((y0 +
2655 * min_pu_width]).pred_flag ==
2658 for (
i = j;
i > (j) - (j + 1);
i--)
2660 ((
s->cur_frame->tab_mvf[(((x0 +
2662 1) << hshift)) >>
sps->
2663 log2_min_pu_size)) + (((y0 +
2668 * min_pu_width]).pred_flag ==
2670 top[
i - 1] = top[
i];
2675 while (j < size_max_x
2677 !((
s->cur_frame->tab_mvf[(((x0 +
2678 ((j) << hshift)) >>
sps->
2679 log2_min_pu_size)) + (((y0 + ((-1)
2684 * min_pu_width]).pred_flag ==
2689 for (
i = j;
i > (j) - (j + 1);
i--)
2691 ((
s->cur_frame->tab_mvf[(((x0 +
2694 sps->log2_min_pu_size))
2698 sps->log2_min_pu_size))
2700 min_pu_width]).pred_flag ==
2702 top[
i - 1] = top[
i];
2704 for (
i = j;
i > (j) - (j);
i--)
2706 ((
s->cur_frame->tab_mvf[(((x0 +
2709 sps->log2_min_pu_size))
2713 sps->log2_min_pu_size))
2715 min_pu_width]).pred_flag ==
2717 top[
i - 1] = top[
i];
2723 if (cand_bottom_left || cand_left) {
2724 a = ((
left[-1]) * 0x01010101U);
2725 for (
i = 0;
i < (0) + (size_max_y);
i += 4)
2727 ((
s->cur_frame->tab_mvf[(((x0 +
2728 ((-1) << hshift)) >>
sps->
2729 log2_min_pu_size)) + (((y0 +
2734 * min_pu_width]).pred_flag ==
2738 a = ((
left[
i + 3]) * 0x01010101U);
2741 vec0 = (v16u8) __msa_fill_b(
left[-1]);
2745 if (!cand_bottom_left) {
2746 vec0 = (v16u8) __msa_fill_b(
left[31]);
2750 if (x0 != 0 && y0 != 0) {
2751 a = ((
left[size_max_y - 1]) * 0x01010101U);
2752 for (
i = (size_max_y - 1);
2753 i > (size_max_y - 1) - (size_max_y);
i -= 4)
2755 ((
s->cur_frame->tab_mvf[(((x0 +
2756 ((-1) << hshift)) >>
sps->
2757 log2_min_pu_size)) + (((y0 +
2763 * min_pu_width]).pred_flag ==
2767 a = ((
left[
i - 3]) * 0x01010101U);
2769 ((
s->cur_frame->tab_mvf[(((x0 +
2770 ((-1) << hshift)) >>
sps->
2771 log2_min_pu_size)) + (((y0 + ((-1)
2776 * min_pu_width]).pred_flag ==
PF_INTRA))
2778 }
else if (x0 == 0) {
2780 uint32_t pix = ((0) * 0x01010101U);
2781 for (
i = 0;
i < (size_max_y);
i += 4)
2785 a = ((
left[size_max_y - 1]) * 0x01010101U);
2786 for (
i = (size_max_y - 1);
2787 i > (size_max_y - 1) - (size_max_y);
i -= 4)
2789 ((
s->cur_frame->tab_mvf[(((x0 +
2790 ((-1) << hshift)) >>
sps->
2791 log2_min_pu_size)) + (((y0 +
2797 * min_pu_width]).pred_flag ==
2801 a = ((
left[
i - 3]) * 0x01010101U);
2805 a = ((
left[-1]) * 0x01010101U);
2806 for (
i = 0;
i < (0) + (size_max_x);
i += 4)
2808 ((
s->cur_frame->tab_mvf[(((x0 +
2809 ((
i) << hshift)) >>
sps->
2810 log2_min_pu_size)) + (((y0 + ((-1)
2815 * min_pu_width]).pred_flag ==
2819 a = ((top[
i + 3]) * 0x01010101U);
2824 if (!cand_bottom_left) {
2826 vec0 = (v16u8) __msa_fill_b(
left[31]);
2829 }
else if (cand_up_left) {
2830 vec0 = (v16u8) __msa_fill_b(
left[-1]);
2835 }
else if (cand_up) {
2838 vec0 = (v16u8) __msa_fill_b(
left[-1]);
2844 }
else if (cand_up_right) {
2845 vec0 = (v16u8) __msa_fill_b(top[32]);
2847 ST_UB2(vec0, vec0, top, 16);
2859 vec0 = (v16u8) __msa_ldi_b(128);
2861 ST_UB4(vec0, vec0, vec0, vec0, top, 16);
2867 vec0 = (v16u8) __msa_fill_b(
left[32]);
2871 if (!cand_up_left) {
2875 vec0 = (v16u8) __msa_fill_b(
left[-1]);
2877 ST_UB2(vec0, vec0, top, 16);
2879 if (!cand_up_right) {
2880 vec0 = (v16u8) __msa_fill_b(top[31]);
2882 ST_UB2(vec0, vec0, (top + 32), 16);
2888 if (!
sps->intra_smoothing_disabled
2889 && (c_idx == 0 ||
sps->chroma_format_idc == 3)) {
2891 int intra_hor_ver_dist_thresh[] = { 7, 1, 0 };
2892 int min_dist_vert_hor =
2893 (((((int) (
mode - 26
U)) >=
2894 0 ? ((
int) (
mode - 26
U)) : (-((int) (
mode - 26
U))))) >
2895 ((((int) (
mode - 10
U)) >=
2896 0 ? ((
int) (
mode - 10
U)) : (-((int) (
mode - 10
U)))))
2897 ? ((((int) (
mode - 10
U)) >=
2898 0 ? ((
int) (
mode - 10
U)) : (-((int) (
mode - 10
U)))))
2899 : ((((int) (
mode - 26
U)) >=
2900 0 ? ((
int) (
mode - 26
U)) : (-((int) (
mode - 26
U))))));
2901 if (min_dist_vert_hor > intra_hor_ver_dist_thresh[5 - 3]) {
2902 int threshold = 1 << (8 - 5);
2903 if (
sps->strong_intra_smoothing_enabled
2905 && ((top[-1] + top[63] - 2 * top[31]) >=
2906 0 ? (top[-1] + top[63] -
2907 2 * top[31]) : (-(top[-1] + top[63] -
2908 2 * top[31]))) < threshold
2912 2 *
left[31]))) < threshold) {
2915 filtered_top[-1] = top[-1];
2916 filtered_top[63] = top[63];
2919 for (
i = 0;
i < 63;
i++) {
2921 ((63 -
i) * top[-1] + (
i + 1) * top[63] + 32) >> 6;
2924 tmp0 = __msa_fill_h(top[-1]);
2925 tmp1 = __msa_fill_h(top[63]);
2927 tmp2 = mul_val0 - 8;
2928 tmp3 = mul_val0 - 16;
2929 tmp4 = mul_val0 - 24;
2930 tmp5 = mul_val1 + 8;
2931 tmp6 = mul_val1 + 16;
2932 tmp7 = mul_val1 + 24;
2934 res0 = mul_val0 * tmp0;
2938 res0 += mul_val1 * tmp1;
2939 res1 += tmp5 * tmp1;
2940 res2 += tmp6 * tmp1;
2941 res3 += tmp7 * tmp1;
2943 res0 = __msa_srari_h(res0, 6);
2944 res1 = __msa_srari_h(res1, 6);
2945 res2 = __msa_srari_h(res2, 6);
2946 res3 = __msa_srari_h(res3, 6);
2948 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2949 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2951 ST_UB2(vec0, vec1, filtered_top, 16);
2953 res0 = mul_val0 - 32;
2954 tmp2 = mul_val0 - 40;
2955 tmp3 = mul_val0 - 48;
2956 tmp4 = mul_val0 - 56;
2957 res3 = mul_val1 + 32;
2958 tmp5 = mul_val1 + 40;
2959 tmp6 = mul_val1 + 48;
2960 tmp7 = mul_val1 + 56;
2965 res0 += res3 * tmp1;
2967 res1 += tmp5 * tmp1;
2968 res2 += tmp6 * tmp1;
2969 res3 += tmp7 * tmp1;
2971 res0 = __msa_srari_h(res0, 6);
2972 res1 = __msa_srari_h(res1, 6);
2973 res2 = __msa_srari_h(res2, 6);
2974 res3 = __msa_srari_h(res3, 6);
2976 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
2977 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
2979 ST_UB2(vec0, vec1, (filtered_top + 32), 16);
2981 filtered_top[63] = top[63];
2983 tmp0 = __msa_fill_h(
left[-1]);
2984 tmp1 = __msa_fill_h(
left[63]);
2986 tmp2 = mul_val0 - 8;
2987 tmp3 = mul_val0 - 16;
2988 tmp4 = mul_val0 - 24;
2989 tmp5 = mul_val1 + 8;
2990 tmp6 = mul_val1 + 16;
2991 tmp7 = mul_val1 + 24;
2993 res0 = mul_val0 * tmp0;
2997 res0 += mul_val1 * tmp1;
2998 res1 += tmp5 * tmp1;
2999 res2 += tmp6 * tmp1;
3000 res3 += tmp7 * tmp1;
3002 res0 = __msa_srari_h(res0, 6);
3003 res1 = __msa_srari_h(res1, 6);
3004 res2 = __msa_srari_h(res2, 6);
3005 res3 = __msa_srari_h(res3, 6);
3007 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3008 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3012 res0 = mul_val0 - 32;
3013 tmp2 = mul_val0 - 40;
3014 tmp3 = mul_val0 - 48;
3015 tmp4 = mul_val0 - 56;
3016 res3 = mul_val1 + 32;
3017 tmp5 = mul_val1 + 40;
3018 tmp6 = mul_val1 + 48;
3019 tmp7 = mul_val1 + 56;
3024 res0 += res3 * tmp1;
3026 res1 += tmp5 * tmp1;
3027 res2 += tmp6 * tmp1;
3028 res3 += tmp7 * tmp1;
3030 res0 = __msa_srari_h(res0, 6);
3031 res1 = __msa_srari_h(res1, 6);
3032 res2 = __msa_srari_h(res2, 6);
3033 res3 = __msa_srari_h(res3, 6);
3035 vec0 = (v16u8) __msa_pckev_b((v16i8) res1, (v16i8) res0);
3036 vec1 = (v16u8) __msa_pckev_b((v16i8) res3, (v16i8) res2);
3044 filtered_left[2 * 32 - 1] =
left[2 * 32 - 1];
3045 filtered_top[2 * 32 - 1] = top[2 * 32 - 1];
3046 for (
i = 2 * 32 - 2;
i >= 0;
i--)
3048 left[
i - 1] + 2) >> 2;
3051 (
left[0] + 2 *
left[-1] + top[0] + 2) >> 2;
3052 for (
i = 2 * 32 - 2;
i >= 0;
i--)
3053 filtered_top[
i] = (top[
i + 1] + 2 * top[
i] +
3054 top[
i - 1] + 2) >> 2;
3055 left = filtered_left;
3064 s->hpc.pred_planar[3] ((uint8_t *)
src, (uint8_t *) top,
3068 s->hpc.pred_dc((uint8_t *)
src, (uint8_t *) top,
3072 s->hpc.pred_angular[3] ((uint8_t *)
src, (uint8_t *) top,