37 int32_t dp00, dq00, dp30, dq30, d00, d30;
39 int32_t dp04, dq04, dp34, dq34, d04, d34;
40 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
42 uint64_t dst_val0, dst_val1;
43 v16u8 dst0, dst1, dst2, dst3, dst4, dst5;
44 v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
49 v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
51 v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
53 dp00 =
abs(p2[0] - (p1[0] << 1) + p0[0]);
54 dq00 =
abs(q2[0] - (
q1[0] << 1) +
q0[0]);
55 dp30 =
abs(p2[3] - (p1[3] << 1) + p0[3]);
56 dq30 =
abs(q2[3] - (
q1[3] << 1) +
q0[3]);
59 dp04 =
abs(p2[4] - (p1[4] << 1) + p0[4]);
60 dq04 =
abs(q2[4] - (
q1[4] << 1) +
q0[4]);
61 dp34 =
abs(p2[7] - (p1[7] << 1) + p0[7]);
62 dq34 =
abs(q2[7] - (
q1[7] << 1) +
q0[7]);
66 p_is_pcm0 = p_is_pcm[0];
67 p_is_pcm4 = p_is_pcm[1];
68 q_is_pcm0 = q_is_pcm[0];
69 q_is_pcm4 = q_is_pcm[1];
71 cmp0 = __msa_fill_d(p_is_pcm0);
72 cmp1 = __msa_fill_d(p_is_pcm4);
73 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
74 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
76 d0030 = (d00 + d30) >= beta;
77 d0434 = (d04 + d34) >= beta;
79 cmp0 = (v2i64) __msa_fill_w(d0030);
80 cmp1 = (v2i64) __msa_fill_w(d0434);
81 cmp3 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
82 cmp3 = (v2i64) __msa_ceqi_w((v4i32) cmp3, 0);
84 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
91 cmp0 = __msa_fill_d(q_is_pcm0);
92 cmp1 = __msa_fill_d(q_is_pcm4);
93 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
94 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
99 tc250 = ((tc0 * 5 + 1) >> 1);
101 tc254 = ((tc4 * 5 + 1) >> 1);
103 cmp0 = (v2i64) __msa_fill_h(tc0);
104 cmp1 = (v2i64) __msa_fill_h(tc4);
107 p3_src, p2_src, p1_src, p0_src);
113 flag0 =
abs(p3[0] - p0[0]) +
abs(q3[0] -
q0[0]) < beta30 &&
114 abs(p0[0] -
q0[0]) < tc250;
115 flag0 = flag0 && (
abs(p3[3] - p0[3]) +
abs(q3[3] -
q0[3]) < beta30 &&
116 abs(p0[3] -
q0[3]) < tc250 && (d00 << 1) < beta20 &&
117 (d30 << 1) < beta20);
119 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
121 q0_src, q1_src, q2_src, q3_src);
122 flag1 =
abs(p3[4] - p0[4]) +
abs(q3[4] -
q0[4]) < beta30 &&
123 abs(p0[4] -
q0[4]) < tc254;
124 flag1 = flag1 && (
abs(p3[7] - p0[7]) +
abs(q3[7] -
q0[7]) < beta30 &&
125 abs(p0[7] -
q0[7]) < tc254 && (d04 << 1) < beta20 &&
126 (d34 << 1) < beta20);
128 cmp0 = (v2i64) __msa_fill_w(flag0);
129 cmp1 = (v2i64) __msa_fill_w(flag1);
130 cmp2 = (v2i64) __msa_ilvev_w((v4i32) cmp1, (v4i32) cmp0);
131 cmp2 = (v2i64) __msa_ceqi_w((v4i32) cmp2, 0);
133 if (flag0 && flag1) {
139 temp0 = (p1_src + p0_src + q0_src);
140 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
141 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
142 temp2 = (v8i16) (temp1 - p2_src);
143 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
144 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
146 temp1 = temp0 + p2_src;
147 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
148 temp2 = (v8i16) (temp1 - p1_src);
149 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
150 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
152 temp1 = (temp0 << 1) + p2_src + q1_src;
153 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
154 temp2 = (v8i16) (temp1 - p0_src);
155 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
156 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
158 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
159 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
160 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
163 temp0 = (q1_src + p0_src + q0_src);
165 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
166 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
167 temp2 = (v8i16) (temp1 - q2_src);
168 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
169 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
171 temp1 = temp0 + q2_src;
172 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
173 temp2 = (v8i16) (temp1 - q1_src);
174 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
175 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
177 temp1 = (temp0 << 1) + p1_src + q2_src;
178 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
179 temp2 = (v8i16) (temp1 - q0_src);
180 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
181 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
183 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
184 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
185 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
189 dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
192 PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4);
193 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src);
195 dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3);
196 dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3);
197 dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3);
199 dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
200 dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
206 }
else if (flag0 == flag1) {
210 diff0 = (v8i16) (q0_src - p0_src);
211 diff1 = (v8i16) (q1_src - p1_src);
212 diff0 = (diff0 << 3) + diff0;
213 diff1 = (diff1 << 1) + diff1;
214 delta0 = diff0 - diff1;
215 delta0 = __msa_srari_h(delta0, 4);
217 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
218 abs_delta0 = __msa_add_a_h(delta0, (v8i16)
zero);
219 abs_delta0 = (v8u16) abs_delta0 < temp1;
221 delta0 =
CLIP_SH(delta0, tc_neg, tc_pos);
223 temp0 = (v8u16) (delta0 + p0_src);
225 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
226 (v16u8) p_is_pcm_vec);
228 temp2 = (v8i16) (q0_src - delta0);
230 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
231 (v16u8) q_is_pcm_vec);
233 p_is_pcm_vec = ~p_is_pcm_vec;
234 q_is_pcm_vec = ~q_is_pcm_vec;
235 tmp = (beta + (beta >> 1)) >> 3;
236 cmp0 = __msa_fill_d(dp00 + dp30 <
tmp);
237 cmp1 = __msa_fill_d(dp04 + dp34 <
tmp);
238 cmp0 = __msa_ilvev_d(cmp1, cmp0);
239 cmp0 = __msa_ceqi_d(cmp0, 0);
240 p_is_pcm_vec = p_is_pcm_vec | cmp0;
242 cmp0 = __msa_fill_d(dq00 + dq30 <
tmp);
243 cmp1 = __msa_fill_d(dq04 + dq34 <
tmp);
244 cmp0 = __msa_ilvev_d(cmp1, cmp0);
245 cmp0 = __msa_ceqi_d(cmp0, 0);
246 q_is_pcm_vec = q_is_pcm_vec | cmp0;
251 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
252 delta1 -= (v8i16) p1_src;
255 delta1 =
CLIP_SH(delta1, tc_neg, tc_pos);
256 delta1 = (v8i16) p1_src + (v8i16) delta1;
258 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
259 (v16u8) p_is_pcm_vec);
261 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
262 delta2 = delta2 - (v8i16) q1_src;
263 delta2 = delta2 - delta0;
264 delta2 = delta2 >> 1;
265 delta2 =
CLIP_SH(delta2, tc_neg, tc_pos);
266 delta2 = (v8i16) q1_src + (v8i16) delta2;
268 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
269 (v16u8) q_is_pcm_vec);
271 dst1 = (v16u8) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
273 dst2 = (v16u8) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
275 dst3 = (v16u8) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
277 dst4 = (v16u8) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
283 PCKEV_B2_UB(p0_src, p1_src, q1_src, q0_src, dst2, dst3);
285 dst0 = __msa_bmz_v(dst0, dst2, (v16u8) cmp3);
286 dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3);
297 temp0 = (p1_src + p0_src + q0_src);
298 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
299 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
300 temp2 = (v8i16) (temp1 - p2_src);
301 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
302 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
304 temp1 = temp0 + p2_src;
305 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
306 temp2 = (v8i16) (temp1 - p1_src);
307 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
308 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
310 temp1 = (temp0 << 1) + p2_src + q1_src;
311 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
312 temp2 = (v8i16) (temp1 - p0_src);
313 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
314 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
316 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
317 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
318 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
321 temp0 = (q1_src + p0_src + q0_src);
323 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
324 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
325 temp2 = (v8i16) (temp1 - q2_src);
326 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
327 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
329 temp1 = temp0 + q2_src;
330 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
331 temp2 = (v8i16) (temp1 - q1_src);
332 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
333 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
335 temp1 = (temp0 << 1) + p1_src + q2_src;
336 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
337 temp2 = (v8i16) (temp1 - q0_src);
338 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
339 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
341 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
342 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
343 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
347 dst2 = (v16u8) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
354 diff0 = (v8i16) (q0_src - p0_src);
355 diff1 = (v8i16) (q1_src - p1_src);
356 diff0 = (diff0 << 3) + diff0;
357 diff1 = (diff1 << 1) + diff1;
358 delta0 = diff0 - diff1;
359 delta0 = __msa_srari_h(delta0, 4);
361 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
362 abs_delta0 = __msa_add_a_h(delta0, (v8i16)
zero);
363 abs_delta0 = (v8u16) abs_delta0 < temp1;
365 delta0 =
CLIP_SH(delta0, tc_neg, tc_pos);
367 temp0 = (v8u16) (delta0 + p0_src);
369 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
370 (v16u8) p_is_pcm_vec);
372 temp2 = (v8i16) (q0_src - delta0);
374 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
375 (v16u8) q_is_pcm_vec);
377 p_is_pcm_vec = ~p_is_pcm_vec;
378 q_is_pcm_vec = ~q_is_pcm_vec;
379 tmp = (beta + (beta >> 1)) >> 3;
380 cmp0 = __msa_fill_d(dp00 + dp30 <
tmp);
381 cmp1 = __msa_fill_d(dp04 + dp34 <
tmp);
382 cmp0 = __msa_ilvev_d(cmp1, cmp0);
383 p_is_pcm_vec = p_is_pcm_vec | __msa_ceqi_d(cmp0, 0);
385 cmp0 = __msa_fill_d(dq00 + dq30 <
tmp);
386 cmp1 = __msa_fill_d(dq04 + dq34 <
tmp);
387 cmp0 = __msa_ilvev_d(cmp1, cmp0);
388 q_is_pcm_vec = q_is_pcm_vec | __msa_ceqi_d(cmp0, 0);
393 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
394 delta1 -= (v8i16) p1_src;
397 delta1 =
CLIP_SH(delta1, tc_neg, tc_pos);
398 delta1 = (v8i16) p1_src + (v8i16) delta1;
400 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
401 (v16u8) p_is_pcm_vec);
403 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
404 delta2 = delta2 - (v8i16) q1_src;
405 delta2 = delta2 - delta0;
406 delta2 = delta2 >> 1;
407 delta2 =
CLIP_SH(delta2, tc_neg, tc_pos);
408 delta2 = (v8i16) q1_src + (v8i16) delta2;
410 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
411 (v16u8) q_is_pcm_vec);
413 delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
415 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
417 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
419 delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
424 PCKEV_B2_UB(delta1, p2_src, temp2, temp0, dst3, dst4);
425 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) delta2);
428 dst0 = __msa_bmnz_v(dst0, dst3, (v16u8) cmp2);
429 dst1 = __msa_bmnz_v(dst1, dst4, (v16u8) cmp2);
430 dst2 = __msa_bmnz_v(dst2, dst5, (v16u8) cmp2);
433 PCKEV_B2_UB(p1_src, p2_src, q0_src, p0_src, dst3, dst4);
434 dst5 = (v16u8) __msa_pckev_b((v16i8) q2_src, (v16i8) q1_src);
436 dst0 = __msa_bmz_v(dst0, dst3, (v16u8) cmp3);
437 dst1 = __msa_bmz_v(dst1, dst4, (v16u8) cmp3);
438 dst2 = __msa_bmz_v(dst2, dst5, (v16u8) cmp3);
440 dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
441 dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
461 int32_t dp00, dq00, dp30, dq30, d00, d30;
463 int32_t dp04, dq04, dp34, dq34, d04, d34;
464 int32_t tc0, p_is_pcm0, q_is_pcm0, beta30, beta20, tc250;
465 int32_t tc4, p_is_pcm4, q_is_pcm4, tc254,
tmp;
466 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
467 v2i64 cmp0, cmp1, cmp2, p_is_pcm_vec, q_is_pcm_vec;
471 v8i16 tc_pos, tc_neg;
472 v8i16 diff0, diff1, delta0, delta1, delta2, abs_delta0;
474 v8u16 p3_src, p2_src, p1_src, p0_src, q0_src, q1_src, q2_src, q3_src;
476 dp00 =
abs(p3[-3] - (p3[-2] << 1) + p3[-1]);
477 dq00 =
abs(p3[2] - (p3[1] << 1) + p3[0]);
478 dp30 =
abs(p2[-3] - (p2[-2] << 1) + p2[-1]);
479 dq30 =
abs(p2[2] - (p2[1] << 1) + p2[0]);
482 p_is_pcm0 = p_is_pcm[0];
483 q_is_pcm0 = q_is_pcm[0];
485 dp04 =
abs(p1[-3] - (p1[-2] << 1) + p1[-1]);
486 dq04 =
abs(p1[2] - (p1[1] << 1) + p1[0]);
487 dp34 =
abs(p0[-3] - (p0[-2] << 1) + p0[-1]);
488 dq34 =
abs(p0[2] - (p0[1] << 1) + p0[0]);
491 p_is_pcm4 = p_is_pcm[1];
492 q_is_pcm4 = q_is_pcm[1];
494 cmp0 = __msa_fill_d(p_is_pcm0);
495 cmp1 = __msa_fill_d(p_is_pcm4);
496 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
497 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
499 d0030 = (d00 + d30) >= beta;
500 d0434 = (d04 + d34) >= beta;
502 cmp0 = __msa_fill_d(d0030);
503 cmp1 = __msa_fill_d(d0434);
504 cmp3 = __msa_ilvev_d(cmp1, cmp0);
505 cmp3 = (v2i64) __msa_ceqi_d(cmp3, 0);
507 if ((!p_is_pcm0 || !p_is_pcm4 || !q_is_pcm0 || !q_is_pcm4) &&
508 (!d0030 || !d0434)) {
513 cmp0 = __msa_fill_d(q_is_pcm0);
514 cmp1 = __msa_fill_d(q_is_pcm4);
515 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
516 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
521 tc250 = ((tc0 * 5 + 1) >> 1);
524 tc254 = ((tc4 * 5 + 1) >> 1);
525 cmp0 = (v2i64) __msa_fill_h(tc0 << 1);
526 cmp1 = (v2i64) __msa_fill_h(tc4 << 1);
527 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
530 q2_src, q3_src, p3_src, p2_src, p1_src, p0_src,
531 q0_src, q1_src, q2_src, q3_src);
533 flag0 =
abs(p3[-4] - p3[-1]) +
abs(p3[3] - p3[0]) < beta30 &&
534 abs(p3[-1] - p3[0]) < tc250;
535 flag0 = flag0 && (
abs(p2[-4] - p2[-1]) +
abs(p2[3] - p2[0]) < beta30 &&
536 abs(p2[-1] - p2[0]) < tc250 && (d00 << 1) < beta20 &&
537 (d30 << 1) < beta20);
538 cmp0 = __msa_fill_d(flag0);
540 p3_src, p2_src, p1_src, p0_src);
542 flag1 =
abs(p1[-4] - p1[-1]) +
abs(p1[3] - p1[0]) < beta30 &&
543 abs(p1[-1] - p1[0]) < tc254;
544 flag1 = flag1 && (
abs(p0[-4] - p0[-1]) +
abs(p0[3] - p0[0]) < beta30 &&
545 abs(p0[-1] - p0[0]) < tc254 && (d04 << 1) < beta20 &&
546 (d34 << 1) < beta20);
548 q0_src, q1_src, q2_src, q3_src);
550 cmp1 = __msa_fill_d(flag1);
551 cmp2 = __msa_ilvev_d(cmp1, cmp0);
552 cmp2 = __msa_ceqi_d(cmp2, 0);
554 if (flag0 && flag1) {
559 temp0 = (p1_src + p0_src + q0_src);
561 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
562 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
563 temp2 = (v8i16) (temp1 - p2_src);
564 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
565 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
567 temp1 = temp0 + p2_src;
568 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
569 temp2 = (v8i16) (temp1 - p1_src);
570 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
571 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
573 temp1 = (temp0 << 1) + p2_src + q1_src;
574 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
575 temp2 = (v8i16) (temp1 - p0_src);
576 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
577 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
579 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
580 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
581 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
584 temp0 = (q1_src + p0_src + q0_src);
585 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
586 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
587 temp2 = (v8i16) (temp1 - q2_src);
588 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
589 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
591 temp1 = temp0 + q2_src;
592 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
593 temp2 = (v8i16) (temp1 - q1_src);
594 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
595 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
597 temp1 = (temp0 << 1) + p1_src + q2_src;
598 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
599 temp2 = (v8i16) (temp1 - q0_src);
600 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
601 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
603 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
604 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
605 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
607 }
else if (flag0 == flag1) {
612 diff0 = (v8i16) (q0_src - p0_src);
613 diff1 = (v8i16) (q1_src - p1_src);
614 diff0 = (diff0 << 3) + diff0;
615 diff1 = (diff1 << 1) + diff1;
616 delta0 = diff0 - diff1;
617 delta0 = __msa_srari_h(delta0, 4);
619 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
620 abs_delta0 = __msa_add_a_h(delta0, (v8i16)
zero);
621 abs_delta0 = (v8u16) abs_delta0 < temp1;
623 delta0 =
CLIP_SH(delta0, tc_neg, tc_pos);
624 temp0 = (v8u16) (delta0 + p0_src);
626 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
627 (v16u8) p_is_pcm_vec);
629 temp2 = (v8i16) (q0_src - delta0);
631 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
632 (v16u8) q_is_pcm_vec);
634 tmp = ((beta + (beta >> 1)) >> 3);
635 cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) <
tmp));
636 cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) <
tmp));
637 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
638 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
640 cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 <
tmp));
641 cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 <
tmp));
642 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
643 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
648 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
649 delta1 -= (v8i16) p1_src;
652 delta1 =
CLIP_SH(delta1, tc_neg, tc_pos);
653 delta1 = (v8i16) p1_src + (v8i16) delta1;
655 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
656 (v16u8) p_is_pcm_vec);
658 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
659 delta2 = delta2 - (v8i16) q1_src;
660 delta2 = delta2 - delta0;
661 delta2 = delta2 >> 1;
662 delta2 =
CLIP_SH(delta2, tc_neg, tc_pos);
663 delta2 = (v8i16) q1_src + (v8i16) delta2;
665 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
666 (v16u8) q_is_pcm_vec);
668 dst0 = __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
670 dst1 = __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
672 dst2 = __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
674 dst3 = __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
678 dst0 = __msa_bmz_v(dst0, (v16u8) p1_src, (v16u8) cmp3);
679 dst1 = __msa_bmz_v(dst1, (v16u8) p0_src, (v16u8) cmp3);
680 dst2 = __msa_bmz_v(dst2, (v16u8) q0_src, (v16u8) cmp3);
681 dst3 = __msa_bmz_v(dst3, (v16u8) q1_src, (v16u8) cmp3);
691 tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
692 tmp3 = __msa_copy_u_w((v4i32) dst0, 1);
698 tmp2 = __msa_copy_u_w((v4i32) dst0, 2);
699 tmp3 = __msa_copy_u_w((v4i32) dst0, 3);
705 tmp2 = __msa_copy_u_w((v4i32) dst1, 0);
706 tmp3 = __msa_copy_u_w((v4i32) dst1, 1);
712 tmp2 = __msa_copy_u_w((v4i32) dst1, 2);
713 tmp3 = __msa_copy_u_w((v4i32) dst1, 3);
724 temp0 = (p1_src + p0_src + q0_src);
726 temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
727 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
728 temp2 = (v8i16) (temp1 - p2_src);
729 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
730 dst0 = (v16u8) (temp2 + (v8i16) p2_src);
732 temp1 = temp0 + p2_src;
733 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
734 temp2 = (v8i16) (temp1 - p1_src);
735 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
736 dst1 = (v16u8) (temp2 + (v8i16) p1_src);
738 temp1 = (temp0 << 1) + p2_src + q1_src;
739 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
740 temp2 = (v8i16) (temp1 - p0_src);
741 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
742 dst2 = (v16u8) (temp2 + (v8i16) p0_src);
744 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
745 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) p_is_pcm_vec);
746 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) p_is_pcm_vec);
749 temp0 = (q1_src + p0_src + q0_src);
750 temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
751 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
752 temp2 = (v8i16) (temp1 - q2_src);
753 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
754 dst5 = (v16u8) (temp2 + (v8i16) q2_src);
756 temp1 = temp0 + q2_src;
757 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
758 temp2 = (v8i16) (temp1 - q1_src);
759 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
760 dst4 = (v16u8) (temp2 + (v8i16) q1_src);
762 temp1 = (temp0 << 1) + p1_src + q2_src;
763 temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
764 temp2 = (v8i16) (temp1 - q0_src);
765 temp2 =
CLIP_SH(temp2, tc_neg, tc_pos);
766 dst3 = (v16u8) (temp2 + (v8i16) q0_src);
768 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
769 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) q_is_pcm_vec);
770 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) q_is_pcm_vec);
777 diff0 = (v8i16) (q0_src - p0_src);
778 diff1 = (v8i16) (q1_src - p1_src);
779 diff0 = (diff0 << 3) + diff0;
780 diff1 = (diff1 << 1) + diff1;
781 delta0 = diff0 - diff1;
782 delta0 = __msa_srari_h(delta0, 4);
784 temp1 = (v8u16) ((tc_pos << 3) + (tc_pos << 1));
785 abs_delta0 = __msa_add_a_h(delta0, (v8i16)
zero);
786 abs_delta0 = (v8u16) abs_delta0 < temp1;
788 delta0 =
CLIP_SH(delta0, tc_neg, tc_pos);
790 temp0 = (v8u16) (delta0 + p0_src);
792 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
793 (v16u8) p_is_pcm_vec);
795 temp2 = (v8i16) (q0_src - delta0);
797 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
798 (v16u8) q_is_pcm_vec);
800 tmp = (beta + (beta >> 1)) >> 3;
801 cmp0 = __msa_fill_d(!p_is_pcm0 && ((dp00 + dp30) <
tmp));
802 cmp1 = __msa_fill_d(!p_is_pcm4 && ((dp04 + dp34) <
tmp));
803 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
804 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
806 cmp0 = (v2i64) __msa_fill_h((!q_is_pcm0) && (dq00 + dq30 <
tmp));
807 cmp1 = (v2i64) __msa_fill_h((!q_is_pcm4) && (dq04 + dq34 <
tmp));
808 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
809 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
814 delta1 = (v8i16) __msa_aver_u_h(p2_src, p0_src);
815 delta1 -= (v8i16) p1_src;
818 delta1 =
CLIP_SH(delta1, tc_neg, tc_pos);
819 delta1 = (v8i16) p1_src + (v8i16) delta1;
821 delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
822 (v16u8) p_is_pcm_vec);
824 delta2 = (v8i16) __msa_aver_u_h(q0_src, q2_src);
825 delta2 = delta2 - (v8i16) q1_src;
826 delta2 = delta2 - delta0;
827 delta2 = delta2 >> 1;
828 delta2 =
CLIP_SH(delta2, tc_neg, tc_pos);
829 delta2 = (v8i16) q1_src + (v8i16) delta2;
831 delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
832 (v16u8) q_is_pcm_vec);
833 delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
835 temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
837 temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
839 delta2 = (v8i16) __msa_bmz_v((v16u8) delta2, (v16u8) q1_src,
844 dst2 = __msa_bmnz_v(dst2, (v16u8) temp0, (v16u8) cmp2);
845 dst3 = __msa_bmnz_v(dst3, (v16u8) temp2, (v16u8) cmp2);
846 dst1 = __msa_bmnz_v(dst1, (v16u8) delta1, (v16u8) cmp2);
847 dst4 = __msa_bmnz_v(dst4, (v16u8) delta2, (v16u8) cmp2);
848 dst0 = __msa_bmnz_v(dst0, (v16u8) p2_src, (v16u8) cmp2);
849 dst5 = __msa_bmnz_v(dst5, (v16u8) q2_src, (v16u8) cmp2);
852 dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) cmp3);
853 dst1 = __msa_bmz_v(dst1, (v16u8) p1_src, (v16u8) cmp3);
854 dst2 = __msa_bmz_v(dst2, (v16u8) p0_src, (v16u8) cmp3);
855 dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) cmp3);
856 dst4 = __msa_bmz_v(dst4, (v16u8) q1_src, (v16u8) cmp3);
857 dst5 = __msa_bmz_v(dst5, (v16u8) q2_src, (v16u8) cmp3);
860 PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst4, dst4, dst5, dst5, dst0, dst1,
871 tmp2 = __msa_copy_u_w((v4i32) dst0, 0);
872 tmp3 = __msa_copy_u_w((v4i32) dst0, 1);
873 tmp0 = __msa_copy_u_h((v8i16) dst2, 0);
874 tmp1 = __msa_copy_u_h((v8i16) dst2, 2);
882 tmp2 = __msa_copy_u_w((v4i32) dst0, 2);
883 tmp3 = __msa_copy_u_w((v4i32) dst0, 3);
884 tmp0 = __msa_copy_u_h((v8i16) dst2, 4);
885 tmp1 = __msa_copy_u_h((v8i16) dst2, 6);
893 tmp2 = __msa_copy_u_w((v4i32) dst1, 0);
894 tmp3 = __msa_copy_u_w((v4i32) dst1, 1);
895 tmp0 = __msa_copy_u_h((v8i16) dst3, 0);
896 tmp1 = __msa_copy_u_h((v8i16) dst3, 2);
904 tmp2 = __msa_copy_u_w((v4i32) dst1, 2);
905 tmp3 = __msa_copy_u_w((v4i32) dst1, 3);
906 tmp0 = __msa_copy_u_h((v8i16) dst3, 4);
907 tmp1 = __msa_copy_u_h((v8i16) dst3, 6);
924 v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
925 v8u16 p1, p0,
q0,
q1;
926 v8i16 tc_pos, tc_neg;
928 v8i16 temp0, temp1,
delta;
930 if (!(
tc[0] <= 0) || !(
tc[1] <= 0)) {
931 cmp0 = (v2i64) __msa_fill_h(
tc[0]);
932 cmp1 = (v2i64) __msa_fill_h(
tc[1]);
933 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
936 cmp0 = __msa_fill_d(p_is_pcm[0]);
937 cmp1 = __msa_fill_d(p_is_pcm[1]);
938 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
939 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
941 cmp0 = __msa_fill_d(q_is_pcm[0]);
942 cmp1 = __msa_fill_d(q_is_pcm[1]);
943 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
944 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
951 ILVR_B4_UH(
zero, p1,
zero, p0,
zero,
q0,
zero,
q1, p1, p0,
q0,
q1);
953 temp0 = (v8i16) (
q0 - p0);
954 temp1 = (v8i16) (p1 -
q1);
957 delta = __msa_srari_h((v8i16) temp0, 3);
960 temp0 = (v8i16) ((v8i16) p0 +
delta);
962 temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
963 (v16u8) p_is_pcm_vec);
965 temp1 = (v8i16) ((v8i16)
q0 -
delta);
967 temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8)
q0,
968 (v16u8) q_is_pcm_vec);
970 tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
971 temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
972 temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8)
q0, (v16u8) tc_pos);
974 temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
983 v2i64 cmp0, cmp1, p_is_pcm_vec, q_is_pcm_vec;
984 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
985 v8u16 p1, p0,
q0,
q1;
986 v8i16 tc_pos, tc_neg;
988 v8i16 temp0, temp1,
delta;
990 if (!(
tc[0] <= 0) || !(
tc[1] <= 0)) {
991 cmp0 = (v2i64) __msa_fill_h(
tc[0]);
992 cmp1 = (v2i64) __msa_fill_h(
tc[1]);
993 tc_pos = (v8i16) __msa_ilvev_d(cmp1, cmp0);
996 cmp0 = __msa_fill_d(p_is_pcm[0]);
997 cmp1 = __msa_fill_d(p_is_pcm[1]);
998 p_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
999 p_is_pcm_vec = __msa_ceqi_d(p_is_pcm_vec, 0);
1001 cmp0 = __msa_fill_d(q_is_pcm[0]);
1002 cmp1 = __msa_fill_d(q_is_pcm[1]);
1003 q_is_pcm_vec = __msa_ilvev_d(cmp1, cmp0);
1004 q_is_pcm_vec = __msa_ceqi_d(q_is_pcm_vec, 0);
1010 ILVR_B4_UH(
zero, p1,
zero, p0,
zero,
q0,
zero,
q1, p1, p0,
q0,
q1);
1012 temp0 = (v8i16) (
q0 - p0);
1013 temp1 = (v8i16) (p1 -
q1);
1016 delta = __msa_srari_h((v8i16) temp0, 3);
1019 temp0 = (v8i16) ((v8i16) p0 +
delta);
1021 temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
1022 (v16u8) p_is_pcm_vec);
1024 temp1 = (v8i16) ((v8i16)
q0 -
delta);
1026 temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8)
q0,
1027 (v16u8) q_is_pcm_vec);
1029 tc_pos = (v8i16) __msa_clei_s_d((v2i64) tc_pos, 0);
1030 temp0 = (v8i16) __msa_bmnz_v((v16u8) temp0, (v16u8) p0, (v16u8) tc_pos);
1031 temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8)
q0, (v16u8) tc_pos);
1033 temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
1036 ST_H8(temp0, 0, 1, 2, 3, 4, 5, 6, 7,
src,
stride);
1043 int16_t *sao_offset_val,
1047 v16i8 src0_r, src1_r;
1049 v16i8 dst0, offset0, offset1;
1052 offset_val =
LD_SB(sao_offset_val + 1);
1053 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1055 offset_val = __msa_pckev_b(offset_val, offset_val);
1056 offset1 = (v16i8) __msa_insve_w((v4i32)
zero, 3, (v4i32) offset_val);
1057 offset0 = __msa_sld_b(offset1,
zero, 28 - ((sao_left_class) & 31));
1058 offset1 = __msa_sld_b(
zero, offset1, 28 - ((sao_left_class) & 31));
1063 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1064 SWAP(offset0, offset1);
1068 src += (4 * src_stride);
1072 src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
1073 mask = __msa_srli_b(src0_r, 3);
1074 offset = __msa_vshf_b(
mask, offset1, offset0);
1076 src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128);
1077 dst0 = __msa_adds_s_b(src0_r,
offset);
1078 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1084 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
1085 dst += (4 * dst_stride);
1090 src0_r = (v16i8) __msa_pckev_w((v4i32) src1_r, (v4i32) src0_r);
1091 mask = __msa_srli_b(src0_r, 3);
1092 offset = __msa_vshf_b(
mask, offset1, offset0);
1094 src0_r = (v16i8) __msa_xori_b((v16u8) src0_r, 128);
1095 dst0 = __msa_adds_s_b(src0_r,
offset);
1096 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1099 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
1105 int16_t *sao_offset_val,
1109 v16i8 src0_r, src1_r, mask0, mask1;
1110 v16i8 offset_mask0, offset_mask1, offset_val;
1111 v16i8 offset0, offset1, dst0, dst1;
1114 offset_val =
LD_SB(sao_offset_val + 1);
1115 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1116 offset_val = __msa_pckev_b(offset_val, offset_val);
1117 offset1 = (v16i8) __msa_insve_w((v4i32)
zero, 3, (v4i32) offset_val);
1118 offset0 = __msa_sld_b(offset1,
zero, 28 - ((sao_left_class) & 31));
1119 offset1 = __msa_sld_b(
zero, offset1, 28 - ((sao_left_class) & 31));
1124 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1125 SWAP(offset0, offset1);
1129 src += src_stride << 2;
1133 mask0 = __msa_srli_b(src0_r, 3);
1134 mask1 = __msa_srli_b(src1_r, 3);
1136 offset_mask0 = __msa_vshf_b(mask0, offset1, offset0);
1137 offset_mask1 = __msa_vshf_b(mask1, offset1, offset0);
1144 dst0 = __msa_adds_s_b(src0_r, offset_mask0);
1145 dst1 = __msa_adds_s_b(src1_r, offset_mask1);
1150 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1151 dst += dst_stride << 2;
1156 mask0 = __msa_srli_b(src0_r, 3);
1157 mask1 = __msa_srli_b(src1_r, 3);
1159 offset_mask0 = __msa_vshf_b(mask0, offset1, offset0);
1160 offset_mask1 = __msa_vshf_b(mask1, offset1, offset0);
1164 dst0 = __msa_adds_s_b(src0_r, offset_mask0);
1165 dst1 = __msa_adds_s_b(src1_r, offset_mask1);
1170 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1178 int16_t *sao_offset_val,
1183 v16i8 out0, out1, out2, out3;
1184 v16i8 mask0, mask1, mask2, mask3;
1185 v16i8 tmp0, tmp1, tmp2, tmp3, offset_val;
1186 v16i8 offset0, offset1;
1189 offset_val =
LD_SB(sao_offset_val + 1);
1190 offset_val = (v16i8) __msa_pckev_d((v2i64) offset_val, (v2i64) offset_val);
1191 offset_val = __msa_pckev_b(offset_val, offset_val);
1192 offset1 = (v16i8) __msa_insve_w((v4i32)
zero, 3, (v4i32) offset_val);
1193 offset0 = __msa_sld_b(offset1,
zero, 28 - ((sao_left_class) & 31));
1194 offset1 = __msa_sld_b(
zero, offset1, 28 - ((sao_left_class) & 31));
1196 if (!((sao_left_class > 12) & (sao_left_class < 29))) {
1197 SWAP(offset0, offset1);
1204 for (w_cnt = 16; w_cnt <
width; w_cnt += 16) {
1205 mask0 = __msa_srli_b((v16i8)
src0, 3);
1206 mask1 = __msa_srli_b((v16i8)
src1, 3);
1207 mask2 = __msa_srli_b((v16i8) src2, 3);
1208 mask3 = __msa_srli_b((v16i8) src3, 3);
1210 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1,
1212 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3,
1216 out0 = __msa_adds_s_b((v16i8)
src0, tmp0);
1217 out1 = __msa_adds_s_b((v16i8)
src1, tmp1);
1218 out2 = __msa_adds_s_b((v16i8) src2, tmp2);
1219 out3 = __msa_adds_s_b((v16i8) src3, tmp3);
1226 ST_SB4(out0, out1, out2, out3, dst + w_cnt - 16, dst_stride);
1229 mask0 = __msa_srli_b((v16i8)
src0, 3);
1230 mask1 = __msa_srli_b((v16i8)
src1, 3);
1231 mask2 = __msa_srli_b((v16i8) src2, 3);
1232 mask3 = __msa_srli_b((v16i8) src3, 3);
1234 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask0, mask1, tmp0,
1236 VSHF_B2_SB(offset0, offset1, offset0, offset1, mask2, mask3, tmp2,
1240 out0 = __msa_adds_s_b((v16i8)
src0, tmp0);
1241 out1 = __msa_adds_s_b((v16i8)
src1, tmp1);
1242 out2 = __msa_adds_s_b((v16i8) src2, tmp2);
1243 out3 = __msa_adds_s_b((v16i8) src3, tmp3);
1247 ST_SB4(out0, out1, out2, out3, dst + w_cnt - 16, dst_stride);
1249 src += src_stride << 2;
1250 dst += dst_stride << 2;
1259 int16_t *sao_offset_val,
1262 uint32_t dst_val0, dst_val1;
1263 v16u8 cmp_minus10, diff_minus10, diff_minus11, src_minus10, src_minus11;
1264 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1265 v16i8 sao_offset =
LD_SB(sao_offset_val);
1267 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1270 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1274 LD_UB2(
src, src_stride, src_minus10, src_minus11);
1277 src += (2 * src_stride);
1279 src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11,
1280 (v2i64) src_minus10);
1282 src0 = (v16i8) __msa_sldi_b(
zero, (v16i8) src_minus10, 1);
1283 src_plus10 = (v16i8) __msa_sldi_b(
zero, (v16i8) src_minus10, 2);
1285 cmp_minus10 = ((v16u8)
src0 == src_minus10);
1286 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1287 cmp_minus10 = (src_minus10 < (v16u8)
src0);
1288 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1290 cmp_minus10 = ((v16u8)
src0 == (v16u8) src_plus10);
1291 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1292 cmp_minus10 = ((v16u8) src_plus10 < (v16u8)
src0);
1293 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1295 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1298 LD_UB2(
src, src_stride, src_minus10, src_minus11);
1303 src0 = (v16i8) __msa_xori_b((v16u8)
src0, 128);
1305 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1307 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1308 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1315 src_minus10 = (v16u8) __msa_pckev_d((v2i64) src_minus11,
1316 (v2i64) src_minus10);
1318 src0 = (v16i8) __msa_sldi_b(
zero, (v16i8) src_minus10, 1);
1319 src_plus10 = (v16i8) __msa_sldi_b(
zero, (v16i8) src_minus10, 2);
1321 cmp_minus10 = ((v16u8)
src0 == src_minus10);
1322 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1323 cmp_minus10 = (src_minus10 < (v16u8)
src0);
1324 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1326 cmp_minus10 = ((v16u8)
src0 == (v16u8) src_plus10);
1327 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1328 cmp_minus10 = ((v16u8) src_plus10 < (v16u8)
src0);
1329 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1331 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1335 src0 = (v16i8) __msa_xori_b((v16u8)
src0, 128);
1337 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1339 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1340 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1351 int16_t *sao_offset_val,
1354 uint64_t dst_val0, dst_val1;
1355 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1356 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1357 v16u8 cmp_minus10, diff_minus10, diff_minus11;
1358 v16u8
src0,
src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
1361 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1365 LD_UB2(
src, src_stride, src_minus10, src_minus11);
1368 src += (src_stride << 1);
1371 SLDI_B2_0_UB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
1373 PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10,
1374 src_minus10, src_plus10);
1375 src0 = (v16u8) __msa_pckev_d((v2i64)
src1, (v2i64)
src0);
1377 cmp_minus10 = (
src0 == src_minus10);
1378 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1379 cmp_minus10 = (src_minus10 <
src0);
1380 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1382 cmp_minus10 = (
src0 == src_plus10);
1383 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1384 cmp_minus10 = (src_plus10 <
src0);
1385 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1387 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1390 LD_UB2(
src, src_stride, src_minus10, src_minus11);
1396 dst0 = (v16u8) __msa_adds_s_b((v16i8)
src0,
offset);
1397 dst0 = __msa_xori_b(dst0, 128);
1399 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1400 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1408 SLDI_B2_0_UB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
1410 PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10, src_minus10,
1412 src0 = (v16u8) __msa_pckev_d((v2i64)
src1, (v2i64)
src0);
1414 cmp_minus10 = ((v16u8)
src0 == src_minus10);
1415 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1416 cmp_minus10 = (src_minus10 < (v16u8)
src0);
1417 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1419 cmp_minus10 = (
src0 == src_plus10);
1420 diff_minus11 = __msa_nor_v(cmp_minus10, cmp_minus10);
1421 cmp_minus10 = (src_plus10 <
src0);
1422 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus10);
1424 offset = (v16i8) diff_minus10 + (v16i8) diff_minus11 + 2;
1430 dst0 = (v16u8) __msa_adds_s_b((v16i8)
src0,
offset);
1431 dst0 = __msa_xori_b(dst0, 128);
1433 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1434 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1444 int16_t *sao_offset_val,
1448 uint8_t *dst_ptr, *src_minus1;
1450 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1451 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1453 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1454 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1455 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1457 v16u8 src10, src11, src12, src13, dst0, dst1, dst2, dst3;
1458 v16u8 src_minus10, src_minus11, src_minus12, src_minus13;
1459 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
1460 v16i8 src_zero0, src_zero1, src_zero2, src_zero3;
1461 v16i8 src_plus10, src_plus11, src_plus12, src_plus13;
1463 sao_offset =
LD_SB(sao_offset_val);
1464 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1467 src_minus1 =
src - 1;
1468 LD_UB4(src_minus1, src_stride,
1469 src_minus10, src_minus11, src_minus12, src_minus13);
1471 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
1473 dst_ptr = dst + v_cnt;
1474 LD_UB4(src_minus1, src_stride, src10, src11, src12, src13);
1476 SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_zero0,
1478 SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_zero2,
1480 SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_plus10,
1482 SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_plus12,
1485 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1486 cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
1487 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1488 cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
1489 cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
1490 cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
1491 cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
1492 cmp_plus13 = ((v16u8) src_zero3 == (v16u8) src_plus13);
1494 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1495 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1496 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1497 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1498 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1499 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1500 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1501 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1503 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1504 cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
1505 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1506 cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
1507 cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
1508 cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
1509 cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
1510 cmp_plus13 = ((v16u8) src_plus13 < (v16u8) src_zero3);
1512 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1513 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1514 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1515 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1516 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1517 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1518 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1519 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1521 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1522 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask0,
1523 offset_mask0, offset_mask0, offset_mask0);
1524 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1525 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask1,
1526 offset_mask1, offset_mask1, offset_mask1);
1527 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1528 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask2,
1529 offset_mask2, offset_mask2, offset_mask2);
1530 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1531 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset, offset_mask3,
1532 offset_mask3, offset_mask3, offset_mask3);
1536 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
1537 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
1538 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
1539 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
1543 src_minus10 = src10;
1544 ST_UB(dst0, dst_ptr);
1545 src_minus11 = src11;
1546 ST_UB(dst1, dst_ptr + dst_stride);
1547 src_minus12 = src12;
1548 ST_UB(dst2, dst_ptr + (dst_stride << 1));
1549 src_minus13 = src13;
1550 ST_UB(dst3, dst_ptr + (dst_stride * 3));
1553 src += (src_stride << 2);
1554 dst += (dst_stride << 2);
1562 int16_t *sao_offset_val,
1565 uint32_t dst_val0, dst_val1;
1566 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1567 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1569 v16i8 sao_offset =
LD_SB(sao_offset_val);
1570 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1571 v16u8 src_minus10, src_minus11, src10, src11;
1572 v16i8 src_zero0, src_zero1;
1574 v8i16 offset_mask0, offset_mask1;
1576 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1579 LD_UB2(
src - src_stride, src_stride, src_minus10, src_minus11);
1580 LD_UB2(
src + src_stride, src_stride, src10, src11);
1583 src += (src_stride << 1);
1585 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1586 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1587 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1588 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1590 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1591 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1592 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1593 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1595 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1596 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1597 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1598 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1600 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1601 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1603 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1604 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1609 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1610 dst0 = __msa_adds_s_b(dst0,
offset);
1611 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1613 src_minus10 = src10;
1614 src_minus11 = src11;
1617 LD_UB2(
src + src_stride, src_stride, src10, src11);
1619 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1620 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1628 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1629 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1630 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1631 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1633 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1634 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1635 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1636 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1638 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1639 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1640 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1641 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1643 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1644 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1646 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1647 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1652 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1653 dst0 = __msa_adds_s_b(dst0,
offset);
1654 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1656 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1657 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1667 int16_t *sao_offset_val,
1670 uint64_t dst_val0, dst_val1;
1671 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1672 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1674 v16i8 src_zero0, src_zero1, dst0;
1675 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1676 v16u8 src_minus10, src_minus11, src10, src11;
1677 v8i16 offset_mask0, offset_mask1;
1679 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1682 LD_UB2(
src - src_stride, src_stride, src_minus10, src_minus11);
1683 LD_UB2(
src + src_stride, src_stride, src10, src11);
1686 src += (src_stride << 1);
1688 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1689 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1690 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1691 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1693 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1694 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1695 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1696 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1698 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1699 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1700 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1701 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1703 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1704 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1706 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1707 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1712 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1713 dst0 = __msa_adds_s_b(dst0,
offset);
1714 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1716 src_minus10 = src10;
1717 src_minus11 = src11;
1720 LD_UB2(
src + src_stride, src_stride, src10, src11);
1722 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1723 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1730 src_minus10 = (v16u8) __msa_ilvr_b((v16i8) src10, (v16i8) src_minus10);
1731 src_zero0 = __msa_ilvr_b((v16i8) src_minus11, (v16i8) src_minus11);
1732 src_minus11 = (v16u8) __msa_ilvr_b((v16i8) src11, (v16i8) src_minus11);
1733 src_zero1 = __msa_ilvr_b((v16i8) src10, (v16i8) src10);
1735 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1736 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1737 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1738 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1740 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1741 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1742 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1743 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1745 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1746 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1748 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1749 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1754 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1755 dst0 = __msa_adds_s_b(dst0,
offset);
1756 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1758 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1759 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
1777 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1778 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1779 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
1780 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
1781 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
1783 v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
1784 v16u8 src12, dst2, src13, dst3;
1785 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
1787 sao_offset =
LD_SB(sao_offset_val);
1788 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1790 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
1791 src = src_orig + v_cnt;
1792 dst = dst_orig + v_cnt;
1794 LD_UB2(
src - src_stride, src_stride, src_minus10, src_minus11);
1796 for (h_cnt = (
height >> 2); h_cnt--;) {
1797 LD_UB4(
src + src_stride, src_stride, src10, src11, src12, src13);
1799 cmp_minus10 = (src_minus11 == src_minus10);
1800 cmp_plus10 = (src_minus11 == src10);
1801 cmp_minus11 = (src10 == src_minus11);
1802 cmp_plus11 = (src10 == src11);
1803 cmp_minus12 = (src11 == src10);
1804 cmp_plus12 = (src11 == src12);
1805 cmp_minus13 = (src12 == src11);
1806 cmp_plus13 = (src12 == src13);
1808 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1809 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
1810 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1811 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
1812 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
1813 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
1814 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
1815 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
1817 cmp_minus10 = (src_minus10 < src_minus11);
1818 cmp_plus10 = (src10 < src_minus11);
1819 cmp_minus11 = (src_minus11 < src10);
1820 cmp_plus11 = (src11 < src10);
1821 cmp_minus12 = (src10 < src11);
1822 cmp_plus12 = (src12 < src11);
1823 cmp_minus13 = (src11 < src12);
1824 cmp_plus13 = (src13 < src12);
1826 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1827 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
1828 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1829 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
1830 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
1831 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
1832 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
1833 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
1835 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
1836 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1837 offset_mask0, offset_mask0, offset_mask0, offset_mask0);
1838 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
1839 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1840 offset_mask1, offset_mask1, offset_mask1, offset_mask1);
1841 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
1842 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1843 offset_mask2, offset_mask2, offset_mask2, offset_mask2);
1844 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
1845 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
1846 offset_mask3, offset_mask3, offset_mask3, offset_mask3);
1848 src_minus10 = src12;
1851 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_minus11, offset_mask0);
1852 dst1 = (v16u8) __msa_adds_s_b((v16i8) src10, offset_mask1);
1853 dst2 = (v16u8) __msa_adds_s_b((v16i8) src11, offset_mask2);
1854 dst3 = (v16u8) __msa_adds_s_b((v16i8) src12, offset_mask3);
1857 src_minus11 = src13;
1859 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
1861 src += (src_stride << 2);
1862 dst += (dst_stride << 2);
1871 int16_t *sao_offset_val,
1875 uint32_t dst_val0, dst_val1;
1876 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1877 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1879 v16u8 cmp_minus10, diff_minus10, src_minus10, cmp_minus11, diff_minus11;
1880 v16u8 src_minus11, src10, src11;
1881 v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0;
1882 v8i16 offset_mask0, offset_mask1;
1884 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
1889 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
1890 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1893 src_orig += (src_stride << 1);
1895 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
1898 ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
1900 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1903 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1904 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1905 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1906 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1908 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1909 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1910 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1911 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1913 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1914 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1916 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1917 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1922 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1923 dst0 = __msa_adds_s_b(dst0,
offset);
1924 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1926 src_minus10 = src10;
1927 src_minus11 = src11;
1930 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
1932 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1933 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1941 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
1944 ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
1946 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
1949 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
1950 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
1951 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
1952 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
1954 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
1955 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
1956 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
1957 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
1959 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
1960 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
1962 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
1963 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
1968 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1969 dst0 = __msa_adds_s_b(dst0,
offset);
1970 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
1972 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
1973 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
1983 int16_t *sao_offset_val,
1987 uint64_t dst_val0, dst_val1;
1988 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1989 v16u8 const1 = (v16u8) __msa_ldi_b(1);
1991 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
1992 v16u8 src_minus10, src10, src_minus11, src11;
1993 v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0;
1994 v8i16 offset_mask0, offset_mask1;
1996 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2000 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2001 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2004 src_orig += (src_stride << 1);
2006 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
2009 ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
2010 src_minus10, src_minus11);
2011 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1,
2012 src_zero0, src_zero1);
2014 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2015 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2016 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2017 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2019 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2020 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2021 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2022 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2024 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2025 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2027 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2028 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2033 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2034 dst0 = __msa_adds_s_b(dst0,
offset);
2035 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2037 src_minus10 = src10;
2038 src_minus11 = src11;
2041 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2043 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2044 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2051 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
2053 ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, src_minus10,
2055 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2058 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2059 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2060 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2061 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2063 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2064 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2065 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2066 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2068 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2069 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2071 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2072 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2077 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2078 dst0 = __msa_adds_s_b(dst0,
offset);
2079 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2081 src_minus10 = src10;
2082 src_minus11 = src11;
2085 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2087 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2088 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2106 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2107 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2108 v16u8 cmp_minus10, cmp_plus10, diff_minus10, diff_plus10, cmp_minus11;
2109 v16u8 cmp_plus11, diff_minus11, diff_plus11, cmp_minus12, cmp_plus12;
2110 v16u8 diff_minus12, diff_plus12, cmp_minus13, cmp_plus13, diff_minus13;
2111 v16u8 diff_plus13, src_minus14, src_plus13;
2112 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3;
2113 v16u8 src10, src_minus10, dst0, src11, src_minus11, dst1;
2114 v16u8 src12, src_minus12, dst2, src13, src_minus13, dst3;
2115 v16i8 src_zero0, src_plus10, src_zero1, src_plus11, src_zero2, src_plus12;
2116 v16i8 src_zero3, sao_offset;
2118 sao_offset =
LD_SB(sao_offset_val);
2119 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2124 LD_UB4(src_orig, src_stride, src_minus11, src_minus12, src_minus13,
2127 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
2128 src_minus10 =
LD_UB(src_orig - src_stride);
2129 LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
2130 src_plus13 =
LD_UB(
src + 1 + v_cnt + (src_stride << 2));
2133 SLDI_B2_SB(src10, src11, src_minus11, src_minus12, src_zero0,
2135 SLDI_B2_SB(src12, src13, src_minus13, src_minus14, src_zero2,
2137 SLDI_B2_SB(src11, src12, src_minus12, src_minus13, src_plus10,
2140 src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2);
2142 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2143 cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
2144 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2145 cmp_plus11 = ((v16u8) src_zero1 == (v16u8) src_plus11);
2146 cmp_minus12 = ((v16u8) src_zero2 == src_minus12);
2147 cmp_plus12 = ((v16u8) src_zero2 == (v16u8) src_plus12);
2148 cmp_minus13 = ((v16u8) src_zero3 == src_minus13);
2149 cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
2151 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2152 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
2153 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2154 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
2155 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
2156 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
2157 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
2158 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
2160 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2161 cmp_plus10 = ((v16u8) src_plus10 < (v16u8) src_zero0);
2162 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2163 cmp_plus11 = ((v16u8) src_plus11 < (v16u8) src_zero1);
2164 cmp_minus12 = (src_minus12 < (v16u8) src_zero2);
2165 cmp_plus12 = ((v16u8) src_plus12 < (v16u8) src_zero2);
2166 cmp_minus13 = (src_minus13 < (v16u8) src_zero3);
2167 cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
2169 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2170 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
2171 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2172 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
2173 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
2174 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
2175 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
2176 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
2178 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
2179 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
2180 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
2181 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
2183 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2184 offset_mask0, offset_mask0, offset_mask0, offset_mask0);
2185 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2186 offset_mask1, offset_mask1, offset_mask1, offset_mask1);
2187 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2188 offset_mask2, offset_mask2, offset_mask2, offset_mask2);
2189 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2190 offset_mask3, offset_mask3, offset_mask3, offset_mask3);
2194 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
2195 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
2196 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
2197 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
2201 src_minus11 = src10;
2202 src_minus12 = src11;
2203 src_minus13 = src12;
2204 src_minus14 = src13;
2206 ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
2210 src += (src_stride << 2);
2211 dst += (dst_stride << 2);
2219 int16_t *sao_offset_val,
2223 uint32_t dst_val0, dst_val1;
2224 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2225 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2227 v16i8 src_zero0, src_zero1, dst0;
2228 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2229 v16u8 src_minus10, src10, src_minus11, src11;
2230 v8i16 offset_mask0, offset_mask1;
2232 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2236 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2237 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2240 src_orig += (src_stride << 1);
2242 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
2243 SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
2245 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2247 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2250 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2251 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2252 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2253 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2255 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2256 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2257 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2258 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2260 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2261 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2263 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2264 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2269 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2270 dst0 = __msa_adds_s_b(dst0,
offset);
2271 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2273 src_minus10 = src10;
2274 src_minus11 = src11;
2277 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2279 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
2280 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
2289 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
2290 SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
2292 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2294 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2297 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2298 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2299 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2300 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2302 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2303 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2304 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2305 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2307 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2308 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2310 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2311 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2316 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2317 dst0 = __msa_adds_s_b(dst0,
offset);
2318 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2320 dst_val0 = __msa_copy_u_w((v4i32) dst0, 0);
2321 dst_val1 = __msa_copy_u_w((v4i32) dst0, 2);
2333 int16_t *sao_offset_val,
2337 uint64_t dst_val0, dst_val1;
2338 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2339 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2341 v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
2342 v16u8 src_minus10, src10, src_minus11, src11;
2343 v16i8 src_zero0, src_zero1, dst0;
2344 v8i16 offset_mask0, offset_mask1;
2346 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2350 LD_UB2(src_orig - src_stride, src_stride, src_minus10, src_minus11);
2351 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2354 src_orig += (src_stride << 1);
2356 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
2357 SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
2358 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2360 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2363 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2364 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2365 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2366 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2368 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2369 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2370 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2371 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2373 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2374 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2376 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2377 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2382 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2383 dst0 = __msa_adds_s_b(dst0,
offset);
2384 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2386 src_minus10 = src10;
2387 src_minus11 = src11;
2390 LD_UB2(src_orig + src_stride, src_stride, src10, src11);
2392 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2393 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2401 SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
2402 SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
2403 ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
2405 ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
2408 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2409 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2410 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2411 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2413 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2414 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2415 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2416 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2418 offset_mask0 = (v8i16) (__msa_hadd_u_h(diff_minus10, diff_minus10) + 2);
2419 offset_mask1 = (v8i16) (__msa_hadd_u_h(diff_minus11, diff_minus11) + 2);
2421 offset = __msa_pckev_b((v16i8) offset_mask1, (v16i8) offset_mask0);
2422 dst0 = __msa_pckev_b((v16i8) src_zero1, (v16i8) src_zero0);
2427 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2428 dst0 = __msa_adds_s_b(dst0,
offset);
2429 dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
2431 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2432 dst_val1 = __msa_copy_u_d((v2i64) dst0, 1);
2451 v16i8 edge_idx = { 1, 2, 0, 3, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
2452 v16u8 const1 = (v16u8) __msa_ldi_b(1);
2453 v16u8 dst0, dst1, dst2, dst3;
2454 v16u8 cmp_minus10, cmp_minus11, cmp_minus12, cmp_minus13, cmp_plus10;
2455 v16u8 cmp_plus11, cmp_plus12, cmp_plus13, diff_minus10, diff_minus11;
2456 v16u8 diff_minus12, diff_minus13, diff_plus10, diff_plus11, diff_plus12;
2457 v16u8 diff_plus13, src10, src11, src12, src13, src_minus10, src_minus11;
2458 v16u8 src_plus10, src_plus11, src_plus12, src_plus13;
2459 v16i8 src_minus12, src_minus13, src_zero0, src_zero1, src_zero2, src_zero3;
2460 v16i8 offset_mask0, offset_mask1, offset_mask2, offset_mask3, sao_offset;
2462 sao_offset =
LD_SB(sao_offset_val);
2463 sao_offset = __msa_pckev_b(sao_offset, sao_offset);
2469 LD_UB4(src_orig, src_stride, src_minus11, src_plus10, src_plus11,
2472 for (v_cnt = 0; v_cnt <
width; v_cnt += 16) {
2473 src_minus10 =
LD_UB(src_orig + 2 - src_stride);
2474 LD_UB4(src_orig + 16, src_stride, src10, src11, src12, src13);
2475 src_plus13 =
LD_UB(src_orig + (src_stride << 2));
2478 src_zero0 = __msa_sldi_b((v16i8) src10, (v16i8) src_minus11, 1);
2479 cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
2480 cmp_plus10 = ((v16u8) src_zero0 == src_plus10);
2482 src_zero1 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 1);
2483 src_minus11 = (v16u8) __msa_sldi_b((v16i8) src10,
2484 (v16i8) src_minus11, 2);
2485 cmp_minus11 = ((v16u8) src_zero1 == src_minus11);
2486 cmp_plus11 = ((v16u8) src_zero1 == src_plus11);
2488 src_zero2 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 1);
2489 src_minus12 = __msa_sldi_b((v16i8) src11, (v16i8) src_plus10, 2);
2490 cmp_minus12 = ((v16u8) src_zero2 == (v16u8) src_minus12);
2491 cmp_plus12 = ((v16u8) src_zero2 == src_plus12);
2493 src_zero3 = __msa_sldi_b((v16i8) src13, (v16i8) src_plus12, 1);
2494 src_minus13 = __msa_sldi_b((v16i8) src12, (v16i8) src_plus11, 2);
2495 cmp_minus13 = ((v16u8) src_zero3 == (v16u8) src_minus13);
2496 cmp_plus13 = ((v16u8) src_zero3 == src_plus13);
2498 diff_minus10 = __msa_nor_v(cmp_minus10, cmp_minus10);
2499 diff_plus10 = __msa_nor_v(cmp_plus10, cmp_plus10);
2500 diff_minus11 = __msa_nor_v(cmp_minus11, cmp_minus11);
2501 diff_plus11 = __msa_nor_v(cmp_plus11, cmp_plus11);
2502 diff_minus12 = __msa_nor_v(cmp_minus12, cmp_minus12);
2503 diff_plus12 = __msa_nor_v(cmp_plus12, cmp_plus12);
2504 diff_minus13 = __msa_nor_v(cmp_minus13, cmp_minus13);
2505 diff_plus13 = __msa_nor_v(cmp_plus13, cmp_plus13);
2507 cmp_minus10 = (src_minus10 < (v16u8) src_zero0);
2508 cmp_plus10 = (src_plus10 < (v16u8) src_zero0);
2509 cmp_minus11 = (src_minus11 < (v16u8) src_zero1);
2510 cmp_plus11 = (src_plus11 < (v16u8) src_zero1);
2511 cmp_minus12 = ((v16u8) src_minus12 < (v16u8) src_zero2);
2512 cmp_plus12 = (src_plus12 < (v16u8) src_zero2);
2513 cmp_minus13 = ((v16u8) src_minus13 < (v16u8) src_zero3);
2514 cmp_plus13 = (src_plus13 < (v16u8) src_zero3);
2516 diff_minus10 = __msa_bmnz_v(diff_minus10, const1, cmp_minus10);
2517 diff_plus10 = __msa_bmnz_v(diff_plus10, const1, cmp_plus10);
2518 diff_minus11 = __msa_bmnz_v(diff_minus11, const1, cmp_minus11);
2519 diff_plus11 = __msa_bmnz_v(diff_plus11, const1, cmp_plus11);
2520 diff_minus12 = __msa_bmnz_v(diff_minus12, const1, cmp_minus12);
2521 diff_plus12 = __msa_bmnz_v(diff_plus12, const1, cmp_plus12);
2522 diff_minus13 = __msa_bmnz_v(diff_minus13, const1, cmp_minus13);
2523 diff_plus13 = __msa_bmnz_v(diff_plus13, const1, cmp_plus13);
2525 offset_mask0 = 2 + (v16i8) diff_minus10 + (v16i8) diff_plus10;
2526 offset_mask1 = 2 + (v16i8) diff_minus11 + (v16i8) diff_plus11;
2527 offset_mask2 = 2 + (v16i8) diff_minus12 + (v16i8) diff_plus12;
2528 offset_mask3 = 2 + (v16i8) diff_minus13 + (v16i8) diff_plus13;
2530 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2531 offset_mask0, offset_mask0, offset_mask0, offset_mask0);
2532 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2533 offset_mask1, offset_mask1, offset_mask1, offset_mask1);
2534 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2535 offset_mask2, offset_mask2, offset_mask2, offset_mask2);
2536 VSHF_B2_SB(edge_idx, edge_idx, sao_offset, sao_offset,
2537 offset_mask3, offset_mask3, offset_mask3, offset_mask3);
2541 dst0 = (v16u8) __msa_adds_s_b((v16i8) src_zero0, offset_mask0);
2542 dst1 = (v16u8) __msa_adds_s_b((v16i8) src_zero1, offset_mask1);
2543 dst2 = (v16u8) __msa_adds_s_b((v16i8) src_zero2, offset_mask2);
2544 dst3 = (v16u8) __msa_adds_s_b((v16i8) src_zero3, offset_mask3);
2548 src_minus11 = src10;
2553 ST_UB4(dst0, dst1, dst2, dst3, dst_orig, dst_stride);
2557 src += (src_stride << 2);
2558 dst += (dst_stride << 2);
2563 ptrdiff_t src_stride,
2571 ptrdiff_t src_stride,
2579 ptrdiff_t src_stride,
2587 ptrdiff_t src_stride,
2595 ptrdiff_t stride_dst, ptrdiff_t stride_src,
2596 int16_t *sao_offset_val,
int sao_left_class,
2601 sao_left_class, sao_offset_val,
2610 sao_left_class, sao_offset_val,
height);
2618 sao_left_class, sao_offset_val,
height);
2623 ptrdiff_t stride_dst,
2624 int16_t *sao_offset_val,