FFmpeg
vp9dsp.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Ronald S. Bultje <rsbultje@gmail.com>
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License along
17  * with FFmpeg; if not, write to the Free Software Foundation, Inc.,
18  * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19  */
20 
21 #include <math.h>
22 #include <string.h>
23 #include "checkasm.h"
24 #include "libavcodec/vp9data.h"
25 #include "libavcodec/vp9.h"
26 #include "libavutil/common.h"
27 #include "libavutil/emms.h"
28 #include "libavutil/internal.h"
29 #include "libavutil/intreadwrite.h"
30 #include "libavutil/mathematics.h"
31 #include "libavutil/mem_internal.h"
32 
33 static const uint32_t pixel_mask[3] = { 0xffffffff, 0x03ff03ff, 0x0fff0fff };
34 #define SIZEOF_PIXEL ((bit_depth + 7) / 8)
35 
36 #define randomize_buffers() \
37  do { \
38  uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \
39  int k; \
40  for (k = -4; k < SIZEOF_PIXEL * FFMAX(8, size); k += 4) { \
41  uint32_t r = rnd() & mask; \
42  AV_WN32A(a + k, r); \
43  } \
44  for (k = 0; k < size * SIZEOF_PIXEL; k += 4) { \
45  uint32_t r = rnd() & mask; \
46  AV_WN32A(l + k, r); \
47  } \
48  } while (0)
49 
50 static void check_ipred(void)
51 {
52  LOCAL_ALIGNED_32(uint8_t, a_buf, [64 * 2]);
53  uint8_t *a = &a_buf[32 * 2];
54  LOCAL_ALIGNED_32(uint8_t, l, [32 * 2]);
55  LOCAL_ALIGNED_32(uint8_t, dst0, [32 * 32 * 2]);
56  LOCAL_ALIGNED_32(uint8_t, dst1, [32 * 32 * 2]);
57  VP9DSPContext dsp;
58  int tx, mode, bit_depth;
59  declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride,
60  const uint8_t *left, const uint8_t *top);
61  static const char *const mode_names[N_INTRA_PRED_MODES] = {
62  [VERT_PRED] = "vert",
63  [HOR_PRED] = "hor",
64  [DC_PRED] = "dc",
65  [DIAG_DOWN_LEFT_PRED] = "diag_downleft",
66  [DIAG_DOWN_RIGHT_PRED] = "diag_downright",
67  [VERT_RIGHT_PRED] = "vert_right",
68  [HOR_DOWN_PRED] = "hor_down",
69  [VERT_LEFT_PRED] = "vert_left",
70  [HOR_UP_PRED] = "hor_up",
71  [TM_VP8_PRED] = "tm",
72  [LEFT_DC_PRED] = "dc_left",
73  [TOP_DC_PRED] = "dc_top",
74  [DC_128_PRED] = "dc_128",
75  [DC_127_PRED] = "dc_127",
76  [DC_129_PRED] = "dc_129",
77  };
78 
79  for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
80  ff_vp9dsp_init(&dsp, bit_depth, 0);
81  for (tx = 0; tx < 4; tx++) {
82  int size = 4 << tx;
83 
84  for (mode = 0; mode < N_INTRA_PRED_MODES; mode++) {
85  if (check_func(dsp.intra_pred[tx][mode], "vp9_%s_%dx%d_%dbpp",
86  mode_names[mode], size, size, bit_depth)) {
88  call_ref(dst0, size * SIZEOF_PIXEL, l, a);
89  call_new(dst1, size * SIZEOF_PIXEL, l, a);
90  if (memcmp(dst0, dst1, size * size * SIZEOF_PIXEL))
91  fail();
92  bench_new(dst1, size * SIZEOF_PIXEL,l, a);
93  }
94  }
95  }
96  }
97  report("ipred");
98 }
99 
100 #undef randomize_buffers
101 
102 #define randomize_buffers() \
103  do { \
104  uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \
105  for (y = 0; y < sz; y++) { \
106  for (x = 0; x < sz * SIZEOF_PIXEL; x += 4) { \
107  uint32_t r = rnd() & mask; \
108  AV_WN32A(dst + y * sz * SIZEOF_PIXEL + x, r); \
109  AV_WN32A(src + y * sz * SIZEOF_PIXEL + x, rnd() & mask); \
110  } \
111  for (x = 0; x < sz; x++) { \
112  if (bit_depth == 8) { \
113  coef[y * sz + x] = src[y * sz + x] - dst[y * sz + x]; \
114  } else { \
115  ((int32_t *) coef)[y * sz + x] = \
116  ((uint16_t *) src)[y * sz + x] - \
117  ((uint16_t *) dst)[y * sz + x]; \
118  } \
119  } \
120  } \
121  } while(0)
122 
123 // wht function copied from libvpx
124 static void fwht_1d(double *out, const double *in, int sz)
125 {
126  double t0 = in[0] + in[1];
127  double t3 = in[3] - in[2];
128  double t4 = trunc((t0 - t3) * 0.5);
129  double t1 = t4 - in[1];
130  double t2 = t4 - in[2];
131 
132  out[0] = t0 - t2;
133  out[1] = t2;
134  out[2] = t3 + t1;
135  out[3] = t1;
136 }
137 
138 // standard DCT-II
139 static void fdct_1d(double *out, const double *in, int sz)
140 {
141  int k, n;
142 
143  for (k = 0; k < sz; k++) {
144  out[k] = 0.0;
145  for (n = 0; n < sz; n++)
146  out[k] += in[n] * cos(M_PI * (2 * n + 1) * k / (sz * 2.0));
147  }
148  out[0] *= M_SQRT1_2;
149 }
150 
151 // see "Towards jointly optimal spatial prediction and adaptive transform in
152 // video/image coding", by J. Han, A. Saxena, and K. Rose
153 // IEEE Proc. ICASSP, pp. 726-729, Mar. 2010.
154 static void fadst4_1d(double *out, const double *in, int sz)
155 {
156  int k, n;
157 
158  for (k = 0; k < sz; k++) {
159  out[k] = 0.0;
160  for (n = 0; n < sz; n++)
161  out[k] += in[n] * sin(M_PI * (n + 1) * (2 * k + 1) / (sz * 2.0 + 1.0));
162  }
163 }
164 
165 // see "A Butterfly Structured Design of The Hybrid Transform Coding Scheme",
166 // by Jingning Han, Yaowu Xu, and Debargha Mukherjee
167 // http://static.googleusercontent.com/media/research.google.com/en//pubs/archive/41418.pdf
168 static void fadst_1d(double *out, const double *in, int sz)
169 {
170  int k, n;
171 
172  for (k = 0; k < sz; k++) {
173  out[k] = 0.0;
174  for (n = 0; n < sz; n++)
175  out[k] += in[n] * sin(M_PI * (2 * n + 1) * (2 * k + 1) / (sz * 4.0));
176  }
177 }
178 
179 typedef void (*ftx1d_fn)(double *out, const double *in, int sz);
180 static void ftx_2d(double *out, const double *in, enum TxfmMode tx,
181  enum TxfmType txtp, int sz)
182 {
183  static const double scaling_factors[5][4] = {
184  { 4.0, 16.0 * M_SQRT1_2 / 3.0, 16.0 * M_SQRT1_2 / 3.0, 32.0 / 9.0 },
185  { 2.0, 2.0, 2.0, 2.0 },
186  { 1.0, 1.0, 1.0, 1.0 },
187  { 0.25 },
188  { 4.0 }
189  };
190  static const ftx1d_fn ftx1d_tbl[5][4][2] = {
191  {
192  { fdct_1d, fdct_1d },
193  { fadst4_1d, fdct_1d },
194  { fdct_1d, fadst4_1d },
195  { fadst4_1d, fadst4_1d },
196  }, {
197  { fdct_1d, fdct_1d },
198  { fadst_1d, fdct_1d },
199  { fdct_1d, fadst_1d },
200  { fadst_1d, fadst_1d },
201  }, {
202  { fdct_1d, fdct_1d },
203  { fadst_1d, fdct_1d },
204  { fdct_1d, fadst_1d },
205  { fadst_1d, fadst_1d },
206  }, {
207  { fdct_1d, fdct_1d },
208  }, {
209  { fwht_1d, fwht_1d },
210  },
211  };
212  double temp[1024];
213  double scaling_factor = scaling_factors[tx][txtp];
214  int i, j;
215 
216  // cols
217  for (i = 0; i < sz; ++i) {
218  double temp_out[32];
219 
220  ftx1d_tbl[tx][txtp][0](temp_out, &in[i * sz], sz);
221  // scale and transpose
222  for (j = 0; j < sz; ++j)
223  temp[j * sz + i] = temp_out[j] * scaling_factor;
224  }
225 
226  // rows
227  for (i = 0; i < sz; i++)
228  ftx1d_tbl[tx][txtp][1](&out[i * sz], &temp[i * sz], sz);
229 }
230 
231 static void ftx(int16_t *buf, enum TxfmMode tx,
232  enum TxfmType txtp, int sz, int bit_depth)
233 {
234  double ind[1024], outd[1024];
235  int n;
236 
237  emms_c();
238  for (n = 0; n < sz * sz; n++) {
239  if (bit_depth == 8)
240  ind[n] = buf[n];
241  else
242  ind[n] = ((int32_t *) buf)[n];
243  }
244  ftx_2d(outd, ind, tx, txtp, sz);
245  for (n = 0; n < sz * sz; n++) {
246  if (bit_depth == 8)
247  buf[n] = lrint(outd[n]);
248  else
249  ((int32_t *) buf)[n] = lrint(outd[n]);
250  }
251 }
252 
253 static int copy_subcoefs(int16_t *out, const int16_t *in, enum TxfmMode tx,
254  enum TxfmType txtp, int sz, int sub, int bit_depth)
255 {
256  // copy the topleft coefficients such that the return value (being the
257  // coefficient scantable index for the eob token) guarantees that only
258  // the topleft $sub out of $sz (where $sz >= $sub) coefficients in both
259  // dimensions are non-zero. This leads to braching to specific optimized
260  // simd versions (e.g. dc-only) so that we get full asm coverage in this
261  // test
262 
263  int n;
264  const int16_t *scan = ff_vp9_scans[tx][txtp];
265  int eob;
266 
267  for (n = 0; n < sz * sz; n++) {
268  int rc = scan[n], rcx = rc % sz, rcy = rc / sz;
269 
270  // find eob for this sub-idct
271  if (rcx >= sub || rcy >= sub)
272  break;
273 
274  // copy coef
275  if (bit_depth == 8) {
276  out[rc] = in[rc];
277  } else {
278  AV_COPY32(&out[rc * 2], &in[rc * 2]);
279  }
280  }
281 
282  eob = n;
283 
284  for (; n < sz * sz; n++) {
285  int rc = scan[n];
286 
287  // zero
288  if (bit_depth == 8) {
289  out[rc] = 0;
290  } else {
291  AV_ZERO32(&out[rc * 2]);
292  }
293  }
294 
295  return eob;
296 }
297 
298 static int is_zero(const int16_t *c, int sz)
299 {
300  int n;
301 
302  for (n = 0; n < sz / sizeof(int16_t); n += 2)
303  if (AV_RN32A(&c[n]))
304  return 0;
305 
306  return 1;
307 }
308 
309 #define SIZEOF_COEF (2 * ((bit_depth + 7) / 8))
310 
311 static void check_itxfm(void)
312 {
313  LOCAL_ALIGNED_32(uint8_t, src, [32 * 32 * 2]);
314  LOCAL_ALIGNED_32(uint8_t, dst, [32 * 32 * 2]);
315  LOCAL_ALIGNED_32(uint8_t, dst0, [32 * 32 * 2]);
316  LOCAL_ALIGNED_32(uint8_t, dst1, [32 * 32 * 2]);
317  LOCAL_ALIGNED_32(int16_t, coef, [32 * 32 * 2]);
318  LOCAL_ALIGNED_32(int16_t, subcoef0, [32 * 32 * 2]);
319  LOCAL_ALIGNED_32(int16_t, subcoef1, [32 * 32 * 2]);
320  declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
321  VP9DSPContext dsp;
322  int y, x, tx, txtp, bit_depth, sub;
323  static const char *const txtp_types[N_TXFM_TYPES] = {
324  [DCT_DCT] = "dct_dct", [DCT_ADST] = "adst_dct",
325  [ADST_DCT] = "dct_adst", [ADST_ADST] = "adst_adst"
326  };
327 
328  for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
329  ff_vp9dsp_init(&dsp, bit_depth, 0);
330 
331  for (tx = TX_4X4; tx <= N_TXFM_SIZES /* 4 = lossless */; tx++) {
332  int sz = 4 << (tx & 3);
333  int n_txtps = tx < TX_32X32 ? N_TXFM_TYPES : 1;
334 
335  for (txtp = 0; txtp < n_txtps; txtp++) {
336  // skip testing sub-IDCTs for WHT or ADST since they don't
337  // implement it in any of the SIMD functions. If they do,
338  // consider changing this to ensure we have complete test
339  // coverage. Test sub=1 for dc-only, then 2, 4, 8, 12, etc,
340  // since the arm version can distinguish them at that level.
341  for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz;
342  sub < 4 ? (sub <<= 1) : (sub += 4)) {
343  if (check_func(dsp.itxfm_add[tx][txtp],
344  "vp9_inv_%s_%dx%d_sub%d_add_%d",
345  tx == 4 ? "wht_wht" : txtp_types[txtp],
346  sz, sz, sub, bit_depth)) {
347  int eob;
348 
350  ftx(coef, tx, txtp, sz, bit_depth);
351 
352  if (sub < sz) {
353  eob = copy_subcoefs(subcoef0, coef, tx, txtp,
354  sz, sub, bit_depth);
355  } else {
356  eob = sz * sz;
357  memcpy(subcoef0, coef, sz * sz * SIZEOF_COEF);
358  }
359 
360  memcpy(dst0, dst, sz * sz * SIZEOF_PIXEL);
361  memcpy(dst1, dst, sz * sz * SIZEOF_PIXEL);
362  memcpy(subcoef1, subcoef0, sz * sz * SIZEOF_COEF);
363  call_ref(dst0, sz * SIZEOF_PIXEL, subcoef0, eob);
364  call_new(dst1, sz * SIZEOF_PIXEL, subcoef1, eob);
365  if (memcmp(dst0, dst1, sz * sz * SIZEOF_PIXEL) ||
366  !is_zero(subcoef0, sz * sz * SIZEOF_COEF) ||
367  !is_zero(subcoef1, sz * sz * SIZEOF_COEF))
368  fail();
369 
370  bench_new(dst, sz * SIZEOF_PIXEL, coef, eob);
371  }
372  }
373  }
374  }
375  }
376  report("itxfm");
377 }
378 
379 #undef randomize_buffers
380 
381 #define setpx(a,b,c) \
382  do { \
383  if (SIZEOF_PIXEL == 1) { \
384  buf0[(a) + (b) * jstride] = av_clip_uint8(c); \
385  } else { \
386  ((uint16_t *)buf0)[(a) + (b) * jstride] = av_clip_uintp2(c, bit_depth); \
387  } \
388  } while (0)
389 
390 // c can be an assignment and must not be put under ()
391 #define setdx(a,b,c,d) setpx(a,b,c-(d)+(rnd()%((d)*2+1)))
392 #define setsx(a,b,c,d) setdx(a,b,c,(d) << (bit_depth - 8))
393 static void randomize_loopfilter_buffers(int bidx, int lineoff, int str,
394  int bit_depth, int dir, const int *E,
395  const int *F, const int *H, const int *I,
396  uint8_t *buf0, uint8_t *buf1)
397 {
398  uint32_t mask = (1 << bit_depth) - 1;
399  int off = dir ? lineoff : lineoff * 16;
400  int istride = dir ? 1 : 16;
401  int jstride = dir ? str : 1;
402  int i, j;
403  for (i = 0; i < 2; i++) /* flat16 */ {
404  int idx = off + i * istride, p0, q0;
405  setpx(idx, 0, q0 = rnd() & mask);
406  setsx(idx, -1, p0 = q0, E[bidx] >> 2);
407  for (j = 1; j < 8; j++) {
408  setsx(idx, -1 - j, p0, F[bidx]);
409  setsx(idx, j, q0, F[bidx]);
410  }
411  }
412  for (i = 2; i < 4; i++) /* flat8 */ {
413  int idx = off + i * istride, p0, q0;
414  setpx(idx, 0, q0 = rnd() & mask);
415  setsx(idx, -1, p0 = q0, E[bidx] >> 2);
416  for (j = 1; j < 4; j++) {
417  setsx(idx, -1 - j, p0, F[bidx]);
418  setsx(idx, j, q0, F[bidx]);
419  }
420  for (j = 4; j < 8; j++) {
421  setpx(idx, -1 - j, rnd() & mask);
422  setpx(idx, j, rnd() & mask);
423  }
424  }
425  for (i = 4; i < 6; i++) /* regular */ {
426  int idx = off + i * istride, p2, p1, p0, q0, q1, q2;
427  setpx(idx, 0, q0 = rnd() & mask);
428  setsx(idx, 1, q1 = q0, I[bidx]);
429  setsx(idx, 2, q2 = q1, I[bidx]);
430  setsx(idx, 3, q2, I[bidx]);
431  setsx(idx, -1, p0 = q0, E[bidx] >> 2);
432  setsx(idx, -2, p1 = p0, I[bidx]);
433  setsx(idx, -3, p2 = p1, I[bidx]);
434  setsx(idx, -4, p2, I[bidx]);
435  for (j = 4; j < 8; j++) {
436  setpx(idx, -1 - j, rnd() & mask);
437  setpx(idx, j, rnd() & mask);
438  }
439  }
440  for (i = 6; i < 8; i++) /* off */ {
441  int idx = off + i * istride;
442  for (j = 0; j < 8; j++) {
443  setpx(idx, -1 - j, rnd() & mask);
444  setpx(idx, j, rnd() & mask);
445  }
446  }
447 }
448 #define randomize_buffers(bidx, lineoff, str) \
449  randomize_loopfilter_buffers(bidx, lineoff, str, bit_depth, dir, \
450  E, F, H, I, buf0, buf1)
451 
452 static void check_loopfilter(void)
453 {
454  LOCAL_ALIGNED_32(uint8_t, base0, [32 + 16 * 16 * 2]);
455  LOCAL_ALIGNED_32(uint8_t, base1, [32 + 16 * 16 * 2]);
456  VP9DSPContext dsp;
457  int dir, wd, wd2, bit_depth;
458  static const char *const dir_name[2] = { "h", "v" };
459  static const int E[2] = { 20, 28 }, I[2] = { 10, 16 };
460  static const int H[2] = { 7, 11 }, F[2] = { 1, 1 };
461  declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
462 
463  for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
464  ff_vp9dsp_init(&dsp, bit_depth, 0);
465 
466  for (dir = 0; dir < 2; dir++) {
467  int midoff = (dir ? 8 * 8 : 8) * SIZEOF_PIXEL;
468  int midoff_aligned = (dir ? 8 * 8 : 16) * SIZEOF_PIXEL;
469  uint8_t *buf0 = base0 + midoff_aligned;
470  uint8_t *buf1 = base1 + midoff_aligned;
471 
472  for (wd = 0; wd < 3; wd++) {
473  // 4/8/16wd_8px
474  if (check_func(dsp.loop_filter_8[wd][dir],
475  "vp9_loop_filter_%s_%d_8_%dbpp",
476  dir_name[dir], 4 << wd, bit_depth)) {
477  randomize_buffers(0, 0, 8);
478  memcpy(buf1 - midoff, buf0 - midoff,
479  16 * 8 * SIZEOF_PIXEL);
480  call_ref(buf0, 16 * SIZEOF_PIXEL >> dir, E[0], I[0], H[0]);
481  call_new(buf1, 16 * SIZEOF_PIXEL >> dir, E[0], I[0], H[0]);
482  if (memcmp(buf0 - midoff, buf1 - midoff, 16 * 8 * SIZEOF_PIXEL))
483  fail();
484  bench_new(buf1, 16 * SIZEOF_PIXEL >> dir, E[0], I[0], H[0]);
485  }
486  }
487 
488  midoff = (dir ? 16 * 8 : 8) * SIZEOF_PIXEL;
489  midoff_aligned = (dir ? 16 * 8 : 16) * SIZEOF_PIXEL;
490 
491  buf0 = base0 + midoff_aligned;
492  buf1 = base1 + midoff_aligned;
493 
494  // 16wd_16px loopfilter
495  if (check_func(dsp.loop_filter_16[dir],
496  "vp9_loop_filter_%s_16_16_%dbpp",
497  dir_name[dir], bit_depth)) {
498  randomize_buffers(0, 0, 16);
499  randomize_buffers(0, 8, 16);
500  memcpy(buf1 - midoff, buf0 - midoff, 16 * 16 * SIZEOF_PIXEL);
501  call_ref(buf0, 16 * SIZEOF_PIXEL, E[0], I[0], H[0]);
502  call_new(buf1, 16 * SIZEOF_PIXEL, E[0], I[0], H[0]);
503  if (memcmp(buf0 - midoff, buf1 - midoff, 16 * 16 * SIZEOF_PIXEL))
504  fail();
505  bench_new(buf1, 16 * SIZEOF_PIXEL, E[0], I[0], H[0]);
506  }
507 
508  for (wd = 0; wd < 2; wd++) {
509  for (wd2 = 0; wd2 < 2; wd2++) {
510  // mix2 loopfilter
511  if (check_func(dsp.loop_filter_mix2[wd][wd2][dir],
512  "vp9_loop_filter_mix2_%s_%d%d_16_%dbpp",
513  dir_name[dir], 4 << wd, 4 << wd2, bit_depth)) {
514  randomize_buffers(0, 0, 16);
515  randomize_buffers(1, 8, 16);
516  memcpy(buf1 - midoff, buf0 - midoff, 16 * 16 * SIZEOF_PIXEL);
517 #define M(a) (((a)[1] << 8) | (a)[0])
518  call_ref(buf0, 16 * SIZEOF_PIXEL, M(E), M(I), M(H));
519  call_new(buf1, 16 * SIZEOF_PIXEL, M(E), M(I), M(H));
520  if (memcmp(buf0 - midoff, buf1 - midoff, 16 * 16 * SIZEOF_PIXEL))
521  fail();
522  bench_new(buf1, 16 * SIZEOF_PIXEL, M(E), M(I), M(H));
523 #undef M
524  }
525  }
526  }
527  }
528  }
529  report("loopfilter");
530 }
531 
532 #undef setsx
533 #undef setpx
534 #undef setdx
535 #undef randomize_buffers
536 
537 #define DST_BUF_SIZE (size * size * SIZEOF_PIXEL)
538 #define SRC_BUF_STRIDE 72
539 #define SRC_BUF_SIZE ((size + 7) * SRC_BUF_STRIDE * SIZEOF_PIXEL)
540 #define src (buf + 3 * SIZEOF_PIXEL * (SRC_BUF_STRIDE + 1))
541 
542 #define randomize_buffers() \
543  do { \
544  uint32_t mask = pixel_mask[(bit_depth - 8) >> 1]; \
545  int k; \
546  for (k = 0; k < SRC_BUF_SIZE; k += 4) { \
547  uint32_t r = rnd() & mask; \
548  AV_WN32A(buf + k, r); \
549  } \
550  if (op == 1) { \
551  for (k = 0; k < DST_BUF_SIZE; k += 4) { \
552  uint32_t r = rnd() & mask; \
553  AV_WN32A(dst0 + k, r); \
554  AV_WN32A(dst1 + k, r); \
555  } \
556  } \
557  } while (0)
558 
559 static void check_mc(void)
560 {
561  LOCAL_ALIGNED_32(uint8_t, buf, [72 * 72 * 2]);
562  LOCAL_ALIGNED_32(uint8_t, dst0, [64 * 64 * 2]);
563  LOCAL_ALIGNED_32(uint8_t, dst1, [64 * 64 * 2]);
564  VP9DSPContext dsp;
565  int op, hsize, bit_depth, filter, dx, dy;
566  declare_func_emms(AV_CPU_FLAG_MMX | AV_CPU_FLAG_MMXEXT, void, uint8_t *dst, ptrdiff_t dst_stride,
567  const uint8_t *ref, ptrdiff_t ref_stride,
568  int h, int mx, int my);
569  static const char *const filter_names[4] = {
570  "8tap_smooth", "8tap_regular", "8tap_sharp", "bilin"
571  };
572  static const char *const subpel_names[2][2] = { { "", "h" }, { "v", "hv" } };
573  static const char *const op_names[2] = { "put", "avg" };
574  char str[256];
575 
576  for (op = 0; op < 2; op++) {
577  for (bit_depth = 8; bit_depth <= 12; bit_depth += 2) {
578  ff_vp9dsp_init(&dsp, bit_depth, 0);
579  for (hsize = 0; hsize < 5; hsize++) {
580  int size = 64 >> hsize;
581 
582  for (filter = 0; filter < 4; filter++) {
583  for (dx = 0; dx < 2; dx++) {
584  for (dy = 0; dy < 2; dy++) {
585  if (dx || dy) {
586  snprintf(str, sizeof(str),
587  "%s_%s_%d%s", op_names[op],
588  filter_names[filter], size,
589  subpel_names[dy][dx]);
590  } else {
591  snprintf(str, sizeof(str),
592  "%s%d", op_names[op], size);
593  }
594  if (check_func(dsp.mc[hsize][filter][op][dx][dy],
595  "vp9_%s_%dbpp", str, bit_depth)) {
596  int mx = dx ? 1 + (rnd() % 14) : 0;
597  int my = dy ? 1 + (rnd() % 14) : 0;
599  call_ref(dst0, size * SIZEOF_PIXEL,
601  size, mx, my);
602  call_new(dst1, size * SIZEOF_PIXEL,
604  size, mx, my);
605  if (memcmp(dst0, dst1, DST_BUF_SIZE))
606  fail();
607 
608  // simd implementations for each filter of subpel
609  // functions are identical
610  if (filter >= 1 && filter <= 2) continue;
611  // 10/12 bpp for bilin are identical
612  if (bit_depth == 12 && filter == 3) continue;
613 
614  bench_new(dst1, size * SIZEOF_PIXEL,
616  size, mx, my);
617  }
618  }
619  }
620  }
621  }
622  }
623  }
624  report("mc");
625 }
626 
628 {
629  check_ipred();
630  check_itxfm();
632  check_mc();
633 }
declare_func_emms
#define declare_func_emms(cpu_flags, ret,...)
Definition: checkasm.h:190
q1
static const uint8_t q1[256]
Definition: twofish.c:100
fwht_1d
static void fwht_1d(double *out, const double *in, int sz)
Definition: vp9dsp.c:124
setpx
#define setpx(a, b, c)
Definition: vp9dsp.c:381
mem_internal.h
DC_128_PRED
@ DC_128_PRED
Definition: vp9.h:58
out
FILE * out
Definition: movenc.c:55
N_TXFM_TYPES
@ N_TXFM_TYPES
Definition: vp9.h:42
mask
int mask
Definition: mediacodecdec_common.c:154
VP9DSPContext::loop_filter_8
void(* loop_filter_8[3][2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
Definition: vp9dsp.h:81
mode
Definition: swscale.c:52
TM_VP8_PRED
@ TM_VP8_PRED
Definition: vp9.h:55
DC_PRED
@ DC_PRED
Definition: vp9.h:48
VP9DSPContext
Definition: vp9dsp.h:40
check_func
#define check_func(func,...)
Definition: checkasm.h:184
randomize_loopfilter_buffers
static void randomize_loopfilter_buffers(int bidx, int lineoff, int str, int bit_depth, int dir, const int *E, const int *F, const int *H, const int *I, uint8_t *buf0, uint8_t *buf1)
Definition: vp9dsp.c:393
VERT_LEFT_PRED
@ VERT_LEFT_PRED
Definition: vp9.h:53
fadst4_1d
static void fadst4_1d(double *out, const double *in, int sz)
Definition: vp9dsp.c:154
F
#define F(x)
filter
void(* filter)(uint8_t *src, int stride, int qscale)
Definition: h263dsp.c:29
mathematics.h
call_ref
#define call_ref(...)
Definition: checkasm.h:199
bit_depth
static void bit_depth(AudioStatsContext *s, const uint64_t *const mask, uint8_t *depth)
Definition: af_astats.c:246
check_itxfm
static void check_itxfm(void)
Definition: vp9dsp.c:311
N_TXFM_SIZES
@ N_TXFM_SIZES
Definition: vp9.h:32
mx
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t mx
Definition: dsp.h:53
DC_127_PRED
@ DC_127_PRED
Definition: vp9.h:59
VP9DSPContext::loop_filter_mix2
void(* loop_filter_mix2[2][2][2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
Definition: vp9dsp.h:103
fail
#define fail()
Definition: checkasm.h:193
VERT_PRED
@ VERT_PRED
Definition: vp9.h:46
trunc
static __device__ float trunc(float a)
Definition: cuda_runtime.h:179
SIZEOF_PIXEL
#define SIZEOF_PIXEL
Definition: vp9dsp.c:34
checkasm.h
DIAG_DOWN_RIGHT_PRED
@ DIAG_DOWN_RIGHT_PRED
Definition: vp9.h:50
copy_subcoefs
static int copy_subcoefs(int16_t *out, const int16_t *in, enum TxfmMode tx, enum TxfmType txtp, int sz, int sub, int bit_depth)
Definition: vp9dsp.c:253
check_mc
static void check_mc(void)
Definition: vp9dsp.c:559
fadst_1d
static void fadst_1d(double *out, const double *in, int sz)
Definition: vp9dsp.c:168
setsx
#define setsx(a, b, c, d)
Definition: vp9dsp.c:392
lrint
#define lrint
Definition: tablegen.h:53
rnd
#define rnd()
Definition: checkasm.h:177
SIZEOF_COEF
#define SIZEOF_COEF
Definition: vp9dsp.c:309
HOR_PRED
@ HOR_PRED
Definition: vp9.h:47
emms_c
#define emms_c()
Definition: emms.h:63
intreadwrite.h
src
#define src
Definition: vp9dsp.c:540
AV_ZERO32
#define AV_ZERO32(d)
Definition: intreadwrite.h:662
op
static int op(uint8_t **dst, const uint8_t *dst_end, GetByteContext *gb, int pixel, int count, int *x, int width, int linesize)
Perform decode operation.
Definition: anm.c:76
ff_vp9_scans
const int16_t *const ff_vp9_scans[5][4]
Definition: vp9data.c:600
vp9data.h
SRC_BUF_STRIDE
#define SRC_BUF_STRIDE
Definition: vp9dsp.c:538
LEFT_DC_PRED
@ LEFT_DC_PRED
Definition: vp9.h:56
check_loopfilter
static void check_loopfilter(void)
Definition: vp9dsp.c:452
ff_vp9dsp_init
av_cold void ff_vp9dsp_init(VP9DSPContext *dsp, int bpp, int bitexact)
Definition: vp9dsp.c:88
ftx_2d
static void ftx_2d(double *out, const double *in, enum TxfmMode tx, enum TxfmType txtp, int sz)
Definition: vp9dsp.c:180
ftx
static void ftx(int16_t *buf, enum TxfmMode tx, enum TxfmType txtp, int sz, int bit_depth)
Definition: vp9dsp.c:231
q0
static const uint8_t q0[256]
Definition: twofish.c:81
E
#define E
Definition: avdct.c:33
my
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t my
Definition: dsp.h:53
DCT_ADST
@ DCT_ADST
Definition: vp9.h:39
call_new
#define call_new(...)
Definition: checkasm.h:302
LOCAL_ALIGNED_32
#define LOCAL_ALIGNED_32(t, v,...)
Definition: mem_internal.h:132
M
#define M(a)
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
VP9DSPContext::itxfm_add
void(* itxfm_add[N_TXFM_SIZES+1][N_TXFM_TYPES])(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob)
Definition: vp9dsp.h:71
VP9DSPContext::intra_pred
void(* intra_pred[N_TXFM_SIZES][N_INTRA_PRED_MODES])(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, const uint8_t *top)
Definition: vp9dsp.h:52
TxfmMode
TxfmMode
Definition: vp9.h:27
vp9.h
DCT_DCT
@ DCT_DCT
Definition: vp9.h:38
TxfmType
TxfmType
Definition: vp9.h:37
pixel_mask
static const uint32_t pixel_mask[3]
Definition: vp9dsp.c:33
N_INTRA_PRED_MODES
@ N_INTRA_PRED_MODES
Definition: vp9.h:61
dst
uint8_t ptrdiff_t const uint8_t ptrdiff_t int intptr_t intptr_t int int16_t * dst
Definition: dsp.h:83
size
int size
Definition: twinvq_data.h:10344
VERT_RIGHT_PRED
@ VERT_RIGHT_PRED
Definition: vp9.h:51
VP9DSPContext::mc
vp9_mc_func mc[5][N_FILTERS][2][2][2]
Definition: vp9dsp.h:115
TX_4X4
@ TX_4X4
Definition: vp9.h:28
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
H
#define H
Definition: pixlet.c:39
M_PI
#define M_PI
Definition: mathematics.h:67
ftx1d_fn
void(* ftx1d_fn)(double *out, const double *in, int sz)
Definition: vp9dsp.c:179
emms.h
randomize_buffers
#define randomize_buffers()
Definition: vp9dsp.c:542
report
#define report
Definition: checkasm.h:196
is_zero
static int is_zero(const int16_t *c, int sz)
Definition: vp9dsp.c:298
bench_new
#define bench_new(...)
Definition: checkasm.h:373
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:256
fdct_1d
static void fdct_1d(double *out, const double *in, int sz)
Definition: vp9dsp.c:139
DC_129_PRED
@ DC_129_PRED
Definition: vp9.h:60
internal.h
common.h
ADST_ADST
@ ADST_ADST
Definition: vp9.h:41
AV_COPY32
#define AV_COPY32(d, s)
Definition: intreadwrite.h:634
AV_RN32A
#define AV_RN32A(p)
Definition: intreadwrite.h:522
stride
#define stride
Definition: h264pred_template.c:536
left
Tag MUST be and< 10hcoeff half pel interpolation filter coefficients, hcoeff[0] are the 2 middle coefficients[1] are the next outer ones and so on, resulting in a filter like:...eff[2], hcoeff[1], hcoeff[0], hcoeff[0], hcoeff[1], hcoeff[2] ... the sign of the coefficients is not explicitly stored but alternates after each coeff and coeff[0] is positive, so ...,+,-,+,-,+,+,-,+,-,+,... hcoeff[0] is not explicitly stored but found by subtracting the sum of all stored coefficients with signs from 32 hcoeff[0]=32 - hcoeff[1] - hcoeff[2] - ... a good choice for hcoeff and htaps is htaps=6 hcoeff={40,-10, 2} an alternative which requires more computations at both encoder and decoder side and may or may not be better is htaps=8 hcoeff={42,-14, 6,-2}ref_frames minimum of the number of available reference frames and max_ref_frames for example the first frame after a key frame always has ref_frames=1spatial_decomposition_type wavelet type 0 is a 9/7 symmetric compact integer wavelet 1 is a 5/3 symmetric compact integer wavelet others are reserved stored as delta from last, last is reset to 0 if always_reset||keyframeqlog quality(logarithmic quantizer scale) stored as delta from last, last is reset to 0 if always_reset||keyframemv_scale stored as delta from last, last is reset to 0 if always_reset||keyframe FIXME check that everything works fine if this changes between framesqbias dequantization bias stored as delta from last, last is reset to 0 if always_reset||keyframeblock_max_depth maximum depth of the block tree stored as delta from last, last is reset to 0 if always_reset||keyframequant_table quantization tableHighlevel bitstream structure:==============================--------------------------------------------|Header|--------------------------------------------|------------------------------------|||Block0||||split?||||yes no||||......... intra?||||:Block01 :yes no||||:Block02 :....... ..........||||:Block03 ::y DC ::ref index:||||:Block04 ::cb DC ::motion x :||||......... :cr DC ::motion y :||||....... ..........|||------------------------------------||------------------------------------|||Block1|||...|--------------------------------------------|------------ ------------ ------------|||Y subbands||Cb subbands||Cr subbands||||--- ---||--- ---||--- ---|||||LL0||HL0||||LL0||HL0||||LL0||HL0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||LH0||HH0||||LH0||HH0||||LH0||HH0|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HL1||LH1||||HL1||LH1||||HL1||LH1|||||--- ---||--- ---||--- ---||||--- ---||--- ---||--- ---|||||HH1||HL2||||HH1||HL2||||HH1||HL2|||||...||...||...|||------------ ------------ ------------|--------------------------------------------Decoding process:=================------------|||Subbands|------------||||------------|Intra DC||||LL0 subband prediction ------------|\ Dequantization ------------------- \||Reference frames|\ IDWT|------- -------|Motion \|||Frame 0||Frame 1||Compensation . OBMC v -------|------- -------|--------------. \------> Frame n output Frame Frame<----------------------------------/|...|------------------- Range Coder:============Binary Range Coder:------------------- The implemented range coder is an adapted version based upon "Range encoding: an algorithm for removing redundancy from a digitised message." by G. N. N. Martin. The symbols encoded by the Snow range coder are bits(0|1). The associated probabilities are not fix but change depending on the symbol mix seen so far. bit seen|new state ---------+----------------------------------------------- 0|256 - state_transition_table[256 - old_state];1|state_transition_table[old_state];state_transition_table={ 0, 0, 0, 0, 0, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 190, 191, 192, 194, 194, 195, 196, 197, 198, 199, 200, 201, 202, 202, 204, 205, 206, 207, 208, 209, 209, 210, 211, 212, 213, 215, 215, 216, 217, 218, 219, 220, 220, 222, 223, 224, 225, 226, 227, 227, 229, 229, 230, 231, 232, 234, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 248, 0, 0, 0, 0, 0, 0, 0};FIXME Range Coding of integers:------------------------- FIXME Neighboring Blocks:===================left and top are set to the respective blocks unless they are outside of the image in which case they are set to the Null block top-left is set to the top left block unless it is outside of the image in which case it is set to the left block if this block has no larger parent block or it is at the left side of its parent block and the top right block is not outside of the image then the top right block is used for top-right else the top-left block is used Null block y, cb, cr are 128 level, ref, mx and my are 0 Motion Vector Prediction:=========================1. the motion vectors of all the neighboring blocks are scaled to compensate for the difference of reference frames scaled_mv=(mv *(256 *(current_reference+1)/(mv.reference+1))+128)> the median of the scaled left
Definition: snow.txt:386
M_SQRT1_2
#define M_SQRT1_2
Definition: mathematics.h:103
AV_CPU_FLAG_MMX
#define AV_CPU_FLAG_MMX
standard MMX
Definition: cpu.h:30
HOR_UP_PRED
@ HOR_UP_PRED
Definition: vp9.h:54
mode
mode
Definition: ebur128.h:83
ref
static int ref[MAX_W *MAX_W]
Definition: jpeg2000dwt.c:117
temp
else temp
Definition: vf_mcdeint.c:263
HOR_DOWN_PRED
@ HOR_DOWN_PRED
Definition: vp9.h:52
AV_CPU_FLAG_MMXEXT
#define AV_CPU_FLAG_MMXEXT
SSE integer functions or AMD MMX ext.
Definition: cpu.h:31
TX_32X32
@ TX_32X32
Definition: vp9.h:31
TOP_DC_PRED
@ TOP_DC_PRED
Definition: vp9.h:57
ADST_DCT
@ ADST_DCT
Definition: vp9.h:40
DIAG_DOWN_LEFT_PRED
@ DIAG_DOWN_LEFT_PRED
Definition: vp9.h:49
int32_t
int32_t
Definition: audioconvert.c:56
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
h
h
Definition: vp9dsp_template.c:2070
VP9DSPContext::loop_filter_16
void(* loop_filter_16[2])(uint8_t *dst, ptrdiff_t stride, int mb_lim, int lim, int hev_thr)
Definition: vp9dsp.h:89
snprintf
#define snprintf
Definition: snprintf.h:34
checkasm_check_vp9dsp
void checkasm_check_vp9dsp(void)
Definition: vp9dsp.c:627
check_ipred
static void check_ipred(void)
Definition: vp9dsp.c:50
DST_BUF_SIZE
#define DST_BUF_SIZE
Definition: vp9dsp.c:537