FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vp9.c
Go to the documentation of this file.
1 /*
2  * VP9 compatible video decoder
3  *
4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "avcodec.h"
25 #include "get_bits.h"
26 #include "internal.h"
27 #include "thread.h"
28 #include "videodsp.h"
29 #include "vp56.h"
30 #include "vp9.h"
31 #include "vp9data.h"
32 #include "vp9dsp.h"
33 #include "libavutil/avassert.h"
34 
35 #define VP9_SYNCCODE 0x498342
36 
41 };
42 
43 enum BlockLevel {
48 };
49 
50 enum BlockSize {
65 };
66 
67 struct VP9mvrefPair {
68  VP56mv mv[2];
69  int8_t ref[2];
70 };
71 
72 typedef struct VP9Frame {
76  struct VP9mvrefPair *mv;
77 } VP9Frame;
78 
79 struct VP9Filter {
80  uint8_t level[8 * 8];
81  uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
82  [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
83 };
84 
85 typedef struct VP9Block {
88  VP56mv mv[4 /* b_idx */][2 /* ref */];
89  enum BlockSize bs;
90  enum TxfmMode tx, uvtx;
91  enum BlockLevel bl;
93 } VP9Block;
94 
95 typedef struct VP9Context {
101  unsigned c_b_size;
104  int row, row7, col, col7;
106  ptrdiff_t y_stride, uv_stride;
107 
108  // bitstream header
130 #define CUR_FRAME 0
131 #define LAST_FRAME 1
133 
134  struct {
136  int8_t sharpness;
139  } filter;
140  struct {
142  int8_t mode[2];
143  int8_t ref[4];
144  } lf_delta;
148  struct {
153  struct {
159  int16_t q_val;
160  int8_t lf_val;
161  int16_t qmul[2][2];
162  uint8_t lflvl[4][2];
163  } feat[8];
164  } segmentation;
165  struct {
167  unsigned tile_cols, tile_rows;
169  } tiling;
170  unsigned sb_cols, sb_rows, rows, cols;
171  struct {
173  uint8_t coef[4][2][2][6][6][3];
174  } prob_ctx[4];
175  struct {
176  prob_context p;
177  uint8_t coef[4][2][2][6][6][11];
180  } prob;
181  struct {
182  unsigned y_mode[4][10];
183  unsigned uv_mode[10][10];
184  unsigned filter[4][3];
185  unsigned mv_mode[7][4];
186  unsigned intra[4][2];
187  unsigned comp[5][2];
188  unsigned single_ref[5][2][2];
189  unsigned comp_ref[5][2];
190  unsigned tx32p[2][4];
191  unsigned tx16p[2][3];
192  unsigned tx8p[2][2];
193  unsigned skip[3][2];
194  unsigned mv_joint[4];
195  struct {
196  unsigned sign[2];
197  unsigned classes[11];
198  unsigned class0[2];
199  unsigned bits[10][2];
200  unsigned class0_fp[2][4];
201  unsigned fp[4];
202  unsigned class0_hp[2];
203  unsigned hp[2];
204  } mv_comp[2];
205  unsigned partition[4][4][4];
206  unsigned coef[4][2][2][6][6][3];
207  unsigned eob[4][2][2][6][6][2];
208  } counts;
211 
212  // contextual (left/above) cache
227  // FIXME maybe merge some of the below in a flags field?
238 
239  // whole-frame cache
241  struct VP9Filter *lflvl;
243 
244  // block reconstruction intermediates
246  int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
248  struct { int x, y; } min_mv, max_mv;
250  DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
251 } VP9Context;
252 
253 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
254  {
255  { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
256  { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
257  }, {
258  { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
259  { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
260  }
261 };
262 
264 {
265  VP9Context *s = ctx->priv_data;
266  int ret, sz;
267 
268  if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
269  return ret;
270  sz = 64 * s->sb_cols * s->sb_rows;
271  if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
272  ff_thread_release_buffer(ctx, &f->tf);
273  return AVERROR(ENOMEM);
274  }
275 
277  f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
278 
279  // retain segmentation map if it doesn't update
281  !s->intraonly && !s->keyframe && !s->errorres) {
283  }
284 
285  return 0;
286 }
287 
289 {
290  ff_thread_release_buffer(ctx, &f->tf);
292 }
293 
295 {
296  int res;
297 
298  if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
299  return res;
300  } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
301  vp9_unref_frame(ctx, dst);
302  return AVERROR(ENOMEM);
303  }
304 
306  dst->mv = src->mv;
307 
308  return 0;
309 }
310 
311 static int update_size(AVCodecContext *ctx, int w, int h)
312 {
313  VP9Context *s = ctx->priv_data;
314  uint8_t *p;
315 
316  av_assert0(w > 0 && h > 0);
317 
318  if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height)
319  return 0;
320 
321  ctx->width = w;
322  ctx->height = h;
323  s->sb_cols = (w + 63) >> 6;
324  s->sb_rows = (h + 63) >> 6;
325  s->cols = (w + 7) >> 3;
326  s->rows = (h + 7) >> 3;
327 
328 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
329  av_freep(&s->intra_pred_data[0]);
330  p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
331  if (!p)
332  return AVERROR(ENOMEM);
333  assign(s->intra_pred_data[0], uint8_t *, 64);
334  assign(s->intra_pred_data[1], uint8_t *, 32);
335  assign(s->intra_pred_data[2], uint8_t *, 32);
336  assign(s->above_y_nnz_ctx, uint8_t *, 16);
337  assign(s->above_mode_ctx, uint8_t *, 16);
338  assign(s->above_mv_ctx, VP56mv(*)[2], 16);
340  assign(s->above_skip_ctx, uint8_t *, 8);
341  assign(s->above_txfm_ctx, uint8_t *, 8);
342  assign(s->above_uv_nnz_ctx[0], uint8_t *, 8);
343  assign(s->above_uv_nnz_ctx[1], uint8_t *, 8);
344  assign(s->above_segpred_ctx, uint8_t *, 8);
345  assign(s->above_intra_ctx, uint8_t *, 8);
346  assign(s->above_comp_ctx, uint8_t *, 8);
347  assign(s->above_ref_ctx, uint8_t *, 8);
348  assign(s->above_filter_ctx, uint8_t *, 8);
349  assign(s->lflvl, struct VP9Filter *, 1);
350 #undef assign
351 
352  // these will be re-allocated a little later
353  av_freep(&s->b_base);
354  av_freep(&s->block_base);
355 
356  return 0;
357 }
358 
360 {
361  VP9Context *s = ctx->priv_data;
362 
363  if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->uses_2pass)
364  return 0;
365 
366  av_free(s->b_base);
367  av_free(s->block_base);
368  if (s->uses_2pass) {
369  int sbs = s->sb_cols * s->sb_rows;
370 
371  s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
372  s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
373  if (!s->b_base || !s->block_base)
374  return AVERROR(ENOMEM);
375  s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
376  s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
377  s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
378  s->uveob_base[0] = s->eob_base + 256 * sbs;
379  s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
380  } else {
381  s->b_base = av_malloc(sizeof(VP9Block));
382  s->block_base = av_mallocz((64 * 64 + 128) * 3);
383  if (!s->b_base || !s->block_base)
384  return AVERROR(ENOMEM);
385  s->uvblock_base[0] = s->block_base + 64 * 64;
386  s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
387  s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
388  s->uveob_base[0] = s->eob_base + 256;
389  s->uveob_base[1] = s->uveob_base[0] + 64;
390  }
392 
393  return 0;
394 }
395 
396 // for some reason the sign bit is at the end, not the start, of a bit sequence
398 {
399  int v = get_bits(gb, n);
400  return get_bits1(gb) ? -v : v;
401 }
402 
404 {
405  return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
406 }
407 
408 // differential forward probability updates
409 static int update_prob(VP56RangeCoder *c, int p)
410 {
411  static const int inv_map_table[254] = {
412  7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
413  189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
414  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
415  25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
416  40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
417  55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
418  70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
419  86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
420  101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
421  116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
422  131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
423  146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
424  161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
425  177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
426  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
427  207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
428  222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
429  237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
430  252, 253,
431  };
432  int d;
433 
434  /* This code is trying to do a differential probability update. For a
435  * current probability A in the range [1, 255], the difference to a new
436  * probability of any value can be expressed differentially as 1-A,255-A
437  * where some part of this (absolute range) exists both in positive as
438  * well as the negative part, whereas another part only exists in one
439  * half. We're trying to code this shared part differentially, i.e.
440  * times two where the value of the lowest bit specifies the sign, and
441  * the single part is then coded on top of this. This absolute difference
442  * then again has a value of [0,254], but a bigger value in this range
443  * indicates that we're further away from the original value A, so we
444  * can code this as a VLC code, since higher values are increasingly
445  * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
446  * updates vs. the 'fine, exact' updates further down the range, which
447  * adds one extra dimension to this differential update model. */
448 
449  if (!vp8_rac_get(c)) {
450  d = vp8_rac_get_uint(c, 4) + 0;
451  } else if (!vp8_rac_get(c)) {
452  d = vp8_rac_get_uint(c, 4) + 16;
453  } else if (!vp8_rac_get(c)) {
454  d = vp8_rac_get_uint(c, 5) + 32;
455  } else {
456  d = vp8_rac_get_uint(c, 7);
457  if (d >= 65)
458  d = (d << 1) - 65 + vp8_rac_get(c);
459  d += 64;
460  }
461 
462  return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
463  255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
464 }
465 
467  const uint8_t *data, int size, int *ref)
468 {
469  VP9Context *s = ctx->priv_data;
470  int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
471  int last_invisible;
472  const uint8_t *data2;
473 
474  /* general header */
475  if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
476  av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
477  return res;
478  }
479  if (get_bits(&s->gb, 2) != 0x2) { // frame marker
480  av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
481  return AVERROR_INVALIDDATA;
482  }
483  s->profile = get_bits1(&s->gb);
484  if (get_bits1(&s->gb)) { // reserved bit
485  av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
486  return AVERROR_INVALIDDATA;
487  }
488  if (get_bits1(&s->gb)) {
489  *ref = get_bits(&s->gb, 3);
490  return 0;
491  }
492  s->last_uses_2pass = s->uses_2pass;
493  s->last_keyframe = s->keyframe;
494  s->keyframe = !get_bits1(&s->gb);
495  last_invisible = s->invisible;
496  s->invisible = !get_bits1(&s->gb);
497  s->errorres = get_bits1(&s->gb);
498  s->use_last_frame_mvs = !s->errorres && !last_invisible;
499  if (s->keyframe) {
500  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
501  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
502  return AVERROR_INVALIDDATA;
503  }
504  s->colorspace = get_bits(&s->gb, 3);
505  if (s->colorspace == 7) { // RGB = profile 1
506  av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
507  return AVERROR_INVALIDDATA;
508  }
509  s->fullrange = get_bits1(&s->gb);
510  // for profile 1, here follows the subsampling bits
511  s->refreshrefmask = 0xff;
512  w = get_bits(&s->gb, 16) + 1;
513  h = get_bits(&s->gb, 16) + 1;
514  if (get_bits1(&s->gb)) // display size
515  skip_bits(&s->gb, 32);
516  } else {
517  s->intraonly = s->invisible ? get_bits1(&s->gb) : 0;
518  s->resetctx = s->errorres ? 0 : get_bits(&s->gb, 2);
519  if (s->intraonly) {
520  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
521  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
522  return AVERROR_INVALIDDATA;
523  }
524  s->refreshrefmask = get_bits(&s->gb, 8);
525  w = get_bits(&s->gb, 16) + 1;
526  h = get_bits(&s->gb, 16) + 1;
527  if (get_bits1(&s->gb)) // display size
528  skip_bits(&s->gb, 32);
529  } else {
530  s->refreshrefmask = get_bits(&s->gb, 8);
531  s->refidx[0] = get_bits(&s->gb, 3);
532  s->signbias[0] = get_bits1(&s->gb);
533  s->refidx[1] = get_bits(&s->gb, 3);
534  s->signbias[1] = get_bits1(&s->gb);
535  s->refidx[2] = get_bits(&s->gb, 3);
536  s->signbias[2] = get_bits1(&s->gb);
537  if (!s->refs[s->refidx[0]].f->data[0] ||
538  !s->refs[s->refidx[1]].f->data[0] ||
539  !s->refs[s->refidx[2]].f->data[0]) {
540  av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
541  return AVERROR_INVALIDDATA;
542  }
543  if (get_bits1(&s->gb)) {
544  w = s->refs[s->refidx[0]].f->width;
545  h = s->refs[s->refidx[0]].f->height;
546  } else if (get_bits1(&s->gb)) {
547  w = s->refs[s->refidx[1]].f->width;
548  h = s->refs[s->refidx[1]].f->height;
549  } else if (get_bits1(&s->gb)) {
550  w = s->refs[s->refidx[2]].f->width;
551  h = s->refs[s->refidx[2]].f->height;
552  } else {
553  w = get_bits(&s->gb, 16) + 1;
554  h = get_bits(&s->gb, 16) + 1;
555  }
556  // Note that in this code, "CUR_FRAME" is actually before we
557  // have formally allocated a frame, and thus actually represents
558  // the _last_ frame
559  s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
560  s->frames[CUR_FRAME].tf.f->height == h;
561  if (get_bits1(&s->gb)) // display size
562  skip_bits(&s->gb, 32);
563  s->highprecisionmvs = get_bits1(&s->gb);
565  get_bits(&s->gb, 2);
566  s->allowcompinter = s->signbias[0] != s->signbias[1] ||
567  s->signbias[0] != s->signbias[2];
568  if (s->allowcompinter) {
569  if (s->signbias[0] == s->signbias[1]) {
570  s->fixcompref = 2;
571  s->varcompref[0] = 0;
572  s->varcompref[1] = 1;
573  } else if (s->signbias[0] == s->signbias[2]) {
574  s->fixcompref = 1;
575  s->varcompref[0] = 0;
576  s->varcompref[1] = 2;
577  } else {
578  s->fixcompref = 0;
579  s->varcompref[0] = 1;
580  s->varcompref[1] = 2;
581  }
582  }
583  }
584  }
585  s->refreshctx = s->errorres ? 0 : get_bits1(&s->gb);
586  s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
587  s->framectxid = c = get_bits(&s->gb, 2);
588 
589  /* loopfilter header data */
590  s->filter.level = get_bits(&s->gb, 6);
591  sharp = get_bits(&s->gb, 3);
592  // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
593  // the old cache values since they are still valid
594  if (s->filter.sharpness != sharp)
595  memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
596  s->filter.sharpness = sharp;
597  if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
598  if (get_bits1(&s->gb)) {
599  for (i = 0; i < 4; i++)
600  if (get_bits1(&s->gb))
601  s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
602  for (i = 0; i < 2; i++)
603  if (get_bits1(&s->gb))
604  s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
605  }
606  } else {
607  memset(&s->lf_delta, 0, sizeof(s->lf_delta));
608  }
609 
610  /* quantization header data */
611  s->yac_qi = get_bits(&s->gb, 8);
612  s->ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
613  s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
614  s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
615  s->lossless = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
616  s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
617 
618  /* segmentation header info */
619  if ((s->segmentation.enabled = get_bits1(&s->gb))) {
620  if ((s->segmentation.update_map = get_bits1(&s->gb))) {
621  for (i = 0; i < 7; i++)
622  s->prob.seg[i] = get_bits1(&s->gb) ?
623  get_bits(&s->gb, 8) : 255;
624  if ((s->segmentation.temporal = get_bits1(&s->gb))) {
625  for (i = 0; i < 3; i++)
626  s->prob.segpred[i] = get_bits1(&s->gb) ?
627  get_bits(&s->gb, 8) : 255;
628  }
629  }
630  if ((!s->segmentation.update_map || s->segmentation.temporal) &&
631  (w != s->frames[CUR_FRAME].tf.f->width ||
632  h != s->frames[CUR_FRAME].tf.f->height)) {
633  av_log(ctx, AV_LOG_ERROR,
634  "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
636  return AVERROR_INVALIDDATA;
637  }
638 
639  if (get_bits1(&s->gb)) {
641  for (i = 0; i < 8; i++) {
642  if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
643  s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
644  if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
645  s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
646  if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
647  s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
648  s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
649  }
650  }
651  } else {
652  s->segmentation.feat[0].q_enabled = 0;
653  s->segmentation.feat[0].lf_enabled = 0;
654  s->segmentation.feat[0].skip_enabled = 0;
655  s->segmentation.feat[0].ref_enabled = 0;
656  }
657 
658  // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
659  for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
660  int qyac, qydc, quvac, quvdc, lflvl, sh;
661 
662  if (s->segmentation.feat[i].q_enabled) {
664  qyac = s->segmentation.feat[i].q_val;
665  else
666  qyac = s->yac_qi + s->segmentation.feat[i].q_val;
667  } else {
668  qyac = s->yac_qi;
669  }
670  qydc = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
671  quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
672  quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
673  qyac = av_clip_uintp2(qyac, 8);
674 
675  s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
676  s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
677  s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
678  s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
679 
680  sh = s->filter.level >= 32;
681  if (s->segmentation.feat[i].lf_enabled) {
683  lflvl = s->segmentation.feat[i].lf_val;
684  else
685  lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
686  } else {
687  lflvl = s->filter.level;
688  }
689  s->segmentation.feat[i].lflvl[0][0] =
690  s->segmentation.feat[i].lflvl[0][1] =
691  av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
692  for (j = 1; j < 4; j++) {
693  s->segmentation.feat[i].lflvl[j][0] =
694  av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
695  s->lf_delta.mode[0]) << sh), 6);
696  s->segmentation.feat[i].lflvl[j][1] =
697  av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
698  s->lf_delta.mode[1]) << sh), 6);
699  }
700  }
701 
702  /* tiling info */
703  if ((res = update_size(ctx, w, h)) < 0) {
704  av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
705  return res;
706  }
707  for (s->tiling.log2_tile_cols = 0;
708  (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
709  s->tiling.log2_tile_cols++) ;
710  for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
711  max = FFMAX(0, max - 1);
712  while (max > s->tiling.log2_tile_cols) {
713  if (get_bits1(&s->gb))
714  s->tiling.log2_tile_cols++;
715  else
716  break;
717  }
718  s->tiling.log2_tile_rows = decode012(&s->gb);
719  s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
720  if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
721  s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
722  s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
723  sizeof(VP56RangeCoder) * s->tiling.tile_cols);
724  if (!s->c_b) {
725  av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
726  return AVERROR(ENOMEM);
727  }
728  }
729 
730  if (s->keyframe || s->errorres || s->intraonly) {
731  s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
732  s->prob_ctx[3].p = vp9_default_probs;
733  memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
734  sizeof(vp9_default_coef_probs));
735  memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
736  sizeof(vp9_default_coef_probs));
737  memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
738  sizeof(vp9_default_coef_probs));
739  memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
740  sizeof(vp9_default_coef_probs));
741  }
742 
743  // next 16 bits is size of the rest of the header (arith-coded)
744  size2 = get_bits(&s->gb, 16);
745  data2 = align_get_bits(&s->gb);
746  if (size2 > size - (data2 - data)) {
747  av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
748  return AVERROR_INVALIDDATA;
749  }
750  ff_vp56_init_range_decoder(&s->c, data2, size2);
751  if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
752  av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
753  return AVERROR_INVALIDDATA;
754  }
755 
756  if (s->keyframe || s->intraonly) {
757  memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
758  } else {
759  memset(&s->counts, 0, sizeof(s->counts));
760  }
761  // FIXME is it faster to not copy here, but do it down in the fw updates
762  // as explicit copies if the fw update is missing (and skip the copy upon
763  // fw update)?
764  s->prob.p = s->prob_ctx[c].p;
765 
766  // txfm updates
767  if (s->lossless) {
768  s->txfmmode = TX_4X4;
769  } else {
770  s->txfmmode = vp8_rac_get_uint(&s->c, 2);
771  if (s->txfmmode == 3)
772  s->txfmmode += vp8_rac_get(&s->c);
773 
774  if (s->txfmmode == TX_SWITCHABLE) {
775  for (i = 0; i < 2; i++)
776  if (vp56_rac_get_prob_branchy(&s->c, 252))
777  s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
778  for (i = 0; i < 2; i++)
779  for (j = 0; j < 2; j++)
780  if (vp56_rac_get_prob_branchy(&s->c, 252))
781  s->prob.p.tx16p[i][j] =
782  update_prob(&s->c, s->prob.p.tx16p[i][j]);
783  for (i = 0; i < 2; i++)
784  for (j = 0; j < 3; j++)
785  if (vp56_rac_get_prob_branchy(&s->c, 252))
786  s->prob.p.tx32p[i][j] =
787  update_prob(&s->c, s->prob.p.tx32p[i][j]);
788  }
789  }
790 
791  // coef updates
792  for (i = 0; i < 4; i++) {
793  uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
794  if (vp8_rac_get(&s->c)) {
795  for (j = 0; j < 2; j++)
796  for (k = 0; k < 2; k++)
797  for (l = 0; l < 6; l++)
798  for (m = 0; m < 6; m++) {
799  uint8_t *p = s->prob.coef[i][j][k][l][m];
800  uint8_t *r = ref[j][k][l][m];
801  if (m >= 3 && l == 0) // dc only has 3 pt
802  break;
803  for (n = 0; n < 3; n++) {
804  if (vp56_rac_get_prob_branchy(&s->c, 252)) {
805  p[n] = update_prob(&s->c, r[n]);
806  } else {
807  p[n] = r[n];
808  }
809  }
810  p[3] = 0;
811  }
812  } else {
813  for (j = 0; j < 2; j++)
814  for (k = 0; k < 2; k++)
815  for (l = 0; l < 6; l++)
816  for (m = 0; m < 6; m++) {
817  uint8_t *p = s->prob.coef[i][j][k][l][m];
818  uint8_t *r = ref[j][k][l][m];
819  if (m > 3 && l == 0) // dc only has 3 pt
820  break;
821  memcpy(p, r, 3);
822  p[3] = 0;
823  }
824  }
825  if (s->txfmmode == i)
826  break;
827  }
828 
829  // mode updates
830  for (i = 0; i < 3; i++)
831  if (vp56_rac_get_prob_branchy(&s->c, 252))
832  s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
833  if (!s->keyframe && !s->intraonly) {
834  for (i = 0; i < 7; i++)
835  for (j = 0; j < 3; j++)
836  if (vp56_rac_get_prob_branchy(&s->c, 252))
837  s->prob.p.mv_mode[i][j] =
838  update_prob(&s->c, s->prob.p.mv_mode[i][j]);
839 
840  if (s->filtermode == FILTER_SWITCHABLE)
841  for (i = 0; i < 4; i++)
842  for (j = 0; j < 2; j++)
843  if (vp56_rac_get_prob_branchy(&s->c, 252))
844  s->prob.p.filter[i][j] =
845  update_prob(&s->c, s->prob.p.filter[i][j]);
846 
847  for (i = 0; i < 4; i++)
848  if (vp56_rac_get_prob_branchy(&s->c, 252))
849  s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
850 
851  if (s->allowcompinter) {
852  s->comppredmode = vp8_rac_get(&s->c);
853  if (s->comppredmode)
854  s->comppredmode += vp8_rac_get(&s->c);
855  if (s->comppredmode == PRED_SWITCHABLE)
856  for (i = 0; i < 5; i++)
857  if (vp56_rac_get_prob_branchy(&s->c, 252))
858  s->prob.p.comp[i] =
859  update_prob(&s->c, s->prob.p.comp[i]);
860  } else {
862  }
863 
864  if (s->comppredmode != PRED_COMPREF) {
865  for (i = 0; i < 5; i++) {
866  if (vp56_rac_get_prob_branchy(&s->c, 252))
867  s->prob.p.single_ref[i][0] =
868  update_prob(&s->c, s->prob.p.single_ref[i][0]);
869  if (vp56_rac_get_prob_branchy(&s->c, 252))
870  s->prob.p.single_ref[i][1] =
871  update_prob(&s->c, s->prob.p.single_ref[i][1]);
872  }
873  }
874 
875  if (s->comppredmode != PRED_SINGLEREF) {
876  for (i = 0; i < 5; i++)
877  if (vp56_rac_get_prob_branchy(&s->c, 252))
878  s->prob.p.comp_ref[i] =
879  update_prob(&s->c, s->prob.p.comp_ref[i]);
880  }
881 
882  for (i = 0; i < 4; i++)
883  for (j = 0; j < 9; j++)
884  if (vp56_rac_get_prob_branchy(&s->c, 252))
885  s->prob.p.y_mode[i][j] =
886  update_prob(&s->c, s->prob.p.y_mode[i][j]);
887 
888  for (i = 0; i < 4; i++)
889  for (j = 0; j < 4; j++)
890  for (k = 0; k < 3; k++)
891  if (vp56_rac_get_prob_branchy(&s->c, 252))
892  s->prob.p.partition[3 - i][j][k] =
893  update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
894 
895  // mv fields don't use the update_prob subexp model for some reason
896  for (i = 0; i < 3; i++)
897  if (vp56_rac_get_prob_branchy(&s->c, 252))
898  s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
899 
900  for (i = 0; i < 2; i++) {
901  if (vp56_rac_get_prob_branchy(&s->c, 252))
902  s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
903 
904  for (j = 0; j < 10; j++)
905  if (vp56_rac_get_prob_branchy(&s->c, 252))
906  s->prob.p.mv_comp[i].classes[j] =
907  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
908 
909  if (vp56_rac_get_prob_branchy(&s->c, 252))
910  s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
911 
912  for (j = 0; j < 10; j++)
913  if (vp56_rac_get_prob_branchy(&s->c, 252))
914  s->prob.p.mv_comp[i].bits[j] =
915  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
916  }
917 
918  for (i = 0; i < 2; i++) {
919  for (j = 0; j < 2; j++)
920  for (k = 0; k < 3; k++)
921  if (vp56_rac_get_prob_branchy(&s->c, 252))
922  s->prob.p.mv_comp[i].class0_fp[j][k] =
923  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
924 
925  for (j = 0; j < 3; j++)
926  if (vp56_rac_get_prob_branchy(&s->c, 252))
927  s->prob.p.mv_comp[i].fp[j] =
928  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
929  }
930 
931  if (s->highprecisionmvs) {
932  for (i = 0; i < 2; i++) {
933  if (vp56_rac_get_prob_branchy(&s->c, 252))
934  s->prob.p.mv_comp[i].class0_hp =
935  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
936 
937  if (vp56_rac_get_prob_branchy(&s->c, 252))
938  s->prob.p.mv_comp[i].hp =
939  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
940  }
941  }
942  }
943 
944  return (data2 - data) + size2;
945 }
946 
947 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
948  VP9Context *s)
949 {
950  dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
951  dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
952 }
953 
954 static void find_ref_mvs(VP9Context *s,
955  VP56mv *pmv, int ref, int z, int idx, int sb)
956 {
957  static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
958  [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
959  { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
960  [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
961  { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
962  [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
963  { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
964  [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
965  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
966  [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
967  { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
968  [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
969  { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
970  [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
971  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
972  [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
973  { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
974  [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
975  { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
976  [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
977  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
978  [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
979  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
980  [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
981  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
982  [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
983  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
984  };
985  VP9Block *b = s->b;
986  int row = s->row, col = s->col, row7 = s->row7;
987  const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
988 #define INVALID_MV 0x80008000U
989  uint32_t mem = INVALID_MV;
990  int i;
991 
992 #define RETURN_DIRECT_MV(mv) \
993  do { \
994  uint32_t m = AV_RN32A(&mv); \
995  if (!idx) { \
996  AV_WN32A(pmv, m); \
997  return; \
998  } else if (mem == INVALID_MV) { \
999  mem = m; \
1000  } else if (m != mem) { \
1001  AV_WN32A(pmv, m); \
1002  return; \
1003  } \
1004  } while (0)
1005 
1006  if (sb >= 0) {
1007  if (sb == 2 || sb == 1) {
1008  RETURN_DIRECT_MV(b->mv[0][z]);
1009  } else if (sb == 3) {
1010  RETURN_DIRECT_MV(b->mv[2][z]);
1011  RETURN_DIRECT_MV(b->mv[1][z]);
1012  RETURN_DIRECT_MV(b->mv[0][z]);
1013  }
1014 
1015 #define RETURN_MV(mv) \
1016  do { \
1017  if (sb > 0) { \
1018  VP56mv tmp; \
1019  uint32_t m; \
1020  clamp_mv(&tmp, &mv, s); \
1021  m = AV_RN32A(&tmp); \
1022  if (!idx) { \
1023  AV_WN32A(pmv, m); \
1024  return; \
1025  } else if (mem == INVALID_MV) { \
1026  mem = m; \
1027  } else if (m != mem) { \
1028  AV_WN32A(pmv, m); \
1029  return; \
1030  } \
1031  } else { \
1032  uint32_t m = AV_RN32A(&mv); \
1033  if (!idx) { \
1034  clamp_mv(pmv, &mv, s); \
1035  return; \
1036  } else if (mem == INVALID_MV) { \
1037  mem = m; \
1038  } else if (m != mem) { \
1039  clamp_mv(pmv, &mv, s); \
1040  return; \
1041  } \
1042  } \
1043  } while (0)
1044 
1045  if (row > 0) {
1046  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1047  if (mv->ref[0] == ref) {
1048  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1049  } else if (mv->ref[1] == ref) {
1050  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1051  }
1052  }
1053  if (col > s->tiling.tile_col_start) {
1054  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1055  if (mv->ref[0] == ref) {
1056  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1057  } else if (mv->ref[1] == ref) {
1058  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1059  }
1060  }
1061  i = 2;
1062  } else {
1063  i = 0;
1064  }
1065 
1066  // previously coded MVs in this neighbourhood, using same reference frame
1067  for (; i < 8; i++) {
1068  int c = p[i][0] + col, r = p[i][1] + row;
1069 
1070  if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1071  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1072 
1073  if (mv->ref[0] == ref) {
1074  RETURN_MV(mv->mv[0]);
1075  } else if (mv->ref[1] == ref) {
1076  RETURN_MV(mv->mv[1]);
1077  }
1078  }
1079  }
1080 
1081  // MV at this position in previous frame, using same reference frame
1082  if (s->use_last_frame_mvs) {
1083  struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1084 
1085  if (!s->last_uses_2pass)
1086  ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1087  if (mv->ref[0] == ref) {
1088  RETURN_MV(mv->mv[0]);
1089  } else if (mv->ref[1] == ref) {
1090  RETURN_MV(mv->mv[1]);
1091  }
1092  }
1093 
1094 #define RETURN_SCALE_MV(mv, scale) \
1095  do { \
1096  if (scale) { \
1097  VP56mv mv_temp = { -mv.x, -mv.y }; \
1098  RETURN_MV(mv_temp); \
1099  } else { \
1100  RETURN_MV(mv); \
1101  } \
1102  } while (0)
1103 
1104  // previously coded MVs in this neighbourhood, using different reference frame
1105  for (i = 0; i < 8; i++) {
1106  int c = p[i][0] + col, r = p[i][1] + row;
1107 
1108  if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1109  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1110 
1111  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1112  RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1113  }
1114  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1115  // BUG - libvpx has this condition regardless of whether
1116  // we used the first ref MV and pre-scaling
1117  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1118  RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1119  }
1120  }
1121  }
1122 
1123  // MV at this position in previous frame, using different reference frame
1124  if (s->use_last_frame_mvs) {
1125  struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1126 
1127  // no need to await_progress, because we already did that above
1128  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1129  RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1130  }
1131  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1132  // BUG - libvpx has this condition regardless of whether
1133  // we used the first ref MV and pre-scaling
1134  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1135  RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1136  }
1137  }
1138 
1139  AV_ZERO32(pmv);
1140 #undef INVALID_MV
1141 #undef RETURN_MV
1142 #undef RETURN_SCALE_MV
1143 }
1144 
1145 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1146 {
1147  int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1148  int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1149  s->prob.p.mv_comp[idx].classes);
1150 
1151  s->counts.mv_comp[idx].sign[sign]++;
1152  s->counts.mv_comp[idx].classes[c]++;
1153  if (c) {
1154  int m;
1155 
1156  for (n = 0, m = 0; m < c; m++) {
1157  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1158  n |= bit << m;
1159  s->counts.mv_comp[idx].bits[m][bit]++;
1160  }
1161  n <<= 3;
1162  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1163  n |= bit << 1;
1164  s->counts.mv_comp[idx].fp[bit]++;
1165  if (hp) {
1166  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1167  s->counts.mv_comp[idx].hp[bit]++;
1168  n |= bit;
1169  } else {
1170  n |= 1;
1171  // bug in libvpx - we count for bw entropy purposes even if the
1172  // bit wasn't coded
1173  s->counts.mv_comp[idx].hp[1]++;
1174  }
1175  n += 8 << c;
1176  } else {
1177  n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1178  s->counts.mv_comp[idx].class0[n]++;
1179  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1180  s->prob.p.mv_comp[idx].class0_fp[n]);
1181  s->counts.mv_comp[idx].class0_fp[n][bit]++;
1182  n = (n << 3) | (bit << 1);
1183  if (hp) {
1184  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1185  s->counts.mv_comp[idx].class0_hp[bit]++;
1186  n |= bit;
1187  } else {
1188  n |= 1;
1189  // bug in libvpx - we count for bw entropy purposes even if the
1190  // bit wasn't coded
1191  s->counts.mv_comp[idx].class0_hp[1]++;
1192  }
1193  }
1194 
1195  return sign ? -(n + 1) : (n + 1);
1196 }
1197 
1198 static void fill_mv(VP9Context *s,
1199  VP56mv *mv, int mode, int sb)
1200 {
1201  VP9Block *b = s->b;
1202 
1203  if (mode == ZEROMV) {
1204  AV_ZERO64(mv);
1205  } else {
1206  int hp;
1207 
1208  // FIXME cache this value and reuse for other subblocks
1209  find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1210  mode == NEWMV ? -1 : sb);
1211  // FIXME maybe move this code into find_ref_mvs()
1212  if ((mode == NEWMV || sb == -1) &&
1213  !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1214  if (mv[0].y & 1) {
1215  if (mv[0].y < 0)
1216  mv[0].y++;
1217  else
1218  mv[0].y--;
1219  }
1220  if (mv[0].x & 1) {
1221  if (mv[0].x < 0)
1222  mv[0].x++;
1223  else
1224  mv[0].x--;
1225  }
1226  }
1227  if (mode == NEWMV) {
1229  s->prob.p.mv_joint);
1230 
1231  s->counts.mv_joint[j]++;
1232  if (j >= MV_JOINT_V)
1233  mv[0].y += read_mv_component(s, 0, hp);
1234  if (j & 1)
1235  mv[0].x += read_mv_component(s, 1, hp);
1236  }
1237 
1238  if (b->comp) {
1239  // FIXME cache this value and reuse for other subblocks
1240  find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1241  mode == NEWMV ? -1 : sb);
1242  if ((mode == NEWMV || sb == -1) &&
1243  !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1244  if (mv[1].y & 1) {
1245  if (mv[1].y < 0)
1246  mv[1].y++;
1247  else
1248  mv[1].y--;
1249  }
1250  if (mv[1].x & 1) {
1251  if (mv[1].x < 0)
1252  mv[1].x++;
1253  else
1254  mv[1].x--;
1255  }
1256  }
1257  if (mode == NEWMV) {
1259  s->prob.p.mv_joint);
1260 
1261  s->counts.mv_joint[j]++;
1262  if (j >= MV_JOINT_V)
1263  mv[1].y += read_mv_component(s, 0, hp);
1264  if (j & 1)
1265  mv[1].x += read_mv_component(s, 1, hp);
1266  }
1267  }
1268  }
1269 }
1270 
1271 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1272  ptrdiff_t stride, int v)
1273 {
1274  switch (w) {
1275  case 1:
1276  do {
1277  *ptr = v;
1278  ptr += stride;
1279  } while (--h);
1280  break;
1281  case 2: {
1282  int v16 = v * 0x0101;
1283  do {
1284  AV_WN16A(ptr, v16);
1285  ptr += stride;
1286  } while (--h);
1287  break;
1288  }
1289  case 4: {
1290  uint32_t v32 = v * 0x01010101;
1291  do {
1292  AV_WN32A(ptr, v32);
1293  ptr += stride;
1294  } while (--h);
1295  break;
1296  }
1297  case 8: {
1298 #if HAVE_FAST_64BIT
1299  uint64_t v64 = v * 0x0101010101010101ULL;
1300  do {
1301  AV_WN64A(ptr, v64);
1302  ptr += stride;
1303  } while (--h);
1304 #else
1305  uint32_t v32 = v * 0x01010101;
1306  do {
1307  AV_WN32A(ptr, v32);
1308  AV_WN32A(ptr + 4, v32);
1309  ptr += stride;
1310  } while (--h);
1311 #endif
1312  break;
1313  }
1314  }
1315 }
1316 
1317 static void decode_mode(AVCodecContext *ctx)
1318 {
1319  static const uint8_t left_ctx[N_BS_SIZES] = {
1320  0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1321  };
1322  static const uint8_t above_ctx[N_BS_SIZES] = {
1323  0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1324  };
1325  static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1327  TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1328  };
1329  VP9Context *s = ctx->priv_data;
1330  VP9Block *b = s->b;
1331  int row = s->row, col = s->col, row7 = s->row7;
1332  enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1333  int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
1334  int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
1335  int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1336  int vref, filter_id;
1337 
1338  if (!s->segmentation.enabled) {
1339  b->seg_id = 0;
1340  } else if (s->keyframe || s->intraonly) {
1342  } else if (!s->segmentation.update_map ||
1343  (s->segmentation.temporal &&
1345  s->prob.segpred[s->above_segpred_ctx[col] +
1346  s->left_segpred_ctx[row7]]))) {
1347  if (!s->errorres) {
1348  int pred = 8, x;
1349  uint8_t *refsegmap = s->frames[LAST_FRAME].segmentation_map;
1350 
1351  if (!s->last_uses_2pass)
1352  ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1353  for (y = 0; y < h4; y++)
1354  for (x = 0; x < w4; x++)
1355  pred = FFMIN(pred, refsegmap[(y + row) * 8 * s->sb_cols + x + col]);
1356  av_assert1(pred < 8);
1357  b->seg_id = pred;
1358  } else {
1359  b->seg_id = 0;
1360  }
1361 
1362  memset(&s->above_segpred_ctx[col], 1, w4);
1363  memset(&s->left_segpred_ctx[row7], 1, h4);
1364  } else {
1366  s->prob.seg);
1367 
1368  memset(&s->above_segpred_ctx[col], 0, w4);
1369  memset(&s->left_segpred_ctx[row7], 0, h4);
1370  }
1371  if (s->segmentation.enabled &&
1372  (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1373  setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1374  w4, h4, 8 * s->sb_cols, b->seg_id);
1375  }
1376 
1377  b->skip = s->segmentation.enabled &&
1378  s->segmentation.feat[b->seg_id].skip_enabled;
1379  if (!b->skip) {
1380  int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1381  b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1382  s->counts.skip[c][b->skip]++;
1383  }
1384 
1385  if (s->keyframe || s->intraonly) {
1386  b->intra = 1;
1387  } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1388  b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1389  } else {
1390  int c, bit;
1391 
1392  if (have_a && have_l) {
1393  c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1394  c += (c == 2);
1395  } else {
1396  c = have_a ? 2 * s->above_intra_ctx[col] :
1397  have_l ? 2 * s->left_intra_ctx[row7] : 0;
1398  }
1399  bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1400  s->counts.intra[c][bit]++;
1401  b->intra = !bit;
1402  }
1403 
1404  if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1405  int c;
1406  if (have_a) {
1407  if (have_l) {
1408  c = (s->above_skip_ctx[col] ? max_tx :
1409  s->above_txfm_ctx[col]) +
1410  (s->left_skip_ctx[row7] ? max_tx :
1411  s->left_txfm_ctx[row7]) > max_tx;
1412  } else {
1413  c = s->above_skip_ctx[col] ? 1 :
1414  (s->above_txfm_ctx[col] * 2 > max_tx);
1415  }
1416  } else if (have_l) {
1417  c = s->left_skip_ctx[row7] ? 1 :
1418  (s->left_txfm_ctx[row7] * 2 > max_tx);
1419  } else {
1420  c = 1;
1421  }
1422  switch (max_tx) {
1423  case TX_32X32:
1424  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1425  if (b->tx) {
1426  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1427  if (b->tx == 2)
1428  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1429  }
1430  s->counts.tx32p[c][b->tx]++;
1431  break;
1432  case TX_16X16:
1433  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1434  if (b->tx)
1435  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1436  s->counts.tx16p[c][b->tx]++;
1437  break;
1438  case TX_8X8:
1439  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1440  s->counts.tx8p[c][b->tx]++;
1441  break;
1442  case TX_4X4:
1443  b->tx = TX_4X4;
1444  break;
1445  }
1446  } else {
1447  b->tx = FFMIN(max_tx, s->txfmmode);
1448  }
1449 
1450  if (s->keyframe || s->intraonly) {
1451  uint8_t *a = &s->above_mode_ctx[col * 2];
1452  uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1453 
1454  b->comp = 0;
1455  if (b->bs > BS_8x8) {
1456  // FIXME the memory storage intermediates here aren't really
1457  // necessary, they're just there to make the code slightly
1458  // simpler for now
1459  b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1460  vp9_default_kf_ymode_probs[a[0]][l[0]]);
1461  if (b->bs != BS_8x4) {
1463  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1464  l[0] = a[1] = b->mode[1];
1465  } else {
1466  l[0] = a[1] = b->mode[1] = b->mode[0];
1467  }
1468  if (b->bs != BS_4x8) {
1469  b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1470  vp9_default_kf_ymode_probs[a[0]][l[1]]);
1471  if (b->bs != BS_8x4) {
1473  vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1474  l[1] = a[1] = b->mode[3];
1475  } else {
1476  l[1] = a[1] = b->mode[3] = b->mode[2];
1477  }
1478  } else {
1479  b->mode[2] = b->mode[0];
1480  l[1] = a[1] = b->mode[3] = b->mode[1];
1481  }
1482  } else {
1484  vp9_default_kf_ymode_probs[*a][*l]);
1485  b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1486  // FIXME this can probably be optimized
1487  memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1488  memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1489  }
1492  } else if (b->intra) {
1493  b->comp = 0;
1494  if (b->bs > BS_8x8) {
1496  s->prob.p.y_mode[0]);
1497  s->counts.y_mode[0][b->mode[0]]++;
1498  if (b->bs != BS_8x4) {
1500  s->prob.p.y_mode[0]);
1501  s->counts.y_mode[0][b->mode[1]]++;
1502  } else {
1503  b->mode[1] = b->mode[0];
1504  }
1505  if (b->bs != BS_4x8) {
1507  s->prob.p.y_mode[0]);
1508  s->counts.y_mode[0][b->mode[2]]++;
1509  if (b->bs != BS_8x4) {
1511  s->prob.p.y_mode[0]);
1512  s->counts.y_mode[0][b->mode[3]]++;
1513  } else {
1514  b->mode[3] = b->mode[2];
1515  }
1516  } else {
1517  b->mode[2] = b->mode[0];
1518  b->mode[3] = b->mode[1];
1519  }
1520  } else {
1521  static const uint8_t size_group[10] = {
1522  3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1523  };
1524  int sz = size_group[b->bs];
1525 
1527  s->prob.p.y_mode[sz]);
1528  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1529  s->counts.y_mode[sz][b->mode[3]]++;
1530  }
1532  s->prob.p.uv_mode[b->mode[3]]);
1533  s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1534  } else {
1535  static const uint8_t inter_mode_ctx_lut[14][14] = {
1536  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1537  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1538  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1539  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1540  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1541  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1542  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1543  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1544  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1545  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1546  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1547  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1548  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1549  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1550  };
1551 
1552  if (s->segmentation.feat[b->seg_id].ref_enabled) {
1553  av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1554  b->comp = 0;
1555  b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1556  } else {
1557  // read comp_pred flag
1558  if (s->comppredmode != PRED_SWITCHABLE) {
1559  b->comp = s->comppredmode == PRED_COMPREF;
1560  } else {
1561  int c;
1562 
1563  // FIXME add intra as ref=0xff (or -1) to make these easier?
1564  if (have_a) {
1565  if (have_l) {
1566  if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1567  c = 4;
1568  } else if (s->above_comp_ctx[col]) {
1569  c = 2 + (s->left_intra_ctx[row7] ||
1570  s->left_ref_ctx[row7] == s->fixcompref);
1571  } else if (s->left_comp_ctx[row7]) {
1572  c = 2 + (s->above_intra_ctx[col] ||
1573  s->above_ref_ctx[col] == s->fixcompref);
1574  } else {
1575  c = (!s->above_intra_ctx[col] &&
1576  s->above_ref_ctx[col] == s->fixcompref) ^
1577  (!s->left_intra_ctx[row7] &&
1578  s->left_ref_ctx[row & 7] == s->fixcompref);
1579  }
1580  } else {
1581  c = s->above_comp_ctx[col] ? 3 :
1582  (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1583  }
1584  } else if (have_l) {
1585  c = s->left_comp_ctx[row7] ? 3 :
1586  (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1587  } else {
1588  c = 1;
1589  }
1590  b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1591  s->counts.comp[c][b->comp]++;
1592  }
1593 
1594  // read actual references
1595  // FIXME probably cache a few variables here to prevent repetitive
1596  // memory accesses below
1597  if (b->comp) /* two references */ {
1598  int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1599 
1600  b->ref[fix_idx] = s->fixcompref;
1601  // FIXME can this codeblob be replaced by some sort of LUT?
1602  if (have_a) {
1603  if (have_l) {
1604  if (s->above_intra_ctx[col]) {
1605  if (s->left_intra_ctx[row7]) {
1606  c = 2;
1607  } else {
1608  c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1609  }
1610  } else if (s->left_intra_ctx[row7]) {
1611  c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1612  } else {
1613  int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1614 
1615  if (refl == refa && refa == s->varcompref[1]) {
1616  c = 0;
1617  } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1618  if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1619  (refl == s->fixcompref && refa == s->varcompref[0])) {
1620  c = 4;
1621  } else {
1622  c = (refa == refl) ? 3 : 1;
1623  }
1624  } else if (!s->left_comp_ctx[row7]) {
1625  if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1626  c = 1;
1627  } else {
1628  c = (refl == s->varcompref[1] &&
1629  refa != s->varcompref[1]) ? 2 : 4;
1630  }
1631  } else if (!s->above_comp_ctx[col]) {
1632  if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1633  c = 1;
1634  } else {
1635  c = (refa == s->varcompref[1] &&
1636  refl != s->varcompref[1]) ? 2 : 4;
1637  }
1638  } else {
1639  c = (refl == refa) ? 4 : 2;
1640  }
1641  }
1642  } else {
1643  if (s->above_intra_ctx[col]) {
1644  c = 2;
1645  } else if (s->above_comp_ctx[col]) {
1646  c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1647  } else {
1648  c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1649  }
1650  }
1651  } else if (have_l) {
1652  if (s->left_intra_ctx[row7]) {
1653  c = 2;
1654  } else if (s->left_comp_ctx[row7]) {
1655  c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1656  } else {
1657  c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1658  }
1659  } else {
1660  c = 2;
1661  }
1662  bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1663  b->ref[var_idx] = s->varcompref[bit];
1664  s->counts.comp_ref[c][bit]++;
1665  } else /* single reference */ {
1666  int bit, c;
1667 
1668  if (have_a && !s->above_intra_ctx[col]) {
1669  if (have_l && !s->left_intra_ctx[row7]) {
1670  if (s->left_comp_ctx[row7]) {
1671  if (s->above_comp_ctx[col]) {
1672  c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1673  !s->above_ref_ctx[col]);
1674  } else {
1675  c = (3 * !s->above_ref_ctx[col]) +
1676  (!s->fixcompref || !s->left_ref_ctx[row7]);
1677  }
1678  } else if (s->above_comp_ctx[col]) {
1679  c = (3 * !s->left_ref_ctx[row7]) +
1680  (!s->fixcompref || !s->above_ref_ctx[col]);
1681  } else {
1682  c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1683  }
1684  } else if (s->above_intra_ctx[col]) {
1685  c = 2;
1686  } else if (s->above_comp_ctx[col]) {
1687  c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1688  } else {
1689  c = 4 * (!s->above_ref_ctx[col]);
1690  }
1691  } else if (have_l && !s->left_intra_ctx[row7]) {
1692  if (s->left_intra_ctx[row7]) {
1693  c = 2;
1694  } else if (s->left_comp_ctx[row7]) {
1695  c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1696  } else {
1697  c = 4 * (!s->left_ref_ctx[row7]);
1698  }
1699  } else {
1700  c = 2;
1701  }
1702  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1703  s->counts.single_ref[c][0][bit]++;
1704  if (!bit) {
1705  b->ref[0] = 0;
1706  } else {
1707  // FIXME can this codeblob be replaced by some sort of LUT?
1708  if (have_a) {
1709  if (have_l) {
1710  if (s->left_intra_ctx[row7]) {
1711  if (s->above_intra_ctx[col]) {
1712  c = 2;
1713  } else if (s->above_comp_ctx[col]) {
1714  c = 1 + 2 * (s->fixcompref == 1 ||
1715  s->above_ref_ctx[col] == 1);
1716  } else if (!s->above_ref_ctx[col]) {
1717  c = 3;
1718  } else {
1719  c = 4 * (s->above_ref_ctx[col] == 1);
1720  }
1721  } else if (s->above_intra_ctx[col]) {
1722  if (s->left_intra_ctx[row7]) {
1723  c = 2;
1724  } else if (s->left_comp_ctx[row7]) {
1725  c = 1 + 2 * (s->fixcompref == 1 ||
1726  s->left_ref_ctx[row7] == 1);
1727  } else if (!s->left_ref_ctx[row7]) {
1728  c = 3;
1729  } else {
1730  c = 4 * (s->left_ref_ctx[row7] == 1);
1731  }
1732  } else if (s->above_comp_ctx[col]) {
1733  if (s->left_comp_ctx[row7]) {
1734  if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1735  c = 3 * (s->fixcompref == 1 ||
1736  s->left_ref_ctx[row7] == 1);
1737  } else {
1738  c = 2;
1739  }
1740  } else if (!s->left_ref_ctx[row7]) {
1741  c = 1 + 2 * (s->fixcompref == 1 ||
1742  s->above_ref_ctx[col] == 1);
1743  } else {
1744  c = 3 * (s->left_ref_ctx[row7] == 1) +
1745  (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1746  }
1747  } else if (s->left_comp_ctx[row7]) {
1748  if (!s->above_ref_ctx[col]) {
1749  c = 1 + 2 * (s->fixcompref == 1 ||
1750  s->left_ref_ctx[row7] == 1);
1751  } else {
1752  c = 3 * (s->above_ref_ctx[col] == 1) +
1753  (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1754  }
1755  } else if (!s->above_ref_ctx[col]) {
1756  if (!s->left_ref_ctx[row7]) {
1757  c = 3;
1758  } else {
1759  c = 4 * (s->left_ref_ctx[row7] == 1);
1760  }
1761  } else if (!s->left_ref_ctx[row7]) {
1762  c = 4 * (s->above_ref_ctx[col] == 1);
1763  } else {
1764  c = 2 * (s->left_ref_ctx[row7] == 1) +
1765  2 * (s->above_ref_ctx[col] == 1);
1766  }
1767  } else {
1768  if (s->above_intra_ctx[col] ||
1769  (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1770  c = 2;
1771  } else if (s->above_comp_ctx[col]) {
1772  c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1773  } else {
1774  c = 4 * (s->above_ref_ctx[col] == 1);
1775  }
1776  }
1777  } else if (have_l) {
1778  if (s->left_intra_ctx[row7] ||
1779  (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1780  c = 2;
1781  } else if (s->left_comp_ctx[row7]) {
1782  c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1783  } else {
1784  c = 4 * (s->left_ref_ctx[row7] == 1);
1785  }
1786  } else {
1787  c = 2;
1788  }
1789  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1790  s->counts.single_ref[c][1][bit]++;
1791  b->ref[0] = 1 + bit;
1792  }
1793  }
1794  }
1795 
1796  if (b->bs <= BS_8x8) {
1797  if (s->segmentation.feat[b->seg_id].skip_enabled) {
1798  b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1799  } else {
1800  static const uint8_t off[10] = {
1801  3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1802  };
1803 
1804  // FIXME this needs to use the LUT tables from find_ref_mvs
1805  // because not all are -1,0/0,-1
1806  int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1807  [s->left_mode_ctx[row7 + off[b->bs]]];
1808 
1810  s->prob.p.mv_mode[c]);
1811  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1812  s->counts.mv_mode[c][b->mode[0] - 10]++;
1813  }
1814  }
1815 
1816  if (s->filtermode == FILTER_SWITCHABLE) {
1817  int c;
1818 
1819  if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1820  if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1821  c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1822  s->left_filter_ctx[row7] : 3;
1823  } else {
1824  c = s->above_filter_ctx[col];
1825  }
1826  } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1827  c = s->left_filter_ctx[row7];
1828  } else {
1829  c = 3;
1830  }
1831 
1832  filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1833  s->prob.p.filter[c]);
1834  s->counts.filter[c][filter_id]++;
1835  b->filter = vp9_filter_lut[filter_id];
1836  } else {
1837  b->filter = s->filtermode;
1838  }
1839 
1840  if (b->bs > BS_8x8) {
1841  int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1842 
1844  s->prob.p.mv_mode[c]);
1845  s->counts.mv_mode[c][b->mode[0] - 10]++;
1846  fill_mv(s, b->mv[0], b->mode[0], 0);
1847 
1848  if (b->bs != BS_8x4) {
1850  s->prob.p.mv_mode[c]);
1851  s->counts.mv_mode[c][b->mode[1] - 10]++;
1852  fill_mv(s, b->mv[1], b->mode[1], 1);
1853  } else {
1854  b->mode[1] = b->mode[0];
1855  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1856  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1857  }
1858 
1859  if (b->bs != BS_4x8) {
1861  s->prob.p.mv_mode[c]);
1862  s->counts.mv_mode[c][b->mode[2] - 10]++;
1863  fill_mv(s, b->mv[2], b->mode[2], 2);
1864 
1865  if (b->bs != BS_8x4) {
1867  s->prob.p.mv_mode[c]);
1868  s->counts.mv_mode[c][b->mode[3] - 10]++;
1869  fill_mv(s, b->mv[3], b->mode[3], 3);
1870  } else {
1871  b->mode[3] = b->mode[2];
1872  AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1873  AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1874  }
1875  } else {
1876  b->mode[2] = b->mode[0];
1877  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1878  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1879  b->mode[3] = b->mode[1];
1880  AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1881  AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1882  }
1883  } else {
1884  fill_mv(s, b->mv[0], b->mode[0], -1);
1885  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1886  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1887  AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1888  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1889  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1890  AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1891  }
1892 
1893  vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
1894  }
1895 
1896 #if HAVE_FAST_64BIT
1897 #define SPLAT_CTX(var, val, n) \
1898  switch (n) { \
1899  case 1: var = val; break; \
1900  case 2: AV_WN16A(&var, val * 0x0101); break; \
1901  case 4: AV_WN32A(&var, val * 0x01010101); break; \
1902  case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
1903  case 16: { \
1904  uint64_t v64 = val * 0x0101010101010101ULL; \
1905  AV_WN64A( &var, v64); \
1906  AV_WN64A(&((uint8_t *) &var)[8], v64); \
1907  break; \
1908  } \
1909  }
1910 #else
1911 #define SPLAT_CTX(var, val, n) \
1912  switch (n) { \
1913  case 1: var = val; break; \
1914  case 2: AV_WN16A(&var, val * 0x0101); break; \
1915  case 4: AV_WN32A(&var, val * 0x01010101); break; \
1916  case 8: { \
1917  uint32_t v32 = val * 0x01010101; \
1918  AV_WN32A( &var, v32); \
1919  AV_WN32A(&((uint8_t *) &var)[4], v32); \
1920  break; \
1921  } \
1922  case 16: { \
1923  uint32_t v32 = val * 0x01010101; \
1924  AV_WN32A( &var, v32); \
1925  AV_WN32A(&((uint8_t *) &var)[4], v32); \
1926  AV_WN32A(&((uint8_t *) &var)[8], v32); \
1927  AV_WN32A(&((uint8_t *) &var)[12], v32); \
1928  break; \
1929  } \
1930  }
1931 #endif
1932 
1933  switch (bwh_tab[1][b->bs][0]) {
1934 #define SET_CTXS(dir, off, n) \
1935  do { \
1936  SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
1937  SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
1938  SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
1939  if (!s->keyframe && !s->intraonly) { \
1940  SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
1941  SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
1942  SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
1943  if (!b->intra) { \
1944  SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
1945  if (s->filtermode == FILTER_SWITCHABLE) { \
1946  SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
1947  } \
1948  } \
1949  } \
1950  } while (0)
1951  case 1: SET_CTXS(above, col, 1); break;
1952  case 2: SET_CTXS(above, col, 2); break;
1953  case 4: SET_CTXS(above, col, 4); break;
1954  case 8: SET_CTXS(above, col, 8); break;
1955  }
1956  switch (bwh_tab[1][b->bs][1]) {
1957  case 1: SET_CTXS(left, row7, 1); break;
1958  case 2: SET_CTXS(left, row7, 2); break;
1959  case 4: SET_CTXS(left, row7, 4); break;
1960  case 8: SET_CTXS(left, row7, 8); break;
1961  }
1962 #undef SPLAT_CTX
1963 #undef SET_CTXS
1964 
1965  if (!s->keyframe && !s->intraonly) {
1966  if (b->bs > BS_8x8) {
1967  int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1968 
1969  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
1970  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
1971  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
1972  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
1973  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
1974  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
1975  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
1976  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
1977  } else {
1978  int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1979 
1980  for (n = 0; n < w4 * 2; n++) {
1981  AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
1982  AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
1983  }
1984  for (n = 0; n < h4 * 2; n++) {
1985  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
1986  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
1987  }
1988  }
1989  }
1990 
1991  // FIXME kinda ugly
1992  for (y = 0; y < h4; y++) {
1993  int x, o = (row + y) * s->sb_cols * 8 + col;
1994  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
1995 
1996  if (b->intra) {
1997  for (x = 0; x < w4; x++) {
1998  mv[x].ref[0] =
1999  mv[x].ref[1] = -1;
2000  }
2001  } else if (b->comp) {
2002  for (x = 0; x < w4; x++) {
2003  mv[x].ref[0] = b->ref[0];
2004  mv[x].ref[1] = b->ref[1];
2005  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2006  AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2007  }
2008  } else {
2009  for (x = 0; x < w4; x++) {
2010  mv[x].ref[0] = b->ref[0];
2011  mv[x].ref[1] = -1;
2012  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2013  }
2014  }
2015  }
2016 }
2017 
2018 // FIXME merge cnt/eob arguments?
2019 static av_always_inline int
2020 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2021  int is_tx32x32, unsigned (*cnt)[6][3],
2022  unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2023  int nnz, const int16_t *scan, const int16_t (*nb)[2],
2024  const int16_t *band_counts, const int16_t *qmul)
2025 {
2026  int i = 0, band = 0, band_left = band_counts[band];
2027  uint8_t *tp = p[0][nnz];
2028  uint8_t cache[1024];
2029 
2030  do {
2031  int val, rc;
2032 
2033  val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2034  eob[band][nnz][val]++;
2035  if (!val)
2036  break;
2037 
2038  skip_eob:
2039  if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2040  cnt[band][nnz][0]++;
2041  if (!--band_left)
2042  band_left = band_counts[++band];
2043  cache[scan[i]] = 0;
2044  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2045  tp = p[band][nnz];
2046  if (++i == n_coeffs)
2047  break; //invalid input; blocks should end with EOB
2048  goto skip_eob;
2049  }
2050 
2051  rc = scan[i];
2052  if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2053  cnt[band][nnz][1]++;
2054  val = 1;
2055  cache[rc] = 1;
2056  } else {
2057  // fill in p[3-10] (model fill) - only once per frame for each pos
2058  if (!tp[3])
2059  memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2060 
2061  cnt[band][nnz][2]++;
2062  if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2063  if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2064  cache[rc] = val = 2;
2065  } else {
2066  val = 3 + vp56_rac_get_prob(c, tp[5]);
2067  cache[rc] = 3;
2068  }
2069  } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2070  cache[rc] = 4;
2071  if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2072  val = 5 + vp56_rac_get_prob(c, 159);
2073  } else {
2074  val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2075  val += vp56_rac_get_prob(c, 145);
2076  }
2077  } else { // cat 3-6
2078  cache[rc] = 5;
2079  if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2080  if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2081  val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2082  val += (vp56_rac_get_prob(c, 148) << 1);
2083  val += vp56_rac_get_prob(c, 140);
2084  } else {
2085  val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2086  val += (vp56_rac_get_prob(c, 155) << 2);
2087  val += (vp56_rac_get_prob(c, 140) << 1);
2088  val += vp56_rac_get_prob(c, 135);
2089  }
2090  } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2091  val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2092  val += (vp56_rac_get_prob(c, 157) << 3);
2093  val += (vp56_rac_get_prob(c, 141) << 2);
2094  val += (vp56_rac_get_prob(c, 134) << 1);
2095  val += vp56_rac_get_prob(c, 130);
2096  } else {
2097  val = 67 + (vp56_rac_get_prob(c, 254) << 13);
2098  val += (vp56_rac_get_prob(c, 254) << 12);
2099  val += (vp56_rac_get_prob(c, 254) << 11);
2100  val += (vp56_rac_get_prob(c, 252) << 10);
2101  val += (vp56_rac_get_prob(c, 249) << 9);
2102  val += (vp56_rac_get_prob(c, 243) << 8);
2103  val += (vp56_rac_get_prob(c, 230) << 7);
2104  val += (vp56_rac_get_prob(c, 196) << 6);
2105  val += (vp56_rac_get_prob(c, 177) << 5);
2106  val += (vp56_rac_get_prob(c, 153) << 4);
2107  val += (vp56_rac_get_prob(c, 140) << 3);
2108  val += (vp56_rac_get_prob(c, 133) << 2);
2109  val += (vp56_rac_get_prob(c, 130) << 1);
2110  val += vp56_rac_get_prob(c, 129);
2111  }
2112  }
2113  }
2114  if (!--band_left)
2115  band_left = band_counts[++band];
2116  if (is_tx32x32)
2117  coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
2118  else
2119  coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
2120  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2121  tp = p[band][nnz];
2122  } while (++i < n_coeffs);
2123 
2124  return i;
2125 }
2126 
2127 static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2128  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2129  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2130  const int16_t (*nb)[2], const int16_t *band_counts,
2131  const int16_t *qmul)
2132 {
2133  return decode_coeffs_b_generic(c, coef, n_coeffs, 0, cnt, eob, p,
2134  nnz, scan, nb, band_counts, qmul);
2135 }
2136 
2137 static int decode_coeffs_b32(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2138  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2139  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2140  const int16_t (*nb)[2], const int16_t *band_counts,
2141  const int16_t *qmul)
2142 {
2143  return decode_coeffs_b_generic(c, coef, n_coeffs, 1, cnt, eob, p,
2144  nnz, scan, nb, band_counts, qmul);
2145 }
2146 
2148 {
2149  VP9Context *s = ctx->priv_data;
2150  VP9Block *b = s->b;
2151  int row = s->row, col = s->col;
2152  uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2153  unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2154  unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2155  int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2156  int end_x = FFMIN(2 * (s->cols - col), w4);
2157  int end_y = FFMIN(2 * (s->rows - row), h4);
2158  int n, pl, x, y, res;
2159  int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2160  int tx = 4 * s->lossless + b->tx;
2161  const int16_t * const *yscans = vp9_scans[tx];
2162  const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2163  const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2164  const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2165  uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2166  uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2167  static const int16_t band_counts[4][8] = {
2168  { 1, 2, 3, 4, 3, 16 - 13 },
2169  { 1, 2, 3, 4, 11, 64 - 21 },
2170  { 1, 2, 3, 4, 11, 256 - 21 },
2171  { 1, 2, 3, 4, 11, 1024 - 21 },
2172  };
2173  const int16_t *y_band_counts = band_counts[b->tx];
2174  const int16_t *uv_band_counts = band_counts[b->uvtx];
2175 
2176 #define MERGE(la, end, step, rd) \
2177  for (n = 0; n < end; n += step) \
2178  la[n] = !!rd(&la[n])
2179 #define MERGE_CTX(step, rd) \
2180  do { \
2181  MERGE(l, end_y, step, rd); \
2182  MERGE(a, end_x, step, rd); \
2183  } while (0)
2184 
2185 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2186  for (n = 0, y = 0; y < end_y; y += step) { \
2187  for (x = 0; x < end_x; x += step, n += step * step) { \
2188  enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2189  res = decode_coeffs_b##v(&s->c, s->block + 16 * n, 16 * step * step, \
2190  c, e, p, a[x] + l[y], yscans[txtp], \
2191  ynbs[txtp], y_band_counts, qmul[0]); \
2192  a[x] = l[y] = !!res; \
2193  if (step >= 4) { \
2194  AV_WN16A(&s->eob[n], res); \
2195  } else { \
2196  s->eob[n] = res; \
2197  } \
2198  } \
2199  }
2200 
2201 #define SPLAT(la, end, step, cond) \
2202  if (step == 2) { \
2203  for (n = 1; n < end; n += step) \
2204  la[n] = la[n - 1]; \
2205  } else if (step == 4) { \
2206  if (cond) { \
2207  for (n = 0; n < end; n += step) \
2208  AV_WN32A(&la[n], la[n] * 0x01010101); \
2209  } else { \
2210  for (n = 0; n < end; n += step) \
2211  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2212  } \
2213  } else /* step == 8 */ { \
2214  if (cond) { \
2215  if (HAVE_FAST_64BIT) { \
2216  for (n = 0; n < end; n += step) \
2217  AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2218  } else { \
2219  for (n = 0; n < end; n += step) { \
2220  uint32_t v32 = la[n] * 0x01010101; \
2221  AV_WN32A(&la[n], v32); \
2222  AV_WN32A(&la[n + 4], v32); \
2223  } \
2224  } \
2225  } else { \
2226  for (n = 0; n < end; n += step) \
2227  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2228  } \
2229  }
2230 #define SPLAT_CTX(step) \
2231  do { \
2232  SPLAT(a, end_x, step, end_x == w4); \
2233  SPLAT(l, end_y, step, end_y == h4); \
2234  } while (0)
2235 
2236  /* y tokens */
2237  switch (b->tx) {
2238  case TX_4X4:
2239  DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2240  break;
2241  case TX_8X8:
2242  MERGE_CTX(2, AV_RN16A);
2243  DECODE_Y_COEF_LOOP(2, 0,);
2244  SPLAT_CTX(2);
2245  break;
2246  case TX_16X16:
2247  MERGE_CTX(4, AV_RN32A);
2248  DECODE_Y_COEF_LOOP(4, 0,);
2249  SPLAT_CTX(4);
2250  break;
2251  case TX_32X32:
2252  MERGE_CTX(8, AV_RN64A);
2253  DECODE_Y_COEF_LOOP(8, 0, 32);
2254  SPLAT_CTX(8);
2255  break;
2256  }
2257 
2258 #define DECODE_UV_COEF_LOOP(step) \
2259  for (n = 0, y = 0; y < end_y; y += step) { \
2260  for (x = 0; x < end_x; x += step, n += step * step) { \
2261  res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, \
2262  16 * step * step, c, e, p, a[x] + l[y], \
2263  uvscan, uvnb, uv_band_counts, qmul[1]); \
2264  a[x] = l[y] = !!res; \
2265  if (step >= 4) { \
2266  AV_WN16A(&s->uveob[pl][n], res); \
2267  } else { \
2268  s->uveob[pl][n] = res; \
2269  } \
2270  } \
2271  }
2272 
2273  p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2274  c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2275  e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2276  w4 >>= 1;
2277  h4 >>= 1;
2278  end_x >>= 1;
2279  end_y >>= 1;
2280  for (pl = 0; pl < 2; pl++) {
2281  a = &s->above_uv_nnz_ctx[pl][col];
2282  l = &s->left_uv_nnz_ctx[pl][row & 7];
2283  switch (b->uvtx) {
2284  case TX_4X4:
2286  break;
2287  case TX_8X8:
2288  MERGE_CTX(2, AV_RN16A);
2290  SPLAT_CTX(2);
2291  break;
2292  case TX_16X16:
2293  MERGE_CTX(4, AV_RN32A);
2295  SPLAT_CTX(4);
2296  break;
2297  case TX_32X32:
2298  MERGE_CTX(8, AV_RN64A);
2299  // a 64x64 (max) uv block can ever only contain 1 tx32x32 block
2300  // so there is no need to loop
2301  res = decode_coeffs_b32(&s->c, s->uvblock[pl],
2302  1024, c, e, p, a[0] + l[0],
2303  uvscan, uvnb, uv_band_counts, qmul[1]);
2304  a[0] = l[0] = !!res;
2305  AV_WN16A(&s->uveob[pl][0], res);
2306  SPLAT_CTX(8);
2307  break;
2308  }
2309  }
2310 }
2311 
2313  uint8_t *dst_edge, ptrdiff_t stride_edge,
2314  uint8_t *dst_inner, ptrdiff_t stride_inner,
2315  uint8_t *l, int col, int x, int w,
2316  int row, int y, enum TxfmMode tx,
2317  int p)
2318 {
2319  int have_top = row > 0 || y > 0;
2320  int have_left = col > s->tiling.tile_col_start || x > 0;
2321  int have_right = x < w - 1;
2322  static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2323  [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2324  { DC_127_PRED, VERT_PRED } },
2325  [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2326  { HOR_PRED, HOR_PRED } },
2327  [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2328  { LEFT_DC_PRED, DC_PRED } },
2338  { DC_127_PRED, VERT_LEFT_PRED } },
2339  [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2340  { HOR_UP_PRED, HOR_UP_PRED } },
2341  [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2342  { HOR_PRED, TM_VP8_PRED } },
2343  };
2344  static const struct {
2345  uint8_t needs_left:1;
2346  uint8_t needs_top:1;
2347  uint8_t needs_topleft:1;
2348  uint8_t needs_topright:1;
2349  } edges[N_INTRA_PRED_MODES] = {
2350  [VERT_PRED] = { .needs_top = 1 },
2351  [HOR_PRED] = { .needs_left = 1 },
2352  [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2353  [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2354  [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2355  [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2356  [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2357  [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2358  [HOR_UP_PRED] = { .needs_left = 1 },
2359  [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2360  [LEFT_DC_PRED] = { .needs_left = 1 },
2361  [TOP_DC_PRED] = { .needs_top = 1 },
2362  [DC_128_PRED] = { 0 },
2363  [DC_127_PRED] = { 0 },
2364  [DC_129_PRED] = { 0 }
2365  };
2366 
2367  av_assert2(mode >= 0 && mode < 10);
2368  mode = mode_conv[mode][have_left][have_top];
2369  if (edges[mode].needs_top) {
2370  uint8_t *top, *topleft;
2371  int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
2372  int n_px_need_tr = 0;
2373 
2374  if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2375  n_px_need_tr = 4;
2376 
2377  // if top of sb64-row, use s->intra_pred_data[] instead of
2378  // dst[-stride] for intra prediction (it contains pre- instead of
2379  // post-loopfilter data)
2380  if (have_top) {
2381  top = !(row & 7) && !y ?
2382  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2383  y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2384  if (have_left)
2385  topleft = !(row & 7) && !y ?
2386  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2387  y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2388  &dst_inner[-stride_inner];
2389  }
2390 
2391  if (have_top &&
2392  (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2393  (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2394  n_px_need + n_px_need_tr <= n_px_have) {
2395  *a = top;
2396  } else {
2397  if (have_top) {
2398  if (n_px_need <= n_px_have) {
2399  memcpy(*a, top, n_px_need);
2400  } else {
2401  memcpy(*a, top, n_px_have);
2402  memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
2403  n_px_need - n_px_have);
2404  }
2405  } else {
2406  memset(*a, 127, n_px_need);
2407  }
2408  if (edges[mode].needs_topleft) {
2409  if (have_left && have_top) {
2410  (*a)[-1] = topleft[-1];
2411  } else {
2412  (*a)[-1] = have_top ? 129 : 127;
2413  }
2414  }
2415  if (tx == TX_4X4 && edges[mode].needs_topright) {
2416  if (have_top && have_right &&
2417  n_px_need + n_px_need_tr <= n_px_have) {
2418  memcpy(&(*a)[4], &top[4], 4);
2419  } else {
2420  memset(&(*a)[4], (*a)[3], 4);
2421  }
2422  }
2423  }
2424  }
2425  if (edges[mode].needs_left) {
2426  if (have_left) {
2427  int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
2428  uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2429  ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2430 
2431  if (n_px_need <= n_px_have) {
2432  for (i = 0; i < n_px_need; i++)
2433  l[n_px_need - 1 - i] = dst[i * stride - 1];
2434  } else {
2435  for (i = 0; i < n_px_have; i++)
2436  l[n_px_need - 1 - i] = dst[i * stride - 1];
2437  memset(l, l[n_px_need - n_px_have], n_px_need - n_px_have);
2438  }
2439  } else {
2440  memset(l, 129, 4 << tx);
2441  }
2442  }
2443 
2444  return mode;
2445 }
2446 
2447 static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2448 {
2449  VP9Context *s = ctx->priv_data;
2450  VP9Block *b = s->b;
2451  int row = s->row, col = s->col;
2452  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2453  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2454  int end_x = FFMIN(2 * (s->cols - col), w4);
2455  int end_y = FFMIN(2 * (s->rows - row), h4);
2456  int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2457  int uvstep1d = 1 << b->uvtx, p;
2458  uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2459  LOCAL_ALIGNED_32(uint8_t, a_buf, [64]);
2460  LOCAL_ALIGNED_32(uint8_t, l, [32]);
2461 
2462  for (n = 0, y = 0; y < end_y; y += step1d) {
2463  uint8_t *ptr = dst, *ptr_r = dst_r;
2464  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
2465  ptr_r += 4 * step1d, n += step) {
2466  int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2467  y * 2 + x : 0];
2468  uint8_t *a = &a_buf[32];
2469  enum TxfmType txtp = vp9_intra_txfm_type[mode];
2470  int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2471 
2472  mode = check_intra_mode(s, mode, &a, ptr_r,
2473  s->frames[CUR_FRAME].tf.f->linesize[0],
2474  ptr, s->y_stride, l,
2475  col, x, w4, row, y, b->tx, 0);
2476  s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2477  if (eob)
2478  s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2479  s->block + 16 * n, eob);
2480  }
2481  dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2482  dst += 4 * step1d * s->y_stride;
2483  }
2484 
2485  // U/V
2486  w4 >>= 1;
2487  end_x >>= 1;
2488  end_y >>= 1;
2489  step = 1 << (b->uvtx * 2);
2490  for (p = 0; p < 2; p++) {
2491  dst = s->dst[1 + p];
2492  dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2493  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2494  uint8_t *ptr = dst, *ptr_r = dst_r;
2495  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
2496  ptr_r += 4 * uvstep1d, n += step) {
2497  int mode = b->uvmode;
2498  uint8_t *a = &a_buf[16];
2499  int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2500 
2501  mode = check_intra_mode(s, mode, &a, ptr_r,
2502  s->frames[CUR_FRAME].tf.f->linesize[1],
2503  ptr, s->uv_stride, l,
2504  col, x, w4, row, y, b->uvtx, p + 1);
2505  s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2506  if (eob)
2507  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2508  s->uvblock[p] + 16 * n, eob);
2509  }
2510  dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2511  dst += 4 * uvstep1d * s->uv_stride;
2512  }
2513  }
2514 }
2515 
2517  uint8_t *dst, ptrdiff_t dst_stride,
2518  const uint8_t *ref, ptrdiff_t ref_stride,
2520  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2521  int bw, int bh, int w, int h)
2522 {
2523  int mx = mv->x, my = mv->y, th;
2524 
2525  y += my >> 3;
2526  x += mx >> 3;
2527  ref += y * ref_stride + x;
2528  mx &= 7;
2529  my &= 7;
2530  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2531  // we use +7 because the last 7 pixels of each sbrow can be changed in
2532  // the longest loopfilter of the next sbrow
2533  th = (y + bh + 4 * !!my + 7) >> 6;
2534  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2535  if (x < !!mx * 3 || y < !!my * 3 ||
2536  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2538  ref - !!my * 3 * ref_stride - !!mx * 3,
2539  80, ref_stride,
2540  bw + !!mx * 7, bh + !!my * 7,
2541  x - !!mx * 3, y - !!my * 3, w, h);
2542  ref = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2543  ref_stride = 80;
2544  }
2545  mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2546 }
2547 
2549  uint8_t *dst_u, uint8_t *dst_v,
2550  ptrdiff_t dst_stride,
2551  const uint8_t *ref_u, ptrdiff_t src_stride_u,
2552  const uint8_t *ref_v, ptrdiff_t src_stride_v,
2554  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2555  int bw, int bh, int w, int h)
2556 {
2557  int mx = mv->x, my = mv->y, th;
2558 
2559  y += my >> 4;
2560  x += mx >> 4;
2561  ref_u += y * src_stride_u + x;
2562  ref_v += y * src_stride_v + x;
2563  mx &= 15;
2564  my &= 15;
2565  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2566  // we use +7 because the last 7 pixels of each sbrow can be changed in
2567  // the longest loopfilter of the next sbrow
2568  th = (y + bh + 4 * !!my + 7) >> 5;
2569  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2570  if (x < !!mx * 3 || y < !!my * 3 ||
2571  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2573  ref_u - !!my * 3 * src_stride_u - !!mx * 3,
2574  80, src_stride_u,
2575  bw + !!mx * 7, bh + !!my * 7,
2576  x - !!mx * 3, y - !!my * 3, w, h);
2577  ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2578  mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
2579 
2581  ref_v - !!my * 3 * src_stride_v - !!mx * 3,
2582  80, src_stride_v,
2583  bw + !!mx * 7, bh + !!my * 7,
2584  x - !!mx * 3, y - !!my * 3, w, h);
2585  ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2586  mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
2587  } else {
2588  mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2589  mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2590  }
2591 }
2592 
2593 static void inter_recon(AVCodecContext *ctx)
2594 {
2595  static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
2596  { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
2597  { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
2598  };
2599  VP9Context *s = ctx->priv_data;
2600  VP9Block *b = s->b;
2601  int row = s->row, col = s->col;
2602  ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2;
2603  AVFrame *ref1 = tref1->f, *ref2;
2604  int w1 = ref1->width, h1 = ref1->height, w2, h2;
2605  ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
2606 
2607  if (b->comp) {
2608  tref2 = &s->refs[s->refidx[b->ref[1]]];
2609  ref2 = tref2->f;
2610  w2 = ref2->width;
2611  h2 = ref2->height;
2612  }
2613 
2614  // y inter pred
2615  if (b->bs > BS_8x8) {
2616  if (b->bs == BS_8x4) {
2617  mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
2618  ref1->data[0], ref1->linesize[0], tref1,
2619  row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1);
2620  mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
2621  s->dst[0] + 4 * ls_y, ls_y,
2622  ref1->data[0], ref1->linesize[0], tref1,
2623  (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1);
2624 
2625  if (b->comp) {
2626  mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
2627  ref2->data[0], ref2->linesize[0], tref2,
2628  row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2);
2629  mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
2630  s->dst[0] + 4 * ls_y, ls_y,
2631  ref2->data[0], ref2->linesize[0], tref2,
2632  (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2);
2633  }
2634  } else if (b->bs == BS_4x8) {
2635  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2636  ref1->data[0], ref1->linesize[0], tref1,
2637  row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1);
2638  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2639  ref1->data[0], ref1->linesize[0], tref1,
2640  row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1);
2641 
2642  if (b->comp) {
2643  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2644  ref2->data[0], ref2->linesize[0], tref2,
2645  row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2);
2646  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2647  ref2->data[0], ref2->linesize[0], tref2,
2648  row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2);
2649  }
2650  } else {
2651  av_assert2(b->bs == BS_4x4);
2652 
2653  // FIXME if two horizontally adjacent blocks have the same MV,
2654  // do a w8 instead of a w4 call
2655  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2656  ref1->data[0], ref1->linesize[0], tref1,
2657  row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1);
2658  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2659  ref1->data[0], ref1->linesize[0], tref1,
2660  row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1);
2661  mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2662  s->dst[0] + 4 * ls_y, ls_y,
2663  ref1->data[0], ref1->linesize[0], tref1,
2664  (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1);
2665  mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2666  s->dst[0] + 4 * ls_y + 4, ls_y,
2667  ref1->data[0], ref1->linesize[0], tref1,
2668  (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1);
2669 
2670  if (b->comp) {
2671  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2672  ref2->data[0], ref2->linesize[0], tref2,
2673  row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2);
2674  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2675  ref2->data[0], ref2->linesize[0], tref2,
2676  row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2);
2677  mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2678  s->dst[0] + 4 * ls_y, ls_y,
2679  ref2->data[0], ref2->linesize[0], tref2,
2680  (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2);
2681  mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2682  s->dst[0] + 4 * ls_y + 4, ls_y,
2683  ref2->data[0], ref2->linesize[0], tref2,
2684  (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2);
2685  }
2686  }
2687  } else {
2688  int bwl = bwlog_tab[0][b->bs];
2689  int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
2690 
2691  mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
2692  ref1->data[0], ref1->linesize[0], tref1,
2693  row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1);
2694 
2695  if (b->comp)
2696  mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
2697  ref2->data[0], ref2->linesize[0], tref2,
2698  row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2);
2699  }
2700 
2701  // uv inter pred
2702  {
2703  int bwl = bwlog_tab[1][b->bs];
2704  int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
2705  VP56mv mvuv;
2706 
2707  w1 = (w1 + 1) >> 1;
2708  h1 = (h1 + 1) >> 1;
2709  if (b->comp) {
2710  w2 = (w2 + 1) >> 1;
2711  h2 = (h2 + 1) >> 1;
2712  }
2713  if (b->bs > BS_8x8) {
2714  mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
2715  mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
2716  } else {
2717  mvuv = b->mv[0][0];
2718  }
2719 
2720  mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
2721  s->dst[1], s->dst[2], ls_uv,
2722  ref1->data[1], ref1->linesize[1],
2723  ref1->data[2], ref1->linesize[2], tref1,
2724  row << 2, col << 2, &mvuv, bw, bh, w1, h1);
2725 
2726  if (b->comp) {
2727  if (b->bs > BS_8x8) {
2728  mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
2729  mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
2730  } else {
2731  mvuv = b->mv[0][1];
2732  }
2733  mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
2734  s->dst[1], s->dst[2], ls_uv,
2735  ref2->data[1], ref2->linesize[1],
2736  ref2->data[2], ref2->linesize[2], tref2,
2737  row << 2, col << 2, &mvuv, bw, bh, w2, h2);
2738  }
2739  }
2740 
2741  if (!b->skip) {
2742  /* mostly copied intra_reconn() */
2743 
2744  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2745  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2746  int end_x = FFMIN(2 * (s->cols - col), w4);
2747  int end_y = FFMIN(2 * (s->rows - row), h4);
2748  int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2749  int uvstep1d = 1 << b->uvtx, p;
2750  uint8_t *dst = s->dst[0];
2751 
2752  // y itxfm add
2753  for (n = 0, y = 0; y < end_y; y += step1d) {
2754  uint8_t *ptr = dst;
2755  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
2756  int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2757 
2758  if (eob)
2759  s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
2760  s->block + 16 * n, eob);
2761  }
2762  dst += 4 * s->y_stride * step1d;
2763  }
2764 
2765  // uv itxfm add
2766  end_x >>= 1;
2767  end_y >>= 1;
2768  step = 1 << (b->uvtx * 2);
2769  for (p = 0; p < 2; p++) {
2770  dst = s->dst[p + 1];
2771  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2772  uint8_t *ptr = dst;
2773  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
2774  int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2775 
2776  if (eob)
2777  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2778  s->uvblock[p] + 16 * n, eob);
2779  }
2780  dst += 4 * uvstep1d * s->uv_stride;
2781  }
2782  }
2783  }
2784 }
2785 
2786 static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
2787  int row_and_7, int col_and_7,
2788  int w, int h, int col_end, int row_end,
2789  enum TxfmMode tx, int skip_inter)
2790 {
2791  // FIXME I'm pretty sure all loops can be replaced by a single LUT if
2792  // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
2793  // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
2794  // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
2795 
2796  // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
2797  // edges. This means that for UV, we work on two subsampled blocks at
2798  // a time, and we only use the topleft block's mode information to set
2799  // things like block strength. Thus, for any block size smaller than
2800  // 16x16, ignore the odd portion of the block.
2801  if (tx == TX_4X4 && is_uv) {
2802  if (h == 1) {
2803  if (row_and_7 & 1)
2804  return;
2805  if (!row_end)
2806  h += 1;
2807  }
2808  if (w == 1) {
2809  if (col_and_7 & 1)
2810  return;
2811  if (!col_end)
2812  w += 1;
2813  }
2814  }
2815 
2816  if (tx == TX_4X4 && !skip_inter) {
2817  int t = 1 << col_and_7, m_col = (t << w) - t, y;
2818  int m_col_odd = (t << (w - 1)) - t;
2819 
2820  // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
2821  if (is_uv) {
2822  int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
2823 
2824  for (y = row_and_7; y < h + row_and_7; y++) {
2825  int col_mask_id = 2 - !(y & 7);
2826 
2827  lflvl->mask[is_uv][0][y][1] |= m_row_8;
2828  lflvl->mask[is_uv][0][y][2] |= m_row_4;
2829  // for odd lines, if the odd col is not being filtered,
2830  // skip odd row also:
2831  // .---. <-- a
2832  // | |
2833  // |___| <-- b
2834  // ^ ^
2835  // c d
2836  //
2837  // if a/c are even row/col and b/d are odd, and d is skipped,
2838  // e.g. right edge of size-66x66.webm, then skip b also (bug)
2839  if ((col_end & 1) && (y & 1)) {
2840  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
2841  } else {
2842  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
2843  }
2844  }
2845  } else {
2846  int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
2847 
2848  for (y = row_and_7; y < h + row_and_7; y++) {
2849  int col_mask_id = 2 - !(y & 3);
2850 
2851  lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
2852  lflvl->mask[is_uv][0][y][2] |= m_row_4;
2853  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
2854  lflvl->mask[is_uv][0][y][3] |= m_col;
2855  lflvl->mask[is_uv][1][y][3] |= m_col;
2856  }
2857  }
2858  } else {
2859  int y, t = 1 << col_and_7, m_col = (t << w) - t;
2860 
2861  if (!skip_inter) {
2862  int mask_id = (tx == TX_8X8);
2863  int l2 = tx + is_uv - 1, step1d = 1 << l2;
2864  static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
2865  int m_row = m_col & masks[l2];
2866 
2867  // at odd UV col/row edges tx16/tx32 loopfilter edges, force
2868  // 8wd loopfilter to prevent going off the visible edge.
2869  if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
2870  int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
2871  int m_row_8 = m_row - m_row_16;
2872 
2873  for (y = row_and_7; y < h + row_and_7; y++) {
2874  lflvl->mask[is_uv][0][y][0] |= m_row_16;
2875  lflvl->mask[is_uv][0][y][1] |= m_row_8;
2876  }
2877  } else {
2878  for (y = row_and_7; y < h + row_and_7; y++)
2879  lflvl->mask[is_uv][0][y][mask_id] |= m_row;
2880  }
2881 
2882  if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
2883  for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
2884  lflvl->mask[is_uv][1][y][0] |= m_col;
2885  if (y - row_and_7 == h - 1)
2886  lflvl->mask[is_uv][1][y][1] |= m_col;
2887  } else {
2888  for (y = row_and_7; y < h + row_and_7; y += step1d)
2889  lflvl->mask[is_uv][1][y][mask_id] |= m_col;
2890  }
2891  } else if (tx != TX_4X4) {
2892  int mask_id;
2893 
2894  mask_id = (tx == TX_8X8) || (is_uv && h == 1);
2895  lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
2896  mask_id = (tx == TX_8X8) || (is_uv && w == 1);
2897  for (y = row_and_7; y < h + row_and_7; y++)
2898  lflvl->mask[is_uv][0][y][mask_id] |= t;
2899  } else if (is_uv) {
2900  int t8 = t & 0x01, t4 = t - t8;
2901 
2902  for (y = row_and_7; y < h + row_and_7; y++) {
2903  lflvl->mask[is_uv][0][y][2] |= t4;
2904  lflvl->mask[is_uv][0][y][1] |= t8;
2905  }
2906  lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
2907  } else {
2908  int t8 = t & 0x11, t4 = t - t8;
2909 
2910  for (y = row_and_7; y < h + row_and_7; y++) {
2911  lflvl->mask[is_uv][0][y][2] |= t4;
2912  lflvl->mask[is_uv][0][y][1] |= t8;
2913  }
2914  lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
2915  }
2916  }
2917 }
2918 
2919 static void decode_b(AVCodecContext *ctx, int row, int col,
2920  struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
2921  enum BlockLevel bl, enum BlockPartition bp)
2922 {
2923  VP9Context *s = ctx->priv_data;
2924  VP9Block *b = s->b;
2925  enum BlockSize bs = bl * 3 + bp;
2926  int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
2927  int emu[2];
2928  AVFrame *f = s->frames[CUR_FRAME].tf.f;
2929 
2930  s->row = row;
2931  s->row7 = row & 7;
2932  s->col = col;
2933  s->col7 = col & 7;
2934  s->min_mv.x = -(128 + col * 64);
2935  s->min_mv.y = -(128 + row * 64);
2936  s->max_mv.x = 128 + (s->cols - col - w4) * 64;
2937  s->max_mv.y = 128 + (s->rows - row - h4) * 64;
2938  if (s->pass < 2) {
2939  b->bs = bs;
2940  b->bl = bl;
2941  b->bp = bp;
2942  decode_mode(ctx);
2943  b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
2944 
2945  if (!b->skip) {
2946  decode_coeffs(ctx);
2947  } else {
2948  int row7 = s->row7;
2949 
2950 #define SPLAT_ZERO_CTX(v, n) \
2951  switch (n) { \
2952  case 1: v = 0; break; \
2953  case 2: AV_ZERO16(&v); break; \
2954  case 4: AV_ZERO32(&v); break; \
2955  case 8: AV_ZERO64(&v); break; \
2956  case 16: AV_ZERO128(&v); break; \
2957  }
2958 #define SPLAT_ZERO_YUV(dir, var, off, n) \
2959  do { \
2960  SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
2961  SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
2962  SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
2963  } while (0)
2964 
2965  switch (w4) {
2966  case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1); break;
2967  case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2); break;
2968  case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4); break;
2969  case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8); break;
2970  }
2971  switch (h4) {
2972  case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1); break;
2973  case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2); break;
2974  case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4); break;
2975  case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8); break;
2976  }
2977  }
2978  if (s->pass == 1) {
2979  s->b++;
2980  s->block += w4 * h4 * 64;
2981  s->uvblock[0] += w4 * h4 * 16;
2982  s->uvblock[1] += w4 * h4 * 16;
2983  s->eob += 4 * w4 * h4;
2984  s->uveob[0] += w4 * h4;
2985  s->uveob[1] += w4 * h4;
2986 
2987  return;
2988  }
2989  }
2990 
2991  // emulated overhangs if the stride of the target buffer can't hold. This
2992  // allows to support emu-edge and so on even if we have large block
2993  // overhangs
2994  emu[0] = (col + w4) * 8 > f->linesize[0] ||
2995  (row + h4) > s->rows;
2996  emu[1] = (col + w4) * 4 > f->linesize[1] ||
2997  (row + h4) > s->rows;
2998  if (emu[0]) {
2999  s->dst[0] = s->tmp_y;
3000  s->y_stride = 64;
3001  } else {
3002  s->dst[0] = f->data[0] + yoff;
3003  s->y_stride = f->linesize[0];
3004  }
3005  if (emu[1]) {
3006  s->dst[1] = s->tmp_uv[0];
3007  s->dst[2] = s->tmp_uv[1];
3008  s->uv_stride = 32;
3009  } else {
3010  s->dst[1] = f->data[1] + uvoff;
3011  s->dst[2] = f->data[2] + uvoff;
3012  s->uv_stride = f->linesize[1];
3013  }
3014  if (b->intra) {
3015  intra_recon(ctx, yoff, uvoff);
3016  } else {
3017  inter_recon(ctx);
3018  }
3019  if (emu[0]) {
3020  int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3021 
3022  for (n = 0; o < w; n++) {
3023  int bw = 64 >> n;
3024 
3025  av_assert2(n <= 4);
3026  if (w & bw) {
3027  s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
3028  s->tmp_y + o, 64, h, 0, 0);
3029  o += bw;
3030  }
3031  }
3032  }
3033  if (emu[1]) {
3034  int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
3035 
3036  for (n = 1; o < w; n++) {
3037  int bw = 64 >> n;
3038 
3039  av_assert2(n <= 4);
3040  if (w & bw) {
3041  s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
3042  s->tmp_uv[0] + o, 32, h, 0, 0);
3043  s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
3044  s->tmp_uv[1] + o, 32, h, 0, 0);
3045  o += bw;
3046  }
3047  }
3048  }
3049 
3050  // pick filter level and find edges to apply filter to
3051  if (s->filter.level &&
3052  (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3053  [b->mode[3] != ZEROMV]) > 0) {
3054  int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3055  int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3056 
3057  setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3058  mask_edges(lflvl, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3059  mask_edges(lflvl, 1, row7, col7, x_end, y_end,
3060  s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3061  s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3062  b->uvtx, skip_inter);
3063 
3064  if (!s->filter.lim_lut[lvl]) {
3065  int sharp = s->filter.sharpness;
3066  int limit = lvl;
3067 
3068  if (sharp > 0) {
3069  limit >>= (sharp + 3) >> 2;
3070  limit = FFMIN(limit, 9 - sharp);
3071  }
3072  limit = FFMAX(limit, 1);
3073 
3074  s->filter.lim_lut[lvl] = limit;
3075  s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3076  }
3077  }
3078 
3079  if (s->pass == 2) {
3080  s->b++;
3081  s->block += w4 * h4 * 64;
3082  s->uvblock[0] += w4 * h4 * 16;
3083  s->uvblock[1] += w4 * h4 * 16;
3084  s->eob += 4 * w4 * h4;
3085  s->uveob[0] += w4 * h4;
3086  s->uveob[1] += w4 * h4;
3087  }
3088 }
3089 
3090 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3091  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3092 {
3093  VP9Context *s = ctx->priv_data;
3094  int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3095  (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3096  const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
3097  s->prob.p.partition[bl][c];
3098  enum BlockPartition bp;
3099  ptrdiff_t hbs = 4 >> bl;
3100  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3101  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3102 
3103  if (bl == BL_8X8) {
3104  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3105  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3106  } else if (col + hbs < s->cols) { // FIXME why not <=?
3107  if (row + hbs < s->rows) { // FIXME why not <=?
3108  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3109  switch (bp) {
3110  case PARTITION_NONE:
3111  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3112  break;
3113  case PARTITION_H:
3114  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3115  yoff += hbs * 8 * y_stride;
3116  uvoff += hbs * 4 * uv_stride;
3117  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3118  break;
3119  case PARTITION_V:
3120  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3121  yoff += hbs * 8;
3122  uvoff += hbs * 4;
3123  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3124  break;
3125  case PARTITION_SPLIT:
3126  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3127  decode_sb(ctx, row, col + hbs, lflvl,
3128  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3129  yoff += hbs * 8 * y_stride;
3130  uvoff += hbs * 4 * uv_stride;
3131  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3132  decode_sb(ctx, row + hbs, col + hbs, lflvl,
3133  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3134  break;
3135  default:
3136  av_assert0(0);
3137  }
3138  } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3139  bp = PARTITION_SPLIT;
3140  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3141  decode_sb(ctx, row, col + hbs, lflvl,
3142  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3143  } else {
3144  bp = PARTITION_H;
3145  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3146  }
3147  } else if (row + hbs < s->rows) { // FIXME why not <=?
3148  if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3149  bp = PARTITION_SPLIT;
3150  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3151  yoff += hbs * 8 * y_stride;
3152  uvoff += hbs * 4 * uv_stride;
3153  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3154  } else {
3155  bp = PARTITION_V;
3156  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3157  }
3158  } else {
3159  bp = PARTITION_SPLIT;
3160  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3161  }
3162  s->counts.partition[bl][c][bp]++;
3163 }
3164 
3165 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3166  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3167 {
3168  VP9Context *s = ctx->priv_data;
3169  VP9Block *b = s->b;
3170  ptrdiff_t hbs = 4 >> bl;
3171  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3172  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3173 
3174  if (bl == BL_8X8) {
3175  av_assert2(b->bl == BL_8X8);
3176  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3177  } else if (s->b->bl == bl) {
3178  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3179  if (b->bp == PARTITION_H && row + hbs < s->rows) {
3180  yoff += hbs * 8 * y_stride;
3181  uvoff += hbs * 4 * uv_stride;
3182  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3183  } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3184  yoff += hbs * 8;
3185  uvoff += hbs * 4;
3186  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3187  }
3188  } else {
3189  decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3190  if (col + hbs < s->cols) { // FIXME why not <=?
3191  if (row + hbs < s->rows) {
3192  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
3193  uvoff + 4 * hbs, bl + 1);
3194  yoff += hbs * 8 * y_stride;
3195  uvoff += hbs * 4 * uv_stride;
3196  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3197  decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3198  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3199  } else {
3200  yoff += hbs * 8;
3201  uvoff += hbs * 4;
3202  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3203  }
3204  } else if (row + hbs < s->rows) {
3205  yoff += hbs * 8 * y_stride;
3206  uvoff += hbs * 4 * uv_stride;
3207  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3208  }
3209  }
3210 }
3211 
3212 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3213  int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3214 {
3215  VP9Context *s = ctx->priv_data;
3216  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3217  uint8_t *dst = f->data[0] + yoff, *lvl = lflvl->level;
3218  ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3219  int y, x, p;
3220 
3221  // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3222  // if you think of them as acting on a 8x8 block max, we can interleave
3223  // each v/h within the single x loop, but that only works if we work on
3224  // 8 pixel blocks, and we won't always do that (we want at least 16px
3225  // to use SSE2 optimizations, perhaps 32 for AVX2)
3226 
3227  // filter edges between columns, Y plane (e.g. block1 | block2)
3228  for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
3229  uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
3230  uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
3231  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3232  unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3233  unsigned hm = hm1 | hm2 | hm13 | hm23;
3234 
3235  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
3236  if (hm1 & x) {
3237  int L = *l, H = L >> 4;
3238  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3239 
3240  if (col || x > 1) {
3241  if (hmask1[0] & x) {
3242  if (hmask2[0] & x) {
3243  av_assert2(l[8] == L);
3244  s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
3245  } else {
3246  s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
3247  }
3248  } else if (hm2 & x) {
3249  L = l[8];
3250  H |= (L >> 4) << 8;
3251  E |= s->filter.mblim_lut[L] << 8;
3252  I |= s->filter.lim_lut[L] << 8;
3253  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3254  [!!(hmask2[1] & x)]
3255  [0](ptr, ls_y, E, I, H);
3256  } else {
3257  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3258  [0](ptr, ls_y, E, I, H);
3259  }
3260  }
3261  } else if (hm2 & x) {
3262  int L = l[8], H = L >> 4;
3263  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3264 
3265  if (col || x > 1) {
3266  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3267  [0](ptr + 8 * ls_y, ls_y, E, I, H);
3268  }
3269  }
3270  if (hm13 & x) {
3271  int L = *l, H = L >> 4;
3272  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3273 
3274  if (hm23 & x) {
3275  L = l[8];
3276  H |= (L >> 4) << 8;
3277  E |= s->filter.mblim_lut[L] << 8;
3278  I |= s->filter.lim_lut[L] << 8;
3279  s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
3280  } else {
3281  s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
3282  }
3283  } else if (hm23 & x) {
3284  int L = l[8], H = L >> 4;
3285  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3286 
3287  s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
3288  }
3289  }
3290  }
3291 
3292  // block1
3293  // filter edges between rows, Y plane (e.g. ------)
3294  // block2
3295  dst = f->data[0] + yoff;
3296  lvl = lflvl->level;
3297  for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
3298  uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
3299  unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3300 
3301  for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
3302  if (row || y) {
3303  if (vm & x) {
3304  int L = *l, H = L >> 4;
3305  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3306 
3307  if (vmask[0] & x) {
3308  if (vmask[0] & (x << 1)) {
3309  av_assert2(l[1] == L);
3310  s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
3311  } else {
3312  s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
3313  }
3314  } else if (vm & (x << 1)) {
3315  L = l[1];
3316  H |= (L >> 4) << 8;
3317  E |= s->filter.mblim_lut[L] << 8;
3318  I |= s->filter.lim_lut[L] << 8;
3319  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3320  [!!(vmask[1] & (x << 1))]
3321  [1](ptr, ls_y, E, I, H);
3322  } else {
3323  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3324  [1](ptr, ls_y, E, I, H);
3325  }
3326  } else if (vm & (x << 1)) {
3327  int L = l[1], H = L >> 4;
3328  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3329 
3330  s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
3331  [1](ptr + 8, ls_y, E, I, H);
3332  }
3333  }
3334  if (vm3 & x) {
3335  int L = *l, H = L >> 4;
3336  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3337 
3338  if (vm3 & (x << 1)) {
3339  L = l[1];
3340  H |= (L >> 4) << 8;
3341  E |= s->filter.mblim_lut[L] << 8;
3342  I |= s->filter.lim_lut[L] << 8;
3343  s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
3344  } else {
3345  s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
3346  }
3347  } else if (vm3 & (x << 1)) {
3348  int L = l[1], H = L >> 4;
3349  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3350 
3351  s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
3352  }
3353  }
3354  }
3355 
3356  // same principle but for U/V planes
3357  for (p = 0; p < 2; p++) {
3358  lvl = lflvl->level;
3359  dst = f->data[1 + p] + uvoff;
3360  for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
3361  uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
3362  uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
3363  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
3364  unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
3365 
3366  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
3367  if (col || x > 1) {
3368  if (hm1 & x) {
3369  int L = *l, H = L >> 4;
3370  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3371 
3372  if (hmask1[0] & x) {
3373  if (hmask2[0] & x) {
3374  av_assert2(l[16] == L);
3375  s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
3376  } else {
3377  s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
3378  }
3379  } else if (hm2 & x) {
3380  L = l[16];
3381  H |= (L >> 4) << 8;
3382  E |= s->filter.mblim_lut[L] << 8;
3383  I |= s->filter.lim_lut[L] << 8;
3384  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3385  [!!(hmask2[1] & x)]
3386  [0](ptr, ls_uv, E, I, H);
3387  } else {
3388  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3389  [0](ptr, ls_uv, E, I, H);
3390  }
3391  } else if (hm2 & x) {
3392  int L = l[16], H = L >> 4;
3393  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3394 
3395  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3396  [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
3397  }
3398  }
3399  if (x & 0xAA)
3400  l += 2;
3401  }
3402  }
3403  lvl = lflvl->level;
3404  dst = f->data[1 + p] + uvoff;
3405  for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
3406  uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
3407  unsigned vm = vmask[0] | vmask[1] | vmask[2];
3408 
3409  for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
3410  if (row || y) {
3411  if (vm & x) {
3412  int L = *l, H = L >> 4;
3413  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3414 
3415  if (vmask[0] & x) {
3416  if (vmask[0] & (x << 2)) {
3417  av_assert2(l[2] == L);
3418  s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
3419  } else {
3420  s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
3421  }
3422  } else if (vm & (x << 2)) {
3423  L = l[2];
3424  H |= (L >> 4) << 8;
3425  E |= s->filter.mblim_lut[L] << 8;
3426  I |= s->filter.lim_lut[L] << 8;
3427  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3428  [!!(vmask[1] & (x << 2))]
3429  [1](ptr, ls_uv, E, I, H);
3430  } else {
3431  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3432  [1](ptr, ls_uv, E, I, H);
3433  }
3434  } else if (vm & (x << 2)) {
3435  int L = l[2], H = L >> 4;
3436  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3437 
3438  s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
3439  [1](ptr + 8, ls_uv, E, I, H);
3440  }
3441  }
3442  }
3443  if (y & 1)
3444  lvl += 16;
3445  }
3446  }
3447 }
3448 
3449 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3450 {
3451  int sb_start = ( idx * n) >> log2_n;
3452  int sb_end = ((idx + 1) * n) >> log2_n;
3453  *start = FFMIN(sb_start, n) << 3;
3454  *end = FFMIN(sb_end, n) << 3;
3455 }
3456 
3457 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3458  int max_count, int update_factor)
3459 {
3460  unsigned ct = ct0 + ct1, p2, p1;
3461 
3462  if (!ct)
3463  return;
3464 
3465  p1 = *p;
3466  p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3467  p2 = av_clip(p2, 1, 255);
3468  ct = FFMIN(ct, max_count);
3469  update_factor = FASTDIV(update_factor * ct, max_count);
3470 
3471  // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3472  *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3473 }
3474 
3475 static void adapt_probs(VP9Context *s)
3476 {
3477  int i, j, k, l, m;
3478  prob_context *p = &s->prob_ctx[s->framectxid].p;
3479  int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3480 
3481  // coefficients
3482  for (i = 0; i < 4; i++)
3483  for (j = 0; j < 2; j++)
3484  for (k = 0; k < 2; k++)
3485  for (l = 0; l < 6; l++)
3486  for (m = 0; m < 6; m++) {
3487  uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3488  unsigned *e = s->counts.eob[i][j][k][l][m];
3489  unsigned *c = s->counts.coef[i][j][k][l][m];
3490 
3491  if (l == 0 && m >= 3) // dc only has 3 pt
3492  break;
3493 
3494  adapt_prob(&pp[0], e[0], e[1], 24, uf);
3495  adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3496  adapt_prob(&pp[2], c[1], c[2], 24, uf);
3497  }
3498 
3499  if (s->keyframe || s->intraonly) {
3500  memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3501  memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3502  memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3503  memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3504  return;
3505  }
3506 
3507  // skip flag
3508  for (i = 0; i < 3; i++)
3509  adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3510 
3511  // intra/inter flag
3512  for (i = 0; i < 4; i++)
3513  adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3514 
3515  // comppred flag
3516  if (s->comppredmode == PRED_SWITCHABLE) {
3517  for (i = 0; i < 5; i++)
3518  adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3519  }
3520 
3521  // reference frames
3522  if (s->comppredmode != PRED_SINGLEREF) {
3523  for (i = 0; i < 5; i++)
3524  adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3525  s->counts.comp_ref[i][1], 20, 128);
3526  }
3527 
3528  if (s->comppredmode != PRED_COMPREF) {
3529  for (i = 0; i < 5; i++) {
3530  uint8_t *pp = p->single_ref[i];
3531  unsigned (*c)[2] = s->counts.single_ref[i];
3532 
3533  adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3534  adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3535  }
3536  }
3537 
3538  // block partitioning
3539  for (i = 0; i < 4; i++)
3540  for (j = 0; j < 4; j++) {
3541  uint8_t *pp = p->partition[i][j];
3542  unsigned *c = s->counts.partition[i][j];
3543 
3544  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3545  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3546  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3547  }
3548 
3549  // tx size
3550  if (s->txfmmode == TX_SWITCHABLE) {
3551  for (i = 0; i < 2; i++) {
3552  unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3553 
3554  adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3555  adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3556  adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3557  adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3558  adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3559  adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3560  }
3561  }
3562 
3563  // interpolation filter
3564  if (s->filtermode == FILTER_SWITCHABLE) {
3565  for (i = 0; i < 4; i++) {
3566  uint8_t *pp = p->filter[i];
3567  unsigned *c = s->counts.filter[i];
3568 
3569  adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3570  adapt_prob(&pp[1], c[1], c[2], 20, 128);
3571  }
3572  }
3573 
3574  // inter modes
3575  for (i = 0; i < 7; i++) {
3576  uint8_t *pp = p->mv_mode[i];
3577  unsigned *c = s->counts.mv_mode[i];
3578 
3579  adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3580  adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3581  adapt_prob(&pp[2], c[1], c[3], 20, 128);
3582  }
3583 
3584  // mv joints
3585  {
3586  uint8_t *pp = p->mv_joint;
3587  unsigned *c = s->counts.mv_joint;
3588 
3589  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3590  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3591  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3592  }
3593 
3594  // mv components
3595  for (i = 0; i < 2; i++) {
3596  uint8_t *pp;
3597  unsigned *c, (*c2)[2], sum;
3598 
3599  adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3600  s->counts.mv_comp[i].sign[1], 20, 128);
3601 
3602  pp = p->mv_comp[i].classes;
3603  c = s->counts.mv_comp[i].classes;
3604  sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3605  adapt_prob(&pp[0], c[0], sum, 20, 128);
3606  sum -= c[1];
3607  adapt_prob(&pp[1], c[1], sum, 20, 128);
3608  sum -= c[2] + c[3];
3609  adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3610  adapt_prob(&pp[3], c[2], c[3], 20, 128);
3611  sum -= c[4] + c[5];
3612  adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3613  adapt_prob(&pp[5], c[4], c[5], 20, 128);
3614  sum -= c[6];
3615  adapt_prob(&pp[6], c[6], sum, 20, 128);
3616  adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3617  adapt_prob(&pp[8], c[7], c[8], 20, 128);
3618  adapt_prob(&pp[9], c[9], c[10], 20, 128);
3619 
3620  adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3621  s->counts.mv_comp[i].class0[1], 20, 128);
3622  pp = p->mv_comp[i].bits;
3623  c2 = s->counts.mv_comp[i].bits;
3624  for (j = 0; j < 10; j++)
3625  adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3626 
3627  for (j = 0; j < 2; j++) {
3628  pp = p->mv_comp[i].class0_fp[j];
3629  c = s->counts.mv_comp[i].class0_fp[j];
3630  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3631  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3632  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3633  }
3634  pp = p->mv_comp[i].fp;
3635  c = s->counts.mv_comp[i].fp;
3636  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3637  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3638  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3639 
3640  if (s->highprecisionmvs) {
3641  adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3642  s->counts.mv_comp[i].class0_hp[1], 20, 128);
3643  adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3644  s->counts.mv_comp[i].hp[1], 20, 128);
3645  }
3646  }
3647 
3648  // y intra modes
3649  for (i = 0; i < 4; i++) {
3650  uint8_t *pp = p->y_mode[i];
3651  unsigned *c = s->counts.y_mode[i], sum, s2;
3652 
3653  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3654  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3655  sum -= c[TM_VP8_PRED];
3656  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3657  sum -= c[VERT_PRED];
3658  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3659  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3660  sum -= s2;
3661  adapt_prob(&pp[3], s2, sum, 20, 128);
3662  s2 -= c[HOR_PRED];
3663  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3664  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3665  sum -= c[DIAG_DOWN_LEFT_PRED];
3666  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3667  sum -= c[VERT_LEFT_PRED];
3668  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3669  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3670  }
3671 
3672  // uv intra modes
3673  for (i = 0; i < 10; i++) {
3674  uint8_t *pp = p->uv_mode[i];
3675  unsigned *c = s->counts.uv_mode[i], sum, s2;
3676 
3677  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3678  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3679  sum -= c[TM_VP8_PRED];
3680  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3681  sum -= c[VERT_PRED];
3682  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3683  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3684  sum -= s2;
3685  adapt_prob(&pp[3], s2, sum, 20, 128);
3686  s2 -= c[HOR_PRED];
3687  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3688  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3689  sum -= c[DIAG_DOWN_LEFT_PRED];
3690  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3691  sum -= c[VERT_LEFT_PRED];
3692  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3693  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3694  }
3695 }
3696 
3697 static void free_buffers(VP9Context *s)
3698 {
3699  av_freep(&s->intra_pred_data[0]);
3700  av_freep(&s->b_base);
3701  av_freep(&s->block_base);
3702 }
3703 
3705 {
3706  VP9Context *s = ctx->priv_data;
3707  int i;
3708 
3709  for (i = 0; i < 2; i++) {
3710  if (s->frames[i].tf.f->data[0])
3711  vp9_unref_frame(ctx, &s->frames[i]);
3712  av_frame_free(&s->frames[i].tf.f);
3713  }
3714  for (i = 0; i < 8; i++) {
3715  if (s->refs[i].f->data[0])
3716  ff_thread_release_buffer(ctx, &s->refs[i]);
3717  av_frame_free(&s->refs[i].f);
3718  if (s->next_refs[i].f->data[0])
3719  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3720  av_frame_free(&s->next_refs[i].f);
3721  }
3722  free_buffers(s);
3723  av_freep(&s->c_b);
3724  s->c_b_size = 0;
3725 
3726  return 0;
3727 }
3728 
3729 
3730 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3731  int *got_frame, AVPacket *pkt)
3732 {
3733  const uint8_t *data = pkt->data;
3734  int size = pkt->size;
3735  VP9Context *s = ctx->priv_data;
3736  int res, tile_row, tile_col, i, ref, row, col;
3737  ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3738  AVFrame *f;
3739 
3740  if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3741  return res;
3742  } else if (res == 0) {
3743  if (!s->refs[ref].f->data[0]) {
3744  av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3745  return AVERROR_INVALIDDATA;
3746  }
3747  if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
3748  return res;
3749  *got_frame = 1;
3750  return 0;
3751  }
3752  data += res;
3753  size -= res;
3754 
3755  if (s->frames[LAST_FRAME].tf.f->data[0])
3756  vp9_unref_frame(ctx, &s->frames[LAST_FRAME]);
3757  if (!s->keyframe && s->frames[CUR_FRAME].tf.f->data[0] &&
3758  (res = vp9_ref_frame(ctx, &s->frames[LAST_FRAME], &s->frames[CUR_FRAME])) < 0)
3759  return res;
3760  if (s->frames[CUR_FRAME].tf.f->data[0])
3761  vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
3762  if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
3763  return res;
3764  f = s->frames[CUR_FRAME].tf.f;
3765  f->key_frame = s->keyframe;
3767  ls_y = f->linesize[0];
3768  ls_uv =f->linesize[1];
3769 
3770  // ref frame setup
3771  for (i = 0; i < 8; i++) {
3772  if (s->next_refs[i].f->data[0])
3773  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3774  if (s->refreshrefmask & (1 << i)) {
3775  res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
3776  } else {
3777  res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
3778  }
3779  if (res < 0)
3780  return res;
3781  }
3782 
3783  // main tile decode loop
3784  memset(s->above_partition_ctx, 0, s->cols);
3785  memset(s->above_skip_ctx, 0, s->cols);
3786  if (s->keyframe || s->intraonly) {
3787  memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3788  } else {
3789  memset(s->above_mode_ctx, NEARESTMV, s->cols);
3790  }
3791  memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3792  memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
3793  memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
3794  memset(s->above_segpred_ctx, 0, s->cols);
3795  s->pass = s->uses_2pass =
3797  if ((res = update_block_buffers(ctx)) < 0) {
3798  av_log(ctx, AV_LOG_ERROR,
3799  "Failed to allocate block buffers\n");
3800  return res;
3801  }
3802  if (s->refreshctx && s->parallelmode) {
3803  int j, k, l, m;
3804 
3805  for (i = 0; i < 4; i++) {
3806  for (j = 0; j < 2; j++)
3807  for (k = 0; k < 2; k++)
3808  for (l = 0; l < 6; l++)
3809  for (m = 0; m < 6; m++)
3810  memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
3811  s->prob.coef[i][j][k][l][m], 3);
3812  if (s->txfmmode == i)
3813  break;
3814  }
3815  s->prob_ctx[s->framectxid].p = s->prob.p;
3817  }
3818 
3819  do {
3820  yoff = uvoff = 0;
3821  s->b = s->b_base;
3822  s->block = s->block_base;
3823  s->uvblock[0] = s->uvblock_base[0];
3824  s->uvblock[1] = s->uvblock_base[1];
3825  s->eob = s->eob_base;
3826  s->uveob[0] = s->uveob_base[0];
3827  s->uveob[1] = s->uveob_base[1];
3828 
3829  for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
3831  tile_row, s->tiling.log2_tile_rows, s->sb_rows);
3832  if (s->pass != 2) {
3833  for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3834  unsigned tile_size;
3835 
3836  if (tile_col == s->tiling.tile_cols - 1 &&
3837  tile_row == s->tiling.tile_rows - 1) {
3838  tile_size = size;
3839  } else {
3840  tile_size = AV_RB32(data);
3841  data += 4;
3842  size -= 4;
3843  }
3844  if (tile_size > size) {
3845  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3846  return AVERROR_INVALIDDATA;
3847  }
3848  ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
3849  if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
3850  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3851  return AVERROR_INVALIDDATA;
3852  }
3853  data += tile_size;
3854  size -= tile_size;
3855  }
3856  }
3857 
3858  for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
3859  row += 8, yoff += ls_y * 64, uvoff += ls_uv * 32) {
3860  struct VP9Filter *lflvl_ptr = s->lflvl;
3861  ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
3862 
3863  for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3865  tile_col, s->tiling.log2_tile_cols, s->sb_cols);
3866 
3867  if (s->pass != 2) {
3868  memset(s->left_partition_ctx, 0, 8);
3869  memset(s->left_skip_ctx, 0, 8);
3870  if (s->keyframe || s->intraonly) {
3871  memset(s->left_mode_ctx, DC_PRED, 16);
3872  } else {
3873  memset(s->left_mode_ctx, NEARESTMV, 8);
3874  }
3875  memset(s->left_y_nnz_ctx, 0, 16);
3876  memset(s->left_uv_nnz_ctx, 0, 16);
3877  memset(s->left_segpred_ctx, 0, 8);
3878 
3879  memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
3880  }
3881 
3882  for (col = s->tiling.tile_col_start;
3883  col < s->tiling.tile_col_end;
3884  col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3885  // FIXME integrate with lf code (i.e. zero after each
3886  // use, similar to invtxfm coefficients, or similar)
3887  if (s->pass != 1) {
3888  memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
3889  }
3890 
3891  if (s->pass == 2) {
3892  decode_sb_mem(ctx, row, col, lflvl_ptr,
3893  yoff2, uvoff2, BL_64X64);
3894  } else {
3895  decode_sb(ctx, row, col, lflvl_ptr,
3896  yoff2, uvoff2, BL_64X64);
3897  }
3898  }
3899  if (s->pass != 2) {
3900  memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
3901  }
3902  }
3903 
3904  if (s->pass == 1) {
3905  continue;
3906  }
3907 
3908  // backup pre-loopfilter reconstruction data for intra
3909  // prediction of next row of sb64s
3910  if (row + 8 < s->rows) {
3911  memcpy(s->intra_pred_data[0],
3912  f->data[0] + yoff + 63 * ls_y,
3913  8 * s->cols);
3914  memcpy(s->intra_pred_data[1],
3915  f->data[1] + uvoff + 31 * ls_uv,
3916  4 * s->cols);
3917  memcpy(s->intra_pred_data[2],
3918  f->data[2] + uvoff + 31 * ls_uv,
3919  4 * s->cols);
3920  }
3921 
3922  // loopfilter one row
3923  if (s->filter.level) {
3924  yoff2 = yoff;
3925  uvoff2 = uvoff;
3926  lflvl_ptr = s->lflvl;
3927  for (col = 0; col < s->cols;
3928  col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3929  loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
3930  }
3931  }
3932 
3933  // FIXME maybe we can make this more finegrained by running the
3934  // loopfilter per-block instead of after each sbrow
3935  // In fact that would also make intra pred left preparation easier?
3936  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
3937  }
3938  }
3939 
3940  if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
3941  adapt_probs(s);
3943  }
3944  } while (s->pass++ == 1);
3945  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3946 
3947  // ref frame setup
3948  for (i = 0; i < 8; i++) {
3949  if (s->refs[i].f->data[0])
3950  ff_thread_release_buffer(ctx, &s->refs[i]);
3951  ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
3952  }
3953 
3954  if (!s->invisible) {
3955  if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
3956  return res;
3957  *got_frame = 1;
3958  }
3959 
3960  return 0;
3961 }
3962 
3964 {
3965  VP9Context *s = ctx->priv_data;
3966  int i;
3967 
3968  for (i = 0; i < 2; i++)
3969  vp9_unref_frame(ctx, &s->frames[i]);
3970  for (i = 0; i < 8; i++)
3971  ff_thread_release_buffer(ctx, &s->refs[i]);
3972 }
3973 
3974 static int init_frames(AVCodecContext *ctx)
3975 {
3976  VP9Context *s = ctx->priv_data;
3977  int i;
3978 
3979  for (i = 0; i < 2; i++) {
3980  s->frames[i].tf.f = av_frame_alloc();
3981  if (!s->frames[i].tf.f) {
3982  vp9_decode_free(ctx);
3983  av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3984  return AVERROR(ENOMEM);
3985  }
3986  }
3987  for (i = 0; i < 8; i++) {
3988  s->refs[i].f = av_frame_alloc();
3989  s->next_refs[i].f = av_frame_alloc();
3990  if (!s->refs[i].f || !s->next_refs[i].f) {
3991  vp9_decode_free(ctx);
3992  av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3993  return AVERROR(ENOMEM);
3994  }
3995  }
3996 
3997  return 0;
3998 }
3999 
4001 {
4002  VP9Context *s = ctx->priv_data;
4003 
4004  ctx->internal->allocate_progress = 1;
4005  ctx->pix_fmt = AV_PIX_FMT_YUV420P;
4006  ff_vp9dsp_init(&s->dsp);
4007  ff_videodsp_init(&s->vdsp, 8);
4008  s->filter.sharpness = -1;
4009 
4010  return init_frames(ctx);
4011 }
4012 
4014 {
4015  return init_frames(avctx);
4016 }
4017 
4019 {
4020  int i, res;
4021  VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4022 
4023  // detect size changes in other threads
4024  if (s->intra_pred_data[0] &&
4025  (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
4026  free_buffers(s);
4027  }
4028 
4029  for (i = 0; i < 2; i++) {
4030  if (s->frames[i].tf.f->data[0])
4031  vp9_unref_frame(dst, &s->frames[i]);
4032  if (ssrc->frames[i].tf.f->data[0]) {
4033  if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
4034  return res;
4035  }
4036  }
4037  for (i = 0; i < 8; i++) {
4038  if (s->refs[i].f->data[0])
4039  ff_thread_release_buffer(dst, &s->refs[i]);
4040  if (ssrc->next_refs[i].f->data[0]) {
4041  if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
4042  return res;
4043  }
4044  }
4045 
4046  s->invisible = ssrc->invisible;
4047  s->keyframe = ssrc->keyframe;
4048  s->uses_2pass = ssrc->uses_2pass;
4049  memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4050  memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4051  if (ssrc->segmentation.enabled) {
4052  memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4053  sizeof(s->segmentation.feat));
4054  }
4055 
4056  return 0;
4057 }
4058 
4060  .name = "vp9",
4061  .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4062  .type = AVMEDIA_TYPE_VIDEO,
4063  .id = AV_CODEC_ID_VP9,
4064  .priv_data_size = sizeof(VP9Context),
4065  .init = vp9_decode_init,
4068  .capabilities = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
4072 };