FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vp9.c
Go to the documentation of this file.
1 /*
2  * VP9 compatible video decoder
3  *
4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "avcodec.h"
25 #include "get_bits.h"
26 #include "internal.h"
27 #include "thread.h"
28 #include "videodsp.h"
29 #include "vp56.h"
30 #include "vp9.h"
31 #include "vp9data.h"
32 #include "vp9dsp.h"
33 #include "libavutil/avassert.h"
34 
35 #define VP9_SYNCCODE 0x498342
36 
41 };
42 
43 enum BlockLevel {
48 };
49 
50 enum BlockSize {
65 };
66 
67 struct VP9mvrefPair {
68  VP56mv mv[2];
69  int8_t ref[2];
70 };
71 
72 typedef struct VP9Frame {
76  struct VP9mvrefPair *mv;
77 } VP9Frame;
78 
79 struct VP9Filter {
80  uint8_t level[8 * 8];
81  uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
82  [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
83 };
84 
85 typedef struct VP9Block {
88  VP56mv mv[4 /* b_idx */][2 /* ref */];
89  enum BlockSize bs;
90  enum TxfmMode tx, uvtx;
91  enum BlockLevel bl;
93 } VP9Block;
94 
95 typedef struct VP9Context {
101  unsigned c_b_size;
104  int row, row7, col, col7;
106  ptrdiff_t y_stride, uv_stride;
107 
108  // bitstream header
130 #define CUR_FRAME 0
131 #define LAST_FRAME 1
133 
134  struct {
136  int8_t sharpness;
139  } filter;
140  struct {
142  int8_t mode[2];
143  int8_t ref[4];
144  } lf_delta;
148  struct {
153  struct {
159  int16_t q_val;
160  int8_t lf_val;
161  int16_t qmul[2][2];
162  uint8_t lflvl[4][2];
163  } feat[8];
164  } segmentation;
165  struct {
167  unsigned tile_cols, tile_rows;
169  } tiling;
170  unsigned sb_cols, sb_rows, rows, cols;
171  struct {
173  uint8_t coef[4][2][2][6][6][3];
174  } prob_ctx[4];
175  struct {
176  prob_context p;
177  uint8_t coef[4][2][2][6][6][11];
180  } prob;
181  struct {
182  unsigned y_mode[4][10];
183  unsigned uv_mode[10][10];
184  unsigned filter[4][3];
185  unsigned mv_mode[7][4];
186  unsigned intra[4][2];
187  unsigned comp[5][2];
188  unsigned single_ref[5][2][2];
189  unsigned comp_ref[5][2];
190  unsigned tx32p[2][4];
191  unsigned tx16p[2][3];
192  unsigned tx8p[2][2];
193  unsigned skip[3][2];
194  unsigned mv_joint[4];
195  struct {
196  unsigned sign[2];
197  unsigned classes[11];
198  unsigned class0[2];
199  unsigned bits[10][2];
200  unsigned class0_fp[2][4];
201  unsigned fp[4];
202  unsigned class0_hp[2];
203  unsigned hp[2];
204  } mv_comp[2];
205  unsigned partition[4][4][4];
206  unsigned coef[4][2][2][6][6][3];
207  unsigned eob[4][2][2][6][6][2];
208  } counts;
211 
212  // contextual (left/above) cache
227  // FIXME maybe merge some of the below in a flags field?
238 
239  // whole-frame cache
241  struct VP9Filter *lflvl;
243 
244  // block reconstruction intermediates
246  int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
248  struct { int x, y; } min_mv, max_mv;
250  DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
251 } VP9Context;
252 
253 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
254  {
255  { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
256  { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
257  }, {
258  { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
259  { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
260  }
261 };
262 
264 {
265  VP9Context *s = ctx->priv_data;
266  int ret, sz;
267 
268  if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
269  return ret;
270  sz = 64 * s->sb_cols * s->sb_rows;
271  if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
272  ff_thread_release_buffer(ctx, &f->tf);
273  return AVERROR(ENOMEM);
274  }
275 
277  f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
278 
279  // retain segmentation map if it doesn't update
281  !s->intraonly && !s->keyframe) {
283  }
284 
285  return 0;
286 }
287 
289 {
290  ff_thread_release_buffer(ctx, &f->tf);
292 }
293 
295 {
296  int res;
297 
298  if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
299  return res;
300  } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
301  vp9_unref_frame(ctx, dst);
302  return AVERROR(ENOMEM);
303  }
304 
306  dst->mv = src->mv;
307 
308  return 0;
309 }
310 
311 static int update_size(AVCodecContext *ctx, int w, int h)
312 {
313  VP9Context *s = ctx->priv_data;
314  uint8_t *p;
315 
316  av_assert0(w > 0 && h > 0);
317 
318  if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height)
319  return 0;
320 
321  ctx->width = w;
322  ctx->height = h;
323  s->sb_cols = (w + 63) >> 6;
324  s->sb_rows = (h + 63) >> 6;
325  s->cols = (w + 7) >> 3;
326  s->rows = (h + 7) >> 3;
327 
328 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
329  av_freep(&s->intra_pred_data[0]);
330  p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
331  if (!p)
332  return AVERROR(ENOMEM);
333  assign(s->intra_pred_data[0], uint8_t *, 64);
334  assign(s->intra_pred_data[1], uint8_t *, 32);
335  assign(s->intra_pred_data[2], uint8_t *, 32);
336  assign(s->above_y_nnz_ctx, uint8_t *, 16);
337  assign(s->above_mode_ctx, uint8_t *, 16);
338  assign(s->above_mv_ctx, VP56mv(*)[2], 16);
340  assign(s->above_skip_ctx, uint8_t *, 8);
341  assign(s->above_txfm_ctx, uint8_t *, 8);
342  assign(s->above_uv_nnz_ctx[0], uint8_t *, 8);
343  assign(s->above_uv_nnz_ctx[1], uint8_t *, 8);
344  assign(s->above_segpred_ctx, uint8_t *, 8);
345  assign(s->above_intra_ctx, uint8_t *, 8);
346  assign(s->above_comp_ctx, uint8_t *, 8);
347  assign(s->above_ref_ctx, uint8_t *, 8);
348  assign(s->above_filter_ctx, uint8_t *, 8);
349  assign(s->lflvl, struct VP9Filter *, 1);
350 #undef assign
351 
352  // these will be re-allocated a little later
353  av_freep(&s->b_base);
354  av_freep(&s->block_base);
355 
356  return 0;
357 }
358 
360 {
361  VP9Context *s = ctx->priv_data;
362 
363  if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->uses_2pass)
364  return 0;
365 
366  av_free(s->b_base);
367  av_free(s->block_base);
368  if (s->uses_2pass) {
369  int sbs = s->sb_cols * s->sb_rows;
370 
371  s->b_base = av_malloc(sizeof(VP9Block) * s->cols * s->rows);
372  s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
373  if (!s->b_base || !s->block_base)
374  return AVERROR(ENOMEM);
375  s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
376  s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
377  s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
378  s->uveob_base[0] = s->eob_base + 256 * sbs;
379  s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
380  } else {
381  s->b_base = av_malloc(sizeof(VP9Block));
382  s->block_base = av_mallocz((64 * 64 + 128) * 3);
383  if (!s->b_base || !s->block_base)
384  return AVERROR(ENOMEM);
385  s->uvblock_base[0] = s->block_base + 64 * 64;
386  s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
387  s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
388  s->uveob_base[0] = s->eob_base + 256;
389  s->uveob_base[1] = s->uveob_base[0] + 64;
390  }
392 
393  return 0;
394 }
395 
396 // for some reason the sign bit is at the end, not the start, of a bit sequence
398 {
399  int v = get_bits(gb, n);
400  return get_bits1(gb) ? -v : v;
401 }
402 
404 {
405  return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
406 }
407 
408 // differential forward probability updates
409 static int update_prob(VP56RangeCoder *c, int p)
410 {
411  static const int inv_map_table[254] = {
412  7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
413  189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
414  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
415  25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
416  40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
417  55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
418  70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
419  86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
420  101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
421  116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
422  131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
423  146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
424  161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
425  177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
426  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
427  207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
428  222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
429  237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
430  252, 253,
431  };
432  int d;
433 
434  /* This code is trying to do a differential probability update. For a
435  * current probability A in the range [1, 255], the difference to a new
436  * probability of any value can be expressed differentially as 1-A,255-A
437  * where some part of this (absolute range) exists both in positive as
438  * well as the negative part, whereas another part only exists in one
439  * half. We're trying to code this shared part differentially, i.e.
440  * times two where the value of the lowest bit specifies the sign, and
441  * the single part is then coded on top of this. This absolute difference
442  * then again has a value of [0,254], but a bigger value in this range
443  * indicates that we're further away from the original value A, so we
444  * can code this as a VLC code, since higher values are increasingly
445  * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
446  * updates vs. the 'fine, exact' updates further down the range, which
447  * adds one extra dimension to this differential update model. */
448 
449  if (!vp8_rac_get(c)) {
450  d = vp8_rac_get_uint(c, 4) + 0;
451  } else if (!vp8_rac_get(c)) {
452  d = vp8_rac_get_uint(c, 4) + 16;
453  } else if (!vp8_rac_get(c)) {
454  d = vp8_rac_get_uint(c, 5) + 32;
455  } else {
456  d = vp8_rac_get_uint(c, 7);
457  if (d >= 65)
458  d = (d << 1) - 65 + vp8_rac_get(c);
459  d += 64;
460  }
461 
462  return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
463  255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
464 }
465 
467  const uint8_t *data, int size, int *ref)
468 {
469  VP9Context *s = ctx->priv_data;
470  int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
471  int last_invisible;
472  const uint8_t *data2;
473 
474  /* general header */
475  if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
476  av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
477  return res;
478  }
479  if (get_bits(&s->gb, 2) != 0x2) { // frame marker
480  av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
481  return AVERROR_INVALIDDATA;
482  }
483  s->profile = get_bits1(&s->gb);
484  if (get_bits1(&s->gb)) { // reserved bit
485  av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
486  return AVERROR_INVALIDDATA;
487  }
488  if (get_bits1(&s->gb)) {
489  *ref = get_bits(&s->gb, 3);
490  return 0;
491  }
492  s->last_uses_2pass = s->uses_2pass;
493  s->last_keyframe = s->keyframe;
494  s->keyframe = !get_bits1(&s->gb);
495  last_invisible = s->invisible;
496  s->invisible = !get_bits1(&s->gb);
497  s->errorres = get_bits1(&s->gb);
498  s->use_last_frame_mvs = !s->errorres && !last_invisible;
499  if (s->keyframe) {
500  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
501  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
502  return AVERROR_INVALIDDATA;
503  }
504  s->colorspace = get_bits(&s->gb, 3);
505  if (s->colorspace == 7) { // RGB = profile 1
506  av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
507  return AVERROR_INVALIDDATA;
508  }
509  s->fullrange = get_bits1(&s->gb);
510  // for profile 1, here follows the subsampling bits
511  s->refreshrefmask = 0xff;
512  w = get_bits(&s->gb, 16) + 1;
513  h = get_bits(&s->gb, 16) + 1;
514  if (get_bits1(&s->gb)) // display size
515  skip_bits(&s->gb, 32);
516  } else {
517  s->intraonly = s->invisible ? get_bits1(&s->gb) : 0;
518  s->resetctx = s->errorres ? 0 : get_bits(&s->gb, 2);
519  if (s->intraonly) {
520  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
521  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
522  return AVERROR_INVALIDDATA;
523  }
524  s->refreshrefmask = get_bits(&s->gb, 8);
525  w = get_bits(&s->gb, 16) + 1;
526  h = get_bits(&s->gb, 16) + 1;
527  if (get_bits1(&s->gb)) // display size
528  skip_bits(&s->gb, 32);
529  } else {
530  s->refreshrefmask = get_bits(&s->gb, 8);
531  s->refidx[0] = get_bits(&s->gb, 3);
532  s->signbias[0] = get_bits1(&s->gb);
533  s->refidx[1] = get_bits(&s->gb, 3);
534  s->signbias[1] = get_bits1(&s->gb);
535  s->refidx[2] = get_bits(&s->gb, 3);
536  s->signbias[2] = get_bits1(&s->gb);
537  if (!s->refs[s->refidx[0]].f->data[0] ||
538  !s->refs[s->refidx[1]].f->data[0] ||
539  !s->refs[s->refidx[2]].f->data[0]) {
540  av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
541  return AVERROR_INVALIDDATA;
542  }
543  if (get_bits1(&s->gb)) {
544  w = s->refs[s->refidx[0]].f->width;
545  h = s->refs[s->refidx[0]].f->height;
546  } else if (get_bits1(&s->gb)) {
547  w = s->refs[s->refidx[1]].f->width;
548  h = s->refs[s->refidx[1]].f->height;
549  } else if (get_bits1(&s->gb)) {
550  w = s->refs[s->refidx[2]].f->width;
551  h = s->refs[s->refidx[2]].f->height;
552  } else {
553  w = get_bits(&s->gb, 16) + 1;
554  h = get_bits(&s->gb, 16) + 1;
555  }
556  // Note that in this code, "CUR_FRAME" is actually before we
557  // have formally allocated a frame, and thus actually represents
558  // the _last_ frame
559  s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
560  s->frames[CUR_FRAME].tf.f->height == h;
561  if (get_bits1(&s->gb)) // display size
562  skip_bits(&s->gb, 32);
563  s->highprecisionmvs = get_bits1(&s->gb);
565  get_bits(&s->gb, 2);
566  s->allowcompinter = s->signbias[0] != s->signbias[1] ||
567  s->signbias[0] != s->signbias[2];
568  if (s->allowcompinter) {
569  if (s->signbias[0] == s->signbias[1]) {
570  s->fixcompref = 2;
571  s->varcompref[0] = 0;
572  s->varcompref[1] = 1;
573  } else if (s->signbias[0] == s->signbias[2]) {
574  s->fixcompref = 1;
575  s->varcompref[0] = 0;
576  s->varcompref[1] = 2;
577  } else {
578  s->fixcompref = 0;
579  s->varcompref[0] = 1;
580  s->varcompref[1] = 2;
581  }
582  }
583  }
584  }
585  s->refreshctx = s->errorres ? 0 : get_bits1(&s->gb);
586  s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
587  s->framectxid = c = get_bits(&s->gb, 2);
588 
589  /* loopfilter header data */
590  s->filter.level = get_bits(&s->gb, 6);
591  sharp = get_bits(&s->gb, 3);
592  // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
593  // the old cache values since they are still valid
594  if (s->filter.sharpness != sharp)
595  memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
596  s->filter.sharpness = sharp;
597  if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
598  if (get_bits1(&s->gb)) {
599  for (i = 0; i < 4; i++)
600  if (get_bits1(&s->gb))
601  s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
602  for (i = 0; i < 2; i++)
603  if (get_bits1(&s->gb))
604  s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
605  }
606  } else {
607  memset(&s->lf_delta, 0, sizeof(s->lf_delta));
608  }
609 
610  /* quantization header data */
611  s->yac_qi = get_bits(&s->gb, 8);
612  s->ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
613  s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
614  s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
615  s->lossless = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
616  s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
617 
618  /* segmentation header info */
619  if ((s->segmentation.enabled = get_bits1(&s->gb))) {
620  if ((s->segmentation.update_map = get_bits1(&s->gb))) {
621  for (i = 0; i < 7; i++)
622  s->prob.seg[i] = get_bits1(&s->gb) ?
623  get_bits(&s->gb, 8) : 255;
624  if ((s->segmentation.temporal = get_bits1(&s->gb))) {
625  for (i = 0; i < 3; i++)
626  s->prob.segpred[i] = get_bits1(&s->gb) ?
627  get_bits(&s->gb, 8) : 255;
628  }
629  }
630  if ((!s->segmentation.update_map || s->segmentation.temporal) &&
631  (w != s->frames[CUR_FRAME].tf.f->width ||
632  h != s->frames[CUR_FRAME].tf.f->height)) {
633  av_log(ctx, AV_LOG_ERROR,
634  "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
636  return AVERROR_INVALIDDATA;
637  }
638 
639  if (get_bits1(&s->gb)) {
641  for (i = 0; i < 8; i++) {
642  if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
643  s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
644  if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
645  s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
646  if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
647  s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
648  s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
649  }
650  }
651  } else {
652  s->segmentation.feat[0].q_enabled = 0;
653  s->segmentation.feat[0].lf_enabled = 0;
654  s->segmentation.feat[0].skip_enabled = 0;
655  s->segmentation.feat[0].ref_enabled = 0;
656  }
657 
658  // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
659  for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
660  int qyac, qydc, quvac, quvdc, lflvl, sh;
661 
662  if (s->segmentation.feat[i].q_enabled) {
664  qyac = s->segmentation.feat[i].q_val;
665  else
666  qyac = s->yac_qi + s->segmentation.feat[i].q_val;
667  } else {
668  qyac = s->yac_qi;
669  }
670  qydc = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
671  quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
672  quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
673  qyac = av_clip_uintp2(qyac, 8);
674 
675  s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
676  s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
677  s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
678  s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
679 
680  sh = s->filter.level >= 32;
681  if (s->segmentation.feat[i].lf_enabled) {
683  lflvl = s->segmentation.feat[i].lf_val;
684  else
685  lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
686  } else {
687  lflvl = s->filter.level;
688  }
689  s->segmentation.feat[i].lflvl[0][0] =
690  s->segmentation.feat[i].lflvl[0][1] =
691  av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
692  for (j = 1; j < 4; j++) {
693  s->segmentation.feat[i].lflvl[j][0] =
694  av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
695  s->lf_delta.mode[0]) << sh), 6);
696  s->segmentation.feat[i].lflvl[j][1] =
697  av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
698  s->lf_delta.mode[1]) << sh), 6);
699  }
700  }
701 
702  /* tiling info */
703  if ((res = update_size(ctx, w, h)) < 0) {
704  av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
705  return res;
706  }
707  for (s->tiling.log2_tile_cols = 0;
708  (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
709  s->tiling.log2_tile_cols++) ;
710  for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
711  max = FFMAX(0, max - 1);
712  while (max > s->tiling.log2_tile_cols) {
713  if (get_bits1(&s->gb))
714  s->tiling.log2_tile_cols++;
715  else
716  break;
717  }
718  s->tiling.log2_tile_rows = decode012(&s->gb);
719  s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
720  if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
721  s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
722  s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
723  sizeof(VP56RangeCoder) * s->tiling.tile_cols);
724  if (!s->c_b) {
725  av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
726  return AVERROR(ENOMEM);
727  }
728  }
729 
730  if (s->keyframe || s->errorres || s->intraonly) {
731  s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
732  s->prob_ctx[3].p = vp9_default_probs;
733  memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
734  sizeof(vp9_default_coef_probs));
735  memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
736  sizeof(vp9_default_coef_probs));
737  memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
738  sizeof(vp9_default_coef_probs));
739  memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
740  sizeof(vp9_default_coef_probs));
741  }
742 
743  // next 16 bits is size of the rest of the header (arith-coded)
744  size2 = get_bits(&s->gb, 16);
745  data2 = align_get_bits(&s->gb);
746  if (size2 > size - (data2 - data)) {
747  av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
748  return AVERROR_INVALIDDATA;
749  }
750  ff_vp56_init_range_decoder(&s->c, data2, size2);
751  if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
752  av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
753  return AVERROR_INVALIDDATA;
754  }
755 
756  if (s->keyframe || s->intraonly) {
757  memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
758  } else {
759  memset(&s->counts, 0, sizeof(s->counts));
760  }
761  // FIXME is it faster to not copy here, but do it down in the fw updates
762  // as explicit copies if the fw update is missing (and skip the copy upon
763  // fw update)?
764  s->prob.p = s->prob_ctx[c].p;
765 
766  // txfm updates
767  if (s->lossless) {
768  s->txfmmode = TX_4X4;
769  } else {
770  s->txfmmode = vp8_rac_get_uint(&s->c, 2);
771  if (s->txfmmode == 3)
772  s->txfmmode += vp8_rac_get(&s->c);
773 
774  if (s->txfmmode == TX_SWITCHABLE) {
775  for (i = 0; i < 2; i++)
776  if (vp56_rac_get_prob_branchy(&s->c, 252))
777  s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
778  for (i = 0; i < 2; i++)
779  for (j = 0; j < 2; j++)
780  if (vp56_rac_get_prob_branchy(&s->c, 252))
781  s->prob.p.tx16p[i][j] =
782  update_prob(&s->c, s->prob.p.tx16p[i][j]);
783  for (i = 0; i < 2; i++)
784  for (j = 0; j < 3; j++)
785  if (vp56_rac_get_prob_branchy(&s->c, 252))
786  s->prob.p.tx32p[i][j] =
787  update_prob(&s->c, s->prob.p.tx32p[i][j]);
788  }
789  }
790 
791  // coef updates
792  for (i = 0; i < 4; i++) {
793  uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
794  if (vp8_rac_get(&s->c)) {
795  for (j = 0; j < 2; j++)
796  for (k = 0; k < 2; k++)
797  for (l = 0; l < 6; l++)
798  for (m = 0; m < 6; m++) {
799  uint8_t *p = s->prob.coef[i][j][k][l][m];
800  uint8_t *r = ref[j][k][l][m];
801  if (m >= 3 && l == 0) // dc only has 3 pt
802  break;
803  for (n = 0; n < 3; n++) {
804  if (vp56_rac_get_prob_branchy(&s->c, 252)) {
805  p[n] = update_prob(&s->c, r[n]);
806  } else {
807  p[n] = r[n];
808  }
809  }
810  p[3] = 0;
811  }
812  } else {
813  for (j = 0; j < 2; j++)
814  for (k = 0; k < 2; k++)
815  for (l = 0; l < 6; l++)
816  for (m = 0; m < 6; m++) {
817  uint8_t *p = s->prob.coef[i][j][k][l][m];
818  uint8_t *r = ref[j][k][l][m];
819  if (m > 3 && l == 0) // dc only has 3 pt
820  break;
821  memcpy(p, r, 3);
822  p[3] = 0;
823  }
824  }
825  if (s->txfmmode == i)
826  break;
827  }
828 
829  // mode updates
830  for (i = 0; i < 3; i++)
831  if (vp56_rac_get_prob_branchy(&s->c, 252))
832  s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
833  if (!s->keyframe && !s->intraonly) {
834  for (i = 0; i < 7; i++)
835  for (j = 0; j < 3; j++)
836  if (vp56_rac_get_prob_branchy(&s->c, 252))
837  s->prob.p.mv_mode[i][j] =
838  update_prob(&s->c, s->prob.p.mv_mode[i][j]);
839 
840  if (s->filtermode == FILTER_SWITCHABLE)
841  for (i = 0; i < 4; i++)
842  for (j = 0; j < 2; j++)
843  if (vp56_rac_get_prob_branchy(&s->c, 252))
844  s->prob.p.filter[i][j] =
845  update_prob(&s->c, s->prob.p.filter[i][j]);
846 
847  for (i = 0; i < 4; i++)
848  if (vp56_rac_get_prob_branchy(&s->c, 252))
849  s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
850 
851  if (s->allowcompinter) {
852  s->comppredmode = vp8_rac_get(&s->c);
853  if (s->comppredmode)
854  s->comppredmode += vp8_rac_get(&s->c);
855  if (s->comppredmode == PRED_SWITCHABLE)
856  for (i = 0; i < 5; i++)
857  if (vp56_rac_get_prob_branchy(&s->c, 252))
858  s->prob.p.comp[i] =
859  update_prob(&s->c, s->prob.p.comp[i]);
860  } else {
862  }
863 
864  if (s->comppredmode != PRED_COMPREF) {
865  for (i = 0; i < 5; i++) {
866  if (vp56_rac_get_prob_branchy(&s->c, 252))
867  s->prob.p.single_ref[i][0] =
868  update_prob(&s->c, s->prob.p.single_ref[i][0]);
869  if (vp56_rac_get_prob_branchy(&s->c, 252))
870  s->prob.p.single_ref[i][1] =
871  update_prob(&s->c, s->prob.p.single_ref[i][1]);
872  }
873  }
874 
875  if (s->comppredmode != PRED_SINGLEREF) {
876  for (i = 0; i < 5; i++)
877  if (vp56_rac_get_prob_branchy(&s->c, 252))
878  s->prob.p.comp_ref[i] =
879  update_prob(&s->c, s->prob.p.comp_ref[i]);
880  }
881 
882  for (i = 0; i < 4; i++)
883  for (j = 0; j < 9; j++)
884  if (vp56_rac_get_prob_branchy(&s->c, 252))
885  s->prob.p.y_mode[i][j] =
886  update_prob(&s->c, s->prob.p.y_mode[i][j]);
887 
888  for (i = 0; i < 4; i++)
889  for (j = 0; j < 4; j++)
890  for (k = 0; k < 3; k++)
891  if (vp56_rac_get_prob_branchy(&s->c, 252))
892  s->prob.p.partition[3 - i][j][k] =
893  update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
894 
895  // mv fields don't use the update_prob subexp model for some reason
896  for (i = 0; i < 3; i++)
897  if (vp56_rac_get_prob_branchy(&s->c, 252))
898  s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
899 
900  for (i = 0; i < 2; i++) {
901  if (vp56_rac_get_prob_branchy(&s->c, 252))
902  s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
903 
904  for (j = 0; j < 10; j++)
905  if (vp56_rac_get_prob_branchy(&s->c, 252))
906  s->prob.p.mv_comp[i].classes[j] =
907  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
908 
909  if (vp56_rac_get_prob_branchy(&s->c, 252))
910  s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
911 
912  for (j = 0; j < 10; j++)
913  if (vp56_rac_get_prob_branchy(&s->c, 252))
914  s->prob.p.mv_comp[i].bits[j] =
915  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
916  }
917 
918  for (i = 0; i < 2; i++) {
919  for (j = 0; j < 2; j++)
920  for (k = 0; k < 3; k++)
921  if (vp56_rac_get_prob_branchy(&s->c, 252))
922  s->prob.p.mv_comp[i].class0_fp[j][k] =
923  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
924 
925  for (j = 0; j < 3; j++)
926  if (vp56_rac_get_prob_branchy(&s->c, 252))
927  s->prob.p.mv_comp[i].fp[j] =
928  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
929  }
930 
931  if (s->highprecisionmvs) {
932  for (i = 0; i < 2; i++) {
933  if (vp56_rac_get_prob_branchy(&s->c, 252))
934  s->prob.p.mv_comp[i].class0_hp =
935  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
936 
937  if (vp56_rac_get_prob_branchy(&s->c, 252))
938  s->prob.p.mv_comp[i].hp =
939  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
940  }
941  }
942  }
943 
944  return (data2 - data) + size2;
945 }
946 
947 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
948  VP9Context *s)
949 {
950  dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
951  dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
952 }
953 
954 static void find_ref_mvs(VP9Context *s,
955  VP56mv *pmv, int ref, int z, int idx, int sb)
956 {
957  static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
958  [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
959  { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
960  [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
961  { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
962  [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
963  { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
964  [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
965  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
966  [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
967  { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
968  [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
969  { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
970  [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
971  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
972  [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
973  { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
974  [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
975  { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
976  [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
977  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
978  [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
979  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
980  [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
981  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
982  [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
983  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
984  };
985  VP9Block *b = s->b;
986  int row = s->row, col = s->col, row7 = s->row7;
987  const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
988 #define INVALID_MV 0x80008000U
989  uint32_t mem = INVALID_MV;
990  int i;
991 
992 #define RETURN_DIRECT_MV(mv) \
993  do { \
994  uint32_t m = AV_RN32A(&mv); \
995  if (!idx) { \
996  AV_WN32A(pmv, m); \
997  return; \
998  } else if (mem == INVALID_MV) { \
999  mem = m; \
1000  } else if (m != mem) { \
1001  AV_WN32A(pmv, m); \
1002  return; \
1003  } \
1004  } while (0)
1005 
1006  if (sb >= 0) {
1007  if (sb == 2 || sb == 1) {
1008  RETURN_DIRECT_MV(b->mv[0][z]);
1009  } else if (sb == 3) {
1010  RETURN_DIRECT_MV(b->mv[2][z]);
1011  RETURN_DIRECT_MV(b->mv[1][z]);
1012  RETURN_DIRECT_MV(b->mv[0][z]);
1013  }
1014 
1015 #define RETURN_MV(mv) \
1016  do { \
1017  if (sb > 0) { \
1018  VP56mv tmp; \
1019  uint32_t m; \
1020  clamp_mv(&tmp, &mv, s); \
1021  m = AV_RN32A(&tmp); \
1022  if (!idx) { \
1023  AV_WN32A(pmv, m); \
1024  return; \
1025  } else if (mem == INVALID_MV) { \
1026  mem = m; \
1027  } else if (m != mem) { \
1028  AV_WN32A(pmv, m); \
1029  return; \
1030  } \
1031  } else { \
1032  uint32_t m = AV_RN32A(&mv); \
1033  if (!idx) { \
1034  clamp_mv(pmv, &mv, s); \
1035  return; \
1036  } else if (mem == INVALID_MV) { \
1037  mem = m; \
1038  } else if (m != mem) { \
1039  clamp_mv(pmv, &mv, s); \
1040  return; \
1041  } \
1042  } \
1043  } while (0)
1044 
1045  if (row > 0) {
1046  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1047  if (mv->ref[0] == ref) {
1048  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1049  } else if (mv->ref[1] == ref) {
1050  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1051  }
1052  }
1053  if (col > s->tiling.tile_col_start) {
1054  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1055  if (mv->ref[0] == ref) {
1056  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1057  } else if (mv->ref[1] == ref) {
1058  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1059  }
1060  }
1061  i = 2;
1062  } else {
1063  i = 0;
1064  }
1065 
1066  // previously coded MVs in this neighbourhood, using same reference frame
1067  for (; i < 8; i++) {
1068  int c = p[i][0] + col, r = p[i][1] + row;
1069 
1070  if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1071  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1072 
1073  if (mv->ref[0] == ref) {
1074  RETURN_MV(mv->mv[0]);
1075  } else if (mv->ref[1] == ref) {
1076  RETURN_MV(mv->mv[1]);
1077  }
1078  }
1079  }
1080 
1081  // MV at this position in previous frame, using same reference frame
1082  if (s->use_last_frame_mvs) {
1083  struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1084 
1085  if (!s->last_uses_2pass)
1086  ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1087  if (mv->ref[0] == ref) {
1088  RETURN_MV(mv->mv[0]);
1089  } else if (mv->ref[1] == ref) {
1090  RETURN_MV(mv->mv[1]);
1091  }
1092  }
1093 
1094 #define RETURN_SCALE_MV(mv, scale) \
1095  do { \
1096  if (scale) { \
1097  VP56mv mv_temp = { -mv.x, -mv.y }; \
1098  RETURN_MV(mv_temp); \
1099  } else { \
1100  RETURN_MV(mv); \
1101  } \
1102  } while (0)
1103 
1104  // previously coded MVs in this neighbourhood, using different reference frame
1105  for (i = 0; i < 8; i++) {
1106  int c = p[i][0] + col, r = p[i][1] + row;
1107 
1108  if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1109  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1110 
1111  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1112  RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1113  }
1114  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1115  // BUG - libvpx has this condition regardless of whether
1116  // we used the first ref MV and pre-scaling
1117  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1118  RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1119  }
1120  }
1121  }
1122 
1123  // MV at this position in previous frame, using different reference frame
1124  if (s->use_last_frame_mvs) {
1125  struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1126 
1127  // no need to await_progress, because we already did that above
1128  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1129  RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1130  }
1131  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1132  // BUG - libvpx has this condition regardless of whether
1133  // we used the first ref MV and pre-scaling
1134  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1135  RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1136  }
1137  }
1138 
1139  AV_ZERO32(pmv);
1140 #undef INVALID_MV
1141 #undef RETURN_MV
1142 #undef RETURN_SCALE_MV
1143 }
1144 
1145 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1146 {
1147  int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1148  int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1149  s->prob.p.mv_comp[idx].classes);
1150 
1151  s->counts.mv_comp[idx].sign[sign]++;
1152  s->counts.mv_comp[idx].classes[c]++;
1153  if (c) {
1154  int m;
1155 
1156  for (n = 0, m = 0; m < c; m++) {
1157  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1158  n |= bit << m;
1159  s->counts.mv_comp[idx].bits[m][bit]++;
1160  }
1161  n <<= 3;
1162  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1163  n |= bit << 1;
1164  s->counts.mv_comp[idx].fp[bit]++;
1165  if (hp) {
1166  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1167  s->counts.mv_comp[idx].hp[bit]++;
1168  n |= bit;
1169  } else {
1170  n |= 1;
1171  // bug in libvpx - we count for bw entropy purposes even if the
1172  // bit wasn't coded
1173  s->counts.mv_comp[idx].hp[1]++;
1174  }
1175  n += 8 << c;
1176  } else {
1177  n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1178  s->counts.mv_comp[idx].class0[n]++;
1179  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1180  s->prob.p.mv_comp[idx].class0_fp[n]);
1181  s->counts.mv_comp[idx].class0_fp[n][bit]++;
1182  n = (n << 3) | (bit << 1);
1183  if (hp) {
1184  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1185  s->counts.mv_comp[idx].class0_hp[bit]++;
1186  n |= bit;
1187  } else {
1188  n |= 1;
1189  // bug in libvpx - we count for bw entropy purposes even if the
1190  // bit wasn't coded
1191  s->counts.mv_comp[idx].class0_hp[1]++;
1192  }
1193  }
1194 
1195  return sign ? -(n + 1) : (n + 1);
1196 }
1197 
1198 static void fill_mv(VP9Context *s,
1199  VP56mv *mv, int mode, int sb)
1200 {
1201  VP9Block *b = s->b;
1202 
1203  if (mode == ZEROMV) {
1204  AV_ZERO64(mv);
1205  } else {
1206  int hp;
1207 
1208  // FIXME cache this value and reuse for other subblocks
1209  find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1210  mode == NEWMV ? -1 : sb);
1211  // FIXME maybe move this code into find_ref_mvs()
1212  if ((mode == NEWMV || sb == -1) &&
1213  !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1214  if (mv[0].y & 1) {
1215  if (mv[0].y < 0)
1216  mv[0].y++;
1217  else
1218  mv[0].y--;
1219  }
1220  if (mv[0].x & 1) {
1221  if (mv[0].x < 0)
1222  mv[0].x++;
1223  else
1224  mv[0].x--;
1225  }
1226  }
1227  if (mode == NEWMV) {
1229  s->prob.p.mv_joint);
1230 
1231  s->counts.mv_joint[j]++;
1232  if (j >= MV_JOINT_V)
1233  mv[0].y += read_mv_component(s, 0, hp);
1234  if (j & 1)
1235  mv[0].x += read_mv_component(s, 1, hp);
1236  }
1237 
1238  if (b->comp) {
1239  // FIXME cache this value and reuse for other subblocks
1240  find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1241  mode == NEWMV ? -1 : sb);
1242  if ((mode == NEWMV || sb == -1) &&
1243  !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1244  if (mv[1].y & 1) {
1245  if (mv[1].y < 0)
1246  mv[1].y++;
1247  else
1248  mv[1].y--;
1249  }
1250  if (mv[1].x & 1) {
1251  if (mv[1].x < 0)
1252  mv[1].x++;
1253  else
1254  mv[1].x--;
1255  }
1256  }
1257  if (mode == NEWMV) {
1259  s->prob.p.mv_joint);
1260 
1261  s->counts.mv_joint[j]++;
1262  if (j >= MV_JOINT_V)
1263  mv[1].y += read_mv_component(s, 0, hp);
1264  if (j & 1)
1265  mv[1].x += read_mv_component(s, 1, hp);
1266  }
1267  }
1268  }
1269 }
1270 
1271 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1272  ptrdiff_t stride, int v)
1273 {
1274  switch (w) {
1275  case 1:
1276  do {
1277  *ptr = v;
1278  ptr += stride;
1279  } while (--h);
1280  break;
1281  case 2: {
1282  int v16 = v * 0x0101;
1283  do {
1284  AV_WN16A(ptr, v16);
1285  ptr += stride;
1286  } while (--h);
1287  break;
1288  }
1289  case 4: {
1290  uint32_t v32 = v * 0x01010101;
1291  do {
1292  AV_WN32A(ptr, v32);
1293  ptr += stride;
1294  } while (--h);
1295  break;
1296  }
1297  case 8: {
1298 #if HAVE_FAST_64BIT
1299  uint64_t v64 = v * 0x0101010101010101ULL;
1300  do {
1301  AV_WN64A(ptr, v64);
1302  ptr += stride;
1303  } while (--h);
1304 #else
1305  uint32_t v32 = v * 0x01010101;
1306  do {
1307  AV_WN32A(ptr, v32);
1308  AV_WN32A(ptr + 4, v32);
1309  ptr += stride;
1310  } while (--h);
1311 #endif
1312  break;
1313  }
1314  }
1315 }
1316 
1317 static void decode_mode(AVCodecContext *ctx)
1318 {
1319  static const uint8_t left_ctx[N_BS_SIZES] = {
1320  0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1321  };
1322  static const uint8_t above_ctx[N_BS_SIZES] = {
1323  0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1324  };
1325  static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1327  TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1328  };
1329  VP9Context *s = ctx->priv_data;
1330  VP9Block *b = s->b;
1331  int row = s->row, col = s->col, row7 = s->row7;
1332  enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1333  int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
1334  int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
1335  int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1336  int vref, filter_id;
1337 
1338  if (!s->segmentation.enabled) {
1339  b->seg_id = 0;
1340  } else if (s->keyframe || s->intraonly) {
1342  } else if (!s->segmentation.update_map ||
1343  (s->segmentation.temporal &&
1345  s->prob.segpred[s->above_segpred_ctx[col] +
1346  s->left_segpred_ctx[row7]]))) {
1347  int pred = 8, x;
1348  uint8_t *refsegmap = s->frames[LAST_FRAME].segmentation_map;
1349 
1350  if (!s->last_uses_2pass)
1351  ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1352  for (y = 0; y < h4; y++)
1353  for (x = 0; x < w4; x++)
1354  pred = FFMIN(pred, refsegmap[(y + row) * 8 * s->sb_cols + x + col]);
1355  av_assert1(pred < 8);
1356  b->seg_id = pred;
1357 
1358  memset(&s->above_segpred_ctx[col], 1, w4);
1359  memset(&s->left_segpred_ctx[row7], 1, h4);
1360  } else {
1362  s->prob.seg);
1363 
1364  memset(&s->above_segpred_ctx[col], 0, w4);
1365  memset(&s->left_segpred_ctx[row7], 0, h4);
1366  }
1367  if (s->segmentation.enabled &&
1368  (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1369  setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1370  w4, h4, 8 * s->sb_cols, b->seg_id);
1371  }
1372 
1373  b->skip = s->segmentation.enabled &&
1374  s->segmentation.feat[b->seg_id].skip_enabled;
1375  if (!b->skip) {
1376  int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1377  b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1378  s->counts.skip[c][b->skip]++;
1379  }
1380 
1381  if (s->keyframe || s->intraonly) {
1382  b->intra = 1;
1383  } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1384  b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1385  } else {
1386  int c, bit;
1387 
1388  if (have_a && have_l) {
1389  c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1390  c += (c == 2);
1391  } else {
1392  c = have_a ? 2 * s->above_intra_ctx[col] :
1393  have_l ? 2 * s->left_intra_ctx[row7] : 0;
1394  }
1395  bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1396  s->counts.intra[c][bit]++;
1397  b->intra = !bit;
1398  }
1399 
1400  if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1401  int c;
1402  if (have_a) {
1403  if (have_l) {
1404  c = (s->above_skip_ctx[col] ? max_tx :
1405  s->above_txfm_ctx[col]) +
1406  (s->left_skip_ctx[row7] ? max_tx :
1407  s->left_txfm_ctx[row7]) > max_tx;
1408  } else {
1409  c = s->above_skip_ctx[col] ? 1 :
1410  (s->above_txfm_ctx[col] * 2 > max_tx);
1411  }
1412  } else if (have_l) {
1413  c = s->left_skip_ctx[row7] ? 1 :
1414  (s->left_txfm_ctx[row7] * 2 > max_tx);
1415  } else {
1416  c = 1;
1417  }
1418  switch (max_tx) {
1419  case TX_32X32:
1420  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1421  if (b->tx) {
1422  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1423  if (b->tx == 2)
1424  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1425  }
1426  s->counts.tx32p[c][b->tx]++;
1427  break;
1428  case TX_16X16:
1429  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1430  if (b->tx)
1431  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1432  s->counts.tx16p[c][b->tx]++;
1433  break;
1434  case TX_8X8:
1435  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1436  s->counts.tx8p[c][b->tx]++;
1437  break;
1438  case TX_4X4:
1439  b->tx = TX_4X4;
1440  break;
1441  }
1442  } else {
1443  b->tx = FFMIN(max_tx, s->txfmmode);
1444  }
1445 
1446  if (s->keyframe || s->intraonly) {
1447  uint8_t *a = &s->above_mode_ctx[col * 2];
1448  uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1449 
1450  b->comp = 0;
1451  if (b->bs > BS_8x8) {
1452  // FIXME the memory storage intermediates here aren't really
1453  // necessary, they're just there to make the code slightly
1454  // simpler for now
1455  b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1456  vp9_default_kf_ymode_probs[a[0]][l[0]]);
1457  if (b->bs != BS_8x4) {
1459  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1460  l[0] = a[1] = b->mode[1];
1461  } else {
1462  l[0] = a[1] = b->mode[1] = b->mode[0];
1463  }
1464  if (b->bs != BS_4x8) {
1465  b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1466  vp9_default_kf_ymode_probs[a[0]][l[1]]);
1467  if (b->bs != BS_8x4) {
1469  vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1470  l[1] = a[1] = b->mode[3];
1471  } else {
1472  l[1] = a[1] = b->mode[3] = b->mode[2];
1473  }
1474  } else {
1475  b->mode[2] = b->mode[0];
1476  l[1] = a[1] = b->mode[3] = b->mode[1];
1477  }
1478  } else {
1480  vp9_default_kf_ymode_probs[*a][*l]);
1481  b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1482  // FIXME this can probably be optimized
1483  memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1484  memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1485  }
1488  } else if (b->intra) {
1489  b->comp = 0;
1490  if (b->bs > BS_8x8) {
1492  s->prob.p.y_mode[0]);
1493  s->counts.y_mode[0][b->mode[0]]++;
1494  if (b->bs != BS_8x4) {
1496  s->prob.p.y_mode[0]);
1497  s->counts.y_mode[0][b->mode[1]]++;
1498  } else {
1499  b->mode[1] = b->mode[0];
1500  }
1501  if (b->bs != BS_4x8) {
1503  s->prob.p.y_mode[0]);
1504  s->counts.y_mode[0][b->mode[2]]++;
1505  if (b->bs != BS_8x4) {
1507  s->prob.p.y_mode[0]);
1508  s->counts.y_mode[0][b->mode[3]]++;
1509  } else {
1510  b->mode[3] = b->mode[2];
1511  }
1512  } else {
1513  b->mode[2] = b->mode[0];
1514  b->mode[3] = b->mode[1];
1515  }
1516  } else {
1517  static const uint8_t size_group[10] = {
1518  3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1519  };
1520  int sz = size_group[b->bs];
1521 
1523  s->prob.p.y_mode[sz]);
1524  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1525  s->counts.y_mode[sz][b->mode[3]]++;
1526  }
1528  s->prob.p.uv_mode[b->mode[3]]);
1529  s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1530  } else {
1531  static const uint8_t inter_mode_ctx_lut[14][14] = {
1532  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1533  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1534  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1535  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1536  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1537  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1538  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1539  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1540  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1541  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1542  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1543  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1544  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1545  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1546  };
1547 
1548  if (s->segmentation.feat[b->seg_id].ref_enabled) {
1549  av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1550  b->comp = 0;
1551  b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1552  } else {
1553  // read comp_pred flag
1554  if (s->comppredmode != PRED_SWITCHABLE) {
1555  b->comp = s->comppredmode == PRED_COMPREF;
1556  } else {
1557  int c;
1558 
1559  // FIXME add intra as ref=0xff (or -1) to make these easier?
1560  if (have_a) {
1561  if (have_l) {
1562  if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1563  c = 4;
1564  } else if (s->above_comp_ctx[col]) {
1565  c = 2 + (s->left_intra_ctx[row7] ||
1566  s->left_ref_ctx[row7] == s->fixcompref);
1567  } else if (s->left_comp_ctx[row7]) {
1568  c = 2 + (s->above_intra_ctx[col] ||
1569  s->above_ref_ctx[col] == s->fixcompref);
1570  } else {
1571  c = (!s->above_intra_ctx[col] &&
1572  s->above_ref_ctx[col] == s->fixcompref) ^
1573  (!s->left_intra_ctx[row7] &&
1574  s->left_ref_ctx[row & 7] == s->fixcompref);
1575  }
1576  } else {
1577  c = s->above_comp_ctx[col] ? 3 :
1578  (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1579  }
1580  } else if (have_l) {
1581  c = s->left_comp_ctx[row7] ? 3 :
1582  (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1583  } else {
1584  c = 1;
1585  }
1586  b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1587  s->counts.comp[c][b->comp]++;
1588  }
1589 
1590  // read actual references
1591  // FIXME probably cache a few variables here to prevent repetitive
1592  // memory accesses below
1593  if (b->comp) /* two references */ {
1594  int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1595 
1596  b->ref[fix_idx] = s->fixcompref;
1597  // FIXME can this codeblob be replaced by some sort of LUT?
1598  if (have_a) {
1599  if (have_l) {
1600  if (s->above_intra_ctx[col]) {
1601  if (s->left_intra_ctx[row7]) {
1602  c = 2;
1603  } else {
1604  c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1605  }
1606  } else if (s->left_intra_ctx[row7]) {
1607  c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1608  } else {
1609  int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1610 
1611  if (refl == refa && refa == s->varcompref[1]) {
1612  c = 0;
1613  } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1614  if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1615  (refl == s->fixcompref && refa == s->varcompref[0])) {
1616  c = 4;
1617  } else {
1618  c = (refa == refl) ? 3 : 1;
1619  }
1620  } else if (!s->left_comp_ctx[row7]) {
1621  if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1622  c = 1;
1623  } else {
1624  c = (refl == s->varcompref[1] &&
1625  refa != s->varcompref[1]) ? 2 : 4;
1626  }
1627  } else if (!s->above_comp_ctx[col]) {
1628  if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1629  c = 1;
1630  } else {
1631  c = (refa == s->varcompref[1] &&
1632  refl != s->varcompref[1]) ? 2 : 4;
1633  }
1634  } else {
1635  c = (refl == refa) ? 4 : 2;
1636  }
1637  }
1638  } else {
1639  if (s->above_intra_ctx[col]) {
1640  c = 2;
1641  } else if (s->above_comp_ctx[col]) {
1642  c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1643  } else {
1644  c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1645  }
1646  }
1647  } else if (have_l) {
1648  if (s->left_intra_ctx[row7]) {
1649  c = 2;
1650  } else if (s->left_comp_ctx[row7]) {
1651  c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1652  } else {
1653  c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1654  }
1655  } else {
1656  c = 2;
1657  }
1658  bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1659  b->ref[var_idx] = s->varcompref[bit];
1660  s->counts.comp_ref[c][bit]++;
1661  } else /* single reference */ {
1662  int bit, c;
1663 
1664  if (have_a && !s->above_intra_ctx[col]) {
1665  if (have_l && !s->left_intra_ctx[row7]) {
1666  if (s->left_comp_ctx[row7]) {
1667  if (s->above_comp_ctx[col]) {
1668  c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1669  !s->above_ref_ctx[col]);
1670  } else {
1671  c = (3 * !s->above_ref_ctx[col]) +
1672  (!s->fixcompref || !s->left_ref_ctx[row7]);
1673  }
1674  } else if (s->above_comp_ctx[col]) {
1675  c = (3 * !s->left_ref_ctx[row7]) +
1676  (!s->fixcompref || !s->above_ref_ctx[col]);
1677  } else {
1678  c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1679  }
1680  } else if (s->above_intra_ctx[col]) {
1681  c = 2;
1682  } else if (s->above_comp_ctx[col]) {
1683  c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1684  } else {
1685  c = 4 * (!s->above_ref_ctx[col]);
1686  }
1687  } else if (have_l && !s->left_intra_ctx[row7]) {
1688  if (s->left_intra_ctx[row7]) {
1689  c = 2;
1690  } else if (s->left_comp_ctx[row7]) {
1691  c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1692  } else {
1693  c = 4 * (!s->left_ref_ctx[row7]);
1694  }
1695  } else {
1696  c = 2;
1697  }
1698  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1699  s->counts.single_ref[c][0][bit]++;
1700  if (!bit) {
1701  b->ref[0] = 0;
1702  } else {
1703  // FIXME can this codeblob be replaced by some sort of LUT?
1704  if (have_a) {
1705  if (have_l) {
1706  if (s->left_intra_ctx[row7]) {
1707  if (s->above_intra_ctx[col]) {
1708  c = 2;
1709  } else if (s->above_comp_ctx[col]) {
1710  c = 1 + 2 * (s->fixcompref == 1 ||
1711  s->above_ref_ctx[col] == 1);
1712  } else if (!s->above_ref_ctx[col]) {
1713  c = 3;
1714  } else {
1715  c = 4 * (s->above_ref_ctx[col] == 1);
1716  }
1717  } else if (s->above_intra_ctx[col]) {
1718  if (s->left_intra_ctx[row7]) {
1719  c = 2;
1720  } else if (s->left_comp_ctx[row7]) {
1721  c = 1 + 2 * (s->fixcompref == 1 ||
1722  s->left_ref_ctx[row7] == 1);
1723  } else if (!s->left_ref_ctx[row7]) {
1724  c = 3;
1725  } else {
1726  c = 4 * (s->left_ref_ctx[row7] == 1);
1727  }
1728  } else if (s->above_comp_ctx[col]) {
1729  if (s->left_comp_ctx[row7]) {
1730  if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1731  c = 3 * (s->fixcompref == 1 ||
1732  s->left_ref_ctx[row7] == 1);
1733  } else {
1734  c = 2;
1735  }
1736  } else if (!s->left_ref_ctx[row7]) {
1737  c = 1 + 2 * (s->fixcompref == 1 ||
1738  s->above_ref_ctx[col] == 1);
1739  } else {
1740  c = 3 * (s->left_ref_ctx[row7] == 1) +
1741  (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1742  }
1743  } else if (s->left_comp_ctx[row7]) {
1744  if (!s->above_ref_ctx[col]) {
1745  c = 1 + 2 * (s->fixcompref == 1 ||
1746  s->left_ref_ctx[row7] == 1);
1747  } else {
1748  c = 3 * (s->above_ref_ctx[col] == 1) +
1749  (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1750  }
1751  } else if (!s->above_ref_ctx[col]) {
1752  if (!s->left_ref_ctx[row7]) {
1753  c = 3;
1754  } else {
1755  c = 4 * (s->left_ref_ctx[row7] == 1);
1756  }
1757  } else if (!s->left_ref_ctx[row7]) {
1758  c = 4 * (s->above_ref_ctx[col] == 1);
1759  } else {
1760  c = 2 * (s->left_ref_ctx[row7] == 1) +
1761  2 * (s->above_ref_ctx[col] == 1);
1762  }
1763  } else {
1764  if (s->above_intra_ctx[col] ||
1765  (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1766  c = 2;
1767  } else if (s->above_comp_ctx[col]) {
1768  c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1769  } else {
1770  c = 4 * (s->above_ref_ctx[col] == 1);
1771  }
1772  }
1773  } else if (have_l) {
1774  if (s->left_intra_ctx[row7] ||
1775  (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1776  c = 2;
1777  } else if (s->left_comp_ctx[row7]) {
1778  c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1779  } else {
1780  c = 4 * (s->left_ref_ctx[row7] == 1);
1781  }
1782  } else {
1783  c = 2;
1784  }
1785  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1786  s->counts.single_ref[c][1][bit]++;
1787  b->ref[0] = 1 + bit;
1788  }
1789  }
1790  }
1791 
1792  if (b->bs <= BS_8x8) {
1793  if (s->segmentation.feat[b->seg_id].skip_enabled) {
1794  b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1795  } else {
1796  static const uint8_t off[10] = {
1797  3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1798  };
1799 
1800  // FIXME this needs to use the LUT tables from find_ref_mvs
1801  // because not all are -1,0/0,-1
1802  int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1803  [s->left_mode_ctx[row7 + off[b->bs]]];
1804 
1806  s->prob.p.mv_mode[c]);
1807  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1808  s->counts.mv_mode[c][b->mode[0] - 10]++;
1809  }
1810  }
1811 
1812  if (s->filtermode == FILTER_SWITCHABLE) {
1813  int c;
1814 
1815  if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1816  if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1817  c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1818  s->left_filter_ctx[row7] : 3;
1819  } else {
1820  c = s->above_filter_ctx[col];
1821  }
1822  } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1823  c = s->left_filter_ctx[row7];
1824  } else {
1825  c = 3;
1826  }
1827 
1828  filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1829  s->prob.p.filter[c]);
1830  s->counts.filter[c][filter_id]++;
1831  b->filter = vp9_filter_lut[filter_id];
1832  } else {
1833  b->filter = s->filtermode;
1834  }
1835 
1836  if (b->bs > BS_8x8) {
1837  int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1838 
1840  s->prob.p.mv_mode[c]);
1841  s->counts.mv_mode[c][b->mode[0] - 10]++;
1842  fill_mv(s, b->mv[0], b->mode[0], 0);
1843 
1844  if (b->bs != BS_8x4) {
1846  s->prob.p.mv_mode[c]);
1847  s->counts.mv_mode[c][b->mode[1] - 10]++;
1848  fill_mv(s, b->mv[1], b->mode[1], 1);
1849  } else {
1850  b->mode[1] = b->mode[0];
1851  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1852  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1853  }
1854 
1855  if (b->bs != BS_4x8) {
1857  s->prob.p.mv_mode[c]);
1858  s->counts.mv_mode[c][b->mode[2] - 10]++;
1859  fill_mv(s, b->mv[2], b->mode[2], 2);
1860 
1861  if (b->bs != BS_8x4) {
1863  s->prob.p.mv_mode[c]);
1864  s->counts.mv_mode[c][b->mode[3] - 10]++;
1865  fill_mv(s, b->mv[3], b->mode[3], 3);
1866  } else {
1867  b->mode[3] = b->mode[2];
1868  AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1869  AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1870  }
1871  } else {
1872  b->mode[2] = b->mode[0];
1873  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1874  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1875  b->mode[3] = b->mode[1];
1876  AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1877  AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1878  }
1879  } else {
1880  fill_mv(s, b->mv[0], b->mode[0], -1);
1881  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1882  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1883  AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1884  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1885  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1886  AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1887  }
1888 
1889  vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
1890  }
1891 
1892 #if HAVE_FAST_64BIT
1893 #define SPLAT_CTX(var, val, n) \
1894  switch (n) { \
1895  case 1: var = val; break; \
1896  case 2: AV_WN16A(&var, val * 0x0101); break; \
1897  case 4: AV_WN32A(&var, val * 0x01010101); break; \
1898  case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
1899  case 16: { \
1900  uint64_t v64 = val * 0x0101010101010101ULL; \
1901  AV_WN64A( &var, v64); \
1902  AV_WN64A(&((uint8_t *) &var)[8], v64); \
1903  break; \
1904  } \
1905  }
1906 #else
1907 #define SPLAT_CTX(var, val, n) \
1908  switch (n) { \
1909  case 1: var = val; break; \
1910  case 2: AV_WN16A(&var, val * 0x0101); break; \
1911  case 4: AV_WN32A(&var, val * 0x01010101); break; \
1912  case 8: { \
1913  uint32_t v32 = val * 0x01010101; \
1914  AV_WN32A( &var, v32); \
1915  AV_WN32A(&((uint8_t *) &var)[4], v32); \
1916  break; \
1917  } \
1918  case 16: { \
1919  uint32_t v32 = val * 0x01010101; \
1920  AV_WN32A( &var, v32); \
1921  AV_WN32A(&((uint8_t *) &var)[4], v32); \
1922  AV_WN32A(&((uint8_t *) &var)[8], v32); \
1923  AV_WN32A(&((uint8_t *) &var)[12], v32); \
1924  break; \
1925  } \
1926  }
1927 #endif
1928 
1929  switch (bwh_tab[1][b->bs][0]) {
1930 #define SET_CTXS(dir, off, n) \
1931  do { \
1932  SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
1933  SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
1934  SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
1935  if (!s->keyframe && !s->intraonly) { \
1936  SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
1937  SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
1938  SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
1939  if (!b->intra) { \
1940  SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
1941  if (s->filtermode == FILTER_SWITCHABLE) { \
1942  SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
1943  } \
1944  } \
1945  } \
1946  } while (0)
1947  case 1: SET_CTXS(above, col, 1); break;
1948  case 2: SET_CTXS(above, col, 2); break;
1949  case 4: SET_CTXS(above, col, 4); break;
1950  case 8: SET_CTXS(above, col, 8); break;
1951  }
1952  switch (bwh_tab[1][b->bs][1]) {
1953  case 1: SET_CTXS(left, row7, 1); break;
1954  case 2: SET_CTXS(left, row7, 2); break;
1955  case 4: SET_CTXS(left, row7, 4); break;
1956  case 8: SET_CTXS(left, row7, 8); break;
1957  }
1958 #undef SPLAT_CTX
1959 #undef SET_CTXS
1960 
1961  if (!s->keyframe && !s->intraonly) {
1962  if (b->bs > BS_8x8) {
1963  int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1964 
1965  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
1966  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
1967  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
1968  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
1969  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
1970  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
1971  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
1972  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
1973  } else {
1974  int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1975 
1976  for (n = 0; n < w4 * 2; n++) {
1977  AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
1978  AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
1979  }
1980  for (n = 0; n < h4 * 2; n++) {
1981  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
1982  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
1983  }
1984  }
1985  }
1986 
1987  // FIXME kinda ugly
1988  for (y = 0; y < h4; y++) {
1989  int x, o = (row + y) * s->sb_cols * 8 + col;
1990  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
1991 
1992  if (b->intra) {
1993  for (x = 0; x < w4; x++) {
1994  mv[x].ref[0] =
1995  mv[x].ref[1] = -1;
1996  }
1997  } else if (b->comp) {
1998  for (x = 0; x < w4; x++) {
1999  mv[x].ref[0] = b->ref[0];
2000  mv[x].ref[1] = b->ref[1];
2001  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2002  AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2003  }
2004  } else {
2005  for (x = 0; x < w4; x++) {
2006  mv[x].ref[0] = b->ref[0];
2007  mv[x].ref[1] = -1;
2008  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2009  }
2010  }
2011  }
2012 }
2013 
2014 // FIXME merge cnt/eob arguments?
2015 static av_always_inline int
2016 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2017  int is_tx32x32, unsigned (*cnt)[6][3],
2018  unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2019  int nnz, const int16_t *scan, const int16_t (*nb)[2],
2020  const int16_t *band_counts, const int16_t *qmul)
2021 {
2022  int i = 0, band = 0, band_left = band_counts[band];
2023  uint8_t *tp = p[0][nnz];
2024  uint8_t cache[1024];
2025 
2026  do {
2027  int val, rc;
2028 
2029  val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2030  eob[band][nnz][val]++;
2031  if (!val)
2032  break;
2033 
2034  skip_eob:
2035  if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2036  cnt[band][nnz][0]++;
2037  if (!--band_left)
2038  band_left = band_counts[++band];
2039  cache[scan[i]] = 0;
2040  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2041  tp = p[band][nnz];
2042  if (++i == n_coeffs)
2043  break; //invalid input; blocks should end with EOB
2044  goto skip_eob;
2045  }
2046 
2047  rc = scan[i];
2048  if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2049  cnt[band][nnz][1]++;
2050  val = 1;
2051  cache[rc] = 1;
2052  } else {
2053  // fill in p[3-10] (model fill) - only once per frame for each pos
2054  if (!tp[3])
2055  memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2056 
2057  cnt[band][nnz][2]++;
2058  if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2059  if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2060  cache[rc] = val = 2;
2061  } else {
2062  val = 3 + vp56_rac_get_prob(c, tp[5]);
2063  cache[rc] = 3;
2064  }
2065  } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2066  cache[rc] = 4;
2067  if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2068  val = 5 + vp56_rac_get_prob(c, 159);
2069  } else {
2070  val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2071  val += vp56_rac_get_prob(c, 145);
2072  }
2073  } else { // cat 3-6
2074  cache[rc] = 5;
2075  if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2076  if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2077  val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2078  val += (vp56_rac_get_prob(c, 148) << 1);
2079  val += vp56_rac_get_prob(c, 140);
2080  } else {
2081  val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2082  val += (vp56_rac_get_prob(c, 155) << 2);
2083  val += (vp56_rac_get_prob(c, 140) << 1);
2084  val += vp56_rac_get_prob(c, 135);
2085  }
2086  } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2087  val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2088  val += (vp56_rac_get_prob(c, 157) << 3);
2089  val += (vp56_rac_get_prob(c, 141) << 2);
2090  val += (vp56_rac_get_prob(c, 134) << 1);
2091  val += vp56_rac_get_prob(c, 130);
2092  } else {
2093  val = 67 + (vp56_rac_get_prob(c, 254) << 13);
2094  val += (vp56_rac_get_prob(c, 254) << 12);
2095  val += (vp56_rac_get_prob(c, 254) << 11);
2096  val += (vp56_rac_get_prob(c, 252) << 10);
2097  val += (vp56_rac_get_prob(c, 249) << 9);
2098  val += (vp56_rac_get_prob(c, 243) << 8);
2099  val += (vp56_rac_get_prob(c, 230) << 7);
2100  val += (vp56_rac_get_prob(c, 196) << 6);
2101  val += (vp56_rac_get_prob(c, 177) << 5);
2102  val += (vp56_rac_get_prob(c, 153) << 4);
2103  val += (vp56_rac_get_prob(c, 140) << 3);
2104  val += (vp56_rac_get_prob(c, 133) << 2);
2105  val += (vp56_rac_get_prob(c, 130) << 1);
2106  val += vp56_rac_get_prob(c, 129);
2107  }
2108  }
2109  }
2110  if (!--band_left)
2111  band_left = band_counts[++band];
2112  if (is_tx32x32)
2113  coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
2114  else
2115  coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
2116  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2117  tp = p[band][nnz];
2118  } while (++i < n_coeffs);
2119 
2120  return i;
2121 }
2122 
2123 static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2124  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2125  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2126  const int16_t (*nb)[2], const int16_t *band_counts,
2127  const int16_t *qmul)
2128 {
2129  return decode_coeffs_b_generic(c, coef, n_coeffs, 0, cnt, eob, p,
2130  nnz, scan, nb, band_counts, qmul);
2131 }
2132 
2133 static int decode_coeffs_b32(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2134  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2135  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2136  const int16_t (*nb)[2], const int16_t *band_counts,
2137  const int16_t *qmul)
2138 {
2139  return decode_coeffs_b_generic(c, coef, n_coeffs, 1, cnt, eob, p,
2140  nnz, scan, nb, band_counts, qmul);
2141 }
2142 
2144 {
2145  VP9Context *s = ctx->priv_data;
2146  VP9Block *b = s->b;
2147  int row = s->row, col = s->col;
2148  uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2149  unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2150  unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2151  int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2152  int end_x = FFMIN(2 * (s->cols - col), w4);
2153  int end_y = FFMIN(2 * (s->rows - row), h4);
2154  int n, pl, x, y, res;
2155  int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2156  int tx = 4 * s->lossless + b->tx;
2157  const int16_t * const *yscans = vp9_scans[tx];
2158  const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2159  const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2160  const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2161  uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2162  uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2163  static const int16_t band_counts[4][8] = {
2164  { 1, 2, 3, 4, 3, 16 - 13 },
2165  { 1, 2, 3, 4, 11, 64 - 21 },
2166  { 1, 2, 3, 4, 11, 256 - 21 },
2167  { 1, 2, 3, 4, 11, 1024 - 21 },
2168  };
2169  const int16_t *y_band_counts = band_counts[b->tx];
2170  const int16_t *uv_band_counts = band_counts[b->uvtx];
2171 
2172 #define MERGE(la, end, step, rd) \
2173  for (n = 0; n < end; n += step) \
2174  la[n] = !!rd(&la[n])
2175 #define MERGE_CTX(step, rd) \
2176  do { \
2177  MERGE(l, end_y, step, rd); \
2178  MERGE(a, end_x, step, rd); \
2179  } while (0)
2180 
2181 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2182  for (n = 0, y = 0; y < end_y; y += step) { \
2183  for (x = 0; x < end_x; x += step, n += step * step) { \
2184  enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2185  res = decode_coeffs_b##v(&s->c, s->block + 16 * n, 16 * step * step, \
2186  c, e, p, a[x] + l[y], yscans[txtp], \
2187  ynbs[txtp], y_band_counts, qmul[0]); \
2188  a[x] = l[y] = !!res; \
2189  if (step >= 4) { \
2190  AV_WN16A(&s->eob[n], res); \
2191  } else { \
2192  s->eob[n] = res; \
2193  } \
2194  } \
2195  }
2196 
2197 #define SPLAT(la, end, step, cond) \
2198  if (step == 2) { \
2199  for (n = 1; n < end; n += step) \
2200  la[n] = la[n - 1]; \
2201  } else if (step == 4) { \
2202  if (cond) { \
2203  for (n = 0; n < end; n += step) \
2204  AV_WN32A(&la[n], la[n] * 0x01010101); \
2205  } else { \
2206  for (n = 0; n < end; n += step) \
2207  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2208  } \
2209  } else /* step == 8 */ { \
2210  if (cond) { \
2211  if (HAVE_FAST_64BIT) { \
2212  for (n = 0; n < end; n += step) \
2213  AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2214  } else { \
2215  for (n = 0; n < end; n += step) { \
2216  uint32_t v32 = la[n] * 0x01010101; \
2217  AV_WN32A(&la[n], v32); \
2218  AV_WN32A(&la[n + 4], v32); \
2219  } \
2220  } \
2221  } else { \
2222  for (n = 0; n < end; n += step) \
2223  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2224  } \
2225  }
2226 #define SPLAT_CTX(step) \
2227  do { \
2228  SPLAT(a, end_x, step, end_x == w4); \
2229  SPLAT(l, end_y, step, end_y == h4); \
2230  } while (0)
2231 
2232  /* y tokens */
2233  switch (b->tx) {
2234  case TX_4X4:
2235  DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2236  break;
2237  case TX_8X8:
2238  MERGE_CTX(2, AV_RN16A);
2239  DECODE_Y_COEF_LOOP(2, 0,);
2240  SPLAT_CTX(2);
2241  break;
2242  case TX_16X16:
2243  MERGE_CTX(4, AV_RN32A);
2244  DECODE_Y_COEF_LOOP(4, 0,);
2245  SPLAT_CTX(4);
2246  break;
2247  case TX_32X32:
2248  MERGE_CTX(8, AV_RN64A);
2249  DECODE_Y_COEF_LOOP(8, 0, 32);
2250  SPLAT_CTX(8);
2251  break;
2252  }
2253 
2254 #define DECODE_UV_COEF_LOOP(step) \
2255  for (n = 0, y = 0; y < end_y; y += step) { \
2256  for (x = 0; x < end_x; x += step, n += step * step) { \
2257  res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, \
2258  16 * step * step, c, e, p, a[x] + l[y], \
2259  uvscan, uvnb, uv_band_counts, qmul[1]); \
2260  a[x] = l[y] = !!res; \
2261  if (step >= 4) { \
2262  AV_WN16A(&s->uveob[pl][n], res); \
2263  } else { \
2264  s->uveob[pl][n] = res; \
2265  } \
2266  } \
2267  }
2268 
2269  p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2270  c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2271  e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2272  w4 >>= 1;
2273  h4 >>= 1;
2274  end_x >>= 1;
2275  end_y >>= 1;
2276  for (pl = 0; pl < 2; pl++) {
2277  a = &s->above_uv_nnz_ctx[pl][col];
2278  l = &s->left_uv_nnz_ctx[pl][row & 7];
2279  switch (b->uvtx) {
2280  case TX_4X4:
2282  break;
2283  case TX_8X8:
2284  MERGE_CTX(2, AV_RN16A);
2286  SPLAT_CTX(2);
2287  break;
2288  case TX_16X16:
2289  MERGE_CTX(4, AV_RN32A);
2291  SPLAT_CTX(4);
2292  break;
2293  case TX_32X32:
2294  MERGE_CTX(8, AV_RN64A);
2295  // a 64x64 (max) uv block can ever only contain 1 tx32x32 block
2296  // so there is no need to loop
2297  res = decode_coeffs_b32(&s->c, s->uvblock[pl],
2298  1024, c, e, p, a[0] + l[0],
2299  uvscan, uvnb, uv_band_counts, qmul[1]);
2300  a[0] = l[0] = !!res;
2301  AV_WN16A(&s->uveob[pl][0], res);
2302  SPLAT_CTX(8);
2303  break;
2304  }
2305  }
2306 }
2307 
2309  uint8_t *dst_edge, ptrdiff_t stride_edge,
2310  uint8_t *dst_inner, ptrdiff_t stride_inner,
2311  uint8_t *l, int col, int x, int w,
2312  int row, int y, enum TxfmMode tx,
2313  int p)
2314 {
2315  int have_top = row > 0 || y > 0;
2316  int have_left = col > s->tiling.tile_col_start || x > 0;
2317  int have_right = x < w - 1;
2318  static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2319  [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2320  { DC_127_PRED, VERT_PRED } },
2321  [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2322  { HOR_PRED, HOR_PRED } },
2323  [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2324  { LEFT_DC_PRED, DC_PRED } },
2334  { DC_127_PRED, VERT_LEFT_PRED } },
2335  [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2336  { HOR_UP_PRED, HOR_UP_PRED } },
2337  [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2338  { HOR_PRED, TM_VP8_PRED } },
2339  };
2340  static const struct {
2341  uint8_t needs_left:1;
2342  uint8_t needs_top:1;
2343  uint8_t needs_topleft:1;
2344  uint8_t needs_topright:1;
2345  } edges[N_INTRA_PRED_MODES] = {
2346  [VERT_PRED] = { .needs_top = 1 },
2347  [HOR_PRED] = { .needs_left = 1 },
2348  [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2349  [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2350  [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2351  [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2352  [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2353  [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2354  [HOR_UP_PRED] = { .needs_left = 1 },
2355  [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2356  [LEFT_DC_PRED] = { .needs_left = 1 },
2357  [TOP_DC_PRED] = { .needs_top = 1 },
2358  [DC_128_PRED] = { 0 },
2359  [DC_127_PRED] = { 0 },
2360  [DC_129_PRED] = { 0 }
2361  };
2362 
2363  av_assert2(mode >= 0 && mode < 10);
2364  mode = mode_conv[mode][have_left][have_top];
2365  if (edges[mode].needs_top) {
2366  uint8_t *top, *topleft;
2367  int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
2368  int n_px_need_tr = 0;
2369 
2370  if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2371  n_px_need_tr = 4;
2372 
2373  // if top of sb64-row, use s->intra_pred_data[] instead of
2374  // dst[-stride] for intra prediction (it contains pre- instead of
2375  // post-loopfilter data)
2376  if (have_top) {
2377  top = !(row & 7) && !y ?
2378  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2379  y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2380  if (have_left)
2381  topleft = !(row & 7) && !y ?
2382  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2383  y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2384  &dst_inner[-stride_inner];
2385  }
2386 
2387  if (have_top &&
2388  (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2389  (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2390  n_px_need + n_px_need_tr <= n_px_have) {
2391  *a = top;
2392  } else {
2393  if (have_top) {
2394  if (n_px_need <= n_px_have) {
2395  memcpy(*a, top, n_px_need);
2396  } else {
2397  memcpy(*a, top, n_px_have);
2398  memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
2399  n_px_need - n_px_have);
2400  }
2401  } else {
2402  memset(*a, 127, n_px_need);
2403  }
2404  if (edges[mode].needs_topleft) {
2405  if (have_left && have_top) {
2406  (*a)[-1] = topleft[-1];
2407  } else {
2408  (*a)[-1] = have_top ? 129 : 127;
2409  }
2410  }
2411  if (tx == TX_4X4 && edges[mode].needs_topright) {
2412  if (have_top && have_right &&
2413  n_px_need + n_px_need_tr <= n_px_have) {
2414  memcpy(&(*a)[4], &top[4], 4);
2415  } else {
2416  memset(&(*a)[4], (*a)[3], 4);
2417  }
2418  }
2419  }
2420  }
2421  if (edges[mode].needs_left) {
2422  if (have_left) {
2423  int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
2424  uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2425  ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2426 
2427  if (n_px_need <= n_px_have) {
2428  for (i = 0; i < n_px_need; i++)
2429  l[n_px_need - 1 - i] = dst[i * stride - 1];
2430  } else {
2431  for (i = 0; i < n_px_have; i++)
2432  l[n_px_need - 1 - i] = dst[i * stride - 1];
2433  memset(l, l[n_px_need - n_px_have], n_px_need - n_px_have);
2434  }
2435  } else {
2436  memset(l, 129, 4 << tx);
2437  }
2438  }
2439 
2440  return mode;
2441 }
2442 
2443 static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2444 {
2445  VP9Context *s = ctx->priv_data;
2446  VP9Block *b = s->b;
2447  int row = s->row, col = s->col;
2448  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2449  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2450  int end_x = FFMIN(2 * (s->cols - col), w4);
2451  int end_y = FFMIN(2 * (s->rows - row), h4);
2452  int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2453  int uvstep1d = 1 << b->uvtx, p;
2454  uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2455  LOCAL_ALIGNED_32(uint8_t, a_buf, [64]);
2456  LOCAL_ALIGNED_32(uint8_t, l, [32]);
2457 
2458  for (n = 0, y = 0; y < end_y; y += step1d) {
2459  uint8_t *ptr = dst, *ptr_r = dst_r;
2460  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
2461  ptr_r += 4 * step1d, n += step) {
2462  int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2463  y * 2 + x : 0];
2464  uint8_t *a = &a_buf[32];
2465  enum TxfmType txtp = vp9_intra_txfm_type[mode];
2466  int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2467 
2468  mode = check_intra_mode(s, mode, &a, ptr_r,
2469  s->frames[CUR_FRAME].tf.f->linesize[0],
2470  ptr, s->y_stride, l,
2471  col, x, w4, row, y, b->tx, 0);
2472  s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2473  if (eob)
2474  s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2475  s->block + 16 * n, eob);
2476  }
2477  dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2478  dst += 4 * step1d * s->y_stride;
2479  }
2480 
2481  // U/V
2482  h4 >>= 1;
2483  w4 >>= 1;
2484  end_x >>= 1;
2485  end_y >>= 1;
2486  step = 1 << (b->uvtx * 2);
2487  for (p = 0; p < 2; p++) {
2488  dst = s->dst[1 + p];
2489  dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2490  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2491  uint8_t *ptr = dst, *ptr_r = dst_r;
2492  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
2493  ptr_r += 4 * uvstep1d, n += step) {
2494  int mode = b->uvmode;
2495  uint8_t *a = &a_buf[16];
2496  int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2497 
2498  mode = check_intra_mode(s, mode, &a, ptr_r,
2499  s->frames[CUR_FRAME].tf.f->linesize[1],
2500  ptr, s->uv_stride, l,
2501  col, x, w4, row, y, b->uvtx, p + 1);
2502  s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2503  if (eob)
2504  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2505  s->uvblock[p] + 16 * n, eob);
2506  }
2507  dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2508  dst += 4 * uvstep1d * s->uv_stride;
2509  }
2510  }
2511 }
2512 
2514  uint8_t *dst, ptrdiff_t dst_stride,
2515  const uint8_t *ref, ptrdiff_t ref_stride,
2517  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2518  int bw, int bh, int w, int h)
2519 {
2520  int mx = mv->x, my = mv->y, th;
2521 
2522  y += my >> 3;
2523  x += mx >> 3;
2524  ref += y * ref_stride + x;
2525  mx &= 7;
2526  my &= 7;
2527  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2528  // we use +7 because the last 7 pixels of each sbrow can be changed in
2529  // the longest loopfilter of the next sbrow
2530  th = (y + bh + 4 * !!my + 7) >> 6;
2531  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2532  if (x < !!mx * 3 || y < !!my * 3 ||
2533  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2535  ref - !!my * 3 * ref_stride - !!mx * 3,
2536  80, ref_stride,
2537  bw + !!mx * 7, bh + !!my * 7,
2538  x - !!mx * 3, y - !!my * 3, w, h);
2539  ref = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2540  ref_stride = 80;
2541  }
2542  mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2543 }
2544 
2546  uint8_t *dst_u, uint8_t *dst_v,
2547  ptrdiff_t dst_stride,
2548  const uint8_t *ref_u, ptrdiff_t src_stride_u,
2549  const uint8_t *ref_v, ptrdiff_t src_stride_v,
2551  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2552  int bw, int bh, int w, int h)
2553 {
2554  int mx = mv->x, my = mv->y, th;
2555 
2556  y += my >> 4;
2557  x += mx >> 4;
2558  ref_u += y * src_stride_u + x;
2559  ref_v += y * src_stride_v + x;
2560  mx &= 15;
2561  my &= 15;
2562  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2563  // we use +7 because the last 7 pixels of each sbrow can be changed in
2564  // the longest loopfilter of the next sbrow
2565  th = (y + bh + 4 * !!my + 7) >> 5;
2566  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2567  if (x < !!mx * 3 || y < !!my * 3 ||
2568  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2570  ref_u - !!my * 3 * src_stride_u - !!mx * 3,
2571  80, src_stride_u,
2572  bw + !!mx * 7, bh + !!my * 7,
2573  x - !!mx * 3, y - !!my * 3, w, h);
2574  ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2575  mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
2576 
2578  ref_v - !!my * 3 * src_stride_v - !!mx * 3,
2579  80, src_stride_v,
2580  bw + !!mx * 7, bh + !!my * 7,
2581  x - !!mx * 3, y - !!my * 3, w, h);
2582  ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2583  mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
2584  } else {
2585  mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2586  mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2587  }
2588 }
2589 
2590 static void inter_recon(AVCodecContext *ctx)
2591 {
2592  static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
2593  { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
2594  { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
2595  };
2596  VP9Context *s = ctx->priv_data;
2597  VP9Block *b = s->b;
2598  int row = s->row, col = s->col;
2599  ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2;
2600  AVFrame *ref1 = tref1->f, *ref2;
2601  int w1 = ref1->width, h1 = ref1->height, w2, h2;
2602  ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
2603 
2604  if (b->comp) {
2605  tref2 = &s->refs[s->refidx[b->ref[1]]];
2606  ref2 = tref2->f;
2607  w2 = ref2->width;
2608  h2 = ref2->height;
2609  }
2610 
2611  // y inter pred
2612  if (b->bs > BS_8x8) {
2613  if (b->bs == BS_8x4) {
2614  mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
2615  ref1->data[0], ref1->linesize[0], tref1,
2616  row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1);
2617  mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
2618  s->dst[0] + 4 * ls_y, ls_y,
2619  ref1->data[0], ref1->linesize[0], tref1,
2620  (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1);
2621 
2622  if (b->comp) {
2623  mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
2624  ref2->data[0], ref2->linesize[0], tref2,
2625  row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2);
2626  mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
2627  s->dst[0] + 4 * ls_y, ls_y,
2628  ref2->data[0], ref2->linesize[0], tref2,
2629  (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2);
2630  }
2631  } else if (b->bs == BS_4x8) {
2632  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2633  ref1->data[0], ref1->linesize[0], tref1,
2634  row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1);
2635  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2636  ref1->data[0], ref1->linesize[0], tref1,
2637  row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1);
2638 
2639  if (b->comp) {
2640  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2641  ref2->data[0], ref2->linesize[0], tref2,
2642  row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2);
2643  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2644  ref2->data[0], ref2->linesize[0], tref2,
2645  row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2);
2646  }
2647  } else {
2648  av_assert2(b->bs == BS_4x4);
2649 
2650  // FIXME if two horizontally adjacent blocks have the same MV,
2651  // do a w8 instead of a w4 call
2652  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2653  ref1->data[0], ref1->linesize[0], tref1,
2654  row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1);
2655  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2656  ref1->data[0], ref1->linesize[0], tref1,
2657  row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1);
2658  mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2659  s->dst[0] + 4 * ls_y, ls_y,
2660  ref1->data[0], ref1->linesize[0], tref1,
2661  (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1);
2662  mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2663  s->dst[0] + 4 * ls_y + 4, ls_y,
2664  ref1->data[0], ref1->linesize[0], tref1,
2665  (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1);
2666 
2667  if (b->comp) {
2668  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2669  ref2->data[0], ref2->linesize[0], tref2,
2670  row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2);
2671  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2672  ref2->data[0], ref2->linesize[0], tref2,
2673  row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2);
2674  mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2675  s->dst[0] + 4 * ls_y, ls_y,
2676  ref2->data[0], ref2->linesize[0], tref2,
2677  (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2);
2678  mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2679  s->dst[0] + 4 * ls_y + 4, ls_y,
2680  ref2->data[0], ref2->linesize[0], tref2,
2681  (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2);
2682  }
2683  }
2684  } else {
2685  int bwl = bwlog_tab[0][b->bs];
2686  int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
2687 
2688  mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
2689  ref1->data[0], ref1->linesize[0], tref1,
2690  row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1);
2691 
2692  if (b->comp)
2693  mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
2694  ref2->data[0], ref2->linesize[0], tref2,
2695  row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2);
2696  }
2697 
2698  // uv inter pred
2699  {
2700  int bwl = bwlog_tab[1][b->bs];
2701  int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
2702  VP56mv mvuv;
2703 
2704  w1 = (w1 + 1) >> 1;
2705  h1 = (h1 + 1) >> 1;
2706  if (b->comp) {
2707  w2 = (w2 + 1) >> 1;
2708  h2 = (h2 + 1) >> 1;
2709  }
2710  if (b->bs > BS_8x8) {
2711  mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
2712  mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
2713  } else {
2714  mvuv = b->mv[0][0];
2715  }
2716 
2717  mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
2718  s->dst[1], s->dst[2], ls_uv,
2719  ref1->data[1], ref1->linesize[1],
2720  ref1->data[2], ref1->linesize[2], tref1,
2721  row << 2, col << 2, &mvuv, bw, bh, w1, h1);
2722 
2723  if (b->comp) {
2724  if (b->bs > BS_8x8) {
2725  mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
2726  mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
2727  } else {
2728  mvuv = b->mv[0][1];
2729  }
2730  mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
2731  s->dst[1], s->dst[2], ls_uv,
2732  ref2->data[1], ref2->linesize[1],
2733  ref2->data[2], ref2->linesize[2], tref2,
2734  row << 2, col << 2, &mvuv, bw, bh, w2, h2);
2735  }
2736  }
2737 
2738  if (!b->skip) {
2739  /* mostly copied intra_reconn() */
2740 
2741  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2742  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2743  int end_x = FFMIN(2 * (s->cols - col), w4);
2744  int end_y = FFMIN(2 * (s->rows - row), h4);
2745  int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2746  int uvstep1d = 1 << b->uvtx, p;
2747  uint8_t *dst = s->dst[0];
2748 
2749  // y itxfm add
2750  for (n = 0, y = 0; y < end_y; y += step1d) {
2751  uint8_t *ptr = dst;
2752  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
2753  int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2754 
2755  if (eob)
2756  s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
2757  s->block + 16 * n, eob);
2758  }
2759  dst += 4 * s->y_stride * step1d;
2760  }
2761 
2762  // uv itxfm add
2763  h4 >>= 1;
2764  w4 >>= 1;
2765  end_x >>= 1;
2766  end_y >>= 1;
2767  step = 1 << (b->uvtx * 2);
2768  for (p = 0; p < 2; p++) {
2769  dst = s->dst[p + 1];
2770  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2771  uint8_t *ptr = dst;
2772  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
2773  int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2774 
2775  if (eob)
2776  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2777  s->uvblock[p] + 16 * n, eob);
2778  }
2779  dst += 4 * uvstep1d * s->uv_stride;
2780  }
2781  }
2782  }
2783 }
2784 
2785 static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
2786  int row_and_7, int col_and_7,
2787  int w, int h, int col_end, int row_end,
2788  enum TxfmMode tx, int skip_inter)
2789 {
2790  // FIXME I'm pretty sure all loops can be replaced by a single LUT if
2791  // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
2792  // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
2793  // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
2794 
2795  // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
2796  // edges. This means that for UV, we work on two subsampled blocks at
2797  // a time, and we only use the topleft block's mode information to set
2798  // things like block strength. Thus, for any block size smaller than
2799  // 16x16, ignore the odd portion of the block.
2800  if (tx == TX_4X4 && is_uv) {
2801  if (h == 1) {
2802  if (row_and_7 & 1)
2803  return;
2804  if (!row_end)
2805  h += 1;
2806  }
2807  if (w == 1) {
2808  if (col_and_7 & 1)
2809  return;
2810  if (!col_end)
2811  w += 1;
2812  }
2813  }
2814 
2815  if (tx == TX_4X4 && !skip_inter) {
2816  int t = 1 << col_and_7, m_col = (t << w) - t, y;
2817  int m_col_odd = (t << (w - 1)) - t;
2818 
2819  // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
2820  if (is_uv) {
2821  int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
2822 
2823  for (y = row_and_7; y < h + row_and_7; y++) {
2824  int col_mask_id = 2 - !(y & 7);
2825 
2826  lflvl->mask[is_uv][0][y][1] |= m_row_8;
2827  lflvl->mask[is_uv][0][y][2] |= m_row_4;
2828  // for odd lines, if the odd col is not being filtered,
2829  // skip odd row also:
2830  // .---. <-- a
2831  // | |
2832  // |___| <-- b
2833  // ^ ^
2834  // c d
2835  //
2836  // if a/c are even row/col and b/d are odd, and d is skipped,
2837  // e.g. right edge of size-66x66.webm, then skip b also (bug)
2838  if ((col_end & 1) && (y & 1)) {
2839  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
2840  } else {
2841  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
2842  }
2843  }
2844  } else {
2845  int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
2846 
2847  for (y = row_and_7; y < h + row_and_7; y++) {
2848  int col_mask_id = 2 - !(y & 3);
2849 
2850  lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
2851  lflvl->mask[is_uv][0][y][2] |= m_row_4;
2852  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
2853  lflvl->mask[is_uv][0][y][3] |= m_col;
2854  lflvl->mask[is_uv][1][y][3] |= m_col;
2855  }
2856  }
2857  } else {
2858  int y, t = 1 << col_and_7, m_col = (t << w) - t;
2859 
2860  if (!skip_inter) {
2861  int mask_id = (tx == TX_8X8);
2862  int l2 = tx + is_uv - 1, step1d = 1 << l2;
2863  static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
2864  int m_row = m_col & masks[l2];
2865 
2866  // at odd UV col/row edges tx16/tx32 loopfilter edges, force
2867  // 8wd loopfilter to prevent going off the visible edge.
2868  if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
2869  int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
2870  int m_row_8 = m_row - m_row_16;
2871 
2872  for (y = row_and_7; y < h + row_and_7; y++) {
2873  lflvl->mask[is_uv][0][y][0] |= m_row_16;
2874  lflvl->mask[is_uv][0][y][1] |= m_row_8;
2875  }
2876  } else {
2877  for (y = row_and_7; y < h + row_and_7; y++)
2878  lflvl->mask[is_uv][0][y][mask_id] |= m_row;
2879  }
2880 
2881  if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
2882  for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
2883  lflvl->mask[is_uv][1][y][0] |= m_col;
2884  if (y - row_and_7 == h - 1)
2885  lflvl->mask[is_uv][1][y][1] |= m_col;
2886  } else {
2887  for (y = row_and_7; y < h + row_and_7; y += step1d)
2888  lflvl->mask[is_uv][1][y][mask_id] |= m_col;
2889  }
2890  } else if (tx != TX_4X4) {
2891  int mask_id;
2892 
2893  mask_id = (tx == TX_8X8) || (is_uv && h == 1);
2894  lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
2895  mask_id = (tx == TX_8X8) || (is_uv && w == 1);
2896  for (y = row_and_7; y < h + row_and_7; y++)
2897  lflvl->mask[is_uv][0][y][mask_id] |= t;
2898  } else if (is_uv) {
2899  int t8 = t & 0x01, t4 = t - t8;
2900 
2901  for (y = row_and_7; y < h + row_and_7; y++) {
2902  lflvl->mask[is_uv][0][y][2] |= t4;
2903  lflvl->mask[is_uv][0][y][1] |= t8;
2904  }
2905  lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
2906  } else {
2907  int t8 = t & 0x11, t4 = t - t8;
2908 
2909  for (y = row_and_7; y < h + row_and_7; y++) {
2910  lflvl->mask[is_uv][0][y][2] |= t4;
2911  lflvl->mask[is_uv][0][y][1] |= t8;
2912  }
2913  lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
2914  }
2915  }
2916 }
2917 
2918 static void decode_b(AVCodecContext *ctx, int row, int col,
2919  struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
2920  enum BlockLevel bl, enum BlockPartition bp)
2921 {
2922  VP9Context *s = ctx->priv_data;
2923  VP9Block *b = s->b;
2924  enum BlockSize bs = bl * 3 + bp;
2925  int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
2926  int emu[2];
2927  AVFrame *f = s->frames[CUR_FRAME].tf.f;
2928 
2929  s->row = row;
2930  s->row7 = row & 7;
2931  s->col = col;
2932  s->col7 = col & 7;
2933  s->min_mv.x = -(128 + col * 64);
2934  s->min_mv.y = -(128 + row * 64);
2935  s->max_mv.x = 128 + (s->cols - col - w4) * 64;
2936  s->max_mv.y = 128 + (s->rows - row - h4) * 64;
2937  if (s->pass < 2) {
2938  b->bs = bs;
2939  b->bl = bl;
2940  b->bp = bp;
2941  decode_mode(ctx);
2942  b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
2943 
2944  if (!b->skip) {
2945  decode_coeffs(ctx);
2946  } else {
2947  int row7 = s->row7;
2948 
2949 #define SPLAT_ZERO_CTX(v, n) \
2950  switch (n) { \
2951  case 1: v = 0; break; \
2952  case 2: AV_ZERO16(&v); break; \
2953  case 4: AV_ZERO32(&v); break; \
2954  case 8: AV_ZERO64(&v); break; \
2955  case 16: AV_ZERO128(&v); break; \
2956  }
2957 #define SPLAT_ZERO_YUV(dir, var, off, n) \
2958  do { \
2959  SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
2960  SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
2961  SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
2962  } while (0)
2963 
2964  switch (w4) {
2965  case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1); break;
2966  case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2); break;
2967  case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4); break;
2968  case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8); break;
2969  }
2970  switch (h4) {
2971  case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1); break;
2972  case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2); break;
2973  case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4); break;
2974  case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8); break;
2975  }
2976  }
2977  if (s->pass == 1) {
2978  s->b++;
2979  s->block += w4 * h4 * 64;
2980  s->uvblock[0] += w4 * h4 * 16;
2981  s->uvblock[1] += w4 * h4 * 16;
2982  s->eob += 4 * w4 * h4;
2983  s->uveob[0] += w4 * h4;
2984  s->uveob[1] += w4 * h4;
2985 
2986  return;
2987  }
2988  }
2989 
2990  // emulated overhangs if the stride of the target buffer can't hold. This
2991  // allows to support emu-edge and so on even if we have large block
2992  // overhangs
2993  emu[0] = (col + w4) * 8 > f->linesize[0] ||
2994  (row + h4) > s->rows;
2995  emu[1] = (col + w4) * 4 > f->linesize[1] ||
2996  (row + h4) > s->rows;
2997  if (emu[0]) {
2998  s->dst[0] = s->tmp_y;
2999  s->y_stride = 64;
3000  } else {
3001  s->dst[0] = f->data[0] + yoff;
3002  s->y_stride = f->linesize[0];
3003  }
3004  if (emu[1]) {
3005  s->dst[1] = s->tmp_uv[0];
3006  s->dst[2] = s->tmp_uv[1];
3007  s->uv_stride = 32;
3008  } else {
3009  s->dst[1] = f->data[1] + uvoff;
3010  s->dst[2] = f->data[2] + uvoff;
3011  s->uv_stride = f->linesize[1];
3012  }
3013  if (b->intra) {
3014  intra_recon(ctx, yoff, uvoff);
3015  } else {
3016  inter_recon(ctx);
3017  }
3018  if (emu[0]) {
3019  int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3020 
3021  for (n = 0; o < w; n++) {
3022  int bw = 64 >> n;
3023 
3024  av_assert2(n <= 4);
3025  if (w & bw) {
3026  s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
3027  s->tmp_y + o, 64, h, 0, 0);
3028  o += bw;
3029  }
3030  }
3031  }
3032  if (emu[1]) {
3033  int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
3034 
3035  for (n = 1; o < w; n++) {
3036  int bw = 64 >> n;
3037 
3038  av_assert2(n <= 4);
3039  if (w & bw) {
3040  s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
3041  s->tmp_uv[0] + o, 32, h, 0, 0);
3042  s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
3043  s->tmp_uv[1] + o, 32, h, 0, 0);
3044  o += bw;
3045  }
3046  }
3047  }
3048 
3049  // pick filter level and find edges to apply filter to
3050  if (s->filter.level &&
3051  (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3052  [b->mode[3] != ZEROMV]) > 0) {
3053  int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3054  int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3055 
3056  setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3057  mask_edges(lflvl, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3058  mask_edges(lflvl, 1, row7, col7, x_end, y_end,
3059  s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3060  s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3061  b->uvtx, skip_inter);
3062 
3063  if (!s->filter.lim_lut[lvl]) {
3064  int sharp = s->filter.sharpness;
3065  int limit = lvl;
3066 
3067  if (sharp > 0) {
3068  limit >>= (sharp + 3) >> 2;
3069  limit = FFMIN(limit, 9 - sharp);
3070  }
3071  limit = FFMAX(limit, 1);
3072 
3073  s->filter.lim_lut[lvl] = limit;
3074  s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3075  }
3076  }
3077 
3078  if (s->pass == 2) {
3079  s->b++;
3080  s->block += w4 * h4 * 64;
3081  s->uvblock[0] += w4 * h4 * 16;
3082  s->uvblock[1] += w4 * h4 * 16;
3083  s->eob += 4 * w4 * h4;
3084  s->uveob[0] += w4 * h4;
3085  s->uveob[1] += w4 * h4;
3086  }
3087 }
3088 
3089 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3090  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3091 {
3092  VP9Context *s = ctx->priv_data;
3093  int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3094  (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3095  const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
3096  s->prob.p.partition[bl][c];
3097  enum BlockPartition bp;
3098  ptrdiff_t hbs = 4 >> bl;
3099  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3100  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3101 
3102  if (bl == BL_8X8) {
3103  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3104  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3105  } else if (col + hbs < s->cols) { // FIXME why not <=?
3106  if (row + hbs < s->rows) { // FIXME why not <=?
3107  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3108  switch (bp) {
3109  case PARTITION_NONE:
3110  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3111  break;
3112  case PARTITION_H:
3113  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3114  yoff += hbs * 8 * y_stride;
3115  uvoff += hbs * 4 * uv_stride;
3116  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3117  break;
3118  case PARTITION_V:
3119  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3120  yoff += hbs * 8;
3121  uvoff += hbs * 4;
3122  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3123  break;
3124  case PARTITION_SPLIT:
3125  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3126  decode_sb(ctx, row, col + hbs, lflvl,
3127  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3128  yoff += hbs * 8 * y_stride;
3129  uvoff += hbs * 4 * uv_stride;
3130  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3131  decode_sb(ctx, row + hbs, col + hbs, lflvl,
3132  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3133  break;
3134  default:
3135  av_assert0(0);
3136  }
3137  } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3138  bp = PARTITION_SPLIT;
3139  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3140  decode_sb(ctx, row, col + hbs, lflvl,
3141  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3142  } else {
3143  bp = PARTITION_H;
3144  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3145  }
3146  } else if (row + hbs < s->rows) { // FIXME why not <=?
3147  if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3148  bp = PARTITION_SPLIT;
3149  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3150  yoff += hbs * 8 * y_stride;
3151  uvoff += hbs * 4 * uv_stride;
3152  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3153  } else {
3154  bp = PARTITION_V;
3155  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3156  }
3157  } else {
3158  bp = PARTITION_SPLIT;
3159  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3160  }
3161  s->counts.partition[bl][c][bp]++;
3162 }
3163 
3164 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3165  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3166 {
3167  VP9Context *s = ctx->priv_data;
3168  VP9Block *b = s->b;
3169  ptrdiff_t hbs = 4 >> bl;
3170  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3171  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3172 
3173  if (bl == BL_8X8) {
3174  av_assert2(b->bl == BL_8X8);
3175  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3176  } else if (s->b->bl == bl) {
3177  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3178  if (b->bp == PARTITION_H && row + hbs < s->rows) {
3179  yoff += hbs * 8 * y_stride;
3180  uvoff += hbs * 4 * uv_stride;
3181  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3182  } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3183  yoff += hbs * 8;
3184  uvoff += hbs * 4;
3185  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3186  }
3187  } else {
3188  decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3189  if (col + hbs < s->cols) { // FIXME why not <=?
3190  if (row + hbs < s->rows) {
3191  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
3192  uvoff + 4 * hbs, bl + 1);
3193  yoff += hbs * 8 * y_stride;
3194  uvoff += hbs * 4 * uv_stride;
3195  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3196  decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3197  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3198  } else {
3199  yoff += hbs * 8;
3200  uvoff += hbs * 4;
3201  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3202  }
3203  } else if (row + hbs < s->rows) {
3204  yoff += hbs * 8 * y_stride;
3205  uvoff += hbs * 4 * uv_stride;
3206  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3207  }
3208  }
3209 }
3210 
3211 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3212  int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3213 {
3214  VP9Context *s = ctx->priv_data;
3215  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3216  uint8_t *dst = f->data[0] + yoff, *lvl = lflvl->level;
3217  ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3218  int y, x, p;
3219 
3220  // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3221  // if you think of them as acting on a 8x8 block max, we can interleave
3222  // each v/h within the single x loop, but that only works if we work on
3223  // 8 pixel blocks, and we won't always do that (we want at least 16px
3224  // to use SSE2 optimizations, perhaps 32 for AVX2)
3225 
3226  // filter edges between columns, Y plane (e.g. block1 | block2)
3227  for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
3228  uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
3229  uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
3230  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3231  unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3232  unsigned hm = hm1 | hm2 | hm13 | hm23;
3233 
3234  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
3235  if (hm1 & x) {
3236  int L = *l, H = L >> 4;
3237  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3238 
3239  if (col || x > 1) {
3240  if (hmask1[0] & x) {
3241  if (hmask2[0] & x) {
3242  av_assert2(l[8] == L);
3243  s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
3244  } else {
3245  s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
3246  }
3247  } else if (hm2 & x) {
3248  L = l[8];
3249  H |= (L >> 4) << 8;
3250  E |= s->filter.mblim_lut[L] << 8;
3251  I |= s->filter.lim_lut[L] << 8;
3252  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3253  [!!(hmask2[1] & x)]
3254  [0](ptr, ls_y, E, I, H);
3255  } else {
3256  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3257  [0](ptr, ls_y, E, I, H);
3258  }
3259  }
3260  } else if (hm2 & x) {
3261  int L = l[8], H = L >> 4;
3262  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3263 
3264  if (col || x > 1) {
3265  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3266  [0](ptr + 8 * ls_y, ls_y, E, I, H);
3267  }
3268  }
3269  if (hm13 & x) {
3270  int L = *l, H = L >> 4;
3271  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3272 
3273  if (hm23 & x) {
3274  L = l[8];
3275  H |= (L >> 4) << 8;
3276  E |= s->filter.mblim_lut[L] << 8;
3277  I |= s->filter.lim_lut[L] << 8;
3278  s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
3279  } else {
3280  s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
3281  }
3282  } else if (hm23 & x) {
3283  int L = l[8], H = L >> 4;
3284  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3285 
3286  s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
3287  }
3288  }
3289  }
3290 
3291  // block1
3292  // filter edges between rows, Y plane (e.g. ------)
3293  // block2
3294  dst = f->data[0] + yoff;
3295  lvl = lflvl->level;
3296  for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
3297  uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
3298  unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3299 
3300  for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
3301  if (row || y) {
3302  if (vm & x) {
3303  int L = *l, H = L >> 4;
3304  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3305 
3306  if (vmask[0] & x) {
3307  if (vmask[0] & (x << 1)) {
3308  av_assert2(l[1] == L);
3309  s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
3310  } else {
3311  s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
3312  }
3313  } else if (vm & (x << 1)) {
3314  L = l[1];
3315  H |= (L >> 4) << 8;
3316  E |= s->filter.mblim_lut[L] << 8;
3317  I |= s->filter.lim_lut[L] << 8;
3318  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3319  [!!(vmask[1] & (x << 1))]
3320  [1](ptr, ls_y, E, I, H);
3321  } else {
3322  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3323  [1](ptr, ls_y, E, I, H);
3324  }
3325  } else if (vm & (x << 1)) {
3326  int L = l[1], H = L >> 4;
3327  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3328 
3329  s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
3330  [1](ptr + 8, ls_y, E, I, H);
3331  }
3332  }
3333  if (vm3 & x) {
3334  int L = *l, H = L >> 4;
3335  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3336 
3337  if (vm3 & (x << 1)) {
3338  L = l[1];
3339  H |= (L >> 4) << 8;
3340  E |= s->filter.mblim_lut[L] << 8;
3341  I |= s->filter.lim_lut[L] << 8;
3342  s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
3343  } else {
3344  s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
3345  }
3346  } else if (vm3 & (x << 1)) {
3347  int L = l[1], H = L >> 4;
3348  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3349 
3350  s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
3351  }
3352  }
3353  }
3354 
3355  // same principle but for U/V planes
3356  for (p = 0; p < 2; p++) {
3357  lvl = lflvl->level;
3358  dst = f->data[1 + p] + uvoff;
3359  for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
3360  uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
3361  uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
3362  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
3363  unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
3364 
3365  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
3366  if (col || x > 1) {
3367  if (hm1 & x) {
3368  int L = *l, H = L >> 4;
3369  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3370 
3371  if (hmask1[0] & x) {
3372  if (hmask2[0] & x) {
3373  av_assert2(l[16] == L);
3374  s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
3375  } else {
3376  s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
3377  }
3378  } else if (hm2 & x) {
3379  L = l[16];
3380  H |= (L >> 4) << 8;
3381  E |= s->filter.mblim_lut[L] << 8;
3382  I |= s->filter.lim_lut[L] << 8;
3383  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3384  [!!(hmask2[1] & x)]
3385  [0](ptr, ls_uv, E, I, H);
3386  } else {
3387  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3388  [0](ptr, ls_uv, E, I, H);
3389  }
3390  } else if (hm2 & x) {
3391  int L = l[16], H = L >> 4;
3392  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3393 
3394  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3395  [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
3396  }
3397  }
3398  if (x & 0xAA)
3399  l += 2;
3400  }
3401  }
3402  lvl = lflvl->level;
3403  dst = f->data[1 + p] + uvoff;
3404  for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
3405  uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
3406  unsigned vm = vmask[0] | vmask[1] | vmask[2];
3407 
3408  for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
3409  if (row || y) {
3410  if (vm & x) {
3411  int L = *l, H = L >> 4;
3412  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3413 
3414  if (vmask[0] & x) {
3415  if (vmask[0] & (x << 2)) {
3416  av_assert2(l[2] == L);
3417  s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
3418  } else {
3419  s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
3420  }
3421  } else if (vm & (x << 2)) {
3422  L = l[2];
3423  H |= (L >> 4) << 8;
3424  E |= s->filter.mblim_lut[L] << 8;
3425  I |= s->filter.lim_lut[L] << 8;
3426  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3427  [!!(vmask[1] & (x << 2))]
3428  [1](ptr, ls_uv, E, I, H);
3429  } else {
3430  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3431  [1](ptr, ls_uv, E, I, H);
3432  }
3433  } else if (vm & (x << 2)) {
3434  int L = l[2], H = L >> 4;
3435  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3436 
3437  s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
3438  [1](ptr + 8, ls_uv, E, I, H);
3439  }
3440  }
3441  }
3442  if (y & 1)
3443  lvl += 16;
3444  }
3445  }
3446 }
3447 
3448 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3449 {
3450  int sb_start = ( idx * n) >> log2_n;
3451  int sb_end = ((idx + 1) * n) >> log2_n;
3452  *start = FFMIN(sb_start, n) << 3;
3453  *end = FFMIN(sb_end, n) << 3;
3454 }
3455 
3456 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3457  int max_count, int update_factor)
3458 {
3459  unsigned ct = ct0 + ct1, p2, p1;
3460 
3461  if (!ct)
3462  return;
3463 
3464  p1 = *p;
3465  p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3466  p2 = av_clip(p2, 1, 255);
3467  ct = FFMIN(ct, max_count);
3468  update_factor = FASTDIV(update_factor * ct, max_count);
3469 
3470  // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3471  *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3472 }
3473 
3474 static void adapt_probs(VP9Context *s)
3475 {
3476  int i, j, k, l, m;
3477  prob_context *p = &s->prob_ctx[s->framectxid].p;
3478  int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3479 
3480  // coefficients
3481  for (i = 0; i < 4; i++)
3482  for (j = 0; j < 2; j++)
3483  for (k = 0; k < 2; k++)
3484  for (l = 0; l < 6; l++)
3485  for (m = 0; m < 6; m++) {
3486  uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3487  unsigned *e = s->counts.eob[i][j][k][l][m];
3488  unsigned *c = s->counts.coef[i][j][k][l][m];
3489 
3490  if (l == 0 && m >= 3) // dc only has 3 pt
3491  break;
3492 
3493  adapt_prob(&pp[0], e[0], e[1], 24, uf);
3494  adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3495  adapt_prob(&pp[2], c[1], c[2], 24, uf);
3496  }
3497 
3498  if (s->keyframe || s->intraonly) {
3499  memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3500  memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3501  memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3502  memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3503  return;
3504  }
3505 
3506  // skip flag
3507  for (i = 0; i < 3; i++)
3508  adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3509 
3510  // intra/inter flag
3511  for (i = 0; i < 4; i++)
3512  adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3513 
3514  // comppred flag
3515  if (s->comppredmode == PRED_SWITCHABLE) {
3516  for (i = 0; i < 5; i++)
3517  adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3518  }
3519 
3520  // reference frames
3521  if (s->comppredmode != PRED_SINGLEREF) {
3522  for (i = 0; i < 5; i++)
3523  adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3524  s->counts.comp_ref[i][1], 20, 128);
3525  }
3526 
3527  if (s->comppredmode != PRED_COMPREF) {
3528  for (i = 0; i < 5; i++) {
3529  uint8_t *pp = p->single_ref[i];
3530  unsigned (*c)[2] = s->counts.single_ref[i];
3531 
3532  adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3533  adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3534  }
3535  }
3536 
3537  // block partitioning
3538  for (i = 0; i < 4; i++)
3539  for (j = 0; j < 4; j++) {
3540  uint8_t *pp = p->partition[i][j];
3541  unsigned *c = s->counts.partition[i][j];
3542 
3543  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3544  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3545  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3546  }
3547 
3548  // tx size
3549  if (s->txfmmode == TX_SWITCHABLE) {
3550  for (i = 0; i < 2; i++) {
3551  unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3552 
3553  adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3554  adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3555  adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3556  adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3557  adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3558  adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3559  }
3560  }
3561 
3562  // interpolation filter
3563  if (s->filtermode == FILTER_SWITCHABLE) {
3564  for (i = 0; i < 4; i++) {
3565  uint8_t *pp = p->filter[i];
3566  unsigned *c = s->counts.filter[i];
3567 
3568  adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3569  adapt_prob(&pp[1], c[1], c[2], 20, 128);
3570  }
3571  }
3572 
3573  // inter modes
3574  for (i = 0; i < 7; i++) {
3575  uint8_t *pp = p->mv_mode[i];
3576  unsigned *c = s->counts.mv_mode[i];
3577 
3578  adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3579  adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3580  adapt_prob(&pp[2], c[1], c[3], 20, 128);
3581  }
3582 
3583  // mv joints
3584  {
3585  uint8_t *pp = p->mv_joint;
3586  unsigned *c = s->counts.mv_joint;
3587 
3588  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3589  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3590  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3591  }
3592 
3593  // mv components
3594  for (i = 0; i < 2; i++) {
3595  uint8_t *pp;
3596  unsigned *c, (*c2)[2], sum;
3597 
3598  adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3599  s->counts.mv_comp[i].sign[1], 20, 128);
3600 
3601  pp = p->mv_comp[i].classes;
3602  c = s->counts.mv_comp[i].classes;
3603  sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3604  adapt_prob(&pp[0], c[0], sum, 20, 128);
3605  sum -= c[1];
3606  adapt_prob(&pp[1], c[1], sum, 20, 128);
3607  sum -= c[2] + c[3];
3608  adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3609  adapt_prob(&pp[3], c[2], c[3], 20, 128);
3610  sum -= c[4] + c[5];
3611  adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3612  adapt_prob(&pp[5], c[4], c[5], 20, 128);
3613  sum -= c[6];
3614  adapt_prob(&pp[6], c[6], sum, 20, 128);
3615  adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3616  adapt_prob(&pp[8], c[7], c[8], 20, 128);
3617  adapt_prob(&pp[9], c[9], c[10], 20, 128);
3618 
3619  adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3620  s->counts.mv_comp[i].class0[1], 20, 128);
3621  pp = p->mv_comp[i].bits;
3622  c2 = s->counts.mv_comp[i].bits;
3623  for (j = 0; j < 10; j++)
3624  adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3625 
3626  for (j = 0; j < 2; j++) {
3627  pp = p->mv_comp[i].class0_fp[j];
3628  c = s->counts.mv_comp[i].class0_fp[j];
3629  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3630  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3631  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3632  }
3633  pp = p->mv_comp[i].fp;
3634  c = s->counts.mv_comp[i].fp;
3635  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3636  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3637  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3638 
3639  if (s->highprecisionmvs) {
3640  adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3641  s->counts.mv_comp[i].class0_hp[1], 20, 128);
3642  adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3643  s->counts.mv_comp[i].hp[1], 20, 128);
3644  }
3645  }
3646 
3647  // y intra modes
3648  for (i = 0; i < 4; i++) {
3649  uint8_t *pp = p->y_mode[i];
3650  unsigned *c = s->counts.y_mode[i], sum, s2;
3651 
3652  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3653  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3654  sum -= c[TM_VP8_PRED];
3655  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3656  sum -= c[VERT_PRED];
3657  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3658  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3659  sum -= s2;
3660  adapt_prob(&pp[3], s2, sum, 20, 128);
3661  s2 -= c[HOR_PRED];
3662  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3663  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3664  sum -= c[DIAG_DOWN_LEFT_PRED];
3665  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3666  sum -= c[VERT_LEFT_PRED];
3667  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3668  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3669  }
3670 
3671  // uv intra modes
3672  for (i = 0; i < 10; i++) {
3673  uint8_t *pp = p->uv_mode[i];
3674  unsigned *c = s->counts.uv_mode[i], sum, s2;
3675 
3676  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3677  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3678  sum -= c[TM_VP8_PRED];
3679  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3680  sum -= c[VERT_PRED];
3681  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3682  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3683  sum -= s2;
3684  adapt_prob(&pp[3], s2, sum, 20, 128);
3685  s2 -= c[HOR_PRED];
3686  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3687  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3688  sum -= c[DIAG_DOWN_LEFT_PRED];
3689  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3690  sum -= c[VERT_LEFT_PRED];
3691  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3692  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3693  }
3694 }
3695 
3696 static void free_buffers(VP9Context *s)
3697 {
3698  av_freep(&s->intra_pred_data[0]);
3699  av_freep(&s->b_base);
3700  av_freep(&s->block_base);
3701 }
3702 
3704 {
3705  VP9Context *s = ctx->priv_data;
3706  int i;
3707 
3708  for (i = 0; i < 2; i++) {
3709  if (s->frames[i].tf.f->data[0])
3710  vp9_unref_frame(ctx, &s->frames[i]);
3711  av_frame_free(&s->frames[i].tf.f);
3712  }
3713  for (i = 0; i < 8; i++) {
3714  if (s->refs[i].f->data[0])
3715  ff_thread_release_buffer(ctx, &s->refs[i]);
3716  av_frame_free(&s->refs[i].f);
3717  if (s->next_refs[i].f->data[0])
3718  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3719  av_frame_free(&s->next_refs[i].f);
3720  }
3721  free_buffers(s);
3722  av_freep(&s->c_b);
3723  s->c_b_size = 0;
3724 
3725  return 0;
3726 }
3727 
3728 
3729 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3730  int *got_frame, AVPacket *pkt)
3731 {
3732  const uint8_t *data = pkt->data;
3733  int size = pkt->size;
3734  VP9Context *s = ctx->priv_data;
3735  int res, tile_row, tile_col, i, ref, row, col;
3736  ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3737  AVFrame *f;
3738 
3739  if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3740  return res;
3741  } else if (res == 0) {
3742  if (!s->refs[ref].f->data[0]) {
3743  av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3744  return AVERROR_INVALIDDATA;
3745  }
3746  if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
3747  return res;
3748  *got_frame = 1;
3749  return 0;
3750  }
3751  data += res;
3752  size -= res;
3753 
3754  if (s->frames[LAST_FRAME].tf.f->data[0])
3755  vp9_unref_frame(ctx, &s->frames[LAST_FRAME]);
3756  if (!s->keyframe && s->frames[CUR_FRAME].tf.f->data[0] &&
3757  (res = vp9_ref_frame(ctx, &s->frames[LAST_FRAME], &s->frames[CUR_FRAME])) < 0)
3758  return res;
3759  if (s->frames[CUR_FRAME].tf.f->data[0])
3760  vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
3761  if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
3762  return res;
3763  f = s->frames[CUR_FRAME].tf.f;
3764  f->key_frame = s->keyframe;
3766  ls_y = f->linesize[0];
3767  ls_uv =f->linesize[1];
3768 
3769  // ref frame setup
3770  for (i = 0; i < 8; i++) {
3771  if (s->next_refs[i].f->data[0])
3772  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3773  if (s->refreshrefmask & (1 << i)) {
3774  res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
3775  } else {
3776  res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
3777  }
3778  if (res < 0)
3779  return res;
3780  }
3781 
3782  // main tile decode loop
3783  memset(s->above_partition_ctx, 0, s->cols);
3784  memset(s->above_skip_ctx, 0, s->cols);
3785  if (s->keyframe || s->intraonly) {
3786  memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3787  } else {
3788  memset(s->above_mode_ctx, NEARESTMV, s->cols);
3789  }
3790  memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3791  memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
3792  memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
3793  memset(s->above_segpred_ctx, 0, s->cols);
3794  s->pass = s->uses_2pass =
3796  if ((res = update_block_buffers(ctx)) < 0) {
3797  av_log(ctx, AV_LOG_ERROR,
3798  "Failed to allocate block buffers\n");
3799  return res;
3800  }
3801  if (s->refreshctx && s->parallelmode) {
3802  int j, k, l, m;
3803 
3804  for (i = 0; i < 4; i++) {
3805  for (j = 0; j < 2; j++)
3806  for (k = 0; k < 2; k++)
3807  for (l = 0; l < 6; l++)
3808  for (m = 0; m < 6; m++)
3809  memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
3810  s->prob.coef[i][j][k][l][m], 3);
3811  if (s->txfmmode == i)
3812  break;
3813  }
3814  s->prob_ctx[s->framectxid].p = s->prob.p;
3816  }
3817 
3818  do {
3819  yoff = uvoff = 0;
3820  s->b = s->b_base;
3821  s->block = s->block_base;
3822  s->uvblock[0] = s->uvblock_base[0];
3823  s->uvblock[1] = s->uvblock_base[1];
3824  s->eob = s->eob_base;
3825  s->uveob[0] = s->uveob_base[0];
3826  s->uveob[1] = s->uveob_base[1];
3827 
3828  for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
3830  tile_row, s->tiling.log2_tile_rows, s->sb_rows);
3831  if (s->pass != 2) {
3832  for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3833  unsigned tile_size;
3834 
3835  if (tile_col == s->tiling.tile_cols - 1 &&
3836  tile_row == s->tiling.tile_rows - 1) {
3837  tile_size = size;
3838  } else {
3839  tile_size = AV_RB32(data);
3840  data += 4;
3841  size -= 4;
3842  }
3843  if (tile_size > size) {
3844  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3845  return AVERROR_INVALIDDATA;
3846  }
3847  ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
3848  if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
3849  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3850  return AVERROR_INVALIDDATA;
3851  }
3852  data += tile_size;
3853  size -= tile_size;
3854  }
3855  }
3856 
3857  for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
3858  row += 8, yoff += ls_y * 64, uvoff += ls_uv * 32) {
3859  struct VP9Filter *lflvl_ptr = s->lflvl;
3860  ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
3861 
3862  for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3864  tile_col, s->tiling.log2_tile_cols, s->sb_cols);
3865 
3866  if (s->pass != 2) {
3867  memset(s->left_partition_ctx, 0, 8);
3868  memset(s->left_skip_ctx, 0, 8);
3869  if (s->keyframe || s->intraonly) {
3870  memset(s->left_mode_ctx, DC_PRED, 16);
3871  } else {
3872  memset(s->left_mode_ctx, NEARESTMV, 8);
3873  }
3874  memset(s->left_y_nnz_ctx, 0, 16);
3875  memset(s->left_uv_nnz_ctx, 0, 16);
3876  memset(s->left_segpred_ctx, 0, 8);
3877 
3878  memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
3879  }
3880 
3881  for (col = s->tiling.tile_col_start;
3882  col < s->tiling.tile_col_end;
3883  col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3884  // FIXME integrate with lf code (i.e. zero after each
3885  // use, similar to invtxfm coefficients, or similar)
3886  if (s->pass != 1) {
3887  memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
3888  }
3889 
3890  if (s->pass == 2) {
3891  decode_sb_mem(ctx, row, col, lflvl_ptr,
3892  yoff2, uvoff2, BL_64X64);
3893  } else {
3894  decode_sb(ctx, row, col, lflvl_ptr,
3895  yoff2, uvoff2, BL_64X64);
3896  }
3897  }
3898  if (s->pass != 2) {
3899  memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
3900  }
3901  }
3902 
3903  if (s->pass == 1) {
3904  continue;
3905  }
3906 
3907  // backup pre-loopfilter reconstruction data for intra
3908  // prediction of next row of sb64s
3909  if (row + 8 < s->rows) {
3910  memcpy(s->intra_pred_data[0],
3911  f->data[0] + yoff + 63 * ls_y,
3912  8 * s->cols);
3913  memcpy(s->intra_pred_data[1],
3914  f->data[1] + uvoff + 31 * ls_uv,
3915  4 * s->cols);
3916  memcpy(s->intra_pred_data[2],
3917  f->data[2] + uvoff + 31 * ls_uv,
3918  4 * s->cols);
3919  }
3920 
3921  // loopfilter one row
3922  if (s->filter.level) {
3923  yoff2 = yoff;
3924  uvoff2 = uvoff;
3925  lflvl_ptr = s->lflvl;
3926  for (col = 0; col < s->cols;
3927  col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3928  loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
3929  }
3930  }
3931 
3932  // FIXME maybe we can make this more finegrained by running the
3933  // loopfilter per-block instead of after each sbrow
3934  // In fact that would also make intra pred left preparation easier?
3935  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
3936  }
3937  }
3938 
3939  if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
3940  adapt_probs(s);
3942  }
3943  } while (s->pass++ == 1);
3944  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3945 
3946  // ref frame setup
3947  for (i = 0; i < 8; i++) {
3948  if (s->refs[i].f->data[0])
3949  ff_thread_release_buffer(ctx, &s->refs[i]);
3950  ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
3951  }
3952 
3953  if (!s->invisible) {
3954  if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
3955  return res;
3956  *got_frame = 1;
3957  }
3958 
3959  return 0;
3960 }
3961 
3963 {
3964  VP9Context *s = ctx->priv_data;
3965  int i;
3966 
3967  for (i = 0; i < 2; i++)
3968  vp9_unref_frame(ctx, &s->frames[i]);
3969  for (i = 0; i < 8; i++)
3970  ff_thread_release_buffer(ctx, &s->refs[i]);
3971 }
3972 
3973 static int init_frames(AVCodecContext *ctx)
3974 {
3975  VP9Context *s = ctx->priv_data;
3976  int i;
3977 
3978  for (i = 0; i < 2; i++) {
3979  s->frames[i].tf.f = av_frame_alloc();
3980  if (!s->frames[i].tf.f) {
3981  vp9_decode_free(ctx);
3982  av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3983  return AVERROR(ENOMEM);
3984  }
3985  }
3986  for (i = 0; i < 8; i++) {
3987  s->refs[i].f = av_frame_alloc();
3988  s->next_refs[i].f = av_frame_alloc();
3989  if (!s->refs[i].f || !s->next_refs[i].f) {
3990  vp9_decode_free(ctx);
3991  av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3992  return AVERROR(ENOMEM);
3993  }
3994  }
3995 
3996  return 0;
3997 }
3998 
4000 {
4001  VP9Context *s = ctx->priv_data;
4002 
4003  ctx->internal->allocate_progress = 1;
4004  ctx->pix_fmt = AV_PIX_FMT_YUV420P;
4005  ff_vp9dsp_init(&s->dsp);
4006  ff_videodsp_init(&s->vdsp, 8);
4007  s->filter.sharpness = -1;
4008 
4009  return init_frames(ctx);
4010 }
4011 
4013 {
4014  return init_frames(avctx);
4015 }
4016 
4018 {
4019  int i, res;
4020  VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4021 
4022  // detect size changes in other threads
4023  if (s->intra_pred_data[0] &&
4024  (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
4025  free_buffers(s);
4026  }
4027 
4028  for (i = 0; i < 2; i++) {
4029  if (s->frames[i].tf.f->data[0])
4030  vp9_unref_frame(dst, &s->frames[i]);
4031  if (ssrc->frames[i].tf.f->data[0]) {
4032  if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
4033  return res;
4034  }
4035  }
4036  for (i = 0; i < 8; i++) {
4037  if (s->refs[i].f->data[0])
4038  ff_thread_release_buffer(dst, &s->refs[i]);
4039  if (ssrc->next_refs[i].f->data[0]) {
4040  if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
4041  return res;
4042  }
4043  }
4044 
4045  s->invisible = ssrc->invisible;
4046  s->keyframe = ssrc->keyframe;
4047  s->uses_2pass = ssrc->uses_2pass;
4048  memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4049  memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4050  if (ssrc->segmentation.enabled) {
4051  memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4052  sizeof(s->segmentation.feat));
4053  }
4054 
4055  return 0;
4056 }
4057 
4059  .name = "vp9",
4060  .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4061  .type = AVMEDIA_TYPE_VIDEO,
4062  .id = AV_CODEC_ID_VP9,
4063  .priv_data_size = sizeof(VP9Context),
4064  .init = vp9_decode_init,
4067  .capabilities = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
4071 };