FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vp9.c
Go to the documentation of this file.
1 /*
2  * VP9 compatible video decoder
3  *
4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "avcodec.h"
25 #include "get_bits.h"
26 #include "internal.h"
27 #include "thread.h"
28 #include "videodsp.h"
29 #include "vp56.h"
30 #include "vp9.h"
31 #include "vp9data.h"
32 #include "vp9dsp.h"
33 #include "libavutil/avassert.h"
34 
35 #define VP9_SYNCCODE 0x498342
36 
41 };
42 
43 enum BlockLevel {
48 };
49 
50 enum BlockSize {
65 };
66 
67 struct VP9mvrefPair {
68  VP56mv mv[2];
69  int8_t ref[2];
70 };
71 
72 typedef struct VP9Frame {
76  struct VP9mvrefPair *mv;
77 } VP9Frame;
78 
79 struct VP9Filter {
80  uint8_t level[8 * 8];
81  uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
82  [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
83 };
84 
85 typedef struct VP9Block {
88  VP56mv mv[4 /* b_idx */][2 /* ref */];
89  enum BlockSize bs;
90  enum TxfmMode tx, uvtx;
91  enum BlockLevel bl;
93 } VP9Block;
94 
95 typedef struct VP9Context {
101  unsigned c_b_size;
104  int row, row7, col, col7;
106  ptrdiff_t y_stride, uv_stride;
107 
108  // bitstream header
130 #define CUR_FRAME 0
131 #define LAST_FRAME 1
133 
134  struct {
136  int8_t sharpness;
139  } filter;
140  struct {
142  int8_t mode[2];
143  int8_t ref[4];
144  } lf_delta;
148 #define MAX_SEGMENT 8
149  struct {
154  struct {
160  int16_t q_val;
161  int8_t lf_val;
162  int16_t qmul[2][2];
163  uint8_t lflvl[4][2];
164  } feat[MAX_SEGMENT];
165  } segmentation;
166  struct {
168  unsigned tile_cols, tile_rows;
170  } tiling;
171  unsigned sb_cols, sb_rows, rows, cols;
172  struct {
174  uint8_t coef[4][2][2][6][6][3];
175  } prob_ctx[4];
176  struct {
177  prob_context p;
178  uint8_t coef[4][2][2][6][6][11];
181  } prob;
182  struct {
183  unsigned y_mode[4][10];
184  unsigned uv_mode[10][10];
185  unsigned filter[4][3];
186  unsigned mv_mode[7][4];
187  unsigned intra[4][2];
188  unsigned comp[5][2];
189  unsigned single_ref[5][2][2];
190  unsigned comp_ref[5][2];
191  unsigned tx32p[2][4];
192  unsigned tx16p[2][3];
193  unsigned tx8p[2][2];
194  unsigned skip[3][2];
195  unsigned mv_joint[4];
196  struct {
197  unsigned sign[2];
198  unsigned classes[11];
199  unsigned class0[2];
200  unsigned bits[10][2];
201  unsigned class0_fp[2][4];
202  unsigned fp[4];
203  unsigned class0_hp[2];
204  unsigned hp[2];
205  } mv_comp[2];
206  unsigned partition[4][4][4];
207  unsigned coef[4][2][2][6][6][3];
208  unsigned eob[4][2][2][6][6][2];
209  } counts;
212 
213  // contextual (left/above) cache
228  // FIXME maybe merge some of the below in a flags field?
239 
240  // whole-frame cache
242  struct VP9Filter *lflvl;
244 
245  // block reconstruction intermediates
247  int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
249  struct { int x, y; } min_mv, max_mv;
251  DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
252 } VP9Context;
253 
254 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
255  {
256  { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
257  { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
258  }, {
259  { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
260  { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
261  }
262 };
263 
265 {
266  VP9Context *s = ctx->priv_data;
267  int ret, sz;
268 
269  if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
270  return ret;
271  sz = 64 * s->sb_cols * s->sb_rows;
272  if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
273  ff_thread_release_buffer(ctx, &f->tf);
274  return AVERROR(ENOMEM);
275  }
276 
278  f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
279 
280  // retain segmentation map if it doesn't update
282  !s->intraonly && !s->keyframe && !s->errorres &&
285  }
286 
287  return 0;
288 }
289 
291 {
292  ff_thread_release_buffer(ctx, &f->tf);
294 }
295 
297 {
298  int res;
299 
300  if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
301  return res;
302  } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
303  vp9_unref_frame(ctx, dst);
304  return AVERROR(ENOMEM);
305  }
306 
308  dst->mv = src->mv;
309 
310  return 0;
311 }
312 
313 static int update_size(AVCodecContext *ctx, int w, int h)
314 {
315  VP9Context *s = ctx->priv_data;
316  uint8_t *p;
317 
318  av_assert0(w > 0 && h > 0);
319 
320  if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height)
321  return 0;
322 
323  ctx->width = w;
324  ctx->height = h;
325  s->sb_cols = (w + 63) >> 6;
326  s->sb_rows = (h + 63) >> 6;
327  s->cols = (w + 7) >> 3;
328  s->rows = (h + 7) >> 3;
329 
330 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
331  av_freep(&s->intra_pred_data[0]);
332  p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
333  if (!p)
334  return AVERROR(ENOMEM);
335  assign(s->intra_pred_data[0], uint8_t *, 64);
336  assign(s->intra_pred_data[1], uint8_t *, 32);
337  assign(s->intra_pred_data[2], uint8_t *, 32);
338  assign(s->above_y_nnz_ctx, uint8_t *, 16);
339  assign(s->above_mode_ctx, uint8_t *, 16);
340  assign(s->above_mv_ctx, VP56mv(*)[2], 16);
342  assign(s->above_skip_ctx, uint8_t *, 8);
343  assign(s->above_txfm_ctx, uint8_t *, 8);
344  assign(s->above_uv_nnz_ctx[0], uint8_t *, 8);
345  assign(s->above_uv_nnz_ctx[1], uint8_t *, 8);
346  assign(s->above_segpred_ctx, uint8_t *, 8);
347  assign(s->above_intra_ctx, uint8_t *, 8);
348  assign(s->above_comp_ctx, uint8_t *, 8);
349  assign(s->above_ref_ctx, uint8_t *, 8);
350  assign(s->above_filter_ctx, uint8_t *, 8);
351  assign(s->lflvl, struct VP9Filter *, 1);
352 #undef assign
353 
354  // these will be re-allocated a little later
355  av_freep(&s->b_base);
356  av_freep(&s->block_base);
357 
358  return 0;
359 }
360 
362 {
363  VP9Context *s = ctx->priv_data;
364 
365  if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->uses_2pass)
366  return 0;
367 
368  av_free(s->b_base);
369  av_free(s->block_base);
370  if (s->uses_2pass) {
371  int sbs = s->sb_cols * s->sb_rows;
372 
373  s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
374  s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
375  if (!s->b_base || !s->block_base)
376  return AVERROR(ENOMEM);
377  s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
378  s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
379  s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
380  s->uveob_base[0] = s->eob_base + 256 * sbs;
381  s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
382  } else {
383  s->b_base = av_malloc(sizeof(VP9Block));
384  s->block_base = av_mallocz((64 * 64 + 128) * 3);
385  if (!s->b_base || !s->block_base)
386  return AVERROR(ENOMEM);
387  s->uvblock_base[0] = s->block_base + 64 * 64;
388  s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
389  s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
390  s->uveob_base[0] = s->eob_base + 256;
391  s->uveob_base[1] = s->uveob_base[0] + 64;
392  }
394 
395  return 0;
396 }
397 
398 // for some reason the sign bit is at the end, not the start, of a bit sequence
400 {
401  int v = get_bits(gb, n);
402  return get_bits1(gb) ? -v : v;
403 }
404 
406 {
407  return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
408 }
409 
410 // differential forward probability updates
411 static int update_prob(VP56RangeCoder *c, int p)
412 {
413  static const int inv_map_table[254] = {
414  7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
415  189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
416  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
417  25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
418  40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
419  55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
420  70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
421  86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
422  101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
423  116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
424  131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
425  146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
426  161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
427  177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
428  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
429  207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
430  222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
431  237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
432  252, 253,
433  };
434  int d;
435 
436  /* This code is trying to do a differential probability update. For a
437  * current probability A in the range [1, 255], the difference to a new
438  * probability of any value can be expressed differentially as 1-A,255-A
439  * where some part of this (absolute range) exists both in positive as
440  * well as the negative part, whereas another part only exists in one
441  * half. We're trying to code this shared part differentially, i.e.
442  * times two where the value of the lowest bit specifies the sign, and
443  * the single part is then coded on top of this. This absolute difference
444  * then again has a value of [0,254], but a bigger value in this range
445  * indicates that we're further away from the original value A, so we
446  * can code this as a VLC code, since higher values are increasingly
447  * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
448  * updates vs. the 'fine, exact' updates further down the range, which
449  * adds one extra dimension to this differential update model. */
450 
451  if (!vp8_rac_get(c)) {
452  d = vp8_rac_get_uint(c, 4) + 0;
453  } else if (!vp8_rac_get(c)) {
454  d = vp8_rac_get_uint(c, 4) + 16;
455  } else if (!vp8_rac_get(c)) {
456  d = vp8_rac_get_uint(c, 5) + 32;
457  } else {
458  d = vp8_rac_get_uint(c, 7);
459  if (d >= 65)
460  d = (d << 1) - 65 + vp8_rac_get(c);
461  d += 64;
462  }
463 
464  return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
465  255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
466 }
467 
469  const uint8_t *data, int size, int *ref)
470 {
471  VP9Context *s = ctx->priv_data;
472  int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
473  int last_invisible;
474  const uint8_t *data2;
475 
476  /* general header */
477  if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
478  av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
479  return res;
480  }
481  if (get_bits(&s->gb, 2) != 0x2) { // frame marker
482  av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
483  return AVERROR_INVALIDDATA;
484  }
485  s->profile = get_bits1(&s->gb);
486  if (get_bits1(&s->gb)) { // reserved bit
487  av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
488  return AVERROR_INVALIDDATA;
489  }
490  if (get_bits1(&s->gb)) {
491  *ref = get_bits(&s->gb, 3);
492  return 0;
493  }
494  s->last_uses_2pass = s->uses_2pass;
495  s->last_keyframe = s->keyframe;
496  s->keyframe = !get_bits1(&s->gb);
497  last_invisible = s->invisible;
498  s->invisible = !get_bits1(&s->gb);
499  s->errorres = get_bits1(&s->gb);
500  s->use_last_frame_mvs = !s->errorres && !last_invisible;
501  if (s->keyframe) {
502  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
503  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
504  return AVERROR_INVALIDDATA;
505  }
506  s->colorspace = get_bits(&s->gb, 3);
507  if (s->colorspace == 7) { // RGB = profile 1
508  av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
509  return AVERROR_INVALIDDATA;
510  }
511  s->fullrange = get_bits1(&s->gb);
512  // for profile 1, here follows the subsampling bits
513  s->refreshrefmask = 0xff;
514  w = get_bits(&s->gb, 16) + 1;
515  h = get_bits(&s->gb, 16) + 1;
516  if (get_bits1(&s->gb)) // display size
517  skip_bits(&s->gb, 32);
518  } else {
519  s->intraonly = s->invisible ? get_bits1(&s->gb) : 0;
520  s->resetctx = s->errorres ? 0 : get_bits(&s->gb, 2);
521  if (s->intraonly) {
522  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
523  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
524  return AVERROR_INVALIDDATA;
525  }
526  s->refreshrefmask = get_bits(&s->gb, 8);
527  w = get_bits(&s->gb, 16) + 1;
528  h = get_bits(&s->gb, 16) + 1;
529  if (get_bits1(&s->gb)) // display size
530  skip_bits(&s->gb, 32);
531  } else {
532  s->refreshrefmask = get_bits(&s->gb, 8);
533  s->refidx[0] = get_bits(&s->gb, 3);
534  s->signbias[0] = get_bits1(&s->gb);
535  s->refidx[1] = get_bits(&s->gb, 3);
536  s->signbias[1] = get_bits1(&s->gb);
537  s->refidx[2] = get_bits(&s->gb, 3);
538  s->signbias[2] = get_bits1(&s->gb);
539  if (!s->refs[s->refidx[0]].f->data[0] ||
540  !s->refs[s->refidx[1]].f->data[0] ||
541  !s->refs[s->refidx[2]].f->data[0]) {
542  av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
543  return AVERROR_INVALIDDATA;
544  }
545  if (get_bits1(&s->gb)) {
546  w = s->refs[s->refidx[0]].f->width;
547  h = s->refs[s->refidx[0]].f->height;
548  } else if (get_bits1(&s->gb)) {
549  w = s->refs[s->refidx[1]].f->width;
550  h = s->refs[s->refidx[1]].f->height;
551  } else if (get_bits1(&s->gb)) {
552  w = s->refs[s->refidx[2]].f->width;
553  h = s->refs[s->refidx[2]].f->height;
554  } else {
555  w = get_bits(&s->gb, 16) + 1;
556  h = get_bits(&s->gb, 16) + 1;
557  }
558  // Note that in this code, "CUR_FRAME" is actually before we
559  // have formally allocated a frame, and thus actually represents
560  // the _last_ frame
561  s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
562  s->frames[CUR_FRAME].tf.f->height == h;
563  if (get_bits1(&s->gb)) // display size
564  skip_bits(&s->gb, 32);
565  s->highprecisionmvs = get_bits1(&s->gb);
567  get_bits(&s->gb, 2);
568  s->allowcompinter = s->signbias[0] != s->signbias[1] ||
569  s->signbias[0] != s->signbias[2];
570  if (s->allowcompinter) {
571  if (s->signbias[0] == s->signbias[1]) {
572  s->fixcompref = 2;
573  s->varcompref[0] = 0;
574  s->varcompref[1] = 1;
575  } else if (s->signbias[0] == s->signbias[2]) {
576  s->fixcompref = 1;
577  s->varcompref[0] = 0;
578  s->varcompref[1] = 2;
579  } else {
580  s->fixcompref = 0;
581  s->varcompref[0] = 1;
582  s->varcompref[1] = 2;
583  }
584  }
585  }
586  }
587  s->refreshctx = s->errorres ? 0 : get_bits1(&s->gb);
588  s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
589  s->framectxid = c = get_bits(&s->gb, 2);
590 
591  /* loopfilter header data */
592  s->filter.level = get_bits(&s->gb, 6);
593  sharp = get_bits(&s->gb, 3);
594  // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
595  // the old cache values since they are still valid
596  if (s->filter.sharpness != sharp)
597  memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
598  s->filter.sharpness = sharp;
599  if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
600  if (get_bits1(&s->gb)) {
601  for (i = 0; i < 4; i++)
602  if (get_bits1(&s->gb))
603  s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
604  for (i = 0; i < 2; i++)
605  if (get_bits1(&s->gb))
606  s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
607  }
608  } else {
609  memset(&s->lf_delta, 0, sizeof(s->lf_delta));
610  }
611 
612  /* quantization header data */
613  s->yac_qi = get_bits(&s->gb, 8);
614  s->ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
615  s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
616  s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
617  s->lossless = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
618  s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
619 
620  /* segmentation header info */
621  if ((s->segmentation.enabled = get_bits1(&s->gb))) {
622  if ((s->segmentation.update_map = get_bits1(&s->gb))) {
623  for (i = 0; i < 7; i++)
624  s->prob.seg[i] = get_bits1(&s->gb) ?
625  get_bits(&s->gb, 8) : 255;
626  if ((s->segmentation.temporal = get_bits1(&s->gb))) {
627  for (i = 0; i < 3; i++)
628  s->prob.segpred[i] = get_bits1(&s->gb) ?
629  get_bits(&s->gb, 8) : 255;
630  }
631  }
632  if ((!s->segmentation.update_map || s->segmentation.temporal) &&
633  (w != s->frames[CUR_FRAME].tf.f->width ||
634  h != s->frames[CUR_FRAME].tf.f->height)) {
635  av_log(ctx, AV_LOG_ERROR,
636  "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
638  return AVERROR_INVALIDDATA;
639  }
640 
641  if (get_bits1(&s->gb)) {
643  for (i = 0; i < 8; i++) {
644  if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
645  s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
646  if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
647  s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
648  if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
649  s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
650  s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
651  }
652  }
653  } else {
654  s->segmentation.feat[0].q_enabled = 0;
655  s->segmentation.feat[0].lf_enabled = 0;
656  s->segmentation.feat[0].skip_enabled = 0;
657  s->segmentation.feat[0].ref_enabled = 0;
658  }
659 
660  // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
661  for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
662  int qyac, qydc, quvac, quvdc, lflvl, sh;
663 
664  if (s->segmentation.feat[i].q_enabled) {
666  qyac = s->segmentation.feat[i].q_val;
667  else
668  qyac = s->yac_qi + s->segmentation.feat[i].q_val;
669  } else {
670  qyac = s->yac_qi;
671  }
672  qydc = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
673  quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
674  quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
675  qyac = av_clip_uintp2(qyac, 8);
676 
677  s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
678  s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
679  s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
680  s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
681 
682  sh = s->filter.level >= 32;
683  if (s->segmentation.feat[i].lf_enabled) {
685  lflvl = s->segmentation.feat[i].lf_val;
686  else
687  lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
688  } else {
689  lflvl = s->filter.level;
690  }
691  s->segmentation.feat[i].lflvl[0][0] =
692  s->segmentation.feat[i].lflvl[0][1] =
693  av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
694  for (j = 1; j < 4; j++) {
695  s->segmentation.feat[i].lflvl[j][0] =
696  av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
697  s->lf_delta.mode[0]) << sh), 6);
698  s->segmentation.feat[i].lflvl[j][1] =
699  av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
700  s->lf_delta.mode[1]) << sh), 6);
701  }
702  }
703 
704  /* tiling info */
705  if ((res = update_size(ctx, w, h)) < 0) {
706  av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
707  return res;
708  }
709  for (s->tiling.log2_tile_cols = 0;
710  (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
711  s->tiling.log2_tile_cols++) ;
712  for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
713  max = FFMAX(0, max - 1);
714  while (max > s->tiling.log2_tile_cols) {
715  if (get_bits1(&s->gb))
716  s->tiling.log2_tile_cols++;
717  else
718  break;
719  }
720  s->tiling.log2_tile_rows = decode012(&s->gb);
721  s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
722  if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
723  s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
724  s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
725  sizeof(VP56RangeCoder) * s->tiling.tile_cols);
726  if (!s->c_b) {
727  av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
728  return AVERROR(ENOMEM);
729  }
730  }
731 
732  if (s->keyframe || s->errorres || s->intraonly) {
733  s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
734  s->prob_ctx[3].p = vp9_default_probs;
735  memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
736  sizeof(vp9_default_coef_probs));
737  memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
738  sizeof(vp9_default_coef_probs));
739  memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
740  sizeof(vp9_default_coef_probs));
741  memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
742  sizeof(vp9_default_coef_probs));
743  }
744 
745  // next 16 bits is size of the rest of the header (arith-coded)
746  size2 = get_bits(&s->gb, 16);
747  data2 = align_get_bits(&s->gb);
748  if (size2 > size - (data2 - data)) {
749  av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
750  return AVERROR_INVALIDDATA;
751  }
752  ff_vp56_init_range_decoder(&s->c, data2, size2);
753  if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
754  av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
755  return AVERROR_INVALIDDATA;
756  }
757 
758  if (s->keyframe || s->intraonly) {
759  memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
760  } else {
761  memset(&s->counts, 0, sizeof(s->counts));
762  }
763  // FIXME is it faster to not copy here, but do it down in the fw updates
764  // as explicit copies if the fw update is missing (and skip the copy upon
765  // fw update)?
766  s->prob.p = s->prob_ctx[c].p;
767 
768  // txfm updates
769  if (s->lossless) {
770  s->txfmmode = TX_4X4;
771  } else {
772  s->txfmmode = vp8_rac_get_uint(&s->c, 2);
773  if (s->txfmmode == 3)
774  s->txfmmode += vp8_rac_get(&s->c);
775 
776  if (s->txfmmode == TX_SWITCHABLE) {
777  for (i = 0; i < 2; i++)
778  if (vp56_rac_get_prob_branchy(&s->c, 252))
779  s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
780  for (i = 0; i < 2; i++)
781  for (j = 0; j < 2; j++)
782  if (vp56_rac_get_prob_branchy(&s->c, 252))
783  s->prob.p.tx16p[i][j] =
784  update_prob(&s->c, s->prob.p.tx16p[i][j]);
785  for (i = 0; i < 2; i++)
786  for (j = 0; j < 3; j++)
787  if (vp56_rac_get_prob_branchy(&s->c, 252))
788  s->prob.p.tx32p[i][j] =
789  update_prob(&s->c, s->prob.p.tx32p[i][j]);
790  }
791  }
792 
793  // coef updates
794  for (i = 0; i < 4; i++) {
795  uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
796  if (vp8_rac_get(&s->c)) {
797  for (j = 0; j < 2; j++)
798  for (k = 0; k < 2; k++)
799  for (l = 0; l < 6; l++)
800  for (m = 0; m < 6; m++) {
801  uint8_t *p = s->prob.coef[i][j][k][l][m];
802  uint8_t *r = ref[j][k][l][m];
803  if (m >= 3 && l == 0) // dc only has 3 pt
804  break;
805  for (n = 0; n < 3; n++) {
806  if (vp56_rac_get_prob_branchy(&s->c, 252)) {
807  p[n] = update_prob(&s->c, r[n]);
808  } else {
809  p[n] = r[n];
810  }
811  }
812  p[3] = 0;
813  }
814  } else {
815  for (j = 0; j < 2; j++)
816  for (k = 0; k < 2; k++)
817  for (l = 0; l < 6; l++)
818  for (m = 0; m < 6; m++) {
819  uint8_t *p = s->prob.coef[i][j][k][l][m];
820  uint8_t *r = ref[j][k][l][m];
821  if (m > 3 && l == 0) // dc only has 3 pt
822  break;
823  memcpy(p, r, 3);
824  p[3] = 0;
825  }
826  }
827  if (s->txfmmode == i)
828  break;
829  }
830 
831  // mode updates
832  for (i = 0; i < 3; i++)
833  if (vp56_rac_get_prob_branchy(&s->c, 252))
834  s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
835  if (!s->keyframe && !s->intraonly) {
836  for (i = 0; i < 7; i++)
837  for (j = 0; j < 3; j++)
838  if (vp56_rac_get_prob_branchy(&s->c, 252))
839  s->prob.p.mv_mode[i][j] =
840  update_prob(&s->c, s->prob.p.mv_mode[i][j]);
841 
842  if (s->filtermode == FILTER_SWITCHABLE)
843  for (i = 0; i < 4; i++)
844  for (j = 0; j < 2; j++)
845  if (vp56_rac_get_prob_branchy(&s->c, 252))
846  s->prob.p.filter[i][j] =
847  update_prob(&s->c, s->prob.p.filter[i][j]);
848 
849  for (i = 0; i < 4; i++)
850  if (vp56_rac_get_prob_branchy(&s->c, 252))
851  s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
852 
853  if (s->allowcompinter) {
854  s->comppredmode = vp8_rac_get(&s->c);
855  if (s->comppredmode)
856  s->comppredmode += vp8_rac_get(&s->c);
857  if (s->comppredmode == PRED_SWITCHABLE)
858  for (i = 0; i < 5; i++)
859  if (vp56_rac_get_prob_branchy(&s->c, 252))
860  s->prob.p.comp[i] =
861  update_prob(&s->c, s->prob.p.comp[i]);
862  } else {
864  }
865 
866  if (s->comppredmode != PRED_COMPREF) {
867  for (i = 0; i < 5; i++) {
868  if (vp56_rac_get_prob_branchy(&s->c, 252))
869  s->prob.p.single_ref[i][0] =
870  update_prob(&s->c, s->prob.p.single_ref[i][0]);
871  if (vp56_rac_get_prob_branchy(&s->c, 252))
872  s->prob.p.single_ref[i][1] =
873  update_prob(&s->c, s->prob.p.single_ref[i][1]);
874  }
875  }
876 
877  if (s->comppredmode != PRED_SINGLEREF) {
878  for (i = 0; i < 5; i++)
879  if (vp56_rac_get_prob_branchy(&s->c, 252))
880  s->prob.p.comp_ref[i] =
881  update_prob(&s->c, s->prob.p.comp_ref[i]);
882  }
883 
884  for (i = 0; i < 4; i++)
885  for (j = 0; j < 9; j++)
886  if (vp56_rac_get_prob_branchy(&s->c, 252))
887  s->prob.p.y_mode[i][j] =
888  update_prob(&s->c, s->prob.p.y_mode[i][j]);
889 
890  for (i = 0; i < 4; i++)
891  for (j = 0; j < 4; j++)
892  for (k = 0; k < 3; k++)
893  if (vp56_rac_get_prob_branchy(&s->c, 252))
894  s->prob.p.partition[3 - i][j][k] =
895  update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
896 
897  // mv fields don't use the update_prob subexp model for some reason
898  for (i = 0; i < 3; i++)
899  if (vp56_rac_get_prob_branchy(&s->c, 252))
900  s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
901 
902  for (i = 0; i < 2; i++) {
903  if (vp56_rac_get_prob_branchy(&s->c, 252))
904  s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
905 
906  for (j = 0; j < 10; j++)
907  if (vp56_rac_get_prob_branchy(&s->c, 252))
908  s->prob.p.mv_comp[i].classes[j] =
909  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
910 
911  if (vp56_rac_get_prob_branchy(&s->c, 252))
912  s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
913 
914  for (j = 0; j < 10; j++)
915  if (vp56_rac_get_prob_branchy(&s->c, 252))
916  s->prob.p.mv_comp[i].bits[j] =
917  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
918  }
919 
920  for (i = 0; i < 2; i++) {
921  for (j = 0; j < 2; j++)
922  for (k = 0; k < 3; k++)
923  if (vp56_rac_get_prob_branchy(&s->c, 252))
924  s->prob.p.mv_comp[i].class0_fp[j][k] =
925  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
926 
927  for (j = 0; j < 3; j++)
928  if (vp56_rac_get_prob_branchy(&s->c, 252))
929  s->prob.p.mv_comp[i].fp[j] =
930  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
931  }
932 
933  if (s->highprecisionmvs) {
934  for (i = 0; i < 2; i++) {
935  if (vp56_rac_get_prob_branchy(&s->c, 252))
936  s->prob.p.mv_comp[i].class0_hp =
937  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
938 
939  if (vp56_rac_get_prob_branchy(&s->c, 252))
940  s->prob.p.mv_comp[i].hp =
941  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
942  }
943  }
944  }
945 
946  return (data2 - data) + size2;
947 }
948 
949 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
950  VP9Context *s)
951 {
952  dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
953  dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
954 }
955 
956 static void find_ref_mvs(VP9Context *s,
957  VP56mv *pmv, int ref, int z, int idx, int sb)
958 {
959  static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
960  [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
961  { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
962  [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
963  { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
964  [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
965  { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
966  [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
967  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
968  [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
969  { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
970  [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
971  { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
972  [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
973  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
974  [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
975  { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
976  [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
977  { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
978  [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
979  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
980  [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
981  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
982  [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
983  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
984  [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
985  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
986  };
987  VP9Block *b = s->b;
988  int row = s->row, col = s->col, row7 = s->row7;
989  const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
990 #define INVALID_MV 0x80008000U
991  uint32_t mem = INVALID_MV;
992  int i;
993 
994 #define RETURN_DIRECT_MV(mv) \
995  do { \
996  uint32_t m = AV_RN32A(&mv); \
997  if (!idx) { \
998  AV_WN32A(pmv, m); \
999  return; \
1000  } else if (mem == INVALID_MV) { \
1001  mem = m; \
1002  } else if (m != mem) { \
1003  AV_WN32A(pmv, m); \
1004  return; \
1005  } \
1006  } while (0)
1007 
1008  if (sb >= 0) {
1009  if (sb == 2 || sb == 1) {
1010  RETURN_DIRECT_MV(b->mv[0][z]);
1011  } else if (sb == 3) {
1012  RETURN_DIRECT_MV(b->mv[2][z]);
1013  RETURN_DIRECT_MV(b->mv[1][z]);
1014  RETURN_DIRECT_MV(b->mv[0][z]);
1015  }
1016 
1017 #define RETURN_MV(mv) \
1018  do { \
1019  if (sb > 0) { \
1020  VP56mv tmp; \
1021  uint32_t m; \
1022  clamp_mv(&tmp, &mv, s); \
1023  m = AV_RN32A(&tmp); \
1024  if (!idx) { \
1025  AV_WN32A(pmv, m); \
1026  return; \
1027  } else if (mem == INVALID_MV) { \
1028  mem = m; \
1029  } else if (m != mem) { \
1030  AV_WN32A(pmv, m); \
1031  return; \
1032  } \
1033  } else { \
1034  uint32_t m = AV_RN32A(&mv); \
1035  if (!idx) { \
1036  clamp_mv(pmv, &mv, s); \
1037  return; \
1038  } else if (mem == INVALID_MV) { \
1039  mem = m; \
1040  } else if (m != mem) { \
1041  clamp_mv(pmv, &mv, s); \
1042  return; \
1043  } \
1044  } \
1045  } while (0)
1046 
1047  if (row > 0) {
1048  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1049  if (mv->ref[0] == ref) {
1050  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1051  } else if (mv->ref[1] == ref) {
1052  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1053  }
1054  }
1055  if (col > s->tiling.tile_col_start) {
1056  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1057  if (mv->ref[0] == ref) {
1058  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1059  } else if (mv->ref[1] == ref) {
1060  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1061  }
1062  }
1063  i = 2;
1064  } else {
1065  i = 0;
1066  }
1067 
1068  // previously coded MVs in this neighbourhood, using same reference frame
1069  for (; i < 8; i++) {
1070  int c = p[i][0] + col, r = p[i][1] + row;
1071 
1072  if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1073  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1074 
1075  if (mv->ref[0] == ref) {
1076  RETURN_MV(mv->mv[0]);
1077  } else if (mv->ref[1] == ref) {
1078  RETURN_MV(mv->mv[1]);
1079  }
1080  }
1081  }
1082 
1083  // MV at this position in previous frame, using same reference frame
1084  if (s->use_last_frame_mvs) {
1085  struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1086 
1087  if (!s->last_uses_2pass)
1088  ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1089  if (mv->ref[0] == ref) {
1090  RETURN_MV(mv->mv[0]);
1091  } else if (mv->ref[1] == ref) {
1092  RETURN_MV(mv->mv[1]);
1093  }
1094  }
1095 
1096 #define RETURN_SCALE_MV(mv, scale) \
1097  do { \
1098  if (scale) { \
1099  VP56mv mv_temp = { -mv.x, -mv.y }; \
1100  RETURN_MV(mv_temp); \
1101  } else { \
1102  RETURN_MV(mv); \
1103  } \
1104  } while (0)
1105 
1106  // previously coded MVs in this neighbourhood, using different reference frame
1107  for (i = 0; i < 8; i++) {
1108  int c = p[i][0] + col, r = p[i][1] + row;
1109 
1110  if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1111  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1112 
1113  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1114  RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1115  }
1116  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1117  // BUG - libvpx has this condition regardless of whether
1118  // we used the first ref MV and pre-scaling
1119  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1120  RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1121  }
1122  }
1123  }
1124 
1125  // MV at this position in previous frame, using different reference frame
1126  if (s->use_last_frame_mvs) {
1127  struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1128 
1129  // no need to await_progress, because we already did that above
1130  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1131  RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1132  }
1133  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1134  // BUG - libvpx has this condition regardless of whether
1135  // we used the first ref MV and pre-scaling
1136  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1137  RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1138  }
1139  }
1140 
1141  AV_ZERO32(pmv);
1142 #undef INVALID_MV
1143 #undef RETURN_MV
1144 #undef RETURN_SCALE_MV
1145 }
1146 
1147 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1148 {
1149  int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1150  int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1151  s->prob.p.mv_comp[idx].classes);
1152 
1153  s->counts.mv_comp[idx].sign[sign]++;
1154  s->counts.mv_comp[idx].classes[c]++;
1155  if (c) {
1156  int m;
1157 
1158  for (n = 0, m = 0; m < c; m++) {
1159  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1160  n |= bit << m;
1161  s->counts.mv_comp[idx].bits[m][bit]++;
1162  }
1163  n <<= 3;
1164  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1165  n |= bit << 1;
1166  s->counts.mv_comp[idx].fp[bit]++;
1167  if (hp) {
1168  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1169  s->counts.mv_comp[idx].hp[bit]++;
1170  n |= bit;
1171  } else {
1172  n |= 1;
1173  // bug in libvpx - we count for bw entropy purposes even if the
1174  // bit wasn't coded
1175  s->counts.mv_comp[idx].hp[1]++;
1176  }
1177  n += 8 << c;
1178  } else {
1179  n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1180  s->counts.mv_comp[idx].class0[n]++;
1181  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1182  s->prob.p.mv_comp[idx].class0_fp[n]);
1183  s->counts.mv_comp[idx].class0_fp[n][bit]++;
1184  n = (n << 3) | (bit << 1);
1185  if (hp) {
1186  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1187  s->counts.mv_comp[idx].class0_hp[bit]++;
1188  n |= bit;
1189  } else {
1190  n |= 1;
1191  // bug in libvpx - we count for bw entropy purposes even if the
1192  // bit wasn't coded
1193  s->counts.mv_comp[idx].class0_hp[1]++;
1194  }
1195  }
1196 
1197  return sign ? -(n + 1) : (n + 1);
1198 }
1199 
1200 static void fill_mv(VP9Context *s,
1201  VP56mv *mv, int mode, int sb)
1202 {
1203  VP9Block *b = s->b;
1204 
1205  if (mode == ZEROMV) {
1206  AV_ZERO64(mv);
1207  } else {
1208  int hp;
1209 
1210  // FIXME cache this value and reuse for other subblocks
1211  find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1212  mode == NEWMV ? -1 : sb);
1213  // FIXME maybe move this code into find_ref_mvs()
1214  if ((mode == NEWMV || sb == -1) &&
1215  !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1216  if (mv[0].y & 1) {
1217  if (mv[0].y < 0)
1218  mv[0].y++;
1219  else
1220  mv[0].y--;
1221  }
1222  if (mv[0].x & 1) {
1223  if (mv[0].x < 0)
1224  mv[0].x++;
1225  else
1226  mv[0].x--;
1227  }
1228  }
1229  if (mode == NEWMV) {
1231  s->prob.p.mv_joint);
1232 
1233  s->counts.mv_joint[j]++;
1234  if (j >= MV_JOINT_V)
1235  mv[0].y += read_mv_component(s, 0, hp);
1236  if (j & 1)
1237  mv[0].x += read_mv_component(s, 1, hp);
1238  }
1239 
1240  if (b->comp) {
1241  // FIXME cache this value and reuse for other subblocks
1242  find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1243  mode == NEWMV ? -1 : sb);
1244  if ((mode == NEWMV || sb == -1) &&
1245  !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1246  if (mv[1].y & 1) {
1247  if (mv[1].y < 0)
1248  mv[1].y++;
1249  else
1250  mv[1].y--;
1251  }
1252  if (mv[1].x & 1) {
1253  if (mv[1].x < 0)
1254  mv[1].x++;
1255  else
1256  mv[1].x--;
1257  }
1258  }
1259  if (mode == NEWMV) {
1261  s->prob.p.mv_joint);
1262 
1263  s->counts.mv_joint[j]++;
1264  if (j >= MV_JOINT_V)
1265  mv[1].y += read_mv_component(s, 0, hp);
1266  if (j & 1)
1267  mv[1].x += read_mv_component(s, 1, hp);
1268  }
1269  }
1270  }
1271 }
1272 
1273 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1274  ptrdiff_t stride, int v)
1275 {
1276  switch (w) {
1277  case 1:
1278  do {
1279  *ptr = v;
1280  ptr += stride;
1281  } while (--h);
1282  break;
1283  case 2: {
1284  int v16 = v * 0x0101;
1285  do {
1286  AV_WN16A(ptr, v16);
1287  ptr += stride;
1288  } while (--h);
1289  break;
1290  }
1291  case 4: {
1292  uint32_t v32 = v * 0x01010101;
1293  do {
1294  AV_WN32A(ptr, v32);
1295  ptr += stride;
1296  } while (--h);
1297  break;
1298  }
1299  case 8: {
1300 #if HAVE_FAST_64BIT
1301  uint64_t v64 = v * 0x0101010101010101ULL;
1302  do {
1303  AV_WN64A(ptr, v64);
1304  ptr += stride;
1305  } while (--h);
1306 #else
1307  uint32_t v32 = v * 0x01010101;
1308  do {
1309  AV_WN32A(ptr, v32);
1310  AV_WN32A(ptr + 4, v32);
1311  ptr += stride;
1312  } while (--h);
1313 #endif
1314  break;
1315  }
1316  }
1317 }
1318 
1319 static void decode_mode(AVCodecContext *ctx)
1320 {
1321  static const uint8_t left_ctx[N_BS_SIZES] = {
1322  0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1323  };
1324  static const uint8_t above_ctx[N_BS_SIZES] = {
1325  0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1326  };
1327  static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1329  TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1330  };
1331  VP9Context *s = ctx->priv_data;
1332  VP9Block *b = s->b;
1333  int row = s->row, col = s->col, row7 = s->row7;
1334  enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1335  int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
1336  int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
1337  int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1338  int vref, filter_id;
1339 
1340  if (!s->segmentation.enabled) {
1341  b->seg_id = 0;
1342  } else if (s->keyframe || s->intraonly) {
1344  } else if (!s->segmentation.update_map ||
1345  (s->segmentation.temporal &&
1347  s->prob.segpred[s->above_segpred_ctx[col] +
1348  s->left_segpred_ctx[row7]]))) {
1349  if (!s->errorres) {
1350  int pred = 8, x;
1351  uint8_t *refsegmap = s->frames[LAST_FRAME].segmentation_map;
1352 
1353  if (!s->last_uses_2pass)
1354  ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1355  for (y = 0; y < h4; y++) {
1356  int idx_base = (y + row) * 8 * s->sb_cols + col;
1357  for (x = 0; x < w4; x++)
1358  pred = FFMIN(pred, refsegmap[idx_base + x]);
1360  // FIXME maybe retain reference to previous frame as
1361  // segmap reference instead of copying the whole map
1362  // into a new buffer
1363  memcpy(&s->frames[CUR_FRAME].segmentation_map[idx_base],
1364  &refsegmap[idx_base], w4);
1365  }
1366  }
1367  av_assert1(pred < 8);
1368  b->seg_id = pred;
1369  } else {
1370  b->seg_id = 0;
1371  }
1372 
1373  memset(&s->above_segpred_ctx[col], 1, w4);
1374  memset(&s->left_segpred_ctx[row7], 1, h4);
1375  } else {
1377  s->prob.seg);
1378 
1379  memset(&s->above_segpred_ctx[col], 0, w4);
1380  memset(&s->left_segpred_ctx[row7], 0, h4);
1381  }
1382  if (s->segmentation.enabled &&
1383  (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1384  setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1385  w4, h4, 8 * s->sb_cols, b->seg_id);
1386  }
1387 
1388  b->skip = s->segmentation.enabled &&
1389  s->segmentation.feat[b->seg_id].skip_enabled;
1390  if (!b->skip) {
1391  int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1392  b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1393  s->counts.skip[c][b->skip]++;
1394  }
1395 
1396  if (s->keyframe || s->intraonly) {
1397  b->intra = 1;
1398  } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1399  b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1400  } else {
1401  int c, bit;
1402 
1403  if (have_a && have_l) {
1404  c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1405  c += (c == 2);
1406  } else {
1407  c = have_a ? 2 * s->above_intra_ctx[col] :
1408  have_l ? 2 * s->left_intra_ctx[row7] : 0;
1409  }
1410  bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1411  s->counts.intra[c][bit]++;
1412  b->intra = !bit;
1413  }
1414 
1415  if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1416  int c;
1417  if (have_a) {
1418  if (have_l) {
1419  c = (s->above_skip_ctx[col] ? max_tx :
1420  s->above_txfm_ctx[col]) +
1421  (s->left_skip_ctx[row7] ? max_tx :
1422  s->left_txfm_ctx[row7]) > max_tx;
1423  } else {
1424  c = s->above_skip_ctx[col] ? 1 :
1425  (s->above_txfm_ctx[col] * 2 > max_tx);
1426  }
1427  } else if (have_l) {
1428  c = s->left_skip_ctx[row7] ? 1 :
1429  (s->left_txfm_ctx[row7] * 2 > max_tx);
1430  } else {
1431  c = 1;
1432  }
1433  switch (max_tx) {
1434  case TX_32X32:
1435  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1436  if (b->tx) {
1437  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1438  if (b->tx == 2)
1439  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1440  }
1441  s->counts.tx32p[c][b->tx]++;
1442  break;
1443  case TX_16X16:
1444  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1445  if (b->tx)
1446  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1447  s->counts.tx16p[c][b->tx]++;
1448  break;
1449  case TX_8X8:
1450  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1451  s->counts.tx8p[c][b->tx]++;
1452  break;
1453  case TX_4X4:
1454  b->tx = TX_4X4;
1455  break;
1456  }
1457  } else {
1458  b->tx = FFMIN(max_tx, s->txfmmode);
1459  }
1460 
1461  if (s->keyframe || s->intraonly) {
1462  uint8_t *a = &s->above_mode_ctx[col * 2];
1463  uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1464 
1465  b->comp = 0;
1466  if (b->bs > BS_8x8) {
1467  // FIXME the memory storage intermediates here aren't really
1468  // necessary, they're just there to make the code slightly
1469  // simpler for now
1470  b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1471  vp9_default_kf_ymode_probs[a[0]][l[0]]);
1472  if (b->bs != BS_8x4) {
1474  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1475  l[0] = a[1] = b->mode[1];
1476  } else {
1477  l[0] = a[1] = b->mode[1] = b->mode[0];
1478  }
1479  if (b->bs != BS_4x8) {
1480  b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1481  vp9_default_kf_ymode_probs[a[0]][l[1]]);
1482  if (b->bs != BS_8x4) {
1484  vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1485  l[1] = a[1] = b->mode[3];
1486  } else {
1487  l[1] = a[1] = b->mode[3] = b->mode[2];
1488  }
1489  } else {
1490  b->mode[2] = b->mode[0];
1491  l[1] = a[1] = b->mode[3] = b->mode[1];
1492  }
1493  } else {
1495  vp9_default_kf_ymode_probs[*a][*l]);
1496  b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1497  // FIXME this can probably be optimized
1498  memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1499  memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1500  }
1503  } else if (b->intra) {
1504  b->comp = 0;
1505  if (b->bs > BS_8x8) {
1507  s->prob.p.y_mode[0]);
1508  s->counts.y_mode[0][b->mode[0]]++;
1509  if (b->bs != BS_8x4) {
1511  s->prob.p.y_mode[0]);
1512  s->counts.y_mode[0][b->mode[1]]++;
1513  } else {
1514  b->mode[1] = b->mode[0];
1515  }
1516  if (b->bs != BS_4x8) {
1518  s->prob.p.y_mode[0]);
1519  s->counts.y_mode[0][b->mode[2]]++;
1520  if (b->bs != BS_8x4) {
1522  s->prob.p.y_mode[0]);
1523  s->counts.y_mode[0][b->mode[3]]++;
1524  } else {
1525  b->mode[3] = b->mode[2];
1526  }
1527  } else {
1528  b->mode[2] = b->mode[0];
1529  b->mode[3] = b->mode[1];
1530  }
1531  } else {
1532  static const uint8_t size_group[10] = {
1533  3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1534  };
1535  int sz = size_group[b->bs];
1536 
1538  s->prob.p.y_mode[sz]);
1539  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1540  s->counts.y_mode[sz][b->mode[3]]++;
1541  }
1543  s->prob.p.uv_mode[b->mode[3]]);
1544  s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1545  } else {
1546  static const uint8_t inter_mode_ctx_lut[14][14] = {
1547  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1548  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1549  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1550  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1551  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1552  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1553  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1554  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1555  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1556  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1557  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1558  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1559  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1560  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1561  };
1562 
1563  if (s->segmentation.feat[b->seg_id].ref_enabled) {
1564  av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1565  b->comp = 0;
1566  b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1567  } else {
1568  // read comp_pred flag
1569  if (s->comppredmode != PRED_SWITCHABLE) {
1570  b->comp = s->comppredmode == PRED_COMPREF;
1571  } else {
1572  int c;
1573 
1574  // FIXME add intra as ref=0xff (or -1) to make these easier?
1575  if (have_a) {
1576  if (have_l) {
1577  if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1578  c = 4;
1579  } else if (s->above_comp_ctx[col]) {
1580  c = 2 + (s->left_intra_ctx[row7] ||
1581  s->left_ref_ctx[row7] == s->fixcompref);
1582  } else if (s->left_comp_ctx[row7]) {
1583  c = 2 + (s->above_intra_ctx[col] ||
1584  s->above_ref_ctx[col] == s->fixcompref);
1585  } else {
1586  c = (!s->above_intra_ctx[col] &&
1587  s->above_ref_ctx[col] == s->fixcompref) ^
1588  (!s->left_intra_ctx[row7] &&
1589  s->left_ref_ctx[row & 7] == s->fixcompref);
1590  }
1591  } else {
1592  c = s->above_comp_ctx[col] ? 3 :
1593  (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1594  }
1595  } else if (have_l) {
1596  c = s->left_comp_ctx[row7] ? 3 :
1597  (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1598  } else {
1599  c = 1;
1600  }
1601  b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1602  s->counts.comp[c][b->comp]++;
1603  }
1604 
1605  // read actual references
1606  // FIXME probably cache a few variables here to prevent repetitive
1607  // memory accesses below
1608  if (b->comp) /* two references */ {
1609  int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1610 
1611  b->ref[fix_idx] = s->fixcompref;
1612  // FIXME can this codeblob be replaced by some sort of LUT?
1613  if (have_a) {
1614  if (have_l) {
1615  if (s->above_intra_ctx[col]) {
1616  if (s->left_intra_ctx[row7]) {
1617  c = 2;
1618  } else {
1619  c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1620  }
1621  } else if (s->left_intra_ctx[row7]) {
1622  c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1623  } else {
1624  int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1625 
1626  if (refl == refa && refa == s->varcompref[1]) {
1627  c = 0;
1628  } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1629  if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1630  (refl == s->fixcompref && refa == s->varcompref[0])) {
1631  c = 4;
1632  } else {
1633  c = (refa == refl) ? 3 : 1;
1634  }
1635  } else if (!s->left_comp_ctx[row7]) {
1636  if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1637  c = 1;
1638  } else {
1639  c = (refl == s->varcompref[1] &&
1640  refa != s->varcompref[1]) ? 2 : 4;
1641  }
1642  } else if (!s->above_comp_ctx[col]) {
1643  if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1644  c = 1;
1645  } else {
1646  c = (refa == s->varcompref[1] &&
1647  refl != s->varcompref[1]) ? 2 : 4;
1648  }
1649  } else {
1650  c = (refl == refa) ? 4 : 2;
1651  }
1652  }
1653  } else {
1654  if (s->above_intra_ctx[col]) {
1655  c = 2;
1656  } else if (s->above_comp_ctx[col]) {
1657  c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1658  } else {
1659  c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1660  }
1661  }
1662  } else if (have_l) {
1663  if (s->left_intra_ctx[row7]) {
1664  c = 2;
1665  } else if (s->left_comp_ctx[row7]) {
1666  c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1667  } else {
1668  c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1669  }
1670  } else {
1671  c = 2;
1672  }
1673  bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1674  b->ref[var_idx] = s->varcompref[bit];
1675  s->counts.comp_ref[c][bit]++;
1676  } else /* single reference */ {
1677  int bit, c;
1678 
1679  if (have_a && !s->above_intra_ctx[col]) {
1680  if (have_l && !s->left_intra_ctx[row7]) {
1681  if (s->left_comp_ctx[row7]) {
1682  if (s->above_comp_ctx[col]) {
1683  c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1684  !s->above_ref_ctx[col]);
1685  } else {
1686  c = (3 * !s->above_ref_ctx[col]) +
1687  (!s->fixcompref || !s->left_ref_ctx[row7]);
1688  }
1689  } else if (s->above_comp_ctx[col]) {
1690  c = (3 * !s->left_ref_ctx[row7]) +
1691  (!s->fixcompref || !s->above_ref_ctx[col]);
1692  } else {
1693  c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1694  }
1695  } else if (s->above_intra_ctx[col]) {
1696  c = 2;
1697  } else if (s->above_comp_ctx[col]) {
1698  c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1699  } else {
1700  c = 4 * (!s->above_ref_ctx[col]);
1701  }
1702  } else if (have_l && !s->left_intra_ctx[row7]) {
1703  if (s->left_intra_ctx[row7]) {
1704  c = 2;
1705  } else if (s->left_comp_ctx[row7]) {
1706  c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1707  } else {
1708  c = 4 * (!s->left_ref_ctx[row7]);
1709  }
1710  } else {
1711  c = 2;
1712  }
1713  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1714  s->counts.single_ref[c][0][bit]++;
1715  if (!bit) {
1716  b->ref[0] = 0;
1717  } else {
1718  // FIXME can this codeblob be replaced by some sort of LUT?
1719  if (have_a) {
1720  if (have_l) {
1721  if (s->left_intra_ctx[row7]) {
1722  if (s->above_intra_ctx[col]) {
1723  c = 2;
1724  } else if (s->above_comp_ctx[col]) {
1725  c = 1 + 2 * (s->fixcompref == 1 ||
1726  s->above_ref_ctx[col] == 1);
1727  } else if (!s->above_ref_ctx[col]) {
1728  c = 3;
1729  } else {
1730  c = 4 * (s->above_ref_ctx[col] == 1);
1731  }
1732  } else if (s->above_intra_ctx[col]) {
1733  if (s->left_intra_ctx[row7]) {
1734  c = 2;
1735  } else if (s->left_comp_ctx[row7]) {
1736  c = 1 + 2 * (s->fixcompref == 1 ||
1737  s->left_ref_ctx[row7] == 1);
1738  } else if (!s->left_ref_ctx[row7]) {
1739  c = 3;
1740  } else {
1741  c = 4 * (s->left_ref_ctx[row7] == 1);
1742  }
1743  } else if (s->above_comp_ctx[col]) {
1744  if (s->left_comp_ctx[row7]) {
1745  if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1746  c = 3 * (s->fixcompref == 1 ||
1747  s->left_ref_ctx[row7] == 1);
1748  } else {
1749  c = 2;
1750  }
1751  } else if (!s->left_ref_ctx[row7]) {
1752  c = 1 + 2 * (s->fixcompref == 1 ||
1753  s->above_ref_ctx[col] == 1);
1754  } else {
1755  c = 3 * (s->left_ref_ctx[row7] == 1) +
1756  (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1757  }
1758  } else if (s->left_comp_ctx[row7]) {
1759  if (!s->above_ref_ctx[col]) {
1760  c = 1 + 2 * (s->fixcompref == 1 ||
1761  s->left_ref_ctx[row7] == 1);
1762  } else {
1763  c = 3 * (s->above_ref_ctx[col] == 1) +
1764  (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1765  }
1766  } else if (!s->above_ref_ctx[col]) {
1767  if (!s->left_ref_ctx[row7]) {
1768  c = 3;
1769  } else {
1770  c = 4 * (s->left_ref_ctx[row7] == 1);
1771  }
1772  } else if (!s->left_ref_ctx[row7]) {
1773  c = 4 * (s->above_ref_ctx[col] == 1);
1774  } else {
1775  c = 2 * (s->left_ref_ctx[row7] == 1) +
1776  2 * (s->above_ref_ctx[col] == 1);
1777  }
1778  } else {
1779  if (s->above_intra_ctx[col] ||
1780  (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1781  c = 2;
1782  } else if (s->above_comp_ctx[col]) {
1783  c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1784  } else {
1785  c = 4 * (s->above_ref_ctx[col] == 1);
1786  }
1787  }
1788  } else if (have_l) {
1789  if (s->left_intra_ctx[row7] ||
1790  (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1791  c = 2;
1792  } else if (s->left_comp_ctx[row7]) {
1793  c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1794  } else {
1795  c = 4 * (s->left_ref_ctx[row7] == 1);
1796  }
1797  } else {
1798  c = 2;
1799  }
1800  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1801  s->counts.single_ref[c][1][bit]++;
1802  b->ref[0] = 1 + bit;
1803  }
1804  }
1805  }
1806 
1807  if (b->bs <= BS_8x8) {
1808  if (s->segmentation.feat[b->seg_id].skip_enabled) {
1809  b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1810  } else {
1811  static const uint8_t off[10] = {
1812  3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1813  };
1814 
1815  // FIXME this needs to use the LUT tables from find_ref_mvs
1816  // because not all are -1,0/0,-1
1817  int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1818  [s->left_mode_ctx[row7 + off[b->bs]]];
1819 
1821  s->prob.p.mv_mode[c]);
1822  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1823  s->counts.mv_mode[c][b->mode[0] - 10]++;
1824  }
1825  }
1826 
1827  if (s->filtermode == FILTER_SWITCHABLE) {
1828  int c;
1829 
1830  if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1831  if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1832  c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1833  s->left_filter_ctx[row7] : 3;
1834  } else {
1835  c = s->above_filter_ctx[col];
1836  }
1837  } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1838  c = s->left_filter_ctx[row7];
1839  } else {
1840  c = 3;
1841  }
1842 
1843  filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1844  s->prob.p.filter[c]);
1845  s->counts.filter[c][filter_id]++;
1846  b->filter = vp9_filter_lut[filter_id];
1847  } else {
1848  b->filter = s->filtermode;
1849  }
1850 
1851  if (b->bs > BS_8x8) {
1852  int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1853 
1855  s->prob.p.mv_mode[c]);
1856  s->counts.mv_mode[c][b->mode[0] - 10]++;
1857  fill_mv(s, b->mv[0], b->mode[0], 0);
1858 
1859  if (b->bs != BS_8x4) {
1861  s->prob.p.mv_mode[c]);
1862  s->counts.mv_mode[c][b->mode[1] - 10]++;
1863  fill_mv(s, b->mv[1], b->mode[1], 1);
1864  } else {
1865  b->mode[1] = b->mode[0];
1866  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1867  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1868  }
1869 
1870  if (b->bs != BS_4x8) {
1872  s->prob.p.mv_mode[c]);
1873  s->counts.mv_mode[c][b->mode[2] - 10]++;
1874  fill_mv(s, b->mv[2], b->mode[2], 2);
1875 
1876  if (b->bs != BS_8x4) {
1878  s->prob.p.mv_mode[c]);
1879  s->counts.mv_mode[c][b->mode[3] - 10]++;
1880  fill_mv(s, b->mv[3], b->mode[3], 3);
1881  } else {
1882  b->mode[3] = b->mode[2];
1883  AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1884  AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1885  }
1886  } else {
1887  b->mode[2] = b->mode[0];
1888  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1889  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1890  b->mode[3] = b->mode[1];
1891  AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1892  AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1893  }
1894  } else {
1895  fill_mv(s, b->mv[0], b->mode[0], -1);
1896  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1897  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1898  AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1899  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1900  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1901  AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1902  }
1903 
1904  vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
1905  }
1906 
1907 #if HAVE_FAST_64BIT
1908 #define SPLAT_CTX(var, val, n) \
1909  switch (n) { \
1910  case 1: var = val; break; \
1911  case 2: AV_WN16A(&var, val * 0x0101); break; \
1912  case 4: AV_WN32A(&var, val * 0x01010101); break; \
1913  case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
1914  case 16: { \
1915  uint64_t v64 = val * 0x0101010101010101ULL; \
1916  AV_WN64A( &var, v64); \
1917  AV_WN64A(&((uint8_t *) &var)[8], v64); \
1918  break; \
1919  } \
1920  }
1921 #else
1922 #define SPLAT_CTX(var, val, n) \
1923  switch (n) { \
1924  case 1: var = val; break; \
1925  case 2: AV_WN16A(&var, val * 0x0101); break; \
1926  case 4: AV_WN32A(&var, val * 0x01010101); break; \
1927  case 8: { \
1928  uint32_t v32 = val * 0x01010101; \
1929  AV_WN32A( &var, v32); \
1930  AV_WN32A(&((uint8_t *) &var)[4], v32); \
1931  break; \
1932  } \
1933  case 16: { \
1934  uint32_t v32 = val * 0x01010101; \
1935  AV_WN32A( &var, v32); \
1936  AV_WN32A(&((uint8_t *) &var)[4], v32); \
1937  AV_WN32A(&((uint8_t *) &var)[8], v32); \
1938  AV_WN32A(&((uint8_t *) &var)[12], v32); \
1939  break; \
1940  } \
1941  }
1942 #endif
1943 
1944  switch (bwh_tab[1][b->bs][0]) {
1945 #define SET_CTXS(dir, off, n) \
1946  do { \
1947  SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
1948  SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
1949  SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
1950  if (!s->keyframe && !s->intraonly) { \
1951  SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
1952  SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
1953  SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
1954  if (!b->intra) { \
1955  SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
1956  if (s->filtermode == FILTER_SWITCHABLE) { \
1957  SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
1958  } \
1959  } \
1960  } \
1961  } while (0)
1962  case 1: SET_CTXS(above, col, 1); break;
1963  case 2: SET_CTXS(above, col, 2); break;
1964  case 4: SET_CTXS(above, col, 4); break;
1965  case 8: SET_CTXS(above, col, 8); break;
1966  }
1967  switch (bwh_tab[1][b->bs][1]) {
1968  case 1: SET_CTXS(left, row7, 1); break;
1969  case 2: SET_CTXS(left, row7, 2); break;
1970  case 4: SET_CTXS(left, row7, 4); break;
1971  case 8: SET_CTXS(left, row7, 8); break;
1972  }
1973 #undef SPLAT_CTX
1974 #undef SET_CTXS
1975 
1976  if (!s->keyframe && !s->intraonly) {
1977  if (b->bs > BS_8x8) {
1978  int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1979 
1980  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
1981  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
1982  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
1983  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
1984  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
1985  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
1986  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
1987  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
1988  } else {
1989  int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1990 
1991  for (n = 0; n < w4 * 2; n++) {
1992  AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
1993  AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
1994  }
1995  for (n = 0; n < h4 * 2; n++) {
1996  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
1997  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
1998  }
1999  }
2000  }
2001 
2002  // FIXME kinda ugly
2003  for (y = 0; y < h4; y++) {
2004  int x, o = (row + y) * s->sb_cols * 8 + col;
2005  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
2006 
2007  if (b->intra) {
2008  for (x = 0; x < w4; x++) {
2009  mv[x].ref[0] =
2010  mv[x].ref[1] = -1;
2011  }
2012  } else if (b->comp) {
2013  for (x = 0; x < w4; x++) {
2014  mv[x].ref[0] = b->ref[0];
2015  mv[x].ref[1] = b->ref[1];
2016  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2017  AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2018  }
2019  } else {
2020  for (x = 0; x < w4; x++) {
2021  mv[x].ref[0] = b->ref[0];
2022  mv[x].ref[1] = -1;
2023  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2024  }
2025  }
2026  }
2027 }
2028 
2029 // FIXME merge cnt/eob arguments?
2030 static av_always_inline int
2031 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2032  int is_tx32x32, unsigned (*cnt)[6][3],
2033  unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2034  int nnz, const int16_t *scan, const int16_t (*nb)[2],
2035  const int16_t *band_counts, const int16_t *qmul)
2036 {
2037  int i = 0, band = 0, band_left = band_counts[band];
2038  uint8_t *tp = p[0][nnz];
2039  uint8_t cache[1024];
2040 
2041  do {
2042  int val, rc;
2043 
2044  val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2045  eob[band][nnz][val]++;
2046  if (!val)
2047  break;
2048 
2049  skip_eob:
2050  if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2051  cnt[band][nnz][0]++;
2052  if (!--band_left)
2053  band_left = band_counts[++band];
2054  cache[scan[i]] = 0;
2055  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2056  tp = p[band][nnz];
2057  if (++i == n_coeffs)
2058  break; //invalid input; blocks should end with EOB
2059  goto skip_eob;
2060  }
2061 
2062  rc = scan[i];
2063  if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2064  cnt[band][nnz][1]++;
2065  val = 1;
2066  cache[rc] = 1;
2067  } else {
2068  // fill in p[3-10] (model fill) - only once per frame for each pos
2069  if (!tp[3])
2070  memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2071 
2072  cnt[band][nnz][2]++;
2073  if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2074  if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2075  cache[rc] = val = 2;
2076  } else {
2077  val = 3 + vp56_rac_get_prob(c, tp[5]);
2078  cache[rc] = 3;
2079  }
2080  } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2081  cache[rc] = 4;
2082  if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2083  val = 5 + vp56_rac_get_prob(c, 159);
2084  } else {
2085  val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2086  val += vp56_rac_get_prob(c, 145);
2087  }
2088  } else { // cat 3-6
2089  cache[rc] = 5;
2090  if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2091  if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2092  val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2093  val += (vp56_rac_get_prob(c, 148) << 1);
2094  val += vp56_rac_get_prob(c, 140);
2095  } else {
2096  val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2097  val += (vp56_rac_get_prob(c, 155) << 2);
2098  val += (vp56_rac_get_prob(c, 140) << 1);
2099  val += vp56_rac_get_prob(c, 135);
2100  }
2101  } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2102  val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2103  val += (vp56_rac_get_prob(c, 157) << 3);
2104  val += (vp56_rac_get_prob(c, 141) << 2);
2105  val += (vp56_rac_get_prob(c, 134) << 1);
2106  val += vp56_rac_get_prob(c, 130);
2107  } else {
2108  val = 67 + (vp56_rac_get_prob(c, 254) << 13);
2109  val += (vp56_rac_get_prob(c, 254) << 12);
2110  val += (vp56_rac_get_prob(c, 254) << 11);
2111  val += (vp56_rac_get_prob(c, 252) << 10);
2112  val += (vp56_rac_get_prob(c, 249) << 9);
2113  val += (vp56_rac_get_prob(c, 243) << 8);
2114  val += (vp56_rac_get_prob(c, 230) << 7);
2115  val += (vp56_rac_get_prob(c, 196) << 6);
2116  val += (vp56_rac_get_prob(c, 177) << 5);
2117  val += (vp56_rac_get_prob(c, 153) << 4);
2118  val += (vp56_rac_get_prob(c, 140) << 3);
2119  val += (vp56_rac_get_prob(c, 133) << 2);
2120  val += (vp56_rac_get_prob(c, 130) << 1);
2121  val += vp56_rac_get_prob(c, 129);
2122  }
2123  }
2124  }
2125  if (!--band_left)
2126  band_left = band_counts[++band];
2127  if (is_tx32x32)
2128  coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
2129  else
2130  coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
2131  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2132  tp = p[band][nnz];
2133  } while (++i < n_coeffs);
2134 
2135  return i;
2136 }
2137 
2138 static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2139  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2140  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2141  const int16_t (*nb)[2], const int16_t *band_counts,
2142  const int16_t *qmul)
2143 {
2144  return decode_coeffs_b_generic(c, coef, n_coeffs, 0, cnt, eob, p,
2145  nnz, scan, nb, band_counts, qmul);
2146 }
2147 
2148 static int decode_coeffs_b32(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2149  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2150  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2151  const int16_t (*nb)[2], const int16_t *band_counts,
2152  const int16_t *qmul)
2153 {
2154  return decode_coeffs_b_generic(c, coef, n_coeffs, 1, cnt, eob, p,
2155  nnz, scan, nb, band_counts, qmul);
2156 }
2157 
2159 {
2160  VP9Context *s = ctx->priv_data;
2161  VP9Block *b = s->b;
2162  int row = s->row, col = s->col;
2163  uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2164  unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2165  unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2166  int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2167  int end_x = FFMIN(2 * (s->cols - col), w4);
2168  int end_y = FFMIN(2 * (s->rows - row), h4);
2169  int n, pl, x, y, res;
2170  int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2171  int tx = 4 * s->lossless + b->tx;
2172  const int16_t * const *yscans = vp9_scans[tx];
2173  const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2174  const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2175  const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2176  uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2177  uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2178  static const int16_t band_counts[4][8] = {
2179  { 1, 2, 3, 4, 3, 16 - 13 },
2180  { 1, 2, 3, 4, 11, 64 - 21 },
2181  { 1, 2, 3, 4, 11, 256 - 21 },
2182  { 1, 2, 3, 4, 11, 1024 - 21 },
2183  };
2184  const int16_t *y_band_counts = band_counts[b->tx];
2185  const int16_t *uv_band_counts = band_counts[b->uvtx];
2186 
2187 #define MERGE(la, end, step, rd) \
2188  for (n = 0; n < end; n += step) \
2189  la[n] = !!rd(&la[n])
2190 #define MERGE_CTX(step, rd) \
2191  do { \
2192  MERGE(l, end_y, step, rd); \
2193  MERGE(a, end_x, step, rd); \
2194  } while (0)
2195 
2196 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2197  for (n = 0, y = 0; y < end_y; y += step) { \
2198  for (x = 0; x < end_x; x += step, n += step * step) { \
2199  enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2200  res = decode_coeffs_b##v(&s->c, s->block + 16 * n, 16 * step * step, \
2201  c, e, p, a[x] + l[y], yscans[txtp], \
2202  ynbs[txtp], y_band_counts, qmul[0]); \
2203  a[x] = l[y] = !!res; \
2204  if (step >= 4) { \
2205  AV_WN16A(&s->eob[n], res); \
2206  } else { \
2207  s->eob[n] = res; \
2208  } \
2209  } \
2210  }
2211 
2212 #define SPLAT(la, end, step, cond) \
2213  if (step == 2) { \
2214  for (n = 1; n < end; n += step) \
2215  la[n] = la[n - 1]; \
2216  } else if (step == 4) { \
2217  if (cond) { \
2218  for (n = 0; n < end; n += step) \
2219  AV_WN32A(&la[n], la[n] * 0x01010101); \
2220  } else { \
2221  for (n = 0; n < end; n += step) \
2222  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2223  } \
2224  } else /* step == 8 */ { \
2225  if (cond) { \
2226  if (HAVE_FAST_64BIT) { \
2227  for (n = 0; n < end; n += step) \
2228  AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2229  } else { \
2230  for (n = 0; n < end; n += step) { \
2231  uint32_t v32 = la[n] * 0x01010101; \
2232  AV_WN32A(&la[n], v32); \
2233  AV_WN32A(&la[n + 4], v32); \
2234  } \
2235  } \
2236  } else { \
2237  for (n = 0; n < end; n += step) \
2238  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2239  } \
2240  }
2241 #define SPLAT_CTX(step) \
2242  do { \
2243  SPLAT(a, end_x, step, end_x == w4); \
2244  SPLAT(l, end_y, step, end_y == h4); \
2245  } while (0)
2246 
2247  /* y tokens */
2248  switch (b->tx) {
2249  case TX_4X4:
2250  DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2251  break;
2252  case TX_8X8:
2253  MERGE_CTX(2, AV_RN16A);
2254  DECODE_Y_COEF_LOOP(2, 0,);
2255  SPLAT_CTX(2);
2256  break;
2257  case TX_16X16:
2258  MERGE_CTX(4, AV_RN32A);
2259  DECODE_Y_COEF_LOOP(4, 0,);
2260  SPLAT_CTX(4);
2261  break;
2262  case TX_32X32:
2263  MERGE_CTX(8, AV_RN64A);
2264  DECODE_Y_COEF_LOOP(8, 0, 32);
2265  SPLAT_CTX(8);
2266  break;
2267  }
2268 
2269 #define DECODE_UV_COEF_LOOP(step) \
2270  for (n = 0, y = 0; y < end_y; y += step) { \
2271  for (x = 0; x < end_x; x += step, n += step * step) { \
2272  res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, \
2273  16 * step * step, c, e, p, a[x] + l[y], \
2274  uvscan, uvnb, uv_band_counts, qmul[1]); \
2275  a[x] = l[y] = !!res; \
2276  if (step >= 4) { \
2277  AV_WN16A(&s->uveob[pl][n], res); \
2278  } else { \
2279  s->uveob[pl][n] = res; \
2280  } \
2281  } \
2282  }
2283 
2284  p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2285  c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2286  e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2287  w4 >>= 1;
2288  h4 >>= 1;
2289  end_x >>= 1;
2290  end_y >>= 1;
2291  for (pl = 0; pl < 2; pl++) {
2292  a = &s->above_uv_nnz_ctx[pl][col];
2293  l = &s->left_uv_nnz_ctx[pl][row & 7];
2294  switch (b->uvtx) {
2295  case TX_4X4:
2297  break;
2298  case TX_8X8:
2299  MERGE_CTX(2, AV_RN16A);
2301  SPLAT_CTX(2);
2302  break;
2303  case TX_16X16:
2304  MERGE_CTX(4, AV_RN32A);
2306  SPLAT_CTX(4);
2307  break;
2308  case TX_32X32:
2309  MERGE_CTX(8, AV_RN64A);
2310  // a 64x64 (max) uv block can ever only contain 1 tx32x32 block
2311  // so there is no need to loop
2312  res = decode_coeffs_b32(&s->c, s->uvblock[pl],
2313  1024, c, e, p, a[0] + l[0],
2314  uvscan, uvnb, uv_band_counts, qmul[1]);
2315  a[0] = l[0] = !!res;
2316  AV_WN16A(&s->uveob[pl][0], res);
2317  SPLAT_CTX(8);
2318  break;
2319  }
2320  }
2321 }
2322 
2324  uint8_t *dst_edge, ptrdiff_t stride_edge,
2325  uint8_t *dst_inner, ptrdiff_t stride_inner,
2326  uint8_t *l, int col, int x, int w,
2327  int row, int y, enum TxfmMode tx,
2328  int p)
2329 {
2330  int have_top = row > 0 || y > 0;
2331  int have_left = col > s->tiling.tile_col_start || x > 0;
2332  int have_right = x < w - 1;
2333  static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2334  [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2335  { DC_127_PRED, VERT_PRED } },
2336  [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2337  { HOR_PRED, HOR_PRED } },
2338  [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2339  { LEFT_DC_PRED, DC_PRED } },
2349  { DC_127_PRED, VERT_LEFT_PRED } },
2350  [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2351  { HOR_UP_PRED, HOR_UP_PRED } },
2352  [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2353  { HOR_PRED, TM_VP8_PRED } },
2354  };
2355  static const struct {
2356  uint8_t needs_left:1;
2357  uint8_t needs_top:1;
2358  uint8_t needs_topleft:1;
2359  uint8_t needs_topright:1;
2360  uint8_t invert_left:1;
2361  } edges[N_INTRA_PRED_MODES] = {
2362  [VERT_PRED] = { .needs_top = 1 },
2363  [HOR_PRED] = { .needs_left = 1 },
2364  [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2365  [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2366  [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2367  [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2368  [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2369  [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2370  [HOR_UP_PRED] = { .needs_left = 1, .invert_left = 1 },
2371  [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2372  [LEFT_DC_PRED] = { .needs_left = 1 },
2373  [TOP_DC_PRED] = { .needs_top = 1 },
2374  [DC_128_PRED] = { 0 },
2375  [DC_127_PRED] = { 0 },
2376  [DC_129_PRED] = { 0 }
2377  };
2378 
2379  av_assert2(mode >= 0 && mode < 10);
2380  mode = mode_conv[mode][have_left][have_top];
2381  if (edges[mode].needs_top) {
2382  uint8_t *top, *topleft;
2383  int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
2384  int n_px_need_tr = 0;
2385 
2386  if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2387  n_px_need_tr = 4;
2388 
2389  // if top of sb64-row, use s->intra_pred_data[] instead of
2390  // dst[-stride] for intra prediction (it contains pre- instead of
2391  // post-loopfilter data)
2392  if (have_top) {
2393  top = !(row & 7) && !y ?
2394  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2395  y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2396  if (have_left)
2397  topleft = !(row & 7) && !y ?
2398  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2399  y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2400  &dst_inner[-stride_inner];
2401  }
2402 
2403  if (have_top &&
2404  (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2405  (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2406  n_px_need + n_px_need_tr <= n_px_have) {
2407  *a = top;
2408  } else {
2409  if (have_top) {
2410  if (n_px_need <= n_px_have) {
2411  memcpy(*a, top, n_px_need);
2412  } else {
2413  memcpy(*a, top, n_px_have);
2414  memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
2415  n_px_need - n_px_have);
2416  }
2417  } else {
2418  memset(*a, 127, n_px_need);
2419  }
2420  if (edges[mode].needs_topleft) {
2421  if (have_left && have_top) {
2422  (*a)[-1] = topleft[-1];
2423  } else {
2424  (*a)[-1] = have_top ? 129 : 127;
2425  }
2426  }
2427  if (tx == TX_4X4 && edges[mode].needs_topright) {
2428  if (have_top && have_right &&
2429  n_px_need + n_px_need_tr <= n_px_have) {
2430  memcpy(&(*a)[4], &top[4], 4);
2431  } else {
2432  memset(&(*a)[4], (*a)[3], 4);
2433  }
2434  }
2435  }
2436  }
2437  if (edges[mode].needs_left) {
2438  if (have_left) {
2439  int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
2440  uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2441  ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2442 
2443  if (edges[mode].invert_left) {
2444  if (n_px_need <= n_px_have) {
2445  for (i = 0; i < n_px_need; i++)
2446  l[i] = dst[i * stride - 1];
2447  } else {
2448  for (i = 0; i < n_px_have; i++)
2449  l[i] = dst[i * stride - 1];
2450  memset(&l[n_px_have], l[n_px_have - 1], n_px_need - n_px_have);
2451  }
2452  } else {
2453  if (n_px_need <= n_px_have) {
2454  for (i = 0; i < n_px_need; i++)
2455  l[n_px_need - 1 - i] = dst[i * stride - 1];
2456  } else {
2457  for (i = 0; i < n_px_have; i++)
2458  l[n_px_need - 1 - i] = dst[i * stride - 1];
2459  memset(l, l[n_px_need - n_px_have], n_px_need - n_px_have);
2460  }
2461  }
2462  } else {
2463  memset(l, 129, 4 << tx);
2464  }
2465  }
2466 
2467  return mode;
2468 }
2469 
2470 static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2471 {
2472  VP9Context *s = ctx->priv_data;
2473  VP9Block *b = s->b;
2474  int row = s->row, col = s->col;
2475  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2476  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2477  int end_x = FFMIN(2 * (s->cols - col), w4);
2478  int end_y = FFMIN(2 * (s->rows - row), h4);
2479  int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2480  int uvstep1d = 1 << b->uvtx, p;
2481  uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2482  LOCAL_ALIGNED_32(uint8_t, a_buf, [64]);
2483  LOCAL_ALIGNED_32(uint8_t, l, [32]);
2484 
2485  for (n = 0, y = 0; y < end_y; y += step1d) {
2486  uint8_t *ptr = dst, *ptr_r = dst_r;
2487  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
2488  ptr_r += 4 * step1d, n += step) {
2489  int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2490  y * 2 + x : 0];
2491  uint8_t *a = &a_buf[32];
2492  enum TxfmType txtp = vp9_intra_txfm_type[mode];
2493  int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2494 
2495  mode = check_intra_mode(s, mode, &a, ptr_r,
2496  s->frames[CUR_FRAME].tf.f->linesize[0],
2497  ptr, s->y_stride, l,
2498  col, x, w4, row, y, b->tx, 0);
2499  s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2500  if (eob)
2501  s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2502  s->block + 16 * n, eob);
2503  }
2504  dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2505  dst += 4 * step1d * s->y_stride;
2506  }
2507 
2508  // U/V
2509  w4 >>= 1;
2510  end_x >>= 1;
2511  end_y >>= 1;
2512  step = 1 << (b->uvtx * 2);
2513  for (p = 0; p < 2; p++) {
2514  dst = s->dst[1 + p];
2515  dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2516  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2517  uint8_t *ptr = dst, *ptr_r = dst_r;
2518  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
2519  ptr_r += 4 * uvstep1d, n += step) {
2520  int mode = b->uvmode;
2521  uint8_t *a = &a_buf[16];
2522  int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2523 
2524  mode = check_intra_mode(s, mode, &a, ptr_r,
2525  s->frames[CUR_FRAME].tf.f->linesize[1],
2526  ptr, s->uv_stride, l,
2527  col, x, w4, row, y, b->uvtx, p + 1);
2528  s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2529  if (eob)
2530  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2531  s->uvblock[p] + 16 * n, eob);
2532  }
2533  dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2534  dst += 4 * uvstep1d * s->uv_stride;
2535  }
2536  }
2537 }
2538 
2540  uint8_t *dst, ptrdiff_t dst_stride,
2541  const uint8_t *ref, ptrdiff_t ref_stride,
2543  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2544  int bw, int bh, int w, int h)
2545 {
2546  int mx = mv->x, my = mv->y, th;
2547 
2548  y += my >> 3;
2549  x += mx >> 3;
2550  ref += y * ref_stride + x;
2551  mx &= 7;
2552  my &= 7;
2553  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2554  // we use +7 because the last 7 pixels of each sbrow can be changed in
2555  // the longest loopfilter of the next sbrow
2556  th = (y + bh + 4 * !!my + 7) >> 6;
2557  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2558  if (x < !!mx * 3 || y < !!my * 3 ||
2559  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2561  ref - !!my * 3 * ref_stride - !!mx * 3,
2562  80, ref_stride,
2563  bw + !!mx * 7, bh + !!my * 7,
2564  x - !!mx * 3, y - !!my * 3, w, h);
2565  ref = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2566  ref_stride = 80;
2567  }
2568  mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2569 }
2570 
2572  uint8_t *dst_u, uint8_t *dst_v,
2573  ptrdiff_t dst_stride,
2574  const uint8_t *ref_u, ptrdiff_t src_stride_u,
2575  const uint8_t *ref_v, ptrdiff_t src_stride_v,
2577  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2578  int bw, int bh, int w, int h)
2579 {
2580  int mx = mv->x, my = mv->y, th;
2581 
2582  y += my >> 4;
2583  x += mx >> 4;
2584  ref_u += y * src_stride_u + x;
2585  ref_v += y * src_stride_v + x;
2586  mx &= 15;
2587  my &= 15;
2588  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2589  // we use +7 because the last 7 pixels of each sbrow can be changed in
2590  // the longest loopfilter of the next sbrow
2591  th = (y + bh + 4 * !!my + 7) >> 5;
2592  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2593  if (x < !!mx * 3 || y < !!my * 3 ||
2594  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2596  ref_u - !!my * 3 * src_stride_u - !!mx * 3,
2597  80, src_stride_u,
2598  bw + !!mx * 7, bh + !!my * 7,
2599  x - !!mx * 3, y - !!my * 3, w, h);
2600  ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2601  mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
2602 
2604  ref_v - !!my * 3 * src_stride_v - !!mx * 3,
2605  80, src_stride_v,
2606  bw + !!mx * 7, bh + !!my * 7,
2607  x - !!mx * 3, y - !!my * 3, w, h);
2608  ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2609  mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
2610  } else {
2611  mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2612  mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2613  }
2614 }
2615 
2616 static void inter_recon(AVCodecContext *ctx)
2617 {
2618  static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
2619  { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
2620  { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
2621  };
2622  VP9Context *s = ctx->priv_data;
2623  VP9Block *b = s->b;
2624  int row = s->row, col = s->col;
2625  ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2;
2626  AVFrame *ref1 = tref1->f, *ref2;
2627  int w1 = ref1->width, h1 = ref1->height, w2, h2;
2628  ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
2629 
2630  if (b->comp) {
2631  tref2 = &s->refs[s->refidx[b->ref[1]]];
2632  ref2 = tref2->f;
2633  w2 = ref2->width;
2634  h2 = ref2->height;
2635  }
2636 
2637  // y inter pred
2638  if (b->bs > BS_8x8) {
2639  if (b->bs == BS_8x4) {
2640  mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
2641  ref1->data[0], ref1->linesize[0], tref1,
2642  row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1);
2643  mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
2644  s->dst[0] + 4 * ls_y, ls_y,
2645  ref1->data[0], ref1->linesize[0], tref1,
2646  (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1);
2647 
2648  if (b->comp) {
2649  mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
2650  ref2->data[0], ref2->linesize[0], tref2,
2651  row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2);
2652  mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
2653  s->dst[0] + 4 * ls_y, ls_y,
2654  ref2->data[0], ref2->linesize[0], tref2,
2655  (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2);
2656  }
2657  } else if (b->bs == BS_4x8) {
2658  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2659  ref1->data[0], ref1->linesize[0], tref1,
2660  row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1);
2661  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2662  ref1->data[0], ref1->linesize[0], tref1,
2663  row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1);
2664 
2665  if (b->comp) {
2666  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2667  ref2->data[0], ref2->linesize[0], tref2,
2668  row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2);
2669  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2670  ref2->data[0], ref2->linesize[0], tref2,
2671  row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2);
2672  }
2673  } else {
2674  av_assert2(b->bs == BS_4x4);
2675 
2676  // FIXME if two horizontally adjacent blocks have the same MV,
2677  // do a w8 instead of a w4 call
2678  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2679  ref1->data[0], ref1->linesize[0], tref1,
2680  row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1);
2681  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2682  ref1->data[0], ref1->linesize[0], tref1,
2683  row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1);
2684  mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2685  s->dst[0] + 4 * ls_y, ls_y,
2686  ref1->data[0], ref1->linesize[0], tref1,
2687  (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1);
2688  mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2689  s->dst[0] + 4 * ls_y + 4, ls_y,
2690  ref1->data[0], ref1->linesize[0], tref1,
2691  (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1);
2692 
2693  if (b->comp) {
2694  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2695  ref2->data[0], ref2->linesize[0], tref2,
2696  row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2);
2697  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2698  ref2->data[0], ref2->linesize[0], tref2,
2699  row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2);
2700  mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2701  s->dst[0] + 4 * ls_y, ls_y,
2702  ref2->data[0], ref2->linesize[0], tref2,
2703  (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2);
2704  mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2705  s->dst[0] + 4 * ls_y + 4, ls_y,
2706  ref2->data[0], ref2->linesize[0], tref2,
2707  (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2);
2708  }
2709  }
2710  } else {
2711  int bwl = bwlog_tab[0][b->bs];
2712  int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
2713 
2714  mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
2715  ref1->data[0], ref1->linesize[0], tref1,
2716  row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1);
2717 
2718  if (b->comp)
2719  mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
2720  ref2->data[0], ref2->linesize[0], tref2,
2721  row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2);
2722  }
2723 
2724  // uv inter pred
2725  {
2726  int bwl = bwlog_tab[1][b->bs];
2727  int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
2728  VP56mv mvuv;
2729 
2730  w1 = (w1 + 1) >> 1;
2731  h1 = (h1 + 1) >> 1;
2732  if (b->comp) {
2733  w2 = (w2 + 1) >> 1;
2734  h2 = (h2 + 1) >> 1;
2735  }
2736  if (b->bs > BS_8x8) {
2737  mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
2738  mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
2739  } else {
2740  mvuv = b->mv[0][0];
2741  }
2742 
2743  mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
2744  s->dst[1], s->dst[2], ls_uv,
2745  ref1->data[1], ref1->linesize[1],
2746  ref1->data[2], ref1->linesize[2], tref1,
2747  row << 2, col << 2, &mvuv, bw, bh, w1, h1);
2748 
2749  if (b->comp) {
2750  if (b->bs > BS_8x8) {
2751  mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
2752  mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
2753  } else {
2754  mvuv = b->mv[0][1];
2755  }
2756  mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
2757  s->dst[1], s->dst[2], ls_uv,
2758  ref2->data[1], ref2->linesize[1],
2759  ref2->data[2], ref2->linesize[2], tref2,
2760  row << 2, col << 2, &mvuv, bw, bh, w2, h2);
2761  }
2762  }
2763 
2764  if (!b->skip) {
2765  /* mostly copied intra_reconn() */
2766 
2767  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2768  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2769  int end_x = FFMIN(2 * (s->cols - col), w4);
2770  int end_y = FFMIN(2 * (s->rows - row), h4);
2771  int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2772  int uvstep1d = 1 << b->uvtx, p;
2773  uint8_t *dst = s->dst[0];
2774 
2775  // y itxfm add
2776  for (n = 0, y = 0; y < end_y; y += step1d) {
2777  uint8_t *ptr = dst;
2778  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
2779  int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2780 
2781  if (eob)
2782  s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
2783  s->block + 16 * n, eob);
2784  }
2785  dst += 4 * s->y_stride * step1d;
2786  }
2787 
2788  // uv itxfm add
2789  end_x >>= 1;
2790  end_y >>= 1;
2791  step = 1 << (b->uvtx * 2);
2792  for (p = 0; p < 2; p++) {
2793  dst = s->dst[p + 1];
2794  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2795  uint8_t *ptr = dst;
2796  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
2797  int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2798 
2799  if (eob)
2800  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2801  s->uvblock[p] + 16 * n, eob);
2802  }
2803  dst += 4 * uvstep1d * s->uv_stride;
2804  }
2805  }
2806  }
2807 }
2808 
2809 static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
2810  int row_and_7, int col_and_7,
2811  int w, int h, int col_end, int row_end,
2812  enum TxfmMode tx, int skip_inter)
2813 {
2814  // FIXME I'm pretty sure all loops can be replaced by a single LUT if
2815  // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
2816  // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
2817  // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
2818 
2819  // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
2820  // edges. This means that for UV, we work on two subsampled blocks at
2821  // a time, and we only use the topleft block's mode information to set
2822  // things like block strength. Thus, for any block size smaller than
2823  // 16x16, ignore the odd portion of the block.
2824  if (tx == TX_4X4 && is_uv) {
2825  if (h == 1) {
2826  if (row_and_7 & 1)
2827  return;
2828  if (!row_end)
2829  h += 1;
2830  }
2831  if (w == 1) {
2832  if (col_and_7 & 1)
2833  return;
2834  if (!col_end)
2835  w += 1;
2836  }
2837  }
2838 
2839  if (tx == TX_4X4 && !skip_inter) {
2840  int t = 1 << col_and_7, m_col = (t << w) - t, y;
2841  int m_col_odd = (t << (w - 1)) - t;
2842 
2843  // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
2844  if (is_uv) {
2845  int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
2846 
2847  for (y = row_and_7; y < h + row_and_7; y++) {
2848  int col_mask_id = 2 - !(y & 7);
2849 
2850  lflvl->mask[is_uv][0][y][1] |= m_row_8;
2851  lflvl->mask[is_uv][0][y][2] |= m_row_4;
2852  // for odd lines, if the odd col is not being filtered,
2853  // skip odd row also:
2854  // .---. <-- a
2855  // | |
2856  // |___| <-- b
2857  // ^ ^
2858  // c d
2859  //
2860  // if a/c are even row/col and b/d are odd, and d is skipped,
2861  // e.g. right edge of size-66x66.webm, then skip b also (bug)
2862  if ((col_end & 1) && (y & 1)) {
2863  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
2864  } else {
2865  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
2866  }
2867  }
2868  } else {
2869  int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
2870 
2871  for (y = row_and_7; y < h + row_and_7; y++) {
2872  int col_mask_id = 2 - !(y & 3);
2873 
2874  lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
2875  lflvl->mask[is_uv][0][y][2] |= m_row_4;
2876  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
2877  lflvl->mask[is_uv][0][y][3] |= m_col;
2878  lflvl->mask[is_uv][1][y][3] |= m_col;
2879  }
2880  }
2881  } else {
2882  int y, t = 1 << col_and_7, m_col = (t << w) - t;
2883 
2884  if (!skip_inter) {
2885  int mask_id = (tx == TX_8X8);
2886  int l2 = tx + is_uv - 1, step1d = 1 << l2;
2887  static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
2888  int m_row = m_col & masks[l2];
2889 
2890  // at odd UV col/row edges tx16/tx32 loopfilter edges, force
2891  // 8wd loopfilter to prevent going off the visible edge.
2892  if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
2893  int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
2894  int m_row_8 = m_row - m_row_16;
2895 
2896  for (y = row_and_7; y < h + row_and_7; y++) {
2897  lflvl->mask[is_uv][0][y][0] |= m_row_16;
2898  lflvl->mask[is_uv][0][y][1] |= m_row_8;
2899  }
2900  } else {
2901  for (y = row_and_7; y < h + row_and_7; y++)
2902  lflvl->mask[is_uv][0][y][mask_id] |= m_row;
2903  }
2904 
2905  if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
2906  for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
2907  lflvl->mask[is_uv][1][y][0] |= m_col;
2908  if (y - row_and_7 == h - 1)
2909  lflvl->mask[is_uv][1][y][1] |= m_col;
2910  } else {
2911  for (y = row_and_7; y < h + row_and_7; y += step1d)
2912  lflvl->mask[is_uv][1][y][mask_id] |= m_col;
2913  }
2914  } else if (tx != TX_4X4) {
2915  int mask_id;
2916 
2917  mask_id = (tx == TX_8X8) || (is_uv && h == 1);
2918  lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
2919  mask_id = (tx == TX_8X8) || (is_uv && w == 1);
2920  for (y = row_and_7; y < h + row_and_7; y++)
2921  lflvl->mask[is_uv][0][y][mask_id] |= t;
2922  } else if (is_uv) {
2923  int t8 = t & 0x01, t4 = t - t8;
2924 
2925  for (y = row_and_7; y < h + row_and_7; y++) {
2926  lflvl->mask[is_uv][0][y][2] |= t4;
2927  lflvl->mask[is_uv][0][y][1] |= t8;
2928  }
2929  lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
2930  } else {
2931  int t8 = t & 0x11, t4 = t - t8;
2932 
2933  for (y = row_and_7; y < h + row_and_7; y++) {
2934  lflvl->mask[is_uv][0][y][2] |= t4;
2935  lflvl->mask[is_uv][0][y][1] |= t8;
2936  }
2937  lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
2938  }
2939  }
2940 }
2941 
2942 static void decode_b(AVCodecContext *ctx, int row, int col,
2943  struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
2944  enum BlockLevel bl, enum BlockPartition bp)
2945 {
2946  VP9Context *s = ctx->priv_data;
2947  VP9Block *b = s->b;
2948  enum BlockSize bs = bl * 3 + bp;
2949  int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
2950  int emu[2];
2951  AVFrame *f = s->frames[CUR_FRAME].tf.f;
2952 
2953  s->row = row;
2954  s->row7 = row & 7;
2955  s->col = col;
2956  s->col7 = col & 7;
2957  s->min_mv.x = -(128 + col * 64);
2958  s->min_mv.y = -(128 + row * 64);
2959  s->max_mv.x = 128 + (s->cols - col - w4) * 64;
2960  s->max_mv.y = 128 + (s->rows - row - h4) * 64;
2961  if (s->pass < 2) {
2962  b->bs = bs;
2963  b->bl = bl;
2964  b->bp = bp;
2965  decode_mode(ctx);
2966  b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
2967 
2968  if (!b->skip) {
2969  decode_coeffs(ctx);
2970  } else {
2971  int row7 = s->row7;
2972 
2973 #define SPLAT_ZERO_CTX(v, n) \
2974  switch (n) { \
2975  case 1: v = 0; break; \
2976  case 2: AV_ZERO16(&v); break; \
2977  case 4: AV_ZERO32(&v); break; \
2978  case 8: AV_ZERO64(&v); break; \
2979  case 16: AV_ZERO128(&v); break; \
2980  }
2981 #define SPLAT_ZERO_YUV(dir, var, off, n) \
2982  do { \
2983  SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
2984  SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
2985  SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
2986  } while (0)
2987 
2988  switch (w4) {
2989  case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1); break;
2990  case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2); break;
2991  case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4); break;
2992  case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8); break;
2993  }
2994  switch (h4) {
2995  case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1); break;
2996  case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2); break;
2997  case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4); break;
2998  case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8); break;
2999  }
3000  }
3001  if (s->pass == 1) {
3002  s->b++;
3003  s->block += w4 * h4 * 64;
3004  s->uvblock[0] += w4 * h4 * 16;
3005  s->uvblock[1] += w4 * h4 * 16;
3006  s->eob += 4 * w4 * h4;
3007  s->uveob[0] += w4 * h4;
3008  s->uveob[1] += w4 * h4;
3009 
3010  return;
3011  }
3012  }
3013 
3014  // emulated overhangs if the stride of the target buffer can't hold. This
3015  // allows to support emu-edge and so on even if we have large block
3016  // overhangs
3017  emu[0] = (col + w4) * 8 > f->linesize[0] ||
3018  (row + h4) > s->rows;
3019  emu[1] = (col + w4) * 4 > f->linesize[1] ||
3020  (row + h4) > s->rows;
3021  if (emu[0]) {
3022  s->dst[0] = s->tmp_y;
3023  s->y_stride = 64;
3024  } else {
3025  s->dst[0] = f->data[0] + yoff;
3026  s->y_stride = f->linesize[0];
3027  }
3028  if (emu[1]) {
3029  s->dst[1] = s->tmp_uv[0];
3030  s->dst[2] = s->tmp_uv[1];
3031  s->uv_stride = 32;
3032  } else {
3033  s->dst[1] = f->data[1] + uvoff;
3034  s->dst[2] = f->data[2] + uvoff;
3035  s->uv_stride = f->linesize[1];
3036  }
3037  if (b->intra) {
3038  intra_recon(ctx, yoff, uvoff);
3039  } else {
3040  inter_recon(ctx);
3041  }
3042  if (emu[0]) {
3043  int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3044 
3045  for (n = 0; o < w; n++) {
3046  int bw = 64 >> n;
3047 
3048  av_assert2(n <= 4);
3049  if (w & bw) {
3050  s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
3051  s->tmp_y + o, 64, h, 0, 0);
3052  o += bw;
3053  }
3054  }
3055  }
3056  if (emu[1]) {
3057  int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
3058 
3059  for (n = 1; o < w; n++) {
3060  int bw = 64 >> n;
3061 
3062  av_assert2(n <= 4);
3063  if (w & bw) {
3064  s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
3065  s->tmp_uv[0] + o, 32, h, 0, 0);
3066  s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
3067  s->tmp_uv[1] + o, 32, h, 0, 0);
3068  o += bw;
3069  }
3070  }
3071  }
3072 
3073  // pick filter level and find edges to apply filter to
3074  if (s->filter.level &&
3075  (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3076  [b->mode[3] != ZEROMV]) > 0) {
3077  int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3078  int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3079 
3080  setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3081  mask_edges(lflvl, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3082  mask_edges(lflvl, 1, row7, col7, x_end, y_end,
3083  s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3084  s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3085  b->uvtx, skip_inter);
3086 
3087  if (!s->filter.lim_lut[lvl]) {
3088  int sharp = s->filter.sharpness;
3089  int limit = lvl;
3090 
3091  if (sharp > 0) {
3092  limit >>= (sharp + 3) >> 2;
3093  limit = FFMIN(limit, 9 - sharp);
3094  }
3095  limit = FFMAX(limit, 1);
3096 
3097  s->filter.lim_lut[lvl] = limit;
3098  s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3099  }
3100  }
3101 
3102  if (s->pass == 2) {
3103  s->b++;
3104  s->block += w4 * h4 * 64;
3105  s->uvblock[0] += w4 * h4 * 16;
3106  s->uvblock[1] += w4 * h4 * 16;
3107  s->eob += 4 * w4 * h4;
3108  s->uveob[0] += w4 * h4;
3109  s->uveob[1] += w4 * h4;
3110  }
3111 }
3112 
3113 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3114  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3115 {
3116  VP9Context *s = ctx->priv_data;
3117  int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3118  (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3119  const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
3120  s->prob.p.partition[bl][c];
3121  enum BlockPartition bp;
3122  ptrdiff_t hbs = 4 >> bl;
3123  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3124  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3125 
3126  if (bl == BL_8X8) {
3127  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3128  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3129  } else if (col + hbs < s->cols) { // FIXME why not <=?
3130  if (row + hbs < s->rows) { // FIXME why not <=?
3131  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3132  switch (bp) {
3133  case PARTITION_NONE:
3134  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3135  break;
3136  case PARTITION_H:
3137  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3138  yoff += hbs * 8 * y_stride;
3139  uvoff += hbs * 4 * uv_stride;
3140  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3141  break;
3142  case PARTITION_V:
3143  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3144  yoff += hbs * 8;
3145  uvoff += hbs * 4;
3146  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3147  break;
3148  case PARTITION_SPLIT:
3149  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3150  decode_sb(ctx, row, col + hbs, lflvl,
3151  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3152  yoff += hbs * 8 * y_stride;
3153  uvoff += hbs * 4 * uv_stride;
3154  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3155  decode_sb(ctx, row + hbs, col + hbs, lflvl,
3156  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3157  break;
3158  default:
3159  av_assert0(0);
3160  }
3161  } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3162  bp = PARTITION_SPLIT;
3163  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3164  decode_sb(ctx, row, col + hbs, lflvl,
3165  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3166  } else {
3167  bp = PARTITION_H;
3168  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3169  }
3170  } else if (row + hbs < s->rows) { // FIXME why not <=?
3171  if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3172  bp = PARTITION_SPLIT;
3173  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3174  yoff += hbs * 8 * y_stride;
3175  uvoff += hbs * 4 * uv_stride;
3176  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3177  } else {
3178  bp = PARTITION_V;
3179  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3180  }
3181  } else {
3182  bp = PARTITION_SPLIT;
3183  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3184  }
3185  s->counts.partition[bl][c][bp]++;
3186 }
3187 
3188 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3189  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3190 {
3191  VP9Context *s = ctx->priv_data;
3192  VP9Block *b = s->b;
3193  ptrdiff_t hbs = 4 >> bl;
3194  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3195  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3196 
3197  if (bl == BL_8X8) {
3198  av_assert2(b->bl == BL_8X8);
3199  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3200  } else if (s->b->bl == bl) {
3201  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3202  if (b->bp == PARTITION_H && row + hbs < s->rows) {
3203  yoff += hbs * 8 * y_stride;
3204  uvoff += hbs * 4 * uv_stride;
3205  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3206  } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3207  yoff += hbs * 8;
3208  uvoff += hbs * 4;
3209  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3210  }
3211  } else {
3212  decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3213  if (col + hbs < s->cols) { // FIXME why not <=?
3214  if (row + hbs < s->rows) {
3215  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
3216  uvoff + 4 * hbs, bl + 1);
3217  yoff += hbs * 8 * y_stride;
3218  uvoff += hbs * 4 * uv_stride;
3219  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3220  decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3221  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3222  } else {
3223  yoff += hbs * 8;
3224  uvoff += hbs * 4;
3225  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3226  }
3227  } else if (row + hbs < s->rows) {
3228  yoff += hbs * 8 * y_stride;
3229  uvoff += hbs * 4 * uv_stride;
3230  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3231  }
3232  }
3233 }
3234 
3235 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3236  int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3237 {
3238  VP9Context *s = ctx->priv_data;
3239  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3240  uint8_t *dst = f->data[0] + yoff, *lvl = lflvl->level;
3241  ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3242  int y, x, p;
3243 
3244  // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3245  // if you think of them as acting on a 8x8 block max, we can interleave
3246  // each v/h within the single x loop, but that only works if we work on
3247  // 8 pixel blocks, and we won't always do that (we want at least 16px
3248  // to use SSE2 optimizations, perhaps 32 for AVX2)
3249 
3250  // filter edges between columns, Y plane (e.g. block1 | block2)
3251  for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
3252  uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
3253  uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
3254  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3255  unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3256  unsigned hm = hm1 | hm2 | hm13 | hm23;
3257 
3258  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
3259  if (hm1 & x) {
3260  int L = *l, H = L >> 4;
3261  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3262 
3263  if (col || x > 1) {
3264  if (hmask1[0] & x) {
3265  if (hmask2[0] & x) {
3266  av_assert2(l[8] == L);
3267  s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
3268  } else {
3269  s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
3270  }
3271  } else if (hm2 & x) {
3272  L = l[8];
3273  H |= (L >> 4) << 8;
3274  E |= s->filter.mblim_lut[L] << 8;
3275  I |= s->filter.lim_lut[L] << 8;
3276  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3277  [!!(hmask2[1] & x)]
3278  [0](ptr, ls_y, E, I, H);
3279  } else {
3280  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3281  [0](ptr, ls_y, E, I, H);
3282  }
3283  }
3284  } else if (hm2 & x) {
3285  int L = l[8], H = L >> 4;
3286  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3287 
3288  if (col || x > 1) {
3289  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3290  [0](ptr + 8 * ls_y, ls_y, E, I, H);
3291  }
3292  }
3293  if (hm13 & x) {
3294  int L = *l, H = L >> 4;
3295  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3296 
3297  if (hm23 & x) {
3298  L = l[8];
3299  H |= (L >> 4) << 8;
3300  E |= s->filter.mblim_lut[L] << 8;
3301  I |= s->filter.lim_lut[L] << 8;
3302  s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
3303  } else {
3304  s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
3305  }
3306  } else if (hm23 & x) {
3307  int L = l[8], H = L >> 4;
3308  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3309 
3310  s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
3311  }
3312  }
3313  }
3314 
3315  // block1
3316  // filter edges between rows, Y plane (e.g. ------)
3317  // block2
3318  dst = f->data[0] + yoff;
3319  lvl = lflvl->level;
3320  for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
3321  uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
3322  unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3323 
3324  for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
3325  if (row || y) {
3326  if (vm & x) {
3327  int L = *l, H = L >> 4;
3328  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3329 
3330  if (vmask[0] & x) {
3331  if (vmask[0] & (x << 1)) {
3332  av_assert2(l[1] == L);
3333  s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
3334  } else {
3335  s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
3336  }
3337  } else if (vm & (x << 1)) {
3338  L = l[1];
3339  H |= (L >> 4) << 8;
3340  E |= s->filter.mblim_lut[L] << 8;
3341  I |= s->filter.lim_lut[L] << 8;
3342  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3343  [!!(vmask[1] & (x << 1))]
3344  [1](ptr, ls_y, E, I, H);
3345  } else {
3346  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3347  [1](ptr, ls_y, E, I, H);
3348  }
3349  } else if (vm & (x << 1)) {
3350  int L = l[1], H = L >> 4;
3351  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3352 
3353  s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
3354  [1](ptr + 8, ls_y, E, I, H);
3355  }
3356  }
3357  if (vm3 & x) {
3358  int L = *l, H = L >> 4;
3359  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3360 
3361  if (vm3 & (x << 1)) {
3362  L = l[1];
3363  H |= (L >> 4) << 8;
3364  E |= s->filter.mblim_lut[L] << 8;
3365  I |= s->filter.lim_lut[L] << 8;
3366  s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
3367  } else {
3368  s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
3369  }
3370  } else if (vm3 & (x << 1)) {
3371  int L = l[1], H = L >> 4;
3372  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3373 
3374  s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
3375  }
3376  }
3377  }
3378 
3379  // same principle but for U/V planes
3380  for (p = 0; p < 2; p++) {
3381  lvl = lflvl->level;
3382  dst = f->data[1 + p] + uvoff;
3383  for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
3384  uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
3385  uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
3386  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
3387  unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
3388 
3389  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
3390  if (col || x > 1) {
3391  if (hm1 & x) {
3392  int L = *l, H = L >> 4;
3393  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3394 
3395  if (hmask1[0] & x) {
3396  if (hmask2[0] & x) {
3397  av_assert2(l[16] == L);
3398  s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
3399  } else {
3400  s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
3401  }
3402  } else if (hm2 & x) {
3403  L = l[16];
3404  H |= (L >> 4) << 8;
3405  E |= s->filter.mblim_lut[L] << 8;
3406  I |= s->filter.lim_lut[L] << 8;
3407  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3408  [!!(hmask2[1] & x)]
3409  [0](ptr, ls_uv, E, I, H);
3410  } else {
3411  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3412  [0](ptr, ls_uv, E, I, H);
3413  }
3414  } else if (hm2 & x) {
3415  int L = l[16], H = L >> 4;
3416  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3417 
3418  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3419  [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
3420  }
3421  }
3422  if (x & 0xAA)
3423  l += 2;
3424  }
3425  }
3426  lvl = lflvl->level;
3427  dst = f->data[1 + p] + uvoff;
3428  for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
3429  uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
3430  unsigned vm = vmask[0] | vmask[1] | vmask[2];
3431 
3432  for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
3433  if (row || y) {
3434  if (vm & x) {
3435  int L = *l, H = L >> 4;
3436  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3437 
3438  if (vmask[0] & x) {
3439  if (vmask[0] & (x << 2)) {
3440  av_assert2(l[2] == L);
3441  s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
3442  } else {
3443  s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
3444  }
3445  } else if (vm & (x << 2)) {
3446  L = l[2];
3447  H |= (L >> 4) << 8;
3448  E |= s->filter.mblim_lut[L] << 8;
3449  I |= s->filter.lim_lut[L] << 8;
3450  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3451  [!!(vmask[1] & (x << 2))]
3452  [1](ptr, ls_uv, E, I, H);
3453  } else {
3454  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3455  [1](ptr, ls_uv, E, I, H);
3456  }
3457  } else if (vm & (x << 2)) {
3458  int L = l[2], H = L >> 4;
3459  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3460 
3461  s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
3462  [1](ptr + 8, ls_uv, E, I, H);
3463  }
3464  }
3465  }
3466  if (y & 1)
3467  lvl += 16;
3468  }
3469  }
3470 }
3471 
3472 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3473 {
3474  int sb_start = ( idx * n) >> log2_n;
3475  int sb_end = ((idx + 1) * n) >> log2_n;
3476  *start = FFMIN(sb_start, n) << 3;
3477  *end = FFMIN(sb_end, n) << 3;
3478 }
3479 
3480 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3481  int max_count, int update_factor)
3482 {
3483  unsigned ct = ct0 + ct1, p2, p1;
3484 
3485  if (!ct)
3486  return;
3487 
3488  p1 = *p;
3489  p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3490  p2 = av_clip(p2, 1, 255);
3491  ct = FFMIN(ct, max_count);
3492  update_factor = FASTDIV(update_factor * ct, max_count);
3493 
3494  // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3495  *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3496 }
3497 
3498 static void adapt_probs(VP9Context *s)
3499 {
3500  int i, j, k, l, m;
3501  prob_context *p = &s->prob_ctx[s->framectxid].p;
3502  int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3503 
3504  // coefficients
3505  for (i = 0; i < 4; i++)
3506  for (j = 0; j < 2; j++)
3507  for (k = 0; k < 2; k++)
3508  for (l = 0; l < 6; l++)
3509  for (m = 0; m < 6; m++) {
3510  uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3511  unsigned *e = s->counts.eob[i][j][k][l][m];
3512  unsigned *c = s->counts.coef[i][j][k][l][m];
3513 
3514  if (l == 0 && m >= 3) // dc only has 3 pt
3515  break;
3516 
3517  adapt_prob(&pp[0], e[0], e[1], 24, uf);
3518  adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3519  adapt_prob(&pp[2], c[1], c[2], 24, uf);
3520  }
3521 
3522  if (s->keyframe || s->intraonly) {
3523  memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3524  memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3525  memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3526  memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3527  return;
3528  }
3529 
3530  // skip flag
3531  for (i = 0; i < 3; i++)
3532  adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3533 
3534  // intra/inter flag
3535  for (i = 0; i < 4; i++)
3536  adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3537 
3538  // comppred flag
3539  if (s->comppredmode == PRED_SWITCHABLE) {
3540  for (i = 0; i < 5; i++)
3541  adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3542  }
3543 
3544  // reference frames
3545  if (s->comppredmode != PRED_SINGLEREF) {
3546  for (i = 0; i < 5; i++)
3547  adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3548  s->counts.comp_ref[i][1], 20, 128);
3549  }
3550 
3551  if (s->comppredmode != PRED_COMPREF) {
3552  for (i = 0; i < 5; i++) {
3553  uint8_t *pp = p->single_ref[i];
3554  unsigned (*c)[2] = s->counts.single_ref[i];
3555 
3556  adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3557  adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3558  }
3559  }
3560 
3561  // block partitioning
3562  for (i = 0; i < 4; i++)
3563  for (j = 0; j < 4; j++) {
3564  uint8_t *pp = p->partition[i][j];
3565  unsigned *c = s->counts.partition[i][j];
3566 
3567  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3568  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3569  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3570  }
3571 
3572  // tx size
3573  if (s->txfmmode == TX_SWITCHABLE) {
3574  for (i = 0; i < 2; i++) {
3575  unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3576 
3577  adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3578  adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3579  adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3580  adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3581  adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3582  adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3583  }
3584  }
3585 
3586  // interpolation filter
3587  if (s->filtermode == FILTER_SWITCHABLE) {
3588  for (i = 0; i < 4; i++) {
3589  uint8_t *pp = p->filter[i];
3590  unsigned *c = s->counts.filter[i];
3591 
3592  adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3593  adapt_prob(&pp[1], c[1], c[2], 20, 128);
3594  }
3595  }
3596 
3597  // inter modes
3598  for (i = 0; i < 7; i++) {
3599  uint8_t *pp = p->mv_mode[i];
3600  unsigned *c = s->counts.mv_mode[i];
3601 
3602  adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3603  adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3604  adapt_prob(&pp[2], c[1], c[3], 20, 128);
3605  }
3606 
3607  // mv joints
3608  {
3609  uint8_t *pp = p->mv_joint;
3610  unsigned *c = s->counts.mv_joint;
3611 
3612  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3613  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3614  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3615  }
3616 
3617  // mv components
3618  for (i = 0; i < 2; i++) {
3619  uint8_t *pp;
3620  unsigned *c, (*c2)[2], sum;
3621 
3622  adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3623  s->counts.mv_comp[i].sign[1], 20, 128);
3624 
3625  pp = p->mv_comp[i].classes;
3626  c = s->counts.mv_comp[i].classes;
3627  sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3628  adapt_prob(&pp[0], c[0], sum, 20, 128);
3629  sum -= c[1];
3630  adapt_prob(&pp[1], c[1], sum, 20, 128);
3631  sum -= c[2] + c[3];
3632  adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3633  adapt_prob(&pp[3], c[2], c[3], 20, 128);
3634  sum -= c[4] + c[5];
3635  adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3636  adapt_prob(&pp[5], c[4], c[5], 20, 128);
3637  sum -= c[6];
3638  adapt_prob(&pp[6], c[6], sum, 20, 128);
3639  adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3640  adapt_prob(&pp[8], c[7], c[8], 20, 128);
3641  adapt_prob(&pp[9], c[9], c[10], 20, 128);
3642 
3643  adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3644  s->counts.mv_comp[i].class0[1], 20, 128);
3645  pp = p->mv_comp[i].bits;
3646  c2 = s->counts.mv_comp[i].bits;
3647  for (j = 0; j < 10; j++)
3648  adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3649 
3650  for (j = 0; j < 2; j++) {
3651  pp = p->mv_comp[i].class0_fp[j];
3652  c = s->counts.mv_comp[i].class0_fp[j];
3653  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3654  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3655  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3656  }
3657  pp = p->mv_comp[i].fp;
3658  c = s->counts.mv_comp[i].fp;
3659  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3660  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3661  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3662 
3663  if (s->highprecisionmvs) {
3664  adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3665  s->counts.mv_comp[i].class0_hp[1], 20, 128);
3666  adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3667  s->counts.mv_comp[i].hp[1], 20, 128);
3668  }
3669  }
3670 
3671  // y intra modes
3672  for (i = 0; i < 4; i++) {
3673  uint8_t *pp = p->y_mode[i];
3674  unsigned *c = s->counts.y_mode[i], sum, s2;
3675 
3676  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3677  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3678  sum -= c[TM_VP8_PRED];
3679  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3680  sum -= c[VERT_PRED];
3681  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3682  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3683  sum -= s2;
3684  adapt_prob(&pp[3], s2, sum, 20, 128);
3685  s2 -= c[HOR_PRED];
3686  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3687  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3688  sum -= c[DIAG_DOWN_LEFT_PRED];
3689  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3690  sum -= c[VERT_LEFT_PRED];
3691  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3692  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3693  }
3694 
3695  // uv intra modes
3696  for (i = 0; i < 10; i++) {
3697  uint8_t *pp = p->uv_mode[i];
3698  unsigned *c = s->counts.uv_mode[i], sum, s2;
3699 
3700  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3701  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3702  sum -= c[TM_VP8_PRED];
3703  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3704  sum -= c[VERT_PRED];
3705  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3706  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3707  sum -= s2;
3708  adapt_prob(&pp[3], s2, sum, 20, 128);
3709  s2 -= c[HOR_PRED];
3710  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3711  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3712  sum -= c[DIAG_DOWN_LEFT_PRED];
3713  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3714  sum -= c[VERT_LEFT_PRED];
3715  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3716  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3717  }
3718 }
3719 
3720 static void free_buffers(VP9Context *s)
3721 {
3722  av_freep(&s->intra_pred_data[0]);
3723  av_freep(&s->b_base);
3724  av_freep(&s->block_base);
3725 }
3726 
3728 {
3729  VP9Context *s = ctx->priv_data;
3730  int i;
3731 
3732  for (i = 0; i < 2; i++) {
3733  if (s->frames[i].tf.f->data[0])
3734  vp9_unref_frame(ctx, &s->frames[i]);
3735  av_frame_free(&s->frames[i].tf.f);
3736  }
3737  for (i = 0; i < 8; i++) {
3738  if (s->refs[i].f->data[0])
3739  ff_thread_release_buffer(ctx, &s->refs[i]);
3740  av_frame_free(&s->refs[i].f);
3741  if (s->next_refs[i].f->data[0])
3742  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3743  av_frame_free(&s->next_refs[i].f);
3744  }
3745  free_buffers(s);
3746  av_freep(&s->c_b);
3747  s->c_b_size = 0;
3748 
3749  return 0;
3750 }
3751 
3752 
3753 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3754  int *got_frame, AVPacket *pkt)
3755 {
3756  const uint8_t *data = pkt->data;
3757  int size = pkt->size;
3758  VP9Context *s = ctx->priv_data;
3759  int res, tile_row, tile_col, i, ref, row, col;
3760  ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3761  AVFrame *f;
3762 
3763  if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3764  return res;
3765  } else if (res == 0) {
3766  if (!s->refs[ref].f->data[0]) {
3767  av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3768  return AVERROR_INVALIDDATA;
3769  }
3770  if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
3771  return res;
3772  *got_frame = 1;
3773  return pkt->size;
3774  }
3775  data += res;
3776  size -= res;
3777 
3778  if (s->frames[LAST_FRAME].tf.f->data[0])
3779  vp9_unref_frame(ctx, &s->frames[LAST_FRAME]);
3780  if (!s->keyframe && s->frames[CUR_FRAME].tf.f->data[0] &&
3781  (res = vp9_ref_frame(ctx, &s->frames[LAST_FRAME], &s->frames[CUR_FRAME])) < 0)
3782  return res;
3783  if (s->frames[CUR_FRAME].tf.f->data[0])
3784  vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
3785  if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
3786  return res;
3787  f = s->frames[CUR_FRAME].tf.f;
3788  f->key_frame = s->keyframe;
3790  ls_y = f->linesize[0];
3791  ls_uv =f->linesize[1];
3792 
3793  // ref frame setup
3794  for (i = 0; i < 8; i++) {
3795  if (s->next_refs[i].f->data[0])
3796  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3797  if (s->refreshrefmask & (1 << i)) {
3798  res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
3799  } else {
3800  res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
3801  }
3802  if (res < 0)
3803  return res;
3804  }
3805 
3806  if (s->fullrange)
3808  else
3810 
3811  switch (s->colorspace) {
3812  case 1: ctx->colorspace = AVCOL_SPC_BT470BG; break;
3813  case 2: ctx->colorspace = AVCOL_SPC_BT709; break;
3814  case 3: ctx->colorspace = AVCOL_SPC_SMPTE170M; break;
3815  case 4: ctx->colorspace = AVCOL_SPC_SMPTE240M; break;
3816  }
3817 
3818  // main tile decode loop
3819  memset(s->above_partition_ctx, 0, s->cols);
3820  memset(s->above_skip_ctx, 0, s->cols);
3821  if (s->keyframe || s->intraonly) {
3822  memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3823  } else {
3824  memset(s->above_mode_ctx, NEARESTMV, s->cols);
3825  }
3826  memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3827  memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
3828  memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
3829  memset(s->above_segpred_ctx, 0, s->cols);
3830  s->pass = s->uses_2pass =
3832  if ((res = update_block_buffers(ctx)) < 0) {
3833  av_log(ctx, AV_LOG_ERROR,
3834  "Failed to allocate block buffers\n");
3835  return res;
3836  }
3837  if (s->refreshctx && s->parallelmode) {
3838  int j, k, l, m;
3839 
3840  for (i = 0; i < 4; i++) {
3841  for (j = 0; j < 2; j++)
3842  for (k = 0; k < 2; k++)
3843  for (l = 0; l < 6; l++)
3844  for (m = 0; m < 6; m++)
3845  memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
3846  s->prob.coef[i][j][k][l][m], 3);
3847  if (s->txfmmode == i)
3848  break;
3849  }
3850  s->prob_ctx[s->framectxid].p = s->prob.p;
3852  } else if (!s->refreshctx) {
3854  }
3855 
3856  do {
3857  yoff = uvoff = 0;
3858  s->b = s->b_base;
3859  s->block = s->block_base;
3860  s->uvblock[0] = s->uvblock_base[0];
3861  s->uvblock[1] = s->uvblock_base[1];
3862  s->eob = s->eob_base;
3863  s->uveob[0] = s->uveob_base[0];
3864  s->uveob[1] = s->uveob_base[1];
3865 
3866  for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
3868  tile_row, s->tiling.log2_tile_rows, s->sb_rows);
3869  if (s->pass != 2) {
3870  for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3871  unsigned tile_size;
3872 
3873  if (tile_col == s->tiling.tile_cols - 1 &&
3874  tile_row == s->tiling.tile_rows - 1) {
3875  tile_size = size;
3876  } else {
3877  tile_size = AV_RB32(data);
3878  data += 4;
3879  size -= 4;
3880  }
3881  if (tile_size > size) {
3882  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3883  return AVERROR_INVALIDDATA;
3884  }
3885  ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
3886  if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
3887  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3888  return AVERROR_INVALIDDATA;
3889  }
3890  data += tile_size;
3891  size -= tile_size;
3892  }
3893  }
3894 
3895  for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
3896  row += 8, yoff += ls_y * 64, uvoff += ls_uv * 32) {
3897  struct VP9Filter *lflvl_ptr = s->lflvl;
3898  ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
3899 
3900  for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3902  tile_col, s->tiling.log2_tile_cols, s->sb_cols);
3903 
3904  if (s->pass != 2) {
3905  memset(s->left_partition_ctx, 0, 8);
3906  memset(s->left_skip_ctx, 0, 8);
3907  if (s->keyframe || s->intraonly) {
3908  memset(s->left_mode_ctx, DC_PRED, 16);
3909  } else {
3910  memset(s->left_mode_ctx, NEARESTMV, 8);
3911  }
3912  memset(s->left_y_nnz_ctx, 0, 16);
3913  memset(s->left_uv_nnz_ctx, 0, 16);
3914  memset(s->left_segpred_ctx, 0, 8);
3915 
3916  memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
3917  }
3918 
3919  for (col = s->tiling.tile_col_start;
3920  col < s->tiling.tile_col_end;
3921  col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3922  // FIXME integrate with lf code (i.e. zero after each
3923  // use, similar to invtxfm coefficients, or similar)
3924  if (s->pass != 1) {
3925  memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
3926  }
3927 
3928  if (s->pass == 2) {
3929  decode_sb_mem(ctx, row, col, lflvl_ptr,
3930  yoff2, uvoff2, BL_64X64);
3931  } else {
3932  decode_sb(ctx, row, col, lflvl_ptr,
3933  yoff2, uvoff2, BL_64X64);
3934  }
3935  }
3936  if (s->pass != 2) {
3937  memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
3938  }
3939  }
3940 
3941  if (s->pass == 1) {
3942  continue;
3943  }
3944 
3945  // backup pre-loopfilter reconstruction data for intra
3946  // prediction of next row of sb64s
3947  if (row + 8 < s->rows) {
3948  memcpy(s->intra_pred_data[0],
3949  f->data[0] + yoff + 63 * ls_y,
3950  8 * s->cols);
3951  memcpy(s->intra_pred_data[1],
3952  f->data[1] + uvoff + 31 * ls_uv,
3953  4 * s->cols);
3954  memcpy(s->intra_pred_data[2],
3955  f->data[2] + uvoff + 31 * ls_uv,
3956  4 * s->cols);
3957  }
3958 
3959  // loopfilter one row
3960  if (s->filter.level) {
3961  yoff2 = yoff;
3962  uvoff2 = uvoff;
3963  lflvl_ptr = s->lflvl;
3964  for (col = 0; col < s->cols;
3965  col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3966  loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
3967  }
3968  }
3969 
3970  // FIXME maybe we can make this more finegrained by running the
3971  // loopfilter per-block instead of after each sbrow
3972  // In fact that would also make intra pred left preparation easier?
3973  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
3974  }
3975  }
3976 
3977  if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
3978  adapt_probs(s);
3980  }
3981  } while (s->pass++ == 1);
3982  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3983 
3984  // ref frame setup
3985  for (i = 0; i < 8; i++) {
3986  if (s->refs[i].f->data[0])
3987  ff_thread_release_buffer(ctx, &s->refs[i]);
3988  ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
3989  }
3990 
3991  if (!s->invisible) {
3992  if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
3993  return res;
3994  *got_frame = 1;
3995  }
3996 
3997  return pkt->size;
3998 }
3999 
4001 {
4002  VP9Context *s = ctx->priv_data;
4003  int i;
4004 
4005  for (i = 0; i < 2; i++)
4006  vp9_unref_frame(ctx, &s->frames[i]);
4007  for (i = 0; i < 8; i++)
4008  ff_thread_release_buffer(ctx, &s->refs[i]);
4009 }
4010 
4011 static int init_frames(AVCodecContext *ctx)
4012 {
4013  VP9Context *s = ctx->priv_data;
4014  int i;
4015 
4016  for (i = 0; i < 2; i++) {
4017  s->frames[i].tf.f = av_frame_alloc();
4018  if (!s->frames[i].tf.f) {
4019  vp9_decode_free(ctx);
4020  av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4021  return AVERROR(ENOMEM);
4022  }
4023  }
4024  for (i = 0; i < 8; i++) {
4025  s->refs[i].f = av_frame_alloc();
4026  s->next_refs[i].f = av_frame_alloc();
4027  if (!s->refs[i].f || !s->next_refs[i].f) {
4028  vp9_decode_free(ctx);
4029  av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4030  return AVERROR(ENOMEM);
4031  }
4032  }
4033 
4034  return 0;
4035 }
4036 
4038 {
4039  VP9Context *s = ctx->priv_data;
4040 
4041  ctx->internal->allocate_progress = 1;
4042  ctx->pix_fmt = AV_PIX_FMT_YUV420P;
4043  ff_vp9dsp_init(&s->dsp);
4044  ff_videodsp_init(&s->vdsp, 8);
4045  s->filter.sharpness = -1;
4046 
4047  return init_frames(ctx);
4048 }
4049 
4051 {
4052  return init_frames(avctx);
4053 }
4054 
4056 {
4057  int i, res;
4058  VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4059 
4060  // detect size changes in other threads
4061  if (s->intra_pred_data[0] &&
4062  (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
4063  free_buffers(s);
4064  }
4065 
4066  for (i = 0; i < 2; i++) {
4067  if (s->frames[i].tf.f->data[0])
4068  vp9_unref_frame(dst, &s->frames[i]);
4069  if (ssrc->frames[i].tf.f->data[0]) {
4070  if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
4071  return res;
4072  }
4073  }
4074  for (i = 0; i < 8; i++) {
4075  if (s->refs[i].f->data[0])
4076  ff_thread_release_buffer(dst, &s->refs[i]);
4077  if (ssrc->next_refs[i].f->data[0]) {
4078  if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
4079  return res;
4080  }
4081  }
4082 
4083  s->invisible = ssrc->invisible;
4084  s->keyframe = ssrc->keyframe;
4085  s->uses_2pass = ssrc->uses_2pass;
4086  memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4087  memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4088  if (ssrc->segmentation.enabled) {
4089  memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4090  sizeof(s->segmentation.feat));
4091  }
4092 
4093  return 0;
4094 }
4095 
4097  .name = "vp9",
4098  .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4099  .type = AVMEDIA_TYPE_VIDEO,
4100  .id = AV_CODEC_ID_VP9,
4101  .priv_data_size = sizeof(VP9Context),
4102  .init = vp9_decode_init,
4103  .close = vp9_decode_free,
4105  .capabilities = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
4109 };