FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
vp9.c
Go to the documentation of this file.
1 /*
2  * VP9 compatible video decoder
3  *
4  * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5  * Copyright (C) 2013 Clément Bœsch <u pkh me>
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23 
24 #include "avcodec.h"
25 #include "get_bits.h"
26 #include "internal.h"
27 #include "thread.h"
28 #include "videodsp.h"
29 #include "vp56.h"
30 #include "vp9.h"
31 #include "vp9data.h"
32 #include "vp9dsp.h"
33 #include "libavutil/avassert.h"
34 
35 #define VP9_SYNCCODE 0x498342
36 
41 };
42 
43 enum BlockLevel {
48 };
49 
50 enum BlockSize {
65 };
66 
67 struct VP9mvrefPair {
68  VP56mv mv[2];
69  int8_t ref[2];
70 };
71 
72 typedef struct VP9Frame {
76  struct VP9mvrefPair *mv;
77 } VP9Frame;
78 
79 struct VP9Filter {
80  uint8_t level[8 * 8];
81  uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
82  [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
83 };
84 
85 typedef struct VP9Block {
88  VP56mv mv[4 /* b_idx */][2 /* ref */];
89  enum BlockSize bs;
90  enum TxfmMode tx, uvtx;
91  enum BlockLevel bl;
93 } VP9Block;
94 
95 typedef struct VP9Context {
101  unsigned c_b_size;
104  int row, row7, col, col7;
106  ptrdiff_t y_stride, uv_stride;
107 
108  // bitstream header
130 #define CUR_FRAME 0
131 #define LAST_FRAME 1
133 
134  struct {
136  int8_t sharpness;
139  } filter;
140  struct {
142  int8_t mode[2];
143  int8_t ref[4];
144  } lf_delta;
148 #define MAX_SEGMENT 8
149  struct {
154  struct {
160  int16_t q_val;
161  int8_t lf_val;
162  int16_t qmul[2][2];
163  uint8_t lflvl[4][2];
164  } feat[MAX_SEGMENT];
165  } segmentation;
166  struct {
168  unsigned tile_cols, tile_rows;
170  } tiling;
171  unsigned sb_cols, sb_rows, rows, cols;
172  struct {
174  uint8_t coef[4][2][2][6][6][3];
175  } prob_ctx[4];
176  struct {
177  prob_context p;
178  uint8_t coef[4][2][2][6][6][11];
181  } prob;
182  struct {
183  unsigned y_mode[4][10];
184  unsigned uv_mode[10][10];
185  unsigned filter[4][3];
186  unsigned mv_mode[7][4];
187  unsigned intra[4][2];
188  unsigned comp[5][2];
189  unsigned single_ref[5][2][2];
190  unsigned comp_ref[5][2];
191  unsigned tx32p[2][4];
192  unsigned tx16p[2][3];
193  unsigned tx8p[2][2];
194  unsigned skip[3][2];
195  unsigned mv_joint[4];
196  struct {
197  unsigned sign[2];
198  unsigned classes[11];
199  unsigned class0[2];
200  unsigned bits[10][2];
201  unsigned class0_fp[2][4];
202  unsigned fp[4];
203  unsigned class0_hp[2];
204  unsigned hp[2];
205  } mv_comp[2];
206  unsigned partition[4][4][4];
207  unsigned coef[4][2][2][6][6][3];
208  unsigned eob[4][2][2][6][6][2];
209  } counts;
212 
213  // contextual (left/above) cache
228  // FIXME maybe merge some of the below in a flags field?
239 
240  // whole-frame cache
242  struct VP9Filter *lflvl;
244 
245  // block reconstruction intermediates
247  int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
249  struct { int x, y; } min_mv, max_mv;
251  DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
252 } VP9Context;
253 
254 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
255  {
256  { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
257  { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
258  }, {
259  { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
260  { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
261  }
262 };
263 
265 {
266  VP9Context *s = ctx->priv_data;
267  int ret, sz;
268 
269  if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
270  return ret;
271  sz = 64 * s->sb_cols * s->sb_rows;
272  if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
273  ff_thread_release_buffer(ctx, &f->tf);
274  return AVERROR(ENOMEM);
275  }
276 
278  f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
279 
280  // retain segmentation map if it doesn't update
282  !s->intraonly && !s->keyframe && !s->errorres) {
284  }
285 
286  return 0;
287 }
288 
290 {
291  ff_thread_release_buffer(ctx, &f->tf);
293 }
294 
296 {
297  int res;
298 
299  if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
300  return res;
301  } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
302  vp9_unref_frame(ctx, dst);
303  return AVERROR(ENOMEM);
304  }
305 
307  dst->mv = src->mv;
308 
309  return 0;
310 }
311 
312 static int update_size(AVCodecContext *ctx, int w, int h)
313 {
314  VP9Context *s = ctx->priv_data;
315  uint8_t *p;
316 
317  av_assert0(w > 0 && h > 0);
318 
319  if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height)
320  return 0;
321 
322  ctx->width = w;
323  ctx->height = h;
324  s->sb_cols = (w + 63) >> 6;
325  s->sb_rows = (h + 63) >> 6;
326  s->cols = (w + 7) >> 3;
327  s->rows = (h + 7) >> 3;
328 
329 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
330  av_freep(&s->intra_pred_data[0]);
331  p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
332  if (!p)
333  return AVERROR(ENOMEM);
334  assign(s->intra_pred_data[0], uint8_t *, 64);
335  assign(s->intra_pred_data[1], uint8_t *, 32);
336  assign(s->intra_pred_data[2], uint8_t *, 32);
337  assign(s->above_y_nnz_ctx, uint8_t *, 16);
338  assign(s->above_mode_ctx, uint8_t *, 16);
339  assign(s->above_mv_ctx, VP56mv(*)[2], 16);
341  assign(s->above_skip_ctx, uint8_t *, 8);
342  assign(s->above_txfm_ctx, uint8_t *, 8);
343  assign(s->above_uv_nnz_ctx[0], uint8_t *, 8);
344  assign(s->above_uv_nnz_ctx[1], uint8_t *, 8);
345  assign(s->above_segpred_ctx, uint8_t *, 8);
346  assign(s->above_intra_ctx, uint8_t *, 8);
347  assign(s->above_comp_ctx, uint8_t *, 8);
348  assign(s->above_ref_ctx, uint8_t *, 8);
349  assign(s->above_filter_ctx, uint8_t *, 8);
350  assign(s->lflvl, struct VP9Filter *, 1);
351 #undef assign
352 
353  // these will be re-allocated a little later
354  av_freep(&s->b_base);
355  av_freep(&s->block_base);
356 
357  return 0;
358 }
359 
361 {
362  VP9Context *s = ctx->priv_data;
363 
364  if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->uses_2pass)
365  return 0;
366 
367  av_free(s->b_base);
368  av_free(s->block_base);
369  if (s->uses_2pass) {
370  int sbs = s->sb_cols * s->sb_rows;
371 
372  s->b_base = av_malloc_array(s->cols * s->rows, sizeof(VP9Block));
373  s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
374  if (!s->b_base || !s->block_base)
375  return AVERROR(ENOMEM);
376  s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
377  s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
378  s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
379  s->uveob_base[0] = s->eob_base + 256 * sbs;
380  s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
381  } else {
382  s->b_base = av_malloc(sizeof(VP9Block));
383  s->block_base = av_mallocz((64 * 64 + 128) * 3);
384  if (!s->b_base || !s->block_base)
385  return AVERROR(ENOMEM);
386  s->uvblock_base[0] = s->block_base + 64 * 64;
387  s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
388  s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
389  s->uveob_base[0] = s->eob_base + 256;
390  s->uveob_base[1] = s->uveob_base[0] + 64;
391  }
393 
394  return 0;
395 }
396 
397 // for some reason the sign bit is at the end, not the start, of a bit sequence
399 {
400  int v = get_bits(gb, n);
401  return get_bits1(gb) ? -v : v;
402 }
403 
405 {
406  return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
407 }
408 
409 // differential forward probability updates
410 static int update_prob(VP56RangeCoder *c, int p)
411 {
412  static const int inv_map_table[254] = {
413  7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
414  189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
415  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
416  25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
417  40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
418  55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
419  70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
420  86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
421  101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
422  116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
423  131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
424  146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
425  161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
426  177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
427  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
428  207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
429  222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
430  237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
431  252, 253,
432  };
433  int d;
434 
435  /* This code is trying to do a differential probability update. For a
436  * current probability A in the range [1, 255], the difference to a new
437  * probability of any value can be expressed differentially as 1-A,255-A
438  * where some part of this (absolute range) exists both in positive as
439  * well as the negative part, whereas another part only exists in one
440  * half. We're trying to code this shared part differentially, i.e.
441  * times two where the value of the lowest bit specifies the sign, and
442  * the single part is then coded on top of this. This absolute difference
443  * then again has a value of [0,254], but a bigger value in this range
444  * indicates that we're further away from the original value A, so we
445  * can code this as a VLC code, since higher values are increasingly
446  * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
447  * updates vs. the 'fine, exact' updates further down the range, which
448  * adds one extra dimension to this differential update model. */
449 
450  if (!vp8_rac_get(c)) {
451  d = vp8_rac_get_uint(c, 4) + 0;
452  } else if (!vp8_rac_get(c)) {
453  d = vp8_rac_get_uint(c, 4) + 16;
454  } else if (!vp8_rac_get(c)) {
455  d = vp8_rac_get_uint(c, 5) + 32;
456  } else {
457  d = vp8_rac_get_uint(c, 7);
458  if (d >= 65)
459  d = (d << 1) - 65 + vp8_rac_get(c);
460  d += 64;
461  }
462 
463  return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
464  255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
465 }
466 
468  const uint8_t *data, int size, int *ref)
469 {
470  VP9Context *s = ctx->priv_data;
471  int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
472  int last_invisible;
473  const uint8_t *data2;
474 
475  /* general header */
476  if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
477  av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
478  return res;
479  }
480  if (get_bits(&s->gb, 2) != 0x2) { // frame marker
481  av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
482  return AVERROR_INVALIDDATA;
483  }
484  s->profile = get_bits1(&s->gb);
485  if (get_bits1(&s->gb)) { // reserved bit
486  av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
487  return AVERROR_INVALIDDATA;
488  }
489  if (get_bits1(&s->gb)) {
490  *ref = get_bits(&s->gb, 3);
491  return 0;
492  }
493  s->last_uses_2pass = s->uses_2pass;
494  s->last_keyframe = s->keyframe;
495  s->keyframe = !get_bits1(&s->gb);
496  last_invisible = s->invisible;
497  s->invisible = !get_bits1(&s->gb);
498  s->errorres = get_bits1(&s->gb);
499  s->use_last_frame_mvs = !s->errorres && !last_invisible;
500  if (s->keyframe) {
501  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
502  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
503  return AVERROR_INVALIDDATA;
504  }
505  s->colorspace = get_bits(&s->gb, 3);
506  if (s->colorspace == 7) { // RGB = profile 1
507  av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
508  return AVERROR_INVALIDDATA;
509  }
510  s->fullrange = get_bits1(&s->gb);
511  // for profile 1, here follows the subsampling bits
512  s->refreshrefmask = 0xff;
513  w = get_bits(&s->gb, 16) + 1;
514  h = get_bits(&s->gb, 16) + 1;
515  if (get_bits1(&s->gb)) // display size
516  skip_bits(&s->gb, 32);
517  } else {
518  s->intraonly = s->invisible ? get_bits1(&s->gb) : 0;
519  s->resetctx = s->errorres ? 0 : get_bits(&s->gb, 2);
520  if (s->intraonly) {
521  if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
522  av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
523  return AVERROR_INVALIDDATA;
524  }
525  s->refreshrefmask = get_bits(&s->gb, 8);
526  w = get_bits(&s->gb, 16) + 1;
527  h = get_bits(&s->gb, 16) + 1;
528  if (get_bits1(&s->gb)) // display size
529  skip_bits(&s->gb, 32);
530  } else {
531  s->refreshrefmask = get_bits(&s->gb, 8);
532  s->refidx[0] = get_bits(&s->gb, 3);
533  s->signbias[0] = get_bits1(&s->gb);
534  s->refidx[1] = get_bits(&s->gb, 3);
535  s->signbias[1] = get_bits1(&s->gb);
536  s->refidx[2] = get_bits(&s->gb, 3);
537  s->signbias[2] = get_bits1(&s->gb);
538  if (!s->refs[s->refidx[0]].f->data[0] ||
539  !s->refs[s->refidx[1]].f->data[0] ||
540  !s->refs[s->refidx[2]].f->data[0]) {
541  av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
542  return AVERROR_INVALIDDATA;
543  }
544  if (get_bits1(&s->gb)) {
545  w = s->refs[s->refidx[0]].f->width;
546  h = s->refs[s->refidx[0]].f->height;
547  } else if (get_bits1(&s->gb)) {
548  w = s->refs[s->refidx[1]].f->width;
549  h = s->refs[s->refidx[1]].f->height;
550  } else if (get_bits1(&s->gb)) {
551  w = s->refs[s->refidx[2]].f->width;
552  h = s->refs[s->refidx[2]].f->height;
553  } else {
554  w = get_bits(&s->gb, 16) + 1;
555  h = get_bits(&s->gb, 16) + 1;
556  }
557  // Note that in this code, "CUR_FRAME" is actually before we
558  // have formally allocated a frame, and thus actually represents
559  // the _last_ frame
560  s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
561  s->frames[CUR_FRAME].tf.f->height == h;
562  if (get_bits1(&s->gb)) // display size
563  skip_bits(&s->gb, 32);
564  s->highprecisionmvs = get_bits1(&s->gb);
566  get_bits(&s->gb, 2);
567  s->allowcompinter = s->signbias[0] != s->signbias[1] ||
568  s->signbias[0] != s->signbias[2];
569  if (s->allowcompinter) {
570  if (s->signbias[0] == s->signbias[1]) {
571  s->fixcompref = 2;
572  s->varcompref[0] = 0;
573  s->varcompref[1] = 1;
574  } else if (s->signbias[0] == s->signbias[2]) {
575  s->fixcompref = 1;
576  s->varcompref[0] = 0;
577  s->varcompref[1] = 2;
578  } else {
579  s->fixcompref = 0;
580  s->varcompref[0] = 1;
581  s->varcompref[1] = 2;
582  }
583  }
584  }
585  }
586  s->refreshctx = s->errorres ? 0 : get_bits1(&s->gb);
587  s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
588  s->framectxid = c = get_bits(&s->gb, 2);
589 
590  /* loopfilter header data */
591  s->filter.level = get_bits(&s->gb, 6);
592  sharp = get_bits(&s->gb, 3);
593  // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
594  // the old cache values since they are still valid
595  if (s->filter.sharpness != sharp)
596  memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
597  s->filter.sharpness = sharp;
598  if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
599  if (get_bits1(&s->gb)) {
600  for (i = 0; i < 4; i++)
601  if (get_bits1(&s->gb))
602  s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
603  for (i = 0; i < 2; i++)
604  if (get_bits1(&s->gb))
605  s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
606  }
607  } else {
608  memset(&s->lf_delta, 0, sizeof(s->lf_delta));
609  }
610 
611  /* quantization header data */
612  s->yac_qi = get_bits(&s->gb, 8);
613  s->ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
614  s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
615  s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
616  s->lossless = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
617  s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
618 
619  /* segmentation header info */
620  if ((s->segmentation.enabled = get_bits1(&s->gb))) {
621  if ((s->segmentation.update_map = get_bits1(&s->gb))) {
622  for (i = 0; i < 7; i++)
623  s->prob.seg[i] = get_bits1(&s->gb) ?
624  get_bits(&s->gb, 8) : 255;
625  if ((s->segmentation.temporal = get_bits1(&s->gb))) {
626  for (i = 0; i < 3; i++)
627  s->prob.segpred[i] = get_bits1(&s->gb) ?
628  get_bits(&s->gb, 8) : 255;
629  }
630  }
631  if ((!s->segmentation.update_map || s->segmentation.temporal) &&
632  (w != s->frames[CUR_FRAME].tf.f->width ||
633  h != s->frames[CUR_FRAME].tf.f->height)) {
634  av_log(ctx, AV_LOG_ERROR,
635  "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
637  return AVERROR_INVALIDDATA;
638  }
639 
640  if (get_bits1(&s->gb)) {
642  for (i = 0; i < 8; i++) {
643  if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
644  s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
645  if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
646  s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
647  if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
648  s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
649  s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
650  }
651  }
652  } else {
653  s->segmentation.feat[0].q_enabled = 0;
654  s->segmentation.feat[0].lf_enabled = 0;
655  s->segmentation.feat[0].skip_enabled = 0;
656  s->segmentation.feat[0].ref_enabled = 0;
657  }
658 
659  // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
660  for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
661  int qyac, qydc, quvac, quvdc, lflvl, sh;
662 
663  if (s->segmentation.feat[i].q_enabled) {
665  qyac = s->segmentation.feat[i].q_val;
666  else
667  qyac = s->yac_qi + s->segmentation.feat[i].q_val;
668  } else {
669  qyac = s->yac_qi;
670  }
671  qydc = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
672  quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
673  quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
674  qyac = av_clip_uintp2(qyac, 8);
675 
676  s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
677  s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
678  s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
679  s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
680 
681  sh = s->filter.level >= 32;
682  if (s->segmentation.feat[i].lf_enabled) {
684  lflvl = s->segmentation.feat[i].lf_val;
685  else
686  lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
687  } else {
688  lflvl = s->filter.level;
689  }
690  s->segmentation.feat[i].lflvl[0][0] =
691  s->segmentation.feat[i].lflvl[0][1] =
692  av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
693  for (j = 1; j < 4; j++) {
694  s->segmentation.feat[i].lflvl[j][0] =
695  av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
696  s->lf_delta.mode[0]) << sh), 6);
697  s->segmentation.feat[i].lflvl[j][1] =
698  av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
699  s->lf_delta.mode[1]) << sh), 6);
700  }
701  }
702 
703  /* tiling info */
704  if ((res = update_size(ctx, w, h)) < 0) {
705  av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
706  return res;
707  }
708  for (s->tiling.log2_tile_cols = 0;
709  (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
710  s->tiling.log2_tile_cols++) ;
711  for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
712  max = FFMAX(0, max - 1);
713  while (max > s->tiling.log2_tile_cols) {
714  if (get_bits1(&s->gb))
715  s->tiling.log2_tile_cols++;
716  else
717  break;
718  }
719  s->tiling.log2_tile_rows = decode012(&s->gb);
720  s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
721  if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
722  s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
723  s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
724  sizeof(VP56RangeCoder) * s->tiling.tile_cols);
725  if (!s->c_b) {
726  av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
727  return AVERROR(ENOMEM);
728  }
729  }
730 
731  if (s->keyframe || s->errorres || s->intraonly) {
732  s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
733  s->prob_ctx[3].p = vp9_default_probs;
734  memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
735  sizeof(vp9_default_coef_probs));
736  memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
737  sizeof(vp9_default_coef_probs));
738  memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
739  sizeof(vp9_default_coef_probs));
740  memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
741  sizeof(vp9_default_coef_probs));
742  }
743 
744  // next 16 bits is size of the rest of the header (arith-coded)
745  size2 = get_bits(&s->gb, 16);
746  data2 = align_get_bits(&s->gb);
747  if (size2 > size - (data2 - data)) {
748  av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
749  return AVERROR_INVALIDDATA;
750  }
751  ff_vp56_init_range_decoder(&s->c, data2, size2);
752  if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
753  av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
754  return AVERROR_INVALIDDATA;
755  }
756 
757  if (s->keyframe || s->intraonly) {
758  memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
759  } else {
760  memset(&s->counts, 0, sizeof(s->counts));
761  }
762  // FIXME is it faster to not copy here, but do it down in the fw updates
763  // as explicit copies if the fw update is missing (and skip the copy upon
764  // fw update)?
765  s->prob.p = s->prob_ctx[c].p;
766 
767  // txfm updates
768  if (s->lossless) {
769  s->txfmmode = TX_4X4;
770  } else {
771  s->txfmmode = vp8_rac_get_uint(&s->c, 2);
772  if (s->txfmmode == 3)
773  s->txfmmode += vp8_rac_get(&s->c);
774 
775  if (s->txfmmode == TX_SWITCHABLE) {
776  for (i = 0; i < 2; i++)
777  if (vp56_rac_get_prob_branchy(&s->c, 252))
778  s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
779  for (i = 0; i < 2; i++)
780  for (j = 0; j < 2; j++)
781  if (vp56_rac_get_prob_branchy(&s->c, 252))
782  s->prob.p.tx16p[i][j] =
783  update_prob(&s->c, s->prob.p.tx16p[i][j]);
784  for (i = 0; i < 2; i++)
785  for (j = 0; j < 3; j++)
786  if (vp56_rac_get_prob_branchy(&s->c, 252))
787  s->prob.p.tx32p[i][j] =
788  update_prob(&s->c, s->prob.p.tx32p[i][j]);
789  }
790  }
791 
792  // coef updates
793  for (i = 0; i < 4; i++) {
794  uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
795  if (vp8_rac_get(&s->c)) {
796  for (j = 0; j < 2; j++)
797  for (k = 0; k < 2; k++)
798  for (l = 0; l < 6; l++)
799  for (m = 0; m < 6; m++) {
800  uint8_t *p = s->prob.coef[i][j][k][l][m];
801  uint8_t *r = ref[j][k][l][m];
802  if (m >= 3 && l == 0) // dc only has 3 pt
803  break;
804  for (n = 0; n < 3; n++) {
805  if (vp56_rac_get_prob_branchy(&s->c, 252)) {
806  p[n] = update_prob(&s->c, r[n]);
807  } else {
808  p[n] = r[n];
809  }
810  }
811  p[3] = 0;
812  }
813  } else {
814  for (j = 0; j < 2; j++)
815  for (k = 0; k < 2; k++)
816  for (l = 0; l < 6; l++)
817  for (m = 0; m < 6; m++) {
818  uint8_t *p = s->prob.coef[i][j][k][l][m];
819  uint8_t *r = ref[j][k][l][m];
820  if (m > 3 && l == 0) // dc only has 3 pt
821  break;
822  memcpy(p, r, 3);
823  p[3] = 0;
824  }
825  }
826  if (s->txfmmode == i)
827  break;
828  }
829 
830  // mode updates
831  for (i = 0; i < 3; i++)
832  if (vp56_rac_get_prob_branchy(&s->c, 252))
833  s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
834  if (!s->keyframe && !s->intraonly) {
835  for (i = 0; i < 7; i++)
836  for (j = 0; j < 3; j++)
837  if (vp56_rac_get_prob_branchy(&s->c, 252))
838  s->prob.p.mv_mode[i][j] =
839  update_prob(&s->c, s->prob.p.mv_mode[i][j]);
840 
841  if (s->filtermode == FILTER_SWITCHABLE)
842  for (i = 0; i < 4; i++)
843  for (j = 0; j < 2; j++)
844  if (vp56_rac_get_prob_branchy(&s->c, 252))
845  s->prob.p.filter[i][j] =
846  update_prob(&s->c, s->prob.p.filter[i][j]);
847 
848  for (i = 0; i < 4; i++)
849  if (vp56_rac_get_prob_branchy(&s->c, 252))
850  s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
851 
852  if (s->allowcompinter) {
853  s->comppredmode = vp8_rac_get(&s->c);
854  if (s->comppredmode)
855  s->comppredmode += vp8_rac_get(&s->c);
856  if (s->comppredmode == PRED_SWITCHABLE)
857  for (i = 0; i < 5; i++)
858  if (vp56_rac_get_prob_branchy(&s->c, 252))
859  s->prob.p.comp[i] =
860  update_prob(&s->c, s->prob.p.comp[i]);
861  } else {
863  }
864 
865  if (s->comppredmode != PRED_COMPREF) {
866  for (i = 0; i < 5; i++) {
867  if (vp56_rac_get_prob_branchy(&s->c, 252))
868  s->prob.p.single_ref[i][0] =
869  update_prob(&s->c, s->prob.p.single_ref[i][0]);
870  if (vp56_rac_get_prob_branchy(&s->c, 252))
871  s->prob.p.single_ref[i][1] =
872  update_prob(&s->c, s->prob.p.single_ref[i][1]);
873  }
874  }
875 
876  if (s->comppredmode != PRED_SINGLEREF) {
877  for (i = 0; i < 5; i++)
878  if (vp56_rac_get_prob_branchy(&s->c, 252))
879  s->prob.p.comp_ref[i] =
880  update_prob(&s->c, s->prob.p.comp_ref[i]);
881  }
882 
883  for (i = 0; i < 4; i++)
884  for (j = 0; j < 9; j++)
885  if (vp56_rac_get_prob_branchy(&s->c, 252))
886  s->prob.p.y_mode[i][j] =
887  update_prob(&s->c, s->prob.p.y_mode[i][j]);
888 
889  for (i = 0; i < 4; i++)
890  for (j = 0; j < 4; j++)
891  for (k = 0; k < 3; k++)
892  if (vp56_rac_get_prob_branchy(&s->c, 252))
893  s->prob.p.partition[3 - i][j][k] =
894  update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
895 
896  // mv fields don't use the update_prob subexp model for some reason
897  for (i = 0; i < 3; i++)
898  if (vp56_rac_get_prob_branchy(&s->c, 252))
899  s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
900 
901  for (i = 0; i < 2; i++) {
902  if (vp56_rac_get_prob_branchy(&s->c, 252))
903  s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
904 
905  for (j = 0; j < 10; j++)
906  if (vp56_rac_get_prob_branchy(&s->c, 252))
907  s->prob.p.mv_comp[i].classes[j] =
908  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
909 
910  if (vp56_rac_get_prob_branchy(&s->c, 252))
911  s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
912 
913  for (j = 0; j < 10; j++)
914  if (vp56_rac_get_prob_branchy(&s->c, 252))
915  s->prob.p.mv_comp[i].bits[j] =
916  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
917  }
918 
919  for (i = 0; i < 2; i++) {
920  for (j = 0; j < 2; j++)
921  for (k = 0; k < 3; k++)
922  if (vp56_rac_get_prob_branchy(&s->c, 252))
923  s->prob.p.mv_comp[i].class0_fp[j][k] =
924  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
925 
926  for (j = 0; j < 3; j++)
927  if (vp56_rac_get_prob_branchy(&s->c, 252))
928  s->prob.p.mv_comp[i].fp[j] =
929  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
930  }
931 
932  if (s->highprecisionmvs) {
933  for (i = 0; i < 2; i++) {
934  if (vp56_rac_get_prob_branchy(&s->c, 252))
935  s->prob.p.mv_comp[i].class0_hp =
936  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
937 
938  if (vp56_rac_get_prob_branchy(&s->c, 252))
939  s->prob.p.mv_comp[i].hp =
940  (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
941  }
942  }
943  }
944 
945  return (data2 - data) + size2;
946 }
947 
948 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
949  VP9Context *s)
950 {
951  dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
952  dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
953 }
954 
955 static void find_ref_mvs(VP9Context *s,
956  VP56mv *pmv, int ref, int z, int idx, int sb)
957 {
958  static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
959  [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
960  { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
961  [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
962  { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
963  [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
964  { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
965  [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
966  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
967  [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
968  { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
969  [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
970  { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
971  [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
972  { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
973  [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
974  { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
975  [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
976  { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
977  [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
978  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
979  [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
980  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
981  [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
982  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
983  [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
984  { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
985  };
986  VP9Block *b = s->b;
987  int row = s->row, col = s->col, row7 = s->row7;
988  const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
989 #define INVALID_MV 0x80008000U
990  uint32_t mem = INVALID_MV;
991  int i;
992 
993 #define RETURN_DIRECT_MV(mv) \
994  do { \
995  uint32_t m = AV_RN32A(&mv); \
996  if (!idx) { \
997  AV_WN32A(pmv, m); \
998  return; \
999  } else if (mem == INVALID_MV) { \
1000  mem = m; \
1001  } else if (m != mem) { \
1002  AV_WN32A(pmv, m); \
1003  return; \
1004  } \
1005  } while (0)
1006 
1007  if (sb >= 0) {
1008  if (sb == 2 || sb == 1) {
1009  RETURN_DIRECT_MV(b->mv[0][z]);
1010  } else if (sb == 3) {
1011  RETURN_DIRECT_MV(b->mv[2][z]);
1012  RETURN_DIRECT_MV(b->mv[1][z]);
1013  RETURN_DIRECT_MV(b->mv[0][z]);
1014  }
1015 
1016 #define RETURN_MV(mv) \
1017  do { \
1018  if (sb > 0) { \
1019  VP56mv tmp; \
1020  uint32_t m; \
1021  clamp_mv(&tmp, &mv, s); \
1022  m = AV_RN32A(&tmp); \
1023  if (!idx) { \
1024  AV_WN32A(pmv, m); \
1025  return; \
1026  } else if (mem == INVALID_MV) { \
1027  mem = m; \
1028  } else if (m != mem) { \
1029  AV_WN32A(pmv, m); \
1030  return; \
1031  } \
1032  } else { \
1033  uint32_t m = AV_RN32A(&mv); \
1034  if (!idx) { \
1035  clamp_mv(pmv, &mv, s); \
1036  return; \
1037  } else if (mem == INVALID_MV) { \
1038  mem = m; \
1039  } else if (m != mem) { \
1040  clamp_mv(pmv, &mv, s); \
1041  return; \
1042  } \
1043  } \
1044  } while (0)
1045 
1046  if (row > 0) {
1047  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1048  if (mv->ref[0] == ref) {
1049  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1050  } else if (mv->ref[1] == ref) {
1051  RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1052  }
1053  }
1054  if (col > s->tiling.tile_col_start) {
1055  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1056  if (mv->ref[0] == ref) {
1057  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1058  } else if (mv->ref[1] == ref) {
1059  RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1060  }
1061  }
1062  i = 2;
1063  } else {
1064  i = 0;
1065  }
1066 
1067  // previously coded MVs in this neighbourhood, using same reference frame
1068  for (; i < 8; i++) {
1069  int c = p[i][0] + col, r = p[i][1] + row;
1070 
1071  if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1072  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1073 
1074  if (mv->ref[0] == ref) {
1075  RETURN_MV(mv->mv[0]);
1076  } else if (mv->ref[1] == ref) {
1077  RETURN_MV(mv->mv[1]);
1078  }
1079  }
1080  }
1081 
1082  // MV at this position in previous frame, using same reference frame
1083  if (s->use_last_frame_mvs) {
1084  struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1085 
1086  if (!s->last_uses_2pass)
1087  ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1088  if (mv->ref[0] == ref) {
1089  RETURN_MV(mv->mv[0]);
1090  } else if (mv->ref[1] == ref) {
1091  RETURN_MV(mv->mv[1]);
1092  }
1093  }
1094 
1095 #define RETURN_SCALE_MV(mv, scale) \
1096  do { \
1097  if (scale) { \
1098  VP56mv mv_temp = { -mv.x, -mv.y }; \
1099  RETURN_MV(mv_temp); \
1100  } else { \
1101  RETURN_MV(mv); \
1102  } \
1103  } while (0)
1104 
1105  // previously coded MVs in this neighbourhood, using different reference frame
1106  for (i = 0; i < 8; i++) {
1107  int c = p[i][0] + col, r = p[i][1] + row;
1108 
1109  if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1110  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1111 
1112  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1113  RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1114  }
1115  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1116  // BUG - libvpx has this condition regardless of whether
1117  // we used the first ref MV and pre-scaling
1118  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1119  RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1120  }
1121  }
1122  }
1123 
1124  // MV at this position in previous frame, using different reference frame
1125  if (s->use_last_frame_mvs) {
1126  struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1127 
1128  // no need to await_progress, because we already did that above
1129  if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1130  RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1131  }
1132  if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1133  // BUG - libvpx has this condition regardless of whether
1134  // we used the first ref MV and pre-scaling
1135  AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1136  RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1137  }
1138  }
1139 
1140  AV_ZERO32(pmv);
1141 #undef INVALID_MV
1142 #undef RETURN_MV
1143 #undef RETURN_SCALE_MV
1144 }
1145 
1146 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1147 {
1148  int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1149  int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1150  s->prob.p.mv_comp[idx].classes);
1151 
1152  s->counts.mv_comp[idx].sign[sign]++;
1153  s->counts.mv_comp[idx].classes[c]++;
1154  if (c) {
1155  int m;
1156 
1157  for (n = 0, m = 0; m < c; m++) {
1158  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1159  n |= bit << m;
1160  s->counts.mv_comp[idx].bits[m][bit]++;
1161  }
1162  n <<= 3;
1163  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1164  n |= bit << 1;
1165  s->counts.mv_comp[idx].fp[bit]++;
1166  if (hp) {
1167  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1168  s->counts.mv_comp[idx].hp[bit]++;
1169  n |= bit;
1170  } else {
1171  n |= 1;
1172  // bug in libvpx - we count for bw entropy purposes even if the
1173  // bit wasn't coded
1174  s->counts.mv_comp[idx].hp[1]++;
1175  }
1176  n += 8 << c;
1177  } else {
1178  n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1179  s->counts.mv_comp[idx].class0[n]++;
1180  bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1181  s->prob.p.mv_comp[idx].class0_fp[n]);
1182  s->counts.mv_comp[idx].class0_fp[n][bit]++;
1183  n = (n << 3) | (bit << 1);
1184  if (hp) {
1185  bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1186  s->counts.mv_comp[idx].class0_hp[bit]++;
1187  n |= bit;
1188  } else {
1189  n |= 1;
1190  // bug in libvpx - we count for bw entropy purposes even if the
1191  // bit wasn't coded
1192  s->counts.mv_comp[idx].class0_hp[1]++;
1193  }
1194  }
1195 
1196  return sign ? -(n + 1) : (n + 1);
1197 }
1198 
1199 static void fill_mv(VP9Context *s,
1200  VP56mv *mv, int mode, int sb)
1201 {
1202  VP9Block *b = s->b;
1203 
1204  if (mode == ZEROMV) {
1205  AV_ZERO64(mv);
1206  } else {
1207  int hp;
1208 
1209  // FIXME cache this value and reuse for other subblocks
1210  find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1211  mode == NEWMV ? -1 : sb);
1212  // FIXME maybe move this code into find_ref_mvs()
1213  if ((mode == NEWMV || sb == -1) &&
1214  !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1215  if (mv[0].y & 1) {
1216  if (mv[0].y < 0)
1217  mv[0].y++;
1218  else
1219  mv[0].y--;
1220  }
1221  if (mv[0].x & 1) {
1222  if (mv[0].x < 0)
1223  mv[0].x++;
1224  else
1225  mv[0].x--;
1226  }
1227  }
1228  if (mode == NEWMV) {
1230  s->prob.p.mv_joint);
1231 
1232  s->counts.mv_joint[j]++;
1233  if (j >= MV_JOINT_V)
1234  mv[0].y += read_mv_component(s, 0, hp);
1235  if (j & 1)
1236  mv[0].x += read_mv_component(s, 1, hp);
1237  }
1238 
1239  if (b->comp) {
1240  // FIXME cache this value and reuse for other subblocks
1241  find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1242  mode == NEWMV ? -1 : sb);
1243  if ((mode == NEWMV || sb == -1) &&
1244  !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1245  if (mv[1].y & 1) {
1246  if (mv[1].y < 0)
1247  mv[1].y++;
1248  else
1249  mv[1].y--;
1250  }
1251  if (mv[1].x & 1) {
1252  if (mv[1].x < 0)
1253  mv[1].x++;
1254  else
1255  mv[1].x--;
1256  }
1257  }
1258  if (mode == NEWMV) {
1260  s->prob.p.mv_joint);
1261 
1262  s->counts.mv_joint[j]++;
1263  if (j >= MV_JOINT_V)
1264  mv[1].y += read_mv_component(s, 0, hp);
1265  if (j & 1)
1266  mv[1].x += read_mv_component(s, 1, hp);
1267  }
1268  }
1269  }
1270 }
1271 
1272 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1273  ptrdiff_t stride, int v)
1274 {
1275  switch (w) {
1276  case 1:
1277  do {
1278  *ptr = v;
1279  ptr += stride;
1280  } while (--h);
1281  break;
1282  case 2: {
1283  int v16 = v * 0x0101;
1284  do {
1285  AV_WN16A(ptr, v16);
1286  ptr += stride;
1287  } while (--h);
1288  break;
1289  }
1290  case 4: {
1291  uint32_t v32 = v * 0x01010101;
1292  do {
1293  AV_WN32A(ptr, v32);
1294  ptr += stride;
1295  } while (--h);
1296  break;
1297  }
1298  case 8: {
1299 #if HAVE_FAST_64BIT
1300  uint64_t v64 = v * 0x0101010101010101ULL;
1301  do {
1302  AV_WN64A(ptr, v64);
1303  ptr += stride;
1304  } while (--h);
1305 #else
1306  uint32_t v32 = v * 0x01010101;
1307  do {
1308  AV_WN32A(ptr, v32);
1309  AV_WN32A(ptr + 4, v32);
1310  ptr += stride;
1311  } while (--h);
1312 #endif
1313  break;
1314  }
1315  }
1316 }
1317 
1318 static void decode_mode(AVCodecContext *ctx)
1319 {
1320  static const uint8_t left_ctx[N_BS_SIZES] = {
1321  0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1322  };
1323  static const uint8_t above_ctx[N_BS_SIZES] = {
1324  0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1325  };
1326  static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1328  TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1329  };
1330  VP9Context *s = ctx->priv_data;
1331  VP9Block *b = s->b;
1332  int row = s->row, col = s->col, row7 = s->row7;
1333  enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1334  int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
1335  int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
1336  int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1337  int vref, filter_id;
1338 
1339  if (!s->segmentation.enabled) {
1340  b->seg_id = 0;
1341  } else if (s->keyframe || s->intraonly) {
1343  } else if (!s->segmentation.update_map ||
1344  (s->segmentation.temporal &&
1346  s->prob.segpred[s->above_segpred_ctx[col] +
1347  s->left_segpred_ctx[row7]]))) {
1348  if (!s->errorres) {
1349  int pred = 8, x;
1350  uint8_t *refsegmap = s->frames[LAST_FRAME].segmentation_map;
1351 
1352  if (!s->last_uses_2pass)
1353  ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1354  for (y = 0; y < h4; y++)
1355  for (x = 0; x < w4; x++)
1356  pred = FFMIN(pred, refsegmap[(y + row) * 8 * s->sb_cols + x + col]);
1357  av_assert1(pred < 8);
1358  b->seg_id = pred;
1359  } else {
1360  b->seg_id = 0;
1361  }
1362 
1363  memset(&s->above_segpred_ctx[col], 1, w4);
1364  memset(&s->left_segpred_ctx[row7], 1, h4);
1365  } else {
1367  s->prob.seg);
1368 
1369  memset(&s->above_segpred_ctx[col], 0, w4);
1370  memset(&s->left_segpred_ctx[row7], 0, h4);
1371  }
1372  if (s->segmentation.enabled &&
1373  (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1374  setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1375  w4, h4, 8 * s->sb_cols, b->seg_id);
1376  }
1377 
1378  b->skip = s->segmentation.enabled &&
1379  s->segmentation.feat[b->seg_id].skip_enabled;
1380  if (!b->skip) {
1381  int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1382  b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1383  s->counts.skip[c][b->skip]++;
1384  }
1385 
1386  if (s->keyframe || s->intraonly) {
1387  b->intra = 1;
1388  } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1389  b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1390  } else {
1391  int c, bit;
1392 
1393  if (have_a && have_l) {
1394  c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1395  c += (c == 2);
1396  } else {
1397  c = have_a ? 2 * s->above_intra_ctx[col] :
1398  have_l ? 2 * s->left_intra_ctx[row7] : 0;
1399  }
1400  bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1401  s->counts.intra[c][bit]++;
1402  b->intra = !bit;
1403  }
1404 
1405  if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1406  int c;
1407  if (have_a) {
1408  if (have_l) {
1409  c = (s->above_skip_ctx[col] ? max_tx :
1410  s->above_txfm_ctx[col]) +
1411  (s->left_skip_ctx[row7] ? max_tx :
1412  s->left_txfm_ctx[row7]) > max_tx;
1413  } else {
1414  c = s->above_skip_ctx[col] ? 1 :
1415  (s->above_txfm_ctx[col] * 2 > max_tx);
1416  }
1417  } else if (have_l) {
1418  c = s->left_skip_ctx[row7] ? 1 :
1419  (s->left_txfm_ctx[row7] * 2 > max_tx);
1420  } else {
1421  c = 1;
1422  }
1423  switch (max_tx) {
1424  case TX_32X32:
1425  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1426  if (b->tx) {
1427  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1428  if (b->tx == 2)
1429  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1430  }
1431  s->counts.tx32p[c][b->tx]++;
1432  break;
1433  case TX_16X16:
1434  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1435  if (b->tx)
1436  b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1437  s->counts.tx16p[c][b->tx]++;
1438  break;
1439  case TX_8X8:
1440  b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1441  s->counts.tx8p[c][b->tx]++;
1442  break;
1443  case TX_4X4:
1444  b->tx = TX_4X4;
1445  break;
1446  }
1447  } else {
1448  b->tx = FFMIN(max_tx, s->txfmmode);
1449  }
1450 
1451  if (s->keyframe || s->intraonly) {
1452  uint8_t *a = &s->above_mode_ctx[col * 2];
1453  uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1454 
1455  b->comp = 0;
1456  if (b->bs > BS_8x8) {
1457  // FIXME the memory storage intermediates here aren't really
1458  // necessary, they're just there to make the code slightly
1459  // simpler for now
1460  b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1461  vp9_default_kf_ymode_probs[a[0]][l[0]]);
1462  if (b->bs != BS_8x4) {
1464  vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1465  l[0] = a[1] = b->mode[1];
1466  } else {
1467  l[0] = a[1] = b->mode[1] = b->mode[0];
1468  }
1469  if (b->bs != BS_4x8) {
1470  b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1471  vp9_default_kf_ymode_probs[a[0]][l[1]]);
1472  if (b->bs != BS_8x4) {
1474  vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1475  l[1] = a[1] = b->mode[3];
1476  } else {
1477  l[1] = a[1] = b->mode[3] = b->mode[2];
1478  }
1479  } else {
1480  b->mode[2] = b->mode[0];
1481  l[1] = a[1] = b->mode[3] = b->mode[1];
1482  }
1483  } else {
1485  vp9_default_kf_ymode_probs[*a][*l]);
1486  b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1487  // FIXME this can probably be optimized
1488  memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1489  memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1490  }
1493  } else if (b->intra) {
1494  b->comp = 0;
1495  if (b->bs > BS_8x8) {
1497  s->prob.p.y_mode[0]);
1498  s->counts.y_mode[0][b->mode[0]]++;
1499  if (b->bs != BS_8x4) {
1501  s->prob.p.y_mode[0]);
1502  s->counts.y_mode[0][b->mode[1]]++;
1503  } else {
1504  b->mode[1] = b->mode[0];
1505  }
1506  if (b->bs != BS_4x8) {
1508  s->prob.p.y_mode[0]);
1509  s->counts.y_mode[0][b->mode[2]]++;
1510  if (b->bs != BS_8x4) {
1512  s->prob.p.y_mode[0]);
1513  s->counts.y_mode[0][b->mode[3]]++;
1514  } else {
1515  b->mode[3] = b->mode[2];
1516  }
1517  } else {
1518  b->mode[2] = b->mode[0];
1519  b->mode[3] = b->mode[1];
1520  }
1521  } else {
1522  static const uint8_t size_group[10] = {
1523  3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1524  };
1525  int sz = size_group[b->bs];
1526 
1528  s->prob.p.y_mode[sz]);
1529  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1530  s->counts.y_mode[sz][b->mode[3]]++;
1531  }
1533  s->prob.p.uv_mode[b->mode[3]]);
1534  s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1535  } else {
1536  static const uint8_t inter_mode_ctx_lut[14][14] = {
1537  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1538  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1539  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1540  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1541  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1542  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1543  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1544  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1545  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1546  { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1547  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1548  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1549  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1550  { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1551  };
1552 
1553  if (s->segmentation.feat[b->seg_id].ref_enabled) {
1554  av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1555  b->comp = 0;
1556  b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1557  } else {
1558  // read comp_pred flag
1559  if (s->comppredmode != PRED_SWITCHABLE) {
1560  b->comp = s->comppredmode == PRED_COMPREF;
1561  } else {
1562  int c;
1563 
1564  // FIXME add intra as ref=0xff (or -1) to make these easier?
1565  if (have_a) {
1566  if (have_l) {
1567  if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1568  c = 4;
1569  } else if (s->above_comp_ctx[col]) {
1570  c = 2 + (s->left_intra_ctx[row7] ||
1571  s->left_ref_ctx[row7] == s->fixcompref);
1572  } else if (s->left_comp_ctx[row7]) {
1573  c = 2 + (s->above_intra_ctx[col] ||
1574  s->above_ref_ctx[col] == s->fixcompref);
1575  } else {
1576  c = (!s->above_intra_ctx[col] &&
1577  s->above_ref_ctx[col] == s->fixcompref) ^
1578  (!s->left_intra_ctx[row7] &&
1579  s->left_ref_ctx[row & 7] == s->fixcompref);
1580  }
1581  } else {
1582  c = s->above_comp_ctx[col] ? 3 :
1583  (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1584  }
1585  } else if (have_l) {
1586  c = s->left_comp_ctx[row7] ? 3 :
1587  (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1588  } else {
1589  c = 1;
1590  }
1591  b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1592  s->counts.comp[c][b->comp]++;
1593  }
1594 
1595  // read actual references
1596  // FIXME probably cache a few variables here to prevent repetitive
1597  // memory accesses below
1598  if (b->comp) /* two references */ {
1599  int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1600 
1601  b->ref[fix_idx] = s->fixcompref;
1602  // FIXME can this codeblob be replaced by some sort of LUT?
1603  if (have_a) {
1604  if (have_l) {
1605  if (s->above_intra_ctx[col]) {
1606  if (s->left_intra_ctx[row7]) {
1607  c = 2;
1608  } else {
1609  c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1610  }
1611  } else if (s->left_intra_ctx[row7]) {
1612  c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1613  } else {
1614  int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1615 
1616  if (refl == refa && refa == s->varcompref[1]) {
1617  c = 0;
1618  } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1619  if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1620  (refl == s->fixcompref && refa == s->varcompref[0])) {
1621  c = 4;
1622  } else {
1623  c = (refa == refl) ? 3 : 1;
1624  }
1625  } else if (!s->left_comp_ctx[row7]) {
1626  if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1627  c = 1;
1628  } else {
1629  c = (refl == s->varcompref[1] &&
1630  refa != s->varcompref[1]) ? 2 : 4;
1631  }
1632  } else if (!s->above_comp_ctx[col]) {
1633  if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1634  c = 1;
1635  } else {
1636  c = (refa == s->varcompref[1] &&
1637  refl != s->varcompref[1]) ? 2 : 4;
1638  }
1639  } else {
1640  c = (refl == refa) ? 4 : 2;
1641  }
1642  }
1643  } else {
1644  if (s->above_intra_ctx[col]) {
1645  c = 2;
1646  } else if (s->above_comp_ctx[col]) {
1647  c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1648  } else {
1649  c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1650  }
1651  }
1652  } else if (have_l) {
1653  if (s->left_intra_ctx[row7]) {
1654  c = 2;
1655  } else if (s->left_comp_ctx[row7]) {
1656  c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1657  } else {
1658  c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1659  }
1660  } else {
1661  c = 2;
1662  }
1663  bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1664  b->ref[var_idx] = s->varcompref[bit];
1665  s->counts.comp_ref[c][bit]++;
1666  } else /* single reference */ {
1667  int bit, c;
1668 
1669  if (have_a && !s->above_intra_ctx[col]) {
1670  if (have_l && !s->left_intra_ctx[row7]) {
1671  if (s->left_comp_ctx[row7]) {
1672  if (s->above_comp_ctx[col]) {
1673  c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1674  !s->above_ref_ctx[col]);
1675  } else {
1676  c = (3 * !s->above_ref_ctx[col]) +
1677  (!s->fixcompref || !s->left_ref_ctx[row7]);
1678  }
1679  } else if (s->above_comp_ctx[col]) {
1680  c = (3 * !s->left_ref_ctx[row7]) +
1681  (!s->fixcompref || !s->above_ref_ctx[col]);
1682  } else {
1683  c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1684  }
1685  } else if (s->above_intra_ctx[col]) {
1686  c = 2;
1687  } else if (s->above_comp_ctx[col]) {
1688  c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1689  } else {
1690  c = 4 * (!s->above_ref_ctx[col]);
1691  }
1692  } else if (have_l && !s->left_intra_ctx[row7]) {
1693  if (s->left_intra_ctx[row7]) {
1694  c = 2;
1695  } else if (s->left_comp_ctx[row7]) {
1696  c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1697  } else {
1698  c = 4 * (!s->left_ref_ctx[row7]);
1699  }
1700  } else {
1701  c = 2;
1702  }
1703  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1704  s->counts.single_ref[c][0][bit]++;
1705  if (!bit) {
1706  b->ref[0] = 0;
1707  } else {
1708  // FIXME can this codeblob be replaced by some sort of LUT?
1709  if (have_a) {
1710  if (have_l) {
1711  if (s->left_intra_ctx[row7]) {
1712  if (s->above_intra_ctx[col]) {
1713  c = 2;
1714  } else if (s->above_comp_ctx[col]) {
1715  c = 1 + 2 * (s->fixcompref == 1 ||
1716  s->above_ref_ctx[col] == 1);
1717  } else if (!s->above_ref_ctx[col]) {
1718  c = 3;
1719  } else {
1720  c = 4 * (s->above_ref_ctx[col] == 1);
1721  }
1722  } else if (s->above_intra_ctx[col]) {
1723  if (s->left_intra_ctx[row7]) {
1724  c = 2;
1725  } else if (s->left_comp_ctx[row7]) {
1726  c = 1 + 2 * (s->fixcompref == 1 ||
1727  s->left_ref_ctx[row7] == 1);
1728  } else if (!s->left_ref_ctx[row7]) {
1729  c = 3;
1730  } else {
1731  c = 4 * (s->left_ref_ctx[row7] == 1);
1732  }
1733  } else if (s->above_comp_ctx[col]) {
1734  if (s->left_comp_ctx[row7]) {
1735  if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1736  c = 3 * (s->fixcompref == 1 ||
1737  s->left_ref_ctx[row7] == 1);
1738  } else {
1739  c = 2;
1740  }
1741  } else if (!s->left_ref_ctx[row7]) {
1742  c = 1 + 2 * (s->fixcompref == 1 ||
1743  s->above_ref_ctx[col] == 1);
1744  } else {
1745  c = 3 * (s->left_ref_ctx[row7] == 1) +
1746  (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1747  }
1748  } else if (s->left_comp_ctx[row7]) {
1749  if (!s->above_ref_ctx[col]) {
1750  c = 1 + 2 * (s->fixcompref == 1 ||
1751  s->left_ref_ctx[row7] == 1);
1752  } else {
1753  c = 3 * (s->above_ref_ctx[col] == 1) +
1754  (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1755  }
1756  } else if (!s->above_ref_ctx[col]) {
1757  if (!s->left_ref_ctx[row7]) {
1758  c = 3;
1759  } else {
1760  c = 4 * (s->left_ref_ctx[row7] == 1);
1761  }
1762  } else if (!s->left_ref_ctx[row7]) {
1763  c = 4 * (s->above_ref_ctx[col] == 1);
1764  } else {
1765  c = 2 * (s->left_ref_ctx[row7] == 1) +
1766  2 * (s->above_ref_ctx[col] == 1);
1767  }
1768  } else {
1769  if (s->above_intra_ctx[col] ||
1770  (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1771  c = 2;
1772  } else if (s->above_comp_ctx[col]) {
1773  c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1774  } else {
1775  c = 4 * (s->above_ref_ctx[col] == 1);
1776  }
1777  }
1778  } else if (have_l) {
1779  if (s->left_intra_ctx[row7] ||
1780  (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1781  c = 2;
1782  } else if (s->left_comp_ctx[row7]) {
1783  c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1784  } else {
1785  c = 4 * (s->left_ref_ctx[row7] == 1);
1786  }
1787  } else {
1788  c = 2;
1789  }
1790  bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1791  s->counts.single_ref[c][1][bit]++;
1792  b->ref[0] = 1 + bit;
1793  }
1794  }
1795  }
1796 
1797  if (b->bs <= BS_8x8) {
1798  if (s->segmentation.feat[b->seg_id].skip_enabled) {
1799  b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1800  } else {
1801  static const uint8_t off[10] = {
1802  3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1803  };
1804 
1805  // FIXME this needs to use the LUT tables from find_ref_mvs
1806  // because not all are -1,0/0,-1
1807  int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1808  [s->left_mode_ctx[row7 + off[b->bs]]];
1809 
1811  s->prob.p.mv_mode[c]);
1812  b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1813  s->counts.mv_mode[c][b->mode[0] - 10]++;
1814  }
1815  }
1816 
1817  if (s->filtermode == FILTER_SWITCHABLE) {
1818  int c;
1819 
1820  if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1821  if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1822  c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1823  s->left_filter_ctx[row7] : 3;
1824  } else {
1825  c = s->above_filter_ctx[col];
1826  }
1827  } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1828  c = s->left_filter_ctx[row7];
1829  } else {
1830  c = 3;
1831  }
1832 
1833  filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1834  s->prob.p.filter[c]);
1835  s->counts.filter[c][filter_id]++;
1836  b->filter = vp9_filter_lut[filter_id];
1837  } else {
1838  b->filter = s->filtermode;
1839  }
1840 
1841  if (b->bs > BS_8x8) {
1842  int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1843 
1845  s->prob.p.mv_mode[c]);
1846  s->counts.mv_mode[c][b->mode[0] - 10]++;
1847  fill_mv(s, b->mv[0], b->mode[0], 0);
1848 
1849  if (b->bs != BS_8x4) {
1851  s->prob.p.mv_mode[c]);
1852  s->counts.mv_mode[c][b->mode[1] - 10]++;
1853  fill_mv(s, b->mv[1], b->mode[1], 1);
1854  } else {
1855  b->mode[1] = b->mode[0];
1856  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1857  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1858  }
1859 
1860  if (b->bs != BS_4x8) {
1862  s->prob.p.mv_mode[c]);
1863  s->counts.mv_mode[c][b->mode[2] - 10]++;
1864  fill_mv(s, b->mv[2], b->mode[2], 2);
1865 
1866  if (b->bs != BS_8x4) {
1868  s->prob.p.mv_mode[c]);
1869  s->counts.mv_mode[c][b->mode[3] - 10]++;
1870  fill_mv(s, b->mv[3], b->mode[3], 3);
1871  } else {
1872  b->mode[3] = b->mode[2];
1873  AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1874  AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1875  }
1876  } else {
1877  b->mode[2] = b->mode[0];
1878  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1879  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1880  b->mode[3] = b->mode[1];
1881  AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1882  AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1883  }
1884  } else {
1885  fill_mv(s, b->mv[0], b->mode[0], -1);
1886  AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1887  AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1888  AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1889  AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1890  AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1891  AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1892  }
1893 
1894  vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
1895  }
1896 
1897 #if HAVE_FAST_64BIT
1898 #define SPLAT_CTX(var, val, n) \
1899  switch (n) { \
1900  case 1: var = val; break; \
1901  case 2: AV_WN16A(&var, val * 0x0101); break; \
1902  case 4: AV_WN32A(&var, val * 0x01010101); break; \
1903  case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
1904  case 16: { \
1905  uint64_t v64 = val * 0x0101010101010101ULL; \
1906  AV_WN64A( &var, v64); \
1907  AV_WN64A(&((uint8_t *) &var)[8], v64); \
1908  break; \
1909  } \
1910  }
1911 #else
1912 #define SPLAT_CTX(var, val, n) \
1913  switch (n) { \
1914  case 1: var = val; break; \
1915  case 2: AV_WN16A(&var, val * 0x0101); break; \
1916  case 4: AV_WN32A(&var, val * 0x01010101); break; \
1917  case 8: { \
1918  uint32_t v32 = val * 0x01010101; \
1919  AV_WN32A( &var, v32); \
1920  AV_WN32A(&((uint8_t *) &var)[4], v32); \
1921  break; \
1922  } \
1923  case 16: { \
1924  uint32_t v32 = val * 0x01010101; \
1925  AV_WN32A( &var, v32); \
1926  AV_WN32A(&((uint8_t *) &var)[4], v32); \
1927  AV_WN32A(&((uint8_t *) &var)[8], v32); \
1928  AV_WN32A(&((uint8_t *) &var)[12], v32); \
1929  break; \
1930  } \
1931  }
1932 #endif
1933 
1934  switch (bwh_tab[1][b->bs][0]) {
1935 #define SET_CTXS(dir, off, n) \
1936  do { \
1937  SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
1938  SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
1939  SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
1940  if (!s->keyframe && !s->intraonly) { \
1941  SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
1942  SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
1943  SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
1944  if (!b->intra) { \
1945  SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
1946  if (s->filtermode == FILTER_SWITCHABLE) { \
1947  SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
1948  } \
1949  } \
1950  } \
1951  } while (0)
1952  case 1: SET_CTXS(above, col, 1); break;
1953  case 2: SET_CTXS(above, col, 2); break;
1954  case 4: SET_CTXS(above, col, 4); break;
1955  case 8: SET_CTXS(above, col, 8); break;
1956  }
1957  switch (bwh_tab[1][b->bs][1]) {
1958  case 1: SET_CTXS(left, row7, 1); break;
1959  case 2: SET_CTXS(left, row7, 2); break;
1960  case 4: SET_CTXS(left, row7, 4); break;
1961  case 8: SET_CTXS(left, row7, 8); break;
1962  }
1963 #undef SPLAT_CTX
1964 #undef SET_CTXS
1965 
1966  if (!s->keyframe && !s->intraonly) {
1967  if (b->bs > BS_8x8) {
1968  int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1969 
1970  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
1971  AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
1972  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
1973  AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
1974  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
1975  AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
1976  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
1977  AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
1978  } else {
1979  int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1980 
1981  for (n = 0; n < w4 * 2; n++) {
1982  AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
1983  AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
1984  }
1985  for (n = 0; n < h4 * 2; n++) {
1986  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
1987  AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
1988  }
1989  }
1990  }
1991 
1992  // FIXME kinda ugly
1993  for (y = 0; y < h4; y++) {
1994  int x, o = (row + y) * s->sb_cols * 8 + col;
1995  struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
1996 
1997  if (b->intra) {
1998  for (x = 0; x < w4; x++) {
1999  mv[x].ref[0] =
2000  mv[x].ref[1] = -1;
2001  }
2002  } else if (b->comp) {
2003  for (x = 0; x < w4; x++) {
2004  mv[x].ref[0] = b->ref[0];
2005  mv[x].ref[1] = b->ref[1];
2006  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2007  AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2008  }
2009  } else {
2010  for (x = 0; x < w4; x++) {
2011  mv[x].ref[0] = b->ref[0];
2012  mv[x].ref[1] = -1;
2013  AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2014  }
2015  }
2016  }
2017 }
2018 
2019 // FIXME merge cnt/eob arguments?
2020 static av_always_inline int
2021 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2022  int is_tx32x32, unsigned (*cnt)[6][3],
2023  unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2024  int nnz, const int16_t *scan, const int16_t (*nb)[2],
2025  const int16_t *band_counts, const int16_t *qmul)
2026 {
2027  int i = 0, band = 0, band_left = band_counts[band];
2028  uint8_t *tp = p[0][nnz];
2029  uint8_t cache[1024];
2030 
2031  do {
2032  int val, rc;
2033 
2034  val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2035  eob[band][nnz][val]++;
2036  if (!val)
2037  break;
2038 
2039  skip_eob:
2040  if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2041  cnt[band][nnz][0]++;
2042  if (!--band_left)
2043  band_left = band_counts[++band];
2044  cache[scan[i]] = 0;
2045  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2046  tp = p[band][nnz];
2047  if (++i == n_coeffs)
2048  break; //invalid input; blocks should end with EOB
2049  goto skip_eob;
2050  }
2051 
2052  rc = scan[i];
2053  if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2054  cnt[band][nnz][1]++;
2055  val = 1;
2056  cache[rc] = 1;
2057  } else {
2058  // fill in p[3-10] (model fill) - only once per frame for each pos
2059  if (!tp[3])
2060  memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2061 
2062  cnt[band][nnz][2]++;
2063  if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2064  if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2065  cache[rc] = val = 2;
2066  } else {
2067  val = 3 + vp56_rac_get_prob(c, tp[5]);
2068  cache[rc] = 3;
2069  }
2070  } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2071  cache[rc] = 4;
2072  if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2073  val = 5 + vp56_rac_get_prob(c, 159);
2074  } else {
2075  val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2076  val += vp56_rac_get_prob(c, 145);
2077  }
2078  } else { // cat 3-6
2079  cache[rc] = 5;
2080  if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2081  if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2082  val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2083  val += (vp56_rac_get_prob(c, 148) << 1);
2084  val += vp56_rac_get_prob(c, 140);
2085  } else {
2086  val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2087  val += (vp56_rac_get_prob(c, 155) << 2);
2088  val += (vp56_rac_get_prob(c, 140) << 1);
2089  val += vp56_rac_get_prob(c, 135);
2090  }
2091  } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2092  val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2093  val += (vp56_rac_get_prob(c, 157) << 3);
2094  val += (vp56_rac_get_prob(c, 141) << 2);
2095  val += (vp56_rac_get_prob(c, 134) << 1);
2096  val += vp56_rac_get_prob(c, 130);
2097  } else {
2098  val = 67 + (vp56_rac_get_prob(c, 254) << 13);
2099  val += (vp56_rac_get_prob(c, 254) << 12);
2100  val += (vp56_rac_get_prob(c, 254) << 11);
2101  val += (vp56_rac_get_prob(c, 252) << 10);
2102  val += (vp56_rac_get_prob(c, 249) << 9);
2103  val += (vp56_rac_get_prob(c, 243) << 8);
2104  val += (vp56_rac_get_prob(c, 230) << 7);
2105  val += (vp56_rac_get_prob(c, 196) << 6);
2106  val += (vp56_rac_get_prob(c, 177) << 5);
2107  val += (vp56_rac_get_prob(c, 153) << 4);
2108  val += (vp56_rac_get_prob(c, 140) << 3);
2109  val += (vp56_rac_get_prob(c, 133) << 2);
2110  val += (vp56_rac_get_prob(c, 130) << 1);
2111  val += vp56_rac_get_prob(c, 129);
2112  }
2113  }
2114  }
2115  if (!--band_left)
2116  band_left = band_counts[++band];
2117  if (is_tx32x32)
2118  coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
2119  else
2120  coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
2121  nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2122  tp = p[band][nnz];
2123  } while (++i < n_coeffs);
2124 
2125  return i;
2126 }
2127 
2128 static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2129  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2130  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2131  const int16_t (*nb)[2], const int16_t *band_counts,
2132  const int16_t *qmul)
2133 {
2134  return decode_coeffs_b_generic(c, coef, n_coeffs, 0, cnt, eob, p,
2135  nnz, scan, nb, band_counts, qmul);
2136 }
2137 
2138 static int decode_coeffs_b32(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2139  unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2140  uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2141  const int16_t (*nb)[2], const int16_t *band_counts,
2142  const int16_t *qmul)
2143 {
2144  return decode_coeffs_b_generic(c, coef, n_coeffs, 1, cnt, eob, p,
2145  nnz, scan, nb, band_counts, qmul);
2146 }
2147 
2149 {
2150  VP9Context *s = ctx->priv_data;
2151  VP9Block *b = s->b;
2152  int row = s->row, col = s->col;
2153  uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2154  unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2155  unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2156  int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2157  int end_x = FFMIN(2 * (s->cols - col), w4);
2158  int end_y = FFMIN(2 * (s->rows - row), h4);
2159  int n, pl, x, y, res;
2160  int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2161  int tx = 4 * s->lossless + b->tx;
2162  const int16_t * const *yscans = vp9_scans[tx];
2163  const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2164  const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2165  const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2166  uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2167  uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2168  static const int16_t band_counts[4][8] = {
2169  { 1, 2, 3, 4, 3, 16 - 13 },
2170  { 1, 2, 3, 4, 11, 64 - 21 },
2171  { 1, 2, 3, 4, 11, 256 - 21 },
2172  { 1, 2, 3, 4, 11, 1024 - 21 },
2173  };
2174  const int16_t *y_band_counts = band_counts[b->tx];
2175  const int16_t *uv_band_counts = band_counts[b->uvtx];
2176 
2177 #define MERGE(la, end, step, rd) \
2178  for (n = 0; n < end; n += step) \
2179  la[n] = !!rd(&la[n])
2180 #define MERGE_CTX(step, rd) \
2181  do { \
2182  MERGE(l, end_y, step, rd); \
2183  MERGE(a, end_x, step, rd); \
2184  } while (0)
2185 
2186 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2187  for (n = 0, y = 0; y < end_y; y += step) { \
2188  for (x = 0; x < end_x; x += step, n += step * step) { \
2189  enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2190  res = decode_coeffs_b##v(&s->c, s->block + 16 * n, 16 * step * step, \
2191  c, e, p, a[x] + l[y], yscans[txtp], \
2192  ynbs[txtp], y_band_counts, qmul[0]); \
2193  a[x] = l[y] = !!res; \
2194  if (step >= 4) { \
2195  AV_WN16A(&s->eob[n], res); \
2196  } else { \
2197  s->eob[n] = res; \
2198  } \
2199  } \
2200  }
2201 
2202 #define SPLAT(la, end, step, cond) \
2203  if (step == 2) { \
2204  for (n = 1; n < end; n += step) \
2205  la[n] = la[n - 1]; \
2206  } else if (step == 4) { \
2207  if (cond) { \
2208  for (n = 0; n < end; n += step) \
2209  AV_WN32A(&la[n], la[n] * 0x01010101); \
2210  } else { \
2211  for (n = 0; n < end; n += step) \
2212  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2213  } \
2214  } else /* step == 8 */ { \
2215  if (cond) { \
2216  if (HAVE_FAST_64BIT) { \
2217  for (n = 0; n < end; n += step) \
2218  AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2219  } else { \
2220  for (n = 0; n < end; n += step) { \
2221  uint32_t v32 = la[n] * 0x01010101; \
2222  AV_WN32A(&la[n], v32); \
2223  AV_WN32A(&la[n + 4], v32); \
2224  } \
2225  } \
2226  } else { \
2227  for (n = 0; n < end; n += step) \
2228  memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2229  } \
2230  }
2231 #define SPLAT_CTX(step) \
2232  do { \
2233  SPLAT(a, end_x, step, end_x == w4); \
2234  SPLAT(l, end_y, step, end_y == h4); \
2235  } while (0)
2236 
2237  /* y tokens */
2238  switch (b->tx) {
2239  case TX_4X4:
2240  DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2241  break;
2242  case TX_8X8:
2243  MERGE_CTX(2, AV_RN16A);
2244  DECODE_Y_COEF_LOOP(2, 0,);
2245  SPLAT_CTX(2);
2246  break;
2247  case TX_16X16:
2248  MERGE_CTX(4, AV_RN32A);
2249  DECODE_Y_COEF_LOOP(4, 0,);
2250  SPLAT_CTX(4);
2251  break;
2252  case TX_32X32:
2253  MERGE_CTX(8, AV_RN64A);
2254  DECODE_Y_COEF_LOOP(8, 0, 32);
2255  SPLAT_CTX(8);
2256  break;
2257  }
2258 
2259 #define DECODE_UV_COEF_LOOP(step) \
2260  for (n = 0, y = 0; y < end_y; y += step) { \
2261  for (x = 0; x < end_x; x += step, n += step * step) { \
2262  res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, \
2263  16 * step * step, c, e, p, a[x] + l[y], \
2264  uvscan, uvnb, uv_band_counts, qmul[1]); \
2265  a[x] = l[y] = !!res; \
2266  if (step >= 4) { \
2267  AV_WN16A(&s->uveob[pl][n], res); \
2268  } else { \
2269  s->uveob[pl][n] = res; \
2270  } \
2271  } \
2272  }
2273 
2274  p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2275  c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2276  e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2277  w4 >>= 1;
2278  h4 >>= 1;
2279  end_x >>= 1;
2280  end_y >>= 1;
2281  for (pl = 0; pl < 2; pl++) {
2282  a = &s->above_uv_nnz_ctx[pl][col];
2283  l = &s->left_uv_nnz_ctx[pl][row & 7];
2284  switch (b->uvtx) {
2285  case TX_4X4:
2287  break;
2288  case TX_8X8:
2289  MERGE_CTX(2, AV_RN16A);
2291  SPLAT_CTX(2);
2292  break;
2293  case TX_16X16:
2294  MERGE_CTX(4, AV_RN32A);
2296  SPLAT_CTX(4);
2297  break;
2298  case TX_32X32:
2299  MERGE_CTX(8, AV_RN64A);
2300  // a 64x64 (max) uv block can ever only contain 1 tx32x32 block
2301  // so there is no need to loop
2302  res = decode_coeffs_b32(&s->c, s->uvblock[pl],
2303  1024, c, e, p, a[0] + l[0],
2304  uvscan, uvnb, uv_band_counts, qmul[1]);
2305  a[0] = l[0] = !!res;
2306  AV_WN16A(&s->uveob[pl][0], res);
2307  SPLAT_CTX(8);
2308  break;
2309  }
2310  }
2311 }
2312 
2314  uint8_t *dst_edge, ptrdiff_t stride_edge,
2315  uint8_t *dst_inner, ptrdiff_t stride_inner,
2316  uint8_t *l, int col, int x, int w,
2317  int row, int y, enum TxfmMode tx,
2318  int p)
2319 {
2320  int have_top = row > 0 || y > 0;
2321  int have_left = col > s->tiling.tile_col_start || x > 0;
2322  int have_right = x < w - 1;
2323  static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2324  [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2325  { DC_127_PRED, VERT_PRED } },
2326  [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2327  { HOR_PRED, HOR_PRED } },
2328  [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2329  { LEFT_DC_PRED, DC_PRED } },
2339  { DC_127_PRED, VERT_LEFT_PRED } },
2340  [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2341  { HOR_UP_PRED, HOR_UP_PRED } },
2342  [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2343  { HOR_PRED, TM_VP8_PRED } },
2344  };
2345  static const struct {
2346  uint8_t needs_left:1;
2347  uint8_t needs_top:1;
2348  uint8_t needs_topleft:1;
2349  uint8_t needs_topright:1;
2350  } edges[N_INTRA_PRED_MODES] = {
2351  [VERT_PRED] = { .needs_top = 1 },
2352  [HOR_PRED] = { .needs_left = 1 },
2353  [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2354  [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2355  [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2356  [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2357  [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2358  [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2359  [HOR_UP_PRED] = { .needs_left = 1 },
2360  [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2361  [LEFT_DC_PRED] = { .needs_left = 1 },
2362  [TOP_DC_PRED] = { .needs_top = 1 },
2363  [DC_128_PRED] = { 0 },
2364  [DC_127_PRED] = { 0 },
2365  [DC_129_PRED] = { 0 }
2366  };
2367 
2368  av_assert2(mode >= 0 && mode < 10);
2369  mode = mode_conv[mode][have_left][have_top];
2370  if (edges[mode].needs_top) {
2371  uint8_t *top, *topleft;
2372  int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
2373  int n_px_need_tr = 0;
2374 
2375  if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2376  n_px_need_tr = 4;
2377 
2378  // if top of sb64-row, use s->intra_pred_data[] instead of
2379  // dst[-stride] for intra prediction (it contains pre- instead of
2380  // post-loopfilter data)
2381  if (have_top) {
2382  top = !(row & 7) && !y ?
2383  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2384  y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2385  if (have_left)
2386  topleft = !(row & 7) && !y ?
2387  s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2388  y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2389  &dst_inner[-stride_inner];
2390  }
2391 
2392  if (have_top &&
2393  (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2394  (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2395  n_px_need + n_px_need_tr <= n_px_have) {
2396  *a = top;
2397  } else {
2398  if (have_top) {
2399  if (n_px_need <= n_px_have) {
2400  memcpy(*a, top, n_px_need);
2401  } else {
2402  memcpy(*a, top, n_px_have);
2403  memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
2404  n_px_need - n_px_have);
2405  }
2406  } else {
2407  memset(*a, 127, n_px_need);
2408  }
2409  if (edges[mode].needs_topleft) {
2410  if (have_left && have_top) {
2411  (*a)[-1] = topleft[-1];
2412  } else {
2413  (*a)[-1] = have_top ? 129 : 127;
2414  }
2415  }
2416  if (tx == TX_4X4 && edges[mode].needs_topright) {
2417  if (have_top && have_right &&
2418  n_px_need + n_px_need_tr <= n_px_have) {
2419  memcpy(&(*a)[4], &top[4], 4);
2420  } else {
2421  memset(&(*a)[4], (*a)[3], 4);
2422  }
2423  }
2424  }
2425  }
2426  if (edges[mode].needs_left) {
2427  if (have_left) {
2428  int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
2429  uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2430  ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2431 
2432  if (n_px_need <= n_px_have) {
2433  for (i = 0; i < n_px_need; i++)
2434  l[n_px_need - 1 - i] = dst[i * stride - 1];
2435  } else {
2436  for (i = 0; i < n_px_have; i++)
2437  l[n_px_need - 1 - i] = dst[i * stride - 1];
2438  memset(l, l[n_px_need - n_px_have], n_px_need - n_px_have);
2439  }
2440  } else {
2441  memset(l, 129, 4 << tx);
2442  }
2443  }
2444 
2445  return mode;
2446 }
2447 
2448 static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2449 {
2450  VP9Context *s = ctx->priv_data;
2451  VP9Block *b = s->b;
2452  int row = s->row, col = s->col;
2453  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2454  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2455  int end_x = FFMIN(2 * (s->cols - col), w4);
2456  int end_y = FFMIN(2 * (s->rows - row), h4);
2457  int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2458  int uvstep1d = 1 << b->uvtx, p;
2459  uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2460  LOCAL_ALIGNED_32(uint8_t, a_buf, [64]);
2461  LOCAL_ALIGNED_32(uint8_t, l, [32]);
2462 
2463  for (n = 0, y = 0; y < end_y; y += step1d) {
2464  uint8_t *ptr = dst, *ptr_r = dst_r;
2465  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
2466  ptr_r += 4 * step1d, n += step) {
2467  int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2468  y * 2 + x : 0];
2469  uint8_t *a = &a_buf[32];
2470  enum TxfmType txtp = vp9_intra_txfm_type[mode];
2471  int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2472 
2473  mode = check_intra_mode(s, mode, &a, ptr_r,
2474  s->frames[CUR_FRAME].tf.f->linesize[0],
2475  ptr, s->y_stride, l,
2476  col, x, w4, row, y, b->tx, 0);
2477  s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2478  if (eob)
2479  s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2480  s->block + 16 * n, eob);
2481  }
2482  dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2483  dst += 4 * step1d * s->y_stride;
2484  }
2485 
2486  // U/V
2487  w4 >>= 1;
2488  end_x >>= 1;
2489  end_y >>= 1;
2490  step = 1 << (b->uvtx * 2);
2491  for (p = 0; p < 2; p++) {
2492  dst = s->dst[1 + p];
2493  dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2494  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2495  uint8_t *ptr = dst, *ptr_r = dst_r;
2496  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
2497  ptr_r += 4 * uvstep1d, n += step) {
2498  int mode = b->uvmode;
2499  uint8_t *a = &a_buf[16];
2500  int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2501 
2502  mode = check_intra_mode(s, mode, &a, ptr_r,
2503  s->frames[CUR_FRAME].tf.f->linesize[1],
2504  ptr, s->uv_stride, l,
2505  col, x, w4, row, y, b->uvtx, p + 1);
2506  s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2507  if (eob)
2508  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2509  s->uvblock[p] + 16 * n, eob);
2510  }
2511  dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2512  dst += 4 * uvstep1d * s->uv_stride;
2513  }
2514  }
2515 }
2516 
2518  uint8_t *dst, ptrdiff_t dst_stride,
2519  const uint8_t *ref, ptrdiff_t ref_stride,
2521  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2522  int bw, int bh, int w, int h)
2523 {
2524  int mx = mv->x, my = mv->y, th;
2525 
2526  y += my >> 3;
2527  x += mx >> 3;
2528  ref += y * ref_stride + x;
2529  mx &= 7;
2530  my &= 7;
2531  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2532  // we use +7 because the last 7 pixels of each sbrow can be changed in
2533  // the longest loopfilter of the next sbrow
2534  th = (y + bh + 4 * !!my + 7) >> 6;
2535  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2536  if (x < !!mx * 3 || y < !!my * 3 ||
2537  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2539  ref - !!my * 3 * ref_stride - !!mx * 3,
2540  80, ref_stride,
2541  bw + !!mx * 7, bh + !!my * 7,
2542  x - !!mx * 3, y - !!my * 3, w, h);
2543  ref = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2544  ref_stride = 80;
2545  }
2546  mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2547 }
2548 
2550  uint8_t *dst_u, uint8_t *dst_v,
2551  ptrdiff_t dst_stride,
2552  const uint8_t *ref_u, ptrdiff_t src_stride_u,
2553  const uint8_t *ref_v, ptrdiff_t src_stride_v,
2555  ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2556  int bw, int bh, int w, int h)
2557 {
2558  int mx = mv->x, my = mv->y, th;
2559 
2560  y += my >> 4;
2561  x += mx >> 4;
2562  ref_u += y * src_stride_u + x;
2563  ref_v += y * src_stride_v + x;
2564  mx &= 15;
2565  my &= 15;
2566  // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2567  // we use +7 because the last 7 pixels of each sbrow can be changed in
2568  // the longest loopfilter of the next sbrow
2569  th = (y + bh + 4 * !!my + 7) >> 5;
2570  ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2571  if (x < !!mx * 3 || y < !!my * 3 ||
2572  x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2574  ref_u - !!my * 3 * src_stride_u - !!mx * 3,
2575  80, src_stride_u,
2576  bw + !!mx * 7, bh + !!my * 7,
2577  x - !!mx * 3, y - !!my * 3, w, h);
2578  ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2579  mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
2580 
2582  ref_v - !!my * 3 * src_stride_v - !!mx * 3,
2583  80, src_stride_v,
2584  bw + !!mx * 7, bh + !!my * 7,
2585  x - !!mx * 3, y - !!my * 3, w, h);
2586  ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2587  mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
2588  } else {
2589  mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2590  mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2591  }
2592 }
2593 
2594 static void inter_recon(AVCodecContext *ctx)
2595 {
2596  static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
2597  { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
2598  { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
2599  };
2600  VP9Context *s = ctx->priv_data;
2601  VP9Block *b = s->b;
2602  int row = s->row, col = s->col;
2603  ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2;
2604  AVFrame *ref1 = tref1->f, *ref2;
2605  int w1 = ref1->width, h1 = ref1->height, w2, h2;
2606  ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
2607 
2608  if (b->comp) {
2609  tref2 = &s->refs[s->refidx[b->ref[1]]];
2610  ref2 = tref2->f;
2611  w2 = ref2->width;
2612  h2 = ref2->height;
2613  }
2614 
2615  // y inter pred
2616  if (b->bs > BS_8x8) {
2617  if (b->bs == BS_8x4) {
2618  mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
2619  ref1->data[0], ref1->linesize[0], tref1,
2620  row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1);
2621  mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
2622  s->dst[0] + 4 * ls_y, ls_y,
2623  ref1->data[0], ref1->linesize[0], tref1,
2624  (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1);
2625 
2626  if (b->comp) {
2627  mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
2628  ref2->data[0], ref2->linesize[0], tref2,
2629  row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2);
2630  mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
2631  s->dst[0] + 4 * ls_y, ls_y,
2632  ref2->data[0], ref2->linesize[0], tref2,
2633  (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2);
2634  }
2635  } else if (b->bs == BS_4x8) {
2636  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2637  ref1->data[0], ref1->linesize[0], tref1,
2638  row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1);
2639  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2640  ref1->data[0], ref1->linesize[0], tref1,
2641  row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1);
2642 
2643  if (b->comp) {
2644  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2645  ref2->data[0], ref2->linesize[0], tref2,
2646  row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2);
2647  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2648  ref2->data[0], ref2->linesize[0], tref2,
2649  row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2);
2650  }
2651  } else {
2652  av_assert2(b->bs == BS_4x4);
2653 
2654  // FIXME if two horizontally adjacent blocks have the same MV,
2655  // do a w8 instead of a w4 call
2656  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2657  ref1->data[0], ref1->linesize[0], tref1,
2658  row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1);
2659  mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2660  ref1->data[0], ref1->linesize[0], tref1,
2661  row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1);
2662  mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2663  s->dst[0] + 4 * ls_y, ls_y,
2664  ref1->data[0], ref1->linesize[0], tref1,
2665  (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1);
2666  mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2667  s->dst[0] + 4 * ls_y + 4, ls_y,
2668  ref1->data[0], ref1->linesize[0], tref1,
2669  (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1);
2670 
2671  if (b->comp) {
2672  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2673  ref2->data[0], ref2->linesize[0], tref2,
2674  row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2);
2675  mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2676  ref2->data[0], ref2->linesize[0], tref2,
2677  row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2);
2678  mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2679  s->dst[0] + 4 * ls_y, ls_y,
2680  ref2->data[0], ref2->linesize[0], tref2,
2681  (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2);
2682  mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2683  s->dst[0] + 4 * ls_y + 4, ls_y,
2684  ref2->data[0], ref2->linesize[0], tref2,
2685  (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2);
2686  }
2687  }
2688  } else {
2689  int bwl = bwlog_tab[0][b->bs];
2690  int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
2691 
2692  mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
2693  ref1->data[0], ref1->linesize[0], tref1,
2694  row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1);
2695 
2696  if (b->comp)
2697  mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
2698  ref2->data[0], ref2->linesize[0], tref2,
2699  row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2);
2700  }
2701 
2702  // uv inter pred
2703  {
2704  int bwl = bwlog_tab[1][b->bs];
2705  int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
2706  VP56mv mvuv;
2707 
2708  w1 = (w1 + 1) >> 1;
2709  h1 = (h1 + 1) >> 1;
2710  if (b->comp) {
2711  w2 = (w2 + 1) >> 1;
2712  h2 = (h2 + 1) >> 1;
2713  }
2714  if (b->bs > BS_8x8) {
2715  mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
2716  mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
2717  } else {
2718  mvuv = b->mv[0][0];
2719  }
2720 
2721  mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
2722  s->dst[1], s->dst[2], ls_uv,
2723  ref1->data[1], ref1->linesize[1],
2724  ref1->data[2], ref1->linesize[2], tref1,
2725  row << 2, col << 2, &mvuv, bw, bh, w1, h1);
2726 
2727  if (b->comp) {
2728  if (b->bs > BS_8x8) {
2729  mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
2730  mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
2731  } else {
2732  mvuv = b->mv[0][1];
2733  }
2734  mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
2735  s->dst[1], s->dst[2], ls_uv,
2736  ref2->data[1], ref2->linesize[1],
2737  ref2->data[2], ref2->linesize[2], tref2,
2738  row << 2, col << 2, &mvuv, bw, bh, w2, h2);
2739  }
2740  }
2741 
2742  if (!b->skip) {
2743  /* mostly copied intra_reconn() */
2744 
2745  int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2746  int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2747  int end_x = FFMIN(2 * (s->cols - col), w4);
2748  int end_y = FFMIN(2 * (s->rows - row), h4);
2749  int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2750  int uvstep1d = 1 << b->uvtx, p;
2751  uint8_t *dst = s->dst[0];
2752 
2753  // y itxfm add
2754  for (n = 0, y = 0; y < end_y; y += step1d) {
2755  uint8_t *ptr = dst;
2756  for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
2757  int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2758 
2759  if (eob)
2760  s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
2761  s->block + 16 * n, eob);
2762  }
2763  dst += 4 * s->y_stride * step1d;
2764  }
2765 
2766  // uv itxfm add
2767  end_x >>= 1;
2768  end_y >>= 1;
2769  step = 1 << (b->uvtx * 2);
2770  for (p = 0; p < 2; p++) {
2771  dst = s->dst[p + 1];
2772  for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2773  uint8_t *ptr = dst;
2774  for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
2775  int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2776 
2777  if (eob)
2778  s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2779  s->uvblock[p] + 16 * n, eob);
2780  }
2781  dst += 4 * uvstep1d * s->uv_stride;
2782  }
2783  }
2784  }
2785 }
2786 
2787 static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
2788  int row_and_7, int col_and_7,
2789  int w, int h, int col_end, int row_end,
2790  enum TxfmMode tx, int skip_inter)
2791 {
2792  // FIXME I'm pretty sure all loops can be replaced by a single LUT if
2793  // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
2794  // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
2795  // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
2796 
2797  // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
2798  // edges. This means that for UV, we work on two subsampled blocks at
2799  // a time, and we only use the topleft block's mode information to set
2800  // things like block strength. Thus, for any block size smaller than
2801  // 16x16, ignore the odd portion of the block.
2802  if (tx == TX_4X4 && is_uv) {
2803  if (h == 1) {
2804  if (row_and_7 & 1)
2805  return;
2806  if (!row_end)
2807  h += 1;
2808  }
2809  if (w == 1) {
2810  if (col_and_7 & 1)
2811  return;
2812  if (!col_end)
2813  w += 1;
2814  }
2815  }
2816 
2817  if (tx == TX_4X4 && !skip_inter) {
2818  int t = 1 << col_and_7, m_col = (t << w) - t, y;
2819  int m_col_odd = (t << (w - 1)) - t;
2820 
2821  // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
2822  if (is_uv) {
2823  int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
2824 
2825  for (y = row_and_7; y < h + row_and_7; y++) {
2826  int col_mask_id = 2 - !(y & 7);
2827 
2828  lflvl->mask[is_uv][0][y][1] |= m_row_8;
2829  lflvl->mask[is_uv][0][y][2] |= m_row_4;
2830  // for odd lines, if the odd col is not being filtered,
2831  // skip odd row also:
2832  // .---. <-- a
2833  // | |
2834  // |___| <-- b
2835  // ^ ^
2836  // c d
2837  //
2838  // if a/c are even row/col and b/d are odd, and d is skipped,
2839  // e.g. right edge of size-66x66.webm, then skip b also (bug)
2840  if ((col_end & 1) && (y & 1)) {
2841  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
2842  } else {
2843  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
2844  }
2845  }
2846  } else {
2847  int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
2848 
2849  for (y = row_and_7; y < h + row_and_7; y++) {
2850  int col_mask_id = 2 - !(y & 3);
2851 
2852  lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
2853  lflvl->mask[is_uv][0][y][2] |= m_row_4;
2854  lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
2855  lflvl->mask[is_uv][0][y][3] |= m_col;
2856  lflvl->mask[is_uv][1][y][3] |= m_col;
2857  }
2858  }
2859  } else {
2860  int y, t = 1 << col_and_7, m_col = (t << w) - t;
2861 
2862  if (!skip_inter) {
2863  int mask_id = (tx == TX_8X8);
2864  int l2 = tx + is_uv - 1, step1d = 1 << l2;
2865  static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
2866  int m_row = m_col & masks[l2];
2867 
2868  // at odd UV col/row edges tx16/tx32 loopfilter edges, force
2869  // 8wd loopfilter to prevent going off the visible edge.
2870  if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
2871  int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
2872  int m_row_8 = m_row - m_row_16;
2873 
2874  for (y = row_and_7; y < h + row_and_7; y++) {
2875  lflvl->mask[is_uv][0][y][0] |= m_row_16;
2876  lflvl->mask[is_uv][0][y][1] |= m_row_8;
2877  }
2878  } else {
2879  for (y = row_and_7; y < h + row_and_7; y++)
2880  lflvl->mask[is_uv][0][y][mask_id] |= m_row;
2881  }
2882 
2883  if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
2884  for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
2885  lflvl->mask[is_uv][1][y][0] |= m_col;
2886  if (y - row_and_7 == h - 1)
2887  lflvl->mask[is_uv][1][y][1] |= m_col;
2888  } else {
2889  for (y = row_and_7; y < h + row_and_7; y += step1d)
2890  lflvl->mask[is_uv][1][y][mask_id] |= m_col;
2891  }
2892  } else if (tx != TX_4X4) {
2893  int mask_id;
2894 
2895  mask_id = (tx == TX_8X8) || (is_uv && h == 1);
2896  lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
2897  mask_id = (tx == TX_8X8) || (is_uv && w == 1);
2898  for (y = row_and_7; y < h + row_and_7; y++)
2899  lflvl->mask[is_uv][0][y][mask_id] |= t;
2900  } else if (is_uv) {
2901  int t8 = t & 0x01, t4 = t - t8;
2902 
2903  for (y = row_and_7; y < h + row_and_7; y++) {
2904  lflvl->mask[is_uv][0][y][2] |= t4;
2905  lflvl->mask[is_uv][0][y][1] |= t8;
2906  }
2907  lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
2908  } else {
2909  int t8 = t & 0x11, t4 = t - t8;
2910 
2911  for (y = row_and_7; y < h + row_and_7; y++) {
2912  lflvl->mask[is_uv][0][y][2] |= t4;
2913  lflvl->mask[is_uv][0][y][1] |= t8;
2914  }
2915  lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
2916  }
2917  }
2918 }
2919 
2920 static void decode_b(AVCodecContext *ctx, int row, int col,
2921  struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
2922  enum BlockLevel bl, enum BlockPartition bp)
2923 {
2924  VP9Context *s = ctx->priv_data;
2925  VP9Block *b = s->b;
2926  enum BlockSize bs = bl * 3 + bp;
2927  int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
2928  int emu[2];
2929  AVFrame *f = s->frames[CUR_FRAME].tf.f;
2930 
2931  s->row = row;
2932  s->row7 = row & 7;
2933  s->col = col;
2934  s->col7 = col & 7;
2935  s->min_mv.x = -(128 + col * 64);
2936  s->min_mv.y = -(128 + row * 64);
2937  s->max_mv.x = 128 + (s->cols - col - w4) * 64;
2938  s->max_mv.y = 128 + (s->rows - row - h4) * 64;
2939  if (s->pass < 2) {
2940  b->bs = bs;
2941  b->bl = bl;
2942  b->bp = bp;
2943  decode_mode(ctx);
2944  b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
2945 
2946  if (!b->skip) {
2947  decode_coeffs(ctx);
2948  } else {
2949  int row7 = s->row7;
2950 
2951 #define SPLAT_ZERO_CTX(v, n) \
2952  switch (n) { \
2953  case 1: v = 0; break; \
2954  case 2: AV_ZERO16(&v); break; \
2955  case 4: AV_ZERO32(&v); break; \
2956  case 8: AV_ZERO64(&v); break; \
2957  case 16: AV_ZERO128(&v); break; \
2958  }
2959 #define SPLAT_ZERO_YUV(dir, var, off, n) \
2960  do { \
2961  SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
2962  SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
2963  SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
2964  } while (0)
2965 
2966  switch (w4) {
2967  case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1); break;
2968  case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2); break;
2969  case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4); break;
2970  case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8); break;
2971  }
2972  switch (h4) {
2973  case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1); break;
2974  case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2); break;
2975  case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4); break;
2976  case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8); break;
2977  }
2978  }
2979  if (s->pass == 1) {
2980  s->b++;
2981  s->block += w4 * h4 * 64;
2982  s->uvblock[0] += w4 * h4 * 16;
2983  s->uvblock[1] += w4 * h4 * 16;
2984  s->eob += 4 * w4 * h4;
2985  s->uveob[0] += w4 * h4;
2986  s->uveob[1] += w4 * h4;
2987 
2988  return;
2989  }
2990  }
2991 
2992  // emulated overhangs if the stride of the target buffer can't hold. This
2993  // allows to support emu-edge and so on even if we have large block
2994  // overhangs
2995  emu[0] = (col + w4) * 8 > f->linesize[0] ||
2996  (row + h4) > s->rows;
2997  emu[1] = (col + w4) * 4 > f->linesize[1] ||
2998  (row + h4) > s->rows;
2999  if (emu[0]) {
3000  s->dst[0] = s->tmp_y;
3001  s->y_stride = 64;
3002  } else {
3003  s->dst[0] = f->data[0] + yoff;
3004  s->y_stride = f->linesize[0];
3005  }
3006  if (emu[1]) {
3007  s->dst[1] = s->tmp_uv[0];
3008  s->dst[2] = s->tmp_uv[1];
3009  s->uv_stride = 32;
3010  } else {
3011  s->dst[1] = f->data[1] + uvoff;
3012  s->dst[2] = f->data[2] + uvoff;
3013  s->uv_stride = f->linesize[1];
3014  }
3015  if (b->intra) {
3016  intra_recon(ctx, yoff, uvoff);
3017  } else {
3018  inter_recon(ctx);
3019  }
3020  if (emu[0]) {
3021  int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3022 
3023  for (n = 0; o < w; n++) {
3024  int bw = 64 >> n;
3025 
3026  av_assert2(n <= 4);
3027  if (w & bw) {
3028  s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
3029  s->tmp_y + o, 64, h, 0, 0);
3030  o += bw;
3031  }
3032  }
3033  }
3034  if (emu[1]) {
3035  int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
3036 
3037  for (n = 1; o < w; n++) {
3038  int bw = 64 >> n;
3039 
3040  av_assert2(n <= 4);
3041  if (w & bw) {
3042  s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
3043  s->tmp_uv[0] + o, 32, h, 0, 0);
3044  s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
3045  s->tmp_uv[1] + o, 32, h, 0, 0);
3046  o += bw;
3047  }
3048  }
3049  }
3050 
3051  // pick filter level and find edges to apply filter to
3052  if (s->filter.level &&
3053  (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3054  [b->mode[3] != ZEROMV]) > 0) {
3055  int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3056  int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3057 
3058  setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3059  mask_edges(lflvl, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3060  mask_edges(lflvl, 1, row7, col7, x_end, y_end,
3061  s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3062  s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3063  b->uvtx, skip_inter);
3064 
3065  if (!s->filter.lim_lut[lvl]) {
3066  int sharp = s->filter.sharpness;
3067  int limit = lvl;
3068 
3069  if (sharp > 0) {
3070  limit >>= (sharp + 3) >> 2;
3071  limit = FFMIN(limit, 9 - sharp);
3072  }
3073  limit = FFMAX(limit, 1);
3074 
3075  s->filter.lim_lut[lvl] = limit;
3076  s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3077  }
3078  }
3079 
3080  if (s->pass == 2) {
3081  s->b++;
3082  s->block += w4 * h4 * 64;
3083  s->uvblock[0] += w4 * h4 * 16;
3084  s->uvblock[1] += w4 * h4 * 16;
3085  s->eob += 4 * w4 * h4;
3086  s->uveob[0] += w4 * h4;
3087  s->uveob[1] += w4 * h4;
3088  }
3089 }
3090 
3091 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3092  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3093 {
3094  VP9Context *s = ctx->priv_data;
3095  int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3096  (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3097  const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
3098  s->prob.p.partition[bl][c];
3099  enum BlockPartition bp;
3100  ptrdiff_t hbs = 4 >> bl;
3101  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3102  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3103 
3104  if (bl == BL_8X8) {
3105  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3106  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3107  } else if (col + hbs < s->cols) { // FIXME why not <=?
3108  if (row + hbs < s->rows) { // FIXME why not <=?
3109  bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3110  switch (bp) {
3111  case PARTITION_NONE:
3112  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3113  break;
3114  case PARTITION_H:
3115  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3116  yoff += hbs * 8 * y_stride;
3117  uvoff += hbs * 4 * uv_stride;
3118  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3119  break;
3120  case PARTITION_V:
3121  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3122  yoff += hbs * 8;
3123  uvoff += hbs * 4;
3124  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3125  break;
3126  case PARTITION_SPLIT:
3127  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3128  decode_sb(ctx, row, col + hbs, lflvl,
3129  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3130  yoff += hbs * 8 * y_stride;
3131  uvoff += hbs * 4 * uv_stride;
3132  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3133  decode_sb(ctx, row + hbs, col + hbs, lflvl,
3134  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3135  break;
3136  default:
3137  av_assert0(0);
3138  }
3139  } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3140  bp = PARTITION_SPLIT;
3141  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3142  decode_sb(ctx, row, col + hbs, lflvl,
3143  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3144  } else {
3145  bp = PARTITION_H;
3146  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3147  }
3148  } else if (row + hbs < s->rows) { // FIXME why not <=?
3149  if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3150  bp = PARTITION_SPLIT;
3151  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3152  yoff += hbs * 8 * y_stride;
3153  uvoff += hbs * 4 * uv_stride;
3154  decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3155  } else {
3156  bp = PARTITION_V;
3157  decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3158  }
3159  } else {
3160  bp = PARTITION_SPLIT;
3161  decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3162  }
3163  s->counts.partition[bl][c][bp]++;
3164 }
3165 
3166 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3167  ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3168 {
3169  VP9Context *s = ctx->priv_data;
3170  VP9Block *b = s->b;
3171  ptrdiff_t hbs = 4 >> bl;
3172  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3173  ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3174 
3175  if (bl == BL_8X8) {
3176  av_assert2(b->bl == BL_8X8);
3177  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3178  } else if (s->b->bl == bl) {
3179  decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3180  if (b->bp == PARTITION_H && row + hbs < s->rows) {
3181  yoff += hbs * 8 * y_stride;
3182  uvoff += hbs * 4 * uv_stride;
3183  decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3184  } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3185  yoff += hbs * 8;
3186  uvoff += hbs * 4;
3187  decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3188  }
3189  } else {
3190  decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3191  if (col + hbs < s->cols) { // FIXME why not <=?
3192  if (row + hbs < s->rows) {
3193  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
3194  uvoff + 4 * hbs, bl + 1);
3195  yoff += hbs * 8 * y_stride;
3196  uvoff += hbs * 4 * uv_stride;
3197  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3198  decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3199  yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3200  } else {
3201  yoff += hbs * 8;
3202  uvoff += hbs * 4;
3203  decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3204  }
3205  } else if (row + hbs < s->rows) {
3206  yoff += hbs * 8 * y_stride;
3207  uvoff += hbs * 4 * uv_stride;
3208  decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3209  }
3210  }
3211 }
3212 
3213 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3214  int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3215 {
3216  VP9Context *s = ctx->priv_data;
3217  AVFrame *f = s->frames[CUR_FRAME].tf.f;
3218  uint8_t *dst = f->data[0] + yoff, *lvl = lflvl->level;
3219  ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3220  int y, x, p;
3221 
3222  // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3223  // if you think of them as acting on a 8x8 block max, we can interleave
3224  // each v/h within the single x loop, but that only works if we work on
3225  // 8 pixel blocks, and we won't always do that (we want at least 16px
3226  // to use SSE2 optimizations, perhaps 32 for AVX2)
3227 
3228  // filter edges between columns, Y plane (e.g. block1 | block2)
3229  for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
3230  uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
3231  uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
3232  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3233  unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3234  unsigned hm = hm1 | hm2 | hm13 | hm23;
3235 
3236  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
3237  if (hm1 & x) {
3238  int L = *l, H = L >> 4;
3239  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3240 
3241  if (col || x > 1) {
3242  if (hmask1[0] & x) {
3243  if (hmask2[0] & x) {
3244  av_assert2(l[8] == L);
3245  s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
3246  } else {
3247  s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
3248  }
3249  } else if (hm2 & x) {
3250  L = l[8];
3251  H |= (L >> 4) << 8;
3252  E |= s->filter.mblim_lut[L] << 8;
3253  I |= s->filter.lim_lut[L] << 8;
3254  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3255  [!!(hmask2[1] & x)]
3256  [0](ptr, ls_y, E, I, H);
3257  } else {
3258  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3259  [0](ptr, ls_y, E, I, H);
3260  }
3261  }
3262  } else if (hm2 & x) {
3263  int L = l[8], H = L >> 4;
3264  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3265 
3266  if (col || x > 1) {
3267  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3268  [0](ptr + 8 * ls_y, ls_y, E, I, H);
3269  }
3270  }
3271  if (hm13 & x) {
3272  int L = *l, H = L >> 4;
3273  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3274 
3275  if (hm23 & x) {
3276  L = l[8];
3277  H |= (L >> 4) << 8;
3278  E |= s->filter.mblim_lut[L] << 8;
3279  I |= s->filter.lim_lut[L] << 8;
3280  s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
3281  } else {
3282  s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
3283  }
3284  } else if (hm23 & x) {
3285  int L = l[8], H = L >> 4;
3286  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3287 
3288  s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
3289  }
3290  }
3291  }
3292 
3293  // block1
3294  // filter edges between rows, Y plane (e.g. ------)
3295  // block2
3296  dst = f->data[0] + yoff;
3297  lvl = lflvl->level;
3298  for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
3299  uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
3300  unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3301 
3302  for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
3303  if (row || y) {
3304  if (vm & x) {
3305  int L = *l, H = L >> 4;
3306  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3307 
3308  if (vmask[0] & x) {
3309  if (vmask[0] & (x << 1)) {
3310  av_assert2(l[1] == L);
3311  s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
3312  } else {
3313  s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
3314  }
3315  } else if (vm & (x << 1)) {
3316  L = l[1];
3317  H |= (L >> 4) << 8;
3318  E |= s->filter.mblim_lut[L] << 8;
3319  I |= s->filter.lim_lut[L] << 8;
3320  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3321  [!!(vmask[1] & (x << 1))]
3322  [1](ptr, ls_y, E, I, H);
3323  } else {
3324  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3325  [1](ptr, ls_y, E, I, H);
3326  }
3327  } else if (vm & (x << 1)) {
3328  int L = l[1], H = L >> 4;
3329  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3330 
3331  s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
3332  [1](ptr + 8, ls_y, E, I, H);
3333  }
3334  }
3335  if (vm3 & x) {
3336  int L = *l, H = L >> 4;
3337  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3338 
3339  if (vm3 & (x << 1)) {
3340  L = l[1];
3341  H |= (L >> 4) << 8;
3342  E |= s->filter.mblim_lut[L] << 8;
3343  I |= s->filter.lim_lut[L] << 8;
3344  s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
3345  } else {
3346  s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
3347  }
3348  } else if (vm3 & (x << 1)) {
3349  int L = l[1], H = L >> 4;
3350  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3351 
3352  s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
3353  }
3354  }
3355  }
3356 
3357  // same principle but for U/V planes
3358  for (p = 0; p < 2; p++) {
3359  lvl = lflvl->level;
3360  dst = f->data[1 + p] + uvoff;
3361  for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
3362  uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
3363  uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
3364  unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
3365  unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
3366 
3367  for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
3368  if (col || x > 1) {
3369  if (hm1 & x) {
3370  int L = *l, H = L >> 4;
3371  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3372 
3373  if (hmask1[0] & x) {
3374  if (hmask2[0] & x) {
3375  av_assert2(l[16] == L);
3376  s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
3377  } else {
3378  s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
3379  }
3380  } else if (hm2 & x) {
3381  L = l[16];
3382  H |= (L >> 4) << 8;
3383  E |= s->filter.mblim_lut[L] << 8;
3384  I |= s->filter.lim_lut[L] << 8;
3385  s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3386  [!!(hmask2[1] & x)]
3387  [0](ptr, ls_uv, E, I, H);
3388  } else {
3389  s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3390  [0](ptr, ls_uv, E, I, H);
3391  }
3392  } else if (hm2 & x) {
3393  int L = l[16], H = L >> 4;
3394  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3395 
3396  s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3397  [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
3398  }
3399  }
3400  if (x & 0xAA)
3401  l += 2;
3402  }
3403  }
3404  lvl = lflvl->level;
3405  dst = f->data[1 + p] + uvoff;
3406  for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
3407  uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
3408  unsigned vm = vmask[0] | vmask[1] | vmask[2];
3409 
3410  for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
3411  if (row || y) {
3412  if (vm & x) {
3413  int L = *l, H = L >> 4;
3414  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3415 
3416  if (vmask[0] & x) {
3417  if (vmask[0] & (x << 2)) {
3418  av_assert2(l[2] == L);
3419  s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
3420  } else {
3421  s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
3422  }
3423  } else if (vm & (x << 2)) {
3424  L = l[2];
3425  H |= (L >> 4) << 8;
3426  E |= s->filter.mblim_lut[L] << 8;
3427  I |= s->filter.lim_lut[L] << 8;
3428  s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3429  [!!(vmask[1] & (x << 2))]
3430  [1](ptr, ls_uv, E, I, H);
3431  } else {
3432  s->dsp.loop_filter_8[!!(vmask[1] & x)]
3433  [1](ptr, ls_uv, E, I, H);
3434  }
3435  } else if (vm & (x << 2)) {
3436  int L = l[2], H = L >> 4;
3437  int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3438 
3439  s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
3440  [1](ptr + 8, ls_uv, E, I, H);
3441  }
3442  }
3443  }
3444  if (y & 1)
3445  lvl += 16;
3446  }
3447  }
3448 }
3449 
3450 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3451 {
3452  int sb_start = ( idx * n) >> log2_n;
3453  int sb_end = ((idx + 1) * n) >> log2_n;
3454  *start = FFMIN(sb_start, n) << 3;
3455  *end = FFMIN(sb_end, n) << 3;
3456 }
3457 
3458 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3459  int max_count, int update_factor)
3460 {
3461  unsigned ct = ct0 + ct1, p2, p1;
3462 
3463  if (!ct)
3464  return;
3465 
3466  p1 = *p;
3467  p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3468  p2 = av_clip(p2, 1, 255);
3469  ct = FFMIN(ct, max_count);
3470  update_factor = FASTDIV(update_factor * ct, max_count);
3471 
3472  // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3473  *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3474 }
3475 
3476 static void adapt_probs(VP9Context *s)
3477 {
3478  int i, j, k, l, m;
3479  prob_context *p = &s->prob_ctx[s->framectxid].p;
3480  int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3481 
3482  // coefficients
3483  for (i = 0; i < 4; i++)
3484  for (j = 0; j < 2; j++)
3485  for (k = 0; k < 2; k++)
3486  for (l = 0; l < 6; l++)
3487  for (m = 0; m < 6; m++) {
3488  uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3489  unsigned *e = s->counts.eob[i][j][k][l][m];
3490  unsigned *c = s->counts.coef[i][j][k][l][m];
3491 
3492  if (l == 0 && m >= 3) // dc only has 3 pt
3493  break;
3494 
3495  adapt_prob(&pp[0], e[0], e[1], 24, uf);
3496  adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3497  adapt_prob(&pp[2], c[1], c[2], 24, uf);
3498  }
3499 
3500  if (s->keyframe || s->intraonly) {
3501  memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3502  memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3503  memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3504  memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3505  return;
3506  }
3507 
3508  // skip flag
3509  for (i = 0; i < 3; i++)
3510  adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3511 
3512  // intra/inter flag
3513  for (i = 0; i < 4; i++)
3514  adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3515 
3516  // comppred flag
3517  if (s->comppredmode == PRED_SWITCHABLE) {
3518  for (i = 0; i < 5; i++)
3519  adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3520  }
3521 
3522  // reference frames
3523  if (s->comppredmode != PRED_SINGLEREF) {
3524  for (i = 0; i < 5; i++)
3525  adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3526  s->counts.comp_ref[i][1], 20, 128);
3527  }
3528 
3529  if (s->comppredmode != PRED_COMPREF) {
3530  for (i = 0; i < 5; i++) {
3531  uint8_t *pp = p->single_ref[i];
3532  unsigned (*c)[2] = s->counts.single_ref[i];
3533 
3534  adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3535  adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3536  }
3537  }
3538 
3539  // block partitioning
3540  for (i = 0; i < 4; i++)
3541  for (j = 0; j < 4; j++) {
3542  uint8_t *pp = p->partition[i][j];
3543  unsigned *c = s->counts.partition[i][j];
3544 
3545  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3546  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3547  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3548  }
3549 
3550  // tx size
3551  if (s->txfmmode == TX_SWITCHABLE) {
3552  for (i = 0; i < 2; i++) {
3553  unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3554 
3555  adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3556  adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3557  adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3558  adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3559  adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3560  adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3561  }
3562  }
3563 
3564  // interpolation filter
3565  if (s->filtermode == FILTER_SWITCHABLE) {
3566  for (i = 0; i < 4; i++) {
3567  uint8_t *pp = p->filter[i];
3568  unsigned *c = s->counts.filter[i];
3569 
3570  adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3571  adapt_prob(&pp[1], c[1], c[2], 20, 128);
3572  }
3573  }
3574 
3575  // inter modes
3576  for (i = 0; i < 7; i++) {
3577  uint8_t *pp = p->mv_mode[i];
3578  unsigned *c = s->counts.mv_mode[i];
3579 
3580  adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3581  adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3582  adapt_prob(&pp[2], c[1], c[3], 20, 128);
3583  }
3584 
3585  // mv joints
3586  {
3587  uint8_t *pp = p->mv_joint;
3588  unsigned *c = s->counts.mv_joint;
3589 
3590  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3591  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3592  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3593  }
3594 
3595  // mv components
3596  for (i = 0; i < 2; i++) {
3597  uint8_t *pp;
3598  unsigned *c, (*c2)[2], sum;
3599 
3600  adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3601  s->counts.mv_comp[i].sign[1], 20, 128);
3602 
3603  pp = p->mv_comp[i].classes;
3604  c = s->counts.mv_comp[i].classes;
3605  sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3606  adapt_prob(&pp[0], c[0], sum, 20, 128);
3607  sum -= c[1];
3608  adapt_prob(&pp[1], c[1], sum, 20, 128);
3609  sum -= c[2] + c[3];
3610  adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3611  adapt_prob(&pp[3], c[2], c[3], 20, 128);
3612  sum -= c[4] + c[5];
3613  adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3614  adapt_prob(&pp[5], c[4], c[5], 20, 128);
3615  sum -= c[6];
3616  adapt_prob(&pp[6], c[6], sum, 20, 128);
3617  adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3618  adapt_prob(&pp[8], c[7], c[8], 20, 128);
3619  adapt_prob(&pp[9], c[9], c[10], 20, 128);
3620 
3621  adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3622  s->counts.mv_comp[i].class0[1], 20, 128);
3623  pp = p->mv_comp[i].bits;
3624  c2 = s->counts.mv_comp[i].bits;
3625  for (j = 0; j < 10; j++)
3626  adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3627 
3628  for (j = 0; j < 2; j++) {
3629  pp = p->mv_comp[i].class0_fp[j];
3630  c = s->counts.mv_comp[i].class0_fp[j];
3631  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3632  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3633  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3634  }
3635  pp = p->mv_comp[i].fp;
3636  c = s->counts.mv_comp[i].fp;
3637  adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3638  adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3639  adapt_prob(&pp[2], c[2], c[3], 20, 128);
3640 
3641  if (s->highprecisionmvs) {
3642  adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3643  s->counts.mv_comp[i].class0_hp[1], 20, 128);
3644  adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3645  s->counts.mv_comp[i].hp[1], 20, 128);
3646  }
3647  }
3648 
3649  // y intra modes
3650  for (i = 0; i < 4; i++) {
3651  uint8_t *pp = p->y_mode[i];
3652  unsigned *c = s->counts.y_mode[i], sum, s2;
3653 
3654  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3655  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3656  sum -= c[TM_VP8_PRED];
3657  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3658  sum -= c[VERT_PRED];
3659  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3660  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3661  sum -= s2;
3662  adapt_prob(&pp[3], s2, sum, 20, 128);
3663  s2 -= c[HOR_PRED];
3664  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3665  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3666  sum -= c[DIAG_DOWN_LEFT_PRED];
3667  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3668  sum -= c[VERT_LEFT_PRED];
3669  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3670  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3671  }
3672 
3673  // uv intra modes
3674  for (i = 0; i < 10; i++) {
3675  uint8_t *pp = p->uv_mode[i];
3676  unsigned *c = s->counts.uv_mode[i], sum, s2;
3677 
3678  sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3679  adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3680  sum -= c[TM_VP8_PRED];
3681  adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3682  sum -= c[VERT_PRED];
3683  adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3684  s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3685  sum -= s2;
3686  adapt_prob(&pp[3], s2, sum, 20, 128);
3687  s2 -= c[HOR_PRED];
3688  adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3689  adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3690  sum -= c[DIAG_DOWN_LEFT_PRED];
3691  adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3692  sum -= c[VERT_LEFT_PRED];
3693  adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3694  adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3695  }
3696 }
3697 
3698 static void free_buffers(VP9Context *s)
3699 {
3700  av_freep(&s->intra_pred_data[0]);
3701  av_freep(&s->b_base);
3702  av_freep(&s->block_base);
3703 }
3704 
3706 {
3707  VP9Context *s = ctx->priv_data;
3708  int i;
3709 
3710  for (i = 0; i < 2; i++) {
3711  if (s->frames[i].tf.f->data[0])
3712  vp9_unref_frame(ctx, &s->frames[i]);
3713  av_frame_free(&s->frames[i].tf.f);
3714  }
3715  for (i = 0; i < 8; i++) {
3716  if (s->refs[i].f->data[0])
3717  ff_thread_release_buffer(ctx, &s->refs[i]);
3718  av_frame_free(&s->refs[i].f);
3719  if (s->next_refs[i].f->data[0])
3720  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3721  av_frame_free(&s->next_refs[i].f);
3722  }
3723  free_buffers(s);
3724  av_freep(&s->c_b);
3725  s->c_b_size = 0;
3726 
3727  return 0;
3728 }
3729 
3730 
3731 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3732  int *got_frame, AVPacket *pkt)
3733 {
3734  const uint8_t *data = pkt->data;
3735  int size = pkt->size;
3736  VP9Context *s = ctx->priv_data;
3737  int res, tile_row, tile_col, i, ref, row, col;
3738  ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3739  AVFrame *f;
3740 
3741  if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3742  return res;
3743  } else if (res == 0) {
3744  if (!s->refs[ref].f->data[0]) {
3745  av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3746  return AVERROR_INVALIDDATA;
3747  }
3748  if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
3749  return res;
3750  *got_frame = 1;
3751  return 0;
3752  }
3753  data += res;
3754  size -= res;
3755 
3756  if (s->frames[LAST_FRAME].tf.f->data[0])
3757  vp9_unref_frame(ctx, &s->frames[LAST_FRAME]);
3758  if (!s->keyframe && s->frames[CUR_FRAME].tf.f->data[0] &&
3759  (res = vp9_ref_frame(ctx, &s->frames[LAST_FRAME], &s->frames[CUR_FRAME])) < 0)
3760  return res;
3761  if (s->frames[CUR_FRAME].tf.f->data[0])
3762  vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
3763  if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
3764  return res;
3765  f = s->frames[CUR_FRAME].tf.f;
3766  f->key_frame = s->keyframe;
3768  ls_y = f->linesize[0];
3769  ls_uv =f->linesize[1];
3770 
3771  // ref frame setup
3772  for (i = 0; i < 8; i++) {
3773  if (s->next_refs[i].f->data[0])
3774  ff_thread_release_buffer(ctx, &s->next_refs[i]);
3775  if (s->refreshrefmask & (1 << i)) {
3776  res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
3777  } else {
3778  res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
3779  }
3780  if (res < 0)
3781  return res;
3782  }
3783 
3784  if (s->fullrange)
3786  else
3788 
3789  switch (s->colorspace) {
3790  case 1: ctx->colorspace = AVCOL_SPC_BT470BG; break;
3791  case 2: ctx->colorspace = AVCOL_SPC_BT709; break;
3792  case 3: ctx->colorspace = AVCOL_SPC_SMPTE170M; break;
3793  case 4: ctx->colorspace = AVCOL_SPC_SMPTE240M; break;
3794  }
3795 
3796  // main tile decode loop
3797  memset(s->above_partition_ctx, 0, s->cols);
3798  memset(s->above_skip_ctx, 0, s->cols);
3799  if (s->keyframe || s->intraonly) {
3800  memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3801  } else {
3802  memset(s->above_mode_ctx, NEARESTMV, s->cols);
3803  }
3804  memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3805  memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
3806  memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
3807  memset(s->above_segpred_ctx, 0, s->cols);
3808  s->pass = s->uses_2pass =
3810  if ((res = update_block_buffers(ctx)) < 0) {
3811  av_log(ctx, AV_LOG_ERROR,
3812  "Failed to allocate block buffers\n");
3813  return res;
3814  }
3815  if (s->refreshctx && s->parallelmode) {
3816  int j, k, l, m;
3817 
3818  for (i = 0; i < 4; i++) {
3819  for (j = 0; j < 2; j++)
3820  for (k = 0; k < 2; k++)
3821  for (l = 0; l < 6; l++)
3822  for (m = 0; m < 6; m++)
3823  memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
3824  s->prob.coef[i][j][k][l][m], 3);
3825  if (s->txfmmode == i)
3826  break;
3827  }
3828  s->prob_ctx[s->framectxid].p = s->prob.p;
3830  } else if (!s->refreshctx) {
3832  }
3833 
3834  do {
3835  yoff = uvoff = 0;
3836  s->b = s->b_base;
3837  s->block = s->block_base;
3838  s->uvblock[0] = s->uvblock_base[0];
3839  s->uvblock[1] = s->uvblock_base[1];
3840  s->eob = s->eob_base;
3841  s->uveob[0] = s->uveob_base[0];
3842  s->uveob[1] = s->uveob_base[1];
3843 
3844  for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
3846  tile_row, s->tiling.log2_tile_rows, s->sb_rows);
3847  if (s->pass != 2) {
3848  for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3849  unsigned tile_size;
3850 
3851  if (tile_col == s->tiling.tile_cols - 1 &&
3852  tile_row == s->tiling.tile_rows - 1) {
3853  tile_size = size;
3854  } else {
3855  tile_size = AV_RB32(data);
3856  data += 4;
3857  size -= 4;
3858  }
3859  if (tile_size > size) {
3860  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3861  return AVERROR_INVALIDDATA;
3862  }
3863  ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
3864  if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
3865  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3866  return AVERROR_INVALIDDATA;
3867  }
3868  data += tile_size;
3869  size -= tile_size;
3870  }
3871  }
3872 
3873  for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
3874  row += 8, yoff += ls_y * 64, uvoff += ls_uv * 32) {
3875  struct VP9Filter *lflvl_ptr = s->lflvl;
3876  ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
3877 
3878  for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3880  tile_col, s->tiling.log2_tile_cols, s->sb_cols);
3881 
3882  if (s->pass != 2) {
3883  memset(s->left_partition_ctx, 0, 8);
3884  memset(s->left_skip_ctx, 0, 8);
3885  if (s->keyframe || s->intraonly) {
3886  memset(s->left_mode_ctx, DC_PRED, 16);
3887  } else {
3888  memset(s->left_mode_ctx, NEARESTMV, 8);
3889  }
3890  memset(s->left_y_nnz_ctx, 0, 16);
3891  memset(s->left_uv_nnz_ctx, 0, 16);
3892  memset(s->left_segpred_ctx, 0, 8);
3893 
3894  memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
3895  }
3896 
3897  for (col = s->tiling.tile_col_start;
3898  col < s->tiling.tile_col_end;
3899  col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3900  // FIXME integrate with lf code (i.e. zero after each
3901  // use, similar to invtxfm coefficients, or similar)
3902  if (s->pass != 1) {
3903  memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
3904  }
3905 
3906  if (s->pass == 2) {
3907  decode_sb_mem(ctx, row, col, lflvl_ptr,
3908  yoff2, uvoff2, BL_64X64);
3909  } else {
3910  decode_sb(ctx, row, col, lflvl_ptr,
3911  yoff2, uvoff2, BL_64X64);
3912  }
3913  }
3914  if (s->pass != 2) {
3915  memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
3916  }
3917  }
3918 
3919  if (s->pass == 1) {
3920  continue;
3921  }
3922 
3923  // backup pre-loopfilter reconstruction data for intra
3924  // prediction of next row of sb64s
3925  if (row + 8 < s->rows) {
3926  memcpy(s->intra_pred_data[0],
3927  f->data[0] + yoff + 63 * ls_y,
3928  8 * s->cols);
3929  memcpy(s->intra_pred_data[1],
3930  f->data[1] + uvoff + 31 * ls_uv,
3931  4 * s->cols);
3932  memcpy(s->intra_pred_data[2],
3933  f->data[2] + uvoff + 31 * ls_uv,
3934  4 * s->cols);
3935  }
3936 
3937  // loopfilter one row
3938  if (s->filter.level) {
3939  yoff2 = yoff;
3940  uvoff2 = uvoff;
3941  lflvl_ptr = s->lflvl;
3942  for (col = 0; col < s->cols;
3943  col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3944  loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
3945  }
3946  }
3947 
3948  // FIXME maybe we can make this more finegrained by running the
3949  // loopfilter per-block instead of after each sbrow
3950  // In fact that would also make intra pred left preparation easier?
3951  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
3952  }
3953  }
3954 
3955  if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
3956  adapt_probs(s);
3958  }
3959  } while (s->pass++ == 1);
3960  ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3961 
3962  // ref frame setup
3963  for (i = 0; i < 8; i++) {
3964  if (s->refs[i].f->data[0])
3965  ff_thread_release_buffer(ctx, &s->refs[i]);
3966  ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
3967  }
3968 
3969  if (!s->invisible) {
3970  if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
3971  return res;
3972  *got_frame = 1;
3973  }
3974 
3975  return 0;
3976 }
3977 
3979 {
3980  VP9Context *s = ctx->priv_data;
3981  int i;
3982 
3983  for (i = 0; i < 2; i++)
3984  vp9_unref_frame(ctx, &s->frames[i]);
3985  for (i = 0; i < 8; i++)
3986  ff_thread_release_buffer(ctx, &s->refs[i]);
3987 }
3988 
3989 static int init_frames(AVCodecContext *ctx)
3990 {
3991  VP9Context *s = ctx->priv_data;
3992  int i;
3993 
3994  for (i = 0; i < 2; i++) {
3995  s->frames[i].tf.f = av_frame_alloc();
3996  if (!s->frames[i].tf.f) {
3997  vp9_decode_free(ctx);
3998  av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3999  return AVERROR(ENOMEM);
4000  }
4001  }
4002  for (i = 0; i < 8; i++) {
4003  s->refs[i].f = av_frame_alloc();
4004  s->next_refs[i].f = av_frame_alloc();
4005  if (!s->refs[i].f || !s->next_refs[i].f) {
4006  vp9_decode_free(ctx);
4007  av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
4008  return AVERROR(ENOMEM);
4009  }
4010  }
4011 
4012  return 0;
4013 }
4014 
4016 {
4017  VP9Context *s = ctx->priv_data;
4018 
4019  ctx->internal->allocate_progress = 1;
4020  ctx->pix_fmt = AV_PIX_FMT_YUV420P;
4021  ff_vp9dsp_init(&s->dsp);
4022  ff_videodsp_init(&s->vdsp, 8);
4023  s->filter.sharpness = -1;
4024 
4025  return init_frames(ctx);
4026 }
4027 
4029 {
4030  return init_frames(avctx);
4031 }
4032 
4034 {
4035  int i, res;
4036  VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4037 
4038  // detect size changes in other threads
4039  if (s->intra_pred_data[0] &&
4040  (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
4041  free_buffers(s);
4042  }
4043 
4044  for (i = 0; i < 2; i++) {
4045  if (s->frames[i].tf.f->data[0])
4046  vp9_unref_frame(dst, &s->frames[i]);
4047  if (ssrc->frames[i].tf.f->data[0]) {
4048  if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
4049  return res;
4050  }
4051  }
4052  for (i = 0; i < 8; i++) {
4053  if (s->refs[i].f->data[0])
4054  ff_thread_release_buffer(dst, &s->refs[i]);
4055  if (ssrc->next_refs[i].f->data[0]) {
4056  if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
4057  return res;
4058  }
4059  }
4060 
4061  s->invisible = ssrc->invisible;
4062  s->keyframe = ssrc->keyframe;
4063  s->uses_2pass = ssrc->uses_2pass;
4064  memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4065  memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4066  if (ssrc->segmentation.enabled) {
4067  memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4068  sizeof(s->segmentation.feat));
4069  }
4070 
4071  return 0;
4072 }
4073 
4075  .name = "vp9",
4076  .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4077  .type = AVMEDIA_TYPE_VIDEO,
4078  .id = AV_CODEC_ID_VP9,
4079  .priv_data_size = sizeof(VP9Context),
4080  .init = vp9_decode_init,
4083  .capabilities = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
4087 };