FFmpeg
af_arnndn.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2018 Gregor Richards
3  * Copyright (c) 2017 Mozilla
4  * Copyright (c) 2005-2009 Xiph.Org Foundation
5  * Copyright (c) 2007-2008 CSIRO
6  * Copyright (c) 2008-2011 Octasic Inc.
7  * Copyright (c) Jean-Marc Valin
8  * Copyright (c) 2019 Paul B Mahol
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  *
14  * - Redistributions of source code must retain the above copyright
15  * notice, this list of conditions and the following disclaimer.
16  *
17  * - Redistributions in binary form must reproduce the above copyright
18  * notice, this list of conditions and the following disclaimer in the
19  * documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
25  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 #include "libavutil/avassert.h"
35 #include "libavutil/file_open.h"
36 #include "libavutil/float_dsp.h"
37 #include "libavutil/mem_internal.h"
38 #include "libavutil/opt.h"
39 #include "libavutil/tx.h"
40 #include "avfilter.h"
41 #include "audio.h"
42 #include "filters.h"
43 #include "formats.h"
44 
45 #define FRAME_SIZE_SHIFT 2
46 #define FRAME_SIZE (120<<FRAME_SIZE_SHIFT)
47 #define WINDOW_SIZE (2*FRAME_SIZE)
48 #define FREQ_SIZE (FRAME_SIZE + 1)
49 
50 #define PITCH_MIN_PERIOD 60
51 #define PITCH_MAX_PERIOD 768
52 #define PITCH_FRAME_SIZE 960
53 #define PITCH_BUF_SIZE (PITCH_MAX_PERIOD+PITCH_FRAME_SIZE)
54 
55 #define SQUARE(x) ((x)*(x))
56 
57 #define NB_BANDS 22
58 
59 #define CEPS_MEM 8
60 #define NB_DELTA_CEPS 6
61 
62 #define NB_FEATURES (NB_BANDS+3*NB_DELTA_CEPS+2)
63 
64 #define WEIGHTS_SCALE (1.f/256)
65 
66 #define MAX_NEURONS 128
67 
68 #define ACTIVATION_TANH 0
69 #define ACTIVATION_SIGMOID 1
70 #define ACTIVATION_RELU 2
71 
72 #define Q15ONE 1.0f
73 
74 typedef struct DenseLayer {
75  const float *bias;
76  const float *input_weights;
77  int nb_inputs;
80 } DenseLayer;
81 
82 typedef struct GRULayer {
83  const float *bias;
84  const float *input_weights;
85  const float *recurrent_weights;
86  int nb_inputs;
89 } GRULayer;
90 
91 typedef struct RNNModel {
94 
96  const GRULayer *vad_gru;
97 
100 
103 
106 
109 } RNNModel;
110 
111 typedef struct RNNState {
116 } RNNState;
117 
118 typedef struct DenoiseState {
121  int memid;
125  float last_gain;
127  float mem_hp_x[2];
128  float lastg[NB_BANDS];
133 } DenoiseState;
134 
135 typedef struct AudioRNNContext {
136  const AVClass *class;
137 
138  char *model_name;
139  float mix;
140 
141  int channels;
143 
146 
148 
151 
152 #define F_ACTIVATION_TANH 0
153 #define F_ACTIVATION_SIGMOID 1
154 #define F_ACTIVATION_RELU 2
155 
156 static void rnnoise_model_free(RNNModel *model)
157 {
158 #define FREE_MAYBE(ptr) do { if (ptr) free(ptr); } while (0)
159 #define FREE_DENSE(name) do { \
160  if (model->name) { \
161  av_free((void *) model->name->input_weights); \
162  av_free((void *) model->name->bias); \
163  av_free((void *) model->name); \
164  } \
165  } while (0)
166 #define FREE_GRU(name) do { \
167  if (model->name) { \
168  av_free((void *) model->name->input_weights); \
169  av_free((void *) model->name->recurrent_weights); \
170  av_free((void *) model->name->bias); \
171  av_free((void *) model->name); \
172  } \
173  } while (0)
174 
175  if (!model)
176  return;
177  FREE_DENSE(input_dense);
178  FREE_GRU(vad_gru);
179  FREE_GRU(noise_gru);
180  FREE_GRU(denoise_gru);
181  FREE_DENSE(denoise_output);
182  FREE_DENSE(vad_output);
183  av_free(model);
184 }
185 
186 static int rnnoise_model_from_file(FILE *f, RNNModel **rnn)
187 {
188  RNNModel *ret = NULL;
189  DenseLayer *input_dense;
190  GRULayer *vad_gru;
191  GRULayer *noise_gru;
192  GRULayer *denoise_gru;
193  DenseLayer *denoise_output;
194  DenseLayer *vad_output;
195  int in;
196 
197  if (fscanf(f, "rnnoise-nu model file version %d\n", &in) != 1 || in != 1)
198  return AVERROR_INVALIDDATA;
199 
200  ret = av_calloc(1, sizeof(RNNModel));
201  if (!ret)
202  return AVERROR(ENOMEM);
203 
204 #define ALLOC_LAYER(type, name) \
205  name = av_calloc(1, sizeof(type)); \
206  if (!name) { \
207  rnnoise_model_free(ret); \
208  return AVERROR(ENOMEM); \
209  } \
210  ret->name = name
211 
212  ALLOC_LAYER(DenseLayer, input_dense);
213  ALLOC_LAYER(GRULayer, vad_gru);
214  ALLOC_LAYER(GRULayer, noise_gru);
215  ALLOC_LAYER(GRULayer, denoise_gru);
216  ALLOC_LAYER(DenseLayer, denoise_output);
217  ALLOC_LAYER(DenseLayer, vad_output);
218 
219 #define INPUT_VAL(name) do { \
220  if (fscanf(f, "%d", &in) != 1 || in < 0 || in > 128) { \
221  rnnoise_model_free(ret); \
222  return AVERROR(EINVAL); \
223  } \
224  name = in; \
225  } while (0)
226 
227 #define INPUT_ACTIVATION(name) do { \
228  int activation; \
229  INPUT_VAL(activation); \
230  switch (activation) { \
231  case F_ACTIVATION_SIGMOID: \
232  name = ACTIVATION_SIGMOID; \
233  break; \
234  case F_ACTIVATION_RELU: \
235  name = ACTIVATION_RELU; \
236  break; \
237  default: \
238  name = ACTIVATION_TANH; \
239  } \
240  } while (0)
241 
242 #define INPUT_ARRAY(name, len) do { \
243  float *values = av_calloc((len), sizeof(float)); \
244  if (!values) { \
245  rnnoise_model_free(ret); \
246  return AVERROR(ENOMEM); \
247  } \
248  name = values; \
249  for (int i = 0; i < (len); i++) { \
250  if (fscanf(f, "%d", &in) != 1) { \
251  rnnoise_model_free(ret); \
252  return AVERROR(EINVAL); \
253  } \
254  values[i] = in; \
255  } \
256  } while (0)
257 
258 #define INPUT_ARRAY3(name, len0, len1, len2) do { \
259  float *values = av_calloc(FFALIGN((len0), 4) * FFALIGN((len1), 4) * (len2), sizeof(float)); \
260  if (!values) { \
261  rnnoise_model_free(ret); \
262  return AVERROR(ENOMEM); \
263  } \
264  name = values; \
265  for (int k = 0; k < (len0); k++) { \
266  for (int i = 0; i < (len2); i++) { \
267  for (int j = 0; j < (len1); j++) { \
268  if (fscanf(f, "%d", &in) != 1) { \
269  rnnoise_model_free(ret); \
270  return AVERROR(EINVAL); \
271  } \
272  values[j * (len2) * FFALIGN((len0), 4) + i * FFALIGN((len0), 4) + k] = in; \
273  } \
274  } \
275  } \
276  } while (0)
277 
278 #define NEW_LINE() do { \
279  int c; \
280  while ((c = fgetc(f)) != EOF) { \
281  if (c == '\n') \
282  break; \
283  } \
284  } while (0)
285 
286 #define INPUT_DENSE(name) do { \
287  INPUT_VAL(name->nb_inputs); \
288  INPUT_VAL(name->nb_neurons); \
289  ret->name ## _size = name->nb_neurons; \
290  INPUT_ACTIVATION(name->activation); \
291  NEW_LINE(); \
292  INPUT_ARRAY(name->input_weights, name->nb_inputs * name->nb_neurons); \
293  NEW_LINE(); \
294  INPUT_ARRAY(name->bias, name->nb_neurons); \
295  NEW_LINE(); \
296  } while (0)
297 
298 #define INPUT_GRU(name) do { \
299  INPUT_VAL(name->nb_inputs); \
300  INPUT_VAL(name->nb_neurons); \
301  ret->name ## _size = name->nb_neurons; \
302  INPUT_ACTIVATION(name->activation); \
303  NEW_LINE(); \
304  INPUT_ARRAY3(name->input_weights, name->nb_inputs, name->nb_neurons, 3); \
305  NEW_LINE(); \
306  INPUT_ARRAY3(name->recurrent_weights, name->nb_neurons, name->nb_neurons, 3); \
307  NEW_LINE(); \
308  INPUT_ARRAY(name->bias, name->nb_neurons * 3); \
309  NEW_LINE(); \
310  } while (0)
311 
312  INPUT_DENSE(input_dense);
313  INPUT_GRU(vad_gru);
314  INPUT_GRU(noise_gru);
315  INPUT_GRU(denoise_gru);
316  INPUT_DENSE(denoise_output);
317  INPUT_DENSE(vad_output);
318 
319  if (vad_output->nb_neurons != 1) {
321  return AVERROR(EINVAL);
322  }
323 
324  *rnn = ret;
325 
326  return 0;
327 }
328 
330 {
331  static const enum AVSampleFormat sample_fmts[] = {
334  };
335  int ret, sample_rates[] = { 48000, -1 };
336 
338  if (ret < 0)
339  return ret;
340 
342  if (ret < 0)
343  return ret;
344 
346 }
347 
349 {
350  AVFilterContext *ctx = inlink->dst;
351  AudioRNNContext *s = ctx->priv;
352  int ret = 0;
353 
354  s->channels = inlink->ch_layout.nb_channels;
355 
356  if (!s->st)
357  s->st = av_calloc(s->channels, sizeof(DenoiseState));
358  if (!s->st)
359  return AVERROR(ENOMEM);
360 
361  for (int i = 0; i < s->channels; i++) {
362  DenoiseState *st = &s->st[i];
363 
364  st->rnn[0].model = s->model[0];
365  st->rnn[0].vad_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->vad_gru_size, 16));
366  st->rnn[0].noise_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->noise_gru_size, 16));
367  st->rnn[0].denoise_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->denoise_gru_size, 16));
368  if (!st->rnn[0].vad_gru_state ||
369  !st->rnn[0].noise_gru_state ||
370  !st->rnn[0].denoise_gru_state)
371  return AVERROR(ENOMEM);
372  }
373 
374  for (int i = 0; i < s->channels; i++) {
375  DenoiseState *st = &s->st[i];
376 
377  if (!st->tx)
378  ret = av_tx_init(&st->tx, &st->tx_fn, AV_TX_FLOAT_FFT, 0, WINDOW_SIZE, NULL, 0);
379  if (ret < 0)
380  return ret;
381 
382  if (!st->txi)
383  ret = av_tx_init(&st->txi, &st->txi_fn, AV_TX_FLOAT_FFT, 1, WINDOW_SIZE, NULL, 0);
384  if (ret < 0)
385  return ret;
386  }
387 
388  return ret;
389 }
390 
391 static void biquad(float *y, float mem[2], const float *x,
392  const float *b, const float *a, int N)
393 {
394  for (int i = 0; i < N; i++) {
395  float xi, yi;
396 
397  xi = x[i];
398  yi = x[i] + mem[0];
399  mem[0] = mem[1] + (b[0]*xi - a[0]*yi);
400  mem[1] = (b[1]*xi - a[1]*yi);
401  y[i] = yi;
402  }
403 }
404 
405 #define RNN_MOVE(dst, src, n) (memmove((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
406 #define RNN_CLEAR(dst, n) (memset((dst), 0, (n)*sizeof(*(dst))))
407 #define RNN_COPY(dst, src, n) (memcpy((dst), (src), (n)*sizeof(*(dst)) + 0*((dst)-(src)) ))
408 
409 static void forward_transform(DenoiseState *st, AVComplexFloat *out, const float *in)
410 {
413 
414  for (int i = 0; i < WINDOW_SIZE; i++) {
415  x[i].re = in[i];
416  x[i].im = 0;
417  }
418 
419  st->tx_fn(st->tx, y, x, sizeof(float));
420 
421  RNN_COPY(out, y, FREQ_SIZE);
422 }
423 
424 static void inverse_transform(DenoiseState *st, float *out, const AVComplexFloat *in)
425 {
428 
429  RNN_COPY(x, in, FREQ_SIZE);
430 
431  for (int i = FREQ_SIZE; i < WINDOW_SIZE; i++) {
432  x[i].re = x[WINDOW_SIZE - i].re;
433  x[i].im = -x[WINDOW_SIZE - i].im;
434  }
435 
436  st->txi_fn(st->txi, y, x, sizeof(float));
437 
438  for (int i = 0; i < WINDOW_SIZE; i++)
439  out[i] = y[i].re / WINDOW_SIZE;
440 }
441 
442 static const uint8_t eband5ms[] = {
443 /*0 200 400 600 800 1k 1.2 1.4 1.6 2k 2.4 2.8 3.2 4k 4.8 5.6 6.8 8k 9.6 12k 15.6 20k*/
444  0, 1, 2, 3, 4, 5, 6, 7, 8, 10, 12, 14, 16, 20, 24, 28, 34, 40, 48, 60, 78, 100
445 };
446 
447 static void compute_band_energy(float *bandE, const AVComplexFloat *X)
448 {
449  float sum[NB_BANDS] = {0};
450 
451  for (int i = 0; i < NB_BANDS - 1; i++) {
452  int band_size;
453 
454  band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;
455  for (int j = 0; j < band_size; j++) {
456  float tmp, frac = (float)j / band_size;
457 
458  tmp = SQUARE(X[(eband5ms[i] << FRAME_SIZE_SHIFT) + j].re);
459  tmp += SQUARE(X[(eband5ms[i] << FRAME_SIZE_SHIFT) + j].im);
460  sum[i] += (1.f - frac) * tmp;
461  sum[i + 1] += frac * tmp;
462  }
463  }
464 
465  sum[0] *= 2;
466  sum[NB_BANDS - 1] *= 2;
467 
468  for (int i = 0; i < NB_BANDS; i++)
469  bandE[i] = sum[i];
470 }
471 
472 static void compute_band_corr(float *bandE, const AVComplexFloat *X, const AVComplexFloat *P)
473 {
474  float sum[NB_BANDS] = { 0 };
475 
476  for (int i = 0; i < NB_BANDS - 1; i++) {
477  int band_size;
478 
479  band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;
480  for (int j = 0; j < band_size; j++) {
481  float tmp, frac = (float)j / band_size;
482 
483  tmp = X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].re * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].re;
484  tmp += X[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].im * P[(eband5ms[i]<<FRAME_SIZE_SHIFT) + j].im;
485  sum[i] += (1 - frac) * tmp;
486  sum[i + 1] += frac * tmp;
487  }
488  }
489 
490  sum[0] *= 2;
491  sum[NB_BANDS-1] *= 2;
492 
493  for (int i = 0; i < NB_BANDS; i++)
494  bandE[i] = sum[i];
495 }
496 
497 static void frame_analysis(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, float *Ex, const float *in)
498 {
499  LOCAL_ALIGNED_32(float, x, [WINDOW_SIZE]);
500 
502  RNN_COPY(x + FRAME_SIZE, in, FRAME_SIZE);
503  RNN_COPY(st->analysis_mem, in, FRAME_SIZE);
504  s->fdsp->vector_fmul(x, x, s->window, WINDOW_SIZE);
505  forward_transform(st, X, x);
506  compute_band_energy(Ex, X);
507 }
508 
509 static void frame_synthesis(AudioRNNContext *s, DenoiseState *st, float *out, const AVComplexFloat *y)
510 {
511  LOCAL_ALIGNED_32(float, x, [WINDOW_SIZE]);
512  const float *src = st->history;
513  const float mix = s->mix;
514  const float imix = 1.f - FFMAX(mix, 0.f);
515 
516  inverse_transform(st, x, y);
517  s->fdsp->vector_fmul(x, x, s->window, WINDOW_SIZE);
518  s->fdsp->vector_fmac_scalar(x, st->synthesis_mem, 1.f, FRAME_SIZE);
519  RNN_COPY(out, x, FRAME_SIZE);
521 
522  for (int n = 0; n < FRAME_SIZE; n++)
523  out[n] = out[n] * mix + src[n] * imix;
524 }
525 
526 static inline void xcorr_kernel(const float *x, const float *y, float sum[4], int len)
527 {
528  float y_0, y_1, y_2, y_3 = 0;
529  int j;
530 
531  y_0 = *y++;
532  y_1 = *y++;
533  y_2 = *y++;
534 
535  for (j = 0; j < len - 3; j += 4) {
536  float tmp;
537 
538  tmp = *x++;
539  y_3 = *y++;
540  sum[0] += tmp * y_0;
541  sum[1] += tmp * y_1;
542  sum[2] += tmp * y_2;
543  sum[3] += tmp * y_3;
544  tmp = *x++;
545  y_0 = *y++;
546  sum[0] += tmp * y_1;
547  sum[1] += tmp * y_2;
548  sum[2] += tmp * y_3;
549  sum[3] += tmp * y_0;
550  tmp = *x++;
551  y_1 = *y++;
552  sum[0] += tmp * y_2;
553  sum[1] += tmp * y_3;
554  sum[2] += tmp * y_0;
555  sum[3] += tmp * y_1;
556  tmp = *x++;
557  y_2 = *y++;
558  sum[0] += tmp * y_3;
559  sum[1] += tmp * y_0;
560  sum[2] += tmp * y_1;
561  sum[3] += tmp * y_2;
562  }
563 
564  if (j++ < len) {
565  float tmp = *x++;
566 
567  y_3 = *y++;
568  sum[0] += tmp * y_0;
569  sum[1] += tmp * y_1;
570  sum[2] += tmp * y_2;
571  sum[3] += tmp * y_3;
572  }
573 
574  if (j++ < len) {
575  float tmp=*x++;
576 
577  y_0 = *y++;
578  sum[0] += tmp * y_1;
579  sum[1] += tmp * y_2;
580  sum[2] += tmp * y_3;
581  sum[3] += tmp * y_0;
582  }
583 
584  if (j < len) {
585  float tmp=*x++;
586 
587  y_1 = *y++;
588  sum[0] += tmp * y_2;
589  sum[1] += tmp * y_3;
590  sum[2] += tmp * y_0;
591  sum[3] += tmp * y_1;
592  }
593 }
594 
595 static inline float celt_inner_prod(const float *x,
596  const float *y, int N)
597 {
598  float xy = 0.f;
599 
600  for (int i = 0; i < N; i++)
601  xy += x[i] * y[i];
602 
603  return xy;
604 }
605 
606 static void celt_pitch_xcorr(const float *x, const float *y,
607  float *xcorr, int len, int max_pitch)
608 {
609  int i;
610 
611  for (i = 0; i < max_pitch - 3; i += 4) {
612  float sum[4] = { 0, 0, 0, 0};
613 
614  xcorr_kernel(x, y + i, sum, len);
615 
616  xcorr[i] = sum[0];
617  xcorr[i + 1] = sum[1];
618  xcorr[i + 2] = sum[2];
619  xcorr[i + 3] = sum[3];
620  }
621  /* In case max_pitch isn't a multiple of 4, do non-unrolled version. */
622  for (; i < max_pitch; i++) {
623  xcorr[i] = celt_inner_prod(x, y + i, len);
624  }
625 }
626 
627 static int celt_autocorr(const float *x, /* in: [0...n-1] samples x */
628  float *ac, /* out: [0...lag-1] ac values */
629  const float *window,
630  int overlap,
631  int lag,
632  int n)
633 {
634  int fastN = n - lag;
635  int shift;
636  const float *xptr;
637  float xx[PITCH_BUF_SIZE>>1];
638 
639  if (overlap == 0) {
640  xptr = x;
641  } else {
642  for (int i = 0; i < n; i++)
643  xx[i] = x[i];
644  for (int i = 0; i < overlap; i++) {
645  xx[i] = x[i] * window[i];
646  xx[n-i-1] = x[n-i-1] * window[i];
647  }
648  xptr = xx;
649  }
650 
651  shift = 0;
652  celt_pitch_xcorr(xptr, xptr, ac, fastN, lag+1);
653 
654  for (int k = 0; k <= lag; k++) {
655  float d = 0.f;
656 
657  for (int i = k + fastN; i < n; i++)
658  d += xptr[i] * xptr[i-k];
659  ac[k] += d;
660  }
661 
662  return shift;
663 }
664 
665 static void celt_lpc(float *lpc, /* out: [0...p-1] LPC coefficients */
666  const float *ac, /* in: [0...p] autocorrelation values */
667  int p)
668 {
669  float r, error = ac[0];
670 
671  RNN_CLEAR(lpc, p);
672  if (ac[0] != 0) {
673  for (int i = 0; i < p; i++) {
674  /* Sum up this iteration's reflection coefficient */
675  float rr = 0;
676  for (int j = 0; j < i; j++)
677  rr += (lpc[j] * ac[i - j]);
678  rr += ac[i + 1];
679  r = -rr/error;
680  /* Update LPC coefficients and total error */
681  lpc[i] = r;
682  for (int j = 0; j < (i + 1) >> 1; j++) {
683  float tmp1, tmp2;
684  tmp1 = lpc[j];
685  tmp2 = lpc[i-1-j];
686  lpc[j] = tmp1 + (r*tmp2);
687  lpc[i-1-j] = tmp2 + (r*tmp1);
688  }
689 
690  error = error - (r * r *error);
691  /* Bail out once we get 30 dB gain */
692  if (error < .001f * ac[0])
693  break;
694  }
695  }
696 }
697 
698 static void celt_fir5(const float *x,
699  const float *num,
700  float *y,
701  int N,
702  float *mem)
703 {
704  float num0, num1, num2, num3, num4;
705  float mem0, mem1, mem2, mem3, mem4;
706 
707  num0 = num[0];
708  num1 = num[1];
709  num2 = num[2];
710  num3 = num[3];
711  num4 = num[4];
712  mem0 = mem[0];
713  mem1 = mem[1];
714  mem2 = mem[2];
715  mem3 = mem[3];
716  mem4 = mem[4];
717 
718  for (int i = 0; i < N; i++) {
719  float sum = x[i];
720 
721  sum += (num0*mem0);
722  sum += (num1*mem1);
723  sum += (num2*mem2);
724  sum += (num3*mem3);
725  sum += (num4*mem4);
726  mem4 = mem3;
727  mem3 = mem2;
728  mem2 = mem1;
729  mem1 = mem0;
730  mem0 = x[i];
731  y[i] = sum;
732  }
733 
734  mem[0] = mem0;
735  mem[1] = mem1;
736  mem[2] = mem2;
737  mem[3] = mem3;
738  mem[4] = mem4;
739 }
740 
741 static void pitch_downsample(float *x[], float *x_lp,
742  int len, int C)
743 {
744  float ac[5];
745  float tmp=Q15ONE;
746  float lpc[4], mem[5]={0,0,0,0,0};
747  float lpc2[5];
748  float c1 = .8f;
749 
750  for (int i = 1; i < len >> 1; i++)
751  x_lp[i] = .5f * (.5f * (x[0][(2*i-1)]+x[0][(2*i+1)])+x[0][2*i]);
752  x_lp[0] = .5f * (.5f * (x[0][1])+x[0][0]);
753  if (C==2) {
754  for (int i = 1; i < len >> 1; i++)
755  x_lp[i] += (.5f * (.5f * (x[1][(2*i-1)]+x[1][(2*i+1)])+x[1][2*i]));
756  x_lp[0] += .5f * (.5f * (x[1][1])+x[1][0]);
757  }
758 
759  celt_autocorr(x_lp, ac, NULL, 0, 4, len>>1);
760 
761  /* Noise floor -40 dB */
762  ac[0] *= 1.0001f;
763  /* Lag windowing */
764  for (int i = 1; i <= 4; i++) {
765  /*ac[i] *= exp(-.5*(2*M_PI*.002*i)*(2*M_PI*.002*i));*/
766  ac[i] -= ac[i]*(.008f*i)*(.008f*i);
767  }
768 
769  celt_lpc(lpc, ac, 4);
770  for (int i = 0; i < 4; i++) {
771  tmp = .9f * tmp;
772  lpc[i] = (lpc[i] * tmp);
773  }
774  /* Add a zero */
775  lpc2[0] = lpc[0] + .8f;
776  lpc2[1] = lpc[1] + (c1 * lpc[0]);
777  lpc2[2] = lpc[2] + (c1 * lpc[1]);
778  lpc2[3] = lpc[3] + (c1 * lpc[2]);
779  lpc2[4] = (c1 * lpc[3]);
780  celt_fir5(x_lp, lpc2, x_lp, len>>1, mem);
781 }
782 
783 static inline void dual_inner_prod(const float *x, const float *y01, const float *y02,
784  int N, float *xy1, float *xy2)
785 {
786  float xy01 = 0, xy02 = 0;
787 
788  for (int i = 0; i < N; i++) {
789  xy01 += (x[i] * y01[i]);
790  xy02 += (x[i] * y02[i]);
791  }
792 
793  *xy1 = xy01;
794  *xy2 = xy02;
795 }
796 
797 static float compute_pitch_gain(float xy, float xx, float yy)
798 {
799  return xy / sqrtf(1.f + xx * yy);
800 }
801 
802 static const uint8_t second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
803 static float remove_doubling(float *x, int maxperiod, int minperiod, int N,
804  int *T0_, int prev_period, float prev_gain)
805 {
806  int k, i, T, T0;
807  float g, g0;
808  float pg;
809  float xy,xx,yy,xy2;
810  float xcorr[3];
811  float best_xy, best_yy;
812  int offset;
813  int minperiod0;
814  float yy_lookup[PITCH_MAX_PERIOD+1];
815 
816  minperiod0 = minperiod;
817  maxperiod /= 2;
818  minperiod /= 2;
819  *T0_ /= 2;
820  prev_period /= 2;
821  N /= 2;
822  x += maxperiod;
823  if (*T0_>=maxperiod)
824  *T0_=maxperiod-1;
825 
826  T = T0 = *T0_;
827  dual_inner_prod(x, x, x-T0, N, &xx, &xy);
828  yy_lookup[0] = xx;
829  yy=xx;
830  for (i = 1; i <= maxperiod; i++) {
831  yy = yy+(x[-i] * x[-i])-(x[N-i] * x[N-i]);
832  yy_lookup[i] = FFMAX(0, yy);
833  }
834  yy = yy_lookup[T0];
835  best_xy = xy;
836  best_yy = yy;
837  g = g0 = compute_pitch_gain(xy, xx, yy);
838  /* Look for any pitch at T/k */
839  for (k = 2; k <= 15; k++) {
840  int T1, T1b;
841  float g1;
842  float cont=0;
843  float thresh;
844  T1 = (2*T0+k)/(2*k);
845  if (T1 < minperiod)
846  break;
847  /* Look for another strong correlation at T1b */
848  if (k==2)
849  {
850  if (T1+T0>maxperiod)
851  T1b = T0;
852  else
853  T1b = T0+T1;
854  } else
855  {
856  T1b = (2*second_check[k]*T0+k)/(2*k);
857  }
858  dual_inner_prod(x, &x[-T1], &x[-T1b], N, &xy, &xy2);
859  xy = .5f * (xy + xy2);
860  yy = .5f * (yy_lookup[T1] + yy_lookup[T1b]);
861  g1 = compute_pitch_gain(xy, xx, yy);
862  if (FFABS(T1-prev_period)<=1)
863  cont = prev_gain;
864  else if (FFABS(T1-prev_period)<=2 && 5 * k * k < T0)
865  cont = prev_gain * .5f;
866  else
867  cont = 0;
868  thresh = FFMAX(.3f, (.7f * g0) - cont);
869  /* Bias against very high pitch (very short period) to avoid false-positives
870  due to short-term correlation */
871  if (T1<3*minperiod)
872  thresh = FFMAX(.4f, (.85f * g0) - cont);
873  else if (T1<2*minperiod)
874  thresh = FFMAX(.5f, (.9f * g0) - cont);
875  if (g1 > thresh)
876  {
877  best_xy = xy;
878  best_yy = yy;
879  T = T1;
880  g = g1;
881  }
882  }
883  best_xy = FFMAX(0, best_xy);
884  if (best_yy <= best_xy)
885  pg = Q15ONE;
886  else
887  pg = best_xy/(best_yy + 1);
888 
889  for (k = 0; k < 3; k++)
890  xcorr[k] = celt_inner_prod(x, x-(T+k-1), N);
891  if ((xcorr[2]-xcorr[0]) > .7f * (xcorr[1]-xcorr[0]))
892  offset = 1;
893  else if ((xcorr[0]-xcorr[2]) > (.7f * (xcorr[1] - xcorr[2])))
894  offset = -1;
895  else
896  offset = 0;
897  if (pg > g)
898  pg = g;
899  *T0_ = 2*T+offset;
900 
901  if (*T0_<minperiod0)
902  *T0_=minperiod0;
903  return pg;
904 }
905 
906 static void find_best_pitch(float *xcorr, float *y, int len,
907  int max_pitch, int *best_pitch)
908 {
909  float best_num[2];
910  float best_den[2];
911  float Syy = 1.f;
912 
913  best_num[0] = -1;
914  best_num[1] = -1;
915  best_den[0] = 0;
916  best_den[1] = 0;
917  best_pitch[0] = 0;
918  best_pitch[1] = 1;
919 
920  for (int j = 0; j < len; j++)
921  Syy += y[j] * y[j];
922 
923  for (int i = 0; i < max_pitch; i++) {
924  if (xcorr[i]>0) {
925  float num;
926  float xcorr16;
927 
928  xcorr16 = xcorr[i];
929  /* Considering the range of xcorr16, this should avoid both underflows
930  and overflows (inf) when squaring xcorr16 */
931  xcorr16 *= 1e-12f;
932  num = xcorr16 * xcorr16;
933  if ((num * best_den[1]) > (best_num[1] * Syy)) {
934  if ((num * best_den[0]) > (best_num[0] * Syy)) {
935  best_num[1] = best_num[0];
936  best_den[1] = best_den[0];
937  best_pitch[1] = best_pitch[0];
938  best_num[0] = num;
939  best_den[0] = Syy;
940  best_pitch[0] = i;
941  } else {
942  best_num[1] = num;
943  best_den[1] = Syy;
944  best_pitch[1] = i;
945  }
946  }
947  }
948  Syy += y[i+len]*y[i+len] - y[i] * y[i];
949  Syy = FFMAX(1, Syy);
950  }
951 }
952 
953 static void pitch_search(const float *x_lp, float *y,
954  int len, int max_pitch, int *pitch)
955 {
956  int lag;
957  int best_pitch[2]={0,0};
958  int offset;
959 
960  float x_lp4[WINDOW_SIZE];
961  float y_lp4[WINDOW_SIZE];
962  float xcorr[WINDOW_SIZE];
963 
964  lag = len+max_pitch;
965 
966  /* Downsample by 2 again */
967  for (int j = 0; j < len >> 2; j++)
968  x_lp4[j] = x_lp[2*j];
969  for (int j = 0; j < lag >> 2; j++)
970  y_lp4[j] = y[2*j];
971 
972  /* Coarse search with 4x decimation */
973 
974  celt_pitch_xcorr(x_lp4, y_lp4, xcorr, len>>2, max_pitch>>2);
975 
976  find_best_pitch(xcorr, y_lp4, len>>2, max_pitch>>2, best_pitch);
977 
978  /* Finer search with 2x decimation */
979  for (int i = 0; i < max_pitch >> 1; i++) {
980  float sum;
981  xcorr[i] = 0;
982  if (FFABS(i-2*best_pitch[0])>2 && FFABS(i-2*best_pitch[1])>2)
983  continue;
984  sum = celt_inner_prod(x_lp, y+i, len>>1);
985  xcorr[i] = FFMAX(-1, sum);
986  }
987 
988  find_best_pitch(xcorr, y, len>>1, max_pitch>>1, best_pitch);
989 
990  /* Refine by pseudo-interpolation */
991  if (best_pitch[0] > 0 && best_pitch[0] < (max_pitch >> 1) - 1) {
992  float a, b, c;
993 
994  a = xcorr[best_pitch[0] - 1];
995  b = xcorr[best_pitch[0]];
996  c = xcorr[best_pitch[0] + 1];
997  if (c - a > .7f * (b - a))
998  offset = 1;
999  else if (a - c > .7f * (b-c))
1000  offset = -1;
1001  else
1002  offset = 0;
1003  } else {
1004  offset = 0;
1005  }
1006 
1007  *pitch = 2 * best_pitch[0] - offset;
1008 }
1009 
1010 static void dct(AudioRNNContext *s, float *out, const float *in)
1011 {
1012  for (int i = 0; i < NB_BANDS; i++) {
1013  float sum;
1014 
1015  sum = s->fdsp->scalarproduct_float(in, s->dct_table[i], FFALIGN(NB_BANDS, 4));
1016  out[i] = sum * sqrtf(2.f / 22);
1017  }
1018 }
1019 
1021  float *Ex, float *Ep, float *Exp, float *features, const float *in)
1022 {
1023  float E = 0;
1024  float *ceps_0, *ceps_1, *ceps_2;
1025  float spec_variability = 0;
1026  LOCAL_ALIGNED_32(float, Ly, [NB_BANDS]);
1027  LOCAL_ALIGNED_32(float, p, [WINDOW_SIZE]);
1028  float pitch_buf[PITCH_BUF_SIZE>>1];
1029  int pitch_index;
1030  float gain;
1031  float *(pre[1]);
1032  float tmp[NB_BANDS];
1033  float follow, logMax;
1034 
1035  frame_analysis(s, st, X, Ex, in);
1038  pre[0] = &st->pitch_buf[0];
1039  pitch_downsample(pre, pitch_buf, PITCH_BUF_SIZE, 1);
1040  pitch_search(pitch_buf+(PITCH_MAX_PERIOD>>1), pitch_buf, PITCH_FRAME_SIZE,
1041  PITCH_MAX_PERIOD-3*PITCH_MIN_PERIOD, &pitch_index);
1042  pitch_index = PITCH_MAX_PERIOD-pitch_index;
1043 
1045  PITCH_FRAME_SIZE, &pitch_index, st->last_period, st->last_gain);
1046  st->last_period = pitch_index;
1047  st->last_gain = gain;
1048 
1049  for (int i = 0; i < WINDOW_SIZE; i++)
1050  p[i] = st->pitch_buf[PITCH_BUF_SIZE-WINDOW_SIZE-pitch_index+i];
1051 
1052  s->fdsp->vector_fmul(p, p, s->window, WINDOW_SIZE);
1053  forward_transform(st, P, p);
1054  compute_band_energy(Ep, P);
1055  compute_band_corr(Exp, X, P);
1056 
1057  for (int i = 0; i < NB_BANDS; i++)
1058  Exp[i] = Exp[i] / sqrtf(.001f+Ex[i]*Ep[i]);
1059 
1060  dct(s, tmp, Exp);
1061 
1062  for (int i = 0; i < NB_DELTA_CEPS; i++)
1063  features[NB_BANDS+2*NB_DELTA_CEPS+i] = tmp[i];
1064 
1065  features[NB_BANDS+2*NB_DELTA_CEPS] -= 1.3;
1066  features[NB_BANDS+2*NB_DELTA_CEPS+1] -= 0.9;
1067  features[NB_BANDS+3*NB_DELTA_CEPS] = .01*(pitch_index-300);
1068  logMax = -2;
1069  follow = -2;
1070 
1071  for (int i = 0; i < NB_BANDS; i++) {
1072  Ly[i] = log10f(1e-2f + Ex[i]);
1073  Ly[i] = FFMAX(logMax-7, FFMAX(follow-1.5, Ly[i]));
1074  logMax = FFMAX(logMax, Ly[i]);
1075  follow = FFMAX(follow-1.5, Ly[i]);
1076  E += Ex[i];
1077  }
1078 
1079  if (E < 0.04f) {
1080  /* If there's no audio, avoid messing up the state. */
1081  RNN_CLEAR(features, NB_FEATURES);
1082  return 1;
1083  }
1084 
1085  dct(s, features, Ly);
1086  features[0] -= 12;
1087  features[1] -= 4;
1088  ceps_0 = st->cepstral_mem[st->memid];
1089  ceps_1 = (st->memid < 1) ? st->cepstral_mem[CEPS_MEM+st->memid-1] : st->cepstral_mem[st->memid-1];
1090  ceps_2 = (st->memid < 2) ? st->cepstral_mem[CEPS_MEM+st->memid-2] : st->cepstral_mem[st->memid-2];
1091 
1092  for (int i = 0; i < NB_BANDS; i++)
1093  ceps_0[i] = features[i];
1094 
1095  st->memid++;
1096  for (int i = 0; i < NB_DELTA_CEPS; i++) {
1097  features[i] = ceps_0[i] + ceps_1[i] + ceps_2[i];
1098  features[NB_BANDS+i] = ceps_0[i] - ceps_2[i];
1099  features[NB_BANDS+NB_DELTA_CEPS+i] = ceps_0[i] - 2*ceps_1[i] + ceps_2[i];
1100  }
1101  /* Spectral variability features. */
1102  if (st->memid == CEPS_MEM)
1103  st->memid = 0;
1104 
1105  for (int i = 0; i < CEPS_MEM; i++) {
1106  float mindist = 1e15f;
1107  for (int j = 0; j < CEPS_MEM; j++) {
1108  float dist = 0.f;
1109  for (int k = 0; k < NB_BANDS; k++) {
1110  float tmp;
1111 
1112  tmp = st->cepstral_mem[i][k] - st->cepstral_mem[j][k];
1113  dist += tmp*tmp;
1114  }
1115 
1116  if (j != i)
1117  mindist = FFMIN(mindist, dist);
1118  }
1119 
1120  spec_variability += mindist;
1121  }
1122 
1123  features[NB_BANDS+3*NB_DELTA_CEPS+1] = spec_variability/CEPS_MEM-2.1;
1124 
1125  return 0;
1126 }
1127 
1128 static void interp_band_gain(float *g, const float *bandE)
1129 {
1130  memset(g, 0, sizeof(*g) * FREQ_SIZE);
1131 
1132  for (int i = 0; i < NB_BANDS - 1; i++) {
1133  const int band_size = (eband5ms[i + 1] - eband5ms[i]) << FRAME_SIZE_SHIFT;
1134 
1135  for (int j = 0; j < band_size; j++) {
1136  float frac = (float)j / band_size;
1137 
1138  g[(eband5ms[i] << FRAME_SIZE_SHIFT) + j] = (1.f - frac) * bandE[i] + frac * bandE[i + 1];
1139  }
1140  }
1141 }
1142 
1143 static void pitch_filter(AVComplexFloat *X, const AVComplexFloat *P, const float *Ex, const float *Ep,
1144  const float *Exp, const float *g)
1145 {
1146  float newE[NB_BANDS];
1147  float r[NB_BANDS];
1148  float norm[NB_BANDS];
1149  float rf[FREQ_SIZE] = {0};
1150  float normf[FREQ_SIZE]={0};
1151 
1152  for (int i = 0; i < NB_BANDS; i++) {
1153  if (Exp[i]>g[i]) r[i] = 1;
1154  else r[i] = SQUARE(Exp[i])*(1-SQUARE(g[i]))/(.001 + SQUARE(g[i])*(1-SQUARE(Exp[i])));
1155  r[i] = sqrtf(av_clipf(r[i], 0, 1));
1156  r[i] *= sqrtf(Ex[i]/(1e-8+Ep[i]));
1157  }
1158  interp_band_gain(rf, r);
1159  for (int i = 0; i < FREQ_SIZE; i++) {
1160  X[i].re += rf[i]*P[i].re;
1161  X[i].im += rf[i]*P[i].im;
1162  }
1163  compute_band_energy(newE, X);
1164  for (int i = 0; i < NB_BANDS; i++) {
1165  norm[i] = sqrtf(Ex[i] / (1e-8+newE[i]));
1166  }
1167  interp_band_gain(normf, norm);
1168  for (int i = 0; i < FREQ_SIZE; i++) {
1169  X[i].re *= normf[i];
1170  X[i].im *= normf[i];
1171  }
1172 }
1173 
1174 static const float tansig_table[201] = {
1175  0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f,
1176  0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f,
1177  0.379949f, 0.413644f, 0.446244f, 0.477700f, 0.507977f,
1178  0.537050f, 0.564900f, 0.591519f, 0.616909f, 0.641077f,
1179  0.664037f, 0.685809f, 0.706419f, 0.725897f, 0.744277f,
1180  0.761594f, 0.777888f, 0.793199f, 0.807569f, 0.821040f,
1181  0.833655f, 0.845456f, 0.856485f, 0.866784f, 0.876393f,
1182  0.885352f, 0.893698f, 0.901468f, 0.908698f, 0.915420f,
1183  0.921669f, 0.927473f, 0.932862f, 0.937863f, 0.942503f,
1184  0.946806f, 0.950795f, 0.954492f, 0.957917f, 0.961090f,
1185  0.964028f, 0.966747f, 0.969265f, 0.971594f, 0.973749f,
1186  0.975743f, 0.977587f, 0.979293f, 0.980869f, 0.982327f,
1187  0.983675f, 0.984921f, 0.986072f, 0.987136f, 0.988119f,
1188  0.989027f, 0.989867f, 0.990642f, 0.991359f, 0.992020f,
1189  0.992631f, 0.993196f, 0.993718f, 0.994199f, 0.994644f,
1190  0.995055f, 0.995434f, 0.995784f, 0.996108f, 0.996407f,
1191  0.996682f, 0.996937f, 0.997172f, 0.997389f, 0.997590f,
1192  0.997775f, 0.997946f, 0.998104f, 0.998249f, 0.998384f,
1193  0.998508f, 0.998623f, 0.998728f, 0.998826f, 0.998916f,
1194  0.999000f, 0.999076f, 0.999147f, 0.999213f, 0.999273f,
1195  0.999329f, 0.999381f, 0.999428f, 0.999472f, 0.999513f,
1196  0.999550f, 0.999585f, 0.999617f, 0.999646f, 0.999673f,
1197  0.999699f, 0.999722f, 0.999743f, 0.999763f, 0.999781f,
1198  0.999798f, 0.999813f, 0.999828f, 0.999841f, 0.999853f,
1199  0.999865f, 0.999875f, 0.999885f, 0.999893f, 0.999902f,
1200  0.999909f, 0.999916f, 0.999923f, 0.999929f, 0.999934f,
1201  0.999939f, 0.999944f, 0.999948f, 0.999952f, 0.999956f,
1202  0.999959f, 0.999962f, 0.999965f, 0.999968f, 0.999970f,
1203  0.999973f, 0.999975f, 0.999977f, 0.999978f, 0.999980f,
1204  0.999982f, 0.999983f, 0.999984f, 0.999986f, 0.999987f,
1205  0.999988f, 0.999989f, 0.999990f, 0.999990f, 0.999991f,
1206  0.999992f, 0.999992f, 0.999993f, 0.999994f, 0.999994f,
1207  0.999994f, 0.999995f, 0.999995f, 0.999996f, 0.999996f,
1208  0.999996f, 0.999997f, 0.999997f, 0.999997f, 0.999997f,
1209  0.999997f, 0.999998f, 0.999998f, 0.999998f, 0.999998f,
1210  0.999998f, 0.999998f, 0.999999f, 0.999999f, 0.999999f,
1211  0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,
1212  0.999999f, 0.999999f, 0.999999f, 0.999999f, 0.999999f,
1213  1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1214  1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
1215  1.000000f,
1216 };
1217 
1218 static inline float tansig_approx(float x)
1219 {
1220  float y, dy;
1221  float sign=1;
1222  int i;
1223 
1224  /* Tests are reversed to catch NaNs */
1225  if (!(x<8))
1226  return 1;
1227  if (!(x>-8))
1228  return -1;
1229  /* Another check in case of -ffast-math */
1230 
1231  if (isnan(x))
1232  return 0;
1233 
1234  if (x < 0) {
1235  x=-x;
1236  sign=-1;
1237  }
1238  i = (int)floor(.5f+25*x);
1239  x -= .04f*i;
1240  y = tansig_table[i];
1241  dy = 1-y*y;
1242  y = y + x*dy*(1 - y*x);
1243  return sign*y;
1244 }
1245 
1246 static inline float sigmoid_approx(float x)
1247 {
1248  return .5f + .5f*tansig_approx(.5f*x);
1249 }
1250 
1251 static void compute_dense(const DenseLayer *layer, float *output, const float *input)
1252 {
1253  const int N = layer->nb_neurons, M = layer->nb_inputs, stride = N;
1254 
1255  for (int i = 0; i < N; i++) {
1256  /* Compute update gate. */
1257  float sum = layer->bias[i];
1258 
1259  for (int j = 0; j < M; j++)
1260  sum += layer->input_weights[j * stride + i] * input[j];
1261 
1262  output[i] = WEIGHTS_SCALE * sum;
1263  }
1264 
1265  if (layer->activation == ACTIVATION_SIGMOID) {
1266  for (int i = 0; i < N; i++)
1268  } else if (layer->activation == ACTIVATION_TANH) {
1269  for (int i = 0; i < N; i++)
1270  output[i] = tansig_approx(output[i]);
1271  } else if (layer->activation == ACTIVATION_RELU) {
1272  for (int i = 0; i < N; i++)
1273  output[i] = FFMAX(0, output[i]);
1274  } else {
1275  av_assert0(0);
1276  }
1277 }
1278 
1279 static void compute_gru(AudioRNNContext *s, const GRULayer *gru, float *state, const float *input)
1280 {
1281  LOCAL_ALIGNED_32(float, z, [MAX_NEURONS]);
1282  LOCAL_ALIGNED_32(float, r, [MAX_NEURONS]);
1283  LOCAL_ALIGNED_32(float, h, [MAX_NEURONS]);
1284  const int M = gru->nb_inputs;
1285  const int N = gru->nb_neurons;
1286  const int AN = FFALIGN(N, 4);
1287  const int AM = FFALIGN(M, 4);
1288  const int stride = 3 * AN, istride = 3 * AM;
1289 
1290  for (int i = 0; i < N; i++) {
1291  /* Compute update gate. */
1292  float sum = gru->bias[i];
1293 
1294  sum += s->fdsp->scalarproduct_float(gru->input_weights + i * istride, input, AM);
1295  sum += s->fdsp->scalarproduct_float(gru->recurrent_weights + i * stride, state, AN);
1296  z[i] = sigmoid_approx(WEIGHTS_SCALE * sum);
1297  }
1298 
1299  for (int i = 0; i < N; i++) {
1300  /* Compute reset gate. */
1301  float sum = gru->bias[N + i];
1302 
1303  sum += s->fdsp->scalarproduct_float(gru->input_weights + AM + i * istride, input, AM);
1304  sum += s->fdsp->scalarproduct_float(gru->recurrent_weights + AN + i * stride, state, AN);
1305  r[i] = sigmoid_approx(WEIGHTS_SCALE * sum);
1306  }
1307 
1308  for (int i = 0; i < N; i++) {
1309  /* Compute output. */
1310  float sum = gru->bias[2 * N + i];
1311 
1312  sum += s->fdsp->scalarproduct_float(gru->input_weights + 2 * AM + i * istride, input, AM);
1313  for (int j = 0; j < N; j++)
1314  sum += gru->recurrent_weights[2 * AN + i * stride + j] * state[j] * r[j];
1315 
1316  if (gru->activation == ACTIVATION_SIGMOID)
1317  sum = sigmoid_approx(WEIGHTS_SCALE * sum);
1318  else if (gru->activation == ACTIVATION_TANH)
1319  sum = tansig_approx(WEIGHTS_SCALE * sum);
1320  else if (gru->activation == ACTIVATION_RELU)
1321  sum = FFMAX(0, WEIGHTS_SCALE * sum);
1322  else
1323  av_assert0(0);
1324  h[i] = z[i] * state[i] + (1.f - z[i]) * sum;
1325  }
1326 
1327  RNN_COPY(state, h, N);
1328 }
1329 
1330 #define INPUT_SIZE 42
1331 
1332 static void compute_rnn(AudioRNNContext *s, RNNState *rnn, float *gains, float *vad, const float *input)
1333 {
1334  LOCAL_ALIGNED_32(float, dense_out, [MAX_NEURONS]);
1335  LOCAL_ALIGNED_32(float, noise_input, [MAX_NEURONS * 3]);
1336  LOCAL_ALIGNED_32(float, denoise_input, [MAX_NEURONS * 3]);
1337 
1338  compute_dense(rnn->model->input_dense, dense_out, input);
1339  compute_gru(s, rnn->model->vad_gru, rnn->vad_gru_state, dense_out);
1340  compute_dense(rnn->model->vad_output, vad, rnn->vad_gru_state);
1341 
1342  memcpy(noise_input, dense_out, rnn->model->input_dense_size * sizeof(float));
1343  memcpy(noise_input + rnn->model->input_dense_size,
1344  rnn->vad_gru_state, rnn->model->vad_gru_size * sizeof(float));
1345  memcpy(noise_input + rnn->model->input_dense_size + rnn->model->vad_gru_size,
1346  input, INPUT_SIZE * sizeof(float));
1347 
1348  compute_gru(s, rnn->model->noise_gru, rnn->noise_gru_state, noise_input);
1349 
1350  memcpy(denoise_input, rnn->vad_gru_state, rnn->model->vad_gru_size * sizeof(float));
1351  memcpy(denoise_input + rnn->model->vad_gru_size,
1352  rnn->noise_gru_state, rnn->model->noise_gru_size * sizeof(float));
1353  memcpy(denoise_input + rnn->model->vad_gru_size + rnn->model->noise_gru_size,
1354  input, INPUT_SIZE * sizeof(float));
1355 
1356  compute_gru(s, rnn->model->denoise_gru, rnn->denoise_gru_state, denoise_input);
1358 }
1359 
1360 static float rnnoise_channel(AudioRNNContext *s, DenoiseState *st, float *out, const float *in,
1361  int disabled)
1362 {
1365  float x[FRAME_SIZE];
1366  float Ex[NB_BANDS], Ep[NB_BANDS];
1367  LOCAL_ALIGNED_32(float, Exp, [NB_BANDS]);
1368  float features[NB_FEATURES];
1369  float g[NB_BANDS];
1370  float gf[FREQ_SIZE];
1371  float vad_prob = 0;
1372  float *history = st->history;
1373  static const float a_hp[2] = {-1.99599, 0.99600};
1374  static const float b_hp[2] = {-2, 1};
1375  int silence;
1376 
1377  biquad(x, st->mem_hp_x, in, b_hp, a_hp, FRAME_SIZE);
1378  silence = compute_frame_features(s, st, X, P, Ex, Ep, Exp, features, x);
1379 
1380  if (!silence && !disabled) {
1381  compute_rnn(s, &st->rnn[0], g, &vad_prob, features);
1382  pitch_filter(X, P, Ex, Ep, Exp, g);
1383  for (int i = 0; i < NB_BANDS; i++) {
1384  float alpha = .6f;
1385 
1386  g[i] = FFMAX(g[i], alpha * st->lastg[i]);
1387  st->lastg[i] = g[i];
1388  }
1389 
1390  interp_band_gain(gf, g);
1391 
1392  for (int i = 0; i < FREQ_SIZE; i++) {
1393  X[i].re *= gf[i];
1394  X[i].im *= gf[i];
1395  }
1396  }
1397 
1398  frame_synthesis(s, st, out, X);
1399  memcpy(history, in, FRAME_SIZE * sizeof(*history));
1400 
1401  return vad_prob;
1402 }
1403 
1404 typedef struct ThreadData {
1405  AVFrame *in, *out;
1406 } ThreadData;
1407 
1408 static int rnnoise_channels(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
1409 {
1410  AudioRNNContext *s = ctx->priv;
1411  ThreadData *td = arg;
1412  AVFrame *in = td->in;
1413  AVFrame *out = td->out;
1414  const int start = (out->ch_layout.nb_channels * jobnr) / nb_jobs;
1415  const int end = (out->ch_layout.nb_channels * (jobnr+1)) / nb_jobs;
1416 
1417  for (int ch = start; ch < end; ch++) {
1418  rnnoise_channel(s, &s->st[ch],
1419  (float *)out->extended_data[ch],
1420  (const float *)in->extended_data[ch],
1421  ctx->is_disabled);
1422  }
1423 
1424  return 0;
1425 }
1426 
1428 {
1429  AVFilterContext *ctx = inlink->dst;
1430  AVFilterLink *outlink = ctx->outputs[0];
1431  AVFrame *out = NULL;
1432  ThreadData td;
1433 
1434  out = ff_get_audio_buffer(outlink, FRAME_SIZE);
1435  if (!out) {
1436  av_frame_free(&in);
1437  return AVERROR(ENOMEM);
1438  }
1439  out->pts = in->pts;
1440 
1441  td.in = in; td.out = out;
1444 
1445  av_frame_free(&in);
1446  return ff_filter_frame(outlink, out);
1447 }
1448 
1450 {
1451  AVFilterLink *inlink = ctx->inputs[0];
1452  AVFilterLink *outlink = ctx->outputs[0];
1453  AVFrame *in = NULL;
1454  int ret;
1455 
1457 
1459  if (ret < 0)
1460  return ret;
1461 
1462  if (ret > 0)
1463  return filter_frame(inlink, in);
1464 
1465  FF_FILTER_FORWARD_STATUS(inlink, outlink);
1466  FF_FILTER_FORWARD_WANTED(outlink, inlink);
1467 
1468  return FFERROR_NOT_READY;
1469 }
1470 
1472 {
1473  AudioRNNContext *s = ctx->priv;
1474  int ret;
1475  FILE *f;
1476 
1477  if (!s->model_name)
1478  return AVERROR(EINVAL);
1479  f = avpriv_fopen_utf8(s->model_name, "r");
1480  if (!f) {
1481  av_log(ctx, AV_LOG_ERROR, "Failed to open model file: %s\n", s->model_name);
1482  return AVERROR(EINVAL);
1483  }
1484 
1485  ret = rnnoise_model_from_file(f, model);
1486  fclose(f);
1487  if (!*model || ret < 0)
1488  return ret;
1489 
1490  return 0;
1491 }
1492 
1494 {
1495  AudioRNNContext *s = ctx->priv;
1496  int ret;
1497 
1498  s->fdsp = avpriv_float_dsp_alloc(0);
1499  if (!s->fdsp)
1500  return AVERROR(ENOMEM);
1501 
1502  ret = open_model(ctx, &s->model[0]);
1503  if (ret < 0)
1504  return ret;
1505 
1506  for (int i = 0; i < FRAME_SIZE; i++) {
1507  s->window[i] = sin(.5*M_PI*sin(.5*M_PI*(i+.5)/FRAME_SIZE) * sin(.5*M_PI*(i+.5)/FRAME_SIZE));
1508  s->window[WINDOW_SIZE - 1 - i] = s->window[i];
1509  }
1510 
1511  for (int i = 0; i < NB_BANDS; i++) {
1512  for (int j = 0; j < NB_BANDS; j++) {
1513  s->dct_table[j][i] = cosf((i + .5f) * j * M_PI / NB_BANDS);
1514  if (j == 0)
1515  s->dct_table[j][i] *= sqrtf(.5);
1516  }
1517  }
1518 
1519  return 0;
1520 }
1521 
1522 static void free_model(AVFilterContext *ctx, int n)
1523 {
1524  AudioRNNContext *s = ctx->priv;
1525 
1526  rnnoise_model_free(s->model[n]);
1527  s->model[n] = NULL;
1528 
1529  for (int ch = 0; ch < s->channels && s->st; ch++) {
1530  av_freep(&s->st[ch].rnn[n].vad_gru_state);
1531  av_freep(&s->st[ch].rnn[n].noise_gru_state);
1532  av_freep(&s->st[ch].rnn[n].denoise_gru_state);
1533  }
1534 }
1535 
1536 static int process_command(AVFilterContext *ctx, const char *cmd, const char *args,
1537  char *res, int res_len, int flags)
1538 {
1539  AudioRNNContext *s = ctx->priv;
1540  int ret;
1541 
1542  ret = ff_filter_process_command(ctx, cmd, args, res, res_len, flags);
1543  if (ret < 0)
1544  return ret;
1545 
1546  ret = open_model(ctx, &s->model[1]);
1547  if (ret < 0)
1548  return ret;
1549 
1550  FFSWAP(RNNModel *, s->model[0], s->model[1]);
1551  for (int ch = 0; ch < s->channels; ch++)
1552  FFSWAP(RNNState, s->st[ch].rnn[0], s->st[ch].rnn[1]);
1553 
1554  ret = config_input(ctx->inputs[0]);
1555  if (ret < 0) {
1556  for (int ch = 0; ch < s->channels; ch++)
1557  FFSWAP(RNNState, s->st[ch].rnn[0], s->st[ch].rnn[1]);
1558  FFSWAP(RNNModel *, s->model[0], s->model[1]);
1559  return ret;
1560  }
1561 
1562  free_model(ctx, 1);
1563  return 0;
1564 }
1565 
1567 {
1568  AudioRNNContext *s = ctx->priv;
1569 
1570  av_freep(&s->fdsp);
1571  free_model(ctx, 0);
1572  for (int ch = 0; ch < s->channels && s->st; ch++) {
1573  av_tx_uninit(&s->st[ch].tx);
1574  av_tx_uninit(&s->st[ch].txi);
1575  }
1576  av_freep(&s->st);
1577 }
1578 
1579 static const AVFilterPad inputs[] = {
1580  {
1581  .name = "default",
1582  .type = AVMEDIA_TYPE_AUDIO,
1583  .config_props = config_input,
1584  },
1585 };
1586 
1587 static const AVFilterPad outputs[] = {
1588  {
1589  .name = "default",
1590  .type = AVMEDIA_TYPE_AUDIO,
1591  },
1592 };
1593 
1594 #define OFFSET(x) offsetof(AudioRNNContext, x)
1595 #define AF AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
1596 
1597 static const AVOption arnndn_options[] = {
1598  { "model", "set model name", OFFSET(model_name), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, AF },
1599  { "m", "set model name", OFFSET(model_name), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, AF },
1600  { "mix", "set output vs input mix", OFFSET(mix), AV_OPT_TYPE_FLOAT, {.dbl=1.0},-1, 1, AF },
1601  { NULL }
1602 };
1603 
1604 AVFILTER_DEFINE_CLASS(arnndn);
1605 
1607  .name = "arnndn",
1608  .description = NULL_IF_CONFIG_SMALL("Reduce noise from speech using Recurrent Neural Networks."),
1609  .priv_size = sizeof(AudioRNNContext),
1610  .priv_class = &arnndn_class,
1611  .activate = activate,
1612  .init = init,
1613  .uninit = uninit,
1619  .process_command = process_command,
1620 };
error
static void error(const char *err)
Definition: target_bsf_fuzzer.c:31
M
#define M(a, b)
Definition: vp3dsp.c:48
compute_dense
static void compute_dense(const DenseLayer *layer, float *output, const float *input)
Definition: af_arnndn.c:1251
ff_get_audio_buffer
AVFrame * ff_get_audio_buffer(AVFilterLink *link, int nb_samples)
Request an audio samples buffer with a specific set of permissions.
Definition: audio.c:100
AV_SAMPLE_FMT_FLTP
@ AV_SAMPLE_FMT_FLTP
float, planar
Definition: samplefmt.h:66
PITCH_MAX_PERIOD
#define PITCH_MAX_PERIOD
Definition: af_arnndn.c:51
td
#define td
Definition: regdef.h:70
pitch_downsample
static void pitch_downsample(float *x[], float *x_lp, int len, int C)
Definition: af_arnndn.c:741
WEIGHTS_SCALE
#define WEIGHTS_SCALE
Definition: af_arnndn.c:64
mix
static int mix(int c0, int c1)
Definition: 4xm.c:717
DenoiseState::synthesis_mem
float synthesis_mem[FRAME_SIZE]
Definition: af_arnndn.c:122
r
const char * r
Definition: vf_curves.c:126
AVERROR
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
opt.h
activate
static int activate(AVFilterContext *ctx)
Definition: af_arnndn.c:1449
mem_internal.h
GRULayer::activation
int activation
Definition: af_arnndn.c:88
out
FILE * out
Definition: movenc.c:54
dual_inner_prod
static void dual_inner_prod(const float *x, const float *y01, const float *y02, int N, float *xy1, float *xy2)
Definition: af_arnndn.c:783
ff_filter_frame
int ff_filter_frame(AVFilterLink *link, AVFrame *frame)
Send a frame of data to the next filter.
Definition: avfilter.c:1009
sample_fmts
static enum AVSampleFormat sample_fmts[]
Definition: adpcmenc.c:947
FFERROR_NOT_READY
return FFERROR_NOT_READY
Definition: filter_design.txt:204
FREE_GRU
#define FREE_GRU(name)
AVTXContext
Definition: tx_priv.h:228
output
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce output
Definition: filter_design.txt:225
inlink
The exact code depends on how similar the blocks are and how related they are to the and needs to apply these operations to the correct inlink or outlink if there are several Macros are available to factor that when no extra processing is inlink
Definition: filter_design.txt:212
ff_set_common_samplerates_from_list
int ff_set_common_samplerates_from_list(AVFilterContext *ctx, const int *samplerates)
Equivalent to ff_set_common_samplerates(ctx, ff_make_format_list(samplerates))
Definition: formats.c:733
PITCH_MIN_PERIOD
#define PITCH_MIN_PERIOD
Definition: af_arnndn.c:50
av_frame_free
void av_frame_free(AVFrame **frame)
Free the frame and any dynamically allocated objects in it, e.g.
Definition: frame.c:116
GRULayer::nb_neurons
int nb_neurons
Definition: af_arnndn.c:87
RNNState::noise_gru_state
float * noise_gru_state
Definition: af_arnndn.c:113
uninit
static av_cold void uninit(AVFilterContext *ctx)
Definition: af_arnndn.c:1566
inverse_transform
static void inverse_transform(DenoiseState *st, float *out, const AVComplexFloat *in)
Definition: af_arnndn.c:424
im
float im
Definition: fft.c:79
AVFrame
This structure describes decoded (raw) audio or video data.
Definition: frame.h:325
tmp
static uint8_t tmp[11]
Definition: aes_ctr.c:28
AVFrame::pts
int64_t pts
Presentation timestamp in time_base units (time when frame should be shown to user).
Definition: frame.h:432
DenoiseState::lastg
float lastg[NB_BANDS]
Definition: af_arnndn.c:128
AVOption
AVOption.
Definition: opt.h:251
OFFSET
#define OFFSET(x)
Definition: af_arnndn.c:1594
b
#define b
Definition: input.c:41
arnndn_options
static const AVOption arnndn_options[]
Definition: af_arnndn.c:1597
FILTER_QUERY_FUNC
#define FILTER_QUERY_FUNC(func)
Definition: internal.h:167
frame_synthesis
static void frame_synthesis(AudioRNNContext *s, DenoiseState *st, float *out, const AVComplexFloat *y)
Definition: af_arnndn.c:509
NB_DELTA_CEPS
#define NB_DELTA_CEPS
Definition: af_arnndn.c:60
RNNModel::input_dense_size
int input_dense_size
Definition: af_arnndn.c:92
AVComplexFloat
Definition: tx.h:27
FFMAX
#define FFMAX(a, b)
Definition: macros.h:47
AVFilter::name
const char * name
Filter name.
Definition: avfilter.h:175
c1
static const uint64_t c1
Definition: murmur3.c:51
ThreadData::out
AVFrame * out
Definition: af_adeclick.c:473
AVChannelLayout::nb_channels
int nb_channels
Number of channels in this layout.
Definition: channel_layout.h:311
ThreadData::in
AVFrame * in
Definition: af_adecorrelate.c:154
tansig_table
static const float tansig_table[201]
Definition: af_arnndn.c:1174
find_best_pitch
static void find_best_pitch(float *xcorr, float *y, int len, int max_pitch, int *best_pitch)
Definition: af_arnndn.c:906
FF_FILTER_FORWARD_STATUS_BACK
#define FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink)
Forward the status on an output link to an input link.
Definition: filters.h:199
process_command
static int process_command(AVFilterContext *ctx, const char *cmd, const char *args, char *res, int res_len, int flags)
Definition: af_arnndn.c:1536
av_tx_init
av_cold int av_tx_init(AVTXContext **ctx, av_tx_fn *tx, enum AVTXType type, int inv, int len, const void *scale, uint64_t flags)
Initialize a transform context with the given configuration (i)MDCTs with an odd length are currently...
Definition: tx.c:883
DenoiseState::memid
int memid
Definition: af_arnndn.c:121
RNN_CLEAR
#define RNN_CLEAR(dst, n)
Definition: af_arnndn.c:406
GRULayer::nb_inputs
int nb_inputs
Definition: af_arnndn.c:86
compute_band_energy
static void compute_band_energy(float *bandE, const AVComplexFloat *X)
Definition: af_arnndn.c:447
formats.h
compute_rnn
static void compute_rnn(AudioRNNContext *s, RNNState *rnn, float *gains, float *vad, const float *input)
Definition: af_arnndn.c:1332
DenoiseState::txi
AVTXContext * txi
Definition: af_arnndn.c:131
X
@ X
Definition: vf_addroi.c:26
free_model
static void free_model(AVFilterContext *ctx, int n)
Definition: af_arnndn.c:1522
RNNState::denoise_gru_state
float * denoise_gru_state
Definition: af_arnndn.c:114
rnnoise_channels
static int rnnoise_channels(AVFilterContext *ctx, void *arg, int jobnr, int nb_jobs)
Definition: af_arnndn.c:1408
ACTIVATION_RELU
#define ACTIVATION_RELU
Definition: af_arnndn.c:70
AVComplexFloat::im
float im
Definition: tx.h:28
DenoiseState::mem_hp_x
float mem_hp_x[2]
Definition: af_arnndn.c:127
window
static SDL_Window * window
Definition: ffplay.c:365
cosf
#define cosf(x)
Definition: libm.h:78
log10f
#define log10f(x)
Definition: libm.h:414
AudioRNNContext::model
RNNModel * model[2]
Definition: af_arnndn.c:147
rnnoise_model_free
static void rnnoise_model_free(RNNModel *model)
Definition: af_arnndn.c:156
AudioRNNContext::st
DenoiseState * st
Definition: af_arnndn.c:142
DenoiseState::cepstral_mem
float cepstral_mem[CEPS_MEM][NB_BANDS]
Definition: af_arnndn.c:120
SQUARE
#define SQUARE(x)
Definition: af_arnndn.c:55
AF
#define AF
Definition: af_arnndn.c:1595
DenseLayer::bias
const float * bias
Definition: af_arnndn.c:75
AVFilterPad
A filter pad used for either input or output.
Definition: internal.h:49
FREQ_SIZE
#define FREQ_SIZE
Definition: af_arnndn.c:48
T
#define T(x)
Definition: vpx_arith.h:29
compute_band_corr
static void compute_band_corr(float *bandE, const AVComplexFloat *X, const AVComplexFloat *P)
Definition: af_arnndn.c:472
DenoiseState::history
float history[FRAME_SIZE]
Definition: af_arnndn.c:129
C
s EdgeDetect Foobar g libavfilter vf_edgedetect c libavfilter vf_foobar c edit libavfilter and add an entry for foobar following the pattern of the other filters edit libavfilter allfilters and add an entry for foobar following the pattern of the other filters configure make j< whatever > ffmpeg ffmpeg i you should get a foobar png with Lena edge detected That s your new playground is ready Some little details about what s going which in turn will define variables for the build system and the C
Definition: writing_filters.txt:58
avassert.h
AV_LOG_ERROR
#define AV_LOG_ERROR
Something went wrong and cannot losslessly be recovered.
Definition: log.h:180
av_cold
#define av_cold
Definition: attributes.h:90
av_tx_fn
void(* av_tx_fn)(AVTXContext *s, void *out, void *in, ptrdiff_t stride)
Function pointer to a function to perform the transform.
Definition: tx.h:127
float
float
Definition: af_crystalizer.c:122
MAX_NEURONS
#define MAX_NEURONS
Definition: af_arnndn.c:66
s
#define s(width, name)
Definition: cbs_vp9.c:256
frame_analysis
static void frame_analysis(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, float *Ex, const float *in)
Definition: af_arnndn.c:497
DenseLayer::nb_inputs
int nb_inputs
Definition: af_arnndn.c:77
CEPS_MEM
#define CEPS_MEM
Definition: af_arnndn.c:59
floor
static __device__ float floor(float a)
Definition: cuda_runtime.h:173
inputs
static const AVFilterPad inputs[]
Definition: af_arnndn.c:1579
g
const char * g
Definition: vf_curves.c:127
celt_inner_prod
static float celt_inner_prod(const float *x, const float *y, int N)
Definition: af_arnndn.c:595
AVMEDIA_TYPE_AUDIO
@ AVMEDIA_TYPE_AUDIO
Definition: avutil.h:202
ff_set_common_formats_from_list
int ff_set_common_formats_from_list(AVFilterContext *ctx, const int *fmts)
Equivalent to ff_set_common_formats(ctx, ff_make_format_list(fmts))
Definition: formats.c:755
av_assert0
#define av_assert0(cond)
assert() equivalent, that is always enabled.
Definition: avassert.h:37
filters.h
AV_TX_FLOAT_FFT
@ AV_TX_FLOAT_FFT
Standard complex to complex FFT with sample data type of AVComplexFloat, AVComplexDouble or AVComplex...
Definition: tx.h:47
ctx
AVFormatContext * ctx
Definition: movenc.c:48
RNNModel::vad_gru_size
int vad_gru_size
Definition: af_arnndn.c:95
xi
#define xi(width, name, var, range_min, range_max, subs,...)
Definition: cbs_h2645.c:402
rnnoise_model_from_file
static int rnnoise_model_from_file(FILE *f, RNNModel **rnn)
Definition: af_arnndn.c:186
ff_af_arnndn
const AVFilter ff_af_arnndn
Definition: af_arnndn.c:1606
config_input
static int config_input(AVFilterLink *inlink)
Definition: af_arnndn.c:348
FRAME_SIZE_SHIFT
#define FRAME_SIZE_SHIFT
Definition: af_arnndn.c:45
state
static struct @344 state
ACTIVATION_TANH
#define ACTIVATION_TANH
Definition: af_arnndn.c:68
FILTER_INPUTS
#define FILTER_INPUTS(array)
Definition: internal.h:190
file_open.h
E
#define E
Definition: avdct.c:32
arg
const char * arg
Definition: jacosubdec.c:67
FFABS
#define FFABS(a)
Absolute value, Note, INT_MIN / INT64_MIN result in undefined behavior as they are not representable ...
Definition: common.h:64
if
if(ret)
Definition: filter_design.txt:179
RNNModel::vad_gru
const GRULayer * vad_gru
Definition: af_arnndn.c:96
AVClass
Describe the class of an AVClass context structure.
Definition: log.h:66
ff_inlink_consume_samples
int ff_inlink_consume_samples(AVFilterLink *link, unsigned min, unsigned max, AVFrame **rframe)
Take samples from the link's FIFO and update the link's stats.
Definition: avfilter.c:1423
NULL
#define NULL
Definition: coverity.c:32
LOCAL_ALIGNED_32
#define LOCAL_ALIGNED_32(t, v,...)
Definition: mem_internal.h:137
sigmoid_approx
static float sigmoid_approx(float x)
Definition: af_arnndn.c:1246
RNNModel::denoise_gru_size
int denoise_gru_size
Definition: af_arnndn.c:101
RNNModel::vad_output
const DenseLayer * vad_output
Definition: af_arnndn.c:108
isnan
#define isnan(x)
Definition: libm.h:340
GRULayer::recurrent_weights
const float * recurrent_weights
Definition: af_arnndn.c:85
outputs
static const AVFilterPad outputs[]
Definition: af_arnndn.c:1587
FREE_DENSE
#define FREE_DENSE(name)
PITCH_BUF_SIZE
#define PITCH_BUF_SIZE
Definition: af_arnndn.c:53
sqrtf
static __device__ float sqrtf(float a)
Definition: cuda_runtime.h:184
PITCH_FRAME_SIZE
#define PITCH_FRAME_SIZE
Definition: af_arnndn.c:52
av_clipf
av_clipf
Definition: af_crystalizer.c:122
ff_set_common_all_channel_counts
int ff_set_common_all_channel_counts(AVFilterContext *ctx)
Equivalent to ff_set_common_channel_layouts(ctx, ff_all_channel_counts())
Definition: formats.c:721
RNNModel::input_dense
const DenseLayer * input_dense
Definition: af_arnndn.c:93
c
Undefined Behavior In the C some operations are like signed integer dereferencing freed accessing outside allocated Undefined Behavior must not occur in a C it is not safe even if the output of undefined operations is unused The unsafety may seem nit picking but Optimizing compilers have in fact optimized code on the assumption that no undefined Behavior occurs Optimizing code based on wrong assumptions can and has in some cases lead to effects beyond the output of computations The signed integer overflow problem in speed critical code Code which is highly optimized and works with signed integers sometimes has the problem that often the output of the computation does not c
Definition: undefined.txt:32
DenseLayer::input_weights
const float * input_weights
Definition: af_arnndn.c:76
float_dsp.h
biquad
static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N)
Definition: af_arnndn.c:391
DenoiseState::pitch_buf
float pitch_buf[PITCH_BUF_SIZE]
Definition: af_arnndn.c:123
f
f
Definition: af_crystalizer.c:122
INPUT_SIZE
#define INPUT_SIZE
Definition: af_arnndn.c:1330
NULL_IF_CONFIG_SMALL
#define NULL_IF_CONFIG_SMALL(x)
Return NULL if CONFIG_SMALL is true, otherwise the argument without modification.
Definition: internal.h:115
NB_BANDS
#define NB_BANDS
Definition: af_arnndn.c:57
P
#define P
shift
static int shift(int a, int b)
Definition: bonk.c:253
DenseLayer::nb_neurons
int nb_neurons
Definition: af_arnndn.c:78
AV_SAMPLE_FMT_NONE
@ AV_SAMPLE_FMT_NONE
Definition: samplefmt.h:56
celt_autocorr
static int celt_autocorr(const float *x, float *ac, const float *window, int overlap, int lag, int n)
Definition: af_arnndn.c:627
WINDOW_SIZE
#define WINDOW_SIZE
Definition: af_arnndn.c:47
AVComplexFloat::re
float re
Definition: tx.h:28
AudioRNNContext::mix
float mix
Definition: af_arnndn.c:139
AVFloatDSPContext
Definition: float_dsp.h:24
RNNModel::noise_gru_size
int noise_gru_size
Definition: af_arnndn.c:98
celt_lpc
static void celt_lpc(float *lpc, const float *ac, int p)
Definition: af_arnndn.c:665
ff_filter_process_command
int ff_filter_process_command(AVFilterContext *ctx, const char *cmd, const char *arg, char *res, int res_len, int flags)
Generic processing of user supplied commands that are set in the same way as the filter options.
Definition: avfilter.c:873
DenoiseState::rnn
RNNState rnn[2]
Definition: af_arnndn.c:130
a
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
Definition: undefined.txt:41
RNN_MOVE
#define RNN_MOVE(dst, src, n)
Definition: af_arnndn.c:405
offset
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
Definition: writing_filters.txt:86
FF_FILTER_FORWARD_WANTED
FF_FILTER_FORWARD_WANTED(outlink, inlink)
N
#define N
Definition: af_mcompand.c:53
RNNModel::denoise_gru
const GRULayer * denoise_gru
Definition: af_arnndn.c:102
input
and forward the test the status of outputs and forward it to the corresponding return FFERROR_NOT_READY If the filters stores internally one or a few frame for some input
Definition: filter_design.txt:172
DenoiseState::last_gain
float last_gain
Definition: af_arnndn.c:125
M_PI
#define M_PI
Definition: mathematics.h:52
av_tx_uninit
av_cold void av_tx_uninit(AVTXContext **ctx)
Frees a context and sets *ctx to NULL, does nothing when *ctx == NULL.
Definition: tx.c:294
AudioRNNContext::channels
int channels
Definition: af_arnndn.c:141
DenoiseState::tx
AVTXContext * tx
Definition: af_arnndn.c:131
sample_rates
sample_rates
Definition: ffmpeg_filter.c:156
ACTIVATION_SIGMOID
#define ACTIVATION_SIGMOID
Definition: af_arnndn.c:69
AudioRNNContext::model_name
char * model_name
Definition: af_arnndn.c:138
AV_OPT_TYPE_FLOAT
@ AV_OPT_TYPE_FLOAT
Definition: opt.h:228
DECLARE_ALIGNED
#define DECLARE_ALIGNED(n, t, v)
Definition: mem.h:116
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:269
DenoiseState
Definition: af_arnndn.c:118
RNN_COPY
#define RNN_COPY(dst, src, n)
Definition: af_arnndn.c:407
AVFrame::extended_data
uint8_t ** extended_data
pointers to the data planes/channels.
Definition: frame.h:386
ff_filter_get_nb_threads
int ff_filter_get_nb_threads(AVFilterContext *ctx)
Get number of threads for current filter instance.
Definition: avfilter.c:793
AVSampleFormat
AVSampleFormat
Audio sample formats.
Definition: samplefmt.h:55
ThreadData
Used for passing data between threads.
Definition: dsddec.c:69
interp_band_gain
static void interp_band_gain(float *g, const float *bandE)
Definition: af_arnndn.c:1128
FFMIN
#define FFMIN(a, b)
Definition: macros.h:49
dct
static void dct(AudioRNNContext *s, float *out, const float *in)
Definition: af_arnndn.c:1010
AudioRNNContext
Definition: af_arnndn.c:135
FRAME_SIZE
#define FRAME_SIZE
Definition: af_arnndn.c:46
len
int len
Definition: vorbis_enc_data.h:426
AudioRNNContext::dct_table
float dct_table[FFALIGN(NB_BANDS, 4)][FFALIGN(NB_BANDS, 4)]
Definition: af_arnndn.c:145
AVFilterPad::name
const char * name
Pad name.
Definition: internal.h:55
avpriv_fopen_utf8
FILE * avpriv_fopen_utf8(const char *path, const char *mode)
Open a file using a UTF-8 filename.
Definition: file_open.c:159
av_calloc
void * av_calloc(size_t nmemb, size_t size)
Definition: mem.c:272
stride
#define stride
Definition: h264pred_template.c:537
AVFilter
Filter definition.
Definition: avfilter.h:171
open_model
static int open_model(AVFilterContext *ctx, RNNModel **model)
Definition: af_arnndn.c:1471
ret
ret
Definition: filter_design.txt:187
RNNModel
Definition: af_arnndn.c:91
FFSWAP
#define FFSWAP(type, a, b)
Definition: macros.h:52
compute_frame_features
static int compute_frame_features(AudioRNNContext *s, DenoiseState *st, AVComplexFloat *X, AVComplexFloat *P, float *Ex, float *Ep, float *Exp, float *features, const float *in)
Definition: af_arnndn.c:1020
DenseLayer
Definition: af_arnndn.c:74
GRULayer::input_weights
const float * input_weights
Definition: af_arnndn.c:84
AudioRNNContext::window
float window[WINDOW_SIZE]
Definition: af_arnndn.c:144
second_check
static const uint8_t second_check[16]
Definition: af_arnndn.c:802
remove_doubling
static float remove_doubling(float *x, int maxperiod, int minperiod, int N, int *T0_, int prev_period, float prev_gain)
Definition: af_arnndn.c:803
RNNModel::denoise_output_size
int denoise_output_size
Definition: af_arnndn.c:104
compute_pitch_gain
static float compute_pitch_gain(float xy, float xx, float yy)
Definition: af_arnndn.c:797
AVFILTER_DEFINE_CLASS
AVFILTER_DEFINE_CLASS(arnndn)
xcorr_kernel
static void xcorr_kernel(const float *x, const float *y, float sum[4], int len)
Definition: af_arnndn.c:526
RNNModel::vad_output_size
int vad_output_size
Definition: af_arnndn.c:107
pitch_search
static void pitch_search(const float *x_lp, float *y, int len, int max_pitch, int *pitch)
Definition: af_arnndn.c:953
pitch_filter
static void pitch_filter(AVComplexFloat *X, const AVComplexFloat *P, const float *Ex, const float *Ep, const float *Exp, const float *g)
Definition: af_arnndn.c:1143
avfilter.h
celt_pitch_xcorr
static void celt_pitch_xcorr(const float *x, const float *y, float *xcorr, int len, int max_pitch)
Definition: af_arnndn.c:606
RNNState::vad_gru_state
float * vad_gru_state
Definition: af_arnndn.c:112
INPUT_GRU
#define INPUT_GRU(name)
rnnoise_channel
static float rnnoise_channel(AudioRNNContext *s, DenoiseState *st, float *out, const float *in, int disabled)
Definition: af_arnndn.c:1360
celt_fir5
static void celt_fir5(const float *x, const float *num, float *y, int N, float *mem)
Definition: af_arnndn.c:698
filter_frame
static int filter_frame(AVFilterLink *inlink, AVFrame *in)
Definition: af_arnndn.c:1427
AVFilterContext
An instance of a filter.
Definition: avfilter.h:415
DenoiseState::pitch_enh_buf
float pitch_enh_buf[PITCH_BUF_SIZE]
Definition: af_arnndn.c:124
AVFILTER_FLAG_SLICE_THREADS
#define AVFILTER_FLAG_SLICE_THREADS
The filter supports multithreading by splitting frames into multiple parts and processing them concur...
Definition: avfilter.h:127
tansig_approx
static float tansig_approx(float x)
Definition: af_arnndn.c:1218
AudioRNNContext::fdsp
AVFloatDSPContext * fdsp
Definition: af_arnndn.c:149
Q15ONE
#define Q15ONE
Definition: af_arnndn.c:72
DenoiseState::last_period
int last_period
Definition: af_arnndn.c:126
audio.h
DenoiseState::tx_fn
av_tx_fn tx_fn
Definition: af_arnndn.c:132
query_formats
static int query_formats(AVFilterContext *ctx)
Definition: af_arnndn.c:329
forward_transform
static void forward_transform(DenoiseState *st, AVComplexFloat *out, const float *in)
Definition: af_arnndn.c:409
av_free
#define av_free(p)
Definition: tableprint_vlc.h:33
FF_FILTER_FORWARD_STATUS
FF_FILTER_FORWARD_STATUS(inlink, outlink)
FFALIGN
#define FFALIGN(x, a)
Definition: macros.h:78
alpha
static const int16_t alpha[]
Definition: ilbcdata.h:55
FILTER_OUTPUTS
#define FILTER_OUTPUTS(array)
Definition: internal.h:191
av_freep
#define av_freep(p)
Definition: tableprint_vlc.h:34
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
avpriv_float_dsp_alloc
av_cold AVFloatDSPContext * avpriv_float_dsp_alloc(int bit_exact)
Allocate a float DSP context.
Definition: float_dsp.c:135
DenoiseState::txi_fn
av_tx_fn txi_fn
Definition: af_arnndn.c:132
d
d
Definition: ffmpeg_filter.c:156
AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL
#define AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL
Same as AVFILTER_FLAG_SUPPORT_TIMELINE_GENERIC, except that the filter will have its filter_frame() c...
Definition: avfilter.h:160
flags
#define flags(name, subs,...)
Definition: cbs_av1.c:561
DenseLayer::activation
int activation
Definition: af_arnndn.c:79
RNNModel::denoise_output
const DenseLayer * denoise_output
Definition: af_arnndn.c:105
av_log
#define av_log(a,...)
Definition: tableprint_vlc.h:27
AVERROR_INVALIDDATA
#define AVERROR_INVALIDDATA
Invalid data found when processing input.
Definition: error.h:61
h
h
Definition: vp9dsp_template.c:2038
RNNState
Definition: af_arnndn.c:111
ALLOC_LAYER
#define ALLOC_LAYER(type, name)
AV_OPT_TYPE_STRING
@ AV_OPT_TYPE_STRING
Definition: opt.h:229
GRULayer
Definition: af_arnndn.c:82
ff_filter_execute
static av_always_inline int ff_filter_execute(AVFilterContext *ctx, avfilter_action_func *func, void *arg, int *ret, int nb_jobs)
Definition: internal.h:142
int
int
Definition: ffmpeg_filter.c:156
compute_gru
static void compute_gru(AudioRNNContext *s, const GRULayer *gru, float *state, const float *input)
Definition: af_arnndn.c:1279
eband5ms
static const uint8_t eband5ms[]
Definition: af_arnndn.c:442
GRULayer::bias
const float * bias
Definition: af_arnndn.c:83
INPUT_DENSE
#define INPUT_DENSE(name)
RNNModel::noise_gru
const GRULayer * noise_gru
Definition: af_arnndn.c:99
NB_FEATURES
#define NB_FEATURES
Definition: af_arnndn.c:62
init
static av_cold int init(AVFilterContext *ctx)
Definition: af_arnndn.c:1493
tx.h
re
float re
Definition: fft.c:79
RNNState::model
RNNModel * model
Definition: af_arnndn.c:115
DenoiseState::analysis_mem
float analysis_mem[FRAME_SIZE]
Definition: af_arnndn.c:119