Go to the documentation of this file.
45 #define PSY_3GPP_THR_SPREAD_HI 1.5f // spreading factor for low-to-hi threshold spreading (15 dB/Bark)
46 #define PSY_3GPP_THR_SPREAD_LOW 3.0f // spreading factor for hi-to-low threshold spreading (30 dB/Bark)
48 #define PSY_3GPP_EN_SPREAD_HI_L1 2.0f
50 #define PSY_3GPP_EN_SPREAD_HI_L2 1.5f
52 #define PSY_3GPP_EN_SPREAD_HI_S 1.5f
54 #define PSY_3GPP_EN_SPREAD_LOW_L 3.0f
56 #define PSY_3GPP_EN_SPREAD_LOW_S 2.0f
58 #define PSY_3GPP_RPEMIN 0.01f
59 #define PSY_3GPP_RPELEV 2.0f
61 #define PSY_3GPP_C1 3.0f
62 #define PSY_3GPP_C2 1.3219281f
63 #define PSY_3GPP_C3 0.55935729f
65 #define PSY_SNR_1DB 7.9432821e-1f
66 #define PSY_SNR_25DB 3.1622776e-3f
68 #define PSY_3GPP_SAVE_SLOPE_L -0.46666667f
69 #define PSY_3GPP_SAVE_SLOPE_S -0.36363637f
70 #define PSY_3GPP_SAVE_ADD_L -0.84285712f
71 #define PSY_3GPP_SAVE_ADD_S -0.75f
72 #define PSY_3GPP_SPEND_SLOPE_L 0.66666669f
73 #define PSY_3GPP_SPEND_SLOPE_S 0.81818181f
74 #define PSY_3GPP_SPEND_ADD_L -0.35f
75 #define PSY_3GPP_SPEND_ADD_S -0.26111111f
76 #define PSY_3GPP_CLIP_LO_L 0.2f
77 #define PSY_3GPP_CLIP_LO_S 0.2f
78 #define PSY_3GPP_CLIP_HI_L 0.95f
79 #define PSY_3GPP_CLIP_HI_S 0.75f
81 #define PSY_3GPP_AH_THR_LONG 0.5f
82 #define PSY_3GPP_AH_THR_SHORT 0.63f
84 #define PSY_PE_FORGET_SLOPE 511
92 #define PSY_3GPP_BITS_TO_PE(bits) ((bits) * 1.18f)
93 #define PSY_3GPP_PE_TO_BITS(bits) ((bits) / 1.18f)
96 #define PSY_LAME_FIR_LEN 21
97 #define AAC_BLOCK_SIZE_LONG 1024
98 #define AAC_BLOCK_SIZE_SHORT 128
99 #define AAC_NUM_BLOCKS_SHORT 8
100 #define PSY_LAME_NUM_SUBBLOCKS 2
106 #define PSY_LAME_PE_GAP 12
107 #define PSY_LAME_PE_QUIET 0.4f
108 #define PSY_LAME_PE_RED 0.45f
241 -8.65163e-18 * 2, -0.00851586 * 2, -6.74764e-18 * 2, 0.0209036 * 2,
242 -3.36639e-17 * 2, -0.0438162 * 2, -1.54175e-17 * 2, 0.0931738 * 2,
243 -5.52212e-17 * 2, -0.313819 * 2
252 int lower_range = 12, upper_range = 12;
260 for (
i = 1;
i < 13;
i++) {
301 return 13.3f *
atanf(0.00076
f *
f) + 3.5f *
atanf((
f / 7500.0
f) * (
f / 7500.0
f));
312 return 3.64 * pow(
f, -0.8)
313 - 6.8 *
exp(-0.6 * (
f - 3.4) * (
f - 3.4))
314 + 6.0 *
exp(-0.15 * (
f - 8.7) * (
f - 8.7))
315 + (0.6 + 0.04 * add) * 0.001 *
f *
f *
f *
f;
322 float prev, minscale, minath, minsnr, pe_min;
326 const float num_bark =
calc_bark((
float)bandwidth);
332 if (!
ctx->model_priv_data)
334 pctx =
ctx->model_priv_data;
339 chan_bitrate = (int)(chan_bitrate / 120.0 * (
ctx->avctx->global_quality ?
ctx->avctx->global_quality : 120));
347 ctx->bitres.size -=
ctx->bitres.size % 8;
350 for (j = 0; j < 2; j++) {
352 const uint8_t *band_sizes =
ctx->bands[j];
353 float line_to_frequency =
ctx->avctx->sample_rate / (j ? 256.f : 2048.0f);
354 float avg_chan_bits = chan_bitrate * (j ? 128.0f : 1024.0f) /
ctx->avctx->sample_rate;
363 for (
g = 0;
g <
ctx->num_bands[j];
g++) {
366 coeffs[
g].
barks = (bark + prev) / 2.0;
369 for (
g = 0;
g <
ctx->num_bands[j] - 1;
g++) {
371 float bark_width = coeffs[
g+1].
barks - coeffs->
barks;
374 coeff->spread_low[1] =
ff_exp10(-bark_width * en_spread_low);
376 pe_min = bark_pe * bark_width;
377 minsnr =
exp2(pe_min / band_sizes[
g]) - 1.5f;
381 for (
g = 0;
g <
ctx->num_bands[j];
g++) {
382 minscale =
ath(start * line_to_frequency,
ATH_ADD);
383 for (
i = 1;
i < band_sizes[
g];
i++)
385 coeffs[
g].
ath = minscale - minath;
386 start += band_sizes[
g];
397 for (
i = 0;
i <
ctx->avctx->ch_layout.nb_channels;
i++)
422 0xB6, 0x6C, 0xD8, 0xB2, 0x66, 0xC6, 0x96, 0x36, 0x36
430 const int16_t *audio,
436 int attack_ratio = br <= 16000 ? 18 : 10;
439 uint8_t grouping = 0;
445 int switch_to_eight = 0;
446 float sum = 0.0, sum2 = 0.0;
449 for (
i = 0;
i < 8;
i++) {
450 for (j = 0; j < 128; j++) {
457 for (
i = 0;
i < 8;
i++) {
458 if (
s[
i] > pch->win_energy * attack_ratio) {
464 pch->win_energy = pch->win_energy*7/8 + sum2/64;
466 wi.window_type[1] = prev_type;
474 grouping = pch->next_grouping;
490 pch->next_window_seq = next_type;
492 for (
i = 0;
i < 3;
i++)
493 wi.window_type[
i] = prev_type;
504 for (
i = 0;
i < 8;
i++) {
505 if (!((grouping >>
i) & 1))
507 wi.grouping[lastgrp]++;
524 float clipped_pe, bit_save, bit_spend, bit_factor, fill_level, forgetful_min_pe;
528 fill_level =
av_clipf((
float)
ctx->fill_level /
size, clip_low, clip_high);
530 bit_save = (fill_level + bitsave_add) * bitsave_slope;
531 assert(bit_save <= 0.3f && bit_save >= -0.05000001
f);
532 bit_spend = (fill_level + bitspend_add) * bitspend_slope;
533 assert(bit_spend <= 0.5f && bit_spend >= -0.1
f);
540 bit_factor = 1.0f - bit_save + ((bit_spend - bit_save) / (
ctx->pe.max -
ctx->pe.min)) * (clipped_pe -
ctx->pe.min);
548 ctx->pe.min =
FFMIN(pe, forgetful_min_pe);
554 ctx->frame_bits * bit_factor,
584 float thr_avg, reduction;
586 if(active_lines == 0.0)
589 thr_avg =
exp2f((
a - pe) / (4.0
f * active_lines));
590 reduction =
exp2f((
a - desired_pe) / (4.0
f * active_lines)) - thr_avg;
592 return FFMAX(reduction, 0.0
f);
598 float thr = band->
thr;
602 thr =
sqrtf(thr) + reduction;
621 const uint8_t *band_sizes,
const float *coefs,
const int cutoff)
624 int start = 0, wstart = 0;
627 for (
g = 0;
g < num_bands;
g++) {
630 float form_factor = 0.0f;
633 if (wstart < cutoff) {
634 for (
i = 0;
i < band_sizes[
g];
i++) {
635 band->
energy += coefs[start+
i] * coefs[start+
i];
643 start += band_sizes[
g];
644 wstart += band_sizes[
g];
662 hpfsmpl[
i] = (sum1 + sum2) * 32768.0
f;
675 float desired_bits, desired_pe, delta_pe, reduction=
NAN, spread_en[128] = {0};
676 float a = 0.0f, active_lines = 0.0f, norm_fac = 0.0f;
677 float pe = pctx->chan_bitrate > 32000 ? 0.0f :
FFMAX(50.0
f, 100.0
f - pctx->chan_bitrate * 100.0f / 32000.0f);
678 const int num_bands =
ctx->num_bands[wi->num_windows == 8];
679 const uint8_t *band_sizes =
ctx->bands[wi->num_windows == 8];
680 AacPsyCoeffs *coeffs = pctx->psy_coef[wi->num_windows == 8];
683 const int cutoff = bandwidth * 2048 / wi->num_windows /
ctx->avctx->sample_rate;
686 calc_thr_3gpp(wi, num_bands, pch, band_sizes, coefs, cutoff);
689 for (
w = 0;
w < wi->num_windows*16;
w += 16) {
693 spread_en[0] =
bands[0].energy;
694 for (
g = 1;
g < num_bands;
g++) {
696 spread_en[
w+
g] =
FFMAX(
bands[
g].energy, spread_en[
w+
g-1] * coeffs[
g].spread_hi[1]);
698 for (
g = num_bands - 2;
g >= 0;
g--) {
700 spread_en[
w+
g] =
FFMAX(spread_en[
w+
g], spread_en[
w+
g+1] * coeffs[
g].spread_low[1]);
703 for (
g = 0;
g < num_bands;
g++) {
718 if (spread_en[
w+
g] * avoid_hole_thr > band->
energy || coeffs[
g].min_snr > 1.0f)
731 desired_pe = pe * (
ctx->avctx->global_quality ?
ctx->avctx->global_quality : 120) / (2 * 2.5
f * 120.0
f);
736 if (
ctx->bitres.bits > 0) {
741 pctx->pe.max =
FFMAX(pe, pctx->pe.max);
742 pctx->pe.min =
FFMIN(pe, pctx->pe.min);
751 if (
ctx->bitres.bits > 0)
756 ctx->bitres.alloc = desired_bits;
758 if (desired_pe < pe) {
760 for (
w = 0;
w < wi->num_windows*16;
w += 16) {
765 for (
g = 0;
g < num_bands;
g++) {
777 for (
i = 0;
i < 2;
i++) {
778 float pe_no_ah = 0.0f, desired_pe_no_ah;
779 active_lines =
a = 0.0f;
780 for (
w = 0;
w < wi->num_windows*16;
w += 16) {
781 for (
g = 0;
g < num_bands;
g++) {
785 pe_no_ah += band->
pe;
791 desired_pe_no_ah =
FFMAX(desired_pe - (pe - pe_no_ah), 0.0
f);
792 if (active_lines > 0.0
f)
796 for (
w = 0;
w < wi->num_windows*16;
w += 16) {
797 for (
g = 0;
g < num_bands;
g++) {
800 if (active_lines > 0.0
f)
803 if (band->
thr > 0.0f)
810 delta_pe = desired_pe - pe;
811 if (
fabs(delta_pe) > 0.05
f * desired_pe)
815 if (pe < 1.15
f * desired_pe) {
817 norm_fac = norm_fac ? 1.0f / norm_fac : 0;
818 for (
w = 0;
w < wi->num_windows*16;
w += 16) {
819 for (
g = 0;
g < num_bands;
g++) {
823 float delta_sfb_pe = band->
norm_fac * norm_fac * delta_pe;
824 float thr = band->
thr;
836 while (pe > desired_pe &&
g--) {
837 for (
w = 0;
w < wi->num_windows*16;
w+= 16) {
850 for (
w = 0;
w < wi->num_windows*16;
w += 16) {
851 for (
g = 0;
g < num_bands;
g++) {
862 memcpy(pch->prev_band, pch->band,
sizeof(pch->band));
890 for (ch = 0; ch < group->
num_ch; ch++) {
925 ctx->next_window_seq = blocktype;
929 const float *la,
int channel,
int prev_type)
934 int uselongblock = 1;
941 const float *pf = hpfsmpl;
956 energy_short[0] += energy_subshort[
i];
962 for (; pf < pfe; pf++)
979 float frame_peak = 1.0f;
981 frame_peak =
FFMAX(frame_peak, energy_subshort[
i]);
984 float thr = pch->attack_threshold;
989 if (attack_intensity[
i] > thr)
999 const float u = energy_short[
i - 1];
1000 const float v = energy_short[
i];
1001 const float m =
FFMAX(
u, v);
1003 if (
u < 2.3
f * v && v < 2.3
f *
u) {
1004 if (
i == 1 && attacks[0] < attacks[
i])
1009 att_sum += attacks[
i];
1012 if (pch->next_attack0_zero)
1016 if (attacks[0] <= pch->prev_attack)
1019 att_sum += attacks[0];
1027 if (attacks[
i] && attacks[
i-1])
1031 pch->frames_since_short = uselongblock ? pch->frames_since_short + 1 : 0;
1054 for (
i = 0;
i < 8;
i++) {
1055 if (!((pch->next_grouping >>
i) & 1))
1067 for (
i = 0;
i < 9;
i++) {
1082 .
name =
"3GPP TS 26.403-inspired model",
#define PSY_LAME_PE_QUIET
pre-onset must be below this fraction of the frame peak
#define PSY_LAME_PE_RED
attack-threshold multiplier for a qualifying isolated onset
float spread_low[2]
spreading factor for low-to-high threshold spreading in long frame
static av_always_inline double ff_exp10(double x)
Compute 10^x for floating point values.
static av_cold int psy_3gpp_init(FFPsyContext *ctx)
Filter the word “frame” indicates either a video frame or a group of audio as stored in an AVFrame structure Format for each input and each output the list of supported formats For video that means pixel format For audio that means channel sample they are references to shared objects When the negotiation mechanism computes the intersection of the formats supported at each end of a all references to both lists are replaced with a reference to the intersection And when a single format is eventually chosen for a link amongst the remaining all references to the list are updated That means that if a filter requires that its input and output have the same format amongst a supported all it has to do is use a reference to the same list of formats query_formats can leave some formats unset and return AVERROR(EAGAIN) to cause the negotiation mechanism toagain later. That can be used by filters with complex requirements to use the format negotiated on one link to set the formats supported on another. Frame references ownership and permissions
static av_unused FFPsyWindowInfo psy_3gpp_window(FFPsyContext *ctx, const int16_t *audio, const int16_t *la, int channel, int prev_type)
Tell encoder which window types to use.
static float lame_calc_attack_threshold(int bitrate)
Calculate the ABR attack threshold from the above LAME psymodel table.
#define PSY_PE_FORGET_SLOPE
static FFPsyWindowInfo psy_lame_window(FFPsyContext *ctx, const float *audio, const float *la, int channel, int prev_type)
float thr
energy threshold
static void calc_thr_3gpp(const FFPsyWindowInfo *wi, const int num_bands, AacPsyChannel *pch, const uint8_t *band_sizes, const float *coefs, const int cutoff)
#define PSY_3GPP_PE_TO_BITS(bits)
#define AV_CODEC_FLAG_QSCALE
Use fixed qscale.
static av_cold float calc_bark(float f)
Calculate Bark value for given line.
float nz_lines
number of non-zero spectral lines
#define PSY_3GPP_CLIP_LO_S
#define u(width, name, range_min, range_max)
#define PSY_3GPP_AH_THR_LONG
int window_shape
window shape (sine/KBD/whatever)
static float calc_pe_3gpp(AacPsyBand *band)
float min
minimum allowed PE for bit factor calculation
#define PSY_3GPP_SPEND_SLOPE_L
#define PSY_3GPP_THR_SPREAD_HI
constants for 3GPP AAC psychoacoustic model
int fill_level
bit reservoir fill level
int nb_channels
Number of channels in this layout.
float spread_hi[2]
spreading factor for high-to-low threshold spreading in long frame
trying all byte sequences megabyte in length and selecting the best looking sequence will yield cases to try But a word about quality
static void lame_apply_block_type(AacPsyChannel *ctx, FFPsyWindowInfo *wi, int uselongblock)
psychoacoustic model frame type-dependent coefficients
int64_t rc_frame_num
frame the rewind state was saved for
AVChannelLayout ch_layout
Audio channel layout.
static av_cold void lame_window_init(AacPsyContext *ctx, AVCodecContext *avctx)
LAME psy model specific initialization.
float st_lrm
short threshold for L, R, and M channels
#define PSY_3GPP_EN_SPREAD_HI_S
#define PSY_LAME_PE_GAP
min consecutive long frames before the relaxation applies
#define PSY_3GPP_SPEND_ADD_L
int flags
AV_CODEC_FLAG_*.
float barks
Bark value for each spectral band in long frame.
float prev_energy_subshort[AAC_NUM_BLOCKS_SHORT *PSY_LAME_NUM_SUBBLOCKS]
static __device__ float fabsf(float a)
windowing related information
int64_t bit_rate
Total stream bitrate in bit/s, 0 if not available.
float previous
allowed PE of the previous frame
const FFPsyModel ff_aac_psy_model
uint8_t num_ch
number of channels in this group
int rc_first_ch
first channel analyzed in that frame
LAME psy model preset struct.
int64_t rc_frame_num
frame this channel last saved rewind state for
#define PSY_3GPP_CLIP_HI_S
information for single band used by 3GPP TS26.403-inspired psychoacoustic model
int global_quality
Global quality for codecs which cannot change it per frame.
int flags
Flags modifying the (de)muxer behaviour.
int quality
Quality to map the rest of the values to.
float pe_const
constant part of the PE calculation
3GPP TS26.403-inspired psychoacoustic model specific data
int next_attack0_zero
whether attack[0] of the next frame is zero
static AVFormatContext * ctx
static float calc_reduction_3gpp(float a, float desired_pe, float pe, float active_lines)
static const uint8_t window_grouping[9]
window grouping information stored as bits (0 - new group, 1 - group continues)
#define AAC_BLOCK_SIZE_SHORT
short block size
struct AacPsyContext::@41 pe
static const float bands[]
static av_cold float ath(float f, float add)
Calculate ATH value for given frequency.
static int calc_bit_demand(AacPsyContext *ctx, float pe, int bits, int size, int short_window)
#define PSY_3GPP_AH_THR_SHORT
static void psy_hp_filter(const float *firbuf, float *hpfsmpl, const float *psy_fir_coeffs)
static float iir_filter(int in, float state[2])
IIR filter used in block switching decision.
static const PsyLamePreset psy_vbr_map[]
LAME psy model preset table for constant quality.
int window_type[3]
window type (short/long/transitional, etc.) - current, previous and next
static __device__ float fabs(float a)
static const PsyLamePreset psy_abr_map[]
LAME psy model preset table for ABR.
int64_t bit_rate
the average bitrate
static av_cold void psy_3gpp_end(FFPsyContext *apc)
#define PSY_3GPP_BITS_TO_PE(bits)
single band psychoacoustic information
static __device__ float sqrtf(float a)
int grouping[8]
window grouping (for e.g. AAC)
float max
maximum allowed PE for bit factor calculation
float iir_state[2]
hi-pass IIR filter state
AacPsyCoeffs psy_coef[2][64]
float thr_quiet
threshold in quiet
#define AAC_BLOCK_SIZE_LONG
long block size
AacPsyBand band[128]
bands information
#define i(width, name, range_min, range_max)
static float calc_reduced_thr_3gpp(AacPsyBand *band, float min_snr, float reduction)
float ath
absolute threshold of hearing per bands
float active_lines
number of active spectral lines
#define AAC_NUM_BLOCKS_SHORT
number of blocks in a short sequence
#define PSY_LAME_FIR_LEN
LAME psy model FIR order.
AacPsyBand rc_prev_band[128]
prev_band as it was entering the frame
The reader does not expect b to be semantically here and if the code is changed by maybe adding a a division or other the signedness will almost certainly be mistaken To avoid this confusion a new type was SUINT is the C unsigned type but it holds a signed int to use the same example SUINT a
#define PSY_3GPP_CLIP_LO_L
int avoid_holes
hole avoidance flag
#define PSY_3GPP_THR_SPREAD_LOW
#define PSY_3GPP_SAVE_ADD_S
#define PSY_3GPP_SPEND_ADD_S
int frames_since_short
consecutive long frames (pre-echo-aware isolated-onset gate)
static const float psy_fir_coeffs[]
LAME psy model FIR coefficient table.
float attack_threshold
attack threshold for this channel
float norm_fac
normalization factor for linearization
#define PSY_3GPP_CLIP_HI_L
float pe
perceptual entropy
void * av_calloc(size_t nmemb, size_t size)
psychoacoustic information for an arbitrary group of channels
enum WindowSequence next_window_seq
window sequence to be used in the next frame
float win_energy
sliding average of channel energy
single/pair channel context for psychoacoustic model
float correction
PE correction factor.
void * model_priv_data
psychoacoustic model implementation private data
#define PSY_3GPP_SAVE_SLOPE_S
#define PSY_3GPP_EN_SPREAD_HI_L1
uint8_t next_grouping
stored grouping scheme for the next frame (in case of 8 short window sequence)
main external API structure.
#define PSY_LAME_NUM_SUBBLOCKS
Number of sub-blocks in each short block.
float global_quality
normalized global quality taken from avctx
static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int channel, const float *coefs, const FFPsyWindowInfo *wi)
Calculate band thresholds as suggested in 3GPP TS26.403.
codec-specific psychoacoustic model implementation
IDirect3DDxgiInterfaceAccess _COM_Outptr_ void ** p
int frame_bits
average bits per frame
FFPsyChannelGroup * ff_psy_find_group(FFPsyContext *ctx, int channel)
Determine what group a channel belongs to.
static void psy_3gpp_analyze(FFPsyContext *ctx, int channel, const float **coeffs, const FFPsyWindowInfo *wi)
#define PSY_3GPP_EN_SPREAD_LOW_L
int chan_bitrate
bitrate per channel
#define PSY_3GPP_SAVE_SLOPE_L
static const double coeff[2][5]
#define PSY_3GPP_SPEND_SLOPE_S
#define FF_QP2LAMBDA
factor to convert from H.263 QP to lambda
#define PSY_3GPP_EN_SPREAD_LOW_S
int prev_attack
attack value for the last short block in the previous sequence
context used by psychoacoustic model
AacPsyBand prev_band[128]
bands information from the previous frame
int num_windows
number of windows in a frame
#define PSY_3GPP_SAVE_ADD_L