#include <math.h>
#include "avcodec.h"
#include "get_bits.h"
#include "put_bits.h"
#include "wmavoice_data.h"
#include "celp_math.h"
#include "celp_filters.h"
#include "acelp_vectors.h"
#include "acelp_filters.h"
#include "lsp.h"
#include "libavutil/lzo.h"
#include "dct.h"
#include "rdft.h"
#include "sinewin.h"
Go to the source code of this file.
Data Structures | |
struct | frame_type_desc |
Description of frame types. More... | |
struct | WMAVoiceContext |
WMA Voice decoding context. More... | |
Defines | |
#define | MAX_BLOCKS 8 |
maximum number of blocks per frame | |
#define | MAX_LSPS 16 |
maximum filter order | |
#define | MAX_LSPS_ALIGN16 16 |
same as MAX_LSPS; needs to be multiple | |
#define | MAX_FRAMES 3 |
maximum number of frames per superframe | |
#define | MAX_FRAMESIZE 160 |
maximum number of samples per frame | |
#define | MAX_SIGNAL_HISTORY 416 |
maximum excitation signal history | |
#define | MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES) |
maximum number of samples per superframe | |
#define | SFRAME_CACHE_MAXSIZE 256 |
maximum cache size for frame data that | |
#define | VLC_NBITS 6 |
number of bits to read per VLC iteration | |
#define | log_range(var, assign) |
Enumerations | |
enum | { ACB_TYPE_NONE = 0, ACB_TYPE_ASYMMETRIC = 1, ACB_TYPE_HAMMING = 2 } |
Adaptive codebook types. More... | |
enum | { FCB_TYPE_SILENCE = 0, FCB_TYPE_HARDCODED = 1, FCB_TYPE_AW_PULSES = 2, FCB_TYPE_EXC_PULSES = 3 } |
Fixed codebook types. More... | |
Functions | |
static av_cold int | decode_vbmtree (GetBitContext *gb, int8_t vbm_tree[25]) |
Set up the variable bit mode (VBM) tree from container extradata. | |
static av_cold int | wmavoice_decode_init (AVCodecContext *ctx) |
Set up decoder with parameters from demuxer (extradata etc. | |
static void | dequant_lsps (double *lsps, int num, const uint16_t *values, const uint16_t *sizes, int n_stages, const uint8_t *table, const double *mul_q, const double *base_q) |
Dequantize LSPs. | |
static int | pRNG (int frame_cntr, int block_num, int block_size) |
Generate a random number from frame_cntr and block_idx, which will lief in the range [0, 1000 - block_size] (so it can be used as an index in a table of size 1000 of which you want to read block_size entries). | |
static void | synth_block_hardcoded (WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, const struct frame_type_desc *frame_desc, float *excitation) |
Parse hardcoded signal for a single block. | |
static void | synth_block_fcb_acb (WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, int block_pitch_sh2, const struct frame_type_desc *frame_desc, float *excitation) |
Parse FCB/ACB signal for a single block. | |
static void | synth_block (WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, int block_pitch_sh2, const double *lsps, const double *prev_lsps, const struct frame_type_desc *frame_desc, float *excitation, float *synth) |
Parse data in a single block. | |
static int | synth_frame (AVCodecContext *ctx, GetBitContext *gb, int frame_idx, float *samples, const double *lsps, const double *prev_lsps, float *excitation, float *synth) |
Synthesize output samples for a single frame. | |
static void | stabilize_lsps (double *lsps, int num) |
Ensure minimum value for first item, maximum value for last value, proper spacing between each value and proper ordering. | |
static int | check_bits_for_superframe (GetBitContext *orig_gb, WMAVoiceContext *s) |
Test if there's enough bits to read 1 superframe. | |
static int | synth_superframe (AVCodecContext *ctx, float *samples, int *data_size) |
Synthesize output samples for a single superframe. | |
static int | parse_packet_header (WMAVoiceContext *s) |
Parse the packet header at the start of each packet (input data to this decoder). | |
static void | copy_bits (PutBitContext *pb, const uint8_t *data, int size, GetBitContext *gb, int nbits) |
Copy (unaligned) bits from gb/data/size to pb. | |
static int | wmavoice_decode_packet (AVCodecContext *ctx, void *data, int *data_size, AVPacket *avpkt) |
Packet decoding: a packet is anything that the (ASF) demuxer contains, and we expect that the demuxer / application provides it to us as such (else you'll probably get garbage as output). | |
static av_cold int | wmavoice_decode_end (AVCodecContext *ctx) |
static av_cold void | wmavoice_flush (AVCodecContext *ctx) |
Postfilter functions | |
Postfilter functions (gain control, wiener denoise filter, DC filter, kalman smoothening, plus surrounding code to wrap it) | |
static void | adaptive_gain_control (float *out, const float *in, const float *speech_synth, int size, float alpha, float *gain_mem) |
Adaptive gain control (as used in postfilter). | |
static int | kalman_smoothen (WMAVoiceContext *s, int pitch, const float *in, float *out, int size) |
Kalman smoothing function. | |
static float | tilt_factor (const float *lpcs, int n_lpcs) |
Get the tilt factor of a formant filter from its transfer function. | |
static void | calc_input_response (WMAVoiceContext *s, float *lpcs, int fcb_type, float *coeffs, int remainder) |
Derive denoise filter coefficients (in real domain) from the LPCs. | |
static void | wiener_denoise (WMAVoiceContext *s, int fcb_type, float *synth_pf, int size, const float *lpcs) |
This function applies a Wiener filter on the (noisy) speech signal as a means to denoise it. | |
static void | postfilter (WMAVoiceContext *s, const float *synth, float *samples, int size, const float *lpcs, float *zero_exc_pf, int fcb_type, int pitch) |
Averaging projection filter, the postfilter used in WMAVoice. | |
LSP dequantization routines | |
LSP dequantization routines, for 10/16LSPs and independent/residual coding.
| |
static void | dequant_lsp10i (GetBitContext *gb, double *lsps) |
Parse 10 independently-coded LSPs. | |
static void | dequant_lsp10r (GetBitContext *gb, double *i_lsps, const double *old, double *a1, double *a2, int q_mode) |
Parse 10 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames from them (residual coding). | |
static void | dequant_lsp16i (GetBitContext *gb, double *lsps) |
Parse 16 independently-coded LSPs. | |
static void | dequant_lsp16r (GetBitContext *gb, double *i_lsps, const double *old, double *a1, double *a2, int q_mode) |
Parse 16 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames from them (residual coding). | |
Pitch-adaptive window coding functions | |
The next few functions are for pitch-adaptive window coding. | |
static void | aw_parse_coords (WMAVoiceContext *s, GetBitContext *gb, const int *pitch) |
Parse the offset of the first pitch-adaptive window pulses, and the distribution of pulses between the two blocks in this frame. | |
static void | aw_pulse_set2 (WMAVoiceContext *s, GetBitContext *gb, int block_idx, AMRFixed *fcb) |
Apply second set of pitch-adaptive window pulses. | |
static void | aw_pulse_set1 (WMAVoiceContext *s, GetBitContext *gb, int block_idx, AMRFixed *fcb) |
Apply first set of pitch-adaptive window pulses. | |
Variables | |
static VLC | frame_type_vlc |
Frame type VLC coding. | |
static struct frame_type_desc | frame_descs [17] |
Description of frame types. | |
AVCodec | ff_wmavoice_decoder |
Definition in file wmavoice.c.
#define log_range | ( | var, | |||
assign | ) |
#define MAX_BLOCKS 8 |
maximum number of blocks per frame
Definition at line 43 of file wmavoice.c.
Referenced by synth_frame().
#define MAX_FRAMES 3 |
maximum number of frames per superframe
Definition at line 47 of file wmavoice.c.
Referenced by check_bits_for_superframe(), and synth_superframe().
#define MAX_FRAMESIZE 160 |
#define MAX_LSPS 16 |
maximum filter order
Definition at line 44 of file wmavoice.c.
Referenced by synth_block(), synth_frame(), synth_superframe(), and wmavoice_flush().
#define MAX_LSPS_ALIGN16 16 |
same as MAX_LSPS; needs to be multiple
of 16 for ASM input buffer alignment
Definition at line 45 of file wmavoice.c.
Referenced by postfilter(), and wmavoice_flush().
#define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES) |
maximum number of samples per superframe
Definition at line 50 of file wmavoice.c.
Referenced by synth_superframe().
#define MAX_SIGNAL_HISTORY 416 |
maximum excitation signal history
Definition at line 49 of file wmavoice.c.
Referenced by synth_superframe(), wmavoice_decode_init(), and wmavoice_flush().
#define SFRAME_CACHE_MAXSIZE 256 |
maximum cache size for frame data that
was split over two packets
Definition at line 52 of file wmavoice.c.
Referenced by wmavoice_decode_packet().
#define VLC_NBITS 6 |
number of bits to read per VLC iteration
Definition at line 54 of file wmavoice.c.
Referenced by decode_vbmtree().
anonymous enum |
Adaptive codebook types.
ACB_TYPE_NONE | no adaptive codebook (only hardcoded fixed) |
ACB_TYPE_ASYMMETRIC |
adaptive codebook with per-frame pitch, which we interpolate to get a per-sample pitch.
Signal is generated using an asymmetric sinc window function
|
ACB_TYPE_HAMMING |
Per-block pitch with signal generation using a Hamming sinc window function.
|
Definition at line 64 of file wmavoice.c.
anonymous enum |
Fixed codebook types.
Definition at line 79 of file wmavoice.c.
static void adaptive_gain_control | ( | float * | out, | |
const float * | in, | |||
const float * | speech_synth, | |||
int | size, | |||
float | alpha, | |||
float * | gain_mem | |||
) | [static] |
Adaptive gain control (as used in postfilter).
Identical to ff_adaptive_gain_control() in acelp_vectors.c, except that the energy here is calculated using sum(abs(...)), whereas the other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
out | output buffer for filtered samples | |
in | input buffer containing the samples as they are after the postfilter steps so far | |
speech_synth | input buffer containing speech synth before postfilter | |
size | input buffer size | |
alpha | exponential filter factor | |
gain_mem | pointer to filter memory (single float) |
Definition at line 465 of file wmavoice.c.
Referenced by postfilter().
static void aw_parse_coords | ( | WMAVoiceContext * | s, | |
GetBitContext * | gb, | |||
const int * | pitch | |||
) | [static] |
Parse the offset of the first pitch-adaptive window pulses, and the distribution of pulses between the two blocks in this frame.
s | WMA Voice decoding context private data | |
gb | bit I/O context | |
pitch | pitch for each block in this frame |
Definition at line 993 of file wmavoice.c.
Referenced by synth_frame().
static void aw_pulse_set1 | ( | WMAVoiceContext * | s, | |
GetBitContext * | gb, | |||
int | block_idx, | |||
AMRFixed * | fcb | |||
) | [static] |
Apply first set of pitch-adaptive window pulses.
s | WMA Voice decoding context private data | |
gb | bit I/O context | |
block_idx | block index in frame [0, 1] | |
fcb | storage location for fixed codebook pulse info |
Definition at line 1133 of file wmavoice.c.
Referenced by synth_block_fcb_acb().
static void aw_pulse_set2 | ( | WMAVoiceContext * | s, | |
GetBitContext * | gb, | |||
int | block_idx, | |||
AMRFixed * | fcb | |||
) | [static] |
Apply second set of pitch-adaptive window pulses.
s | WMA Voice decoding context private data | |
gb | bit I/O context | |
block_idx | block index in frame [0, 1] | |
fcb | structure containing fixed codebook vector info |
Definition at line 1044 of file wmavoice.c.
Referenced by synth_block_fcb_acb().
static void calc_input_response | ( | WMAVoiceContext * | s, | |
float * | lpcs, | |||
int | fcb_type, | |||
float * | coeffs, | |||
int | remainder | |||
) | [static] |
Derive denoise filter coefficients (in real domain) from the LPCs.
Definition at line 564 of file wmavoice.c.
Referenced by wiener_denoise().
static int check_bits_for_superframe | ( | GetBitContext * | orig_gb, | |
WMAVoiceContext * | s | |||
) | [static] |
Test if there's enough bits to read 1 superframe.
orig_gb | bit I/O context used for reading. This function does not modify the state of the bitreader; it only uses it to copy the current stream position | |
s | WMA Voice decoding context private data |
Definition at line 1640 of file wmavoice.c.
Referenced by synth_superframe().
static void copy_bits | ( | PutBitContext * | pb, | |
const uint8_t * | data, | |||
int | size, | |||
GetBitContext * | gb, | |||
int | nbits | |||
) | [static] |
Copy (unaligned) bits from gb/data/size to pb.
pb | target buffer to copy bits into | |
data | source buffer to copy bits from | |
size | size of the source data, in bytes | |
gb | bit I/O context specifying the current position in the source. data. This function might use this to align the bit position to a whole-byte boundary before calling ff_copy_bits() on aligned source data | |
nbits | the amount of bits to copy from source to target |
Definition at line 1882 of file wmavoice.c.
static av_cold int decode_vbmtree | ( | GetBitContext * | gb, | |
int8_t | vbm_tree[25] | |||
) | [static] |
Set up the variable bit mode (VBM) tree from container extradata.
gb | bit I/O context. The bit context (s->gb) should be loaded with byte 23-46 of the container extradata (i.e. the ones containing the VBM tree). | |
vbm_tree | pointer to array to which the decoded VBM tree will be written. |
Definition at line 301 of file wmavoice.c.
Referenced by wmavoice_decode_init().
static void dequant_lsp10i | ( | GetBitContext * | gb, | |
double * | lsps | |||
) | [static] |
Parse 10 independently-coded LSPs.
Definition at line 848 of file wmavoice.c.
Referenced by dequant_lsp10r(), and synth_superframe().
static void dequant_lsp10r | ( | GetBitContext * | gb, | |
double * | i_lsps, | |||
const double * | old, | |||
double * | a1, | |||
double * | a2, | |||
int | q_mode | |||
) | [static] |
Parse 10 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames from them (residual coding).
Definition at line 874 of file wmavoice.c.
Referenced by synth_superframe().
static void dequant_lsp16i | ( | GetBitContext * | gb, | |
double * | lsps | |||
) | [static] |
Parse 16 independently-coded LSPs.
Definition at line 910 of file wmavoice.c.
Referenced by dequant_lsp16r(), and synth_superframe().
static void dequant_lsp16r | ( | GetBitContext * | gb, | |
double * | i_lsps, | |||
const double * | old, | |||
double * | a1, | |||
double * | a2, | |||
int | q_mode | |||
) | [static] |
Parse 16 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames from them (residual coding).
Definition at line 943 of file wmavoice.c.
Referenced by synth_superframe().
static void dequant_lsps | ( | double * | lsps, | |
int | num, | |||
const uint16_t * | values, | |||
const uint16_t * | sizes, | |||
int | n_stages, | |||
const uint8_t * | table, | |||
const double * | mul_q, | |||
const double * | base_q | |||
) | [static] |
Dequantize LSPs.
lsps | output pointer to the array that will hold the LSPs | |
num | number of LSPs to be dequantized | |
values | quantized values, contains n_stages values | |
sizes | range (i.e. max value) of each quantized value | |
n_stages | number of dequantization runs | |
table | dequantization table to be used | |
mul_q | LSF multiplier | |
base_q | base (lowest) LSF values |
Definition at line 816 of file wmavoice.c.
Referenced by dequant_lsp10i(), dequant_lsp10r(), dequant_lsp16i(), and dequant_lsp16r().
static int kalman_smoothen | ( | WMAVoiceContext * | s, | |
int | pitch, | |||
const float * | in, | |||
float * | out, | |||
int | size | |||
) | [static] |
Kalman smoothing function.
This function looks back pitch +/- 3 samples back into history to find the best fitting curve (that one giving the optimal gain of the two signals, i.e. the highest dot product between the two), and then uses that signal history to smoothen the output of the speech synthesis filter.
s | WMA Voice decoding context | |
pitch | pitch of the speech signal | |
in | input speech signal | |
out | output pointer for smoothened signal | |
size | input/output buffer size |
Definition at line 505 of file wmavoice.c.
Referenced by postfilter().
static int parse_packet_header | ( | WMAVoiceContext * | s | ) | [static] |
Parse the packet header at the start of each packet (input data to this decoder).
s | WMA Voice decoding context private data |
Definition at line 1847 of file wmavoice.c.
Referenced by gxf_header(), gxf_packet(), gxf_resync_media(), and wmavoice_decode_packet().
static void postfilter | ( | WMAVoiceContext * | s, | |
const float * | synth, | |||
float * | samples, | |||
int | size, | |||
const float * | lpcs, | |||
float * | zero_exc_pf, | |||
int | fcb_type, | |||
int | pitch | |||
) | [static] |
Averaging projection filter, the postfilter used in WMAVoice.
This uses the following steps:
s | WMAVoice decoding context | |
synth | Speech synthesis output (before postfilter) | |
samples | Output buffer for filtered samples | |
size | Buffer size of synth & samples | |
lpcs | Generated LPCs used for speech synthesis | |
zero_exc_pf | destination for zero synthesis filter (16-byte aligned) | |
fcb_type | Frame type (silence, hardcoded, AW-pulses or FCB-pulses) | |
pitch | Pitch of the input signal |
Definition at line 762 of file wmavoice.c.
static int pRNG | ( | int | frame_cntr, | |
int | block_num, | |||
int | block_size | |||
) | [static] |
Generate a random number from frame_cntr and block_idx, which will lief in the range [0, 1000 - block_size] (so it can be used as an index in a table of size 1000 of which you want to read block_size entries).
frame_cntr | current frame number | |
block_num | current block index | |
block_size | amount of entries we want to read from a table that has 1000 entries |
Definition at line 1194 of file wmavoice.c.
Referenced by synth_block_hardcoded().
static void stabilize_lsps | ( | double * | lsps, | |
int | num | |||
) | [static] |
Ensure minimum value for first item, maximum value for last value, proper spacing between each value and proper ordering.
lsps | array of LSPs | |
num | size of LSP array |
Definition at line 1602 of file wmavoice.c.
Referenced by synth_superframe().
static void synth_block | ( | WMAVoiceContext * | s, | |
GetBitContext * | gb, | |||
int | block_idx, | |||
int | size, | |||
int | block_pitch_sh2, | |||
const double * | lsps, | |||
const double * | prev_lsps, | |||
const struct frame_type_desc * | frame_desc, | |||
float * | excitation, | |||
float * | synth | |||
) | [static] |
Parse data in a single block.
s | WMA Voice decoding context private data | |
gb | bit I/O context | |
block_idx | index of the to-be-read block | |
size | amount of samples to be read in this block | |
block_pitch_sh2 | pitch for this block << 2 | |
lsps | LSPs for (the end of) this frame | |
prev_lsps | LSPs for the last frame | |
frame_desc | frame type descriptor | |
excitation | target memory for the ACB+FCB interpolated signal | |
synth | target memory for the speech synthesis filter output |
Definition at line 1384 of file wmavoice.c.
Referenced by synth_frame().
static void synth_block_fcb_acb | ( | WMAVoiceContext * | s, | |
GetBitContext * | gb, | |||
int | block_idx, | |||
int | size, | |||
int | block_pitch_sh2, | |||
const struct frame_type_desc * | frame_desc, | |||
float * | excitation | |||
) | [static] |
Parse FCB/ACB signal for a single block.
Definition at line 1261 of file wmavoice.c.
Referenced by synth_block().
static void synth_block_hardcoded | ( | WMAVoiceContext * | s, | |
GetBitContext * | gb, | |||
int | block_idx, | |||
int | size, | |||
const struct frame_type_desc * | frame_desc, | |||
float * | excitation | |||
) | [static] |
Parse hardcoded signal for a single block.
Definition at line 1230 of file wmavoice.c.
Referenced by synth_block().
static int synth_frame | ( | AVCodecContext * | ctx, | |
GetBitContext * | gb, | |||
int | frame_idx, | |||
float * | samples, | |||
const double * | lsps, | |||
const double * | prev_lsps, | |||
float * | excitation, | |||
float * | synth | |||
) | [static] |
Synthesize output samples for a single frame.
ctx | WMA Voice decoder context | |
gb | bit I/O context (s->gb or one for cross-packet superframes) | |
frame_idx | Frame number within superframe [0-2] | |
samples | pointer to output sample buffer, has space for at least 160 samples | |
lsps | LSP array | |
prev_lsps | array of previous frame's LSPs | |
excitation | target buffer for excitation signal | |
synth | target buffer for synthesized speech data |
Definition at line 1427 of file wmavoice.c.
Referenced by synth_superframe().
static int synth_superframe | ( | AVCodecContext * | ctx, | |
float * | samples, | |||
int * | data_size | |||
) | [static] |
Synthesize output samples for a single superframe.
If we have any data cached in s->sframe_cache, that will be used instead of whatever is loaded in s->gb.
WMA Voice superframes contain 3 frames, each containing 160 audio samples, to give a total of 480 samples per frame. See synth_frame() for frame parsing. In addition to 3 frames, superframes can also contain the LSPs (if these are globally specified for all frames (residually); they can also be specified individually per-frame. See the s->has_residual_lsps option), and can specify the number of samples encoded in this superframe (if less than 480), usually used to prevent blanks at track boundaries.
ctx | WMA Voice decoder context | |
samples | pointer to output buffer for voice samples | |
data_size | pointer containing the size of samples on input, and the amount of samples filled on output |
Definition at line 1728 of file wmavoice.c.
Referenced by wmavoice_decode_packet().
static float tilt_factor | ( | const float * | lpcs, | |
int | n_lpcs | |||
) | [static] |
Get the tilt factor of a formant filter from its transfer function.
lpcs | LPC coefficients | |
n_lpcs | Size of LPC buffer |
Definition at line 551 of file wmavoice.c.
static void wiener_denoise | ( | WMAVoiceContext * | s, | |
int | fcb_type, | |||
float * | synth_pf, | |||
int | size, | |||
const float * | lpcs | |||
) | [static] |
This function applies a Wiener filter on the (noisy) speech signal as a means to denoise it.
s | WMA Voice decoding context | |
fcb_type | Frame (codebook) type | |
synth_pf | input: the noisy speech signal, output: denoised speech data; should be 16-byte aligned (for ASM purposes) | |
size | size of the speech data | |
lpcs | LPCs used to synthesize this frame's speech data |
Definition at line 680 of file wmavoice.c.
Referenced by postfilter().
static av_cold int wmavoice_decode_end | ( | AVCodecContext * | ctx | ) | [static] |
Definition at line 1992 of file wmavoice.c.
static av_cold int wmavoice_decode_init | ( | AVCodecContext * | ctx | ) | [static] |
Set up decoder with parameters from demuxer (extradata etc.
).
Extradata layout:
Definition at line 336 of file wmavoice.c.
static int wmavoice_decode_packet | ( | AVCodecContext * | ctx, | |
void * | data, | |||
int * | data_size, | |||
AVPacket * | avpkt | |||
) | [static] |
Packet decoding: a packet is anything that the (ASF) demuxer contains, and we expect that the demuxer / application provides it to us as such (else you'll probably get garbage as output).
Every packet has a size of ctx->block_align bytes, starts with a packet header (see parse_packet_header()), and then a series of superframes. Superframe boundaries may exceed packets, i.e. superframes can split data over multiple (two) packets.
For more information about frames, see synth_superframe().
Definition at line 1911 of file wmavoice.c.
static av_cold void wmavoice_flush | ( | AVCodecContext * | ctx | ) | [static] |
Definition at line 2006 of file wmavoice.c.
Initial value:
{ "wmavoice", AVMEDIA_TYPE_AUDIO, CODEC_ID_WMAVOICE, sizeof(WMAVoiceContext), wmavoice_decode_init, NULL, wmavoice_decode_end, wmavoice_decode_packet, CODEC_CAP_SUBFRAMES, .flush = wmavoice_flush, .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"), }
Definition at line 2034 of file wmavoice.c.
struct frame_type_desc frame_descs[17] [static] |
VLC frame_type_vlc [static] |