FFmpeg
|
Windows Media Audio Voice compatible decoder. More...
#include <math.h>
#include "libavutil/channel_layout.h"
#include "libavutil/float_dsp.h"
#include "libavutil/mem.h"
#include "avcodec.h"
#include "internal.h"
#include "get_bits.h"
#include "put_bits.h"
#include "wmavoice_data.h"
#include "celp_filters.h"
#include "acelp_vectors.h"
#include "acelp_filters.h"
#include "lsp.h"
#include "dct.h"
#include "rdft.h"
#include "sinewin.h"
Go to the source code of this file.
Data Structures | |
struct | frame_type_desc |
Description of frame types. More... | |
struct | WMAVoiceContext |
WMA Voice decoding context. More... | |
Macros | |
#define | MAX_BLOCKS 8 |
maximum number of blocks per frame | |
#define | MAX_LSPS 16 |
maximum filter order | |
#define | MAX_LSPS_ALIGN16 16 |
same as MAX_LSPS; needs to be multiple | |
#define | MAX_FRAMES 3 |
maximum number of frames per superframe | |
#define | MAX_FRAMESIZE 160 |
maximum number of samples per frame | |
#define | MAX_SIGNAL_HISTORY 416 |
maximum excitation signal history | |
#define | MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES) |
maximum number of samples per superframe | |
#define | SFRAME_CACHE_MAXSIZE 256 |
maximum cache size for frame data that | |
#define | VLC_NBITS 6 |
number of bits to read per VLC iteration | |
#define | log_range(var, assign) |
Enumerations | |
enum | { ACB_TYPE_NONE = 0, ACB_TYPE_ASYMMETRIC = 1, ACB_TYPE_HAMMING = 2 } |
Adaptive codebook types. More... | |
enum | { FCB_TYPE_SILENCE = 0, FCB_TYPE_HARDCODED = 1, FCB_TYPE_AW_PULSES = 2, FCB_TYPE_EXC_PULSES = 3 } |
Fixed codebook types. More... | |
Functions | |
static av_cold int | decode_vbmtree (GetBitContext *gb, int8_t vbm_tree[25]) |
Set up the variable bit mode (VBM) tree from container extradata. | |
static av_cold void | wmavoice_init_static_data (AVCodec *codec) |
static av_cold int | wmavoice_decode_init (AVCodecContext *ctx) |
Set up decoder with parameters from demuxer (extradata etc.). | |
static void | dequant_lsps (double *lsps, int num, const uint16_t *values, const uint16_t *sizes, int n_stages, const uint8_t *table, const double *mul_q, const double *base_q) |
Dequantize LSPs. | |
static int | pRNG (int frame_cntr, int block_num, int block_size) |
Generate a random number from frame_cntr and block_idx, which will lief in the range [0, 1000 - block_size] (so it can be used as an index in a table of size 1000 of which you want to read block_size entries). | |
static void | synth_block_hardcoded (WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, const struct frame_type_desc *frame_desc, float *excitation) |
Parse hardcoded signal for a single block. | |
static void | synth_block_fcb_acb (WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, int block_pitch_sh2, const struct frame_type_desc *frame_desc, float *excitation) |
Parse FCB/ACB signal for a single block. | |
static void | synth_block (WMAVoiceContext *s, GetBitContext *gb, int block_idx, int size, int block_pitch_sh2, const double *lsps, const double *prev_lsps, const struct frame_type_desc *frame_desc, float *excitation, float *synth) |
Parse data in a single block. | |
static int | synth_frame (AVCodecContext *ctx, GetBitContext *gb, int frame_idx, float *samples, const double *lsps, const double *prev_lsps, float *excitation, float *synth) |
Synthesize output samples for a single frame. | |
static void | stabilize_lsps (double *lsps, int num) |
Ensure minimum value for first item, maximum value for last value, proper spacing between each value and proper ordering. | |
static int | check_bits_for_superframe (GetBitContext *orig_gb, WMAVoiceContext *s) |
Test if there's enough bits to read 1 superframe. | |
static int | synth_superframe (AVCodecContext *ctx, AVFrame *frame, int *got_frame_ptr) |
Synthesize output samples for a single superframe. | |
static int | parse_packet_header (WMAVoiceContext *s) |
Parse the packet header at the start of each packet (input data to this decoder). | |
static void | copy_bits (PutBitContext *pb, const uint8_t *data, int size, GetBitContext *gb, int nbits) |
Copy (unaligned) bits from gb/data/size to pb. | |
static int | wmavoice_decode_packet (AVCodecContext *ctx, void *data, int *got_frame_ptr, AVPacket *avpkt) |
Packet decoding: a packet is anything that the (ASF) demuxer contains, and we expect that the demuxer / application provides it to us as such (else you'll probably get garbage as output). | |
static av_cold int | wmavoice_decode_end (AVCodecContext *ctx) |
static av_cold void | wmavoice_flush (AVCodecContext *ctx) |
Postfilter functions | |
Postfilter functions (gain control, wiener denoise filter, DC filter, kalman smoothening, plus surrounding code to wrap it) | |
static void | adaptive_gain_control (float *out, const float *in, const float *speech_synth, int size, float alpha, float *gain_mem) |
Adaptive gain control (as used in postfilter). | |
static int | kalman_smoothen (WMAVoiceContext *s, int pitch, const float *in, float *out, int size) |
Kalman smoothing function. | |
static float | tilt_factor (const float *lpcs, int n_lpcs) |
Get the tilt factor of a formant filter from its transfer function. | |
static void | calc_input_response (WMAVoiceContext *s, float *lpcs, int fcb_type, float *coeffs, int remainder) |
Derive denoise filter coefficients (in real domain) from the LPCs. | |
static void | wiener_denoise (WMAVoiceContext *s, int fcb_type, float *synth_pf, int size, const float *lpcs) |
This function applies a Wiener filter on the (noisy) speech signal as a means to denoise it. | |
static void | postfilter (WMAVoiceContext *s, const float *synth, float *samples, int size, const float *lpcs, float *zero_exc_pf, int fcb_type, int pitch) |
Averaging projection filter, the postfilter used in WMAVoice. | |
LSP dequantization routines | |
LSP dequantization routines, for 10/16LSPs and independent/residual coding.
| |
static void | dequant_lsp10i (GetBitContext *gb, double *lsps) |
Parse 10 independently-coded LSPs. | |
static void | dequant_lsp10r (GetBitContext *gb, double *i_lsps, const double *old, double *a1, double *a2, int q_mode) |
Parse 10 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames from them (residual coding). | |
static void | dequant_lsp16i (GetBitContext *gb, double *lsps) |
Parse 16 independently-coded LSPs. | |
static void | dequant_lsp16r (GetBitContext *gb, double *i_lsps, const double *old, double *a1, double *a2, int q_mode) |
Parse 16 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames from them (residual coding). | |
Pitch-adaptive window coding functions | |
The next few functions are for pitch-adaptive window coding. | |
static void | aw_parse_coords (WMAVoiceContext *s, GetBitContext *gb, const int *pitch) |
Parse the offset of the first pitch-adaptive window pulses, and the distribution of pulses between the two blocks in this frame. | |
static int | aw_pulse_set2 (WMAVoiceContext *s, GetBitContext *gb, int block_idx, AMRFixed *fcb) |
Apply second set of pitch-adaptive window pulses. | |
static void | aw_pulse_set1 (WMAVoiceContext *s, GetBitContext *gb, int block_idx, AMRFixed *fcb) |
Apply first set of pitch-adaptive window pulses. | |
Variables | |
static VLC | frame_type_vlc |
Frame type VLC coding. | |
static struct frame_type_desc | frame_descs [17] |
AVCodec | ff_wmavoice_decoder |
Windows Media Audio Voice compatible decoder.
Definition in file wmavoice.c.
#define MAX_BLOCKS 8 |
maximum number of blocks per frame
Definition at line 46 of file wmavoice.c.
Referenced by synth_frame().
#define MAX_LSPS 16 |
maximum filter order
Definition at line 47 of file wmavoice.c.
Referenced by synth_block(), synth_frame(), synth_superframe(), and wmavoice_flush().
#define MAX_LSPS_ALIGN16 16 |
same as MAX_LSPS; needs to be multiple
of 16 for ASM input buffer alignment
Definition at line 48 of file wmavoice.c.
Referenced by postfilter(), and wmavoice_flush().
#define MAX_FRAMES 3 |
maximum number of frames per superframe
Definition at line 50 of file wmavoice.c.
Referenced by check_bits_for_superframe(), and synth_superframe().
#define MAX_FRAMESIZE 160 |
maximum number of samples per frame
Definition at line 51 of file wmavoice.c.
Referenced by aw_parse_coords(), aw_pulse_set1(), aw_pulse_set2(), postfilter(), synth_block_fcb_acb(), synth_block_hardcoded(), synth_frame(), and synth_superframe().
#define MAX_SIGNAL_HISTORY 416 |
maximum excitation signal history
Definition at line 52 of file wmavoice.c.
Referenced by synth_superframe(), wmavoice_decode_init(), and wmavoice_flush().
#define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES) |
maximum number of samples per superframe
Definition at line 53 of file wmavoice.c.
Referenced by synth_superframe().
#define SFRAME_CACHE_MAXSIZE 256 |
maximum cache size for frame data that
was split over two packets
Definition at line 55 of file wmavoice.c.
Referenced by wmavoice_decode_packet().
#define VLC_NBITS 6 |
number of bits to read per VLC iteration
Definition at line 57 of file wmavoice.c.
Referenced by wmavoice_init_static_data().
#define log_range | ( | var, | |
assign | |||
) |
anonymous enum |
Adaptive codebook types.
ACB_TYPE_NONE |
no adaptive codebook (only hardcoded fixed) |
ACB_TYPE_ASYMMETRIC |
adaptive codebook with per-frame pitch, which we interpolate to get a per-sample pitch. Signal is generated using an asymmetric sinc window function
|
ACB_TYPE_HAMMING |
Per-block pitch with signal generation using a Hamming sinc window function.
|
Definition at line 67 of file wmavoice.c.
anonymous enum |
Fixed codebook types.
Definition at line 82 of file wmavoice.c.
|
static |
Set up the variable bit mode (VBM) tree from container extradata.
gb | bit I/O context. The bit context (s->gb) should be loaded with byte 23-46 of the container extradata (i.e. the ones containing the VBM tree). |
vbm_tree | pointer to array to which the decoded VBM tree will be written. |
Definition at line 304 of file wmavoice.c.
Referenced by wmavoice_decode_init().
Definition at line 318 of file wmavoice.c.
|
static |
Set up decoder with parameters from demuxer (extradata etc.).
Extradata layout:
Definition at line 343 of file wmavoice.c.
|
static |
Adaptive gain control (as used in postfilter).
Identical to ff_adaptive_gain_control() in acelp_vectors.c, except that the energy here is calculated using sum(abs(...)), whereas the other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
out | output buffer for filtered samples |
in | input buffer containing the samples as they are after the postfilter steps so far |
speech_synth | input buffer containing speech synth before postfilter |
size | input buffer size |
alpha | exponential filter factor |
gain_mem | pointer to filter memory (single float) |
Definition at line 474 of file wmavoice.c.
Referenced by postfilter().
|
static |
Kalman smoothing function.
This function looks back pitch +/- 3 samples back into history to find the best fitting curve (that one giving the optimal gain of the two signals, i.e. the highest dot product between the two), and then uses that signal history to smoothen the output of the speech synthesis filter.
s | WMA Voice decoding context |
pitch | pitch of the speech signal |
in | input speech signal |
out | output pointer for smoothened signal |
size | input/output buffer size |
Definition at line 514 of file wmavoice.c.
Referenced by postfilter().
|
static |
Get the tilt factor of a formant filter from its transfer function.
lpcs | LPC coefficients |
n_lpcs | Size of LPC buffer |
Definition at line 560 of file wmavoice.c.
Referenced by calc_input_response(), and wiener_denoise().
|
static |
Derive denoise filter coefficients (in real domain) from the LPCs.
Definition at line 573 of file wmavoice.c.
Referenced by wiener_denoise().
|
static |
This function applies a Wiener filter on the (noisy) speech signal as a means to denoise it.
s | WMA Voice decoding context |
fcb_type | Frame (codebook) type |
synth_pf | input: the noisy speech signal, output: denoised speech data; should be 16-byte aligned (for ASM purposes) |
size | size of the speech data |
lpcs | LPCs used to synthesize this frame's speech data |
Definition at line 690 of file wmavoice.c.
Referenced by postfilter().
|
static |
Averaging projection filter, the postfilter used in WMAVoice.
This uses the following steps:
s | WMAVoice decoding context |
synth | Speech synthesis output (before postfilter) |
samples | Output buffer for filtered samples |
size | Buffer size of synth & samples |
lpcs | Generated LPCs used for speech synthesis |
zero_exc_pf | destination for zero synthesis filter (16-byte aligned) |
fcb_type | Frame type (silence, hardcoded, AW-pulses or FCB-pulses) |
pitch | Pitch of the input signal |
Definition at line 772 of file wmavoice.c.
Referenced by synth_frame().
|
static |
Dequantize LSPs.
lsps | output pointer to the array that will hold the LSPs |
num | number of LSPs to be dequantized |
values | quantized values, contains n_stages values |
sizes | range (i.e. max value) of each quantized value |
n_stages | number of dequantization runs |
table | dequantization table to be used |
mul_q | LSF multiplier |
base_q | base (lowest) LSF values |
Definition at line 826 of file wmavoice.c.
Referenced by dequant_lsp10i(), dequant_lsp10r(), dequant_lsp16i(), and dequant_lsp16r().
|
static |
Parse 10 independently-coded LSPs.
Definition at line 858 of file wmavoice.c.
Referenced by dequant_lsp10r(), and synth_superframe().
|
static |
Parse 10 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames from them (residual coding).
Definition at line 884 of file wmavoice.c.
Referenced by synth_superframe().
|
static |
Parse 16 independently-coded LSPs.
Definition at line 920 of file wmavoice.c.
Referenced by dequant_lsp16r(), and synth_superframe().
|
static |
Parse 16 independently-coded LSPs, and then derive the tables to generate LSPs for the other frames from them (residual coding).
Definition at line 953 of file wmavoice.c.
Referenced by synth_superframe().
|
static |
Parse the offset of the first pitch-adaptive window pulses, and the distribution of pulses between the two blocks in this frame.
s | WMA Voice decoding context private data |
gb | bit I/O context |
pitch | pitch for each block in this frame |
Definition at line 1003 of file wmavoice.c.
Referenced by synth_frame().
|
static |
Apply second set of pitch-adaptive window pulses.
s | WMA Voice decoding context private data |
gb | bit I/O context |
block_idx | block index in frame [0, 1] |
fcb | structure containing fixed codebook vector info |
Definition at line 1055 of file wmavoice.c.
Referenced by synth_block_fcb_acb().
|
static |
Apply first set of pitch-adaptive window pulses.
s | WMA Voice decoding context private data |
gb | bit I/O context |
block_idx | block index in frame [0, 1] |
fcb | storage location for fixed codebook pulse info |
Definition at line 1145 of file wmavoice.c.
Referenced by synth_block_fcb_acb().
|
static |
Generate a random number from frame_cntr and block_idx, which will lief in the range [0, 1000 - block_size] (so it can be used as an index in a table of size 1000 of which you want to read block_size entries).
frame_cntr | current frame number |
block_num | current block index |
block_size | amount of entries we want to read from a table that has 1000 entries |
Definition at line 1206 of file wmavoice.c.
Referenced by synth_block_fcb_acb(), and synth_block_hardcoded().
|
static |
Parse hardcoded signal for a single block.
Definition at line 1242 of file wmavoice.c.
Referenced by synth_block().
|
static |
Parse FCB/ACB signal for a single block.
Definition at line 1273 of file wmavoice.c.
Referenced by synth_block().
|
static |
Parse data in a single block.
s | WMA Voice decoding context private data |
gb | bit I/O context |
block_idx | index of the to-be-read block |
size | amount of samples to be read in this block |
block_pitch_sh2 | pitch for this block << 2 |
lsps | LSPs for (the end of) this frame |
prev_lsps | LSPs for the last frame |
frame_desc | frame type descriptor |
excitation | target memory for the ACB+FCB interpolated signal |
synth | target memory for the speech synthesis filter output |
Definition at line 1408 of file wmavoice.c.
Referenced by synth_frame().
|
static |
Synthesize output samples for a single frame.
ctx | WMA Voice decoder context |
gb | bit I/O context (s->gb or one for cross-packet superframes) |
frame_idx | Frame number within superframe [0-2] |
samples | pointer to output sample buffer, has space for at least 160 samples |
lsps | LSP array |
prev_lsps | array of previous frame's LSPs |
excitation | target buffer for excitation signal |
synth | target buffer for synthesized speech data |
Definition at line 1451 of file wmavoice.c.
Referenced by synth_superframe().
|
static |
Ensure minimum value for first item, maximum value for last value, proper spacing between each value and proper ordering.
lsps | array of LSPs |
num | size of LSP array |
Definition at line 1627 of file wmavoice.c.
Referenced by synth_superframe().
|
static |
Test if there's enough bits to read 1 superframe.
orig_gb | bit I/O context used for reading. This function does not modify the state of the bitreader; it only uses it to copy the current stream position |
s | WMA Voice decoding context private data |
Definition at line 1665 of file wmavoice.c.
Referenced by synth_superframe().
|
static |
Synthesize output samples for a single superframe.
If we have any data cached in s->sframe_cache, that will be used instead of whatever is loaded in s->gb.
WMA Voice superframes contain 3 frames, each containing 160 audio samples, to give a total of 480 samples per frame. See synth_frame() for frame parsing. In addition to 3 frames, superframes can also contain the LSPs (if these are globally specified for all frames (residually); they can also be specified individually per-frame. See the s->has_residual_lsps option), and can specify the number of samples encoded in this superframe (if less than 480), usually used to prevent blanks at track boundaries.
ctx | WMA Voice decoder context |
Definition at line 1750 of file wmavoice.c.
Referenced by wmavoice_decode_packet().
|
static |
Parse the packet header at the start of each packet (input data to this decoder).
s | WMA Voice decoding context private data |
Definition at line 1882 of file wmavoice.c.
Referenced by wmavoice_decode_packet().
|
static |
Copy (unaligned) bits from gb/data/size to pb.
pb | target buffer to copy bits into |
data | source buffer to copy bits from |
size | size of the source data, in bytes |
gb | bit I/O context specifying the current position in the source. data. This function might use this to align the bit position to a whole-byte boundary before calling avpriv_copy_bits() on aligned source data |
nbits | the amount of bits to copy from source to target |
Definition at line 1917 of file wmavoice.c.
Referenced by wmavoice_decode_packet().
|
static |
Packet decoding: a packet is anything that the (ASF) demuxer contains, and we expect that the demuxer / application provides it to us as such (else you'll probably get garbage as output).
Every packet has a size of ctx->block_align bytes, starts with a packet header (see parse_packet_header()), and then a series of superframes. Superframe boundaries may exceed packets, i.e. superframes can split data over multiple (two) packets.
For more information about frames, see synth_superframe().
Definition at line 1946 of file wmavoice.c.
|
static |
Definition at line 2021 of file wmavoice.c.
|
static |
Definition at line 2035 of file wmavoice.c.
|
static |
Definition at line 62 of file wmavoice.c.
|
static |
Referenced by check_bits_for_superframe(), and synth_frame().
AVCodec ff_wmavoice_decoder |
Definition at line 2063 of file wmavoice.c.