Files
gemigreerd-racket-sound-lib/ffmpeg-audio/ffmpeg_audio.cpp
T
2026-04-26 11:22:31 +02:00

833 lines
21 KiB
C++

/*
* Acinerella audio-only decoder.
*
* This file is intentionally written as C++ internally, but exports a stable
* C ABI. That gives us RAII, std::string and std::vector internally, while a
* C or Racket FFI caller still sees a simple C interface.
*
* What this decoder does:
*
* 1. Open a media file with FFmpeg/libavformat.
* 2. Find audio streams.
* 3. Read compressed packets from the container.
* 4. Decode packets with the modern avcodec_send_packet() /
* avcodec_receive_frame() API.
* 5. Convert decoded audio to one predictable output format:
*
* signed 32-bit integer PCM
* interleaved / packed
* native endian
*
* This is suitable for feeding to libao as 32-bit PCM.
*
* Important FFmpeg vocabulary:
*
* Container/demuxer:
* The file format layer: mp3, mp4/m4a, ogg, wav, etc.
* FFmpeg represents this with AVFormatContext.
*
* Stream:
* A file may contain one or more streams. For this audio-only API we only
* care about streams whose codec_type is AVMEDIA_TYPE_AUDIO.
*
* Packet:
* Compressed data belonging to one stream. One packet may decode to zero,
* one, or multiple decoded frames.
*
* Frame:
* Decoded audio samples, but not necessarily in the format we want. MP3,
* for example, may decode to planar float. We therefore use libswresample
* to normalize everything to signed 32-bit interleaved PCM.
*/
#include "ffmpeg_audio.h"
#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <string>
#include <vector>
extern "C" {
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavutil/avutil.h>
#include <libavutil/channel_layout.h>
#include <libavutil/samplefmt.h>
#include <libswresample/swresample.h>
}
static constexpr int AC_AUDIO_OUTPUT_BITS = 32;
static constexpr int AC_AUDIO_OUTPUT_BYTES = 4;
static constexpr AVSampleFormat AC_AUDIO_OUTPUT_FMT = AV_SAMPLE_FMT_S32;
/*
* Metadata.
*
* This used to be the kind of place where C code often used fixed-size arrays:
*
* char title[512];
*
* That is simple, but truncates long UTF-8 metadata and wastes space. Since the
* implementation is C++, std::string is the natural representation. The public
* C API only exposes const char* getters.
*/
struct __fmpg_file_info__ {
std::string title;
std::string author;
std::string album;
std::string genre;
std::string comment;
std::string copyright;
int year = -1;
int track = -1;
int64_t duration = -1; /* milliseconds */
int bitrate = -1;
void clear() {
title.clear();
author.clear();
album.clear();
genre.clear();
comment.clear();
copyright.clear();
year = -1;
track = -1;
duration = -1;
bitrate = -1;
}
};
/*
* __fmpg_instance__ owns the opened media file.
*
* AVFormatContext is FFmpeg's demuxer/container object. It knows which streams
* the file contains and can read compressed packets from it.
*/
struct __fmpg_instance__ {
bool opened = false;
AVFormatContext *format_ctx = nullptr;
fmpg_file_info info;
~__fmpg_instance__() {
if (format_ctx) {
avformat_close_input(&format_ctx);
}
}
};
/*
* A package wraps one FFmpeg AVPacket.
*
* The old Acinerella name was "package". FFmpeg calls this a packet. It is not
* decoded audio yet; it is compressed data read from the container.
*/
struct __fmpg_package__ {
int stream_index = -1;
int64_t pts = AV_NOPTS_VALUE;
AVPacket *packet = nullptr;
__fmpg_package__() : packet(av_packet_alloc()) {}
~__fmpg_package__() {
av_packet_free(&packet);
}
};
/*
* __fmpg_decoder__ owns the actual audio decoder and resampler for one stream.
*/
struct __fmpg_decoder__ {
fmpg_instance *instance = nullptr;
int stream_index = -1;
const AVCodec *codec = nullptr;
AVCodecContext *codec_ctx = nullptr;
AVFrame *frame = nullptr;
SwrContext *swr_ctx = nullptr;
fmpg_audio_info audio_info{};
std::vector<uint8_t> pcm;
double timecode = 0.0;
~__fmpg_decoder__() {
avcodec_free_context(&codec_ctx);
av_frame_free(&frame);
swr_free(&swr_ctx);
}
};
static const char *empty_if_null(const char *s) {
return s ? s : "";
}
static const char *string_c_str(const std::string &s) {
return s.empty() ? "" : s.c_str();
}
static std::string get_metadata_string(const AVFormatContext *ctx,
const char *key) {
const AVDictionaryEntry *entry =
av_dict_get(ctx->metadata, key, nullptr, 0);
return entry && entry->value ? std::string(entry->value)
: std::string();
}
static int get_metadata_int(const AVFormatContext *ctx, const char *key) {
const AVDictionaryEntry *entry =
av_dict_get(ctx->metadata, key, nullptr, 0);
if (!entry || !entry->value || !*entry->value) {
return -1;
}
return std::atoi(entry->value);
}
static void fill_metadata(fmpg_instance *self) {
AVFormatContext *ctx = self->format_ctx;
self->info.clear();
self->info.title = get_metadata_string(ctx, "title");
self->info.author = get_metadata_string(ctx, "artist");
self->info.album = get_metadata_string(ctx, "album");
self->info.genre = get_metadata_string(ctx, "genre");
self->info.comment = get_metadata_string(ctx, "comment");
self->info.copyright = get_metadata_string(ctx, "copyright");
self->info.year = get_metadata_int(ctx, "year");
self->info.track = get_metadata_int(ctx, "track");
self->info.bitrate = static_cast<int>(ctx->bit_rate);
self->info.duration =
ctx->duration == AV_NOPTS_VALUE
? -1
: ctx->duration * 1000 / AV_TIME_BASE;
}
static bool valid_stream_index(const fmpg_instance *instance, int stream_index)
{
return instance && instance->opened && instance->format_ctx &&
stream_index >= 0 &&
stream_index < static_cast<int>(instance->format_ctx->nb_streams);
}
fmpg_instance * ac_init(void) {
try {
return new fmpg_instance();
} catch (...) {
return nullptr;
}
}
void ac_free(fmpg_instance * instance) {
delete instance;
}
int ac_open_file(fmpg_instance * instance,
const char *filename) {
if (!instance || instance->opened || !filename) {
return 0;
}
/*
* avformat_open_input opens the file and guesses the container format.
* The codec is not opened here. This is only the demuxing layer.
*/
if (avformat_open_input(&instance->format_ctx,
empty_if_null(filename),
nullptr,
nullptr) < 0) {
ac_close(instance);
return 0;
}
/*
* Read enough packets to discover stream metadata such as sample rate,
* channel layout, codec id, duration and tags.
*/
if (avformat_find_stream_info(instance->format_ctx, nullptr) < 0) {
ac_close(instance);
return 0;
}
fill_metadata(instance);
instance->opened = true;
return 1;
}
void ac_close(fmpg_instance * instance) {
if (!instance) {
return;
}
if (instance->format_ctx) {
avformat_close_input(&instance->format_ctx);
}
instance->opened = false;
instance->info.clear();
}
int ac_is_open(fmpg_instance * instance)
{
return instance && instance->opened ? 1 : 0;
}
int ac_get_audio_stream_count(fmpg_instance * instance)
{
if (!instance || !instance->opened || !instance->format_ctx) {
return 0;
}
int count = 0;
for (unsigned i = 0; i < instance->format_ctx->nb_streams; ++i) {
const AVCodecParameters *par =
instance->format_ctx->streams[i]->codecpar;
if (par && par->codec_type == AVMEDIA_TYPE_AUDIO) {
++count;
}
}
return count;
}
int ac_get_default_audio_stream(fmpg_instance * instance)
{
if (!instance || !instance->opened || !instance->format_ctx) {
return -1;
}
const int idx = av_find_best_stream(instance->format_ctx,
AVMEDIA_TYPE_AUDIO,
-1,
-1,
nullptr,
0);
return idx >= 0 ? idx : -1;
}
int ac_get_audio_info(fmpg_instance * instance, int stream_index, fmpg_audio_info *info)
{
if (!info) {
return 0;
}
std::memset(info, 0, sizeof(*info));
if (!valid_stream_index(instance, stream_index)) {
return 0;
}
const AVCodecParameters *par =
instance->format_ctx->streams[stream_index]->codecpar;
if (!par || par->codec_type != AVMEDIA_TYPE_AUDIO) {
return 0;
}
info->sample_rate = par->sample_rate;
info->channels = par->ch_layout.nb_channels;
info->bits_per_sample = AC_AUDIO_OUTPUT_BITS;
info->bytes_per_sample = AC_AUDIO_OUTPUT_BYTES;
return info->sample_rate > 0 && info->channels > 0 ? 1 : 0;
}
const fmpg_file_info *ac_get_file_info(fmpg_instance * instance)
{
return instance ? &instance->info : nullptr;
}
const char * ac_file_info_title(const fmpg_file_info *info)
{
return info ? string_c_str(info->title) : "";
}
const char *ac_file_info_author(const fmpg_file_info *info)
{
return info ? string_c_str(info->author) : "";
}
const char *ac_file_info_album(const fmpg_file_info *info)
{
return info ? string_c_str(info->album) : "";
}
const char *ac_file_info_genre(const fmpg_file_info *info)
{
return info ? string_c_str(info->genre) : "";
}
const char *ac_file_info_comment(const fmpg_file_info *info)
{
return info ? string_c_str(info->comment) : "";
}
const char *ac_file_info_copyright(const fmpg_file_info *info)
{
return info ? string_c_str(info->copyright) : "";
}
int ac_file_info_year(const fmpg_file_info *info)
{
return info ? info->year : -1;
}
int ac_file_info_track(const fmpg_file_info *info)
{
return info ? info->track : -1;
}
int64_t ac_file_info_duration(const fmpg_file_info *info)
{
return info ? info->duration : -1;
}
int ac_file_info_bitrate(const fmpg_file_info *info)
{
return info ? info->bitrate : -1;
}
fmpg_package * ac_read_package(fmpg_instance * instance)
{
if (!instance || !instance->opened || !instance->format_ctx) {
return nullptr;
}
fmpg_package *pkg = nullptr;
try {
pkg = new fmpg_package();
} catch (...) {
return nullptr;
}
if (!pkg->packet) {
delete pkg;
return nullptr;
}
/*
* av_read_frame reads one compressed packet. This may be audio, video,
* subtitles, or another stream type. The caller can inspect stream_index
* and only feed audio packets to the matching decoder.
*/
if (av_read_frame(instance->format_ctx, pkg->packet) < 0) {
delete pkg;
return nullptr;
}
pkg->stream_index = pkg->packet->stream_index;
pkg->pts = pkg->packet->dts != AV_NOPTS_VALUE
? pkg->packet->dts
: pkg->packet->pts;
return pkg;
}
void ac_free_package(fmpg_package * package)
{
delete package;
}
int ac_package_stream_index(fmpg_package * package)
{
return package ? package->stream_index : -1;
}
static bool init_codec_context(fmpg_decoder *dec, const AVCodecParameters *par)
{
dec->codec = avcodec_find_decoder(par->codec_id);
if (!dec->codec) {
return false;
}
dec->codec_ctx = avcodec_alloc_context3(dec->codec);
if (!dec->codec_ctx) {
return false;
}
/*
* Copy stream codec parameters into the active decoder context.
*/
if (avcodec_parameters_to_context(dec->codec_ctx, par) < 0) {
return false;
}
/*
* Open the actual decoder. From this point on, packets can be sent to it.
*/
if (avcodec_open2(dec->codec_ctx, dec->codec, nullptr) < 0) {
return false;
}
return true;
}
static bool init_resampler(fmpg_decoder *dec)
{
const AVChannelLayout *layout = &dec->codec_ctx->ch_layout;
if (layout->nb_channels <= 0 || dec->codec_ctx->sample_rate <= 0) {
return false;
}
/*
* We do not change sample rate or channel layout. We only normalize the
* sample format to signed 32-bit integer PCM.
*/
if (swr_alloc_set_opts2(&dec->swr_ctx,
layout,
AC_AUDIO_OUTPUT_FMT,
dec->codec_ctx->sample_rate,
layout,
dec->codec_ctx->sample_fmt,
dec->codec_ctx->sample_rate,
0,
nullptr) < 0) {
return false;
}
return swr_init(dec->swr_ctx) >= 0;
}
fmpg_decoder * ac_create_decoder(fmpg_instance * instance, int stream_index)
{
if (!valid_stream_index(instance, stream_index)) {
return nullptr;
}
fmpg_audio_info info{};
if (!ac_get_audio_info(instance, stream_index, &info)) {
return nullptr;
}
fmpg_decoder *dec = nullptr;
try {
dec = new fmpg_decoder();
} catch (...) {
return nullptr;
}
dec->instance = instance;
dec->stream_index = stream_index;
dec->audio_info = info;
const AVCodecParameters *par =
instance->format_ctx->streams[stream_index]->codecpar;
if (!init_codec_context(dec, par)) {
delete dec;
return nullptr;
}
dec->frame = av_frame_alloc();
if (!dec->frame) {
delete dec;
return nullptr;
}
if (!init_resampler(dec)) {
delete dec;
return nullptr;
}
return dec;
}
void ac_free_decoder(fmpg_decoder * decoder)
{
delete decoder;
}
static bool append_bytes(fmpg_decoder *dec, const uint8_t *src, size_t bytes)
{
if (!bytes) {
return true;
}
if (bytes > static_cast<size_t>(std::numeric_limits<int>::max()) -
dec->pcm.size()) {
return false;
}
try {
const size_t old_size = dec->pcm.size();
dec->pcm.resize(old_size + bytes);
std::memcpy(dec->pcm.data() + old_size, src, bytes);
return true;
} catch (...) {
return false;
}
}
static bool append_converted_frame(fmpg_decoder *dec, const AVFrame *frame)
{
const int channels = dec->codec_ctx->ch_layout.nb_channels;
if (channels <= 0 || frame->nb_samples <= 0) {
return true;
}
/*
* swr_get_out_samples gives a safe upper bound for the number of output
* samples. The resampler can have internal delay, so this is safer than
* assuming input sample count equals output sample count.
*/
const int max_out_samples =
swr_get_out_samples(dec->swr_ctx, frame->nb_samples);
if (max_out_samples <= 0) {
return false;
}
const int max_bytes =
av_samples_get_buffer_size(nullptr,
channels,
max_out_samples,
AC_AUDIO_OUTPUT_FMT,
1);
if (max_bytes <= 0) {
return false;
}
std::vector<uint8_t> tmp(static_cast<size_t>(max_bytes));
uint8_t *out_planes[1] = { tmp.data() };
/*
* swr_convert performs the actual conversion to S32 interleaved PCM.
*/
const int out_samples =
swr_convert(dec->swr_ctx,
out_planes,
max_out_samples,
const_cast<const uint8_t **>(frame->data),
frame->nb_samples);
if (out_samples < 0) {
return false;
}
const int used_bytes =
av_samples_get_buffer_size(nullptr,
channels,
out_samples,
AC_AUDIO_OUTPUT_FMT,
1);
if (used_bytes < 0) {
return false;
}
return append_bytes(dec, tmp.data(), static_cast<size_t>(used_bytes));
}
static int receive_available_frames(fmpg_decoder *dec)
{
int produced = 0;
for (;;) {
const int ret = avcodec_receive_frame(dec->codec_ctx, dec->frame);
if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
return produced;
}
if (ret < 0) {
return -1;
}
if (!append_converted_frame(dec, dec->frame)) {
av_frame_unref(dec->frame);
return -1;
}
produced = 1;
av_frame_unref(dec->frame);
}
}
static void update_timecode_from_packet(fmpg_decoder *dec, const fmpg_package *pkg)
{
if (!dec || !pkg || pkg->pts == AV_NOPTS_VALUE) {
return;
}
AVStream *stream = dec->instance->format_ctx->streams[pkg->stream_index];
dec->timecode = pkg->pts * av_q2d(stream->time_base);
}
int ac_decode_package(fmpg_package * package, fmpg_decoder * decoder)
{
if (!package || !decoder || !package->packet ||
package->stream_index != decoder->stream_index) {
return 0;
}
decoder->pcm.clear();
update_timecode_from_packet(decoder, package);
/*
* Modern FFmpeg decoding is a two-step queue-like API:
*
* 1. send compressed packet
* 2. receive all decoded frames currently available
*
* A single packet can produce multiple frames, especially with codecs that
* buffer internally. We concatenate all produced PCM blocks.
*/
int ret = avcodec_send_packet(decoder->codec_ctx, package->packet);
if (ret == AVERROR(EAGAIN)) {
if (receive_available_frames(decoder) < 0) {
return 0;
}
ret = avcodec_send_packet(decoder->codec_ctx, package->packet);
}
if (ret < 0) {
return 0;
}
return receive_available_frames(decoder) > 0 ? 1 : 0;
}
int ac_flush_decoder(fmpg_decoder * decoder)
{
if (!decoder) {
return 0;
}
decoder->pcm.clear();
/*
* Sending NULL tells FFmpeg that no more input is coming and that delayed
* decoded frames should be drained.
*/
const int ret = avcodec_send_packet(decoder->codec_ctx, nullptr);
if (ret < 0 && ret != AVERROR_EOF) {
return 0;
}
const int produced = receive_available_frames(decoder);
if (produced < 0) {
return 0;
}
/* Drain possible delayed samples from libswresample as well. */
const int channels = decoder->codec_ctx->ch_layout.nb_channels;
for (;;) {
const int delay =
static_cast<int>(swr_get_delay(decoder->swr_ctx,
decoder->codec_ctx->sample_rate));
if (delay <= 0) {
break;
}
const int max_bytes =
av_samples_get_buffer_size(nullptr,
channels,
delay,
AC_AUDIO_OUTPUT_FMT,
1);
if (max_bytes <= 0) {
break;
}
std::vector<uint8_t> tmp(static_cast<size_t>(max_bytes));
uint8_t *out_planes[1] = { tmp.data() };
const int out_samples =
swr_convert(decoder->swr_ctx,
out_planes,
delay,
nullptr,
0);
if (out_samples <= 0) {
break;
}
const int used_bytes =
av_samples_get_buffer_size(nullptr,
channels,
out_samples,
AC_AUDIO_OUTPUT_FMT,
1);
if (used_bytes < 0 ||
!append_bytes(decoder, tmp.data(), static_cast<size_t>(used_bytes))) {
break;
}
}
return decoder->pcm.empty() ? 0 : 1;
}
int ac_seek_ms(fmpg_decoder * decoder, int64_t target_pos_ms)
{
if (!decoder || !decoder->instance || !decoder->instance->format_ctx) {
return 0;
}
AVStream *stream = decoder->instance->format_ctx->streams[decoder->stream_index];
const int64_t pos_us = av_rescale(target_pos_ms, AV_TIME_BASE, 1000);
const int64_t stream_ts = av_rescale_q(pos_us, AV_TIME_BASE_Q, stream->time_base);
if (av_seek_frame(decoder->instance->format_ctx,
decoder->stream_index,
stream_ts,
AVSEEK_FLAG_BACKWARD) < 0) {
return 0;
}
decoder->timecode = target_pos_ms / 1000.0;
decoder->pcm.clear();
/* Old buffered data no longer belongs to the new seek position. */
avcodec_flush_buffers(decoder->codec_ctx);
/* Reset resampler delay/state too. */
swr_close(decoder->swr_ctx);
return swr_init(decoder->swr_ctx) >= 0 ? 1 : 0;
}
const uint8_t *ac_decoder_buffer(fmpg_decoder * decoder)
{
return decoder && !decoder->pcm.empty() ? decoder->pcm.data() : nullptr;
}
int ac_decoder_buffer_size(fmpg_decoder * decoder)
{
if (!decoder ||
decoder->pcm.size() >
static_cast<size_t>(std::numeric_limits<int>::max())) {
return 0;
}
return static_cast<int>(decoder->pcm.size());
}
double ac_decoder_timecode(fmpg_decoder * decoder)
{
return decoder ? decoder->timecode : 0.0;
}
int ac_decoder_stream_index(fmpg_decoder * decoder)
{
return decoder ? decoder->stream_index : -1;
}