Files
2026-06-08 08:48:50 +02:00

902 lines
25 KiB
C++

/*
* Audio-only FFmpeg wrapper with a plain C ABI.
*
* This implementation intentionally hides FFmpeg concepts from the public API:
*
* - no stream_index in the API;
* - no AVPacket/package object in the API;
* - no explicit decoder object in the API;
* - no metadata/tag API exposed to C callers.
*
* Internally the instance owns everything needed to decode one selected audio
* stream. The caller simply opens a file and repeatedly calls fmpg_decode_next().
*/
#include "ffmpeg_audio.h"
#include "../ffi_version.h"
#include <algorithm>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <vector>
extern "C" {
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavutil/avutil.h>
#include <libavutil/channel_layout.h>
#include <libavutil/samplefmt.h>
#include <libswresample/swresample.h>
}
#define MSG0(type, msg) fprintf(stderr, type ":");fprintf(stderr, "%s", __FUNCTION__);fprintf(stderr, ": %s\n", msg)
#define MSG1(type, msg, a) fprintf(stderr, type ":");fprintf(stderr, "%s", __FUNCTION__);fprintf(stderr, ": " msg "\n", a)
#define MSG2(type, msg, a, b) fprintf(stderr, type ":");fprintf(stderr, "%s", __FUNCTION__);fprintf(stderr, ": " msg "\n", a, b)
#define MSG3(type, msg, a, b, c) fprintf(stderr, type ":");fprintf(stderr, "%s", __FUNCTION__);fprintf(stderr, ": " msg "\n", a, b, c)
#define INFO0(msg) MSG0("info", msg)
#define INFO1(msg, a) MSG1("info", msg, a)
#define INFO2(msg, a, b) MSG2("info", msg, a, b)
#define INFO3(msg, a, b, c) MSG3("info", msg, a, b, c)
#define ERROR0(msg) MSG0("error", msg)
#define ERROR1(msg, a) MSG1("error", msg, a)
#define ERROR2(msg, a, b) MSG2("error", msg, a, b)
#define ERROR3(msg, a, b, c) MSG3("error", msg, a, b, c)
static constexpr int FMPG_OUTPUT_BITS = 32;
static constexpr int FMPG_OUTPUT_BYTES = 4;
static constexpr AVSampleFormat FMPG_OUTPUT_FMT = AV_SAMPLE_FMT_S32;
struct audio_info_storage {
int audio_stream_count = 0;
int selected_stream_index = -1; /* Internal FFmpeg stream index. */
int sample_rate = 0;
int channels = 0;
int64_t duration_ms = -1;
int64_t duration_samples = -1; /* Output sample frames. */
void clear()
{
audio_stream_count = 0;
selected_stream_index = -1;
sample_rate = 0;
channels = 0;
duration_ms = -1;
duration_samples = -1;
}
};
struct decoder_storage {
const AVCodec *codec = nullptr;
AVCodecContext *codec_ctx = nullptr;
AVFrame *frame = nullptr;
SwrContext *swr_ctx = nullptr;
std::vector<uint8_t> pcm;
bool eof_seen = false;
bool decoder_drained = false;
double timecode = 0.0;
int64_t last_samples = 0;
int64_t buffer_start_sample = 0;
int64_t next_sample_position = 0;
/* >= 0 while a seek has requested us to discard decoded pre-roll samples. */
int64_t discard_until_sample = -1;
void clear_output()
{
pcm.clear();
last_samples = 0;
buffer_start_sample = next_sample_position;
}
void free_ffmpeg()
{
avcodec_free_context(&codec_ctx);
av_frame_free(&frame);
swr_free(&swr_ctx);
codec = nullptr;
pcm.clear();
eof_seen = false;
decoder_drained = false;
timecode = 0.0;
last_samples = 0;
buffer_start_sample = 0;
next_sample_position = 0;
discard_until_sample = -1;
}
~decoder_storage()
{
free_ffmpeg();
}
};
struct __fmpg_instance__ {
bool opened = false;
AVFormatContext *format_ctx = nullptr;
audio_info_storage audio_info;
decoder_storage decoder;
~__fmpg_instance__()
{
if (format_ctx) {
avformat_close_input(&format_ctx);
}
}
};
static int count_audio_streams(const AVFormatContext *ctx)
{
if (!ctx) {
return 0;
}
int count = 0;
for (unsigned i = 0; i < ctx->nb_streams; ++i) {
const AVCodecParameters *par = ctx->streams[i]->codecpar;
if (par && par->codec_type == AVMEDIA_TYPE_AUDIO) {
++count;
}
}
return count;
}
static int64_t milliseconds_from_seconds(double seconds)
{
if (seconds < 0.0) {
return -1;
}
return static_cast<int64_t>(seconds * 1000.0 + 0.5);
}
static int64_t samples_from_seconds(double seconds, int sample_rate)
{
if (seconds < 0.0 || sample_rate <= 0) {
return -1;
}
return static_cast<int64_t>(seconds * static_cast<double>(sample_rate) +
0.5);
}
static double stream_duration_seconds(const AVStream *stream)
{
if (!stream || stream->duration == AV_NOPTS_VALUE) {
return -1.0;
}
return static_cast<double>(stream->duration) * av_q2d(stream->time_base);
}
static double format_duration_seconds(const AVFormatContext *ctx)
{
if (!ctx || ctx->duration == AV_NOPTS_VALUE) {
return -1.0;
}
return static_cast<double>(ctx->duration) /
static_cast<double>(AV_TIME_BASE);
}
static int64_t timestamp_to_samples(int64_t timestamp,
const AVStream *stream,
int sample_rate)
{
if (!stream || timestamp == AV_NOPTS_VALUE || sample_rate <= 0) {
return -1;
}
const double seconds = static_cast<double>(timestamp) *
av_q2d(stream->time_base);
return samples_from_seconds(seconds, sample_rate);
}
static bool fill_audio_info(fmpg_instance *self)
{
AVFormatContext *ctx = self->format_ctx;
self->audio_info.clear();
self->audio_info.audio_stream_count = count_audio_streams(ctx);
const int best = av_find_best_stream(ctx,
AVMEDIA_TYPE_AUDIO,
-1,
-1,
nullptr,
0);
if (best < 0) {
return false;
}
AVStream *stream = ctx->streams[best];
const AVCodecParameters *par = stream->codecpar;
if (!par || par->codec_type != AVMEDIA_TYPE_AUDIO ||
par->sample_rate <= 0 || par->ch_layout.nb_channels <= 0) {
return false;
}
self->audio_info.selected_stream_index = best;
self->audio_info.sample_rate = par->sample_rate;
self->audio_info.channels = par->ch_layout.nb_channels;
double seconds = stream_duration_seconds(stream);
if (seconds < 0.0) {
seconds = format_duration_seconds(ctx);
}
self->audio_info.duration_ms = milliseconds_from_seconds(seconds);
self->audio_info.duration_samples =
samples_from_seconds(seconds, self->audio_info.sample_rate);
return true;
}
static bool instance_ready(const fmpg_instance *instance)
{
return instance && instance->opened && instance->format_ctx &&
instance->audio_info.selected_stream_index >= 0 &&
instance->decoder.codec_ctx && instance->decoder.swr_ctx;
}
static bool init_codec_context(fmpg_instance *self)
{
decoder_storage &dec = self->decoder;
const int stream_index = self->audio_info.selected_stream_index;
const AVCodecParameters *par =
self->format_ctx->streams[stream_index]->codecpar;
dec.codec = avcodec_find_decoder(par->codec_id);
if (!dec.codec) {
return false;
}
dec.codec_ctx = avcodec_alloc_context3(dec.codec);
if (!dec.codec_ctx) {
return false;
}
if (avcodec_parameters_to_context(dec.codec_ctx, par) < 0) {
return false;
}
if (avcodec_open2(dec.codec_ctx, dec.codec, nullptr) < 0) {
return false;
}
dec.frame = av_frame_alloc();
return dec.frame != nullptr;
}
static bool init_resampler(fmpg_instance *self)
{
decoder_storage &dec = self->decoder;
const AVChannelLayout *layout = &dec.codec_ctx->ch_layout;
if (layout->nb_channels <= 0 || dec.codec_ctx->sample_rate <= 0) {
return false;
}
if (swr_alloc_set_opts2(&dec.swr_ctx,
layout,
FMPG_OUTPUT_FMT,
dec.codec_ctx->sample_rate,
layout,
dec.codec_ctx->sample_fmt,
dec.codec_ctx->sample_rate,
0,
nullptr) < 0) {
return false;
}
return swr_init(dec.swr_ctx) >= 0;
}
static bool init_decoder(fmpg_instance *self)
{
self->decoder.free_ffmpeg();
return init_codec_context(self) && init_resampler(self);
}
int fmpg_compatible_ffmpeg()
{
int compiled_avformat_major = LIBAVFORMAT_VERSION_MAJOR;
int compiled_avcodec_major = LIBAVCODEC_VERSION_MAJOR;
int compiled_swresample_major = LIBSWRESAMPLE_VERSION_MAJOR;
int compiled_avutil_major = LIBAVUTIL_VERSION_MAJOR;
int current_avformat_major = AV_VERSION_MAJOR(avformat_version());
int current_avcodec_major = AV_VERSION_MAJOR(avcodec_version());
int current_swresample_major = AV_VERSION_MAJOR(swresample_version());
int current_avutil_major = AV_VERSION_MAJOR(avutil_version());
auto comp = [](const char *lib, int cv, int rv) {
if (cv != rv) {
ERROR3("FFMPEG %s Major versions not equal. Compile time: %d, runtime: %d", lib, cv, rv);
}
};
comp("AVFormat", compiled_avformat_major, current_avformat_major);
comp("AVCodec", compiled_avcodec_major, current_avcodec_major);
comp("SWResample", compiled_swresample_major, current_swresample_major);
comp("AVUtil", compiled_avutil_major, current_avutil_major);
int compatible = (compiled_avformat_major == current_avformat_major) &&
(compiled_avcodec_major == current_avcodec_major) &&
(compiled_swresample_major == current_swresample_major) &&
(compiled_avutil_major == current_avutil_major);
return compatible;
}
fmpg_instance *fmpg_init(void)
{
if (!fmpg_compatible_ffmpeg()) {
ERROR0("Compiled major ffmpeg version ≃ runtime major version, not compatible.");
return nullptr;
}
try {
return new fmpg_instance();
} catch (...) {
return nullptr;
}
}
void fmpg_free(fmpg_instance *instance)
{
delete instance;
}
int fmpg_open_file(fmpg_instance *instance, const char *filename)
{
if (!instance || instance->opened || !filename) {
return 0;
}
if (instance->format_ctx != nullptr) {
return 0;
}
int r = avformat_open_input(&instance->format_ctx,
filename,
nullptr,
nullptr);
if (r < 0) {
fmpg_close(instance);
return 0;
}
if (avformat_find_stream_info(instance->format_ctx, nullptr) < 0) {
fmpg_close(instance);
return 0;
}
if (!fill_audio_info(instance) || !init_decoder(instance)) {
fmpg_close(instance);
return 0;
}
instance->opened = true;
return 1;
}
void fmpg_close(fmpg_instance *instance)
{
if (!instance) {
return;
}
instance->decoder.free_ffmpeg();
if (instance->format_ctx) {
avformat_close_input(&instance->format_ctx);
}
instance->opened = false;
instance->audio_info.clear();
}
int fmpg_is_open(fmpg_instance *instance)
{
return instance_ready(instance) ? 1 : 0;
}
int fmpg_audio_stream_count(fmpg_instance *instance)
{
return instance && instance->opened ? instance->audio_info.audio_stream_count
: 0;
}
int fmpg_audio_sample_rate(fmpg_instance *instance)
{
return instance_ready(instance) ? instance->audio_info.sample_rate : 0;
}
int fmpg_audio_channels(fmpg_instance *instance)
{
return instance_ready(instance) ? instance->audio_info.channels : 0;
}
int fmpg_audio_bits_per_sample(fmpg_instance *)
{
return FMPG_OUTPUT_BITS;
}
int fmpg_audio_bytes_per_sample(fmpg_instance *)
{
return FMPG_OUTPUT_BYTES;
}
int64_t fmpg_duration_ms(fmpg_instance *instance)
{
return instance_ready(instance) ? instance->audio_info.duration_ms : -1;
}
int64_t fmpg_duration_samples(fmpg_instance *instance)
{
return instance_ready(instance) ? instance->audio_info.duration_samples : -1;
}
static bool append_bytes(decoder_storage &dec,
const uint8_t *src,
size_t bytes)
{
if (!bytes) {
return true;
}
if (bytes > static_cast<size_t>(std::numeric_limits<int>::max()) -
dec.pcm.size()) {
return false;
}
try {
const size_t old_size = dec.pcm.size();
dec.pcm.resize(old_size + bytes);
std::memcpy(dec.pcm.data() + old_size, src, bytes);
return true;
} catch (...) {
return false;
}
}
static bool append_converted_frame(fmpg_instance *self,
const AVFrame *frame)
{
decoder_storage &dec = self->decoder;
const int channels = self->audio_info.channels;
const int sample_rate = self->audio_info.sample_rate;
if (channels <= 0 || frame->nb_samples <= 0) {
return true;
}
const int max_out_samples = swr_get_out_samples(dec.swr_ctx,
frame->nb_samples);
if (max_out_samples <= 0) {
return false;
}
const int max_bytes = av_samples_get_buffer_size(nullptr,
channels,
max_out_samples,
FMPG_OUTPUT_FMT,
1);
if (max_bytes <= 0) {
return false;
}
std::vector<uint8_t> tmp(static_cast<size_t>(max_bytes));
uint8_t *out_planes[1] = { tmp.data() };
const int out_samples = swr_convert(dec.swr_ctx,
out_planes,
max_out_samples,
const_cast<const uint8_t **>(frame->data),
frame->nb_samples);
if (out_samples < 0) {
return false;
}
const int used_bytes = av_samples_get_buffer_size(nullptr,
channels,
out_samples,
FMPG_OUTPUT_FMT,
1);
if (used_bytes < 0) {
return false;
}
const int stream_index = self->audio_info.selected_stream_index;
const AVStream *stream = self->format_ctx->streams[stream_index];
int64_t frame_start = timestamp_to_samples(frame->best_effort_timestamp,
stream,
sample_rate);
if (frame_start < 0) {
frame_start = dec.next_sample_position;
}
int64_t keep_start = frame_start;
int keep_samples = out_samples;
size_t byte_offset = 0;
/*
* After seeking, FFmpeg may first return decoded samples from before the
* requested position. Discard them so public sample positions refer to the
* actual music position requested by the caller.
*/
if (dec.discard_until_sample >= 0) {
const int64_t target = dec.discard_until_sample;
const int64_t frame_end = frame_start + out_samples;
if (frame_end <= target) {
dec.next_sample_position = frame_end;
return true;
}
if (frame_start < target) {
const int64_t drop = target - frame_start;
if (drop > 0 && drop < out_samples) {
byte_offset = static_cast<size_t>(drop) *
static_cast<size_t>(channels) *
FMPG_OUTPUT_BYTES;
keep_samples = static_cast<int>(out_samples - drop);
keep_start = target;
}
}
dec.discard_until_sample = -1;
}
if (keep_samples <= 0) {
dec.next_sample_position = frame_start + out_samples;
return true;
}
if (dec.pcm.empty()) {
dec.buffer_start_sample = keep_start;
dec.timecode = static_cast<double>(keep_start) /
static_cast<double>(sample_rate);
}
const size_t keep_bytes = static_cast<size_t>(keep_samples) *
static_cast<size_t>(channels) *
FMPG_OUTPUT_BYTES;
if (!append_bytes(dec, tmp.data() + byte_offset, keep_bytes)) {
return false;
}
dec.last_samples += keep_samples;
dec.next_sample_position = keep_start + keep_samples;
return true;
}
static int receive_available_frames(fmpg_instance *self)
{
decoder_storage &dec = self->decoder;
int produced = 0;
for (;;) {
const int ret = avcodec_receive_frame(dec.codec_ctx, dec.frame);
if (ret == AVERROR(EAGAIN)) {
return produced;
}
if (ret == AVERROR_EOF) {
dec.decoder_drained = true;
return produced;
}
if (ret < 0) {
return -1;
}
if (!append_converted_frame(self, dec.frame)) {
av_frame_unref(dec.frame);
return -1;
}
produced = dec.last_samples > 0 ? 1 : produced;
av_frame_unref(dec.frame);
}
}
static bool read_selected_audio_packet(fmpg_instance *self, AVPacket *pkt)
{
const int wanted_stream = self->audio_info.selected_stream_index;
for (;;) {
const int ret = av_read_frame(self->format_ctx, pkt);
if (ret < 0) {
return false;
}
if (pkt->stream_index == wanted_stream) {
return true;
}
av_packet_unref(pkt);
}
}
static int drain_resampler(fmpg_instance *self)
{
decoder_storage &dec = self->decoder;
const int channels = self->audio_info.channels;
const int sample_rate = self->audio_info.sample_rate;
int produced = 0;
for (;;) {
const int delay = static_cast<int>(swr_get_delay(dec.swr_ctx,
sample_rate));
if (delay <= 0) {
break;
}
const int max_bytes = av_samples_get_buffer_size(nullptr,
channels,
delay,
FMPG_OUTPUT_FMT,
1);
if (max_bytes <= 0) {
break;
}
std::vector<uint8_t> tmp(static_cast<size_t>(max_bytes));
uint8_t *out_planes[1] = { tmp.data() };
const int out_samples = swr_convert(dec.swr_ctx,
out_planes,
delay,
nullptr,
0);
if (out_samples <= 0) {
break;
}
const int used_bytes = av_samples_get_buffer_size(nullptr,
channels,
out_samples,
FMPG_OUTPUT_FMT,
1);
if (used_bytes < 0) {
break;
}
if (dec.pcm.empty()) {
dec.buffer_start_sample = dec.next_sample_position;
dec.timecode = static_cast<double>(dec.buffer_start_sample) /
static_cast<double>(sample_rate);
}
if (!append_bytes(dec, tmp.data(), static_cast<size_t>(used_bytes))) {
return -1;
}
dec.last_samples += out_samples;
dec.next_sample_position += out_samples;
produced = 1;
}
return produced;
}
int fmpg_decode_next(fmpg_instance *instance)
{
if (!instance_ready(instance)) {
return 0;
}
decoder_storage &dec = instance->decoder;
dec.clear_output();
/* First return any frames that are already pending in the decoder. */
int produced = receive_available_frames(instance);
if (produced < 0) {
return 0;
}
if (produced > 0 && !dec.pcm.empty()) {
return 1;
}
AVPacket *pkt = av_packet_alloc();
if (!pkt) {
return 0;
}
while (!dec.eof_seen) {
if (!read_selected_audio_packet(instance, pkt)) {
dec.eof_seen = true;
av_packet_unref(pkt);
break;
}
int ret = avcodec_send_packet(dec.codec_ctx, pkt);
av_packet_unref(pkt);
if (ret == AVERROR(EAGAIN)) {
produced = receive_available_frames(instance);
if (produced < 0) {
av_packet_free(&pkt);
return 0;
}
if (produced > 0 && !dec.pcm.empty()) {
av_packet_free(&pkt);
return 1;
}
continue;
}
if (ret < 0) {
av_packet_free(&pkt);
return 0;
}
produced = receive_available_frames(instance);
if (produced < 0) {
av_packet_free(&pkt);
return 0;
}
if (produced > 0 && !dec.pcm.empty()) {
av_packet_free(&pkt);
return 1;
}
}
av_packet_free(&pkt);
if (!dec.decoder_drained) {
const int ret = avcodec_send_packet(dec.codec_ctx, nullptr);
if (ret < 0 && ret != AVERROR_EOF) {
return 0;
}
produced = receive_available_frames(instance);
if (produced < 0) {
return 0;
}
if (produced > 0 && !dec.pcm.empty()) {
return 1;
}
}
produced = drain_resampler(instance);
return produced > 0 && !dec.pcm.empty() ? 1 : 0;
}
int fmpg_seek_ms(fmpg_instance *instance, int64_t target_pos_ms)
{
if (!instance_ready(instance) || target_pos_ms < 0) {
return 0;
}
const int stream_index = instance->audio_info.selected_stream_index;
AVStream *stream = instance->format_ctx->streams[stream_index];
const int64_t pos_us = av_rescale(target_pos_ms, AV_TIME_BASE, 1000);
const int64_t stream_ts = av_rescale_q(pos_us,
AV_TIME_BASE_Q,
stream->time_base);
if (av_seek_frame(instance->format_ctx,
stream_index,
stream_ts,
AVSEEK_FLAG_BACKWARD) < 0) {
return 0;
}
decoder_storage &dec = instance->decoder;
const int64_t target_samples = samples_from_seconds(target_pos_ms / 1000.0,
instance->audio_info.sample_rate);
avcodec_flush_buffers(dec.codec_ctx);
swr_close(dec.swr_ctx);
if (swr_init(dec.swr_ctx) < 0) {
return 0;
}
dec.pcm.clear();
dec.last_samples = 0;
dec.buffer_start_sample = target_samples >= 0 ? target_samples : 0;
dec.next_sample_position = target_samples >= 0 ? target_samples : 0;
dec.discard_until_sample = target_samples;
dec.timecode = target_pos_ms / 1000.0;
dec.eof_seen = false;
dec.decoder_drained = false;
return 1;
}
const uint8_t *fmpg_buffer(fmpg_instance *instance)
{
return instance && !instance->decoder.pcm.empty()
? instance->decoder.pcm.data()
: nullptr;
}
int fmpg_buffer_size(fmpg_instance *instance)
{
if (!instance || instance->decoder.pcm.size() >
static_cast<size_t>(std::numeric_limits<int>::max())) {
return 0;
}
return static_cast<int>(instance->decoder.pcm.size());
}
int64_t fmpg_buffer_samples(fmpg_instance *instance)
{
return instance ? instance->decoder.last_samples : 0;
}
int64_t fmpg_buffer_start_sample(fmpg_instance *instance)
{
return instance ? instance->decoder.buffer_start_sample : 0;
}
int64_t fmpg_buffer_end_sample(fmpg_instance *instance)
{
if (!instance) {
return 0;
}
return instance->decoder.buffer_start_sample + instance->decoder.last_samples;
}
int64_t fmpg_sample_position(fmpg_instance *instance)
{
return instance ? instance->decoder.next_sample_position : 0;
}
double fmpg_timecode(fmpg_instance *instance)
{
return instance ? instance->decoder.timecode : 0.0;
}
const char *fmpg_ffmpeg_version()
{
static char *version = nullptr;
if (version == nullptr) {
version = static_cast<char *>(malloc(1024));
}
sprintf(version, "avformat: %u.%u.%u (%d), avcodec: %u.%u.%u (%d), swresample: %u.%u.%u (%d), avutil: %u.%u.%u (%d)",
LIBAVFORMAT_VERSION_MAJOR, LIBAVFORMAT_VERSION_MINOR, LIBAVFORMAT_VERSION_MICRO, LIBAVFORMAT_VERSION_INT,
LIBAVCODEC_VERSION_MAJOR, LIBAVCODEC_VERSION_MINOR, LIBAVCODEC_VERSION_MICRO, LIBAVCODEC_VERSION_INT,
LIBSWRESAMPLE_VERSION_MAJOR, LIBSWRESAMPLE_VERSION_MINOR, LIBSWRESAMPLE_VERSION_MICRO, LIBSWRESAMPLE_VERSION_INT,
LIBAVUTIL_VERSION_MAJOR, LIBAVUTIL_VERSION_MINOR, LIBAVUTIL_VERSION_MICRO, LIBAVUTIL_VERSION_INT
);
return version;
}
const char *fmpg_int_version2string(int ver)
{
static char *version = nullptr;
if (version == nullptr) {
version = static_cast<char *>(malloc(1024));
}
int major = AV_VERSION_MAJOR(ver);
int minor = AV_VERSION_MINOR(ver);
int micro = AV_VERSION_MICRO(ver);
sprintf(version, "%u.%u.%u", major, minor, micro);
return version;
}
int fmpg_version()
{
return ffi_version();
}