/* * Acinerella audio-only decoder. * * This file is intentionally written as C++ internally, but exports a stable * C ABI. That gives us RAII, std::string and std::vector internally, while a * C or Racket FFI caller still sees a simple C interface. * * What this decoder does: * * 1. Open a media file with FFmpeg/libavformat. * 2. Find audio streams. * 3. Read compressed packets from the container. * 4. Decode packets with the modern avcodec_send_packet() / * avcodec_receive_frame() API. * 5. Convert decoded audio to one predictable output format: * * signed 32-bit integer PCM * interleaved / packed * native endian * * This is suitable for feeding to libao as 32-bit PCM. * * Important FFmpeg vocabulary: * * Container/demuxer: * The file format layer: mp3, mp4/m4a, ogg, wav, etc. * FFmpeg represents this with AVFormatContext. * * Stream: * A file may contain one or more streams. For this audio-only API we only * care about streams whose codec_type is AVMEDIA_TYPE_AUDIO. * * Packet: * Compressed data belonging to one stream. One packet may decode to zero, * one, or multiple decoded frames. * * Frame: * Decoded audio samples, but not necessarily in the format we want. MP3, * for example, may decode to planar float. We therefore use libswresample * to normalize everything to signed 32-bit interleaved PCM. */ #include "ffmpeg_audio.h" #include #include #include #include #include #include #include extern "C" { #include #include #include #include #include #include } static constexpr int AC_AUDIO_OUTPUT_BITS = 32; static constexpr int AC_AUDIO_OUTPUT_BYTES = 4; static constexpr AVSampleFormat AC_AUDIO_OUTPUT_FMT = AV_SAMPLE_FMT_S32; /* * Metadata. * * This used to be the kind of place where C code often used fixed-size arrays: * * char title[512]; * * That is simple, but truncates long UTF-8 metadata and wastes space. Since the * implementation is C++, std::string is the natural representation. The public * C API only exposes const char* getters. */ struct __fmpg_file_info__ { std::string title; std::string author; std::string album; std::string genre; std::string comment; std::string copyright; int year = -1; int track = -1; int64_t duration = -1; /* milliseconds */ int bitrate = -1; void clear() { title.clear(); author.clear(); album.clear(); genre.clear(); comment.clear(); copyright.clear(); year = -1; track = -1; duration = -1; bitrate = -1; } }; /* * __fmpg_instance__ owns the opened media file. * * AVFormatContext is FFmpeg's demuxer/container object. It knows which streams * the file contains and can read compressed packets from it. */ struct __fmpg_instance__ { bool opened = false; AVFormatContext *format_ctx = nullptr; fmpg_file_info info; ~__fmpg_instance__() { if (format_ctx) { avformat_close_input(&format_ctx); } } }; /* * A package wraps one FFmpeg AVPacket. * * The old Acinerella name was "package". FFmpeg calls this a packet. It is not * decoded audio yet; it is compressed data read from the container. */ struct __fmpg_package__ { int stream_index = -1; int64_t pts = AV_NOPTS_VALUE; AVPacket *packet = nullptr; __fmpg_package__() : packet(av_packet_alloc()) {} ~__fmpg_package__() { av_packet_free(&packet); } }; /* * __fmpg_decoder__ owns the actual audio decoder and resampler for one stream. */ struct __fmpg_decoder__ { fmpg_instance *instance = nullptr; int stream_index = -1; const AVCodec *codec = nullptr; AVCodecContext *codec_ctx = nullptr; AVFrame *frame = nullptr; SwrContext *swr_ctx = nullptr; fmpg_audio_info audio_info{}; std::vector pcm; double timecode = 0.0; ~__fmpg_decoder__() { avcodec_free_context(&codec_ctx); av_frame_free(&frame); swr_free(&swr_ctx); } }; static const char *empty_if_null(const char *s) { return s ? s : ""; } static const char *string_c_str(const std::string &s) { return s.empty() ? "" : s.c_str(); } static std::string get_metadata_string(const AVFormatContext *ctx, const char *key) { const AVDictionaryEntry *entry = av_dict_get(ctx->metadata, key, nullptr, 0); return entry && entry->value ? std::string(entry->value) : std::string(); } static int get_metadata_int(const AVFormatContext *ctx, const char *key) { const AVDictionaryEntry *entry = av_dict_get(ctx->metadata, key, nullptr, 0); if (!entry || !entry->value || !*entry->value) { return -1; } return std::atoi(entry->value); } static void fill_metadata(fmpg_instance *self) { AVFormatContext *ctx = self->format_ctx; self->info.clear(); self->info.title = get_metadata_string(ctx, "title"); self->info.author = get_metadata_string(ctx, "artist"); self->info.album = get_metadata_string(ctx, "album"); self->info.genre = get_metadata_string(ctx, "genre"); self->info.comment = get_metadata_string(ctx, "comment"); self->info.copyright = get_metadata_string(ctx, "copyright"); self->info.year = get_metadata_int(ctx, "year"); self->info.track = get_metadata_int(ctx, "track"); self->info.bitrate = static_cast(ctx->bit_rate); self->info.duration = ctx->duration == AV_NOPTS_VALUE ? -1 : ctx->duration * 1000 / AV_TIME_BASE; } static bool valid_stream_index(const fmpg_instance *instance, int stream_index) { return instance && instance->opened && instance->format_ctx && stream_index >= 0 && stream_index < static_cast(instance->format_ctx->nb_streams); } fmpg_instance * ac_init(void) { try { return new fmpg_instance(); } catch (...) { return nullptr; } } void ac_free(fmpg_instance * instance) { delete instance; } int ac_open_file(fmpg_instance * instance, const char *filename) { if (!instance || instance->opened || !filename) { return 0; } /* * avformat_open_input opens the file and guesses the container format. * The codec is not opened here. This is only the demuxing layer. */ if (avformat_open_input(&instance->format_ctx, empty_if_null(filename), nullptr, nullptr) < 0) { ac_close(instance); return 0; } /* * Read enough packets to discover stream metadata such as sample rate, * channel layout, codec id, duration and tags. */ if (avformat_find_stream_info(instance->format_ctx, nullptr) < 0) { ac_close(instance); return 0; } fill_metadata(instance); instance->opened = true; return 1; } void ac_close(fmpg_instance * instance) { if (!instance) { return; } if (instance->format_ctx) { avformat_close_input(&instance->format_ctx); } instance->opened = false; instance->info.clear(); } int ac_is_open(fmpg_instance * instance) { return instance && instance->opened ? 1 : 0; } int ac_get_audio_stream_count(fmpg_instance * instance) { if (!instance || !instance->opened || !instance->format_ctx) { return 0; } int count = 0; for (unsigned i = 0; i < instance->format_ctx->nb_streams; ++i) { const AVCodecParameters *par = instance->format_ctx->streams[i]->codecpar; if (par && par->codec_type == AVMEDIA_TYPE_AUDIO) { ++count; } } return count; } int ac_get_default_audio_stream(fmpg_instance * instance) { if (!instance || !instance->opened || !instance->format_ctx) { return -1; } const int idx = av_find_best_stream(instance->format_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, nullptr, 0); return idx >= 0 ? idx : -1; } int ac_get_audio_info(fmpg_instance * instance, int stream_index, fmpg_audio_info *info) { if (!info) { return 0; } std::memset(info, 0, sizeof(*info)); if (!valid_stream_index(instance, stream_index)) { return 0; } const AVCodecParameters *par = instance->format_ctx->streams[stream_index]->codecpar; if (!par || par->codec_type != AVMEDIA_TYPE_AUDIO) { return 0; } info->sample_rate = par->sample_rate; info->channels = par->ch_layout.nb_channels; info->bits_per_sample = AC_AUDIO_OUTPUT_BITS; info->bytes_per_sample = AC_AUDIO_OUTPUT_BYTES; return info->sample_rate > 0 && info->channels > 0 ? 1 : 0; } const fmpg_file_info *ac_get_file_info(fmpg_instance * instance) { return instance ? &instance->info : nullptr; } const char * ac_file_info_title(const fmpg_file_info *info) { return info ? string_c_str(info->title) : ""; } const char *ac_file_info_author(const fmpg_file_info *info) { return info ? string_c_str(info->author) : ""; } const char *ac_file_info_album(const fmpg_file_info *info) { return info ? string_c_str(info->album) : ""; } const char *ac_file_info_genre(const fmpg_file_info *info) { return info ? string_c_str(info->genre) : ""; } const char *ac_file_info_comment(const fmpg_file_info *info) { return info ? string_c_str(info->comment) : ""; } const char *ac_file_info_copyright(const fmpg_file_info *info) { return info ? string_c_str(info->copyright) : ""; } int ac_file_info_year(const fmpg_file_info *info) { return info ? info->year : -1; } int ac_file_info_track(const fmpg_file_info *info) { return info ? info->track : -1; } int64_t ac_file_info_duration(const fmpg_file_info *info) { return info ? info->duration : -1; } int ac_file_info_bitrate(const fmpg_file_info *info) { return info ? info->bitrate : -1; } fmpg_package * ac_read_package(fmpg_instance * instance) { if (!instance || !instance->opened || !instance->format_ctx) { return nullptr; } fmpg_package *pkg = nullptr; try { pkg = new fmpg_package(); } catch (...) { return nullptr; } if (!pkg->packet) { delete pkg; return nullptr; } /* * av_read_frame reads one compressed packet. This may be audio, video, * subtitles, or another stream type. The caller can inspect stream_index * and only feed audio packets to the matching decoder. */ if (av_read_frame(instance->format_ctx, pkg->packet) < 0) { delete pkg; return nullptr; } pkg->stream_index = pkg->packet->stream_index; pkg->pts = pkg->packet->dts != AV_NOPTS_VALUE ? pkg->packet->dts : pkg->packet->pts; return pkg; } void ac_free_package(fmpg_package * package) { delete package; } int ac_package_stream_index(fmpg_package * package) { return package ? package->stream_index : -1; } static bool init_codec_context(fmpg_decoder *dec, const AVCodecParameters *par) { dec->codec = avcodec_find_decoder(par->codec_id); if (!dec->codec) { return false; } dec->codec_ctx = avcodec_alloc_context3(dec->codec); if (!dec->codec_ctx) { return false; } /* * Copy stream codec parameters into the active decoder context. */ if (avcodec_parameters_to_context(dec->codec_ctx, par) < 0) { return false; } /* * Open the actual decoder. From this point on, packets can be sent to it. */ if (avcodec_open2(dec->codec_ctx, dec->codec, nullptr) < 0) { return false; } return true; } static bool init_resampler(fmpg_decoder *dec) { const AVChannelLayout *layout = &dec->codec_ctx->ch_layout; if (layout->nb_channels <= 0 || dec->codec_ctx->sample_rate <= 0) { return false; } /* * We do not change sample rate or channel layout. We only normalize the * sample format to signed 32-bit integer PCM. */ if (swr_alloc_set_opts2(&dec->swr_ctx, layout, AC_AUDIO_OUTPUT_FMT, dec->codec_ctx->sample_rate, layout, dec->codec_ctx->sample_fmt, dec->codec_ctx->sample_rate, 0, nullptr) < 0) { return false; } return swr_init(dec->swr_ctx) >= 0; } fmpg_decoder * ac_create_decoder(fmpg_instance * instance, int stream_index) { if (!valid_stream_index(instance, stream_index)) { return nullptr; } fmpg_audio_info info{}; if (!ac_get_audio_info(instance, stream_index, &info)) { return nullptr; } fmpg_decoder *dec = nullptr; try { dec = new fmpg_decoder(); } catch (...) { return nullptr; } dec->instance = instance; dec->stream_index = stream_index; dec->audio_info = info; const AVCodecParameters *par = instance->format_ctx->streams[stream_index]->codecpar; if (!init_codec_context(dec, par)) { delete dec; return nullptr; } dec->frame = av_frame_alloc(); if (!dec->frame) { delete dec; return nullptr; } if (!init_resampler(dec)) { delete dec; return nullptr; } return dec; } void ac_free_decoder(fmpg_decoder * decoder) { delete decoder; } static bool append_bytes(fmpg_decoder *dec, const uint8_t *src, size_t bytes) { if (!bytes) { return true; } if (bytes > static_cast(std::numeric_limits::max()) - dec->pcm.size()) { return false; } try { const size_t old_size = dec->pcm.size(); dec->pcm.resize(old_size + bytes); std::memcpy(dec->pcm.data() + old_size, src, bytes); return true; } catch (...) { return false; } } static bool append_converted_frame(fmpg_decoder *dec, const AVFrame *frame) { const int channels = dec->codec_ctx->ch_layout.nb_channels; if (channels <= 0 || frame->nb_samples <= 0) { return true; } /* * swr_get_out_samples gives a safe upper bound for the number of output * samples. The resampler can have internal delay, so this is safer than * assuming input sample count equals output sample count. */ const int max_out_samples = swr_get_out_samples(dec->swr_ctx, frame->nb_samples); if (max_out_samples <= 0) { return false; } const int max_bytes = av_samples_get_buffer_size(nullptr, channels, max_out_samples, AC_AUDIO_OUTPUT_FMT, 1); if (max_bytes <= 0) { return false; } std::vector tmp(static_cast(max_bytes)); uint8_t *out_planes[1] = { tmp.data() }; /* * swr_convert performs the actual conversion to S32 interleaved PCM. */ const int out_samples = swr_convert(dec->swr_ctx, out_planes, max_out_samples, const_cast(frame->data), frame->nb_samples); if (out_samples < 0) { return false; } const int used_bytes = av_samples_get_buffer_size(nullptr, channels, out_samples, AC_AUDIO_OUTPUT_FMT, 1); if (used_bytes < 0) { return false; } return append_bytes(dec, tmp.data(), static_cast(used_bytes)); } static int receive_available_frames(fmpg_decoder *dec) { int produced = 0; for (;;) { const int ret = avcodec_receive_frame(dec->codec_ctx, dec->frame); if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) { return produced; } if (ret < 0) { return -1; } if (!append_converted_frame(dec, dec->frame)) { av_frame_unref(dec->frame); return -1; } produced = 1; av_frame_unref(dec->frame); } } static void update_timecode_from_packet(fmpg_decoder *dec, const fmpg_package *pkg) { if (!dec || !pkg || pkg->pts == AV_NOPTS_VALUE) { return; } AVStream *stream = dec->instance->format_ctx->streams[pkg->stream_index]; dec->timecode = pkg->pts * av_q2d(stream->time_base); } int ac_decode_package(fmpg_package * package, fmpg_decoder * decoder) { if (!package || !decoder || !package->packet || package->stream_index != decoder->stream_index) { return 0; } decoder->pcm.clear(); update_timecode_from_packet(decoder, package); /* * Modern FFmpeg decoding is a two-step queue-like API: * * 1. send compressed packet * 2. receive all decoded frames currently available * * A single packet can produce multiple frames, especially with codecs that * buffer internally. We concatenate all produced PCM blocks. */ int ret = avcodec_send_packet(decoder->codec_ctx, package->packet); if (ret == AVERROR(EAGAIN)) { if (receive_available_frames(decoder) < 0) { return 0; } ret = avcodec_send_packet(decoder->codec_ctx, package->packet); } if (ret < 0) { return 0; } return receive_available_frames(decoder) > 0 ? 1 : 0; } int ac_flush_decoder(fmpg_decoder * decoder) { if (!decoder) { return 0; } decoder->pcm.clear(); /* * Sending NULL tells FFmpeg that no more input is coming and that delayed * decoded frames should be drained. */ const int ret = avcodec_send_packet(decoder->codec_ctx, nullptr); if (ret < 0 && ret != AVERROR_EOF) { return 0; } const int produced = receive_available_frames(decoder); if (produced < 0) { return 0; } /* Drain possible delayed samples from libswresample as well. */ const int channels = decoder->codec_ctx->ch_layout.nb_channels; for (;;) { const int delay = static_cast(swr_get_delay(decoder->swr_ctx, decoder->codec_ctx->sample_rate)); if (delay <= 0) { break; } const int max_bytes = av_samples_get_buffer_size(nullptr, channels, delay, AC_AUDIO_OUTPUT_FMT, 1); if (max_bytes <= 0) { break; } std::vector tmp(static_cast(max_bytes)); uint8_t *out_planes[1] = { tmp.data() }; const int out_samples = swr_convert(decoder->swr_ctx, out_planes, delay, nullptr, 0); if (out_samples <= 0) { break; } const int used_bytes = av_samples_get_buffer_size(nullptr, channels, out_samples, AC_AUDIO_OUTPUT_FMT, 1); if (used_bytes < 0 || !append_bytes(decoder, tmp.data(), static_cast(used_bytes))) { break; } } return decoder->pcm.empty() ? 0 : 1; } int ac_seek_ms(fmpg_decoder * decoder, int64_t target_pos_ms) { if (!decoder || !decoder->instance || !decoder->instance->format_ctx) { return 0; } AVStream *stream = decoder->instance->format_ctx->streams[decoder->stream_index]; const int64_t pos_us = av_rescale(target_pos_ms, AV_TIME_BASE, 1000); const int64_t stream_ts = av_rescale_q(pos_us, AV_TIME_BASE_Q, stream->time_base); if (av_seek_frame(decoder->instance->format_ctx, decoder->stream_index, stream_ts, AVSEEK_FLAG_BACKWARD) < 0) { return 0; } decoder->timecode = target_pos_ms / 1000.0; decoder->pcm.clear(); /* Old buffered data no longer belongs to the new seek position. */ avcodec_flush_buffers(decoder->codec_ctx); /* Reset resampler delay/state too. */ swr_close(decoder->swr_ctx); return swr_init(decoder->swr_ctx) >= 0 ? 1 : 0; } const uint8_t *ac_decoder_buffer(fmpg_decoder * decoder) { return decoder && !decoder->pcm.empty() ? decoder->pcm.data() : nullptr; } int ac_decoder_buffer_size(fmpg_decoder * decoder) { if (!decoder || decoder->pcm.size() > static_cast(std::numeric_limits::max())) { return 0; } return static_cast(decoder->pcm.size()); } double ac_decoder_timecode(fmpg_decoder * decoder) { return decoder ? decoder->timecode : 0.0; } int ac_decoder_stream_index(fmpg_decoder * decoder) { return decoder ? decoder->stream_index : -1; }