gemigreerd-racket-sound-lib/ffmpeg-audio/ffmpeg_audio.cpp

/*
 * Acinerella audio-only decoder.
 *
 * This file is intentionally written as C++ internally, but exports a stable
 * C ABI. That gives us RAII, std::string and std::vector internally, while a
 * C or Racket FFI caller still sees a simple C interface.
 *
 * What this decoder does:
 *
 *   1. Open a media file with FFmpeg/libavformat.
 *   2. Find audio streams.
 *   3. Read compressed packets from the container.
 *   4. Decode packets with the modern avcodec_send_packet() /
 *      avcodec_receive_frame() API.
 *   5. Convert decoded audio to one predictable output format:
 *
 *          signed 32-bit integer PCM
 *          interleaved / packed
 *          native endian
 *
 *      This is suitable for feeding to libao as 32-bit PCM.
 *
 * Important FFmpeg vocabulary:
 *
 *   Container/demuxer:
 *      The file format layer: mp3, mp4/m4a, ogg, wav, etc.
 *      FFmpeg represents this with AVFormatContext.
 *
 *   Stream:
 *      A file may contain one or more streams. For this audio-only API we only
 *      care about streams whose codec_type is AVMEDIA_TYPE_AUDIO.
 *
 *   Packet:
 *      Compressed data belonging to one stream. One packet may decode to zero,
 *      one, or multiple decoded frames.
 *
 *   Frame:
 *      Decoded audio samples, but not necessarily in the format we want. MP3,
 *      for example, may decode to planar float. We therefore use libswresample
 *      to normalize everything to signed 32-bit interleaved PCM.
 */

#include "ffmpeg_audio.h"

#include <algorithm>
#include <cstdint>
#include <cstdlib>
#include <cstring>
#include <limits>
#include <string>
#include <vector>

extern "C" {
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libavutil/avutil.h>
#include <libavutil/channel_layout.h>
#include <libavutil/samplefmt.h>
#include <libswresample/swresample.h>
}

static constexpr int AC_AUDIO_OUTPUT_BITS = 32;
static constexpr int AC_AUDIO_OUTPUT_BYTES = 4;
static constexpr AVSampleFormat AC_AUDIO_OUTPUT_FMT = AV_SAMPLE_FMT_S32;

/*
 * Metadata.
 *
 * This used to be the kind of place where C code often used fixed-size arrays:
 *
 *   char title[512];
 *
 * That is simple, but truncates long UTF-8 metadata and wastes space. Since the
 * implementation is C++, std::string is the natural representation. The public
 * C API only exposes const char* getters.
 */
struct __fmpg_file_info__ {
    std::string title;
    std::string author;
    std::string album;
    std::string genre;
    std::string comment;
    std::string copyright;

    int year = -1;
    int track = -1;
    int64_t duration = -1; /* milliseconds */
    int bitrate = -1;

    void clear() {
        title.clear();
        author.clear();
        album.clear();
        genre.clear();
        comment.clear();
        copyright.clear();
        year = -1;
        track = -1;
        duration = -1;
        bitrate = -1;
    }
};

/*
 * __fmpg_instance__ owns the opened media file.
 *
 * AVFormatContext is FFmpeg's demuxer/container object. It knows which streams
 * the file contains and can read compressed packets from it.
 */
struct __fmpg_instance__ {
    bool opened = false;
    AVFormatContext *format_ctx = nullptr;
    fmpg_file_info info;

    ~__fmpg_instance__() {
        if (format_ctx) {
            avformat_close_input(&format_ctx);
        }
    }
};

/*
 * A package wraps one FFmpeg AVPacket.
 *
 * The old Acinerella name was "package". FFmpeg calls this a packet. It is not
 * decoded audio yet; it is compressed data read from the container.
 */
struct __fmpg_package__ {
    int stream_index = -1;
    int64_t pts = AV_NOPTS_VALUE;
    AVPacket *packet = nullptr;

    __fmpg_package__() : packet(av_packet_alloc()) {}

    ~__fmpg_package__() {
        av_packet_free(&packet);
    }
};

/*
 * __fmpg_decoder__ owns the actual audio decoder and resampler for one stream.
 */
struct __fmpg_decoder__ {
    fmpg_instance *instance = nullptr;
    int stream_index = -1;

    const AVCodec *codec = nullptr;
    AVCodecContext *codec_ctx = nullptr;
    AVFrame *frame = nullptr;
    SwrContext *swr_ctx = nullptr;

    fmpg_audio_info audio_info{};
    std::vector<uint8_t> pcm;
    double timecode = 0.0;

    ~__fmpg_decoder__() {
        avcodec_free_context(&codec_ctx);
        av_frame_free(&frame);
        swr_free(&swr_ctx);
    }
};

static const char *empty_if_null(const char *s) {
    return s ? s : "";
}

static const char *string_c_str(const std::string &s) {
    return s.empty() ? "" : s.c_str();
}

static std::string get_metadata_string(const AVFormatContext *ctx,
                                       const char *key) {
    const AVDictionaryEntry *entry =
        av_dict_get(ctx->metadata, key, nullptr, 0);

    return entry && entry->value ? std::string(entry->value)
                                 : std::string();
}

static int get_metadata_int(const AVFormatContext *ctx, const char *key) {
    const AVDictionaryEntry *entry =
        av_dict_get(ctx->metadata, key, nullptr, 0);

    if (!entry || !entry->value || !*entry->value) {
        return -1;
    }

    return std::atoi(entry->value);
}

static void fill_metadata(fmpg_instance *self) {
    AVFormatContext *ctx = self->format_ctx;

    self->info.clear();
    self->info.title = get_metadata_string(ctx, "title");
    self->info.author = get_metadata_string(ctx, "artist");
    self->info.album = get_metadata_string(ctx, "album");
    self->info.genre = get_metadata_string(ctx, "genre");
    self->info.comment = get_metadata_string(ctx, "comment");
    self->info.copyright = get_metadata_string(ctx, "copyright");
    self->info.year = get_metadata_int(ctx, "year");
    self->info.track = get_metadata_int(ctx, "track");
    self->info.bitrate = static_cast<int>(ctx->bit_rate);

    self->info.duration =
        ctx->duration == AV_NOPTS_VALUE
            ? -1
            : ctx->duration * 1000 / AV_TIME_BASE;
}

static bool valid_stream_index(const fmpg_instance *instance, int stream_index)
{
    return instance && instance->opened && instance->format_ctx &&
           stream_index >= 0 &&
           stream_index < static_cast<int>(instance->format_ctx->nb_streams);
}

fmpg_instance *  ac_init(void) {
    try {
        return new fmpg_instance();
    } catch (...) {
        return nullptr;
    }
}

void  ac_free(fmpg_instance * instance) {
    delete instance;
}

int  ac_open_file(fmpg_instance * instance,
                                       const char *filename) {
    if (!instance || instance->opened || !filename) {
        return 0;
    }

    /*
     * avformat_open_input opens the file and guesses the container format.
     * The codec is not opened here. This is only the demuxing layer.
     */
    if (avformat_open_input(&instance->format_ctx,
                            empty_if_null(filename),
                            nullptr,
                            nullptr) < 0) {
        ac_close(instance);
        return 0;
    }

    /*
     * Read enough packets to discover stream metadata such as sample rate,
     * channel layout, codec id, duration and tags.
     */
    if (avformat_find_stream_info(instance->format_ctx, nullptr) < 0) {
        ac_close(instance);
        return 0;
    }

    fill_metadata(instance);
    instance->opened = true;
    return 1;
}

void  ac_close(fmpg_instance * instance) {
    if (!instance) {
        return;
    }

    if (instance->format_ctx) {
        avformat_close_input(&instance->format_ctx);
    }

    instance->opened = false;
    instance->info.clear();
}

int  ac_is_open(fmpg_instance * instance)
{
    return instance && instance->opened ? 1 : 0;
}

int ac_get_audio_stream_count(fmpg_instance * instance)
{
    if (!instance || !instance->opened || !instance->format_ctx) {
        return 0;
    }

    int count = 0;

    for (unsigned i = 0; i < instance->format_ctx->nb_streams; ++i) {
        const AVCodecParameters *par =
            instance->format_ctx->streams[i]->codecpar;

        if (par && par->codec_type == AVMEDIA_TYPE_AUDIO) {
            ++count;
        }
    }

    return count;
}

int ac_get_default_audio_stream(fmpg_instance * instance)
{
    if (!instance || !instance->opened || !instance->format_ctx) {
        return -1;
    }

    const int idx = av_find_best_stream(instance->format_ctx,
                                        AVMEDIA_TYPE_AUDIO,
                                        -1,
                                        -1,
                                        nullptr,
                                        0);

    return idx >= 0 ? idx : -1;
}

int ac_get_audio_info(fmpg_instance * instance, int stream_index, fmpg_audio_info *info)
{
    if (!info) {
        return 0;
    }

    std::memset(info, 0, sizeof(*info));

    if (!valid_stream_index(instance, stream_index)) {
        return 0;
    }

    const AVCodecParameters *par =
        instance->format_ctx->streams[stream_index]->codecpar;

    if (!par || par->codec_type != AVMEDIA_TYPE_AUDIO) {
        return 0;
    }

    info->sample_rate = par->sample_rate;
    info->channels = par->ch_layout.nb_channels;
    info->bits_per_sample = AC_AUDIO_OUTPUT_BITS;
    info->bytes_per_sample = AC_AUDIO_OUTPUT_BYTES;

    return info->sample_rate > 0 && info->channels > 0 ? 1 : 0;
}

const fmpg_file_info *ac_get_file_info(fmpg_instance * instance)
{
    return instance ? &instance->info : nullptr;
}

const char * ac_file_info_title(const fmpg_file_info *info)
{
    return info ? string_c_str(info->title) : "";
}

const char *ac_file_info_author(const fmpg_file_info *info)
{
    return info ? string_c_str(info->author) : "";
}

const char *ac_file_info_album(const fmpg_file_info *info)
{
    return info ? string_c_str(info->album) : "";
}

const char *ac_file_info_genre(const fmpg_file_info *info)
{
    return info ? string_c_str(info->genre) : "";
}

const char *ac_file_info_comment(const fmpg_file_info *info)
{
    return info ? string_c_str(info->comment) : "";
}

const char *ac_file_info_copyright(const fmpg_file_info *info)
{
    return info ? string_c_str(info->copyright) : "";
}

int ac_file_info_year(const fmpg_file_info *info)
{
    return info ? info->year : -1;
}

int ac_file_info_track(const fmpg_file_info *info)
{
    return info ? info->track : -1;
}

int64_t ac_file_info_duration(const fmpg_file_info *info)
{
    return info ? info->duration : -1;
}

int ac_file_info_bitrate(const fmpg_file_info *info)
{
    return info ? info->bitrate : -1;
}

fmpg_package * ac_read_package(fmpg_instance * instance)
{
    if (!instance || !instance->opened || !instance->format_ctx) {
        return nullptr;
    }

    fmpg_package *pkg = nullptr;

    try {
        pkg = new fmpg_package();
    } catch (...) {
        return nullptr;
    }

    if (!pkg->packet) {
        delete pkg;
        return nullptr;
    }

    /*
     * av_read_frame reads one compressed packet. This may be audio, video,
     * subtitles, or another stream type. The caller can inspect stream_index
     * and only feed audio packets to the matching decoder.
     */
    if (av_read_frame(instance->format_ctx, pkg->packet) < 0) {
        delete pkg;
        return nullptr;
    }

    pkg->stream_index = pkg->packet->stream_index;
    pkg->pts = pkg->packet->dts != AV_NOPTS_VALUE
                   ? pkg->packet->dts
                   : pkg->packet->pts;

    return pkg;
}

void ac_free_package(fmpg_package * package)
{
    delete package;
}

int ac_package_stream_index(fmpg_package * package)
{
    return package ? package->stream_index : -1;
}

static bool init_codec_context(fmpg_decoder *dec, const AVCodecParameters *par)
{
    dec->codec = avcodec_find_decoder(par->codec_id);
    if (!dec->codec) {
        return false;
    }

    dec->codec_ctx = avcodec_alloc_context3(dec->codec);
    if (!dec->codec_ctx) {
        return false;
    }

    /*
     * Copy stream codec parameters into the active decoder context.
     */
    if (avcodec_parameters_to_context(dec->codec_ctx, par) < 0) {
        return false;
    }

    /*
     * Open the actual decoder. From this point on, packets can be sent to it.
     */
    if (avcodec_open2(dec->codec_ctx, dec->codec, nullptr) < 0) {
        return false;
    }

    return true;
}

static bool init_resampler(fmpg_decoder *dec)
{
    const AVChannelLayout *layout = &dec->codec_ctx->ch_layout;

    if (layout->nb_channels <= 0 || dec->codec_ctx->sample_rate <= 0) {
        return false;
    }

    /*
     * We do not change sample rate or channel layout. We only normalize the
     * sample format to signed 32-bit integer PCM.
     */
    if (swr_alloc_set_opts2(&dec->swr_ctx,
                            layout,
                            AC_AUDIO_OUTPUT_FMT,
                            dec->codec_ctx->sample_rate,
                            layout,
                            dec->codec_ctx->sample_fmt,
                            dec->codec_ctx->sample_rate,
                            0,
                            nullptr) < 0) {
        return false;
    }

    return swr_init(dec->swr_ctx) >= 0;
}

fmpg_decoder * ac_create_decoder(fmpg_instance * instance, int stream_index)
{
    if (!valid_stream_index(instance, stream_index)) {
        return nullptr;
    }

    fmpg_audio_info info{};
    if (!ac_get_audio_info(instance, stream_index, &info)) {
        return nullptr;
    }

    fmpg_decoder *dec = nullptr;

    try {
        dec = new fmpg_decoder();
    } catch (...) {
        return nullptr;
    }

    dec->instance = instance;
    dec->stream_index = stream_index;
    dec->audio_info = info;

    const AVCodecParameters *par =
        instance->format_ctx->streams[stream_index]->codecpar;

    if (!init_codec_context(dec, par)) {
        delete dec;
        return nullptr;
    }

    dec->frame = av_frame_alloc();
    if (!dec->frame) {
        delete dec;
        return nullptr;
    }

    if (!init_resampler(dec)) {
        delete dec;
        return nullptr;
    }

    return dec;
}

void ac_free_decoder(fmpg_decoder * decoder)
{
    delete decoder;
}

static bool append_bytes(fmpg_decoder *dec, const uint8_t *src, size_t bytes)
{
    if (!bytes) {
        return true;
    }

    if (bytes > static_cast<size_t>(std::numeric_limits<int>::max()) -
                    dec->pcm.size()) {
        return false;
    }

    try {
        const size_t old_size = dec->pcm.size();
        dec->pcm.resize(old_size + bytes);
        std::memcpy(dec->pcm.data() + old_size, src, bytes);
        return true;
    } catch (...) {
        return false;
    }
}

static bool append_converted_frame(fmpg_decoder *dec, const AVFrame *frame)
{
    const int channels = dec->codec_ctx->ch_layout.nb_channels;

    if (channels <= 0 || frame->nb_samples <= 0) {
        return true;
    }

    /*
     * swr_get_out_samples gives a safe upper bound for the number of output
     * samples. The resampler can have internal delay, so this is safer than
     * assuming input sample count equals output sample count.
     */
    const int max_out_samples =
        swr_get_out_samples(dec->swr_ctx, frame->nb_samples);

    if (max_out_samples <= 0) {
        return false;
    }

    const int max_bytes =
        av_samples_get_buffer_size(nullptr,
                                   channels,
                                   max_out_samples,
                                   AC_AUDIO_OUTPUT_FMT,
                                   1);

    if (max_bytes <= 0) {
        return false;
    }

    std::vector<uint8_t> tmp(static_cast<size_t>(max_bytes));
    uint8_t *out_planes[1] = { tmp.data() };

    /*
     * swr_convert performs the actual conversion to S32 interleaved PCM.
     */
    const int out_samples =
        swr_convert(dec->swr_ctx,
                    out_planes,
                    max_out_samples,
                    const_cast<const uint8_t **>(frame->data),
                    frame->nb_samples);

    if (out_samples < 0) {
        return false;
    }

    const int used_bytes =
        av_samples_get_buffer_size(nullptr,
                                   channels,
                                   out_samples,
                                   AC_AUDIO_OUTPUT_FMT,
                                   1);

    if (used_bytes < 0) {
        return false;
    }

    return append_bytes(dec, tmp.data(), static_cast<size_t>(used_bytes));
}

static int receive_available_frames(fmpg_decoder *dec)
{
    int produced = 0;

    for (;;) {
        const int ret = avcodec_receive_frame(dec->codec_ctx, dec->frame);

        if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF) {
            return produced;
        }

        if (ret < 0) {
            return -1;
        }

        if (!append_converted_frame(dec, dec->frame)) {
            av_frame_unref(dec->frame);
            return -1;
        }

        produced = 1;
        av_frame_unref(dec->frame);
    }
}

static void update_timecode_from_packet(fmpg_decoder *dec, const fmpg_package *pkg)
{
    if (!dec || !pkg || pkg->pts == AV_NOPTS_VALUE) {
        return;
    }

    AVStream *stream = dec->instance->format_ctx->streams[pkg->stream_index];
    dec->timecode = pkg->pts * av_q2d(stream->time_base);
}

int ac_decode_package(fmpg_package * package, fmpg_decoder * decoder)
{
    if (!package || !decoder || !package->packet ||
        package->stream_index != decoder->stream_index) {
        return 0;
    }

    decoder->pcm.clear();
    update_timecode_from_packet(decoder, package);

    /*
     * Modern FFmpeg decoding is a two-step queue-like API:
     *
     *   1. send compressed packet
     *   2. receive all decoded frames currently available
     *
     * A single packet can produce multiple frames, especially with codecs that
     * buffer internally. We concatenate all produced PCM blocks.
     */
    int ret = avcodec_send_packet(decoder->codec_ctx, package->packet);

    if (ret == AVERROR(EAGAIN)) {
        if (receive_available_frames(decoder) < 0) {
            return 0;
        }
        ret = avcodec_send_packet(decoder->codec_ctx, package->packet);
    }

    if (ret < 0) {
        return 0;
    }

    return receive_available_frames(decoder) > 0 ? 1 : 0;
}

int ac_flush_decoder(fmpg_decoder * decoder)
{
    if (!decoder) {
        return 0;
    }

    decoder->pcm.clear();

    /*
     * Sending NULL tells FFmpeg that no more input is coming and that delayed
     * decoded frames should be drained.
     */
    const int ret = avcodec_send_packet(decoder->codec_ctx, nullptr);
    if (ret < 0 && ret != AVERROR_EOF) {
        return 0;
    }

    const int produced = receive_available_frames(decoder);
    if (produced < 0) {
        return 0;
    }

    /* Drain possible delayed samples from libswresample as well. */
    const int channels = decoder->codec_ctx->ch_layout.nb_channels;

    for (;;) {
        const int delay =
            static_cast<int>(swr_get_delay(decoder->swr_ctx,
                                           decoder->codec_ctx->sample_rate));

        if (delay <= 0) {
            break;
        }

        const int max_bytes =
            av_samples_get_buffer_size(nullptr,
                                       channels,
                                       delay,
                                       AC_AUDIO_OUTPUT_FMT,
                                       1);

        if (max_bytes <= 0) {
            break;
        }

        std::vector<uint8_t> tmp(static_cast<size_t>(max_bytes));
        uint8_t *out_planes[1] = { tmp.data() };

        const int out_samples =
            swr_convert(decoder->swr_ctx,
                        out_planes,
                        delay,
                        nullptr,
                        0);

        if (out_samples <= 0) {
            break;
        }

        const int used_bytes =
            av_samples_get_buffer_size(nullptr,
                                       channels,
                                       out_samples,
                                       AC_AUDIO_OUTPUT_FMT,
                                       1);

        if (used_bytes < 0 ||
            !append_bytes(decoder, tmp.data(), static_cast<size_t>(used_bytes))) {
            break;
        }
    }

    return decoder->pcm.empty() ? 0 : 1;
}

int ac_seek_ms(fmpg_decoder * decoder, int64_t target_pos_ms)
{
    if (!decoder || !decoder->instance || !decoder->instance->format_ctx) {
        return 0;
    }

    AVStream *stream = decoder->instance->format_ctx->streams[decoder->stream_index];

    const int64_t pos_us = av_rescale(target_pos_ms, AV_TIME_BASE, 1000);
    const int64_t stream_ts = av_rescale_q(pos_us, AV_TIME_BASE_Q, stream->time_base);

    if (av_seek_frame(decoder->instance->format_ctx,
                      decoder->stream_index,
                      stream_ts,
                      AVSEEK_FLAG_BACKWARD) < 0) {
        return 0;
    }

    decoder->timecode = target_pos_ms / 1000.0;
    decoder->pcm.clear();

    /* Old buffered data no longer belongs to the new seek position. */
    avcodec_flush_buffers(decoder->codec_ctx);

    /* Reset resampler delay/state too. */
    swr_close(decoder->swr_ctx);
    return swr_init(decoder->swr_ctx) >= 0 ? 1 : 0;
}

const uint8_t *ac_decoder_buffer(fmpg_decoder * decoder)
{
    return decoder && !decoder->pcm.empty() ? decoder->pcm.data() : nullptr;
}

int ac_decoder_buffer_size(fmpg_decoder * decoder)
{
    if (!decoder ||
        decoder->pcm.size() >
            static_cast<size_t>(std::numeric_limits<int>::max())) {
        return 0;
    }

    return static_cast<int>(decoder->pcm.size());
}

double ac_decoder_timecode(fmpg_decoder * decoder)
{
    return decoder ? decoder->timecode : 0.0;
}

int ac_decoder_stream_index(fmpg_decoder * decoder)
{
    return decoder ? decoder->stream_index : -1;
}