Resampling audio with FFMPEG LibAV

Question

Well, since FFMPEG documentation and code examples are absolute garbage, I guess my only choise is to go here and aks.

So what I'm trying to do is simply record audio from microphione and write it to the file. So I initialize my input and out formats, I get an audio packet decode it, resample, encode and write. But everytime I try to play and audio there's only a stub of data. It seems like for some reason it writes only a start packet. Which is still very strange and let me explain why:

if((response = swr_config_frame(resampleContext, audioOutputFrame, frame) < 0)) qDebug() << "can't configure frame!" <<  av_make_error(response);

if((response = swr_convert_frame(resampleContext, audioOutputFrame, frame) < 0)) qDebug() << "can't resample frame!" <<  av_make_error(response);

Here's the code I'm using to resample. My frame has data but swr_convert_frame writes empty data to audioOutputFrame

How do I fix that? FFMPEG literally driving me crazy.

Here's the full code of my class

VideoReader.h

#ifndef VIDEOREADER_H
#define VIDEOREADER_H

extern "C"
{
#include <libavcodec/avcodec.h>
#include <libavformat/avformat.h>
#include <libswscale/swscale.h>
#include <libavdevice/avdevice.h>
#include "libavutil/audio_fifo.h"
#include "libavformat/avio.h"
#include "libswresample/swresample.h"
#include <inttypes.h>
}

#include <QString>
#include <QElapsedTimer>

class VideoReader
{
public:
    VideoReader();

    bool open(const char* filename);
    bool fillFrame();
    bool readFrame(uint8_t *&frameData);
    void close();

    int width, height;

private:
    bool configInput();
    bool configOutput(const char *filename);
    bool configResampler();

    bool encode(AVFrame *frame, AVCodecContext *encoderContext, AVPacket *outputPacket, int streamIndex, QString type);

    int audioStreamIndex = -1;
    int videoStreamIndex = -1;

    int64_t videoStartPts = 0;
    int64_t audioStartPts = 0;

    AVFormatContext* inputFormatContext = nullptr;
    AVFormatContext* outputFormatContext = nullptr;

    AVCodecContext* videoDecoderContext = nullptr;
    AVCodecContext* videoEncoderContext = nullptr;

    AVCodecContext* audioDecoderContext = nullptr;
    AVCodecContext* audioEncoderContext = nullptr;

    AVFrame* videoInputFrame = nullptr;
    AVFrame* audioInputFrame = nullptr;

    AVFrame* videoOutputFrame = nullptr;
    AVFrame* audioOutputFrame = nullptr;

    AVPacket* inputPacket = nullptr;

    AVPacket* videoOutputPacket = nullptr;
    AVPacket* audioOutputPacket = nullptr;

    SwsContext* innerScaleContext = nullptr;
    SwsContext* outerScaleContext = nullptr;

    SwrContext *resampleContext = nullptr;
};

#endif // VIDEOREADER_H

VideoReader.cpp

#include "VideoReader.h"

#include <QDebug>

static const char* av_make_error(int errnum)
{
    static char str[AV_ERROR_MAX_STRING_SIZE];
    memset(str, 0, sizeof(str));
    return av_make_error_string(str, AV_ERROR_MAX_STRING_SIZE, errnum);
}

VideoReader::VideoReader()
{

}

bool VideoReader::open(const char *filename)
{
    if(!configInput()) return false;
    if(!configOutput(filename)) return false;
    if(!configResampler()) return false;

    return true;
}

bool VideoReader::fillFrame()
{
    auto convertToYUV = [=](AVFrame* frame)
    {
        int response = 0;

        if((response = sws_scale(outerScaleContext, frame->data, frame->linesize, 0, videoEncoderContext->height, videoOutputFrame->data, videoOutputFrame->linesize)) < 0) qDebug() << "can't rescale" << av_make_error(response);
    };

    auto convertAudio = [this](AVFrame* frame)
    {
        int response = 0;

        auto& out = audioOutputFrame;
        qDebug() << out->linesize[0] << out->nb_samples;
        if((response = swr_convert_frame(resampleContext, audioOutputFrame, frame)) < 0) qDebug() << "can't resample frame!" << av_make_error(response);
        qDebug() << "poop";
    };

    auto decodeEncode = [=](AVPacket* inputPacket, AVFrame* inputFrame, AVCodecContext* decoderContext,
                            AVPacket* outputPacket, AVFrame* outputFrame, AVCodecContext* encoderContext,
                            std::function<void (AVFrame*)> convertFunc,
                            int streamIndex, int64_t startPts, QString type)
    {
        int response = avcodec_send_packet(decoderContext, inputPacket);
        if(response < 0) { qDebug() << "failed to send" << type << "packet!" <<  av_make_error(response); return false; }

        response = avcodec_receive_frame(decoderContext, inputFrame);
        if(response == AVERROR(EAGAIN) || response == AVERROR_EOF) { av_packet_unref(inputPacket); return false; }
        else if (response < 0) { qDebug() << "failed to decode" << type << "frame!" << response << av_make_error(response); return false; }

        if(encoderContext)
        {
            outputFrame->pts = inputPacket->pts - startPts;

            convertFunc(inputFrame);
            if(!encode(outputFrame, encoderContext, outputPacket, streamIndex, type)) return false;
        }

        av_packet_unref(inputPacket);

        return true;
    };

    while(av_read_frame(inputFormatContext, inputPacket) >= 0) //actually read packet
    {
        if(inputPacket->stream_index == videoStreamIndex)
        {
            if(!videoStartPts) videoStartPts = inputPacket->pts;
            if(decodeEncode(inputPacket, videoInputFrame, videoDecoderContext, videoOutputPacket, videoOutputFrame, videoEncoderContext, convertToYUV, videoStreamIndex, videoStartPts, "video")) break;
        }
        else if(inputPacket->stream_index == audioStreamIndex)
        {
            if(!audioStartPts) audioStartPts = inputPacket->pts;
            if(decodeEncode(inputPacket, audioInputFrame, audioDecoderContext, audioOutputPacket, audioOutputFrame, audioEncoderContext, convertAudio, audioStreamIndex, audioStartPts, "audio")) break;
        }
    }

    return true;
}

bool VideoReader::readFrame(uint8_t *&frameData)
{
    if(!fillFrame()) { qDebug() << "readFrame method failed!"; return false; };

    const int bytesPerPixel = 4;

    uint8_t* destination[bytesPerPixel] = {frameData, NULL, NULL, NULL};
    int destinationLinesize[bytesPerPixel] = { videoInputFrame->width * bytesPerPixel,  0, 0, 0};

    sws_scale(innerScaleContext, videoInputFrame->data, videoInputFrame->linesize, 0, videoInputFrame->height, destination, destinationLinesize);

    return true;
}

void VideoReader::close()
{
    encode(NULL, videoEncoderContext, videoOutputPacket, videoStreamIndex, "video");
    encode(NULL, audioEncoderContext, audioOutputPacket, audioStreamIndex, "audio");

    if(av_write_trailer(outputFormatContext) < 0) { qDebug() << "failed to write trailer"; };

    avformat_close_input(&outputFormatContext);
    avformat_free_context(outputFormatContext);
    avformat_close_input(&inputFormatContext);
    avformat_free_context(inputFormatContext);

    av_frame_free(&videoInputFrame);
    av_frame_free(&audioInputFrame);

    av_frame_free(&videoOutputFrame);
    av_frame_free(&audioOutputFrame);

    av_packet_free(&inputPacket);

    av_packet_free(&videoOutputPacket);
    av_packet_free(&audioOutputPacket);

    avcodec_free_context(&videoDecoderContext);
    avcodec_free_context(&videoEncoderContext);

    avcodec_free_context(&audioDecoderContext);
    avcodec_free_context(&audioEncoderContext);

    sws_freeContext(innerScaleContext);
    sws_freeContext(outerScaleContext);

    swr_free(&resampleContext);
}

bool VideoReader::configInput()
{
    avdevice_register_all();

    inputFormatContext = avformat_alloc_context();

    if(!inputFormatContext) { qDebug() << "can't create context!"; return false; }

    const char* inputFormatName = "dshow";/*"gdigrab"*/
    AVInputFormat* inputFormat = av_find_input_format(inputFormatName);

    if(!inputFormat){ qDebug() << "Can't find" << inputFormatName; return false; }

    AVDictionary* options = NULL;
    av_dict_set(&options, "framerate", "30", 0);
    av_dict_set(&options, "video_size", "1920x1080", 0);

    if(avformat_open_input(&inputFormatContext, "video=HD USB Camera:audio=Microphone (High Definition Audio Device)" /*"desktop"*/, inputFormat, &options) != 0) { qDebug() << "can't open video file!"; return false; }

    AVCodecParameters* videoCodecParams = nullptr;
    AVCodecParameters* audioCodecParams = nullptr;
    AVCodec* videoDecoder = nullptr;
    AVCodec* audioDecoder = nullptr;

    for (uint i = 0; i < inputFormatContext->nb_streams; ++i)
    {
        auto stream = inputFormatContext->streams[i];
        auto codecParams = stream->codecpar;

        if(codecParams->codec_type == AVMEDIA_TYPE_AUDIO) { audioStreamIndex = i; audioDecoder = avcodec_find_decoder(codecParams->codec_id); audioCodecParams = codecParams; }
        if(codecParams->codec_type == AVMEDIA_TYPE_VIDEO) { videoStreamIndex = i; videoDecoder = avcodec_find_decoder(codecParams->codec_id); videoCodecParams = codecParams; }

        if(audioStreamIndex != -1 && videoStreamIndex != -1) break;
    }

    if(audioStreamIndex == -1) { qDebug() << "failed to find audio stream inside file"; return false; }
    if(videoStreamIndex == -1) { qDebug() << "failed to find video stream inside file"; return false; }

    auto configureCodecContext = [=](AVCodecContext*& context, AVCodec* decoder, AVCodecParameters* params, AVFrame*& frame, QString type)
    {
        context = avcodec_alloc_context3(decoder);
        if(!context) { qDebug() << "failed to create" << type << "decoder context!"; return false; }

        if(avcodec_parameters_to_context(context, params) < 0) { qDebug() << "can't initialize input" << type << "decoder context"; return false; }

        if(avcodec_open2(context, decoder, NULL) < 0) { qDebug() << "can't open" << type << "decoder"; return false; }

        frame = av_frame_alloc();
        if(!frame) { qDebug() << "can't allocate" << type << "frame"; return false; }

        return true;
    };

    if(!configureCodecContext(videoDecoderContext, videoDecoder, videoCodecParams, videoInputFrame, "video")) return false;
    if(!configureCodecContext(audioDecoderContext, audioDecoder, audioCodecParams, audioInputFrame, "audio")) return false;

    audioDecoderContext->channel_layout = AV_CH_LAYOUT_STEREO;
    audioInputFrame->channel_layout = audioDecoderContext->channel_layout;

    inputPacket = av_packet_alloc();
    if(!inputPacket) { qDebug() << "can't allocate input packet!";  return false; }

    //first frame, needed fo initialization
    if(!fillFrame()) { qDebug() << "Failed to fill frame on init!"; return false; };

    width = videoDecoderContext->width;
    height = videoDecoderContext->height;

    innerScaleContext = sws_getContext(width, height, videoDecoderContext->pix_fmt,
                                       width, height, AV_PIX_FMT_RGB0,
                                       SWS_FAST_BILINEAR,
                                       NULL,
                                       NULL,
                                       NULL);

    outerScaleContext = sws_getContext(width, height, videoDecoderContext->pix_fmt,
                                       width, height, AV_PIX_FMT_YUV420P,
                                       SWS_FAST_BILINEAR,
                                       NULL,
                                       NULL,
                                       NULL);


    if(!innerScaleContext) { qDebug() << "failed to initialize scaler context"; return false; }

    return true;
}

bool VideoReader::configOutput(const char *filename)
{
    avformat_alloc_output_context2(&outputFormatContext, NULL, NULL, filename);
    if(!outputFormatContext) { qDebug() << "failed to create output context"; return false; }

    AVOutputFormat* outputFormat = outputFormatContext->oformat;

    auto prepareOutputContext = [=](AVCodecContext*& encoderContext,
                                    std::function<void (AVCodecContext*, AVCodec*)> configureContextFunc,
                                    std::function<void (AVFrame*)> configureFrameFunc,
                                    AVCodecID codecId, AVFrame*& frame, AVPacket*& packet, QString type)
    {
        auto stream = avformat_new_stream(outputFormatContext, NULL);
        if(!stream) { qDebug() << "failed to allocate output" << type << "stream"; return false; }

        AVCodec* encoder = avcodec_find_encoder(codecId);
        if(!encoder) { qDebug() << "failed to find" << type << "encoder!"; return false; }

        encoderContext = avcodec_alloc_context3(encoder);
        if(!encoderContext) { qDebug() << "failed to create video encoder context!"; return false; }

        configureContextFunc(encoderContext, encoder);

        int result = avcodec_open2(encoderContext, encoder, NULL);
        if(result < 0) { qDebug() << "failed to open audio encoder" << av_make_error(result); return false; }
        if(avcodec_parameters_from_context(stream->codecpar, encoderContext) < 0) { qDebug() << "failed to copy parameters to audio output stream"; return false; }

        packet = av_packet_alloc();
        if(!packet) {qDebug() << "failed allocate output" << type << "packet"; return false;}

        frame = av_frame_alloc();
        if(!frame) { qDebug() << "can't allocate output" << type << "frame"; return false; }

        configureFrameFunc(frame);

        av_frame_get_buffer(frame, 0);

        return true;
    };

    auto configureAudioFrame = [=](AVFrame* frame)
    {
        frame->nb_samples = audioEncoderContext->frame_size;
        frame->format = audioEncoderContext->sample_fmt;
        frame->sample_rate = audioEncoderContext->sample_rate;
        frame->channel_layout = av_get_default_channel_layout(audioDecoderContext->channels);
    };

    auto configureAudioEncoderContext = [=](AVCodecContext* encoderContext, AVCodec* encoder)
    {
        encoderContext->bit_rate = 64000;
        encoderContext->sample_fmt = encoder->sample_fmts[0];
        encoderContext->sample_rate = 44100;
        encoderContext->codec_type = AVMEDIA_TYPE_AUDIO;
        encoderContext->channel_layout = AV_CH_LAYOUT_STEREO;
        encoderContext->channels = av_get_channel_layout_nb_channels(encoderContext->channel_layout);
    };

    auto configureVideoFrame = [=](AVFrame* frame)
    {
        frame->format = videoEncoderContext->pix_fmt;
        frame->width  = videoEncoderContext->width;
        frame->height = videoEncoderContext->height;
    };

    auto configureVideoEncoderContext = [=](AVCodecContext* encoderContext, AVCodec* encoder)
    {
        encoderContext->width = videoDecoderContext->width;
        encoderContext->height = videoDecoderContext->height;
        encoderContext->pix_fmt = encoder->pix_fmts[0];
        encoderContext->gop_size = 10;
        encoderContext->max_b_frames = 1;
        encoderContext->framerate = AVRational{30, 1};
        encoderContext->time_base = AVRational{1, 30};

        av_opt_set(encoderContext->priv_data, "preset", "ultrafast", 0);
        av_opt_set(encoderContext->priv_data, "tune", "zerolatency", 0);
    };

    if(!prepareOutputContext(videoEncoderContext, configureVideoEncoderContext, configureVideoFrame, outputFormat->video_codec, videoOutputFrame, videoOutputPacket, "video")) return false;
    if(!prepareOutputContext(audioEncoderContext, configureAudioEncoderContext, configureAudioFrame, outputFormat->audio_codec, audioOutputFrame, audioOutputPacket, "audio")) return false;

    if(outputFormat->flags & AVFMT_GLOBALHEADER) outputFormat->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;

    int result = 0;
    if(!(outputFormat->flags & AVFMT_NOFILE))
        if((result = avio_open(&outputFormatContext->pb, filename, AVIO_FLAG_WRITE)) < 0)
            { qDebug() << "failed to open file" <<  av_make_error(result); return false; }

    result = avformat_write_header(outputFormatContext, NULL);
    if(result < 0) {qDebug() << "failed to write header!" << av_make_error(result); return false; }

    return true;
}

bool VideoReader::configResampler()
{

    resampleContext = swr_alloc_set_opts(NULL,
                                         av_get_default_channel_layout(audioEncoderContext->channels),
                                         audioEncoderContext->sample_fmt,
                                         audioEncoderContext->sample_rate,
                                         av_get_default_channel_layout(audioDecoderContext->channels),
                                         audioDecoderContext->sample_fmt,
                                         audioDecoderContext->sample_rate,
                                         0, NULL);
    if (!resampleContext) { qDebug() << "Could not allocate resample context"; return false; }

    int error;
    if ((error = swr_init(resampleContext)) < 0) { qDebug() << "Could not open resample context"; swr_free(&resampleContext); return false; }

    return true;
}

bool VideoReader::encode(AVFrame* frame, AVCodecContext* encoderContext, AVPacket* outputPacket, int streamIndex, QString type)
{
    int response;

    response = avcodec_send_frame(encoderContext, frame);
    if(response < 0) { qDebug() << "failed to send" << type << "frame" << av_make_error(response); return false; }

    while(response >= 0)
    {
        response = avcodec_receive_packet(encoderContext, outputPacket);
        if(response == AVERROR(EAGAIN) || response == AVERROR_EOF) { av_packet_unref(outputPacket); continue; }
        else if (response < 0) { qDebug() << "failed to encode" << type << "frame!" << response << av_make_error(response); return false; }

        outputPacket->stream_index = streamIndex;

        AVStream *inStream = inputFormatContext->streams[streamIndex];
        AVStream *outStream = outputFormatContext->streams[streamIndex];

        av_packet_rescale_ts(outputPacket, inStream->time_base, outStream->time_base);

        if((response = av_interleaved_write_frame(outputFormatContext, outputPacket)) != 0) { qDebug() << "Failed to write" << type << "packet!" <<  av_make_error(response); av_packet_unref(outputPacket); return false; }

        av_packet_unref(outputPacket);
    }

    return true;
}

I could try to write down shorter example if needed

Answer 1

As far as I know, here are several circumstances that swr_convert_frame might write nothing:

You did not initialize your output frame correctly. If so, checkout the following snippet:

  audioFrame = av_frame_alloc();
  if (audioFrame == NULL) {
    // error handling
  }
  audioFrame->format = /* the sample format you'd like to use */; 
  audioFrame->channel_layout = audioCodecContext->channel_layout;
  audioFrame->nb_samples = audioCodecContext->frame_size;
  if (av_frame_get_buffer(encoder->audioFrame, 0) < 0) {
    // error handling
  }

The samples in your input frame is not enough for producing a complete output frame. If so, you need swr_get_delay .

if (swr_convert(swrContext, audioFrame->data,
                audioFrame->nb_samples,
                (uint8_t const**)frame->data, frame->nb_samples) < 0) {
  // handle error
}
// do stuff with your audioFrame
...

while (swr_get_delay(swrContext, audioCodecContext->sample_rate)
       > audioFrame->nb_samples) {
  if (swr_convert(swrContext, audioFrame->data,
                  audioFrame->nb_samples, NULL, 0) < 0) {
    // handle error
  }
  // do stuff with your audioFrame
}

Anyway, more informations, at least a minimal reproducible sample should be provided for further diagnose.

I have to agree that documentation of libav is too poor, it used to drive me crazy too. But cursing authors of libav won't do any help, and what's more, Open Source contributors do not owe you anything.

Resampling audio with FFMPEG LibAV

Question

1 answers

solution1
1 ACCPTED 2020-09-21 11:16:26

Resampling audio with FFMPEG LibAV

Question

1 answers

solution1 1 ACCPTED 2020-09-21 11:16:26

solution1
1 ACCPTED 2020-09-21 11:16:26