简体   繁体   English

使用 libav* 库的 FFMPEG 音频转码

[英]FFMPEG audio transcoding using libav* libraries

I am writing an audio transcoding application using ffmpeg libraries.我正在使用 ffmpeg 库编写音频转码应用程序。 Here is my code这是我的代码

     * File:   main.cpp
     * Author: vinod
     * Compile with "g++ -std=c++11 -o audiotranscode main.cpp -lavformat -lavcodec -lavutil -lavfilter"

    #if !defined PRId64 || PRI_MACROS_BROKEN
    #undef PRId64
    #define PRId64 "lld"

    #define __STDC_FORMAT_MACROS

    #ifdef   __cplusplus
    extern "C" {

    #include <stdio.h>
    #include <stdlib.h>
    #include <sys/types.h>
    #include <stdint.h>
    #include <libavutil/imgutils.h>
    #include <libavutil/samplefmt.h>
    #include <libavutil/frame.h>
    #include <libavutil/timestamp.h>
    #include <libavformat/avformat.h>
    #include <libavfilter/avfilter.h>
    #include <libavfilter/buffersrc.h>
    #include <libavfilter/buffersink.h>
    #include <libswscale/swscale.h>
    #include <libavutil/opt.h>

    #ifdef   __cplusplus

    #include <iostream>
    using namespace std;

    int select_stream, got_frame, got_packet;

    AVFormatContext *in_fmt_ctx = NULL, *out_fmt_ctx = NULL;
    AVCodec *dec_codec = NULL, * enc_codec = NULL;
    AVStream *audio_st = NULL;
    AVCodecContext *enc_ctx = NULL, *dec_ctx = NULL;

    AVFrame *pFrame = NULL, * pFrameFiltered = NULL;

    AVFilterGraph *filter_graph = NULL;
    AVFilterContext *buffersrc_ctx = NULL;
    AVFilterContext *buffersink_ctx = NULL;

    AVPacket packet;

    string inFileName = "/home/vinod/vinod/Media/univac.webm";
    string outFileName = "audio_extracted.m4a";

    int target_bit_rate = 128000,
        sample_rate = 22050,
        channels = 1;
    AVSampleFormat sample_fmt = AV_SAMPLE_FMT_S16;
    string filter_description = "aresample=22050,aformat=sample_fmts=s16:channel_layouts=mono";

    int log_averror(int errcode)
            char *errbuf = (char *) calloc(AV_ERROR_MAX_STRING_SIZE, sizeof(char));
            av_strerror(errcode, errbuf, AV_ERROR_MAX_STRING_SIZE);
            std::cout << "Error - " << errbuf << std::endl;
            delete [] errbuf;
            return -1;

     * Initialize conversion filter */
    int initialize_audio_filter()
            char args[512];
            int ret;
            AVFilter *buffersrc = avfilter_get_by_name("abuffer");
            AVFilter *buffersink = avfilter_get_by_name("abuffersink");
            AVFilterInOut *outputs = avfilter_inout_alloc();
            AVFilterInOut *inputs = avfilter_inout_alloc();
            filter_graph = avfilter_graph_alloc();
            const enum AVSampleFormat out_sample_fmts[] = {sample_fmt, AV_SAMPLE_FMT_NONE};
            const int64_t out_channel_layouts[] = {av_get_default_channel_layout(out_fmt_ctx -> streams[0] -> codec -> channels), -1};
            const int out_sample_rates[] = {out_fmt_ctx -> streams[0] -> codec -> sample_rate, -1};

            if (!dec_ctx->channel_layout)
                    dec_ctx->channel_layout = av_get_default_channel_layout(dec_ctx->channels);

            snprintf(args, sizeof(args), "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=0x%" PRIx64,
                     in_fmt_ctx -> streams[select_stream] -> time_base.num, in_fmt_ctx -> streams[select_stream] -> time_base.den,
            ret = avfilter_graph_create_filter(&buffersrc_ctx, buffersrc, "in", args, NULL, filter_graph);

            if (ret < 0) {
                    av_log(NULL, AV_LOG_ERROR, "Cannot create buffer source\n");
                    return -1;

            ret = avfilter_graph_create_filter(&buffersink_ctx, buffersink, "out", NULL, NULL, filter_graph);

            if (ret < 0) {
                    av_log(NULL, AV_LOG_ERROR, "Cannot create buffer sink\n");
                    return ret;

            ret = av_opt_set_int_list(buffersink_ctx, "sample_fmts", out_sample_fmts, -1,

            if (ret < 0) {
                    av_log(NULL, AV_LOG_ERROR, "Cannot set output sample format\n");
                    return ret;

            ret = av_opt_set_int_list(buffersink_ctx, "channel_layouts", out_channel_layouts, -1,

            if (ret < 0) {
                    av_log(NULL, AV_LOG_ERROR, "Cannot set output channel layout\n");
                    return ret;

            ret = av_opt_set_int_list(buffersink_ctx, "sample_rates", out_sample_rates, -1,

            if (ret < 0) {
                    av_log(NULL, AV_LOG_ERROR, "Cannot set output sample rate\n");
                    return ret;

            /* Endpoints for the filter graph. */
            outputs -> name = av_strdup("in");
            outputs -> filter_ctx = buffersrc_ctx;
            outputs -> pad_idx = 0;
            outputs -> next = NULL;
            /* Endpoints for the filter graph. */
            inputs -> name = av_strdup("out");
            inputs -> filter_ctx = buffersink_ctx;
            inputs -> pad_idx = 0;
            inputs -> next = NULL;
            string filter_desc = filter_description;

            if ((ret = avfilter_graph_parse_ptr(filter_graph, filter_desc.c_str(), &inputs, &outputs, NULL)) < 0) {

            if ((ret = avfilter_graph_config(filter_graph, NULL)) < 0) {

            /* Print summary of the sink buffer
             * Note: args buffer is reused to store channel layout string */
            AVFilterLink *outlink = buffersink_ctx->inputs[0];
            av_get_channel_layout_string(args, sizeof(args), -1, outlink->channel_layout);
            av_log(NULL, AV_LOG_INFO, "Output: srate:%dHz fmt:%s chlayout:%s\n",
                   (int) outlink->sample_rate,
                   (char *) av_x_if_null(av_get_sample_fmt_name((AVSampleFormat) outlink->format), "?"),
            return 0;

    int main(int argc, char **argv)
            int ret;
            cout << "Hello World" << endl;

            /* open input file, and allocate format context */
            if (avformat_open_input(&in_fmt_ctx, inFileName.c_str(), NULL, NULL) < 0) {
                    std::cout << "error opening input file - " << inFileName << std::endl;
                    return -1;

            /* retrieve stream information */
            if (avformat_find_stream_info(in_fmt_ctx, NULL) < 0) {
                    std::cerr << "Could not find stream information in the input file " << inFileName << std::endl;

            /* Dump format details */
            printf("\n ---------------------------------------------------------------------- \n");
            av_dump_format(in_fmt_ctx, 0, inFileName.c_str(), 0);
            printf("\n ---------------------------------------------------------------------- \n");
            /* Choose a audio stream */
            select_stream = av_find_best_stream(in_fmt_ctx, AVMEDIA_TYPE_AUDIO, -1, -1, &dec_codec, 0);

            if (select_stream == AVERROR_STREAM_NOT_FOUND) {
                    std::cerr << "No audio stream found" << std::endl;
                    return -1;

            if (select_stream == AVERROR_DECODER_NOT_FOUND) {
                    std::cerr << "No suitable decoder found" << std::endl;
                    return -1;

            dec_ctx = in_fmt_ctx -> streams[ select_stream] -> codec;
            av_opt_set_int(dec_ctx, "refcounted_frames", 1, 0);

            /* init the audio decoder */
            if ((ret = avcodec_open2(dec_ctx, dec_codec, NULL)) < 0) {
                    av_log(NULL, AV_LOG_ERROR, "Cannot open audio decoder\n");
                    return ret;

            /* allocate output context */
            ret = avformat_alloc_output_context2(&out_fmt_ctx, NULL, NULL,

            if (ret < 0) {
                    std::cerr << "Could not create output context for the file " << outFileName << std::endl;
                    return -1;

            /* find the encoder */
            enum AVCodecID codec_id = out_fmt_ctx -> oformat -> audio_codec;
            enc_codec = avcodec_find_encoder(codec_id);

            if (!(enc_codec)) {
                    std::cerr << "Could not find encoder for - " << avcodec_get_name(codec_id) << std::endl;
                    return -1;

            /* add a new stream */
            audio_st = avformat_new_stream(out_fmt_ctx, enc_codec);

            if (!audio_st) {
                    std::cerr << "Could not add audio stream - " << std::endl;

            /* Initialise audio codec */
            audio_st -> id = out_fmt_ctx -> nb_streams - 1;
            enc_ctx = audio_st -> codec;
            enc_ctx -> codec_id = codec_id;
            enc_ctx -> codec_type = AVMEDIA_TYPE_AUDIO;
            enc_ctx -> bit_rate = target_bit_rate;
            enc_ctx -> sample_rate = sample_rate;
            enc_ctx -> sample_fmt = sample_fmt;
            enc_ctx -> channels = channels;
            enc_ctx -> channel_layout = av_get_default_channel_layout(enc_ctx -> channels);

            /* Some formats want stream headers to be separate. */
            if (out_fmt_ctx -> oformat -> flags & AVFMT_GLOBALHEADER) {
                    enc_ctx -> flags |= CODEC_FLAG_GLOBAL_HEADER;

            ret = avcodec_open2(out_fmt_ctx -> streams[0] -> codec, enc_codec, NULL);

            if (ret < 0) {
                    std::cerr << "Could not create codec context for the file " << outFileName << std::endl;
                    return -1;

            /* Initialize filter */

            if (!(out_fmt_ctx -> oformat -> flags & AVFMT_NOFILE)) {
                    int ret = avio_open(& out_fmt_ctx -> pb, outFileName.c_str(),

                    if (ret < 0) {
                            return -1;

            /* Write header */
            if (avformat_write_header(out_fmt_ctx, NULL) < 0) {
                    if (ret < 0) {
                            return -1;

            /* Allocate frame */
            pFrame = av_frame_alloc();

            if (!pFrame) {
                    std::cerr << "Could not allocate frame\n";
                    return -1;

            pFrameFiltered = av_frame_alloc();

            if (!pFrameFiltered) {
                    std::cerr << "Could not allocate frame\n";
                    return -1;

            packet.data = NULL;
            packet.size = 0;

            /* Read packet from the stream */
            while (av_read_frame(in_fmt_ctx, &packet) >= 0) {
                    if (packet.stream_index == select_stream) {
                            ret = avcodec_decode_audio4(dec_ctx, pFrame, &got_frame, &packet);

                            if (ret < 0) {
                                    return ret;

                            printf("Decoded packet pts : %ld ", packet.pts);
                            printf("Frame Best Effor pts : %ld \n", pFrame->best_effort_timestamp);
                            /* Set frame pts */
                            pFrame -> pts = av_frame_get_best_effort_timestamp(pFrame);

                            if (got_frame) {
                                    /* push the decoded frame into the filtergraph */
                                    ret = av_buffersrc_add_frame_flags(buffersrc_ctx, pFrame, AV_BUFFERSRC_FLAG_KEEP_REF);

                                    if (ret < 0) {
                                            return ret;

                                    /* pull filtered frames from the filtergraph */
                                    while (1) {
                                            ret = av_buffersink_get_frame(buffersink_ctx, pFrameFiltered);

                                            if ((ret == AVERROR(EAGAIN)) || (ret == AVERROR_EOF)) {

                                            if (ret < 0) {
                                                    printf("Error while getting filtered frames from filtergraph\n");
                                                    return -1;

                                            /* Initialize the packets */
                                            AVPacket encodedPacket = {0};
                                            ret = avcodec_encode_audio2(out_fmt_ctx -> streams[0] -> codec, &encodedPacket, pFrameFiltered, &got_packet);

                                            if (!ret && got_packet && encodedPacket.size) {
                                                    /* Set correct pts and dts */
                                                    if (encodedPacket.pts != AV_NOPTS_VALUE) {
                                                            encodedPacket.pts = av_rescale_q(encodedPacket.pts, buffersink_ctx -> inputs[0] -> time_base,
                                                                                             out_fmt_ctx -> streams[0] -> time_base);

                                                    if (encodedPacket.dts != AV_NOPTS_VALUE) {
                                                            encodedPacket.dts = av_rescale_q(encodedPacket.dts, buffersink_ctx -> inputs[0] -> time_base,
                                                                                             out_fmt_ctx -> streams[0] -> time_base);

                                                    printf("Encoded packet pts %ld\n", encodedPacket.pts);
                                                    /* Write the compressed frame to the media file. */
                                                    ret = av_interleaved_write_frame(out_fmt_ctx, &encodedPacket);

                                                    if (ret < 0) {
                                                            return -1;
                                            } else if (ret < 0) {
                                                    return -1;



            /* Flush delayed frames from encoder*/
            while (got_packet) {
                    AVPacket encodedPacket = {0};
                    ret = avcodec_encode_audio2(out_fmt_ctx -> streams[0] -> codec, &encodedPacket, NULL, &got_packet);

                    if (!ret && got_packet && encodedPacket.size) {
                            /* Set correct pts and dts */
                            if (encodedPacket.pts != AV_NOPTS_VALUE) {
                                    encodedPacket.pts = av_rescale_q(encodedPacket.pts, buffersink_ctx -> inputs[0] -> time_base,
                                                                     out_fmt_ctx -> streams[0] -> time_base);

                            if (encodedPacket.dts != AV_NOPTS_VALUE) {
                                    encodedPacket.dts = av_rescale_q(encodedPacket.dts, buffersink_ctx -> inputs[0] -> time_base,
                                                                     out_fmt_ctx -> streams[0] -> time_base);

                            printf("Encoded packet pts %ld\n", encodedPacket.pts);
                            /* Write the compressed frame to the media file. */
                            ret = av_interleaved_write_frame(out_fmt_ctx, &encodedPacket);

                            if (ret < 0) {
                                    return -1;
                    } else if (ret < 0) {
                            return -1;

            /* Write Trailer */

            if (dec_ctx)


            if (!(out_fmt_ctx -> oformat -> flags & AVFMT_NOFILE))
                    avio_close(out_fmt_ctx -> pb);
            return 0;

The audio file after transcoding is same duration as the input.转码后的音频文件与输入的时长相同。 But its completely noisy.但它完全嘈杂。 Can somebody tell me what I am doing wrong here!有人可以告诉我我在这里做错了什么!

I have found out where the problem was and it has been resolved.我发现问题出在哪里并且已经解决。

When the output file was opened in audacity, it was seen that there were unwanted silences inserted in the audio signal.当输出文件大胆打开时,可以看到音频信号中插入了不需要的静音。 The problem was with the 'number of samples per frame' supplied to the encoder.问题在于提供给编码器的“每帧样本数”。

Different codecs expect different frame sizes for encoding.不同的编解码器期望不同的帧大小进行编码。 And aac encoder expects a size of 1024. This can be seen by observing enc_ctx->frame_size after execution of avcodec_open2() .并且 aac 编码器期望大小为 1024。这可以通过在执行avcodec_open2()后观察enc_ctx->frame_size来看到。

The filter needs to supply a frame with 1024 number of samples per channel to the encoder.过滤器需要向编码器提供每个通道包含 1024 个样本的帧。 So in my code, pFrameFiltered needs to have exactly 1024 number of samples per channel.所以在我的代码中, pFrameFiltered每个通道需要正好有 1024 个样本。 If the its less than 1024 , the encoder appends zeros to make it to 1024 samples and then encodes it.如果它小于 1024 ,则编码器附加零以使其达到 1024 个样本,然后对其进行编码。

This can be solved by either having our own fifo queue or by using the filter available with the ffmpeg audio filters.这可以通过拥有我们自己的 fifo 队列或使用 ffmpeg 音频过滤器提供的过滤器来解决。 We need to use a filter asetnsamples=n=1024:p=0 as explained here .我们需要使用过滤器asetnsamples=n=1024:p=0 ,如此所述。 So the alteration required was所以需要的改动是

`string filter_description = 

Just play around with the value of n in the filter to understand better.只需在过滤器中调整n的值即可更好地理解。 Check the enc_ctx->frame_size field set by avcodec_open2( ) and set the value of n appropriately.检查 avcodec_open2( ) 设置的enc_ctx->frame_size字段并适当设置n的值。

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

粤ICP备18138465号  © 2020-2024 STACKOOM.COM