对音频流进行上采样和编码

Question

Basically after transcoding pcm_alaw 8khz to mp3, I can hear only some brief or even swift sound in first 1-2 seconds, unrecognizable sound.基本上在将 pcm_alaw 8khz 转码为 mp3 后，我在前 1-2 秒内只能听到一些简短甚至是快速的声音，无法识别的声音。 So something is wrong with pts/dts, packed to planar convertion, or upsampling.因此，pts/dts、打包到平面转换或上采样有问题。

My application does transcoding rtsp camera stream to file.我的应用程序将 rtsp 相机流转码为文件。 Video and audio.视频和音频。 Video works fine and audio remuxing as well.视频可以正常工作，音频也可以重新混合。 Now I have pcm_alaw 8khz audio stream and want to transcode it to mp4 file along with video.现在我有 pcm_alaw 8khz 音频流，想将它与视频一起转码为 mp4 文件。

Code is quite cumbersome to construct reproducible part, so firstly I want to know if my logic is right.代码构建可重现部分非常麻烦，所以首先我想知道我的逻辑是否正确。 Here is my draft process (assume all error are checked and handled):这是我的草稿流程（假设检查并处理了所有错误）：

create encoder:创建编码器：

    codec_ = avcodec_find_encoder(AV_CODEC_ID_MP3);

    enc_ctx_ = avcodec_alloc_context3(codec_);

    enc_ctx_->bit_rate = 64000;
    enc_ctx_->codec_type = AVMEDIA_TYPE_AUDIO;

    enc_ctx_->sample_fmt   = codec_->sample_fmts ? codec_->sample_fmts[0] : AV_SAMPLE_FMT_S32P;

    // functions from here https://www.ffmpeg.org/doxygen/4.1/encode_audio_8c-example.html
    enc_ctx_->sample_rate    = select_sample_rate(codec_);
    enc_ctx_->channel_layout = select_channel_layout(codec_);
    enc_ctx_->channels       = av_get_channel_layout_nb_channels(enc_ctx_->channel_layout);
    enc_ctx_->time_base = (AVRational){1, enc_ctx_->sample_rate};
    enc_ctx_->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;

    if (is_global_header) {
        enc_ctx_->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
    }

    avcodec_open2(enc_ctx_, codec_, nullptr);

create resampler (in_frame):创建重采样器（in_frame）：

    audio_fifo_ = av_audio_fifo_alloc(enc_ctx_->sample_fmt, enc_ctx_->channels, 1));
       
    in_ch_layout_ = in_frame->channel_layout;
    in_sample_fmt = in_frame->format;
    in_sample_rate_ = in_frame->sample_rate;

    swr_ctx_ = swr_alloc_set_opts(NULL,                       // we're allocating a new context
                             enc_ctx_->channel_layout,        // out_ch_layout
                             enc_ctx_->sample_fmt,            // out_sample_fmt
                             enc_ctx_->sample_rate,           // out_sample_rate
                             in_frame->channel_layout,        // in_ch_layout
                             (AVSampleFormat)in_frame->format, // in_sample_fmt
                             in_frame->sample_rate,            // in_sample_rate
                             0,                                // log_offset
                             NULL);                            // log_ctx
                             
    swr_init(swr_ctx_);

resample (in_frame, start_pts, start_dts):重采样（in_frame，start_pts，start_dts）：

    auto resampled_frame = av_frame_alloc();

    auto dst_nb_samples = av_rescale_rnd(swr_get_delay(swr_ctx_, in_frame->sample_rate) +
                                    in_frame->nb_samples, enc_ctx_->sample_rate, in_frame->sample_rate, AV_ROUND_UP);

    // resampled_frame->nb_samples     = dst_nb_samples;
    resampled_frame->format         = enc_ctx_->sample_fmt;
    resampled_frame->channel_layout = enc_ctx_->channel_layout;
    // resampled_frame->channels       = enc_ctx_->channels;
    resampled_frame->sample_rate    = enc_ctx_->sample_rate;

    error = swr_convert_frame(swr_ctx_, resampled_frame, in_frame);

    /* Make the FIFO as large as it needs to be to hold both,
     * the old and the new samples. */
    if (av_audio_fifo_size(audio_fifo_) < dst_nb_samples) {
        av_audio_fifo_realloc(audio_fifo_, dst_nb_samples);
    }

    /* Store the new samples in the FIFO buffer. */
    auto nb_samples = av_audio_fifo_write(audio_fifo_,
                                          reinterpret_cast<void **>(resampled_frame->extended_data),
                                          resampled_frame->nb_samples);


    int delay = 0;
    // trying to split resampled frame to desired chunks
    while (av_audio_fifo_size(audio_fifo_) > 0) {
        const int frame_size = FFMIN(av_audio_fifo_size(audio_fifo_), enc_ctx_->frame_size);

        auto out_frame = av_frame_alloc();


        out_frame->nb_samples       = frame_size;
        out_frame->format           = enc_ctx_->sample_fmt;
        out_frame->channel_layout   = enc_ctx_->channel_layout;
        out_frame->channels         = enc_ctx_->channels;
        out_frame->sample_rate      = enc_ctx_->sample_rate;

        av_frame_get_buffer(out_frame, 0);
        
        av_audio_fifo_read(audio_fifo_, (void **)out_frame->data, frame_size) < frame_size);

    // ***** tried both cases
        out_frame->pts = in_frame->pts + delay;
        out_frame->pkt_dts = in_frame->pkt_dts + delay;
        // swr_next_pts(swr_ctx_, in_frame->pts) + delay;
        // swr_next_pts(swr_ctx_, in_frame->pkt_dts) + delay;

        result.push_back(out_frame);

        delay += frame_size;
    }

    return result;

encoding and muxing (in_frame):编码和复用（in_frame）：

    bool DoesNeedResample(const AVFrame * in_frame) {
        assert(("DoesNeedResample: in_frame is empty", in_frame));
        assert(("DoesNeedResample: encoder is not started", is_init_));

        if (in_frame->sample_rate != enc_ctx_->sample_rate ||
        in_frame->channel_layout != enc_ctx_->channel_layout ||
        in_frame->channels != enc_ctx_->channels ||
        in_frame->format != enc_ctx_->sample_fmt) {
        return true;
        }

        return false;
    }

    av_frame_make_writable(in_frame);


    streamserver::AVFrames encoding_frames;
    if (DoesNeedResample(in_frame)) {
        encoding_frames = Resample(in_frame, 
        av_rescale_q(in_frame->pts, in_audio_stream_timebase_, out_audio_stream_->time_base),
        av_rescale_q(in_frame->pkt_dts, in_audio_stream_timebase_, out_audio_stream_->time_base));
    } else {
        encoding_frames.push_back(av_frame_clone(in_frame));
    }


    for (auto frame : encoding_frames) {
        if ((err = avcodec_send_frame(encoder_ctx, frame)) < 0) {
            AVFrameFree(&frame);
        }

        while (err >= 0) {
            pkt_->data = NULL;
            pkt_->size = 0;
            av_init_packet(pkt_);

            err = avcodec_receive_packet(encoder_ctx, pkt_);
            if (err == AVERROR(EAGAIN) || err == AVERROR_EOF) {
                break;
            } else if (err < 0) {
                break;
            }

            pkt_->stream_index = out_audio_stream_->index;

            av_interleaved_write_frame(ofmt_ctx_, pkt_);
        }

        av_packet_unref(pkt_);
    }

Sound in resulted video is corrupted, see first paragraph for description.结果视频中的声音已损坏，请参阅第一段的说明。

In https://www.ffmpeg.org/doxygen/4.1/transcode_aac_8c-example.html there are lines:在https://www.ffmpeg.org/doxygen/4.1/transcode_aac_8c-example.html中有几行：

        /*
        * Perform a sanity check so that the number of converted samples is
        * not greater than the number of samples to be converted.
        * If the sample rates differ, this case has to be handled differently
        */
        av_assert0(output_codec_context->sample_rate == input_codec_context->sample_rate);

How to handle such cases?如何处理此类情况？ I tried to split resampled frames via fifo in example above!在上面的示例中，我尝试通过 fifo 拆分重新采样的帧！

Answer 1

The logic and steps for transcoding audio with upsampling were right.使用上采样对音频进行转码的逻辑和步骤是正确的。 Problem was outside of the topic.问题不在主题范围内。 So anyone may use this thread as foundation for your code if it will be needed.因此，如果需要，任何人都可以将此线程用作您的代码的基础。

对音频流进行上采样和编码

问题描述

1 个解决方案

解决方案1
0 已采纳 2022-06-08 11:59:52

对音频流进行上采样和编码

问题描述

1 个解决方案

解决方案1 0 已采纳 2022-06-08 11:59:52

解决方案1
0 已采纳 2022-06-08 11:59:52