Upsample and encode audio stream

Question

Basically after transcoding pcm_alaw 8khz to mp3, I can hear only some brief or even swift sound in first 1-2 seconds, unrecognizable sound. So something is wrong with pts/dts, packed to planar convertion, or upsampling.

My application does transcoding rtsp camera stream to file. Video and audio. Video works fine and audio remuxing as well. Now I have pcm_alaw 8khz audio stream and want to transcode it to mp4 file along with video.

Code is quite cumbersome to construct reproducible part, so firstly I want to know if my logic is right. Here is my draft process (assume all error are checked and handled):

create encoder:

    codec_ = avcodec_find_encoder(AV_CODEC_ID_MP3);

    enc_ctx_ = avcodec_alloc_context3(codec_);

    enc_ctx_->bit_rate = 64000;
    enc_ctx_->codec_type = AVMEDIA_TYPE_AUDIO;

    enc_ctx_->sample_fmt   = codec_->sample_fmts ? codec_->sample_fmts[0] : AV_SAMPLE_FMT_S32P;

    // functions from here https://www.ffmpeg.org/doxygen/4.1/encode_audio_8c-example.html
    enc_ctx_->sample_rate    = select_sample_rate(codec_);
    enc_ctx_->channel_layout = select_channel_layout(codec_);
    enc_ctx_->channels       = av_get_channel_layout_nb_channels(enc_ctx_->channel_layout);
    enc_ctx_->time_base = (AVRational){1, enc_ctx_->sample_rate};
    enc_ctx_->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;

    if (is_global_header) {
        enc_ctx_->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
    }

    avcodec_open2(enc_ctx_, codec_, nullptr);

create resampler (in_frame):

    audio_fifo_ = av_audio_fifo_alloc(enc_ctx_->sample_fmt, enc_ctx_->channels, 1));
       
    in_ch_layout_ = in_frame->channel_layout;
    in_sample_fmt = in_frame->format;
    in_sample_rate_ = in_frame->sample_rate;

    swr_ctx_ = swr_alloc_set_opts(NULL,                       // we're allocating a new context
                             enc_ctx_->channel_layout,        // out_ch_layout
                             enc_ctx_->sample_fmt,            // out_sample_fmt
                             enc_ctx_->sample_rate,           // out_sample_rate
                             in_frame->channel_layout,        // in_ch_layout
                             (AVSampleFormat)in_frame->format, // in_sample_fmt
                             in_frame->sample_rate,            // in_sample_rate
                             0,                                // log_offset
                             NULL);                            // log_ctx
                             
    swr_init(swr_ctx_);

resample (in_frame, start_pts, start_dts):

    auto resampled_frame = av_frame_alloc();

    auto dst_nb_samples = av_rescale_rnd(swr_get_delay(swr_ctx_, in_frame->sample_rate) +
                                    in_frame->nb_samples, enc_ctx_->sample_rate, in_frame->sample_rate, AV_ROUND_UP);

    // resampled_frame->nb_samples     = dst_nb_samples;
    resampled_frame->format         = enc_ctx_->sample_fmt;
    resampled_frame->channel_layout = enc_ctx_->channel_layout;
    // resampled_frame->channels       = enc_ctx_->channels;
    resampled_frame->sample_rate    = enc_ctx_->sample_rate;

    error = swr_convert_frame(swr_ctx_, resampled_frame, in_frame);

    /* Make the FIFO as large as it needs to be to hold both,
     * the old and the new samples. */
    if (av_audio_fifo_size(audio_fifo_) < dst_nb_samples) {
        av_audio_fifo_realloc(audio_fifo_, dst_nb_samples);
    }

    /* Store the new samples in the FIFO buffer. */
    auto nb_samples = av_audio_fifo_write(audio_fifo_,
                                          reinterpret_cast<void **>(resampled_frame->extended_data),
                                          resampled_frame->nb_samples);


    int delay = 0;
    // trying to split resampled frame to desired chunks
    while (av_audio_fifo_size(audio_fifo_) > 0) {
        const int frame_size = FFMIN(av_audio_fifo_size(audio_fifo_), enc_ctx_->frame_size);

        auto out_frame = av_frame_alloc();


        out_frame->nb_samples       = frame_size;
        out_frame->format           = enc_ctx_->sample_fmt;
        out_frame->channel_layout   = enc_ctx_->channel_layout;
        out_frame->channels         = enc_ctx_->channels;
        out_frame->sample_rate      = enc_ctx_->sample_rate;

        av_frame_get_buffer(out_frame, 0);
        
        av_audio_fifo_read(audio_fifo_, (void **)out_frame->data, frame_size) < frame_size);

    // ***** tried both cases
        out_frame->pts = in_frame->pts + delay;
        out_frame->pkt_dts = in_frame->pkt_dts + delay;
        // swr_next_pts(swr_ctx_, in_frame->pts) + delay;
        // swr_next_pts(swr_ctx_, in_frame->pkt_dts) + delay;

        result.push_back(out_frame);

        delay += frame_size;
    }

    return result;

encoding and muxing (in_frame):

    bool DoesNeedResample(const AVFrame * in_frame) {
        assert(("DoesNeedResample: in_frame is empty", in_frame));
        assert(("DoesNeedResample: encoder is not started", is_init_));

        if (in_frame->sample_rate != enc_ctx_->sample_rate ||
        in_frame->channel_layout != enc_ctx_->channel_layout ||
        in_frame->channels != enc_ctx_->channels ||
        in_frame->format != enc_ctx_->sample_fmt) {
        return true;
        }

        return false;
    }

    av_frame_make_writable(in_frame);


    streamserver::AVFrames encoding_frames;
    if (DoesNeedResample(in_frame)) {
        encoding_frames = Resample(in_frame, 
        av_rescale_q(in_frame->pts, in_audio_stream_timebase_, out_audio_stream_->time_base),
        av_rescale_q(in_frame->pkt_dts, in_audio_stream_timebase_, out_audio_stream_->time_base));
    } else {
        encoding_frames.push_back(av_frame_clone(in_frame));
    }


    for (auto frame : encoding_frames) {
        if ((err = avcodec_send_frame(encoder_ctx, frame)) < 0) {
            AVFrameFree(&frame);
        }

        while (err >= 0) {
            pkt_->data = NULL;
            pkt_->size = 0;
            av_init_packet(pkt_);

            err = avcodec_receive_packet(encoder_ctx, pkt_);
            if (err == AVERROR(EAGAIN) || err == AVERROR_EOF) {
                break;
            } else if (err < 0) {
                break;
            }

            pkt_->stream_index = out_audio_stream_->index;

            av_interleaved_write_frame(ofmt_ctx_, pkt_);
        }

        av_packet_unref(pkt_);
    }

Sound in resulted video is corrupted, see first paragraph for description.

In https://www.ffmpeg.org/doxygen/4.1/transcode_aac_8c-example.html there are lines:

        /*
        * Perform a sanity check so that the number of converted samples is
        * not greater than the number of samples to be converted.
        * If the sample rates differ, this case has to be handled differently
        */
        av_assert0(output_codec_context->sample_rate == input_codec_context->sample_rate);

How to handle such cases? I tried to split resampled frames via fifo in example above!

Answer 1

The logic and steps for transcoding audio with upsampling were right. Problem was outside of the topic. So anyone may use this thread as foundation for your code if it will be needed.

Upsample and encode audio stream

Question

1 answers

solution1
0 ACCPTED 2022-06-08 11:59:52

Upsample and encode audio stream

Question

1 answers

solution1 0 ACCPTED 2022-06-08 11:59:52

solution1
0 ACCPTED 2022-06-08 11:59:52