[英]Upsample and encode audio stream
基本上在將 pcm_alaw 8khz 轉碼為 mp3 后,我在前 1-2 秒內只能聽到一些簡短甚至是快速的聲音,無法識別的聲音。 因此,pts/dts、打包到平面轉換或上采樣有問題。
我的應用程序將 rtsp 相機流轉碼為文件。 視頻和音頻。 視頻可以正常工作,音頻也可以重新混合。 現在我有 pcm_alaw 8khz 音頻流,想將它與視頻一起轉碼為 mp4 文件。
代碼構建可重現部分非常麻煩,所以首先我想知道我的邏輯是否正確。 這是我的草稿流程(假設檢查並處理了所有錯誤):
創建編碼器:
codec_ = avcodec_find_encoder(AV_CODEC_ID_MP3);
enc_ctx_ = avcodec_alloc_context3(codec_);
enc_ctx_->bit_rate = 64000;
enc_ctx_->codec_type = AVMEDIA_TYPE_AUDIO;
enc_ctx_->sample_fmt = codec_->sample_fmts ? codec_->sample_fmts[0] : AV_SAMPLE_FMT_S32P;
// functions from here https://www.ffmpeg.org/doxygen/4.1/encode_audio_8c-example.html
enc_ctx_->sample_rate = select_sample_rate(codec_);
enc_ctx_->channel_layout = select_channel_layout(codec_);
enc_ctx_->channels = av_get_channel_layout_nb_channels(enc_ctx_->channel_layout);
enc_ctx_->time_base = (AVRational){1, enc_ctx_->sample_rate};
enc_ctx_->strict_std_compliance = FF_COMPLIANCE_EXPERIMENTAL;
if (is_global_header) {
enc_ctx_->flags |= AV_CODEC_FLAG_GLOBAL_HEADER;
}
avcodec_open2(enc_ctx_, codec_, nullptr);
創建重采樣器(in_frame):
audio_fifo_ = av_audio_fifo_alloc(enc_ctx_->sample_fmt, enc_ctx_->channels, 1));
in_ch_layout_ = in_frame->channel_layout;
in_sample_fmt = in_frame->format;
in_sample_rate_ = in_frame->sample_rate;
swr_ctx_ = swr_alloc_set_opts(NULL, // we're allocating a new context
enc_ctx_->channel_layout, // out_ch_layout
enc_ctx_->sample_fmt, // out_sample_fmt
enc_ctx_->sample_rate, // out_sample_rate
in_frame->channel_layout, // in_ch_layout
(AVSampleFormat)in_frame->format, // in_sample_fmt
in_frame->sample_rate, // in_sample_rate
0, // log_offset
NULL); // log_ctx
swr_init(swr_ctx_);
重采樣(in_frame,start_pts,start_dts):
auto resampled_frame = av_frame_alloc();
auto dst_nb_samples = av_rescale_rnd(swr_get_delay(swr_ctx_, in_frame->sample_rate) +
in_frame->nb_samples, enc_ctx_->sample_rate, in_frame->sample_rate, AV_ROUND_UP);
// resampled_frame->nb_samples = dst_nb_samples;
resampled_frame->format = enc_ctx_->sample_fmt;
resampled_frame->channel_layout = enc_ctx_->channel_layout;
// resampled_frame->channels = enc_ctx_->channels;
resampled_frame->sample_rate = enc_ctx_->sample_rate;
error = swr_convert_frame(swr_ctx_, resampled_frame, in_frame);
/* Make the FIFO as large as it needs to be to hold both,
* the old and the new samples. */
if (av_audio_fifo_size(audio_fifo_) < dst_nb_samples) {
av_audio_fifo_realloc(audio_fifo_, dst_nb_samples);
}
/* Store the new samples in the FIFO buffer. */
auto nb_samples = av_audio_fifo_write(audio_fifo_,
reinterpret_cast<void **>(resampled_frame->extended_data),
resampled_frame->nb_samples);
int delay = 0;
// trying to split resampled frame to desired chunks
while (av_audio_fifo_size(audio_fifo_) > 0) {
const int frame_size = FFMIN(av_audio_fifo_size(audio_fifo_), enc_ctx_->frame_size);
auto out_frame = av_frame_alloc();
out_frame->nb_samples = frame_size;
out_frame->format = enc_ctx_->sample_fmt;
out_frame->channel_layout = enc_ctx_->channel_layout;
out_frame->channels = enc_ctx_->channels;
out_frame->sample_rate = enc_ctx_->sample_rate;
av_frame_get_buffer(out_frame, 0);
av_audio_fifo_read(audio_fifo_, (void **)out_frame->data, frame_size) < frame_size);
// ***** tried both cases
out_frame->pts = in_frame->pts + delay;
out_frame->pkt_dts = in_frame->pkt_dts + delay;
// swr_next_pts(swr_ctx_, in_frame->pts) + delay;
// swr_next_pts(swr_ctx_, in_frame->pkt_dts) + delay;
result.push_back(out_frame);
delay += frame_size;
}
return result;
編碼和復用(in_frame):
bool DoesNeedResample(const AVFrame * in_frame) {
assert(("DoesNeedResample: in_frame is empty", in_frame));
assert(("DoesNeedResample: encoder is not started", is_init_));
if (in_frame->sample_rate != enc_ctx_->sample_rate ||
in_frame->channel_layout != enc_ctx_->channel_layout ||
in_frame->channels != enc_ctx_->channels ||
in_frame->format != enc_ctx_->sample_fmt) {
return true;
}
return false;
}
av_frame_make_writable(in_frame);
streamserver::AVFrames encoding_frames;
if (DoesNeedResample(in_frame)) {
encoding_frames = Resample(in_frame,
av_rescale_q(in_frame->pts, in_audio_stream_timebase_, out_audio_stream_->time_base),
av_rescale_q(in_frame->pkt_dts, in_audio_stream_timebase_, out_audio_stream_->time_base));
} else {
encoding_frames.push_back(av_frame_clone(in_frame));
}
for (auto frame : encoding_frames) {
if ((err = avcodec_send_frame(encoder_ctx, frame)) < 0) {
AVFrameFree(&frame);
}
while (err >= 0) {
pkt_->data = NULL;
pkt_->size = 0;
av_init_packet(pkt_);
err = avcodec_receive_packet(encoder_ctx, pkt_);
if (err == AVERROR(EAGAIN) || err == AVERROR_EOF) {
break;
} else if (err < 0) {
break;
}
pkt_->stream_index = out_audio_stream_->index;
av_interleaved_write_frame(ofmt_ctx_, pkt_);
}
av_packet_unref(pkt_);
}
結果視頻中的聲音已損壞,請參閱第一段的說明。
在https://www.ffmpeg.org/doxygen/4.1/transcode_aac_8c-example.html中有幾行:
/*
* Perform a sanity check so that the number of converted samples is
* not greater than the number of samples to be converted.
* If the sample rates differ, this case has to be handled differently
*/
av_assert0(output_codec_context->sample_rate == input_codec_context->sample_rate);
如何處理此類情況? 在上面的示例中,我嘗試通過 fifo 拆分重新采樣的幀!
使用上采樣對音頻進行轉碼的邏輯和步驟是正確的。 問題不在主題范圍內。 因此,如果需要,任何人都可以將此線程用作您的代碼的基礎。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.