Azure 文本到规范到 PyAudio Stream

Question

I am trying to stream the output of an Azure text-to-speech instance to my speaker with PyAudio using Microsoft's sample code我正在尝试使用微软的示例代码使用 PyAudio 将 Azure 文本到语音实例的 stream output 发送到我的扬声器

I tried to write to PyAudio's stream inside Azure's callback function def write , but it gives me this error:我试图在 Azure 的回调 function def write中写入 PyAudio 的 stream，但它给了我这个错误：

`my_stream.write(audio_buffer)
File "/opt/homebrew/lib/python3.10/site-packages/pyaudio.py", line 589
, in write pa.write_stream(self._stream, frames, num_frames,
TypeError: argument 2 must be read-only bytes-like object, not memoryview`

How do I handle Azure's output so that the PyAudio stream accepts it as audio data?我如何处理 Azure 的 output 以便 PyAudio stream 接受它作为音频数据？

Full code:完整代码：

`import azure.cognitiveservices.speech as speechsdk
import os, sys, pyaudio
pa = pyaudio.PyAudio()

my_text = "My emotional experiences are varied, but mostly involve trying to find a balance between understanding others’ feelings and managing my own. I also explore the intersection of emotion and technology through affective computing and related research."

voc_data = {
    'channels': 1 if sys.platform == 'darwin' else 2,
    'rate': 44100,
    'width': pa.get_sample_size(pyaudio.paInt16),
    'format': pyaudio.paInt16,
    'frames': []
}

my_stream = pa.open(format=voc_data['format'],
                    channels=voc_data['channels'],
                    rate=voc_data['rate'],
                    output=True)

speech_key = os.getenv('SPEECH_KEY')
service_region = os.getenv('SPEECH_REGION')

def speech_synthesis_to_push_audio_output_stream():
    """performs speech synthesis and push audio output to a stream"""
    class PushAudioOutputStreamSampleCallback(speechsdk.audio.PushAudioOutputStreamCallback):
        """
        Example class that implements the PushAudioOutputStreamCallback, which is used to show
        how to push output audio to a stream
        """
        def __init__(self) -> None:
            super().__init__()
            self._audio_data = bytes(0)
            self._closed = False
        def write(self, audio_buffer: memoryview) -> int:
            """
            The callback function which is invoked when the synthesizer has an output audio chunk
            to write out
            """
            self._audio_data += audio_buffer
            my_stream.write(audio_buffer)
            print("{} bytes received.".format(audio_buffer.nbytes))
            return audio_buffer.nbytes

        def close(self) -> None:
            """
            The callback function which is invoked when the synthesizer is about to close the
            stream.
            """
            self._closed = True
            print("Push audio output stream closed.")

        def get_audio_data(self) -> bytes:
            return self._audio_data

        def get_audio_size(self) -> int:
            return len(self._audio_data)

    # Creates an instance of a speech config with specified subscription key and service region.
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    # Creates customized instance of PushAudioOutputStreamCallback
    stream_callback = PushAudioOutputStreamSampleCallback()
    # Creates audio output stream from the callback
    push_stream = speechsdk.audio.PushAudioOutputStream(stream_callback)
    # Creates a speech synthesizer using push stream as audio output.
    stream_config = speechsdk.audio.AudioOutputConfig(stream=push_stream)
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=stream_config)

    # Receives a text from console input and synthesizes it to stream output.
    while True:
    #    print("Enter some text that you want to synthesize, Ctrl-Z to exit")
    #    try:
    #        text = input()
    #    except EOFError:
    #        break
        result = speech_synthesizer.speak_text_async(my_text).get()
        # Check result
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print("Speech synthesized for text [{}], and the audio was written to output stream.".format(text))
        elif result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = result.cancellation_details
            print("Speech synthesis canceled: {}".format(cancellation_details.reason))
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                print("Error details: {}".format(cancellation_details.error_details))
        # Destroys result which is necessary for destroying speech synthesizer
        del result

    # Destroys the synthesizer in order to close the output stream.
    del speech_synthesizer

    print("Totally {} bytes received.".format(stream_callback.get_audio_size()))

speech_synthesis_to_push_audio_output_stream()`

Answer 1

Here, I have a work around where instead of the using stream use a file.在这里，我有一个解决方法，而不是使用 stream 使用文件。 Where the audio will be stored in the file and then we simply read the file and play it using Py audio.音频将存储在文件中的位置，然后我们只需读取文件并使用 Py 音频播放它。

# Dependencies
import  os
import  azure.cognitiveservices.speech  as  speechsdk
import  pyaudio
import  wave

speech_config = speechsdk.SpeechConfig(subscription="<Key>", region="<Region>")

# Audio Config 

audio_config = speechsdk.audio.AudioOutputConfig(filename="background.wav")

speech_config.speech_synthesis_voice_name='en-US-JennyNeural'

speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)

print("Enter the Text:- ")
text = input()

speech_synthesis_result = speech_synthesizer.speak_text_async(text).get()
print("Conversion is Complete")
filename = 'background.wav' # Same is in audio config

chunk = 1024
file = wave.open(filename, 'rb')

p = pyaudio.PyAudio()

stream = p.open(format = p.get_format_from_width(file.getsampwidth()),
    channels = file.getnchannels(),
    rate = file.getframerate(),
    output = True)

data = file.readframes(chunk)
print("Starting Audio")
while  data != '':
    stream.write(data)
    data = file.readframes(chunk)

stream.stop_stream()
stream.close()
p.terminate()

在此处输入图像描述

Here It will take more space and more time as we are reading the file.在这里，当我们读取文件时，它会占用更多的空间和更多的时间。

Azure 文本到规范到 PyAudio Stream

问题描述

1 个解决方案

解决方案1
0 2023-01-31 10:36:37

Azure 文本到规范到 PyAudio Stream

问题描述

1 个解决方案

解决方案1 0 2023-01-31 10:36:37

解决方案1
0 2023-01-31 10:36:37