如何在主線程在 Python 中執行時從主線程生成子線程

Question

我正在制作一個應用程序來將從麥克風錄制的音頻轉換為文本。 錄音的長度可能很長，比如 3 小時，所以我猜最好將其轉換為持續時間短的波形文件，比如一分鍾左右，然后生成一個子線程，在其中執行音頻到文本的操作，而主線程可以開始錄制下一分鍾。 音頻到文本的操作比錄制部分快得多，因此時間不會成為問題。

這是我認為它應該如何工作的流程圖。

我正在使用pyaudio錄制音頻。 它的代碼：

import pyaudio
import wave
import time

def read_audio(stream):
    chunk = 1024  # Record in chunks of 1024 samples
    sample_format = pyaudio.paInt16  # 16 bits per sample
    channels = 2
    fs = 44100  # Record at 44100 samples per second
    seconds = 10
    filename = 'record.wav'
    frames = []  # Initialize array to store frames
    # Store data in chunks for 3 seconds
    
    for i in range(0, int(fs / chunk * seconds)):
        data = stream.read(chunk)
        frames.append(data)
    
    # Save the recorded data as a WAV file
    wf = wave.open(filename, 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(sample_format))
    wf.setframerate(fs)
    wf.writeframes(b''.join(frames))
    wf.close()
    
    # Stop and close the stream
    stream.stop_stream()
    stream.close()

    

p = pyaudio.PyAudio()  # Create an interface to PortAudio
chunk = 1024  # Record in chunks of 1024 samples
sample_format = pyaudio.paInt16  # 16 bits per sample
channels = 2
fs = 44100
stream = p.open(format=sample_format,channels=channels,rate=fs,
                frames_per_buffer=chunk,input=True)
read_audio(stream)
p.terminate() # Terminate the PortAudio interface

對於語音識別，使用 Google 的 API speech_recognition 。 它的代碼：

import speech_recognition as sr

def convert():
    sound = "record.wav"
 
    r = sr.Recognizer()
 
 
    with sr.AudioFile(sound) as source:
        r.adjust_for_ambient_noise(source)
        print("Converting Audio To Text and saving to file..... ") 
        audio = r.listen(source)
    try:

        value = r.recognize_google(audio) ##### API call to google for speech recognition

        if str is bytes: 
            result = u"{}".format(value).encode("utf-8")

        else: 
            result = "{}".format(value)

        with open("test.txt","a") as f:
            f.write(result)
        print("Done !\n\n")

    except sr.UnknownValueError:
        print("")
    except sr.RequestError as e:
        print("{0}".format(e))
    except KeyboardInterrupt:
        pass
 
convert()

Answer 1

由於 GIL，Python 從來都不是真正的多線程，但這在您的情況下可能並不重要，因為您正在使用 api 調用來為您進行語音識別。

所以你可以試試這個來啟動一個線程來做轉換

from threading import Thread
t = Thread(target=convert)
t.start()

在您嘗試轉換下一分鍾之前，您可能會嘗試加入最后一個線程以確保它已完成

t.join()

您可能還可以使用 asyncio 庫

雖然這可能有點矯枉過正，但我可能會使用多處理庫。 在您的情況下，您可能有一個不斷錄制和保存新聲音文件的偵聽器工作進程，以及一個不斷尋找新文件並轉換它們的轉換工作進程。

如果需要，這將允許您編寫更強大的系統。 例如，如果您失去了互聯網連接並且在幾分鍾內無法通過谷歌 api 轉換您的聲音文件，那么錄音機工作人員將繼續保存聲音文件而不關心在互聯網連接恢復時會得到處理的聲音文件。

無論如何，這里有一個你可以使用的轉換工作進程的小例子。

import multiprocessing as mp
import os
from pathlib import Path
from time import sleep

class ConversionWorker:

    def __init__(self, sound_file_directory_path: str, text_save_filepath: str):
        self.sound_directory_path = Path(sound_file_directory_path)
        self.text_filepath = Path(text_save_filepath)

    def run(self):
        while True:

            # find and convert all wav files in the target directory
            filepaths = self.sound_directory_path.glob('*.wav')
            for path in filepaths:
                # convert from path
                # save to self.text_filepath
                convert()

                # we can delete the sound file after converting it
                os.remove(path)

            # sleep for a bit since we are only saving files once a minute or so
            sleep(5)

def main():
    conversion_worker = ConversionWorker(sound_file_directory_path='path/to/sounds', text_save_filepath='path/to/text')
    p = mp.Process(target=conversion_worker.run)
    p.start()

    # do the recording and saving for as long as you want

    p.terminate()

Answer 2

我解決這個問題的方法是受到 Jeremy Bare 的啟發。 我決定發布一個關於我最終是如何做到的答案，所以如果有人想做類似的事情，那么他們可以獲得完整的代碼。

import speech_recognition as sr
import pyaudio
import wave
import time
import threading
import os

def read_audio(stream, filename):
    chunk = 1024  # Record in chunks of 1024 samples
    sample_format = pyaudio.paInt16  # 16 bits per sample
    channels = 2
    fs = 44100  # Record at 44100 samples per second
    seconds = 10 # Number of seconds to record at once
    filename = filename
    frames = []  # Initialize array to store frames
    
    for i in range(0, int(fs / chunk * seconds)):
        data = stream.read(chunk)
        frames.append(data)
    
    # Save the recorded data as a WAV file
    wf = wave.open(filename, 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(sample_format))
    wf.setframerate(fs)
    wf.writeframes(b''.join(frames))
    wf.close()
    # Stop and close the stream
    stream.stop_stream()
    stream.close()

def convert(i):
    if i >= 0:
        sound = 'record' + str(i) +'.wav'
        r = sr.Recognizer()
        
        with sr.AudioFile(sound) as source:
            r.adjust_for_ambient_noise(source)
            print("Converting Audio To Text and saving to file..... ") 
            audio = r.listen(source)
        try:
            value = r.recognize_google(audio) ##### API call to google for speech recognition
            os.remove(sound)
            if str is bytes: 
                result = u"{}".format(value).encode("utf-8")
            else: 
                result = "{}".format(value)
 
            with open("test.txt","a") as f:
                f.write(result)
                f.write(" ")
                f.close()
                
        except sr.UnknownValueError:
            print("")
        except sr.RequestError as e:
            print("{0}".format(e))
        except KeyboardInterrupt:
            pass

p = pyaudio.PyAudio()  # Create an interface to PortAudio
chunk = 1024  # Record in chunks of 1024 samples
sample_format = pyaudio.paInt16  # 16 bits per sample
channels = 2
fs = 44100

def save_audios(i):
    stream = p.open(format=sample_format,channels=channels,rate=fs,
                frames_per_buffer=chunk,input=True)
    filename = 'record'+str(i)+'.wav'
    read_audio(stream, filename)

for i in range(30//10): # Number of total seconds to record/ Number of seconds per recording
    t1 = threading.Thread(target=save_audios, args=[i]) 
    x = i-1
    t2 = threading.Thread(target=convert, args=[x]) # send one earlier than being recorded
    t1.start() 
    t2.start() 
    t1.join() 
    t2.join() 
    if i==2:
        flag = True
if flag:
    convert(i)
    p.terminate()

read_audio()和convert()函數與問題中發布的幾乎相同，但是，它們現在將 integer 作為參數。 這對於確定他們將在該特定線程中處理的文件號很有用。

由於必須先進行錄制，因此語音識別 function 傳遞的值比提供給錄制 function 的 integer 的值小 1。 這確保它僅適用於錄制的音頻。

如何在主線程在 Python 中執行時從主線程生成子線程

問題描述

2 個解決方案

解決方案1
1 已采納 2020-06-26 13:34:54

解決方案2
0 2020-06-30 14:22:34

如何在主線程在 Python 中執行時從主線程生成子線程

問題描述

2 個解決方案

解決方案1 1 已采納 2020-06-26 13:34:54

解決方案2 0 2020-06-30 14:22:34

解決方案1
1 已采納 2020-06-26 13:34:54

解決方案2
0 2020-06-30 14:22:34