使用 Watson for Python 实现连续实时语音到文本

Question

I'm trying to create a small Python program that would let me get text in real time using my mic from the Watson server similar to how it works here .我正在尝试创建一个小型 Python 程序，它可以让我使用来自 Watson 服务器的麦克风实时获取文本，类似于此处的工作方式。

This is the code I have came up with but it gets the text after I finish recording:这是我想出的代码，但在我完成录制后它会得到文本：

import pyaudio
import json
from watson_developer_cloud import SpeechToTextV1

CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 44100
RECORD_SECONDS = 10

p = pyaudio.PyAudio()

stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                frames_per_buffer=CHUNK)
print("* recording")

frames = []

for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)

print("* done recording")

stream.stop_stream()
stream.close()
p.terminate()

data_feed = b''.join(frames)

speech_to_text = SpeechToTextV1(
    username='secret',
    password='secret too',
    x_watson_learning_opt_out=False
)

result = speech_to_text.recognize(data_feed,
                                  content_type="audio/l16;rate=44100;channels=2",
                                  word_confidence=True,
                                  max_alternatives=4,
                                  word_alternatives_threshold=0.5,
                                  model="en-US_BroadbandModel",
                                  continuous=True)

j = json.dumps(result, indent=2)
print(j)

Answer 1

I went ahead and created a program from scratch to connect to the Watson server using websockets.我继续从头开始创建一个程序，以使用 websockets 连接到 Watson 服务器。 It still isn't doing exactly what I expect but it is very close.它仍然没有完全按照我的预期做，但已经非常接近了。

The audio is being sent to the server in real time but I am getting the transcript after the recording finishes.音频正在实时发送到服务器，但在录制完成后我正在获取成绩单。

import asyncio
import websockets
import json
import requests
import pyaudio
import time

# Variables to use for recording audio
CHUNK = 1024
FORMAT = pyaudio.paInt16
CHANNELS = 2
RATE = 16000

p = pyaudio.PyAudio()

# This is the language model to use to transcribe the audio
model = "en-US_BroadbandModel"

# These are the urls we will be using to communicate with Watson
default_url = "https://stream.watsonplatform.net/speech-to-text/api"
token_url = "https://stream.watsonplatform.net/authorization/api/v1/token?" \
            "url=https://stream.watsonplatform.net/speech-to-text/api"
url = "wss://stream.watsonplatform.net/speech-to-text/api/v1/recognize?model=en-US_BroadbandModel"

# BlueMix app credentials
username = ""   # Your Bluemix App username
password = ""   # Your Bluemix App password

# Send a request to get an authorization key
r = requests.get(token_url, auth=(username, password))
auth_token = r.text
token_header = {"X-Watson-Authorization-Token": auth_token}

# Params to use for Watson API
params = {
    "word_confidence": True,
    "content_type": "audio/l16;rate=16000;channels=2",
    "action": "start",
    "interim_results": True
}

# Opens the stream to start recording from the default microphone
stream = p.open(format=FORMAT,
                channels=CHANNELS,
                rate=RATE,
                input=True,
                output=True,
                frames_per_buffer=CHUNK)


async def send_audio(ws):
    # Starts recording of microphone
    print("* READY *")

    start = time.time()
    while True:
        try:
            print(".")
            data = stream.read(CHUNK)
            await ws.send(data)
            if time.time() - start > 20:    # Records for n seconds
                await ws.send(json.dumps({'action': 'stop'}))
                return False
        except Exception as e:
            print(e)
            return False

    # Stop the stream and terminate the recording
    stream.stop_stream()
    stream.close()
    p.terminate()


async def speech_to_text():
    async with websockets.connect(url, extra_headers=token_header) as conn:
        # Send request to watson and waits for the listening response
        send = await conn.send(json.dumps(params))
        rec = await conn.recv()
        print(rec)
        asyncio.ensure_future(send_audio(conn))

        # Keeps receiving transcript until we have the final transcript
        while True:
            try:
                rec = await conn.recv()
                parsed = json.loads(rec)
                transcript = parsed["results"][0]["alternatives"][0]["transcript"]
                print(transcript)
                #print(parsed)
                if "results" in parsed:
                    if len(parsed["results"]) > 0:
                        if "final" in parsed["results"][0]:
                            if parsed["results"][0]["final"]:
                                #conn.close()
                                #return False
                                pass
            except KeyError:
                conn.close()
                return False

# Starts the application loop
loop = asyncio.get_event_loop()
loop.run_until_complete(speech_to_text())
loop.close()

So all I want now is to get the transcript while I am recording through the microphone.所以我现在想要的就是在我通过麦克风录音时获得成绩单。

使用 Watson for Python 实现连续实时语音到文本

问题描述

1 个解决方案

解决方案1
1 2017-10-29 18:40:44

使用 Watson for Python 实现连续实时语音到文本

问题描述

1 个解决方案

解决方案1 1 2017-10-29 18:40:44

解决方案1
1 2017-10-29 18:40:44