jarvis/_stt.py

import torch
import sounddevice as sd
import speech_recognition as sr
import time
import numpy
from glob import glob

device = torch.device('cpu')
model, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',
                                       model='silero_stt',
                                       language='en', # en, ru
                                       device=device)
(read_batch, split_into_batches,
 read_audio, prepare_model_input) = utils


def callback(_r, audio):
    try:
        # CONVERT raw wav data to NumPy array
        # wav_raw = audio.get_wav_data()
        # data_s16 = numpy.frombuffer(wav_raw, dtype=numpy.int16, count=len(wav_raw) // 2, offset=0)
        # np_audio = data_s16 * 0.5 ** 15

        # Play it via sounddevice
        #sd.play(np_audio, m.SAMPLE_RATE)
        #time.sleep(len(np_audio) / m.SAMPLE_RATE)
        #sd.stop()

        print("Распознание ...")

        # TODO: fix crutch, pass audio data directly as a model input of Silero STT
        with open('speech.wav', 'wb') as f:
            f.write(audio.get_wav_data())

        test_files = glob('speech.wav')
        batches = split_into_batches(test_files, batch_size=10)
        input = prepare_model_input(read_batch(batches[0]),
                                    device=device)

        output = model(input)
        for example in output:
            print(decoder(example.cpu()))

        # voice = recognizer.recognize_google(audio, language="ru-RU").lower()
        # print("[log] Распознано: " + voice)

    except sr.UnknownValueError:
        print("[log] Голос не распознан!")


# запуск
r = sr.Recognizer()
r.pause_threshold = 0.5
m = sr.Microphone(device_index=1)

with m as source:
    r.adjust_for_ambient_noise(source)

stop_listening = r.listen_in_background(m, callback)
while True: time.sleep(0.1)