mirror of
https://github.com/Priler/jarvis.git
synced 2026-05-26 23:19:46 +00:00
61 lines
1.8 KiB
Python
61 lines
1.8 KiB
Python
import torch
|
|
import sounddevice as sd
|
|
import speech_recognition as sr
|
|
import time
|
|
import numpy
|
|
from glob import glob
|
|
|
|
device = torch.device('cpu')
|
|
model, decoder, utils = torch.hub.load(repo_or_dir='snakers4/silero-models',
|
|
model='silero_stt',
|
|
language='en', # en, ru
|
|
device=device)
|
|
(read_batch, split_into_batches,
|
|
read_audio, prepare_model_input) = utils
|
|
|
|
|
|
def callback(_r, audio):
|
|
try:
|
|
# CONVERT raw wav data to NumPy array
|
|
# wav_raw = audio.get_wav_data()
|
|
# data_s16 = numpy.frombuffer(wav_raw, dtype=numpy.int16, count=len(wav_raw) // 2, offset=0)
|
|
# np_audio = data_s16 * 0.5 ** 15
|
|
|
|
# Play it via sounddevice
|
|
#sd.play(np_audio, m.SAMPLE_RATE)
|
|
#time.sleep(len(np_audio) / m.SAMPLE_RATE)
|
|
#sd.stop()
|
|
|
|
print("Распознание ...")
|
|
|
|
# TODO: fix crutch, pass audio data directly as a model input of Silero STT
|
|
with open('speech.wav', 'wb') as f:
|
|
f.write(audio.get_wav_data())
|
|
|
|
test_files = glob('speech.wav')
|
|
batches = split_into_batches(test_files, batch_size=10)
|
|
input = prepare_model_input(read_batch(batches[0]),
|
|
device=device)
|
|
|
|
output = model(input)
|
|
for example in output:
|
|
print(decoder(example.cpu()))
|
|
|
|
# voice = recognizer.recognize_google(audio, language="ru-RU").lower()
|
|
# print("[log] Распознано: " + voice)
|
|
|
|
except sr.UnknownValueError:
|
|
print("[log] Голос не распознан!")
|
|
|
|
|
|
# запуск
|
|
r = sr.Recognizer()
|
|
r.pause_threshold = 0.5
|
|
m = sr.Microphone(device_index=1)
|
|
|
|
with m as source:
|
|
r.adjust_for_ambient_noise(source)
|
|
|
|
stop_listening = r.listen_in_background(m, callback)
|
|
while True: time.sleep(0.1)
|