mirror of
https://github.com/Priler/jarvis.git
synced 2026-05-26 07:08:11 +00:00
vosk usage optimization
This commit is contained in:
@@ -36,6 +36,10 @@ fn main_loop() -> Result<(), ()> {
|
||||
// recognize wake-word
|
||||
match listener::data_callback(&frame_buffer) {
|
||||
Some(_keyword_index) => {
|
||||
// reset speech recognizer
|
||||
stt::reset_wake_recognizer();
|
||||
stt::reset_speech_recognizer();
|
||||
|
||||
// wake-word activated, process further commands
|
||||
// capture current time
|
||||
start = SystemTime::now();
|
||||
@@ -62,11 +66,36 @@ fn main_loop() -> Result<(), ()> {
|
||||
// filter recognized voice
|
||||
// @TODO. Better recognized voice filtration.
|
||||
recognized_voice = recognized_voice.to_lowercase();
|
||||
|
||||
// answer again if it's activation phrase repeated
|
||||
if recognized_voice.contains(config::VOSK_FETCH_PHRASE) {
|
||||
info!("Wake word detected during chaining, reactivating...");
|
||||
|
||||
// play greet sound
|
||||
audio::play_sound(&sounds_directory.join(format!(
|
||||
"{}.wav",
|
||||
config::ASSISTANT_GREET_PHRASES
|
||||
.choose(&mut rand::thread_rng())
|
||||
.unwrap()
|
||||
)));
|
||||
|
||||
// reset timer and continue listening
|
||||
start = SystemTime::now();
|
||||
stt::reset_speech_recognizer();
|
||||
continue 'voice_recognition;
|
||||
}
|
||||
|
||||
// filter out activation phrase from command
|
||||
for tbr in config::ASSISTANT_PHRASES_TBR {
|
||||
recognized_voice = recognized_voice.replace(tbr, "");
|
||||
}
|
||||
recognized_voice = recognized_voice.trim().into();
|
||||
|
||||
// skip if nothing left after filtering (*evil laugh*)
|
||||
if recognized_voice.is_empty() {
|
||||
continue 'voice_recognition;
|
||||
}
|
||||
|
||||
// infer command (try intent recognition first, fallback to levenshtein)
|
||||
let cmd_result = if let Some((intent_id, confidence)) =
|
||||
rt.block_on(intent::classify(&recognized_voice))
|
||||
@@ -74,7 +103,7 @@ fn main_loop() -> Result<(), ()> {
|
||||
info!("Intent recognized: {} (confidence: {:.2})", intent_id, confidence);
|
||||
intent::get_command_by_intent(COMMANDS_LIST.get().unwrap(), &intent_id)
|
||||
} else {
|
||||
info!("Intent not recognized, trying levenshtein fallback...");
|
||||
info!("Intent not recognized, trying levenshtein fallback ...");
|
||||
commands::fetch_command(&recognized_voice, COMMANDS_LIST.get().unwrap())
|
||||
};
|
||||
|
||||
@@ -119,6 +148,9 @@ fn main_loop() -> Result<(), ()> {
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
|
||||
// reset wake recognizer
|
||||
stt::reset_wake_recognizer();
|
||||
}
|
||||
}
|
||||
None => (),
|
||||
|
||||
@@ -95,53 +95,95 @@ pub fn commands_hash(commands: &Vec<JCommandsList>) -> String {
|
||||
}
|
||||
|
||||
|
||||
// @TODO. NLU or smthng else is required, in order to infer commands with highest accuracy possible.
|
||||
pub fn fetch_command<'a>(
|
||||
phrase: &str,
|
||||
commands: &'a Vec<JCommandsList>,
|
||||
) -> Option<(&'a PathBuf, &'a JCommand)> {
|
||||
// result scmd
|
||||
let mut result_scmd: Option<(&PathBuf, &JCommand)> = None;
|
||||
let mut current_max_ratio = config::CMD_RATIO_THRESHOLD;
|
||||
let mut result: Option<(&PathBuf, &JCommand)> = None;
|
||||
let mut best_score = config::CMD_RATIO_THRESHOLD;
|
||||
|
||||
// convert fetch phrase to sequence
|
||||
let fetch_phrase_chars = phrase.chars().collect::<Vec<_>>();
|
||||
// normalize input
|
||||
let phrase = phrase.trim().to_lowercase();
|
||||
if phrase.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// list all the commands
|
||||
for cmd in commands {
|
||||
// list all subcommands
|
||||
for scmd in &cmd.commands {
|
||||
// list all phrases in command
|
||||
for cmd_phrase in &scmd.phrases {
|
||||
// convert cmd phrase to sequence
|
||||
let cmd_phrase_chars = cmd_phrase.chars().collect::<Vec<_>>();
|
||||
let phrase_chars: Vec<char> = phrase.chars().collect();
|
||||
let phrase_words: Vec<&str> = phrase.split_whitespace().collect();
|
||||
|
||||
// compare fetch phrase with cmd phrase
|
||||
let ratio = ratio(&fetch_phrase_chars, &cmd_phrase_chars);
|
||||
|
||||
// return, if it fits the given threshold
|
||||
if ratio >= current_max_ratio {
|
||||
result_scmd = Some((&cmd.path, &scmd));
|
||||
current_max_ratio = ratio;
|
||||
// println!("Ratio is: {}", ratio);
|
||||
// return Some((&cmd.path, &scmd))
|
||||
for cmd_list in commands {
|
||||
for cmd in &cmd_list.commands {
|
||||
for cmd_phrase in &cmd.phrases {
|
||||
let cmd_phrase = cmd_phrase.trim().to_lowercase();
|
||||
let cmd_phrase_chars: Vec<char> = cmd_phrase.chars().collect();
|
||||
|
||||
// character-level similarity
|
||||
let char_ratio = ratio(&phrase_chars, &cmd_phrase_chars);
|
||||
|
||||
// word-level similarity (handles word order)
|
||||
let cmd_words: Vec<&str> = cmd_phrase.split_whitespace().collect();
|
||||
let word_score = word_overlap_score(&phrase_words, &cmd_words);
|
||||
|
||||
// combined score (weighted average)
|
||||
let score = (char_ratio * 0.6) + (word_score * 0.4);
|
||||
|
||||
// early exit on perfect match
|
||||
if score >= 99.0 {
|
||||
debug!("Perfect match: '{}' -> '{}'", phrase, cmd_phrase);
|
||||
return Some((&cmd_list.path, cmd));
|
||||
}
|
||||
|
||||
if score > best_score {
|
||||
best_score = score;
|
||||
result = Some((&cmd_list.path, cmd));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if let Some((cmd_path, scmd)) = result_scmd {
|
||||
debug!("Ratio is: {}", current_max_ratio);
|
||||
if let Some((cmd_path, cmd)) = result {
|
||||
info!(
|
||||
"CMD is: {cmd_path:?}, SCMD is: {scmd:?}, Ratio is: {}",
|
||||
current_max_ratio
|
||||
"Fuzzy match: '{}' -> cmd '{}' (score: {:.1}%)",
|
||||
phrase, cmd.id, best_score
|
||||
);
|
||||
Some((&cmd_path, &scmd))
|
||||
Some((cmd_path, cmd))
|
||||
} else {
|
||||
debug!("No match for '{}' (best: {:.1}%)", phrase, best_score);
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn word_overlap_score(input_words: &[&str], cmd_words: &[&str]) -> f64 {
|
||||
if input_words.is_empty() || cmd_words.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let mut matched = 0.0;
|
||||
|
||||
for input_word in input_words {
|
||||
// find best matching word in command
|
||||
let best_word_match = cmd_words
|
||||
.iter()
|
||||
.map(|cmd_word| {
|
||||
let iw: Vec<char> = input_word.chars().collect();
|
||||
let cw: Vec<char> = cmd_word.chars().collect();
|
||||
ratio(&iw, &cw)
|
||||
})
|
||||
.fold(0.0_f64, |a, b| a.max(b));
|
||||
|
||||
// count as match if word similarity > 70%
|
||||
if best_word_match > 70.0 {
|
||||
matched += best_word_match / 100.0;
|
||||
}
|
||||
}
|
||||
|
||||
// normalize by max word count
|
||||
let max_words = input_words.len().max(cmd_words.len()) as f64;
|
||||
(matched / max_words) * 100.0
|
||||
}
|
||||
|
||||
|
||||
// @TODO. Rewrite executors by executor type struct. (with match arms)
|
||||
pub fn execute_exe(exe: &str, args: &Vec<String>) -> std::io::Result<Child> {
|
||||
Command::new(exe).args(args).spawn()
|
||||
|
||||
@@ -133,6 +133,16 @@ pub const VOSK_MODEL_PATH: &str = "resources/vosk/model_small";
|
||||
pub const VOSK_FETCH_PHRASE: &str = "джарвис";
|
||||
pub const VOSK_MIN_RATIO: f64 = 70.0;
|
||||
|
||||
// 0.7 lenient, expect false positives
|
||||
// 0.8 balanced
|
||||
// 0.9 strict
|
||||
// etc
|
||||
pub const VOSK_WAKE_CONFIDENCE: f32 = 0.9;
|
||||
|
||||
pub const VOSK_SPEECH_RECOGNIZER_MAX_ALTERNATIVES: u16 = 3;
|
||||
pub const VOSK_SPEECH_RECOGNIZER_WORDS: bool = false;
|
||||
pub const VOSK_SPEECH_PARTIAL_WORDS: bool = false;
|
||||
|
||||
// IRE (intents recognition)
|
||||
pub const INTENT_CLASSIFIER_MIN_CONFIDENCE: f64 = 0.75;
|
||||
|
||||
|
||||
@@ -4,33 +4,60 @@ pub fn init() -> Result<(), ()> {
|
||||
Ok(()) // nothing to init for Vosk
|
||||
}
|
||||
|
||||
// @TODO. Make it better somehow (more accurate or with higher sensitivity).
|
||||
pub fn data_callback(frame_buffer: &[i16]) -> Option<i32> {
|
||||
// recognize & convert to sequence
|
||||
let recognized_phrase = stt::recognize(&frame_buffer, true).unwrap_or("".into());
|
||||
|
||||
if !recognized_phrase.trim().is_empty() {
|
||||
info!("Vosk wake-word debug info:");
|
||||
info!("rec: {}", recognized_phrase);
|
||||
let recognized_phrases = recognized_phrase.split_whitespace();
|
||||
for phrase in recognized_phrases {
|
||||
let recognized_phrase_chars = phrase.trim().to_lowercase().chars().collect::<Vec<_>>();
|
||||
|
||||
// compare
|
||||
let compare_ratio = seqdiff::ratio(
|
||||
&config::VOSK_FETCH_PHRASE.chars().collect::<Vec<_>>(),
|
||||
&recognized_phrase_chars,
|
||||
);
|
||||
info!("og phrase: {:?}", &config::VOSK_FETCH_PHRASE);
|
||||
info!("recognized phrase: {:?}", &recognized_phrase_chars);
|
||||
info!("compare ratio: {}", compare_ratio);
|
||||
|
||||
if compare_ratio >= config::VOSK_MIN_RATIO {
|
||||
info!("Phrase activated.");
|
||||
return Some(0);
|
||||
}
|
||||
if let Some((recognized, _confidence)) = stt::recognize_wake_word(frame_buffer) {
|
||||
let recognized = recognized.trim().to_lowercase();
|
||||
|
||||
// skip unknown/empty
|
||||
if recognized.is_empty() || recognized == "[unk]" {
|
||||
return None;
|
||||
}
|
||||
|
||||
info!("Wake word candidate: '{}'", recognized);
|
||||
|
||||
// verify with seqdiff ratio
|
||||
let wake_chars: Vec<char> = config::VOSK_FETCH_PHRASE.chars().collect();
|
||||
let recognized_chars: Vec<char> = recognized.chars().collect();
|
||||
let similarity = seqdiff::ratio(&wake_chars, &recognized_chars);
|
||||
|
||||
info!("Similarity: {:.1}% ('{}' vs '{}')", similarity, recognized, config::VOSK_FETCH_PHRASE);
|
||||
|
||||
if similarity >= config::VOSK_MIN_RATIO {
|
||||
info!("Wake word activated!");
|
||||
return Some(0);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
None
|
||||
}
|
||||
|
||||
// @TODO. Make it better somehow (more accurate or with higher sensitivity).
|
||||
// pub fn data_callback(frame_buffer: &[i16]) -> Option<i32> {
|
||||
// // recognize & convert to sequence
|
||||
// let recognized_phrase = stt::recognize(&frame_buffer, true).unwrap_or("".into());
|
||||
|
||||
// if !recognized_phrase.trim().is_empty() {
|
||||
// info!("Vosk wake-word debug info:");
|
||||
// info!("rec: {}", recognized_phrase);
|
||||
// let recognized_phrases = recognized_phrase.split_whitespace();
|
||||
// for phrase in recognized_phrases {
|
||||
// let recognized_phrase_chars = phrase.trim().to_lowercase().chars().collect::<Vec<_>>();
|
||||
|
||||
// // compare
|
||||
// let compare_ratio = seqdiff::ratio(
|
||||
// &config::VOSK_FETCH_PHRASE.chars().collect::<Vec<_>>(),
|
||||
// &recognized_phrase_chars,
|
||||
// );
|
||||
// info!("og phrase: {:?}", &config::VOSK_FETCH_PHRASE);
|
||||
// info!("recognized phrase: {:?}", &recognized_phrase_chars);
|
||||
// info!("compare ratio: {}", compare_ratio);
|
||||
|
||||
// if compare_ratio >= config::VOSK_MIN_RATIO {
|
||||
// info!("Phrase activated.");
|
||||
// return Some(0);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
// None
|
||||
// }
|
||||
|
||||
@@ -8,6 +8,11 @@ use crate::config::structs::SpeechToTextEngine;
|
||||
|
||||
use crate::vosk_models;
|
||||
// use vosk_models::{scan_vosk_models, get_model_path, VoskModelInfo};
|
||||
pub use self::vosk::init_vosk;
|
||||
pub use self::vosk::recognize_wake_word;
|
||||
pub use self::vosk::recognize_speech;
|
||||
pub use self::vosk::reset_speech_recognizer;
|
||||
pub use self::vosk::reset_wake_recognizer;
|
||||
|
||||
static STT_TYPE: OnceCell<SpeechToTextEngine> = OnceCell::new();
|
||||
|
||||
@@ -33,9 +38,16 @@ pub fn init() -> Result<(), ()> {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
pub fn recognize(data: &[i16], partial: bool) -> Option<String> {
|
||||
match STT_TYPE.get().unwrap() {
|
||||
SpeechToTextEngine::Vosk => vosk::recognize(data, partial),
|
||||
pub fn recognize(data: &[i16], include_partial: bool) -> Option<String> {
|
||||
if include_partial {
|
||||
vosk::recognize_wake_word(data).map(|(text, _)| text)
|
||||
} else {
|
||||
vosk::recognize_speech(data)
|
||||
}
|
||||
}
|
||||
|
||||
// pub fn recognize(data: &[i16], partial: bool) -> Option<String> {
|
||||
// match STT_TYPE.get().unwrap() {
|
||||
// SpeechToTextEngine::Vosk => vosk::recognize(data, partial),
|
||||
// }
|
||||
// }
|
||||
|
||||
@@ -9,80 +9,154 @@ use crate::stt::vosk_models;
|
||||
use crate::DB;
|
||||
|
||||
static MODEL: OnceCell<Model> = OnceCell::new();
|
||||
static RECOGNIZER: OnceCell<Mutex<Recognizer>> = OnceCell::new();
|
||||
static WAKE_RECOGNIZER: OnceCell<Mutex<Recognizer>> = OnceCell::new();
|
||||
static SPEECH_RECOGNIZER: OnceCell<Mutex<Recognizer>> = OnceCell::new();
|
||||
|
||||
pub fn init_vosk() -> Result<(), String> {
|
||||
if RECOGNIZER.get().is_some() {
|
||||
if MODEL.get().is_some() {
|
||||
return Ok(());
|
||||
} // already initialized
|
||||
|
||||
let model_path = get_configured_model_path()?;
|
||||
info!("Loading Vosk model from: {}", model_path.display());
|
||||
|
||||
let model = Model::new(model_path.to_str().unwrap())
|
||||
.ok_or_else(|| format!("Failed to load Vosk model from: {}", model_path.display()))?;
|
||||
|
||||
let mut recognizer = Recognizer::new(&model, 16000.0)
|
||||
.ok_or("Failed to create Vosk recognizer")?;
|
||||
//let mut recognizer = Recognizer::new(&model, 16000.0)
|
||||
// .ok_or("Failed to create Vosk recognizer")?;
|
||||
let wake_phrases: &[&str] = &[
|
||||
config::VOSK_FETCH_PHRASE,
|
||||
"[unk]",
|
||||
"джон",
|
||||
"джони",
|
||||
"джей",
|
||||
"джонстон",
|
||||
"привет",
|
||||
"давай",
|
||||
];
|
||||
let mut wake_recognizer = Recognizer::new_with_grammar(&model, 16000.0, wake_phrases)
|
||||
.ok_or("Failed to create wake word recognizer")?;
|
||||
|
||||
recognizer.set_max_alternatives(10);
|
||||
recognizer.set_words(true);
|
||||
recognizer.set_partial_words(true);
|
||||
wake_recognizer.set_max_alternatives(1); // required for confidence check later on
|
||||
|
||||
MODEL.set(model);
|
||||
RECOGNIZER.set(Mutex::new(recognizer));
|
||||
let mut speech_recognizer = Recognizer::new(&model, 16000.0)
|
||||
.ok_or("Failed to create speech recognizer")?;
|
||||
|
||||
speech_recognizer.set_max_alternatives(config::VOSK_SPEECH_RECOGNIZER_MAX_ALTERNATIVES);
|
||||
speech_recognizer.set_words(config::VOSK_SPEECH_RECOGNIZER_WORDS);
|
||||
speech_recognizer.set_partial_words(config::VOSK_SPEECH_PARTIAL_WORDS);
|
||||
|
||||
MODEL.set(model).map_err(|_| "Model already set")?;
|
||||
WAKE_RECOGNIZER.set(Mutex::new(wake_recognizer)).map_err(|_| "Wake recognizer already set")?;
|
||||
SPEECH_RECOGNIZER.set(Mutex::new(speech_recognizer)).map_err(|_| "Speech recognizer already set")?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn recognize(data: &[i16], include_partial: bool) -> Option<String> {
|
||||
let state = RECOGNIZER
|
||||
.get()
|
||||
.unwrap()
|
||||
.lock()
|
||||
.unwrap()
|
||||
.accept_waveform(data);
|
||||
|
||||
match state {
|
||||
Ok(ds) => {
|
||||
match ds {
|
||||
DecodingState::Running => {
|
||||
if include_partial {
|
||||
Some(
|
||||
RECOGNIZER
|
||||
.get()
|
||||
.unwrap()
|
||||
.lock()
|
||||
.unwrap()
|
||||
.partial_result()
|
||||
.partial
|
||||
.into(),
|
||||
)
|
||||
} else {
|
||||
None
|
||||
pub fn recognize_wake_word(data: &[i16]) -> Option<(String, f32)> {
|
||||
let mut recognizer = WAKE_RECOGNIZER.get()?.lock().unwrap();
|
||||
|
||||
match recognizer.accept_waveform(data) {
|
||||
Ok(DecodingState::Running) => {
|
||||
// partials don't have confidence, skip them
|
||||
None
|
||||
}
|
||||
Ok(DecodingState::Finalized) => {
|
||||
let result = recognizer.result();
|
||||
|
||||
// compensate confidence issues
|
||||
if let Some(alternatives) = result.multiple() {
|
||||
if let Some(best) = alternatives.alternatives.first() {
|
||||
if !best.text.is_empty() {
|
||||
return Some((best.text.to_string(), best.confidence));
|
||||
}
|
||||
}
|
||||
DecodingState::Finalized => {
|
||||
// Result will always be multiple because we called set_max_alternatives
|
||||
RECOGNIZER
|
||||
.get()
|
||||
.unwrap()
|
||||
.lock()
|
||||
.unwrap()
|
||||
.result()
|
||||
.multiple()
|
||||
.and_then(|m| m.alternatives.first().map(|a| a.text.to_string()))
|
||||
}
|
||||
DecodingState::Failed => None,
|
||||
}
|
||||
},
|
||||
Err(err) => {
|
||||
error!("Vosk accept waveform error.\nError details: {}", err);
|
||||
|
||||
None
|
||||
|
||||
None
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn recognize_speech(data: &[i16]) -> Option<String> {
|
||||
let mut recognizer = SPEECH_RECOGNIZER.get()?.lock().unwrap();
|
||||
|
||||
match recognizer.accept_waveform(data) {
|
||||
Ok(DecodingState::Finalized) => {
|
||||
recognizer.result()
|
||||
.multiple()
|
||||
.and_then(|m| m.alternatives.first().map(|a| a.text.to_string()))
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub fn reset_speech_recognizer() {
|
||||
if let Some(recognizer) = SPEECH_RECOGNIZER.get() {
|
||||
recognizer.lock().unwrap().reset();
|
||||
}
|
||||
}
|
||||
|
||||
pub fn reset_wake_recognizer() {
|
||||
if let Some(recognizer) = WAKE_RECOGNIZER.get() {
|
||||
recognizer.lock().unwrap().reset();
|
||||
}
|
||||
}
|
||||
|
||||
// pub fn recognize(data: &[i16], include_partial: bool) -> Option<String> {
|
||||
// let state = RECOGNIZER
|
||||
// .get()
|
||||
// .unwrap()
|
||||
// .lock()
|
||||
// .unwrap()
|
||||
// .accept_waveform(data);
|
||||
|
||||
// match state {
|
||||
// Ok(ds) => {
|
||||
// match ds {
|
||||
// DecodingState::Running => {
|
||||
// if include_partial {
|
||||
// Some(
|
||||
// RECOGNIZER
|
||||
// .get()
|
||||
// .unwrap()
|
||||
// .lock()
|
||||
// .unwrap()
|
||||
// .partial_result()
|
||||
// .partial
|
||||
// .into(),
|
||||
// )
|
||||
// } else {
|
||||
// None
|
||||
// }
|
||||
// }
|
||||
// DecodingState::Finalized => {
|
||||
// // Result will always be multiple because we called set_max_alternatives
|
||||
// RECOGNIZER
|
||||
// .get()
|
||||
// .unwrap()
|
||||
// .lock()
|
||||
// .unwrap()
|
||||
// .result()
|
||||
// .multiple()
|
||||
// .and_then(|m| m.alternatives.first().map(|a| a.text.to_string()))
|
||||
// }
|
||||
// DecodingState::Failed => None,
|
||||
// }
|
||||
// },
|
||||
// Err(err) => {
|
||||
// error!("Vosk accept waveform error.\nError details: {}", err);
|
||||
|
||||
// None
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
fn get_configured_model_path() -> Result<std::path::PathBuf, String> {
|
||||
// try to get from settings
|
||||
if let Some(db) = DB.get() {
|
||||
|
||||
Reference in New Issue
Block a user