vosk usage optimization

This commit is contained in:
Priler
2026-01-05 04:20:43 +05:00
parent cab53abcbe
commit a640e6caea
6 changed files with 305 additions and 108 deletions

View File

@@ -36,6 +36,10 @@ fn main_loop() -> Result<(), ()> {
// recognize wake-word
match listener::data_callback(&frame_buffer) {
Some(_keyword_index) => {
// reset speech recognizer
stt::reset_wake_recognizer();
stt::reset_speech_recognizer();
// wake-word activated, process further commands
// capture current time
start = SystemTime::now();
@@ -62,11 +66,36 @@ fn main_loop() -> Result<(), ()> {
// filter recognized voice
// @TODO. Better recognized voice filtration.
recognized_voice = recognized_voice.to_lowercase();
// answer again if it's activation phrase repeated
if recognized_voice.contains(config::VOSK_FETCH_PHRASE) {
info!("Wake word detected during chaining, reactivating...");
// play greet sound
audio::play_sound(&sounds_directory.join(format!(
"{}.wav",
config::ASSISTANT_GREET_PHRASES
.choose(&mut rand::thread_rng())
.unwrap()
)));
// reset timer and continue listening
start = SystemTime::now();
stt::reset_speech_recognizer();
continue 'voice_recognition;
}
// filter out activation phrase from command
for tbr in config::ASSISTANT_PHRASES_TBR {
recognized_voice = recognized_voice.replace(tbr, "");
}
recognized_voice = recognized_voice.trim().into();
// skip if nothing left after filtering (*evil laugh*)
if recognized_voice.is_empty() {
continue 'voice_recognition;
}
// infer command (try intent recognition first, fallback to levenshtein)
let cmd_result = if let Some((intent_id, confidence)) =
rt.block_on(intent::classify(&recognized_voice))
@@ -74,7 +103,7 @@ fn main_loop() -> Result<(), ()> {
info!("Intent recognized: {} (confidence: {:.2})", intent_id, confidence);
intent::get_command_by_intent(COMMANDS_LIST.get().unwrap(), &intent_id)
} else {
info!("Intent not recognized, trying levenshtein fallback...");
info!("Intent not recognized, trying levenshtein fallback ...");
commands::fetch_command(&recognized_voice, COMMANDS_LIST.get().unwrap())
};
@@ -119,6 +148,9 @@ fn main_loop() -> Result<(), ()> {
}
_ => (),
}
// reset wake recognizer
stt::reset_wake_recognizer();
}
}
None => (),

View File

@@ -95,53 +95,95 @@ pub fn commands_hash(commands: &Vec<JCommandsList>) -> String {
}
// @TODO. NLU or smthng else is required, in order to infer commands with highest accuracy possible.
pub fn fetch_command<'a>(
phrase: &str,
commands: &'a Vec<JCommandsList>,
) -> Option<(&'a PathBuf, &'a JCommand)> {
// result scmd
let mut result_scmd: Option<(&PathBuf, &JCommand)> = None;
let mut current_max_ratio = config::CMD_RATIO_THRESHOLD;
let mut result: Option<(&PathBuf, &JCommand)> = None;
let mut best_score = config::CMD_RATIO_THRESHOLD;
// convert fetch phrase to sequence
let fetch_phrase_chars = phrase.chars().collect::<Vec<_>>();
// normalize input
let phrase = phrase.trim().to_lowercase();
if phrase.is_empty() {
return None;
}
// list all the commands
for cmd in commands {
// list all subcommands
for scmd in &cmd.commands {
// list all phrases in command
for cmd_phrase in &scmd.phrases {
// convert cmd phrase to sequence
let cmd_phrase_chars = cmd_phrase.chars().collect::<Vec<_>>();
let phrase_chars: Vec<char> = phrase.chars().collect();
let phrase_words: Vec<&str> = phrase.split_whitespace().collect();
// compare fetch phrase with cmd phrase
let ratio = ratio(&fetch_phrase_chars, &cmd_phrase_chars);
// return, if it fits the given threshold
if ratio >= current_max_ratio {
result_scmd = Some((&cmd.path, &scmd));
current_max_ratio = ratio;
// println!("Ratio is: {}", ratio);
// return Some((&cmd.path, &scmd))
for cmd_list in commands {
for cmd in &cmd_list.commands {
for cmd_phrase in &cmd.phrases {
let cmd_phrase = cmd_phrase.trim().to_lowercase();
let cmd_phrase_chars: Vec<char> = cmd_phrase.chars().collect();
// character-level similarity
let char_ratio = ratio(&phrase_chars, &cmd_phrase_chars);
// word-level similarity (handles word order)
let cmd_words: Vec<&str> = cmd_phrase.split_whitespace().collect();
let word_score = word_overlap_score(&phrase_words, &cmd_words);
// combined score (weighted average)
let score = (char_ratio * 0.6) + (word_score * 0.4);
// early exit on perfect match
if score >= 99.0 {
debug!("Perfect match: '{}' -> '{}'", phrase, cmd_phrase);
return Some((&cmd_list.path, cmd));
}
if score > best_score {
best_score = score;
result = Some((&cmd_list.path, cmd));
}
}
}
}
if let Some((cmd_path, scmd)) = result_scmd {
debug!("Ratio is: {}", current_max_ratio);
if let Some((cmd_path, cmd)) = result {
info!(
"CMD is: {cmd_path:?}, SCMD is: {scmd:?}, Ratio is: {}",
current_max_ratio
"Fuzzy match: '{}' -> cmd '{}' (score: {:.1}%)",
phrase, cmd.id, best_score
);
Some((&cmd_path, &scmd))
Some((cmd_path, cmd))
} else {
debug!("No match for '{}' (best: {:.1}%)", phrase, best_score);
None
}
}
fn word_overlap_score(input_words: &[&str], cmd_words: &[&str]) -> f64 {
if input_words.is_empty() || cmd_words.is_empty() {
return 0.0;
}
let mut matched = 0.0;
for input_word in input_words {
// find best matching word in command
let best_word_match = cmd_words
.iter()
.map(|cmd_word| {
let iw: Vec<char> = input_word.chars().collect();
let cw: Vec<char> = cmd_word.chars().collect();
ratio(&iw, &cw)
})
.fold(0.0_f64, |a, b| a.max(b));
// count as match if word similarity > 70%
if best_word_match > 70.0 {
matched += best_word_match / 100.0;
}
}
// normalize by max word count
let max_words = input_words.len().max(cmd_words.len()) as f64;
(matched / max_words) * 100.0
}
// @TODO. Rewrite executors by executor type struct. (with match arms)
pub fn execute_exe(exe: &str, args: &Vec<String>) -> std::io::Result<Child> {
Command::new(exe).args(args).spawn()

View File

@@ -133,6 +133,16 @@ pub const VOSK_MODEL_PATH: &str = "resources/vosk/model_small";
pub const VOSK_FETCH_PHRASE: &str = "джарвис";
pub const VOSK_MIN_RATIO: f64 = 70.0;
// 0.7 lenient, expect false positives
// 0.8 balanced
// 0.9 strict
// etc
pub const VOSK_WAKE_CONFIDENCE: f32 = 0.9;
pub const VOSK_SPEECH_RECOGNIZER_MAX_ALTERNATIVES: u16 = 3;
pub const VOSK_SPEECH_RECOGNIZER_WORDS: bool = false;
pub const VOSK_SPEECH_PARTIAL_WORDS: bool = false;
// IRE (intents recognition)
pub const INTENT_CLASSIFIER_MIN_CONFIDENCE: f64 = 0.75;

View File

@@ -4,33 +4,60 @@ pub fn init() -> Result<(), ()> {
Ok(()) // nothing to init for Vosk
}
// @TODO. Make it better somehow (more accurate or with higher sensitivity).
pub fn data_callback(frame_buffer: &[i16]) -> Option<i32> {
// recognize & convert to sequence
let recognized_phrase = stt::recognize(&frame_buffer, true).unwrap_or("".into());
if !recognized_phrase.trim().is_empty() {
info!("Vosk wake-word debug info:");
info!("rec: {}", recognized_phrase);
let recognized_phrases = recognized_phrase.split_whitespace();
for phrase in recognized_phrases {
let recognized_phrase_chars = phrase.trim().to_lowercase().chars().collect::<Vec<_>>();
// compare
let compare_ratio = seqdiff::ratio(
&config::VOSK_FETCH_PHRASE.chars().collect::<Vec<_>>(),
&recognized_phrase_chars,
);
info!("og phrase: {:?}", &config::VOSK_FETCH_PHRASE);
info!("recognized phrase: {:?}", &recognized_phrase_chars);
info!("compare ratio: {}", compare_ratio);
if compare_ratio >= config::VOSK_MIN_RATIO {
info!("Phrase activated.");
return Some(0);
}
if let Some((recognized, _confidence)) = stt::recognize_wake_word(frame_buffer) {
let recognized = recognized.trim().to_lowercase();
// skip unknown/empty
if recognized.is_empty() || recognized == "[unk]" {
return None;
}
info!("Wake word candidate: '{}'", recognized);
// verify with seqdiff ratio
let wake_chars: Vec<char> = config::VOSK_FETCH_PHRASE.chars().collect();
let recognized_chars: Vec<char> = recognized.chars().collect();
let similarity = seqdiff::ratio(&wake_chars, &recognized_chars);
info!("Similarity: {:.1}% ('{}' vs '{}')", similarity, recognized, config::VOSK_FETCH_PHRASE);
if similarity >= config::VOSK_MIN_RATIO {
info!("Wake word activated!");
return Some(0);
}
}
None
}
// @TODO. Make it better somehow (more accurate or with higher sensitivity).
// pub fn data_callback(frame_buffer: &[i16]) -> Option<i32> {
// // recognize & convert to sequence
// let recognized_phrase = stt::recognize(&frame_buffer, true).unwrap_or("".into());
// if !recognized_phrase.trim().is_empty() {
// info!("Vosk wake-word debug info:");
// info!("rec: {}", recognized_phrase);
// let recognized_phrases = recognized_phrase.split_whitespace();
// for phrase in recognized_phrases {
// let recognized_phrase_chars = phrase.trim().to_lowercase().chars().collect::<Vec<_>>();
// // compare
// let compare_ratio = seqdiff::ratio(
// &config::VOSK_FETCH_PHRASE.chars().collect::<Vec<_>>(),
// &recognized_phrase_chars,
// );
// info!("og phrase: {:?}", &config::VOSK_FETCH_PHRASE);
// info!("recognized phrase: {:?}", &recognized_phrase_chars);
// info!("compare ratio: {}", compare_ratio);
// if compare_ratio >= config::VOSK_MIN_RATIO {
// info!("Phrase activated.");
// return Some(0);
// }
// }
// }
// None
// }

View File

@@ -8,6 +8,11 @@ use crate::config::structs::SpeechToTextEngine;
use crate::vosk_models;
// use vosk_models::{scan_vosk_models, get_model_path, VoskModelInfo};
pub use self::vosk::init_vosk;
pub use self::vosk::recognize_wake_word;
pub use self::vosk::recognize_speech;
pub use self::vosk::reset_speech_recognizer;
pub use self::vosk::reset_wake_recognizer;
static STT_TYPE: OnceCell<SpeechToTextEngine> = OnceCell::new();
@@ -33,9 +38,16 @@ pub fn init() -> Result<(), ()> {
Ok(())
}
pub fn recognize(data: &[i16], partial: bool) -> Option<String> {
match STT_TYPE.get().unwrap() {
SpeechToTextEngine::Vosk => vosk::recognize(data, partial),
pub fn recognize(data: &[i16], include_partial: bool) -> Option<String> {
if include_partial {
vosk::recognize_wake_word(data).map(|(text, _)| text)
} else {
vosk::recognize_speech(data)
}
}
// pub fn recognize(data: &[i16], partial: bool) -> Option<String> {
// match STT_TYPE.get().unwrap() {
// SpeechToTextEngine::Vosk => vosk::recognize(data, partial),
// }
// }

View File

@@ -9,80 +9,154 @@ use crate::stt::vosk_models;
use crate::DB;
static MODEL: OnceCell<Model> = OnceCell::new();
static RECOGNIZER: OnceCell<Mutex<Recognizer>> = OnceCell::new();
static WAKE_RECOGNIZER: OnceCell<Mutex<Recognizer>> = OnceCell::new();
static SPEECH_RECOGNIZER: OnceCell<Mutex<Recognizer>> = OnceCell::new();
pub fn init_vosk() -> Result<(), String> {
if RECOGNIZER.get().is_some() {
if MODEL.get().is_some() {
return Ok(());
} // already initialized
let model_path = get_configured_model_path()?;
info!("Loading Vosk model from: {}", model_path.display());
let model = Model::new(model_path.to_str().unwrap())
.ok_or_else(|| format!("Failed to load Vosk model from: {}", model_path.display()))?;
let mut recognizer = Recognizer::new(&model, 16000.0)
.ok_or("Failed to create Vosk recognizer")?;
//let mut recognizer = Recognizer::new(&model, 16000.0)
// .ok_or("Failed to create Vosk recognizer")?;
let wake_phrases: &[&str] = &[
config::VOSK_FETCH_PHRASE,
"[unk]",
"джон",
"джони",
"джей",
"джонстон",
"привет",
"давай",
];
let mut wake_recognizer = Recognizer::new_with_grammar(&model, 16000.0, wake_phrases)
.ok_or("Failed to create wake word recognizer")?;
recognizer.set_max_alternatives(10);
recognizer.set_words(true);
recognizer.set_partial_words(true);
wake_recognizer.set_max_alternatives(1); // required for confidence check later on
MODEL.set(model);
RECOGNIZER.set(Mutex::new(recognizer));
let mut speech_recognizer = Recognizer::new(&model, 16000.0)
.ok_or("Failed to create speech recognizer")?;
speech_recognizer.set_max_alternatives(config::VOSK_SPEECH_RECOGNIZER_MAX_ALTERNATIVES);
speech_recognizer.set_words(config::VOSK_SPEECH_RECOGNIZER_WORDS);
speech_recognizer.set_partial_words(config::VOSK_SPEECH_PARTIAL_WORDS);
MODEL.set(model).map_err(|_| "Model already set")?;
WAKE_RECOGNIZER.set(Mutex::new(wake_recognizer)).map_err(|_| "Wake recognizer already set")?;
SPEECH_RECOGNIZER.set(Mutex::new(speech_recognizer)).map_err(|_| "Speech recognizer already set")?;
Ok(())
}
pub fn recognize(data: &[i16], include_partial: bool) -> Option<String> {
let state = RECOGNIZER
.get()
.unwrap()
.lock()
.unwrap()
.accept_waveform(data);
match state {
Ok(ds) => {
match ds {
DecodingState::Running => {
if include_partial {
Some(
RECOGNIZER
.get()
.unwrap()
.lock()
.unwrap()
.partial_result()
.partial
.into(),
)
} else {
None
pub fn recognize_wake_word(data: &[i16]) -> Option<(String, f32)> {
let mut recognizer = WAKE_RECOGNIZER.get()?.lock().unwrap();
match recognizer.accept_waveform(data) {
Ok(DecodingState::Running) => {
// partials don't have confidence, skip them
None
}
Ok(DecodingState::Finalized) => {
let result = recognizer.result();
// compensate confidence issues
if let Some(alternatives) = result.multiple() {
if let Some(best) = alternatives.alternatives.first() {
if !best.text.is_empty() {
return Some((best.text.to_string(), best.confidence));
}
}
DecodingState::Finalized => {
// Result will always be multiple because we called set_max_alternatives
RECOGNIZER
.get()
.unwrap()
.lock()
.unwrap()
.result()
.multiple()
.and_then(|m| m.alternatives.first().map(|a| a.text.to_string()))
}
DecodingState::Failed => None,
}
},
Err(err) => {
error!("Vosk accept waveform error.\nError details: {}", err);
None
None
}
_ => None,
}
}
pub fn recognize_speech(data: &[i16]) -> Option<String> {
let mut recognizer = SPEECH_RECOGNIZER.get()?.lock().unwrap();
match recognizer.accept_waveform(data) {
Ok(DecodingState::Finalized) => {
recognizer.result()
.multiple()
.and_then(|m| m.alternatives.first().map(|a| a.text.to_string()))
}
_ => None,
}
}
pub fn reset_speech_recognizer() {
if let Some(recognizer) = SPEECH_RECOGNIZER.get() {
recognizer.lock().unwrap().reset();
}
}
pub fn reset_wake_recognizer() {
if let Some(recognizer) = WAKE_RECOGNIZER.get() {
recognizer.lock().unwrap().reset();
}
}
// pub fn recognize(data: &[i16], include_partial: bool) -> Option<String> {
// let state = RECOGNIZER
// .get()
// .unwrap()
// .lock()
// .unwrap()
// .accept_waveform(data);
// match state {
// Ok(ds) => {
// match ds {
// DecodingState::Running => {
// if include_partial {
// Some(
// RECOGNIZER
// .get()
// .unwrap()
// .lock()
// .unwrap()
// .partial_result()
// .partial
// .into(),
// )
// } else {
// None
// }
// }
// DecodingState::Finalized => {
// // Result will always be multiple because we called set_max_alternatives
// RECOGNIZER
// .get()
// .unwrap()
// .lock()
// .unwrap()
// .result()
// .multiple()
// .and_then(|m| m.alternatives.first().map(|a| a.text.to_string()))
// }
// DecodingState::Failed => None,
// }
// },
// Err(err) => {
// error!("Vosk accept waveform error.\nError details: {}", err);
// None
// }
// }
// }
fn get_configured_model_path() -> Result<std::path::PathBuf, String> {
// try to get from settings
if let Some(db) = DB.get() {