mirror of
https://github.com/Priler/jarvis.git
synced 2026-05-26 07:08:11 +00:00
VAD fixes + some calibrations
This commit is contained in:
337
crates/jarvis-app/src/_app.rs
Normal file
337
crates/jarvis-app/src/_app.rs
Normal file
@@ -0,0 +1,337 @@
|
||||
use std::sync::mpsc::Receiver;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use jarvis_core::{audio_buffer::AudioRingBuffer, audio, audio_processing, commands, config, listener, recorder, stt, COMMANDS_LIST, intent, voices, ipc::{self, IpcEvent}};
|
||||
use rand::prelude::*;
|
||||
|
||||
use crate::should_stop;
|
||||
|
||||
// VAD state machine
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
enum VadState {
|
||||
WaitingForVoice,
|
||||
VoiceActive,
|
||||
}
|
||||
|
||||
pub fn start(text_cmd_rx: Receiver<String>) -> Result<(), ()> {
|
||||
// start the loop
|
||||
main_loop(text_cmd_rx)
|
||||
}
|
||||
|
||||
fn main_loop(text_cmd_rx: Receiver<String>) -> Result<(), ()> {
|
||||
let rt = tokio::runtime::Runtime::new().expect("Failed to create tokio runtime");
|
||||
let mut start: SystemTime;
|
||||
// let sounds_directory = audio::get_sound_directory().unwrap();
|
||||
let frame_length: usize = 512; // default for every wake-word engine
|
||||
let sample_rate: usize = 16000;
|
||||
let mut frame_buffer: Vec<i16> = vec![0; frame_length];
|
||||
|
||||
// ring buffer: keep last 2 seconds of audio
|
||||
let mut audio_buffer = AudioRingBuffer::new(2.0, frame_length, sample_rate);
|
||||
|
||||
// VAD state
|
||||
let mut vad_state = VadState::WaitingForVoice;
|
||||
let mut silence_frames: u32 = 0;
|
||||
|
||||
// how many frames of silence before we consider speech ended
|
||||
// 1.5 seconds = 1.5 * (16000 / 512) ≈ 47 frames
|
||||
// @TODO: Put this to config
|
||||
let silence_threshold: u32 = ((1.5 * sample_rate as f32) / frame_length as f32) as u32;
|
||||
|
||||
// play some startup phrase
|
||||
// audio::play_sound(&sounds_directory.join("run.wav"));
|
||||
voices::play_greet();
|
||||
|
||||
// start recording
|
||||
match recorder::start_recording() {
|
||||
Ok(_) => info!("Recording started."),
|
||||
Err(_) => {
|
||||
error!("Cannot start recording.");
|
||||
return Err(()); // quit
|
||||
}
|
||||
}
|
||||
|
||||
// notify GUI we're ready
|
||||
ipc::send(IpcEvent::Idle);
|
||||
|
||||
// DEBUG counter
|
||||
let mut frame_count: u32 = 0;
|
||||
|
||||
// the loop
|
||||
'wake_word: loop {
|
||||
// check for stop signal
|
||||
if should_stop() {
|
||||
info!("Stop signal received, shutting down...");
|
||||
voices::play_goodbye();
|
||||
ipc::send(IpcEvent::Stopping);
|
||||
break;
|
||||
}
|
||||
|
||||
// check for text commands
|
||||
if let Ok(text) = text_cmd_rx.try_recv() {
|
||||
process_text_command(&text, &rt);
|
||||
continue 'wake_word;
|
||||
}
|
||||
|
||||
// read from microphone
|
||||
recorder::read_microphone(&mut frame_buffer);
|
||||
|
||||
// DEBUG: check raw audio
|
||||
frame_count += 1;
|
||||
let raw_rms = calculate_rms(&frame_buffer);
|
||||
|
||||
if frame_count % 100 == 0 {
|
||||
info!("DEBUG [{}]: raw_rms={:.0}", frame_count, raw_rms);
|
||||
}
|
||||
|
||||
// check if we're getting any audio at all
|
||||
if frame_count == 100 && raw_rms < 10.0 {
|
||||
warn!("WARNING: Microphone appears to be silent! RMS={:.0}", raw_rms);
|
||||
}
|
||||
|
||||
// process audio (gain -> noise suppression -> VAD)
|
||||
let processed = audio_processing::process(&frame_buffer);
|
||||
|
||||
if frame_count % 100 == 0 {
|
||||
info!("DEBUG [{}]: is_voice={}, vad_conf={:.2}, processed_rms={:.0}",
|
||||
frame_count,
|
||||
processed.is_voice,
|
||||
processed.vad_confidence,
|
||||
calculate_rms(&processed.samples)
|
||||
);
|
||||
}
|
||||
|
||||
// skip if no voice detected (vad)
|
||||
if !processed.is_voice {
|
||||
continue 'wake_word;
|
||||
}
|
||||
|
||||
// DEBUG: we passed VAD
|
||||
if frame_count % 50 == 0 {
|
||||
info!("DEBUG: Voice detected, checking wake word...");
|
||||
}
|
||||
|
||||
// recognize wake-word
|
||||
match listener::data_callback(&frame_buffer) {
|
||||
Some(_keyword_index) => {
|
||||
// notify GUI
|
||||
ipc::send(IpcEvent::WakeWordDetected);
|
||||
|
||||
// reset some things
|
||||
stt::reset_wake_recognizer();
|
||||
stt::reset_speech_recognizer();
|
||||
audio_processing::reset();
|
||||
|
||||
// wake-word activated, process further commands
|
||||
// capture current time
|
||||
start = SystemTime::now();
|
||||
silence_frames = 0;
|
||||
|
||||
// play some reply phrase
|
||||
// @TODO. Make it via commands or upcoming events system.
|
||||
voices::play_reply();
|
||||
|
||||
|
||||
// notify GUI we're listening
|
||||
ipc::send(IpcEvent::Listening);
|
||||
|
||||
// wait for voice commands
|
||||
'voice_recognition: loop {
|
||||
// check for stop
|
||||
if should_stop() {
|
||||
break 'wake_word;
|
||||
}
|
||||
|
||||
// read from microphone
|
||||
recorder::read_microphone(&mut frame_buffer);
|
||||
|
||||
// process first
|
||||
let processed = audio_processing::process(&frame_buffer);
|
||||
|
||||
// detect silence, return to wake-word if silence
|
||||
if processed.is_voice {
|
||||
silence_frames = 0;
|
||||
} else {
|
||||
silence_frames += 1;
|
||||
if silence_frames > config::VAD_SILENCE_FRAMES * 2 {
|
||||
info!("Long silence detected, returning to wake word mode.");
|
||||
break 'voice_recognition;
|
||||
}
|
||||
}
|
||||
|
||||
// stt part (without partials)
|
||||
if let Some(mut recognized_voice) = stt::recognize(&frame_buffer, false) {
|
||||
// something was recognized
|
||||
info!("Recognized voice: {}", recognized_voice);
|
||||
|
||||
// notify GUI
|
||||
ipc::send(IpcEvent::SpeechRecognized {
|
||||
text: recognized_voice.clone(),
|
||||
});
|
||||
|
||||
// filter recognized voice
|
||||
// @TODO. Better recognized voice filtration.
|
||||
recognized_voice = recognized_voice.to_lowercase();
|
||||
|
||||
// answer again if it's activation phrase repeated
|
||||
if recognized_voice.contains(config::VOSK_FETCH_PHRASE) {
|
||||
info!("Wake word detected during chaining, reactivating...");
|
||||
|
||||
// play greet sound
|
||||
// audio::play_sound(&sounds_directory.join(format!(
|
||||
// "{}.wav",
|
||||
// config::ASSISTANT_GREET_PHRASES
|
||||
// .choose(&mut rand::thread_rng())
|
||||
// .unwrap()
|
||||
// )));
|
||||
voices::play_reply();
|
||||
|
||||
// reset timer and continue listening
|
||||
start = SystemTime::now();
|
||||
silence_frames = 0;
|
||||
stt::reset_speech_recognizer();
|
||||
|
||||
ipc::send(IpcEvent::Listening);
|
||||
continue 'voice_recognition;
|
||||
}
|
||||
|
||||
// filter out activation phrase from command
|
||||
for tbr in config::ASSISTANT_PHRASES_TBR {
|
||||
recognized_voice = recognized_voice.replace(tbr, "");
|
||||
}
|
||||
recognized_voice = recognized_voice.trim().into();
|
||||
|
||||
// skip if nothing left after filtering (*evil laugh*)
|
||||
if recognized_voice.is_empty() {
|
||||
continue 'voice_recognition;
|
||||
}
|
||||
|
||||
// execute command (shared executor)
|
||||
execute_command(&recognized_voice, &rt);
|
||||
|
||||
// return to wake-word listening after command execution (no matter successful or not)
|
||||
break 'voice_recognition;
|
||||
}
|
||||
|
||||
// only recognize voice for a certain period of time
|
||||
match start.elapsed() {
|
||||
Ok(elapsed) if elapsed > config::CMS_WAIT_DELAY => {
|
||||
// return to wake-word listening after N seconds
|
||||
break 'voice_recognition;
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
|
||||
// reset things
|
||||
stt::reset_wake_recognizer();
|
||||
audio_processing::reset();
|
||||
ipc::send(IpcEvent::Idle);
|
||||
}
|
||||
}
|
||||
None => (),
|
||||
}
|
||||
}
|
||||
|
||||
// cleanup
|
||||
recorder::stop_recording().ok();
|
||||
ipc::send(IpcEvent::Stopping);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
// process text command from GUI
|
||||
fn process_text_command(text: &str, rt: &tokio::runtime::Runtime) {
|
||||
info!("Processing text command: {}", text);
|
||||
|
||||
ipc::send(IpcEvent::SpeechRecognized { text: text.to_string() });
|
||||
|
||||
// filter text same as voice
|
||||
let mut filtered = text.to_lowercase();
|
||||
for tbr in config::ASSISTANT_PHRASES_TBR {
|
||||
filtered = filtered.replace(tbr, "");
|
||||
}
|
||||
let filtered = filtered.trim();
|
||||
|
||||
if filtered.is_empty() {
|
||||
ipc::send(IpcEvent::Idle);
|
||||
return;
|
||||
}
|
||||
|
||||
execute_command(filtered, rt);
|
||||
}
|
||||
|
||||
// shared command execution logic (manual & voice)
|
||||
fn execute_command(text: &str, rt: &tokio::runtime::Runtime) {
|
||||
let commands_list = match COMMANDS_LIST.get() {
|
||||
Some(c) => c,
|
||||
None => {
|
||||
ipc::send(IpcEvent::Error { message: "Commands not loaded".to_string() });
|
||||
ipc::send(IpcEvent::Idle);
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
// let sounds_directory = audio::get_sound_directory().unwrap();
|
||||
|
||||
// try intent recognition first, fallback to levenshtein
|
||||
let cmd_result = if let Some((intent_id, confidence)) =
|
||||
rt.block_on(intent::classify(text))
|
||||
{
|
||||
info!("Intent recognized: {} (confidence: {:.2})", intent_id, confidence);
|
||||
intent::get_command_by_intent(commands_list, &intent_id)
|
||||
} else {
|
||||
info!("Intent not recognized, trying levenshtein fallback...");
|
||||
commands::fetch_command(text, commands_list)
|
||||
};
|
||||
|
||||
if let Some((cmd_path, cmd_config)) = cmd_result {
|
||||
info!("Command found: {:?}", cmd_path);
|
||||
|
||||
match commands::execute_command(&cmd_path, &cmd_config) {
|
||||
Ok(_) => {
|
||||
info!("Command executed successfully");
|
||||
voices::play_ok(); // command executed sound
|
||||
ipc::send(IpcEvent::CommandExecuted {
|
||||
id: cmd_config.id.clone(),
|
||||
success: true,
|
||||
});
|
||||
}
|
||||
Err(msg) => {
|
||||
error!("Error executing command: {}", msg);
|
||||
voices::play_error();
|
||||
ipc::send(IpcEvent::CommandExecuted {
|
||||
id: cmd_config.id.clone(),
|
||||
success: false,
|
||||
});
|
||||
ipc::send(IpcEvent::Error { message: msg.to_string() });
|
||||
}
|
||||
}
|
||||
} else {
|
||||
info!("No command found for: {}", text);
|
||||
// play "not understood" sound
|
||||
// audio::play_sound(&sounds_directory.join("not_understand.wav"));
|
||||
voices::play_not_found();
|
||||
ipc::send(IpcEvent::Error {
|
||||
message: format!("Command not found: {}", text)
|
||||
});
|
||||
}
|
||||
|
||||
ipc::send(IpcEvent::Idle);
|
||||
}
|
||||
|
||||
|
||||
fn keyword_callback(keyword_index: i32) {}
|
||||
|
||||
pub fn close(code: i32) {
|
||||
info!("Closing application.");
|
||||
voices::play_goodbye();
|
||||
ipc::send(IpcEvent::Stopping);
|
||||
std::process::exit(code);
|
||||
}
|
||||
|
||||
fn calculate_rms(samples: &[i16]) -> f32 {
|
||||
if samples.is_empty() { return 0.0; }
|
||||
let sum: f64 = samples.iter().map(|&s| (s as f64).powi(2)).sum();
|
||||
(sum / samples.len() as f64).sqrt() as f32
|
||||
}
|
||||
@@ -1,43 +1,52 @@
|
||||
use std::sync::mpsc::Receiver;
|
||||
use std::time::SystemTime;
|
||||
|
||||
use jarvis_core::{audio, audio_processing, commands, config, listener, recorder, stt, COMMANDS_LIST, intent, voices, ipc::{self, IpcEvent}};
|
||||
use rand::prelude::*;
|
||||
use jarvis_core::{audio_buffer::AudioRingBuffer, audio_processing, commands, config, listener, recorder, stt, COMMANDS_LIST, intent, voices, ipc::{self, IpcEvent}};
|
||||
|
||||
use crate::should_stop;
|
||||
|
||||
// VAD state machine
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
enum VadState {
|
||||
WaitingForVoice,
|
||||
VoiceActive,
|
||||
}
|
||||
|
||||
pub fn start(text_cmd_rx: Receiver<String>) -> Result<(), ()> {
|
||||
// start the loop
|
||||
main_loop(text_cmd_rx)
|
||||
}
|
||||
|
||||
fn main_loop(text_cmd_rx: Receiver<String>) -> Result<(), ()> {
|
||||
let rt = tokio::runtime::Runtime::new().expect("Failed to create tokio runtime");
|
||||
let mut start: SystemTime;
|
||||
// let sounds_directory = audio::get_sound_directory().unwrap();
|
||||
let frame_length: usize = 512; // default for every wake-word engine
|
||||
let frame_length: usize = 512;
|
||||
let sample_rate: usize = 16000;
|
||||
let mut frame_buffer: Vec<i16> = vec![0; frame_length];
|
||||
|
||||
// ring buffer: keeps last 2 seconds of audio (pre-roll)
|
||||
let mut audio_buffer = AudioRingBuffer::new(2.0, frame_length, sample_rate);
|
||||
|
||||
// VAD state
|
||||
let mut vad_state = VadState::WaitingForVoice;
|
||||
let mut silence_frames: u32 = 0;
|
||||
|
||||
// play some startup phrase
|
||||
// audio::play_sound(&sounds_directory.join("run.wav"));
|
||||
|
||||
// how many frames of silence before we consider speech ended
|
||||
// 1.5 seconds = 1.5 * (16000 / 512) ≈ 47 frames
|
||||
let silence_threshold: u32 = ((1.5 * sample_rate as f32) / frame_length as f32) as u32;
|
||||
|
||||
voices::play_greet();
|
||||
|
||||
// start recording
|
||||
match recorder::start_recording() {
|
||||
Ok(_) => info!("Recording started."),
|
||||
Err(_) => {
|
||||
error!("Cannot start recording.");
|
||||
return Err(()); // quit
|
||||
return Err(());
|
||||
}
|
||||
}
|
||||
|
||||
// notify GUI we're ready
|
||||
ipc::send(IpcEvent::Idle);
|
||||
|
||||
// the loop
|
||||
// ### WAKE WORD DETECTION LOOP
|
||||
'wake_word: loop {
|
||||
// check for stop signal
|
||||
if should_stop() {
|
||||
info!("Stop signal received, shutting down...");
|
||||
voices::play_goodbye();
|
||||
@@ -45,145 +54,78 @@ fn main_loop(text_cmd_rx: Receiver<String>) -> Result<(), ()> {
|
||||
break;
|
||||
}
|
||||
|
||||
// check for text commands
|
||||
if let Ok(text) = text_cmd_rx.try_recv() {
|
||||
process_text_command(&text, &rt);
|
||||
continue 'wake_word;
|
||||
}
|
||||
|
||||
// read from microphone
|
||||
recorder::read_microphone(&mut frame_buffer);
|
||||
|
||||
// process audio (gain -> noise suppression -> VAD)
|
||||
let processed = audio_processing::process(&frame_buffer);
|
||||
|
||||
// skip if no voice detected (vad)
|
||||
if !processed.is_voice {
|
||||
continue 'wake_word;
|
||||
}
|
||||
|
||||
// recognize wake-word
|
||||
match listener::data_callback(&frame_buffer) {
|
||||
Some(_keyword_index) => {
|
||||
// notify GUI
|
||||
ipc::send(IpcEvent::WakeWordDetected);
|
||||
|
||||
// reset some things
|
||||
stt::reset_wake_recognizer();
|
||||
stt::reset_speech_recognizer();
|
||||
audio_processing::reset();
|
||||
|
||||
// wake-word activated, process further commands
|
||||
// capture current time
|
||||
start = SystemTime::now();
|
||||
silence_frames = 0;
|
||||
|
||||
// play some reply phrase
|
||||
// @TODO. Make it via commands or upcoming events system.
|
||||
voices::play_reply();
|
||||
|
||||
|
||||
// notify GUI we're listening
|
||||
ipc::send(IpcEvent::Listening);
|
||||
|
||||
// wait for voice commands
|
||||
'voice_recognition: loop {
|
||||
// check for stop
|
||||
if should_stop() {
|
||||
break 'wake_word;
|
||||
|
||||
match vad_state {
|
||||
VadState::WaitingForVoice => {
|
||||
// always buffer audio
|
||||
audio_buffer.push(&frame_buffer);
|
||||
|
||||
if processed.is_voice {
|
||||
// voice started! flush buffer to Vosk
|
||||
info!("VAD: Voice started, flushing {} buffered frames", audio_buffer.len());
|
||||
|
||||
for buffered_frame in audio_buffer.drain_all() {
|
||||
listener::data_callback(&buffered_frame);
|
||||
}
|
||||
|
||||
// read from microphone
|
||||
recorder::read_microphone(&mut frame_buffer);
|
||||
|
||||
// process first
|
||||
let processed = audio_processing::process(&frame_buffer);
|
||||
|
||||
// detect silence, return to wake-word if silence
|
||||
if processed.is_voice {
|
||||
silence_frames = 0;
|
||||
} else {
|
||||
silence_frames += 1;
|
||||
if silence_frames > config::VAD_SILENCE_FRAMES * 2 {
|
||||
info!("Long silence detected, returning to wake word mode.");
|
||||
break 'voice_recognition;
|
||||
}
|
||||
}
|
||||
|
||||
// stt part (without partials)
|
||||
if let Some(mut recognized_voice) = stt::recognize(&frame_buffer, false) {
|
||||
// something was recognized
|
||||
info!("Recognized voice: {}", recognized_voice);
|
||||
|
||||
// notify GUI
|
||||
ipc::send(IpcEvent::SpeechRecognized {
|
||||
text: recognized_voice.clone(),
|
||||
});
|
||||
|
||||
// filter recognized voice
|
||||
// @TODO. Better recognized voice filtration.
|
||||
recognized_voice = recognized_voice.to_lowercase();
|
||||
|
||||
// answer again if it's activation phrase repeated
|
||||
if recognized_voice.contains(config::VOSK_FETCH_PHRASE) {
|
||||
info!("Wake word detected during chaining, reactivating...");
|
||||
|
||||
// play greet sound
|
||||
// audio::play_sound(&sounds_directory.join(format!(
|
||||
// "{}.wav",
|
||||
// config::ASSISTANT_GREET_PHRASES
|
||||
// .choose(&mut rand::thread_rng())
|
||||
// .unwrap()
|
||||
// )));
|
||||
voices::play_reply();
|
||||
|
||||
// reset timer and continue listening
|
||||
start = SystemTime::now();
|
||||
silence_frames = 0;
|
||||
stt::reset_speech_recognizer();
|
||||
|
||||
ipc::send(IpcEvent::Listening);
|
||||
continue 'voice_recognition;
|
||||
}
|
||||
|
||||
// filter out activation phrase from command
|
||||
for tbr in config::ASSISTANT_PHRASES_TBR {
|
||||
recognized_voice = recognized_voice.replace(tbr, "");
|
||||
}
|
||||
recognized_voice = recognized_voice.trim().into();
|
||||
|
||||
// skip if nothing left after filtering (*evil laugh*)
|
||||
if recognized_voice.is_empty() {
|
||||
continue 'voice_recognition;
|
||||
}
|
||||
|
||||
// execute command (shared executor)
|
||||
execute_command(&recognized_voice, &rt);
|
||||
|
||||
// return to wake-word listening after command execution (no matter successful or not)
|
||||
break 'voice_recognition;
|
||||
}
|
||||
|
||||
// only recognize voice for a certain period of time
|
||||
match start.elapsed() {
|
||||
Ok(elapsed) if elapsed > config::CMS_WAIT_DELAY => {
|
||||
// return to wake-word listening after N seconds
|
||||
break 'voice_recognition;
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
|
||||
// reset things
|
||||
|
||||
vad_state = VadState::VoiceActive;
|
||||
silence_frames = 0;
|
||||
}
|
||||
}
|
||||
|
||||
VadState::VoiceActive => {
|
||||
// feed to wake word detector
|
||||
if let Some(_keyword_index) = listener::data_callback(&frame_buffer) {
|
||||
// WAKE WORD DETECTED!
|
||||
info!("Wake word activated!");
|
||||
ipc::send(IpcEvent::WakeWordDetected);
|
||||
|
||||
stt::reset_wake_recognizer();
|
||||
stt::reset_speech_recognizer();
|
||||
audio_processing::reset();
|
||||
|
||||
voices::play_reply();
|
||||
ipc::send(IpcEvent::Listening);
|
||||
|
||||
// enter voice recognition mode
|
||||
recognize_command(&mut frame_buffer, &rt, frame_length, sample_rate);
|
||||
|
||||
// reset state after command
|
||||
vad_state = VadState::WaitingForVoice;
|
||||
silence_frames = 0;
|
||||
audio_buffer.clear();
|
||||
stt::reset_wake_recognizer();
|
||||
audio_processing::reset();
|
||||
ipc::send(IpcEvent::Idle);
|
||||
|
||||
continue 'wake_word;
|
||||
}
|
||||
|
||||
// track silence
|
||||
if processed.is_voice {
|
||||
silence_frames = 0;
|
||||
} else {
|
||||
silence_frames += 1;
|
||||
|
||||
if silence_frames > silence_threshold {
|
||||
// silence timeout, back to waiting
|
||||
debug!("VAD: Silence timeout, returning to wait state");
|
||||
vad_state = VadState::WaitingForVoice;
|
||||
silence_frames = 0;
|
||||
stt::reset_wake_recognizer();
|
||||
}
|
||||
}
|
||||
}
|
||||
None => (),
|
||||
}
|
||||
}
|
||||
|
||||
// cleanup
|
||||
recorder::stop_recording().ok();
|
||||
ipc::send(IpcEvent::Stopping);
|
||||
|
||||
@@ -191,13 +133,129 @@ fn main_loop(text_cmd_rx: Receiver<String>) -> Result<(), ()> {
|
||||
}
|
||||
|
||||
|
||||
// process text command from GUI
|
||||
// Voice recognition for command after wake word
|
||||
fn recognize_command(
|
||||
frame_buffer: &mut [i16],
|
||||
rt: &tokio::runtime::Runtime,
|
||||
frame_length: usize,
|
||||
sample_rate: usize,
|
||||
) {
|
||||
let mut audio_buffer = AudioRingBuffer::new(2.0, frame_length, sample_rate);
|
||||
let mut vad_state = VadState::WaitingForVoice;
|
||||
let mut silence_frames: u32 = 0;
|
||||
let mut start = SystemTime::now();
|
||||
|
||||
// longer silence threshold for commands (user might pause to think)
|
||||
// 2 seconds
|
||||
let silence_threshold: u32 = ((2.0 * sample_rate as f32) / frame_length as f32) as u32;
|
||||
|
||||
loop {
|
||||
if crate::should_stop() {
|
||||
return;
|
||||
}
|
||||
|
||||
recorder::read_microphone(frame_buffer);
|
||||
let processed = audio_processing::process(frame_buffer);
|
||||
|
||||
match vad_state {
|
||||
VadState::WaitingForVoice => {
|
||||
audio_buffer.push(frame_buffer);
|
||||
|
||||
if processed.is_voice {
|
||||
// flush buffer to STT
|
||||
for buffered_frame in audio_buffer.drain_all() {
|
||||
stt::recognize(&buffered_frame, false);
|
||||
}
|
||||
vad_state = VadState::VoiceActive;
|
||||
silence_frames = 0;
|
||||
}
|
||||
}
|
||||
|
||||
VadState::VoiceActive => {
|
||||
// feed to STT
|
||||
if let Some(mut recognized_voice) = stt::recognize(frame_buffer, false) {
|
||||
info!("Recognized voice: {}", recognized_voice);
|
||||
|
||||
ipc::send(IpcEvent::SpeechRecognized {
|
||||
text: recognized_voice.clone(),
|
||||
});
|
||||
|
||||
recognized_voice = recognized_voice.to_lowercase();
|
||||
|
||||
// check if wake word repeated (reactivate)
|
||||
if recognized_voice.contains(config::VOSK_FETCH_PHRASE) {
|
||||
info!("Wake word detected during chaining, reactivating...");
|
||||
voices::play_reply();
|
||||
stt::reset_speech_recognizer();
|
||||
ipc::send(IpcEvent::Listening);
|
||||
|
||||
// reset for next command
|
||||
vad_state = VadState::WaitingForVoice;
|
||||
silence_frames = 0;
|
||||
start = SystemTime::now();
|
||||
audio_buffer.clear();
|
||||
continue;
|
||||
}
|
||||
|
||||
// filter activation phrases
|
||||
for tbr in config::ASSISTANT_PHRASES_TBR {
|
||||
recognized_voice = recognized_voice.replace(tbr, "");
|
||||
}
|
||||
recognized_voice = recognized_voice.trim().to_string();
|
||||
|
||||
if recognized_voice.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// execute command and check if we should chain
|
||||
let should_chain = execute_command(&recognized_voice, rt);
|
||||
|
||||
if should_chain {
|
||||
// chain: reset and continue listening
|
||||
info!("Chaining enabled, continuing to listen...");
|
||||
stt::reset_speech_recognizer();
|
||||
vad_state = VadState::WaitingForVoice;
|
||||
silence_frames = 0;
|
||||
start = SystemTime::now();
|
||||
audio_buffer.clear();
|
||||
ipc::send(IpcEvent::Listening);
|
||||
continue;
|
||||
} else {
|
||||
// no chain: return to wake word
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// track silence
|
||||
if processed.is_voice {
|
||||
silence_frames = 0;
|
||||
} else {
|
||||
silence_frames += 1;
|
||||
|
||||
if silence_frames > silence_threshold {
|
||||
info!("Long silence detected, returning to wake word mode.");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// timeout
|
||||
if let Ok(elapsed) = start.elapsed() {
|
||||
if elapsed > config::CMS_WAIT_DELAY {
|
||||
info!("Command timeout, returning to wake word mode.");
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn process_text_command(text: &str, rt: &tokio::runtime::Runtime) {
|
||||
info!("Processing text command: {}", text);
|
||||
|
||||
ipc::send(IpcEvent::SpeechRecognized { text: text.to_string() });
|
||||
|
||||
// filter text same as voice
|
||||
let mut filtered = text.to_lowercase();
|
||||
for tbr in config::ASSISTANT_PHRASES_TBR {
|
||||
filtered = filtered.replace(tbr, "");
|
||||
@@ -209,23 +267,22 @@ fn process_text_command(text: &str, rt: &tokio::runtime::Runtime) {
|
||||
return;
|
||||
}
|
||||
|
||||
// text commands never chain
|
||||
execute_command(filtered, rt);
|
||||
}
|
||||
|
||||
// shared command execution logic (manual & voice)
|
||||
fn execute_command(text: &str, rt: &tokio::runtime::Runtime) {
|
||||
|
||||
// Execute command, returns true if chaining should continue
|
||||
fn execute_command(text: &str, rt: &tokio::runtime::Runtime) -> bool {
|
||||
let commands_list = match COMMANDS_LIST.get() {
|
||||
Some(c) => c,
|
||||
None => {
|
||||
ipc::send(IpcEvent::Error { message: "Commands not loaded".to_string() });
|
||||
ipc::send(IpcEvent::Idle);
|
||||
return;
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
// let sounds_directory = audio::get_sound_directory().unwrap();
|
||||
|
||||
// try intent recognition first, fallback to levenshtein
|
||||
let cmd_result = if let Some((intent_id, confidence)) =
|
||||
rt.block_on(intent::classify(text))
|
||||
{
|
||||
@@ -240,13 +297,15 @@ fn execute_command(text: &str, rt: &tokio::runtime::Runtime) {
|
||||
info!("Command found: {:?}", cmd_path);
|
||||
|
||||
match commands::execute_command(&cmd_path, &cmd_config) {
|
||||
Ok(_) => {
|
||||
Ok(chain) => {
|
||||
info!("Command executed successfully");
|
||||
voices::play_ok(); // command executed sound
|
||||
voices::play_ok();
|
||||
ipc::send(IpcEvent::CommandExecuted {
|
||||
id: cmd_config.id.clone(),
|
||||
success: true,
|
||||
});
|
||||
ipc::send(IpcEvent::Idle);
|
||||
return chain; // return chain status from command
|
||||
}
|
||||
Err(msg) => {
|
||||
error!("Error executing command: {}", msg);
|
||||
@@ -260,8 +319,6 @@ fn execute_command(text: &str, rt: &tokio::runtime::Runtime) {
|
||||
}
|
||||
} else {
|
||||
info!("No command found for: {}", text);
|
||||
// play "not understood" sound
|
||||
// audio::play_sound(&sounds_directory.join("not_understand.wav"));
|
||||
voices::play_not_found();
|
||||
ipc::send(IpcEvent::Error {
|
||||
message: format!("Command not found: {}", text)
|
||||
@@ -269,14 +326,13 @@ fn execute_command(text: &str, rt: &tokio::runtime::Runtime) {
|
||||
}
|
||||
|
||||
ipc::send(IpcEvent::Idle);
|
||||
false // no chain on error or not found
|
||||
}
|
||||
|
||||
|
||||
fn keyword_callback(keyword_index: i32) {}
|
||||
|
||||
pub fn close(code: i32) {
|
||||
info!("Closing application.");
|
||||
voices::play_goodbye();
|
||||
ipc::send(IpcEvent::Stopping);
|
||||
std::process::exit(code);
|
||||
}
|
||||
}
|
||||
41
crates/jarvis-core/src/audio_buffer.rs
Normal file
41
crates/jarvis-core/src/audio_buffer.rs
Normal file
@@ -0,0 +1,41 @@
|
||||
use std::collections::VecDeque;
|
||||
|
||||
pub struct AudioRingBuffer {
|
||||
buffer: VecDeque<Vec<i16>>,
|
||||
max_frames: usize,
|
||||
}
|
||||
|
||||
impl AudioRingBuffer {
|
||||
// Create buffer that holds `seconds` worth of audio at given frame_size and sample_rate
|
||||
pub fn new(seconds: f32, frame_size: usize, sample_rate: usize) -> Self {
|
||||
let frames_per_second = sample_rate / frame_size;
|
||||
let max_frames = (frames_per_second as f32 * seconds) as usize;
|
||||
|
||||
Self {
|
||||
buffer: VecDeque::with_capacity(max_frames),
|
||||
max_frames,
|
||||
}
|
||||
}
|
||||
|
||||
// Push a frame, dropping oldest if full
|
||||
pub fn push(&mut self, frame: &[i16]) {
|
||||
if self.buffer.len() >= self.max_frames {
|
||||
self.buffer.pop_front();
|
||||
}
|
||||
self.buffer.push_back(frame.to_vec());
|
||||
}
|
||||
|
||||
// Drain all buffered frames into a single vec
|
||||
pub fn drain_all(&mut self) -> Vec<Vec<i16>> {
|
||||
self.buffer.drain(..).collect()
|
||||
}
|
||||
|
||||
// Get frame count
|
||||
pub fn len(&self) -> usize {
|
||||
self.buffer.len()
|
||||
}
|
||||
|
||||
pub fn clear(&mut self) {
|
||||
self.buffer.clear();
|
||||
}
|
||||
}
|
||||
@@ -163,8 +163,8 @@ pub const DEFAULT_VAD: VadBackend = VadBackend::Energy;
|
||||
pub const DEFAULT_GAIN_NORMALIZER: bool = false;
|
||||
|
||||
// VAD settings
|
||||
pub const VAD_ENERGY_THRESHOLD: f32 = 500.0; // RMS threshold for energy-based VAD
|
||||
pub const VAD_NNNOISELESS_THRESHOLD: f32 = 0.5; // probability threshold for nnnoiseless
|
||||
pub const VAD_ENERGY_THRESHOLD: f32 = 100.0; // RMS threshold for energy-based VAD
|
||||
pub const VAD_NNNOISELESS_THRESHOLD: f32 = 0.8; // probability threshold for nnnoiseless
|
||||
pub const VAD_SILENCE_FRAMES: u32 = 15; // frames of silence before speech end (~480ms)
|
||||
|
||||
// gain normalizer settings
|
||||
|
||||
@@ -63,7 +63,7 @@ settings-stt-engine = Speech recognition
|
||||
settings-intent-engine = Intent recognition
|
||||
settings-intent-engine-desc = Select neural network for command recognition.
|
||||
settings-noise-suppression = Noise suppression
|
||||
settings-noise-suppression-desc = Reduces background noise.
|
||||
settings-noise-suppression-desc = Reduces background noise. May negatively affect recognition.
|
||||
settings-vad = Voice detection (VAD)
|
||||
settings-vad-desc = Skips silence, saves CPU resources.
|
||||
settings-gain-normalizer = Gain normalizer
|
||||
|
||||
@@ -63,7 +63,7 @@ settings-stt-engine = Распознавание речи
|
||||
settings-intent-engine = Определение намерения
|
||||
settings-intent-engine-desc = Выберите нейросеть для распознавания команд.
|
||||
settings-noise-suppression = Шумоподавление
|
||||
settings-noise-suppression-desc = Уменьшает фоновый шум.
|
||||
settings-noise-suppression-desc = Уменьшает фоновый шум. Может негативно влиять на распознавание.
|
||||
settings-vad = Определение голоса (VAD)
|
||||
settings-vad-desc = Пропускает тишину, экономит ресурсы CPU.
|
||||
settings-gain-normalizer = Нормализация громкости
|
||||
|
||||
@@ -63,7 +63,7 @@ settings-stt-engine = Розпізнавання мовлення
|
||||
settings-intent-engine = Визначення наміру
|
||||
settings-intent-engine-desc = Виберіть нейромережу для розпізнавання команд.
|
||||
settings-noise-suppression = Шумозаглушення
|
||||
settings-noise-suppression-desc = Зменшує фоновий шум.
|
||||
settings-noise-suppression-desc = Зменшує фоновий шум. Може негативно впливати на розпізнавання.
|
||||
settings-vad = Визначення голосу (VAD)
|
||||
settings-vad-desc = Пропускає тишу, економить ресурси CPU.
|
||||
settings-gain-normalizer = Нормалізація гучності
|
||||
|
||||
@@ -36,6 +36,8 @@ pub mod ipc;
|
||||
|
||||
pub mod voices;
|
||||
|
||||
pub mod audio_buffer;
|
||||
|
||||
// shared statics
|
||||
// pub static APP_DIR: Lazy<PathBuf> = Lazy::new(|| std::env::current_dir().unwrap());
|
||||
pub static APP_DIR: Lazy<PathBuf> = Lazy::new(|| {
|
||||
|
||||
Reference in New Issue
Block a user