use hound::SampleFormat; use std::io::Read; use whisper_rs::{WhisperVadContext, WhisperVadContextParams, WhisperVadParams, WhisperVadSegment}; fn main() { let model_path = std::env::args() .nth(1) .expect("Please specify path to VAD model as argument 1"); let wav_path = std::env::args() .nth(2) .expect("Please specify path to WAV file as argument 2"); let wav_reader = hound::WavReader::open(wav_path).expect("failed to open wav file"); assert_eq!( wav_reader.spec().sample_rate, 16000, "expected 16kHz sample rate" ); assert_eq!(wav_reader.spec().channels, 1, "expected mono audio"); let samples = decode_to_float(wav_reader); let mut vad_ctx_params = WhisperVadContextParams::default(); vad_ctx_params.set_n_threads(1); vad_ctx_params.set_use_gpu(false); // Note this context could be held in a global Mutex or similar // There's no restrictions on where the output can be sent after it's used, // as it just holds a C-style array internally with no references to the model. let mut vad_ctx = WhisperVadContext::new(&model_path, vad_ctx_params).expect("failed to load model"); let vad_params = WhisperVadParams::new(); let result = vad_ctx .segments_from_samples(vad_params, &samples) .expect("failed to run VAD"); for WhisperVadSegment { start, end } in result { println!( "detected speech between {}s and {}s", // each segment is in centiseconds so must be modified start / 100.0, end / 100.0 ); } } fn decode_to_float(rdr: hound::WavReader) -> Vec { match rdr.spec().sample_format { SampleFormat::Float => rdr .into_samples::() .map(|x| x.expect("expected fp32 WAV file")) .collect(), SampleFormat::Int => rdr .into_samples::() .map(|x| x.expect("expected i16 WAV file") as f32 / 32768.0) .collect(), } }