From 05d072ffc4dd8a4f88452e3f15096e5fb2bd218b Mon Sep 17 00:00:00 2001 From: James Bruska Date: Thu, 23 Mar 2023 12:38:54 -0400 Subject: [PATCH 1/8] Updated Cargo.toml and audio_transcription example to not fail cargo test --- Cargo.toml | 5 ++++- examples/audio_transcription.rs | 7 ++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 1de051c..36fdd00 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,8 +16,11 @@ repository = "https://github.com/tazz4843/whisper-rs" [dependencies] whisper-rs-sys = { path = "sys", version = "0.3" } +[dev-dependencies] +hound = "3.5.0" + [features] simd = [] [package.metadata.docs.rs] -features = ["simd"] \ No newline at end of file +features = ["simd"] diff --git a/examples/audio_transcription.rs b/examples/audio_transcription.rs index bf6b3d8..7831f6d 100644 --- a/examples/audio_transcription.rs +++ b/examples/audio_transcription.rs @@ -1,9 +1,10 @@ // This example is not going to build in this folder. -// You need to copy this code into your project and add the whisper_rs dependency in your cargo.toml +// You need to copy this code into your project and add the dependencies whisper_rs and hound in your cargo.toml use std::fs::File; use std::io::Write; use whisper_rs::{FullParams, SamplingStrategy, WhisperContext}; +use hound; /// Loads a context and model, processes an audio file, and prints the resulting transcript to stdout. fn main() { @@ -14,7 +15,7 @@ fn main() { // Create a params object for running the model. // Currently, only the Greedy sampling strategy is implemented, with BeamSearch as a WIP. // The number of past samples to consider defaults to 0. - let mut params = FullParams::new(SamplingStrategy::Greedy { n_past: 0 }); + let mut params = FullParams::new(SamplingStrategy::Greedy { best_of: 0 }); // Edit params as needed. // Set the number of threads to use to 1. @@ -22,7 +23,7 @@ fn main() { // Enable translation. params.set_translate(true); // Set the language to translate to to English. - params.set_language("en"); + params.set_language(Some("en")); // Disable anything that prints to stdout. params.set_print_special(false); params.set_print_progress(false); From 31260475dc3af6d617d6672d41feedf08e0a3653 Mon Sep 17 00:00:00 2001 From: James Bruska Date: Thu, 23 Mar 2023 12:41:26 -0400 Subject: [PATCH 2/8] Change assert_stereo_to_mono_simd test to exibit issue of odd length value --- src/utilities.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utilities.rs b/src/utilities.rs index 4d210b8..7fdc057 100644 --- a/src/utilities.rs +++ b/src/utilities.rs @@ -120,7 +120,7 @@ mod test { pub fn assert_stereo_to_mono_simd() { // fake some sample data, of 1028 elements let mut samples = Vec::with_capacity(1028); - for i in 0..1028 { + for i in 0..1029 { samples.push(i as f32); } let mono_simd = convert_stereo_to_mono_audio_simd(&samples); From 1873288db0a9bda8173429b4807fda8f1e3722a4 Mon Sep 17 00:00:00 2001 From: James Bruska Date: Thu, 23 Mar 2023 13:44:53 -0400 Subject: [PATCH 3/8] Fixed odd value length bug in convert_stereo_to_mono_audio functions --- src/utilities.rs | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/utilities.rs b/src/utilities.rs index 7fdc057..2da86ed 100644 --- a/src/utilities.rs +++ b/src/utilities.rs @@ -54,6 +54,7 @@ pub fn convert_integer_to_float_audio_simd(samples: &[i16]) -> Vec { /// Convert 32 bit floating point stereo PCM audio to 32 bit floating point mono PCM audio. /// +/// If there are an odd number of samples, the last sample is dropped. /// This variant does not use SIMD instructions. /// /// # Arguments @@ -62,15 +63,12 @@ pub fn convert_integer_to_float_audio_simd(samples: &[i16]) -> Vec { /// # Returns /// A vector of 32 bit floating point mono PCM audio samples. pub fn convert_stereo_to_mono_audio(samples: &[f32]) -> Vec { - let mut mono = Vec::with_capacity(samples.len() / 2); - for i in (0..samples.len()).step_by(2) { - mono.push((samples[i] + samples[i + 1]) / 2.0); - } - mono + samples.chunks_exact(2).map(|x| (x[0] + x[1]) / 2.0).collect() } /// Convert 32 bit floating point stereo PCM audio to 32 bit floating point mono PCM audio. /// +/// If there are an odd number of samples, the last sample is dropped. /// This variant uses SIMD instructions, and as such is only available on /// nightly Rust. /// @@ -104,9 +102,7 @@ pub fn convert_stereo_to_mono_audio_simd(samples: &[f32]) -> Vec { // Handle the remainder. // do this normally because it's only a few samples and the overhead of // converting to SIMD is not worth it. - for i in (0..remainder.len()).step_by(2) { - mono.push((remainder[i] + remainder[i + 1]) / 2.0); - } + mono.extend(convert_stereo_to_mono_audio(remainder)); mono } From 445a072bdae821b7801367c65f755fc86eb52239 Mon Sep 17 00:00:00 2001 From: James Bruska Date: Thu, 23 Mar 2023 13:46:36 -0400 Subject: [PATCH 4/8] Turned off warning for unused variable in audio_transcription example --- examples/audio_transcription.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/audio_transcription.rs b/examples/audio_transcription.rs index 7831f6d..d795c39 100644 --- a/examples/audio_transcription.rs +++ b/examples/audio_transcription.rs @@ -32,6 +32,7 @@ fn main() { // Open the audio file. let mut reader = hound::WavReader::open("audio.wav").expect("failed to open file"); + #[allow(unused_variables)] let hound::WavSpec { channels, sample_rate, From bad88c38d831c621d768230eacf83283211786eb Mon Sep 17 00:00:00 2001 From: James Bruska Date: Sun, 26 Mar 2023 11:44:24 -0400 Subject: [PATCH 5/8] Change comments for convert_stereo_to_mono_audio functions --- src/utilities.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/utilities.rs b/src/utilities.rs index 2da86ed..f521f7e 100644 --- a/src/utilities.rs +++ b/src/utilities.rs @@ -54,7 +54,7 @@ pub fn convert_integer_to_float_audio_simd(samples: &[i16]) -> Vec { /// Convert 32 bit floating point stereo PCM audio to 32 bit floating point mono PCM audio. /// -/// If there are an odd number of samples, the last sample is dropped. +/// If there are an odd number of samples, the last half-sample is dropped. /// This variant does not use SIMD instructions. /// /// # Arguments @@ -68,7 +68,7 @@ pub fn convert_stereo_to_mono_audio(samples: &[f32]) -> Vec { /// Convert 32 bit floating point stereo PCM audio to 32 bit floating point mono PCM audio. /// -/// If there are an odd number of samples, the last sample is dropped. +/// If there are an odd number of samples, the last half-sample is dropped. /// This variant uses SIMD instructions, and as such is only available on /// nightly Rust. /// From 30ff41989b2879e805454361b3ede345e1f85264 Mon Sep 17 00:00:00 2001 From: James Bruska Date: Mon, 27 Mar 2023 10:35:23 -0400 Subject: [PATCH 6/8] Ran cargo fmt --- examples/audio_transcription.rs | 2 +- src/utilities.rs | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/audio_transcription.rs b/examples/audio_transcription.rs index d795c39..b2f3445 100644 --- a/examples/audio_transcription.rs +++ b/examples/audio_transcription.rs @@ -1,10 +1,10 @@ // This example is not going to build in this folder. // You need to copy this code into your project and add the dependencies whisper_rs and hound in your cargo.toml +use hound; use std::fs::File; use std::io::Write; use whisper_rs::{FullParams, SamplingStrategy, WhisperContext}; -use hound; /// Loads a context and model, processes an audio file, and prints the resulting transcript to stdout. fn main() { diff --git a/src/utilities.rs b/src/utilities.rs index f521f7e..79bb9c2 100644 --- a/src/utilities.rs +++ b/src/utilities.rs @@ -63,7 +63,10 @@ pub fn convert_integer_to_float_audio_simd(samples: &[i16]) -> Vec { /// # Returns /// A vector of 32 bit floating point mono PCM audio samples. pub fn convert_stereo_to_mono_audio(samples: &[f32]) -> Vec { - samples.chunks_exact(2).map(|x| (x[0] + x[1]) / 2.0).collect() + samples + .chunks_exact(2) + .map(|x| (x[0] + x[1]) / 2.0) + .collect() } /// Convert 32 bit floating point stereo PCM audio to 32 bit floating point mono PCM audio. From d8271e31d09b3a189177a33bc21819a0f7875eee Mon Sep 17 00:00:00 2001 From: James Bruska Date: Mon, 27 Mar 2023 11:49:13 -0400 Subject: [PATCH 7/8] Changed convert_stereo_to_mono_audio to return a Result --- examples/audio_transcription.rs | 5 ++-- examples/basic_use.rs | 6 ++-- src/utilities.rs | 49 ++++++++++++++++++++++++--------- 3 files changed, 43 insertions(+), 17 deletions(-) diff --git a/examples/audio_transcription.rs b/examples/audio_transcription.rs index b2f3445..7ab716d 100644 --- a/examples/audio_transcription.rs +++ b/examples/audio_transcription.rs @@ -7,7 +7,7 @@ use std::io::Write; use whisper_rs::{FullParams, SamplingStrategy, WhisperContext}; /// Loads a context and model, processes an audio file, and prints the resulting transcript to stdout. -fn main() { +fn main() -> Result<(), &'static str> { // Load a context and model. let mut ctx = WhisperContext::new("example/path/to/model/whisper.cpp/models/ggml-base.en.bin") .expect("failed to load model"); @@ -52,7 +52,7 @@ fn main() { // These utilities are provided for convenience, but can be replaced with custom conversion logic. // SIMD variants of these functions are also available on nightly Rust (see the docs). if channels == 2 { - audio = whisper_rs::convert_stereo_to_mono_audio(&audio); + audio = whisper_rs::convert_stereo_to_mono_audio(&audio)?; } else if channels != 1 { panic!(">2 channels unsupported"); } @@ -85,4 +85,5 @@ fn main() { file.write_all(line.as_bytes()) .expect("failed to write to file"); } + Ok(()) } diff --git a/examples/basic_use.rs b/examples/basic_use.rs index 8d0f219..727deba 100644 --- a/examples/basic_use.rs +++ b/examples/basic_use.rs @@ -5,7 +5,7 @@ use whisper_rs::{FullParams, SamplingStrategy, WhisperContext}; // note that running this example will not do anything, as it is just a // demonstration of how to use the library, and actual usage requires // more dependencies than the base library. -pub fn usage() { +pub fn usage() -> Result<(), &'static str> { // load a context and model let mut ctx = WhisperContext::new("path/to/model").expect("failed to load model"); @@ -38,7 +38,7 @@ pub fn usage() { // SIMD variants of these functions are also available, but only on nightly Rust: see the docs let audio_data = whisper_rs::convert_stereo_to_mono_audio( &whisper_rs::convert_integer_to_float_audio(&audio_data), - ); + )?; // now we can run the model ctx.full(params, &audio_data[..]) @@ -52,6 +52,8 @@ pub fn usage() { let end_timestamp = ctx.full_get_segment_t1(i); println!("[{} - {}]: {}", start_timestamp, end_timestamp, segment); } + + Ok(()) } fn main() { diff --git a/src/utilities.rs b/src/utilities.rs index 79bb9c2..b976475 100644 --- a/src/utilities.rs +++ b/src/utilities.rs @@ -54,7 +54,6 @@ pub fn convert_integer_to_float_audio_simd(samples: &[i16]) -> Vec { /// Convert 32 bit floating point stereo PCM audio to 32 bit floating point mono PCM audio. /// -/// If there are an odd number of samples, the last half-sample is dropped. /// This variant does not use SIMD instructions. /// /// # Arguments @@ -62,16 +61,20 @@ pub fn convert_integer_to_float_audio_simd(samples: &[i16]) -> Vec { /// /// # Returns /// A vector of 32 bit floating point mono PCM audio samples. -pub fn convert_stereo_to_mono_audio(samples: &[f32]) -> Vec { - samples +pub fn convert_stereo_to_mono_audio(samples: &[f32]) -> Result, &'static str> { + if samples.len() & 1 != 0 { + return Err("The stereo audio vector has an odd number of samples. \ + This means a half-sample is missing somewhere"); + } + + Ok(samples .chunks_exact(2) .map(|x| (x[0] + x[1]) / 2.0) - .collect() + .collect()) } /// Convert 32 bit floating point stereo PCM audio to 32 bit floating point mono PCM audio. /// -/// If there are an odd number of samples, the last half-sample is dropped. /// This variant uses SIMD instructions, and as such is only available on /// nightly Rust. /// @@ -81,7 +84,7 @@ pub fn convert_stereo_to_mono_audio(samples: &[f32]) -> Vec { /// # Returns /// A vector of 32 bit floating point mono PCM audio samples. #[cfg(feature = "simd")] -pub fn convert_stereo_to_mono_audio_simd(samples: &[f32]) -> Vec { +pub fn convert_stereo_to_mono_audio_simd(samples: &[f32]) -> Result, &'static str> { let mut mono = Vec::with_capacity(samples.len() / 2); let div_array = f32x16::splat(2.0); @@ -105,9 +108,9 @@ pub fn convert_stereo_to_mono_audio_simd(samples: &[f32]) -> Vec { // Handle the remainder. // do this normally because it's only a few samples and the overhead of // converting to SIMD is not worth it. - mono.extend(convert_stereo_to_mono_audio(remainder)); + mono.extend(convert_stereo_to_mono_audio(remainder)?); - mono + Ok(mono) } #[cfg(feature = "simd")] @@ -115,13 +118,33 @@ pub fn convert_stereo_to_mono_audio_simd(samples: &[f32]) -> Vec { mod test { use super::*; + #[test] + pub fn assert_stereo_to_mono_err() { + // fake some sample data + let samples = (0u16..1029).map(f32::from).collect::>(); + let mono = convert_stereo_to_mono_audio(&samples); + assert!(mono.is_err()); + } +} + +#[cfg(feature = "simd")] +#[cfg(test)] +mod test_simd { + use super::*; + #[test] pub fn assert_stereo_to_mono_simd() { - // fake some sample data, of 1028 elements - let mut samples = Vec::with_capacity(1028); - for i in 0..1029 { - samples.push(i as f32); - } + // fake some sample data + let samples = (0u16..1028).map(f32::from).collect::>(); + let mono_simd = convert_stereo_to_mono_audio_simd(&samples); + let mono = convert_stereo_to_mono_audio(&samples); + assert_eq!(mono_simd, mono); + } + + #[test] + pub fn assert_stereo_to_mono_simd_err() { + // fake some sample data + let samples = (0u16..1029).map(f32::from).collect::>(); let mono_simd = convert_stereo_to_mono_audio_simd(&samples); let mono = convert_stereo_to_mono_audio(&samples); assert_eq!(mono_simd, mono); From 9a3efcca5f6dd3008ab32a3fd126ff497e268748 Mon Sep 17 00:00:00 2001 From: James Bruska Date: Mon, 27 Mar 2023 11:51:46 -0400 Subject: [PATCH 8/8] Changed version to 0.5.0 due to public API change --- Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index 36fdd00..117dabf 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,7 +4,7 @@ exclude = ["examples/full_usage"] [package] name = "whisper-rs" -version = "0.4.0" +version = "0.5.0" edition = "2021" description = "Rust bindings for whisper.cpp" license = "Unlicense"