feat: add zeroclaw-robot-kit crate for AI-powered robotics
Standalone robot toolkit providing AI agents with physical world interaction. Features: - 6 tools: drive, look, listen, speak, sense, emote - Multiple backends: ROS2, serial, GPIO, mock - Independent SafetyMonitor with E-stop, collision avoidance - Designed for Raspberry Pi 5 + Ollama offline operation - 55 unit/integration tests - Complete Pi 5 hardware setup guide
This commit is contained in:
parent
431287184b
commit
0dfc707c49
18 changed files with 4444 additions and 9 deletions
229
crates/robot-kit/src/speak.rs
Normal file
229
crates/robot-kit/src/speak.rs
Normal file
|
|
@ -0,0 +1,229 @@
|
|||
//! Speak Tool - Text-to-speech via Piper
|
||||
//!
|
||||
//! Converts text to speech using Piper TTS (fast, offline, runs on Pi).
|
||||
//! Plays audio through the speaker.
|
||||
|
||||
use crate::config::RobotConfig;
|
||||
use crate::traits::{Tool, ToolResult};
|
||||
use anyhow::Result;
|
||||
use async_trait::async_trait;
|
||||
use serde_json::{json, Value};
|
||||
use std::path::PathBuf;
|
||||
|
||||
pub struct SpeakTool {
|
||||
config: RobotConfig,
|
||||
audio_dir: PathBuf,
|
||||
}
|
||||
|
||||
impl SpeakTool {
|
||||
pub fn new(config: RobotConfig) -> Self {
|
||||
let audio_dir = directories::UserDirs::new()
|
||||
.map(|d| d.home_dir().join(".zeroclaw/tts_cache"))
|
||||
.unwrap_or_else(|| PathBuf::from("/tmp/zeroclaw_tts"));
|
||||
|
||||
let _ = std::fs::create_dir_all(&audio_dir);
|
||||
|
||||
Self { config, audio_dir }
|
||||
}
|
||||
|
||||
/// Generate speech using Piper and play it
|
||||
async fn speak(&self, text: &str, emotion: &str) -> Result<()> {
|
||||
let piper_path = &self.config.audio.piper_path;
|
||||
let voice = &self.config.audio.piper_voice;
|
||||
let speaker_device = &self.config.audio.speaker_device;
|
||||
|
||||
// Model path
|
||||
let model_path = directories::UserDirs::new()
|
||||
.map(|d| d.home_dir().join(format!(".zeroclaw/models/piper/{}.onnx", voice)))
|
||||
.unwrap_or_else(|| PathBuf::from(format!("/usr/local/share/piper/{}.onnx", voice)));
|
||||
|
||||
// Adjust text based on emotion (simple SSML-like modifications)
|
||||
let processed_text = match emotion {
|
||||
"excited" => format!("{}!", text.trim_end_matches('.')),
|
||||
"sad" => text.to_string(), // Piper doesn't support prosody, but we keep the hook
|
||||
"whisper" => text.to_string(),
|
||||
_ => text.to_string(),
|
||||
};
|
||||
|
||||
// Generate WAV file
|
||||
let output_path = self.audio_dir.join("speech.wav");
|
||||
|
||||
// Pipe text to piper, output to WAV
|
||||
let mut piper = tokio::process::Command::new(piper_path)
|
||||
.args([
|
||||
"--model", model_path.to_str().unwrap(),
|
||||
"--output_file", output_path.to_str().unwrap(),
|
||||
])
|
||||
.stdin(std::process::Stdio::piped())
|
||||
.spawn()?;
|
||||
|
||||
// Write text to stdin
|
||||
if let Some(mut stdin) = piper.stdin.take() {
|
||||
use tokio::io::AsyncWriteExt;
|
||||
stdin.write_all(processed_text.as_bytes()).await?;
|
||||
}
|
||||
|
||||
let status = piper.wait().await?;
|
||||
if !status.success() {
|
||||
anyhow::bail!("Piper TTS failed");
|
||||
}
|
||||
|
||||
// Play audio using aplay
|
||||
let play_result = tokio::process::Command::new("aplay")
|
||||
.args([
|
||||
"-D", speaker_device,
|
||||
output_path.to_str().unwrap(),
|
||||
])
|
||||
.output()
|
||||
.await?;
|
||||
|
||||
if !play_result.status.success() {
|
||||
// Fallback: try paplay (PulseAudio)
|
||||
let fallback = tokio::process::Command::new("paplay")
|
||||
.arg(output_path.to_str().unwrap())
|
||||
.output()
|
||||
.await?;
|
||||
|
||||
if !fallback.status.success() {
|
||||
anyhow::bail!(
|
||||
"Audio playback failed. Tried aplay and paplay.\n{}",
|
||||
String::from_utf8_lossy(&play_result.stderr)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Play a sound effect
|
||||
async fn play_sound(&self, sound: &str) -> Result<()> {
|
||||
let sounds_dir = directories::UserDirs::new()
|
||||
.map(|d| d.home_dir().join(".zeroclaw/sounds"))
|
||||
.unwrap_or_else(|| PathBuf::from("/usr/local/share/zeroclaw/sounds"));
|
||||
|
||||
let sound_file = sounds_dir.join(format!("{}.wav", sound));
|
||||
|
||||
if !sound_file.exists() {
|
||||
anyhow::bail!("Sound file not found: {}", sound_file.display());
|
||||
}
|
||||
|
||||
let speaker_device = &self.config.audio.speaker_device;
|
||||
let output = tokio::process::Command::new("aplay")
|
||||
.args(["-D", speaker_device, sound_file.to_str().unwrap()])
|
||||
.output()
|
||||
.await?;
|
||||
|
||||
if !output.status.success() {
|
||||
anyhow::bail!("Sound playback failed");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl Tool for SpeakTool {
|
||||
fn name(&self) -> &str {
|
||||
"speak"
|
||||
}
|
||||
|
||||
fn description(&self) -> &str {
|
||||
"Speak text out loud using text-to-speech. The robot will say the given text \
|
||||
through its speaker. Can also play sound effects like 'beep', 'chime', 'laugh'."
|
||||
}
|
||||
|
||||
fn parameters_schema(&self) -> Value {
|
||||
json!({
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "The text to speak out loud"
|
||||
},
|
||||
"emotion": {
|
||||
"type": "string",
|
||||
"enum": ["neutral", "excited", "sad", "whisper"],
|
||||
"description": "Emotional tone. Default 'neutral'."
|
||||
},
|
||||
"sound": {
|
||||
"type": "string",
|
||||
"description": "Play a sound effect instead of speaking (e.g., 'beep', 'chime', 'laugh', 'alert')"
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
async fn execute(&self, args: Value) -> Result<ToolResult> {
|
||||
// Check if playing a sound effect
|
||||
if let Some(sound) = args["sound"].as_str() {
|
||||
return match self.play_sound(sound).await {
|
||||
Ok(()) => Ok(ToolResult {
|
||||
success: true,
|
||||
output: format!("Played sound: {}", sound),
|
||||
error: None,
|
||||
}),
|
||||
Err(e) => Ok(ToolResult {
|
||||
success: false,
|
||||
output: String::new(),
|
||||
error: Some(format!("Sound playback failed: {e}")),
|
||||
}),
|
||||
};
|
||||
}
|
||||
|
||||
// Speak text
|
||||
let text = args["text"]
|
||||
.as_str()
|
||||
.ok_or_else(|| anyhow::anyhow!("Missing 'text' parameter (or use 'sound' for effects)"))?;
|
||||
|
||||
if text.is_empty() {
|
||||
return Ok(ToolResult {
|
||||
success: false,
|
||||
output: String::new(),
|
||||
error: Some("Cannot speak empty text".to_string()),
|
||||
});
|
||||
}
|
||||
|
||||
// Limit text length for safety
|
||||
if text.len() > 1000 {
|
||||
return Ok(ToolResult {
|
||||
success: false,
|
||||
output: String::new(),
|
||||
error: Some("Text too long (max 1000 characters)".to_string()),
|
||||
});
|
||||
}
|
||||
|
||||
let emotion = args["emotion"].as_str().unwrap_or("neutral");
|
||||
|
||||
match self.speak(text, emotion).await {
|
||||
Ok(()) => Ok(ToolResult {
|
||||
success: true,
|
||||
output: format!("Said: \"{}\"", text),
|
||||
error: None,
|
||||
}),
|
||||
Err(e) => Ok(ToolResult {
|
||||
success: false,
|
||||
output: String::new(),
|
||||
error: Some(format!("Speech failed: {e}")),
|
||||
}),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn speak_tool_name() {
|
||||
let tool = SpeakTool::new(RobotConfig::default());
|
||||
assert_eq!(tool.name(), "speak");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn speak_tool_schema() {
|
||||
let tool = SpeakTool::new(RobotConfig::default());
|
||||
let schema = tool.parameters_schema();
|
||||
assert!(schema["properties"]["text"].is_object());
|
||||
assert!(schema["properties"]["emotion"].is_object());
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue