feat: add zeroclaw-robot-kit crate for AI-powered robotics

Standalone robot toolkit providing AI agents with physical world interaction. Features: - 6 tools: drive, look, listen, speak, sense, emote - Multiple backends: ROS2, serial, GPIO, mock - Independent SafetyMonitor with E-stop, collision avoidance - Designed for Raspberry Pi 5 + Ollama offline operation - 55 unit/integration tests - Complete Pi 5 hardware setup guide
2026-02-17 10:25:54 -06:00 · 2026-02-17 10:25:54 -06:00 · 0dfc707c49
commit 0dfc707c49
parent 431287184b
18 changed files with 4444 additions and 9 deletions
--- a/crates/robot-kit/src/speak.rs
+++ b/crates/robot-kit/src/speak.rs
@ -0,0 +1,229 @@
+//! Speak Tool - Text-to-speech via Piper
+//!
+//! Converts text to speech using Piper TTS (fast, offline, runs on Pi).
+//! Plays audio through the speaker.
+
+use crate::config::RobotConfig;
+use crate::traits::{Tool, ToolResult};
+use anyhow::Result;
+use async_trait::async_trait;
+use serde_json::{json, Value};
+use std::path::PathBuf;
+
+pub struct SpeakTool {
+    config: RobotConfig,
+    audio_dir: PathBuf,
+}
+
+impl SpeakTool {
+    pub fn new(config: RobotConfig) -> Self {
+        let audio_dir = directories::UserDirs::new()
+            .map(|d| d.home_dir().join(".zeroclaw/tts_cache"))
+            .unwrap_or_else(|| PathBuf::from("/tmp/zeroclaw_tts"));
+
+        let _ = std::fs::create_dir_all(&audio_dir);
+
+        Self { config, audio_dir }
+    }
+
+    /// Generate speech using Piper and play it
+    async fn speak(&self, text: &str, emotion: &str) -> Result<()> {
+        let piper_path = &self.config.audio.piper_path;
+        let voice = &self.config.audio.piper_voice;
+        let speaker_device = &self.config.audio.speaker_device;
+
+        // Model path
+        let model_path = directories::UserDirs::new()
+            .map(|d| d.home_dir().join(format!(".zeroclaw/models/piper/{}.onnx", voice)))
+            .unwrap_or_else(|| PathBuf::from(format!("/usr/local/share/piper/{}.onnx", voice)));
+
+        // Adjust text based on emotion (simple SSML-like modifications)
+        let processed_text = match emotion {
+            "excited" => format!("{}!", text.trim_end_matches('.')),
+            "sad" => text.to_string(), // Piper doesn't support prosody, but we keep the hook
+            "whisper" => text.to_string(),
+            _ => text.to_string(),
+        };
+
+        // Generate WAV file
+        let output_path = self.audio_dir.join("speech.wav");
+
+        // Pipe text to piper, output to WAV
+        let mut piper = tokio::process::Command::new(piper_path)
+            .args([
+                "--model", model_path.to_str().unwrap(),
+                "--output_file", output_path.to_str().unwrap(),
+            ])
+            .stdin(std::process::Stdio::piped())
+            .spawn()?;
+
+        // Write text to stdin
+        if let Some(mut stdin) = piper.stdin.take() {
+            use tokio::io::AsyncWriteExt;
+            stdin.write_all(processed_text.as_bytes()).await?;
+        }
+
+        let status = piper.wait().await?;
+        if !status.success() {
+            anyhow::bail!("Piper TTS failed");
+        }
+
+        // Play audio using aplay
+        let play_result = tokio::process::Command::new("aplay")
+            .args([
+                "-D", speaker_device,
+                output_path.to_str().unwrap(),
+            ])
+            .output()
+            .await?;
+
+        if !play_result.status.success() {
+            // Fallback: try paplay (PulseAudio)
+            let fallback = tokio::process::Command::new("paplay")
+                .arg(output_path.to_str().unwrap())
+                .output()
+                .await?;
+
+            if !fallback.status.success() {
+                anyhow::bail!(
+                    "Audio playback failed. Tried aplay and paplay.\n{}",
+                    String::from_utf8_lossy(&play_result.stderr)
+                );
+            }
+        }
+
+        Ok(())
+    }
+
+    /// Play a sound effect
+    async fn play_sound(&self, sound: &str) -> Result<()> {
+        let sounds_dir = directories::UserDirs::new()
+            .map(|d| d.home_dir().join(".zeroclaw/sounds"))
+            .unwrap_or_else(|| PathBuf::from("/usr/local/share/zeroclaw/sounds"));
+
+        let sound_file = sounds_dir.join(format!("{}.wav", sound));
+
+        if !sound_file.exists() {
+            anyhow::bail!("Sound file not found: {}", sound_file.display());
+        }
+
+        let speaker_device = &self.config.audio.speaker_device;
+        let output = tokio::process::Command::new("aplay")
+            .args(["-D", speaker_device, sound_file.to_str().unwrap()])
+            .output()
+            .await?;
+
+        if !output.status.success() {
+            anyhow::bail!("Sound playback failed");
+        }
+
+        Ok(())
+    }
+}
+
+#[async_trait]
+impl Tool for SpeakTool {
+    fn name(&self) -> &str {
+        "speak"
+    }
+
+    fn description(&self) -> &str {
+        "Speak text out loud using text-to-speech. The robot will say the given text \
+         through its speaker. Can also play sound effects like 'beep', 'chime', 'laugh'."
+    }
+
+    fn parameters_schema(&self) -> Value {
+        json!({
+            "type": "object",
+            "properties": {
+                "text": {
+                    "type": "string",
+                    "description": "The text to speak out loud"
+                },
+                "emotion": {
+                    "type": "string",
+                    "enum": ["neutral", "excited", "sad", "whisper"],
+                    "description": "Emotional tone. Default 'neutral'."
+                },
+                "sound": {
+                    "type": "string",
+                    "description": "Play a sound effect instead of speaking (e.g., 'beep', 'chime', 'laugh', 'alert')"
+                }
+            }
+        })
+    }
+
+    async fn execute(&self, args: Value) -> Result<ToolResult> {
+        // Check if playing a sound effect
+        if let Some(sound) = args["sound"].as_str() {
+            return match self.play_sound(sound).await {
+                Ok(()) => Ok(ToolResult {
+                    success: true,
+                    output: format!("Played sound: {}", sound),
+                    error: None,
+                }),
+                Err(e) => Ok(ToolResult {
+                    success: false,
+                    output: String::new(),
+                    error: Some(format!("Sound playback failed: {e}")),
+                }),
+            };
+        }
+
+        // Speak text
+        let text = args["text"]
+            .as_str()
+            .ok_or_else(|| anyhow::anyhow!("Missing 'text' parameter (or use 'sound' for effects)"))?;
+
+        if text.is_empty() {
+            return Ok(ToolResult {
+                success: false,
+                output: String::new(),
+                error: Some("Cannot speak empty text".to_string()),
+            });
+        }
+
+        // Limit text length for safety
+        if text.len() > 1000 {
+            return Ok(ToolResult {
+                success: false,
+                output: String::new(),
+                error: Some("Text too long (max 1000 characters)".to_string()),
+            });
+        }
+
+        let emotion = args["emotion"].as_str().unwrap_or("neutral");
+
+        match self.speak(text, emotion).await {
+            Ok(()) => Ok(ToolResult {
+                success: true,
+                output: format!("Said: \"{}\"", text),
+                error: None,
+            }),
+            Err(e) => Ok(ToolResult {
+                success: false,
+                output: String::new(),
+                error: Some(format!("Speech failed: {e}")),
+            }),
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn speak_tool_name() {
+        let tool = SpeakTool::new(RobotConfig::default());
+        assert_eq!(tool.name(), "speak");
+    }
+
+    #[test]
+    fn speak_tool_schema() {
+        let tool = SpeakTool::new(RobotConfig::default());
+        let schema = tool.parameters_schema();
+        assert!(schema["properties"]["text"].is_object());
+        assert!(schema["properties"]["emotion"].is_object());
+    }
+}