zeroclaw/crates/robot-kit/src/look.rs
Lumi-node 0dfc707c49 feat: add zeroclaw-robot-kit crate for AI-powered robotics
Standalone robot toolkit providing AI agents with physical world interaction.

Features:
- 6 tools: drive, look, listen, speak, sense, emote
- Multiple backends: ROS2, serial, GPIO, mock
- Independent SafetyMonitor with E-stop, collision avoidance
- Designed for Raspberry Pi 5 + Ollama offline operation
- 55 unit/integration tests
- Complete Pi 5 hardware setup guide
2026-02-18 14:19:17 +08:00

240 lines
8 KiB
Rust

//! Look Tool - Camera capture + vision model description
//!
//! Captures an image from the camera and optionally describes it
//! using a local vision model (LLaVA, Moondream) via Ollama.
use crate::config::RobotConfig;
use crate::traits::{Tool, ToolResult};
use anyhow::Result;
use async_trait::async_trait;
use serde_json::{json, Value};
use std::path::PathBuf;
pub struct LookTool {
config: RobotConfig,
capture_dir: PathBuf,
}
impl LookTool {
pub fn new(config: RobotConfig) -> Self {
let capture_dir = directories::UserDirs::new()
.map(|d| d.home_dir().join(".zeroclaw/captures"))
.unwrap_or_else(|| PathBuf::from("/tmp/zeroclaw_captures"));
// Ensure capture directory exists
let _ = std::fs::create_dir_all(&capture_dir);
Self { config, capture_dir }
}
/// Capture image using ffmpeg (works with most cameras)
async fn capture_image(&self) -> Result<PathBuf> {
let timestamp = chrono::Utc::now().format("%Y%m%d_%H%M%S");
let filename = self.capture_dir.join(format!("capture_{}.jpg", timestamp));
let device = &self.config.camera.device;
let width = self.config.camera.width;
let height = self.config.camera.height;
// Use ffmpeg for broad camera compatibility
let output = tokio::process::Command::new("ffmpeg")
.args([
"-f", "v4l2",
"-video_size", &format!("{}x{}", width, height),
"-i", device,
"-frames:v", "1",
"-y", // Overwrite
filename.to_str().unwrap(),
])
.output()
.await?;
if !output.status.success() {
// Fallback: try fswebcam (simpler, often works on Pi)
let fallback = tokio::process::Command::new("fswebcam")
.args([
"-r", &format!("{}x{}", width, height),
"--no-banner",
"-d", device,
filename.to_str().unwrap(),
])
.output()
.await?;
if !fallback.status.success() {
anyhow::bail!(
"Camera capture failed. Tried ffmpeg and fswebcam.\n\
ffmpeg: {}\n\
fswebcam: {}",
String::from_utf8_lossy(&output.stderr),
String::from_utf8_lossy(&fallback.stderr)
);
}
}
Ok(filename)
}
/// Describe image using vision model via Ollama
async fn describe_image(&self, image_path: &PathBuf, prompt: &str) -> Result<String> {
let model = &self.config.camera.vision_model;
if model == "none" {
return Ok("Vision model disabled. Image captured only.".to_string());
}
// Read image as base64
let image_bytes = tokio::fs::read(image_path).await?;
let base64_image = base64::Engine::encode(
&base64::engine::general_purpose::STANDARD,
&image_bytes,
);
// Call Ollama with image
let client = reqwest::Client::new();
let response = client
.post(format!("{}/api/generate", self.config.camera.ollama_url))
.json(&json!({
"model": model,
"prompt": prompt,
"images": [base64_image],
"stream": false
}))
.timeout(std::time::Duration::from_secs(60))
.send()
.await?;
if !response.status().is_success() {
anyhow::bail!("Ollama vision request failed: {}", response.status());
}
let result: Value = response.json().await?;
let description = result["response"]
.as_str()
.unwrap_or("No description generated")
.to_string();
Ok(description)
}
}
#[async_trait]
impl Tool for LookTool {
fn name(&self) -> &str {
"look"
}
fn description(&self) -> &str {
"Capture an image from the robot's camera and optionally describe what is seen. \
Use this to observe the environment, find objects, or identify people."
}
fn parameters_schema(&self) -> Value {
json!({
"type": "object",
"properties": {
"action": {
"type": "string",
"enum": ["capture", "describe", "find"],
"description": "capture=just take photo, describe=photo+AI description, find=look for specific thing"
},
"prompt": {
"type": "string",
"description": "For 'describe': what to focus on. For 'find': what to look for."
}
},
"required": ["action"]
})
}
async fn execute(&self, args: Value) -> Result<ToolResult> {
let action = args["action"]
.as_str()
.ok_or_else(|| anyhow::anyhow!("Missing 'action' parameter"))?;
// Capture image
let image_path = match self.capture_image().await {
Ok(path) => path,
Err(e) => {
return Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!("Camera capture failed: {e}")),
});
}
};
match action {
"capture" => Ok(ToolResult {
success: true,
output: format!("Image captured: {}", image_path.display()),
error: None,
}),
"describe" => {
let prompt = args["prompt"]
.as_str()
.unwrap_or("Describe what you see in this image. Be specific about people, objects, and the environment.");
match self.describe_image(&image_path, prompt).await {
Ok(description) => Ok(ToolResult {
success: true,
output: format!("I see: {}", description),
error: None,
}),
Err(e) => Ok(ToolResult {
success: false,
output: format!("Image captured at {} but description failed", image_path.display()),
error: Some(e.to_string()),
}),
}
}
"find" => {
let target = args["prompt"]
.as_str()
.ok_or_else(|| anyhow::anyhow!("'find' action requires 'prompt' specifying what to find"))?;
let prompt = format!(
"Look at this image and determine: Is there a {} visible? \
If yes, describe where it is (left, right, center, near, far). \
If no, say 'Not found' and describe what you do see.",
target
);
match self.describe_image(&image_path, &prompt).await {
Ok(description) => Ok(ToolResult {
success: true,
output: description,
error: None,
}),
Err(e) => Ok(ToolResult {
success: false,
output: String::new(),
error: Some(e.to_string()),
}),
}
}
_ => Ok(ToolResult {
success: false,
output: String::new(),
error: Some(format!("Unknown action: {action}")),
}),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn look_tool_name() {
let tool = LookTool::new(RobotConfig::default());
assert_eq!(tool.name(), "look");
}
#[test]
fn look_tool_schema() {
let tool = LookTool::new(RobotConfig::default());
let schema = tool.parameters_schema();
assert!(schema["properties"]["action"].is_object());
}
}