From 04bf94443fcbf71002a44351bc2968e41ada2728 Mon Sep 17 00:00:00 2001 From: Chummy Date: Tue, 17 Feb 2026 00:31:45 +0800 Subject: [PATCH] feat(browser): add optional computer-use sidecar backend (#335) --- README.md | 21 +- src/config/mod.rs | 10 +- src/config/schema.rs | 78 +++++- src/cost/tracker.rs | 2 +- src/onboard/wizard.rs | 8 +- src/tools/browser.rs | 517 +++++++++++++++++++++++++++++++++++- src/tools/git_operations.rs | 2 + src/tools/mod.rs | 11 +- 8 files changed, 625 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index ac9a8b2..97619ea 100644 --- a/README.md +++ b/README.md @@ -305,15 +305,34 @@ encrypt = true # API keys encrypted with local key file [browser] enabled = false # opt-in browser_open + browser tools allowed_domains = ["docs.rs"] # required when browser is enabled -backend = "agent_browser" # "agent_browser" (default), "rust_native", "auto" +backend = "agent_browser" # "agent_browser" (default), "rust_native", "computer_use", "auto" native_headless = true # applies when backend uses rust-native native_webdriver_url = "http://127.0.0.1:9515" # WebDriver endpoint (chromedriver/selenium) # native_chrome_path = "/usr/bin/chromium" # optional explicit browser binary for driver +[browser.computer_use] +endpoint = "http://127.0.0.1:8787/v1/actions" # computer-use sidecar HTTP endpoint +timeout_ms = 15000 # per-action timeout +allow_remote_endpoint = false # secure default: only private/localhost endpoint +window_allowlist = [] # optional window title/process allowlist hints +# api_key = "..." # optional bearer token for sidecar +# max_coordinate_x = 3840 # optional coordinate guardrail +# max_coordinate_y = 2160 # optional coordinate guardrail + # Rust-native backend build flag: # cargo build --release --features browser-native # Ensure a WebDriver server is running, e.g. chromedriver --port=9515 +# Computer-use sidecar contract (MVP) +# POST browser.computer_use.endpoint +# Request: { +# "action": "mouse_click", +# "params": {"x": 640, "y": 360, "button": "left"}, +# "policy": {"allowed_domains": [...], "window_allowlist": [...], "max_coordinate_x": 3840, "max_coordinate_y": 2160}, +# "metadata": {"session_name": "...", "source": "zeroclaw.browser", "version": "..."} +# } +# Response: {"success": true, "data": {...}} or {"success": false, "error": "..."} + [composio] enabled = false # opt-in: 1000+ OAuth apps via composio.dev # api_key = "cmp_..." # optional: stored encrypted when [secrets].encrypt = true diff --git a/src/config/mod.rs b/src/config/mod.rs index e53b597..3103f42 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -2,11 +2,11 @@ pub mod schema; #[allow(unused_imports)] pub use schema::{ - AuditConfig, AutonomyConfig, BrowserConfig, ChannelsConfig, ComposioConfig, Config, CostConfig, - DelegateAgentConfig, DiscordConfig, DockerRuntimeConfig, GatewayConfig, HeartbeatConfig, - HttpRequestConfig, IMessageConfig, IdentityConfig, LarkConfig, MatrixConfig, MemoryConfig, - ModelRouteConfig, ObservabilityConfig, ReliabilityConfig, ResourceLimitsConfig, RuntimeConfig, - SandboxBackend, SandboxConfig, SchedulerConfig, SecretsConfig, SecurityConfig, SlackConfig, + AuditConfig, AutonomyConfig, BrowserComputerUseConfig, BrowserConfig, ChannelsConfig, + ComposioConfig, Config, DelegateAgentConfig, DiscordConfig, DockerRuntimeConfig, GatewayConfig, + HeartbeatConfig, HttpRequestConfig, IMessageConfig, IdentityConfig, LarkConfig, MatrixConfig, + MemoryConfig, ModelRouteConfig, ObservabilityConfig, ReliabilityConfig, ResourceLimitsConfig, + RuntimeConfig, SandboxBackend, SandboxConfig, SecretsConfig, SecurityConfig, SlackConfig, TelegramConfig, TunnelConfig, WebhookConfig, }; diff --git a/src/config/schema.rs b/src/config/schema.rs index 8a66124..622e12d 100644 --- a/src/config/schema.rs +++ b/src/config/schema.rs @@ -419,6 +419,53 @@ impl Default for SecretsConfig { // ── Browser (friendly-service browsing only) ─────────────────── +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BrowserComputerUseConfig { + /// Sidecar endpoint for computer-use actions (OS-level mouse/keyboard/screenshot) + #[serde(default = "default_browser_computer_use_endpoint")] + pub endpoint: String, + /// Optional bearer token for computer-use sidecar + #[serde(default)] + pub api_key: Option, + /// Per-action request timeout in milliseconds + #[serde(default = "default_browser_computer_use_timeout_ms")] + pub timeout_ms: u64, + /// Allow remote/public endpoint for computer-use sidecar (default: false) + #[serde(default)] + pub allow_remote_endpoint: bool, + /// Optional window title/process allowlist forwarded to sidecar policy + #[serde(default)] + pub window_allowlist: Vec, + /// Optional X-axis boundary for coordinate-based actions + #[serde(default)] + pub max_coordinate_x: Option, + /// Optional Y-axis boundary for coordinate-based actions + #[serde(default)] + pub max_coordinate_y: Option, +} + +fn default_browser_computer_use_endpoint() -> String { + "http://127.0.0.1:8787/v1/actions".into() +} + +fn default_browser_computer_use_timeout_ms() -> u64 { + 15_000 +} + +impl Default for BrowserComputerUseConfig { + fn default() -> Self { + Self { + endpoint: default_browser_computer_use_endpoint(), + api_key: None, + timeout_ms: default_browser_computer_use_timeout_ms(), + allow_remote_endpoint: false, + window_allowlist: Vec::new(), + max_coordinate_x: None, + max_coordinate_y: None, + } + } +} + #[derive(Debug, Clone, Serialize, Deserialize)] pub struct BrowserConfig { /// Enable `browser_open` tool (opens URLs in Brave without scraping) @@ -430,7 +477,7 @@ pub struct BrowserConfig { /// Browser session name (for agent-browser automation) #[serde(default)] pub session_name: Option, - /// Browser automation backend: "agent_browser" | "rust_native" | "auto" + /// Browser automation backend: "agent_browser" | "rust_native" | "computer_use" | "auto" #[serde(default = "default_browser_backend")] pub backend: String, /// Headless mode for rust-native backend @@ -442,6 +489,9 @@ pub struct BrowserConfig { /// Optional Chrome/Chromium executable path for rust-native backend #[serde(default)] pub native_chrome_path: Option, + /// Computer-use sidecar configuration + #[serde(default)] + pub computer_use: BrowserComputerUseConfig, } fn default_browser_backend() -> String { @@ -462,6 +512,7 @@ impl Default for BrowserConfig { native_headless: default_true(), native_webdriver_url: default_browser_webdriver_url(), native_chrome_path: None, + computer_use: BrowserComputerUseConfig::default(), } } } @@ -2334,6 +2385,12 @@ default_temperature = 0.7 assert!(b.native_headless); assert_eq!(b.native_webdriver_url, "http://127.0.0.1:9515"); assert!(b.native_chrome_path.is_none()); + assert_eq!(b.computer_use.endpoint, "http://127.0.0.1:8787/v1/actions"); + assert_eq!(b.computer_use.timeout_ms, 15_000); + assert!(!b.computer_use.allow_remote_endpoint); + assert!(b.computer_use.window_allowlist.is_empty()); + assert!(b.computer_use.max_coordinate_x.is_none()); + assert!(b.computer_use.max_coordinate_y.is_none()); } #[test] @@ -2346,6 +2403,15 @@ default_temperature = 0.7 native_headless: false, native_webdriver_url: "http://localhost:4444".into(), native_chrome_path: Some("/usr/bin/chromium".into()), + computer_use: BrowserComputerUseConfig { + endpoint: "https://computer-use.example.com/v1/actions".into(), + api_key: Some("test-token".into()), + timeout_ms: 8_000, + allow_remote_endpoint: true, + window_allowlist: vec!["Chrome".into(), "Visual Studio Code".into()], + max_coordinate_x: Some(3840), + max_coordinate_y: Some(2160), + }, }; let toml_str = toml::to_string(&b).unwrap(); let parsed: BrowserConfig = toml::from_str(&toml_str).unwrap(); @@ -2359,6 +2425,16 @@ default_temperature = 0.7 parsed.native_chrome_path.as_deref(), Some("/usr/bin/chromium") ); + assert_eq!( + parsed.computer_use.endpoint, + "https://computer-use.example.com/v1/actions" + ); + assert_eq!(parsed.computer_use.api_key.as_deref(), Some("test-token")); + assert_eq!(parsed.computer_use.timeout_ms, 8_000); + assert!(parsed.computer_use.allow_remote_endpoint); + assert_eq!(parsed.computer_use.window_allowlist.len(), 2); + assert_eq!(parsed.computer_use.max_coordinate_x, Some(3840)); + assert_eq!(parsed.computer_use.max_coordinate_y, Some(2160)); } #[test] diff --git a/src/cost/tracker.rs b/src/cost/tracker.rs index 16b874f..697f381 100644 --- a/src/cost/tracker.rs +++ b/src/cost/tracker.rs @@ -1,5 +1,5 @@ use super::types::{BudgetCheck, CostRecord, CostSummary, ModelStats, TokenUsage, UsagePeriod}; -use crate::config::CostConfig; +use crate::config::schema::CostConfig; use anyhow::{anyhow, Context, Result}; use chrono::{Datelike, NaiveDate, Utc}; use std::collections::HashMap; diff --git a/src/onboard/wizard.rs b/src/onboard/wizard.rs index ddac80e..0bf285b 100644 --- a/src/onboard/wizard.rs +++ b/src/onboard/wizard.rs @@ -110,7 +110,7 @@ pub fn run_wizard() -> Result { autonomy: AutonomyConfig::default(), runtime: RuntimeConfig::default(), reliability: crate::config::ReliabilityConfig::default(), - scheduler: crate::config::SchedulerConfig::default(), + scheduler: crate::config::schema::SchedulerConfig::default(), model_routes: Vec::new(), heartbeat: HeartbeatConfig::default(), channels_config, @@ -122,7 +122,7 @@ pub fn run_wizard() -> Result { browser: BrowserConfig::default(), http_request: crate::config::HttpRequestConfig::default(), identity: crate::config::IdentityConfig::default(), - cost: crate::config::CostConfig::default(), + cost: crate::config::schema::CostConfig::default(), hardware: hardware_config, agents: std::collections::HashMap::new(), security: crate::config::SecurityConfig::default(), @@ -307,7 +307,7 @@ pub fn run_quick_setup( autonomy: AutonomyConfig::default(), runtime: RuntimeConfig::default(), reliability: crate::config::ReliabilityConfig::default(), - scheduler: crate::config::SchedulerConfig::default(), + scheduler: crate::config::schema::SchedulerConfig::default(), model_routes: Vec::new(), heartbeat: HeartbeatConfig::default(), channels_config: ChannelsConfig::default(), @@ -319,7 +319,7 @@ pub fn run_quick_setup( browser: BrowserConfig::default(), http_request: crate::config::HttpRequestConfig::default(), identity: crate::config::IdentityConfig::default(), - cost: crate::config::CostConfig::default(), + cost: crate::config::schema::CostConfig::default(), hardware: HardwareConfig::default(), agents: std::collections::HashMap::new(), security: crate::config::SecurityConfig::default(), diff --git a/src/tools/browser.rs b/src/tools/browser.rs index ec469d6..c6a0ba9 100644 --- a/src/tools/browser.rs +++ b/src/tools/browser.rs @@ -3,18 +3,48 @@ //! By default this uses Vercel's `agent-browser` CLI for automation. //! Optionally, a Rust-native backend can be enabled at build time via //! `--features browser-native` and selected through config. +//! Computer-use (OS-level) actions are supported via an optional sidecar endpoint. use super::traits::{Tool, ToolResult}; use crate::security::SecurityPolicy; +use anyhow::Context; use async_trait::async_trait; use serde::{Deserialize, Serialize}; use serde_json::{json, Value}; +use std::net::ToSocketAddrs; use std::process::Stdio; use std::sync::Arc; +use std::time::Duration; use tokio::process::Command; use tracing::debug; -/// Browser automation tool using agent-browser CLI +/// Computer-use sidecar settings. +#[derive(Debug, Clone)] +pub struct ComputerUseConfig { + pub endpoint: String, + pub api_key: Option, + pub timeout_ms: u64, + pub allow_remote_endpoint: bool, + pub window_allowlist: Vec, + pub max_coordinate_x: Option, + pub max_coordinate_y: Option, +} + +impl Default for ComputerUseConfig { + fn default() -> Self { + Self { + endpoint: "http://127.0.0.1:8787/v1/actions".into(), + api_key: None, + timeout_ms: 15_000, + allow_remote_endpoint: false, + window_allowlist: Vec::new(), + max_coordinate_x: None, + max_coordinate_y: None, + } + } +} + +/// Browser automation tool using pluggable backends. pub struct BrowserTool { security: Arc, allowed_domains: Vec, @@ -23,6 +53,7 @@ pub struct BrowserTool { native_headless: bool, native_webdriver_url: String, native_chrome_path: Option, + computer_use: ComputerUseConfig, #[cfg(feature = "browser-native")] native_state: tokio::sync::Mutex, } @@ -31,6 +62,7 @@ pub struct BrowserTool { enum BrowserBackendKind { AgentBrowser, RustNative, + ComputerUse, Auto, } @@ -38,6 +70,7 @@ enum BrowserBackendKind { enum ResolvedBackend { AgentBrowser, RustNative, + ComputerUse, } impl BrowserBackendKind { @@ -46,9 +79,10 @@ impl BrowserBackendKind { match key.as_str() { "agent_browser" | "agentbrowser" => Ok(Self::AgentBrowser), "rust_native" | "native" => Ok(Self::RustNative), + "computer_use" | "computeruse" => Ok(Self::ComputerUse), "auto" => Ok(Self::Auto), _ => anyhow::bail!( - "Unsupported browser backend '{raw}'. Use 'agent_browser', 'rust_native', or 'auto'" + "Unsupported browser backend '{raw}'. Use 'agent_browser', 'rust_native', 'computer_use', or 'auto'" ), } } @@ -57,6 +91,7 @@ impl BrowserBackendKind { match self { Self::AgentBrowser => "agent_browser", Self::RustNative => "rust_native", + Self::ComputerUse => "computer_use", Self::Auto => "auto", } } @@ -70,6 +105,17 @@ struct AgentBrowserResponse { error: Option, } +/// Response format from computer-use sidecar. +#[derive(Debug, Deserialize)] +struct ComputerUseResponse { + #[serde(default)] + success: Option, + #[serde(default)] + data: Option, + #[serde(default)] + error: Option, +} + /// Supported browser actions #[derive(Debug, Clone, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] @@ -151,9 +197,11 @@ impl BrowserTool { true, "http://127.0.0.1:9515".into(), None, + ComputerUseConfig::default(), ) } + #[allow(clippy::too_many_arguments)] pub fn new_with_backend( security: Arc, allowed_domains: Vec, @@ -162,6 +210,7 @@ impl BrowserTool { native_headless: bool, native_webdriver_url: String, native_chrome_path: Option, + computer_use: ComputerUseConfig, ) -> Self { Self { security, @@ -171,6 +220,7 @@ impl BrowserTool { native_headless, native_webdriver_url, native_chrome_path, + computer_use, #[cfg(feature = "browser-native")] native_state: tokio::sync::Mutex::new(native_backend::NativeBrowserState::default()), } @@ -216,6 +266,52 @@ impl BrowserTool { } } + fn computer_use_endpoint_url(&self) -> anyhow::Result { + if self.computer_use.timeout_ms == 0 { + anyhow::bail!("browser.computer_use.timeout_ms must be > 0"); + } + + let endpoint = self.computer_use.endpoint.trim(); + if endpoint.is_empty() { + anyhow::bail!("browser.computer_use.endpoint cannot be empty"); + } + + let parsed = reqwest::Url::parse(endpoint).map_err(|_| { + anyhow::anyhow!( + "Invalid browser.computer_use.endpoint: '{endpoint}'. Expected http(s) URL" + ) + })?; + + let scheme = parsed.scheme(); + if scheme != "http" && scheme != "https" { + anyhow::bail!("browser.computer_use.endpoint must use http:// or https://"); + } + + let host = parsed + .host_str() + .ok_or_else(|| anyhow::anyhow!("browser.computer_use.endpoint must include host"))?; + + let host_is_private = is_private_host(host); + if !self.computer_use.allow_remote_endpoint && !host_is_private { + anyhow::bail!( + "browser.computer_use.endpoint host '{host}' is public. Set browser.computer_use.allow_remote_endpoint=true to allow it" + ); + } + + if self.computer_use.allow_remote_endpoint && !host_is_private && scheme != "https" { + anyhow::bail!( + "browser.computer_use.endpoint must use https:// when allow_remote_endpoint=true and host is public" + ); + } + + Ok(parsed) + } + + fn computer_use_available(&self) -> anyhow::Result { + let endpoint = self.computer_use_endpoint_url()?; + Ok(endpoint_reachable(&endpoint, Duration::from_millis(500))) + } + async fn resolve_backend(&self) -> anyhow::Result { let configured = self.configured_backend()?; @@ -243,6 +339,14 @@ impl BrowserTool { } Ok(ResolvedBackend::RustNative) } + BrowserBackendKind::ComputerUse => { + if !self.computer_use_available()? { + anyhow::bail!( + "browser.backend='computer_use' but sidecar endpoint is unreachable. Check browser.computer_use.endpoint and sidecar status" + ); + } + Ok(ResolvedBackend::ComputerUse) + } BrowserBackendKind::Auto => { if Self::rust_native_compiled() && self.rust_native_available() { return Ok(ResolvedBackend::RustNative); @@ -251,14 +355,31 @@ impl BrowserTool { return Ok(ResolvedBackend::AgentBrowser); } + let computer_use_err = match self.computer_use_available() { + Ok(true) => return Ok(ResolvedBackend::ComputerUse), + Ok(false) => None, + Err(err) => Some(err.to_string()), + }; + if Self::rust_native_compiled() { + if let Some(err) = computer_use_err { + anyhow::bail!( + "browser.backend='auto' found no usable backend (agent-browser missing, rust-native unavailable, computer-use invalid: {err})" + ); + } anyhow::bail!( - "browser.backend='auto' found no usable backend (agent-browser missing, rust-native unavailable)" + "browser.backend='auto' found no usable backend (agent-browser missing, rust-native unavailable, computer-use sidecar unreachable)" ) } + if let Some(err) = computer_use_err { + anyhow::bail!( + "browser.backend='auto' needs agent-browser CLI, browser-native, or valid computer-use sidecar (error: {err})" + ); + } + anyhow::bail!( - "browser.backend='auto' needs agent-browser CLI, or build with --features browser-native" + "browser.backend='auto' needs agent-browser CLI, browser-native, or computer-use sidecar" ) } } @@ -523,6 +644,179 @@ impl BrowserTool { } } + fn validate_coordinate(&self, key: &str, value: i64, max: Option) -> anyhow::Result<()> { + if value < 0 { + anyhow::bail!("'{key}' must be >= 0") + } + if let Some(limit) = max { + if limit < 0 { + anyhow::bail!("Configured coordinate limit for '{key}' must be >= 0") + } + if value > limit { + anyhow::bail!("'{key}'={value} exceeds configured limit {limit}") + } + } + Ok(()) + } + + fn read_required_i64( + &self, + params: &serde_json::Map, + key: &str, + ) -> anyhow::Result { + params + .get(key) + .and_then(Value::as_i64) + .ok_or_else(|| anyhow::anyhow!("Missing or invalid '{key}' parameter")) + } + + fn validate_computer_use_action( + &self, + action: &str, + params: &serde_json::Map, + ) -> anyhow::Result<()> { + match action { + "open" => { + let url = params + .get("url") + .and_then(Value::as_str) + .ok_or_else(|| anyhow::anyhow!("Missing 'url' for open action"))?; + self.validate_url(url)?; + } + "mouse_move" | "mouse_click" => { + let x = self.read_required_i64(params, "x")?; + let y = self.read_required_i64(params, "y")?; + self.validate_coordinate("x", x, self.computer_use.max_coordinate_x)?; + self.validate_coordinate("y", y, self.computer_use.max_coordinate_y)?; + } + "mouse_drag" => { + let from_x = self.read_required_i64(params, "from_x")?; + let from_y = self.read_required_i64(params, "from_y")?; + let to_x = self.read_required_i64(params, "to_x")?; + let to_y = self.read_required_i64(params, "to_y")?; + self.validate_coordinate("from_x", from_x, self.computer_use.max_coordinate_x)?; + self.validate_coordinate("to_x", to_x, self.computer_use.max_coordinate_x)?; + self.validate_coordinate("from_y", from_y, self.computer_use.max_coordinate_y)?; + self.validate_coordinate("to_y", to_y, self.computer_use.max_coordinate_y)?; + } + _ => {} + } + Ok(()) + } + + async fn execute_computer_use_action( + &self, + action: &str, + args: &Value, + ) -> anyhow::Result { + let endpoint = self.computer_use_endpoint_url()?; + + let mut params = args + .as_object() + .cloned() + .ok_or_else(|| anyhow::anyhow!("browser args must be a JSON object"))?; + params.remove("action"); + + self.validate_computer_use_action(action, ¶ms)?; + + let payload = json!({ + "action": action, + "params": params, + "policy": { + "allowed_domains": self.allowed_domains, + "window_allowlist": self.computer_use.window_allowlist, + "max_coordinate_x": self.computer_use.max_coordinate_x, + "max_coordinate_y": self.computer_use.max_coordinate_y, + }, + "metadata": { + "session_name": self.session_name, + "source": "zeroclaw.browser", + "version": env!("CARGO_PKG_VERSION"), + } + }); + + let client = reqwest::Client::new(); + let mut request = client + .post(endpoint) + .timeout(Duration::from_millis(self.computer_use.timeout_ms)) + .json(&payload); + + if let Some(api_key) = self.computer_use.api_key.as_deref() { + let token = api_key.trim(); + if !token.is_empty() { + request = request.bearer_auth(token); + } + } + + let response = request.send().await.with_context(|| { + format!( + "Failed to call computer-use sidecar at {}", + self.computer_use.endpoint + ) + })?; + + let status = response.status(); + let body = response + .text() + .await + .context("Failed to read computer-use sidecar response body")?; + + if let Ok(parsed) = serde_json::from_str::(&body) { + if status.is_success() && parsed.success.unwrap_or(true) { + let output = parsed + .data + .map(|data| serde_json::to_string_pretty(&data).unwrap_or_default()) + .unwrap_or_else(|| { + serde_json::to_string_pretty(&json!({ + "backend": "computer_use", + "action": action, + "ok": true, + })) + .unwrap_or_default() + }); + + return Ok(ToolResult { + success: true, + output, + error: None, + }); + } + + let error = parsed.error.or_else(|| { + if status.is_success() && parsed.success == Some(false) { + Some("computer-use sidecar returned success=false".to_string()) + } else { + Some(format!( + "computer-use sidecar request failed with status {status}" + )) + } + }); + + return Ok(ToolResult { + success: false, + output: String::new(), + error, + }); + } + + if status.is_success() { + return Ok(ToolResult { + success: true, + output: body, + error: None, + }); + } + + Ok(ToolResult { + success: false, + output: String::new(), + error: Some(format!( + "computer-use sidecar request failed with status {status}: {}", + body.trim() + )), + }) + } + async fn execute_action( &self, action: BrowserAction, @@ -531,6 +825,9 @@ impl BrowserTool { match backend { ResolvedBackend::AgentBrowser => self.execute_agent_browser_action(action).await, ResolvedBackend::RustNative => self.execute_rust_native_action(action).await, + ResolvedBackend::ComputerUse => anyhow::bail!( + "Internal error: computer_use backend must be handled before BrowserAction parsing" + ), } } @@ -564,10 +861,12 @@ impl Tool for BrowserTool { } fn description(&self) -> &str { - "Web browser automation with pluggable backends (agent-browser or rust-native). \ - Supports navigation, clicking, filling forms, screenshots, and page snapshots. \ - Use 'snapshot' to map interactive elements to refs (@e1, @e2), then use refs for \ - precise interaction. Enforces browser.allowed_domains for open actions." + concat!( + "Web/browser automation with pluggable backends (agent-browser, rust-native, computer_use). ", + "Supports DOM actions plus optional OS-level actions (mouse_move, mouse_click, mouse_drag, ", + "key_type, key_press, screen_capture) through a computer-use sidecar. Use 'snapshot' to map ", + "interactive elements to refs (@e1, @e2). Enforces browser.allowed_domains for open actions." + ) } fn parameters_schema(&self) -> Value { @@ -578,8 +877,10 @@ impl Tool for BrowserTool { "type": "string", "enum": ["open", "snapshot", "click", "fill", "type", "get_text", "get_title", "get_url", "screenshot", "wait", "press", - "hover", "scroll", "is_visible", "close", "find"], - "description": "Browser action to perform" + "hover", "scroll", "is_visible", "close", "find", + "mouse_move", "mouse_click", "mouse_drag", "key_type", + "key_press", "screen_capture"], + "description": "Browser action to perform (OS-level actions require backend=computer_use)" }, "url": { "type": "string", @@ -601,6 +902,35 @@ impl Tool for BrowserTool { "type": "string", "description": "Key to press (Enter, Tab, Escape, etc.)" }, + "x": { + "type": "integer", + "description": "Screen X coordinate (computer_use: mouse_move/mouse_click)" + }, + "y": { + "type": "integer", + "description": "Screen Y coordinate (computer_use: mouse_move/mouse_click)" + }, + "from_x": { + "type": "integer", + "description": "Drag source X coordinate (computer_use: mouse_drag)" + }, + "from_y": { + "type": "integer", + "description": "Drag source Y coordinate (computer_use: mouse_drag)" + }, + "to_x": { + "type": "integer", + "description": "Drag target X coordinate (computer_use: mouse_drag)" + }, + "to_y": { + "type": "integer", + "description": "Drag target Y coordinate (computer_use: mouse_drag)" + }, + "button": { + "type": "string", + "enum": ["left", "right", "middle"], + "description": "Mouse button for computer_use mouse_click" + }, "direction": { "type": "string", "enum": ["up", "down", "left", "right"], @@ -688,6 +1018,18 @@ impl Tool for BrowserTool { .and_then(|v| v.as_str()) .ok_or_else(|| anyhow::anyhow!("Missing 'action' parameter"))?; + if !is_supported_browser_action(action_str) { + return Ok(ToolResult { + success: false, + output: String::new(), + error: Some(format!("Unknown action: {action_str}")), + }); + } + + if backend == ResolvedBackend::ComputerUse { + return self.execute_computer_use_action(action_str, &args).await; + } + let action = match action_str { "open" => { let url = args @@ -839,7 +1181,14 @@ impl Tool for BrowserTool { return Ok(ToolResult { success: false, output: String::new(), - error: Some(format!("Unknown action: {action_str}")), + error: Some(format!( + "Action '{action_str}' is unavailable for backend '{}'", + match backend { + ResolvedBackend::AgentBrowser => "agent_browser", + ResolvedBackend::RustNative => "rust_native", + ResolvedBackend::ComputerUse => "computer_use", + } + )), }); } }; @@ -1523,6 +1872,34 @@ mod native_backend { // ── Helper functions ───────────────────────────────────────────── +fn is_supported_browser_action(action: &str) -> bool { + matches!( + action, + "open" + | "snapshot" + | "click" + | "fill" + | "type" + | "get_text" + | "get_title" + | "get_url" + | "screenshot" + | "wait" + | "press" + | "hover" + | "scroll" + | "is_visible" + | "close" + | "find" + | "mouse_move" + | "mouse_click" + | "mouse_drag" + | "key_type" + | "key_press" + | "screen_capture" + ) +} + fn normalize_domains(domains: Vec) -> Vec { domains .into_iter() @@ -1531,6 +1908,30 @@ fn normalize_domains(domains: Vec) -> Vec { .collect() } +fn endpoint_reachable(endpoint: &reqwest::Url, timeout: Duration) -> bool { + let host = match endpoint.host_str() { + Some(host) if !host.is_empty() => host, + _ => return false, + }; + + let port = match endpoint.port_or_known_default() { + Some(port) => port, + None => return false, + }; + + let mut addrs = match (host, port).to_socket_addrs() { + Ok(addrs) => addrs, + Err(_) => return false, + }; + + let addr = match addrs.next() { + Some(addr) => addr, + None => return false, + }; + + std::net::TcpStream::connect_timeout(&addr, timeout).is_ok() +} + fn extract_host(url_str: &str) -> anyhow::Result { // Simple host extraction without url crate let url = url_str.trim(); @@ -1746,6 +2147,10 @@ mod tests { BrowserBackendKind::parse("rust-native").unwrap(), BrowserBackendKind::RustNative ); + assert_eq!( + BrowserBackendKind::parse("computer_use").unwrap(), + BrowserBackendKind::ComputerUse + ); assert_eq!( BrowserBackendKind::parse("auto").unwrap(), BrowserBackendKind::Auto @@ -1778,10 +2183,100 @@ mod tests { true, "http://127.0.0.1:9515".into(), None, + ComputerUseConfig::default(), ); assert_eq!(tool.configured_backend().unwrap(), BrowserBackendKind::Auto); } + #[test] + fn browser_tool_accepts_computer_use_backend_config() { + let security = Arc::new(SecurityPolicy::default()); + let tool = BrowserTool::new_with_backend( + security, + vec!["example.com".into()], + None, + "computer_use".into(), + true, + "http://127.0.0.1:9515".into(), + None, + ComputerUseConfig::default(), + ); + assert_eq!( + tool.configured_backend().unwrap(), + BrowserBackendKind::ComputerUse + ); + } + + #[test] + fn computer_use_endpoint_rejects_public_http_by_default() { + let security = Arc::new(SecurityPolicy::default()); + let tool = BrowserTool::new_with_backend( + security, + vec!["example.com".into()], + None, + "computer_use".into(), + true, + "http://127.0.0.1:9515".into(), + None, + ComputerUseConfig { + endpoint: "http://computer-use.example.com/v1/actions".into(), + ..ComputerUseConfig::default() + }, + ); + + assert!(tool.computer_use_endpoint_url().is_err()); + } + + #[test] + fn computer_use_endpoint_requires_https_for_public_remote() { + let security = Arc::new(SecurityPolicy::default()); + let tool = BrowserTool::new_with_backend( + security, + vec!["example.com".into()], + None, + "computer_use".into(), + true, + "http://127.0.0.1:9515".into(), + None, + ComputerUseConfig { + endpoint: "https://computer-use.example.com/v1/actions".into(), + allow_remote_endpoint: true, + ..ComputerUseConfig::default() + }, + ); + + assert!(tool.computer_use_endpoint_url().is_ok()); + } + + #[test] + fn computer_use_coordinate_validation_applies_limits() { + let security = Arc::new(SecurityPolicy::default()); + let tool = BrowserTool::new_with_backend( + security, + vec!["example.com".into()], + None, + "computer_use".into(), + true, + "http://127.0.0.1:9515".into(), + None, + ComputerUseConfig { + max_coordinate_x: Some(100), + max_coordinate_y: Some(100), + ..ComputerUseConfig::default() + }, + ); + + assert!(tool + .validate_coordinate("x", 50, tool.computer_use.max_coordinate_x) + .is_ok()); + assert!(tool + .validate_coordinate("x", 101, tool.computer_use.max_coordinate_x) + .is_err()); + assert!(tool + .validate_coordinate("y", -1, tool.computer_use.max_coordinate_y) + .is_err()); + } + #[test] fn browser_tool_name() { let security = Arc::new(SecurityPolicy::default()); diff --git a/src/tools/git_operations.rs b/src/tools/git_operations.rs index e20113a..d01243a 100644 --- a/src/tools/git_operations.rs +++ b/src/tools/git_operations.rs @@ -2,6 +2,8 @@ use super::traits::{Tool, ToolResult}; use crate::security::{AutonomyLevel, SecurityPolicy}; use async_trait::async_trait; use serde_json::json; +#[cfg(test)] +use std::path::Path; use std::sync::Arc; /// Git operations tool for structured repository management. diff --git a/src/tools/mod.rs b/src/tools/mod.rs index 964ba5b..d239c5e 100644 --- a/src/tools/mod.rs +++ b/src/tools/mod.rs @@ -15,7 +15,7 @@ pub mod screenshot; pub mod shell; pub mod traits; -pub use browser::BrowserTool; +pub use browser::{BrowserTool, ComputerUseConfig}; pub use browser_open::BrowserOpenTool; pub use composio::ComposioTool; pub use delegate::DelegateTool; @@ -131,6 +131,15 @@ pub fn all_tools_with_runtime( browser_config.native_headless, browser_config.native_webdriver_url.clone(), browser_config.native_chrome_path.clone(), + ComputerUseConfig { + endpoint: browser_config.computer_use.endpoint.clone(), + api_key: browser_config.computer_use.api_key.clone(), + timeout_ms: browser_config.computer_use.timeout_ms, + allow_remote_endpoint: browser_config.computer_use.allow_remote_endpoint, + window_allowlist: browser_config.computer_use.window_allowlist.clone(), + max_coordinate_x: browser_config.computer_use.max_coordinate_x, + max_coordinate_y: browser_config.computer_use.max_coordinate_y, + }, ))); }