feat: add multimodal image marker support with Ollama vision
This commit is contained in:
parent
63aacb09ff
commit
dcd0bf641d
21 changed files with 1152 additions and 78 deletions
|
|
@ -51,6 +51,22 @@ Notes:
|
||||||
- Model cache previews come from `zeroclaw models refresh --provider <ID>`.
|
- Model cache previews come from `zeroclaw models refresh --provider <ID>`.
|
||||||
- These are runtime chat commands, not CLI subcommands.
|
- These are runtime chat commands, not CLI subcommands.
|
||||||
|
|
||||||
|
## Inbound Image Marker Protocol
|
||||||
|
|
||||||
|
ZeroClaw supports multimodal input through inline message markers:
|
||||||
|
|
||||||
|
- Syntax: ``[IMAGE:<source>]``
|
||||||
|
- `<source>` can be:
|
||||||
|
- Local file path
|
||||||
|
- Data URI (`data:image/...;base64,...`)
|
||||||
|
- Remote URL only when `[multimodal].allow_remote_fetch = true`
|
||||||
|
|
||||||
|
Operational notes:
|
||||||
|
|
||||||
|
- Marker parsing applies to user-role messages before provider calls.
|
||||||
|
- Provider capability is enforced at runtime: if the selected provider does not support vision, the request fails with a structured capability error (`capability=vision`).
|
||||||
|
- Linq webhook `media` parts with `image/*` MIME type are automatically converted to this marker format.
|
||||||
|
|
||||||
## Channel Matrix
|
## Channel Matrix
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
@ -349,4 +365,3 @@ If a specific channel task crashes or exits, the channel supervisor in `channels
|
||||||
- `Channel message worker crashed:`
|
- `Channel message worker crashed:`
|
||||||
|
|
||||||
These messages indicate automatic restart behavior is active, and you should inspect preceding logs for root cause.
|
These messages indicate automatic restart behavior is active, and you should inspect preceding logs for root cause.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -62,6 +62,24 @@ Notes:
|
||||||
- `reasoning_enabled = true` explicitly requests reasoning for supported providers (`think: true` on `ollama`).
|
- `reasoning_enabled = true` explicitly requests reasoning for supported providers (`think: true` on `ollama`).
|
||||||
- Unset keeps provider defaults.
|
- Unset keeps provider defaults.
|
||||||
|
|
||||||
|
## `[multimodal]`
|
||||||
|
|
||||||
|
| Key | Default | Purpose |
|
||||||
|
|---|---|---|
|
||||||
|
| `max_images` | `4` | Maximum image markers accepted per request |
|
||||||
|
| `max_image_size_mb` | `5` | Per-image size limit before base64 encoding |
|
||||||
|
| `allow_remote_fetch` | `false` | Allow fetching `http(s)` image URLs from markers |
|
||||||
|
|
||||||
|
Notes:
|
||||||
|
|
||||||
|
- Runtime accepts image markers in user messages with syntax: ``[IMAGE:<source>]``.
|
||||||
|
- Supported sources:
|
||||||
|
- Local file path (for example ``[IMAGE:/tmp/screenshot.png]``)
|
||||||
|
- Data URI (for example ``[IMAGE:data:image/png;base64,...]``)
|
||||||
|
- Remote URL only when `allow_remote_fetch = true`
|
||||||
|
- Allowed MIME types: `image/png`, `image/jpeg`, `image/webp`, `image/gif`, `image/bmp`.
|
||||||
|
- When the active provider does not support vision, requests fail with a structured capability error (`capability=vision`) instead of silently dropping images.
|
||||||
|
|
||||||
## `[gateway]`
|
## `[gateway]`
|
||||||
|
|
||||||
| Key | Default | Purpose |
|
| Key | Default | Purpose |
|
||||||
|
|
|
||||||
|
|
@ -56,6 +56,13 @@ credential is not reused for fallback providers.
|
||||||
| `lmstudio` | `lm-studio` | Yes | (optional; local by default) |
|
| `lmstudio` | `lm-studio` | Yes | (optional; local by default) |
|
||||||
| `nvidia` | `nvidia-nim`, `build.nvidia.com` | No | `NVIDIA_API_KEY` |
|
| `nvidia` | `nvidia-nim`, `build.nvidia.com` | No | `NVIDIA_API_KEY` |
|
||||||
|
|
||||||
|
### Ollama Vision Notes
|
||||||
|
|
||||||
|
- Provider ID: `ollama`
|
||||||
|
- Vision input is supported through user message image markers: ``[IMAGE:<source>]``.
|
||||||
|
- After multimodal normalization, ZeroClaw sends image payloads through Ollama's native `messages[].images` field.
|
||||||
|
- If a non-vision provider is selected, ZeroClaw returns a structured capability error instead of silently ignoring images.
|
||||||
|
|
||||||
### Bedrock Notes
|
### Bedrock Notes
|
||||||
|
|
||||||
- Provider ID: `bedrock` (alias: `aws-bedrock`)
|
- Provider ID: `bedrock` (alias: `aws-bedrock`)
|
||||||
|
|
|
||||||
|
|
@ -1,8 +1,11 @@
|
||||||
use crate::approval::{ApprovalManager, ApprovalRequest, ApprovalResponse};
|
use crate::approval::{ApprovalManager, ApprovalRequest, ApprovalResponse};
|
||||||
use crate::config::Config;
|
use crate::config::Config;
|
||||||
use crate::memory::{self, Memory, MemoryCategory};
|
use crate::memory::{self, Memory, MemoryCategory};
|
||||||
|
use crate::multimodal;
|
||||||
use crate::observability::{self, Observer, ObserverEvent};
|
use crate::observability::{self, Observer, ObserverEvent};
|
||||||
use crate::providers::{self, ChatMessage, ChatRequest, Provider, ToolCall};
|
use crate::providers::{
|
||||||
|
self, ChatMessage, ChatRequest, Provider, ProviderCapabilityError, ToolCall,
|
||||||
|
};
|
||||||
use crate::runtime;
|
use crate::runtime;
|
||||||
use crate::security::SecurityPolicy;
|
use crate::security::SecurityPolicy;
|
||||||
use crate::tools::{self, Tool};
|
use crate::tools::{self, Tool};
|
||||||
|
|
@ -826,6 +829,7 @@ pub(crate) async fn agent_turn(
|
||||||
model: &str,
|
model: &str,
|
||||||
temperature: f64,
|
temperature: f64,
|
||||||
silent: bool,
|
silent: bool,
|
||||||
|
multimodal_config: &crate::config::MultimodalConfig,
|
||||||
max_tool_iterations: usize,
|
max_tool_iterations: usize,
|
||||||
) -> Result<String> {
|
) -> Result<String> {
|
||||||
run_tool_call_loop(
|
run_tool_call_loop(
|
||||||
|
|
@ -839,6 +843,7 @@ pub(crate) async fn agent_turn(
|
||||||
silent,
|
silent,
|
||||||
None,
|
None,
|
||||||
"channel",
|
"channel",
|
||||||
|
multimodal_config,
|
||||||
max_tool_iterations,
|
max_tool_iterations,
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
|
|
@ -859,6 +864,7 @@ pub(crate) async fn run_tool_call_loop(
|
||||||
silent: bool,
|
silent: bool,
|
||||||
approval: Option<&ApprovalManager>,
|
approval: Option<&ApprovalManager>,
|
||||||
channel_name: &str,
|
channel_name: &str,
|
||||||
|
multimodal_config: &crate::config::MultimodalConfig,
|
||||||
max_tool_iterations: usize,
|
max_tool_iterations: usize,
|
||||||
on_delta: Option<tokio::sync::mpsc::Sender<String>>,
|
on_delta: Option<tokio::sync::mpsc::Sender<String>>,
|
||||||
) -> Result<String> {
|
) -> Result<String> {
|
||||||
|
|
@ -873,6 +879,21 @@ pub(crate) async fn run_tool_call_loop(
|
||||||
let use_native_tools = provider.supports_native_tools() && !tool_specs.is_empty();
|
let use_native_tools = provider.supports_native_tools() && !tool_specs.is_empty();
|
||||||
|
|
||||||
for _iteration in 0..max_iterations {
|
for _iteration in 0..max_iterations {
|
||||||
|
let image_marker_count = multimodal::count_image_markers(history);
|
||||||
|
if image_marker_count > 0 && !provider.supports_vision() {
|
||||||
|
return Err(ProviderCapabilityError {
|
||||||
|
provider: provider_name.to_string(),
|
||||||
|
capability: "vision".to_string(),
|
||||||
|
message: format!(
|
||||||
|
"received {image_marker_count} image marker(s), but this provider does not support vision input"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
.into());
|
||||||
|
}
|
||||||
|
|
||||||
|
let prepared_messages =
|
||||||
|
multimodal::prepare_messages_for_provider(history, multimodal_config).await?;
|
||||||
|
|
||||||
observer.record_event(&ObserverEvent::LlmRequest {
|
observer.record_event(&ObserverEvent::LlmRequest {
|
||||||
provider: provider_name.to_string(),
|
provider: provider_name.to_string(),
|
||||||
model: model.to_string(),
|
model: model.to_string(),
|
||||||
|
|
@ -893,7 +914,7 @@ pub(crate) async fn run_tool_call_loop(
|
||||||
match provider
|
match provider
|
||||||
.chat(
|
.chat(
|
||||||
ChatRequest {
|
ChatRequest {
|
||||||
messages: history,
|
messages: &prepared_messages.messages,
|
||||||
tools: request_tools,
|
tools: request_tools,
|
||||||
},
|
},
|
||||||
model,
|
model,
|
||||||
|
|
@ -1404,6 +1425,7 @@ pub async fn run(
|
||||||
false,
|
false,
|
||||||
Some(&approval_manager),
|
Some(&approval_manager),
|
||||||
"cli",
|
"cli",
|
||||||
|
&config.multimodal,
|
||||||
config.agent.max_tool_iterations,
|
config.agent.max_tool_iterations,
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
|
|
@ -1530,6 +1552,7 @@ pub async fn run(
|
||||||
false,
|
false,
|
||||||
Some(&approval_manager),
|
Some(&approval_manager),
|
||||||
"cli",
|
"cli",
|
||||||
|
&config.multimodal,
|
||||||
config.agent.max_tool_iterations,
|
config.agent.max_tool_iterations,
|
||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
|
|
@ -1757,6 +1780,7 @@ pub async fn process_message(config: Config, message: &str) -> Result<String> {
|
||||||
&model_name,
|
&model_name,
|
||||||
config.default_temperature,
|
config.default_temperature,
|
||||||
true,
|
true,
|
||||||
|
&config.multimodal,
|
||||||
config.agent.max_tool_iterations,
|
config.agent.max_tool_iterations,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
|
|
@ -1765,6 +1789,10 @@ pub async fn process_message(config: Config, message: &str) -> Result<String> {
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
use async_trait::async_trait;
|
||||||
|
use base64::{engine::general_purpose::STANDARD, Engine as _};
|
||||||
|
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_scrub_credentials() {
|
fn test_scrub_credentials() {
|
||||||
|
|
@ -1785,8 +1813,191 @@ mod tests {
|
||||||
assert!(scrubbed.contains("public"));
|
assert!(scrubbed.contains("public"));
|
||||||
}
|
}
|
||||||
use crate::memory::{Memory, MemoryCategory, SqliteMemory};
|
use crate::memory::{Memory, MemoryCategory, SqliteMemory};
|
||||||
|
use crate::observability::NoopObserver;
|
||||||
|
use crate::providers::traits::ProviderCapabilities;
|
||||||
|
use crate::providers::ChatResponse;
|
||||||
use tempfile::TempDir;
|
use tempfile::TempDir;
|
||||||
|
|
||||||
|
struct NonVisionProvider {
|
||||||
|
calls: Arc<AtomicUsize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Provider for NonVisionProvider {
|
||||||
|
async fn chat_with_system(
|
||||||
|
&self,
|
||||||
|
_system_prompt: Option<&str>,
|
||||||
|
_message: &str,
|
||||||
|
_model: &str,
|
||||||
|
_temperature: f64,
|
||||||
|
) -> anyhow::Result<String> {
|
||||||
|
self.calls.fetch_add(1, Ordering::SeqCst);
|
||||||
|
Ok("ok".to_string())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct VisionProvider {
|
||||||
|
calls: Arc<AtomicUsize>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_trait]
|
||||||
|
impl Provider for VisionProvider {
|
||||||
|
fn capabilities(&self) -> ProviderCapabilities {
|
||||||
|
ProviderCapabilities {
|
||||||
|
native_tool_calling: false,
|
||||||
|
vision: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn chat_with_system(
|
||||||
|
&self,
|
||||||
|
_system_prompt: Option<&str>,
|
||||||
|
_message: &str,
|
||||||
|
_model: &str,
|
||||||
|
_temperature: f64,
|
||||||
|
) -> anyhow::Result<String> {
|
||||||
|
self.calls.fetch_add(1, Ordering::SeqCst);
|
||||||
|
Ok("ok".to_string())
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn chat(
|
||||||
|
&self,
|
||||||
|
request: ChatRequest<'_>,
|
||||||
|
_model: &str,
|
||||||
|
_temperature: f64,
|
||||||
|
) -> anyhow::Result<ChatResponse> {
|
||||||
|
self.calls.fetch_add(1, Ordering::SeqCst);
|
||||||
|
let marker_count = crate::multimodal::count_image_markers(request.messages);
|
||||||
|
if marker_count == 0 {
|
||||||
|
anyhow::bail!("expected image markers in request messages");
|
||||||
|
}
|
||||||
|
|
||||||
|
if request.tools.is_some() {
|
||||||
|
anyhow::bail!("no tools should be attached for this test");
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(ChatResponse {
|
||||||
|
text: Some("vision-ok".to_string()),
|
||||||
|
tool_calls: Vec::new(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn run_tool_call_loop_returns_structured_error_for_non_vision_provider() {
|
||||||
|
let calls = Arc::new(AtomicUsize::new(0));
|
||||||
|
let provider = NonVisionProvider {
|
||||||
|
calls: Arc::clone(&calls),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut history = vec![ChatMessage::user(
|
||||||
|
"please inspect [IMAGE:data:image/png;base64,iVBORw0KGgo=]".to_string(),
|
||||||
|
)];
|
||||||
|
let tools_registry: Vec<Box<dyn Tool>> = Vec::new();
|
||||||
|
let observer = NoopObserver;
|
||||||
|
|
||||||
|
let err = run_tool_call_loop(
|
||||||
|
&provider,
|
||||||
|
&mut history,
|
||||||
|
&tools_registry,
|
||||||
|
&observer,
|
||||||
|
"mock-provider",
|
||||||
|
"mock-model",
|
||||||
|
0.0,
|
||||||
|
true,
|
||||||
|
None,
|
||||||
|
"cli",
|
||||||
|
&crate::config::MultimodalConfig::default(),
|
||||||
|
3,
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect_err("provider without vision support should fail");
|
||||||
|
|
||||||
|
assert!(err.to_string().contains("provider_capability_error"));
|
||||||
|
assert!(err.to_string().contains("capability=vision"));
|
||||||
|
assert_eq!(calls.load(Ordering::SeqCst), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn run_tool_call_loop_rejects_oversized_image_payload() {
|
||||||
|
let calls = Arc::new(AtomicUsize::new(0));
|
||||||
|
let provider = VisionProvider {
|
||||||
|
calls: Arc::clone(&calls),
|
||||||
|
};
|
||||||
|
|
||||||
|
let oversized_payload = STANDARD.encode(vec![0_u8; (1024 * 1024) + 1]);
|
||||||
|
let mut history = vec![ChatMessage::user(format!(
|
||||||
|
"[IMAGE:data:image/png;base64,{oversized_payload}]"
|
||||||
|
))];
|
||||||
|
|
||||||
|
let tools_registry: Vec<Box<dyn Tool>> = Vec::new();
|
||||||
|
let observer = NoopObserver;
|
||||||
|
let multimodal = crate::config::MultimodalConfig {
|
||||||
|
max_images: 4,
|
||||||
|
max_image_size_mb: 1,
|
||||||
|
allow_remote_fetch: false,
|
||||||
|
};
|
||||||
|
|
||||||
|
let err = run_tool_call_loop(
|
||||||
|
&provider,
|
||||||
|
&mut history,
|
||||||
|
&tools_registry,
|
||||||
|
&observer,
|
||||||
|
"mock-provider",
|
||||||
|
"mock-model",
|
||||||
|
0.0,
|
||||||
|
true,
|
||||||
|
None,
|
||||||
|
"cli",
|
||||||
|
&multimodal,
|
||||||
|
3,
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect_err("oversized payload must fail");
|
||||||
|
|
||||||
|
assert!(err
|
||||||
|
.to_string()
|
||||||
|
.contains("multimodal image size limit exceeded"));
|
||||||
|
assert_eq!(calls.load(Ordering::SeqCst), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn run_tool_call_loop_accepts_valid_multimodal_request_flow() {
|
||||||
|
let calls = Arc::new(AtomicUsize::new(0));
|
||||||
|
let provider = VisionProvider {
|
||||||
|
calls: Arc::clone(&calls),
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut history = vec![ChatMessage::user(
|
||||||
|
"Analyze this [IMAGE:data:image/png;base64,iVBORw0KGgo=]".to_string(),
|
||||||
|
)];
|
||||||
|
let tools_registry: Vec<Box<dyn Tool>> = Vec::new();
|
||||||
|
let observer = NoopObserver;
|
||||||
|
|
||||||
|
let result = run_tool_call_loop(
|
||||||
|
&provider,
|
||||||
|
&mut history,
|
||||||
|
&tools_registry,
|
||||||
|
&observer,
|
||||||
|
"mock-provider",
|
||||||
|
"mock-model",
|
||||||
|
0.0,
|
||||||
|
true,
|
||||||
|
None,
|
||||||
|
"cli",
|
||||||
|
&crate::config::MultimodalConfig::default(),
|
||||||
|
3,
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.expect("valid multimodal payload should pass");
|
||||||
|
|
||||||
|
assert_eq!(result, "vision-ok");
|
||||||
|
assert_eq!(calls.load(Ordering::SeqCst), 1);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn parse_tool_calls_extracts_single_call() {
|
fn parse_tool_calls_extracts_single_call() {
|
||||||
let response = r#"Let me check that.
|
let response = r#"Let me check that.
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,28 @@ impl LinqChannel {
|
||||||
&self.from_phone
|
&self.from_phone
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn media_part_to_image_marker(part: &serde_json::Value) -> Option<String> {
|
||||||
|
let source = part
|
||||||
|
.get("url")
|
||||||
|
.or_else(|| part.get("value"))
|
||||||
|
.and_then(|value| value.as_str())
|
||||||
|
.map(str::trim)
|
||||||
|
.filter(|value| !value.is_empty())?;
|
||||||
|
|
||||||
|
let mime_type = part
|
||||||
|
.get("mime_type")
|
||||||
|
.and_then(|value| value.as_str())
|
||||||
|
.map(str::trim)
|
||||||
|
.unwrap_or_default()
|
||||||
|
.to_ascii_lowercase();
|
||||||
|
|
||||||
|
if !mime_type.starts_with("image/") {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
|
||||||
|
Some(format!("[IMAGE:{source}]"))
|
||||||
|
}
|
||||||
|
|
||||||
/// Parse an incoming webhook payload from Linq and extract messages.
|
/// Parse an incoming webhook payload from Linq and extract messages.
|
||||||
///
|
///
|
||||||
/// Linq webhook envelope:
|
/// Linq webhook envelope:
|
||||||
|
|
@ -124,25 +146,36 @@ impl LinqChannel {
|
||||||
return messages;
|
return messages;
|
||||||
};
|
};
|
||||||
|
|
||||||
let text_parts: Vec<&str> = parts
|
let content_parts: Vec<String> = parts
|
||||||
.iter()
|
.iter()
|
||||||
.filter_map(|part| {
|
.filter_map(|part| {
|
||||||
let part_type = part.get("type").and_then(|t| t.as_str())?;
|
let part_type = part.get("type").and_then(|t| t.as_str())?;
|
||||||
if part_type == "text" {
|
match part_type {
|
||||||
part.get("value").and_then(|v| v.as_str())
|
"text" => part
|
||||||
|
.get("value")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.map(ToString::to_string),
|
||||||
|
"media" | "image" => {
|
||||||
|
if let Some(marker) = Self::media_part_to_image_marker(part) {
|
||||||
|
Some(marker)
|
||||||
} else {
|
} else {
|
||||||
// Skip media parts for now
|
tracing::debug!("Linq: skipping unsupported {part_type} part");
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => {
|
||||||
tracing::debug!("Linq: skipping {part_type} part");
|
tracing::debug!("Linq: skipping {part_type} part");
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
}
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
if text_parts.is_empty() {
|
if content_parts.is_empty() {
|
||||||
return messages;
|
return messages;
|
||||||
}
|
}
|
||||||
|
|
||||||
let content = text_parts.join("\n");
|
let content = content_parts.join("\n").trim().to_string();
|
||||||
|
|
||||||
if content.is_empty() {
|
if content.is_empty() {
|
||||||
return messages;
|
return messages;
|
||||||
|
|
@ -496,7 +529,7 @@ mod tests {
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn linq_parse_media_only_skipped() {
|
fn linq_parse_media_only_translated_to_image_marker() {
|
||||||
let ch = LinqChannel::new("tok".into(), "+15551234567".into(), vec!["*".into()]);
|
let ch = LinqChannel::new("tok".into(), "+15551234567".into(), vec!["*".into()]);
|
||||||
let payload = serde_json::json!({
|
let payload = serde_json::json!({
|
||||||
"event_type": "message.received",
|
"event_type": "message.received",
|
||||||
|
|
@ -516,7 +549,32 @@ mod tests {
|
||||||
});
|
});
|
||||||
|
|
||||||
let msgs = ch.parse_webhook_payload(&payload);
|
let msgs = ch.parse_webhook_payload(&payload);
|
||||||
assert!(msgs.is_empty(), "Media-only messages should be skipped");
|
assert_eq!(msgs.len(), 1);
|
||||||
|
assert_eq!(msgs[0].content, "[IMAGE:https://example.com/image.jpg]");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn linq_parse_media_non_image_still_skipped() {
|
||||||
|
let ch = LinqChannel::new("tok".into(), "+15551234567".into(), vec!["*".into()]);
|
||||||
|
let payload = serde_json::json!({
|
||||||
|
"event_type": "message.received",
|
||||||
|
"data": {
|
||||||
|
"chat_id": "chat-789",
|
||||||
|
"from": "+1234567890",
|
||||||
|
"is_from_me": false,
|
||||||
|
"message": {
|
||||||
|
"id": "msg-abc",
|
||||||
|
"parts": [{
|
||||||
|
"type": "media",
|
||||||
|
"url": "https://example.com/sound.mp3",
|
||||||
|
"mime_type": "audio/mpeg"
|
||||||
|
}]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let msgs = ch.parse_webhook_payload(&payload);
|
||||||
|
assert!(msgs.is_empty(), "Non-image media should still be skipped");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
|
||||||
|
|
@ -139,6 +139,7 @@ struct ChannelRuntimeContext {
|
||||||
provider_runtime_options: providers::ProviderRuntimeOptions,
|
provider_runtime_options: providers::ProviderRuntimeOptions,
|
||||||
workspace_dir: Arc<PathBuf>,
|
workspace_dir: Arc<PathBuf>,
|
||||||
message_timeout_secs: u64,
|
message_timeout_secs: u64,
|
||||||
|
multimodal: crate::config::MultimodalConfig,
|
||||||
}
|
}
|
||||||
|
|
||||||
fn conversation_memory_key(msg: &traits::ChannelMessage) -> String {
|
fn conversation_memory_key(msg: &traits::ChannelMessage) -> String {
|
||||||
|
|
@ -810,6 +811,7 @@ async fn process_channel_message(ctx: Arc<ChannelRuntimeContext>, msg: traits::C
|
||||||
true,
|
true,
|
||||||
None,
|
None,
|
||||||
msg.channel.as_str(),
|
msg.channel.as_str(),
|
||||||
|
&ctx.multimodal,
|
||||||
ctx.max_tool_iterations,
|
ctx.max_tool_iterations,
|
||||||
delta_tx,
|
delta_tx,
|
||||||
),
|
),
|
||||||
|
|
@ -2062,6 +2064,7 @@ pub async fn start_channels(config: Config) -> Result<()> {
|
||||||
provider_runtime_options,
|
provider_runtime_options,
|
||||||
workspace_dir: Arc::new(config.workspace_dir.clone()),
|
workspace_dir: Arc::new(config.workspace_dir.clone()),
|
||||||
message_timeout_secs,
|
message_timeout_secs,
|
||||||
|
multimodal: config.multimodal.clone(),
|
||||||
});
|
});
|
||||||
|
|
||||||
run_message_dispatch_loop(rx, runtime_ctx, max_in_flight_messages).await;
|
run_message_dispatch_loop(rx, runtime_ctx, max_in_flight_messages).await;
|
||||||
|
|
@ -2559,6 +2562,7 @@ mod tests {
|
||||||
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
||||||
workspace_dir: Arc::new(std::env::temp_dir()),
|
workspace_dir: Arc::new(std::env::temp_dir()),
|
||||||
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
||||||
|
multimodal: crate::config::MultimodalConfig::default(),
|
||||||
});
|
});
|
||||||
|
|
||||||
process_channel_message(
|
process_channel_message(
|
||||||
|
|
@ -2613,6 +2617,7 @@ mod tests {
|
||||||
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
||||||
workspace_dir: Arc::new(std::env::temp_dir()),
|
workspace_dir: Arc::new(std::env::temp_dir()),
|
||||||
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
||||||
|
multimodal: crate::config::MultimodalConfig::default(),
|
||||||
});
|
});
|
||||||
|
|
||||||
process_channel_message(
|
process_channel_message(
|
||||||
|
|
@ -2676,6 +2681,7 @@ mod tests {
|
||||||
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
||||||
workspace_dir: Arc::new(std::env::temp_dir()),
|
workspace_dir: Arc::new(std::env::temp_dir()),
|
||||||
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
||||||
|
multimodal: crate::config::MultimodalConfig::default(),
|
||||||
});
|
});
|
||||||
|
|
||||||
process_channel_message(
|
process_channel_message(
|
||||||
|
|
@ -2760,6 +2766,7 @@ mod tests {
|
||||||
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
||||||
workspace_dir: Arc::new(std::env::temp_dir()),
|
workspace_dir: Arc::new(std::env::temp_dir()),
|
||||||
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
||||||
|
multimodal: crate::config::MultimodalConfig::default(),
|
||||||
});
|
});
|
||||||
|
|
||||||
process_channel_message(
|
process_channel_message(
|
||||||
|
|
@ -2820,6 +2827,7 @@ mod tests {
|
||||||
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
||||||
workspace_dir: Arc::new(std::env::temp_dir()),
|
workspace_dir: Arc::new(std::env::temp_dir()),
|
||||||
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
||||||
|
multimodal: crate::config::MultimodalConfig::default(),
|
||||||
});
|
});
|
||||||
|
|
||||||
process_channel_message(
|
process_channel_message(
|
||||||
|
|
@ -2875,6 +2883,7 @@ mod tests {
|
||||||
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
||||||
workspace_dir: Arc::new(std::env::temp_dir()),
|
workspace_dir: Arc::new(std::env::temp_dir()),
|
||||||
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
||||||
|
multimodal: crate::config::MultimodalConfig::default(),
|
||||||
});
|
});
|
||||||
|
|
||||||
process_channel_message(
|
process_channel_message(
|
||||||
|
|
@ -2981,6 +2990,7 @@ mod tests {
|
||||||
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
||||||
workspace_dir: Arc::new(std::env::temp_dir()),
|
workspace_dir: Arc::new(std::env::temp_dir()),
|
||||||
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
||||||
|
multimodal: crate::config::MultimodalConfig::default(),
|
||||||
});
|
});
|
||||||
|
|
||||||
let (tx, rx) = tokio::sync::mpsc::channel::<traits::ChannelMessage>(4);
|
let (tx, rx) = tokio::sync::mpsc::channel::<traits::ChannelMessage>(4);
|
||||||
|
|
@ -3054,6 +3064,7 @@ mod tests {
|
||||||
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
||||||
workspace_dir: Arc::new(std::env::temp_dir()),
|
workspace_dir: Arc::new(std::env::temp_dir()),
|
||||||
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
||||||
|
multimodal: crate::config::MultimodalConfig::default(),
|
||||||
});
|
});
|
||||||
|
|
||||||
process_channel_message(
|
process_channel_message(
|
||||||
|
|
@ -3451,6 +3462,7 @@ mod tests {
|
||||||
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
provider_runtime_options: providers::ProviderRuntimeOptions::default(),
|
||||||
workspace_dir: Arc::new(std::env::temp_dir()),
|
workspace_dir: Arc::new(std::env::temp_dir()),
|
||||||
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
message_timeout_secs: CHANNEL_MESSAGE_TIMEOUT_SECS,
|
||||||
|
multimodal: crate::config::MultimodalConfig::default(),
|
||||||
});
|
});
|
||||||
|
|
||||||
process_channel_message(
|
process_channel_message(
|
||||||
|
|
|
||||||
|
|
@ -9,11 +9,11 @@ pub use schema::{
|
||||||
DelegateAgentConfig, DiscordConfig, DockerRuntimeConfig, EmbeddingRouteConfig, GatewayConfig,
|
DelegateAgentConfig, DiscordConfig, DockerRuntimeConfig, EmbeddingRouteConfig, GatewayConfig,
|
||||||
HardwareConfig, HardwareTransport, HeartbeatConfig, HttpRequestConfig, IMessageConfig,
|
HardwareConfig, HardwareTransport, HeartbeatConfig, HttpRequestConfig, IMessageConfig,
|
||||||
IdentityConfig, LarkConfig, MatrixConfig, MemoryConfig, ModelRouteConfig, ObservabilityConfig,
|
IdentityConfig, LarkConfig, MatrixConfig, MemoryConfig, ModelRouteConfig, ObservabilityConfig,
|
||||||
PeripheralBoardConfig, PeripheralsConfig, ProxyConfig, ProxyScope, QueryClassificationConfig,
|
MultimodalConfig, ObservabilityConfig, PeripheralBoardConfig, PeripheralsConfig, ProxyConfig,
|
||||||
ReliabilityConfig, ResourceLimitsConfig, RuntimeConfig, SandboxBackend, SandboxConfig,
|
ProxyScope, QueryClassificationConfig, ReliabilityConfig, ResourceLimitsConfig, RuntimeConfig,
|
||||||
SchedulerConfig, SecretsConfig, SecurityConfig, SlackConfig, StorageConfig,
|
SandboxBackend, SandboxConfig, SchedulerConfig, SecretsConfig, SecurityConfig, SlackConfig,
|
||||||
StorageProviderConfig, StorageProviderSection, StreamMode, TelegramConfig, TunnelConfig,
|
StorageConfig, StorageProviderConfig, StorageProviderSection, StreamMode, TelegramConfig,
|
||||||
WebSearchConfig, WebhookConfig,
|
TunnelConfig, WebSearchConfig, WebhookConfig,
|
||||||
};
|
};
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|
|
||||||
|
|
@ -124,6 +124,9 @@ pub struct Config {
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub http_request: HttpRequestConfig,
|
pub http_request: HttpRequestConfig,
|
||||||
|
|
||||||
|
#[serde(default)]
|
||||||
|
pub multimodal: MultimodalConfig,
|
||||||
|
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub web_search: WebSearchConfig,
|
pub web_search: WebSearchConfig,
|
||||||
|
|
||||||
|
|
@ -284,6 +287,46 @@ impl Default for AgentConfig {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
|
||||||
|
pub struct MultimodalConfig {
|
||||||
|
/// Maximum number of image attachments accepted per request.
|
||||||
|
#[serde(default = "default_multimodal_max_images")]
|
||||||
|
pub max_images: usize,
|
||||||
|
/// Maximum image payload size in MiB before base64 encoding.
|
||||||
|
#[serde(default = "default_multimodal_max_image_size_mb")]
|
||||||
|
pub max_image_size_mb: usize,
|
||||||
|
/// Allow fetching remote image URLs (http/https). Disabled by default.
|
||||||
|
#[serde(default)]
|
||||||
|
pub allow_remote_fetch: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_multimodal_max_images() -> usize {
|
||||||
|
4
|
||||||
|
}
|
||||||
|
|
||||||
|
fn default_multimodal_max_image_size_mb() -> usize {
|
||||||
|
5
|
||||||
|
}
|
||||||
|
|
||||||
|
impl MultimodalConfig {
|
||||||
|
/// Clamp configured values to safe runtime bounds.
|
||||||
|
pub fn effective_limits(&self) -> (usize, usize) {
|
||||||
|
let max_images = self.max_images.clamp(1, 16);
|
||||||
|
let max_image_size_mb = self.max_image_size_mb.clamp(1, 20);
|
||||||
|
(max_images, max_image_size_mb)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for MultimodalConfig {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self {
|
||||||
|
max_images: default_multimodal_max_images(),
|
||||||
|
max_image_size_mb: default_multimodal_max_image_size_mb(),
|
||||||
|
allow_remote_fetch: false,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// ── Identity (AIEOS / OpenClaw format) ──────────────────────────
|
// ── Identity (AIEOS / OpenClaw format) ──────────────────────────
|
||||||
|
|
||||||
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
|
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
|
||||||
|
|
@ -2534,6 +2577,7 @@ impl Default for Config {
|
||||||
secrets: SecretsConfig::default(),
|
secrets: SecretsConfig::default(),
|
||||||
browser: BrowserConfig::default(),
|
browser: BrowserConfig::default(),
|
||||||
http_request: HttpRequestConfig::default(),
|
http_request: HttpRequestConfig::default(),
|
||||||
|
multimodal: MultimodalConfig::default(),
|
||||||
web_search: WebSearchConfig::default(),
|
web_search: WebSearchConfig::default(),
|
||||||
proxy: ProxyConfig::default(),
|
proxy: ProxyConfig::default(),
|
||||||
identity: IdentityConfig::default(),
|
identity: IdentityConfig::default(),
|
||||||
|
|
@ -3502,6 +3546,7 @@ default_temperature = 0.7
|
||||||
secrets: SecretsConfig::default(),
|
secrets: SecretsConfig::default(),
|
||||||
browser: BrowserConfig::default(),
|
browser: BrowserConfig::default(),
|
||||||
http_request: HttpRequestConfig::default(),
|
http_request: HttpRequestConfig::default(),
|
||||||
|
multimodal: MultimodalConfig::default(),
|
||||||
web_search: WebSearchConfig::default(),
|
web_search: WebSearchConfig::default(),
|
||||||
proxy: ProxyConfig::default(),
|
proxy: ProxyConfig::default(),
|
||||||
agent: AgentConfig::default(),
|
agent: AgentConfig::default(),
|
||||||
|
|
@ -3656,6 +3701,7 @@ tool_dispatcher = "xml"
|
||||||
secrets: SecretsConfig::default(),
|
secrets: SecretsConfig::default(),
|
||||||
browser: BrowserConfig::default(),
|
browser: BrowserConfig::default(),
|
||||||
http_request: HttpRequestConfig::default(),
|
http_request: HttpRequestConfig::default(),
|
||||||
|
multimodal: MultimodalConfig::default(),
|
||||||
web_search: WebSearchConfig::default(),
|
web_search: WebSearchConfig::default(),
|
||||||
proxy: ProxyConfig::default(),
|
proxy: ProxyConfig::default(),
|
||||||
agent: AgentConfig::default(),
|
agent: AgentConfig::default(),
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@
|
||||||
use crate::channels::{Channel, LinqChannel, SendMessage, WhatsAppChannel};
|
use crate::channels::{Channel, LinqChannel, SendMessage, WhatsAppChannel};
|
||||||
use crate::config::Config;
|
use crate::config::Config;
|
||||||
use crate::memory::{self, Memory, MemoryCategory};
|
use crate::memory::{self, Memory, MemoryCategory};
|
||||||
use crate::providers::{self, Provider};
|
use crate::providers::{self, ChatMessage, Provider, ProviderCapabilityError};
|
||||||
use crate::runtime;
|
use crate::runtime;
|
||||||
use crate::security::pairing::{constant_time_eq, is_public_bind, PairingGuard};
|
use crate::security::pairing::{constant_time_eq, is_public_bind, PairingGuard};
|
||||||
use crate::security::SecurityPolicy;
|
use crate::security::SecurityPolicy;
|
||||||
|
|
@ -666,6 +666,52 @@ async fn persist_pairing_tokens(config: Arc<Mutex<Config>>, pairing: &PairingGua
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn run_gateway_chat_with_multimodal(
|
||||||
|
state: &AppState,
|
||||||
|
provider_label: &str,
|
||||||
|
message: &str,
|
||||||
|
) -> anyhow::Result<String> {
|
||||||
|
let user_messages = vec![ChatMessage::user(message)];
|
||||||
|
let image_marker_count = crate::multimodal::count_image_markers(&user_messages);
|
||||||
|
if image_marker_count > 0 && !state.provider.supports_vision() {
|
||||||
|
return Err(ProviderCapabilityError {
|
||||||
|
provider: provider_label.to_string(),
|
||||||
|
capability: "vision".to_string(),
|
||||||
|
message: format!(
|
||||||
|
"received {image_marker_count} image marker(s), but this provider does not support vision input"
|
||||||
|
),
|
||||||
|
}
|
||||||
|
.into());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Keep webhook/gateway prompts aligned with channel behavior by injecting
|
||||||
|
// workspace-aware system context before model invocation.
|
||||||
|
let system_prompt = {
|
||||||
|
let config_guard = state.config.lock();
|
||||||
|
crate::channels::build_system_prompt(
|
||||||
|
&config_guard.workspace_dir,
|
||||||
|
&state.model,
|
||||||
|
&[], // tools - empty for simple chat
|
||||||
|
&[], // skills
|
||||||
|
Some(&config_guard.identity),
|
||||||
|
None, // bootstrap_max_chars - use default
|
||||||
|
)
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut messages = Vec::with_capacity(1 + user_messages.len());
|
||||||
|
messages.push(ChatMessage::system(system_prompt));
|
||||||
|
messages.extend(user_messages);
|
||||||
|
|
||||||
|
let multimodal_config = state.config.lock().multimodal.clone();
|
||||||
|
let prepared =
|
||||||
|
crate::multimodal::prepare_messages_for_provider(&messages, &multimodal_config).await?;
|
||||||
|
|
||||||
|
state
|
||||||
|
.provider
|
||||||
|
.chat_with_history(&prepared.messages, &state.model, state.temperature)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
/// Webhook request body
|
/// Webhook request body
|
||||||
#[derive(serde::Deserialize)]
|
#[derive(serde::Deserialize)]
|
||||||
pub struct WebhookBody {
|
pub struct WebhookBody {
|
||||||
|
|
@ -787,30 +833,7 @@ async fn handle_webhook(
|
||||||
messages_count: 1,
|
messages_count: 1,
|
||||||
});
|
});
|
||||||
|
|
||||||
// Build system prompt with workspace context (IDENTITY.md, AGENTS.md, etc.)
|
match run_gateway_chat_with_multimodal(&state, &provider_label, message).await {
|
||||||
let system_prompt = {
|
|
||||||
let config_guard = state.config.lock();
|
|
||||||
crate::channels::build_system_prompt(
|
|
||||||
&config_guard.workspace_dir,
|
|
||||||
&state.model,
|
|
||||||
&[], // tools - empty for simple chat
|
|
||||||
&[], // skills
|
|
||||||
Some(&config_guard.identity),
|
|
||||||
None, // bootstrap_max_chars - use default
|
|
||||||
)
|
|
||||||
};
|
|
||||||
|
|
||||||
// Call the LLM with separate system prompt
|
|
||||||
match state
|
|
||||||
.provider
|
|
||||||
.chat_with_system(
|
|
||||||
Some(&system_prompt),
|
|
||||||
message,
|
|
||||||
&state.model,
|
|
||||||
state.temperature,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(response) => {
|
Ok(response) => {
|
||||||
let duration = started_at.elapsed();
|
let duration = started_at.elapsed();
|
||||||
state
|
state
|
||||||
|
|
@ -994,6 +1017,12 @@ async fn handle_whatsapp_message(
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process each message
|
// Process each message
|
||||||
|
let provider_label = state
|
||||||
|
.config
|
||||||
|
.lock()
|
||||||
|
.default_provider
|
||||||
|
.clone()
|
||||||
|
.unwrap_or_else(|| "unknown".to_string());
|
||||||
for msg in &messages {
|
for msg in &messages {
|
||||||
tracing::info!(
|
tracing::info!(
|
||||||
"WhatsApp message from {}: {}",
|
"WhatsApp message from {}: {}",
|
||||||
|
|
@ -1010,30 +1039,7 @@ async fn handle_whatsapp_message(
|
||||||
.await;
|
.await;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build system prompt with workspace context (IDENTITY.md, AGENTS.md, etc.)
|
match run_gateway_chat_with_multimodal(&state, &provider_label, &msg.content).await {
|
||||||
let system_prompt = {
|
|
||||||
let config_guard = state.config.lock();
|
|
||||||
crate::channels::build_system_prompt(
|
|
||||||
&config_guard.workspace_dir,
|
|
||||||
&state.model,
|
|
||||||
&[], // tools - empty for simple chat
|
|
||||||
&[], // skills
|
|
||||||
Some(&config_guard.identity),
|
|
||||||
None, // bootstrap_max_chars - use default
|
|
||||||
)
|
|
||||||
};
|
|
||||||
|
|
||||||
// Call the LLM with separate system prompt
|
|
||||||
match state
|
|
||||||
.provider
|
|
||||||
.chat_with_system(
|
|
||||||
Some(&system_prompt),
|
|
||||||
&msg.content,
|
|
||||||
&state.model,
|
|
||||||
state.temperature,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(response) => {
|
Ok(response) => {
|
||||||
// Send reply via WhatsApp
|
// Send reply via WhatsApp
|
||||||
if let Err(e) = wa
|
if let Err(e) = wa
|
||||||
|
|
@ -1124,6 +1130,12 @@ async fn handle_linq_webhook(
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process each message
|
// Process each message
|
||||||
|
let provider_label = state
|
||||||
|
.config
|
||||||
|
.lock()
|
||||||
|
.default_provider
|
||||||
|
.clone()
|
||||||
|
.unwrap_or_else(|| "unknown".to_string());
|
||||||
for msg in &messages {
|
for msg in &messages {
|
||||||
tracing::info!(
|
tracing::info!(
|
||||||
"Linq message from {}: {}",
|
"Linq message from {}: {}",
|
||||||
|
|
@ -1141,11 +1153,7 @@ async fn handle_linq_webhook(
|
||||||
}
|
}
|
||||||
|
|
||||||
// Call the LLM
|
// Call the LLM
|
||||||
match state
|
match run_gateway_chat_with_multimodal(&state, &provider_label, &msg.content).await {
|
||||||
.provider
|
|
||||||
.simple_chat(&msg.content, &state.model, state.temperature)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
Ok(response) => {
|
Ok(response) => {
|
||||||
// Send reply via Linq
|
// Send reply via Linq
|
||||||
if let Err(e) = linq
|
if let Err(e) = linq
|
||||||
|
|
|
||||||
|
|
@ -55,6 +55,7 @@ pub mod identity;
|
||||||
pub mod integrations;
|
pub mod integrations;
|
||||||
pub mod memory;
|
pub mod memory;
|
||||||
pub mod migration;
|
pub mod migration;
|
||||||
|
pub mod multimodal;
|
||||||
pub mod observability;
|
pub mod observability;
|
||||||
pub mod onboard;
|
pub mod onboard;
|
||||||
pub mod peripherals;
|
pub mod peripherals;
|
||||||
|
|
|
||||||
|
|
@ -58,6 +58,7 @@ mod identity;
|
||||||
mod integrations;
|
mod integrations;
|
||||||
mod memory;
|
mod memory;
|
||||||
mod migration;
|
mod migration;
|
||||||
|
mod multimodal;
|
||||||
mod observability;
|
mod observability;
|
||||||
mod onboard;
|
mod onboard;
|
||||||
mod peripherals;
|
mod peripherals;
|
||||||
|
|
|
||||||
568
src/multimodal.rs
Normal file
568
src/multimodal.rs
Normal file
|
|
@ -0,0 +1,568 @@
|
||||||
|
use crate::config::{build_runtime_proxy_client_with_timeouts, MultimodalConfig};
|
||||||
|
use crate::providers::ChatMessage;
|
||||||
|
use base64::{engine::general_purpose::STANDARD, Engine as _};
|
||||||
|
use reqwest::Client;
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
const IMAGE_MARKER_PREFIX: &str = "[IMAGE:";
|
||||||
|
const ALLOWED_IMAGE_MIME_TYPES: &[&str] = &[
|
||||||
|
"image/png",
|
||||||
|
"image/jpeg",
|
||||||
|
"image/webp",
|
||||||
|
"image/gif",
|
||||||
|
"image/bmp",
|
||||||
|
];
|
||||||
|
|
||||||
|
#[derive(Debug, Clone)]
|
||||||
|
pub struct PreparedMessages {
|
||||||
|
pub messages: Vec<ChatMessage>,
|
||||||
|
pub contains_images: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, thiserror::Error)]
|
||||||
|
pub enum MultimodalError {
|
||||||
|
#[error("multimodal image limit exceeded: max_images={max_images}, found={found}")]
|
||||||
|
TooManyImages { max_images: usize, found: usize },
|
||||||
|
|
||||||
|
#[error("multimodal image size limit exceeded for '{input}': {size_bytes} bytes > {max_bytes} bytes")]
|
||||||
|
ImageTooLarge {
|
||||||
|
input: String,
|
||||||
|
size_bytes: usize,
|
||||||
|
max_bytes: usize,
|
||||||
|
},
|
||||||
|
|
||||||
|
#[error("multimodal image MIME type is not allowed for '{input}': {mime}")]
|
||||||
|
UnsupportedMime { input: String, mime: String },
|
||||||
|
|
||||||
|
#[error("multimodal remote image fetch is disabled for '{input}'")]
|
||||||
|
RemoteFetchDisabled { input: String },
|
||||||
|
|
||||||
|
#[error("multimodal image source not found or unreadable: '{input}'")]
|
||||||
|
ImageSourceNotFound { input: String },
|
||||||
|
|
||||||
|
#[error("invalid multimodal image marker '{input}': {reason}")]
|
||||||
|
InvalidMarker { input: String, reason: String },
|
||||||
|
|
||||||
|
#[error("failed to download remote image '{input}': {reason}")]
|
||||||
|
RemoteFetchFailed { input: String, reason: String },
|
||||||
|
|
||||||
|
#[error("failed to read local image '{input}': {reason}")]
|
||||||
|
LocalReadFailed { input: String, reason: String },
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn parse_image_markers(content: &str) -> (String, Vec<String>) {
|
||||||
|
let mut refs = Vec::new();
|
||||||
|
let mut cleaned = String::with_capacity(content.len());
|
||||||
|
let mut cursor = 0usize;
|
||||||
|
|
||||||
|
while let Some(rel_start) = content[cursor..].find(IMAGE_MARKER_PREFIX) {
|
||||||
|
let start = cursor + rel_start;
|
||||||
|
cleaned.push_str(&content[cursor..start]);
|
||||||
|
|
||||||
|
let marker_start = start + IMAGE_MARKER_PREFIX.len();
|
||||||
|
let Some(rel_end) = content[marker_start..].find(']') else {
|
||||||
|
cleaned.push_str(&content[start..]);
|
||||||
|
cursor = content.len();
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
|
||||||
|
let end = marker_start + rel_end;
|
||||||
|
let candidate = content[marker_start..end].trim();
|
||||||
|
|
||||||
|
if candidate.is_empty() {
|
||||||
|
cleaned.push_str(&content[start..=end]);
|
||||||
|
} else {
|
||||||
|
refs.push(candidate.to_string());
|
||||||
|
}
|
||||||
|
|
||||||
|
cursor = end + 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if cursor < content.len() {
|
||||||
|
cleaned.push_str(&content[cursor..]);
|
||||||
|
}
|
||||||
|
|
||||||
|
(cleaned.trim().to_string(), refs)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn count_image_markers(messages: &[ChatMessage]) -> usize {
|
||||||
|
messages
|
||||||
|
.iter()
|
||||||
|
.filter(|m| m.role == "user")
|
||||||
|
.map(|m| parse_image_markers(&m.content).1.len())
|
||||||
|
.sum()
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn contains_image_markers(messages: &[ChatMessage]) -> bool {
|
||||||
|
count_image_markers(messages) > 0
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn extract_ollama_image_payload(image_ref: &str) -> Option<String> {
|
||||||
|
if image_ref.starts_with("data:") {
|
||||||
|
let comma_idx = image_ref.find(',')?;
|
||||||
|
let (_, payload) = image_ref.split_at(comma_idx + 1);
|
||||||
|
let payload = payload.trim();
|
||||||
|
if payload.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(payload.to_string())
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
Some(image_ref.trim().to_string()).filter(|value| !value.is_empty())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn prepare_messages_for_provider(
|
||||||
|
messages: &[ChatMessage],
|
||||||
|
config: &MultimodalConfig,
|
||||||
|
) -> anyhow::Result<PreparedMessages> {
|
||||||
|
let (max_images, max_image_size_mb) = config.effective_limits();
|
||||||
|
let max_bytes = max_image_size_mb.saturating_mul(1024 * 1024);
|
||||||
|
|
||||||
|
let found_images = count_image_markers(messages);
|
||||||
|
if found_images > max_images {
|
||||||
|
return Err(MultimodalError::TooManyImages {
|
||||||
|
max_images,
|
||||||
|
found: found_images,
|
||||||
|
}
|
||||||
|
.into());
|
||||||
|
}
|
||||||
|
|
||||||
|
if found_images == 0 {
|
||||||
|
return Ok(PreparedMessages {
|
||||||
|
messages: messages.to_vec(),
|
||||||
|
contains_images: false,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
let remote_client = build_runtime_proxy_client_with_timeouts("provider.ollama", 30, 10);
|
||||||
|
|
||||||
|
let mut normalized_messages = Vec::with_capacity(messages.len());
|
||||||
|
for message in messages {
|
||||||
|
if message.role != "user" {
|
||||||
|
normalized_messages.push(message.clone());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let (cleaned_text, refs) = parse_image_markers(&message.content);
|
||||||
|
if refs.is_empty() {
|
||||||
|
normalized_messages.push(message.clone());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut normalized_refs = Vec::with_capacity(refs.len());
|
||||||
|
for reference in refs {
|
||||||
|
let data_uri =
|
||||||
|
normalize_image_reference(&reference, config, max_bytes, &remote_client).await?;
|
||||||
|
normalized_refs.push(data_uri);
|
||||||
|
}
|
||||||
|
|
||||||
|
let content = compose_multimodal_message(&cleaned_text, &normalized_refs);
|
||||||
|
normalized_messages.push(ChatMessage {
|
||||||
|
role: message.role.clone(),
|
||||||
|
content,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(PreparedMessages {
|
||||||
|
messages: normalized_messages,
|
||||||
|
contains_images: true,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn compose_multimodal_message(text: &str, data_uris: &[String]) -> String {
|
||||||
|
let mut content = String::new();
|
||||||
|
let trimmed = text.trim();
|
||||||
|
|
||||||
|
if !trimmed.is_empty() {
|
||||||
|
content.push_str(trimmed);
|
||||||
|
content.push_str("\n\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
for (index, data_uri) in data_uris.iter().enumerate() {
|
||||||
|
if index > 0 {
|
||||||
|
content.push('\n');
|
||||||
|
}
|
||||||
|
content.push_str(IMAGE_MARKER_PREFIX);
|
||||||
|
content.push_str(data_uri);
|
||||||
|
content.push(']');
|
||||||
|
}
|
||||||
|
|
||||||
|
content
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn normalize_image_reference(
|
||||||
|
source: &str,
|
||||||
|
config: &MultimodalConfig,
|
||||||
|
max_bytes: usize,
|
||||||
|
remote_client: &Client,
|
||||||
|
) -> anyhow::Result<String> {
|
||||||
|
if source.starts_with("data:") {
|
||||||
|
return normalize_data_uri(source, max_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
if source.starts_with("http://") || source.starts_with("https://") {
|
||||||
|
if !config.allow_remote_fetch {
|
||||||
|
return Err(MultimodalError::RemoteFetchDisabled {
|
||||||
|
input: source.to_string(),
|
||||||
|
}
|
||||||
|
.into());
|
||||||
|
}
|
||||||
|
|
||||||
|
return normalize_remote_image(source, max_bytes, remote_client).await;
|
||||||
|
}
|
||||||
|
|
||||||
|
normalize_local_image(source, max_bytes).await
|
||||||
|
}
|
||||||
|
|
||||||
|
fn normalize_data_uri(source: &str, max_bytes: usize) -> anyhow::Result<String> {
|
||||||
|
let Some(comma_idx) = source.find(',') else {
|
||||||
|
return Err(MultimodalError::InvalidMarker {
|
||||||
|
input: source.to_string(),
|
||||||
|
reason: "expected data URI payload".to_string(),
|
||||||
|
}
|
||||||
|
.into());
|
||||||
|
};
|
||||||
|
|
||||||
|
let header = &source[..comma_idx];
|
||||||
|
let payload = source[comma_idx + 1..].trim();
|
||||||
|
|
||||||
|
if !header.contains(";base64") {
|
||||||
|
return Err(MultimodalError::InvalidMarker {
|
||||||
|
input: source.to_string(),
|
||||||
|
reason: "only base64 data URIs are supported".to_string(),
|
||||||
|
}
|
||||||
|
.into());
|
||||||
|
}
|
||||||
|
|
||||||
|
let mime = header
|
||||||
|
.trim_start_matches("data:")
|
||||||
|
.split(';')
|
||||||
|
.next()
|
||||||
|
.unwrap_or_default()
|
||||||
|
.trim()
|
||||||
|
.to_ascii_lowercase();
|
||||||
|
|
||||||
|
validate_mime(source, &mime)?;
|
||||||
|
|
||||||
|
let decoded = STANDARD
|
||||||
|
.decode(payload)
|
||||||
|
.map_err(|error| MultimodalError::InvalidMarker {
|
||||||
|
input: source.to_string(),
|
||||||
|
reason: format!("invalid base64 payload: {error}"),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
validate_size(source, decoded.len(), max_bytes)?;
|
||||||
|
|
||||||
|
Ok(format!("data:{mime};base64,{}", STANDARD.encode(decoded)))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn normalize_remote_image(
|
||||||
|
source: &str,
|
||||||
|
max_bytes: usize,
|
||||||
|
remote_client: &Client,
|
||||||
|
) -> anyhow::Result<String> {
|
||||||
|
let response = remote_client.get(source).send().await.map_err(|error| {
|
||||||
|
MultimodalError::RemoteFetchFailed {
|
||||||
|
input: source.to_string(),
|
||||||
|
reason: error.to_string(),
|
||||||
|
}
|
||||||
|
})?;
|
||||||
|
|
||||||
|
let status = response.status();
|
||||||
|
if !status.is_success() {
|
||||||
|
return Err(MultimodalError::RemoteFetchFailed {
|
||||||
|
input: source.to_string(),
|
||||||
|
reason: format!("HTTP {status}"),
|
||||||
|
}
|
||||||
|
.into());
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(content_length) = response.content_length() {
|
||||||
|
let content_length = content_length as usize;
|
||||||
|
validate_size(source, content_length, max_bytes)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let content_type = response
|
||||||
|
.headers()
|
||||||
|
.get(reqwest::header::CONTENT_TYPE)
|
||||||
|
.and_then(|value| value.to_str().ok())
|
||||||
|
.map(ToString::to_string);
|
||||||
|
|
||||||
|
let bytes = response
|
||||||
|
.bytes()
|
||||||
|
.await
|
||||||
|
.map_err(|error| MultimodalError::RemoteFetchFailed {
|
||||||
|
input: source.to_string(),
|
||||||
|
reason: error.to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
validate_size(source, bytes.len(), max_bytes)?;
|
||||||
|
|
||||||
|
let mime = detect_mime(None, bytes.as_ref(), content_type.as_deref()).ok_or_else(|| {
|
||||||
|
MultimodalError::UnsupportedMime {
|
||||||
|
input: source.to_string(),
|
||||||
|
mime: "unknown".to_string(),
|
||||||
|
}
|
||||||
|
})?;
|
||||||
|
|
||||||
|
validate_mime(source, &mime)?;
|
||||||
|
|
||||||
|
Ok(format!("data:{mime};base64,{}", STANDARD.encode(bytes)))
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn normalize_local_image(source: &str, max_bytes: usize) -> anyhow::Result<String> {
|
||||||
|
let path = Path::new(source);
|
||||||
|
if !path.exists() || !path.is_file() {
|
||||||
|
return Err(MultimodalError::ImageSourceNotFound {
|
||||||
|
input: source.to_string(),
|
||||||
|
}
|
||||||
|
.into());
|
||||||
|
}
|
||||||
|
|
||||||
|
let metadata =
|
||||||
|
tokio::fs::metadata(path)
|
||||||
|
.await
|
||||||
|
.map_err(|error| MultimodalError::LocalReadFailed {
|
||||||
|
input: source.to_string(),
|
||||||
|
reason: error.to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
validate_size(source, metadata.len() as usize, max_bytes)?;
|
||||||
|
|
||||||
|
let bytes = tokio::fs::read(path)
|
||||||
|
.await
|
||||||
|
.map_err(|error| MultimodalError::LocalReadFailed {
|
||||||
|
input: source.to_string(),
|
||||||
|
reason: error.to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
validate_size(source, bytes.len(), max_bytes)?;
|
||||||
|
|
||||||
|
let mime =
|
||||||
|
detect_mime(Some(path), &bytes, None).ok_or_else(|| MultimodalError::UnsupportedMime {
|
||||||
|
input: source.to_string(),
|
||||||
|
mime: "unknown".to_string(),
|
||||||
|
})?;
|
||||||
|
|
||||||
|
validate_mime(source, &mime)?;
|
||||||
|
|
||||||
|
Ok(format!("data:{mime};base64,{}", STANDARD.encode(bytes)))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn validate_size(source: &str, size_bytes: usize, max_bytes: usize) -> anyhow::Result<()> {
|
||||||
|
if size_bytes > max_bytes {
|
||||||
|
return Err(MultimodalError::ImageTooLarge {
|
||||||
|
input: source.to_string(),
|
||||||
|
size_bytes,
|
||||||
|
max_bytes,
|
||||||
|
}
|
||||||
|
.into());
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn validate_mime(source: &str, mime: &str) -> anyhow::Result<()> {
|
||||||
|
if ALLOWED_IMAGE_MIME_TYPES
|
||||||
|
.iter()
|
||||||
|
.any(|allowed| *allowed == mime)
|
||||||
|
{
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
Err(MultimodalError::UnsupportedMime {
|
||||||
|
input: source.to_string(),
|
||||||
|
mime: mime.to_string(),
|
||||||
|
}
|
||||||
|
.into())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn detect_mime(
|
||||||
|
path: Option<&Path>,
|
||||||
|
bytes: &[u8],
|
||||||
|
header_content_type: Option<&str>,
|
||||||
|
) -> Option<String> {
|
||||||
|
if let Some(header_mime) = header_content_type.and_then(normalize_content_type) {
|
||||||
|
return Some(header_mime);
|
||||||
|
}
|
||||||
|
|
||||||
|
if let Some(path) = path {
|
||||||
|
if let Some(ext) = path.extension().and_then(|value| value.to_str()) {
|
||||||
|
if let Some(mime) = mime_from_extension(ext) {
|
||||||
|
return Some(mime.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
mime_from_magic(bytes).map(ToString::to_string)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn normalize_content_type(content_type: &str) -> Option<String> {
|
||||||
|
let mime = content_type.split(';').next()?.trim().to_ascii_lowercase();
|
||||||
|
if mime.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(mime)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn mime_from_extension(ext: &str) -> Option<&'static str> {
|
||||||
|
match ext.to_ascii_lowercase().as_str() {
|
||||||
|
"png" => Some("image/png"),
|
||||||
|
"jpg" | "jpeg" => Some("image/jpeg"),
|
||||||
|
"webp" => Some("image/webp"),
|
||||||
|
"gif" => Some("image/gif"),
|
||||||
|
"bmp" => Some("image/bmp"),
|
||||||
|
_ => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn mime_from_magic(bytes: &[u8]) -> Option<&'static str> {
|
||||||
|
if bytes.len() >= 8 && bytes.starts_with(&[0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n']) {
|
||||||
|
return Some("image/png");
|
||||||
|
}
|
||||||
|
|
||||||
|
if bytes.len() >= 3 && bytes.starts_with(&[0xff, 0xd8, 0xff]) {
|
||||||
|
return Some("image/jpeg");
|
||||||
|
}
|
||||||
|
|
||||||
|
if bytes.len() >= 6 && (bytes.starts_with(b"GIF87a") || bytes.starts_with(b"GIF89a")) {
|
||||||
|
return Some("image/gif");
|
||||||
|
}
|
||||||
|
|
||||||
|
if bytes.len() >= 12 && bytes.starts_with(b"RIFF") && &bytes[8..12] == b"WEBP" {
|
||||||
|
return Some("image/webp");
|
||||||
|
}
|
||||||
|
|
||||||
|
if bytes.len() >= 2 && bytes.starts_with(b"BM") {
|
||||||
|
return Some("image/bmp");
|
||||||
|
}
|
||||||
|
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_image_markers_extracts_multiple_markers() {
|
||||||
|
let input = "Check this [IMAGE:/tmp/a.png] and this [IMAGE:https://example.com/b.jpg]";
|
||||||
|
let (cleaned, refs) = parse_image_markers(input);
|
||||||
|
|
||||||
|
assert_eq!(cleaned, "Check this and this");
|
||||||
|
assert_eq!(refs.len(), 2);
|
||||||
|
assert_eq!(refs[0], "/tmp/a.png");
|
||||||
|
assert_eq!(refs[1], "https://example.com/b.jpg");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn parse_image_markers_keeps_invalid_empty_marker() {
|
||||||
|
let input = "hello [IMAGE:] world";
|
||||||
|
let (cleaned, refs) = parse_image_markers(input);
|
||||||
|
|
||||||
|
assert_eq!(cleaned, "hello [IMAGE:] world");
|
||||||
|
assert!(refs.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn prepare_messages_normalizes_local_image_to_data_uri() {
|
||||||
|
let temp = tempfile::tempdir().unwrap();
|
||||||
|
let image_path = temp.path().join("sample.png");
|
||||||
|
|
||||||
|
// Minimal PNG signature bytes are enough for MIME detection.
|
||||||
|
std::fs::write(
|
||||||
|
&image_path,
|
||||||
|
[0x89, b'P', b'N', b'G', b'\r', b'\n', 0x1a, b'\n'],
|
||||||
|
)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let messages = vec![ChatMessage::user(format!(
|
||||||
|
"Please inspect this screenshot [IMAGE:{}]",
|
||||||
|
image_path.display()
|
||||||
|
))];
|
||||||
|
|
||||||
|
let prepared = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
|
||||||
|
.await
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
assert!(prepared.contains_images);
|
||||||
|
assert_eq!(prepared.messages.len(), 1);
|
||||||
|
|
||||||
|
let (cleaned, refs) = parse_image_markers(&prepared.messages[0].content);
|
||||||
|
assert_eq!(cleaned, "Please inspect this screenshot");
|
||||||
|
assert_eq!(refs.len(), 1);
|
||||||
|
assert!(refs[0].starts_with("data:image/png;base64,"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn prepare_messages_rejects_too_many_images() {
|
||||||
|
let messages = vec![ChatMessage::user(
|
||||||
|
"[IMAGE:/tmp/1.png]\n[IMAGE:/tmp/2.png]".to_string(),
|
||||||
|
)];
|
||||||
|
|
||||||
|
let config = MultimodalConfig {
|
||||||
|
max_images: 1,
|
||||||
|
max_image_size_mb: 5,
|
||||||
|
allow_remote_fetch: false,
|
||||||
|
};
|
||||||
|
|
||||||
|
let error = prepare_messages_for_provider(&messages, &config)
|
||||||
|
.await
|
||||||
|
.expect_err("should reject image count overflow");
|
||||||
|
|
||||||
|
assert!(error
|
||||||
|
.to_string()
|
||||||
|
.contains("multimodal image limit exceeded"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn prepare_messages_rejects_remote_url_when_disabled() {
|
||||||
|
let messages = vec![ChatMessage::user(
|
||||||
|
"Look [IMAGE:https://example.com/img.png]".to_string(),
|
||||||
|
)];
|
||||||
|
|
||||||
|
let error = prepare_messages_for_provider(&messages, &MultimodalConfig::default())
|
||||||
|
.await
|
||||||
|
.expect_err("should reject remote image URL when fetch is disabled");
|
||||||
|
|
||||||
|
assert!(error
|
||||||
|
.to_string()
|
||||||
|
.contains("multimodal remote image fetch is disabled"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn prepare_messages_rejects_oversized_local_image() {
|
||||||
|
let temp = tempfile::tempdir().unwrap();
|
||||||
|
let image_path = temp.path().join("big.png");
|
||||||
|
|
||||||
|
let bytes = vec![0u8; 1024 * 1024 + 1];
|
||||||
|
std::fs::write(&image_path, bytes).unwrap();
|
||||||
|
|
||||||
|
let messages = vec![ChatMessage::user(format!(
|
||||||
|
"[IMAGE:{}]",
|
||||||
|
image_path.display()
|
||||||
|
))];
|
||||||
|
let config = MultimodalConfig {
|
||||||
|
max_images: 4,
|
||||||
|
max_image_size_mb: 1,
|
||||||
|
allow_remote_fetch: false,
|
||||||
|
};
|
||||||
|
|
||||||
|
let error = prepare_messages_for_provider(&messages, &config)
|
||||||
|
.await
|
||||||
|
.expect_err("should reject oversized local image");
|
||||||
|
|
||||||
|
assert!(error
|
||||||
|
.to_string()
|
||||||
|
.contains("multimodal image size limit exceeded"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn extract_ollama_image_payload_supports_data_uris() {
|
||||||
|
let payload = extract_ollama_image_payload("data:image/png;base64,abcd==")
|
||||||
|
.expect("payload should be extracted");
|
||||||
|
assert_eq!(payload, "abcd==");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -173,6 +173,7 @@ pub async fn run_wizard() -> Result<Config> {
|
||||||
secrets: secrets_config,
|
secrets: secrets_config,
|
||||||
browser: BrowserConfig::default(),
|
browser: BrowserConfig::default(),
|
||||||
http_request: crate::config::HttpRequestConfig::default(),
|
http_request: crate::config::HttpRequestConfig::default(),
|
||||||
|
multimodal: crate::config::MultimodalConfig::default(),
|
||||||
web_search: crate::config::WebSearchConfig::default(),
|
web_search: crate::config::WebSearchConfig::default(),
|
||||||
proxy: crate::config::ProxyConfig::default(),
|
proxy: crate::config::ProxyConfig::default(),
|
||||||
identity: crate::config::IdentityConfig::default(),
|
identity: crate::config::IdentityConfig::default(),
|
||||||
|
|
@ -391,6 +392,7 @@ pub async fn run_quick_setup(
|
||||||
secrets: SecretsConfig::default(),
|
secrets: SecretsConfig::default(),
|
||||||
browser: BrowserConfig::default(),
|
browser: BrowserConfig::default(),
|
||||||
http_request: crate::config::HttpRequestConfig::default(),
|
http_request: crate::config::HttpRequestConfig::default(),
|
||||||
|
multimodal: crate::config::MultimodalConfig::default(),
|
||||||
web_search: crate::config::WebSearchConfig::default(),
|
web_search: crate::config::WebSearchConfig::default(),
|
||||||
proxy: crate::config::ProxyConfig::default(),
|
proxy: crate::config::ProxyConfig::default(),
|
||||||
identity: crate::config::IdentityConfig::default(),
|
identity: crate::config::IdentityConfig::default(),
|
||||||
|
|
|
||||||
|
|
@ -639,6 +639,7 @@ impl Provider for BedrockProvider {
|
||||||
fn capabilities(&self) -> ProviderCapabilities {
|
fn capabilities(&self) -> ProviderCapabilities {
|
||||||
ProviderCapabilities {
|
ProviderCapabilities {
|
||||||
native_tool_calling: true,
|
native_tool_calling: true,
|
||||||
|
vision: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -898,6 +898,7 @@ impl Provider for OpenAiCompatibleProvider {
|
||||||
fn capabilities(&self) -> crate::providers::traits::ProviderCapabilities {
|
fn capabilities(&self) -> crate::providers::traits::ProviderCapabilities {
|
||||||
crate::providers::traits::ProviderCapabilities {
|
crate::providers::traits::ProviderCapabilities {
|
||||||
native_tool_calling: true,
|
native_tool_calling: true,
|
||||||
|
vision: false,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -13,8 +13,8 @@ pub mod traits;
|
||||||
|
|
||||||
#[allow(unused_imports)]
|
#[allow(unused_imports)]
|
||||||
pub use traits::{
|
pub use traits::{
|
||||||
ChatMessage, ChatRequest, ChatResponse, ConversationMessage, Provider, ToolCall,
|
ChatMessage, ChatRequest, ChatResponse, ConversationMessage, Provider, ProviderCapabilityError,
|
||||||
ToolResultMessage,
|
ToolCall, ToolResultMessage,
|
||||||
};
|
};
|
||||||
|
|
||||||
use compatible::{AuthStyle, OpenAiCompatibleProvider};
|
use compatible::{AuthStyle, OpenAiCompatibleProvider};
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,7 @@
|
||||||
use crate::providers::traits::{ChatMessage, ChatResponse, Provider, ToolCall};
|
use crate::multimodal;
|
||||||
|
use crate::providers::traits::{
|
||||||
|
ChatMessage, ChatResponse, Provider, ProviderCapabilities, ToolCall,
|
||||||
|
};
|
||||||
use async_trait::async_trait;
|
use async_trait::async_trait;
|
||||||
use reqwest::Client;
|
use reqwest::Client;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
@ -30,6 +33,8 @@ struct Message {
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
content: Option<String>,
|
content: Option<String>,
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
images: Option<Vec<String>>,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
tool_calls: Option<Vec<OutgoingToolCall>>,
|
tool_calls: Option<Vec<OutgoingToolCall>>,
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
tool_name: Option<String>,
|
tool_name: Option<String>,
|
||||||
|
|
@ -166,6 +171,31 @@ impl OllamaProvider {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn convert_user_message_content(&self, content: &str) -> (Option<String>, Option<Vec<String>>) {
|
||||||
|
let (cleaned, image_refs) = multimodal::parse_image_markers(content);
|
||||||
|
if image_refs.is_empty() {
|
||||||
|
return (Some(content.to_string()), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
let images: Vec<String> = image_refs
|
||||||
|
.iter()
|
||||||
|
.filter_map(|reference| multimodal::extract_ollama_image_payload(reference))
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if images.is_empty() {
|
||||||
|
return (Some(content.to_string()), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
let cleaned = cleaned.trim();
|
||||||
|
let content = if cleaned.is_empty() {
|
||||||
|
None
|
||||||
|
} else {
|
||||||
|
Some(cleaned.to_string())
|
||||||
|
};
|
||||||
|
|
||||||
|
(content, Some(images))
|
||||||
|
}
|
||||||
|
|
||||||
/// Convert internal chat history format to Ollama's native tool-call message schema.
|
/// Convert internal chat history format to Ollama's native tool-call message schema.
|
||||||
///
|
///
|
||||||
/// `run_tool_call_loop` stores native assistant/tool entries as JSON strings in
|
/// `run_tool_call_loop` stores native assistant/tool entries as JSON strings in
|
||||||
|
|
@ -205,6 +235,7 @@ impl OllamaProvider {
|
||||||
return Message {
|
return Message {
|
||||||
role: "assistant".to_string(),
|
role: "assistant".to_string(),
|
||||||
content,
|
content,
|
||||||
|
images: None,
|
||||||
tool_calls: Some(outgoing_calls),
|
tool_calls: Some(outgoing_calls),
|
||||||
tool_name: None,
|
tool_name: None,
|
||||||
};
|
};
|
||||||
|
|
@ -238,15 +269,28 @@ impl OllamaProvider {
|
||||||
return Message {
|
return Message {
|
||||||
role: "tool".to_string(),
|
role: "tool".to_string(),
|
||||||
content,
|
content,
|
||||||
|
images: None,
|
||||||
tool_calls: None,
|
tool_calls: None,
|
||||||
tool_name,
|
tool_name,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if message.role == "user" {
|
||||||
|
let (content, images) = self.convert_user_message_content(&message.content);
|
||||||
|
return Message {
|
||||||
|
role: "user".to_string(),
|
||||||
|
content,
|
||||||
|
images,
|
||||||
|
tool_calls: None,
|
||||||
|
tool_name: None,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
Message {
|
Message {
|
||||||
role: message.role.clone(),
|
role: message.role.clone(),
|
||||||
content: Some(message.content.clone()),
|
content: Some(message.content.clone()),
|
||||||
|
images: None,
|
||||||
tool_calls: None,
|
tool_calls: None,
|
||||||
tool_name: None,
|
tool_name: None,
|
||||||
}
|
}
|
||||||
|
|
@ -398,6 +442,13 @@ impl OllamaProvider {
|
||||||
|
|
||||||
#[async_trait]
|
#[async_trait]
|
||||||
impl Provider for OllamaProvider {
|
impl Provider for OllamaProvider {
|
||||||
|
fn capabilities(&self) -> ProviderCapabilities {
|
||||||
|
ProviderCapabilities {
|
||||||
|
native_tool_calling: true,
|
||||||
|
vision: true,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn chat_with_system(
|
async fn chat_with_system(
|
||||||
&self,
|
&self,
|
||||||
system_prompt: Option<&str>,
|
system_prompt: Option<&str>,
|
||||||
|
|
@ -413,14 +464,17 @@ impl Provider for OllamaProvider {
|
||||||
messages.push(Message {
|
messages.push(Message {
|
||||||
role: "system".to_string(),
|
role: "system".to_string(),
|
||||||
content: Some(sys.to_string()),
|
content: Some(sys.to_string()),
|
||||||
|
images: None,
|
||||||
tool_calls: None,
|
tool_calls: None,
|
||||||
tool_name: None,
|
tool_name: None,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let (user_content, user_images) = self.convert_user_message_content(message);
|
||||||
messages.push(Message {
|
messages.push(Message {
|
||||||
role: "user".to_string(),
|
role: "user".to_string(),
|
||||||
content: Some(message.to_string()),
|
content: user_content,
|
||||||
|
images: user_images,
|
||||||
tool_calls: None,
|
tool_calls: None,
|
||||||
tool_name: None,
|
tool_name: None,
|
||||||
});
|
});
|
||||||
|
|
@ -862,4 +916,34 @@ mod tests {
|
||||||
assert_eq!(converted[1].content.as_deref(), Some("ok"));
|
assert_eq!(converted[1].content.as_deref(), Some("ok"));
|
||||||
assert!(converted[1].tool_calls.is_none());
|
assert!(converted[1].tool_calls.is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn convert_messages_extracts_images_from_user_marker() {
|
||||||
|
let provider = OllamaProvider::new(None, None);
|
||||||
|
let messages = vec![ChatMessage {
|
||||||
|
role: "user".into(),
|
||||||
|
content: "Inspect this screenshot [IMAGE:data:image/png;base64,abcd==]".into(),
|
||||||
|
}];
|
||||||
|
|
||||||
|
let converted = provider.convert_messages(&messages);
|
||||||
|
assert_eq!(converted.len(), 1);
|
||||||
|
assert_eq!(converted[0].role, "user");
|
||||||
|
assert_eq!(
|
||||||
|
converted[0].content.as_deref(),
|
||||||
|
Some("Inspect this screenshot")
|
||||||
|
);
|
||||||
|
let images = converted[0]
|
||||||
|
.images
|
||||||
|
.as_ref()
|
||||||
|
.expect("images should be present");
|
||||||
|
assert_eq!(images, &vec!["abcd==".to_string()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn capabilities_include_native_tools_and_vision() {
|
||||||
|
let provider = OllamaProvider::new(None, None);
|
||||||
|
let caps = <OllamaProvider as Provider>::capabilities(&provider);
|
||||||
|
assert!(caps.native_tool_calling);
|
||||||
|
assert!(caps.vision);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -511,6 +511,12 @@ impl Provider for ReliableProvider {
|
||||||
.unwrap_or(false)
|
.unwrap_or(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn supports_vision(&self) -> bool {
|
||||||
|
self.providers
|
||||||
|
.iter()
|
||||||
|
.any(|(_, provider)| provider.supports_vision())
|
||||||
|
}
|
||||||
|
|
||||||
async fn chat_with_tools(
|
async fn chat_with_tools(
|
||||||
&self,
|
&self,
|
||||||
messages: &[ChatMessage],
|
messages: &[ChatMessage],
|
||||||
|
|
|
||||||
|
|
@ -158,6 +158,12 @@ impl Provider for RouterProvider {
|
||||||
.unwrap_or(false)
|
.unwrap_or(false)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn supports_vision(&self) -> bool {
|
||||||
|
self.providers
|
||||||
|
.iter()
|
||||||
|
.any(|(_, provider)| provider.supports_vision())
|
||||||
|
}
|
||||||
|
|
||||||
async fn warmup(&self) -> anyhow::Result<()> {
|
async fn warmup(&self) -> anyhow::Result<()> {
|
||||||
for (name, provider) in &self.providers {
|
for (name, provider) in &self.providers {
|
||||||
tracing::info!(provider = name, "Warming up routed provider");
|
tracing::info!(provider = name, "Warming up routed provider");
|
||||||
|
|
|
||||||
|
|
@ -192,6 +192,15 @@ pub enum StreamError {
|
||||||
Io(#[from] std::io::Error),
|
Io(#[from] std::io::Error),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Structured error returned when a requested capability is not supported.
|
||||||
|
#[derive(Debug, Clone, thiserror::Error)]
|
||||||
|
#[error("provider_capability_error provider={provider} capability={capability} message={message}")]
|
||||||
|
pub struct ProviderCapabilityError {
|
||||||
|
pub provider: String,
|
||||||
|
pub capability: String,
|
||||||
|
pub message: String,
|
||||||
|
}
|
||||||
|
|
||||||
/// Provider capabilities declaration.
|
/// Provider capabilities declaration.
|
||||||
///
|
///
|
||||||
/// Describes what features a provider supports, enabling intelligent
|
/// Describes what features a provider supports, enabling intelligent
|
||||||
|
|
@ -205,6 +214,8 @@ pub struct ProviderCapabilities {
|
||||||
///
|
///
|
||||||
/// When `false`, tools must be injected via system prompt as text.
|
/// When `false`, tools must be injected via system prompt as text.
|
||||||
pub native_tool_calling: bool,
|
pub native_tool_calling: bool,
|
||||||
|
/// Whether the provider supports vision / image inputs.
|
||||||
|
pub vision: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Provider-specific tool payload formats.
|
/// Provider-specific tool payload formats.
|
||||||
|
|
@ -351,6 +362,11 @@ pub trait Provider: Send + Sync {
|
||||||
self.capabilities().native_tool_calling
|
self.capabilities().native_tool_calling
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Whether provider supports multimodal vision input.
|
||||||
|
fn supports_vision(&self) -> bool {
|
||||||
|
self.capabilities().vision
|
||||||
|
}
|
||||||
|
|
||||||
/// Warm up the HTTP connection pool (TLS handshake, DNS, HTTP/2 setup).
|
/// Warm up the HTTP connection pool (TLS handshake, DNS, HTTP/2 setup).
|
||||||
/// Default implementation is a no-op; providers with HTTP clients should override.
|
/// Default implementation is a no-op; providers with HTTP clients should override.
|
||||||
async fn warmup(&self) -> anyhow::Result<()> {
|
async fn warmup(&self) -> anyhow::Result<()> {
|
||||||
|
|
@ -458,6 +474,7 @@ mod tests {
|
||||||
fn capabilities(&self) -> ProviderCapabilities {
|
fn capabilities(&self) -> ProviderCapabilities {
|
||||||
ProviderCapabilities {
|
ProviderCapabilities {
|
||||||
native_tool_calling: true,
|
native_tool_calling: true,
|
||||||
|
vision: true,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -539,18 +556,22 @@ mod tests {
|
||||||
fn provider_capabilities_default() {
|
fn provider_capabilities_default() {
|
||||||
let caps = ProviderCapabilities::default();
|
let caps = ProviderCapabilities::default();
|
||||||
assert!(!caps.native_tool_calling);
|
assert!(!caps.native_tool_calling);
|
||||||
|
assert!(!caps.vision);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn provider_capabilities_equality() {
|
fn provider_capabilities_equality() {
|
||||||
let caps1 = ProviderCapabilities {
|
let caps1 = ProviderCapabilities {
|
||||||
native_tool_calling: true,
|
native_tool_calling: true,
|
||||||
|
vision: false,
|
||||||
};
|
};
|
||||||
let caps2 = ProviderCapabilities {
|
let caps2 = ProviderCapabilities {
|
||||||
native_tool_calling: true,
|
native_tool_calling: true,
|
||||||
|
vision: false,
|
||||||
};
|
};
|
||||||
let caps3 = ProviderCapabilities {
|
let caps3 = ProviderCapabilities {
|
||||||
native_tool_calling: false,
|
native_tool_calling: false,
|
||||||
|
vision: false,
|
||||||
};
|
};
|
||||||
|
|
||||||
assert_eq!(caps1, caps2);
|
assert_eq!(caps1, caps2);
|
||||||
|
|
@ -563,6 +584,12 @@ mod tests {
|
||||||
assert!(provider.supports_native_tools());
|
assert!(provider.supports_native_tools());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn supports_vision_reflects_capabilities_default_mapping() {
|
||||||
|
let provider = CapabilityMockProvider;
|
||||||
|
assert!(provider.supports_vision());
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn tools_payload_variants() {
|
fn tools_payload_variants() {
|
||||||
// Test Gemini variant
|
// Test Gemini variant
|
||||||
|
|
|
||||||
|
|
@ -235,7 +235,9 @@ impl ProxyConfigTool {
|
||||||
}
|
}
|
||||||
|
|
||||||
if args.get("enabled").is_none() && touched_proxy_url {
|
if args.get("enabled").is_none() && touched_proxy_url {
|
||||||
proxy.enabled = true;
|
// Keep auto-enable behavior when users provide a proxy URL, but
|
||||||
|
// auto-disable when all proxy URLs are cleared in the same update.
|
||||||
|
proxy.enabled = proxy.has_any_proxy_url();
|
||||||
}
|
}
|
||||||
|
|
||||||
proxy.no_proxy = proxy.normalized_no_proxy();
|
proxy.no_proxy = proxy.normalized_no_proxy();
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue