feat: add multimodal image marker support with Ollama vision

This commit is contained in:
Chummy 2026-02-19 20:24:56 +08:00
parent 63aacb09ff
commit dcd0bf641d
21 changed files with 1152 additions and 78 deletions

View file

@ -10,7 +10,7 @@
use crate::channels::{Channel, LinqChannel, SendMessage, WhatsAppChannel};
use crate::config::Config;
use crate::memory::{self, Memory, MemoryCategory};
use crate::providers::{self, Provider};
use crate::providers::{self, ChatMessage, Provider, ProviderCapabilityError};
use crate::runtime;
use crate::security::pairing::{constant_time_eq, is_public_bind, PairingGuard};
use crate::security::SecurityPolicy;
@ -666,6 +666,52 @@ async fn persist_pairing_tokens(config: Arc<Mutex<Config>>, pairing: &PairingGua
Ok(())
}
async fn run_gateway_chat_with_multimodal(
state: &AppState,
provider_label: &str,
message: &str,
) -> anyhow::Result<String> {
let user_messages = vec![ChatMessage::user(message)];
let image_marker_count = crate::multimodal::count_image_markers(&user_messages);
if image_marker_count > 0 && !state.provider.supports_vision() {
return Err(ProviderCapabilityError {
provider: provider_label.to_string(),
capability: "vision".to_string(),
message: format!(
"received {image_marker_count} image marker(s), but this provider does not support vision input"
),
}
.into());
}
// Keep webhook/gateway prompts aligned with channel behavior by injecting
// workspace-aware system context before model invocation.
let system_prompt = {
let config_guard = state.config.lock();
crate::channels::build_system_prompt(
&config_guard.workspace_dir,
&state.model,
&[], // tools - empty for simple chat
&[], // skills
Some(&config_guard.identity),
None, // bootstrap_max_chars - use default
)
};
let mut messages = Vec::with_capacity(1 + user_messages.len());
messages.push(ChatMessage::system(system_prompt));
messages.extend(user_messages);
let multimodal_config = state.config.lock().multimodal.clone();
let prepared =
crate::multimodal::prepare_messages_for_provider(&messages, &multimodal_config).await?;
state
.provider
.chat_with_history(&prepared.messages, &state.model, state.temperature)
.await
}
/// Webhook request body
#[derive(serde::Deserialize)]
pub struct WebhookBody {
@ -787,30 +833,7 @@ async fn handle_webhook(
messages_count: 1,
});
// Build system prompt with workspace context (IDENTITY.md, AGENTS.md, etc.)
let system_prompt = {
let config_guard = state.config.lock();
crate::channels::build_system_prompt(
&config_guard.workspace_dir,
&state.model,
&[], // tools - empty for simple chat
&[], // skills
Some(&config_guard.identity),
None, // bootstrap_max_chars - use default
)
};
// Call the LLM with separate system prompt
match state
.provider
.chat_with_system(
Some(&system_prompt),
message,
&state.model,
state.temperature,
)
.await
{
match run_gateway_chat_with_multimodal(&state, &provider_label, message).await {
Ok(response) => {
let duration = started_at.elapsed();
state
@ -994,6 +1017,12 @@ async fn handle_whatsapp_message(
}
// Process each message
let provider_label = state
.config
.lock()
.default_provider
.clone()
.unwrap_or_else(|| "unknown".to_string());
for msg in &messages {
tracing::info!(
"WhatsApp message from {}: {}",
@ -1010,30 +1039,7 @@ async fn handle_whatsapp_message(
.await;
}
// Build system prompt with workspace context (IDENTITY.md, AGENTS.md, etc.)
let system_prompt = {
let config_guard = state.config.lock();
crate::channels::build_system_prompt(
&config_guard.workspace_dir,
&state.model,
&[], // tools - empty for simple chat
&[], // skills
Some(&config_guard.identity),
None, // bootstrap_max_chars - use default
)
};
// Call the LLM with separate system prompt
match state
.provider
.chat_with_system(
Some(&system_prompt),
&msg.content,
&state.model,
state.temperature,
)
.await
{
match run_gateway_chat_with_multimodal(&state, &provider_label, &msg.content).await {
Ok(response) => {
// Send reply via WhatsApp
if let Err(e) = wa
@ -1124,6 +1130,12 @@ async fn handle_linq_webhook(
}
// Process each message
let provider_label = state
.config
.lock()
.default_provider
.clone()
.unwrap_or_else(|| "unknown".to_string());
for msg in &messages {
tracing::info!(
"Linq message from {}: {}",
@ -1141,11 +1153,7 @@ async fn handle_linq_webhook(
}
// Call the LLM
match state
.provider
.simple_chat(&msg.content, &state.model, state.temperature)
.await
{
match run_gateway_chat_with_multimodal(&state, &provider_label, &msg.content).await {
Ok(response) => {
// Send reply via Linq
if let Err(e) = linq