feat: add multimodal image marker support with Ollama vision

This commit is contained in:
Chummy 2026-02-19 20:24:56 +08:00
parent 63aacb09ff
commit dcd0bf641d
21 changed files with 1152 additions and 78 deletions

View file

@ -1,4 +1,7 @@
use crate::providers::traits::{ChatMessage, ChatResponse, Provider, ToolCall};
use crate::multimodal;
use crate::providers::traits::{
ChatMessage, ChatResponse, Provider, ProviderCapabilities, ToolCall,
};
use async_trait::async_trait;
use reqwest::Client;
use serde::{Deserialize, Serialize};
@ -30,6 +33,8 @@ struct Message {
#[serde(skip_serializing_if = "Option::is_none")]
content: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
images: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
tool_calls: Option<Vec<OutgoingToolCall>>,
#[serde(skip_serializing_if = "Option::is_none")]
tool_name: Option<String>,
@ -166,6 +171,31 @@ impl OllamaProvider {
}
}
fn convert_user_message_content(&self, content: &str) -> (Option<String>, Option<Vec<String>>) {
let (cleaned, image_refs) = multimodal::parse_image_markers(content);
if image_refs.is_empty() {
return (Some(content.to_string()), None);
}
let images: Vec<String> = image_refs
.iter()
.filter_map(|reference| multimodal::extract_ollama_image_payload(reference))
.collect();
if images.is_empty() {
return (Some(content.to_string()), None);
}
let cleaned = cleaned.trim();
let content = if cleaned.is_empty() {
None
} else {
Some(cleaned.to_string())
};
(content, Some(images))
}
/// Convert internal chat history format to Ollama's native tool-call message schema.
///
/// `run_tool_call_loop` stores native assistant/tool entries as JSON strings in
@ -205,6 +235,7 @@ impl OllamaProvider {
return Message {
role: "assistant".to_string(),
content,
images: None,
tool_calls: Some(outgoing_calls),
tool_name: None,
};
@ -238,15 +269,28 @@ impl OllamaProvider {
return Message {
role: "tool".to_string(),
content,
images: None,
tool_calls: None,
tool_name,
};
}
}
if message.role == "user" {
let (content, images) = self.convert_user_message_content(&message.content);
return Message {
role: "user".to_string(),
content,
images,
tool_calls: None,
tool_name: None,
};
}
Message {
role: message.role.clone(),
content: Some(message.content.clone()),
images: None,
tool_calls: None,
tool_name: None,
}
@ -398,6 +442,13 @@ impl OllamaProvider {
#[async_trait]
impl Provider for OllamaProvider {
fn capabilities(&self) -> ProviderCapabilities {
ProviderCapabilities {
native_tool_calling: true,
vision: true,
}
}
async fn chat_with_system(
&self,
system_prompt: Option<&str>,
@ -413,14 +464,17 @@ impl Provider for OllamaProvider {
messages.push(Message {
role: "system".to_string(),
content: Some(sys.to_string()),
images: None,
tool_calls: None,
tool_name: None,
});
}
let (user_content, user_images) = self.convert_user_message_content(message);
messages.push(Message {
role: "user".to_string(),
content: Some(message.to_string()),
content: user_content,
images: user_images,
tool_calls: None,
tool_name: None,
});
@ -862,4 +916,34 @@ mod tests {
assert_eq!(converted[1].content.as_deref(), Some("ok"));
assert!(converted[1].tool_calls.is_none());
}
#[test]
fn convert_messages_extracts_images_from_user_marker() {
let provider = OllamaProvider::new(None, None);
let messages = vec![ChatMessage {
role: "user".into(),
content: "Inspect this screenshot [IMAGE:data:image/png;base64,abcd==]".into(),
}];
let converted = provider.convert_messages(&messages);
assert_eq!(converted.len(), 1);
assert_eq!(converted[0].role, "user");
assert_eq!(
converted[0].content.as_deref(),
Some("Inspect this screenshot")
);
let images = converted[0]
.images
.as_ref()
.expect("images should be present");
assert_eq!(images, &vec!["abcd==".to_string()]);
}
#[test]
fn capabilities_include_native_tools_and_vision() {
let provider = OllamaProvider::new(None, None);
let caps = <OllamaProvider as Provider>::capabilities(&provider);
assert!(caps.native_tool_calling);
assert!(caps.vision);
}
}