fix(tools): harden schema cleaner edge cases
This commit is contained in:
parent
e871c9550b
commit
9b465e2940
1 changed files with 152 additions and 72 deletions
|
|
@ -1,24 +1,17 @@
|
||||||
//! JSON Schema cleaning and validation for LLM tool calling compatibility.
|
//! JSON Schema cleaning and validation for LLM tool-calling compatibility.
|
||||||
//!
|
//!
|
||||||
//! Different LLM providers support different subsets of JSON Schema. This module
|
//! Different providers support different subsets of JSON Schema. This module
|
||||||
//! normalizes tool schemas to maximize compatibility across providers (Gemini,
|
//! normalizes tool schemas to improve cross-provider compatibility while
|
||||||
//! Anthropic, OpenAI) while preserving semantic meaning.
|
//! preserving semantic intent.
|
||||||
//!
|
//!
|
||||||
//! # Why Schema Cleaning?
|
//! ## What this module does
|
||||||
//!
|
//!
|
||||||
//! LLM providers reject schemas with unsupported keywords, causing tool calls to fail:
|
//! 1. Removes unsupported keywords per provider strategy
|
||||||
//! - **Gemini**: Rejects `$ref`, `additionalProperties`, `minLength`, `pattern`, etc.
|
//! 2. Resolves local `$ref` entries from `$defs` and `definitions`
|
||||||
//! - **Anthropic**: Generally permissive but doesn't support `$ref` resolution
|
//! 3. Flattens literal `anyOf` / `oneOf` unions into `enum`
|
||||||
//! - **OpenAI**: Supports most keywords but has quirks with `anyOf`/`oneOf`
|
//! 4. Strips nullable variants from unions and `type` arrays
|
||||||
//!
|
//! 5. Converts `const` to single-value `enum`
|
||||||
//! # What This Module Does
|
//! 6. Detects circular references and stops recursion safely
|
||||||
//!
|
|
||||||
//! 1. **Removes unsupported keywords** - Strips provider-specific incompatible fields
|
|
||||||
//! 2. **Resolves `$ref`** - Inlines referenced schemas from `$defs`/`definitions`
|
|
||||||
//! 3. **Flattens unions** - Converts `anyOf`/`oneOf` with literals to `enum`
|
|
||||||
//! 4. **Strips null variants** - Removes `type: null` from unions (most providers don't need it)
|
|
||||||
//! 5. **Normalizes types** - Converts `const` to `enum`, handles type arrays
|
|
||||||
//! 6. **Prevents cycles** - Detects and breaks circular `$ref` chains
|
|
||||||
//!
|
//!
|
||||||
//! # Example
|
//! # Example
|
||||||
//!
|
//!
|
||||||
|
|
@ -31,17 +24,17 @@
|
||||||
//! "properties": {
|
//! "properties": {
|
||||||
//! "name": {
|
//! "name": {
|
||||||
//! "type": "string",
|
//! "type": "string",
|
||||||
//! "minLength": 1, // ← Gemini rejects this
|
//! "minLength": 1, // Gemini rejects this
|
||||||
//! "pattern": "^[a-z]+$" // ← Gemini rejects this
|
//! "pattern": "^[a-z]+$" // Gemini rejects this
|
||||||
//! },
|
//! },
|
||||||
//! "age": {
|
//! "age": {
|
||||||
//! "$ref": "#/$defs/Age" // ← Needs resolution
|
//! "$ref": "#/$defs/Age" // Needs resolution
|
||||||
//! }
|
//! }
|
||||||
//! },
|
//! },
|
||||||
//! "$defs": {
|
//! "$defs": {
|
||||||
//! "Age": {
|
//! "Age": {
|
||||||
//! "type": "integer",
|
//! "type": "integer",
|
||||||
//! "minimum": 0 // ← Gemini rejects this
|
//! "minimum": 0 // Gemini rejects this
|
||||||
//! }
|
//! }
|
||||||
//! }
|
//! }
|
||||||
//! });
|
//! });
|
||||||
|
|
@ -58,29 +51,10 @@
|
||||||
//! // }
|
//! // }
|
||||||
//! ```
|
//! ```
|
||||||
//!
|
//!
|
||||||
//! # Design Philosophy (vs OpenClaw)
|
|
||||||
//!
|
|
||||||
//! **OpenClaw** (TypeScript):
|
|
||||||
//! - Focuses primarily on Gemini compatibility
|
|
||||||
//! - Uses recursive object traversal with mutation
|
|
||||||
//! - ~350 lines of complex nested logic
|
|
||||||
//!
|
|
||||||
//! **Zeroclaw** (this module):
|
|
||||||
//! - ✅ **Multi-provider support** - Configurable for different LLMs
|
|
||||||
//! - ✅ **Immutable by default** - Creates new schemas, preserves originals
|
|
||||||
//! - ✅ **Performance** - Uses efficient Rust patterns (Cow, match)
|
|
||||||
//! - ✅ **Safety** - No runtime panics, comprehensive error handling
|
|
||||||
//! - ✅ **Extensible** - Easy to add new cleaning strategies
|
|
||||||
|
|
||||||
use serde_json::{json, Map, Value};
|
use serde_json::{json, Map, Value};
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
|
|
||||||
/// Keywords that Gemini's Cloud Code Assist API rejects.
|
/// Keywords that Gemini rejects for tool schemas.
|
||||||
///
|
|
||||||
/// Based on real-world testing, Gemini rejects schemas with these keywords,
|
|
||||||
/// even though they're valid in JSON Schema draft 2020-12.
|
|
||||||
///
|
|
||||||
/// Reference: OpenClaw `clean-for-gemini.ts`
|
|
||||||
pub const GEMINI_UNSUPPORTED_KEYWORDS: &[&str] = &[
|
pub const GEMINI_UNSUPPORTED_KEYWORDS: &[&str] = &[
|
||||||
// Schema composition
|
// Schema composition
|
||||||
"$ref",
|
"$ref",
|
||||||
|
|
@ -88,33 +62,27 @@ pub const GEMINI_UNSUPPORTED_KEYWORDS: &[&str] = &[
|
||||||
"$id",
|
"$id",
|
||||||
"$defs",
|
"$defs",
|
||||||
"definitions",
|
"definitions",
|
||||||
|
|
||||||
// Property constraints
|
// Property constraints
|
||||||
"additionalProperties",
|
"additionalProperties",
|
||||||
"patternProperties",
|
"patternProperties",
|
||||||
|
|
||||||
// String constraints
|
// String constraints
|
||||||
"minLength",
|
"minLength",
|
||||||
"maxLength",
|
"maxLength",
|
||||||
"pattern",
|
"pattern",
|
||||||
"format",
|
"format",
|
||||||
|
|
||||||
// Number constraints
|
// Number constraints
|
||||||
"minimum",
|
"minimum",
|
||||||
"maximum",
|
"maximum",
|
||||||
"multipleOf",
|
"multipleOf",
|
||||||
|
|
||||||
// Array constraints
|
// Array constraints
|
||||||
"minItems",
|
"minItems",
|
||||||
"maxItems",
|
"maxItems",
|
||||||
"uniqueItems",
|
"uniqueItems",
|
||||||
|
|
||||||
// Object constraints
|
// Object constraints
|
||||||
"minProperties",
|
"minProperties",
|
||||||
"maxProperties",
|
"maxProperties",
|
||||||
|
|
||||||
// Non-standard
|
// Non-standard
|
||||||
"examples", // OpenAPI keyword, not JSON Schema
|
"examples", // OpenAPI keyword, not JSON Schema
|
||||||
];
|
];
|
||||||
|
|
||||||
/// Keywords that should be preserved during cleaning (metadata).
|
/// Keywords that should be preserved during cleaning (metadata).
|
||||||
|
|
@ -139,7 +107,7 @@ impl CleaningStrategy {
|
||||||
match self {
|
match self {
|
||||||
Self::Gemini => GEMINI_UNSUPPORTED_KEYWORDS,
|
Self::Gemini => GEMINI_UNSUPPORTED_KEYWORDS,
|
||||||
Self::Anthropic => &["$ref", "$defs", "definitions"], // Anthropic doesn't resolve refs
|
Self::Anthropic => &["$ref", "$defs", "definitions"], // Anthropic doesn't resolve refs
|
||||||
Self::OpenAI => &[], // OpenAI is most permissive
|
Self::OpenAI => &[], // OpenAI is most permissive
|
||||||
Self::Conservative => &["$ref", "$defs", "definitions", "additionalProperties"],
|
Self::Conservative => &["$ref", "$defs", "definitions", "additionalProperties"],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -202,9 +170,9 @@ impl SchemaCleanr {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
// ────────────────────────────────────────────────────────────────────
|
// --------------------------------------------------------------------
|
||||||
// Internal implementation
|
// Internal implementation
|
||||||
// ────────────────────────────────────────────────────────────────────
|
// --------------------------------------------------------------------
|
||||||
|
|
||||||
/// Extract $defs and definitions into a flat map for reference resolution.
|
/// Extract $defs and definitions into a flat map for reference resolution.
|
||||||
fn extract_defs(obj: &Map<String, Value>) -> HashMap<String, Value> {
|
fn extract_defs(obj: &Map<String, Value>) -> HashMap<String, Value> {
|
||||||
|
|
@ -236,9 +204,11 @@ impl SchemaCleanr {
|
||||||
) -> Value {
|
) -> Value {
|
||||||
match schema {
|
match schema {
|
||||||
Value::Object(obj) => Self::clean_object(obj, defs, strategy, ref_stack),
|
Value::Object(obj) => Self::clean_object(obj, defs, strategy, ref_stack),
|
||||||
Value::Array(arr) => {
|
Value::Array(arr) => Value::Array(
|
||||||
Value::Array(arr.into_iter().map(|v| Self::clean_with_defs(v, defs, strategy, ref_stack)).collect())
|
arr.into_iter()
|
||||||
}
|
.map(|v| Self::clean_with_defs(v, defs, strategy, ref_stack))
|
||||||
|
.collect(),
|
||||||
|
),
|
||||||
other => other,
|
other => other,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -265,6 +235,7 @@ impl SchemaCleanr {
|
||||||
// Build cleaned object
|
// Build cleaned object
|
||||||
let mut cleaned = Map::new();
|
let mut cleaned = Map::new();
|
||||||
let unsupported: HashSet<&str> = strategy.unsupported_keywords().iter().copied().collect();
|
let unsupported: HashSet<&str> = strategy.unsupported_keywords().iter().copied().collect();
|
||||||
|
let has_union = obj.contains_key("anyOf") || obj.contains_key("oneOf");
|
||||||
|
|
||||||
for (key, value) in obj {
|
for (key, value) in obj {
|
||||||
// Skip unsupported keywords
|
// Skip unsupported keywords
|
||||||
|
|
@ -279,7 +250,7 @@ impl SchemaCleanr {
|
||||||
cleaned.insert("enum".to_string(), json!([value]));
|
cleaned.insert("enum".to_string(), json!([value]));
|
||||||
}
|
}
|
||||||
// Skip type if we have anyOf/oneOf (they define the type)
|
// Skip type if we have anyOf/oneOf (they define the type)
|
||||||
"type" if cleaned.contains_key("anyOf") || cleaned.contains_key("oneOf") => {
|
"type" if has_union => {
|
||||||
// Skip
|
// Skip
|
||||||
}
|
}
|
||||||
// Handle type arrays (remove null)
|
// Handle type arrays (remove null)
|
||||||
|
|
@ -300,9 +271,15 @@ impl SchemaCleanr {
|
||||||
let cleaned_value = Self::clean_union(value, defs, strategy, ref_stack);
|
let cleaned_value = Self::clean_union(value, defs, strategy, ref_stack);
|
||||||
cleaned.insert(key, cleaned_value);
|
cleaned.insert(key, cleaned_value);
|
||||||
}
|
}
|
||||||
// Keep all other keys as-is
|
// Keep all other keys, cleaning nested objects/arrays recursively.
|
||||||
_ => {
|
_ => {
|
||||||
cleaned.insert(key, value);
|
let cleaned_value = match value {
|
||||||
|
Value::Object(_) | Value::Array(_) => {
|
||||||
|
Self::clean_with_defs(value, defs, strategy, ref_stack)
|
||||||
|
}
|
||||||
|
other => other,
|
||||||
|
};
|
||||||
|
cleaned.insert(key, cleaned_value);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -326,7 +303,7 @@ impl SchemaCleanr {
|
||||||
|
|
||||||
// Try to resolve local ref (#/$defs/Name or #/definitions/Name)
|
// Try to resolve local ref (#/$defs/Name or #/definitions/Name)
|
||||||
if let Some(def_name) = Self::parse_local_ref(ref_value) {
|
if let Some(def_name) = Self::parse_local_ref(ref_value) {
|
||||||
if let Some(definition) = defs.get(def_name) {
|
if let Some(definition) = defs.get(def_name.as_str()) {
|
||||||
ref_stack.insert(ref_value.to_string());
|
ref_stack.insert(ref_value.to_string());
|
||||||
let cleaned = Self::clean_with_defs(definition.clone(), defs, strategy, ref_stack);
|
let cleaned = Self::clean_with_defs(definition.clone(), defs, strategy, ref_stack);
|
||||||
ref_stack.remove(ref_value);
|
ref_stack.remove(ref_value);
|
||||||
|
|
@ -340,18 +317,41 @@ impl SchemaCleanr {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse a local JSON Pointer ref (#/$defs/Name).
|
/// Parse a local JSON Pointer ref (#/$defs/Name).
|
||||||
fn parse_local_ref(ref_value: &str) -> Option<&str> {
|
fn parse_local_ref(ref_value: &str) -> Option<String> {
|
||||||
ref_value
|
ref_value
|
||||||
.strip_prefix("#/$defs/")
|
.strip_prefix("#/$defs/")
|
||||||
.or_else(|| ref_value.strip_prefix("#/definitions/"))
|
.or_else(|| ref_value.strip_prefix("#/definitions/"))
|
||||||
.map(Self::decode_json_pointer)
|
.map(Self::decode_json_pointer)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Decode JSON Pointer escaping (~0 = ~, ~1 = /).
|
/// Decode JSON Pointer escaping (`~0` = `~`, `~1` = `/`).
|
||||||
fn decode_json_pointer(segment: &str) -> &str {
|
fn decode_json_pointer(segment: &str) -> String {
|
||||||
// Simplified: in practice, most definition names don't need decoding
|
if !segment.contains('~') {
|
||||||
// Full implementation would use a Cow<str> to handle ~0/~1 escaping
|
return segment.to_string();
|
||||||
segment
|
}
|
||||||
|
|
||||||
|
let mut decoded = String::with_capacity(segment.len());
|
||||||
|
let mut chars = segment.chars().peekable();
|
||||||
|
|
||||||
|
while let Some(ch) = chars.next() {
|
||||||
|
if ch == '~' {
|
||||||
|
match chars.peek().copied() {
|
||||||
|
Some('0') => {
|
||||||
|
chars.next();
|
||||||
|
decoded.push('~');
|
||||||
|
}
|
||||||
|
Some('1') => {
|
||||||
|
chars.next();
|
||||||
|
decoded.push('/');
|
||||||
|
}
|
||||||
|
_ => decoded.push('~'),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
decoded.push(ch);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
decoded
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Try to simplify anyOf/oneOf to a simpler form.
|
/// Try to simplify anyOf/oneOf to a simpler form.
|
||||||
|
|
@ -421,7 +421,7 @@ impl SchemaCleanr {
|
||||||
|
|
||||||
/// Try to flatten anyOf/oneOf with only literal values to enum.
|
/// Try to flatten anyOf/oneOf with only literal values to enum.
|
||||||
///
|
///
|
||||||
/// Example: `anyOf: [{const: "a"}, {const: "b"}]` → `{type: "string", enum: ["a", "b"]}`
|
/// Example: `anyOf: [{const: "a"}, {const: "b"}]` -> `{type: "string", enum: ["a", "b"]}`
|
||||||
fn try_flatten_literal_union(variants: &[Value]) -> Option<Value> {
|
fn try_flatten_literal_union(variants: &[Value]) -> Option<Value> {
|
||||||
if variants.is_empty() {
|
if variants.is_empty() {
|
||||||
return None;
|
return None;
|
||||||
|
|
@ -473,10 +473,13 @@ impl SchemaCleanr {
|
||||||
.filter(|v| v.as_str() != Some("null"))
|
.filter(|v| v.as_str() != Some("null"))
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
if non_null.len() == 1 {
|
match non_null.len() {
|
||||||
non_null[0].clone()
|
0 => Value::String("null".to_string()),
|
||||||
} else {
|
1 => non_null
|
||||||
Value::Array(non_null)
|
.into_iter()
|
||||||
|
.next()
|
||||||
|
.unwrap_or(Value::String("null".to_string())),
|
||||||
|
_ => Value::Array(non_null),
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
value
|
value
|
||||||
|
|
@ -740,8 +743,12 @@ mod tests {
|
||||||
|
|
||||||
let cleaned = SchemaCleanr::clean_for_gemini(schema);
|
let cleaned = SchemaCleanr::clean_for_gemini(schema);
|
||||||
|
|
||||||
assert!(cleaned["properties"]["user"]["properties"]["name"].get("minLength").is_none());
|
assert!(cleaned["properties"]["user"]["properties"]["name"]
|
||||||
assert!(cleaned["properties"]["user"].get("additionalProperties").is_none());
|
.get("minLength")
|
||||||
|
.is_none());
|
||||||
|
assert!(cleaned["properties"]["user"]
|
||||||
|
.get("additionalProperties")
|
||||||
|
.is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
@ -755,4 +762,77 @@ mod tests {
|
||||||
// Should simplify to just "string"
|
// Should simplify to just "string"
|
||||||
assert_eq!(cleaned["type"], "string");
|
assert_eq!(cleaned["type"], "string");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_type_array_only_null_preserved() {
|
||||||
|
let schema = json!({
|
||||||
|
"type": ["null"]
|
||||||
|
});
|
||||||
|
|
||||||
|
let cleaned = SchemaCleanr::clean_for_gemini(schema);
|
||||||
|
|
||||||
|
assert_eq!(cleaned["type"], "null");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ref_with_json_pointer_escape() {
|
||||||
|
let schema = json!({
|
||||||
|
"$ref": "#/$defs/Foo~1Bar",
|
||||||
|
"$defs": {
|
||||||
|
"Foo/Bar": {
|
||||||
|
"type": "string"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let cleaned = SchemaCleanr::clean_for_gemini(schema);
|
||||||
|
|
||||||
|
assert_eq!(cleaned["type"], "string");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_skip_type_when_non_simplifiable_union_exists() {
|
||||||
|
let schema = json!({
|
||||||
|
"type": "object",
|
||||||
|
"oneOf": [
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"a": { "type": "string" }
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"b": { "type": "number" }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
});
|
||||||
|
|
||||||
|
let cleaned = SchemaCleanr::clean_for_gemini(schema);
|
||||||
|
|
||||||
|
assert!(cleaned.get("type").is_none());
|
||||||
|
assert!(cleaned.get("oneOf").is_some());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_clean_nested_unknown_schema_keyword() {
|
||||||
|
let schema = json!({
|
||||||
|
"not": {
|
||||||
|
"$ref": "#/$defs/Age"
|
||||||
|
},
|
||||||
|
"$defs": {
|
||||||
|
"Age": {
|
||||||
|
"type": "integer",
|
||||||
|
"minimum": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let cleaned = SchemaCleanr::clean_for_gemini(schema);
|
||||||
|
|
||||||
|
assert_eq!(cleaned["not"]["type"], "integer");
|
||||||
|
assert!(cleaned["not"].get("minimum").is_none());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue