//! JSON Schema cleaning and validation for LLM tool-calling compatibility. //! //! Different providers support different subsets of JSON Schema. This module //! normalizes tool schemas to improve cross-provider compatibility while //! preserving semantic intent. //! //! ## What this module does //! //! 1. Removes unsupported keywords per provider strategy //! 2. Resolves local `$ref` entries from `$defs` and `definitions` //! 3. Flattens literal `anyOf` / `oneOf` unions into `enum` //! 4. Strips nullable variants from unions and `type` arrays //! 5. Converts `const` to single-value `enum` //! 6. Detects circular references and stops recursion safely //! //! # Example //! //! ```rust //! use serde_json::json; //! use zeroclaw::tools::schema::SchemaCleanr; //! //! let dirty_schema = json!({ //! "type": "object", //! "properties": { //! "name": { //! "type": "string", //! "minLength": 1, // Gemini rejects this //! "pattern": "^[a-z]+$" // Gemini rejects this //! }, //! "age": { //! "$ref": "#/$defs/Age" // Needs resolution //! } //! }, //! "$defs": { //! "Age": { //! "type": "integer", //! "minimum": 0 // Gemini rejects this //! } //! } //! }); //! //! let cleaned = SchemaCleanr::clean_for_gemini(dirty_schema); //! //! // Result: //! // { //! // "type": "object", //! // "properties": { //! // "name": { "type": "string" }, //! // "age": { "type": "integer" } //! // } //! // } //! ``` //! use serde_json::{json, Map, Value}; use std::collections::{HashMap, HashSet}; /// Keywords that Gemini rejects for tool schemas. pub const GEMINI_UNSUPPORTED_KEYWORDS: &[&str] = &[ // Schema composition "$ref", "$schema", "$id", "$defs", "definitions", // Property constraints "additionalProperties", "patternProperties", // String constraints "minLength", "maxLength", "pattern", "format", // Number constraints "minimum", "maximum", "multipleOf", // Array constraints "minItems", "maxItems", "uniqueItems", // Object constraints "minProperties", "maxProperties", // Non-standard "examples", // OpenAPI keyword, not JSON Schema ]; /// Keywords that should be preserved during cleaning (metadata). const SCHEMA_META_KEYS: &[&str] = &["description", "title", "default"]; /// Schema cleaning strategies for different LLM providers. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum CleaningStrategy { /// Gemini (Google AI / Vertex AI) - Most restrictive Gemini, /// Anthropic Claude - Moderately permissive Anthropic, /// OpenAI GPT - Most permissive OpenAI, /// Conservative: Remove only universally unsupported keywords Conservative, } impl CleaningStrategy { /// Get the list of unsupported keywords for this strategy. pub fn unsupported_keywords(self) -> &'static [&'static str] { match self { Self::Gemini => GEMINI_UNSUPPORTED_KEYWORDS, Self::Anthropic => &["$ref", "$defs", "definitions"], // Anthropic doesn't resolve refs Self::OpenAI => &[], // OpenAI is most permissive Self::Conservative => &["$ref", "$defs", "definitions", "additionalProperties"], } } } /// JSON Schema cleaner optimized for LLM tool calling. pub struct SchemaCleanr; impl SchemaCleanr { /// Clean schema for Gemini compatibility (strictest). /// /// This is the most aggressive cleaning strategy, removing all keywords /// that Gemini's API rejects. pub fn clean_for_gemini(schema: Value) -> Value { Self::clean(schema, CleaningStrategy::Gemini) } /// Clean schema for Anthropic compatibility. pub fn clean_for_anthropic(schema: Value) -> Value { Self::clean(schema, CleaningStrategy::Anthropic) } /// Clean schema for OpenAI compatibility (most permissive). pub fn clean_for_openai(schema: Value) -> Value { Self::clean(schema, CleaningStrategy::OpenAI) } /// Clean schema with specified strategy. pub fn clean(schema: Value, strategy: CleaningStrategy) -> Value { // Extract $defs for reference resolution let defs = if let Some(obj) = schema.as_object() { Self::extract_defs(obj) } else { HashMap::new() }; Self::clean_with_defs(schema, &defs, strategy, &mut HashSet::new()) } /// Validate that a schema is suitable for LLM tool calling. /// /// Returns an error if the schema is invalid or missing required fields. pub fn validate(schema: &Value) -> anyhow::Result<()> { let obj = schema .as_object() .ok_or_else(|| anyhow::anyhow!("Schema must be an object"))?; // Must have 'type' field if !obj.contains_key("type") { anyhow::bail!("Schema missing required 'type' field"); } // If type is 'object', should have 'properties' if let Some(Value::String(t)) = obj.get("type") { if t == "object" && !obj.contains_key("properties") { tracing::warn!("Object schema without 'properties' field may cause issues"); } } Ok(()) } // -------------------------------------------------------------------- // Internal implementation // -------------------------------------------------------------------- /// Extract $defs and definitions into a flat map for reference resolution. fn extract_defs(obj: &Map) -> HashMap { let mut defs = HashMap::new(); // Extract from $defs (JSON Schema 2019-09+) if let Some(Value::Object(defs_obj)) = obj.get("$defs") { for (key, value) in defs_obj { defs.insert(key.clone(), value.clone()); } } // Extract from definitions (JSON Schema draft-07) if let Some(Value::Object(defs_obj)) = obj.get("definitions") { for (key, value) in defs_obj { defs.insert(key.clone(), value.clone()); } } defs } /// Recursively clean a schema value. fn clean_with_defs( schema: Value, defs: &HashMap, strategy: CleaningStrategy, ref_stack: &mut HashSet, ) -> Value { match schema { Value::Object(obj) => Self::clean_object(obj, defs, strategy, ref_stack), Value::Array(arr) => Value::Array( arr.into_iter() .map(|v| Self::clean_with_defs(v, defs, strategy, ref_stack)) .collect(), ), other => other, } } /// Clean an object schema. fn clean_object( obj: Map, defs: &HashMap, strategy: CleaningStrategy, ref_stack: &mut HashSet, ) -> Value { // Handle $ref resolution if let Some(Value::String(ref_value)) = obj.get("$ref") { return Self::resolve_ref(ref_value, &obj, defs, strategy, ref_stack); } // Handle anyOf/oneOf simplification if obj.contains_key("anyOf") || obj.contains_key("oneOf") { if let Some(simplified) = Self::try_simplify_union(&obj, defs, strategy, ref_stack) { return simplified; } } // Build cleaned object let mut cleaned = Map::new(); let unsupported: HashSet<&str> = strategy.unsupported_keywords().iter().copied().collect(); let has_union = obj.contains_key("anyOf") || obj.contains_key("oneOf"); for (key, value) in obj { // Skip unsupported keywords if unsupported.contains(key.as_str()) { continue; } // Special handling for specific keys match key.as_str() { // Convert const to enum "const" => { cleaned.insert("enum".to_string(), json!([value])); } // Skip type if we have anyOf/oneOf (they define the type) "type" if has_union => { // Skip } // Handle type arrays (remove null) "type" if matches!(value, Value::Array(_)) => { let cleaned_value = Self::clean_type_array(value); cleaned.insert(key, cleaned_value); } // Recursively clean nested schemas "properties" => { let cleaned_value = Self::clean_properties(value, defs, strategy, ref_stack); cleaned.insert(key, cleaned_value); } "items" => { let cleaned_value = Self::clean_with_defs(value, defs, strategy, ref_stack); cleaned.insert(key, cleaned_value); } "anyOf" | "oneOf" | "allOf" => { let cleaned_value = Self::clean_union(value, defs, strategy, ref_stack); cleaned.insert(key, cleaned_value); } // Keep all other keys, cleaning nested objects/arrays recursively. _ => { let cleaned_value = match value { Value::Object(_) | Value::Array(_) => { Self::clean_with_defs(value, defs, strategy, ref_stack) } other => other, }; cleaned.insert(key, cleaned_value); } } } Value::Object(cleaned) } /// Resolve a $ref to its definition. fn resolve_ref( ref_value: &str, obj: &Map, defs: &HashMap, strategy: CleaningStrategy, ref_stack: &mut HashSet, ) -> Value { // Prevent circular references if ref_stack.contains(ref_value) { tracing::warn!("Circular $ref detected: {}", ref_value); return Self::preserve_meta(obj, Value::Object(Map::new())); } // Try to resolve local ref (#/$defs/Name or #/definitions/Name) if let Some(def_name) = Self::parse_local_ref(ref_value) { if let Some(definition) = defs.get(def_name.as_str()) { ref_stack.insert(ref_value.to_string()); let cleaned = Self::clean_with_defs(definition.clone(), defs, strategy, ref_stack); ref_stack.remove(ref_value); return Self::preserve_meta(obj, cleaned); } } // Can't resolve: return empty object with metadata tracing::warn!("Cannot resolve $ref: {}", ref_value); Self::preserve_meta(obj, Value::Object(Map::new())) } /// Parse a local JSON Pointer ref (#/$defs/Name). fn parse_local_ref(ref_value: &str) -> Option { ref_value .strip_prefix("#/$defs/") .or_else(|| ref_value.strip_prefix("#/definitions/")) .map(Self::decode_json_pointer) } /// Decode JSON Pointer escaping (`~0` = `~`, `~1` = `/`). fn decode_json_pointer(segment: &str) -> String { if !segment.contains('~') { return segment.to_string(); } let mut decoded = String::with_capacity(segment.len()); let mut chars = segment.chars().peekable(); while let Some(ch) = chars.next() { if ch == '~' { match chars.peek().copied() { Some('0') => { chars.next(); decoded.push('~'); } Some('1') => { chars.next(); decoded.push('/'); } _ => decoded.push('~'), } } else { decoded.push(ch); } } decoded } /// Try to simplify anyOf/oneOf to a simpler form. fn try_simplify_union( obj: &Map, defs: &HashMap, strategy: CleaningStrategy, ref_stack: &mut HashSet, ) -> Option { let union_key = if obj.contains_key("anyOf") { "anyOf" } else if obj.contains_key("oneOf") { "oneOf" } else { return None; }; let variants = obj.get(union_key)?.as_array()?; // Clean all variants first let cleaned_variants: Vec = variants .iter() .map(|v| Self::clean_with_defs(v.clone(), defs, strategy, ref_stack)) .collect(); // Strip null variants let non_null: Vec = cleaned_variants .into_iter() .filter(|v| !Self::is_null_schema(v)) .collect(); // If only one variant remains after stripping nulls, return it if non_null.len() == 1 { return Some(Self::preserve_meta(obj, non_null[0].clone())); } // Try to flatten to enum if all variants are literals if let Some(enum_value) = Self::try_flatten_literal_union(&non_null) { return Some(Self::preserve_meta(obj, enum_value)); } None } /// Check if a schema represents null type. fn is_null_schema(value: &Value) -> bool { if let Some(obj) = value.as_object() { // { const: null } if let Some(Value::Null) = obj.get("const") { return true; } // { enum: [null] } if let Some(Value::Array(arr)) = obj.get("enum") { if arr.len() == 1 && matches!(arr[0], Value::Null) { return true; } } // { type: "null" } if let Some(Value::String(t)) = obj.get("type") { if t == "null" { return true; } } } false } /// Try to flatten anyOf/oneOf with only literal values to enum. /// /// Example: `anyOf: [{const: "a"}, {const: "b"}]` -> `{type: "string", enum: ["a", "b"]}` fn try_flatten_literal_union(variants: &[Value]) -> Option { if variants.is_empty() { return None; } let mut all_values = Vec::new(); let mut common_type: Option = None; for variant in variants { let obj = variant.as_object()?; // Extract literal value from const or single-item enum let literal_value = if let Some(const_val) = obj.get("const") { const_val.clone() } else if let Some(Value::Array(arr)) = obj.get("enum") { if arr.len() == 1 { arr[0].clone() } else { return None; } } else { return None; }; // Check type consistency let variant_type = obj.get("type")?.as_str()?; match &common_type { None => common_type = Some(variant_type.to_string()), Some(t) if t != variant_type => return None, _ => {} } all_values.push(literal_value); } common_type.map(|t| { json!({ "type": t, "enum": all_values }) }) } /// Clean type array, removing null. fn clean_type_array(value: Value) -> Value { if let Value::Array(types) = value { let non_null: Vec = types .into_iter() .filter(|v| v.as_str() != Some("null")) .collect(); match non_null.len() { 0 => Value::String("null".to_string()), 1 => non_null .into_iter() .next() .unwrap_or(Value::String("null".to_string())), _ => Value::Array(non_null), } } else { value } } /// Clean properties object. fn clean_properties( value: Value, defs: &HashMap, strategy: CleaningStrategy, ref_stack: &mut HashSet, ) -> Value { if let Value::Object(props) = value { let cleaned: Map = props .into_iter() .map(|(k, v)| (k, Self::clean_with_defs(v, defs, strategy, ref_stack))) .collect(); Value::Object(cleaned) } else { value } } /// Clean union (anyOf/oneOf/allOf). fn clean_union( value: Value, defs: &HashMap, strategy: CleaningStrategy, ref_stack: &mut HashSet, ) -> Value { if let Value::Array(variants) = value { let cleaned: Vec = variants .into_iter() .map(|v| Self::clean_with_defs(v, defs, strategy, ref_stack)) .collect(); Value::Array(cleaned) } else { value } } /// Preserve metadata (description, title, default) from source to target. fn preserve_meta(source: &Map, mut target: Value) -> Value { if let Value::Object(target_obj) = &mut target { for &key in SCHEMA_META_KEYS { if let Some(value) = source.get(key) { target_obj.insert(key.to_string(), value.clone()); } } } target } } #[cfg(test)] mod tests { use super::*; #[test] fn test_remove_unsupported_keywords() { let schema = json!({ "type": "string", "minLength": 1, "maxLength": 100, "pattern": "^[a-z]+$", "description": "A lowercase string" }); let cleaned = SchemaCleanr::clean_for_gemini(schema); assert_eq!(cleaned["type"], "string"); assert_eq!(cleaned["description"], "A lowercase string"); assert!(cleaned.get("minLength").is_none()); assert!(cleaned.get("maxLength").is_none()); assert!(cleaned.get("pattern").is_none()); } #[test] fn test_resolve_ref() { let schema = json!({ "type": "object", "properties": { "age": { "$ref": "#/$defs/Age" } }, "$defs": { "Age": { "type": "integer", "minimum": 0 } } }); let cleaned = SchemaCleanr::clean_for_gemini(schema); assert_eq!(cleaned["properties"]["age"]["type"], "integer"); assert!(cleaned["properties"]["age"].get("minimum").is_none()); // Stripped by Gemini strategy assert!(cleaned.get("$defs").is_none()); } #[test] fn test_flatten_literal_union() { let schema = json!({ "anyOf": [ { "const": "admin", "type": "string" }, { "const": "user", "type": "string" }, { "const": "guest", "type": "string" } ] }); let cleaned = SchemaCleanr::clean_for_gemini(schema); assert_eq!(cleaned["type"], "string"); assert!(cleaned["enum"].is_array()); let enum_values = cleaned["enum"].as_array().unwrap(); assert_eq!(enum_values.len(), 3); assert!(enum_values.contains(&json!("admin"))); assert!(enum_values.contains(&json!("user"))); assert!(enum_values.contains(&json!("guest"))); } #[test] fn test_strip_null_from_union() { let schema = json!({ "oneOf": [ { "type": "string" }, { "type": "null" } ] }); let cleaned = SchemaCleanr::clean_for_gemini(schema); // Should simplify to just { type: "string" } assert_eq!(cleaned["type"], "string"); assert!(cleaned.get("oneOf").is_none()); } #[test] fn test_const_to_enum() { let schema = json!({ "const": "fixed_value", "description": "A constant" }); let cleaned = SchemaCleanr::clean_for_gemini(schema); assert_eq!(cleaned["enum"], json!(["fixed_value"])); assert_eq!(cleaned["description"], "A constant"); assert!(cleaned.get("const").is_none()); } #[test] fn test_preserve_metadata() { let schema = json!({ "$ref": "#/$defs/Name", "description": "User's name", "title": "Name Field", "default": "Anonymous", "$defs": { "Name": { "type": "string" } } }); let cleaned = SchemaCleanr::clean_for_gemini(schema); assert_eq!(cleaned["type"], "string"); assert_eq!(cleaned["description"], "User's name"); assert_eq!(cleaned["title"], "Name Field"); assert_eq!(cleaned["default"], "Anonymous"); } #[test] fn test_circular_ref_prevention() { let schema = json!({ "type": "object", "properties": { "parent": { "$ref": "#/$defs/Node" } }, "$defs": { "Node": { "type": "object", "properties": { "child": { "$ref": "#/$defs/Node" } } } } }); // Should not panic on circular reference let cleaned = SchemaCleanr::clean_for_gemini(schema); assert_eq!(cleaned["properties"]["parent"]["type"], "object"); // Circular reference should be broken } #[test] fn test_validate_schema() { let valid = json!({ "type": "object", "properties": { "name": { "type": "string" } } }); assert!(SchemaCleanr::validate(&valid).is_ok()); let invalid = json!({ "properties": { "name": { "type": "string" } } }); assert!(SchemaCleanr::validate(&invalid).is_err()); } #[test] fn test_strategy_differences() { let schema = json!({ "type": "string", "minLength": 1, "description": "A string field" }); // Gemini: Most restrictive (removes minLength) let gemini = SchemaCleanr::clean_for_gemini(schema.clone()); assert!(gemini.get("minLength").is_none()); assert_eq!(gemini["type"], "string"); assert_eq!(gemini["description"], "A string field"); // OpenAI: Most permissive (keeps minLength) let openai = SchemaCleanr::clean_for_openai(schema.clone()); assert_eq!(openai["minLength"], 1); // OpenAI allows validation keywords assert_eq!(openai["type"], "string"); } #[test] fn test_nested_properties() { let schema = json!({ "type": "object", "properties": { "user": { "type": "object", "properties": { "name": { "type": "string", "minLength": 1 } }, "additionalProperties": false } } }); let cleaned = SchemaCleanr::clean_for_gemini(schema); assert!(cleaned["properties"]["user"]["properties"]["name"] .get("minLength") .is_none()); assert!(cleaned["properties"]["user"] .get("additionalProperties") .is_none()); } #[test] fn test_type_array_null_removal() { let schema = json!({ "type": ["string", "null"] }); let cleaned = SchemaCleanr::clean_for_gemini(schema); // Should simplify to just "string" assert_eq!(cleaned["type"], "string"); } #[test] fn test_type_array_only_null_preserved() { let schema = json!({ "type": ["null"] }); let cleaned = SchemaCleanr::clean_for_gemini(schema); assert_eq!(cleaned["type"], "null"); } #[test] fn test_ref_with_json_pointer_escape() { let schema = json!({ "$ref": "#/$defs/Foo~1Bar", "$defs": { "Foo/Bar": { "type": "string" } } }); let cleaned = SchemaCleanr::clean_for_gemini(schema); assert_eq!(cleaned["type"], "string"); } #[test] fn test_skip_type_when_non_simplifiable_union_exists() { let schema = json!({ "type": "object", "oneOf": [ { "type": "object", "properties": { "a": { "type": "string" } } }, { "type": "object", "properties": { "b": { "type": "number" } } } ] }); let cleaned = SchemaCleanr::clean_for_gemini(schema); assert!(cleaned.get("type").is_none()); assert!(cleaned.get("oneOf").is_some()); } #[test] fn test_clean_nested_unknown_schema_keyword() { let schema = json!({ "not": { "$ref": "#/$defs/Age" }, "$defs": { "Age": { "type": "integer", "minimum": 0 } } }); let cleaned = SchemaCleanr::clean_for_gemini(schema); assert_eq!(cleaned["not"]["type"], "integer"); assert!(cleaned["not"].get("minimum").is_none()); } }