zeroclaw/src/memory/vector.rs
argenis de la rosa 1fd51f1984 fix: resolve all clippy --all-targets warnings across 15 files
- gateway/mod.rs: move send_json before test module (items_after_test_module)
- memory/vector.rs: fix float_cmp, cast_precision_loss, approx_constant
- memory/chunker.rs: fix format_collect, format_push_string, write_with_newline
- memory/sqlite.rs: fix useless_vec
- heartbeat/engine.rs: fix format_collect, write_with_newline
- config/schema.rs: fix needless_raw_string_hashes
- tools/composio.rs: fix needless_raw_string_hashes
- integrations/registry.rs: fix uninlined_format_args, unused import
- tunnel/mod.rs: fix doc_markdown
- skills/mod.rs: allow similar_names in test module
- channels/cli.rs: fix unreadable_literal
- observability/mod.rs: fix manual_string_new
- runtime/mod.rs: fix manual_string_new
- examples/custom_memory.rs: add Default impl (new_without_default)
- examples/custom_channel.rs: fix needless_borrows_for_generic_args
2026-02-14 03:52:57 -05:00

402 lines
12 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Vector operations — cosine similarity, normalization, hybrid merge.
/// Cosine similarity between two vectors. Returns 0.01.0.
pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
if a.len() != b.len() || a.is_empty() {
return 0.0;
}
let mut dot = 0.0_f64;
let mut norm_a = 0.0_f64;
let mut norm_b = 0.0_f64;
for (x, y) in a.iter().zip(b.iter()) {
let x = f64::from(*x);
let y = f64::from(*y);
dot += x * y;
norm_a += x * x;
norm_b += y * y;
}
let denom = norm_a.sqrt() * norm_b.sqrt();
if !denom.is_finite() || denom < f64::EPSILON {
return 0.0;
}
let raw = dot / denom;
if !raw.is_finite() {
return 0.0;
}
// Clamp to [0, 1] — embeddings are typically positive
#[allow(clippy::cast_possible_truncation)]
let sim = raw.clamp(0.0, 1.0) as f32;
sim
}
/// Serialize f32 vector to bytes (little-endian)
pub fn vec_to_bytes(v: &[f32]) -> Vec<u8> {
let mut bytes = Vec::with_capacity(v.len() * 4);
for &f in v {
bytes.extend_from_slice(&f.to_le_bytes());
}
bytes
}
/// Deserialize bytes to f32 vector (little-endian)
pub fn bytes_to_vec(bytes: &[u8]) -> Vec<f32> {
bytes
.chunks_exact(4)
.map(|chunk| {
let arr: [u8; 4] = chunk.try_into().unwrap_or([0; 4]);
f32::from_le_bytes(arr)
})
.collect()
}
/// A scored result for hybrid merging
#[derive(Debug, Clone)]
pub struct ScoredResult {
pub id: String,
pub vector_score: Option<f32>,
pub keyword_score: Option<f32>,
pub final_score: f32,
}
/// Hybrid merge: combine vector and keyword results with weighted fusion.
///
/// Normalizes each score set to [0, 1], then computes:
/// `final_score` = `vector_weight` * `vector_score` + `keyword_weight` * `keyword_score`
///
/// Deduplicates by id, keeping the best score from each source.
pub fn hybrid_merge(
vector_results: &[(String, f32)], // (id, cosine_similarity)
keyword_results: &[(String, f32)], // (id, bm25_score)
vector_weight: f32,
keyword_weight: f32,
limit: usize,
) -> Vec<ScoredResult> {
use std::collections::HashMap;
let mut map: HashMap<String, ScoredResult> = HashMap::new();
// Normalize vector scores (already 01 from cosine similarity)
for (id, score) in vector_results {
map.entry(id.clone())
.and_modify(|r| r.vector_score = Some(*score))
.or_insert_with(|| ScoredResult {
id: id.clone(),
vector_score: Some(*score),
keyword_score: None,
final_score: 0.0,
});
}
// Normalize keyword scores (BM25 can be any positive number)
let max_kw = keyword_results
.iter()
.map(|(_, s)| *s)
.fold(0.0_f32, f32::max);
let max_kw = if max_kw < f32::EPSILON { 1.0 } else { max_kw };
for (id, score) in keyword_results {
let normalized = score / max_kw;
map.entry(id.clone())
.and_modify(|r| r.keyword_score = Some(normalized))
.or_insert_with(|| ScoredResult {
id: id.clone(),
vector_score: None,
keyword_score: Some(normalized),
final_score: 0.0,
});
}
// Compute final scores
let mut results: Vec<ScoredResult> = map
.into_values()
.map(|mut r| {
let vs = r.vector_score.unwrap_or(0.0);
let ks = r.keyword_score.unwrap_or(0.0);
r.final_score = vector_weight * vs + keyword_weight * ks;
r
})
.collect();
results.sort_by(|a, b| {
b.final_score
.partial_cmp(&a.final_score)
.unwrap_or(std::cmp::Ordering::Equal)
});
results.truncate(limit);
results
}
#[cfg(test)]
#[allow(
clippy::float_cmp,
clippy::approx_constant,
clippy::cast_precision_loss,
clippy::cast_possible_truncation
)]
mod tests {
use super::*;
#[test]
fn cosine_identical_vectors() {
let v = vec![1.0, 2.0, 3.0];
let sim = cosine_similarity(&v, &v);
assert!((sim - 1.0).abs() < 0.001);
}
#[test]
fn cosine_orthogonal_vectors() {
let a = vec![1.0, 0.0, 0.0];
let b = vec![0.0, 1.0, 0.0];
let sim = cosine_similarity(&a, &b);
assert!(sim.abs() < 0.001);
}
#[test]
fn cosine_similar_vectors() {
let a = vec![1.0, 2.0, 3.0];
let b = vec![1.1, 2.1, 3.1];
let sim = cosine_similarity(&a, &b);
assert!(sim > 0.99);
}
#[test]
fn cosine_empty_returns_zero() {
assert_eq!(cosine_similarity(&[], &[]), 0.0);
}
#[test]
fn cosine_mismatched_lengths() {
assert_eq!(cosine_similarity(&[1.0], &[1.0, 2.0]), 0.0);
}
#[test]
fn cosine_zero_vector() {
let a = vec![0.0, 0.0, 0.0];
let b = vec![1.0, 2.0, 3.0];
assert_eq!(cosine_similarity(&a, &b), 0.0);
}
#[test]
fn vec_bytes_roundtrip() {
let original = vec![1.0_f32, -2.5, 3.14, 0.0, f32::MAX];
let bytes = vec_to_bytes(&original);
let restored = bytes_to_vec(&bytes);
assert_eq!(original, restored);
}
#[test]
fn vec_bytes_empty() {
let bytes = vec_to_bytes(&[]);
assert!(bytes.is_empty());
let restored = bytes_to_vec(&bytes);
assert!(restored.is_empty());
}
#[test]
fn hybrid_merge_vector_only() {
let vec_results = vec![("a".into(), 0.9), ("b".into(), 0.5)];
let merged = hybrid_merge(&vec_results, &[], 0.7, 0.3, 10);
assert_eq!(merged.len(), 2);
assert_eq!(merged[0].id, "a");
assert!(merged[0].final_score > merged[1].final_score);
}
#[test]
fn hybrid_merge_keyword_only() {
let kw_results = vec![("x".into(), 10.0), ("y".into(), 5.0)];
let merged = hybrid_merge(&[], &kw_results, 0.7, 0.3, 10);
assert_eq!(merged.len(), 2);
assert_eq!(merged[0].id, "x");
}
#[test]
fn hybrid_merge_deduplicates() {
let vec_results = vec![("a".into(), 0.9)];
let kw_results = vec![("a".into(), 10.0)];
let merged = hybrid_merge(&vec_results, &kw_results, 0.7, 0.3, 10);
assert_eq!(merged.len(), 1);
assert_eq!(merged[0].id, "a");
// Should have both scores
assert!(merged[0].vector_score.is_some());
assert!(merged[0].keyword_score.is_some());
// Final score should be higher than either alone
assert!(merged[0].final_score > 0.7 * 0.9);
}
#[test]
fn hybrid_merge_respects_limit() {
let vec_results: Vec<(String, f32)> = (0..20)
.map(|i| (format!("item_{i}"), 1.0 - i as f32 * 0.05))
.collect();
let merged = hybrid_merge(&vec_results, &[], 1.0, 0.0, 5);
assert_eq!(merged.len(), 5);
}
#[test]
fn hybrid_merge_empty_inputs() {
let merged = hybrid_merge(&[], &[], 0.7, 0.3, 10);
assert!(merged.is_empty());
}
// ── Edge cases: cosine similarity ────────────────────────────
#[test]
fn cosine_nan_returns_zero() {
let a = vec![f32::NAN, 1.0, 2.0];
let b = vec![1.0, 2.0, 3.0];
let sim = cosine_similarity(&a, &b);
// NaN propagates through arithmetic — result should be 0.0 (clamped or denom check)
assert!(sim.is_finite(), "Expected finite, got {sim}");
}
#[test]
fn cosine_infinity_returns_zero_or_finite() {
let a = vec![f32::INFINITY, 1.0];
let b = vec![1.0, 2.0];
let sim = cosine_similarity(&a, &b);
assert!(sim.is_finite(), "Expected finite, got {sim}");
}
#[test]
fn cosine_negative_values() {
let a = vec![-1.0, -2.0, -3.0];
let b = vec![-1.0, -2.0, -3.0];
// Identical negative vectors → cosine = 1.0, but clamped to [0,1]
let sim = cosine_similarity(&a, &b);
assert!((sim - 1.0).abs() < 0.001);
}
#[test]
fn cosine_opposite_vectors_clamped() {
let a = vec![1.0, 0.0];
let b = vec![-1.0, 0.0];
// Cosine = -1.0, clamped to 0.0
let sim = cosine_similarity(&a, &b);
assert!(sim.abs() < f32::EPSILON);
}
#[test]
fn cosine_high_dimensional() {
let a: Vec<f32> = (0..1536).map(|i| (f64::from(i) * 0.001) as f32).collect();
let b: Vec<f32> = (0..1536)
.map(|i| (f64::from(i) * 0.001 + 0.0001) as f32)
.collect();
let sim = cosine_similarity(&a, &b);
assert!(
sim > 0.99,
"High-dim similar vectors should be close: {sim}"
);
}
#[test]
fn cosine_single_element() {
assert!((cosine_similarity(&[5.0], &[5.0]) - 1.0).abs() < 0.001);
assert!(cosine_similarity(&[5.0], &[-5.0]).abs() < f32::EPSILON);
}
#[test]
fn cosine_both_zero_vectors() {
let a = vec![0.0, 0.0];
let b = vec![0.0, 0.0];
assert!(cosine_similarity(&a, &b).abs() < f32::EPSILON);
}
// ── Edge cases: vec↔bytes serialization ──────────────────────
#[test]
fn bytes_to_vec_non_aligned_truncates() {
// 5 bytes → only first 4 used (1 float), last byte dropped
let bytes = vec![0u8, 0, 0, 0, 0xFF];
let result = bytes_to_vec(&bytes);
assert_eq!(result.len(), 1);
assert!(result[0].abs() < f32::EPSILON);
}
#[test]
fn bytes_to_vec_three_bytes_returns_empty() {
let bytes = vec![1u8, 2, 3];
let result = bytes_to_vec(&bytes);
assert!(result.is_empty());
}
#[test]
fn vec_bytes_roundtrip_special_values() {
let special = vec![f32::MIN, f32::MAX, f32::EPSILON, -0.0, 0.0];
let bytes = vec_to_bytes(&special);
let restored = bytes_to_vec(&bytes);
assert_eq!(special.len(), restored.len());
for (a, b) in special.iter().zip(restored.iter()) {
assert_eq!(a.to_bits(), b.to_bits());
}
}
#[test]
fn vec_bytes_roundtrip_nan_preserves_bits() {
let nan_vec = vec![f32::NAN];
let bytes = vec_to_bytes(&nan_vec);
let restored = bytes_to_vec(&bytes);
assert!(restored[0].is_nan());
}
// ── Edge cases: hybrid merge ─────────────────────────────────
#[test]
fn hybrid_merge_limit_zero() {
let vec_results = vec![("a".into(), 0.9)];
let merged = hybrid_merge(&vec_results, &[], 0.7, 0.3, 0);
assert!(merged.is_empty());
}
#[test]
fn hybrid_merge_zero_weights() {
let vec_results = vec![("a".into(), 0.9)];
let kw_results = vec![("b".into(), 10.0)];
let merged = hybrid_merge(&vec_results, &kw_results, 0.0, 0.0, 10);
// All final scores should be 0.0
for r in &merged {
assert!(r.final_score.abs() < f32::EPSILON);
}
}
#[test]
fn hybrid_merge_negative_keyword_scores() {
// BM25 scores are negated in our code, but raw negatives shouldn't crash
let kw_results = vec![("a".into(), -5.0), ("b".into(), -1.0)];
let merged = hybrid_merge(&[], &kw_results, 0.7, 0.3, 10);
assert_eq!(merged.len(), 2);
// Should still produce finite scores
for r in &merged {
assert!(r.final_score.is_finite());
}
}
#[test]
fn hybrid_merge_duplicate_ids_in_same_source() {
let vec_results = vec![("a".into(), 0.9), ("a".into(), 0.5)];
let merged = hybrid_merge(&vec_results, &[], 1.0, 0.0, 10);
// Should deduplicate — only 1 entry for "a"
assert_eq!(merged.len(), 1);
}
#[test]
fn hybrid_merge_large_bm25_normalization() {
let kw_results = vec![("a".into(), 1000.0), ("b".into(), 500.0), ("c".into(), 1.0)];
let merged = hybrid_merge(&[], &kw_results, 0.0, 1.0, 10);
// "a" should have normalized score of 1.0
assert!((merged[0].keyword_score.unwrap() - 1.0).abs() < 0.001);
// "b" should have 0.5
assert!((merged[1].keyword_score.unwrap() - 0.5).abs() < 0.001);
}
#[test]
fn hybrid_merge_single_item() {
let merged = hybrid_merge(&[("only".into(), 0.8)], &[], 0.7, 0.3, 10);
assert_eq!(merged.len(), 1);
assert_eq!(merged[0].id, "only");
}
}