feat: full-stack search engine — FTS5, vector search, hybrid merge, embedding cache, chunker
The Full Stack (All Custom): - Vector DB: embeddings stored as BLOB, cosine similarity in pure Rust - Keyword Search: FTS5 virtual tables with BM25 scoring + auto-sync triggers - Hybrid Merge: weighted fusion of vector + keyword results (configurable weights) - Embeddings: provider abstraction (OpenAI, custom URL, noop fallback) - Chunking: line-based markdown chunker with heading preservation - Caching: embedding_cache table with LRU eviction - Safe Reindex: rebuild FTS5 + re-embed missing vectors New modules: - src/memory/embeddings.rs — EmbeddingProvider trait + OpenAI + Noop + factory - src/memory/vector.rs — cosine similarity, vec↔bytes, ScoredResult, hybrid_merge - src/memory/chunker.rs — markdown-aware document splitting Upgraded: - src/memory/sqlite.rs — FTS5 schema, embedding column, hybrid recall, cache, reindex - src/config/schema.rs — MemoryConfig expanded with embedding/search settings - All callers updated to pass api_key for embedding provider 739 tests passing, 0 clippy warnings (Rust 1.93.1), cargo-deny clean
This commit is contained in:
parent
4fceba0740
commit
0e7f501fd6
10 changed files with 1423 additions and 96 deletions
234
src/memory/vector.rs
Normal file
234
src/memory/vector.rs
Normal file
|
|
@ -0,0 +1,234 @@
|
|||
// Vector operations — cosine similarity, normalization, hybrid merge.
|
||||
|
||||
/// Cosine similarity between two vectors. Returns 0.0–1.0.
|
||||
pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
|
||||
if a.len() != b.len() || a.is_empty() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let mut dot = 0.0_f64;
|
||||
let mut norm_a = 0.0_f64;
|
||||
let mut norm_b = 0.0_f64;
|
||||
|
||||
for (x, y) in a.iter().zip(b.iter()) {
|
||||
let x = f64::from(*x);
|
||||
let y = f64::from(*y);
|
||||
dot += x * y;
|
||||
norm_a += x * x;
|
||||
norm_b += y * y;
|
||||
}
|
||||
|
||||
let denom = norm_a.sqrt() * norm_b.sqrt();
|
||||
if denom < f64::EPSILON {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
// Clamp to [0, 1] — embeddings are typically positive
|
||||
#[allow(clippy::cast_possible_truncation)]
|
||||
let sim = (dot / denom).clamp(0.0, 1.0) as f32;
|
||||
sim
|
||||
}
|
||||
|
||||
/// Serialize f32 vector to bytes (little-endian)
|
||||
pub fn vec_to_bytes(v: &[f32]) -> Vec<u8> {
|
||||
let mut bytes = Vec::with_capacity(v.len() * 4);
|
||||
for &f in v {
|
||||
bytes.extend_from_slice(&f.to_le_bytes());
|
||||
}
|
||||
bytes
|
||||
}
|
||||
|
||||
/// Deserialize bytes to f32 vector (little-endian)
|
||||
pub fn bytes_to_vec(bytes: &[u8]) -> Vec<f32> {
|
||||
bytes
|
||||
.chunks_exact(4)
|
||||
.map(|chunk| {
|
||||
let arr: [u8; 4] = chunk.try_into().unwrap_or([0; 4]);
|
||||
f32::from_le_bytes(arr)
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// A scored result for hybrid merging
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ScoredResult {
|
||||
pub id: String,
|
||||
pub vector_score: Option<f32>,
|
||||
pub keyword_score: Option<f32>,
|
||||
pub final_score: f32,
|
||||
}
|
||||
|
||||
/// Hybrid merge: combine vector and keyword results with weighted fusion.
|
||||
///
|
||||
/// Normalizes each score set to [0, 1], then computes:
|
||||
/// `final_score` = `vector_weight` * `vector_score` + `keyword_weight` * `keyword_score`
|
||||
///
|
||||
/// Deduplicates by id, keeping the best score from each source.
|
||||
pub fn hybrid_merge(
|
||||
vector_results: &[(String, f32)], // (id, cosine_similarity)
|
||||
keyword_results: &[(String, f32)], // (id, bm25_score)
|
||||
vector_weight: f32,
|
||||
keyword_weight: f32,
|
||||
limit: usize,
|
||||
) -> Vec<ScoredResult> {
|
||||
use std::collections::HashMap;
|
||||
|
||||
let mut map: HashMap<String, ScoredResult> = HashMap::new();
|
||||
|
||||
// Normalize vector scores (already 0–1 from cosine similarity)
|
||||
for (id, score) in vector_results {
|
||||
map.entry(id.clone())
|
||||
.and_modify(|r| r.vector_score = Some(*score))
|
||||
.or_insert_with(|| ScoredResult {
|
||||
id: id.clone(),
|
||||
vector_score: Some(*score),
|
||||
keyword_score: None,
|
||||
final_score: 0.0,
|
||||
});
|
||||
}
|
||||
|
||||
// Normalize keyword scores (BM25 can be any positive number)
|
||||
let max_kw = keyword_results
|
||||
.iter()
|
||||
.map(|(_, s)| *s)
|
||||
.fold(0.0_f32, f32::max);
|
||||
let max_kw = if max_kw < f32::EPSILON { 1.0 } else { max_kw };
|
||||
|
||||
for (id, score) in keyword_results {
|
||||
let normalized = score / max_kw;
|
||||
map.entry(id.clone())
|
||||
.and_modify(|r| r.keyword_score = Some(normalized))
|
||||
.or_insert_with(|| ScoredResult {
|
||||
id: id.clone(),
|
||||
vector_score: None,
|
||||
keyword_score: Some(normalized),
|
||||
final_score: 0.0,
|
||||
});
|
||||
}
|
||||
|
||||
// Compute final scores
|
||||
let mut results: Vec<ScoredResult> = map
|
||||
.into_values()
|
||||
.map(|mut r| {
|
||||
let vs = r.vector_score.unwrap_or(0.0);
|
||||
let ks = r.keyword_score.unwrap_or(0.0);
|
||||
r.final_score = vector_weight * vs + keyword_weight * ks;
|
||||
r
|
||||
})
|
||||
.collect();
|
||||
|
||||
results.sort_by(|a, b| {
|
||||
b.final_score
|
||||
.partial_cmp(&a.final_score)
|
||||
.unwrap_or(std::cmp::Ordering::Equal)
|
||||
});
|
||||
results.truncate(limit);
|
||||
results
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn cosine_identical_vectors() {
|
||||
let v = vec![1.0, 2.0, 3.0];
|
||||
let sim = cosine_similarity(&v, &v);
|
||||
assert!((sim - 1.0).abs() < 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cosine_orthogonal_vectors() {
|
||||
let a = vec![1.0, 0.0, 0.0];
|
||||
let b = vec![0.0, 1.0, 0.0];
|
||||
let sim = cosine_similarity(&a, &b);
|
||||
assert!(sim.abs() < 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cosine_similar_vectors() {
|
||||
let a = vec![1.0, 2.0, 3.0];
|
||||
let b = vec![1.1, 2.1, 3.1];
|
||||
let sim = cosine_similarity(&a, &b);
|
||||
assert!(sim > 0.99);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cosine_empty_returns_zero() {
|
||||
assert_eq!(cosine_similarity(&[], &[]), 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cosine_mismatched_lengths() {
|
||||
assert_eq!(cosine_similarity(&[1.0], &[1.0, 2.0]), 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cosine_zero_vector() {
|
||||
let a = vec![0.0, 0.0, 0.0];
|
||||
let b = vec![1.0, 2.0, 3.0];
|
||||
assert_eq!(cosine_similarity(&a, &b), 0.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vec_bytes_roundtrip() {
|
||||
let original = vec![1.0_f32, -2.5, 3.14, 0.0, f32::MAX];
|
||||
let bytes = vec_to_bytes(&original);
|
||||
let restored = bytes_to_vec(&bytes);
|
||||
assert_eq!(original, restored);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn vec_bytes_empty() {
|
||||
let bytes = vec_to_bytes(&[]);
|
||||
assert!(bytes.is_empty());
|
||||
let restored = bytes_to_vec(&bytes);
|
||||
assert!(restored.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hybrid_merge_vector_only() {
|
||||
let vec_results = vec![("a".into(), 0.9), ("b".into(), 0.5)];
|
||||
let merged = hybrid_merge(&vec_results, &[], 0.7, 0.3, 10);
|
||||
assert_eq!(merged.len(), 2);
|
||||
assert_eq!(merged[0].id, "a");
|
||||
assert!(merged[0].final_score > merged[1].final_score);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hybrid_merge_keyword_only() {
|
||||
let kw_results = vec![("x".into(), 10.0), ("y".into(), 5.0)];
|
||||
let merged = hybrid_merge(&[], &kw_results, 0.7, 0.3, 10);
|
||||
assert_eq!(merged.len(), 2);
|
||||
assert_eq!(merged[0].id, "x");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hybrid_merge_deduplicates() {
|
||||
let vec_results = vec![("a".into(), 0.9)];
|
||||
let kw_results = vec![("a".into(), 10.0)];
|
||||
let merged = hybrid_merge(&vec_results, &kw_results, 0.7, 0.3, 10);
|
||||
assert_eq!(merged.len(), 1);
|
||||
assert_eq!(merged[0].id, "a");
|
||||
// Should have both scores
|
||||
assert!(merged[0].vector_score.is_some());
|
||||
assert!(merged[0].keyword_score.is_some());
|
||||
// Final score should be higher than either alone
|
||||
assert!(merged[0].final_score > 0.7 * 0.9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hybrid_merge_respects_limit() {
|
||||
let vec_results: Vec<(String, f32)> = (0..20)
|
||||
.map(|i| (format!("item_{i}"), 1.0 - i as f32 * 0.05))
|
||||
.collect();
|
||||
let merged = hybrid_merge(&vec_results, &[], 1.0, 0.0, 5);
|
||||
assert_eq!(merged.len(), 5);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn hybrid_merge_empty_inputs() {
|
||||
let merged = hybrid_merge(&[], &[], 0.7, 0.3, 10);
|
||||
assert!(merged.is_empty());
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue