feat: full-stack search engine — FTS5, vector search, hybrid merge, embedding cache, chunker

The Full Stack (All Custom):
- Vector DB: embeddings stored as BLOB, cosine similarity in pure Rust
- Keyword Search: FTS5 virtual tables with BM25 scoring + auto-sync triggers
- Hybrid Merge: weighted fusion of vector + keyword results (configurable weights)
- Embeddings: provider abstraction (OpenAI, custom URL, noop fallback)
- Chunking: line-based markdown chunker with heading preservation
- Caching: embedding_cache table with LRU eviction
- Safe Reindex: rebuild FTS5 + re-embed missing vectors

New modules:
- src/memory/embeddings.rs — EmbeddingProvider trait + OpenAI + Noop + factory
- src/memory/vector.rs — cosine similarity, vec↔bytes, ScoredResult, hybrid_merge
- src/memory/chunker.rs — markdown-aware document splitting

Upgraded:
- src/memory/sqlite.rs — FTS5 schema, embedding column, hybrid recall, cache, reindex
- src/config/schema.rs — MemoryConfig expanded with embedding/search settings
- All callers updated to pass api_key for embedding provider

739 tests passing, 0 clippy warnings (Rust 1.93.1), cargo-deny clean
This commit is contained in:
argenis de la rosa 2026-02-14 00:00:23 -05:00
parent 4fceba0740
commit 0e7f501fd6
10 changed files with 1423 additions and 96 deletions

234
src/memory/vector.rs Normal file
View file

@ -0,0 +1,234 @@
// Vector operations — cosine similarity, normalization, hybrid merge.
/// Cosine similarity between two vectors. Returns 0.01.0.
pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
if a.len() != b.len() || a.is_empty() {
return 0.0;
}
let mut dot = 0.0_f64;
let mut norm_a = 0.0_f64;
let mut norm_b = 0.0_f64;
for (x, y) in a.iter().zip(b.iter()) {
let x = f64::from(*x);
let y = f64::from(*y);
dot += x * y;
norm_a += x * x;
norm_b += y * y;
}
let denom = norm_a.sqrt() * norm_b.sqrt();
if denom < f64::EPSILON {
return 0.0;
}
// Clamp to [0, 1] — embeddings are typically positive
#[allow(clippy::cast_possible_truncation)]
let sim = (dot / denom).clamp(0.0, 1.0) as f32;
sim
}
/// Serialize f32 vector to bytes (little-endian)
pub fn vec_to_bytes(v: &[f32]) -> Vec<u8> {
let mut bytes = Vec::with_capacity(v.len() * 4);
for &f in v {
bytes.extend_from_slice(&f.to_le_bytes());
}
bytes
}
/// Deserialize bytes to f32 vector (little-endian)
pub fn bytes_to_vec(bytes: &[u8]) -> Vec<f32> {
bytes
.chunks_exact(4)
.map(|chunk| {
let arr: [u8; 4] = chunk.try_into().unwrap_or([0; 4]);
f32::from_le_bytes(arr)
})
.collect()
}
/// A scored result for hybrid merging
#[derive(Debug, Clone)]
pub struct ScoredResult {
pub id: String,
pub vector_score: Option<f32>,
pub keyword_score: Option<f32>,
pub final_score: f32,
}
/// Hybrid merge: combine vector and keyword results with weighted fusion.
///
/// Normalizes each score set to [0, 1], then computes:
/// `final_score` = `vector_weight` * `vector_score` + `keyword_weight` * `keyword_score`
///
/// Deduplicates by id, keeping the best score from each source.
pub fn hybrid_merge(
vector_results: &[(String, f32)], // (id, cosine_similarity)
keyword_results: &[(String, f32)], // (id, bm25_score)
vector_weight: f32,
keyword_weight: f32,
limit: usize,
) -> Vec<ScoredResult> {
use std::collections::HashMap;
let mut map: HashMap<String, ScoredResult> = HashMap::new();
// Normalize vector scores (already 01 from cosine similarity)
for (id, score) in vector_results {
map.entry(id.clone())
.and_modify(|r| r.vector_score = Some(*score))
.or_insert_with(|| ScoredResult {
id: id.clone(),
vector_score: Some(*score),
keyword_score: None,
final_score: 0.0,
});
}
// Normalize keyword scores (BM25 can be any positive number)
let max_kw = keyword_results
.iter()
.map(|(_, s)| *s)
.fold(0.0_f32, f32::max);
let max_kw = if max_kw < f32::EPSILON { 1.0 } else { max_kw };
for (id, score) in keyword_results {
let normalized = score / max_kw;
map.entry(id.clone())
.and_modify(|r| r.keyword_score = Some(normalized))
.or_insert_with(|| ScoredResult {
id: id.clone(),
vector_score: None,
keyword_score: Some(normalized),
final_score: 0.0,
});
}
// Compute final scores
let mut results: Vec<ScoredResult> = map
.into_values()
.map(|mut r| {
let vs = r.vector_score.unwrap_or(0.0);
let ks = r.keyword_score.unwrap_or(0.0);
r.final_score = vector_weight * vs + keyword_weight * ks;
r
})
.collect();
results.sort_by(|a, b| {
b.final_score
.partial_cmp(&a.final_score)
.unwrap_or(std::cmp::Ordering::Equal)
});
results.truncate(limit);
results
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn cosine_identical_vectors() {
let v = vec![1.0, 2.0, 3.0];
let sim = cosine_similarity(&v, &v);
assert!((sim - 1.0).abs() < 0.001);
}
#[test]
fn cosine_orthogonal_vectors() {
let a = vec![1.0, 0.0, 0.0];
let b = vec![0.0, 1.0, 0.0];
let sim = cosine_similarity(&a, &b);
assert!(sim.abs() < 0.001);
}
#[test]
fn cosine_similar_vectors() {
let a = vec![1.0, 2.0, 3.0];
let b = vec![1.1, 2.1, 3.1];
let sim = cosine_similarity(&a, &b);
assert!(sim > 0.99);
}
#[test]
fn cosine_empty_returns_zero() {
assert_eq!(cosine_similarity(&[], &[]), 0.0);
}
#[test]
fn cosine_mismatched_lengths() {
assert_eq!(cosine_similarity(&[1.0], &[1.0, 2.0]), 0.0);
}
#[test]
fn cosine_zero_vector() {
let a = vec![0.0, 0.0, 0.0];
let b = vec![1.0, 2.0, 3.0];
assert_eq!(cosine_similarity(&a, &b), 0.0);
}
#[test]
fn vec_bytes_roundtrip() {
let original = vec![1.0_f32, -2.5, 3.14, 0.0, f32::MAX];
let bytes = vec_to_bytes(&original);
let restored = bytes_to_vec(&bytes);
assert_eq!(original, restored);
}
#[test]
fn vec_bytes_empty() {
let bytes = vec_to_bytes(&[]);
assert!(bytes.is_empty());
let restored = bytes_to_vec(&bytes);
assert!(restored.is_empty());
}
#[test]
fn hybrid_merge_vector_only() {
let vec_results = vec![("a".into(), 0.9), ("b".into(), 0.5)];
let merged = hybrid_merge(&vec_results, &[], 0.7, 0.3, 10);
assert_eq!(merged.len(), 2);
assert_eq!(merged[0].id, "a");
assert!(merged[0].final_score > merged[1].final_score);
}
#[test]
fn hybrid_merge_keyword_only() {
let kw_results = vec![("x".into(), 10.0), ("y".into(), 5.0)];
let merged = hybrid_merge(&[], &kw_results, 0.7, 0.3, 10);
assert_eq!(merged.len(), 2);
assert_eq!(merged[0].id, "x");
}
#[test]
fn hybrid_merge_deduplicates() {
let vec_results = vec![("a".into(), 0.9)];
let kw_results = vec![("a".into(), 10.0)];
let merged = hybrid_merge(&vec_results, &kw_results, 0.7, 0.3, 10);
assert_eq!(merged.len(), 1);
assert_eq!(merged[0].id, "a");
// Should have both scores
assert!(merged[0].vector_score.is_some());
assert!(merged[0].keyword_score.is_some());
// Final score should be higher than either alone
assert!(merged[0].final_score > 0.7 * 0.9);
}
#[test]
fn hybrid_merge_respects_limit() {
let vec_results: Vec<(String, f32)> = (0..20)
.map(|i| (format!("item_{i}"), 1.0 - i as f32 * 0.05))
.collect();
let merged = hybrid_merge(&vec_results, &[], 1.0, 0.0, 5);
assert_eq!(merged.len(), 5);
}
#[test]
fn hybrid_merge_empty_inputs() {
let merged = hybrid_merge(&[], &[], 0.7, 0.3, 10);
assert!(merged.is_empty());
}
}