feat: full-stack search engine — FTS5, vector search, hybrid merge, embedding cache, chunker

The Full Stack (All Custom):
- Vector DB: embeddings stored as BLOB, cosine similarity in pure Rust
- Keyword Search: FTS5 virtual tables with BM25 scoring + auto-sync triggers
- Hybrid Merge: weighted fusion of vector + keyword results (configurable weights)
- Embeddings: provider abstraction (OpenAI, custom URL, noop fallback)
- Chunking: line-based markdown chunker with heading preservation
- Caching: embedding_cache table with LRU eviction
- Safe Reindex: rebuild FTS5 + re-embed missing vectors

New modules:
- src/memory/embeddings.rs — EmbeddingProvider trait + OpenAI + Noop + factory
- src/memory/vector.rs — cosine similarity, vec↔bytes, ScoredResult, hybrid_merge
- src/memory/chunker.rs — markdown-aware document splitting

Upgraded:
- src/memory/sqlite.rs — FTS5 schema, embedding column, hybrid recall, cache, reindex
- src/config/schema.rs — MemoryConfig expanded with embedding/search settings
- All callers updated to pass api_key for embedding provider

739 tests passing, 0 clippy warnings (Rust 1.93.1), cargo-deny clean
This commit is contained in:
argenis de la rosa 2026-02-14 00:00:23 -05:00
parent 4fceba0740
commit 0e7f501fd6
10 changed files with 1423 additions and 96 deletions

View file

@ -1,6 +1,9 @@
pub mod chunker;
pub mod embeddings;
pub mod markdown;
pub mod sqlite;
pub mod traits;
pub mod vector;
pub use markdown::MarkdownMemory;
pub use sqlite::SqliteMemory;
@ -10,14 +13,34 @@ pub use traits::{MemoryCategory, MemoryEntry};
use crate::config::MemoryConfig;
use std::path::Path;
use std::sync::Arc;
/// Factory: create the right memory backend from config
pub fn create_memory(
config: &MemoryConfig,
workspace_dir: &Path,
api_key: Option<&str>,
) -> anyhow::Result<Box<dyn Memory>> {
match config.backend.as_str() {
"sqlite" => Ok(Box::new(SqliteMemory::new(workspace_dir)?)),
"sqlite" => {
let embedder: Arc<dyn embeddings::EmbeddingProvider> =
Arc::from(embeddings::create_embedding_provider(
&config.embedding_provider,
api_key,
&config.embedding_model,
config.embedding_dimensions,
));
#[allow(clippy::cast_possible_truncation)]
let mem = SqliteMemory::with_embedder(
workspace_dir,
embedder,
config.vector_weight as f32,
config.keyword_weight as f32,
config.embedding_cache_size,
)?;
Ok(Box::new(mem))
}
"markdown" | "none" => Ok(Box::new(MarkdownMemory::new(workspace_dir))),
other => {
tracing::warn!("Unknown memory backend '{other}', falling back to markdown");
@ -36,9 +59,9 @@ mod tests {
let tmp = TempDir::new().unwrap();
let cfg = MemoryConfig {
backend: "sqlite".into(),
auto_save: true,
..MemoryConfig::default()
};
let mem = create_memory(&cfg, tmp.path()).unwrap();
let mem = create_memory(&cfg, tmp.path(), None).unwrap();
assert_eq!(mem.name(), "sqlite");
}
@ -47,9 +70,9 @@ mod tests {
let tmp = TempDir::new().unwrap();
let cfg = MemoryConfig {
backend: "markdown".into(),
auto_save: true,
..MemoryConfig::default()
};
let mem = create_memory(&cfg, tmp.path()).unwrap();
let mem = create_memory(&cfg, tmp.path(), None).unwrap();
assert_eq!(mem.name(), "markdown");
}
@ -58,9 +81,9 @@ mod tests {
let tmp = TempDir::new().unwrap();
let cfg = MemoryConfig {
backend: "none".into(),
auto_save: true,
..MemoryConfig::default()
};
let mem = create_memory(&cfg, tmp.path()).unwrap();
let mem = create_memory(&cfg, tmp.path(), None).unwrap();
assert_eq!(mem.name(), "markdown");
}
@ -69,9 +92,9 @@ mod tests {
let tmp = TempDir::new().unwrap();
let cfg = MemoryConfig {
backend: "redis".into(),
auto_save: true,
..MemoryConfig::default()
};
let mem = create_memory(&cfg, tmp.path()).unwrap();
let mem = create_memory(&cfg, tmp.path(), None).unwrap();
assert_eq!(mem.name(), "markdown");
}
}