test: 130 edge case tests + fix NaN/Infinity bug in cosine_similarity
Edge cases found 2 real bugs: - cosine_similarity(NaN, ...) returned NaN instead of 0.0 - cosine_similarity(Infinity, ...) returned NaN instead of 0.0 Fix: added is_finite() guards on denom and raw ratio. New edge case tests by module: - vector.rs (18): NaN, Infinity, negative vectors, opposite vectors clamped, high-dimensional (1536), single element, both-zero, non-aligned bytes, 3-byte input, special float values, NaN roundtrip, limit=0, zero weights, negative BM25 scores, duplicate IDs, large normalization, single item - embeddings.rs (8): noop embed_one error, empty batch, multiple texts, empty/unknown provider, custom empty URL, no API key, trailing slash, dims - chunker.rs (11): headings-only, deeply nested ####, long single line, whitespace-only, max_tokens=0, max_tokens=1, unicode/emoji, FTS5 special chars, multiple blank lines, trailing heading, no content loss - sqlite.rs (23): FTS5 quotes/asterisks/parens, SQL injection, empty content/key, 100KB content, unicode+emoji, newlines+tabs, single char query, limit=0/1, key matching, unicode query, schema idempotency, triple open, ghost results after forget, forget+re-store cycle, reindex empty/twice, content_hash empty/unicode/long, category roundtrip with spaces/empty, list custom category, list empty DB 869 tests passing, 0 clippy warnings, cargo-deny clean
This commit is contained in:
parent
0e7f501fd6
commit
ce4f36a3ab
4 changed files with 649 additions and 2 deletions
|
|
@ -256,4 +256,112 @@ mod tests {
|
|||
let chunks = chunk_markdown(text, 512);
|
||||
assert_eq!(chunks.len(), 1);
|
||||
}
|
||||
|
||||
// ── Edge cases ───────────────────────────────────────────────
|
||||
|
||||
#[test]
|
||||
fn headings_only_no_body() {
|
||||
let text = "# Title\n## Section A\n## Section B\n### Subsection";
|
||||
let chunks = chunk_markdown(text, 512);
|
||||
// Should produce chunks for each heading (even with empty bodies)
|
||||
assert!(!chunks.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn deeply_nested_headings_ignored() {
|
||||
// #### and deeper are NOT treated as heading splits
|
||||
let text = "# Top\nIntro\n#### Deep heading\nDeep content";
|
||||
let chunks = chunk_markdown(text, 512);
|
||||
// "#### Deep heading" should stay with its parent section
|
||||
assert!(!chunks.is_empty());
|
||||
let all_content: String = chunks.iter().map(|c| c.content.clone()).collect();
|
||||
assert!(all_content.contains("Deep heading"));
|
||||
assert!(all_content.contains("Deep content"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn very_long_single_line_no_newlines() {
|
||||
// One giant line with no newlines — can't split on lines effectively
|
||||
let text = "word ".repeat(5000);
|
||||
let chunks = chunk_markdown(&text, 50);
|
||||
// Should produce at least 1 chunk without panicking
|
||||
assert!(!chunks.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn only_newlines_and_whitespace() {
|
||||
assert!(chunk_markdown("\n\n\n \n\n", 512).is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn max_tokens_zero() {
|
||||
// max_tokens=0 → max_chars=0, should not panic or infinite loop
|
||||
let chunks = chunk_markdown("Hello world", 0);
|
||||
// Every chunk will exceed 0 chars, so it splits maximally
|
||||
assert!(!chunks.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn max_tokens_one() {
|
||||
// max_tokens=1 → max_chars=4, very aggressive splitting
|
||||
let text = "Line one\nLine two\nLine three";
|
||||
let chunks = chunk_markdown(text, 1);
|
||||
assert!(!chunks.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unicode_content() {
|
||||
let text = "# 日本語\nこんにちは世界\n\n## Émojis\n🦀 Rust is great 🚀";
|
||||
let chunks = chunk_markdown(text, 512);
|
||||
assert!(!chunks.is_empty());
|
||||
let all: String = chunks.iter().map(|c| c.content.clone()).collect();
|
||||
assert!(all.contains("こんにちは"));
|
||||
assert!(all.contains("🦀"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn fts5_special_chars_in_content() {
|
||||
let text = "Content with \"quotes\" and (parentheses) and * asterisks *";
|
||||
let chunks = chunk_markdown(text, 512);
|
||||
assert_eq!(chunks.len(), 1);
|
||||
assert!(chunks[0].content.contains("\"quotes\""));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_blank_lines_between_paragraphs() {
|
||||
let text = "Paragraph one.\n\n\n\n\nParagraph two.\n\n\n\nParagraph three.";
|
||||
let chunks = chunk_markdown(text, 512);
|
||||
assert_eq!(chunks.len(), 1); // All fits in one chunk
|
||||
assert!(chunks[0].content.contains("Paragraph one"));
|
||||
assert!(chunks[0].content.contains("Paragraph three"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn heading_at_end_of_text() {
|
||||
let text = "Some content\n# Trailing Heading";
|
||||
let chunks = chunk_markdown(text, 512);
|
||||
assert!(!chunks.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn single_heading_no_content() {
|
||||
let text = "# Just a heading";
|
||||
let chunks = chunk_markdown(text, 512);
|
||||
assert_eq!(chunks.len(), 1);
|
||||
assert_eq!(chunks[0].heading.as_deref(), Some("# Just a heading"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn no_content_loss() {
|
||||
let text = "# A\nContent A line 1\nContent A line 2\n\n## B\nContent B\n\n## C\nContent C";
|
||||
let chunks = chunk_markdown(text, 512);
|
||||
let reassembled: String = chunks.iter().map(|c| format!("{}\n", c.content)).collect();
|
||||
// All original content words should appear
|
||||
for word in ["Content", "line", "1", "2"] {
|
||||
assert!(
|
||||
reassembled.contains(word),
|
||||
"Missing word '{word}' in reassembled chunks"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue