test: 130 edge case tests + fix NaN/Infinity bug in cosine_similarity

Edge cases found 2 real bugs:
- cosine_similarity(NaN, ...) returned NaN instead of 0.0
- cosine_similarity(Infinity, ...) returned NaN instead of 0.0
Fix: added is_finite() guards on denom and raw ratio.

New edge case tests by module:
- vector.rs (18): NaN, Infinity, negative vectors, opposite vectors clamped,
  high-dimensional (1536), single element, both-zero, non-aligned bytes,
  3-byte input, special float values, NaN roundtrip, limit=0, zero weights,
  negative BM25 scores, duplicate IDs, large normalization, single item
- embeddings.rs (8): noop embed_one error, empty batch, multiple texts,
  empty/unknown provider, custom empty URL, no API key, trailing slash, dims
- chunker.rs (11): headings-only, deeply nested ####, long single line,
  whitespace-only, max_tokens=0, max_tokens=1, unicode/emoji, FTS5 special
  chars, multiple blank lines, trailing heading, no content loss
- sqlite.rs (23): FTS5 quotes/asterisks/parens, SQL injection, empty
  content/key, 100KB content, unicode+emoji, newlines+tabs, single char
  query, limit=0/1, key matching, unicode query, schema idempotency,
  triple open, ghost results after forget, forget+re-store cycle,
  reindex empty/twice, content_hash empty/unicode/long, category
  roundtrip with spaces/empty, list custom category, list empty DB

869 tests passing, 0 clippy warnings, cargo-deny clean
This commit is contained in:
argenis de la rosa 2026-02-14 00:28:55 -05:00
parent 0e7f501fd6
commit ce4f36a3ab
4 changed files with 649 additions and 2 deletions

View file

@ -256,4 +256,112 @@ mod tests {
let chunks = chunk_markdown(text, 512);
assert_eq!(chunks.len(), 1);
}
// ── Edge cases ───────────────────────────────────────────────
#[test]
fn headings_only_no_body() {
let text = "# Title\n## Section A\n## Section B\n### Subsection";
let chunks = chunk_markdown(text, 512);
// Should produce chunks for each heading (even with empty bodies)
assert!(!chunks.is_empty());
}
#[test]
fn deeply_nested_headings_ignored() {
// #### and deeper are NOT treated as heading splits
let text = "# Top\nIntro\n#### Deep heading\nDeep content";
let chunks = chunk_markdown(text, 512);
// "#### Deep heading" should stay with its parent section
assert!(!chunks.is_empty());
let all_content: String = chunks.iter().map(|c| c.content.clone()).collect();
assert!(all_content.contains("Deep heading"));
assert!(all_content.contains("Deep content"));
}
#[test]
fn very_long_single_line_no_newlines() {
// One giant line with no newlines — can't split on lines effectively
let text = "word ".repeat(5000);
let chunks = chunk_markdown(&text, 50);
// Should produce at least 1 chunk without panicking
assert!(!chunks.is_empty());
}
#[test]
fn only_newlines_and_whitespace() {
assert!(chunk_markdown("\n\n\n \n\n", 512).is_empty());
}
#[test]
fn max_tokens_zero() {
// max_tokens=0 → max_chars=0, should not panic or infinite loop
let chunks = chunk_markdown("Hello world", 0);
// Every chunk will exceed 0 chars, so it splits maximally
assert!(!chunks.is_empty());
}
#[test]
fn max_tokens_one() {
// max_tokens=1 → max_chars=4, very aggressive splitting
let text = "Line one\nLine two\nLine three";
let chunks = chunk_markdown(text, 1);
assert!(!chunks.is_empty());
}
#[test]
fn unicode_content() {
let text = "# 日本語\nこんにちは世界\n\n## Émojis\n🦀 Rust is great 🚀";
let chunks = chunk_markdown(text, 512);
assert!(!chunks.is_empty());
let all: String = chunks.iter().map(|c| c.content.clone()).collect();
assert!(all.contains("こんにちは"));
assert!(all.contains("🦀"));
}
#[test]
fn fts5_special_chars_in_content() {
let text = "Content with \"quotes\" and (parentheses) and * asterisks *";
let chunks = chunk_markdown(text, 512);
assert_eq!(chunks.len(), 1);
assert!(chunks[0].content.contains("\"quotes\""));
}
#[test]
fn multiple_blank_lines_between_paragraphs() {
let text = "Paragraph one.\n\n\n\n\nParagraph two.\n\n\n\nParagraph three.";
let chunks = chunk_markdown(text, 512);
assert_eq!(chunks.len(), 1); // All fits in one chunk
assert!(chunks[0].content.contains("Paragraph one"));
assert!(chunks[0].content.contains("Paragraph three"));
}
#[test]
fn heading_at_end_of_text() {
let text = "Some content\n# Trailing Heading";
let chunks = chunk_markdown(text, 512);
assert!(!chunks.is_empty());
}
#[test]
fn single_heading_no_content() {
let text = "# Just a heading";
let chunks = chunk_markdown(text, 512);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].heading.as_deref(), Some("# Just a heading"));
}
#[test]
fn no_content_loss() {
let text = "# A\nContent A line 1\nContent A line 2\n\n## B\nContent B\n\n## C\nContent C";
let chunks = chunk_markdown(text, 512);
let reassembled: String = chunks.iter().map(|c| format!("{}\n", c.content)).collect();
// All original content words should appear
for word in ["Content", "line", "1", "2"] {
assert!(
reassembled.contains(word),
"Missing word '{word}' in reassembled chunks"
);
}
}
}