test: 130 edge case tests + fix NaN/Infinity bug in cosine_similarity

Edge cases found 2 real bugs: - cosine_similarity(NaN, ...) returned NaN instead of 0.0 - cosine_similarity(Infinity, ...) returned NaN instead of 0.0 Fix: added is_finite() guards on denom and raw ratio. New edge case tests by module: - vector.rs (18): NaN, Infinity, negative vectors, opposite vectors clamped, high-dimensional (1536), single element, both-zero, non-aligned bytes, 3-byte input, special float values, NaN roundtrip, limit=0, zero weights, negative BM25 scores, duplicate IDs, large normalization, single item - embeddings.rs (8): noop embed_one error, empty batch, multiple texts, empty/unknown provider, custom empty URL, no API key, trailing slash, dims - chunker.rs (11): headings-only, deeply nested ####, long single line, whitespace-only, max_tokens=0, max_tokens=1, unicode/emoji, FTS5 special chars, multiple blank lines, trailing heading, no content loss - sqlite.rs (23): FTS5 quotes/asterisks/parens, SQL injection, empty content/key, 100KB content, unicode+emoji, newlines+tabs, single char query, limit=0/1, key matching, unicode query, schema idempotency, triple open, ghost results after forget, forget+re-store cycle, reindex empty/twice, content_hash empty/unicode/long, category roundtrip with spaces/empty, list custom category, list empty DB 869 tests passing, 0 clippy warnings, cargo-deny clean
2026-02-14 00:28:55 -05:00 · 2026-02-14 00:28:55 -05:00 · ce4f36a3ab
commit ce4f36a3ab
parent 0e7f501fd6
4 changed files with 649 additions and 2 deletions
--- a/src/memory/chunker.rs
+++ b/src/memory/chunker.rs
@ -256,4 +256,112 @@ mod tests {
        let chunks = chunk_markdown(text, 512);
        assert_eq!(chunks.len(), 1);
    }
+
+    // ── Edge cases ───────────────────────────────────────────────
+
+    #[test]
+    fn headings_only_no_body() {
+        let text = "# Title\n## Section A\n## Section B\n### Subsection";
+        let chunks = chunk_markdown(text, 512);
+        // Should produce chunks for each heading (even with empty bodies)
+        assert!(!chunks.is_empty());
+    }
+
+    #[test]
+    fn deeply_nested_headings_ignored() {
+        // #### and deeper are NOT treated as heading splits
+        let text = "# Top\nIntro\n#### Deep heading\nDeep content";
+        let chunks = chunk_markdown(text, 512);
+        // "#### Deep heading" should stay with its parent section
+        assert!(!chunks.is_empty());
+        let all_content: String = chunks.iter().map(|c| c.content.clone()).collect();
+        assert!(all_content.contains("Deep heading"));
+        assert!(all_content.contains("Deep content"));
+    }
+
+    #[test]
+    fn very_long_single_line_no_newlines() {
+        // One giant line with no newlines — can't split on lines effectively
+        let text = "word ".repeat(5000);
+        let chunks = chunk_markdown(&text, 50);
+        // Should produce at least 1 chunk without panicking
+        assert!(!chunks.is_empty());
+    }
+
+    #[test]
+    fn only_newlines_and_whitespace() {
+        assert!(chunk_markdown("\n\n\n   \n\n", 512).is_empty());
+    }
+
+    #[test]
+    fn max_tokens_zero() {
+        // max_tokens=0 → max_chars=0, should not panic or infinite loop
+        let chunks = chunk_markdown("Hello world", 0);
+        // Every chunk will exceed 0 chars, so it splits maximally
+        assert!(!chunks.is_empty());
+    }
+
+    #[test]
+    fn max_tokens_one() {
+        // max_tokens=1 → max_chars=4, very aggressive splitting
+        let text = "Line one\nLine two\nLine three";
+        let chunks = chunk_markdown(text, 1);
+        assert!(!chunks.is_empty());
+    }
+
+    #[test]
+    fn unicode_content() {
+        let text = "# 日本語\nこんにちは世界\n\n## Émojis\n🦀 Rust is great 🚀";
+        let chunks = chunk_markdown(text, 512);
+        assert!(!chunks.is_empty());
+        let all: String = chunks.iter().map(|c| c.content.clone()).collect();
+        assert!(all.contains("こんにちは"));
+        assert!(all.contains("🦀"));
+    }
+
+    #[test]
+    fn fts5_special_chars_in_content() {
+        let text = "Content with \"quotes\" and (parentheses) and * asterisks *";
+        let chunks = chunk_markdown(text, 512);
+        assert_eq!(chunks.len(), 1);
+        assert!(chunks[0].content.contains("\"quotes\""));
+    }
+
+    #[test]
+    fn multiple_blank_lines_between_paragraphs() {
+        let text = "Paragraph one.\n\n\n\n\nParagraph two.\n\n\n\nParagraph three.";
+        let chunks = chunk_markdown(text, 512);
+        assert_eq!(chunks.len(), 1); // All fits in one chunk
+        assert!(chunks[0].content.contains("Paragraph one"));
+        assert!(chunks[0].content.contains("Paragraph three"));
+    }
+
+    #[test]
+    fn heading_at_end_of_text() {
+        let text = "Some content\n# Trailing Heading";
+        let chunks = chunk_markdown(text, 512);
+        assert!(!chunks.is_empty());
+    }
+
+    #[test]
+    fn single_heading_no_content() {
+        let text = "# Just a heading";
+        let chunks = chunk_markdown(text, 512);
+        assert_eq!(chunks.len(), 1);
+        assert_eq!(chunks[0].heading.as_deref(), Some("# Just a heading"));
+    }
+
+    #[test]
+    fn no_content_loss() {
+        let text = "# A\nContent A line 1\nContent A line 2\n\n## B\nContent B\n\n## C\nContent C";
+        let chunks = chunk_markdown(text, 512);
+        let reassembled: String = chunks.iter().map(|c| format!("{}\n", c.content)).collect();
+        // All original content words should appear
+        for word in ["Content", "line", "1", "2"] {
+            assert!(
+                reassembled.contains(word),
+                "Missing word '{word}' in reassembled chunks"
+            );
+        }
+    }
 }