Fixes Issue #55: Unicode string truncation causes panics with non-ASCII input Previously, code used byte-index slicing (`&s[..n]`) which panics when the slice boundary falls in the middle of a multi-byte UTF-8 character (emoji, CJK, accented characters). Changes: - Added `truncate_with_ellipsis()` helper in `src/util.rs` that uses `char_indices()` to find safe character boundaries - Replaced 2 unsafe truncations in `src/channels/mod.rs` with the safe helper - Added 12 comprehensive tests covering emoji, CJK, accented chars, and edge cases Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
134 lines
4.5 KiB
Rust
134 lines
4.5 KiB
Rust
//! Utility functions for ZeroClaw.
|
|
//!
|
|
//! This module contains reusable helper functions used across the codebase.
|
|
|
|
/// Truncate a string to at most `max_chars` characters, appending "..." if truncated.
|
|
///
|
|
/// This function safely handles multi-byte UTF-8 characters (emoji, CJK, accented characters)
|
|
/// by using character boundaries instead of byte indices.
|
|
///
|
|
/// # Arguments
|
|
/// * `s` - The string to truncate
|
|
/// * `max_chars` - Maximum number of characters to keep (excluding "...")
|
|
///
|
|
/// # Returns
|
|
/// * Original string if length <= `max_chars`
|
|
/// * Truncated string with "..." appended if length > `max_chars`
|
|
///
|
|
/// # Examples
|
|
/// ```
|
|
/// use zeroclaw::util::truncate_with_ellipsis;
|
|
///
|
|
/// // ASCII string - no truncation needed
|
|
/// assert_eq!(truncate_with_ellipsis("hello", 10), "hello");
|
|
///
|
|
/// // ASCII string - truncation needed
|
|
/// assert_eq!(truncate_with_ellipsis("hello world", 5), "hello...");
|
|
///
|
|
/// // Multi-byte UTF-8 (emoji) - safe truncation
|
|
/// assert_eq!(truncate_with_ellipsis("Hello 🦀 World", 8), "Hello 🦀...");
|
|
/// assert_eq!(truncate_with_ellipsis("😀😀😀😀", 2), "😀😀...");
|
|
///
|
|
/// // Empty string
|
|
/// assert_eq!(truncate_with_ellipsis("", 10), "");
|
|
/// ```
|
|
pub fn truncate_with_ellipsis(s: &str, max_chars: usize) -> String {
|
|
match s.char_indices().nth(max_chars) {
|
|
Some((idx, _)) => format!("{}...", &s[..idx]),
|
|
None => s.to_string(),
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_truncate_ascii_no_truncation() {
|
|
// ASCII string shorter than limit - no change
|
|
assert_eq!(truncate_with_ellipsis("hello", 10), "hello");
|
|
assert_eq!(truncate_with_ellipsis("hello world", 50), "hello world");
|
|
}
|
|
|
|
#[test]
|
|
fn test_truncate_ascii_with_truncation() {
|
|
// ASCII string longer than limit - truncates
|
|
assert_eq!(truncate_with_ellipsis("hello world", 5), "hello...");
|
|
assert_eq!(truncate_with_ellipsis("This is a long message", 10), "This is a ...");
|
|
}
|
|
|
|
#[test]
|
|
fn test_truncate_empty_string() {
|
|
assert_eq!(truncate_with_ellipsis("", 10), "");
|
|
}
|
|
|
|
#[test]
|
|
fn test_truncate_at_exact_boundary() {
|
|
// String exactly at boundary - no truncation
|
|
assert_eq!(truncate_with_ellipsis("hello", 5), "hello");
|
|
}
|
|
|
|
#[test]
|
|
fn test_truncate_emoji_single() {
|
|
// Single emoji (4 bytes) - should not panic
|
|
let s = "🦀";
|
|
assert_eq!(truncate_with_ellipsis(s, 10), s);
|
|
assert_eq!(truncate_with_ellipsis(s, 1), s);
|
|
}
|
|
|
|
#[test]
|
|
fn test_truncate_emoji_multiple() {
|
|
// Multiple emoji - safe truncation at character boundary
|
|
let s = "😀😀😀😀"; // 4 emoji, each 4 bytes = 16 bytes total
|
|
assert_eq!(truncate_with_ellipsis(s, 2), "😀😀...");
|
|
assert_eq!(truncate_with_ellipsis(s, 3), "😀😀😀...");
|
|
}
|
|
|
|
#[test]
|
|
fn test_truncate_mixed_ascii_emoji() {
|
|
// Mixed ASCII and emoji
|
|
assert_eq!(truncate_with_ellipsis("Hello 🦀 World", 8), "Hello 🦀 ...");
|
|
assert_eq!(truncate_with_ellipsis("Hi 😊", 10), "Hi 😊");
|
|
}
|
|
|
|
#[test]
|
|
fn test_truncate_cjk_characters() {
|
|
// CJK characters (Chinese - each is 3 bytes)
|
|
// This would panic with byte slicing: &s[..50] where s has 17 chars (51 bytes)
|
|
let s = "这是一个测试消息用来触发崩溃的中文"; // 21 characters
|
|
// Each character is 3 bytes, so 50 bytes is ~16 characters
|
|
let result = truncate_with_ellipsis(s, 16);
|
|
assert!(result.ends_with("..."));
|
|
// Should not panic and should be valid UTF-8
|
|
assert!(result.is_char_boundary(result.len() - 1));
|
|
}
|
|
|
|
#[test]
|
|
fn test_truncate_accented_characters() {
|
|
// Accented characters (2 bytes each in UTF-8)
|
|
let s = "café résumé naïve";
|
|
assert_eq!(truncate_with_ellipsis(s, 10), "café résum...");
|
|
}
|
|
|
|
#[test]
|
|
fn test_truncate_unicode_edge_case() {
|
|
// Mix of 1-byte, 2-byte, 3-byte, and 4-byte characters
|
|
let s = "aé你好🦀"; // 1 + 1 + 2 + 2 + 4 bytes = 10 bytes, 5 chars
|
|
assert_eq!(truncate_with_ellipsis(s, 3), "aé你...");
|
|
}
|
|
|
|
#[test]
|
|
fn test_truncate_long_string() {
|
|
// Long ASCII string
|
|
let s = "a".repeat(200);
|
|
let result = truncate_with_ellipsis(&s, 50);
|
|
assert_eq!(result.len(), 53); // 50 + "..."
|
|
assert!(result.ends_with("..."));
|
|
}
|
|
|
|
#[test]
|
|
fn test_truncate_zero_max_chars() {
|
|
// Edge case: max_chars = 0
|
|
assert_eq!(truncate_with_ellipsis("hello", 0), "...");
|
|
}
|
|
}
|