internationalization/
range.rs

1use alloc::format;
2use alloc::string::String;
3use alloc::vec::Vec;
4use core::ops::Range;
5
6pub const BASIC_LATIN: Range<u32> = 0x20..0x7F;
7pub const LATIN_1_SUPPLEMENT: Range<u32> = 0xA0..0xFF;
8pub const LATIN_EXTENDED_A: Range<u32> = 0x100..0x17F;
9pub const LATIN_EXTENDED_B: Range<u32> = 0x180..0x24F;
10pub const LATIN_EXTENDED_ADDITIONAL: Range<u32> = 0x1E00..0x1EFF;
11pub const CYRILLIC: Range<u32> = 0x400..0x4FF;
12pub const CYRILLIC_SUPPLEMENT: Range<u32> = 0x500..0x52F;
13pub const GREEK: Range<u32> = 0x370..0x3FF;
14pub const GREEK_EXTENDED: Range<u32> = 0x1F00..0x1FFF;
15pub const ARABIC: Range<u32> = 0x600..0x6FF;
16pub const ARABIC_SUPPLEMENT: Range<u32> = 0x750..0x77F;
17pub const HEBREW: Range<u32> = 0x590..0x5FF;
18pub const CJK_UNIFIED_IDEOGRAPHS: Range<u32> = 0x4E00..0x9FFF;
19pub const HIRAGANA: Range<u32> = 0x3040..0x309F;
20pub const KATAKANA: Range<u32> = 0x30A0..0x30FF;
21pub const HANGUL_SYLLABLES: Range<u32> = 0xAC00..0xD7AF;
22pub const DEVANAGARI: Range<u32> = 0x900..0x97F;
23pub const THAI: Range<u32> = 0xE00..0xE7F;
24pub const VIETNAMESE_EXTENSIONS: Range<u32> = 0x1EA0..0x1EFF;
25
26pub fn get_locale_ranges(locale: &str) -> Option<&'static [Range<u32>]> {
27    let ranges: &[Range<u32>] = match locale {
28        // English
29        "en" => &[BASIC_LATIN],
30
31        // French
32        "fr" => &[
33            BASIC_LATIN,
34            LATIN_1_SUPPLEMENT,
35            LATIN_EXTENDED_A,
36            LATIN_EXTENDED_B,
37        ],
38
39        // German
40        "de" => &[BASIC_LATIN, LATIN_1_SUPPLEMENT, LATIN_EXTENDED_A],
41
42        // Spanish
43        "es" => &[BASIC_LATIN, LATIN_1_SUPPLEMENT],
44
45        // Italian
46        "it" => &[BASIC_LATIN, LATIN_1_SUPPLEMENT],
47
48        // Portuguese
49        "pt" => &[BASIC_LATIN, LATIN_1_SUPPLEMENT, LATIN_EXTENDED_A],
50
51        // Dutch
52        "nl" => &[BASIC_LATIN, LATIN_1_SUPPLEMENT],
53
54        // Swedish, Norwegian, Danish
55        "sv" | "no" | "da" => &[BASIC_LATIN, LATIN_1_SUPPLEMENT],
56
57        // Polish
58        "pl" => &[BASIC_LATIN, LATIN_1_SUPPLEMENT, LATIN_EXTENDED_A],
59
60        // Czech, Slovak
61        "cs" | "sk" => &[BASIC_LATIN, LATIN_1_SUPPLEMENT, LATIN_EXTENDED_A],
62
63        // Hungarian
64        "hu" => &[BASIC_LATIN, LATIN_1_SUPPLEMENT, LATIN_EXTENDED_A],
65
66        // Romanian
67        "ro" => &[BASIC_LATIN, LATIN_1_SUPPLEMENT, LATIN_EXTENDED_A],
68
69        // Turkish
70        "tr" => &[BASIC_LATIN, LATIN_1_SUPPLEMENT, LATIN_EXTENDED_A],
71
72        // Russian, Ukrainian, Belarusian
73        "ru" | "uk" | "be" => &[BASIC_LATIN, CYRILLIC, CYRILLIC_SUPPLEMENT],
74
75        // Greek
76        "el" => &[BASIC_LATIN, GREEK, GREEK_EXTENDED],
77
78        // Arabic
79        "ar" => &[BASIC_LATIN, ARABIC, ARABIC_SUPPLEMENT],
80
81        // Hebrew
82        "he" => &[BASIC_LATIN, HEBREW],
83
84        // Japanese
85        "ja" => &[BASIC_LATIN, HIRAGANA, KATAKANA, CJK_UNIFIED_IDEOGRAPHS],
86
87        // Chinese (Simplified and Traditional)
88        "zh" | "zh-CN" | "zh-TW" => &[BASIC_LATIN, CJK_UNIFIED_IDEOGRAPHS],
89
90        // Korean
91        "ko" => &[BASIC_LATIN, HANGUL_SYLLABLES, CJK_UNIFIED_IDEOGRAPHS],
92
93        // Hindi
94        "hi" => &[BASIC_LATIN, DEVANAGARI],
95
96        // Thai
97        "th" => &[BASIC_LATIN, THAI],
98
99        // Vietnamese
100        "vi" => &[
101            BASIC_LATIN,
102            LATIN_1_SUPPLEMENT,
103            LATIN_EXTENDED_A,
104            LATIN_EXTENDED_ADDITIONAL,
105            VIETNAMESE_EXTENSIONS,
106        ],
107        _ => return None,
108    };
109
110    Some(ranges)
111}
112
113pub fn merge_contiguous_ranges(ranges: Vec<Range<u32>>) -> Vec<Range<u32>> {
114    if ranges.is_empty() {
115        return ranges;
116    }
117
118    let mut sorted_ranges = ranges;
119    sorted_ranges.sort_by_key(|r| r.start);
120
121    let mut merged_ranges = Vec::new();
122    let mut current_range = sorted_ranges[0].clone();
123
124    for range in sorted_ranges.into_iter().skip(1) {
125        if range.start <= current_range.end {
126            current_range.end = current_range.end.max(range.end);
127        } else {
128            merged_ranges.push(current_range);
129            current_range = range;
130        }
131    }
132
133    merged_ranges.push(current_range);
134
135    merged_ranges
136}
137
138pub fn format_range(range: &Range<u32>) -> String {
139    if range.start + 1 == range.end {
140        format!("{}", range.start)
141    } else {
142        format!("{}-{}", range.start, range.end - 1)
143    }
144}
145
146pub fn format_ranges<'a>(ranges: impl IntoIterator<Item = &'a Range<u32>>) -> String {
147    ranges
148        .into_iter()
149        .map(format_range)
150        .collect::<Vec<String>>()
151        .join(",")
152}
153
154#[cfg(test)]
155mod tests {
156    use super::*;
157
158    use alloc::vec;
159
160    #[test]
161    fn test_get_locale_ranges_english() {
162        let ranges = get_locale_ranges("en").unwrap();
163        assert_eq!(ranges.len(), 1);
164        assert_eq!(ranges[0], BASIC_LATIN);
165    }
166
167    #[test]
168    fn test_get_locale_ranges_french() {
169        let ranges = get_locale_ranges("fr").unwrap();
170        assert_eq!(ranges.len(), 4);
171        assert!(ranges.contains(&BASIC_LATIN));
172        assert!(ranges.contains(&LATIN_1_SUPPLEMENT));
173    }
174
175    #[test]
176    fn test_get_locale_ranges_russian() {
177        let ranges = get_locale_ranges("ru").unwrap();
178        assert_eq!(ranges.len(), 3);
179        assert!(ranges.contains(&CYRILLIC));
180    }
181
182    #[test]
183    fn test_get_locale_ranges_japanese() {
184        let ranges = get_locale_ranges("ja").unwrap();
185        assert_eq!(ranges.len(), 4);
186        assert!(ranges.contains(&HIRAGANA));
187        assert!(ranges.contains(&KATAKANA));
188        assert!(ranges.contains(&CJK_UNIFIED_IDEOGRAPHS));
189    }
190
191    #[test]
192    fn test_get_locale_ranges_invalid() {
193        assert!(get_locale_ranges("invalid").is_none());
194        assert!(get_locale_ranges("xx").is_none());
195    }
196
197    #[test]
198    fn test_merge_contiguous_ranges_empty() {
199        let ranges = vec![];
200        let merged = merge_contiguous_ranges(ranges);
201        assert_eq!(merged.len(), 0);
202    }
203
204    #[test]
205    fn test_merge_contiguous_ranges_single() {
206        let ranges = vec![0x20..0x7F];
207        let merged = merge_contiguous_ranges(ranges);
208        assert_eq!(merged.len(), 1);
209        assert_eq!(merged[0], 0x20..0x7F);
210    }
211
212    #[test]
213    fn test_merge_contiguous_ranges_adjacent() {
214        let ranges = vec![0x20..0x7F, 0x7F..0xFF];
215        let merged = merge_contiguous_ranges(ranges);
216        assert_eq!(merged.len(), 1);
217        assert_eq!(merged[0], 0x20..0xFF);
218    }
219
220    #[test]
221    fn test_merge_contiguous_ranges_overlapping() {
222        let ranges = vec![0x20..0x80, 0x50..0xFF];
223        let merged = merge_contiguous_ranges(ranges);
224        assert_eq!(merged.len(), 1);
225        assert_eq!(merged[0], 0x20..0xFF);
226    }
227
228    #[test]
229    fn test_merge_contiguous_ranges_non_overlapping() {
230        let ranges = vec![0x20..0x7F, 0x100..0x17F];
231        let merged = merge_contiguous_ranges(ranges);
232        assert_eq!(merged.len(), 2);
233        assert_eq!(merged[0], 0x20..0x7F);
234        assert_eq!(merged[1], 0x100..0x17F);
235    }
236
237    #[test]
238    fn test_merge_contiguous_ranges_unsorted() {
239        let ranges = vec![0x100..0x17F, 0x20..0x7F, 0x7F..0xFF];
240        let merged = merge_contiguous_ranges(ranges);
241        assert_eq!(merged.len(), 2);
242        assert_eq!(merged[0], 0x20..0xFF);
243        assert_eq!(merged[1], 0x100..0x17F);
244    }
245
246    #[test]
247    fn test_merge_contiguous_ranges_multiple_groups() {
248        let ranges = vec![0x20..0x7F, 0x7F..0xFF, 0x200..0x2FF, 0x2FF..0x3FF];
249        let merged = merge_contiguous_ranges(ranges);
250        assert_eq!(merged.len(), 2);
251        assert_eq!(merged[0], 0x20..0xFF);
252        assert_eq!(merged[1], 0x200..0x3FF);
253    }
254
255    #[test]
256    fn test_unicode_range_boundaries() {
257        assert_eq!(BASIC_LATIN.start, 0x20);
258        assert_eq!(BASIC_LATIN.end, 0x7F);
259        assert_eq!(CJK_UNIFIED_IDEOGRAPHS.start, 0x4E00);
260        assert_eq!(CJK_UNIFIED_IDEOGRAPHS.end, 0x9FFF);
261    }
262
263    #[test]
264    fn test_get_locale_ranges_aliases() {
265        // Test Chinese variants
266        assert!(get_locale_ranges("zh").is_some());
267        assert!(get_locale_ranges("zh-CN").is_some());
268        assert!(get_locale_ranges("zh-TW").is_some());
269
270        // Test Scandinavian languages
271        assert!(get_locale_ranges("sv").is_some());
272        assert!(get_locale_ranges("no").is_some());
273        assert!(get_locale_ranges("da").is_some());
274    }
275}