From 7a03b28e90323a5c11f08a634e6873e17ad735f7 Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sat, 2 Aug 2025 23:21:12 +0100 Subject: [PATCH 01/10] fix: Fix panic in `unicode-table-generator` Commit 15acb0e0ec849453d9db928821cecc552bb51bfd introduced a panic when running `./x run tools/unicode-table-generator`. Fix it by undoing one of the refactors. --- src/tools/unicode-table-generator/src/raw_emitter.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index e9e0efc459442..45dc450c4f67d 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -339,10 +339,13 @@ impl Canonicalized { // We'll probably always have some slack though so this loop will still // be needed. for &w in unique_words { - unique_mapping.entry(w).or_insert_with(|| { + if !unique_mapping.contains_key(&w) { + assert_eq!( + unique_mapping.insert(w, UniqueMapping::Canonical(canonical_words.len())), + None + ); canonical_words.push(w); - UniqueMapping::Canonical(canonical_words.len()) - }); + } } assert_eq!(canonicalized_words.len() + canonical_words.len(), unique_words.len()); assert_eq!(unique_mapping.len(), unique_words.len()); From 55420b1d71c04904955e4b5c6d61f64283c310f2 Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sat, 9 Aug 2025 23:52:11 +0100 Subject: [PATCH 02/10] refactor: Include table sizes in comment at top of `unicode_data.rs` To make changes in table size obvious from git diffs --- library/core/src/unicode/unicode_data.rs | 10 ++++++++++ src/tools/unicode-table-generator/src/main.rs | 20 +++++++++---------- 2 files changed, 19 insertions(+), 11 deletions(-) diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index b57234bbee9a2..6059f7d6450b1 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -1,4 +1,14 @@ ///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually! +// Alphabetic : 1727 bytes, 142759 codepoints in 757 ranges (U+000041 - U+0323B0) using skiplist +// Case_Ignorable : 1053 bytes, 2749 codepoints in 452 ranges (U+000027 - U+0E01F0) using skiplist +// Cased : 407 bytes, 4578 codepoints in 159 ranges (U+000041 - U+01F18A) using skiplist +// Cc : 9 bytes, 65 codepoints in 2 ranges (U+000000 - U+0000A0) using skiplist +// Grapheme_Extend : 887 bytes, 2193 codepoints in 375 ranges (U+000300 - U+0E01F0) using skiplist +// Lowercase : 935 bytes, 2569 codepoints in 675 ranges (U+000061 - U+01E944) using bitset +// N : 457 bytes, 1911 codepoints in 144 ranges (U+000030 - U+01FBFA) using skiplist +// Uppercase : 799 bytes, 1978 codepoints in 656 ranges (U+000041 - U+01F18A) using bitset +// White_Space : 256 bytes, 25 codepoints in 10 ranges (U+000009 - U+003001) using cascading +// Total : 6530 bytes #[inline(always)] const fn bitset_search< diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 6cdb82a87bdff..c1017142097f0 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -239,6 +239,11 @@ fn main() { std::fs::write(&path, generate_tests(&write_location, ranges_by_property)).unwrap(); } + let mut table_file = String::new(); + table_file.push_str( + "///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!\n", + ); + let mut total_bytes = 0; let mut modules = Vec::new(); for (property, ranges) in ranges_by_property { @@ -252,8 +257,8 @@ fn main() { } modules.push((property.to_lowercase().to_string(), emitter.file)); - println!( - "{:15}: {} bytes, {} codepoints in {} ranges ({} - {}) using {}", + table_file.push_str(&format!( + "// {:16}: {:5} bytes, {:6} codepoints in {:3} ranges (U+{:06X} - U+{:06X}) using {}\n", property, emitter.bytes_used, datapoints, @@ -261,15 +266,10 @@ fn main() { ranges.first().unwrap().start, ranges.last().unwrap().end, emitter.desc, - ); + )); total_bytes += emitter.bytes_used; } - - let mut table_file = String::new(); - - table_file.push_str( - "///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually!\n", - ); + table_file.push_str(&format!("// {:16}: {:5} bytes\n", "Total", total_bytes)); // Include the range search function table_file.push('\n'); @@ -296,8 +296,6 @@ fn main() { } std::fs::write(&write_location, format!("{}\n", table_file.trim_end())).unwrap(); - - println!("Total table sizes: {total_bytes} bytes"); } fn version() -> String { From 25d1876235d1ec7eebf89e857bd76444b2762816 Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sun, 10 Aug 2025 00:18:04 +0100 Subject: [PATCH 03/10] refactor: Include size of case conversion tables Include the sizes of the `to_lowercase` and `to_uppercase` tables in the total size calculations. --- library/core/src/unicode/unicode_data.rs | 12 +++--- .../src/case_mapping.rs | 41 +++++++++++++------ src/tools/unicode-table-generator/src/main.rs | 7 +++- 3 files changed, 42 insertions(+), 18 deletions(-) diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index 6059f7d6450b1..787efcc091454 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -8,7 +8,9 @@ // N : 457 bytes, 1911 codepoints in 144 ranges (U+000030 - U+01FBFA) using skiplist // Uppercase : 799 bytes, 1978 codepoints in 656 ranges (U+000041 - U+01F18A) using bitset // White_Space : 256 bytes, 25 codepoints in 10 ranges (U+000009 - U+003001) using cascading -// Total : 6530 bytes +// to_lower : 11484 bytes +// to_upper : 13432 bytes +// Total : 31446 bytes #[inline(always)] const fn bitset_search< @@ -782,7 +784,7 @@ pub mod conversions { } } - static LOWERCASE_TABLE: &[(char, u32)] = &[ + static LOWERCASE_TABLE: &[(char, u32); 1434] = &[ ('\u{c0}', 224), ('\u{c1}', 225), ('\u{c2}', 226), ('\u{c3}', 227), ('\u{c4}', 228), ('\u{c5}', 229), ('\u{c6}', 230), ('\u{c7}', 231), ('\u{c8}', 232), ('\u{c9}', 233), ('\u{ca}', 234), ('\u{cb}', 235), ('\u{cc}', 236), ('\u{cd}', 237), ('\u{ce}', 238), @@ -1132,11 +1134,11 @@ pub mod conversions { ('\u{1e921}', 125251), ]; - static LOWERCASE_TABLE_MULTI: &[[char; 3]] = &[ + static LOWERCASE_TABLE_MULTI: &[[char; 3]; 1] = &[ ['i', '\u{307}', '\u{0}'], ]; - static UPPERCASE_TABLE: &[(char, u32)] = &[ + static UPPERCASE_TABLE: &[(char, u32); 1526] = &[ ('\u{b5}', 924), ('\u{df}', 4194304), ('\u{e0}', 192), ('\u{e1}', 193), ('\u{e2}', 194), ('\u{e3}', 195), ('\u{e4}', 196), ('\u{e5}', 197), ('\u{e6}', 198), ('\u{e7}', 199), ('\u{e8}', 200), ('\u{e9}', 201), ('\u{ea}', 202), ('\u{eb}', 203), ('\u{ec}', 204), @@ -1509,7 +1511,7 @@ pub mod conversions { ('\u{1e941}', 125215), ('\u{1e942}', 125216), ('\u{1e943}', 125217), ]; - static UPPERCASE_TABLE_MULTI: &[[char; 3]] = &[ + static UPPERCASE_TABLE_MULTI: &[[char; 3]; 102] = &[ ['S', 'S', '\u{0}'], ['\u{2bc}', 'N', '\u{0}'], ['J', '\u{30c}', '\u{0}'], ['\u{399}', '\u{308}', '\u{301}'], ['\u{3a5}', '\u{308}', '\u{301}'], ['\u{535}', '\u{552}', '\u{0}'], ['H', '\u{331}', '\u{0}'], ['T', '\u{308}', '\u{0}'], diff --git a/src/tools/unicode-table-generator/src/case_mapping.rs b/src/tools/unicode-table-generator/src/case_mapping.rs index 9c6454492e7e0..a8527ea9a429a 100644 --- a/src/tools/unicode-table-generator/src/case_mapping.rs +++ b/src/tools/unicode-table-generator/src/case_mapping.rs @@ -6,20 +6,22 @@ use crate::{UnicodeData, fmt_list}; const INDEX_MASK: u32 = 1 << 22; -pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String { +pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) { let mut file = String::new(); write!(file, "const INDEX_MASK: u32 = 0x{INDEX_MASK:x};").unwrap(); file.push_str("\n\n"); file.push_str(HEADER.trim_start()); file.push('\n'); - file.push_str(&generate_tables("LOWER", &data.to_lower)); + let (lower_tables, lower_size) = generate_tables("LOWER", &data.to_lower); + file.push_str(&lower_tables); file.push_str("\n\n"); - file.push_str(&generate_tables("UPPER", &data.to_upper)); - file + let (upper_tables, upper_size) = generate_tables("UPPER", &data.to_upper); + file.push_str(&upper_tables); + (file, [lower_size, upper_size]) } -fn generate_tables(case: &str, data: &BTreeMap) -> String { +fn generate_tables(case: &str, data: &BTreeMap) -> (String, usize) { let mut mappings = Vec::with_capacity(data.len()); let mut multis = Vec::new(); @@ -46,16 +48,31 @@ fn generate_tables(case: &str, data: &BTreeMap) -> String } let mut tables = String::new(); - - write!(tables, "static {}CASE_TABLE: &[(char, u32)] = &[{}];", case, fmt_list(mappings)) - .unwrap(); + let mut size = 0; + + size += size_of_val(mappings.as_slice()); + write!( + tables, + "static {}CASE_TABLE: &[(char, u32); {}] = &[{}];", + case, + mappings.len(), + fmt_list(mappings), + ) + .unwrap(); tables.push_str("\n\n"); - write!(tables, "static {}CASE_TABLE_MULTI: &[[char; 3]] = &[{}];", case, fmt_list(multis)) - .unwrap(); - - tables + size += size_of_val(multis.as_slice()); + write!( + tables, + "static {}CASE_TABLE_MULTI: &[[char; 3]; {}] = &[{}];", + case, + multis.len(), + fmt_list(multis), + ) + .unwrap(); + + (tables, size) } struct CharEscape(char); diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index c1017142097f0..f755ad048e4a4 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -269,6 +269,11 @@ fn main() { )); total_bytes += emitter.bytes_used; } + let (conversions, sizes) = case_mapping::generate_case_mapping(&unicode_data); + for (name, size) in ["to_lower", "to_upper"].iter().zip(sizes) { + table_file.push_str(&format!("// {:16}: {:5} bytes\n", name, size)); + total_bytes += size; + } table_file.push_str(&format!("// {:16}: {:5} bytes\n", "Total", total_bytes)); // Include the range search function @@ -280,7 +285,7 @@ fn main() { table_file.push('\n'); - modules.push((String::from("conversions"), case_mapping::generate_case_mapping(&unicode_data))); + modules.push((String::from("conversions"), conversions)); for (name, contents) in modules { table_file.push_str("#[rustfmt::skip]\n"); From 4494509f7492577f84cbee466ab4033b94a34b2f Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sun, 10 Aug 2025 01:10:15 +0100 Subject: [PATCH 04/10] refactor: rewrite `ranges_from_set` The `merge_ranges` function was very complicated and hard to understand. Forunately, we can use `slice::chunk_by` to achieve the same thing. --- src/tools/unicode-table-generator/src/main.rs | 83 ++++--------------- 1 file changed, 17 insertions(+), 66 deletions(-) diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index f755ad048e4a4..bf0511a2c77ef 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -187,33 +187,19 @@ fn load_data() -> UnicodeData { } } - let mut properties: HashMap<&'static str, Vec>> = properties + let mut properties: Vec<(&'static str, Vec>)> = properties .into_iter() - .map(|(k, v)| { - ( - k, - v.into_iter() - .flat_map(|codepoints| match codepoints { - Codepoints::Single(c) => c - .scalar() - .map(|ch| ch as u32..ch as u32 + 1) - .into_iter() - .collect::>(), - Codepoints::Range(c) => c - .into_iter() - .flat_map(|c| c.scalar().map(|ch| ch as u32..ch as u32 + 1)) - .collect::>(), - }) - .collect::>>(), - ) + .map(|(prop, codepoints)| { + let codepoints = codepoints + .into_iter() + .flatten() + .flat_map(|cp| cp.scalar()) + .map(u32::from) + .collect::>(); + (prop, ranges_from_set(&codepoints)) }) .collect(); - for ranges in properties.values_mut() { - merge_ranges(ranges); - } - - let mut properties = properties.into_iter().collect::>(); properties.sort_by_key(|p| p.0); UnicodeData { ranges: properties, to_lower, to_upper } } @@ -402,48 +388,13 @@ fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool } } +/// Group the elements of `set` into contigous ranges fn ranges_from_set(set: &[u32]) -> Vec> { - let mut ranges = set.iter().map(|e| (*e)..(*e + 1)).collect::>>(); - merge_ranges(&mut ranges); - ranges -} - -fn merge_ranges(ranges: &mut Vec>) { - loop { - let mut new_ranges = Vec::new(); - let mut idx_iter = 0..(ranges.len() - 1); - let mut should_insert_last = true; - while let Some(idx) = idx_iter.next() { - let cur = ranges[idx].clone(); - let next = ranges[idx + 1].clone(); - if cur.end == next.start { - if idx_iter.next().is_none() { - // We're merging the last element - should_insert_last = false; - } - new_ranges.push(cur.start..next.end); - } else { - // We're *not* merging the last element - should_insert_last = true; - new_ranges.push(cur); - } - } - if should_insert_last { - new_ranges.push(ranges.last().unwrap().clone()); - } - if new_ranges.len() == ranges.len() { - *ranges = new_ranges; - break; - } else { - *ranges = new_ranges; - } - } - - let mut last_end = None; - for range in ranges { - if let Some(last) = last_end { - assert!(range.start > last, "{range:?}"); - } - last_end = Some(range.end); - } + set.chunk_by(|a, b| a + 1 == *b) + .map(|chunk| { + let start = *chunk.first().unwrap(); + let end = *chunk.last().unwrap(); + start..(end + 1) + }) + .collect() } From 9ecbc4cd3d8e1261ca74fc93e056e0fd4f366385 Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sun, 10 Aug 2025 01:46:00 +0100 Subject: [PATCH 05/10] refactor: `generate_tests` Rewrite `generate_tests` to be more idiomatic. --- src/tools/unicode-table-generator/src/main.rs | 97 +++++++++---------- 1 file changed, 45 insertions(+), 52 deletions(-) diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index bf0511a2c77ef..c9530fec48a00 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -72,6 +72,8 @@ //! or not. use std::collections::{BTreeMap, HashMap}; +use std::fmt; +use std::fmt::Write; use std::ops::Range; use ucd_parse::Codepoints; @@ -222,7 +224,7 @@ fn main() { let ranges_by_property = &unicode_data.ranges; if let Some(path) = test_path { - std::fs::write(&path, generate_tests(&write_location, ranges_by_property)).unwrap(); + std::fs::write(&path, generate_tests(ranges_by_property).unwrap()).unwrap(); } let mut table_file = String::new(); @@ -326,66 +328,57 @@ fn fmt_list(values: impl IntoIterator) -> String { out } -fn generate_tests(data_path: &str, ranges: &[(&str, Vec>)]) -> String { +fn generate_tests(ranges: &[(&str, Vec>)]) -> Result { let mut s = String::new(); - s.push_str("#![allow(incomplete_features, unused)]\n"); - s.push_str("#![feature(const_generics)]\n\n"); - s.push_str("\n#[allow(unused)]\nuse std::hint;\n"); - s.push_str(&format!("#[path = \"{data_path}\"]\n")); - s.push_str("mod unicode_data;\n\n"); - - s.push_str("\nfn main() {\n"); - + writeln!(s, "#![feature(core_intrinsics)]")?; + writeln!(s, "#![allow(internal_features, dead_code)]")?; + writeln!(s, "// ignore-tidy-filelength")?; + writeln!(s, "use std::intrinsics;")?; + writeln!(s, "mod unicode_data;")?; + writeln!(s, "fn main() {{")?; for (property, ranges) in ranges { - s.push_str(&format!(r#" println!("Testing {property}");"#)); - s.push('\n'); - s.push_str(&format!(" {}_true();\n", property.to_lowercase())); - s.push_str(&format!(" {}_false();\n", property.to_lowercase())); - let mut is_true = Vec::new(); - let mut is_false = Vec::new(); - for ch_num in 0..(std::char::MAX as u32) { - if std::char::from_u32(ch_num).is_none() { - continue; - } - if ranges.iter().any(|r| r.contains(&ch_num)) { - is_true.push(ch_num); - } else { - is_false.push(ch_num); - } - } - - s.push_str(&format!(" fn {}_true() {{\n", property.to_lowercase())); - generate_asserts(&mut s, property, &is_true, true); - s.push_str(" }\n\n"); - s.push_str(&format!(" fn {}_false() {{\n", property.to_lowercase())); - generate_asserts(&mut s, property, &is_false, false); - s.push_str(" }\n\n"); + let prop = property.to_lowercase(); + writeln!(s, r#" println!("Testing {prop}");"#)?; + writeln!(s, " {prop}_true();")?; + writeln!(s, " {prop}_false();")?; + let (is_true, is_false): (Vec<_>, Vec<_>) = (char::MIN..=char::MAX) + .filter(|c| !c.is_ascii()) + .map(u32::from) + .partition(|c| ranges.iter().any(|r| r.contains(c))); + + writeln!(s, " fn {prop}_true() {{")?; + generate_asserts(&mut s, &prop, &is_true, true)?; + writeln!(s, " }}")?; + + writeln!(s, " fn {prop}_false() {{")?; + generate_asserts(&mut s, &prop, &is_false, false)?; + writeln!(s, " }}")?; } - s.push('}'); - s + writeln!(s, "}}")?; + Ok(s) } -fn generate_asserts(s: &mut String, property: &str, points: &[u32], truthy: bool) { +fn generate_asserts( + s: &mut String, + prop: &str, + points: &[u32], + truthy: bool, +) -> Result<(), fmt::Error> { + let truthy = if truthy { "" } else { "!" }; for range in ranges_from_set(points) { - if range.end == range.start + 1 { - s.push_str(&format!( - " assert!({}unicode_data::{}::lookup({:?}), \"{}\");\n", - if truthy { "" } else { "!" }, - property.to_lowercase(), - std::char::from_u32(range.start).unwrap(), - range.start, - )); - } else { - s.push_str(&format!(" for chn in {range:?}u32 {{\n")); - s.push_str(&format!( - " assert!({}unicode_data::{}::lookup(std::char::from_u32(chn).unwrap()), \"{{:?}}\", chn);\n", - if truthy { "" } else { "!" }, - property.to_lowercase(), - )); - s.push_str(" }\n"); + let start = char::from_u32(range.start).unwrap(); + let end = char::from_u32(range.end - 1).unwrap(); + match range.len() { + 1 => writeln!(s, " assert!({truthy}unicode_data::{prop}::lookup({start:?}));")?, + _ => { + writeln!(s, " for c in {start:?}..={end:?} {{")?; + writeln!(s, " assert!({truthy}unicode_data::{prop}::lookup(c));")?; + writeln!(s, " }}")?; + } } } + Ok(()) } /// Group the elements of `set` into contigous ranges From 1aec3b8b341225543c90a302e57b61ff1532b332 Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sun, 10 Aug 2025 02:13:03 +0100 Subject: [PATCH 06/10] refactor: Add tests for case conversions --- .../src/case_mapping.rs | 4 +- src/tools/unicode-table-generator/src/main.rs | 48 +++++++++++++++---- 2 files changed, 41 insertions(+), 11 deletions(-) diff --git a/src/tools/unicode-table-generator/src/case_mapping.rs b/src/tools/unicode-table-generator/src/case_mapping.rs index a8527ea9a429a..49aef3ec33ec7 100644 --- a/src/tools/unicode-table-generator/src/case_mapping.rs +++ b/src/tools/unicode-table-generator/src/case_mapping.rs @@ -21,11 +21,11 @@ pub(crate) fn generate_case_mapping(data: &UnicodeData) -> (String, [usize; 2]) (file, [lower_size, upper_size]) } -fn generate_tables(case: &str, data: &BTreeMap) -> (String, usize) { +fn generate_tables(case: &str, data: &BTreeMap) -> (String, usize) { let mut mappings = Vec::with_capacity(data.len()); let mut multis = Vec::new(); - for (&key, &(a, b, c)) in data.iter() { + for (&key, &[a, b, c]) in data.iter() { let key = char::from_u32(key).unwrap(); if key.is_ascii() { diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index c9530fec48a00..1d70eebdbc6c2 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -100,11 +100,11 @@ static PROPERTIES: &[&str] = &[ struct UnicodeData { ranges: Vec<(&'static str, Vec>)>, - to_upper: BTreeMap, - to_lower: BTreeMap, + to_upper: BTreeMap, + to_lower: BTreeMap, } -fn to_mapping(origin: u32, codepoints: Vec) -> Option<(u32, u32, u32)> { +fn to_mapping(origin: u32, codepoints: Vec) -> Option<[u32; 3]> { let mut a = None; let mut b = None; let mut c = None; @@ -125,7 +125,7 @@ fn to_mapping(origin: u32, codepoints: Vec) -> Option<(u32 } } - Some((a.unwrap(), b.unwrap_or(0), c.unwrap_or(0))) + Some([a.unwrap(), b.unwrap_or(0), c.unwrap_or(0)]) } static UNICODE_DIRECTORY: &str = "unicode-downloads"; @@ -165,12 +165,12 @@ fn load_data() -> UnicodeData { if let Some(mapped) = row.simple_lowercase_mapping && mapped != row.codepoint { - to_lower.insert(row.codepoint.value(), (mapped.value(), 0, 0)); + to_lower.insert(row.codepoint.value(), [mapped.value(), 0, 0]); } if let Some(mapped) = row.simple_uppercase_mapping && mapped != row.codepoint { - to_upper.insert(row.codepoint.value(), (mapped.value(), 0, 0)); + to_upper.insert(row.codepoint.value(), [mapped.value(), 0, 0]); } } @@ -224,7 +224,7 @@ fn main() { let ranges_by_property = &unicode_data.ranges; if let Some(path) = test_path { - std::fs::write(&path, generate_tests(ranges_by_property).unwrap()).unwrap(); + std::fs::write(&path, generate_tests(&unicode_data).unwrap()).unwrap(); } let mut table_file = String::new(); @@ -328,7 +328,7 @@ fn fmt_list(values: impl IntoIterator) -> String { out } -fn generate_tests(ranges: &[(&str, Vec>)]) -> Result { +fn generate_tests(data: &UnicodeData) -> Result { let mut s = String::new(); writeln!(s, "#![feature(core_intrinsics)]")?; writeln!(s, "#![allow(internal_features, dead_code)]")?; @@ -336,7 +336,7 @@ fn generate_tests(ranges: &[(&str, Vec>)]) -> Result>)]) -> Result = (char::MIN..=char::MAX) + .filter(|c| !c.is_ascii()) + .map(u32::from) + .filter(|c| !conversion.contains_key(c)) + .collect(); + let unmapped_ranges = ranges_from_set(&unmapped); + for range in unmapped_ranges { + let start = char::from_u32(range.start).unwrap(); + let end = char::from_u32(range.end - 1).unwrap(); + writeln!(s, " for c in {start:?}..={end:?} {{")?; + writeln!( + s, + r#" assert_eq!(unicode_data::conversions::{name}(c), [c, '\0', '\0']);"# + )?; + + writeln!(s, " }}")?; + } + } + writeln!(s, "}}")?; Ok(s) } From 373d540bf2cf00588786789e2b6a42f2907f9359 Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sat, 2 Aug 2025 23:32:10 +0100 Subject: [PATCH 07/10] optimization: Don't include ASCII characters in Unicode tables The ASCII subset of Unicode is fixed and will never change, so we don't need to generate tables for it with every new Unicode version. This saves a few bytes of static data and speeds up `char::is_control` and `char::is_grapheme_extended` on ASCII inputs. Since the table lookup functions exported from the `unicode` module will give nonsensical errors on ASCII input (and in fact will panic in debug mode), I had to add some private wrapper methods to `char` which check for ASCII-ness first. --- library/alloc/src/str.rs | 5 +- library/core/src/char/methods.rs | 40 +- library/core/src/unicode/mod.rs | 4 +- library/core/src/unicode/unicode_data.rs | 531 ++++++++++-------- .../src/cascading_map.rs | 1 + src/tools/unicode-table-generator/src/main.rs | 1 + .../src/raw_emitter.rs | 1 + .../unicode-table-generator/src/skiplist.rs | 2 + 8 files changed, 333 insertions(+), 252 deletions(-) diff --git a/library/alloc/src/str.rs b/library/alloc/src/str.rs index 22cdd8ecde022..e772ac25a95c2 100644 --- a/library/alloc/src/str.rs +++ b/library/alloc/src/str.rs @@ -418,9 +418,8 @@ impl str { } fn case_ignorable_then_cased>(iter: I) -> bool { - use core::unicode::{Case_Ignorable, Cased}; - match iter.skip_while(|&c| Case_Ignorable(c)).next() { - Some(c) => Cased(c), + match iter.skip_while(|&c| c.is_case_ignorable()).next() { + Some(c) => c.is_cased(), None => false, } } diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 7ee0962721f5b..f751713bc3a04 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -950,7 +950,7 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn is_control(self) -> bool { - unicode::Cc(self) + if self.is_ascii() { self.is_ascii_control() } else { unicode::Cc(self) } } /// Returns `true` if this `char` has the `Grapheme_Extend` property. @@ -965,7 +965,43 @@ impl char { #[must_use] #[inline] pub(crate) fn is_grapheme_extended(self) -> bool { - unicode::Grapheme_Extend(self) + !self.is_ascii() && unicode::Grapheme_Extend(self) + } + + /// Returns `true` if this `char` has the `Cased` property. + /// + /// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and + /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. + /// + /// [Unicode Standard]: https://www.unicode.org/versions/latest/ + /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt + #[must_use] + #[inline] + #[doc(hidden)] + #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] + pub fn is_cased(self) -> bool { + if self.is_ascii() { self.is_ascii_alphabetic() } else { unicode::Cased(self) } + } + + /// Returns `true` if this `char` has the `Case_Ignorable` property. + /// + /// `Case_Ignorable` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and + /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. + /// + /// [Unicode Standard]: https://www.unicode.org/versions/latest/ + /// [ucd]: https://www.unicode.org/reports/tr44/ + /// [`DerivedCoreProperties.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/DerivedCoreProperties.txt + #[must_use] + #[inline] + #[doc(hidden)] + #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] + pub fn is_case_ignorable(self) -> bool { + if self.is_ascii() { + matches!(self, '\'' | '.' | ':' | '^' | '`') + } else { + unicode::Case_Ignorable(self) + } } /// Returns `true` if this `char` has one of the general categories for numbers. diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs index 49dbdeb1a6d1c..6f4bb60676855 100644 --- a/library/core/src/unicode/mod.rs +++ b/library/core/src/unicode/mod.rs @@ -3,12 +3,12 @@ // for use in alloc, not re-exported in std. #[rustfmt::skip] -pub use unicode_data::case_ignorable::lookup as Case_Ignorable; -pub use unicode_data::cased::lookup as Cased; pub use unicode_data::conversions; #[rustfmt::skip] pub(crate) use unicode_data::alphabetic::lookup as Alphabetic; +pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable; +pub(crate) use unicode_data::cased::lookup as Cased; pub(crate) use unicode_data::cc::lookup as Cc; pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend; pub(crate) use unicode_data::lowercase::lookup as Lowercase; diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index 787efcc091454..eab2899057d8a 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -1,16 +1,16 @@ ///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually! -// Alphabetic : 1727 bytes, 142759 codepoints in 757 ranges (U+000041 - U+0323B0) using skiplist -// Case_Ignorable : 1053 bytes, 2749 codepoints in 452 ranges (U+000027 - U+0E01F0) using skiplist -// Cased : 407 bytes, 4578 codepoints in 159 ranges (U+000041 - U+01F18A) using skiplist -// Cc : 9 bytes, 65 codepoints in 2 ranges (U+000000 - U+0000A0) using skiplist +// Alphabetic : 1723 bytes, 142707 codepoints in 755 ranges (U+0000AA - U+0323B0) using skiplist +// Case_Ignorable : 1043 bytes, 2744 codepoints in 447 ranges (U+0000A8 - U+0E01F0) using skiplist +// Cased : 403 bytes, 4526 codepoints in 157 ranges (U+0000AA - U+01F18A) using skiplist +// Cc : 7 bytes, 32 codepoints in 1 ranges (U+000080 - U+0000A0) using skiplist // Grapheme_Extend : 887 bytes, 2193 codepoints in 375 ranges (U+000300 - U+0E01F0) using skiplist -// Lowercase : 935 bytes, 2569 codepoints in 675 ranges (U+000061 - U+01E944) using bitset -// N : 457 bytes, 1911 codepoints in 144 ranges (U+000030 - U+01FBFA) using skiplist -// Uppercase : 799 bytes, 1978 codepoints in 656 ranges (U+000041 - U+01F18A) using bitset -// White_Space : 256 bytes, 25 codepoints in 10 ranges (U+000009 - U+003001) using cascading +// Lowercase : 933 bytes, 2543 codepoints in 674 ranges (U+0000AA - U+01E944) using bitset +// N : 455 bytes, 1901 codepoints in 143 ranges (U+0000B2 - U+01FBFA) using skiplist +// Uppercase : 797 bytes, 1952 codepoints in 655 ranges (U+0000C0 - U+01F18A) using bitset +// White_Space : 256 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using cascading // to_lower : 11484 bytes // to_upper : 13432 bytes -// Total : 31446 bytes +// Total : 31420 bytes #[inline(always)] const fn bitset_search< @@ -148,93 +148,100 @@ pub mod alphabetic { use super::ShortOffsetRunHeader; static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 53] = [ - ShortOffsetRunHeader::new(0, 706), ShortOffsetRunHeader::new(16, 4681), - ShortOffsetRunHeader::new(418, 5741), ShortOffsetRunHeader::new(456, 7958), - ShortOffsetRunHeader::new(556, 9398), ShortOffsetRunHeader::new(627, 11264), - ShortOffsetRunHeader::new(629, 12293), ShortOffsetRunHeader::new(667, 13312), - ShortOffsetRunHeader::new(691, 19904), ShortOffsetRunHeader::new(692, 42125), - ShortOffsetRunHeader::new(694, 42509), ShortOffsetRunHeader::new(698, 55204), - ShortOffsetRunHeader::new(788, 63744), ShortOffsetRunHeader::new(793, 64110), - ShortOffsetRunHeader::new(794, 64830), ShortOffsetRunHeader::new(816, 66176), - ShortOffsetRunHeader::new(857, 67383), ShortOffsetRunHeader::new(904, 73440), - ShortOffsetRunHeader::new(1221, 74650), ShortOffsetRunHeader::new(1232, 77712), - ShortOffsetRunHeader::new(1237, 78896), ShortOffsetRunHeader::new(1240, 82939), - ShortOffsetRunHeader::new(1244, 83527), ShortOffsetRunHeader::new(1246, 90368), - ShortOffsetRunHeader::new(1247, 92160), ShortOffsetRunHeader::new(1249, 92729), - ShortOffsetRunHeader::new(1250, 93504), ShortOffsetRunHeader::new(1265, 100344), - ShortOffsetRunHeader::new(1282, 101590), ShortOffsetRunHeader::new(1284, 110576), - ShortOffsetRunHeader::new(1287, 110883), ShortOffsetRunHeader::new(1294, 111356), - ShortOffsetRunHeader::new(1304, 113664), ShortOffsetRunHeader::new(1305, 119808), - ShortOffsetRunHeader::new(1315, 120486), ShortOffsetRunHeader::new(1352, 122624), - ShortOffsetRunHeader::new(1375, 123536), ShortOffsetRunHeader::new(1399, 124112), - ShortOffsetRunHeader::new(1403, 124896), ShortOffsetRunHeader::new(1409, 126464), - ShortOffsetRunHeader::new(1425, 127280), ShortOffsetRunHeader::new(1491, 131072), - ShortOffsetRunHeader::new(1497, 173792), ShortOffsetRunHeader::new(1498, 177978), - ShortOffsetRunHeader::new(1500, 183970), ShortOffsetRunHeader::new(1504, 191457), - ShortOffsetRunHeader::new(1506, 192094), ShortOffsetRunHeader::new(1508, 194560), - ShortOffsetRunHeader::new(1509, 195102), ShortOffsetRunHeader::new(1510, 196608), - ShortOffsetRunHeader::new(1511, 201547), ShortOffsetRunHeader::new(1512, 205744), - ShortOffsetRunHeader::new(1514, 1319856), + ShortOffsetRunHeader::new(0, 706), ShortOffsetRunHeader::new(12, 4681), + ShortOffsetRunHeader::new(414, 5741), ShortOffsetRunHeader::new(452, 7958), + ShortOffsetRunHeader::new(552, 9398), ShortOffsetRunHeader::new(623, 11264), + ShortOffsetRunHeader::new(625, 12293), ShortOffsetRunHeader::new(663, 13312), + ShortOffsetRunHeader::new(687, 19904), ShortOffsetRunHeader::new(688, 42125), + ShortOffsetRunHeader::new(690, 42509), ShortOffsetRunHeader::new(694, 55204), + ShortOffsetRunHeader::new(784, 63744), ShortOffsetRunHeader::new(789, 64110), + ShortOffsetRunHeader::new(790, 64830), ShortOffsetRunHeader::new(812, 66176), + ShortOffsetRunHeader::new(853, 67383), ShortOffsetRunHeader::new(900, 73440), + ShortOffsetRunHeader::new(1217, 74650), ShortOffsetRunHeader::new(1228, 77712), + ShortOffsetRunHeader::new(1233, 78896), ShortOffsetRunHeader::new(1236, 82939), + ShortOffsetRunHeader::new(1240, 83527), ShortOffsetRunHeader::new(1242, 90368), + ShortOffsetRunHeader::new(1243, 92160), ShortOffsetRunHeader::new(1245, 92729), + ShortOffsetRunHeader::new(1246, 93504), ShortOffsetRunHeader::new(1261, 100344), + ShortOffsetRunHeader::new(1278, 101590), ShortOffsetRunHeader::new(1280, 110576), + ShortOffsetRunHeader::new(1283, 110883), ShortOffsetRunHeader::new(1290, 111356), + ShortOffsetRunHeader::new(1300, 113664), ShortOffsetRunHeader::new(1301, 119808), + ShortOffsetRunHeader::new(1311, 120486), ShortOffsetRunHeader::new(1348, 122624), + ShortOffsetRunHeader::new(1371, 123536), ShortOffsetRunHeader::new(1395, 124112), + ShortOffsetRunHeader::new(1399, 124896), ShortOffsetRunHeader::new(1405, 126464), + ShortOffsetRunHeader::new(1421, 127280), ShortOffsetRunHeader::new(1487, 131072), + ShortOffsetRunHeader::new(1493, 173792), ShortOffsetRunHeader::new(1494, 177978), + ShortOffsetRunHeader::new(1496, 183970), ShortOffsetRunHeader::new(1500, 191457), + ShortOffsetRunHeader::new(1502, 192094), ShortOffsetRunHeader::new(1504, 194560), + ShortOffsetRunHeader::new(1505, 195102), ShortOffsetRunHeader::new(1506, 196608), + ShortOffsetRunHeader::new(1507, 201547), ShortOffsetRunHeader::new(1508, 205744), + ShortOffsetRunHeader::new(1510, 1319856), ]; - static OFFSETS: [u8; 1515] = [ - 65, 26, 6, 26, 47, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 0, 4, 12, 14, 5, 7, 1, 1, 1, 86, 1, 29, - 18, 1, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 2, 1, 6, 41, - 39, 14, 1, 1, 1, 2, 1, 2, 1, 1, 8, 27, 4, 4, 29, 11, 5, 56, 1, 7, 14, 102, 1, 8, 4, 8, 4, 3, - 10, 3, 2, 1, 16, 48, 13, 101, 24, 33, 9, 2, 4, 1, 5, 24, 2, 19, 19, 25, 7, 11, 5, 24, 1, 6, - 8, 1, 8, 42, 10, 12, 3, 7, 6, 76, 1, 16, 1, 3, 4, 15, 13, 19, 1, 8, 2, 2, 2, 22, 1, 7, 1, 1, - 3, 4, 3, 8, 2, 2, 2, 2, 1, 1, 8, 1, 4, 2, 1, 5, 12, 2, 10, 1, 4, 3, 1, 6, 4, 2, 2, 22, 1, 7, - 1, 2, 1, 2, 1, 2, 4, 5, 4, 2, 2, 2, 4, 1, 7, 4, 1, 1, 17, 6, 11, 3, 1, 9, 1, 3, 1, 22, 1, 7, - 1, 2, 1, 5, 3, 9, 1, 3, 1, 2, 3, 1, 15, 4, 21, 4, 4, 3, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, - 3, 8, 2, 2, 2, 2, 9, 2, 4, 2, 1, 5, 13, 1, 16, 2, 1, 6, 3, 3, 1, 4, 3, 2, 1, 1, 1, 2, 3, 2, - 3, 3, 3, 12, 4, 5, 3, 3, 1, 3, 3, 1, 6, 1, 40, 13, 1, 3, 1, 23, 1, 16, 3, 8, 1, 3, 1, 3, 8, - 2, 1, 3, 2, 1, 2, 4, 28, 4, 1, 8, 1, 3, 1, 23, 1, 10, 1, 5, 3, 8, 1, 3, 1, 3, 8, 2, 6, 2, 1, - 4, 13, 3, 12, 13, 1, 3, 1, 41, 2, 8, 1, 3, 1, 3, 1, 1, 5, 4, 7, 5, 22, 6, 1, 3, 1, 18, 3, - 24, 1, 9, 1, 1, 2, 7, 8, 6, 1, 1, 1, 8, 18, 2, 13, 58, 5, 7, 6, 1, 51, 2, 1, 1, 1, 5, 1, 24, - 1, 1, 1, 19, 1, 3, 2, 5, 1, 1, 6, 1, 14, 4, 32, 1, 63, 8, 1, 36, 4, 19, 4, 16, 1, 36, 67, - 55, 1, 1, 2, 5, 16, 64, 10, 4, 2, 38, 1, 1, 5, 1, 2, 43, 1, 0, 1, 4, 2, 7, 1, 1, 1, 4, 2, - 41, 1, 4, 2, 33, 1, 4, 2, 7, 1, 1, 1, 4, 2, 15, 1, 57, 1, 4, 2, 67, 37, 16, 16, 86, 2, 6, 3, - 0, 2, 17, 1, 26, 5, 75, 3, 11, 7, 20, 11, 21, 12, 20, 12, 13, 1, 3, 1, 2, 12, 52, 2, 19, 14, - 1, 4, 1, 67, 89, 7, 43, 5, 70, 10, 31, 1, 12, 4, 9, 23, 30, 2, 5, 11, 44, 4, 26, 54, 28, 4, - 63, 2, 20, 50, 1, 23, 2, 11, 3, 49, 52, 1, 15, 1, 8, 51, 42, 2, 4, 10, 44, 1, 11, 14, 55, - 22, 3, 10, 36, 2, 11, 5, 43, 2, 3, 41, 4, 1, 6, 1, 2, 3, 1, 5, 192, 19, 34, 11, 0, 2, 6, 2, - 38, 2, 6, 2, 8, 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, - 5, 3, 1, 7, 116, 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, - 1, 11, 2, 4, 5, 5, 4, 1, 17, 41, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 2, 56, 7, 1, - 16, 23, 9, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 32, 47, 1, 0, 3, 25, 9, 7, 5, 2, - 5, 4, 86, 6, 3, 1, 90, 1, 4, 5, 43, 1, 94, 17, 32, 48, 16, 0, 0, 64, 0, 67, 46, 2, 0, 3, 16, - 10, 2, 20, 47, 5, 8, 3, 113, 39, 9, 2, 103, 2, 67, 2, 2, 1, 1, 1, 8, 21, 20, 1, 33, 24, 52, - 12, 68, 1, 1, 44, 6, 3, 1, 1, 3, 10, 33, 5, 35, 13, 29, 3, 51, 1, 12, 15, 1, 16, 16, 10, 5, - 1, 55, 9, 14, 18, 23, 3, 69, 1, 1, 1, 1, 24, 3, 2, 16, 2, 4, 11, 6, 2, 6, 2, 6, 9, 7, 1, 7, - 1, 43, 1, 14, 6, 123, 21, 0, 12, 23, 4, 49, 0, 0, 2, 106, 38, 7, 12, 5, 5, 12, 1, 13, 1, 5, - 1, 1, 1, 2, 1, 2, 1, 108, 33, 0, 18, 64, 2, 54, 40, 12, 116, 5, 1, 135, 36, 26, 6, 26, 11, - 89, 3, 6, 2, 6, 2, 6, 2, 3, 35, 12, 1, 26, 1, 19, 1, 2, 1, 15, 2, 14, 34, 123, 69, 53, 0, - 29, 3, 49, 47, 32, 13, 30, 5, 43, 5, 30, 2, 36, 4, 8, 1, 5, 42, 158, 18, 36, 4, 36, 4, 40, - 8, 52, 12, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 3, 52, 12, 0, 9, 22, 10, 8, 24, - 6, 1, 42, 1, 9, 69, 6, 2, 1, 1, 44, 1, 2, 3, 1, 2, 23, 10, 23, 9, 31, 65, 19, 1, 2, 10, 22, - 10, 26, 70, 56, 6, 2, 64, 4, 1, 2, 5, 8, 1, 3, 1, 29, 42, 29, 3, 29, 35, 8, 1, 28, 27, 54, - 10, 22, 10, 19, 13, 18, 110, 73, 55, 51, 13, 51, 13, 40, 34, 28, 3, 1, 5, 23, 250, 42, 1, 2, - 3, 2, 16, 3, 55, 1, 3, 29, 10, 1, 8, 22, 42, 18, 46, 21, 27, 23, 9, 70, 43, 5, 10, 57, 9, 1, - 13, 25, 23, 51, 17, 4, 8, 35, 3, 1, 9, 64, 1, 4, 9, 2, 10, 1, 1, 1, 35, 18, 1, 34, 2, 1, 6, - 4, 62, 7, 1, 1, 1, 4, 1, 15, 1, 10, 7, 57, 23, 4, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, 3, 8, - 2, 2, 2, 2, 3, 1, 6, 1, 5, 7, 28, 10, 1, 1, 2, 1, 1, 38, 1, 10, 1, 1, 2, 1, 1, 4, 1, 2, 3, - 1, 1, 1, 44, 66, 1, 3, 1, 4, 20, 3, 30, 66, 2, 2, 1, 1, 184, 54, 2, 7, 25, 6, 34, 63, 1, 1, - 3, 1, 59, 54, 2, 1, 71, 27, 2, 14, 21, 7, 185, 57, 103, 64, 31, 8, 2, 1, 2, 8, 1, 2, 1, 30, - 1, 2, 2, 2, 2, 4, 93, 8, 2, 46, 2, 6, 1, 1, 1, 2, 27, 51, 2, 10, 17, 72, 5, 1, 18, 73, 199, - 33, 31, 9, 1, 45, 1, 7, 1, 1, 49, 30, 2, 22, 1, 14, 73, 7, 1, 2, 1, 44, 3, 1, 1, 2, 1, 3, 1, - 1, 2, 2, 24, 6, 1, 2, 1, 37, 1, 2, 1, 4, 1, 1, 0, 23, 9, 17, 1, 41, 3, 3, 111, 1, 79, 0, - 102, 111, 17, 196, 0, 97, 15, 0, 17, 6, 25, 0, 5, 0, 0, 47, 0, 0, 7, 31, 17, 79, 17, 30, 18, - 48, 16, 4, 31, 21, 5, 19, 0, 45, 211, 64, 128, 75, 4, 57, 7, 17, 64, 2, 1, 1, 12, 2, 14, 0, - 8, 0, 41, 10, 0, 4, 1, 7, 1, 2, 1, 0, 15, 1, 29, 3, 2, 1, 14, 4, 8, 0, 0, 107, 5, 13, 3, 9, - 7, 10, 4, 1, 0, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, - 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 25, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, - 25, 1, 31, 1, 25, 1, 8, 0, 31, 6, 6, 213, 7, 1, 17, 2, 7, 1, 2, 1, 5, 5, 62, 33, 1, 112, 45, - 10, 7, 16, 1, 0, 30, 18, 44, 0, 28, 228, 30, 2, 1, 0, 7, 1, 4, 1, 2, 1, 15, 1, 197, 59, 68, - 3, 1, 3, 1, 0, 4, 1, 27, 1, 2, 1, 1, 2, 1, 1, 10, 1, 4, 1, 1, 1, 1, 6, 1, 4, 1, 1, 1, 1, 1, - 1, 3, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 4, 1, 7, 1, 4, 1, 4, 1, 1, 1, - 10, 1, 17, 5, 3, 1, 5, 1, 17, 0, 26, 6, 26, 6, 26, 0, 0, 32, 0, 6, 222, 2, 0, 14, 0, 15, 0, - 0, 0, 0, 0, 5, 0, 0, + static OFFSETS: [u8; 1511] = [ + 170, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 0, 4, 12, 14, 5, 7, 1, 1, 1, 86, 1, 29, 18, 1, 2, 2, + 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 2, 1, 6, 41, 39, 14, 1, 1, + 1, 2, 1, 2, 1, 1, 8, 27, 4, 4, 29, 11, 5, 56, 1, 7, 14, 102, 1, 8, 4, 8, 4, 3, 10, 3, 2, 1, + 16, 48, 13, 101, 24, 33, 9, 2, 4, 1, 5, 24, 2, 19, 19, 25, 7, 11, 5, 24, 1, 6, 8, 1, 8, 42, + 10, 12, 3, 7, 6, 76, 1, 16, 1, 3, 4, 15, 13, 19, 1, 8, 2, 2, 2, 22, 1, 7, 1, 1, 3, 4, 3, 8, + 2, 2, 2, 2, 1, 1, 8, 1, 4, 2, 1, 5, 12, 2, 10, 1, 4, 3, 1, 6, 4, 2, 2, 22, 1, 7, 1, 2, 1, 2, + 1, 2, 4, 5, 4, 2, 2, 2, 4, 1, 7, 4, 1, 1, 17, 6, 11, 3, 1, 9, 1, 3, 1, 22, 1, 7, 1, 2, 1, 5, + 3, 9, 1, 3, 1, 2, 3, 1, 15, 4, 21, 4, 4, 3, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, 3, 8, 2, 2, + 2, 2, 9, 2, 4, 2, 1, 5, 13, 1, 16, 2, 1, 6, 3, 3, 1, 4, 3, 2, 1, 1, 1, 2, 3, 2, 3, 3, 3, 12, + 4, 5, 3, 3, 1, 3, 3, 1, 6, 1, 40, 13, 1, 3, 1, 23, 1, 16, 3, 8, 1, 3, 1, 3, 8, 2, 1, 3, 2, + 1, 2, 4, 28, 4, 1, 8, 1, 3, 1, 23, 1, 10, 1, 5, 3, 8, 1, 3, 1, 3, 8, 2, 6, 2, 1, 4, 13, 3, + 12, 13, 1, 3, 1, 41, 2, 8, 1, 3, 1, 3, 1, 1, 5, 4, 7, 5, 22, 6, 1, 3, 1, 18, 3, 24, 1, 9, 1, + 1, 2, 7, 8, 6, 1, 1, 1, 8, 18, 2, 13, 58, 5, 7, 6, 1, 51, 2, 1, 1, 1, 5, 1, 24, 1, 1, 1, 19, + 1, 3, 2, 5, 1, 1, 6, 1, 14, 4, 32, 1, 63, 8, 1, 36, 4, 19, 4, 16, 1, 36, 67, 55, 1, 1, 2, 5, + 16, 64, 10, 4, 2, 38, 1, 1, 5, 1, 2, 43, 1, 0, 1, 4, 2, 7, 1, 1, 1, 4, 2, 41, 1, 4, 2, 33, + 1, 4, 2, 7, 1, 1, 1, 4, 2, 15, 1, 57, 1, 4, 2, 67, 37, 16, 16, 86, 2, 6, 3, 0, 2, 17, 1, 26, + 5, 75, 3, 11, 7, 20, 11, 21, 12, 20, 12, 13, 1, 3, 1, 2, 12, 52, 2, 19, 14, 1, 4, 1, 67, 89, + 7, 43, 5, 70, 10, 31, 1, 12, 4, 9, 23, 30, 2, 5, 11, 44, 4, 26, 54, 28, 4, 63, 2, 20, 50, 1, + 23, 2, 11, 3, 49, 52, 1, 15, 1, 8, 51, 42, 2, 4, 10, 44, 1, 11, 14, 55, 22, 3, 10, 36, 2, + 11, 5, 43, 2, 3, 41, 4, 1, 6, 1, 2, 3, 1, 5, 192, 19, 34, 11, 0, 2, 6, 2, 38, 2, 6, 2, 8, 1, + 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, 5, 3, 1, 7, 116, 1, + 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, 1, 11, 2, 4, 5, 5, + 4, 1, 17, 41, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 2, 56, 7, 1, 16, 23, 9, 7, 1, + 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 7, 1, 32, 47, 1, 0, 3, 25, 9, 7, 5, 2, 5, 4, 86, 6, 3, + 1, 90, 1, 4, 5, 43, 1, 94, 17, 32, 48, 16, 0, 0, 64, 0, 67, 46, 2, 0, 3, 16, 10, 2, 20, 47, + 5, 8, 3, 113, 39, 9, 2, 103, 2, 67, 2, 2, 1, 1, 1, 8, 21, 20, 1, 33, 24, 52, 12, 68, 1, 1, + 44, 6, 3, 1, 1, 3, 10, 33, 5, 35, 13, 29, 3, 51, 1, 12, 15, 1, 16, 16, 10, 5, 1, 55, 9, 14, + 18, 23, 3, 69, 1, 1, 1, 1, 24, 3, 2, 16, 2, 4, 11, 6, 2, 6, 2, 6, 9, 7, 1, 7, 1, 43, 1, 14, + 6, 123, 21, 0, 12, 23, 4, 49, 0, 0, 2, 106, 38, 7, 12, 5, 5, 12, 1, 13, 1, 5, 1, 1, 1, 2, 1, + 2, 1, 108, 33, 0, 18, 64, 2, 54, 40, 12, 116, 5, 1, 135, 36, 26, 6, 26, 11, 89, 3, 6, 2, 6, + 2, 6, 2, 3, 35, 12, 1, 26, 1, 19, 1, 2, 1, 15, 2, 14, 34, 123, 69, 53, 0, 29, 3, 49, 47, 32, + 13, 30, 5, 43, 5, 30, 2, 36, 4, 8, 1, 5, 42, 158, 18, 36, 4, 36, 4, 40, 8, 52, 12, 11, 1, + 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 3, 52, 12, 0, 9, 22, 10, 8, 24, 6, 1, 42, 1, 9, + 69, 6, 2, 1, 1, 44, 1, 2, 3, 1, 2, 23, 10, 23, 9, 31, 65, 19, 1, 2, 10, 22, 10, 26, 70, 56, + 6, 2, 64, 4, 1, 2, 5, 8, 1, 3, 1, 29, 42, 29, 3, 29, 35, 8, 1, 28, 27, 54, 10, 22, 10, 19, + 13, 18, 110, 73, 55, 51, 13, 51, 13, 40, 34, 28, 3, 1, 5, 23, 250, 42, 1, 2, 3, 2, 16, 3, + 55, 1, 3, 29, 10, 1, 8, 22, 42, 18, 46, 21, 27, 23, 9, 70, 43, 5, 10, 57, 9, 1, 13, 25, 23, + 51, 17, 4, 8, 35, 3, 1, 9, 64, 1, 4, 9, 2, 10, 1, 1, 1, 35, 18, 1, 34, 2, 1, 6, 4, 62, 7, 1, + 1, 1, 4, 1, 15, 1, 10, 7, 57, 23, 4, 1, 8, 2, 2, 2, 22, 1, 7, 1, 2, 1, 5, 3, 8, 2, 2, 2, 2, + 3, 1, 6, 1, 5, 7, 28, 10, 1, 1, 2, 1, 1, 38, 1, 10, 1, 1, 2, 1, 1, 4, 1, 2, 3, 1, 1, 1, 44, + 66, 1, 3, 1, 4, 20, 3, 30, 66, 2, 2, 1, 1, 184, 54, 2, 7, 25, 6, 34, 63, 1, 1, 3, 1, 59, 54, + 2, 1, 71, 27, 2, 14, 21, 7, 185, 57, 103, 64, 31, 8, 2, 1, 2, 8, 1, 2, 1, 30, 1, 2, 2, 2, 2, + 4, 93, 8, 2, 46, 2, 6, 1, 1, 1, 2, 27, 51, 2, 10, 17, 72, 5, 1, 18, 73, 199, 33, 31, 9, 1, + 45, 1, 7, 1, 1, 49, 30, 2, 22, 1, 14, 73, 7, 1, 2, 1, 44, 3, 1, 1, 2, 1, 3, 1, 1, 2, 2, 24, + 6, 1, 2, 1, 37, 1, 2, 1, 4, 1, 1, 0, 23, 9, 17, 1, 41, 3, 3, 111, 1, 79, 0, 102, 111, 17, + 196, 0, 97, 15, 0, 17, 6, 25, 0, 5, 0, 0, 47, 0, 0, 7, 31, 17, 79, 17, 30, 18, 48, 16, 4, + 31, 21, 5, 19, 0, 45, 211, 64, 128, 75, 4, 57, 7, 17, 64, 2, 1, 1, 12, 2, 14, 0, 8, 0, 41, + 10, 0, 4, 1, 7, 1, 2, 1, 0, 15, 1, 29, 3, 2, 1, 14, 4, 8, 0, 0, 107, 5, 13, 3, 9, 7, 10, 4, + 1, 0, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, + 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 25, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, + 1, 25, 1, 8, 0, 31, 6, 6, 213, 7, 1, 17, 2, 7, 1, 2, 1, 5, 5, 62, 33, 1, 112, 45, 10, 7, 16, + 1, 0, 30, 18, 44, 0, 28, 228, 30, 2, 1, 0, 7, 1, 4, 1, 2, 1, 15, 1, 197, 59, 68, 3, 1, 3, 1, + 0, 4, 1, 27, 1, 2, 1, 1, 2, 1, 1, 10, 1, 4, 1, 1, 1, 1, 6, 1, 4, 1, 1, 1, 1, 1, 1, 3, 1, 2, + 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 2, 4, 1, 7, 1, 4, 1, 4, 1, 1, 1, 10, 1, 17, + 5, 3, 1, 5, 1, 17, 0, 26, 6, 26, 6, 26, 0, 0, 32, 0, 6, 222, 2, 0, 14, 0, 15, 0, 0, 0, 0, 0, + 5, 0, 0, ]; + #[inline] pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0xaa && lookup_slow(c) + } + + #[inline(never)] + fn lookup_slow(c: char) -> bool { const { assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); let mut i = 0; @@ -254,62 +261,69 @@ pub mod case_ignorable { use super::ShortOffsetRunHeader; static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 37] = [ - ShortOffsetRunHeader::new(0, 688), ShortOffsetRunHeader::new(21, 4957), - ShortOffsetRunHeader::new(273, 5906), ShortOffsetRunHeader::new(275, 8125), - ShortOffsetRunHeader::new(385, 11388), ShortOffsetRunHeader::new(419, 12293), - ShortOffsetRunHeader::new(431, 40981), ShortOffsetRunHeader::new(443, 42232), - ShortOffsetRunHeader::new(445, 42508), ShortOffsetRunHeader::new(447, 64286), - ShortOffsetRunHeader::new(543, 65024), ShortOffsetRunHeader::new(547, 66045), - ShortOffsetRunHeader::new(577, 67456), ShortOffsetRunHeader::new(583, 68097), - ShortOffsetRunHeader::new(589, 68900), ShortOffsetRunHeader::new(601, 69291), - ShortOffsetRunHeader::new(609, 71727), ShortOffsetRunHeader::new(733, 71995), - ShortOffsetRunHeader::new(737, 72752), ShortOffsetRunHeader::new(765, 73459), - ShortOffsetRunHeader::new(795, 78896), ShortOffsetRunHeader::new(807, 90398), - ShortOffsetRunHeader::new(811, 92912), ShortOffsetRunHeader::new(815, 93504), - ShortOffsetRunHeader::new(821, 94031), ShortOffsetRunHeader::new(825, 110576), - ShortOffsetRunHeader::new(833, 113821), ShortOffsetRunHeader::new(839, 118528), - ShortOffsetRunHeader::new(843, 119143), ShortOffsetRunHeader::new(847, 121344), - ShortOffsetRunHeader::new(857, 122880), ShortOffsetRunHeader::new(869, 123566), - ShortOffsetRunHeader::new(885, 124139), ShortOffsetRunHeader::new(889, 125136), - ShortOffsetRunHeader::new(893, 127995), ShortOffsetRunHeader::new(897, 917505), - ShortOffsetRunHeader::new(899, 2032112), + ShortOffsetRunHeader::new(0, 688), ShortOffsetRunHeader::new(11, 4957), + ShortOffsetRunHeader::new(263, 5906), ShortOffsetRunHeader::new(265, 8125), + ShortOffsetRunHeader::new(375, 11388), ShortOffsetRunHeader::new(409, 12293), + ShortOffsetRunHeader::new(421, 40981), ShortOffsetRunHeader::new(433, 42232), + ShortOffsetRunHeader::new(435, 42508), ShortOffsetRunHeader::new(437, 64286), + ShortOffsetRunHeader::new(533, 65024), ShortOffsetRunHeader::new(537, 66045), + ShortOffsetRunHeader::new(567, 67456), ShortOffsetRunHeader::new(573, 68097), + ShortOffsetRunHeader::new(579, 68900), ShortOffsetRunHeader::new(591, 69291), + ShortOffsetRunHeader::new(599, 71727), ShortOffsetRunHeader::new(723, 71995), + ShortOffsetRunHeader::new(727, 72752), ShortOffsetRunHeader::new(755, 73459), + ShortOffsetRunHeader::new(785, 78896), ShortOffsetRunHeader::new(797, 90398), + ShortOffsetRunHeader::new(801, 92912), ShortOffsetRunHeader::new(805, 93504), + ShortOffsetRunHeader::new(811, 94031), ShortOffsetRunHeader::new(815, 110576), + ShortOffsetRunHeader::new(823, 113821), ShortOffsetRunHeader::new(829, 118528), + ShortOffsetRunHeader::new(833, 119143), ShortOffsetRunHeader::new(837, 121344), + ShortOffsetRunHeader::new(847, 122880), ShortOffsetRunHeader::new(859, 123566), + ShortOffsetRunHeader::new(875, 124139), ShortOffsetRunHeader::new(879, 125136), + ShortOffsetRunHeader::new(883, 127995), ShortOffsetRunHeader::new(887, 917505), + ShortOffsetRunHeader::new(889, 2032112), ]; - static OFFSETS: [u8; 905] = [ - 39, 1, 6, 1, 11, 1, 35, 1, 1, 1, 71, 1, 4, 1, 1, 1, 4, 1, 2, 2, 0, 192, 4, 2, 4, 1, 9, 2, - 1, 1, 251, 7, 207, 1, 5, 1, 49, 45, 1, 1, 1, 2, 1, 2, 1, 1, 44, 1, 11, 6, 10, 11, 1, 1, 35, - 1, 10, 21, 16, 1, 101, 8, 1, 10, 1, 4, 33, 1, 1, 1, 30, 27, 91, 11, 58, 11, 4, 1, 2, 1, 24, - 24, 43, 3, 44, 1, 7, 2, 5, 9, 41, 58, 55, 1, 1, 1, 4, 8, 4, 1, 3, 7, 10, 2, 13, 1, 15, 1, - 58, 1, 4, 4, 8, 1, 20, 2, 26, 1, 2, 2, 57, 1, 4, 2, 4, 2, 2, 3, 3, 1, 30, 2, 3, 1, 11, 2, - 57, 1, 4, 5, 1, 2, 4, 1, 20, 2, 22, 6, 1, 1, 58, 1, 2, 1, 1, 4, 8, 1, 7, 2, 11, 2, 30, 1, - 61, 1, 12, 1, 50, 1, 3, 1, 55, 1, 1, 3, 5, 3, 1, 4, 7, 2, 11, 2, 29, 1, 58, 1, 2, 1, 6, 1, - 5, 2, 20, 2, 28, 2, 57, 2, 4, 4, 8, 1, 20, 2, 29, 1, 72, 1, 7, 3, 1, 1, 90, 1, 2, 7, 11, 9, - 98, 1, 2, 9, 9, 1, 1, 7, 73, 2, 27, 1, 1, 1, 1, 1, 55, 14, 1, 5, 1, 2, 5, 11, 1, 36, 9, 1, - 102, 4, 1, 6, 1, 2, 2, 2, 25, 2, 4, 3, 16, 4, 13, 1, 2, 2, 6, 1, 15, 1, 94, 1, 0, 3, 0, 3, - 29, 2, 30, 2, 30, 2, 64, 2, 1, 7, 8, 1, 2, 11, 3, 1, 5, 1, 45, 5, 51, 1, 65, 2, 34, 1, 118, - 3, 4, 2, 9, 1, 6, 3, 219, 2, 2, 1, 58, 1, 1, 7, 1, 1, 1, 1, 2, 8, 6, 10, 2, 1, 39, 1, 8, 31, - 49, 4, 48, 1, 1, 5, 1, 1, 5, 1, 40, 9, 12, 2, 32, 4, 2, 2, 1, 3, 56, 1, 1, 2, 3, 1, 1, 3, - 58, 8, 2, 2, 64, 6, 82, 3, 1, 13, 1, 7, 4, 1, 6, 1, 3, 2, 50, 63, 13, 1, 34, 101, 0, 1, 1, - 3, 11, 3, 13, 3, 13, 3, 13, 2, 12, 5, 8, 2, 10, 1, 2, 1, 2, 5, 49, 5, 1, 10, 1, 1, 13, 1, - 16, 13, 51, 33, 0, 2, 113, 3, 125, 1, 15, 1, 96, 32, 47, 1, 0, 1, 36, 4, 3, 5, 5, 1, 93, 6, - 93, 3, 0, 1, 0, 6, 0, 1, 98, 4, 1, 10, 1, 1, 28, 4, 80, 2, 14, 34, 78, 1, 23, 3, 103, 3, 3, - 2, 8, 1, 3, 1, 4, 1, 25, 2, 5, 1, 151, 2, 26, 18, 13, 1, 38, 8, 25, 11, 46, 3, 48, 1, 2, 4, - 2, 2, 17, 1, 21, 2, 66, 6, 2, 2, 2, 2, 12, 1, 8, 1, 35, 1, 11, 1, 51, 1, 1, 3, 2, 2, 5, 2, - 1, 1, 27, 1, 14, 2, 5, 2, 1, 1, 100, 5, 9, 3, 121, 1, 2, 1, 4, 1, 0, 1, 147, 17, 0, 16, 3, - 1, 12, 16, 34, 1, 2, 1, 169, 1, 7, 1, 6, 1, 11, 1, 35, 1, 1, 1, 47, 1, 45, 2, 67, 1, 21, 3, - 0, 1, 226, 1, 149, 5, 0, 6, 1, 42, 1, 9, 0, 3, 1, 2, 5, 4, 40, 3, 4, 1, 165, 2, 0, 4, 38, 1, - 26, 5, 1, 1, 0, 2, 79, 4, 70, 11, 49, 4, 123, 1, 54, 15, 41, 1, 2, 2, 10, 3, 49, 4, 2, 2, 2, - 1, 4, 1, 10, 1, 50, 3, 36, 5, 1, 8, 62, 1, 12, 2, 52, 9, 10, 4, 2, 1, 95, 3, 2, 1, 1, 2, 6, - 1, 2, 1, 157, 1, 3, 8, 21, 2, 57, 2, 3, 1, 37, 7, 3, 5, 70, 6, 13, 1, 1, 1, 1, 1, 14, 2, 85, - 8, 2, 3, 1, 1, 23, 1, 84, 6, 1, 1, 4, 2, 1, 2, 238, 4, 6, 2, 1, 2, 27, 2, 85, 8, 2, 1, 1, 2, - 106, 1, 1, 1, 2, 6, 1, 1, 101, 1, 1, 1, 2, 4, 1, 5, 0, 9, 1, 2, 0, 2, 1, 1, 4, 1, 144, 4, 2, - 2, 4, 1, 32, 10, 40, 6, 2, 4, 8, 1, 9, 6, 2, 3, 46, 13, 1, 2, 0, 7, 1, 6, 1, 1, 82, 22, 2, - 7, 1, 2, 1, 2, 122, 6, 3, 1, 1, 2, 1, 7, 1, 1, 72, 2, 3, 1, 1, 1, 0, 2, 11, 2, 52, 5, 5, 1, - 1, 1, 23, 1, 0, 17, 6, 15, 0, 12, 3, 3, 0, 5, 59, 7, 9, 4, 0, 3, 40, 2, 0, 1, 63, 17, 64, 2, - 1, 2, 0, 4, 1, 7, 1, 2, 0, 2, 1, 4, 0, 46, 2, 23, 0, 3, 9, 16, 2, 7, 30, 4, 148, 3, 0, 55, - 4, 50, 8, 1, 14, 1, 22, 5, 1, 15, 0, 7, 1, 17, 2, 7, 1, 2, 1, 5, 5, 62, 33, 1, 160, 14, 0, - 1, 61, 4, 0, 5, 254, 2, 0, 7, 109, 8, 0, 5, 0, 1, 30, 96, 128, 240, 0, + static OFFSETS: [u8; 895] = [ + 168, 1, 4, 1, 1, 1, 4, 1, 2, 2, 0, 192, 4, 2, 4, 1, 9, 2, 1, 1, 251, 7, 207, 1, 5, 1, 49, + 45, 1, 1, 1, 2, 1, 2, 1, 1, 44, 1, 11, 6, 10, 11, 1, 1, 35, 1, 10, 21, 16, 1, 101, 8, 1, 10, + 1, 4, 33, 1, 1, 1, 30, 27, 91, 11, 58, 11, 4, 1, 2, 1, 24, 24, 43, 3, 44, 1, 7, 2, 5, 9, 41, + 58, 55, 1, 1, 1, 4, 8, 4, 1, 3, 7, 10, 2, 13, 1, 15, 1, 58, 1, 4, 4, 8, 1, 20, 2, 26, 1, 2, + 2, 57, 1, 4, 2, 4, 2, 2, 3, 3, 1, 30, 2, 3, 1, 11, 2, 57, 1, 4, 5, 1, 2, 4, 1, 20, 2, 22, 6, + 1, 1, 58, 1, 2, 1, 1, 4, 8, 1, 7, 2, 11, 2, 30, 1, 61, 1, 12, 1, 50, 1, 3, 1, 55, 1, 1, 3, + 5, 3, 1, 4, 7, 2, 11, 2, 29, 1, 58, 1, 2, 1, 6, 1, 5, 2, 20, 2, 28, 2, 57, 2, 4, 4, 8, 1, + 20, 2, 29, 1, 72, 1, 7, 3, 1, 1, 90, 1, 2, 7, 11, 9, 98, 1, 2, 9, 9, 1, 1, 7, 73, 2, 27, 1, + 1, 1, 1, 1, 55, 14, 1, 5, 1, 2, 5, 11, 1, 36, 9, 1, 102, 4, 1, 6, 1, 2, 2, 2, 25, 2, 4, 3, + 16, 4, 13, 1, 2, 2, 6, 1, 15, 1, 94, 1, 0, 3, 0, 3, 29, 2, 30, 2, 30, 2, 64, 2, 1, 7, 8, 1, + 2, 11, 3, 1, 5, 1, 45, 5, 51, 1, 65, 2, 34, 1, 118, 3, 4, 2, 9, 1, 6, 3, 219, 2, 2, 1, 58, + 1, 1, 7, 1, 1, 1, 1, 2, 8, 6, 10, 2, 1, 39, 1, 8, 31, 49, 4, 48, 1, 1, 5, 1, 1, 5, 1, 40, 9, + 12, 2, 32, 4, 2, 2, 1, 3, 56, 1, 1, 2, 3, 1, 1, 3, 58, 8, 2, 2, 64, 6, 82, 3, 1, 13, 1, 7, + 4, 1, 6, 1, 3, 2, 50, 63, 13, 1, 34, 101, 0, 1, 1, 3, 11, 3, 13, 3, 13, 3, 13, 2, 12, 5, 8, + 2, 10, 1, 2, 1, 2, 5, 49, 5, 1, 10, 1, 1, 13, 1, 16, 13, 51, 33, 0, 2, 113, 3, 125, 1, 15, + 1, 96, 32, 47, 1, 0, 1, 36, 4, 3, 5, 5, 1, 93, 6, 93, 3, 0, 1, 0, 6, 0, 1, 98, 4, 1, 10, 1, + 1, 28, 4, 80, 2, 14, 34, 78, 1, 23, 3, 103, 3, 3, 2, 8, 1, 3, 1, 4, 1, 25, 2, 5, 1, 151, 2, + 26, 18, 13, 1, 38, 8, 25, 11, 46, 3, 48, 1, 2, 4, 2, 2, 17, 1, 21, 2, 66, 6, 2, 2, 2, 2, 12, + 1, 8, 1, 35, 1, 11, 1, 51, 1, 1, 3, 2, 2, 5, 2, 1, 1, 27, 1, 14, 2, 5, 2, 1, 1, 100, 5, 9, + 3, 121, 1, 2, 1, 4, 1, 0, 1, 147, 17, 0, 16, 3, 1, 12, 16, 34, 1, 2, 1, 169, 1, 7, 1, 6, 1, + 11, 1, 35, 1, 1, 1, 47, 1, 45, 2, 67, 1, 21, 3, 0, 1, 226, 1, 149, 5, 0, 6, 1, 42, 1, 9, 0, + 3, 1, 2, 5, 4, 40, 3, 4, 1, 165, 2, 0, 4, 38, 1, 26, 5, 1, 1, 0, 2, 79, 4, 70, 11, 49, 4, + 123, 1, 54, 15, 41, 1, 2, 2, 10, 3, 49, 4, 2, 2, 2, 1, 4, 1, 10, 1, 50, 3, 36, 5, 1, 8, 62, + 1, 12, 2, 52, 9, 10, 4, 2, 1, 95, 3, 2, 1, 1, 2, 6, 1, 2, 1, 157, 1, 3, 8, 21, 2, 57, 2, 3, + 1, 37, 7, 3, 5, 70, 6, 13, 1, 1, 1, 1, 1, 14, 2, 85, 8, 2, 3, 1, 1, 23, 1, 84, 6, 1, 1, 4, + 2, 1, 2, 238, 4, 6, 2, 1, 2, 27, 2, 85, 8, 2, 1, 1, 2, 106, 1, 1, 1, 2, 6, 1, 1, 101, 1, 1, + 1, 2, 4, 1, 5, 0, 9, 1, 2, 0, 2, 1, 1, 4, 1, 144, 4, 2, 2, 4, 1, 32, 10, 40, 6, 2, 4, 8, 1, + 9, 6, 2, 3, 46, 13, 1, 2, 0, 7, 1, 6, 1, 1, 82, 22, 2, 7, 1, 2, 1, 2, 122, 6, 3, 1, 1, 2, 1, + 7, 1, 1, 72, 2, 3, 1, 1, 1, 0, 2, 11, 2, 52, 5, 5, 1, 1, 1, 23, 1, 0, 17, 6, 15, 0, 12, 3, + 3, 0, 5, 59, 7, 9, 4, 0, 3, 40, 2, 0, 1, 63, 17, 64, 2, 1, 2, 0, 4, 1, 7, 1, 2, 0, 2, 1, 4, + 0, 46, 2, 23, 0, 3, 9, 16, 2, 7, 30, 4, 148, 3, 0, 55, 4, 50, 8, 1, 14, 1, 22, 5, 1, 15, 0, + 7, 1, 17, 2, 7, 1, 2, 1, 5, 5, 62, 33, 1, 160, 14, 0, 1, 61, 4, 0, 5, 254, 2, 0, 7, 109, 8, + 0, 5, 0, 1, 30, 96, 128, 240, 0, ]; + #[inline] pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0xa8 && lookup_slow(c) + } + + #[inline(never)] + fn lookup_slow(c: char) -> bool { const { assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); let mut i = 0; @@ -329,33 +343,40 @@ pub mod cased { use super::ShortOffsetRunHeader; static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 22] = [ - ShortOffsetRunHeader::new(0, 4256), ShortOffsetRunHeader::new(55, 5024), - ShortOffsetRunHeader::new(65, 7296), ShortOffsetRunHeader::new(69, 7958), - ShortOffsetRunHeader::new(78, 9398), ShortOffsetRunHeader::new(153, 11264), - ShortOffsetRunHeader::new(155, 42560), ShortOffsetRunHeader::new(167, 43824), - ShortOffsetRunHeader::new(187, 64256), ShortOffsetRunHeader::new(193, 65313), - ShortOffsetRunHeader::new(197, 66560), ShortOffsetRunHeader::new(201, 67456), - ShortOffsetRunHeader::new(223, 68736), ShortOffsetRunHeader::new(231, 71840), - ShortOffsetRunHeader::new(239, 93760), ShortOffsetRunHeader::new(241, 119808), - ShortOffsetRunHeader::new(243, 120486), ShortOffsetRunHeader::new(280, 122624), - ShortOffsetRunHeader::new(303, 122928), ShortOffsetRunHeader::new(309, 125184), - ShortOffsetRunHeader::new(311, 127280), ShortOffsetRunHeader::new(313, 1241482), + ShortOffsetRunHeader::new(0, 4256), ShortOffsetRunHeader::new(51, 5024), + ShortOffsetRunHeader::new(61, 7296), ShortOffsetRunHeader::new(65, 7958), + ShortOffsetRunHeader::new(74, 9398), ShortOffsetRunHeader::new(149, 11264), + ShortOffsetRunHeader::new(151, 42560), ShortOffsetRunHeader::new(163, 43824), + ShortOffsetRunHeader::new(183, 64256), ShortOffsetRunHeader::new(189, 65313), + ShortOffsetRunHeader::new(193, 66560), ShortOffsetRunHeader::new(197, 67456), + ShortOffsetRunHeader::new(219, 68736), ShortOffsetRunHeader::new(227, 71840), + ShortOffsetRunHeader::new(235, 93760), ShortOffsetRunHeader::new(237, 119808), + ShortOffsetRunHeader::new(239, 120486), ShortOffsetRunHeader::new(276, 122624), + ShortOffsetRunHeader::new(299, 122928), ShortOffsetRunHeader::new(305, 125184), + ShortOffsetRunHeader::new(307, 127280), ShortOffsetRunHeader::new(309, 1241482), ]; - static OFFSETS: [u8; 319] = [ - 65, 26, 6, 26, 47, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 1, 36, 7, 2, 30, 5, - 96, 1, 42, 4, 2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9, - 41, 0, 38, 1, 1, 5, 1, 2, 43, 1, 4, 0, 86, 2, 6, 0, 11, 5, 43, 2, 3, 64, 192, 64, 0, 2, 6, - 2, 38, 2, 6, 2, 8, 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, - 13, 5, 3, 1, 7, 116, 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, - 4, 1, 6, 4, 1, 2, 4, 5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, - 1, 0, 46, 18, 30, 132, 102, 3, 4, 1, 62, 2, 2, 1, 1, 1, 8, 21, 5, 1, 3, 0, 43, 1, 14, 6, 80, - 0, 7, 12, 5, 0, 26, 6, 26, 0, 80, 96, 36, 4, 36, 116, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, - 1, 7, 1, 2, 0, 1, 2, 3, 1, 42, 1, 9, 0, 51, 13, 51, 93, 22, 10, 22, 0, 64, 0, 64, 0, 85, 1, - 71, 1, 2, 2, 1, 2, 2, 2, 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, - 1, 1, 3, 7, 1, 0, 2, 25, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, - 8, 0, 10, 1, 20, 6, 6, 0, 62, 0, 68, 0, 26, 6, 26, 6, 26, 0, + static OFFSETS: [u8; 315] = [ + 170, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 1, 36, 7, 2, 30, 5, 96, 1, 42, 4, + 2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9, 41, 0, 38, 1, 1, + 5, 1, 2, 43, 1, 4, 0, 86, 2, 6, 0, 11, 5, 43, 2, 3, 64, 192, 64, 0, 2, 6, 2, 38, 2, 6, 2, 8, + 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, 5, 3, 1, 7, 116, + 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, 1, 6, 4, 1, 2, 4, + 5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 0, 46, 18, 30, 132, + 102, 3, 4, 1, 62, 2, 2, 1, 1, 1, 8, 21, 5, 1, 3, 0, 43, 1, 14, 6, 80, 0, 7, 12, 5, 0, 26, 6, + 26, 0, 80, 96, 36, 4, 36, 116, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 0, 1, 2, 3, + 1, 42, 1, 9, 0, 51, 13, 51, 93, 22, 10, 22, 0, 64, 0, 64, 0, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2, + 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 25, + 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 10, 1, 20, 6, 6, 0, + 62, 0, 68, 0, 26, 6, 26, 6, 26, 0, ]; + #[inline] pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0xaa && lookup_slow(c) + } + + #[inline(never)] + fn lookup_slow(c: char) -> bool { const { assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); let mut i = 0; @@ -377,10 +398,17 @@ pub mod cc { static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 1] = [ ShortOffsetRunHeader::new(0, 1114272), ]; - static OFFSETS: [u8; 5] = [ - 0, 32, 95, 33, 0, + static OFFSETS: [u8; 3] = [ + 128, 32, 0, ]; + #[inline] pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0x80 && lookup_slow(c) + } + + #[inline(never)] + fn lookup_slow(c: char) -> bool { const { assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); let mut i = 0; @@ -449,6 +477,7 @@ pub mod grapheme_extend { ]; #[inline] pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); (c as u32) >= 0x300 && lookup_slow(c) } @@ -471,7 +500,7 @@ pub mod grapheme_extend { #[rustfmt::skip] pub mod lowercase { static BITSET_CHUNKS_MAP: [u8; 123] = [ - 14, 17, 0, 0, 9, 0, 0, 12, 13, 10, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 12, 17, 0, 0, 9, 0, 0, 13, 14, 10, 0, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 1, 0, 15, 0, 8, 0, 0, 11, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 19, 0, @@ -483,37 +512,37 @@ pub mod lowercase { [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 16, 14, 56, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 40, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 44, 0, 0, 0], - [0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 67, 43, 0, 52, 48, 50, 33], - [0, 0, 0, 0, 10, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 9, 57, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 3, 0, 16, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 27], [0, 0, 0, 62, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 71, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 34, 17, 23, 53, 54, 49, 47, 7, 35, 42, 0, 28, 12, 31], [0, 0, 46, 0, 56, 56, 56, 0, 22, 22, 69, 22, 36, 25, 24, 37], [0, 5, 70, 0, 29, 15, 75, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [0, 66, 34, 17, 23, 53, 54, 49, 47, 8, 35, 42, 0, 28, 13, 31], - [11, 60, 0, 6, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 32, 0], + [10, 60, 0, 6, 0, 0, 30, 0, 0, 0, 0, 0, 0, 0, 32, 0], [16, 26, 22, 38, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [16, 51, 2, 21, 68, 9, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [16, 51, 2, 21, 68, 8, 59, 0, 0, 0, 0, 0, 0, 0, 0, 0], [16, 72, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], - [65, 41, 55, 12, 77, 63, 18, 1, 7, 64, 76, 20, 73, 74, 4, 45], + [65, 41, 55, 11, 66, 63, 18, 13, 1, 64, 76, 20, 73, 74, 4, 45], ]; static BITSET_CANONICAL: [u64; 56] = [ 0b0000000000000000000000000000000000000000000000000000000000000000, - 0b1111111111111111110000000000000000000000000011111111111111111111, + 0b0000111111111111111111111111110000000000000000000000000011111111, 0b1010101010101010101010101010101010101010101010101010100000000010, 0b0000000000000111111111111111111111111111111111111111111111111111, 0b1111111111111111111111000000000000000000000000001111110111111111, 0b1000000000000010000000000000000000000000000000000000000000000000, 0b0000111111111111111111111111111111111111000000000000000000000000, - 0b0000111111111111111111111111110000000000000000000000000011111111, 0b1111111111111111111111111111111111111111111111111010101010000101, 0b1111111111111111111111111111111100000000000000000000000000000000, 0b1111111111111111111111111111110000000000000000000000000000000000, 0b1111111111111111111111110000000000000000000000000000000000000000, 0b1111111111111111111111000000000000000000000000001111111111101111, 0b1111111111111111111100000000000000000000000000010000000000000000, + 0b1111111111111111110000000000000000000000000011111111111111111111, 0b1111111111111111000000111111111111110111111111111111111111111111, 0b1111111111111111000000000000000000000000000000000100001111000000, 0b1111111111111111000000000000000000000000000000000000000000000000, @@ -557,13 +586,15 @@ pub mod lowercase { 0b1110011001010001001011010010101001001110001001000011000100101001, 0b1110101111000000000000000000000000001111111111111111111111111100, ]; - static BITSET_MAPPING: [(u8, u8); 22] = [ - (0, 64), (1, 188), (1, 186), (1, 183), (1, 176), (1, 109), (1, 124), (1, 126), (1, 66), - (1, 70), (1, 77), (2, 146), (2, 144), (2, 83), (3, 93), (3, 147), (3, 133), (4, 12), (4, 6), - (5, 187), (6, 78), (7, 132), + static BITSET_MAPPING: [(u8, u8); 21] = [ + (0, 64), (1, 184), (1, 182), (1, 179), (1, 172), (1, 161), (1, 146), (1, 144), (1, 140), + (1, 136), (1, 132), (2, 146), (2, 144), (2, 83), (3, 93), (3, 147), (3, 133), (4, 12), + (4, 6), (5, 187), (6, 78), ]; pub const fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0xaa && super::bitset_search( c as u32, &BITSET_CHUNKS_MAP, @@ -579,43 +610,50 @@ pub mod n { use super::ShortOffsetRunHeader; static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 42] = [ - ShortOffsetRunHeader::new(0, 1632), ShortOffsetRunHeader::new(9, 2406), - ShortOffsetRunHeader::new(15, 4160), ShortOffsetRunHeader::new(49, 4969), - ShortOffsetRunHeader::new(53, 5870), ShortOffsetRunHeader::new(55, 6470), - ShortOffsetRunHeader::new(63, 8304), ShortOffsetRunHeader::new(79, 9312), - ShortOffsetRunHeader::new(89, 10102), ShortOffsetRunHeader::new(93, 11517), - ShortOffsetRunHeader::new(95, 12295), ShortOffsetRunHeader::new(97, 12690), - ShortOffsetRunHeader::new(103, 42528), ShortOffsetRunHeader::new(115, 43056), - ShortOffsetRunHeader::new(119, 44016), ShortOffsetRunHeader::new(131, 65296), - ShortOffsetRunHeader::new(133, 65799), ShortOffsetRunHeader::new(135, 66273), - ShortOffsetRunHeader::new(141, 67672), ShortOffsetRunHeader::new(153, 68858), - ShortOffsetRunHeader::new(183, 69216), ShortOffsetRunHeader::new(189, 70736), - ShortOffsetRunHeader::new(209, 71248), ShortOffsetRunHeader::new(213, 71904), - ShortOffsetRunHeader::new(221, 72688), ShortOffsetRunHeader::new(225, 73552), - ShortOffsetRunHeader::new(233, 74752), ShortOffsetRunHeader::new(237, 90416), - ShortOffsetRunHeader::new(239, 92768), ShortOffsetRunHeader::new(241, 93552), - ShortOffsetRunHeader::new(249, 93824), ShortOffsetRunHeader::new(251, 118000), - ShortOffsetRunHeader::new(253, 119488), ShortOffsetRunHeader::new(255, 120782), - ShortOffsetRunHeader::new(261, 123200), ShortOffsetRunHeader::new(263, 123632), - ShortOffsetRunHeader::new(265, 124144), ShortOffsetRunHeader::new(267, 125127), - ShortOffsetRunHeader::new(271, 126065), ShortOffsetRunHeader::new(275, 127232), - ShortOffsetRunHeader::new(285, 130032), ShortOffsetRunHeader::new(287, 1244154), + ShortOffsetRunHeader::new(0, 1632), ShortOffsetRunHeader::new(7, 2406), + ShortOffsetRunHeader::new(13, 4160), ShortOffsetRunHeader::new(47, 4969), + ShortOffsetRunHeader::new(51, 5870), ShortOffsetRunHeader::new(53, 6470), + ShortOffsetRunHeader::new(61, 8304), ShortOffsetRunHeader::new(77, 9312), + ShortOffsetRunHeader::new(87, 10102), ShortOffsetRunHeader::new(91, 11517), + ShortOffsetRunHeader::new(93, 12295), ShortOffsetRunHeader::new(95, 12690), + ShortOffsetRunHeader::new(101, 42528), ShortOffsetRunHeader::new(113, 43056), + ShortOffsetRunHeader::new(117, 44016), ShortOffsetRunHeader::new(129, 65296), + ShortOffsetRunHeader::new(131, 65799), ShortOffsetRunHeader::new(133, 66273), + ShortOffsetRunHeader::new(139, 67672), ShortOffsetRunHeader::new(151, 68858), + ShortOffsetRunHeader::new(181, 69216), ShortOffsetRunHeader::new(187, 70736), + ShortOffsetRunHeader::new(207, 71248), ShortOffsetRunHeader::new(211, 71904), + ShortOffsetRunHeader::new(219, 72688), ShortOffsetRunHeader::new(223, 73552), + ShortOffsetRunHeader::new(231, 74752), ShortOffsetRunHeader::new(235, 90416), + ShortOffsetRunHeader::new(237, 92768), ShortOffsetRunHeader::new(239, 93552), + ShortOffsetRunHeader::new(247, 93824), ShortOffsetRunHeader::new(249, 118000), + ShortOffsetRunHeader::new(251, 119488), ShortOffsetRunHeader::new(253, 120782), + ShortOffsetRunHeader::new(259, 123200), ShortOffsetRunHeader::new(261, 123632), + ShortOffsetRunHeader::new(263, 124144), ShortOffsetRunHeader::new(265, 125127), + ShortOffsetRunHeader::new(269, 126065), ShortOffsetRunHeader::new(273, 127232), + ShortOffsetRunHeader::new(283, 130032), ShortOffsetRunHeader::new(285, 1244154), ]; - static OFFSETS: [u8; 289] = [ - 48, 10, 120, 2, 5, 1, 2, 3, 0, 10, 134, 10, 198, 10, 0, 10, 118, 10, 4, 6, 108, 10, 118, - 10, 118, 10, 2, 6, 110, 13, 115, 10, 8, 7, 103, 10, 104, 7, 7, 19, 109, 10, 96, 10, 118, 10, - 70, 20, 0, 10, 70, 10, 0, 20, 0, 3, 239, 10, 6, 10, 22, 10, 0, 10, 128, 11, 165, 10, 6, 10, - 182, 10, 86, 10, 134, 10, 6, 10, 0, 1, 3, 6, 6, 10, 198, 51, 2, 5, 0, 60, 78, 22, 0, 30, 0, - 1, 0, 1, 25, 9, 14, 3, 0, 4, 138, 10, 30, 8, 1, 15, 32, 10, 39, 15, 0, 10, 188, 10, 0, 6, - 154, 10, 38, 10, 198, 10, 22, 10, 86, 10, 0, 10, 0, 10, 0, 45, 12, 57, 17, 2, 0, 27, 36, 4, - 29, 1, 8, 1, 134, 5, 202, 10, 0, 8, 25, 7, 39, 9, 75, 5, 22, 6, 160, 2, 2, 16, 2, 46, 64, 9, - 52, 2, 30, 3, 75, 5, 104, 8, 24, 8, 41, 7, 0, 6, 48, 10, 6, 10, 0, 31, 158, 10, 42, 4, 112, - 7, 134, 30, 128, 10, 60, 10, 144, 10, 7, 20, 251, 10, 0, 10, 118, 10, 0, 10, 102, 10, 6, 20, - 76, 12, 0, 19, 93, 10, 0, 10, 86, 29, 227, 10, 70, 10, 0, 10, 102, 21, 0, 111, 0, 10, 0, 10, - 86, 10, 134, 10, 1, 7, 0, 10, 0, 23, 0, 10, 0, 20, 12, 20, 108, 25, 0, 50, 0, 10, 0, 10, 0, - 10, 247, 10, 0, 9, 128, 10, 0, 59, 1, 3, 1, 4, 76, 45, 1, 15, 0, 13, 0, 10, 0, + static OFFSETS: [u8; 287] = [ + 178, 2, 5, 1, 2, 3, 0, 10, 134, 10, 198, 10, 0, 10, 118, 10, 4, 6, 108, 10, 118, 10, 118, + 10, 2, 6, 110, 13, 115, 10, 8, 7, 103, 10, 104, 7, 7, 19, 109, 10, 96, 10, 118, 10, 70, 20, + 0, 10, 70, 10, 0, 20, 0, 3, 239, 10, 6, 10, 22, 10, 0, 10, 128, 11, 165, 10, 6, 10, 182, 10, + 86, 10, 134, 10, 6, 10, 0, 1, 3, 6, 6, 10, 198, 51, 2, 5, 0, 60, 78, 22, 0, 30, 0, 1, 0, 1, + 25, 9, 14, 3, 0, 4, 138, 10, 30, 8, 1, 15, 32, 10, 39, 15, 0, 10, 188, 10, 0, 6, 154, 10, + 38, 10, 198, 10, 22, 10, 86, 10, 0, 10, 0, 10, 0, 45, 12, 57, 17, 2, 0, 27, 36, 4, 29, 1, 8, + 1, 134, 5, 202, 10, 0, 8, 25, 7, 39, 9, 75, 5, 22, 6, 160, 2, 2, 16, 2, 46, 64, 9, 52, 2, + 30, 3, 75, 5, 104, 8, 24, 8, 41, 7, 0, 6, 48, 10, 6, 10, 0, 31, 158, 10, 42, 4, 112, 7, 134, + 30, 128, 10, 60, 10, 144, 10, 7, 20, 251, 10, 0, 10, 118, 10, 0, 10, 102, 10, 6, 20, 76, 12, + 0, 19, 93, 10, 0, 10, 86, 29, 227, 10, 70, 10, 0, 10, 102, 21, 0, 111, 0, 10, 0, 10, 86, 10, + 134, 10, 1, 7, 0, 10, 0, 23, 0, 10, 0, 20, 12, 20, 108, 25, 0, 50, 0, 10, 0, 10, 0, 10, 247, + 10, 0, 9, 128, 10, 0, 59, 1, 3, 1, 4, 76, 45, 1, 15, 0, 13, 0, 10, 0, ]; + #[inline] pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0xb2 && lookup_slow(c) + } + + #[inline(never)] + fn lookup_slow(c: char) -> bool { const { assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); let mut i = 0; @@ -633,34 +671,34 @@ pub mod n { #[rustfmt::skip] pub mod uppercase { static BITSET_CHUNKS_MAP: [u8; 125] = [ - 12, 15, 6, 6, 0, 6, 6, 2, 4, 11, 6, 16, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 6, 6, 5, 6, 14, 6, 10, 6, 6, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, - 6, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 13, 6, 6, - 6, 6, 9, 6, 3, + 3, 14, 6, 6, 0, 6, 6, 2, 5, 12, 6, 15, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 7, 6, 13, 6, 11, 6, 6, 1, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, + 6, 8, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 16, 6, 6, + 6, 6, 10, 6, 4, ]; static BITSET_INDEX_CHUNKS: [[u8; 16]; 17] = [ - [44, 44, 5, 35, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 5, 1], + [44, 44, 5, 35, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 5, 0], [44, 44, 5, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [44, 44, 40, 44, 44, 44, 44, 44, 17, 17, 63, 17, 43, 29, 24, 23], + [44, 44, 40, 44, 44, 44, 44, 44, 17, 17, 62, 17, 43, 29, 24, 23], + [44, 44, 44, 32, 36, 21, 22, 15, 13, 34, 44, 44, 44, 11, 30, 39], [44, 44, 44, 44, 9, 8, 45, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [44, 44, 44, 44, 37, 28, 67, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 0, 44, 44, 44], + [44, 44, 44, 44, 37, 28, 66, 44, 44, 44, 44, 44, 44, 44, 44, 44], [44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [44, 44, 44, 44, 44, 44, 44, 44, 44, 55, 44, 44, 44, 44, 44, 44], - [44, 44, 44, 44, 44, 44, 44, 44, 44, 62, 61, 44, 20, 14, 16, 4], - [44, 44, 44, 44, 56, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [44, 44, 59, 44, 44, 31, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [44, 44, 60, 46, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [44, 49, 44, 32, 36, 21, 22, 15, 13, 34, 44, 44, 44, 11, 30, 39], - [52, 54, 26, 50, 12, 7, 25, 51, 41, 53, 6, 3, 66, 65, 64, 68], - [57, 44, 9, 47, 44, 42, 33, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [58, 19, 2, 18, 10, 48, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], - [58, 38, 17, 27, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], + [44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 57, 44, 44, 44], + [44, 44, 44, 44, 44, 44, 44, 44, 44, 49, 44, 44, 44, 44, 44, 44], + [44, 44, 44, 44, 44, 44, 44, 44, 44, 61, 60, 44, 20, 14, 16, 4], + [44, 44, 44, 44, 50, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], + [44, 44, 53, 44, 44, 31, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], + [44, 44, 54, 46, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], + [51, 44, 9, 47, 44, 42, 33, 44, 44, 44, 44, 44, 44, 44, 44, 44], + [52, 19, 2, 18, 10, 48, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], + [52, 38, 17, 27, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44], + [58, 1, 26, 55, 12, 7, 25, 56, 41, 59, 6, 3, 65, 64, 63, 67], ]; static BITSET_CANONICAL: [u64; 44] = [ - 0b0000011111111111111111111111111000000000000000000000000000000000, 0b0000000000111111111111111111111111111111111111111111111111111111, + 0b1111111111111111111111110000000000000000000000000011111111111111, 0b0101010101010101010101010101010101010101010101010101010000000001, 0b0000011111111111111111111111110000000000000000000000000000000001, 0b0000000000100000000000000000000000010101010000010001101011110101, @@ -704,13 +742,15 @@ pub mod uppercase { 0b1111011111111111000000000000000000000000000000000000000000000000, 0b1111111100000000111111110000000000111111000000001111111100000000, ]; - static BITSET_MAPPING: [(u8, u8); 25] = [ - (0, 187), (0, 177), (0, 171), (0, 167), (0, 164), (0, 32), (0, 47), (0, 51), (0, 121), - (0, 117), (0, 109), (1, 150), (1, 148), (1, 142), (1, 134), (1, 131), (1, 64), (2, 164), - (2, 146), (2, 20), (3, 146), (3, 140), (3, 134), (4, 178), (4, 171), + static BITSET_MAPPING: [(u8, u8); 24] = [ + (0, 182), (0, 74), (0, 166), (0, 162), (0, 159), (0, 150), (0, 148), (0, 142), (0, 134), + (0, 131), (0, 64), (1, 66), (1, 70), (1, 83), (1, 12), (1, 8), (2, 164), (2, 146), (2, 20), + (3, 146), (3, 140), (3, 134), (4, 178), (4, 171), ]; pub const fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0xc0 && super::bitset_search( c as u32, &BITSET_CHUNKS_MAP, @@ -724,8 +764,8 @@ pub mod uppercase { #[rustfmt::skip] pub mod white_space { static WHITESPACE_MAP: [u8; 256] = [ - 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -736,6 +776,7 @@ pub mod white_space { ]; #[inline] pub const fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); match c as u32 >> 8 { 0 => WHITESPACE_MAP[c as usize & 0xff] & 1 != 0, 22 => c as u32 == 0x1680, diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs index 78a7bba320870..56e6401908dcf 100644 --- a/src/tools/unicode-table-generator/src/cascading_map.rs +++ b/src/tools/unicode-table-generator/src/cascading_map.rs @@ -64,6 +64,7 @@ impl RawEmitter { writeln!(&mut self.file, "#[inline]").unwrap(); writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap(); for arm in arms { writeln!(&mut self.file, " {arm},").unwrap(); diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 1d70eebdbc6c2..70455f0d5b879 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -196,6 +196,7 @@ fn load_data() -> UnicodeData { .into_iter() .flatten() .flat_map(|cp| cp.scalar()) + .filter(|c| !c.is_ascii()) .map(u32::from) .collect::>(); (prop, ranges_from_set(&codepoints)) diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index 45dc450c4f67d..7905247cd7a80 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -98,6 +98,7 @@ impl RawEmitter { self.blank_line(); writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); if first_code_point > 0x7f { writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} &&").unwrap(); } diff --git a/src/tools/unicode-table-generator/src/skiplist.rs b/src/tools/unicode-table-generator/src/skiplist.rs index 34c9802e122f6..660a8f342f7a7 100644 --- a/src/tools/unicode-table-generator/src/skiplist.rs +++ b/src/tools/unicode-table-generator/src/skiplist.rs @@ -99,6 +99,7 @@ impl RawEmitter { if first_code_point > 0x7f { writeln!(&mut self.file, "#[inline]").unwrap(); writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); writeln!(&mut self.file, " (c as u32) >= {first_code_point:#04x} && lookup_slow(c)") .unwrap(); writeln!(&mut self.file, "}}").unwrap(); @@ -107,6 +108,7 @@ impl RawEmitter { writeln!(&mut self.file, "fn lookup_slow(c: char) -> bool {{").unwrap(); } else { writeln!(&mut self.file, "pub fn lookup(c: char) -> bool {{").unwrap(); + writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); } writeln!(&mut self.file, " const {{").unwrap(); writeln!( From 9a1accc762543c92f63c530363738942b4329822 Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sun, 10 Aug 2025 18:29:21 +0000 Subject: [PATCH 08/10] optimization: Eliminate `Cased` table MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `Cased` is a derived property - it is the union of the `Lowercase` property, the `Uppercase` property, and the `Titlecase_Letter` generaral category. We already have lookup tables for `Lowercase` and `Uppercase`, and `Titlecase_Letter` is very small. So instead of duplicating a lookup table for `Cased`, just test each of those properties in turn. This probably will be slower than the old approach, but it is not a public API: it is only used in `string::to_lower` when deciding when a Greek "sigma" should be mapped to `ς` or to `σ`. This is a very rare case, so should not be performance sensitive. --- library/core/src/char/methods.rs | 12 ++- library/core/src/unicode/mod.rs | 2 +- library/core/src/unicode/unicode_data.rs | 90 ++++++++----------- src/tools/unicode-table-generator/src/main.rs | 2 +- 4 files changed, 45 insertions(+), 61 deletions(-) diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index f751713bc3a04..2dd510adc61a4 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -6,7 +6,7 @@ use crate::slice; use crate::str::from_utf8_unchecked_mut; use crate::ub_checks::assert_unsafe_precondition; use crate::unicode::printable::is_printable; -use crate::unicode::{self, conversions}; +use crate::unicode::{self, Case_Ignorable, conversions}; impl char { /// The lowest valid code point a `char` can have, `'\0'`. @@ -968,7 +968,7 @@ impl char { !self.is_ascii() && unicode::Grapheme_Extend(self) } - /// Returns `true` if this `char` has the `Cased` property. + /// Returns `true` if this `char` has the `Cased` derived property. /// /// `Cased` is described in Chapter 4 (Character Properties) of the [Unicode Standard] and /// specified in the [Unicode Character Database][ucd] [`DerivedCoreProperties.txt`]. @@ -981,7 +981,11 @@ impl char { #[doc(hidden)] #[unstable(feature = "char_internals", reason = "exposed only for libstd", issue = "none")] pub fn is_cased(self) -> bool { - if self.is_ascii() { self.is_ascii_alphabetic() } else { unicode::Cased(self) } + if self.is_ascii() { + self.is_ascii_alphabetic() + } else { + unicode::Lowercase(self) || unicode::Uppercase(self) || unicode::Lt(self) + } } /// Returns `true` if this `char` has the `Case_Ignorable` property. @@ -1000,7 +1004,7 @@ impl char { if self.is_ascii() { matches!(self, '\'' | '.' | ':' | '^' | '`') } else { - unicode::Case_Ignorable(self) + Case_Ignorable(self) } } diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs index 6f4bb60676855..a2493ef2764dc 100644 --- a/library/core/src/unicode/mod.rs +++ b/library/core/src/unicode/mod.rs @@ -8,10 +8,10 @@ pub use unicode_data::conversions; #[rustfmt::skip] pub(crate) use unicode_data::alphabetic::lookup as Alphabetic; pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable; -pub(crate) use unicode_data::cased::lookup as Cased; pub(crate) use unicode_data::cc::lookup as Cc; pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend; pub(crate) use unicode_data::lowercase::lookup as Lowercase; +pub(crate) use unicode_data::lt::lookup as Lt; pub(crate) use unicode_data::n::lookup as N; pub(crate) use unicode_data::uppercase::lookup as Uppercase; pub(crate) use unicode_data::white_space::lookup as White_Space; diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index eab2899057d8a..f451e114731c3 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -1,16 +1,16 @@ ///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually! // Alphabetic : 1723 bytes, 142707 codepoints in 755 ranges (U+0000AA - U+0323B0) using skiplist // Case_Ignorable : 1043 bytes, 2744 codepoints in 447 ranges (U+0000A8 - U+0E01F0) using skiplist -// Cased : 403 bytes, 4526 codepoints in 157 ranges (U+0000AA - U+01F18A) using skiplist // Cc : 7 bytes, 32 codepoints in 1 ranges (U+000080 - U+0000A0) using skiplist // Grapheme_Extend : 887 bytes, 2193 codepoints in 375 ranges (U+000300 - U+0E01F0) using skiplist // Lowercase : 933 bytes, 2543 codepoints in 674 ranges (U+0000AA - U+01E944) using bitset +// Lt : 33 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using skiplist // N : 455 bytes, 1901 codepoints in 143 ranges (U+0000B2 - U+01FBFA) using skiplist // Uppercase : 797 bytes, 1952 codepoints in 655 ranges (U+0000C0 - U+01F18A) using bitset // White_Space : 256 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using cascading // to_lower : 11484 bytes // to_upper : 13432 bytes -// Total : 31420 bytes +// Total : 31050 bytes #[inline(always)] const fn bitset_search< @@ -338,59 +338,6 @@ pub mod case_ignorable { } } -#[rustfmt::skip] -pub mod cased { - use super::ShortOffsetRunHeader; - - static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 22] = [ - ShortOffsetRunHeader::new(0, 4256), ShortOffsetRunHeader::new(51, 5024), - ShortOffsetRunHeader::new(61, 7296), ShortOffsetRunHeader::new(65, 7958), - ShortOffsetRunHeader::new(74, 9398), ShortOffsetRunHeader::new(149, 11264), - ShortOffsetRunHeader::new(151, 42560), ShortOffsetRunHeader::new(163, 43824), - ShortOffsetRunHeader::new(183, 64256), ShortOffsetRunHeader::new(189, 65313), - ShortOffsetRunHeader::new(193, 66560), ShortOffsetRunHeader::new(197, 67456), - ShortOffsetRunHeader::new(219, 68736), ShortOffsetRunHeader::new(227, 71840), - ShortOffsetRunHeader::new(235, 93760), ShortOffsetRunHeader::new(237, 119808), - ShortOffsetRunHeader::new(239, 120486), ShortOffsetRunHeader::new(276, 122624), - ShortOffsetRunHeader::new(299, 122928), ShortOffsetRunHeader::new(305, 125184), - ShortOffsetRunHeader::new(307, 127280), ShortOffsetRunHeader::new(309, 1241482), - ]; - static OFFSETS: [u8; 315] = [ - 170, 1, 10, 1, 4, 1, 5, 23, 1, 31, 1, 195, 1, 4, 4, 208, 1, 36, 7, 2, 30, 5, 96, 1, 42, 4, - 2, 2, 2, 4, 1, 1, 6, 1, 1, 3, 1, 1, 1, 20, 1, 83, 1, 139, 8, 166, 1, 38, 9, 41, 0, 38, 1, 1, - 5, 1, 2, 43, 1, 4, 0, 86, 2, 6, 0, 11, 5, 43, 2, 3, 64, 192, 64, 0, 2, 6, 2, 38, 2, 6, 2, 8, - 1, 1, 1, 1, 1, 1, 1, 31, 2, 53, 1, 7, 1, 1, 3, 3, 1, 7, 3, 4, 2, 6, 4, 13, 5, 3, 1, 7, 116, - 1, 13, 1, 16, 13, 101, 1, 4, 1, 2, 10, 1, 1, 3, 5, 6, 1, 1, 1, 1, 1, 1, 4, 1, 6, 4, 1, 2, 4, - 5, 5, 4, 1, 17, 32, 3, 2, 0, 52, 0, 229, 6, 4, 3, 2, 12, 38, 1, 1, 5, 1, 0, 46, 18, 30, 132, - 102, 3, 4, 1, 62, 2, 2, 1, 1, 1, 8, 21, 5, 1, 3, 0, 43, 1, 14, 6, 80, 0, 7, 12, 5, 0, 26, 6, - 26, 0, 80, 96, 36, 4, 36, 116, 11, 1, 15, 1, 7, 1, 2, 1, 11, 1, 15, 1, 7, 1, 2, 0, 1, 2, 3, - 1, 42, 1, 9, 0, 51, 13, 51, 93, 22, 10, 22, 0, 64, 0, 64, 0, 85, 1, 71, 1, 2, 2, 1, 2, 2, 2, - 4, 1, 12, 1, 1, 1, 7, 1, 65, 1, 4, 2, 8, 1, 7, 1, 28, 1, 4, 1, 5, 1, 1, 3, 7, 1, 0, 2, 25, - 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 31, 1, 25, 1, 8, 0, 10, 1, 20, 6, 6, 0, - 62, 0, 68, 0, 26, 6, 26, 6, 26, 0, - ]; - #[inline] - pub fn lookup(c: char) -> bool { - debug_assert!(!c.is_ascii()); - (c as u32) >= 0xaa && lookup_slow(c) - } - - #[inline(never)] - fn lookup_slow(c: char) -> bool { - const { - assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); - let mut i = 0; - while i < SHORT_OFFSET_RUNS.len() { - assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); - i += 1; - } - } - // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` - // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. - unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) } - } -} - #[rustfmt::skip] pub mod cc { use super::ShortOffsetRunHeader; @@ -605,6 +552,39 @@ pub mod lowercase { } } +#[rustfmt::skip] +pub mod lt { + use super::ShortOffsetRunHeader; + + static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 3] = [ + ShortOffsetRunHeader::new(0, 453), ShortOffsetRunHeader::new(1, 8072), + ShortOffsetRunHeader::new(9, 1122301), + ]; + static OFFSETS: [u8; 21] = [ + 0, 1, 2, 1, 2, 1, 38, 1, 0, 8, 8, 8, 8, 8, 12, 1, 15, 1, 47, 1, 0, + ]; + #[inline] + pub fn lookup(c: char) -> bool { + debug_assert!(!c.is_ascii()); + (c as u32) >= 0x1c5 && lookup_slow(c) + } + + #[inline(never)] + fn lookup_slow(c: char) -> bool { + const { + assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); + let mut i = 0; + while i < SHORT_OFFSET_RUNS.len() { + assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); + i += 1; + } + } + // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` + // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. + unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) } + } +} + #[rustfmt::skip] pub mod n { use super::ShortOffsetRunHeader; diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 70455f0d5b879..9c014895445d6 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -90,7 +90,7 @@ static PROPERTIES: &[&str] = &[ "Alphabetic", "Lowercase", "Uppercase", - "Cased", + "Lt", "Case_Ignorable", "Grapheme_Extend", "White_Space", From b7fa8ef1336f996874ba48512a7b2996aeeefcd3 Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Sun, 10 Aug 2025 00:54:35 +0100 Subject: [PATCH 09/10] optimization: Use `match` for small sets If the number of codepoint ranges in a set is sufficiently small, it may be better to simply use a `match` expression rather than a lookup table. The instructions to implement the `match` may be slightly bigger than the table that it replaced (hard to predict, depends on architecture and whatever optimzations LLVM applies), but in return we elimate the lookup tables and avoid the slower binary search. --- library/core/src/unicode/unicode_data.rs | 99 ++++++------------- .../src/cascading_map.rs | 78 --------------- src/tools/unicode-table-generator/src/main.rs | 10 +- .../unicode-table-generator/src/match.rs | 30 ++++++ .../src/raw_emitter.rs | 15 ++- 5 files changed, 69 insertions(+), 163 deletions(-) delete mode 100644 src/tools/unicode-table-generator/src/cascading_map.rs create mode 100644 src/tools/unicode-table-generator/src/match.rs diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index f451e114731c3..93918eae6c867 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -1,16 +1,16 @@ ///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually! // Alphabetic : 1723 bytes, 142707 codepoints in 755 ranges (U+0000AA - U+0323B0) using skiplist // Case_Ignorable : 1043 bytes, 2744 codepoints in 447 ranges (U+0000A8 - U+0E01F0) using skiplist -// Cc : 7 bytes, 32 codepoints in 1 ranges (U+000080 - U+0000A0) using skiplist +// Cc : 0 bytes, 32 codepoints in 1 ranges (U+000080 - U+0000A0) using match // Grapheme_Extend : 887 bytes, 2193 codepoints in 375 ranges (U+000300 - U+0E01F0) using skiplist // Lowercase : 933 bytes, 2543 codepoints in 674 ranges (U+0000AA - U+01E944) using bitset -// Lt : 33 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using skiplist +// Lt : 0 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using match // N : 455 bytes, 1901 codepoints in 143 ranges (U+0000B2 - U+01FBFA) using skiplist // Uppercase : 797 bytes, 1952 codepoints in 655 ranges (U+0000C0 - U+01F18A) using bitset -// White_Space : 256 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using cascading +// White_Space : 0 bytes, 19 codepoints in 8 ranges (U+000085 - U+003001) using match // to_lower : 11484 bytes // to_upper : 13432 bytes -// Total : 31050 bytes +// Total : 30754 bytes #[inline(always)] const fn bitset_search< @@ -340,33 +340,13 @@ pub mod case_ignorable { #[rustfmt::skip] pub mod cc { - use super::ShortOffsetRunHeader; - - static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 1] = [ - ShortOffsetRunHeader::new(0, 1114272), - ]; - static OFFSETS: [u8; 3] = [ - 128, 32, 0, - ]; #[inline] - pub fn lookup(c: char) -> bool { + pub const fn lookup(c: char) -> bool { debug_assert!(!c.is_ascii()); - (c as u32) >= 0x80 && lookup_slow(c) - } - - #[inline(never)] - fn lookup_slow(c: char) -> bool { - const { - assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); - let mut i = 0; - while i < SHORT_OFFSET_RUNS.len() { - assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); - i += 1; - } + match c as u32 { + 0x80..=0x9f => true, + _ => false, } - // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` - // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. - unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) } } } @@ -554,34 +534,22 @@ pub mod lowercase { #[rustfmt::skip] pub mod lt { - use super::ShortOffsetRunHeader; - - static SHORT_OFFSET_RUNS: [ShortOffsetRunHeader; 3] = [ - ShortOffsetRunHeader::new(0, 453), ShortOffsetRunHeader::new(1, 8072), - ShortOffsetRunHeader::new(9, 1122301), - ]; - static OFFSETS: [u8; 21] = [ - 0, 1, 2, 1, 2, 1, 38, 1, 0, 8, 8, 8, 8, 8, 12, 1, 15, 1, 47, 1, 0, - ]; #[inline] - pub fn lookup(c: char) -> bool { + pub const fn lookup(c: char) -> bool { debug_assert!(!c.is_ascii()); - (c as u32) >= 0x1c5 && lookup_slow(c) - } - - #[inline(never)] - fn lookup_slow(c: char) -> bool { - const { - assert!(SHORT_OFFSET_RUNS.last().unwrap().0 > char::MAX as u32); - let mut i = 0; - while i < SHORT_OFFSET_RUNS.len() { - assert!(SHORT_OFFSET_RUNS[i].start_index() < OFFSETS.len()); - i += 1; - } + match c as u32 { + 0x1c5 => true, + 0x1c8 => true, + 0x1cb => true, + 0x1f2 => true, + 0x1f88..=0x1f8f => true, + 0x1f98..=0x1f9f => true, + 0x1fa8..=0x1faf => true, + 0x1fbc => true, + 0x1fcc => true, + 0x1ffc => true, + _ => false, } - // SAFETY: We just ensured the last element of `SHORT_OFFSET_RUNS` is greater than `std::char::MAX` - // and the start indices of all elements in `SHORT_OFFSET_RUNS` are smaller than `OFFSETS.len()`. - unsafe { super::skip_search(c, &SHORT_OFFSET_RUNS, &OFFSETS) } } } @@ -743,25 +711,18 @@ pub mod uppercase { #[rustfmt::skip] pub mod white_space { - static WHITESPACE_MAP: [u8; 256] = [ - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]; #[inline] pub const fn lookup(c: char) -> bool { debug_assert!(!c.is_ascii()); - match c as u32 >> 8 { - 0 => WHITESPACE_MAP[c as usize & 0xff] & 1 != 0, - 22 => c as u32 == 0x1680, - 32 => WHITESPACE_MAP[c as usize & 0xff] & 2 != 0, - 48 => c as u32 == 0x3000, + match c as u32 { + 0x85 => true, + 0xa0 => true, + 0x1680 => true, + 0x2000..=0x200a => true, + 0x2028..=0x2029 => true, + 0x202f => true, + 0x205f => true, + 0x3000 => true, _ => false, } } diff --git a/src/tools/unicode-table-generator/src/cascading_map.rs b/src/tools/unicode-table-generator/src/cascading_map.rs deleted file mode 100644 index 56e6401908dcf..0000000000000 --- a/src/tools/unicode-table-generator/src/cascading_map.rs +++ /dev/null @@ -1,78 +0,0 @@ -use std::collections::HashMap; -use std::fmt::Write as _; -use std::ops::Range; - -use crate::fmt_list; -use crate::raw_emitter::RawEmitter; - -impl RawEmitter { - pub fn emit_cascading_map(&mut self, ranges: &[Range]) -> bool { - let mut map: [u8; 256] = [ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - ]; - - let points = ranges - .iter() - .flat_map(|r| (r.start..r.end).collect::>()) - .collect::>(); - - println!("there are {} points", points.len()); - - // how many distinct ranges need to be counted? - let mut codepoints_by_high_bytes = HashMap::>::new(); - for point in points { - // assert that there is no whitespace over the 0x3000 range. - assert!(point <= 0x3000, "the highest unicode whitespace value has changed"); - let high_bytes = point as usize >> 8; - let codepoints = codepoints_by_high_bytes.entry(high_bytes).or_default(); - codepoints.push(point); - } - - let mut bit_for_high_byte = 1u8; - let mut arms = Vec::::new(); - - let mut high_bytes: Vec = codepoints_by_high_bytes.keys().copied().collect(); - high_bytes.sort(); - for high_byte in high_bytes { - let codepoints = codepoints_by_high_bytes.get_mut(&high_byte).unwrap(); - if codepoints.len() == 1 { - let ch = codepoints.pop().unwrap(); - arms.push(format!("{high_byte} => c as u32 == {ch:#04x}")); - continue; - } - // more than 1 codepoint in this arm - for codepoint in codepoints { - map[(*codepoint & 0xff) as usize] |= bit_for_high_byte; - } - arms.push(format!( - "{high_byte} => WHITESPACE_MAP[c as usize & 0xff] & {bit_for_high_byte} != 0" - )); - bit_for_high_byte <<= 1; - } - - writeln!(&mut self.file, "static WHITESPACE_MAP: [u8; 256] = [{}];", fmt_list(map.iter())) - .unwrap(); - self.bytes_used += 256; - - writeln!(&mut self.file, "#[inline]").unwrap(); - writeln!(&mut self.file, "pub const fn lookup(c: char) -> bool {{").unwrap(); - writeln!(&mut self.file, " debug_assert!(!c.is_ascii());").unwrap(); - writeln!(&mut self.file, " match c as u32 >> 8 {{").unwrap(); - for arm in arms { - writeln!(&mut self.file, " {arm},").unwrap(); - } - writeln!(&mut self.file, " _ => false,").unwrap(); - writeln!(&mut self.file, " }}").unwrap(); - writeln!(&mut self.file, "}}").unwrap(); - - true - } -} diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 9c014895445d6..68139d177666d 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -78,13 +78,13 @@ use std::ops::Range; use ucd_parse::Codepoints; -mod cascading_map; mod case_mapping; +mod r#match; mod raw_emitter; mod skiplist; mod unicode_download; -use raw_emitter::{RawEmitter, emit_codepoints, emit_whitespace}; +use raw_emitter::{RawEmitter, emit_codepoints}; static PROPERTIES: &[&str] = &[ "Alphabetic", @@ -239,11 +239,7 @@ fn main() { let datapoints = ranges.iter().map(|r| r.end - r.start).sum::(); let mut emitter = RawEmitter::new(); - if property == &"White_Space" { - emit_whitespace(&mut emitter, ranges); - } else { - emit_codepoints(&mut emitter, ranges); - } + emit_codepoints(&mut emitter, ranges); modules.push((property.to_lowercase().to_string(), emitter.file)); table_file.push_str(&format!( diff --git a/src/tools/unicode-table-generator/src/match.rs b/src/tools/unicode-table-generator/src/match.rs new file mode 100644 index 0000000000000..440178f919e70 --- /dev/null +++ b/src/tools/unicode-table-generator/src/match.rs @@ -0,0 +1,30 @@ +use std::fmt::{self, Write as _}; +use std::ops::Range; + +use crate::raw_emitter::RawEmitter; + +impl RawEmitter { + pub fn emit_match(&mut self, ranges: &[Range]) -> Result<(), fmt::Error> { + let arms: Vec<_> = ranges + .iter() + .map(|range| match range.len() { + 1 => format!("{:#x} => true,", range.start), + + // minus one because inclusive range pattern + _ => format!("{:#x}..={:#x} => true,", range.start, range.end - 1), + }) + .collect(); + + writeln!(self.file, "#[inline]")?; + writeln!(self.file, "pub const fn lookup(c: char) -> bool {{")?; + writeln!(self.file, " debug_assert!(!c.is_ascii());")?; + writeln!(self.file, " match c as u32 {{")?; + for arm in arms { + writeln!(self.file, " {arm}")?; + } + writeln!(self.file, " _ => false,")?; + writeln!(self.file, " }}")?; + writeln!(self.file, "}}")?; + Ok(()) + } +} diff --git a/src/tools/unicode-table-generator/src/raw_emitter.rs b/src/tools/unicode-table-generator/src/raw_emitter.rs index 7905247cd7a80..db7c6b42af17e 100644 --- a/src/tools/unicode-table-generator/src/raw_emitter.rs +++ b/src/tools/unicode-table-generator/src/raw_emitter.rs @@ -156,6 +156,12 @@ impl RawEmitter { pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range]) { emitter.blank_line(); + if ranges.len() <= 10 { + emitter.emit_match(ranges).unwrap(); + emitter.desc = String::from("match"); + return; + } + let mut bitset = emitter.clone(); let bitset_ok = bitset.emit_bitset(ranges).is_ok(); @@ -171,15 +177,6 @@ pub fn emit_codepoints(emitter: &mut RawEmitter, ranges: &[Range]) { } } -pub fn emit_whitespace(emitter: &mut RawEmitter, ranges: &[Range]) { - emitter.blank_line(); - - let mut cascading = emitter.clone(); - cascading.emit_cascading_map(ranges); - *emitter = cascading; - emitter.desc = String::from("cascading"); -} - struct Canonicalized { canonical_words: Vec, canonicalized_words: Vec<(u8, u8)>, From 3d5b2b81dfbebf9881fd6547a159bfcc64b5f88b Mon Sep 17 00:00:00 2001 From: Karl Meakin Date: Mon, 11 Aug 2025 00:57:29 +0000 Subject: [PATCH 10/10] refactor: Hard-code `char::is_control` According to https://www.unicode.org/policies/stability_policy.html#Property_Value, the set of codepoints in `Cc` will never change. So we can hard-code the patterns to match against instead of using a table. --- library/core/src/char/methods.rs | 6 +++++- library/core/src/unicode/mod.rs | 1 - library/core/src/unicode/unicode_data.rs | 13 ------------- src/tools/unicode-table-generator/src/main.rs | 1 - 4 files changed, 5 insertions(+), 16 deletions(-) diff --git a/library/core/src/char/methods.rs b/library/core/src/char/methods.rs index 2dd510adc61a4..838a4a61fbbd7 100644 --- a/library/core/src/char/methods.rs +++ b/library/core/src/char/methods.rs @@ -950,7 +950,11 @@ impl char { #[stable(feature = "rust1", since = "1.0.0")] #[inline] pub fn is_control(self) -> bool { - if self.is_ascii() { self.is_ascii_control() } else { unicode::Cc(self) } + // According to + // https://www.unicode.org/policies/stability_policy.html#Property_Value, + // the set of codepoints in `Cc` will never change. So we can hard-code + // the patterns to match against instead of using a table. + matches!(self, '\0'..='\x1f' | '\x7f'..='\u{9f}') } /// Returns `true` if this `char` has the `Grapheme_Extend` property. diff --git a/library/core/src/unicode/mod.rs b/library/core/src/unicode/mod.rs index a2493ef2764dc..db6f9c5b350fc 100644 --- a/library/core/src/unicode/mod.rs +++ b/library/core/src/unicode/mod.rs @@ -8,7 +8,6 @@ pub use unicode_data::conversions; #[rustfmt::skip] pub(crate) use unicode_data::alphabetic::lookup as Alphabetic; pub(crate) use unicode_data::case_ignorable::lookup as Case_Ignorable; -pub(crate) use unicode_data::cc::lookup as Cc; pub(crate) use unicode_data::grapheme_extend::lookup as Grapheme_Extend; pub(crate) use unicode_data::lowercase::lookup as Lowercase; pub(crate) use unicode_data::lt::lookup as Lt; diff --git a/library/core/src/unicode/unicode_data.rs b/library/core/src/unicode/unicode_data.rs index 93918eae6c867..ae1f1c29f8218 100644 --- a/library/core/src/unicode/unicode_data.rs +++ b/library/core/src/unicode/unicode_data.rs @@ -1,7 +1,6 @@ ///! This file is generated by `./x run src/tools/unicode-table-generator`; do not edit manually! // Alphabetic : 1723 bytes, 142707 codepoints in 755 ranges (U+0000AA - U+0323B0) using skiplist // Case_Ignorable : 1043 bytes, 2744 codepoints in 447 ranges (U+0000A8 - U+0E01F0) using skiplist -// Cc : 0 bytes, 32 codepoints in 1 ranges (U+000080 - U+0000A0) using match // Grapheme_Extend : 887 bytes, 2193 codepoints in 375 ranges (U+000300 - U+0E01F0) using skiplist // Lowercase : 933 bytes, 2543 codepoints in 674 ranges (U+0000AA - U+01E944) using bitset // Lt : 0 bytes, 31 codepoints in 10 ranges (U+0001C5 - U+001FFD) using match @@ -338,18 +337,6 @@ pub mod case_ignorable { } } -#[rustfmt::skip] -pub mod cc { - #[inline] - pub const fn lookup(c: char) -> bool { - debug_assert!(!c.is_ascii()); - match c as u32 { - 0x80..=0x9f => true, - _ => false, - } - } -} - #[rustfmt::skip] pub mod grapheme_extend { use super::ShortOffsetRunHeader; diff --git a/src/tools/unicode-table-generator/src/main.rs b/src/tools/unicode-table-generator/src/main.rs index 68139d177666d..85dc597b50a89 100644 --- a/src/tools/unicode-table-generator/src/main.rs +++ b/src/tools/unicode-table-generator/src/main.rs @@ -94,7 +94,6 @@ static PROPERTIES: &[&str] = &[ "Case_Ignorable", "Grapheme_Extend", "White_Space", - "Cc", "N", ];