3434#![allow(missing_docs, non_upper_case_globals, non_snake_case)]
3535'''
3636
37- UNICODE_VERSION = (12 , 1 , 0 )
37+ UNICODE_VERSION = (13 , 0 , 0 )
3838
3939UNICODE_VERSION_NUMBER = "%s.%s.%s" % UNICODE_VERSION
4040
@@ -54,7 +54,7 @@ def load_properties(f, interestingprops = None):
5454 re1 = re .compile (r"^ *([0-9A-F]+) *; *(\w+)" )
5555 re2 = re .compile (r"^ *([0-9A-F]+)\.\.([0-9A-F]+) *; *(\w+)" )
5656
57- for line in fileinput .input (os .path .basename (f )):
57+ for line in fileinput .input (os .path .basename (f ), openhook = fileinput . hook_encoded ( "utf-8" ) ):
5858 prop = None
5959 d_lo = 0
6060 d_hi = 0
@@ -81,6 +81,28 @@ def load_properties(f, interestingprops = None):
8181
8282 return props
8383
84+ def load_confusables (f ):
85+ fetch (f )
86+ confusables = []
87+ re1 = re .compile (r"^((?:[0-9A-F]+ )+);\t((?:[0-9A-F]+ )+);\t\w*" )
88+
89+ for line in fileinput .input (os .path .basename (f ), openhook = fileinput .hook_encoded ("utf-8" )):
90+ d_input = 0
91+ d_outputs = []
92+ m = re1 .match (line )
93+ if not m :
94+ continue
95+ d_inputs = m .group (1 ).split ()
96+ if len (d_inputs ) != 1 :
97+ raise Exception ('More than one code point in first column' )
98+ d_input = int (d_inputs [0 ].strip (), 16 )
99+ for d_output in m .group (2 ).split ():
100+ d_outputitem = int (d_output , 16 );
101+ d_outputs .append (d_outputitem );
102+ confusables .append ((d_input , d_outputs ))
103+
104+ return confusables
105+
84106def format_table_content (f , content , indent ):
85107 line = " " * indent
86108 first = True
@@ -99,6 +121,18 @@ def format_table_content(f, content, indent):
99121def escape_char (c ):
100122 return "'\\ u{%x}'" % c
101123
124+ def escape_char_list (l ):
125+ line = "[" ;
126+ first = True ;
127+ for c in l :
128+ if first :
129+ line += escape_char (c );
130+ else :
131+ line += ", " + escape_char (c );
132+ first = False ;
133+ line += "]" ;
134+ return line
135+
102136def emit_table (f , name , t_data , t_type = "&'static [(char, char)]" , is_pub = True ,
103137 pfun = lambda x : "(%s,%s)" % (escape_char (x [0 ]), escape_char (x [1 ])), is_const = True ):
104138 pub_string = "const"
@@ -173,10 +207,45 @@ def emit_identifier_module(f):
173207 pfun = lambda x : "(%s,%s, IdentifierType::%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), x [2 ]))
174208 f .write ("}\n \n " )
175209
210+ def emit_confusable_detection_module (f ):
211+ f .write ("pub mod confusable_detection {" )
212+ f .write ("""
213+
214+ #[inline]
215+ pub fn char_confusable_prototype(c: char) -> Option<&'static [char]> {
216+ // FIXME: do we want to special case ASCII here?
217+ match c as usize {
218+ _ => super::util::bsearch_value_table(c, CONFUSABLES)
219+ }
220+ }
221+
222+ """ )
223+
224+ f .write (" // Confusable table:\n " )
225+ confusable_table = load_confusables ("confusables.txt" )
226+ confusable_table .sort (key = lambda w : w [0 ])
227+
228+ emit_table (f , "CONFUSABLES" , confusable_table , "&'static [(char, &'static [char])]" , is_pub = False ,
229+ pfun = lambda x : "(%s, &%s)" % (escape_char (x [0 ]), escape_char_list (x [1 ])))
230+ f .write ("}\n \n " )
231+
232+
176233def emit_util_mod (f ):
177234 f .write ("""
178235pub mod util {
179236 use core::result::Result::{Ok, Err};
237+
238+ #[inline]
239+ pub fn bsearch_value_table<T: Copy>(c: char, r: &'static [(char, T)]) -> Option<T> {
240+ match r.binary_search_by_key(&c, |&(k, _)| k) {
241+ Ok(idx) => {
242+ let (_, v) = r[idx];
243+ Some(v)
244+ }
245+ Err(_) => None
246+ }
247+ }
248+
180249 #[inline]
181250 pub fn bsearch_range_table(c: char, r: &'static [(char,char)]) -> bool {
182251 use core::cmp::Ordering::{Equal, Less, Greater};
@@ -224,3 +293,5 @@ def emit_util_mod(f):
224293 emit_util_mod (rf )
225294 ### identifier module
226295 emit_identifier_module (rf )
296+ ### confusable_detection module
297+ emit_confusable_detection_module (rf )
0 commit comments