99import os
1010import subprocess
1111
12- NUM_CODEPOINTS = 0x110000
12+ NUM_CODEPOINTS = 0x110000
13+
1314
1415def to_ranges (iter ):
1516 current = None
@@ -23,19 +24,25 @@ def to_ranges(iter):
2324 if current is not None :
2425 yield tuple (current )
2526
27+
2628def get_escaped (codepoints ):
2729 for c in codepoints :
28- if (c .class_ or "Cn" ) in "Cc Cf Cs Co Cn Zl Zp Zs" .split () and c .value != ord (' ' ):
30+ if (c .class_ or "Cn" ) in "Cc Cf Cs Co Cn Zl Zp Zs" .split () and c .value != ord (
31+ " "
32+ ):
2933 yield c .value
3034
35+
3136def get_file (f ):
3237 try :
3338 return open (os .path .basename (f ))
3439 except FileNotFoundError :
3540 subprocess .run (["curl" , "-O" , f ], check = True )
3641 return open (os .path .basename (f ))
3742
38- Codepoint = namedtuple ('Codepoint' , 'value class_' )
43+
44+ Codepoint = namedtuple ("Codepoint" , "value class_" )
45+
3946
4047def get_codepoints (f ):
4148 r = csv .reader (f , delimiter = ";" )
@@ -66,13 +73,14 @@ def get_codepoints(f):
6673 for c in range (prev_codepoint + 1 , NUM_CODEPOINTS ):
6774 yield Codepoint (c , None )
6875
76+
6977def compress_singletons (singletons ):
70- uppers = [] # (upper, # items in lowers)
78+ uppers = [] # (upper, # items in lowers)
7179 lowers = []
7280
7381 for i in singletons :
7482 upper = i >> 8
75- lower = i & 0xff
83+ lower = i & 0xFF
7684 if len (uppers ) == 0 or uppers [- 1 ][0 ] != upper :
7785 uppers .append ((upper , 1 ))
7886 else :
@@ -82,10 +90,11 @@ def compress_singletons(singletons):
8290
8391 return uppers , lowers
8492
93+
8594def compress_normal (normal ):
8695 # lengths 0x00..0x7f are encoded as 00, 01, ..., 7e, 7f
8796 # lengths 0x80..0x7fff are encoded as 80 80, 80 81, ..., ff fe, ff ff
88- compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]
97+ compressed = [] # [truelen, (truelenaux), falselen, (falselenaux)]
8998
9099 prev_start = 0
91100 for start , count in normal :
@@ -95,21 +104,22 @@ def compress_normal(normal):
95104
96105 assert truelen < 0x8000 and falselen < 0x8000
97106 entry = []
98- if truelen > 0x7f :
107+ if truelen > 0x7F :
99108 entry .append (0x80 | (truelen >> 8 ))
100- entry .append (truelen & 0xff )
109+ entry .append (truelen & 0xFF )
101110 else :
102- entry .append (truelen & 0x7f )
103- if falselen > 0x7f :
111+ entry .append (truelen & 0x7F )
112+ if falselen > 0x7F :
104113 entry .append (0x80 | (falselen >> 8 ))
105- entry .append (falselen & 0xff )
114+ entry .append (falselen & 0xFF )
106115 else :
107- entry .append (falselen & 0x7f )
116+ entry .append (falselen & 0x7F )
108117
109118 compressed .append (entry )
110119
111120 return compressed
112121
122+
113123def print_singletons (uppers , lowers , uppersname , lowersname ):
114124 print ("#[rustfmt::skip]" )
115125 print ("const {}: &[(u8, u8)] = &[" .format (uppersname ))
@@ -119,22 +129,26 @@ def print_singletons(uppers, lowers, uppersname, lowersname):
119129 print ("#[rustfmt::skip]" )
120130 print ("const {}: &[u8] = &[" .format (lowersname ))
121131 for i in range (0 , len (lowers ), 8 ):
122- print (" {}" .format (" " .join ("{:#04x}," .format (x ) for x in lowers [i :i + 8 ])))
132+ print (
133+ " {}" .format (" " .join ("{:#04x}," .format (x ) for x in lowers [i : i + 8 ]))
134+ )
123135 print ("];" )
124136
137+
125138def print_normal (normal , normalname ):
126139 print ("#[rustfmt::skip]" )
127140 print ("const {}: &[u8] = &[" .format (normalname ))
128141 for v in normal :
129142 print (" {}" .format (" " .join ("{:#04x}," .format (i ) for i in v )))
130143 print ("];" )
131144
145+
132146def main ():
133147 file = get_file ("https://www.unicode.org/Public/UNIDATA/UnicodeData.txt" )
134148
135149 codepoints = get_codepoints (file )
136150
137- CUTOFF = 0x10000
151+ CUTOFF = 0x10000
138152 singletons0 = []
139153 singletons1 = []
140154 normal0 = []
@@ -234,10 +248,11 @@ def main():
234248}\
235249 """ )
236250 print ()
237- print_singletons (singletons0u , singletons0l , 'SINGLETONS0U' , 'SINGLETONS0L' )
238- print_singletons (singletons1u , singletons1l , 'SINGLETONS1U' , 'SINGLETONS1L' )
239- print_normal (normal0 , 'NORMAL0' )
240- print_normal (normal1 , 'NORMAL1' )
251+ print_singletons (singletons0u , singletons0l , "SINGLETONS0U" , "SINGLETONS0L" )
252+ print_singletons (singletons1u , singletons1l , "SINGLETONS1U" , "SINGLETONS1L" )
253+ print_normal (normal0 , "NORMAL0" )
254+ print_normal (normal1 , "NORMAL1" )
255+
241256
242- if __name__ == ' __main__' :
257+ if __name__ == " __main__" :
243258 main ()
0 commit comments