2525
2626import fileinput , re , os , sys , operator
2727
28+ bytes_old = 0
29+ bytes_new = 0
30+
2831preamble = '''// Copyright 2012-2015 The Rust Project Developers. See the COPYRIGHT
2932// file at the top-level directory of this distribution and at
3033// http://rust-lang.org/COPYRIGHT.
@@ -309,16 +312,36 @@ def emit_table(f, name, t_data, t_type = "&'static [(char, char)]", is_pub=True,
309312
310313def emit_trie_lookup_range_table (f ):
311314 f .write ("""
315+
316+ // BoolTrie is a trie for representing a set of Unicode codepoints. It is
317+ // implemented with postfix compression (sharing of identical child nodes),
318+ // which gives both compact size and fast lookup.
319+ //
320+ // The space of Unicode codepoints is divided into 3 subareas, each
321+ // represented by a trie with different depth. In the first (0..0x800), there
322+ // is no trie structure at all; each u64 entry corresponds to a bitvector
323+ // effectively holding 64 bool values.
324+ //
325+ // In the second (0x800..0x10000), each child of the root node represents a
326+ // 64-wide subrange, but instead of storing the full 64-bit value of the leaf,
327+ // the trie stores an 8-bit index into a shared table of leaf values. This
328+ // exploits the fact that in reasonable sets, many such leaves can be shared.
329+ //
330+ // In the third (0x10000..0x110000), each child of the root node represents a
331+ // 4096-wide subrange, and the trie stores an 8-bit index into a 64-byte slice
332+ // of a child tree. Each of these 64 bytes represents an index into the table
333+ // of shared 64-bit leaf values. This exploits the sparse structure in the
334+ // non-BMP range of most Unicode sets.
312335pub struct BoolTrie {
313336 // 0..0x800 (corresponding to 1 and 2 byte utf-8 sequences)
314337 r1: [u64; 32], // leaves
315338
316339 // 0x800..0x10000 (corresponding to 3 byte utf-8 sequences)
317- r2: [u8; 1024 ], // first level
340+ r2: [u8; 992 ], // first level
318341 r3: &'static [u64], // leaves
319342
320343 // 0x10000..0x110000 (corresponding to 4 byte utf-8 sequences)
321- r4: [u8; 272 ], // first level
344+ r4: [u8; 256 ], // first level
322345 r5: &'static [u8], // second level
323346 r6: &'static [u64], // leaves
324347}
@@ -332,10 +355,10 @@ def emit_trie_lookup_range_table(f):
332355 if c < 0x800 {
333356 trie_range_leaf(c, r.r1[c >> 6])
334357 } else if c < 0x10000 {
335- let child = r.r2[c >> 6];
358+ let child = r.r2[( c >> 6) - 0x20 ];
336359 trie_range_leaf(c, r.r3[child as usize])
337360 } else {
338- let child = r.r4[c >> 12];
361+ let child = r.r4[( c >> 12) - 0x10 ];
339362 let leaf = r.r5[((child as usize) << 6) + ((c >> 6) & 0x3f)];
340363 trie_range_leaf(c, r.r6[leaf as usize])
341364 }
@@ -356,6 +379,8 @@ def compute_trie(rawdata, chunksize):
356379 return (root , child_data )
357380
358381def emit_bool_trie (f , name , t_data , is_pub = True ):
382+ global bytes_old , bytes_new
383+ bytes_old += 8 * len (t_data )
359384 CHUNK = 64
360385 rawdata = [False ] * 0x110000 ;
361386 for (lo , hi ) in t_data :
@@ -383,7 +408,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
383408 # 0x800..0x10000 trie
384409 (r2 , r3 ) = compute_trie (chunks [0x800 / CHUNK : 0x10000 / CHUNK ], 64 / CHUNK )
385410 f .write (" r2: [\n " )
386- data = ',' .join (str (node ) for node in [ 255 ] * 32 + r2 )
411+ data = ',' .join (str (node ) for node in r2 )
387412 format_table_content (f , data , 12 )
388413 f .write ("\n ],\n " )
389414 f .write (" r3: &[\n " )
@@ -395,7 +420,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
395420 (mid , r6 ) = compute_trie (chunks [0x10000 / CHUNK : 0x110000 / CHUNK ], 64 / CHUNK )
396421 (r4 , r5 ) = compute_trie (mid , 64 )
397422 f .write (" r4: [\n " )
398- data = ',' .join (str (node ) for node in [ 255 ] * 16 + r4 )
423+ data = ',' .join (str (node ) for node in r4 )
399424 format_table_content (f , data , 12 )
400425 f .write ("\n ],\n " )
401426 f .write (" r5: &[\n " )
@@ -408,6 +433,7 @@ def emit_bool_trie(f, name, t_data, is_pub=True):
408433 f .write ("\n ],\n " )
409434
410435 f .write (" };\n \n " )
436+ bytes_new += 256 + 992 + 256 + 8 * len (r3 ) + len (r5 ) + 8 * len (r6 )
411437
412438def emit_property_module (f , mod , tbl , emit ):
413439 f .write ("pub mod %s {\n " % mod )
@@ -517,3 +543,4 @@ def emit_norm_module(f, canon, compat, combine, norm_props):
517543 # normalizations and conversions module
518544 emit_norm_module (rf , canon_decomp , compat_decomp , combines , norm_props )
519545 emit_conversions_module (rf , to_upper , to_lower , to_title )
546+ #print 'bytes before = %d, bytes after = %d' % (bytes_old, bytes_new)
0 commit comments