@@ -25,11 +25,11 @@ use crate::types::array::ArrayColumnBuilder;
2525use crate :: types:: decimal:: DecimalColumn ;
2626use crate :: types:: nullable:: NullableColumn ;
2727use crate :: types:: number:: NumberColumn ;
28+ use crate :: types:: string:: StringColumn ;
2829use crate :: types:: string:: StringColumnBuilder ;
2930use crate :: types:: AnyType ;
3031use crate :: types:: ArrayType ;
3132use crate :: types:: BooleanType ;
32- use crate :: types:: StringType ;
3333use crate :: types:: ValueType ;
3434use crate :: types:: VariantType ;
3535use crate :: with_decimal_type;
@@ -131,14 +131,8 @@ impl Column {
131131 filter,
132132 ) ,
133133 Column :: String ( column) => {
134- let bytes_per_row = column. data . len ( ) / filter. len ( ) . max ( 1 ) ;
135- let data_capacity = ( filter. len ( ) - filter. unset_bits ( ) ) * bytes_per_row;
136-
137- Self :: filter_scalar_types :: < StringType > (
138- column,
139- StringColumnBuilder :: with_capacity ( length, data_capacity) ,
140- filter,
141- )
134+ let column = Self :: filter_string_scalars ( column, filter) ;
135+ Column :: String ( column)
142136 }
143137 Column :: Timestamp ( column) => {
144138 let ts = Self :: filter_primitive_types ( column, filter) ;
@@ -246,7 +240,7 @@ impl Column {
246240
247241 // low-level API using unsafe to improve performance
248242 fn filter_primitive_types < T : Copy > ( values : & Buffer < T > , filter : & Bitmap ) -> Buffer < T > {
249- assert_eq ! ( values. len( ) , filter. len( ) ) ;
243+ debug_assert_eq ! ( values. len( ) , filter. len( ) ) ;
250244 let selected = filter. len ( ) - filter. unset_bits ( ) ;
251245 if selected == values. len ( ) {
252246 return values. clone ( ) ;
@@ -317,4 +311,87 @@ impl Column {
317311 unsafe { new. set_len ( selected) } ;
318312 new. into ( )
319313 }
314+
315+ // low-level API using unsafe to improve performance
316+ fn filter_string_scalars ( values : & StringColumn , filter : & Bitmap ) -> StringColumn {
317+ debug_assert_eq ! ( values. len( ) , filter. len( ) ) ;
318+ let selected = filter. len ( ) - filter. unset_bits ( ) ;
319+ if selected == values. len ( ) {
320+ return values. clone ( ) ;
321+ }
322+ let data = values. data . as_slice ( ) ;
323+ let offsets = values. offsets . as_slice ( ) ;
324+
325+ let mut res_offsets = Vec :: with_capacity ( selected + 1 ) ;
326+ res_offsets. push ( 0 ) ;
327+
328+ let mut res_data = vec ! [ ] ;
329+ let hint_size = data. len ( ) / ( values. len ( ) + 1 ) * selected;
330+
331+ static MAX_HINT_SIZE : usize = 1000000000 ;
332+ if hint_size < MAX_HINT_SIZE && values. len ( ) < MAX_HINT_SIZE {
333+ res_data. reserve ( hint_size)
334+ }
335+
336+ let mut pos = 0 ;
337+
338+ let ( mut slice, offset, mut length) = filter. as_slice ( ) ;
339+ if offset > 0 {
340+ // Consume the offset
341+ let n = 8 - offset;
342+ values
343+ . iter ( )
344+ . zip ( filter. iter ( ) )
345+ . take ( n)
346+ . for_each ( |( value, is_selected) | {
347+ if is_selected {
348+ res_data. extend_from_slice ( value) ;
349+ res_offsets. push ( res_data. len ( ) as u64 ) ;
350+ }
351+ } ) ;
352+ slice = & slice[ 1 ..] ;
353+ length -= n;
354+ pos += n;
355+ }
356+
357+ const CHUNK_SIZE : usize = 64 ;
358+ let mut mask_chunks = BitChunksExact :: < u64 > :: new ( slice, length) ;
359+
360+ for mut mask in mask_chunks. by_ref ( ) {
361+ if mask == u64:: MAX {
362+ let data = & data[ offsets[ pos] as usize ..offsets[ pos + CHUNK_SIZE ] as usize ] ;
363+ res_data. extend_from_slice ( data) ;
364+
365+ let mut last_len = * res_offsets. last ( ) . unwrap ( ) ;
366+ for i in 0 ..CHUNK_SIZE {
367+ last_len += offsets[ pos + i + 1 ] - offsets[ pos + i] ;
368+ res_offsets. push ( last_len) ;
369+ }
370+ } else {
371+ while mask != 0 {
372+ let n = mask. trailing_zeros ( ) as usize ;
373+ let data = & data[ offsets[ pos + n] as usize ..offsets[ pos + n + 1 ] as usize ] ;
374+ res_data. extend_from_slice ( data) ;
375+ res_offsets. push ( res_data. len ( ) as u64 ) ;
376+
377+ mask = mask & ( mask - 1 ) ;
378+ }
379+ }
380+ pos += CHUNK_SIZE ;
381+ }
382+
383+ for is_select in mask_chunks. remainder_iter ( ) {
384+ if is_select {
385+ let data = & data[ offsets[ pos] as usize ..offsets[ pos + 1 ] as usize ] ;
386+ res_data. extend_from_slice ( data) ;
387+ res_offsets. push ( res_data. len ( ) as u64 ) ;
388+ }
389+ pos += 1 ;
390+ }
391+
392+ StringColumn {
393+ data : res_data. into ( ) ,
394+ offsets : res_offsets. into ( ) ,
395+ }
396+ }
320397}
0 commit comments