Skip to content

Commit 28d9105

Browse files
committed
rearrange
1 parent 3f4bf60 commit 28d9105

File tree

1 file changed

+65
-64
lines changed

1 file changed

+65
-64
lines changed

datafusion/physical-expr/src/string_map.rs

Lines changed: 65 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -15,16 +15,8 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
//! Specialized HashMap that maps Strings/LargeStrings to values
19-
//!
20-
//! This is a specialized HashMap that is optimized for storing and emitting ArrowArrays
21-
//! as efficiently as possible by minimizing copying of the string values themselves
22-
//! both when inserting and when emitting the final array.
23-
//!
24-
//! Note it can be used as a HashSet by specifing the value type as `()`.
25-
//!
26-
//! This is used by the special `COUNT DISTINCT` string aggregate function to store the distinct values
27-
//! and by the `GROUP BY` operator to store the distinct values for each group when they are single strings
18+
//! [`ArrowStringMap`] Specialized HashMap that stores values from
19+
//! Strings/LargeStrings to values.
2820
2921
use ahash::RandomState;
3022
use arrow_array::cast::AsArray;
@@ -37,63 +29,19 @@ use std::mem;
3729
use std::ops::Range;
3830
use std::sync::Arc;
3931

40-
/// Maximum size of a string that can be inlined in the hash table
41-
const SHORT_STRING_LEN: usize = mem::size_of::<usize>();
42-
43-
/// Entry that is stored in a `ArrowStringHashSet` that represents a string
44-
/// that is either stored inline or in the buffer
45-
///
46-
/// This helps the case where there are many short (less than 8 bytes) strings
47-
/// that are the same (e.g. "MA", "CA", "NY", "TX", etc)
48-
///
49-
/// ```text
50-
/// ┌──────────────────┐
51-
/// ─ ─ ─ ─ ─ ─ ─▶│... │
52-
/// │ │TheQuickBrownFox │
53-
/// │... │
54-
/// │ │ │
55-
/// └──────────────────┘
56-
/// │ buffer of u8
57-
///
58-
/// │
59-
/// ┌────────────────┬───────────────┬───────────────┐
60-
/// Storing │ │ starting byte │ length, in │
61-
/// "TheQuickBrownFox" │ hash value │ offset in │ bytes (not │
62-
/// (long string) │ │ buffer │ characters) │
63-
/// └────────────────┴───────────────┴───────────────┘
64-
/// 8 bytes 8 bytes 4 or 8
65-
///
66-
///
67-
/// ┌───────────────┬─┬─┬─┬─┬─┬─┬─┬─┬───────────────┐
68-
/// Storing "foobar" │ │ │ │ │ │ │ │ │ │ length, in │
69-
/// (short string) │ hash value │?│?│f│o│o│b│a│r│ bytes (not │
70-
/// │ │ │ │ │ │ │ │ │ │ characters) │
71-
/// └───────────────┴─┴─┴─┴─┴─┴─┴─┴─┴───────────────┘
72-
/// 8 bytes 8 bytes 4 or 8
73-
/// ```
74-
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
75-
struct Entry {
76-
/// hash of the string value (stored to avoid recomputing it in hash table
77-
/// check)
78-
hash: u64,
79-
/// if len =< SHORT_STRING_LEN: the string data inlined
80-
/// if len > SHORT_STRING_LEN, the offset of where the data starts
81-
offset_or_inline: usize,
82-
/// length of the string, in bytes
83-
len: usize,
84-
}
85-
86-
impl Entry {
87-
/// returns self.offset..self.offset + self.len
88-
fn range(&self) -> Range<usize> {
89-
self.offset_or_inline..self.offset_or_inline + self.len
90-
}
91-
}
92-
9332
/// HashSet optimized for storing `String` and `LargeString` values
9433
/// and producing the final set as a GenericStringArray with minimal copies.
9534
///
96-
/// Equivalent to `HashSet<String>` but with better performance for arrow data.
35+
/// This is a specialized HashMap that is optimized for storing and emitting
36+
/// ArrowArrays as efficiently as possible by minimizing copying of the string
37+
/// values themselves both when inserting and when emitting the final array.
38+
///
39+
/// Note it can be used as a HashSet by specifying the value type as `()`.
40+
///
41+
/// This is used by the special `COUNT DISTINCT` string aggregate function to
42+
/// store the distinct values and by the `GROUP BY` operator to store the
43+
/// distinct values for each group when they are single strings/// Equivalent to
44+
/// `HashSet<String>` but with better performance for arrow data.
9745
struct ArrowStringMap<O> {
9846
/// Underlying hash set for each distinct string
9947
map: hashbrown::raw::RawTable<Entry>,
@@ -275,6 +223,59 @@ impl<O: OffsetSizeTrait> Debug for ArrowStringMap<O> {
275223
}
276224
}
277225

226+
/// Maximum size of a string that can be inlined in the hash table
227+
const SHORT_STRING_LEN: usize = mem::size_of::<usize>();
228+
229+
/// Entry that is stored in a `ArrowStringHashSet` that represents a string
230+
/// that is either stored inline or in the buffer
231+
///
232+
/// This helps the case where there are many short (less than 8 bytes) strings
233+
/// that are the same (e.g. "MA", "CA", "NY", "TX", etc)
234+
///
235+
/// ```text
236+
/// ┌──────────────────┐
237+
/// ─ ─ ─ ─ ─ ─ ─▶│... │
238+
/// │ │TheQuickBrownFox │
239+
/// │... │
240+
/// │ │ │
241+
/// └──────────────────┘
242+
/// │ buffer of u8
243+
///
244+
/// │
245+
/// ┌────────────────┬───────────────┬───────────────┐
246+
/// Storing │ │ starting byte │ length, in │
247+
/// "TheQuickBrownFox" │ hash value │ offset in │ bytes (not │
248+
/// (long string) │ │ buffer │ characters) │
249+
/// └────────────────┴───────────────┴───────────────┘
250+
/// 8 bytes 8 bytes 4 or 8
251+
///
252+
///
253+
/// ┌───────────────┬─┬─┬─┬─┬─┬─┬─┬─┬───────────────┐
254+
/// Storing "foobar" │ │ │ │ │ │ │ │ │ │ length, in │
255+
/// (short string) │ hash value │?│?│f│o│o│b│a│r│ bytes (not │
256+
/// │ │ │ │ │ │ │ │ │ │ characters) │
257+
/// └───────────────┴─┴─┴─┴─┴─┴─┴─┴─┴───────────────┘
258+
/// 8 bytes 8 bytes 4 or 8
259+
/// ```
260+
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy)]
261+
struct Entry {
262+
/// hash of the string value (stored to avoid recomputing it in hash table
263+
/// check)
264+
hash: u64,
265+
/// if len =< SHORT_STRING_LEN: the string data inlined
266+
/// if len > SHORT_STRING_LEN, the offset of where the data starts
267+
offset_or_inline: usize,
268+
/// length of the string, in bytes
269+
len: usize,
270+
}
271+
272+
impl Entry {
273+
/// returns self.offset..self.offset + self.len
274+
fn range(&self) -> Range<usize> {
275+
self.offset_or_inline..self.offset_or_inline + self.len
276+
}
277+
}
278+
278279
#[cfg(test)]
279280
mod tests {
280281
use super::*;

0 commit comments

Comments
 (0)