1515// specific language governing permissions and limitations
1616// under the License.
1717
18- //! Specialized HashMap that maps Strings/LargeStrings to values
19- //!
20- //! This is a specialized HashMap that is optimized for storing and emitting ArrowArrays
21- //! as efficiently as possible by minimizing copying of the string values themselves
22- //! both when inserting and when emitting the final array.
23- //!
24- //! Note it can be used as a HashSet by specifing the value type as `()`.
25- //!
26- //! This is used by the special `COUNT DISTINCT` string aggregate function to store the distinct values
27- //! and by the `GROUP BY` operator to store the distinct values for each group when they are single strings
18+ //! [`ArrowStringMap`] Specialized HashMap that stores values from
19+ //! Strings/LargeStrings to values.
2820
2921use ahash:: RandomState ;
3022use arrow_array:: cast:: AsArray ;
@@ -37,63 +29,19 @@ use std::mem;
3729use std:: ops:: Range ;
3830use std:: sync:: Arc ;
3931
40- /// Maximum size of a string that can be inlined in the hash table
41- const SHORT_STRING_LEN : usize = mem:: size_of :: < usize > ( ) ;
42-
43- /// Entry that is stored in a `ArrowStringHashSet` that represents a string
44- /// that is either stored inline or in the buffer
45- ///
46- /// This helps the case where there are many short (less than 8 bytes) strings
47- /// that are the same (e.g. "MA", "CA", "NY", "TX", etc)
48- ///
49- /// ```text
50- /// ┌──────────────────┐
51- /// ─ ─ ─ ─ ─ ─ ─▶│... │
52- /// │ │TheQuickBrownFox │
53- /// │... │
54- /// │ │ │
55- /// └──────────────────┘
56- /// │ buffer of u8
57- ///
58- /// │
59- /// ┌────────────────┬───────────────┬───────────────┐
60- /// Storing │ │ starting byte │ length, in │
61- /// "TheQuickBrownFox" │ hash value │ offset in │ bytes (not │
62- /// (long string) │ │ buffer │ characters) │
63- /// └────────────────┴───────────────┴───────────────┘
64- /// 8 bytes 8 bytes 4 or 8
65- ///
66- ///
67- /// ┌───────────────┬─┬─┬─┬─┬─┬─┬─┬─┬───────────────┐
68- /// Storing "foobar" │ │ │ │ │ │ │ │ │ │ length, in │
69- /// (short string) │ hash value │?│?│f│o│o│b│a│r│ bytes (not │
70- /// │ │ │ │ │ │ │ │ │ │ characters) │
71- /// └───────────────┴─┴─┴─┴─┴─┴─┴─┴─┴───────────────┘
72- /// 8 bytes 8 bytes 4 or 8
73- /// ```
74- #[ derive( Debug , PartialEq , Eq , Hash , Clone , Copy ) ]
75- struct Entry {
76- /// hash of the string value (stored to avoid recomputing it in hash table
77- /// check)
78- hash : u64 ,
79- /// if len =< SHORT_STRING_LEN: the string data inlined
80- /// if len > SHORT_STRING_LEN, the offset of where the data starts
81- offset_or_inline : usize ,
82- /// length of the string, in bytes
83- len : usize ,
84- }
85-
86- impl Entry {
87- /// returns self.offset..self.offset + self.len
88- fn range ( & self ) -> Range < usize > {
89- self . offset_or_inline ..self . offset_or_inline + self . len
90- }
91- }
92-
9332/// HashSet optimized for storing `String` and `LargeString` values
9433/// and producing the final set as a GenericStringArray with minimal copies.
9534///
96- /// Equivalent to `HashSet<String>` but with better performance for arrow data.
35+ /// This is a specialized HashMap that is optimized for storing and emitting
36+ /// ArrowArrays as efficiently as possible by minimizing copying of the string
37+ /// values themselves both when inserting and when emitting the final array.
38+ ///
39+ /// Note it can be used as a HashSet by specifying the value type as `()`.
40+ ///
41+ /// This is used by the special `COUNT DISTINCT` string aggregate function to
42+ /// store the distinct values and by the `GROUP BY` operator to store the
43+ /// distinct values for each group when they are single strings/// Equivalent to
44+ /// `HashSet<String>` but with better performance for arrow data.
9745struct ArrowStringMap < O > {
9846 /// Underlying hash set for each distinct string
9947 map : hashbrown:: raw:: RawTable < Entry > ,
@@ -275,6 +223,59 @@ impl<O: OffsetSizeTrait> Debug for ArrowStringMap<O> {
275223 }
276224}
277225
226+ /// Maximum size of a string that can be inlined in the hash table
227+ const SHORT_STRING_LEN : usize = mem:: size_of :: < usize > ( ) ;
228+
229+ /// Entry that is stored in a `ArrowStringHashSet` that represents a string
230+ /// that is either stored inline or in the buffer
231+ ///
232+ /// This helps the case where there are many short (less than 8 bytes) strings
233+ /// that are the same (e.g. "MA", "CA", "NY", "TX", etc)
234+ ///
235+ /// ```text
236+ /// ┌──────────────────┐
237+ /// ─ ─ ─ ─ ─ ─ ─▶│... │
238+ /// │ │TheQuickBrownFox │
239+ /// │... │
240+ /// │ │ │
241+ /// └──────────────────┘
242+ /// │ buffer of u8
243+ ///
244+ /// │
245+ /// ┌────────────────┬───────────────┬───────────────┐
246+ /// Storing │ │ starting byte │ length, in │
247+ /// "TheQuickBrownFox" │ hash value │ offset in │ bytes (not │
248+ /// (long string) │ │ buffer │ characters) │
249+ /// └────────────────┴───────────────┴───────────────┘
250+ /// 8 bytes 8 bytes 4 or 8
251+ ///
252+ ///
253+ /// ┌───────────────┬─┬─┬─┬─┬─┬─┬─┬─┬───────────────┐
254+ /// Storing "foobar" │ │ │ │ │ │ │ │ │ │ length, in │
255+ /// (short string) │ hash value │?│?│f│o│o│b│a│r│ bytes (not │
256+ /// │ │ │ │ │ │ │ │ │ │ characters) │
257+ /// └───────────────┴─┴─┴─┴─┴─┴─┴─┴─┴───────────────┘
258+ /// 8 bytes 8 bytes 4 or 8
259+ /// ```
260+ #[ derive( Debug , PartialEq , Eq , Hash , Clone , Copy ) ]
261+ struct Entry {
262+ /// hash of the string value (stored to avoid recomputing it in hash table
263+ /// check)
264+ hash : u64 ,
265+ /// if len =< SHORT_STRING_LEN: the string data inlined
266+ /// if len > SHORT_STRING_LEN, the offset of where the data starts
267+ offset_or_inline : usize ,
268+ /// length of the string, in bytes
269+ len : usize ,
270+ }
271+
272+ impl Entry {
273+ /// returns self.offset..self.offset + self.len
274+ fn range ( & self ) -> Range < usize > {
275+ self . offset_or_inline ..self . offset_or_inline + self . len
276+ }
277+ }
278+
278279#[ cfg( test) ]
279280mod tests {
280281 use super :: * ;
0 commit comments