@@ -23,17 +23,28 @@ use anyhow::anyhow;
2323use arrow_array:: RecordBatch ;
2424use arrow_json:: reader:: { infer_json_schema_from_iterator, ReaderBuilder } ;
2525use arrow_schema:: { DataType , Field , Fields , Schema } ;
26+ use chrono:: { DateTime , NaiveDateTime , Utc } ;
2627use datafusion:: arrow:: util:: bit_util:: round_upto_multiple_of_64;
2728use itertools:: Itertools ;
2829use serde_json:: Value ;
2930use std:: { collections:: HashMap , sync:: Arc } ;
3031use tracing:: error;
3132
3233use super :: EventFormat ;
33- use crate :: { metadata:: SchemaVersion , utils:: arrow:: get_field} ;
34+ use crate :: { metadata:: SchemaVersion , storage :: StreamType , utils:: arrow:: get_field} ;
3435
3536pub struct Event {
3637 pub json : Value ,
38+ ingestion_time : DateTime < Utc > ,
39+ }
40+
41+ impl Event {
42+ pub fn new ( json : Value ) -> Self {
43+ Self {
44+ json,
45+ ingestion_time : Utc :: now ( ) ,
46+ }
47+ }
3748}
3849
3950impl EventFormat for Event {
@@ -120,6 +131,82 @@ impl EventFormat for Event {
120131 Ok ( None ) => unreachable ! ( "all records are added to one rb" ) ,
121132 }
122133 }
134+
135+ fn into_event (
136+ self ,
137+ stream_name : String ,
138+ origin_size : u64 ,
139+ storage_schema : & HashMap < String , Arc < Field > > ,
140+ static_schema_flag : bool ,
141+ custom_partitions : Option < & String > ,
142+ time_partition : Option < & String > ,
143+ schema_version : SchemaVersion ,
144+ stream_type : StreamType ,
145+ ) -> Result < super :: Event , anyhow:: Error > {
146+ let custom_partition_values = match custom_partitions. as_ref ( ) {
147+ Some ( custom_partition) => {
148+ let custom_partitions = custom_partition. split ( ',' ) . collect_vec ( ) ;
149+ get_custom_partition_values ( & self . json , & custom_partitions)
150+ }
151+ None => HashMap :: new ( ) ,
152+ } ;
153+
154+ let parsed_timestamp = match time_partition {
155+ Some ( time_partition) => get_parsed_timestamp ( & self . json , time_partition) ?,
156+ _ => self . ingestion_time . naive_utc ( ) ,
157+ } ;
158+
159+ let ( rb, is_first_event) = self . into_recordbatch (
160+ storage_schema,
161+ static_schema_flag,
162+ time_partition,
163+ schema_version,
164+ ) ?;
165+
166+ Ok ( super :: Event {
167+ rb,
168+ stream_name,
169+ origin_format : "json" ,
170+ origin_size,
171+ is_first_event,
172+ parsed_timestamp,
173+ time_partition : None ,
174+ custom_partition_values,
175+ stream_type,
176+ } )
177+ }
178+ }
179+
180+ pub fn get_custom_partition_values (
181+ json : & Value ,
182+ custom_partition_list : & [ & str ] ,
183+ ) -> HashMap < String , String > {
184+ let mut custom_partition_values: HashMap < String , String > = HashMap :: new ( ) ;
185+ for custom_partition_field in custom_partition_list {
186+ let custom_partition_value = json. get ( custom_partition_field. trim ( ) ) . unwrap ( ) . to_owned ( ) ;
187+ let custom_partition_value = match custom_partition_value {
188+ e @ Value :: Number ( _) | e @ Value :: Bool ( _) => e. to_string ( ) ,
189+ Value :: String ( s) => s,
190+ _ => "" . to_string ( ) ,
191+ } ;
192+ custom_partition_values. insert (
193+ custom_partition_field. trim ( ) . to_string ( ) ,
194+ custom_partition_value,
195+ ) ;
196+ }
197+ custom_partition_values
198+ }
199+
200+ fn get_parsed_timestamp (
201+ json : & Value ,
202+ time_partition : & str ,
203+ ) -> Result < NaiveDateTime , anyhow:: Error > {
204+ let current_time = json
205+ . get ( time_partition)
206+ . ok_or_else ( || anyhow ! ( "Missing field for time partition in json: {time_partition}" ) ) ?;
207+ let parsed_time: DateTime < Utc > = serde_json:: from_value ( current_time. clone ( ) ) ?;
208+
209+ Ok ( parsed_time. naive_utc ( ) )
123210}
124211
125212// Returns arrow schema with the fields that are present in the request body
@@ -225,3 +312,37 @@ fn valid_type(data_type: &DataType, value: &Value, schema_version: SchemaVersion
225312 }
226313 }
227314}
315+
316+ #[ cfg( test) ]
317+ mod tests {
318+ use std:: str:: FromStr ;
319+
320+ use serde_json:: json;
321+
322+ use super :: * ;
323+
324+ #[ test]
325+ fn parse_time_parition_from_value ( ) {
326+ let json = json ! ( { "timestamp" : "2025-05-15T15:30:00Z" } ) ;
327+ let parsed = get_parsed_timestamp ( & json, "timestamp" ) ;
328+
329+ let expected = NaiveDateTime :: from_str ( "2025-05-15T15:30:00" ) . unwrap ( ) ;
330+ assert_eq ! ( parsed. unwrap( ) , expected) ;
331+ }
332+
333+ #[ test]
334+ fn time_parition_not_in_json ( ) {
335+ let json = json ! ( { "timestamp" : "2025-05-15T15:30:00Z" } ) ;
336+ let parsed = get_parsed_timestamp ( & json, "timestamp" ) ;
337+
338+ assert ! ( parsed. is_err( ) ) ;
339+ }
340+
341+ #[ test]
342+ fn time_parition_not_parseable_as_datetime ( ) {
343+ let json = json ! ( { "timestamp" : "2025-05-15T15:30:00Z" } ) ;
344+ let parsed = get_parsed_timestamp ( & json, "timestamp" ) ;
345+
346+ assert ! ( parsed. is_err( ) ) ;
347+ }
348+ }
0 commit comments