@@ -23,22 +23,38 @@ use anyhow::anyhow;
2323use arrow_array:: RecordBatch ;
2424use arrow_json:: reader:: { infer_json_schema_from_iterator, ReaderBuilder } ;
2525use arrow_schema:: { DataType , Field , Fields , Schema } ;
26+ use chrono:: { DateTime , NaiveDateTime , Utc } ;
2627use datafusion:: arrow:: util:: bit_util:: round_upto_multiple_of_64;
2728use itertools:: Itertools ;
2829use serde_json:: Value ;
2930use std:: { collections:: HashMap , sync:: Arc } ;
3031use tracing:: error;
3132
3233use super :: EventFormat ;
33- use crate :: { metadata:: SchemaVersion , utils:: arrow:: get_field} ;
34+ use crate :: { metadata:: SchemaVersion , storage :: StreamType , utils:: arrow:: get_field} ;
3435
3536pub struct Event {
36- pub data : Value ,
37+ pub json : Value ,
38+ pub p_timestamp : DateTime < Utc > ,
39+ }
40+
41+ impl Event {
42+ pub fn new ( json : Value ) -> Self {
43+ Self {
44+ json,
45+ p_timestamp : Utc :: now ( ) ,
46+ }
47+ }
3748}
3849
3950impl EventFormat for Event {
4051 type Data = Vec < Value > ;
4152
53+ /// Returns the time at ingestion, i.e. the `p_timestamp` value
54+ fn get_p_timestamp ( & self ) -> DateTime < Utc > {
55+ self . p_timestamp
56+ }
57+
4258 // convert the incoming json to a vector of json values
4359 // also extract the arrow schema, tags and metadata from the incoming json
4460 fn to_data (
@@ -52,7 +68,7 @@ impl EventFormat for Event {
5268 // incoming event may be a single json or a json array
5369 // but Data (type defined above) is a vector of json values
5470 // hence we need to convert the incoming event to a vector of json values
55- let value_arr = match self . data {
71+ let value_arr = match self . json {
5672 Value :: Array ( arr) => arr,
5773 value @ Value :: Object ( _) => vec ! [ value] ,
5874 _ => unreachable ! ( "flatten would have failed beforehand" ) ,
@@ -120,6 +136,87 @@ impl EventFormat for Event {
120136 Ok ( None ) => unreachable ! ( "all records are added to one rb" ) ,
121137 }
122138 }
139+
140+ /// Converts a JSON event into a Parseable Event
141+ fn into_event (
142+ self ,
143+ stream_name : String ,
144+ origin_size : u64 ,
145+ storage_schema : & HashMap < String , Arc < Field > > ,
146+ static_schema_flag : bool ,
147+ custom_partitions : Option < & String > ,
148+ time_partition : Option < & String > ,
149+ schema_version : SchemaVersion ,
150+ stream_type : StreamType ,
151+ ) -> Result < super :: Event , anyhow:: Error > {
152+ let custom_partition_values = match custom_partitions. as_ref ( ) {
153+ Some ( custom_partition) => {
154+ let custom_partitions = custom_partition. split ( ',' ) . collect_vec ( ) ;
155+ extract_custom_partition_values ( & self . json , & custom_partitions)
156+ }
157+ None => HashMap :: new ( ) ,
158+ } ;
159+
160+ let parsed_timestamp = match time_partition {
161+ Some ( time_partition) => extract_and_parse_time ( & self . json , time_partition) ?,
162+ _ => self . p_timestamp . naive_utc ( ) ,
163+ } ;
164+
165+ let ( rb, is_first_event) = self . into_recordbatch (
166+ storage_schema,
167+ static_schema_flag,
168+ time_partition,
169+ schema_version,
170+ ) ?;
171+
172+ Ok ( super :: Event {
173+ rb,
174+ stream_name,
175+ origin_format : "json" ,
176+ origin_size,
177+ is_first_event,
178+ parsed_timestamp,
179+ time_partition : None ,
180+ custom_partition_values,
181+ stream_type,
182+ } )
183+ }
184+ }
185+
186+ /// Extracts custom partition values from provided JSON object
187+ /// e.g. `json: {"status": 400, "msg": "Hello, World!"}, custom_partition_list: ["status"]` returns `{"status" => 400}`
188+ pub fn extract_custom_partition_values (
189+ json : & Value ,
190+ custom_partition_list : & [ & str ] ,
191+ ) -> HashMap < String , String > {
192+ let mut custom_partition_values: HashMap < String , String > = HashMap :: new ( ) ;
193+ for custom_partition_field in custom_partition_list {
194+ let custom_partition_value = json. get ( custom_partition_field. trim ( ) ) . unwrap ( ) . to_owned ( ) ;
195+ let custom_partition_value = match custom_partition_value {
196+ e @ Value :: Number ( _) | e @ Value :: Bool ( _) => e. to_string ( ) ,
197+ Value :: String ( s) => s,
198+ _ => "" . to_string ( ) ,
199+ } ;
200+ custom_partition_values. insert (
201+ custom_partition_field. trim ( ) . to_string ( ) ,
202+ custom_partition_value,
203+ ) ;
204+ }
205+ custom_partition_values
206+ }
207+
208+ /// Returns the parsed timestamp of deignated time partition from json object
209+ /// e.g. `json: {"timestamp": "2025-05-15T15:30:00Z"}` returns `2025-05-15T15:30:00`
210+ fn extract_and_parse_time (
211+ json : & Value ,
212+ time_partition : & str ,
213+ ) -> Result < NaiveDateTime , anyhow:: Error > {
214+ let current_time = json
215+ . get ( time_partition)
216+ . ok_or_else ( || anyhow ! ( "Missing field for time partition in json: {time_partition}" ) ) ?;
217+ let parsed_time: DateTime < Utc > = serde_json:: from_value ( current_time. clone ( ) ) ?;
218+
219+ Ok ( parsed_time. naive_utc ( ) )
123220}
124221
125222// Returns arrow schema with the fields that are present in the request body
@@ -225,3 +322,37 @@ fn valid_type(data_type: &DataType, value: &Value, schema_version: SchemaVersion
225322 }
226323 }
227324}
325+
326+ #[ cfg( test) ]
327+ mod tests {
328+ use std:: str:: FromStr ;
329+
330+ use serde_json:: json;
331+
332+ use super :: * ;
333+
334+ #[ test]
335+ fn parse_time_parition_from_value ( ) {
336+ let json = json ! ( { "timestamp" : "2025-05-15T15:30:00Z" } ) ;
337+ let parsed = extract_and_parse_time ( & json, "timestamp" ) ;
338+
339+ let expected = NaiveDateTime :: from_str ( "2025-05-15T15:30:00" ) . unwrap ( ) ;
340+ assert_eq ! ( parsed. unwrap( ) , expected) ;
341+ }
342+
343+ #[ test]
344+ fn time_parition_not_in_json ( ) {
345+ let json = json ! ( { "hello" : "world!" } ) ;
346+ let parsed = extract_and_parse_time ( & json, "timestamp" ) ;
347+
348+ assert ! ( parsed. is_err( ) ) ;
349+ }
350+
351+ #[ test]
352+ fn time_parition_not_parseable_as_datetime ( ) {
353+ let json = json ! ( { "timestamp" : "not time" } ) ;
354+ let parsed = extract_and_parse_time ( & json, "timestamp" ) ;
355+
356+ assert ! ( parsed. is_err( ) ) ;
357+ }
358+ }
0 commit comments