diff --git a/docs/integration-spark.md b/docs/integration-spark.md index 342cc58..7850c6c 100644 --- a/docs/integration-spark.md +++ b/docs/integration-spark.md @@ -73,7 +73,7 @@ from whyspark import new_profiling_session raw_df = spark.read.option("header", "true").csv("/databricks-datasets/timeseries/Fires/Fire_Department_Calls_for_Service.csv") df = raw_df.withColumn("call_date", to_timestamp(col("Call Date"), "MM/dd/YYYY")) -profiles = new_profiling_session(newProfilingSession("profilingSession"), name="fire_station_calls", time_colum="call_date") \ +profiles = df.new_profiling_session(newProfilingSession("profilingSession"), name="fire_station_calls", time_colum="call_date") \ .groupBy("City", "Priority") \ .aggProfiles() pdf = profiles.toPandas() # you get a Pandas dataset profile of whylogs @@ -85,4 +85,4 @@ You can then extract and analyze individual profiles: from whylogs import DatasetProfile prof = DatasetProfile.parse_delimited(pdf['why_profile'][0])[0] # prof is a whylogs DatasetProfile that can be analyzed using utilities such as whylogs.viz -``` \ No newline at end of file +```