66
77from .config import Config
88
9- def format_df ( df : pd . DataFrame , geo_id : str , se : bool , logger ):
9+ def format_outname ( prefix : str , se : bool , weekday : bool ):
1010 '''
1111
1212 Parameters
1313 ----------
14- df
15- geo_id
14+ prefix
1615 se
17- logger
16+ weekday
1817
1918 Returns
2019 -------
2120
21+ '''
22+ # write out results
23+ out_name = "smoothed_adj_cli" if weekday else "smoothed_cli"
24+ if se :
25+ assert prefix is not None , "template has no obfuscated prefix"
26+ out_name = prefix + "_" + out_name
27+ return out_name
28+
29+ def format_df (df : pd .DataFrame , geo_id : str , se : bool , logger ):
30+ '''
31+ format dataframe and checks for anomalies to write results
32+ Parameters
33+ ----------
34+ df: dataframe from output from update_sensor
35+ geo_id: geographic resolution, one of ["county", "state", "msa", "hrr", "nation", "hhs"]
36+ se: boolean to write out standard errors, if true, use an obfuscated name
37+ logger
38+
39+ Returns
40+ -------
41+ filtered and formatted dataframe
2242 '''
2343 # report in percentage
2444 df ['val' ] = df ['val' ] * 100
@@ -28,53 +48,61 @@ def format_df(df: pd.DataFrame, geo_id: str, se: bool, logger):
2848 df_val_null = df [val_isnull ]
2949 if not df_val_null .empty :
3050 logger .info ("sensor value is nan, check pipeline" )
31- filtered_df = df [~ val_isnull ]
51+ df = df [~ val_isnull ]
3252
33- se_too_high = filtered_df ['se' ] >= 5
34- df_se_too_high = filtered_df [se_too_high ]
35- if len (df_se_too_high . empty ) > 0 :
53+ se_too_high = df ['se' ] >= 5
54+ df_se_too_high = df [se_too_high ]
55+ if len (df_se_too_high ) > 0 :
3656 logger .info (f"standard error suspiciously high! investigate { geo_id } " )
37- filtered_df = filtered_df [~ se_too_high ]
57+ df = df [~ se_too_high ]
3858
39- sensor_too_high = filtered_df ['val' ] >= 90
40- df_sensor_too_high = filtered_df [sensor_too_high ]
59+ sensor_too_high = df ['val' ] >= 90
60+ df_sensor_too_high = df [sensor_too_high ]
4161 if len (df_sensor_too_high ) > 0 :
4262 logger .info (f"standard error suspiciously high! investigate { geo_id } " )
43- filtered_df = filtered_df [~ sensor_too_high ]
63+ df = df [~ sensor_too_high ]
4464
4565 if se :
46- valid_cond = filtered_df ['se' ] > 0 & filtered_df ['val' ] > 0
47- invalid_df = filtered_df [~ valid_cond ]
66+ valid_cond = ( df ['se' ] > 0 ) & ( df ['val' ] > 0 )
67+ invalid_df = df [~ valid_cond ]
4868 if len (invalid_df ) > 0 :
4969 logger .info (f"p=0, std_err=0 invalid" )
50- filtered_df = filtered_df [valid_cond ]
70+ df = df [valid_cond ]
5171 else :
52- filtered_df .drop (columns = ['se' ], inplace = True )
53-
72+ df ["se" ] = np .NAN
5473
74+ df ["direction" ] = np .NAN
75+ df ["sample_size" ] = np .NAN
76+ return df
5577
56- def write_to_csv (output_df : pd .DataFrame , geo_level : str , se : bool , out_name : str , logger , output_path = "." ):
78+ def write_to_csv (output_df : pd .DataFrame , prefix : str , geo_id : str , weekday : bool , se : bool , logger , output_path = "." ):
5779 """Write sensor values to csv.
5880
5981 Args:
6082 output_dict: dictionary containing sensor rates, se, unique dates, and unique geo_id
61- geo_level : geographic resolution, one of ["county", "state", "msa", "hrr", "nation", "hhs"]
83+ geo_id : geographic resolution, one of ["county", "state", "msa", "hrr", "nation", "hhs"]
6284 se: boolean to write out standard errors, if true, use an obfuscated name
6385 out_name: name of the output file
6486 output_path: outfile path to write the csv (default is current directory)
6587 """
88+ out_name = format_outname (prefix , se , weekday )
89+ filtered_df = format_df (output_df , geo_id , se , logger )
90+
6691 if se :
6792 logger .info (f"========= WARNING: WRITING SEs TO { out_name } =========" )
6893
69- out_n = 0
70- for d in set (output_df ["date" ]):
94+ dates = set (list (output_df ['date' ]))
95+ grouped = filtered_df .groupby ('date' )
96+ for d in dates :
7197 filename = "%s/%s_%s_%s.csv" % (output_path ,
7298 (d + Config .DAY_SHIFT ).strftime ("%Y%m%d" ),
73- geo_level ,
99+ geo_id ,
74100 out_name )
75- single_date_df = output_df [output_df ["date" ] == d ]
101+ single_date_df = grouped .get_group (d )
102+ single_date_df = single_date_df .drop (columns = ['date' ])
103+ single_date_df .to_csv (filename , index = False , na_rep = "NA" )
76104
77- logger .debug (f"wrote { out_n } rows for { geo_level } " )
105+ logger .debug (f"wrote { len ( single_date_df ) } rows for { geo_id } " )
78106
79107
80108def csv_to_df (filepath : str , startdate : datetime , enddate : datetime , dropdate : datetime , logger ) -> pd .DataFrame :
0 commit comments