Feat : Onboarding nhtsa dataset Production ready

GoogleCloudPlatform · nlarge-google · Aug 26, 2022 · Apr 16, 2022 · Aug 17, 2022 · Aug 18, 2022
commit 383d645beab494dcd0b4b774ef10c79eeb5d75dc
diff --git a/...a_traffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_accident_2015_schema.json b/...a_traffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_accident_2015_schema.json
@@ -532,5 +532,11 @@
 		"type": "integer",
 		"description": "This data element records the number of drunk drivers involved in the crash. 00-99 Number of Drunk Drivers Involved in the Fatal Crash.",
 		"mode": "NULLABLE"
+	},
+	{
+		"name": "timestamp_of_crash",
+		"type": "timestamp",
+		"description": "This data element records the date and time on which the crash occurred.",
+		"mode": "NULLABLE"
 	}
 ]
diff --git a/...ffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_accident_2016_2019_schema.json b/...ffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_accident_2016_2019_schema.json
@@ -544,5 +544,11 @@
 		"type": "integer",
 		"description": "This data element records the number of drunk drivers involved in the crash. 00-99 Number of Drunk Drivers Involved in the Fatal Crash.",
 		"mode": "NULLABLE"
+	},
+	{
+		"name": "timestamp_of_crash",
+		"type": "timestamp",
+		"description": "This data element records the date and time on which the crash occurred.",
+		"mode": "NULLABLE"
 	}
 ]
diff --git a/...a_traffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_accident_2020_schema.json b/...a_traffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_accident_2020_schema.json
@@ -484,5 +484,11 @@
 		"type": "integer",
 		"description": "This data element records the number of drunk drivers involved in the crash. 00-99 Number of Drunk Drivers Involved in the Fatal Crash.",
 		"mode": "NULLABLE"
+	},
+	{
+		"name": "timestamp_of_crash",
+		"type": "timestamp",
+		"description": "This data element records the date and time on which the crash occurred.",
+		"mode": "NULLABLE"
 	}
 ]
diff --git a/...raffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_person_2015_2017_schema.json b/...raffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_person_2015_2017_schema.json
@@ -772,5 +772,11 @@
 		"type": "string",
 		"description": "This data element identifies the attribute which best describes the location of this non-motorist with respect to the roadway at the time of the crash.",
 		"mode": "NULLABLE"
+	},
+	{
+		"name": "timestamp_of_crash",
+		"type": "timestamp",
+		"description": "This data element records the date and time on which the crash occurred.",
+		"mode": "NULLABLE"
 	}
 ]
diff --git a/...tsa_traffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_person_2018_schema.json b/...tsa_traffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_person_2018_schema.json
@@ -706,5 +706,11 @@
 		"type": "string",
 		"description": "This data element identifies the attribute which best describes the location of this non-motorist with respect to the roadway at the time of the crash.",
 		"mode": "NULLABLE"
+	},
+	{
+		"name": "timestamp_of_crash",
+		"type": "timestamp",
+		"description": "This data element records the date and time on which the crash occurred.",
+		"mode": "NULLABLE"
 	}
 ]
diff --git a/...tsa_traffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_person_2019_schema.json b/...tsa_traffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_person_2019_schema.json
@@ -712,5 +712,11 @@
 		"type": "string",
 		"description": "This data element identifies any mis-use of the helmet used by this person.",
 		"mode": "NULLABLE"
+	},
+	{
+		"name": "timestamp_of_crash",
+		"type": "timestamp",
+		"description": "This data element records the date and time on which the crash occurred.",
+		"mode": "NULLABLE"
 	}
 ]
diff --git a/...tsa_traffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_person_2020_schema.json b/...tsa_traffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_person_2020_schema.json
@@ -724,5 +724,11 @@
 		"type": "string",
 		"description": "This element captures the completed/finished body class for an incomplete vehicle. An incomplete vehicle is completed by a final stage manufacturer. The intent of this data element is to capture the body class for incomplete vehicles when they are finished for road-use.",
 		"mode": "NULLABLE"
+	},
+	{
+		"name": "timestamp_of_crash",
+		"type": "timestamp",
+		"description": "This data element records the date and time on which the crash occurred.",
+		"mode": "NULLABLE"
 	}
 ]
diff --git a/...sa_traffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_vehicle_2015_schema.json b/...sa_traffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_vehicle_2015_schema.json
@@ -1120,5 +1120,11 @@
 		"type": "string",
 		"description": "This data element records whether the driver was drinking and is derived from data elements in the Vehicle and Person data files. 0 No Drinking 1 Drinking -- Unknown",
 		"mode": "NULLABLE"
+	},
+	{
+		"name": "timestamp_of_crash",
+		"type": "timestamp",
+		"description": "This data element records the date and time on which the crash occurred.",
+		"mode": "NULLABLE"
 	}
 ]
diff --git a/...affic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_vehicle_2016_2017_schema.json b/...affic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_vehicle_2016_2017_schema.json
@@ -1156,5 +1156,11 @@
 		"type": "string",
 		"description": "This data element records the vehicle identification number (VIN) of any trailing units of a combination vehicle.",
 		"mode": "NULLABLE"
+	},
+	{
+		"name": "timestamp_of_crash",
+		"type": "timestamp",
+		"description": "This data element records the date and time on which the crash occurred.",
+		"mode": "NULLABLE"
 	}
 ]
diff --git a/...affic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_vehicle_2018_2019_schema.json b/...affic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_vehicle_2018_2019_schema.json
@@ -1180,5 +1180,11 @@
 		"type": "string",
 		"description": "This data element records the vehicle identification number (VIN) of any trailing units of a combination vehicle.",
 		"mode": "NULLABLE"
+	},
+	{
+		"name": "timestamp_of_crash",
+		"type": "timestamp",
+		"description": "This data element records the date and time on which the crash occurred.",
+		"mode": "NULLABLE"
 	}
 ]
diff --git a/...sa_traffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_vehicle_2020_schema.json b/...sa_traffic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_vehicle_2020_schema.json
@@ -1204,5 +1204,11 @@
 		"type": "string",
 		"description": "This element identifies the gross vehicle weight rating of any trailing units as identified by the manufacturer in the vehicle’s VIN.",
 		"mode": "NULLABLE"
+	},
+	{
+		"name": "timestamp_of_crash",
+		"type": "timestamp",
+		"description": "This data element records the date and time on which the crash occurred.",
+		"mode": "NULLABLE"
 	}
 ]
diff --git a/datasets/nhtsa_traffic_fatalities/pipelines/_images/run_csv_transform_kub/csv_transform.py b/datasets/nhtsa_traffic_fatalities/pipelines/_images/run_csv_transform_kub/csv_transform.py
@@ -117,7 +117,9 @@ def execute_pipeline(
         field_separator=",",
         rename_mappings_list=rename_mappings_list,
         input_dtypes=input_dtypes,
-        input_csv_headers=input_csv_headers
+        input_csv_headers=input_csv_headers,
+        pipeline_name = pipeline_name,
+        process_year = process_year
     )
     if os.path.exists(target_file):
         upload_file_to_gcs(
@@ -165,7 +167,9 @@ def process_source_file(
     field_separator: str,
     rename_mappings_list: dict,
     input_dtypes: dict,
-    input_csv_headers: typing.List[str]
+    input_csv_headers: typing.List[str],
+    pipeline_name: str,
+    process_year : int
 ) -> pd.DataFrame:
     unpack_file(source_file, source_file_unzip_dir, "zip")
     logging.info(f"Opening source file {source_file}")
@@ -191,7 +195,9 @@ def process_source_file(
                     input_dtypes=input_dtypes,
                     target_file=target_file,
                     chunk_number=chunk_number,
-                    rename_mappings_list=rename_mappings_list
+                    rename_mappings_list=rename_mappings_list,
+                    pipeline_name = pipeline_name,
+                    process_year = process_year
                 )
                 data = []
                 chunk_number += 1
@@ -202,7 +208,9 @@ def process_source_file(
                 input_dtypes=input_dtypes,
                 target_file=target_file,
                 chunk_number=chunk_number,
-                rename_mappings_list=rename_mappings_list
+                rename_mappings_list=rename_mappings_list,
+                pipeline_name = pipeline_name,
+                process_year = process_year
             )
 
 
@@ -212,7 +220,9 @@ def process_dataframe_chunk(
     input_dtypes: dict,
     target_file: str,
     chunk_number: int,
-    rename_mappings_list: dict
+    rename_mappings_list: dict,
+    pipeline_name : str,
+    process_year : int
 ) -> None:
     df = pd.DataFrame(
                 data,
@@ -227,7 +237,9 @@ def process_dataframe_chunk(
         target_file_batch=target_file_batch,
         target_file=target_file,
         skip_header=(not chunk_number == 1),
-        rename_headers_list=rename_mappings_list
+        rename_headers_list=rename_mappings_list,
+        pipeline_name = pipeline_name,
+        process_year =process_year
     )
 
 
@@ -246,20 +258,36 @@ def process_chunk(
     target_file_batch: str,
     target_file: str,
     skip_header: bool,
-    rename_headers_list: dict
+    rename_headers_list: dict,
+    pipeline_name : str,
+    process_year :int
 ) -> None:
     logging.info(f"Processing batch file {target_file_batch}")
     df = rename_headers(df, rename_headers_list)
-    # pipeline_name= 'Accident'
-    # if pipeline_name == 'Accident':
-        # create_new_timestamp_column(df)
-    save_to_new_file(df, file_path=str(target_file_batch), sep="|")
-    append_batch_file(target_file_batch, target_file, skip_header, not (skip_header))
+    new_pipeline_name = (pipeline_name.split('-')[1]).lower().strip()
+    if new_pipeline_name in ["accident"]:
+        create_new_timestamp_column(df,new_pipeline_name,process_year)
+        save_to_new_file(df, file_path=str(target_file_batch), sep="|")
+        append_batch_file(target_file_batch, target_file, skip_header, not (skip_header))
+    elif new_pipeline_name in ["person","vehicle"]:
+        create_new_timestamp_column(df,new_pipeline_name,process_year)
+    else:
+        save_to_new_file(df, file_path=str(target_file_batch), sep="|")
+        append_batch_file(target_file_batch, target_file, skip_header, not (skip_header))
+
     logging.info(f"Processing batch file {target_file_batch} completed")
 
-def create_new_timestamp_column(df: pd.DataFrame):
-  df['timestamp_of_crash']=df['year_of_crash'].apply(lambda x : str(x))+'-'+df['month_of_crash'].apply(lambda x : "0"+str(x) if len(str(x))==1  else str(x))+'-'+df['day_of_crash'].apply(lambda x : "0"+str(x) if len(str(x))==1  else str(x))+' '+df['hour_of_crash'].apply(lambda x : "0"+str(x) if len(str(x))==1  else str(x))+':'+df['minute_of_crash'].apply(lambda x : "0"+str(x) if len(str(x))==1  else str(x))+':'+'00'+' UTC'
-  return df
+def create_new_timestamp_column(df: pd.DataFrame,new_pipeline_name:str,process_year:int):
+    if new_pipeline_name in ["accident"]:
+        df.drop(df[df['hour_of_crash'].apply(lambda x : int(x)) > 24].index, inplace = True)
+        df.drop(df[df['minute_of_crash'].apply(lambda x : int(x)) > 59].index, inplace = True)
+        df['timestamp_of_crash']=df['year_of_crash'].apply(lambda x : str(x))+'-'+df['month_of_crash'].apply(lambda x : "0"+str(x) if len(str(x))==1  else str(x))+'-'+df['day_of_crash'].apply(lambda x : "0"+str(x) if len(str(x))==1  else str(x))+' '+df['hour_of_crash'].apply(lambda x : "0"+str(x) if len(str(x))==1  else str(x))+':'+df['minute_of_crash'].apply(lambda x : "0"+str(x) if len(str(x))==1  else str(x))+':'+'00'+' UTC'
+    elif new_pipeline_name in ["person","vehicle"]:
+        df.drop(df[df['hour_of_crash'].apply(lambda x : int(x)) > 24].index, inplace = True)
+        df.drop(df[df['minute_of_crash'].apply(lambda x : int(x)) > 59].index, inplace = True)
+        df['timestamp_of_crash']=str(process_year)+'-'+df['month_of_crash'].apply(lambda x : "0"+str(x) if len(str(x))==1  else str(x))+'-'+df['day_of_crash'].apply(lambda x : "0"+str(x) if len(str(x))==1  else str(x))+' '+df['hour_of_crash'].apply(lambda x : "0"+str(x) if len(str(x))==1  else str(x))+':'+df['minute_of_crash'].apply(lambda x : "0"+str(x) if len(str(x))==1  else str(x))+':'+'00'+' UTC'
+
+    return df
 
 def unpack_file(infile: str, dest_path: str, compression_type: str = "zip") -> None:
     if compression_type == "zip":