update : violatn pipeline

GoogleCloudPlatform · nlarge-google · Aug 26, 2022 · Apr 16, 2022 · Aug 17, 2022 · Aug 18, 2022
commit 36af9617a057e700f3211b6979a5fce67ce96f91
diff --git a/...affic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_violatn_2015_2020_schema.json b/...affic_fatalities/pipelines/_images/nhtsa_traffic_fatalities_violatn_2015_2020_schema.json
@@ -0,0 +1,39 @@
+[
+	{
+		"name": "state_number",
+		"type": "integer",
+		"description": "This data element identifies the state in which the crash occurred. The codes are from the General Services Administration’s (GSA) publication of worldwide Geographic Location Codes (GLC). For more info on the codes, please look at <C1/V1/D1/PC1/P1/NM1 State Number> section in the pdf: https://crashstats.nhtsa.dot.gov/Api/Public/ViewPublication/812315",
+		"mode": "NULLABLE"
+	},
+	{
+		"name": "state_name",
+		"type": "string",
+		"description": "This data element identifies the state in which the crash occurred. The codes are from the General Services Administration’s (GSA) publication of worldwide Geographic Location Codes (GLC).",
+		"mode": "NULLABLE"
+	},
+	{
+		"name": "consecutive_number",
+		"type": "integer",
+		"description": "This data element is the unique case number assigned to each crash. It appears on each data file and is used to merge information from the data files together. xxxxxx Two Characters for State Code followed by Four Characters for Case Number",
+		"mode": "NULLABLE"
+	},
+	{
+		"name": "vehicle_number",
+		"type": "integer",
+		"description": "This data element is the consecutive number assigned to each vehicle in the case. This data element appears on each vehicle level data file and is used in conjunction with the ST_CASE data element to merge information from vehicle level data files. 000-999 Assigned Number of Motor Vehicle",
+		"mode": "NULLABLE"
+	},
+	{
+		"name": "violations_charged",
+		"type": "string",
+		"description": "This data element identifies all violations charged to this driver. For more info on the codes, please look at <D21 Violations Charged> section in the pdf: https://crashstats.nhtsa.dot.gov/Api/Public/ViewPublication/812315",
+		"mode": "NULLABLE"
+	},
+	{
+		"name": "violations_charged_name",
+		"type": "string",
+		"description": "This data element identifies all violations charged to this driver. For more info on the codes, please look at <D21 Violations Charged> section in the pdf: https://crashstats.nhtsa.dot.gov/Api/Public/ViewPublication/812315",
+		"mode": "NULLABLE"
+	}
+
+]
diff --git a/datasets/nhtsa_traffic_fatalities/pipelines/nhtsa_traffic_fatalities/pipeline.yaml b/datasets/nhtsa_traffic_fatalities/pipelines/nhtsa_traffic_fatalities/pipeline.yaml
@@ -8096,7 +8096,64 @@ dag:
           request_ephemeral_storage: "10G"
           request_cpu: "1"
 
+    - operator: "KubernetesPodOperator"
+      description: "Run CSV transform within kubernetes pod for violatn pipelines"
+      args:
+        task_id: "violatn_2015_2020_transform_csv"
+        startup_timeout_seconds: 600
+        name: "violatn"
+        namespace: "composer"
+        service_account_name: "datasets"
+        image_pull_policy: "Always"
+        image: "{{ var.json.nhtsa_traffic_fatalities.container_registry.run_csv_transform_kub }}"
+        env_vars:
+          PIPELINE_NAME: "{{ var.json.nhtsa_traffic_fatalities.violatn_2015_2020.pipeline_name }}"
+          SOURCE_URL: "{{ var.json.nhtsa_traffic_fatalities.violatn_2015_2020.source_url }}"
+          CHUNKSIZE: "{{ var.json.nhtsa_traffic_fatalities.violatn_2015_2020.chunksize }}"
+          SOURCE_ZIPFILE_EXTRACTED: "violatn_2015_2020.csv"
+          SOURCE_FILE: "{{ var.json.nhtsa_traffic_fatalities.violatn_2015_2020.source_file }}"
+          # TARGET_FILE: "{{ var.json.nhtsa_traffic_fatalities.violatn_2015_2020.target_file }}"
+          PROJECT_ID: "{{ var.value.gcp_project }}"
+          DATASET_ID: "{{ var.json.nhtsa_traffic_fatalities.violatn_2015_2020.dataset_id }}"
+          TABLE_ID: "{{ var.json.nhtsa_traffic_fatalities.violatn_2015_2020.destination_table }}"
+          START_YEAR: "{{ var.json.nhtsa_traffic_fatalities.violatn_2015_2020.start_year }}"
+          END_YEAR: "{{ var.json.nhtsa_traffic_fatalities.violatn_2015_2020.end_year }}"
+          DROP_DEST_TABLE: "{{ var.json.nhtsa_traffic_fatalities.violatn_2015_2020.drop_dest_table }}"
+          TARGET_GCS_BUCKET: "{{ var.value.composer_bucket }}"
+          TARGET_GCS_PATH: "{{ var.json.nhtsa_traffic_fatalities.violatn_2015_2020.target_gcs_path }}"
+          SCHEMA_PATH: "{{ var.json.nhtsa_traffic_fatalities.violatn_2015_2020.schema_path }}"
+          INPUT_CSV_HEADERS: >-
+            [
+              "state_number",
+              "state_name",
+              "consecutive_number",
+              "vehicle_number",
+              "violations_charged",
+              "violations_charged_name"
+            ]
+          INPUT_DTYPES: >-
+            {
+              "state_number": "str",
+              "state_name": "str",
+              "consecutive_number": "str",
+              "vehicle_number": "str",
+              "violations_charged": "str",
+              "violations_charged_name": "str"
+            }
+          RENAME_MAPPINGS_LIST: >-
+            {
+              "STATE": "state_number",
+              "STATENAME": "state_name",
+              "ST_CASE": "consecutive_number",
+              "VEH_NO": "vehicle_number",
+              "MVIOLATN": "violations_charged",
+              "MVIOLATNNAME": "violations_charged_name"
+            }
+        resources:
+          request_ephemeral_storage: "10G"
+          request_cpu: "1"
+
 
 
   graph_paths:
-    - "create_cluster >> [ accident_2015_transform_csv,accident_2016_2019_transform_csv,accident_2020_transform_csv,cevent_2015_2020_transform_csv,damage_2015_2020_transform_csv,distract_2015_2020_transform_csv,drimpair_2015_2020_transform_csv,factor_2015_2020_transform_csv,maneuver_2015_2020_transform_csv,nmcrash_2015_2020_transform_csv,nmimpair_2015_2020_transform_csv,parkwork_2015_transform_csv,parkwork_2016_2017_transform_csv,parkwork_2018_transform_csv,parkwork_2019_transform_csv,parkwork_2020_transform_csv,pbtype_transform_csv,person_2015_2017_transform_csv,person_2018_transform_csv,person_2019_transform_csv,person_2020_transform_csv,safetyeq_2015_2016_transform_csv,safetyeq_2017_2020_transform_csv,vehicle_2015_transform_csv,vehicle_2016_2017_transform_csv,vehicle_2018_2019_transform_csv,vehicle_2020_transform_csv,vevent_2015_2020_transform_csv,vindecode_2015_transform_csv ] >> delete_cluster"
+    - "create_cluster >> [ accident_2015_transform_csv,accident_2016_2019_transform_csv,accident_2020_transform_csv,cevent_2015_2020_transform_csv,damage_2015_2020_transform_csv,distract_2015_2020_transform_csv,drimpair_2015_2020_transform_csv,factor_2015_2020_transform_csv,maneuver_2015_2020_transform_csv,nmcrash_2015_2020_transform_csv,nmimpair_2015_2020_transform_csv,parkwork_2015_transform_csv,parkwork_2016_2017_transform_csv,parkwork_2018_transform_csv,parkwork_2019_transform_csv,parkwork_2020_transform_csv,pbtype_transform_csv,person_2015_2017_transform_csv,person_2018_transform_csv,person_2019_transform_csv,person_2020_transform_csv,safetyeq_2015_2016_transform_csv,safetyeq_2017_2020_transform_csv,vehicle_2015_transform_csv,vehicle_2016_2017_transform_csv,vehicle_2018_2019_transform_csv,vehicle_2020_transform_csv,vevent_2015_2020_transform_csv,vindecode_2015_transform_csv,violatn_2015_2020_transform_csv ] >> delete_cluster"