|
| 1 | +# Copyright 2022 Google LLC |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# https://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +# This DAG script is an expansion of data_analytics_dag.py that runs a more complex Dataproc job found in data_analytics_process_expansion.py |
| 16 | + |
| 17 | +import datetime |
| 18 | + |
| 19 | +from airflow import models |
| 20 | +from airflow.providers.google.cloud.operators import dataproc |
| 21 | +from airflow.providers.google.cloud.operators.bigquery import BigQueryInsertJobOperator |
| 22 | +from airflow.providers.google.cloud.transfers.gcs_to_bigquery import ( |
| 23 | + GCSToBigQueryOperator, |
| 24 | +) |
| 25 | +from airflow.utils.task_group import TaskGroup |
| 26 | + |
| 27 | +PROJECT_NAME = "{{var.value.gcp_project}}" |
| 28 | + |
| 29 | +# BigQuery configs |
| 30 | +BQ_DESTINATION_DATASET_NAME = "precipitation_changes" |
| 31 | +BQ_DESTINATION_TABLE_NAME = "ghcnd_stations_joined" |
| 32 | +BQ_NORMALIZED_TABLE_NAME = "ghcnd_stations_normalized" |
| 33 | +BQ_PRCP_MEAN_TABLE_NAME = "ghcnd_stations_prcp_mean" |
| 34 | +BQ_SNOW_MEAN_TABLE_NAME = "ghcnd_stations_prcp_mean" |
| 35 | +BQ_PHX_PRCP_TABLE_NAME = "phx_annual_prcp" |
| 36 | +BQ_PHX_SNOW_TABLE_NAME = "phx_annual_snow" |
| 37 | + |
| 38 | +# Dataproc configs |
| 39 | +BUCKET_NAME = "{{var.value.gcs_bucket}}" |
| 40 | +PYSPARK_JAR = "gs://spark-lib/bigquery/spark-bigquery-with-dependencies_2.12-0.26.0.jar" |
| 41 | +PROCESSING_PYTHON_FILE = f"gs://{BUCKET_NAME}/data_analytics_process_expansion.py" |
| 42 | + |
| 43 | +BATCH_ID = "data-processing-{{ ts_nodash | lower}}" # Dataproc serverless only allows lowercase characters |
| 44 | +BATCH_CONFIG = { |
| 45 | + "pyspark_batch": { |
| 46 | + "jar_file_uris": [PYSPARK_JAR], |
| 47 | + "main_python_file_uri": PROCESSING_PYTHON_FILE, |
| 48 | + "args": [ |
| 49 | + BUCKET_NAME, |
| 50 | + f"{BQ_DESTINATION_DATASET_NAME}.{BQ_DESTINATION_TABLE_NAME}", |
| 51 | + f"{BQ_DESTINATION_DATASET_NAME}.{BQ_NORMALIZED_TABLE_NAME}", |
| 52 | + f"{BQ_DESTINATION_DATASET_NAME}.{BQ_PRCP_MEAN_TABLE_NAME}", |
| 53 | + f"{BQ_DESTINATION_DATASET_NAME}.{BQ_SNOW_MEAN_TABLE_NAME}", |
| 54 | + f"{BQ_DESTINATION_DATASET_NAME}.{BQ_PHX_PRCP_TABLE_NAME}", |
| 55 | + f"{BQ_DESTINATION_DATASET_NAME}.{BQ_PHX_SNOW_TABLE_NAME}", |
| 56 | + ], |
| 57 | + }, |
| 58 | + "environment_config": { |
| 59 | + "execution_config": { |
| 60 | + "service_account": "{{var.value.dataproc_service_account}}" |
| 61 | + } |
| 62 | + }, |
| 63 | +} |
| 64 | + |
| 65 | +yesterday = datetime.datetime.combine( |
| 66 | + datetime.datetime.today() - datetime.timedelta(1), datetime.datetime.min.time() |
| 67 | +) |
| 68 | + |
| 69 | +default_dag_args = { |
| 70 | + # Setting start date as yesterday starts the DAG immediately when it is |
| 71 | + # detected in the Cloud Storage bucket. |
| 72 | + "start_date": yesterday, |
| 73 | + # To email on failure or retry set 'email' arg to your email and enable |
| 74 | + # emailing here. |
| 75 | + "email_on_failure": False, |
| 76 | + "email_on_retry": False, |
| 77 | +} |
| 78 | + |
| 79 | +with models.DAG( |
| 80 | + "data_analytics_dag", |
| 81 | + # Continue to run DAG once per day |
| 82 | + schedule_interval=datetime.timedelta(days=1), |
| 83 | + default_args=default_dag_args, |
| 84 | +) as dag: |
| 85 | + |
| 86 | + create_batch = dataproc.DataprocCreateBatchOperator( |
| 87 | + task_id="create_batch", |
| 88 | + project_id=PROJECT_NAME, |
| 89 | + region="{{ var.value.gce_region }}", |
| 90 | + batch=BATCH_CONFIG, |
| 91 | + batch_id=BATCH_ID, |
| 92 | + ) |
| 93 | + |
| 94 | + load_external_dataset = GCSToBigQueryOperator( |
| 95 | + task_id="run_bq_external_ingestion", |
| 96 | + bucket=BUCKET_NAME, |
| 97 | + source_objects=["ghcn-stations-processed.csv"], |
| 98 | + destination_project_dataset_table=f"{BQ_DESTINATION_DATASET_NAME}.ghcnd-stations-new", |
| 99 | + source_format="CSV", |
| 100 | + schema_fields=[ |
| 101 | + {"name": "ID", "type": "STRING", "mode": "REQUIRED"}, |
| 102 | + {"name": "LATITUDE", "type": "FLOAT", "mode": "REQUIRED"}, |
| 103 | + {"name": "LONGITUDE", "type": "FLOAT", "mode": "REQUIRED"}, |
| 104 | + {"name": "ELEVATION", "type": "FLOAT", "mode": "REQUIRED"}, |
| 105 | + {"name": "STATE", "type": "STRING", "mode": "NULLABLE"}, |
| 106 | + {"name": "NAME", "type": "STRING", "mode": "REQUIRED"}, |
| 107 | + ], |
| 108 | + write_disposition="WRITE_TRUNCATE", |
| 109 | + ) |
| 110 | + |
| 111 | + with TaskGroup("join_bq_datasets") as bq_join_group: |
| 112 | + |
| 113 | + for year in range(1997, 2022): |
| 114 | + # BigQuery configs |
| 115 | + BQ_DATASET_NAME = f"bigquery-public-data.ghcn_d.ghcnd_{str(year)}" |
| 116 | + GHCND_STATIONS_JOIN_QUERY = f""" |
| 117 | + SELECT Stations.ID, Stations.LATITUDE, Stations.LONGITUDE, |
| 118 | + Stations.STATE, Table.DATE, Table.ELEMENT, Table.VALUE |
| 119 | + FROM `{PROJECT_NAME}.expansion_project.ghcnd-stations-new` AS Stations, {BQ_DATASET_NAME} AS Table |
| 120 | + WHERE Stations.ID = Table.id |
| 121 | + """ |
| 122 | + |
| 123 | + bq_join_stations_data = BigQueryInsertJobOperator( |
| 124 | + task_id=f"bq_join_stations_data_{str(year)}", |
| 125 | + configuration={ |
| 126 | + "query": { |
| 127 | + "query": GHCND_STATIONS_JOIN_QUERY, |
| 128 | + "useLegacySql": False, |
| 129 | + "destinationTable": { |
| 130 | + "projectId": PROJECT_NAME, |
| 131 | + "datasetId": BQ_DESTINATION_DATASET_NAME, |
| 132 | + "tableId": BQ_DESTINATION_TABLE_NAME, |
| 133 | + }, |
| 134 | + "writeDisposition": "WRITE_APPEND", |
| 135 | + } |
| 136 | + }, |
| 137 | + location="US", |
| 138 | + ) |
| 139 | + |
| 140 | + load_external_dataset >> bq_join_group >> create_batch |
0 commit comments