|
| 1 | +# Copyright 2020 Google LLC |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# https://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +# [START composer_dataflow_dag] |
| 16 | + |
| 17 | + |
| 18 | +"""Example Airflow DAG that creates a Cloud Dataproc cluster, runs the Hadoop |
| 19 | +wordcount example, and deletes the cluster. |
| 20 | +
|
| 21 | +This DAG relies on three Airflow variables |
| 22 | +https://airflow.apache.org/concepts.html#variables |
| 23 | +* project_id - Google Cloud Project ID to use for the Cloud Dataflow cluster. |
| 24 | +* gce_zone - Google Compute Engine zone where Cloud Dataflow cluster should be |
| 25 | + created. |
| 26 | +* gce_region - Google Compute Engine region where Cloud Dataflow cluster should be |
| 27 | + created. |
| 28 | +Learn more about the difference between the two here: |
| 29 | +https://cloud.google.com/compute/docs/regions-zones |
| 30 | +* bucket_path - Google Cloud Storage bucket where you've stored the User Defined |
| 31 | +Function (.js), the input file (.txt), and the JSON schema (.json). |
| 32 | +""" |
| 33 | + |
| 34 | +import datetime |
| 35 | + |
| 36 | +from airflow import models |
| 37 | +from airflow.contrib.operators.dataflow_operator import DataflowTemplateOperator |
| 38 | +from airflow.utils.dates import days_ago |
| 39 | + |
| 40 | +bucket_path = models.Variable.get("bucket_path") |
| 41 | +project_id = models.Variable.get("project_id") |
| 42 | +gce_zone = models.Variable.get("gce_zone") |
| 43 | +gce_region = models.Variable.get("gce_region") |
| 44 | + |
| 45 | + |
| 46 | +default_args = { |
| 47 | + # Tell airflow to start one day ago, so that it runs as soon as you upload it |
| 48 | + "start_date": days_ago(1), |
| 49 | + "dataflow_default_options": { |
| 50 | + "project": project_id, |
| 51 | + # Set to your region |
| 52 | + "region": gce_region, |
| 53 | + # Set to your zone |
| 54 | + "zone": gce_zone, |
| 55 | + # This is a subfolder for storing temporary files, like the staged pipeline job. |
| 56 | + "temp_location": bucket_path + "/tmp/", |
| 57 | + }, |
| 58 | +} |
| 59 | + |
| 60 | +# Define a DAG (directed acyclic graph) of tasks. |
| 61 | +# Any task you create within the context manager is automatically added to the |
| 62 | +# DAG object. |
| 63 | +with models.DAG( |
| 64 | + # The id you will see in the DAG airflow page |
| 65 | + "composer_dataflow_dag", |
| 66 | + default_args=default_args, |
| 67 | + # The interval with which to schedule the DAG |
| 68 | + schedule_interval=datetime.timedelta(days=1), # Override to match your needs |
| 69 | +) as dag: |
| 70 | + |
| 71 | + start_template_job = DataflowTemplateOperator( |
| 72 | + # The task id of your job |
| 73 | + task_id="dataflow_operator_transform_csv_to_bq", |
| 74 | + # The name of the template that you're using. |
| 75 | + # Below is a list of all the templates you can use. |
| 76 | + # For versions in non-production environments, use the subfolder 'latest' |
| 77 | + # https://cloud.google.com/dataflow/docs/guides/templates/provided-batch#gcstexttobigquery |
| 78 | + template="gs://dataflow-templates/latest/GCS_Text_to_BigQuery", |
| 79 | + # Use the link above to specify the correct parameters for your template. |
| 80 | + parameters={ |
| 81 | + "javascriptTextTransformFunctionName": "transformCSVtoJSON", |
| 82 | + "JSONPath": bucket_path + "/jsonSchema.json", |
| 83 | + "javascriptTextTransformGcsPath": bucket_path + "/transformCSVtoJSON.js", |
| 84 | + "inputFilePattern": bucket_path + "/inputFile.txt", |
| 85 | + "outputTable": project_id + ":average_weather.average_weather", |
| 86 | + "bigQueryLoadingTemporaryDirectory": bucket_path + "/tmp/", |
| 87 | + }, |
| 88 | + ) |
| 89 | + |
| 90 | +# [END composer_dataflow_dag] |
0 commit comments