WIP: address PR comments

Symmetries · Symmetries · commit 9561f35ee291 · 2020-08-13T11:31:10.000-04:00
diff --git a/data-science-onramp/data-ingestion/requirements-test.txt b/data-science-onramp/data-ingestion/requirements-test.txt
@@ -1 +1 @@
-pytest==5.3.2
+pytest==6.0.0
diff --git a/data-science-onramp/data-ingestion/requirements.txt b/data-science-onramp/data-ingestion/requirements.txt
@@ -1,7 +1,6 @@
-grpcio==1.29.0
-google-auth==1.16.0
-google-auth-httplib2==0.0.3
-google-cloud==0.34.0
+#grpcio==1.29.0
+#google-auth==1.16.0
+#google-auth-httplib2==0.0.3
 google-cloud-storage==1.28.1
-google-cloud-dataproc==0.8.0
+google-cloud-dataproc==2.0.0
 google-cloud-bigquery==1.25.0
diff --git a/data-science-onramp/data-ingestion/setup.py b/data-science-onramp/data-ingestion/setup.py
@@ -18,9 +18,6 @@
 from pyspark.sql.functions import date_format, expr, UserDefinedFunction, when
 from pyspark.sql.types import FloatType, StringType, StructField, StructType
 
-
-BUCKET_NAME = sys.argv[1]
-DATASET_NAME = sys.argv[2]
 TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
 CITIBIKE_TABLE_NAME = "RAW_DATA"
 EXTERNAL_TABLES = {
@@ -96,7 +93,7 @@ def gender(s):
 
 def convert_angle(angle):
     """Converts long and lat to DMS notation"""
-    if angle is None:
+    if not angle:
         return None
     degrees = int(angle)
     minutes = int((angle - degrees) * 60)
@@ -105,19 +102,19 @@ def convert_angle(angle):
     return random.choices([str(angle), new_angle], weights=[0.55, 0.45])[0]
 
 
-def create_bigquery_dataset():
+def create_bigquery_dataset(dataset_name):
     # Create BigQuery Dataset
     client = bigquery.Client()
-    dataset_id = f"{client.project}.{DATASET_NAME}"
+    dataset_id = f"{client.project}.{dataset_name}"
     dataset = bigquery.Dataset(dataset_id)
     dataset.location = "US"
     dataset = client.create_dataset(dataset)
 
 
-def write_to_bigquery(df, table_name):
+def write_to_bigquery(df, table_name, dataset_name):
     """Write a dataframe to BigQuery"""
     client = bigquery.Client()
-    dataset_id = f"{client.project}.{DATASET_NAME}"
+    dataset_id = f"{client.project}.{dataset_name}"
 
     # Saving the data to BigQuery
     df.write.format("bigquery").option("table", f"{dataset_id}.{table_name}").save()
@@ -126,12 +123,16 @@ def write_to_bigquery(df, table_name):
 
 
 def main():
-    # Create a SparkSession under the name "setup". Viewable via the Spark UI
+    # Get command line arguments
+    BUCKET_NAME = sys.argv[1]
+    DATASET_NAME = sys.argv[2]
+
+    # Create a SparkSession under the name "setup"
     spark = SparkSession.builder.appName("setup").getOrCreate()
 
     spark.conf.set("temporaryGcsBucket", BUCKET_NAME)
 
-    create_bigquery_dataset()
+    create_bigquery_dataset(DATASET_NAME)
 
     # Whether we are running the job as a test
     test = False
@@ -147,7 +148,7 @@ def main():
     for table_name, data in EXTERNAL_TABLES.items():
         df = spark.createDataFrame(pd.read_csv(data["url"]), schema=data["schema"])
 
-        write_to_bigquery(df, table_name)
+        write_to_bigquery(df, table_name, DATASET_NAME)
 
     # Check if table exists
     try:
@@ -203,7 +204,7 @@ def main():
     df = df.union(dup_df)
 
     print("Uploading citibike dataset...")
-    write_to_bigquery(df, CITIBIKE_TABLE_NAME)
+    write_to_bigquery(df, CITIBIKE_TABLE_NAME, DATASET_NAME)
 
 
 if __name__ == "__main__":
diff --git a/data-science-onramp/data-ingestion/setup_test.py b/data-science-onramp/data-ingestion/setup_test.py
@@ -35,7 +35,7 @@
     "project_id": PROJECT_ID,
     "cluster_name": DATAPROC_CLUSTER,
     "config": {
-        "gce_cluster_config": {"zone_uri": "",},
+        "gce_cluster_config": {"zone_uri": ""},
         "master_config": {"num_instances": 1, "machine_type_uri": "n1-standard-8"},
         "worker_config": {"num_instances": 6, "machine_type_uri": "n1-standard-8"},
         "software_config": {
@@ -48,7 +48,7 @@
     "placement": {"cluster_name": DATAPROC_CLUSTER},
     "pyspark_job": {
         "main_python_file_uri": f"gs://{BUCKET_NAME}/{BUCKET_BLOB}",
-        "args": [BUCKET_NAME, BQ_DATASET, "--test",],
+        "args": [BUCKET_NAME, BQ_DATASET, "--test"],
         "jar_file_uris": ["gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar"],
     },
 }
@@ -58,10 +58,11 @@
 def setup_and_teardown_cluster():
     # Create cluster using cluster client
     cluster_client = dataproc.ClusterControllerClient(
-        client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"}
+        #client_options={"api_endpoint": f"{CLUSTER_REGION}-dataproc.googleapis.com:443"}
     )
+
     operation = cluster_client.create_cluster(
-        PROJECT_ID, CLUSTER_REGION, CLUSTER_CONFIG
+        project_id=PROJECT_ID, region=CLUSTER_REGION, cluster=CLUSTER_CONFIG
     )
 
     # Wait for cluster to provision
@@ -71,7 +72,7 @@ def setup_and_teardown_cluster():
 
     # Delete cluster
     operation = cluster_client.delete_cluster(
-        PROJECT_ID, CLUSTER_REGION, DATAPROC_CLUSTER
+        project_id=PROJECT_ID, region=CLUSTER_REGION, name=DATAPROC_CLUSTER
     )
     operation.result()