18
18
from pyspark .sql .functions import date_format , expr , UserDefinedFunction , when
19
19
from pyspark .sql .types import FloatType , StringType , StructField , StructType
20
20
21
-
22
- BUCKET_NAME = sys .argv [1 ]
23
- DATASET_NAME = sys .argv [2 ]
24
21
TABLE = "bigquery-public-data.new_york_citibike.citibike_trips"
25
22
CITIBIKE_TABLE_NAME = "RAW_DATA"
26
23
EXTERNAL_TABLES = {
@@ -96,7 +93,7 @@ def gender(s):
96
93
97
94
def convert_angle (angle ):
98
95
"""Converts long and lat to DMS notation"""
99
- if angle is None :
96
+ if not angle :
100
97
return None
101
98
degrees = int (angle )
102
99
minutes = int ((angle - degrees ) * 60 )
@@ -105,19 +102,19 @@ def convert_angle(angle):
105
102
return random .choices ([str (angle ), new_angle ], weights = [0.55 , 0.45 ])[0 ]
106
103
107
104
108
- def create_bigquery_dataset ():
105
+ def create_bigquery_dataset (dataset_name ):
109
106
# Create BigQuery Dataset
110
107
client = bigquery .Client ()
111
- dataset_id = f"{ client .project } .{ DATASET_NAME } "
108
+ dataset_id = f"{ client .project } .{ dataset_name } "
112
109
dataset = bigquery .Dataset (dataset_id )
113
110
dataset .location = "US"
114
111
dataset = client .create_dataset (dataset )
115
112
116
113
117
- def write_to_bigquery (df , table_name ):
114
+ def write_to_bigquery (df , table_name , dataset_name ):
118
115
"""Write a dataframe to BigQuery"""
119
116
client = bigquery .Client ()
120
- dataset_id = f"{ client .project } .{ DATASET_NAME } "
117
+ dataset_id = f"{ client .project } .{ dataset_name } "
121
118
122
119
# Saving the data to BigQuery
123
120
df .write .format ("bigquery" ).option ("table" , f"{ dataset_id } .{ table_name } " ).save ()
@@ -126,12 +123,16 @@ def write_to_bigquery(df, table_name):
126
123
127
124
128
125
def main ():
129
- # Create a SparkSession under the name "setup". Viewable via the Spark UI
126
+ # Get command line arguments
127
+ BUCKET_NAME = sys .argv [1 ]
128
+ DATASET_NAME = sys .argv [2 ]
129
+
130
+ # Create a SparkSession under the name "setup"
130
131
spark = SparkSession .builder .appName ("setup" ).getOrCreate ()
131
132
132
133
spark .conf .set ("temporaryGcsBucket" , BUCKET_NAME )
133
134
134
- create_bigquery_dataset ()
135
+ create_bigquery_dataset (DATASET_NAME )
135
136
136
137
# Whether we are running the job as a test
137
138
test = False
@@ -147,7 +148,7 @@ def main():
147
148
for table_name , data in EXTERNAL_TABLES .items ():
148
149
df = spark .createDataFrame (pd .read_csv (data ["url" ]), schema = data ["schema" ])
149
150
150
- write_to_bigquery (df , table_name )
151
+ write_to_bigquery (df , table_name , DATASET_NAME )
151
152
152
153
# Check if table exists
153
154
try :
@@ -203,7 +204,7 @@ def main():
203
204
df = df .union (dup_df )
204
205
205
206
print ("Uploading citibike dataset..." )
206
- write_to_bigquery (df , CITIBIKE_TABLE_NAME )
207
+ write_to_bigquery (df , CITIBIKE_TABLE_NAME , DATASET_NAME )
207
208
208
209
209
210
if __name__ == "__main__" :
0 commit comments