20
20
import os
21
21
import uuid
22
22
23
- import backoff
24
23
from google .api_core .exceptions import Aborted , NotFound
25
24
from google .cloud import bigquery
26
25
from google .cloud import dataproc_v1 as dataproc
50
49
PROCESSING_PYTHON_FILE = f"gs://{ BUCKET_NAME } /{ BUCKET_BLOB } "
51
50
52
51
53
- @pytest .fixture (scope = "module" )
54
- def test_dataproc_batch ():
52
+ @pytest .fixture (scope = "function" )
53
+ def test_dataproc_batch (test_bucket , bq_dataset ):
54
+ # check that the results table isnt there
55
+ with pytest .raises (NotFound ):
56
+ BQ_CLIENT .get_table (f"{ BQ_DATASET } .{ BQ_WRITE_TABLE } " )
55
57
56
- BATCH_ID = (
57
- f"summit-dag-test-{ TEST_ID } " # Dataproc serverless only allows lowercase characters
58
- )
58
+ BATCH_ID = f"summit-dag-test-{ TEST_ID } " # Dataproc serverless only allows lowercase characters
59
59
BATCH_CONFIG = {
60
60
"pyspark_batch" : {
61
61
"jar_file_uris" : [PYSPARK_JAR ],
@@ -68,12 +68,48 @@ def test_dataproc_batch():
68
68
},
69
69
}
70
70
71
- yield ( BATCH_ID , BATCH_CONFIG )
71
+ # create a batch
72
72
dataproc_client = dataproc .BatchControllerClient (
73
73
client_options = {
74
74
"api_endpoint" : f"{ DATAPROC_REGION } -dataproc.googleapis.com:443"
75
75
}
76
76
)
77
+ request = dataproc .CreateBatchRequest (
78
+ parent = f"projects/{ PROJECT_ID } /regions/{ DATAPROC_REGION } " ,
79
+ batch = BATCH_CONFIG ,
80
+ batch_id = BATCH_ID ,
81
+ )
82
+ try :
83
+ # Make the request
84
+ operation = dataproc_client .create_batch (request = request )
85
+
86
+ print ("Waiting for operation to complete..." )
87
+
88
+ response = operation .result ()
89
+ except Aborted as e :
90
+ # retry once if we see a flaky 409 "subnet not ready error"
91
+ if "/subnetworks/default" in e :
92
+ # delete the errored out batch so we don't see an "AlreadyExists"
93
+ delete_request = dataproc .DeleteBatchRequest (
94
+ name = f"projects/{ PROJECT_ID } /locations/{ DATAPROC_REGION } /batches/{ BATCH_ID } "
95
+ )
96
+ dataproc_client .delete_batch (request = delete_request )
97
+ # retry the creation operation once
98
+ create_request = dataproc .CreateBatchRequest (
99
+ parent = f"projects/{ PROJECT_ID } /regions/{ DATAPROC_REGION } " ,
100
+ batch = BATCH_CONFIG ,
101
+ batch_id = BATCH_ID ,
102
+ )
103
+ operation = dataproc_client .create_batch (request = create_request )
104
+
105
+ print ("Waiting for operation to complete..." )
106
+
107
+ response = operation .result ()
108
+ else :
109
+ raise (e )
110
+
111
+ yield response
112
+
77
113
request = dataproc .DeleteBatchRequest (
78
114
name = f"projects/{ PROJECT_ID } /locations/{ DATAPROC_REGION } /batches/{ BATCH_ID } "
79
115
)
@@ -110,7 +146,7 @@ def test_bucket():
110
146
bucket .delete (force = True )
111
147
112
148
113
- @pytest .fixture (autouse = True )
149
+ @pytest .fixture (scope = "module" )
114
150
def bq_dataset (test_bucket ):
115
151
# Create dataset and table tfor test CSV
116
152
BQ_CLIENT .create_dataset (BQ_DATASET )
@@ -147,33 +183,9 @@ def bq_dataset(test_bucket):
147
183
print (f"Ignoring NotFound on cleanup, details: { e } " )
148
184
149
185
150
- # Retry if we see a flaky 409 "subnet not ready" exception
151
- @backoff .on_exception (backoff .expo , Aborted , max_tries = 3 )
152
186
def test_process (test_dataproc_batch ):
153
- # check that the results table isnt there
154
- with pytest .raises (NotFound ):
155
- BQ_CLIENT .get_table (f"{ BQ_DATASET } .{ BQ_WRITE_TABLE } " )
156
-
157
- # create a batch
158
- dataproc_client = dataproc .BatchControllerClient (
159
- client_options = {
160
- "api_endpoint" : f"{ DATAPROC_REGION } -dataproc.googleapis.com:443"
161
- }
162
- )
163
- request = dataproc .CreateBatchRequest (
164
- parent = f"projects/{ PROJECT_ID } /regions/{ DATAPROC_REGION } " ,
165
- batch = test_dataproc_batch [1 ],
166
- batch_id = test_dataproc_batch [0 ],
167
- )
168
- # Make the request
169
- operation = dataproc_client .create_batch (request = request )
170
-
171
- print ("Waiting for operation to complete..." )
172
-
173
- response = operation .result ()
174
187
175
- # Handle the response
176
- print (response )
188
+ print (test_dataproc_batch )
177
189
178
190
# check that the results table is there now
179
- assert BQ_CLIENT .get_table (f"{ BQ_DATASET } .{ BQ_WRITE_TABLE } " ).num_rows > 0
191
+ assert BQ_CLIENT .get_table (f"{ BQ_DATASET } .{ BQ_WRITE_TABLE } " ).num_rows > 0
0 commit comments