53
53
54
54
import airflow
55
55
from airflow import settings
56
- from airflow .configuration import conf
57
56
from airflow .jobs import BaseJob
58
57
from airflow .models import DAG , DagModel , DagRun , Log , SlaMiss , \
59
58
TaskInstance , Variable , XCom
60
59
from airflow .operators .python_operator import PythonOperator
60
+ from airflow .version import version as airflow_version
61
+
61
62
import dateutil .parser
62
63
from sqlalchemy import and_ , func
63
64
from sqlalchemy .exc import ProgrammingError
66
67
try :
67
68
# airflow.utils.timezone is available from v1.10 onwards
68
69
from airflow .utils import timezone
70
+
69
71
now = timezone .utcnow
70
72
except ImportError :
71
73
now = datetime .utcnow
79
81
DAG_OWNER_NAME = "operations"
80
82
# List of email address to send email alerts to if this job fails
81
83
ALERT_EMAIL_ADDRESSES = []
84
+ # Airflow version used by the environment in list form, value stored in
85
+ # airflow_version is in format e.g "1.10.15+composer"
86
+ AIRFLOW_VERSION = airflow_version [:- len ("+composer" )].split ("." )
82
87
# Length to retain the log files if not already provided in the conf. If this
83
88
# is set to 30, the job will remove those files that arE 30 days old or older.
84
-
85
89
DEFAULT_MAX_DB_ENTRY_AGE_IN_DAYS = int (
86
90
Variable .get ("airflow_db_cleanup__max_db_entry_age_in_days" , 30 ))
87
91
# Prints the database entries which will be getting deleted; set to False
139
143
# Check for TaskReschedule model
140
144
try :
141
145
from airflow .models import TaskReschedule
146
+
142
147
DATABASE_OBJECTS .append ({
143
148
"airflow_db_model" : TaskReschedule ,
144
149
"age_check_column" : TaskReschedule .execution_date ,
153
158
# Check for TaskFail model
154
159
try :
155
160
from airflow .models import TaskFail
161
+
156
162
DATABASE_OBJECTS .append ({
157
163
"airflow_db_model" : TaskFail ,
158
164
"age_check_column" : TaskFail .execution_date ,
164
170
except Exception as e :
165
171
logging .error (e )
166
172
167
- # Check for RenderedTaskInstanceFields model
168
- try :
169
- from airflow .models import RenderedTaskInstanceFields
170
- DATABASE_OBJECTS .append ({
171
- "airflow_db_model" : RenderedTaskInstanceFields ,
172
- "age_check_column" : RenderedTaskInstanceFields .execution_date ,
173
- "keep_last" : False ,
174
- "keep_last_filters" : None ,
175
- "keep_last_group_by" : None
176
- })
177
-
178
- except Exception as e :
179
- logging .error (e )
180
-
181
173
# Check for ImportError model
182
174
try :
183
175
from airflow .models import ImportError
176
+
184
177
DATABASE_OBJECTS .append ({
185
178
"airflow_db_model" : ImportError ,
186
179
"age_check_column" : ImportError .timestamp ,
193
186
except Exception as e :
194
187
logging .error (e )
195
188
196
- # Check for celery executor
197
- airflow_executor = str (conf .get ("core" , "executor" ))
198
- logging .info ("Airflow Executor: " + str (airflow_executor ))
199
- if (airflow_executor == "CeleryExecutor" ):
200
- logging .info ("Including Celery Modules" )
201
- try :
202
- from celery .backends .database .models import Task , TaskSet
203
- DATABASE_OBJECTS .extend (({
204
- "airflow_db_model" : Task ,
205
- "age_check_column" : Task .date_done ,
206
- "keep_last" : False ,
207
- "keep_last_filters" : None ,
208
- "keep_last_group_by" : None ,
209
- "do_not_delete_by_dag_id" : True
210
- }, {
211
- "airflow_db_model" : TaskSet ,
212
- "age_check_column" : TaskSet .date_done ,
213
- "keep_last" : False ,
214
- "keep_last_filters" : None ,
215
- "keep_last_group_by" : None ,
216
- "do_not_delete_by_dag_id" : True
217
- }))
218
-
219
- except Exception as e :
220
- logging .error (e )
221
-
222
- session = settings .Session ()
223
-
224
189
default_args = {
225
190
"owner" : DAG_OWNER_NAME ,
226
191
"depends_on_past" : False ,
@@ -252,7 +217,7 @@ def print_configuration_function(**context):
252
217
max_db_entry_age_in_days = dag_run_conf .get (
253
218
"maxDBEntryAgeInDays" , None )
254
219
logging .info ("maxDBEntryAgeInDays from dag_run.conf: " + str (dag_run_conf ))
255
- if ( max_db_entry_age_in_days is None or max_db_entry_age_in_days < 1 ) :
220
+ if max_db_entry_age_in_days is None or max_db_entry_age_in_days < 1 :
256
221
logging .info (
257
222
"maxDBEntryAgeInDays conf variable isn't included or Variable " +
258
223
"value is less than 1. Using Default '" +
@@ -266,7 +231,6 @@ def print_configuration_function(**context):
266
231
logging .info ("max_db_entry_age_in_days: " + str (max_db_entry_age_in_days ))
267
232
logging .info ("max_date: " + str (max_date ))
268
233
logging .info ("enable_delete: " + str (ENABLE_DELETE ))
269
- logging .info ("session: " + str (session ))
270
234
logging .info ("" )
271
235
272
236
logging .info ("Setting max_execution_date to XCom for Downstream Processes" )
@@ -280,7 +244,57 @@ def print_configuration_function(**context):
280
244
dag = dag )
281
245
282
246
247
+ def build_query (session , airflow_db_model , age_check_column , max_date ,
248
+ keep_last , keep_last_filters = None , keep_last_group_by = None ):
249
+ query = session .query (airflow_db_model ).options (
250
+ load_only (age_check_column ))
251
+
252
+ logging .info ("INITIAL QUERY : " + str (query ))
253
+
254
+ if not keep_last :
255
+ query = query .filter (age_check_column <= max_date , )
256
+ else :
257
+ subquery = session .query (func .max (DagRun .execution_date ))
258
+ # workaround for MySQL "table specified twice" issue
259
+ # https://github.com/teamclairvoyant/airflow-maintenance-dags/issues/41
260
+ if keep_last_filters is not None :
261
+ for entry in keep_last_filters :
262
+ subquery = subquery .filter (entry )
263
+
264
+ logging .info ("SUB QUERY [keep_last_filters]: " + str (subquery ))
265
+
266
+ if keep_last_group_by is not None :
267
+ subquery = subquery .group_by (keep_last_group_by )
268
+ logging .info (
269
+ "SUB QUERY [keep_last_group_by]: " +
270
+ str (subquery ))
271
+
272
+ subquery = subquery .from_self ()
273
+
274
+ query = query .filter (
275
+ and_ (age_check_column .notin_ (subquery )),
276
+ and_ (age_check_column <= max_date ))
277
+
278
+ return query
279
+
280
+
281
+ def print_query (query , airflow_db_model , age_check_column ):
282
+ entries_to_delete = query .all ()
283
+
284
+ logging .info ("Query: " + str (query ))
285
+ logging .info ("Process will be Deleting the following " +
286
+ str (airflow_db_model .__name__ ) + "(s):" )
287
+ for entry in entries_to_delete :
288
+ date = str (entry .__dict__ [str (age_check_column ).split ("." )[1 ]])
289
+ logging .info ("\t Entry: " + str (entry ) + ", Date: " + date )
290
+
291
+ logging .info ("Process will be Deleting "
292
+ + str (len (entries_to_delete )) + " "
293
+ + str (airflow_db_model .__name__ ) + "(s)" )
294
+
295
+
283
296
def cleanup_function (** context ):
297
+ session = settings .Session ()
284
298
285
299
logging .info ("Retrieving max_execution_date from XCom" )
286
300
max_date = context ["ti" ].xcom_pull (
@@ -310,67 +324,34 @@ def cleanup_function(**context):
310
324
logging .info ("Running Cleanup Process..." )
311
325
312
326
try :
313
- query = session .query (airflow_db_model ).options (
314
- load_only (age_check_column ))
315
-
316
- logging .info ("INITIAL QUERY : " + str (query ))
317
-
318
- if keep_last :
319
-
320
- subquery = session .query (func .max (DagRun .execution_date ))
321
- # workaround for MySQL "table specified twice" issue
322
- # https://github.com/teamclairvoyant/airflow-maintenance-dags/issues/41
323
- if keep_last_filters is not None :
324
- for entry in keep_last_filters :
325
- subquery = subquery .filter (entry )
326
-
327
- logging .info ("SUB QUERY [keep_last_filters]: " + str (subquery ))
328
-
329
- if keep_last_group_by is not None :
330
- subquery = subquery .group_by (keep_last_group_by )
331
- logging .info (
332
- "SUB QUERY [keep_last_group_by]: " +
333
- str (subquery ))
334
-
335
- subquery = subquery .from_self ()
336
-
337
- query = query .filter (
338
- and_ (age_check_column .notin_ (subquery )),
339
- and_ (age_check_column <= max_date ))
340
-
341
- else :
342
- query = query .filter (age_check_column <= max_date ,)
343
-
344
- if PRINT_DELETES :
345
- entries_to_delete = query .all ()
346
-
347
- logging .info ("Query: " + str (query ))
348
- logging .info ("Process will be Deleting the following " +
349
- str (airflow_db_model .__name__ ) + "(s):" )
350
- for entry in entries_to_delete :
351
- date = str (entry .__dict__ [str (age_check_column ).split ("." )[1 ]])
352
- logging .info ("\t Entry: " + str (entry ) + ", Date: " + date )
353
-
354
- logging .info ("Process will be Deleting "
355
- + str (len (entries_to_delete )) + " "
356
- + str (airflow_db_model .__name__ ) + "(s)" )
327
+ if context ["params" ].get ("do_not_delete_by_dag_id" ):
328
+ query = build_query (session , airflow_db_model , age_check_column ,
329
+ max_date , keep_last , keep_last_filters ,
330
+ keep_last_group_by )
331
+ if PRINT_DELETES :
332
+ print_query (query , airflow_db_model , age_check_column )
333
+ if ENABLE_DELETE :
334
+ logging .info ("Performing Delete..." )
335
+ query .delete (synchronize_session = False )
336
+ session .commit ()
357
337
else :
358
- logging .warn (
359
- "You've opted to skip printing the db entries to be deleted. "
360
- "Set PRINT_DELETES to True to show entries!!!" )
361
-
362
- if ENABLE_DELETE :
363
- logging .info ("Performing Delete..." )
364
- if context ["params" ].get ("do_not_delete_by_dag_id" ):
365
- query .filter (age_check_column <= max_date ).delete (synchronize_session = False )
338
+ dags = session .query (airflow_db_model .dag_id ).distinct ()
339
+ session .commit ()
340
+
341
+ list_dags = [str (list (dag )[0 ]) for dag in dags ]
342
+ for dag in list_dags :
343
+ query = build_query (session , airflow_db_model , age_check_column ,
344
+ max_date , keep_last , keep_last_filters ,
345
+ keep_last_group_by )
346
+ query = query .filter (airflow_db_model .dag_id == dag )
347
+ if PRINT_DELETES :
348
+ print_query (query , airflow_db_model , age_check_column )
349
+ if ENABLE_DELETE :
350
+ logging .info ("Performing Delete..." )
351
+ query .delete (synchronize_session = False )
366
352
session .commit ()
367
- else :
368
- dags = session .query (airflow_db_model .dag_id ).distinct ()
369
- list_dags = [str (list (dag )[0 ]) for dag in dags ]
370
- for dag in list_dags :
371
- query .filter (age_check_column <= max_date ).filter (airflow_db_model .dag_id == dag ).delete (synchronize_session = False )
372
- session .commit ()
373
- else :
353
+
354
+ if not ENABLE_DELETE :
374
355
logging .warn ("You've opted to skip deleting the db entries. "
375
356
"Set ENABLE_DELETE to True to delete entries!!!" )
376
357
@@ -379,12 +360,13 @@ def cleanup_function(**context):
379
360
except ProgrammingError as e :
380
361
logging .error (e )
381
362
logging .error (
382
- str (airflow_db_model ) + " is not present in the metadata."
383
- "Skipping..." )
363
+ str (airflow_db_model ) + " is not present in the metadata. "
364
+ "Skipping..." )
365
+ finally :
366
+ session .close ()
384
367
385
368
386
369
for db_object in DATABASE_OBJECTS :
387
-
388
370
cleanup_op = PythonOperator (
389
371
task_id = "cleanup_" + str (db_object ["airflow_db_model" ].__name__ ),
390
372
python_callable = cleanup_function ,
0 commit comments