|
| 1 | +import pandas as pd |
| 2 | +import sqlalchemy as sa |
| 3 | +import numpy as np |
| 4 | + |
| 5 | +# TODO, try and get both parent and child grants in the SQL query, |
| 6 | +# for now only parent grants are present (i.e. lead PI grants) |
| 7 | + |
| 8 | +# Load connection string for SQLAlchemy |
| 9 | +conn_str = [] |
| 10 | + |
| 11 | +with open('./local_temp/sql_alchemy_mysql_conn_string.txt', 'r') as fn: |
| 12 | + for line in fn: |
| 13 | + conn_str.append(str(line)) |
| 14 | + |
| 15 | +# Setup SQLAlchemy to use mysqlclient as MySQL driver |
| 16 | +# (need to use SQLAlchemy as pandas now only accepts SQLAlchemey engine + SQL query input, not pyodbc or similar) |
| 17 | + |
| 18 | +engine = sa.create_engine(conn_str[0]) # mysqlclient connection for Docker |
| 19 | + |
| 20 | +# engine = sa.create_engine('mysql+pyodbc://DataBank?charset=utf8mb4') # pyodbc connection (used for Windows testing) |
| 21 | +saconn = engine.connect() |
| 22 | + |
| 23 | +# SQL query to pull information needed by DataMAD, also renames to DataMad names. |
| 24 | +sql_datamad_renamed = "SELECT \ |
| 25 | + fact_application.ApplicationID AS GRANTREFERENCE, \ |
| 26 | + fact_application.ApplicationTitle AS PROJECT_TITLE, \ |
| 27 | + dim_scheme.SchemeName AS SCHEME, \ |
| 28 | + dim_opportunity.OpportunityName AS 'CALL', \ |
| 29 | + dim_scheme.SchemeType AS GRANT_TYPE, \ |
| 30 | + dim_person.FullName AS GRANT_HOLDER, \ |
| 31 | + dim_person.Email AS EMAIL, \ |
| 32 | + dim_organisation.OrganisationName AS RESEARCH_ORG, \ |
| 33 | + dim_department.DepartmentName AS DEPARTMENT, \ |
| 34 | + dim_application_date.ActualStartDate AS ACTUAL_START_DATE, \ |
| 35 | + dim_application_date.ActualEndDate AS ACTUAL_END_DATE, \ |
| 36 | + fact_application.AdministratingCouncil AS NEW_ADMINISTRATING_COUNCIL, \ |
| 37 | + dim_application_date.ProposedStartDate AS PROPOSED_ST_DT, \ |
| 38 | + dim_application_date.ProposedEndDate AS PROPOSED_END_DT, \ |
| 39 | + fact_application.ApplicationStatus AS GRANT_STATUS, \ |
| 40 | + dim_organisation.AddressLine1 AS ADDRESS1, \ |
| 41 | + dim_organisation.TownOrCity AS CITY, \ |
| 42 | + dim_organisation.PostCode AS POSTCODE, \ |
| 43 | + fact_application.AwardedAmount AS 'AMOUNT', \ |
| 44 | + dim_application_ext.RoutingClassification AS ROUTING_CLASSIFICATION, \ |
| 45 | + dim_classification_area.SubjectArea AS SCIENCE_AREA, \ |
| 46 | + dim_organisation.region AS GEOGRAPHIC_AREA, \ |
| 47 | + dim_classification_area.ResearchTopic AS SECONDARY_CLASSIFICATION, \ |
| 48 | + dim_application_ext.ApplicationSummary AS ABSTRACT \ |
| 49 | + FROM fact_application \ |
| 50 | + LEFT OUTER JOIN dim_scheme \ |
| 51 | + ON fact_application.SchemeSKey = dim_scheme.SchemeSKey \ |
| 52 | + LEFT OUTER JOIN dim_opportunity \ |
| 53 | + ON fact_application.OpportunitySKey = dim_opportunity.OpportunitySKey \ |
| 54 | + LEFT OUTER JOIN dim_person \ |
| 55 | + ON fact_application.ApplicantPersonSKey = dim_person.PersonSKey \ |
| 56 | + LEFT OUTER JOIN dim_department \ |
| 57 | + ON fact_application.OrganisationDepartmentSKey = dim_department.OrganisationDepartmentSKey \ |
| 58 | + LEFT OUTER JOIN dim_application_date \ |
| 59 | + ON fact_application.ApplicationSKey = dim_application_date.ApplicationSKey \ |
| 60 | + LEFT OUTER JOIN dim_organisation\ |
| 61 | + ON fact_application.LeadOrganisationSKey = dim_organisation.OrganisationSKey \ |
| 62 | + LEFT OUTER JOIN dim_application_ext\ |
| 63 | + ON fact_application.ApplicationSKey = dim_application_ext.ApplicationSKey \ |
| 64 | + LEFT OUTER JOIN dim_classification_area \ |
| 65 | + ON fact_application.PrimaryClassificationAreaSKey = dim_classification_area.ClassificationAreaSKey \ |
| 66 | + WHERE fact_application.AdministratingCouncil = 'NERC' AND fact_application.ApplicationStatus = 'ACCEPTED' \ |
| 67 | + GROUP BY fact_application.ApplicationID \ |
| 68 | + LIMIT 200" |
| 69 | + |
| 70 | +# TODO May potentially need to add this back in (trying to join on fact_application_team.LeadApplicantPersonSKey instead) |
| 71 | +""" |
| 72 | + LEFT OUTER JOIN dim_person \ |
| 73 | + ON fact_application.ApplicantPersonSKey = dim_person.PersonSKey \ |
| 74 | +""" |
| 75 | + |
| 76 | + |
| 77 | +# Query data from Databank |
| 78 | +data_renamed = pd.read_sql(sql_datamad_renamed, engine) |
| 79 | + |
| 80 | +# Add in blank columns to cover fields missing in Databank that were in Siebel |
| 81 | +# TODO, should delete some of these once app_datamad is updated to remove these fields from the models |
| 82 | +# Definitely don't need WORK_NUMBER, NCAS, NCEO, ADDRESS2, OVERALL_SCORE, PROPOSED_ST_DT_ORG or PROPOSED_END_DT_ORG |
| 83 | +# Try to find (or extract from another field): FACILITY, LEAD_GRANT, PARENT_GRANT and OBJECTIVES |
| 84 | + |
| 85 | +no_longer_needed_cols = ['WORK_NUMBER', 'NCAS', 'NCEO', 'ADDRESS2', 'OVERALL_SCORE', |
| 86 | + 'PROPOSED_ST_DT_ORG', 'PROPOSED_END_DT_ORG'] |
| 87 | +data_renamed = data_renamed.reindex(columns=[*data_renamed.columns.tolist(), *no_longer_needed_cols], fill_value=np.nan) |
| 88 | + |
| 89 | +needed_cols = ['FACILITY', 'LEAD_GRANT', 'PARENT_GRANT', 'OBJECTIVES'] |
| 90 | +data_renamed = data_renamed.reindex(columns=[*data_renamed.columns.tolist(), *needed_cols], fill_value=np.nan) |
| 91 | + |
| 92 | +# Reorder columns to Siebel order (easier to read for user, not needed for import via Django) |
| 93 | +col_order = ['GRANTREFERENCE', 'PROJECT_TITLE', 'SCHEME', 'CALL', 'GRANT_TYPE', |
| 94 | + 'GRANT_HOLDER', 'WORK_NUMBER', 'EMAIL', 'RESEARCH_ORG', |
| 95 | + 'DEPARTMENT', 'ACTUAL_START_DATE', 'ACTUAL_END_DATE', |
| 96 | + 'NCAS', 'NCEO', 'PROPOSED_ST_DT', 'PROPOSED_END_DT', |
| 97 | + 'GRANT_STATUS', 'ADDRESS1', 'ADDRESS2', 'CITY', |
| 98 | + 'POSTCODE', 'LEAD_GRANT', 'PARENT_GRANT', 'AMOUNT', |
| 99 | + 'ROUTING_CLASSIFICATION', 'SCIENCE_AREA', |
| 100 | + 'GEOGRAPHIC_AREA', 'SECONDARY_CLASSIFICATION', |
| 101 | + 'ABSTRACT', 'OBJECTIVES', 'FACILITY', 'OVERALL_SCORE', |
| 102 | + 'PROPOSED_ST_DT_ORG', 'PROPOSED_END_DT_ORG'] |
| 103 | + |
| 104 | +data_renamed = data_renamed[col_order] |
| 105 | + |
| 106 | +# Save .csv file |
| 107 | +data_renamed.to_csv("./import_csvs/datamad_databank_debug.csv") |
| 108 | + |
| 109 | +pause = 1 |
0 commit comments