Skip to content

Commit

Permalink
update databricks code to revised manuscript
Browse files Browse the repository at this point in the history
  • Loading branch information
tomlincr committed Mar 18, 2022
1 parent 7e10a38 commit cb967c3
Show file tree
Hide file tree
Showing 22 changed files with 1,823 additions and 461 deletions.
6 changes: 3 additions & 3 deletions code/00_covariates/2-2_CALIBER_skinny.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@
# MAGIC
# MAGIC **Reviewer(s)**
# MAGIC
# MAGIC **Date last updated** 2021-10-05
# MAGIC **Date last updated** 2022-01-22
# MAGIC
# MAGIC **Date last reviewed** *NA*
# MAGIC
# MAGIC **Date last run** 2021-10-05
# MAGIC **Date last run** 2022-01-22
# MAGIC
# MAGIC **Changelog**
# MAGIC * `21-05-19 ` V1 initial eversion - single first date of code per patient
Expand Down Expand Up @@ -67,7 +67,7 @@

# Table names
gdppr_table = "dars_nic_391419_j3w9t_collab.gdppr_dars_nic_391419_j3w9t_archive" # No non-archive equivalent
hes_apc_table = "dars_nic_391419_j3w9t_collab.hes_apc_all_years" # Don't need archive as using ProductionDate
hes_apc_table = "dars_nic_391419_j3w9t_collab.hes_apc_all_years_archive"

# without dars_nic_391419_j3w9t_collab. prefix
output_table = "ccu013_caliber_skinny"
Expand Down
2 changes: 1 addition & 1 deletion code/00_covariates/2-4_CALIBER-categories_pre2020.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
# MAGIC
# MAGIC **Date last reviewed** *UNREVIEWED !!!*
# MAGIC
# MAGIC **Date last run** 2021-10-05
# MAGIC **Date last run** 2022-01-22
# MAGIC
# MAGIC **Changelog**
# MAGIC
Expand Down
52 changes: 28 additions & 24 deletions code/01_phenotype_engineering/CCU013_01_create_table_aliases.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,11 @@
# MAGIC
# MAGIC **Reviewer(s)**
# MAGIC
# MAGIC **Date last updated** 2021-08-16
# MAGIC **Date last updated** 2022-01-22
# MAGIC
# MAGIC **Date last reviewed**
# MAGIC
# MAGIC **Date last run** 2021-08-16
# MAGIC **Date last run** 2022-01-22
# MAGIC
# MAGIC **Data input**
# MAGIC This notebook uses the archive tables made by the data wranglers - selecting the latest data by `productionDate`. The `productionDate` variabel is carried forward to master_phenotype in the `ccu13_tmp_gdppr` table, and will be saved in the main output tables; trajectory, severity and events, to ensure the data for the produced phenotypes is back tracable to source, for reproducability.
Expand All @@ -46,21 +46,28 @@

# COMMAND ----------

# MAGIC %run /Workspaces/dars_nic_391419_j3w9t_collab/CCU013/COVID-19-SEVERITY-PHENOTYPING/CCU013_00_helper_functions

# COMMAND ----------

LatestProductionDate = spark.sql("SELECT MAX(ProductionDate) FROM dars_nic_391419_j3w9t_collab.wrang002b_data_version_batchids").first()[0]
LatestAPC = spark.sql("SELECT MAX(ADMIDATE) FROM dars_nic_391419_j3w9t_collab.hes_apc_all_years").first()[0]
print(f"Most recent Production Date: {LatestProductionDate} \n Maximum date in HES APC is {LatestAPC} which represents a common cut-off across all datasets")

# COMMAND ----------

from pyspark.sql.functions import lit, to_date, col, udf, substring, regexp_replace, max
from pyspark.sql import functions as f
from datetime import datetime
from pyspark.sql.types import DateType

start_date = '2020-01-01'
end_date = '2021-09-01' # The maximal date covered by all sources.
# end_date = '2021-09-01' # The maximal date covered by all sources.
end_date = '2021-11-30'
# NB common cut-off data across all data sources is implemented in CCU013_13_paper_subset_data_to_cohort

# COMMAND ----------

# MAGIC %run /Workspaces/dars_nic_391419_j3w9t_collab/CCU013/COVID-19-SEVERITY-PHENOTYPING/CCU013_00_helper_functions

# COMMAND ----------

# MAGIC %md
# MAGIC ## 1.0 Subseting all source tables by dates

Expand All @@ -78,7 +85,8 @@

# COMMAND ----------

production_date = "2021-08-18 14:47:00.887883"
# production_date = "2021-08-18 14:47:00.887883"
production_date = "2022-01-20 14:58:52.353312"

# COMMAND ----------

Expand All @@ -105,12 +113,7 @@
# COMMAND ----------

# MAGIC %sql
# MAGIC SELECT min(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_sgss

# COMMAND ----------

# MAGIC %sql
# MAGIC SELECT max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_sgss
# MAGIC SELECT min(date), max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_sgss

# COMMAND ----------

Expand Down Expand Up @@ -139,7 +142,7 @@
# COMMAND ----------

# MAGIC %sql
# MAGIC SELECT max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_gdppr
# MAGIC SELECT min(date), max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_gdppr

# COMMAND ----------

Expand Down Expand Up @@ -169,7 +172,7 @@
# COMMAND ----------

# MAGIC %sql
# MAGIC SELECT max(death_date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_deaths
# MAGIC SELECT min(date), max(death_date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_deaths

# COMMAND ----------

Expand Down Expand Up @@ -200,7 +203,7 @@
# COMMAND ----------

# MAGIC %sql
# MAGIC SELECT max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_apc
# MAGIC SELECT min(date), max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_apc

# COMMAND ----------

Expand Down Expand Up @@ -232,7 +235,7 @@
# COMMAND ----------

# MAGIC %sql
# MAGIC SELECT max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_cc
# MAGIC SELECT min(date), max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_cc

# COMMAND ----------

Expand Down Expand Up @@ -289,14 +292,15 @@

# MAGIC %md
# MAGIC ### 1.7 CHESS
# MAGIC * Previously we weren't using the `_archive` table as it wasn't updated/didn't exist

# COMMAND ----------

chess = spark.sql('''SELECT PERSON_ID_DEID as person_id_deid, Typeofspecimen, Covid19, AdmittedToICU, Highflownasaloxygen, NoninvasiveMechanicalventilation, Invasivemechanicalventilation,
RespiratorySupportECMO, DateAdmittedICU, HospitalAdmissionDate, InfectionSwabDate as date, 'InfectionSwabDate' as date_is
FROM dars_nic_391419_j3w9t.chess_dars_nic_391419_j3w9t''')
#chess = spark.sql(f'''SELECT PERSON_ID_DEID as person_id_deid, Typeofspecimen, Covid19, AdmittedToICU, Highflownasaloxygen, NoninvasiveMechanicalventilation, Invasivemechanicalventilation,
# RespiratorySupportECMO, DateAdmittedICU, HospitalAdmissionDate, InfectionSwabDate as date, 'InfectionSwabDate' as date_is FROM #dars_nic_391419_j3w9t_collab.chess_dars_nic_391419_j3w9t_archive WHERE ProductionDate == "{production_date}"''')
# chess = spark.sql('''SELECT PERSON_ID_DEID as person_id_deid, Typeofspecimen, Covid19, AdmittedToICU, Highflownasaloxygen, NoninvasiveMechanicalventilation, Invasivemechanicalventilation,
# RespiratorySupportECMO, DateAdmittedICU, HospitalAdmissionDate, InfectionSwabDate as date, 'InfectionSwabDate' as date_is
# FROM dars_nic_391419_j3w9t.chess_dars_nic_391419_j3w9t''')
chess = spark.sql(f'''SELECT PERSON_ID_DEID as person_id_deid, Typeofspecimen, Covid19, AdmittedToICU, Highflownasaloxygen, NoninvasiveMechanicalventilation, Invasivemechanicalventilation,
RespiratorySupportECMO, DateAdmittedICU, HospitalAdmissionDate, InfectionSwabDate as date, 'InfectionSwabDate' as date_is FROM dars_nic_391419_j3w9t_collab.chess_dars_nic_391419_j3w9t_archive WHERE ProductionDate == "{production_date}"''')
chess = chess.filter(chess['Covid19'] == 'Yes')
chess = chess.filter(chess['person_id_deid'].isNotNull())
#chess = chess.filter((chess['date'] >= start_date) & (chess['date'] <= end_date))
Expand All @@ -311,7 +315,7 @@
# COMMAND ----------

# MAGIC %sql
# MAGIC SELECT max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_chess
# MAGIC SELECT min(date), max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_chess

# COMMAND ----------

Expand Down
Loading

0 comments on commit cb967c3

Please sign in to comment.