diff --git a/code/00_covariates/2-2_CALIBER_skinny.py b/code/00_covariates/2-2_CALIBER_skinny.py index 42f1aae..6d35973 100644 --- a/code/00_covariates/2-2_CALIBER_skinny.py +++ b/code/00_covariates/2-2_CALIBER_skinny.py @@ -22,11 +22,11 @@ # MAGIC # MAGIC **Reviewer(s)** # MAGIC -# MAGIC **Date last updated** 2021-10-05 +# MAGIC **Date last updated** 2022-01-22 # MAGIC # MAGIC **Date last reviewed** *NA* # MAGIC -# MAGIC **Date last run** 2021-10-05 +# MAGIC **Date last run** 2022-01-22 # MAGIC # MAGIC **Changelog** # MAGIC * `21-05-19 ` V1 initial eversion - single first date of code per patient @@ -67,7 +67,7 @@ # Table names gdppr_table = "dars_nic_391419_j3w9t_collab.gdppr_dars_nic_391419_j3w9t_archive" # No non-archive equivalent -hes_apc_table = "dars_nic_391419_j3w9t_collab.hes_apc_all_years" # Don't need archive as using ProductionDate +hes_apc_table = "dars_nic_391419_j3w9t_collab.hes_apc_all_years_archive" # without dars_nic_391419_j3w9t_collab. prefix output_table = "ccu013_caliber_skinny" diff --git a/code/00_covariates/2-4_CALIBER-categories_pre2020.py b/code/00_covariates/2-4_CALIBER-categories_pre2020.py index 914785d..ead40a3 100644 --- a/code/00_covariates/2-4_CALIBER-categories_pre2020.py +++ b/code/00_covariates/2-4_CALIBER-categories_pre2020.py @@ -42,7 +42,7 @@ # MAGIC # MAGIC **Date last reviewed** *UNREVIEWED !!!* # MAGIC -# MAGIC **Date last run** 2021-10-05 +# MAGIC **Date last run** 2022-01-22 # MAGIC # MAGIC **Changelog** # MAGIC diff --git a/code/01_phenotype_engineering/CCU013_01_create_table_aliases.py b/code/01_phenotype_engineering/CCU013_01_create_table_aliases.py index ab0a304..23acd95 100644 --- a/code/01_phenotype_engineering/CCU013_01_create_table_aliases.py +++ b/code/01_phenotype_engineering/CCU013_01_create_table_aliases.py @@ -16,11 +16,11 @@ # MAGIC # MAGIC **Reviewer(s)** # MAGIC -# MAGIC **Date last updated** 2021-08-16 +# MAGIC **Date last updated** 2022-01-22 # MAGIC # MAGIC **Date last reviewed** # MAGIC -# MAGIC **Date last run** 2021-08-16 +# MAGIC **Date last run** 2022-01-22 # MAGIC # MAGIC **Data input** # MAGIC This notebook uses the archive tables made by the data wranglers - selecting the latest data by `productionDate`. The `productionDate` variabel is carried forward to master_phenotype in the `ccu13_tmp_gdppr` table, and will be saved in the main output tables; trajectory, severity and events, to ensure the data for the produced phenotypes is back tracable to source, for reproducability. @@ -46,21 +46,28 @@ # COMMAND ---------- +# MAGIC %run /Workspaces/dars_nic_391419_j3w9t_collab/CCU013/COVID-19-SEVERITY-PHENOTYPING/CCU013_00_helper_functions + +# COMMAND ---------- + +LatestProductionDate = spark.sql("SELECT MAX(ProductionDate) FROM dars_nic_391419_j3w9t_collab.wrang002b_data_version_batchids").first()[0] +LatestAPC = spark.sql("SELECT MAX(ADMIDATE) FROM dars_nic_391419_j3w9t_collab.hes_apc_all_years").first()[0] +print(f"Most recent Production Date: {LatestProductionDate} \n Maximum date in HES APC is {LatestAPC} which represents a common cut-off across all datasets") + +# COMMAND ---------- + from pyspark.sql.functions import lit, to_date, col, udf, substring, regexp_replace, max from pyspark.sql import functions as f from datetime import datetime from pyspark.sql.types import DateType start_date = '2020-01-01' -end_date = '2021-09-01' # The maximal date covered by all sources. +# end_date = '2021-09-01' # The maximal date covered by all sources. +end_date = '2021-11-30' # NB common cut-off data across all data sources is implemented in CCU013_13_paper_subset_data_to_cohort # COMMAND ---------- -# MAGIC %run /Workspaces/dars_nic_391419_j3w9t_collab/CCU013/COVID-19-SEVERITY-PHENOTYPING/CCU013_00_helper_functions - -# COMMAND ---------- - # MAGIC %md # MAGIC ## 1.0 Subseting all source tables by dates @@ -78,7 +85,8 @@ # COMMAND ---------- -production_date = "2021-08-18 14:47:00.887883" +# production_date = "2021-08-18 14:47:00.887883" +production_date = "2022-01-20 14:58:52.353312" # COMMAND ---------- @@ -105,12 +113,7 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC SELECT min(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_sgss - -# COMMAND ---------- - -# MAGIC %sql -# MAGIC SELECT max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_sgss +# MAGIC SELECT min(date), max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_sgss # COMMAND ---------- @@ -139,7 +142,7 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC SELECT max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_gdppr +# MAGIC SELECT min(date), max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_gdppr # COMMAND ---------- @@ -169,7 +172,7 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC SELECT max(death_date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_deaths +# MAGIC SELECT min(date), max(death_date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_deaths # COMMAND ---------- @@ -200,7 +203,7 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC SELECT max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_apc +# MAGIC SELECT min(date), max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_apc # COMMAND ---------- @@ -232,7 +235,7 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC SELECT max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_cc +# MAGIC SELECT min(date), max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_cc # COMMAND ---------- @@ -289,14 +292,15 @@ # MAGIC %md # MAGIC ### 1.7 CHESS +# MAGIC * Previously we weren't using the `_archive` table as it wasn't updated/didn't exist # COMMAND ---------- -chess = spark.sql('''SELECT PERSON_ID_DEID as person_id_deid, Typeofspecimen, Covid19, AdmittedToICU, Highflownasaloxygen, NoninvasiveMechanicalventilation, Invasivemechanicalventilation, - RespiratorySupportECMO, DateAdmittedICU, HospitalAdmissionDate, InfectionSwabDate as date, 'InfectionSwabDate' as date_is - FROM dars_nic_391419_j3w9t.chess_dars_nic_391419_j3w9t''') -#chess = spark.sql(f'''SELECT PERSON_ID_DEID as person_id_deid, Typeofspecimen, Covid19, AdmittedToICU, Highflownasaloxygen, NoninvasiveMechanicalventilation, Invasivemechanicalventilation, -# RespiratorySupportECMO, DateAdmittedICU, HospitalAdmissionDate, InfectionSwabDate as date, 'InfectionSwabDate' as date_is FROM #dars_nic_391419_j3w9t_collab.chess_dars_nic_391419_j3w9t_archive WHERE ProductionDate == "{production_date}"''') +# chess = spark.sql('''SELECT PERSON_ID_DEID as person_id_deid, Typeofspecimen, Covid19, AdmittedToICU, Highflownasaloxygen, NoninvasiveMechanicalventilation, Invasivemechanicalventilation, +# RespiratorySupportECMO, DateAdmittedICU, HospitalAdmissionDate, InfectionSwabDate as date, 'InfectionSwabDate' as date_is +# FROM dars_nic_391419_j3w9t.chess_dars_nic_391419_j3w9t''') +chess = spark.sql(f'''SELECT PERSON_ID_DEID as person_id_deid, Typeofspecimen, Covid19, AdmittedToICU, Highflownasaloxygen, NoninvasiveMechanicalventilation, Invasivemechanicalventilation, + RespiratorySupportECMO, DateAdmittedICU, HospitalAdmissionDate, InfectionSwabDate as date, 'InfectionSwabDate' as date_is FROM dars_nic_391419_j3w9t_collab.chess_dars_nic_391419_j3w9t_archive WHERE ProductionDate == "{production_date}"''') chess = chess.filter(chess['Covid19'] == 'Yes') chess = chess.filter(chess['person_id_deid'].isNotNull()) #chess = chess.filter((chess['date'] >= start_date) & (chess['date'] <= end_date)) @@ -311,7 +315,7 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC SELECT max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_chess +# MAGIC SELECT min(date), max(date) FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_chess # COMMAND ---------- diff --git a/code/01_phenotype_engineering/CCU013_02_master_phenotypes.py b/code/01_phenotype_engineering/CCU013_02_master_phenotypes.py index 295bdf4..c2689cf 100644 --- a/code/01_phenotype_engineering/CCU013_02_master_phenotypes.py +++ b/code/01_phenotype_engineering/CCU013_02_master_phenotypes.py @@ -135,7 +135,9 @@ # MAGIC SELECT person_id_deid, date, # MAGIC "01_Covid_positive_test" as covid_phenotype, # MAGIC "" as clinical_code, -# MAGIC CASE WHEN REPORTING_LAB_ID = '840' THEN "pillar_2" ELSE "pillar_1" END as description, +# MAGIC -- TODO: wranglers please clarify whether LAB ID 840 is still the best means of identifying pillar 1 vs 2 +# MAGIC -- CASE WHEN REPORTING_LAB_ID = '840' THEN "pillar_2" ELSE "pillar_1" END as description, +# MAGIC "" as description, # MAGIC "confirmed" as covid_status, # MAGIC "" as code, # MAGIC "SGSS" as source, date_is @@ -158,7 +160,7 @@ # MAGIC SELECT person_id_deid, date, # MAGIC "01_GP_covid_diagnosis" as covid_phenotype, # MAGIC clinical_code, description, -# MAGIC "confirmed" as covid_status, --- NEED To inspect and identify which are only suspected! +# MAGIC "" as covid_status, --- See SNOMED code description # MAGIC "SNOMED" as code, # MAGIC "GDPPR" as source, date_is # MAGIC from cte_gdppr @@ -221,7 +223,7 @@ # MAGIC "02_Covid_admission" as covid_phenotype, # MAGIC "" as clinical_code, # MAGIC "HospitalAdmissionDate IS NOT null" as description, -# MAGIC "confirmed" as covid_status, +# MAGIC "" as covid_status, # MAGIC "CHESS" as source, # MAGIC "" as code, # MAGIC "HospitalAdmissionDate" as date_is @@ -265,7 +267,7 @@ # MAGIC "03_ICU_admission" as covid_phenotype, # MAGIC "" as clinical_code, # MAGIC "DateAdmittedICU IS NOT null" as description, -# MAGIC "confirmed" as covid_status, +# MAGIC "" as covid_status, # MAGIC "CHESS" as source, # MAGIC "" as code, # MAGIC "DateAdmittedICU" as date_is @@ -282,7 +284,7 @@ # MAGIC '03_ICU_admission' as covid_phenotype, # MAGIC "" as clinical_code, # MAGIC "id is in hes_cc table" as description, -# MAGIC "confirmed" as covid_status, +# MAGIC "" as covid_status, # MAGIC "" as code, # MAGIC 'HES CC' as source, cc.date_is, BRESSUPDAYS, ARESSUPDAYS # MAGIC FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_apc as apc @@ -305,7 +307,7 @@ # MAGIC "03_NIV_treatment" as covid_phenotype, # MAGIC "" as clinical_code, # MAGIC "Highflownasaloxygen OR NoninvasiveMechanicalventilation == Yes" as description, -# MAGIC "confirmed" as covid_status, +# MAGIC "" as covid_status, # MAGIC "CHESS" as source, # MAGIC "" as code, # MAGIC "HospitalAdmissionDate" as date_is -- Can't be any more precise @@ -324,7 +326,7 @@ # MAGIC '03_NIV_treatment' as covid_phenotype, # MAGIC "" as clinical_code, # MAGIC "bressupdays > 0" as description, -# MAGIC "confirmed" as covid_status, +# MAGIC "" as covid_status, # MAGIC "" as code, # MAGIC 'HES CC' as source, date_is, BRESSUPDAYS, ARESSUPDAYS # MAGIC FROM global_temp.ccu013_cc_covid @@ -341,7 +343,7 @@ # MAGIC when OPERTN_4_CONCAT LIKE "%E856%" THEN 'E85.6' Else '0' End) as clinical_code, # MAGIC (case when OPERTN_4_CONCAT LIKE "%E852%" THEN 'Non-invasive ventilation NEC' # MAGIC when OPERTN_4_CONCAT LIKE "%E856%" THEN 'Continuous positive airway pressure' Else '0' End) as description, -# MAGIC "confirmed" as covid_status, +# MAGIC "" as covid_status, # MAGIC "HES APC" as source, # MAGIC "OPCS" as code, date_is, SUSRECID # MAGIC FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_apc @@ -361,7 +363,7 @@ # MAGIC when PROCEDURE_CONCAT LIKE "%E856%" OR PROCEDURE_CONCAT LIKE "%E85.6%" THEN 'E85.6' Else '0' End) as clinical_code, # MAGIC (case when PROCEDURE_CONCAT LIKE "%E852%" OR PROCEDURE_CONCAT LIKE "%E85.2%" THEN 'Non-invasive ventilation NEC' # MAGIC when PROCEDURE_CONCAT LIKE "%E856%" OR PROCEDURE_CONCAT LIKE "%E85.6%" THEN 'Continuous positive airway pressure' Else '0' End) as description, -# MAGIC "confirmed" as covid_status, +# MAGIC "" as covid_status, # MAGIC "SUS" as source, # MAGIC "OPCS" as code, "PRIMARY_PROCEDURE_DATE" as date_is # MAGIC FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_sus @@ -386,7 +388,7 @@ # MAGIC '03_IMV_treatment' as covid_phenotype, # MAGIC "" as clinical_code, # MAGIC "ARESSUPDAYS > 0" as description, -# MAGIC "confirmed" as covid_status, +# MAGIC "" as covid_status, # MAGIC "" as code, # MAGIC 'HES CC' as source, date_is, BRESSUPDAYS, ARESSUPDAYS # MAGIC FROM global_temp.ccu013_cc_covid @@ -401,7 +403,7 @@ # MAGIC "03_IMV_treatment" as covid_phenotype, # MAGIC "" as clinical_code, # MAGIC "Invasivemechanicalventilation == Yes" as description, -# MAGIC "confirmed" as covid_status, +# MAGIC "" as covid_status, # MAGIC "CHESS" as source, # MAGIC "" as code, # MAGIC "DateAdmittedICU" as date_is -- Using ICU date as probably most of the IMV happened there, but may lose some records (250/10k) @@ -420,7 +422,7 @@ # MAGIC when OPERTN_4_CONCAT LIKE "%X56%" THEN 'X56' Else '0' End) as clinical_code, # MAGIC (case when OPERTN_4_CONCAT LIKE "%E851%" THEN 'Invasive ventilation' # MAGIC when OPERTN_4_CONCAT LIKE "%X56%" THEN 'Intubation of trachea' Else '0' End) as description, -# MAGIC "confirmed" as covid_status, +# MAGIC "" as covid_status, # MAGIC "HES APC" as source, # MAGIC "OPCS" as code, date_is, SUSRECID # MAGIC FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_apc @@ -438,7 +440,7 @@ # MAGIC when PROCEDURE_CONCAT LIKE "%X56%" THEN 'X56' Else '0' End) as clinical_code, # MAGIC (case when PROCEDURE_CONCAT LIKE "%E851%" OR PROCEDURE_CONCAT LIKE "%E85.1%" THEN 'Invasive ventilation' # MAGIC when PROCEDURE_CONCAT LIKE "%X56%" THEN 'Intubation of trachea' Else '0' End) as description, -# MAGIC "confirmed" as covid_status, +# MAGIC "" as covid_status, # MAGIC "SUS" as source, # MAGIC "OPCS" as code, "PRIMARY_PROCEDURE_DATE" as date_is # MAGIC FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_sus @@ -461,7 +463,7 @@ # MAGIC "03_ECMO_treatment" as covid_phenotype, # MAGIC "" as clinical_code, # MAGIC "RespiratorySupportECMO == Yes" as description, -# MAGIC "confirmed" as covid_status, +# MAGIC "" as covid_status, # MAGIC "CHESS" as source, # MAGIC "" as code, # MAGIC "DateAdmittedICU" as date_is -- Reasonable @@ -478,7 +480,7 @@ # MAGIC "03_ECMO_treatment" as covid_phenotype, # MAGIC "X58.1" as clinical_code, # MAGIC "Extracorporeal membrane oxygenation" as description, -# MAGIC "confirmed" as covid_status, +# MAGIC "" as covid_status, # MAGIC "HES APC" as source, # MAGIC "OPCS" as code, date_is, SUSRECID # MAGIC FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_apc @@ -495,7 +497,7 @@ # MAGIC "03_ECMO_treatment" as covid_phenotype, # MAGIC "X58.1" as clinical_code, # MAGIC "Extracorporeal membrane oxygenation" as description, -# MAGIC "confirmed" as covid_status, +# MAGIC "" as covid_status, # MAGIC "SUS" as source, # MAGIC "OPCS" as code, "PRIMARY_PROCEDURE_DATE" as date_is # MAGIC FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_sus @@ -614,6 +616,7 @@ # MAGIC AND (DISMETH = 4 -- died # MAGIC OR # MAGIC DISDEST = 79) -- discharge destination not applicable, died or stillborn +# MAGIC -- WARNING hard-coded study-start date # MAGIC AND (DISDATE >= TO_DATE("20200123", "yyyyMMdd")) -- death after study start # COMMAND ---------- @@ -832,7 +835,8 @@ # MAGIC -- OLD value after enforcing everyone has to be alive and in gdppr at study start and minimal followup 28 days # MAGIC -- OLD value @ 150621 3992872 after going back to no cohort subset and excluding pillar2-antigen and hes_op as sources # MAGIC -- Current value @220621 after accidental table deletion is still 3992872 - no damage done! -# MAGIC -- Current value @170821 - 5044357 +# MAGIC -- OLD value @170821 - 5044357 +# MAGIC # MAGIC SELECT count(DISTINCT person_id_deid) # MAGIC FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory @@ -862,7 +866,6 @@ # COMMAND ---------- -# This takes ages 4h drop_table("ccu013_covid_mild") create_table("ccu013_covid_mild") @@ -1009,7 +1012,7 @@ # MAGIC -- OLD value @ 04.06.21 = 3977185 # MAGIC -- OLD value @ 15.06.21 = 3772432 # MAGIC -- OLD value @ 15.06.21 = 3992872 -# MAGIC -- current value @ 17.08.21 = 5044357 +# MAGIC -- OLD value @ 17.08.21 = 5044357 # MAGIC SELECT count(DISTINCT person_id_deid) # MAGIC FROM dars_nic_391419_j3w9t_collab.ccu013_covid_severity @@ -1021,26 +1024,26 @@ # COMMAND ---------- # Uncomment to tidy -drop_table("ccu013_covid_mild") -drop_table("ccu013_covid_moderate") -drop_table("ccu013_covid_severe") -drop_table("ccu013_covid_severe_death") -drop_table("ccu013_covid_not_mild") -drop_table("ccu013_covid_not_mild_or_severe") -drop_table("ccu013_covid_severe_deaths") -drop_table("ccu013_tmp_apc") -drop_table("ccu013_tmp_cc") -drop_table("ccu013_tmp_op") -drop_table("ccu013_tmp_chess") -drop_table("ccu013_tmp_sgss") -drop_table("ccu013_tmp_deaths") -drop_table("ccu013_tmp_gdppr") -drop_table("ccu013_tmp_sus") -drop_table("ccu013_tmp_pillar2") -drop_table("tmp_ccu013_covid_trajectory_delta") -drop_table("tmp_ccu013_covid_trajectory") - -# COMMAND ---------- - -drop_table("ccu013_severity_paper_cohort_tmp") -drop_table("ccu013_jht_death_checks_tmp") +# drop_table("ccu013_covid_mild") +# drop_table("ccu013_covid_moderate") +# drop_table("ccu013_covid_severe") +# drop_table("ccu013_covid_severe_death") +# drop_table("ccu013_covid_not_mild") +# drop_table("ccu013_covid_not_mild_or_severe") +# drop_table("ccu013_covid_severe_deaths") +# drop_table("ccu013_tmp_apc") +# drop_table("ccu013_tmp_cc") +# drop_table("ccu013_tmp_op") +# drop_table("ccu013_tmp_chess") +# drop_table("ccu013_tmp_sgss") +# drop_table("ccu013_tmp_deaths") +# drop_table("ccu013_tmp_gdppr") +# drop_table("ccu013_tmp_sus") +# drop_table("ccu013_tmp_pillar2") +# drop_table("tmp_ccu013_covid_trajectory_delta") +# drop_table("tmp_ccu013_covid_trajectory") + +# COMMAND ---------- + +# drop_table("ccu013_severity_paper_cohort_tmp") +# drop_table("ccu013_jht_death_checks_tmp") diff --git a/code/01_phenotype_engineering/CCU013_04_events_severity.py b/code/01_phenotype_engineering/CCU013_04_events_severity.py index 750aac5..52deb1e 100644 --- a/code/01_phenotype_engineering/CCU013_04_events_severity.py +++ b/code/01_phenotype_engineering/CCU013_04_events_severity.py @@ -27,11 +27,14 @@ # MAGIC # MAGIC **Reviewer(s)** ⚠ UNREVIEWED # MAGIC -# MAGIC **Date last updated** 2021-09-09 +# MAGIC **Date last updated** 2022-01-22 # MAGIC # MAGIC **Date last reviewed** *NA* # MAGIC -# MAGIC **Date last run** 2021-09-09 +# MAGIC **Date last run** 2022-01-22 +# MAGIC +# MAGIC **Changelog** +# MAGIC * `2022-01-22` Renamed `critical_care` -> `ventilatory_support` acknowledging reviewer comments that 'critical care' implies more than just ventilatory treatment, e.g. nursing, monitoring # MAGIC # MAGIC **Data input** # MAGIC * `dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory` Specified in cell 4 below @@ -182,7 +185,7 @@ OR 03_ICU_admission = 1 OR 03_IMV_treatment = 1 OR 03_NIV_treatment = 1 - THEN '3_critical_care' + THEN '3_ventilatory_support' WHEN 02_Covid_admission = 1 THEN '2_hospitalised' @@ -200,15 +203,15 @@ # COMMAND ---------- # MAGIC %md -# MAGIC ## 3.4 `critical_care` aggregate variable +# MAGIC ## 3.4 `ventilatory_support` aggregate variable # MAGIC Chris: I find myself implementing this frequently in SQL for R analysis therefore will incorporate here # COMMAND ---------- -critical_care = spark.sql(f""" +ventilatory_support = spark.sql(f""" SELECT distinct person_id_deid, - 1 as critical_care + 1 as ventilatory_support FROM {trajectory_table} WHERE @@ -235,7 +238,7 @@ .join(severity, "person_id_deid", "left") \ - .join(critical_care, + .join(ventilatory_support, "person_id_deid", "left") \ .fillna(0) diff --git a/code/01_phenotype_engineering/CCU013_05_demographics.py b/code/01_phenotype_engineering/CCU013_05_demographics.py index e854d7a..b17bd1c 100644 --- a/code/01_phenotype_engineering/CCU013_05_demographics.py +++ b/code/01_phenotype_engineering/CCU013_05_demographics.py @@ -26,7 +26,7 @@ # MAGIC # MAGIC **Date last reviewed** *NA* # MAGIC -# MAGIC **Date last run** 2021-09-10 +# MAGIC **Date last run** 2022-01-22 # MAGIC # MAGIC **Data input** `ccu013_covid_events` *See cell 4* # MAGIC @@ -48,8 +48,19 @@ # COMMAND ---------- +# MAGIC %md +# MAGIC # Check master demographics rebuilt if updating prior to running! +# MAGIC Dependencies: +# MAGIC * [`1_demographics`](https://db.core.data.digital.nhs.uk/#notebook/3284853/command/3602362) Check production date +# MAGIC * [`2-1_CALIBER_codelist`](https://db.core.data.digital.nhs.uk/#notebook/3286046/command/3286047) does NOT need to be updated as codelists are static +# MAGIC * [`2-2_CALIBER_skinny`](https://db.core.data.digital.nhs.uk/#notebook/3286010/command/3286011) Check production date +# MAGIC * [`2-3_CALIBER_comorbidities_pre2020`](https://db.core.data.digital.nhs.uk/#notebook/3286218/command/3286219) No params +# MAGIC * [`2-4_CALIBER-categories_pre2020`](https://db.core.data.digital.nhs.uk/#notebook/3285967/command/3285968) No params + +# COMMAND ---------- + # Params -production_date = "2021-07-29 13:39:04.161949" +production_date = "2022-01-20 14:58:52.353312" # Notebook CCU03_01_create_table_aliases Cell 8 # COVID-19 events events_table = "dars_nic_391419_j3w9t_collab.ccu013_covid_events" @@ -127,6 +138,7 @@ # MAGIC %md # MAGIC ## 2.2 Long COVID +# MAGIC **NB not in analysis owing to coding issues as highlighted by OpenSAFELY paper** # COMMAND ---------- @@ -200,10 +212,12 @@ # COMMAND ---------- +# Old value pre @22.01.2020 update: 5044357 display(spark.sql(f"SELECT COUNT(*), COUNT(DISTINCT person_id_deid) FROM {events_table}")) # COMMAND ---------- +# Old value pre @22.01.2020 update: 5044357 display(spark.sql(f"SELECT COUNT(*), COUNT(DISTINCT person_id_deid) FROM dars_nic_391419_j3w9t_collab.{output_table}")) # COMMAND ---------- diff --git a/code/02_study_population/CCU013_11_paper_cohort_dp_skinny_record_unassembled.py b/code/02_study_population/CCU013_11_paper_cohort_dp_skinny_record_unassembled.py index f8f7860..579e670 100644 --- a/code/02_study_population/CCU013_11_paper_cohort_dp_skinny_record_unassembled.py +++ b/code/02_study_population/CCU013_11_paper_cohort_dp_skinny_record_unassembled.py @@ -12,11 +12,9 @@ # MAGIC # MAGIC **Reviewer(s)** Angela Wood # MAGIC -# MAGIC **Date last updated** 2021-08-17 +# MAGIC **Date last updated** 2022-01-22 # MAGIC -# MAGIC **Date last reviewed** 2021-08-17 -# MAGIC -# MAGIC **Date last run** 2021-08-17 +# MAGIC **Date last run** `1/22/2022, 11:51:50 AM ` # MAGIC # MAGIC **Data input** [HES, GDPPR, Deaths] # MAGIC @@ -55,7 +53,7 @@ # COMMAND ---------- -production_date = "2021-07-29 13:39:04.161949" +production_date = "2022-01-20 14:58:52.353312" # COMMAND ---------- @@ -168,33 +166,6 @@ # COMMAND ---------- -# MAGIC %sql -# MAGIC --- Pre conversion to spark! and using frozen data from Mehrdad -# MAGIC ---CREATE OR REPLACE GLOBAL TEMP VIEW ccu013_dp_gdppr_patients AS -# MAGIC --- SELECT NHS_NUMBER_DEID, -# MAGIC --- ETHNIC, -# MAGIC --- SEX, -# MAGIC --- DATE_OF_BIRTH, -# MAGIC --- DATE_OF_DEATH, -# MAGIC --- RECORD_DATE, -# MAGIC --- record_id, -# MAGIC --- dataset, -# MAGIC --- primary -# MAGIC --- FROM ( -# MAGIC --- SELECT NHS_NUMBER_DEID, -# MAGIC --- gdppr.ETHNIC, -# MAGIC --- gdppr.SEX, -# MAGIC --- to_date(string(YEAR_OF_BIRTH),"yyyy") as DATE_OF_BIRTH, -# MAGIC --- to_date(string(YEAR_OF_DEATH),"yyyy") as DATE_OF_DEATH, -# MAGIC --- REPORTING_PERIOD_END_DATE as RECORD_DATE, -- I got this off Natasha from Primary Care -# MAGIC --- NULL as record_id, -# MAGIC --- 'GDPPR' as dataset, -# MAGIC --- 1 as primary -# MAGIC --- FROM dars_nic_391419_j3w9t_collab.ccu003_direfcts_dataprep_1_gdppr_frzon28may_mm_210528 as gdppr -# MAGIC --- ) - -# COMMAND ---------- - # MAGIC %md GDPPR can also store the patient ethnicity in the `CODE` column as a SNOMED code, hence we need to bring this in as another record for the patient (but with null for the other features as they come from the generic record above) # COMMAND ---------- @@ -225,34 +196,6 @@ # COMMAND ---------- -# MAGIC %sql -# MAGIC --- Pre conversion to spark! and using frozen data from Mehrdad -# MAGIC ---CREATE OR REPLACE GLOBAL TEMP VIEW ccu013_dp_gdppr_patients_SNOMED AS -# MAGIC --- SELECT NHS_NUMBER_DEID, -# MAGIC --- ETHNIC, -# MAGIC --- SEX, -# MAGIC --- DATE_OF_BIRTH, -# MAGIC --- DATE_OF_DEATH, -# MAGIC --- RECORD_DATE, -# MAGIC --- record_id, -# MAGIC --- dataset, -# MAGIC --- primary -# MAGIC --- FROM ( -# MAGIC --- SELECT NHS_NUMBER_DEID, -# MAGIC --- eth.PrimaryCode as ETHNIC, -# MAGIC --- gdppr.SEX, -# MAGIC --- to_date(string(YEAR_OF_BIRTH),"yyyy") as DATE_OF_BIRTH, -# MAGIC --- to_date(string(YEAR_OF_DEATH),"yyyy") as DATE_OF_DEATH, -# MAGIC --- DATE as RECORD_DATE, -# MAGIC --- NULL as record_id, -# MAGIC --- 'GDPPR_snomed' as dataset, -# MAGIC --- 1 as primary -# MAGIC --- FROM dars_nic_391419_j3w9t_collab.ccu003_direfcts_dataprep_1_gdppr_frzon28may_mm_210528 as gdppr -# MAGIC --- INNER JOIN dss_corporate.gdppr_ethnicity_mappings eth on gdppr.CODE = eth.ConceptId -# MAGIC --- ) - -# COMMAND ---------- - # MAGIC %md ### Single death per patient # MAGIC In the deaths table (Civil registration deaths), some unfortunate people are down as dying twice. Let's take the most recent death date. @@ -280,23 +223,6 @@ # COMMAND ---------- -# MAGIC %sql -# MAGIC --- Pre conversion to spark! and using frozen data from Mehrdad -# MAGIC ---CREATE OR REPLACE GLOBAL TEMP VIEW ccu013_dp_single_patient_death AS -# MAGIC -# MAGIC ---SELECT * -# MAGIC ---FROM -# MAGIC --- (SELECT * , row_number() OVER (PARTITION BY DEC_CONF_NHS_NUMBER_CLEAN_DEID -# MAGIC --- ORDER BY REG_DATE desc, REG_DATE_OF_DEATH desc) as death_rank -# MAGIC --- FROM dars_nic_391419_j3w9t_collab.ccu003_direfcts_dataprep_1_deaths_frzon28may_mm_210528 -# MAGIC --- ) cte -# MAGIC ---WHERE death_rank = 1 -# MAGIC ---AND DEC_CONF_NHS_NUMBER_CLEAN_DEID IS NOT NULL -# MAGIC ---and TO_DATE(REG_DATE_OF_DEATH, "yyyyMMdd") > '1900-01-01' -# MAGIC ---AND TO_DATE(REG_DATE_OF_DEATH, "yyyyMMdd") <= current_date() - -# COMMAND ---------- - # MAGIC %md ## Combine Primary and Secondary Care along with Deaths data # MAGIC Flag some values as NULLs: # MAGIC - DATE_OF_DEATH flag the following as like NULL: 'NULL', "" empty strings (or just spaces), < 1900-01-01, after the current_date(), after the record_date (the person shouldn't be set to die in the future!) @@ -422,34 +348,6 @@ # COMMAND ---------- -# MAGIC %sql -# MAGIC --- Pre conversion to spark! and using frozen data from Mehrdad -# MAGIC ---CREATE OR REPLACE GLOBAL TEMP VIEW ccu013_dp_patient_dataset_presence_lookup AS -# MAGIC ---SELECT NHS_NUMBER_DEID, -# MAGIC --- COALESCE(deaths, 0) as deaths, -# MAGIC --- COALESCE(sgss, 0) as sgss, -# MAGIC --- COALESCE(gdppr,0) as gdppr, -# MAGIC --- COALESCE(hes_apc, 0) as hes_apc, -# MAGIC --- COALESCE(hes_op, 0) as hes_op, -# MAGIC --- COALESCE(hes_ae, 0) as hes_ae, -# MAGIC --- CASE WHEN hes_ae = 1 or hes_apc=1 or hes_op = 1 THEN 1 ELSE 0 END as hes -# MAGIC ---FROM ( -# MAGIC ---SELECT DISTINCT DEC_CONF_NHS_NUMBER_CLEAN_DEID as NHS_NUMBER_DEID, "deaths" as data_table, 1 as presence FROM dars_nic_391419_j3w9t_collab.ccu003_direfcts_dataprep_1_deaths_frzon28may_mm_210528 -# MAGIC ---union all -# MAGIC ---SELECT DISTINCT PERSON_ID_DEID as NHS_NUMBER_DEID, "sgss" as data_table, 1 as presence FROM dars_nic_391419_j3w9t_collab.ccu003_direfcts_dataprep_1_sgss_frzon28may_mm_210528 -# MAGIC ---union all -# MAGIC ---SELECT DISTINCT NHS_NUMBER_DEID, "gdppr" as data_table, 1 as presence FROM global_temp.ccu013_dp_gdppr_patients -# MAGIC ---union all -# MAGIC ---SELECT DISTINCT NHS_NUMBER_DEID, "hes_apc" as data_table, 1 as presence FROM global_temp.ccu013_dp_all_hes_apc -# MAGIC ---union all -# MAGIC ---SELECT DISTINCT NHS_NUMBER_DEID, "hes_ae" as data_table, 1 as presence FROM global_temp.ccu013_dp_all_hes_ae -# MAGIC ---union all -# MAGIC ---SELECT DISTINCT NHS_NUMBER_DEID, "hes_op" as data_table, 1 as presence FROM global_temp.ccu013_dp_all_hes_op -# MAGIC ---) -# MAGIC ---PIVOT (MAX(presence) FOR data_table in ("deaths", "sgss", "gdppr", "hes_apc", "hes_op", "hes_ae")) - -# COMMAND ---------- - drop_table(table_name='ccu013_dp_patient_dataset_presence_lookup') create_table(table_name='ccu013_dp_patient_dataset_presence_lookup') diff --git a/code/02_study_population/CCU013_12_paper_cohort_dp_skinny_patient_23_01_2020.py b/code/02_study_population/CCU013_12_paper_cohort_dp_skinny_patient_23_01_2020.py index 029971f..77b017f 100644 --- a/code/02_study_population/CCU013_12_paper_cohort_dp_skinny_patient_23_01_2020.py +++ b/code/02_study_population/CCU013_12_paper_cohort_dp_skinny_patient_23_01_2020.py @@ -25,11 +25,9 @@ # MAGIC # MAGIC **Reviewer(s)** Angela Wood # MAGIC -# MAGIC **Date last updated** 2021-08-17 +# MAGIC **Date last updated** 2021-01-22 # MAGIC -# MAGIC **Date last reviewed** 2021-08-17 -# MAGIC -# MAGIC **Date last run** 2020-08-17 +# MAGIC **Date last run** `1/22/2022, 12:52:31 PM` # MAGIC # MAGIC **Data input** [**table**: `dars_nic_391419_j3w9t_collab.ccu013_dp_patient_skinny_unassembled`] # MAGIC @@ -71,7 +69,8 @@ # MAGIC %sql # MAGIC --- old value pre August update 106,011,394 -# MAGIC --- Current value @ 17.08.2021 107,194,954 +# MAGIC --- old value @ 17.08.2021 107,194,954 +# MAGIC --- Current value @ 22.01.2022 109,858,924 # MAGIC SELECT count(DISTINCT NHS_NUMBER_DEID) from dars_nic_391419_j3w9t_collab.ccu013_dp_patient_skinny_unassembled # COMMAND ---------- @@ -114,8 +113,9 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC --- old value pre August update 99,531,011 -# MAGIC --- Current value @ 17.08.2021 99,934,779 +# MAGIC --- old value pre August update 99,531,011 +# MAGIC --- old value @ 17.08.2021 99,934,779 +# MAGIC --- Current value @ 22.01.2022 100,118,189 # MAGIC SELECT count(DISTINCT NHS_NUMBER_DEID) from dars_nic_391419_j3w9t_collab.ccu013_dp_patient_fields_ranked_pre_cutoff # COMMAND ---------- @@ -130,8 +130,7 @@ # MAGIC CASE WHEN ETHNICITY_CODE IN ('1','2','3','N','M','P') THEN "Black or Black British" # MAGIC WHEN ETHNICITY_CODE IN ('0','A','B','C') THEN "White" # MAGIC WHEN ETHNICITY_CODE IN ('4','5','6','L','K','J','H') THEN "Asian or Asian British" -# MAGIC WHEN ETHNICITY_CODE IN ('7','8','W','T','S','R') THEN "Other Ethnic Gr -# MAGIC oups" +# MAGIC WHEN ETHNICITY_CODE IN ('7','8','W','T','S','R') THEN "Other Ethnic Groups" # MAGIC WHEN ETHNICITY_CODE IN ('D','E','F','G') THEN "Mixed" # MAGIC WHEN ETHNICITY_CODE IN ('9','Z','X') THEN "Unknown" # MAGIC ELSE 'Unknown' END as ETHNIC_GROUP @@ -178,7 +177,8 @@ # MAGIC %sql # MAGIC --- old value pre August update 55,988,064 -# MAGIC --- Current value @ 17.08.2021 56,721,158 +# MAGIC --- old value @ 17.08.2021 56,721,158 +# MAGIC --- Current value @ 57,175,620 # MAGIC SELECT count(DISTINCT NHS_NUMBER_DEID) FROM global_temp.ccu013_dp_skinny_patient_23_01_2020 # COMMAND ---------- @@ -193,12 +193,13 @@ # MAGIC %sql # MAGIC --- old value pre August update 55,988,064 -# MAGIC --- Current value @ 17.08.2021 56,721,158 +# MAGIC --- old value @ 17.08.2021 56,721,158 +# MAGIC --- Current value @ 22.01.2022 57,175,620 # MAGIC SELECT count(DISTINCT NHS_NUMBER_DEID) FROM global_temp.ccu013_dp_skinny_patient_23_01_2020_age # COMMAND ---------- -# MAGIC %md filter out the dead people - so we only have people which are **alive** on **1st Jan 2020** +# MAGIC %md filter out the dead people - so we only have people which are **alive** on **23rd Jan 2020** # COMMAND ---------- @@ -207,7 +208,7 @@ # MAGIC # MAGIC SELECT * # MAGIC FROM global_temp.ccu013_dp_skinny_patient_23_01_2020_age -# MAGIC WHERE COALESCE(DATE_OF_DEATH, '2199-01-01') > '2020-01-01' +# MAGIC WHERE COALESCE(DATE_OF_DEATH, '2199-01-01') > '2020-01-23' # COMMAND ---------- @@ -237,5 +238,6 @@ # MAGIC %sql # MAGIC --- old value pre August update 55,876,173 -# MAGIC --- Current value @ 17.08.2021 56,609,049 +# MAGIC --- old value @ 17.08.2021 56,609,049 +# MAGIC --- current value @ 22.01.2022 57,032,174 # MAGIC SELECT count(DISTINCT NHS_NUMBER_DEID) FROM dars_nic_391419_j3w9t_collab.ccu013_dp_skinny_patient_23_01_2020 diff --git a/code/02_study_population/CCU013_13_paper_subset_data_to_cohort.py b/code/02_study_population/CCU013_13_paper_subset_data_to_cohort.py index 80d852d..f706ba0 100644 --- a/code/02_study_population/CCU013_13_paper_subset_data_to_cohort.py +++ b/code/02_study_population/CCU013_13_paper_subset_data_to_cohort.py @@ -12,11 +12,11 @@ # MAGIC # MAGIC **Reviewer(s)** # MAGIC -# MAGIC **Date last updated** 2021-08-17 +# MAGIC **Date last updated** 2022-01-22 # MAGIC # MAGIC **Date last reviewed** # MAGIC -# MAGIC **Date last run** 2021-08-17 +# MAGIC **Date last run** 2022-01-22 # MAGIC # MAGIC **Data input** # MAGIC 1. Descriptive Paper methodology derived cohort @@ -62,49 +62,60 @@ # MAGIC %sql # MAGIC --- Individuals alive and registred in GDPPR on 23/01/2020 -# MAGIC --- Old value = 55,876,173 -# MAGIC --- value @ 170821 = 56,609,049 +# MAGIC --- Old value = 55,876,173 +# MAGIC --- Old @ 170821 = 56,609,049 +# MAGIC --- Current value @ 220122 = 57,032,174 # MAGIC SELECT count(DISTINCT NHS_NUMBER_DEID) FROM dars_nic_391419_j3w9t_collab.ccu013_dp_skinny_patient_23_01_2020 # COMMAND ---------- -# MAGIC %sql -# MAGIC SELECT * FROM dars_nic_391419_j3w9t_collab.ccu013_dp_skinny_patient_23_01_2020 -# MAGIC WHERE DATE_OF_DEATH <= "2020-03-20" - -# COMMAND ---------- - # MAGIC %md # MAGIC #### 1.1.1 Find patients who do not have minimum follow up time # MAGIC - Participants with non-fatal index events who had less than 28 days of follow up were excluded. # COMMAND ---------- +# MAGIC %sql +# MAGIC --- IMPORTANT check that no death date is larger than the study end date !!! +# MAGIC ---- As that would cause errors in the code below +# MAGIC SELECT MAX(date) +# MAGIC FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory +# MAGIC WHERE covid_phenotype == '04_Fatal_with_covid_diagnosis' OR +# MAGIC covid_phenotype == '04_Fatal_without_covid_diagnosis' OR +# MAGIC covid_phenotype == '04_Covid_inpatient_death' + +# COMMAND ---------- + from pyspark.sql.functions import * +# Warning - update study end date +study_end_date = lit(datetime(2021, 11, 30)) + all_fatal = spark.sql(""" SELECT person_id_deid, MIN(date) AS death_date FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory -WHERE covid_phenotype == '04_Fatal_with_covid_diagnosis' OR +WHERE (covid_phenotype == '04_Fatal_with_covid_diagnosis' OR covid_phenotype == '04_Fatal_without_covid_diagnosis' OR -covid_phenotype == '04_Covid_inpatient_death' +covid_phenotype == '04_Covid_inpatient_death') +AND date >= "2020-01-23" GROUP BY person_id_deid """) -# Get all none deaths dates +# Get first covid event dates for everyone, expect those with ONLY fatal events followup_time = spark.sql(""" SELECT person_id_deid, MIN(date) AS first_covid_event FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory -WHERE covid_phenotype != '04_Fatal_with_covid_diagnosis' OR +WHERE (covid_phenotype != '04_Fatal_with_covid_diagnosis' OR covid_phenotype != '04_Fatal_without_covid_diagnosis' OR -covid_phenotype != '04_Covid_inpatient_death' +covid_phenotype != '04_Covid_inpatient_death') +AND date >= "2020-01-23" GROUP BY person_id_deid """) # Calculate elapsed number of days between earliest event and study end (except if fatal) followup_time = followup_time.join(all_fatal, ['person_id_deid'], how='left') followup_time = followup_time.select(['person_id_deid', 'first_covid_event', 'death_date']) -followup_time = followup_time.withColumn('study_end', lit(datetime(2021, 3, 31))) +followup_time = followup_time.withColumn('study_end', study_end_date) followup_time= followup_time.withColumn('followup_days', when(followup_time['death_date'].isNull(), datediff(followup_time['study_end'], followup_time['first_covid_event'])).otherwise(-1)) @@ -116,28 +127,53 @@ # COMMAND ---------- -# MAGIC %sql -- participants excluded due to lack of 28 days minimal followup time. -# MAGIC -# MAGIC --- NOTE THIS number also include patients who enter the study after the cutoff time +# MAGIC %md +# MAGIC Note that these counts are prior to joining on to skinny table, in other words could contain patients that don't meet the study inclusion + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC -- participants excluded due to lack of 28 days minimal followup time. +# MAGIC -- OLD With study_end as 2021, 3, 31 -> 1,280,138 +# MAGIC -- OLD WIth study_end as 2021, 5, 31 -> 1,081,496 +# MAGIC -- current with study_end as 2021, 11, 30 -> 917,278 # MAGIC SELECT count(DISTINCT person_id_deid) FROM global_temp.followup_time # MAGIC WHERE 28d_followup == 0 # COMMAND ---------- +# MAGIC %sql +# MAGIC --- CHECK that no follwup time is less than -1 +# MAGIC SELECT * FROM global_temp.followup_time +# MAGIC where followup_days < -1 + +# COMMAND ---------- + # MAGIC %md # MAGIC #### 1.1.2 Subset trajectory table -# MAGIC Subset for cohor population - inclusion time and minimal followup +# MAGIC Subset for cohort population - inclusion time and minimum follow-up # COMMAND ---------- # MAGIC %sql +# MAGIC --- Current @ 22.01.22 = 8,714,594 # MAGIC SELECT count (DISTINCT person_id_deid) from dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory # COMMAND ---------- # MAGIC %sql +# MAGIC --- Current @ 22.01.22 = 8,714,455 +# MAGIC -- Removes only: 139 patients # MAGIC SELECT count (DISTINCT person_id_deid) from dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory -# MAGIC WHERE date >= "2020-01-23" AND date <= "2021-05-31" +# MAGIC WHERE date >= "2020-01-23" AND date <= "2021-11-30" + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC -- These are those records dated before index event +# MAGIC -- NB 597 unique IDs here, but these patients could also have event within study dates +# MAGIC SELECT * from dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory +# MAGIC WHERE date < "2020-01-23" OR date > "2021-11-30" # COMMAND ---------- @@ -150,18 +186,20 @@ # MAGIC dars_nic_391419_j3w9t_collab.ccu013_dp_skinny_patient_23_01_2020 tab2 # MAGIC ON # MAGIC tab1.person_id_deid = tab2.NHS_NUMBER_DEID -# MAGIC WHERE date >= "2020-01-23" AND date <= "2021-05-31" +# MAGIC WHERE date >= "2020-01-23" AND date <= "2021-11-30" # COMMAND ---------- # MAGIC %sql # MAGIC -- Value @ 150621 3567617 # MAGIC -- Value @ 170821 3705123 +# MAGIC -- Value @ 220222 8103909 # MAGIC SELECT count (DISTINCT person_id_deid) from global_temp.ccu013_covid_trajectory_paper_cohort_tmp # COMMAND ---------- # MAGIC %sql +# MAGIC -- Remove those based on minimum follow-up criteria # MAGIC CREATE OR REPLACE GLOBAL TEMP VIEW ccu013_covid_trajectory_paper_cohort as # MAGIC WITH list_patients_to_omit AS (SELECT person_id_deid from global_temp.followup_time WHERE 28d_followup == 0) # MAGIC SELECT /*+ BROADCAST(list_patients_to_omit) */ t.* FROM global_temp.ccu013_covid_trajectory_paper_cohort_tmp as t @@ -182,14 +220,16 @@ # MAGIC %sql # MAGIC -- value @ 150621 = 3454653 # MAGIC -- value @ 170821 = 3469528 +# MAGIC -- value @ 220122 = 7244925 # MAGIC SELECT count (DISTINCT person_id_deid) from dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort # COMMAND ---------- # MAGIC %sql -# MAGIC -- value @ 150621 = 8683174 -# MAGIC -- value @ 170821 = 8825738 -# MAGIC SELECT count (*) from dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort +# MAGIC -- value @ 150621 = 8683174 +# MAGIC -- value @ 170821 = 8825738 +# MAGIC -- value @ 220122 = 13990423 +# MAGIC SELECT count (*) as total_records from dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort # COMMAND ---------- @@ -207,13 +247,15 @@ # COMMAND ---------- # MAGIC %sql +# MAGIC -- OLD value 5,044,357 +# MAGIC -- Current value 8,714,594 # MAGIC SELECT count (DISTINCT person_id_deid) from dars_nic_391419_j3w9t_collab.ccu013_covid_severity # COMMAND ---------- # MAGIC %sql # MAGIC CREATE OR REPLACE GLOBAL TEMP VIEW ccu013_covid_severity_paper_cohort AS -# MAGIC SELECT s.person_id_deid, s.date, s.covid_severity, s.ProductionDate FROM dars_nic_391419_j3w9t_collab.ccu013_covid_severity as s +# MAGIC SELECT DISTINCT s.person_id_deid, s.date, s.covid_severity, s.ProductionDate FROM dars_nic_391419_j3w9t_collab.ccu013_covid_severity as s # MAGIC INNER JOIN dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort as t # MAGIC ON s.person_id_deid == t.person_id_deid @@ -230,12 +272,18 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC -- value @ 150621 = 3454653 -# MAGIC -- value @ 170821 = 3469528 +# MAGIC -- value @ 150621 = 3454653 +# MAGIC -- value @ 170821 = 3469528 +# MAGIC -- Current @ 220122 = 7244925 # MAGIC SELECT count(DISTINCT person_id_deid) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_severity_paper_cohort # COMMAND ---------- +# MAGIC %sql +# MAGIC SELECT count(*) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_severity_paper_cohort + +# COMMAND ---------- + # MAGIC %md # MAGIC # 2 Create input for patient trajectory plots # MAGIC - Create order and simplified phenotype groups for the plots @@ -353,6 +401,7 @@ # MAGIC %sql # MAGIC -- 56609049 +# MAGIC -- 57032174 # MAGIC SELECT count (distinct person_id_deid) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_graph_data # COMMAND ---------- @@ -402,22 +451,29 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC -- 56491308 +# MAGIC -- old value - 56491308 +# MAGIC -- current value = 56945027 # MAGIC SELECT count(distinct person_id_deid) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_graph_data_wave1 # COMMAND ---------- # MAGIC %sql +# MAGIC --- Old = 57035046 +# MAGIC --- current value = 57490005 # MAGIC SELECT count(*) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_graph_data_wave1 # COMMAND ---------- # MAGIC %sql -# MAGIC SELECT count (distinct person_id_deid) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort_wave1 +# MAGIC --- Old = 263,839 +# MAGIC --- OBS Not sure this is in use any more +# MAGIC ---SELECT count (distinct person_id_deid) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort_wave1 # COMMAND ---------- # MAGIC %sql +# MAGIC --- 3456753 +# MAGIC --- 7232055 # MAGIC SELECT count (distinct person_id_deid) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_graph_data_wave1 # MAGIC ---SELECT * FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_graph_data_wave1 # MAGIC WHERE covid_severity != "00_unaffected" ---AND date <= date_add(TO_DATE("2020-05-29"),28) @@ -425,6 +481,8 @@ # COMMAND ---------- # MAGIC %sql +# MAGIC --- Old value = 53034555 +# MAGIC --- Current value @ = 49712972 # MAGIC SELECT count (distinct person_id_deid) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_graph_data_wave1 # MAGIC ---SELECT * FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_graph_data_wave1 # MAGIC WHERE covid_severity == "00_unaffected" ---AND date <= date_add(TO_DATE("2020-05-29"),28) @@ -437,6 +495,8 @@ # COMMAND ---------- # MAGIC %sql +# MAGIC --- Old value = 55774208 +# MAGIC --- New value @ 220122 = 56225024 # MAGIC --- Query to define all pople included in wave 2 # MAGIC --- This is used below to subset the trajectory graph data # MAGIC SELECT count(distinct a.person_id_deid) FROM @@ -577,7 +637,8 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC -- 56491308 +# MAGIC -- OLD - = 56491308 +# MAGIC -- New @ 220122 = 56945027 # MAGIC SELECT count(distinct person_id_deid) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_graph_data_wave1_icu # COMMAND ---------- @@ -610,7 +671,8 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC -- 55774208 +# MAGIC -- OLD - 55774208 +# MAGIC -- New = 56225024 # MAGIC SELECT count(distinct person_id_deid) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_graph_data_wave2_icu # COMMAND ---------- @@ -622,40 +684,40 @@ # COMMAND ---------- -import pyspark.sql.functions as funcs -from pyspark.sql.window import Window -reinfec = spark.sql(""" -SELECT person_id_deid, date FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory -WHERE covid_phenotype in ('01_Covid_positive_test') -""") +#import pyspark.sql.functions as funcs +#from pyspark.sql.window import Window +#reinfec = spark.sql(""" +#SELECT person_id_deid, date FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory +#WHERE covid_phenotype in ('01_Covid_positive_test') +#""") -reinfect_threshold = 90 # SIREN study +#reinfect_threshold = 90 # SIREN study # Find days between consecutive positive COVID tests # Define window to particion by -window = Window.partitionBy('person_id_deid').orderBy('date') +#window = Window.partitionBy('person_id_deid').orderBy('date') # Calculate difference in days per ID -reinfec = reinfec.withColumn("days_passed", funcs.datediff(reinfec.date, - funcs.lag(reinfec.date, 1).over(window))) +#reinfec = reinfec.withColumn("days_passed", funcs.datediff(reinfec.date, +# funcs.lag(reinfec.date, 1).over(window))) # Save to table -reinfec.createOrReplaceGlobalTempView("ccu013_covid_reinfection_days_between_positive_tests") -drop_table("ccu013_covid_reinfection_days_between_positive_tests") -create_table("ccu013_covid_reinfection_days_between_positive_tests") +#reinfec.createOrReplaceGlobalTempView("ccu013_covid_reinfection_days_between_positive_tests") +#drop_table("ccu013_covid_reinfection_days_between_positive_tests") +#create_table("ccu013_covid_reinfection_days_between_positive_tests") # Get the maximum difference in days between positive tests per individual -w = Window.partitionBy('person_id_deid') -reinfec_max_days = reinfec.withColumn('max_days_passed', f.max('days_passed').over(w))\ - .where(f.col('days_passed') == f.col('max_days_passed'))\ - .drop('max_days_passed') +#w = Window.partitionBy('person_id_deid') +#reinfec_max_days = reinfec.withColumn('max_days_passed', f.max('days_passed').over(w))\ +# .where(f.col('days_passed') == f.col('max_days_passed'))\ +# .drop('max_days_passed') ## Find reinfected using reinfect_threshold -reinfec_max_days = reinfec_max_days.withColumn('reinfected', f.when((f.col('days_passed') >= reinfect_threshold),1).otherwise(0)) -reinfec_max_days = reinfec_max_days.where(f.col('reinfected') == 1) +#reinfec_max_days = reinfec_max_days.withColumn('reinfected', f.when((f.col('days_passed') >= reinfect_threshold),1).otherwise(0)) +#reinfec_max_days = reinfec_max_days.where(f.col('reinfected') == 1) # Save to table -reinfec_max_days.createOrReplaceGlobalTempView("ccu013_covid_reinfected_after_90_days") -drop_table("ccu013_covid_reinfected_after_90_days") -create_table("ccu013_covid_reinfected_after_90_days") +#reinfec_max_days.createOrReplaceGlobalTempView("ccu013_covid_reinfected_after_90_days") +#drop_table("ccu013_covid_reinfected_after_90_days") +#create_table("ccu013_covid_reinfected_after_90_days") # COMMAND ---------- diff --git a/code/02_study_population/CCU013_14_paper_events_severity.py b/code/02_study_population/CCU013_14_paper_events_severity.py index 6be44ec..faca6fa 100644 --- a/code/02_study_population/CCU013_14_paper_events_severity.py +++ b/code/02_study_population/CCU013_14_paper_events_severity.py @@ -16,7 +16,7 @@ # MAGIC 3.1. `0_Covid_infection` as catch-all # MAGIC 3.2. `death_covid` if COVID-19 on death certificate, at any position # MAGIC 3.3. `severity` mutually exclusive worst healthcare event, not including death unless death the only dataset from which patient is ascertained -# MAGIC 3.4. `critical_care` aggregated variable +# MAGIC 3.4. `critical_care` -> `ventilatory_support` aggregated variable # MAGIC 4. Joins to produce cohort # MAGIC 4.2. Tests no duplicates # MAGIC 4.3. Creates delta table & optimises @@ -24,14 +24,15 @@ # MAGIC **Project(s)** CCU013 # MAGIC # MAGIC **Author(s)** Chris Tomlinson -# MAGIC -# MAGIC **Reviewer(s)** ⚠ UNREVIEWED -# MAGIC +# MAGIC # MAGIC **Date last updated** 2021-09-09 # MAGIC # MAGIC **Date last reviewed** *NA* # MAGIC -# MAGIC **Date last run** 2021-09-28 +# MAGIC **Date last run** `1/23/2022, 1:24:56 PM` +# MAGIC +# MAGIC **Changelog** +# MAGIC * `2022-01-23` Renamed `critical_care` -> `ventilatory_support` acknowledging reviewer comments that 'critical care' implies more than just ventilatory treatment, e.g. nursing, monitoring # MAGIC # MAGIC **Data input** # MAGIC * `dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort` Specified in cell 4 below @@ -183,7 +184,7 @@ OR 03_ICU_admission = 1 OR 03_IMV_treatment = 1 OR 03_NIV_treatment = 1 - THEN '3_critical_care' + THEN '3_ventilatory_support' WHEN 02_Covid_admission = 1 THEN '2_hospitalised' @@ -201,15 +202,15 @@ # COMMAND ---------- # MAGIC %md -# MAGIC ## 3.4 `critical_care` aggregate variable +# MAGIC ## 3.4 `ventilatory_support` aggregate variable # MAGIC Chris: I find myself implementing this frequently in SQL for R analysis therefore will incorporate here # COMMAND ---------- -critical_care = spark.sql(f""" +ventilatory_support = spark.sql(f""" SELECT distinct person_id_deid, - 1 as critical_care + 1 as ventilatory_support FROM {trajectory_table} WHERE @@ -236,7 +237,7 @@ .join(severity, "person_id_deid", "left") \ - .join(critical_care, + .join(ventilatory_support, "person_id_deid", "left") \ .fillna(0) @@ -273,11 +274,22 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC -- 17/8: 3469528 -# MAGIC -- 28/9: 3469528 +# MAGIC -- 17/8/21: 3469528 +# MAGIC -- 28/9/21: 3469528 +# MAGIC -- 23/1/22: 7244925 # MAGIC SELECT COUNT(distinct person_id_deid) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_paper_cohort # COMMAND ---------- # MAGIC %sql # MAGIC SELECT COUNT(distinct person_id_deid) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Check that all those with <28d follow-up die + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC SELECT * FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_paper_cohort WHERE date_first > '2021-11-02' AND death != 1 diff --git a/code/02_study_population/CCU013_15_paper_demographics_comorbidities.py b/code/02_study_population/CCU013_15_paper_demographics_comorbidities.py index ae3edb3..5c5b2de 100644 --- a/code/02_study_population/CCU013_15_paper_demographics_comorbidities.py +++ b/code/02_study_population/CCU013_15_paper_demographics_comorbidities.py @@ -27,11 +27,11 @@ # MAGIC # MAGIC **Reviewer(s)** # MAGIC -# MAGIC **Date last updated** 2021-09-07 +# MAGIC **Date last updated** 2022-01-23 # MAGIC # MAGIC **Date last reviewed** *NA* # MAGIC -# MAGIC **Date last run** 2021-10-06 +# MAGIC **Date last run** `1/23/2022, 1:43:55 PM` # MAGIC # MAGIC **Data input** # MAGIC * **`dars_nic_391419_j3w9t_collab.ccu013_covid_events_paper_cohort`** @@ -66,7 +66,7 @@ # COMMAND ---------- # Params -production_date = "2021-07-29 13:39:04.161949" +production_date = "2022-01-20 14:58:52.353312" # COVID-19 events events_table = "dars_nic_391419_j3w9t_collab.ccu013_covid_events_paper_cohort" @@ -232,7 +232,7 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC -- 3469528 +# MAGIC -- 10/6/21 3469528 # MAGIC SELECT COUNT(*), COUNT(DISTINCT person_id_deid) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort # COMMAND ---------- diff --git a/code/02_study_population/CCU013_17_paper_survival_cohort.py b/code/02_study_population/CCU013_17_paper_survival_cohort.py index fc5ba9f..1b6f5f7 100644 --- a/code/02_study_population/CCU013_17_paper_survival_cohort.py +++ b/code/02_study_population/CCU013_17_paper_survival_cohort.py @@ -8,8 +8,8 @@ from datetime import datetime from pyspark.sql.types import DateType -start_date = '2020-01-01' -end_date = '2021-07-29' # The maximal date covered by all sources. +start_date = '2020-01-23' +end_date = '2021-11-30' # The maximal date covered by all sources. # NB common cut-off data across all data sources is implemented in CCU013_13_paper_subset_data_to_cohort # COMMAND ---------- @@ -114,6 +114,7 @@ # MAGIC %md # MAGIC ## 2. Add mutex severity phenotype +# MAGIC * `03_critical_care_outside_ICU` -> `3_ventilatory_support_outside_ICU` # COMMAND ---------- @@ -129,7 +130,7 @@ (03_ECMO_treatment = 1 OR 03_IMV_treatment = 1 OR 03_NIV_treatment = 1) - THEN '03_critical_care_outside_ICU' + THEN '3_ventilatory_support_outside_ICU' WHEN 02_Covid_admission = 1 THEN '2_hospitalised' @@ -259,6 +260,7 @@ # COMMAND ---------- # MAGIC %sql +# MAGIC -- min date(2020-02-20), max date(2020-06-26), min followup_end(2020-05-29), max followup_end(2020-06-26) # MAGIC SELECT min(date), max(date), min(followup_end), max(followup_end) FROM global_temp.ccu013_covid_trajectory_paper_cohort_wave1 # COMMAND ---------- @@ -351,7 +353,7 @@ (03_ECMO_treatment = 1 OR 03_IMV_treatment = 1 OR 03_NIV_treatment = 1) - THEN '03_critical_care_outside_ICU' + THEN '3_ventilatory_support_outside_ICU' WHEN 02_Covid_admission = 1 THEN '2_hospitalised' @@ -465,6 +467,7 @@ # COMMAND ---------- # MAGIC %sql +# MAGIC -- min date(2020-09-30), max date(2021-03-12), min followup_end(2021-02-12), max followup_end(2021-03-12) # MAGIC SELECT min (date), max(date), min(followup_end), max(followup_end) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort_wave2 # COMMAND ---------- @@ -557,7 +560,7 @@ (03_ECMO_treatment = 1 OR 03_IMV_treatment = 1 OR 03_NIV_treatment = 1) - THEN '03_critical_care_outside_ICU' + THEN '3_ventilatory_support_outside_ICU' WHEN 02_Covid_admission = 1 THEN '2_hospitalised' @@ -632,11 +635,13 @@ # COMMAND ---------- # MAGIC %sql +# MAGIC --- 9/7/2021 = 2878573 # MAGIC SELECT COUNT(distinct person_id_deid) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort_wave2 # COMMAND ---------- # MAGIC %sql +# MAGIC --- 9/7/2021 = 2878573 # MAGIC SELECT COUNT(distinct person_id_deid) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort_survival_wave2 # COMMAND ---------- @@ -661,10 +666,12 @@ # COMMAND ---------- # MAGIC %sql +# MAGIC --- 9/7/2021 = 964 # MAGIC select (SUM(dose2_prior_to_event)) FROM global_temp.ccu013_covid_trajectory_paper_cohort_wave2 # COMMAND ---------- # MAGIC %sql +# MAGIC --- 9/7/2021 = 1117322 # MAGIC SELECT count(distinct person_id_deid) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort_survival_wave2 a # MAGIC LEFT ANTI JOIN dars_nic_391419_j3w9t_collab.ccu013_vaccine_status_temp as b ON a.person_id_deid = b.person_id_deid diff --git a/code/02_study_population/CCU013_18_add_vaccination_status.py b/code/02_study_population/CCU013_18_add_vaccination_status.py index 8c42a8a..dfe8679 100644 --- a/code/02_study_population/CCU013_18_add_vaccination_status.py +++ b/code/02_study_population/CCU013_18_add_vaccination_status.py @@ -16,7 +16,7 @@ # MAGIC # MAGIC **Date last reviewed** # MAGIC -# MAGIC **Date last run** 2021-08-18 +# MAGIC **Date last run** 2022-01-23 # MAGIC # MAGIC **Data input** # MAGIC @@ -33,8 +33,8 @@ from datetime import datetime from pyspark.sql.types import DateType -start_date = '2020-01-01' -end_date = '2021-05-31' # The study end date +start_date = '2020-01-23' +end_date = '2021-11-30' # The study end date # COMMAND ---------- @@ -98,6 +98,13 @@ # COMMAND ---------- +# MAGIC %sql +# MAGIC --- 10/25/21 = obs 1,357,387, ids 1,357,387 +# MAGIC SELECT count(*) as observations, count(distinct person_id_deid) as individuals FROM dars_nic_391419_j3w9t_collab.ccu013_vaccine_status_paper_cohort +# MAGIC WHERE dose2 is not NULL + +# COMMAND ---------- + # MAGIC %md # MAGIC # Compareing outcome in vaccinated vs unvaccinated during wave 2 @@ -117,12 +124,13 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC --- N = 56,609,049 - Total population +# MAGIC --- 9/13/21 N = 56,609,049 - Total population # MAGIC SELECT count(distinct NHS_NUMBER_DEID) FROM dars_nic_391419_j3w9t_collab.ccu013_dp_skinny_patient_23_01_2020 # COMMAND ---------- # MAGIC %sql +# MAGIC --- 9/13/21 = 383,470 # MAGIC --- Get people who have died before wave2 # MAGIC SELECT count(distinct NHS_NUMBER_DEID) # MAGIC FROM dars_nic_391419_j3w9t_collab.ccu013_dp_skinny_patient_23_01_2020 AS a @@ -133,6 +141,7 @@ # COMMAND ---------- # MAGIC %sql +# MAGIC --- 9/13/21 = 511,742 # MAGIC --- Get all individuals who had a reported COVID-19 event prior to wave 2 start # MAGIC SELECT count(distinct a.person_id_deid) FROM (SELECT DISTINCT NHS_NUMBER_DEID as person_id_deid FROM dars_nic_391419_j3w9t_collab.ccu013_dp_skinny_patient_23_01_2020) as a # MAGIC INNER JOIN (SELECT person_id_deid FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort WHERE date < "2020-09-30") as t @@ -153,7 +162,7 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC --- N = 55,774,208 - individuals WITHOUT a reported COVID-19 event prior to date and or All Alive! +# MAGIC --- 9/13/21; N = 55,774,208 - individuals WITHOUT a reported COVID-19 event prior to date and or All Alive! # MAGIC SELECT count(distinct person_id_deid) FROM global_temp.ccu013_no_covid_before_wave2 # COMMAND ---------- @@ -199,7 +208,7 @@ # MAGIC %sql # MAGIC --- count of vaccinated vs unvaccinated -# MAGIC --- Using 14 days as buffer; 9,562,898 | 19,638,856 | 26,572,454 +# MAGIC --- 23/01/21 = Using 14 days as buffer; 9,562,898 | 19,638,856 | 26,572,454 # MAGIC SELECT vaccine_status, count(vaccine_status) as count # MAGIC FROM dars_nic_391419_j3w9t_collab.ccu013_no_covid_before_wave2_vax_status # MAGIC group by vaccine_status @@ -213,6 +222,7 @@ # MAGIC %sql # MAGIC --- Get people who have died before 1st of feb 2021 +# MAGIC --- 9/13/21 = 595,024 # MAGIC SELECT count(distinct NHS_NUMBER_DEID) # MAGIC FROM dars_nic_391419_j3w9t_collab.ccu013_dp_skinny_patient_23_01_2020 AS a # MAGIC RIGHT JOIN (SELECT person_id_deid, min(death_date) as death_date FROM dars_nic_391419_j3w9t_collab.ccu013_tmp_deaths group by person_id_deid) as b @@ -223,6 +233,7 @@ # MAGIC %sql # MAGIC --- Get all individuals who had a reported COVID-19 before 1st of feb 2021 +# MAGIC --- 9/13/21 = 3,181,019 # MAGIC SELECT count(distinct a.person_id_deid) FROM (SELECT DISTINCT NHS_NUMBER_DEID as person_id_deid FROM dars_nic_391419_j3w9t_collab.ccu013_dp_skinny_patient_23_01_2020) as a # MAGIC INNER JOIN (SELECT person_id_deid FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort WHERE date < "2021-02-01") as t # MAGIC ON a.person_id_deid = t.person_id_deid @@ -242,7 +253,7 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC --- N = 52,969,966 - individuals WITHOUT a reported COVID-19 event prior to date and or All Alive! +# MAGIC --- 9/13/21 N = 52,969,966 - individuals WITHOUT a reported COVID-19 event prior to date and or All Alive! # MAGIC SELECT count(distinct person_id_deid) FROM global_temp.ccu013_no_covid_before_feb_2021 # COMMAND ---------- @@ -283,7 +294,7 @@ # COMMAND ---------- # MAGIC %sql -# MAGIC --- Using 14 days as buffer; 3,040,532 | 421,811 | 49,507,623 +# MAGIC --- 9/13/21 = Using 14 days as buffer; 3,040,532 | 421,811 | 49,507,623 # MAGIC SELECT vaccine_status, count(vaccine_status) as count # MAGIC FROM dars_nic_391419_j3w9t_collab.ccu013_no_covid_before_feb_2021_vax_status # MAGIC group by vaccine_status @@ -371,6 +382,7 @@ # COMMAND ---------- # MAGIC %sql +# MAGIC --- 9/13/21 = 421811 vaccinated | 49507623 unvaccinated # MAGIC SELECT vaccine_status, count(vaccine_status) FROM dars_nic_391419_j3w9t_collab.ccu013_no_covid_before_feb_2021_vax_status_matching group by vaccine_status # COMMAND ---------- diff --git a/code/02_study_population/CCU013_20_whole_population_demographics_comorbidities.py b/code/02_study_population/CCU013_20_whole_population_demographics_comorbidities.py index aa0791d..b8c5da2 100644 --- a/code/02_study_population/CCU013_20_whole_population_demographics_comorbidities.py +++ b/code/02_study_population/CCU013_20_whole_population_demographics_comorbidities.py @@ -238,9 +238,9 @@ def test(table): # COMMAND ---------- # Params -production_date = "2021-07-29 13:39:04.161949" +production_date = "2022-01-20 14:58:52.353312" cohort_start = '2020-01-23' # For deaths -cohort_end = '2021-05-31' # For deaths MANUALLY CALCULATED FOR NOW across datasets +cohort_end = '2021-11-30' # For deaths MANUALLY CALCULATED FOR NOW across datasets # Population table i.e. the denominator for our work population_table = "dars_nic_391419_j3w9t_collab.ccu013_dp_skinny_patient_23_01_2020" @@ -406,3 +406,23 @@ def test(table): # MAGIC then 1 else 0 end) as covid # MAGIC FROM # MAGIC dars_nic_391419_j3w9t_collab.ccu013_paper_table_one_56million_denominator + +# COMMAND ---------- + +# MAGIC %sql +# MAGIC SELECT +# MAGIC COUNT(distinct person_id_deid) as population, +# MAGIC SUM(CASE WHEN severity = '0_positive' +# MAGIC OR severity = '1_gp' +# MAGIC OR severity = '2_hospitalised' +# MAGIC OR severity = '3_critical_care' +# MAGIC OR severity = '4_death_only' +# MAGIC then 1 else 0 end) as covid, +# MAGIC round(SUM(CASE WHEN severity = '0_positive' +# MAGIC OR severity = '1_gp' +# MAGIC OR severity = '2_hospitalised' +# MAGIC OR severity = '3_critical_care' +# MAGIC OR severity = '4_death_only' +# MAGIC then 1 else 0 end) / COUNT(distinct person_id_deid) * 100, 2) as percent_covid +# MAGIC FROM +# MAGIC dars_nic_391419_j3w9t_collab.ccu013_paper_table_one_56million_denominator diff --git a/code/03_analysis_databricks/Additional_numbers_for_paper.sql b/code/03_analysis_databricks/Additional_numbers_for_paper.sql index 625c1ed..cee97a5 100644 --- a/code/03_analysis_databricks/Additional_numbers_for_paper.sql +++ b/code/03_analysis_databricks/Additional_numbers_for_paper.sql @@ -1,7 +1,32 @@ -- Databricks notebook source -- MAGIC %md --- MAGIC # Chris's numbers --- MAGIC I'm going to try and lay this out in a structure mirroring the paper +-- MAGIC # Numbers for manuscript +-- MAGIC +-- MAGIC **Description** +-- MAGIC +-- MAGIC This notebook runs a list of `SQL` queries to extract the numbers (%) of each text entry in the `CCU013: COVID-19 Event Phenotypes` manuscript *"Understanding COVID-19 trajectories from a nationwide linked electronic health record cohort of 56 million people: phenotypes, severity, waves & vaccination"*. +-- MAGIC
+-- MAGIC The layout will follow that of the paper. +-- MAGIC +-- MAGIC +-- MAGIC **Project(s)** CCU013 +-- MAGIC +-- MAGIC **Author(s)** Chris Tomlinson +-- MAGIC +-- MAGIC **Reviewer(s)** +-- MAGIC +-- MAGIC **Date last updated** 2022-01-24 +-- MAGIC +-- MAGIC **Date last reviewed** *NA* +-- MAGIC +-- MAGIC **Date last run** `1/23/2022, 7:41:23 PM` +-- MAGIC +-- MAGIC ** Last export requested ** `1/23/2022` +-- MAGIC +-- MAGIC **Data input** +-- MAGIC * `ccu013_covid_trajectory_paper_cohort` +-- MAGIC * `ccu013_covid_events_paper_cohort` +-- MAGIC * `ccu013_paper_table_one_56million_denominator` -- COMMAND ---------- @@ -10,11 +35,95 @@ -- COMMAND ---------- +-- MAGIC %md +-- MAGIC > We identified X infected individuals (%) + +-- COMMAND ---------- + +-- MAGIC %sql +-- MAGIC SELECT +-- MAGIC COUNT(distinct person_id_deid) as population, +-- MAGIC SUM(CASE WHEN severity != 'no_covid' then 1 else 0 end) as covid, +-- MAGIC round(SUM(CASE WHEN severity != 'no_covid' then 1 else 0 end) / COUNT(distinct person_id_deid) * 100, 2) as percent_covid +-- MAGIC FROM +-- MAGIC dars_nic_391419_j3w9t_collab.ccu013_paper_table_one_56million_denominator + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC > with X recorded COVID-19 phenotypes + +-- COMMAND ---------- + +SELECT COUNT(*) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC > Of these, X (%) were hospitalised and Y (%) died. + +-- COMMAND ---------- + +SELECT + SUM(02_Covid_admission) as hospitalised, + round(SUM(02_Covid_admission)/COUNT(*)*100,2) as hospitalised_percent, + SUM(death) as died, + round(SUM(death)/COUNT(*)*100,2) as died_percent +FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_paper_cohort + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC > Of those hospitalised, X (%) were admitted to intensive care (ICU), Y (%) received non-invasive ventilation and Z (%) invasive ventilation. + +-- COMMAND ---------- + +SELECT + SUM(03_ICU_admission) as ICU, + round(SUM(03_ICU_admission)/(SELECT COUNT(*) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_paper_cohort WHERE 02_Covid_admission = 1)*100,2) as ICU_percent, + SUM(03_NIV_treatment) as NIV, + round(SUM(03_NIV_treatment)/(SELECT COUNT(*) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_paper_cohort WHERE 02_Covid_admission = 1)*100,2) as NIV_percent, + SUM(03_IMV_treatment) as IMV, + round(SUM(03_IMV_treatment)/(SELECT COUNT(*) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_paper_cohort WHERE 02_Covid_admission = 1)*100,2) as IMV_percent +FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_paper_cohort + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC > X (%) COVID-19 related deaths occurred without diagnoses on the death certificate, but within 30 days of a positive test while Y (%) of cases were identified from mortality data alone with no prior phenotypes recorded. + +-- COMMAND ---------- + +SELECT + COUNT(*) as deaths, + SUM(04_Fatal_without_covid_diagnosis) as fatal_noDX, + round(SUM(04_Fatal_without_covid_diagnosis)/COUNT(*)*100,2) as fatal_noDX_percent +FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_paper_cohort +WHERE death = 1 + +-- COMMAND ---------- + +SELECT + SUM(CASE WHEN severity != 'no_covid' then 1 else 0 end) as covid, + SUM(death_covid) as death_covid, + SUM(CASE WHEN severity = '4_death_only' then 1 else 0 end) as death_only, + round(SUM(CASE WHEN severity = '4_death_only' then 1 else 0 end) /SUM(death_covid)*100,2) as death_only_percent_deaths, + round(SUM(CASE WHEN severity = '4_death_only' then 1 else 0 end) /SUM(CASE WHEN severity != 'no_covid' then 1 else 0 end)*100,2) as death_only_percent_cases +FROM + dars_nic_391419_j3w9t_collab.ccu013_paper_table_one_56million_denominator + +-- COMMAND ---------- + -- MAGIC %md -- MAGIC ## Methods -- COMMAND ---------- +-- MAGIC %md +-- MAGIC > Data cleaning, exploratory analysis, phenotype creation and cohort assembly was performed using Python (3.7) and Spark SQL (X) on Databricks Runtime 6.4 for Machine Learning. + +-- COMMAND ---------- + -- MAGIC %py -- MAGIC from pyspark import version -- MAGIC # Spark SQL version @@ -22,6 +131,11 @@ -- COMMAND ---------- +-- MAGIC %md +-- MAGIC > We assessed X previously described comorbidities, across 16 clinical specialities / organ systems, using validated CALIBER phenotypes + +-- COMMAND ---------- + SELECT COUNT(distinct phenotype) FROM @@ -107,7 +221,7 @@ GROUP BY -- MAGIC %md -- MAGIC ### Denominator --- MAGIC > Among 56,609,049 individuals registered with a general practitioner in England and alive on 23rd January 2020, +-- MAGIC > Among X individuals registered with a general practitioner in England and alive on 23rd January 2020, -- COMMAND ---------- @@ -120,7 +234,7 @@ FROM -- MAGIC %md -- MAGIC ### Individuals & Events + IR --- MAGIC > we identified 8,825,738 COVID-19 events in 3,469,528 individuals, representing an infection rate of 6.1%, +-- MAGIC > we identified X events in Y individuals, representing an infection rate of Z%, -- COMMAND ---------- @@ -139,16 +253,72 @@ FROM -- COMMAND ---------- SELECT - SUM(01_Covid_positive_test), - SUM(01_GP_covid_diagnosis), - SUM(02_Covid_admission), - SUM(03_NIV_treatment), - SUM(03_IMV_treatment), - SUM(03_ICU_admission), - SUM(03_ECMO_treatment), - SUM(04_Covid_inpatient_death), - SUM(04_Fatal_with_covid_diagnosis), - SUM(04_Fatal_without_covid_diagnosis) + "01_Covid_positive_test" as phenotype, + SUM(01_Covid_positive_test) as n, + round(SUM(01_Covid_positive_test)/COUNT(*)*100,2) as percentage +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort +UNION ALL +SELECT + "01_GP_covid_diagnosis" as phenotype, + SUM(01_GP_covid_diagnosis) as n, + round(SUM(01_GP_covid_diagnosis)/COUNT(*)*100,2) as percentage +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort +UNION ALL +SELECT + "02_Covid_admission" as phenotype, + SUM(02_Covid_admission) as n, + round(SUM(02_Covid_admission)/COUNT(*)*100,2) as percentage +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort +UNION ALL +SELECT + "03_NIV_treatment" as phenotype, + SUM(03_NIV_treatment) as n, + round(SUM(03_NIV_treatment)/COUNT(*)*100,2) as percentage +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort +UNION ALL +SELECT + "03_IMV_treatment" as phenotype, + SUM(03_IMV_treatment) as n, + round(SUM(03_IMV_treatment)/COUNT(*)*100,2) as percentage +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort +UNION ALL +SELECT + "03_ICU_admission" as phenotype, + SUM(03_ICU_admission) as n, + round(SUM(03_ICU_admission)/COUNT(*)*100,2) as percentage +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort +UNION ALL +SELECT + "03_ECMO_treatment" as phenotype, + SUM(03_ECMO_treatment) as n, + round(SUM(03_ECMO_treatment)/COUNT(*)*100,2) as percentage +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort +UNION ALL +SELECT + "04_Covid_inpatient_death" as phenotype, + SUM(04_Covid_inpatient_death) as n, + round(SUM(04_Covid_inpatient_death)/COUNT(*)*100,2) as percentage +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort +UNION ALL +SELECT + "04_Fatal_with_covid_diagnosis" as phenotype, + SUM(04_Fatal_with_covid_diagnosis) as n, + round(SUM(04_Fatal_with_covid_diagnosis)/COUNT(*)*100,2) as percentage +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort +UNION ALL +SELECT + "04_Fatal_without_covid_diagnosis" as phenotype, + SUM(04_Fatal_without_covid_diagnosis) as n, + round(SUM(04_Fatal_without_covid_diagnosis)/COUNT(*)*100,2) as percentage FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort @@ -160,7 +330,7 @@ FROM -- COMMAND ---------- SELECT - SUM(CASE WHEN (03_NIV_treatment = 1 OR 03_IMV_treatment = 1 OR 03_ICU_admission = 1 OR 03_ECMO_treatment = 1) then 1 else 0 end) as n_critical_care, + SUM(CASE WHEN (03_NIV_treatment = 1 OR 03_IMV_treatment = 1 OR 03_ICU_admission = 1 OR 03_ECMO_treatment = 1) then 1 else 0 end) as n_ventilatory_support, SUM(CASE WHEN (04_Covid_inpatient_death = 1 OR 04_Fatal_with_covid_diagnosis = 1 OR 04_Fatal_without_covid_diagnosis = 1) then 1 else 0 end) as n_deaths, SUM(death) FROM @@ -172,7 +342,7 @@ FROM -- MAGIC ### Mutex Severities -- MAGIC These are mutually exclusive phenotypes representing the **worst/most severe** COVID-19 events experienced by each individual patient -- MAGIC --- MAGIC > Most individuals with COVID-19 event(s), (3094860 89.2%) avoided hospitalisation or death related to COVID-19. +-- MAGIC > Most individuals with COVID-19 event(s), (n=X, %) avoided hospitalisation or death related to COVID-19. -- COMMAND ---------- @@ -203,7 +373,7 @@ ORDER BY -- MAGIC %md -- MAGIC ### Ventilation --- MAGIC > Of those admitted to hospital, 52,672 (15%) received NIV, 37,620 (11%) were admitted to an ICU, 20,720 (6%) received IMV, 17,108 (4.8%) patients received both NIV and IMV and 534 received ECMO. +-- MAGIC > Of those admitted to hospital, X (%) received NIV, Y (%) were admitted to an ICU, Z (%) received IMV, A (%) patients received both NIV and IMV and B received ECMO. -- COMMAND ---------- @@ -231,8 +401,8 @@ FROM -- COMMAND ---------- SELECT - SUM(CASE WHEN 03_NIV_treatment = 1 then 1 else 0 end) as NIV_n, - SUM(CASE WHEN 03_IMV_treatment = 1 then 1 else 0 end) as IMV_n, + SUM(03_NIV_treatment) as NIV_n, + SUM(03_IMV_treatment) as IMV_n, SUM(CASE WHEN 03_NIV_treatment = 1 AND 03_IMV_treatment = 1 then 1 else 0 end) as both_n FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort @@ -246,8 +416,12 @@ FROM -- Outside ICU SELECT - SUM(CASE WHEN 03_NIV_treatment = 1 then 1 else 0 end) as NIV_n, - SUM(CASE WHEN 03_IMV_treatment = 1 then 1 else 0 end) as IMV_n + SUM(03_NIV_treatment) as NIV_n, + ROUND(SUM(03_NIV_treatment) / + (SELECT SUM(03_NIV_treatment) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort) * 100, 2) as NIV_percent_outICU, + SUM(CASE WHEN 03_IMV_treatment = 1 then 1 else 0 end) as IMV_n, + ROUND(SUM(03_IMV_treatment) / + (SELECT SUM(03_IMV_treatment) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort) * 100, 2) as IMV_percent_outICU FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort WHERE @@ -256,8 +430,80 @@ WHERE -- COMMAND ---------- -- MAGIC %md --- MAGIC ## Deaths: pathways to COVID-19 mortality --- MAGIC > Of the 138,762 individuals with a COVID-19 related death, 39,510 (28%) died without having ever been admitted to hospital and 13,083 (9%) died within 28-days of a COVID-19 event without a confirmed or suspected COVID-19 diagnosis listed on the death certificate. +-- MAGIC ## Mortality +-- MAGIC > X individuals died, representing a mortality rate of Y% + +-- COMMAND ---------- + +SELECT + SUM(death) as deaths_total, + round(SUM(death)/COUNT(*)*100,2) as mortality +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC > Of these deaths, the majority occurred in patients who were hospitalised (%, n), however n (%) died without having ever been admitted to hospital. + +-- COMMAND ---------- + +SELECT + COUNT(*) as deaths_total, + SUM(02_Covid_admission) as deaths_hospital, + ROUND( SUM(02_Covid_admission)/COUNT(*)*100,2) as deaths_hospital_percent, + SUM(CASE WHEN 02_Covid_admission = 0 then 1 else 0 end) as deaths_NO_hospital_contact, + ROUND(SUM(CASE WHEN 02_Covid_admission = 0 then 1 else 0 end)/COUNT(*)*100,2) as deaths_NO_hospital_contact_percent +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort +WHERE + death = 1 + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ### Unheralded deaths +-- MAGIC > Notably we identified X unheralded COVID-19 deaths - individuals who died with COVID-19 as a recorded cause, but for whom no other COVID-19 phenotypes, such as positive tests or primary care diagnosis, were identified. + +-- COMMAND ---------- + +-- See Table 1 main manusript +SELECT + COUNT(*), + round(SUM(CASE WHEN age > 70 then 1 else 0 end)/COUNT(*)*100,2) as over70, + round(SUM(CASE WHEN ethnic_group == "White" then 1 else 0 end)/COUNT(*)*100,2) as white, + mean(multimorbidity) +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort +WHERE + 04_Fatal_with_covid_diagnosis = 1 +AND + 01_Covid_positive_test = 0 +AND + 01_GP_covid_diagnosis = 0 +AND + 02_Covid_admission = 0 +AND + ventilatory_support = 0 + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC > X individuals died within 28-days of a COVID-19 event without a confirmed or suspected COVID-19 diagnosis listed on the death certificate. + +-- COMMAND ---------- + +SELECT + COUNT(*) +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort +WHERE + 04_Fatal_without_covid_diagnosis = 1 + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC > Of the X individuals with a COVID-19 related death, Y (%) died without having ever been admitted to hospital and Z (%) died within 28-days of a COVID-19 event without a confirmed or suspected COVID-19 diagnosis listed on the death certificate. -- COMMAND ---------- @@ -290,7 +536,7 @@ FROM -- COMMAND ---------- SELECT - SUM(CASE WHEN (03_NIV_treatment = 1 OR 03_IMV_treatment = 1 OR 03_ICU_admission = 1 OR 03_ECMO_treatment = 1) AND death = 1 then 1 else 0 end) as deaths_critical_care, + SUM(CASE WHEN (03_NIV_treatment = 1 OR 03_IMV_treatment = 1 OR 03_ICU_admission = 1 OR 03_ECMO_treatment = 1) AND death = 1 then 1 else 0 end) as deaths_ventilatory_support, SUM(CASE WHEN 03_ICU_admission = 1 AND death = 1 then 1 else 0 end) as deaths_ICU FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort @@ -298,16 +544,10 @@ FROM -- COMMAND ---------- -- MAGIC %md --- MAGIC ### Patient characteristics - --- COMMAND ---------- - --- MAGIC %md --- MAGIC #### High Risk patients --- MAGIC > Amongst the 4,014,314 individuals classified as ?high risk?, 377,630 (9.4%) experienced COVID-19 and 40,286 died, a mortality rate of 11%. --- MAGIC --- MAGIC Now.. --- MAGIC > Amongst the 4,071,794 individuals classified as ?high risk?, 381,497 (9.4%) experienced COVID-19 and 41,446 died, a mortality rate of 11%. +-- MAGIC ### High Risk patients +-- MAGIC > Amongst the X individuals classified as ?high risk?, X (%) experienced COVID-19 and Y died, a mortality rate of Y%. +-- MAGIC +-- MAGIC * Note significant change in results since preprint (ending May) due to vaccination/transmission -- COMMAND ---------- @@ -320,55 +560,22 @@ FROM -- COMMAND ---------- SELECT - SUM(CASE WHEN high_risk = 1 then 1 else 0 end) as total_high_risk, - SUM(CASE WHEN high_risk = 1 AND severity != 'no_covid' then 1 else 0 end) as infected, - SUM(CASE WHEN high_risk = 1 AND severity != 'no_covid' AND death_covid = 1 then 1 else 0 end) as died + COUNT(*) as total_high_risk, + SUM(CASE WHEN severity != 'no_covid' then 1 else 0 end) as infected, + round(SUM(CASE WHEN severity != 'no_covid' then 1 else 0 end) / COUNT(*) *100, 2) as infection_rate, + SUM(CASE WHEN severity != 'no_covid' AND death_covid = 1 then 1 else 0 end) as died, + round(SUM(CASE WHEN severity != 'no_covid' AND death_covid = 1 then 1 else 0 end) / SUM(CASE WHEN severity != 'no_covid' then 1 else 0 end) *100, 2) as mortality FROM dars_nic_391419_j3w9t_collab.ccu013_paper_table_one_56million_denominator - --- COMMAND ---------- - ---- TOTAL N of individuals tagged with 'High RIsk' -SELECT - ( --- TOTAL N of COVID-event individuals who were high-risk - SELECT - COUNT(distinct person_id_deid) - FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort - WHERE high_risk = 1 - ) as got_covid, - - ( --- DIED - SELECT - COUNT(distinct person_id_deid) - FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort - WHERE high_risk = 1 - AND (04_Covid_inpatient_death = 1 OR 04_Fatal_with_covid_diagnosis = 1 OR 04_Fatal_without_covid_diagnosis = 1) - ) as died, - - ( --- Calculate percentage DIED of THOSE WHO GOT COVID = mortality rate - ( --- DIED - SELECT - COUNT(distinct person_id_deid) - FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort - WHERE high_risk = 1 - AND (04_Covid_inpatient_death = 1 OR 04_Fatal_with_covid_diagnosis = 1 OR 04_Fatal_without_covid_diagnosis = 1) - ) - / - ( - SELECT - COUNT(distinct person_id_deid) - FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort - WHERE high_risk = 1 - ) - * 100 - ) as mortality_rate +WHERE + high_risk = 1 -- COMMAND ---------- -- MAGIC %md --- MAGIC ## Death (composite) in hospitalised & ICU +-- MAGIC ## Mortality in hospitalised patients -- MAGIC > --- MAGIC The composite of COVID-19 mortality (including deaths with a recorded diagnosis of COVID-19, deaths within 28 days of a positive test and deaths during a COVID-19 hospital admission) occurred in 28% (99,041 deaths) of hospitalised patients and 41% (15,369 deaths) for those admitted to intensive care. +-- MAGIC The composite of COVID-19 mortality (including deaths with a recorded diagnosis of COVID-19, deaths within 28 days of a positive test and deaths during a COVID-19 hospital admission) occurred in X% (n deaths) of hospitalised patients and Y% (n) deaths) for those admitted to intensive care. -- COMMAND ---------- @@ -394,17 +601,17 @@ SELECT SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1) then 1 else 0 end) as hosp_deaths, ROUND(SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1) then 1 else 0 end)/SUM(02_Covid_admission)*100,2) as hosp_mortality, - SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1 and critical_care = 0) then 1 else 0 end) as hosp_noCC_deaths, - ROUND(SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1 and critical_care = 0) then 1 else 0 end)/SUM(CASE WHEN (02_Covid_admission = 1 and critical_care = 0) then 1 else 0 end) *100,2) as hosp_noCC_mortality, + SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1 and ventilatory_support = 0) then 1 else 0 end) as hosp_noCC_deaths, + ROUND(SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1 and ventilatory_support = 0) then 1 else 0 end)/SUM(CASE WHEN (02_Covid_admission = 1 and ventilatory_support = 0) then 1 else 0 end) *100,2) as hosp_noCC_mortality, - SUM(CASE WHEN (death = 1 and critical_care = 1) then 1 else 0 end) as cc_deaths, - ROUND(SUM(CASE WHEN (death = 1 and critical_care = 1) then 1 else 0 end)/SUM(critical_care)*100,2) as cc_mortality, + SUM(CASE WHEN (death = 1 and ventilatory_support = 1) then 1 else 0 end) as cc_deaths, + ROUND(SUM(CASE WHEN (death = 1 and ventilatory_support = 1) then 1 else 0 end)/SUM(ventilatory_support)*100,2) as cc_mortality, SUM(CASE WHEN (death = 1 and 03_ICU_admission = 1) then 1 else 0 end) as icu_deaths, ROUND(SUM(CASE WHEN (death = 1 and 03_ICU_admission = 1) then 1 else 0 end)/SUM(03_ICU_admission)*100,2) as icu_mortality, - SUM(CASE WHEN (death = 1 and critical_care = 1 and 03_ICU_admission = 0) then 1 else 0 end) as cc_outofICU_deaths, - ROUND(SUM(CASE WHEN (death = 1 and critical_care = 1 and 03_ICU_admission = 0) then 1 else 0 end)/SUM(CASE WHEN (critical_care = 1 and 03_ICU_admission = 0) then 1 else 0 end) *100,2) as cc_outofICU_mortality + SUM(CASE WHEN (death = 1 and ventilatory_support = 1 and 03_ICU_admission = 0) then 1 else 0 end) as cc_outofICU_deaths, + ROUND(SUM(CASE WHEN (death = 1 and ventilatory_support = 1 and 03_ICU_admission = 0) then 1 else 0 end)/SUM(CASE WHEN (ventilatory_support = 1 and 03_ICU_admission = 0) then 1 else 0 end) *100,2) as cc_outofICU_mortality FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort @@ -420,17 +627,17 @@ SELECT SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1) then 1 else 0 end) as hosp_deaths, ROUND(SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1) then 1 else 0 end)/SUM(02_Covid_admission)*100,2) as hosp_mortality, - SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1 and critical_care = 0) then 1 else 0 end) as hosp_noCC_deaths, - ROUND(SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1 and critical_care = 0) then 1 else 0 end)/SUM(CASE WHEN (02_Covid_admission = 1 and critical_care = 0) then 1 else 0 end) *100,2) as hosp_noCC_mortality, + SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1 and ventilatory_support = 0) then 1 else 0 end) as hosp_noCC_deaths, + ROUND(SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1 and ventilatory_support = 0) then 1 else 0 end)/SUM(CASE WHEN (02_Covid_admission = 1 and ventilatory_support = 0) then 1 else 0 end) *100,2) as hosp_noCC_mortality, - SUM(CASE WHEN (death = 1 and critical_care = 1) then 1 else 0 end) as cc_deaths, - ROUND(SUM(CASE WHEN (death = 1 and critical_care = 1) then 1 else 0 end)/SUM(critical_care)*100,2) as cc_mortality, + SUM(CASE WHEN (death = 1 and ventilatory_support = 1) then 1 else 0 end) as cc_deaths, + ROUND(SUM(CASE WHEN (death = 1 and ventilatory_support = 1) then 1 else 0 end)/SUM(ventilatory_support)*100,2) as cc_mortality, SUM(CASE WHEN (death = 1 and 03_ICU_admission = 1) then 1 else 0 end) as icu_deaths, ROUND(SUM(CASE WHEN (death = 1 and 03_ICU_admission = 1) then 1 else 0 end)/SUM(03_ICU_admission)*100,2) as icu_mortality, - SUM(CASE WHEN (death = 1 and critical_care = 1 and 03_ICU_admission = 0) then 1 else 0 end) as cc_outofICU_deaths, - ROUND(SUM(CASE WHEN (death = 1 and critical_care = 1 and 03_ICU_admission = 0) then 1 else 0 end)/SUM(CASE WHEN (critical_care = 1 and 03_ICU_admission = 0) then 1 else 0 end) *100,2) as cc_outofICU_mortality + SUM(CASE WHEN (death = 1 and ventilatory_support = 1 and 03_ICU_admission = 0) then 1 else 0 end) as cc_outofICU_deaths, + ROUND(SUM(CASE WHEN (death = 1 and ventilatory_support = 1 and 03_ICU_admission = 0) then 1 else 0 end)/SUM(CASE WHEN (ventilatory_support = 1 and 03_ICU_admission = 0) then 1 else 0 end) *100,2) as cc_outofICU_mortality FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort WHERE @@ -447,17 +654,17 @@ SELECT SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1) then 1 else 0 end) as hosp_deaths, ROUND(SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1) then 1 else 0 end)/SUM(02_Covid_admission)*100,2) as hosp_mortality, - SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1 and critical_care = 0) then 1 else 0 end) as hosp_noCC_deaths, - ROUND(SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1 and critical_care = 0) then 1 else 0 end)/SUM(CASE WHEN (02_Covid_admission = 1 and critical_care = 0) then 1 else 0 end) *100,2) as hosp_noCC_mortality, + SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1 and ventilatory_support = 0) then 1 else 0 end) as hosp_noCC_deaths, + ROUND(SUM(CASE WHEN (death = 1 and 02_Covid_admission = 1 and ventilatory_support = 0) then 1 else 0 end)/SUM(CASE WHEN (02_Covid_admission = 1 and ventilatory_support = 0) then 1 else 0 end) *100,2) as hosp_noCC_mortality, - SUM(CASE WHEN (death = 1 and critical_care = 1) then 1 else 0 end) as cc_deaths, - ROUND(SUM(CASE WHEN (death = 1 and critical_care = 1) then 1 else 0 end)/SUM(critical_care)*100,2) as cc_mortality, + SUM(CASE WHEN (death = 1 and ventilatory_support = 1) then 1 else 0 end) as cc_deaths, + ROUND(SUM(CASE WHEN (death = 1 and ventilatory_support = 1) then 1 else 0 end)/SUM(ventilatory_support)*100,2) as cc_mortality, SUM(CASE WHEN (death = 1 and 03_ICU_admission = 1) then 1 else 0 end) as icu_deaths, ROUND(SUM(CASE WHEN (death = 1 and 03_ICU_admission = 1) then 1 else 0 end)/SUM(03_ICU_admission)*100,2) as icu_mortality, - SUM(CASE WHEN (death = 1 and critical_care = 1 and 03_ICU_admission = 0) then 1 else 0 end) as cc_outofICU_deaths, - ROUND(SUM(CASE WHEN (death = 1 and critical_care = 1 and 03_ICU_admission = 0) then 1 else 0 end)/SUM(CASE WHEN (critical_care = 1 and 03_ICU_admission = 0) then 1 else 0 end) *100,2) as cc_outofICU_mortality + SUM(CASE WHEN (death = 1 and ventilatory_support = 1 and 03_ICU_admission = 0) then 1 else 0 end) as cc_outofICU_deaths, + ROUND(SUM(CASE WHEN (death = 1 and ventilatory_support = 1 and 03_ICU_admission = 0) then 1 else 0 end)/SUM(CASE WHEN (ventilatory_support = 1 and 03_ICU_admission = 0) then 1 else 0 end) *100,2) as cc_outofICU_mortality FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort WHERE @@ -465,6 +672,124 @@ WHERE -- COMMAND ---------- +-- MAGIC %md +-- MAGIC ## Recording patterns across sources +-- MAGIC > Approximately X% of individuals with a positive test also received a primary care diagnosis, while N (%) had a positive test but no other record. + +-- COMMAND ---------- + +SELECT + SUM(01_Covid_positive_test) as positive_tests, + + SUM(CASE WHEN 01_Covid_positive_test = 1 and 01_GP_covid_diagnosis = 1 then 1 else 0 end) as positive_and_GP_Dx_n, + round(SUM(CASE WHEN 01_Covid_positive_test = 1 and 01_GP_covid_diagnosis = 1 then 1 else 0 end) / SUM(01_Covid_positive_test)*100,2) as positive_and_GP_Dx_percent, + + SUM(CASE WHEN 01_Covid_positive_test = 1 and + 01_GP_covid_diagnosis = 0 and + 02_Covid_admission = 0 and + 03_ICU_admission = 0 and + 03_NIV_treatment = 0 and + 03_IMV_treatment = 0 and + 03_ECMO_treatment = 0 and + 04_Fatal_with_covid_diagnosis = 0 and + 04_Fatal_without_covid_diagnosis = 0 and + 04_Covid_inpatient_death = 0 then 1 else 0 end) as positive_only_n, + round(SUM(CASE WHEN 01_Covid_positive_test = 1 and + 01_GP_covid_diagnosis = 0 and + 02_Covid_admission = 0 and + ventilatory_support = 0 and + death = 0 then 1 else 0 end) / SUM(01_Covid_positive_test)*100,2) as positive_only_percent +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_events_paper_cohort + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC > X% with a primary care record had no other evidence of COVID-19, as did X% with a secondary care record, and X COVID-19 cases were identified exclusively from mortality data with no prior COVID-19 events (Figure 2). A small number of individuals were identified only from PHE hospital surveillance data (CHESS, X individuals, X% of all hospitalisations). See Supplementary Figure 2, for further details on data source overlap. + +-- COMMAND ---------- + +SELECT + SUM(01_GP_covid_diagnosis) as GP_Dx_n, + + SUM(CASE WHEN 01_GP_covid_diagnosis = 1 and + 01_Covid_positive_test = 0 and + 02_Covid_admission = 0 and + ventilatory_support = 0 and + death = 0 then 1 else 0 end) as GP_only_n, + round(SUM(CASE WHEN 01_GP_covid_diagnosis = 1 and + 01_Covid_positive_test = 0 and + 02_Covid_admission = 0 and + ventilatory_support = 0 and + death = 0 then 1 else 0 end) / SUM(01_Covid_positive_test)*100,2) as GP_only_percent, + + SUM(02_Covid_admission) as Hosp_n, + + SUM(CASE WHEN 02_Covid_admission = 1 and + 01_Covid_positive_test = 0 and + 01_GP_covid_diagnosis = 0 and + -- Remove ventilatory support as that's a secondary care record + death = 0 then 1 else 0 end) as Hosp_only_n, + round(SUM(CASE WHEN 02_Covid_admission = 1 and + 01_Covid_positive_test = 0 and + 01_GP_covid_diagnosis = 0 and + death = 0 then 1 else 0 end) / SUM(01_Covid_positive_test)*100,2) as Hosp_only_percent + +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_events_paper_cohort + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Alternative approach ** NB requires running notebook `upset_datagen` to update table `ccu013_01_paper_upset` first ** +-- MAGIC +-- MAGIC **this takes a while! + +-- COMMAND ---------- + +-- MAGIC %run /Workspaces/dars_nic_391419_j3w9t_collab/CCU013/COVID-19-SEVERITY-PHENOTYPING/03_analysis_databricks/upset_datagen + +-- COMMAND ---------- + +-- Alternative approach using upset table (run that first) +SELECT + COUNT(distinct person_id_deid) +FROM + dars_nic_391419_j3w9t_collab.ccu013_01_paper_upset +WHERE + SGSS = 1 +AND CHESS = 0 AND HES_APC = 0 AND HES_CC = 0 and GDPPR = 0 and SUS = 0 and deaths = 0 + +-- COMMAND ---------- + +-- Alternative approach using upset table (run that first) +SELECT + COUNT(distinct person_id_deid) +FROM + dars_nic_391419_j3w9t_collab.ccu013_01_paper_upset +WHERE + GDPPR = 1 +AND CHESS = 0 AND HES_APC = 0 AND HES_CC = 0 and SGSS = 0 and SUS = 0 and deaths = 0 + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC > A small number of individuals were identified only from PHE hospital surveillance data (CHESS, X individuals, X% of all hospitalisations) + +-- COMMAND ---------- + +SELECT + COUNT(distinct person_id_deid) as individuals, + round(COUNT(*) / + (SELECT COUNT(*) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort WHERE 02_Covid_admission = 1)*100,2) as percentage_hospitalisations +FROM + dars_nic_391419_j3w9t_collab.ccu013_01_paper_upset +WHERE + CHESS = 1 +AND GDPPR = 0 AND HES_APC = 0 AND HES_CC = 0 and SGSS = 0 and SUS = 0 and deaths = 0 + +-- COMMAND ---------- + -- MAGIC %md -- MAGIC # Discussion @@ -472,7 +797,7 @@ WHERE -- MAGIC %md -- MAGIC ### Insights from linkage --- MAGIC > we identified 21,558 individuals who received NIV outside of ICU, 40% of all patients treated with NIV, +-- MAGIC > we identified X individuals who received NIV outside of ICU, X% of all patients treated with NIV, -- COMMAND ---------- @@ -534,11 +859,6 @@ FROM -- COMMAND ---------- --- MAGIC %py --- MAGIC 190750/56600000*100 - --- COMMAND ---------- - SELECT SUM(CASE WHEN date_first >= "2020-09-30" AND date_first <= "2021-02-12" then 1 else 0 end) as wave2_all, SUM(CASE WHEN date_first >= "2020-09-30" AND date_first <= "2021-02-12" AND 01_Covid_positive_test = 1 then 1 else 0 end) as wave2_test, diff --git a/code/03_analysis_databricks/Fig1-phenotypes_events_individuals_severity.sql b/code/03_analysis_databricks/Fig1-phenotypes_events_individuals_severity.sql index 8f4f170..c5cdd4c 100644 --- a/code/03_analysis_databricks/Fig1-phenotypes_events_individuals_severity.sql +++ b/code/03_analysis_databricks/Fig1-phenotypes_events_individuals_severity.sql @@ -1,19 +1,44 @@ -- Databricks notebook source -- MAGIC %md --- MAGIC Runs a series of queries to produce the numbers for: +-- MAGIC # Numbers for manuscript +-- MAGIC +-- MAGIC **Description** +-- MAGIC +-- MAGIC This notebook runs a list of `SQL` queries to extract the numbers (%) for **Figure 1** in the `CCU013: COVID-19 Event Phenotypes` manuscript *"Understanding COVID-19 trajectories from a nationwide linked electronic health record cohort of 56 million people: phenotypes, severity, waves & vaccination"*. +-- MAGIC
+-- MAGIC -- MAGIC > **Figure 1**: Flowchart of phenotyping COVID-19 severity phenotypes using seven linked data sources in 56.6 million people. Information derived from the following data sources: COVID-19 testing from SGSS (Second Generation Surveillance System) Pillars 1 & 2, including test from NHS hospitals for those with a clinical need and healthcare workers (Pillar 1) and swab testing from the wider population (Pillar 2). Primary care EHR diagnosis from GDPPR. Secondary care events from hospitalisation EHR from HES Admitted Patient Care (APC) and Critical Care (CC), SUS (Secondary Uses Service) and CHESS (COVID-19 Hospitalisations in England Surveillance System). Fatal COVID-19 events from national death registrations from the ONS, HES APC and SUS. Sources used to identify each step are indicated with data buckets on the left and COVID-19 events in rectangles on the right. Ventilation support is defined either as Non-Invasive Ventilation (NIV), Invasive Mechanical Ventilation (IMV) or Extracorporeal Membrane Oxygenation (ECMO). HES CC does not give info on ECMO treatments. Fatal COVID-19 events are defined as inpatient deaths registered from HES APC or SUS, or deaths any point in time with COVID-19 recorded as the cause of death (at any position on the death certificate) or within 28-days of the earliest COVID-19 ascertainment event irrespective of the cause of death recorded on the death certificate. In all sources, ontology terms for both suspected and confirmed diagnosis were used. (%) indicate the percentage of individuals with a given COVID-19 event phenotype out of all individuals with any event phenotype. +-- MAGIC +-- MAGIC **Project(s)** CCU013 +-- MAGIC +-- MAGIC **Author(s)** Chris Tomlinson +-- MAGIC +-- MAGIC **Reviewer(s)** +-- MAGIC +-- MAGIC **Date last updated** 2022-01-24 +-- MAGIC +-- MAGIC **Date last reviewed** *NA* +-- MAGIC +-- MAGIC **Date last run** `1/23/2022, 7:20:37 PM` +-- MAGIC +-- MAGIC ** Last export requested ** +-- MAGIC +-- MAGIC **Data input** +-- MAGIC * `ccu013_covid_trajectory_paper_cohort` +-- MAGIC * `ccu013_covid_events_paper_cohort` -- COMMAND ---------- -- MAGIC %md --- MAGIC # Phenotype | Events | Individuals +-- MAGIC # Phenotype | Events | Individuals | Percentages -- COMMAND ---------- SELECT covid_phenotype, COUNT(*) as events, - COUNT(distinct person_id_deid) as individuals + COUNT(distinct person_id_deid) as individuals, + round(COUNT(distinct person_id_deid) / (SELECT COUNT(distinct person_id_deid) FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_paper_cohort) *100,2) as percentage FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort GROUP BY diff --git a/code/03_analysis_databricks/Sup_Table_2-Codelists_frequencies.py b/code/03_analysis_databricks/ST2-Codelists_frequencies.py similarity index 96% rename from code/03_analysis_databricks/Sup_Table_2-Codelists_frequencies.py rename to code/03_analysis_databricks/ST2-Codelists_frequencies.py index 3fa70fc..80a974e 100644 --- a/code/03_analysis_databricks/Sup_Table_2-Codelists_frequencies.py +++ b/code/03_analysis_databricks/ST2-Codelists_frequencies.py @@ -18,7 +18,7 @@ # MAGIC # MAGIC **Date last reviewed** *NA* # MAGIC -# MAGIC **Date last run** 2021-10-04 +# MAGIC **Date last run** `1/23/2022, 7:41:23 PM` # MAGIC # MAGIC **Data input** # MAGIC * `ccu013_covid_trajectory_paper_cohort` @@ -54,7 +54,6 @@ clinical_code, code as terminology, description, - -- covid_status, -- Remove as no longer in manuscript source, COUNT(clinical_code) as n FROM diff --git a/code/03_analysis_databricks/ST3-CALIBER_counts.py b/code/03_analysis_databricks/ST3-CALIBER_counts.py new file mode 100644 index 0000000..aabe68d --- /dev/null +++ b/code/03_analysis_databricks/ST3-CALIBER_counts.py @@ -0,0 +1,110 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # Supplementary Table 3: CALIBER phenotype frequencies +# MAGIC +# MAGIC **Description** +# MAGIC +# MAGIC This notebook produces the number of distinct individuals with a CALIBER phenotype prior to 01/01/2020, as used when defining comorbidities for `CCU013: COVID-19 Event Phenotypes`. +# MAGIC +# MAGIC The output from these queries produces `Supplement table 3: 269 CALIBER phenotypes, aggregated into 16 categories, and the number of individuals within the study cohort identified from GDPPR (SNOMED-CT) and HES APC (ICD-10, OPCS-4).` within the manuscript `Characterising COVID-19 related events in a nationwide electronic health record cohort of 55.9 million people in England` +# MAGIC +# MAGIC **Project(s)** CCU013 +# MAGIC +# MAGIC **Author(s)** Chris Tomlinson +# MAGIC +# MAGIC **Reviewer(s)** +# MAGIC +# MAGIC **Date last updated** 2022-01-24 +# MAGIC +# MAGIC **Date last reviewed** *NA* +# MAGIC +# MAGIC **Date last run** `1/24/2022, 11:34:40 AM` +# MAGIC +# MAGIC **Last export requested** `1/24/2022` +# MAGIC +# MAGIC **Data input** +# MAGIC * `ccu013_covid_events_paper_cohort` +# MAGIC * `ccu013_caliber_comorbidities_pre2020` +# MAGIC * `ccu013_caliber_category_mapping` +# MAGIC +# MAGIC **Data output** +# MAGIC Export of this notebook. +# MAGIC +# MAGIC **Software and versions** `python` +# MAGIC +# MAGIC **Packages and versions** `pyspark` + +# COMMAND ---------- + +import databricks.koalas as ks +import pandas as pd + +# COMMAND ---------- + +# COVID-19 events +events_table = "dars_nic_391419_j3w9t_collab.ccu013_covid_events_paper_cohort" + +# CALIBER phenotypes table +phenos_table = "dars_nic_391419_j3w9t_collab.ccu013_caliber_comorbidities_pre2020" + +# COMMAND ---------- + +patients = spark.sql(f"""SELECT person_id_deid FROM {events_table}""") +phenos = spark.table(phenos_table) + +# Subset to cohort +counts = patients.join(phenos, 'person_id_deid', 'left') \ + .fillna(0) \ + .drop('person_id_deid') + +# Col sums +counts = counts.to_koalas() \ + .sum(axis=0) \ + .reset_index() + +# Renaming operations prior to join +counts.columns = counts.columns.fillna('count') +counts = counts.rename(columns={'index': 'Phenotype', + 'count': 'Individuals'}) + +# COMMAND ---------- + +# Get phenotype-category mapping +category_dictionary_table = spark.table("dars_nic_391419_j3w9t_collab.ccu013_caliber_category_mapping") \ + .drop('cat') \ + .to_koalas() \ + .rename(columns={'phenotype': 'Phenotype'}) + +# Apply mapping with merge +df = counts.merge(category_dictionary_table, on='Phenotype', how='left') \ + .rename(columns={'category': 'Category'}) \ + .sort_values(by=['Category', 'Individuals'], ascending=[True, False]) +# Mask counts < 5. Do this last as will change count to string so then can't sort by it +df['Individuals'] = df['Individuals'].astype('str').str.replace('^[1-4]$', '<5') +# Process text +df = df.to_pandas() +df['Category'] = df['Category'].str.capitalize() +df['Phenotype'] = df['Phenotype'].str.capitalize() +df = df.replace(regex='_', value=" ") +# Manual corrections +df = df.replace({'Benign neoplasm cin': 'Benign neoplasm/CIN', + 'Hiv': 'HIV', + 'Bacterial diseases excl tb': 'Bacterial diseases excl TB', + 'Copd': 'COPD', + 'Vitamin b12 deficiency anaemia': 'Vitamin B12 deficiency anaemia', + 'Rheumatic valve dz': 'Rheumatic valve disease', + 'Venous thromboembolic disease excl pe': 'Venous thromboembolic disease excl PE', + 'Stroke nos': 'Stroke NOS', + 'Secondary malignancy brain other cns and intracranial': 'Secondary malignancy brain other CNS and intracranial', + 'Primary malignancy brain other cns and intracranial': 'Primary malignancy brain other CNS and intracranial', + 'Monoclonal gammopathy of undetermined significance mgus': 'Monoclonal gammopathy of undetermined significance', + 'Viral diseases excluding chronic hepatitis hiv': 'Viral diseases excluding chronic hepatitis or HIV'}) +# excl +df = df.replace(regex='\sexcl\s', value=" excluding ") +df = df.replace(regex='\sincl\s', value=" including ") +# Reorder +df = df[['Category', 'Phenotype', 'Individuals']] +# Set to display full Phenotype text +pd.set_option('display.max_colwidth', None) + +display(df) diff --git a/code/03_analysis_databricks/ST5-1vs2_deaths.sql b/code/03_analysis_databricks/ST5-1vs2_deaths.sql new file mode 100644 index 0000000..bade390 --- /dev/null +++ b/code/03_analysis_databricks/ST5-1vs2_deaths.sql @@ -0,0 +1,371 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC ## ST5: Primary diagnosis on death certificate for deceased patients +-- MAGIC > 142,224 primary diagnosis was found for the 139,818 individuals with COVID-19 on the death certificate, and 15,526 primary diagnosis for the 15,486 dying without COVID-19 on the death certificate within 28 days of a COVID-19 event. + +-- COMMAND ---------- + +SELECT + covid_phenotype, + COUNT(*) as records, + COUNT(distinct person_id_deid) as individuals +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort +WHERE + covid_phenotype = '04_Fatal_with_covid_diagnosis' OR covid_phenotype = '04_Fatal_without_covid_diagnosis' +GROUP BY + covid_phenotype + +-- COMMAND ---------- + +-- Reproduce parameters from study +CREATE WIDGET TEXT start_date DEFAULT "2020-01-23"; +CREATE WIDGET TEXT end_date DEFAULT "2021-11-30"; +CREATE WIDGET TEXT production_date DEFAULT "2022-01-20 14:58:52.353312" + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC # 04_Fatal_with_covid_diagnosis + +-- COMMAND ---------- + +SELECT + COUNT(*) as records, + COUNT(distinct person_id_deid) as individuals +FROM +( +SELECT +-- Remove duplicates + distinct deaths.* +FROM + (SELECT person_id_deid + FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort + WHERE covid_phenotype = '04_Fatal_with_covid_diagnosis' ) as trajectory +INNER JOIN + ( + SELECT + DEC_CONF_NHS_NUMBER_CLEAN_DEID as person_id_deid, + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') as date, + S_UNDERLYING_COD_ICD10 as Dx + FROM dars_nic_391419_j3w9t_collab.deaths_dars_nic_391419_j3w9t_archive +-- Reproduce study criteria in CCU013_01_create_table_alias -> ccu013_tmp_deaths + WHERE + ProductionDate == "$production_date" + AND + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') >= "$start_date" + AND + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') <= "$end_date" + -- Remove missing Dx + AND + S_UNDERLYING_COD_ICD10 is not null + ) as deaths +ON + trajectory.person_id_deid = deaths.person_id_deid + ) + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## How many had >1 record + +-- COMMAND ---------- + +SELECT + COUNT(*) as individuals, + SUM(CASE WHEN records = 1 then 1 else 0 end) as n_1record, + ROUND(SUM(CASE WHEN records = 1 then 1 else 0 end) / COUNT(*) * 100, 2) as percent_1record, + SUM(CASE WHEN records > 1 then 1 else 0 end) as n_2more_records, + ROUND(SUM(CASE WHEN records > 1 then 1 else 0 end) / COUNT(*) * 100, 2) as percent_2more_records, + MIN(records), + MAX(records) +FROM +( +SELECT + person_id_deid, + COUNT(*) as records +FROM +( +SELECT +-- Remove duplicates + distinct deaths.* +FROM + (SELECT person_id_deid + FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort + WHERE covid_phenotype = '04_Fatal_with_covid_diagnosis' ) as trajectory +INNER JOIN + ( + SELECT + DEC_CONF_NHS_NUMBER_CLEAN_DEID as person_id_deid, + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') as date, + S_UNDERLYING_COD_ICD10 as Dx + FROM dars_nic_391419_j3w9t_collab.deaths_dars_nic_391419_j3w9t_archive +-- Reproduce study criteria in CCU013_01_create_table_alias -> ccu013_tmp_deaths + WHERE + ProductionDate == "$production_date" + AND + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') >= "$start_date" + AND + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') <= "$end_date" + -- Remove missing Dx + AND + S_UNDERLYING_COD_ICD10 is not null + ) as deaths +ON + trajectory.person_id_deid = deaths.person_id_deid + ) +GROUP BY + person_id_deid +ORDER BY + records Desc + ) + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## Primary Cause(s) of Death + +-- COMMAND ---------- + +SELECT + Dx, + FIRST(description) as description, + COUNT(*) as registrations, + COUNT(distinct person_id_deid) as individuals, + ROUND(COUNT(distinct person_id_deid) / + (SELECT COUNT(distinct person_id_deid) + FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort + WHERE covid_phenotype = '04_Fatal_with_covid_diagnosis' ) * 100, 2) as percentage_individuals +FROM +( +SELECT +-- Remove duplicates, defined as same id/date/dx + distinct deaths.*, + icd.code, + icd.description +FROM + (SELECT person_id_deid + FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort + WHERE covid_phenotype = '04_Fatal_with_covid_diagnosis' ) as trajectory +-- OR covid_phenotype = '04_Fatal_without_covid_diagnosis' +INNER JOIN + ( + SELECT + DEC_CONF_NHS_NUMBER_CLEAN_DEID as person_id_deid, + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') as date, + S_UNDERLYING_COD_ICD10 as Dx + FROM dars_nic_391419_j3w9t_collab.deaths_dars_nic_391419_j3w9t_archive +-- Reproduce study criteria in CCU013_01_create_table_alias -> ccu013_tmp_deaths + WHERE + ProductionDate == "$production_date" + AND + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') >= "$start_date" + AND + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') <= "$end_date" + -- Remove missing Dx + AND + S_UNDERLYING_COD_ICD10 is not null + ) as deaths +ON + trajectory.person_id_deid = deaths.person_id_deid +LEFT JOIN + (SELECT ALT_CODE as code, ICD10_DESCRIPTION as description FROM dss_corporate.icd10_group_chapter_v01) as icd +ON + deaths.Dx = icd.code +) +GROUP BY Dx +-- Protect disclosures (nb expect 1000 rows won't get close to counts of <5 anyway) +HAVING individuals > 5 +ORDER BY individuals Desc + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC # 04_Fatal_without_covid_diagnosis +-- MAGIC * NB to get correct number of individuals have to comment out the `AND S_UNDERLYING_COD_ICD10 is not null` filter. Because this included any deaths within 28d of first COVID event, where COVID wasn't the diagnosis, and so diagnosis could have been null + +-- COMMAND ---------- + +SELECT + COUNT(*) as records, + COUNT(distinct person_id_deid) as individuals +FROM +( +SELECT +-- Remove duplicates + distinct deaths.* +FROM + (SELECT person_id_deid + FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort + WHERE covid_phenotype = '04_Fatal_without_covid_diagnosis' ) as trajectory +INNER JOIN + ( + SELECT + DEC_CONF_NHS_NUMBER_CLEAN_DEID as person_id_deid, + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') as date, + S_UNDERLYING_COD_ICD10 as Dx + FROM dars_nic_391419_j3w9t_collab.deaths_dars_nic_391419_j3w9t_archive +-- Reproduce study criteria in CCU013_01_create_table_alias -> ccu013_tmp_deaths + WHERE + ProductionDate == "$production_date" + AND + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') >= "$start_date" + AND + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') <= "$end_date" +-- Remove missing Dx +-- AND +-- S_UNDERLYING_COD_ICD10 is not null + ) as deaths +ON + trajectory.person_id_deid = deaths.person_id_deid + ) + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC **However to get the right number of diagnoses (records) should exclude null because null isn't a diagnosis** + +-- COMMAND ---------- + +SELECT + COUNT(*) as records, + COUNT(distinct person_id_deid) as individuals +FROM +( +SELECT +-- Remove duplicates + distinct deaths.* +FROM + (SELECT person_id_deid + FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort + WHERE covid_phenotype = '04_Fatal_without_covid_diagnosis' ) as trajectory +INNER JOIN + ( + SELECT + DEC_CONF_NHS_NUMBER_CLEAN_DEID as person_id_deid, + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') as date, + S_UNDERLYING_COD_ICD10 as Dx + FROM dars_nic_391419_j3w9t_collab.deaths_dars_nic_391419_j3w9t_archive +-- Reproduce study criteria in CCU013_01_create_table_alias -> ccu013_tmp_deaths + WHERE + ProductionDate == "$production_date" + AND + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') >= "$start_date" + AND + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') <= "$end_date" +-- Remove missing Dx + AND + S_UNDERLYING_COD_ICD10 is not null + ) as deaths +ON + trajectory.person_id_deid = deaths.person_id_deid + ) + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## How many had >1 record? + +-- COMMAND ---------- + +SELECT + COUNT(*) as individuals, + SUM(CASE WHEN records = 1 then 1 else 0 end) as n_1record, + ROUND(SUM(CASE WHEN records = 1 then 1 else 0 end) / COUNT(*) * 100, 2) as percent_1record, + SUM(CASE WHEN records > 1 then 1 else 0 end) as n_2more_records, + ROUND(SUM(CASE WHEN records > 1 then 1 else 0 end) / COUNT(*) * 100, 2) as percent_2more_records, + MIN(records), + MAX(records) +FROM +( +SELECT + person_id_deid, + COUNT(*) as records +FROM +( +SELECT +-- Remove duplicates + distinct deaths.* +FROM + (SELECT person_id_deid + FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort + WHERE covid_phenotype = '04_Fatal_without_covid_diagnosis' ) as trajectory +INNER JOIN + ( + SELECT + DEC_CONF_NHS_NUMBER_CLEAN_DEID as person_id_deid, + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') as date, + S_UNDERLYING_COD_ICD10 as Dx + FROM dars_nic_391419_j3w9t_collab.deaths_dars_nic_391419_j3w9t_archive +-- Reproduce study criteria in CCU013_01_create_table_alias -> ccu013_tmp_deaths + WHERE + ProductionDate == "$production_date" + AND + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') >= "$start_date" + AND + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') <= "$end_date" + ) as deaths +ON + trajectory.person_id_deid = deaths.person_id_deid + ) +GROUP BY + person_id_deid +ORDER BY + records Desc + ) + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## Primary causes(s) of death + +-- COMMAND ---------- + +SELECT + Dx, + FIRST(description) as description, + COUNT(*) as registrations, + COUNT(distinct person_id_deid) as individuals, + ROUND(COUNT(distinct person_id_deid) / + (SELECT COUNT(distinct person_id_deid) + FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort + WHERE covid_phenotype = '04_Fatal_without_covid_diagnosis' ) * 100, 2) as percentage_individuals +FROM +( +SELECT +-- Remove duplicates, defined as same id/date/dx + distinct deaths.*, + icd.code, + icd.description +FROM + (SELECT person_id_deid + FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort + WHERE covid_phenotype = '04_Fatal_without_covid_diagnosis' ) as trajectory +INNER JOIN + ( + SELECT + DEC_CONF_NHS_NUMBER_CLEAN_DEID as person_id_deid, + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') as date, + S_UNDERLYING_COD_ICD10 as Dx + FROM dars_nic_391419_j3w9t_collab.deaths_dars_nic_391419_j3w9t_archive +-- Reproduce study criteria in CCU013_01_create_table_alias -> ccu013_tmp_deaths + WHERE + ProductionDate == "$production_date" + AND + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') >= "$start_date" + AND + to_date(REG_DATE_OF_DEATH,'yyyyMMdd') <= "$end_date" + -- Remove missing Dx +-- AND +-- S_UNDERLYING_COD_ICD10 is not null + ) as deaths +ON + trajectory.person_id_deid = deaths.person_id_deid +LEFT JOIN + (SELECT ALT_CODE as code, ICD10_DESCRIPTION as description FROM dss_corporate.icd10_group_chapter_v01) as icd +ON + deaths.Dx = icd.code +) +GROUP BY Dx +HAVING individuals > 5 +ORDER BY individuals Desc diff --git a/code/03_analysis_databricks/ST6-Primary_v_Secondary_COVID_Dx.sql b/code/03_analysis_databricks/ST6-Primary_v_Secondary_COVID_Dx.sql new file mode 100644 index 0000000..0a6677b --- /dev/null +++ b/code/03_analysis_databricks/ST6-Primary_v_Secondary_COVID_Dx.sql @@ -0,0 +1,370 @@ +-- Databricks notebook source +-- MAGIC %md +-- MAGIC # Explore Primary diagnoses when COVID is Secondary +-- MAGIC +-- MAGIC Reviwer comments: +-- MAGIC > How are COVID-19 hospitalizations defined? What are the diagnosis codes used? Was it **only defined by the primary diagnosis, or were secondary diagnoses considered?** What were the **primary diagnoses corresponding to the secondary diagnosis of COVID-19** as some may be associated with COVID-19 (such as respiratory or coagulopathy), but others may not be (such as a motor vehicle accident)? Again, these are important considerations in evaluating phenotypes and teaching about how to best approach these large-scale studies. + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC # Derive primary/secondary admission Dx from trajectory table +-- MAGIC Outline: +-- MAGIC 1. Get `MIN(date)` from `trajectory` table `WHERE covid_phenotype == "02_Covid_admission" AND source == "HES APC" ` +-- MAGIC * This date is `EPISTART` see [Cell 22 in `CCU013_01_create_table_aliases`](https://db.core.data.digital.nhs.uk/#notebook/1753732/command/1783084) +-- MAGIC 2. Get the `ADMIDATE` for these episodes from `HES APC` +-- MAGIC 3. Find first episodes occuring on the first day of admission, i.e. where `EPISTART = ADMIDATE` and `EPIORDER = 1` + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## 1. Get ids from trajectory table +-- MAGIC * Distinct id as can have multiple in trajectory table +-- MAGIC * `Min(date)` as can have multiple in trajectory table +-- MAGIC * `date` in trajectory table for HES APC admissions is `EPISTART` + +-- COMMAND ---------- + +CREATE OR REPLACE GLOBAL TEMP VIEW ccu013_01_hosp_apc_ids as +SELECT + distinct person_id_deid as PERSON_ID_DEID, + MIN(date) as EPISTART +FROM + dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort +WHERE + covid_phenotype == "02_Covid_admission" +AND + source == "HES APC" +GROUP BY + person_id_deid + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## 2. Retrieve version of HES APC matching our study +-- MAGIC * NB these params are not exactly the same as those in the non-`paper_cohort` table which is more all encompassing +-- MAGIC * Instead see manuscript +-- MAGIC * Pre-print: +-- MAGIC * `production_date` = `2021-07-29 13:39:04.161949` +-- MAGIC * `start_date` = `2020-01-23` +-- MAGIC * `end_date` = `2021-05-31` +-- MAGIC * Revision 1: +-- MAGIC * `production_date` = `2022-01-20 14:58:52.353312` +-- MAGIC * `start_date` = `2020-01-23` +-- MAGIC * `end_date` = `2021-11-30` + +-- COMMAND ---------- + +-- Reproduce parameters from pre-print +CREATE WIDGET TEXT start_date DEFAULT "2020-01-23"; +CREATE WIDGET TEXT end_date DEFAULT "2021-11-30"; +-- Notebook: CCU013_11_paper_cohort_dp_skinny_record_unassembled +-- Cell: 6 +CREATE WIDGET TEXT production_date DEFAULT "2022-01-20 14:58:52.353312" + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC NB this is those patient's matching our study criteria, but all records not minimum. Hence we use the `DIAG_4_CONCAT` like COVID filter. + +-- COMMAND ---------- + +CREATE OR REPLACE GLOBAL TEMP VIEW ccu013_01_hes_apc as +SELECT + PERSON_ID_DEID, + ADMIDATE, + EPISTART, + DIAG_4_01 as primary_Dx, + DIAG_4_CONCAT +FROM + dars_nic_391419_j3w9t_collab.hes_apc_all_years_archive +WHERE + ProductionDate == "$production_date" +AND + (DIAG_4_CONCAT LIKE "%U071%" OR DIAG_4_CONCAT LIKE "%U072%") +AND + (EPISTART >= "$start_date" AND EPISTART <= "$end_date") +AND + PERSON_ID_DEID is not null + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## 3. Get `ADMIDATE` + +-- COMMAND ---------- + +CREATE OR REPLACE GLOBAL TEMP VIEW ccu013_01_hosp_apc_admissions as +SELECT + distinct trajectory.PERSON_ID_DEID as PERSON_ID_DEID, + MIN(ADMIDATE) as ADMIDATE +FROM +global_temp.ccu013_01_hosp_apc_ids as trajectory +INNER JOIN +( +SELECT + PERSON_ID_DEID, + ADMIDATE, + EPISTART +FROM + global_temp.ccu013_01_hes_apc +) as apc +ON + trajectory.PERSON_ID_DEID = apc.PERSON_ID_DEID +AND + trajectory.EPISTART = apc.EPISTART +GROUP BY + trajectory.PERSON_ID_DEID + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC # Find first EPISODE in ADMISSION +-- MAGIC * First = `EPIORDER = 1` +-- MAGIC * We use the 'full' HES APC table to join onto here, rather than our temp view which is COVID-subsetted, as COVID may not feature in this first episode + +-- COMMAND ---------- + +CREATE OR REPLACE GLOBAL TEMP VIEW ccu013_01_hosp_apc_first_primary_dx as +SELECT + apc_full.PERSON_ID_DEID, + apc_full.ADMIDATE, + apc_full.EPISTART, + apc_full.EPIORDER, + substring(apc_full.DIAG_4_01, 1, 1) as first_primary_chapter, + apc_full.DIAG_4_01 as first_primary_icd10, + -- Create covid primary Dx flag + CASE WHEN apc_full.DIAG_4_01 == "U071" OR apc_full.DIAG_4_01 == "U072" then 1 else 0 end as first_primary_covid +FROM + global_temp.ccu013_01_hosp_apc_admissions as admission +INNER JOIN +( +SELECT + PERSON_ID_DEID, + ADMIDATE, + EPISTART, + EPIORDER, + DIAG_4_01 +FROM + dars_nic_391419_j3w9t_collab.hes_apc_all_years_archive +WHERE + ProductionDate == "$production_date" +AND + EPIORDER = 1 +) as apc_full +ON + admission.PERSON_ID_DEID = apc_full.PERSON_ID_DEID +AND +-- Episodes on the day of admission + admission.ADMIDATE = apc_full.EPISTART + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC # Counts + +-- COMMAND ---------- + +SELECT + COUNT(*), + COUNT(distinct PERSON_ID_DEID) +FROm + global_temp.ccu013_01_hosp_apc_first_primary_dx + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## Explore those with >1 'first' episode + +-- COMMAND ---------- + +SELECT + PERSON_ID_DEID, + COUNT(*) as n +FROm + global_temp.ccu013_01_hosp_apc_first_primary_dx +GROUP BY + PERSON_ID_DEID +HAVING n > 1 + +-- COMMAND ---------- + +SELECT + * +FROm + global_temp.ccu013_01_hosp_apc_first_primary_dx as apc +INNER JOIN + (SELECT + PERSON_ID_DEID, + COUNT(*) as n + FROm + global_temp.ccu013_01_hosp_apc_first_primary_dx + GROUP BY + PERSON_ID_DEID + HAVING n > 1) as ids +ON + apc.PERSON_ID_DEID = ids.PERSON_ID_DEID +ORDER BY + n Desc + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## How many pts had >1 episode on first day? + +-- COMMAND ---------- + +SELECT + COUNT(*) as individuals, + SUM(CASE WHEN episodes = 1 then 1 else 0 end) as n_1epi, + ROUND(SUM(CASE WHEN episodes = 1 then 1 else 0 end) / COUNT(*) * 100, 2) as percent_1epi, + SUM(CASE WHEN episodes > 1 then 1 else 0 end) as n_2more_epi, + ROUND(SUM(CASE WHEN episodes > 1 then 1 else 0 end) / COUNT(*) * 100, 2) as percent_2more_epi, + MIN(episodes), + MAX(episodes) +FROM + ( + SELECT + distinct PERSON_ID_DEID, + COUNT(*) as episodes +FROM + global_temp.ccu013_01_hosp_apc_first_primary_dx +GROUP BY + PERSON_ID_DEID +ORDER BY + episodes Desc + ) + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC # Counts + +-- COMMAND ---------- + +SELECT + COUNT(*) as episodes, + COUNT(distinct PERSON_ID_DEID) as individuals, + ROUND(COUNT(distinct PERSON_ID_DEID) / COUNT(*) * 100, 2) as percentage, + MAX(EPIORDER), + MIN(ADMIDATE), + MAX(ADMIDATE), + MIN(EPISTART), + MAX(EPISTART) +FROM + global_temp.ccu013_01_hosp_apc_first_primary_dx + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC * We're looking for first episodes in admission, so it is plausible that both of these extend before study_start +-- MAGIC * 2012 does seem very early, e.g. pt in hospital for 10 years + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Boil down to 1 row per patient, single presence of COVID takes precedence over other codes + +-- COMMAND ---------- + +SELECT + COUNT(*) as records, + -- should be 1 row per pt, check + COUNT(distinct PERSON_ID_DEID) as individuals, + SUM(first_primary_covid) as n_first_primary_covid, + ROUND(SUM(first_primary_covid)/COUNT(*)*100, 2) as percent_first_primary_covid, + COUNT(distinct PERSON_ID_DEID) - SUM(first_primary_covid) as n_NO_first_primary_covid +FROM +( +SELECT + distinct PERSON_ID_DEID, + CASE WHEN SUM(first_primary_covid) >= 1 then 1 else 0 end as first_primary_covid +FROM + global_temp.ccu013_01_hosp_apc_first_primary_dx +GROUP BY + PERSON_ID_DEID +) + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC The numbers are very similar to other attempts using different methods + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC # Primary diagnoses when COVID is secondary + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## By ICD-10 Chapter + +-- COMMAND ---------- + +-- Build ICD-10 chapter dictionary +CREATE OR REPLACE GLOBAL TEMP VIEW ccu013_01_lkp_icd10_chapter as +SELECT + DISTINCT substring(CODE, 1, 1) as chapter, + ICD10_CHAPTER_DESCRIPTION as description +FROM + dss_corporate.icd10_group_chapter_v01 + +-- COMMAND ---------- + +SELECT + first_primary_chapter, + FIRST(description) as description, + COUNT(*) as records, + COUNT(distinct PERSON_ID_DEID) as individuals, + ROUND(COUNT(distinct PERSON_ID_DEID) / (SELECT COUNT(distinct PERSON_ID_DEID) FROM global_temp.ccu013_01_hosp_apc_first_primary_dx) * 100, 2) as percentage_individuals +FROM +global_temp.ccu013_01_hosp_apc_first_primary_dx as apc +-- Don't group by ID for this as multiple episodes relevant +LEFT JOIN + (SELECT chapter, description FROM global_temp.ccu013_01_lkp_icd10_chapter) as icd +ON + apc.first_primary_chapter = icd.chapter +-- WHERE +-- first_primary_covid = 0 +GROUP BY first_primary_chapter +ORDER BY individuals Desc + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## Full ICD-10 codes +-- MAGIC * Includes % of individuals to show the % which receive a primary COVID-19 Dx within this table + +-- COMMAND ---------- + +SELECT + first_primary_icd10, + FIRST(description) as description, + COUNT(*) as records, + COUNT(distinct PERSON_ID_DEID) as individuals, + ROUND(COUNT(distinct PERSON_ID_DEID) / (SELECT COUNT(distinct PERSON_ID_DEID) FROM global_temp.ccu013_01_hosp_apc_first_primary_dx) * 100, 2) as percentage_individuals +FROM + global_temp.ccu013_01_hosp_apc_first_primary_dx as apc +LEFT JOIN + (SELECT ALT_CODE as code, ICD10_DESCRIPTION as description FROM dss_corporate.icd10_group_chapter_v01) as icd +ON + apc.first_primary_icd10 = icd.code +-- WHERE +-- first_primary_covid = 0 +GROUP BY first_primary_icd10 +-- Protect disclosures (nb expect 1000 rows won't get close to counts of <5 anyway) +HAVING individuals > 5 +ORDER BY individuals Desc + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC # N. unique ICD-10 codes in primary position + +-- COMMAND ---------- + +SELECT + COUNT(distinct first_primary_icd10) - 2 -- Subtract 2 COVID codes +FROM + global_temp.ccu013_01_hosp_apc_first_primary_dx diff --git a/code/03_analysis_databricks/Sup_Table_3-CALIBER_counts.py b/code/03_analysis_databricks/Sup_Table_3-CALIBER_counts.py deleted file mode 100644 index 0139bd7..0000000 --- a/code/03_analysis_databricks/Sup_Table_3-CALIBER_counts.py +++ /dev/null @@ -1,73 +0,0 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC # Supplementary Table 3: CALIBER phenotype frequencies -# MAGIC -# MAGIC **Description** -# MAGIC -# MAGIC This notebook produces the number of distinct individuals with a CALIBER phenotype prior to 01/01/2020, as used when defining comorbidities for `CCU013: COVID-19 Event Phenotypes`. -# MAGIC -# MAGIC The output from these queries produces `Supplement table 3: 269 CALIBER phenotypes, aggregated into 16 categories, and the number of individuals within the study cohort identified from GDPPR (SNOMED-CT) and HES APC (ICD-10, OPCS-4).` within the manuscript `Characterising COVID-19 related events in a nationwide electronic health record cohort of 55.9 million people in England` -# MAGIC -# MAGIC **Project(s)** CCU013 -# MAGIC -# MAGIC **Author(s)** Chris Tomlinson -# MAGIC -# MAGIC **Reviewer(s)** -# MAGIC -# MAGIC **Date last updated** 2021-10-04 -# MAGIC -# MAGIC **Date last reviewed** *NA* -# MAGIC -# MAGIC **Date last run** 2021-10-04 -# MAGIC -# MAGIC **Data input** -# MAGIC * `ccu013_covid_events_paper_cohort` -# MAGIC * `ccu013_caliber_comorbidities_pre2020` -# MAGIC * `ccu013_caliber_category_mapping` -# MAGIC -# MAGIC **Data output** -# MAGIC Export of this notebook. -# MAGIC -# MAGIC **Software and versions** `python` -# MAGIC -# MAGIC **Packages and versions** `pyspark` - -# COMMAND ---------- - -import databricks.koalas as ks - -# COMMAND ---------- - -# COVID-19 events -events_table = "dars_nic_391419_j3w9t_collab.ccu013_covid_events_paper_cohort" - -# CALIBER phenotypes table -phenos_table = "dars_nic_391419_j3w9t_collab.ccu013_caliber_comorbidities_pre2020" - -# COMMAND ---------- - -patients = spark.sql(f"""SELECT person_id_deid FROM {events_table}""") -phenos = spark.table(phenos_table) -# Subset to cohort -df = patients.join(phenos, 'person_id_deid', 'left') \ - .fillna(0) \ - .drop('person_id_deid') - -# Col sums -df = df.to_koalas() \ - .sum(axis=0) \ - .reset_index() - -# Renaming operations prior to join -df.columns = df.columns.fillna('count') -df = df.rename(columns={'index': 'phenotype'}) - -# COMMAND ---------- - -category_dictionary_table = spark.table("dars_nic_391419_j3w9t_collab.ccu013_caliber_category_mapping") \ - .drop('cat') \ - .to_koalas() - -# COMMAND ---------- - -df.merge(category_dictionary_table, on='phenotype', how='left') diff --git a/code/03_analysis_databricks/upset_datagen.sql b/code/03_analysis_databricks/upset_datagen.sql new file mode 100644 index 0000000..3ffa7fa --- /dev/null +++ b/code/03_analysis_databricks/upset_datagen.sql @@ -0,0 +1,203 @@ +-- Databricks notebook source +-- MAGIC %run /Workspaces/dars_nic_391419_j3w9t_collab/CCU013/COVID-19-SEVERITY-PHENOTYPING/CCU013_00_helper_functions + +-- COMMAND ---------- + +SELECT * FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort LIMIT 5 + +-- COMMAND ---------- + + SELECT * FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_paper_cohort LIMIT 5 + +-- COMMAND ---------- + + SELECT distinct covid_phenotype FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC Therefore we're basically going to use the method of creating the events table only instead of pivotting on events, will pivot on data source + +-- COMMAND ---------- + +-- MAGIC %py +-- MAGIC import databricks.koalas +-- MAGIC +-- MAGIC events = spark.sql(f""" +-- MAGIC SELECT +-- MAGIC person_id_deid, +-- MAGIC source, +-- MAGIC 1 as value +-- MAGIC FROM +-- MAGIC dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort +-- MAGIC """) \ +-- MAGIC .to_koalas() \ +-- MAGIC .pivot(index='person_id_deid', +-- MAGIC columns='source', +-- MAGIC values='value') \ +-- MAGIC .fillna(0) \ +-- MAGIC .reset_index() \ +-- MAGIC .to_spark() +-- MAGIC +-- MAGIC display(events) + +-- COMMAND ---------- + +-- MAGIC %py +-- MAGIC events.count() + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC # Join metadata + +-- COMMAND ---------- + + SELECT * FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort LIMIT 5 + +-- COMMAND ---------- + +-- MAGIC %py +-- MAGIC X = spark.sql(""" +-- MAGIC SELECT +-- MAGIC person_id_deid, +-- MAGIC date_first, +-- MAGIC sex, +-- MAGIC ethnic_group, +-- MAGIC IMD_quintile, +-- MAGIC age +-- MAGIC FROM +-- MAGIC dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort +-- MAGIC """) +-- MAGIC +-- MAGIC cohort = events.join(X, +-- MAGIC "person_id_deid", +-- MAGIC "left") \ +-- MAGIC .fillna(0) +-- MAGIC +-- MAGIC # Rename columns with spaces in them +-- MAGIC cohort = cohort.withColumnRenamed("HES APC", "HES_APC") \ +-- MAGIC .withColumnRenamed("HES CC", "HES_CC") +-- MAGIC +-- MAGIC display(cohort) + +-- COMMAND ---------- + +-- MAGIC %py +-- MAGIC cohort.createOrReplaceGlobalTempView("ccu013_01_paper_upset") +-- MAGIC drop_table("ccu013_01_paper_upset") +-- MAGIC create_table("ccu013_01_paper_upset") + +-- COMMAND ---------- + +SELECT * FROM dars_nic_391419_j3w9t_collab.ccu013_01_paper_upset + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC # Queries + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## NIV + +-- COMMAND ---------- + +-- MAGIC %py +-- MAGIC import databricks.koalas +-- MAGIC +-- MAGIC NIV = spark.sql(f""" +-- MAGIC SELECT +-- MAGIC person_id_deid, +-- MAGIC source, +-- MAGIC 1 as value +-- MAGIC FROM +-- MAGIC dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort +-- MAGIC WHERE +-- MAGIC covid_phenotype = "03_NIV_treatment" +-- MAGIC """) \ +-- MAGIC .to_koalas() \ +-- MAGIC .pivot(index='person_id_deid', +-- MAGIC columns='source', +-- MAGIC values='value') \ +-- MAGIC .fillna(0) \ +-- MAGIC .reset_index() \ +-- MAGIC .to_spark() +-- MAGIC +-- MAGIC # Rename columns with spaces in them +-- MAGIC NIV = NIV.withColumnRenamed("HES APC", "HES_APC") \ +-- MAGIC .withColumnRenamed("HES CC", "HES_CC") +-- MAGIC +-- MAGIC NIV.createOrReplaceGlobalTempView("ccu013_01_upset_NIV") +-- MAGIC +-- MAGIC display(spark.sql(""" +-- MAGIC SELECT +-- MAGIC -- HES CC with OPCS-4 +-- MAGIC SUM(HES_CC) as cc, +-- MAGIC SUM(CASE WHEN HES_CC = 1 and (HES_APC = 1 OR SUS = 1) THEN 1 else 0 end) as cc_opcs, +-- MAGIC ROUND(SUM(CASE WHEN HES_CC = 1 and (HES_APC = 1 OR SUS = 1) THEN 1 else 0 end)/SUM(HES_CC)*100, 2) as cc_opcs_percent, +-- MAGIC -- CHESS with OPCS_4 +-- MAGIC SUM(CHESS) as chess, +-- MAGIC SUM(CASE WHEN CHESS = 1 and (HES_APC = 1 OR SUS = 1) THEN 1 else 0 end) as chess_opcs, +-- MAGIC ROUND(SUM(CASE WHEN CHESS = 1 and (HES_APC = 1 OR SUS = 1) THEN 1 else 0 end)/SUM(CHESS)*100, 2) as chess_opcs_percent +-- MAGIC FROM +-- MAGIC global_temp.ccu013_01_upset_NIV +-- MAGIC """)) + +-- COMMAND ---------- + +-- WRONG as doesn't include those where ICU admission was detected from CHESS +SELECT + COUNT(*), + SUM(CASE WHEN HES_CC = 0 then 1 else 0 end) as niv_out_ICU +FROM + global_temp.ccu013_01_upset_NIV + +-- COMMAND ---------- + +-- MAGIC %md +-- MAGIC ## IMV + +-- COMMAND ---------- + +-- MAGIC %py +-- MAGIC import databricks.koalas +-- MAGIC +-- MAGIC IMV = spark.sql(f""" +-- MAGIC SELECT +-- MAGIC person_id_deid, +-- MAGIC source, +-- MAGIC 1 as value +-- MAGIC FROM +-- MAGIC dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort +-- MAGIC WHERE +-- MAGIC covid_phenotype = "03_IMV_treatment" +-- MAGIC """) \ +-- MAGIC .to_koalas() \ +-- MAGIC .pivot(index='person_id_deid', +-- MAGIC columns='source', +-- MAGIC values='value') \ +-- MAGIC .fillna(0) \ +-- MAGIC .reset_index() \ +-- MAGIC .to_spark() +-- MAGIC +-- MAGIC # Rename columns with spaces in them +-- MAGIC IMV = IMV.withColumnRenamed("HES APC", "HES_APC") \ +-- MAGIC .withColumnRenamed("HES CC", "HES_CC") +-- MAGIC +-- MAGIC IMV.createOrReplaceGlobalTempView("ccu013_01_upset_IMV") +-- MAGIC +-- MAGIC display(spark.sql(""" +-- MAGIC SELECT +-- MAGIC -- HES CC with OPCS-4 +-- MAGIC SUM(HES_CC) as cc, +-- MAGIC SUM(CASE WHEN HES_CC = 1 and (HES_APC = 1 OR SUS = 1) THEN 1 else 0 end) as cc_opcs, +-- MAGIC ROUND(SUM(CASE WHEN HES_CC = 1 and (HES_APC = 1 OR SUS = 1) THEN 1 else 0 end)/SUM(HES_CC)*100, 2) as cc_opcs_percent, +-- MAGIC -- CHESS with OPCS_4 +-- MAGIC SUM(CHESS) as chess, +-- MAGIC SUM(CASE WHEN CHESS = 1 and (HES_APC = 1 OR SUS = 1) THEN 1 else 0 end) as chess_opcs, +-- MAGIC ROUND(SUM(CASE WHEN CHESS = 1 and (HES_APC = 1 OR SUS = 1) THEN 1 else 0 end)/SUM(CHESS)*100, 2) as chess_opcs_percent +-- MAGIC FROM +-- MAGIC global_temp.ccu013_01_upset_IMV +-- MAGIC """))