Skip to content

Commit

Permalink
update R code to revised manuscript
Browse files Browse the repository at this point in the history
  • Loading branch information
tomlincr committed Mar 18, 2022
1 parent cb967c3 commit 9cb5be1
Show file tree
Hide file tree
Showing 14 changed files with 620 additions and 511 deletions.
70 changes: 39 additions & 31 deletions code/04_analysis_R/CCU013_00_check_table_numbers.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# Tables checked: ccu013_covid_severity OR ccu013_covid_severity_paper_cohort
# ccu013_covid_trajectory OR ccu013_covid_trajectory_paper_cohort
# Author: Johan Hilge Thygesen
# Last updated: 16.06.21
# Last updated: 22.01.22


# ---- Load tables ------------------
Expand All @@ -20,14 +20,19 @@ deaths <- dbGetQuery(con, "SELECT * FROM dars_nic_391419_j3w9t_collab.ccu003_dir

start_date = '2020-01-01' # None paper cohort
start_date = '2020-01-23'
end_date = '2021-07-29'
end_date = '2021-11-30'


# ----- Start checks

nrow(severity) # 3469528 - for paper_cohort
nrow(traject) # 8825738 - for paper_cohort
nrow(unique(traject)) # 8825738 - for paper_cohort
# Old Medxriv paper cohort number
# severity: 3469528
# trajectory: 8825738
# unique(trajectory): 8825738

nrow(severity) # Cohort 7,244,925 |8,714,594 Full Sample
nrow(traject) # Cohort 13,990,425 | 15,825,444 Full Sample
nrow(unique(traject)) # Cohort 13,990,425 | 15825444 Full Sample

## 1) Check that all samples have identifies
all(!is.na(severity[,1]))
Expand Down Expand Up @@ -74,6 +79,9 @@ if(any(traject[,'date']>as.Date(end_date, format = "%Y-%m-%d"), na.rm = T)){stop
head(traject[which(traject[,'date']<as.Date(start_date, format = "%Y-%m-%d")),])
table(traject[which(traject[,'date']<as.Date(start_date, format = "%Y-%m-%d")),"source"])
nrow(traject[which(traject[,'date']<as.Date(start_date, format = "%Y-%m-%d")),])
head(severity[which(severity[,'date']<as.Date(start_date, format = "%Y-%m-%d")),])
nrow(severity[which(severity[,'date']<as.Date(start_date, format = "%Y-%m-%d")),])


## 5) Check for overlap between death with and without diagnosis
withDiag <- unique(traject[which(traject[,"covid_phenotype"] == '04_Fatal_with_covid_diagnosis'), "person_id_deid"])
Expand All @@ -100,32 +108,32 @@ for(i in 1:length(icdcols)){
paste0("Deaths from covid found in patients wihout covid on certificate, death col:", icdcols[i]))}
}
head(icdcols)
#head(deaths)
#deaths[which(deaths[,"person_id_deid"]=="3DQOF5766RUCLZA"),]
#traject[which(traject[,1]=="07O8FWQ7ZTV038C"),]

# ## Multiple death dates
# test <- traject[which(traject$covid_phenotype %in% c("04_Fatal_with_covid_diagnosis","04_Fatal_without_covid_diagnosis")),]
# test2 <- as.data.frame(table(test[,1]))
# test2 <- test2[which(test2[,2]>1),]
# test2 <- test2[order(test2[,2], decreasing = T),]
# nrow(test2)
# head(test2)
# table(test2[,2])
# test[which(test[,1]== "001WF09LDQHE1X3"),]
# test[which(test[,1]== "05LTERN3NH4C7P5"),]
# traject[which(traject[,1]== "05LTERN3NH4C7P5"),]
#
#
# # Number of events per id
# test <- unique(traject[,c("person_id_deid", "covid_phenotype")])
# table(test$covid_phenotype)
#
# # Number of confirmed cases
# length(unique(traject[which(traject$covid_status == "confirmed"),"person_id_deid"]))
#
# # Number of positive tests
# length(unique(traject[which(traject$covid_phenotype == "01_Covid_positive"),"person_id_deid"]))

################################
### Additional checks for the PAPER_COHORT ONLY

### 8) Check that everyone with less than 28 days followup time dies! - otherwise the followup time is not working

## Find all ids who have less than 28 days followup time
myquery <- paste0("select distinct t.person_id_deid, first_event, date, covid_phenotype FROM (
Select person_id_deid, first_event FROM
(SELECT person_id_deid, min(date) as first_event FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort
group by person_id_deid)
WHERE first_event > \"", as.character(as.Date(end_date) - 28) ,"\") as f
INNER JOIN dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort as t
ON f.person_id_deid == t.person_id_deid
order by t.person_id_deid")
first_event <- dbGetQuery(con, myquery)

## How many with 28 days followup time?
length(unique(first_event$person_id_deid))

## How many of these die die
first_event_death <- first_event[which(strtrim(first_event$covid_phenotype,2) == "04"),]
table(first_event_death$covid_phenotype)
length(unique(first_event_death$person_id_deid))

## All okay
length(unique(first_event$person_id_deid))==length(unique(first_event_death$person_id_deid))


10 changes: 5 additions & 5 deletions code/04_analysis_R/CCU013_03_fig3_vaccine_KM_plots.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@ library(survminer)
folder_path <- "~/dars_nic_391419_j3w9t_collab/CCU013/output/km_plots"
basefilename <- "CCU013_km_"

source("~/dars_nic_391419_j3w9t_collab/CCU013/matching_function.R")
source("~/dars_nic_391419_j3w9t_collab/CCU013/04_analysis_R-GITHUB-v3/matching_function.R")

# Note this is exact matching, samples that cant be matched exactly are not included in the further analysis.

# Load sample for matching
## -------------------------------------------
d <- dbGetQuery(con, "SELECT * FROM dars_nic_391419_j3w9t_collab.ccu013_no_covid_before_feb_2021_vax_status_matching")

nrow(d) # 49,929,434
nrow(d) # 50357509 @230122 | old:49,929,434

case <- d[which(d$vaccine_status == "Vaccinated"), ]
ctrl <- d[which(d$vaccine_status == "Unvaccinated"), ]
Expand All @@ -24,15 +24,15 @@ m <- rbind(case,ctrl)
m <- data.table(m)
m <- m[-which(m$SEX %in% c(0,9)), ]
nrow(m) # 49928374
table(m$vaccinated) # Unvax = 49,506,564, Vax = 421,810
table(m$vaccinated) # Unvax = 49,932,299 Vax = 425,148 @230122 |OLD: Unvax = 49,506,564, Vax = 421,810

m$mvar <- do.call('paste0', m[,c("SEX","ethnicity","age_grp")])
#m$mvar <- do.call('paste0', m[,c("age_grp")])
matched_m <- smatch(m, treat = "vaccinated", mvar = 'mvar', ratio = 1, seed = 42)
matched_summary <- dcast(matched_m, SEX + ethnicity + age_grp ~vaccinated, value.var = 'id', fun.aggregate = length)
print(paste0(max(matched_m$id), " out of ", nrow(case), " samples was matached exactly!"))

## For dosage 2: "421809 out of 421811 samples was matached exactly!"
## For dosage 2: @230222 - "425147 out of 425148 samples was matached exactly!" | OLD: "421809 out of 421811 samples was matached exactly!"

# write out matching results
matched_m <- matched_m[order(matched_m$id), ]
Expand All @@ -57,7 +57,7 @@ table(unmatched[,".mvar"])
#-----------------------------------------------------------------------------------------------------------------------
## NOTE!! our date of interest is: end_date
start_date <- as.Date("2021-02-01")
end_date <- as.Date("2021-05-31")
end_date <- as.Date("2021-11-30") # as.Date("2021-05-31")
traject <- dbGetQuery(con, paste0("SELECT a.person_id_deid, date, covid_phenotype, death_date
FROM (SELECT person_id_deid, date, covid_phenotype FROM
dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort
Expand Down
24 changes: 12 additions & 12 deletions code/04_analysis_R/CCU013_04_fig3_KM_worst_outcome.R
Original file line number Diff line number Diff line change
Expand Up @@ -10,29 +10,29 @@ library(magrittr)
folder_path <- "~/dars_nic_391419_j3w9t_collab/CCU013/output/km_plots"
basefilename <- "CCU013_km_"

mycolors <- c("#74c476", "#006d2c", "#fe9929", "#ff6800", "#ff003e") # Positive test, GP diagnosis, Hospitalisation, ICU admission, Critical care outside ICU
mycolors <- c("#74c476", "#006d2c", "#fe9929", "#ff6800", "#ff003e") # Positive test, GP diagnosis, Hospitalisation, ICU admission, Ventilatory support outside ICU

# Main data
## Generated in Data bricks; Notebook CCU013_17_paper_survival_cohort
## Cohort data is used for stratifying on demographics; gender, age, ethnicity & IMD
cohort <- dbGetQuery(con, "SELECT person_id_deid, date_first, death, severity, death_fu_time FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort_survival")
nrow(cohort) # 3469528
nrow(cohort) # 7244925

## Wave population are used to generate wave comparison plots
wave1 <- dbGetQuery(con, "SELECT person_id_deid, date_first, death, severity, death_fu_time FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort_survival_wave1")
wave2 <- dbGetQuery(con, "SELECT person_id_deid, date_first, death, severity, death_fu_time FROM dars_nic_391419_j3w9t_collab.ccu013_covid_trajectory_paper_cohort_survival_wave2")
nrow(wave1) # 263839
nrow(wave2) # 2878573
nrow(wave1) # 264505
nrow(wave2) # 2889028
wave1[,"wave"] <- 1
wave2[,"wave"] <- 2
waves <- rbind(wave1, wave2)
waves <- as.data.frame(waves)

# Additional data
demo <- dbGetQuery(con, "SELECT * FROM dars_nic_391419_j3w9t_collab.ccu013_covid_events_demographics_paper_cohort")
nrow(demo) # 3469528
nrow(demo) # 7244925
vax <- dbGetQuery(con, "SELECT * FROM dars_nic_391419_j3w9t_collab.ccu013_no_covid_before_wave2_vax_status WHERE date is NOT NULL")
nrow(vax) # 2957762
nrow(vax) # 6731281

# Data prep
#----------------------------------------------------------------------------------------------------------------------
Expand Down Expand Up @@ -65,18 +65,18 @@ table(cohort$severity)
cohort[which(cohort[, "severity"] == "0_positive"), "severity"] <- "Positive test"
cohort[which(cohort[, "severity"] == "1_gp"), "severity"] <- "Primary care diagnosis"
cohort[which(cohort[, "severity"] == "2_hospitalised"), "severity"] <- "Hospitalisation"
cohort[which(cohort[, "severity"] == "03_critical_care_outside_ICU"), "severity"] <- "Critical care outside ICU"
cohort[which(cohort[, "severity"] == "3_ventilatory_support_outside_ICU"), "severity"] <- "Ventilatory support outside ICU"
cohort[which(cohort[, "severity"] == "4_icu_admission"), "severity"] <- "ICU admission"
cohort$severity <- factor(cohort$severity, levels = c("Positive test", "Primary care diagnosis", "Hospitalisation", "ICU admission", "Critical care outside ICU"))
cohort$severity <- factor(cohort$severity, levels = c("Positive test", "Primary care diagnosis", "Hospitalisation", "ICU admission", "Ventilatory support outside ICU"))
table(cohort$severity)

# Rename severity
waves[which(waves[, "severity"] == "0_positive"), "severity"] <- "Positive test"
waves[which(waves[, "severity"] == "1_gp"), "severity"] <- "Primary care diagnosis"
waves[which(waves[, "severity"] == "2_hospitalised"), "severity"] <- "Hospitalisation"
waves[which(waves[, "severity"] == "03_critical_care_outside_ICU"), "severity"] <- "Critical care outside ICU"
waves[which(waves[, "severity"] == "3_ventilatory_support_outside_ICU"), "severity"] <- "Ventilatory support outside ICU"
waves[which(waves[, "severity"] == "4_icu_admission"), "severity"] <- "ICU admission"
waves$severity <- factor(waves$severity, levels = c("Positive test", "Primary care diagnosis", "Hospitalisation", "ICU admission", "Critical care outside ICU"))
waves$severity <- factor(waves$severity, levels = c("Positive test", "Primary care diagnosis", "Hospitalisation", "ICU admission", "Ventilatory support outside ICU"))
table(waves$severity)


Expand Down Expand Up @@ -111,7 +111,7 @@ f1 <- survfit(Surv(death_fu_time, death) ~ severity,
pval.coord = c(3, 0.35),
break.time.by = 7,
legend.title = "",
legend.labs = c("Positive test", "Primary care diagnosis", "Hospitalisation", "ICU admission", "Critical care outside ICU"),
legend.labs = c("Positive test", "Primary care diagnosis", "Hospitalisation", "ICU admission", "Ventilatory support outside ICU"),
font.y = 8,
fontsize = 4,
title = 'COVID-19 Mortality - Stratified by worst healthcare presentation',
Expand All @@ -134,7 +134,7 @@ f2 <- survfit(Surv(death_fu_time, death) ~ severity,
pval.coord = c(3, 0.35),
break.time.by = 7,
legend.title = "",
legend.labs = c("Positive test", "Primary care diagnosis", "Hospitalisation", "ICU admission", "Critical care outside ICU"),
legend.labs = c("Positive test", "Primary care diagnosis", "Hospitalisation", "ICU admission", "Ventilatory support outside ICU"),
font.y = 8,
fontsize = 4,
title = 'COVID-19 Mortality - Stratified by worst healthcare presentation',
Expand Down
Loading

0 comments on commit 9cb5be1

Please sign in to comment.