From a4973b8522b02edc32c713bf516387215a91ddd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Mon, 24 Apr 2017 10:43:11 -0600 Subject: [PATCH 01/38] Fix comment header typo --- scripts/collect-raw-municipality.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/collect-raw-municipality.R b/scripts/collect-raw-municipality.R index 0fad7bf..5780d1a 100644 --- a/scripts/collect-raw-municipality.R +++ b/scripts/collect-raw-municipality.R @@ -140,7 +140,7 @@ file.remove(file_list) #------------------------------------------------------------------------------* -# Collect data from 2000-2010 period ---- +# Collect data from 2011-2015 period ---- #------------------------------------------------------------------------------* # Zip path From abe2bdd1e87cf535bb225616343e8a6ce84af127 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Mon, 24 Apr 2017 10:45:09 -0600 Subject: [PATCH 02/38] Improve parameter fixing docs --- scripts/collect-raw-municipality.R | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/scripts/collect-raw-municipality.R b/scripts/collect-raw-municipality.R index 5780d1a..cc86de0 100644 --- a/scripts/collect-raw-municipality.R +++ b/scripts/collect-raw-municipality.R @@ -178,20 +178,28 @@ read_population_2011 <- function(file_path, skip = 3){ # Report sheet (year) cat(" ", sheet) - fixes <- c( + + #--------------------------------------------------------------------------* + # Fix parameters for specific files / sheets + #--------------------------------------------------------------------------* + + # Fix individual skip parameters + fixes_skip <- c( "Totonicapan", "Suchitepequez", "Retalhuleu", "Quiche", "Alta Verapaz", "Peten", "Jutiapa", " Jalapa", "Chiquimula", "Izabal", "Santa Rosa" ) - # Configuration exceptions skip <- case_when( - # department == "Santa Rosa" & sheet == "2012" ~ 4, - # department == "Santa Rosa" & sheet == "2015" ~ 5, - department %in% fixes & sheet == "2012" ~ 4, - department %in% fixes & sheet == "2015" ~ 5, + department %in% fixes_skip & sheet == "2012" ~ 4, + department %in% fixes_skip & sheet == "2015" ~ 5, TRUE ~ skip ) + + #--------------------------------------------------------------------------* + # Get data + #--------------------------------------------------------------------------* + # Read file contents pop_sheet <- read_excel( path = file, sheet = sheet, skip = skip_lines , na = "?" From 0cf57a33142f7e4a17f0878bf89ccad4a4fb9c26 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Mon, 24 Apr 2017 14:30:43 -0600 Subject: [PATCH 03/38] Fix missing municipality names --- scripts/collect-raw-municipality.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/scripts/collect-raw-municipality.R b/scripts/collect-raw-municipality.R index cc86de0..838d2a2 100644 --- a/scripts/collect-raw-municipality.R +++ b/scripts/collect-raw-municipality.R @@ -195,6 +195,18 @@ read_population_2011 <- function(file_path, skip = 3){ TRUE ~ skip ) + # Fix missing municipality names + fix_municipalities <- function(.data){ + .data %>% + # Setup fixing rules + mutate( + municipality = case_when( + department == "Santa Rosa" & grepl("^X", municipality) ~ "Nueva Santa Rosa", + TRUE ~ municipality + ) + ) %>% + return() + } #--------------------------------------------------------------------------* # Get data @@ -238,6 +250,8 @@ read_population_2011 <- function(file_path, skip = 3){ ), department = department ) %>% + # Fix missingmunicipality names + fix_municipalities() %>% select(year, department, municipality, sex = sexo, age, population) %>% mutate( # Fix factors From 98074b51163eea422427c1e7cec96d93cf3d11c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Mon, 24 Apr 2017 17:10:06 -0600 Subject: [PATCH 04/38] Track lastest prerelease --- lastest-prerelease | 1 + 1 file changed, 1 insertion(+) create mode 100644 lastest-prerelease diff --git a/lastest-prerelease b/lastest-prerelease new file mode 100644 index 0000000..45c7a58 --- /dev/null +++ b/lastest-prerelease @@ -0,0 +1 @@ +v0.0.1 From 5a05e2b82fcbbd0694bc9f9732ef5c4bc0117618 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Thu, 27 Apr 2017 10:17:32 -0600 Subject: [PATCH 05/38] Report processed data path --- scripts/collect-raw-municipality.R | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/collect-raw-municipality.R b/scripts/collect-raw-municipality.R index 838d2a2..5b24e3d 100644 --- a/scripts/collect-raw-municipality.R +++ b/scripts/collect-raw-municipality.R @@ -590,12 +590,13 @@ pop_2016_2020 <- pop_2016_2020_predicted %>% population <- pop_2000_2015 %>% bind_rows(pop_2016_2020) +processed_file <- "data/processed/gt_2000_2020_municipality_population.RData" + # Save population data for use in R save( - population, file = "data/processed/gt_2000_2020_municipality_population.RData" + population, file = processed_file ) - - +cat(processed_file) # End of script From f9806bac58c29eb36006ff9516cd03498b91191a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Thu, 27 Apr 2017 10:18:07 -0600 Subject: [PATCH 06/38] Run processing script and load reported snapshot --- scripts/process-municipality-data.R | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 scripts/process-municipality-data.R diff --git a/scripts/process-municipality-data.R b/scripts/process-municipality-data.R new file mode 100644 index 0000000..a2f2fb2 --- /dev/null +++ b/scripts/process-municipality-data.R @@ -0,0 +1,19 @@ +#------------------------------------------------------------------------------* +# Produce population data at the municipality level +#------------------------------------------------------------------------------* + + +#------------------------------------------------------------------------------* +# Load data ---- +#------------------------------------------------------------------------------* + +# Prepare snapshot from raw data +processed_file <- system("Rscript scripts/collect-raw-municipality.R", intern = TRUE) +load(file = processed_file[length(processed_file)] ) + + +# Clean up +rm(processed_file) + + +# End of script From e5bcfdcaa3b053612e044524be435f92ecfabef3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Thu, 27 Apr 2017 10:19:17 -0600 Subject: [PATCH 07/38] Downlad snapshot from last prerelease --- scripts/get-processed-tarball.R | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 scripts/get-processed-tarball.R diff --git a/scripts/get-processed-tarball.R b/scripts/get-processed-tarball.R new file mode 100644 index 0000000..0441398 --- /dev/null +++ b/scripts/get-processed-tarball.R @@ -0,0 +1,31 @@ + + +library(package = "jsonlite") +library(package = "httr") +library(package = "tidyverse") + +# Set request parameters +prerelease <- scan(file = "lastest-prerelease", what = "character") + +releases <- GET( + paste0( + "https://api.github.com/repos/odeleongt/gt-population/releases/tags/", + prerelease + ) +) + +# Get asset uri +asset <- releases %>% + content(as = "text") %>% + fromJSON() %>% + map(~ifelse(is.list(.x), as.data.frame(list(.x)), .x)) %>% + flatten_df() %>% + pull(assets) %>% + GET %>% + content(as = "text") %>% + fromJSON() %>% + getElement("browser_download_url") + +# Download file +download.file(asset, "data/processed/gt_2000_2020_municipality_population.RData") + From c9974fc860117c21ae414f72189a910df1ae6c38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Thu, 27 Apr 2017 15:17:25 -0600 Subject: [PATCH 08/38] Setup vital statistics --- scripts/collect-raw-vital-stats.R | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 scripts/collect-raw-vital-stats.R diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R new file mode 100644 index 0000000..db23bf9 --- /dev/null +++ b/scripts/collect-raw-vital-stats.R @@ -0,0 +1,20 @@ +#------------------------------------------------------------------------------* +# Collect raw vital statistics at the department level +#------------------------------------------------------------------------------* + +#------------------------------------------------------------------------------* +# Prepare environment ---- +#------------------------------------------------------------------------------* + +# Load used packages +library(package = "lubridate") +library(package = "haven") +library(package = "tidyverse") + +# Set metadata +data_path <- "data/raw/vital-statistics/" + + + + +# End of script From ab62c7948d215704061e0d7358b2da06c0dbf3c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Thu, 27 Apr 2017 15:51:37 -0600 Subject: [PATCH 09/38] Prepare births data --- scripts/collect-raw-vital-stats.R | 50 +++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index db23bf9..66604fb 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -17,4 +17,54 @@ data_path <- "data/raw/vital-statistics/" +#------------------------------------------------------------------------------* +# Load births data ---- +#------------------------------------------------------------------------------* + +# Read labeled data +births <- list.files(path = data_path, pattern = "births") %>% + paste0(data_path, .) %>% + set_names(gsub("[^0-9]", "", .)) %>% + map(read_spss) %>% + map(~set_names(.x, tolower(iconv(names(.x), to = "ASCII//TRANSLIT")))) + +# Read labels from imported data +birth_labels <- births %>% + map(~map(.x, ~attr(.x, "labels"))) + +# Bind imported datasets +births <- births %>% + map(~map_df(.x, zap_labels)) %>% + map(~map_df(.x, as.character)) %>% + bind_rows(.id = "file_year") + +# Prepare birth "events" dataset +births <- births %>% + mutate( + record_date = ymd( + paste( + stringr::str_pad(anoreg, width = 2, side = "left", pad = "0"), + mesreg, "01", sep = "-" + ) + ), + anoocu = ifelse(is.na(anoocu), file_year, anoocu), + event_date = ymd( + paste( + stringr::str_pad(anoocu, width = 2, side = "left", pad = "0"), + mesocu, diaocu, sep = "-" + ) + ) + ) %>% + select( + # Record metadata + record_date, record_department = depreg, record_municipality = mupreg, + # Event data + event_date, event_department = depocu, event_municipality = mupocu, + # Mother residency location + mother_department = deprem, mother_municipality = muprem + ) + + + + # End of script From 69c863b5d41c4a3e1e3a55419e93729aa2a5d469 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Thu, 27 Apr 2017 16:09:50 -0600 Subject: [PATCH 10/38] Prepare deaths data --- scripts/collect-raw-vital-stats.R | 65 +++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 66604fb..329a5c1 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -67,4 +67,69 @@ births <- births %>% +#------------------------------------------------------------------------------* +# Load deaths data ---- +#------------------------------------------------------------------------------* + +# Read labeled data +deaths <- list.files(path = data_path, pattern = "deaths") %>% + paste0(data_path, .) %>% + set_names(gsub("[^0-9]", "", .)) %>% + map(read_spss) %>% + map(~set_names(.x, tolower(iconv(names(.x), to = "ASCII//TRANSLIT")))) + +# Read labels from imported data +death_labels <- deaths %>% + map(~map(.x, ~attr(.x, "labels"))) + +# Bind imported datasets +deaths <- deaths %>% + map(~map_df(.x, zap_labels)) %>% + map(~map_df(.x, as.character)) %>% + bind_rows(.id = "file_year") + +# Prepare birth "events" dataset +deaths <- deaths %>% + # Ignore unknown ages + filter( + as.integer(edadif) < 999, + perdif != "9" + ) %>% + mutate( + record_date = ymd( + paste( + stringr::str_pad(anoreg, width = 2, side = "left", pad = "0"), + mesreg, "01", sep = "-" + ) + ), + anoocu = ifelse(is.na(anoocu), file_year, anoocu), + event_date = ymd( + paste( + stringr::str_pad(anoocu, width = 2, side = "left", pad = "0"), + mesocu, diaocu, sep = "-" + ) + ), + age_unit = recode( + perdif, + "1" = "days", + "2" = "months", + "3" = "years", + .default = NA_character_, + .missing = NA_character_ + ), + birth_date = event_date - period(edadif, units = age_unit), + age_days = as.integer(event_date - birth_date) + ) %>% + select( + # Record metadata + record_date, record_department = depreg, record_municipality = mupreg, + # Event data + event_date, event_department = depocu, event_municipality = mupocu, + # Deceased data + birth_date, age_days + ) + + + + # End of script From ebddf18be187511be11b3bafa1f3fe6bbed58eac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Thu, 27 Apr 2017 16:54:24 -0600 Subject: [PATCH 11/38] Add event year for joins --- scripts/collect-raw-vital-stats.R | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 329a5c1..9185008 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -53,13 +53,14 @@ births <- births %>% stringr::str_pad(anoocu, width = 2, side = "left", pad = "0"), mesocu, diaocu, sep = "-" ) - ) + ), + event_year = year(event_date) ) %>% select( + # Event data + event_year, event_date, event_department = depocu, event_municipality = mupocu, # Record metadata record_date, record_department = depreg, record_municipality = mupreg, - # Event data - event_date, event_department = depocu, event_municipality = mupocu, # Mother residency location mother_department = deprem, mother_municipality = muprem ) @@ -109,6 +110,7 @@ deaths <- deaths %>% mesocu, diaocu, sep = "-" ) ), + event_year = year(event_date), age_unit = recode( perdif, "1" = "days", @@ -121,10 +123,10 @@ deaths <- deaths %>% age_days = as.integer(event_date - birth_date) ) %>% select( + # Event data + event_year, event_date, event_department = depocu, event_municipality = mupocu, # Record metadata record_date, record_department = depreg, record_municipality = mupreg, - # Event data - event_date, event_department = depocu, event_municipality = mupocu, # Deceased data birth_date, age_days ) From e6ec7316df042d0831e1fc542e1b57a0c99d565d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Thu, 27 Apr 2017 16:58:00 -0600 Subject: [PATCH 12/38] Mid years and births years --- scripts/collect-raw-vital-stats.R | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 9185008..95831c1 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -68,6 +68,28 @@ births <- births %>% +#------------------------------------------------------------------------------* +# Calculate mid-year counts ---- +#------------------------------------------------------------------------------* + +# Mid year dates +mid_years <- births %>% + pull(event_date) %>% + year() %>% + unique() %>% + paste0("-07-01") %>% + ymd() %>% + data_frame(mid_year = .) %>% + group_by(mid_year) %>% + do({ + data_frame( + event_year = seq(min(year(births$event_date)), year(.$mid_year)) + ) + }) + + + + #------------------------------------------------------------------------------* # Load deaths data ---- #------------------------------------------------------------------------------* From 090c257339439b5fb057131400454d2c4e16c478 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Fri, 28 Apr 2017 17:23:06 -0600 Subject: [PATCH 13/38] Births by mid year --- scripts/collect-raw-vital-stats.R | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 95831c1..d9bcf28 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -85,7 +85,18 @@ mid_years <- births %>% data_frame( event_year = seq(min(year(births$event_date)), year(.$mid_year)) ) - }) + }) %>% + ungroup() + +# Calculate mid year counts +mid_year_counts <- mid_years %>% + # Relevant births for each year + left_join(births) %>% + filter(event_date < mid_year) %>% + # Calculate age at mid year + mutate( + age_days = as.integer(mid_year - event_date) + ) From 06f6953dfc6e60ed146353152ea3aefef339b135 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Tue, 2 May 2017 10:41:22 -0600 Subject: [PATCH 14/38] Add age periods --- scripts/collect-raw-vital-stats.R | 49 +++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index d9bcf28..262e711 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -68,6 +68,55 @@ births <- births %>% +#------------------------------------------------------------------------------* +# Define age groups ---- +#------------------------------------------------------------------------------* + +# Mutually exclusive ages +age_periods_exclusive <- tibble( + correlative = c( + 1, 2, 3, 4, + 5, 6, 7, + 8 + ), + label = c( + # Mutually exclusive + "0-27 days", "28 days-<3 month", "3-5 months", "6-8 months", + "9-11 months", "12-23 months", "24-35 months", + "36-59 months" + ), + period = c( + # Mutually exclusive + days(27), months(3) - days(1), months(6) - days(1), months(9) - days(1), + months(12) - days(1), months(24) - days(1), months(36) - days(1), + months(60) - days(1) + ) +) + +# Cumulative 1 +age_periods_cumulative <- tibble( + correlative = c( + 1, 2, 3, + 4 + ), + label = c( + # Cumulative + "0-11 months", "12-59 months", "24-59 months", + "0-59 months" + ), + period = c( + # Cumulative + months(12) - days(1), months(60) - days(1), months(60) - days(1), + months(60) - days(1) + ) +) + +# One off +# "24-59 months", "0-59 months" + + + + #------------------------------------------------------------------------------* # Calculate mid-year counts ---- #------------------------------------------------------------------------------* From 66fc1c50dd3844383eace5a5efcb7b9c4e69a107 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Tue, 2 May 2017 16:23:32 -0600 Subject: [PATCH 15/38] Calculate mid-year ages --- scripts/collect-raw-vital-stats.R | 37 +++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 7 deletions(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 262e711..382a70b 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -137,16 +137,39 @@ mid_years <- births %>% }) %>% ungroup() -# Calculate mid year counts -mid_year_counts <- mid_years %>% +# Calculate mid year ages given birth date +mid_year_ages <- mid_years %>% # Relevant births for each year - left_join(births) %>% + left_join(unique(select(births, event_year, event_date))) %>% filter(event_date < mid_year) %>% - # Calculate age at mid year - mutate( - age_days = as.integer(mid_year - event_date) - ) + # Get age at each mid year for each unique birth date + group_by(mid_year, event_date) %>% + do( + age_exclusive = mutate( + age_periods_exclusive, + age_date = .$event_date + period + ), + age_cumulative = mutate( + age_periods_cumulative, + age_date = .$event_date + period + ) + ) %>% + ungroup() + +# Label exclusive ages +exclusive_ages <- mid_year_ages %>% + unnest(age_exclusive) %>% + filter(age_date < mid_year) %>% + group_by(mid_year, event_date) %>% + filter(correlative == max(correlative)) + +# Label cumulative ages +cumulative_ages <- mid_year_ages %>% + unnest(age_cumulative) %>% + filter(age_date < mid_year) %>% + group_by(mid_year, event_date) %>% + filter(correlative == max(correlative)) From 0a3523294682c5279d7aebe214def9ed68591b1a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Fri, 5 May 2017 12:15:09 -0600 Subject: [PATCH 16/38] Fix age group assignment and calculate mid-year alive counts --- scripts/collect-raw-vital-stats.R | 189 ++++++++++++++++++++---------- 1 file changed, 125 insertions(+), 64 deletions(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 382a70b..dbac60f 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -72,48 +72,15 @@ births <- births %>% # Define age groups ---- #------------------------------------------------------------------------------* -# Mutually exclusive ages -age_periods_exclusive <- tibble( - correlative = c( - 1, 2, 3, 4, - 5, 6, 7, - 8 - ), - label = c( - # Mutually exclusive - "0-27 days", "28 days-<3 month", "3-5 months", "6-8 months", - "9-11 months", "12-23 months", "24-35 months", - "36-59 months" - ), - period = c( - # Mutually exclusive - days(27), months(3) - days(1), months(6) - days(1), months(9) - days(1), - months(12) - days(1), months(24) - days(1), months(36) - days(1), - months(60) - days(1) - ) -) - -# Cumulative 1 -age_periods_cumulative <- tibble( - correlative = c( - 1, 2, 3, - 4 - ), - label = c( - # Cumulative - "0-11 months", "12-59 months", "24-59 months", - "0-59 months" - ), - period = c( - # Cumulative - months(12) - days(1), months(60) - days(1), months(60) - days(1), - months(60) - days(1) - ) +# Order age groups for output +age_groups <- c( + "0-27 days", "28 days-<3 month", "3-5 months", "6-8 months", "9-11 months", + "0-11 months", + "12-23 months", "24-35 months", "36-59 months", + "0-59 months", + "12-59 months", "24-59 months" ) -# One off -# "24-59 months", "0-59 months" - @@ -128,7 +95,7 @@ mid_years <- births %>% unique() %>% paste0("-07-01") %>% ymd() %>% - data_frame(mid_year = .) %>% + data_frame(mid_year = ., event_year = NA) %>% group_by(mid_year) %>% do({ data_frame( @@ -141,35 +108,129 @@ mid_years <- births %>% mid_year_ages <- mid_years %>% # Relevant births for each year left_join(unique(select(births, event_year, event_date))) %>% - filter(event_date < mid_year) %>% - # Get age at each mid year for each unique birth date - group_by(mid_year, event_date) %>% - do( - age_exclusive = mutate( - age_periods_exclusive, - age_date = .$event_date + period + filter(event_date < mid_year) + + +labeled_ages <- mid_year_ages %>% + # Tag all births for both age group types + mutate( + group = "exclusive", + correlative = 1, + label = "0-27 days", + date_threshold = mid_year - days(27) + ) %>% + bind_rows( + mutate( + ., + group = "exclusive", + correlative = 2, + label = "28 days-<3 month", + date_threshold = mid_year - months(3) + ), + mutate( + ., + group = "exclusive", + correlative = 3, + label = "3-5 months", + date_threshold = mid_year - months(6) + ), + mutate( + ., + group = "exclusive", + correlative = 4, + label = "6-8 months", + date_threshold = mid_year - months(9) + ), + mutate( + ., + group = "exclusive", + correlative = 5, + label = "9-11 months", + date_threshold = mid_year - months(12) + ), + mutate( + ., + group = "exclusive", + correlative = 6, + label = "12-23 months", + date_threshold = mid_year - months(24) + ), + mutate( + ., + group = "exclusive", + correlative = 7, + label = "24-35 months", + date_threshold = mid_year - months(36) ), - age_cumulative = mutate( - age_periods_cumulative, - age_date = .$event_date + period + mutate( + ., + group = "exclusive", + correlative = 8, + label = "36-59 months", + date_threshold = mid_year - months(60) ) + ) %>% + # Assign possible age groups + mutate( + keep = event_date > date_threshold ) %>% - ungroup() - -# Label exclusive ages -exclusive_ages <- mid_year_ages %>% - unnest(age_exclusive) %>% - filter(age_date < mid_year) %>% + filter(keep) %>% + select(-keep, -date_threshold) %>% + # Pick oldest applicable age group + arrange(mid_year, event_date, correlative) %>% group_by(mid_year, event_date) %>% - filter(correlative == max(correlative)) + filter(correlative == min(correlative)) %>% + ungroup %>% + select(mid_year, event_date, label) -# Label cumulative ages -cumulative_ages <- mid_year_ages %>% - unnest(age_cumulative) %>% - filter(age_date < mid_year) %>% - group_by(mid_year, event_date) %>% - filter(correlative == max(correlative)) +# Label other cummulative age groups +labeled_ages2 <- mid_year_ages %>% + select(mid_year, event_date) %>% + mutate( + months_12 = mid_year - months(12), + months_24 = mid_year - months(25) + days(1), + months_59 = mid_year - months(60), + # born 12-59 months before midyear + "12-59 months" = event_date > months_59 & event_date <= months_12, + # born 24-59 months before midyear + "24-59 months" = event_date > months_59 & event_date <= months_24, + # Any age before 12 months + "0-11 months" = event_date > months_12 & event_date < mid_year, + # Any age before 60 months + "0-59 months" = event_date > months_59 & event_date < mid_year + ) %>% + select(-months_12, -months_24, -months_59) %>% + gather(key = label, value = keep, -mid_year, -event_date) %>% + filter(keep) %>% + select(-keep) + + +# Bind labeled birth dates +birth_age_groups <- labeled_ages %>% + bind_rows(labeled_ages2) + + +# Count live people by age group +alive <- births %>% + # Births by date for each location + count( + year = event_year, + department = event_department, municipality = event_municipality, + event_date + ) %>% + # Label with ages at each mid-year + left_join(birth_age_groups) %>% + # Only keep births inside the mid year pediods + filter(!is.na(label)) %>% + # Children alive by age group at each mid-year + count(year = year(mid_year), department, municipality, age_group = label) %>% + rename(alive = nn) %>% + mutate( + age_group = factor(age_group, levels = age_groups, ordered = TRUE) + ) %>% + arrange(department, municipality, year, age_group) + From b7b744e9be4fa96baa7556e0def94928029b4bfc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Mon, 8 May 2017 16:17:00 -0600 Subject: [PATCH 17/38] Keep complete 9-11 months groups --- scripts/collect-raw-vital-stats.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index dbac60f..a3c377b 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -172,7 +172,7 @@ labeled_ages <- mid_year_ages %>% ) %>% # Assign possible age groups mutate( - keep = event_date > date_threshold + keep = event_date >= date_threshold ) %>% filter(keep) %>% select(-keep, -date_threshold) %>% From a852360fdcb98a2676e5d6dc7ae560f41b92cd94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Mon, 8 May 2017 16:20:02 -0600 Subject: [PATCH 18/38] Ignore data before 2009-07-01 A complete cohort for these births can not be established, only succesive years can be completed with 2004-2008 population projections. --- scripts/collect-raw-vital-stats.R | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index a3c377b..cc46f5d 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -221,8 +221,14 @@ alive <- births %>% ) %>% # Label with ages at each mid-year left_join(birth_age_groups) %>% - # Only keep births inside the mid year pediods - filter(!is.na(label)) %>% + filter( + # Only keep births inside the mid year pediods + !is.na(label), + # Ignore first half of 2009 for every succesive year + (mid_year > ymd("2009-07-01") & event_date >= ymd("2009-07-01")), + # Ignore first year, which can not be completed + mid_year > ymd("2009-07-01") + ) %>% # Children alive by age group at each mid-year count(year = year(mid_year), department, municipality, age_group = label) %>% rename(alive = nn) %>% From b2b06e23c8b99d265755f7ce4ee7b113ae728a91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Mon, 8 May 2017 17:03:59 -0600 Subject: [PATCH 19/38] Fix one-of categories --- scripts/collect-raw-vital-stats.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index cc46f5d..28caba5 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -192,13 +192,13 @@ labeled_ages2 <- mid_year_ages %>% months_24 = mid_year - months(25) + days(1), months_59 = mid_year - months(60), # born 12-59 months before midyear - "12-59 months" = event_date > months_59 & event_date <= months_12, + "12-59 months" = event_date >= months_59 & event_date < months_12, # born 24-59 months before midyear - "24-59 months" = event_date > months_59 & event_date <= months_24, + "24-59 months" = event_date >= months_59 & event_date < months_24, # Any age before 12 months - "0-11 months" = event_date > months_12 & event_date < mid_year, + "0-11 months" = event_date >= months_12 & event_date < mid_year, # Any age before 60 months - "0-59 months" = event_date > months_59 & event_date < mid_year + "0-59 months" = event_date >= months_59 & event_date < mid_year ) %>% select(-months_12, -months_24, -months_59) %>% gather(key = label, value = keep, -mid_year, -event_date) %>% From 7970e2434d339f89fe9c4d055137e524cc34dfba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Mon, 8 May 2017 18:51:52 -0600 Subject: [PATCH 20/38] Add location data --- scripts/collect-raw-vital-stats.R | 41 +++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 28caba5..21ee25b 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -17,6 +17,47 @@ data_path <- "data/raw/vital-statistics/" +#------------------------------------------------------------------------------* +# Load locations data ---- +#------------------------------------------------------------------------------* + +# Get municipality codes +municipalities <- sf::read_sf("data/raw/geo-data/municipalities.shp") %>% + select(muni_id = COD_MUNI, municipality = MUNICIPIO) %>% + mutate( + # Fix encoding and case + municipality = iconv(municipality, from = "Latin1", to = "ASCII//TRANSLIT"), + municipality = gsub("(^| )([a-z])", "\\1\\U\\2", municipality, perl = TRUE), + # Fix errors in names + municipality = recode( + muni_id, + "0413" = "San Andres Itzapa", + "2009" = "Quezaltepeque", + "0204" = "San Cristobal Acasaguastlan", + "0502" = "Santa Lucia Cotzumalguapa", + "0506" = "Tiquizate", + "0117" = "San Miguel Petapa", + "0111" = "San Raimundo", + "1314" = "San Rafael La Independencia", + "1320" = "San Sebastian Huehuetenango", + "1326" = "Santa Cruz Barillas", + "2107" = "Mataquescuintla", + "2217" = "Quezada", + "1420" = "Ixcan", + "1105" = "San Felipe", + "0712" = "San Antonio Palopo", + "0711" = "Santa Catarina Palopo", + "1004" = "San Bernardino", + "1009" = "San Pablo Jocopilas", + "1011" = "San Miguel Panan", + "0806" = "Santa Maria Chiquimula", + .default = municipality + ) + ) + + + + #------------------------------------------------------------------------------* # Load births data ---- #------------------------------------------------------------------------------* From dfabf50af6143346ace80743c8cdf987a920e70c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Mon, 8 May 2017 19:26:38 -0600 Subject: [PATCH 21/38] Label municipalities --- scripts/collect-raw-vital-stats.R | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 21ee25b..ed45e8a 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -276,7 +276,21 @@ alive <- births %>% mutate( age_group = factor(age_group, levels = age_groups, ordered = TRUE) ) %>% - arrange(department, municipality, year, age_group) + arrange(department, municipality, year, age_group) %>% + # Fix municipality codes + mutate( + municipality = ifelse( + test = as.integer(municipality) < 99, + yes = paste0( + stringr::str_pad(department, width = 2, side = "left", pad = "0"), + stringr::str_pad(municipality, width = 2, side = "left", pad = "0") + ), + no = municipality + ) + ) %>% + # Tag with municipality names + rename(muni_id = municipality) %>% + left_join(municipalities) From 5cbdc5728d6d5121634b24beb9ae72a4c7e72192 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Mon, 8 May 2017 19:26:47 -0600 Subject: [PATCH 22/38] Filter needed departments --- scripts/collect-raw-vital-stats.R | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index ed45e8a..50049fc 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -277,6 +277,8 @@ alive <- births %>% age_group = factor(age_group, levels = age_groups, ordered = TRUE) ) %>% arrange(department, municipality, year, age_group) %>% + # Keep only used departments + filter(department %in% c("6", "9")) %>% # Fix municipality codes mutate( municipality = ifelse( From 51183d90722e58f6d5321ae652619faf7900711d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Mon, 8 May 2017 19:30:01 -0600 Subject: [PATCH 23/38] Label departments --- scripts/collect-raw-vital-stats.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 50049fc..2885232 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -279,8 +279,8 @@ alive <- births %>% arrange(department, municipality, year, age_group) %>% # Keep only used departments filter(department %in% c("6", "9")) %>% - # Fix municipality codes mutate( + # Fix municipality codes municipality = ifelse( test = as.integer(municipality) < 99, yes = paste0( @@ -288,6 +288,12 @@ alive <- births %>% stringr::str_pad(municipality, width = 2, side = "left", pad = "0") ), no = municipality + ), + # Label departments + department = recode( + department, + "6" = "Santa Rosa", + "9" = "Quetzaltenango" ) ) %>% # Tag with municipality names From fbbf457ee0224774f6c98533800ccf0f3916bd24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Mon, 8 May 2017 19:30:15 -0600 Subject: [PATCH 24/38] Select used variables --- scripts/collect-raw-vital-stats.R | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 2885232..39ac40f 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -298,7 +298,8 @@ alive <- births %>% ) %>% # Tag with municipality names rename(muni_id = municipality) %>% - left_join(municipalities) + left_join(municipalities) %>% + select(year, department, municipality, age_group, alive) From 40c4a9d9bab6b217ec189ab41ea4d653df4109c3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Tue, 9 May 2017 08:02:10 -0600 Subject: [PATCH 25/38] Use mother origin --- scripts/collect-raw-vital-stats.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 39ac40f..77e7c1c 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -257,7 +257,7 @@ alive <- births %>% # Births by date for each location count( year = event_year, - department = event_department, municipality = event_municipality, + department = mother_department, municipality = mother_municipality, event_date ) %>% # Label with ages at each mid-year From 60b05e7cae356f02b9bdeac04625531bbda46cc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Tue, 9 May 2017 08:02:37 -0600 Subject: [PATCH 26/38] Get population from projections --- scripts/collect-raw-vital-stats.R | 45 +++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 77e7c1c..ba84cd5 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -304,6 +304,51 @@ alive <- births %>% +#------------------------------------------------------------------------------* +# Fix missing ages for each mid-year ---- +#------------------------------------------------------------------------------* +# Use simple year age projections data +#------------------------------------------------------------------------------* + +# Prepare simple year projections from raw data +processed_file <- system("Rscript scripts/collect-raw-municipality.R", intern = TRUE) +load(file = processed_file[length(processed_file)] ) +rm(processed_file) + +# Sum simple year projections for +simple_year <- population %>% + filter( + between(year, 2010, 2011), # Years missing data + between(age, 0, 4), # Relevant ages + department %in% c("Santa Rosa", "Quetzaltenango") # Relevant departments + ) %>% + group_by(year, department, municipality, age) %>% + summarize(population = sum(population)) %>% + # Keep necessary data + filter( + (year == 2013 & age == 4) | + (year == 2012 & between(age, 3, 4)) | + (year == 2011 & between(age, 2, 4)) | + (year == 2010 & between(age, 1, 4)) + ) %>% + # Label age groups + mutate( + age_group = recode( + age, + "1" = "12-23 months", + "2" = "24-35 months", + "3" = "36-59 months", + "4" = "36-59 months" + ), + age_group = factor(age_group, levels = age_groups, ordered = TRUE) + ) %>% + # Summarize + group_by(year, department, municipality, age_group) %>% + summarize(alive = sum(population)) + + + + #------------------------------------------------------------------------------* # Load deaths data ---- #------------------------------------------------------------------------------* From 8547e56503667ff1f24a85d03c844625e470a7e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Tue, 16 May 2017 12:17:34 -0600 Subject: [PATCH 27/38] Use data for all study years --- scripts/collect-raw-vital-stats.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index ba84cd5..fff3e03 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -318,7 +318,7 @@ rm(processed_file) # Sum simple year projections for simple_year <- population %>% filter( - between(year, 2010, 2011), # Years missing data + between(year, 2010, 2015), # Years missing data between(age, 0, 4), # Relevant ages department %in% c("Santa Rosa", "Quetzaltenango") # Relevant departments ) %>% From 8d3e42af6392f08733f64a33e4ea4bffc792f12b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Tue, 16 May 2017 12:20:49 -0600 Subject: [PATCH 28/38] Process deaths data before counting population --- scripts/collect-raw-vital-stats.R | 90 +++++++++++++++---------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index fff3e03..ab1fbf1 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -304,51 +304,6 @@ alive <- births %>% -#------------------------------------------------------------------------------* -# Fix missing ages for each mid-year ---- -#------------------------------------------------------------------------------* -# Use simple year age projections data -#------------------------------------------------------------------------------* - -# Prepare simple year projections from raw data -processed_file <- system("Rscript scripts/collect-raw-municipality.R", intern = TRUE) -load(file = processed_file[length(processed_file)] ) -rm(processed_file) - -# Sum simple year projections for -simple_year <- population %>% - filter( - between(year, 2010, 2015), # Years missing data - between(age, 0, 4), # Relevant ages - department %in% c("Santa Rosa", "Quetzaltenango") # Relevant departments - ) %>% - group_by(year, department, municipality, age) %>% - summarize(population = sum(population)) %>% - # Keep necessary data - filter( - (year == 2013 & age == 4) | - (year == 2012 & between(age, 3, 4)) | - (year == 2011 & between(age, 2, 4)) | - (year == 2010 & between(age, 1, 4)) - ) %>% - # Label age groups - mutate( - age_group = recode( - age, - "1" = "12-23 months", - "2" = "24-35 months", - "3" = "36-59 months", - "4" = "36-59 months" - ), - age_group = factor(age_group, levels = age_groups, ordered = TRUE) - ) %>% - # Summarize - group_by(year, department, municipality, age_group) %>% - summarize(alive = sum(population)) - - - - #------------------------------------------------------------------------------* # Load deaths data ---- #------------------------------------------------------------------------------* @@ -415,4 +370,49 @@ deaths <- deaths %>% +#------------------------------------------------------------------------------* +# Fix missing ages for each mid-year ---- +#------------------------------------------------------------------------------* +# Use simple year age projections data +#------------------------------------------------------------------------------* + +# Prepare simple year projections from raw data +processed_file <- system("Rscript scripts/collect-raw-municipality.R", intern = TRUE) +load(file = processed_file[length(processed_file)] ) +rm(processed_file) + +# Sum simple year projections for +simple_year <- population %>% + filter( + between(year, 2010, 2015), # Years missing data + between(age, 0, 4), # Relevant ages + department %in% c("Santa Rosa", "Quetzaltenango") # Relevant departments + ) %>% + group_by(year, department, municipality, age) %>% + summarize(population = sum(population)) %>% + # Keep necessary data + filter( + (year == 2013 & age == 4) | + (year == 2012 & between(age, 3, 4)) | + (year == 2011 & between(age, 2, 4)) | + (year == 2010 & between(age, 1, 4)) + ) %>% + # Label age groups + mutate( + age_group = recode( + age, + "1" = "12-23 months", + "2" = "24-35 months", + "3" = "36-59 months", + "4" = "36-59 months" + ), + age_group = factor(age_group, levels = age_groups, ordered = TRUE) + ) %>% + # Summarize + group_by(year, department, municipality, age_group) %>% + summarize(alive = sum(population)) + + + + # End of script From 246b6c307b79508162827b76f4c99572e431eb90 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Tue, 16 May 2017 12:22:29 -0600 Subject: [PATCH 29/38] Clarify count membership in variables (births, population) --- scripts/collect-raw-vital-stats.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index ab1fbf1..10f67c6 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -253,7 +253,7 @@ birth_age_groups <- labeled_ages %>% # Count live people by age group -alive <- births %>% +local_births <- births %>% # Births by date for each location count( year = event_year, @@ -270,9 +270,9 @@ alive <- births %>% # Ignore first year, which can not be completed mid_year > ymd("2009-07-01") ) %>% - # Children alive by age group at each mid-year + # Children born by age group at each mid-year count(year = year(mid_year), department, municipality, age_group = label) %>% - rename(alive = nn) %>% + rename(births = nn) %>% mutate( age_group = factor(age_group, levels = age_groups, ordered = TRUE) ) %>% @@ -299,7 +299,7 @@ alive <- births %>% # Tag with municipality names rename(muni_id = municipality) %>% left_join(municipalities) %>% - select(year, department, municipality, age_group, alive) + select(year, department, municipality, age_group, births) @@ -410,7 +410,7 @@ simple_year <- population %>% ) %>% # Summarize group_by(year, department, municipality, age_group) %>% - summarize(alive = sum(population)) + summarize(population = sum(population)) From 3f966a4c0ddbb229aca4f67a471ec808498b98b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Tue, 16 May 2017 12:53:09 -0600 Subject: [PATCH 30/38] Fix first age group --- scripts/collect-raw-vital-stats.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 10f67c6..039d437 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -158,7 +158,7 @@ labeled_ages <- mid_year_ages %>% group = "exclusive", correlative = 1, label = "0-27 days", - date_threshold = mid_year - days(27) + date_threshold = mid_year - days(28) ) %>% bind_rows( mutate( From 054fe3beb441683aeffc4911610ebe6e892e8bc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Tue, 16 May 2017 13:45:56 -0600 Subject: [PATCH 31/38] Only count mid-year births for age groups <1 year --- scripts/collect-raw-vital-stats.R | 54 +++---------------------------- 1 file changed, 5 insertions(+), 49 deletions(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 039d437..7b39f9a 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -128,6 +128,10 @@ age_groups <- c( #------------------------------------------------------------------------------* # Calculate mid-year counts ---- #------------------------------------------------------------------------------* +# Mid year counts for ages < 1 year used to estimate the proportion of children +# in each age group, which will be used to estimate the population in each +# age group from the official population projections. +#------------------------------------------------------------------------------* # Mid year dates mid_years <- births %>% @@ -188,27 +192,6 @@ labeled_ages <- mid_year_ages %>% correlative = 5, label = "9-11 months", date_threshold = mid_year - months(12) - ), - mutate( - ., - group = "exclusive", - correlative = 6, - label = "12-23 months", - date_threshold = mid_year - months(24) - ), - mutate( - ., - group = "exclusive", - correlative = 7, - label = "24-35 months", - date_threshold = mid_year - months(36) - ), - mutate( - ., - group = "exclusive", - correlative = 8, - label = "36-59 months", - date_threshold = mid_year - months(60) ) ) %>% # Assign possible age groups @@ -225,33 +208,6 @@ labeled_ages <- mid_year_ages %>% select(mid_year, event_date, label) -# Label other cummulative age groups -labeled_ages2 <- mid_year_ages %>% - select(mid_year, event_date) %>% - mutate( - months_12 = mid_year - months(12), - months_24 = mid_year - months(25) + days(1), - months_59 = mid_year - months(60), - # born 12-59 months before midyear - "12-59 months" = event_date >= months_59 & event_date < months_12, - # born 24-59 months before midyear - "24-59 months" = event_date >= months_59 & event_date < months_24, - # Any age before 12 months - "0-11 months" = event_date >= months_12 & event_date < mid_year, - # Any age before 60 months - "0-59 months" = event_date >= months_59 & event_date < mid_year - ) %>% - select(-months_12, -months_24, -months_59) %>% - gather(key = label, value = keep, -mid_year, -event_date) %>% - filter(keep) %>% - select(-keep) - - -# Bind labeled birth dates -birth_age_groups <- labeled_ages %>% - bind_rows(labeled_ages2) - - # Count live people by age group local_births <- births %>% # Births by date for each location @@ -261,7 +217,7 @@ local_births <- births %>% event_date ) %>% # Label with ages at each mid-year - left_join(birth_age_groups) %>% + left_join(labeled_ages) %>% filter( # Only keep births inside the mid year pediods !is.na(label), From d7ed5c30654e911b797c61aa31adce9ec0740ae1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Tue, 16 May 2017 16:05:07 -0600 Subject: [PATCH 32/38] Not relevant to have separate groups --- scripts/collect-raw-vital-stats.R | 4 ---- 1 file changed, 4 deletions(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 7b39f9a..086f04e 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -159,7 +159,6 @@ mid_year_ages <- mid_years %>% labeled_ages <- mid_year_ages %>% # Tag all births for both age group types mutate( - group = "exclusive", correlative = 1, label = "0-27 days", date_threshold = mid_year - days(28) @@ -167,21 +166,18 @@ labeled_ages <- mid_year_ages %>% bind_rows( mutate( ., - group = "exclusive", correlative = 2, label = "28 days-<3 month", date_threshold = mid_year - months(3) ), mutate( ., - group = "exclusive", correlative = 3, label = "3-5 months", date_threshold = mid_year - months(6) ), mutate( ., - group = "exclusive", correlative = 4, label = "6-8 months", date_threshold = mid_year - months(9) From 6273e97ab2789eed5eb684219b18b19478bc80ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Tue, 16 May 2017 18:40:20 -0600 Subject: [PATCH 33/38] Filter relevant deaths --- scripts/collect-raw-vital-stats.R | 31 +++++++++++++++++++++---------- 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 086f04e..e59e2c1 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -277,12 +277,16 @@ deaths <- deaths %>% map(~map_df(.x, as.character)) %>% bind_rows(.id = "file_year") -# Prepare birth "events" dataset -deaths <- deaths %>% - # Ignore unknown ages +# Prepare deaths "events" dataset +local_deaths <- deaths %>% filter( + # Ignore unknown ages as.integer(edadif) < 999, - perdif != "9" + perdif != "9", + # Ignore >= 1 year + perdif %in% c("1", "2"), + # Keep only used departments + depreg %in% c("6", "9") ) %>% mutate( record_date = ymd( @@ -307,16 +311,23 @@ deaths <- deaths %>% .default = NA_character_, .missing = NA_character_ ), - birth_date = event_date - period(edadif, units = age_unit), - age_days = as.integer(event_date - birth_date) + age_value = as.integer(edadif) ) %>% + # Calculate ages case by case + rowwise() %>% + do({ + bind_cols( + ., + data_frame(birth_date = .$event_date - period(.$age_value, units = first(.$age_unit))) + ) + }) %>% + ungroup %>% select( # Event data - event_year, event_date, event_department = depocu, event_municipality = mupocu, - # Record metadata - record_date, record_department = depreg, record_municipality = mupreg, + event_year, event_date, event_department = depreg, event_municipality = mupreg, # Deceased data - birth_date, age_days + age_value, age_unit, birth_date + ) ) From 46663594b23e3e550f35e43c19c60508d9f6e8ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Tue, 16 May 2017 18:41:03 -0600 Subject: [PATCH 34/38] Summarize deaths by age group --- scripts/collect-raw-vital-stats.R | 78 +++++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index e59e2c1..b45853c 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -328,6 +328,84 @@ local_deaths <- deaths %>% # Deceased data age_value, age_unit, birth_date ) + + +# Label death events +labeled_deaths <- local_deaths %>% + # Add mid year dates + left_join(mid_years) %>% + # Keep only relevant dates (i.e. died before mid-year) + filter( + event_date < mid_year, # died before mid-year + birth_date > (mid_year - years(1)) # born at most one year prior + ) %>% + # Count relevant deaths by birth date + count( + mid_year, event_department, event_municipality, birth_date + ) %>% + # Tag all births for both age group types + mutate( + correlative = 1, + label = "0-27 days", + date_threshold = mid_year - days(28) + ) %>% + bind_rows( + mutate( + ., + correlative = 2, + label = "28 days-<3 month", + date_threshold = mid_year - months(3) + ), + mutate( + ., + correlative = 3, + label = "3-5 months", + date_threshold = mid_year - months(6) + ), + mutate( + ., + correlative = 4, + label = "6-8 months", + date_threshold = mid_year - months(9) + ), + mutate( + ., + correlative = 5, + label = "9-11 months", + date_threshold = mid_year - months(12) + ) + ) %>% + # Assign possible age groups + mutate( + keep = birth_date >= date_threshold + ) %>% + filter(keep) %>% + select(-keep, -date_threshold) %>% + # Pick oldest applicable age group + arrange(mid_year, birth_date, correlative) %>% + group_by(mid_year, birth_date) %>% + filter(correlative == min(correlative)) %>% + ungroup %>% + # Label municipalities + left_join(municipalities, by = c(event_municipality = "muni_id")) %>% + mutate( + # Label departments + department = recode( + event_department, + `6` = "Santa Rosa", + `9` = "Quetzaltenango" + ), + # Configure age group as factor + label = factor(label, levels = age_groups, ordered = TRUE) + ) %>% + # Count deaths by age group + group_by( + year = year(mid_year), department, municipality, age_group = label + ) %>% + summarize( + deaths = sum(n) + ) %>% + ungroup ) From 088b2d701fc283ee0ece8aca72260f4c09efd5d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Tue, 16 May 2017 18:41:51 -0600 Subject: [PATCH 35/38] Get total alive by mid year --- scripts/collect-raw-vital-stats.R | 55 +++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index b45853c..8e39e63 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -406,6 +406,61 @@ labeled_deaths <- local_deaths %>% deaths = sum(n) ) %>% ungroup + + + + +#------------------------------------------------------------------------------* +# Calculate proportion contributed by each <1 year age group ---- +#------------------------------------------------------------------------------* + +# Get proportions by year, department, municipality and age group +proportion_age_group <- local_births %>% + left_join(labeled_deaths) %>% + mutate( + # Fill missing values with 0 + deaths = ifelse(is.na(deaths), 0, deaths), + # Get alive count at mid-year + alive = births - deaths + ) %>% + group_by(year, department, municipality) %>% + mutate( + proportion = alive / sum(alive) + ) + +# PLot proportions by year, department, municipality and age group +plot_age_groups <- proportion_age_group %>% + ggplot(aes(x = year, y = proportion)) + + geom_line( + aes(group = municipality), + alpha = 0.3 + ) + + geom_line( + data = summarize( + group_by(proportion_age_group, year, department, age_group), + proportion = mean(proportion) + ), + color = "red", size = 1 + ) + + facet_grid(age_group ~ department) + + theme_bw() + +# Summarize to single proportion by age group <1 year +proportion_age_group <- proportion_age_group %>% + group_by(age_group) %>% + summarize( + proportion = mean(proportion) + ) + +# Check deviations from uniform distribution assumption +proportion_age_group %>% + mutate( + group_month_interval = c(1, 2, 3, 3, 3), + uniform_proportion = group_month_interval / 12, + deviation = proportion - uniform_proportion + ) %>% + filter( + deviation > 0.01 ) From 09b30f6a68eb76e9cab307e1158e24fb1722b8f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Wed, 17 May 2017 11:21:05 -0600 Subject: [PATCH 36/38] Stop if proportion per group is not uniform --- scripts/collect-raw-vital-stats.R | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 8e39e63..10e8b93 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -453,7 +453,7 @@ proportion_age_group <- proportion_age_group %>% ) # Check deviations from uniform distribution assumption -proportion_age_group %>% +different_rows <- proportion_age_group %>% mutate( group_month_interval = c(1, 2, 3, 3, 3), uniform_proportion = group_month_interval / 12, @@ -463,6 +463,8 @@ proportion_age_group %>% deviation > 0.01 ) +if( nrow(different_rows) > 0 ) stop("Data deviates from uniform assumption") + From adaa13fc1b057b29bd8c0abb3a06072b9b233d8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Wed, 17 May 2017 12:29:14 -0600 Subject: [PATCH 37/38] Estimate population by age group from official projections --- scripts/collect-raw-vital-stats.R | 34 ++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 10e8b93..7425ec1 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -480,7 +480,7 @@ load(file = processed_file[length(processed_file)] ) rm(processed_file) # Sum simple year projections for -simple_year <- population %>% +flu_age_groups <- population %>% filter( between(year, 2010, 2015), # Years missing data between(age, 0, 4), # Relevant ages @@ -488,26 +488,40 @@ simple_year <- population %>% ) %>% group_by(year, department, municipality, age) %>% summarize(population = sum(population)) %>% - # Keep necessary data - filter( - (year == 2013 & age == 4) | - (year == 2012 & between(age, 3, 4)) | - (year == 2011 & between(age, 2, 4)) | - (year == 2010 & between(age, 1, 4)) + # Add population proportion for each age group <1 + left_join( + mutate(proportion_age_group, age = 0, age_group = as.character(age_group)) ) %>% - # Label age groups mutate( + # Estimate population for age groups <1 year + population = ifelse( + test = age == 0 & !is.na(proportion), + yes = population * proportion, + no = population + ), age_group = recode( age, "1" = "12-23 months", "2" = "24-35 months", "3" = "36-59 months", - "4" = "36-59 months" + "4" = "36-59 months", + .default = age_group ), age_group = factor(age_group, levels = age_groups, ordered = TRUE) ) %>% + # Add cummulative age_groups + bind_rows( + mutate(filter(., age_group < "12-23 months"), age_group = "0-11 months"), + mutate(., age_group = "0-59 months"), + mutate(filter(., age_group > "0-11 months"), age_group = "12-59 months"), + mutate(filter(., age_group > "12-23 months"), age_group = "24-59 months") + ) %>% + # Orger age groups + mutate( + age_group = factor(age_group, levels = age_groups, ordered = TRUE) + ) %>% # Summarize - group_by(year, department, municipality, age_group) %>% + group_by(department, municipality, year, age_group) %>% summarize(population = sum(population)) From 05be3105192fb489e1e869e53ca825c4f1d403e3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Oscar=20de=20Le=C3=B3n?= Date: Wed, 17 May 2017 12:43:35 -0600 Subject: [PATCH 38/38] Save estimated population --- .gitignore | 3 +++ output/.gitkeep | 0 scripts/collect-raw-vital-stats.R | 4 ++++ 3 files changed, 7 insertions(+) create mode 100644 output/.gitkeep diff --git a/.gitignore b/.gitignore index 0f6f437..f06796a 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ data/raw/* *.xls* *.RData *.rds + +# Ignore output +output/ diff --git a/output/.gitkeep b/output/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R index 7425ec1..8c3ce9d 100644 --- a/scripts/collect-raw-vital-stats.R +++ b/scripts/collect-raw-vital-stats.R @@ -525,6 +525,10 @@ flu_age_groups <- population %>% summarize(population = sum(population)) +# Save population estimates for the age groups +write_csv(flu_age_groups, path = "output/flu_edinburgh_population.csv") + + # End of script