diff --git a/.gitignore b/.gitignore index 0f6f437..f06796a 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,6 @@ data/raw/* *.xls* *.RData *.rds + +# Ignore output +output/ diff --git a/lastest-prerelease b/lastest-prerelease new file mode 100644 index 0000000..45c7a58 --- /dev/null +++ b/lastest-prerelease @@ -0,0 +1 @@ +v0.0.1 diff --git a/output/.gitkeep b/output/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/scripts/collect-raw-municipality.R b/scripts/collect-raw-municipality.R index 0fad7bf..5b24e3d 100644 --- a/scripts/collect-raw-municipality.R +++ b/scripts/collect-raw-municipality.R @@ -140,7 +140,7 @@ file.remove(file_list) #------------------------------------------------------------------------------* -# Collect data from 2000-2010 period ---- +# Collect data from 2011-2015 period ---- #------------------------------------------------------------------------------* # Zip path @@ -178,20 +178,40 @@ read_population_2011 <- function(file_path, skip = 3){ # Report sheet (year) cat(" ", sheet) - fixes <- c( + + #--------------------------------------------------------------------------* + # Fix parameters for specific files / sheets + #--------------------------------------------------------------------------* + + # Fix individual skip parameters + fixes_skip <- c( "Totonicapan", "Suchitepequez", "Retalhuleu", "Quiche", "Alta Verapaz", "Peten", "Jutiapa", " Jalapa", "Chiquimula", "Izabal", "Santa Rosa" ) - # Configuration exceptions skip <- case_when( - # department == "Santa Rosa" & sheet == "2012" ~ 4, - # department == "Santa Rosa" & sheet == "2015" ~ 5, - department %in% fixes & sheet == "2012" ~ 4, - department %in% fixes & sheet == "2015" ~ 5, + department %in% fixes_skip & sheet == "2012" ~ 4, + department %in% fixes_skip & sheet == "2015" ~ 5, TRUE ~ skip ) + # Fix missing municipality names + fix_municipalities <- function(.data){ + .data %>% + # Setup fixing rules + mutate( + municipality = case_when( + department == "Santa Rosa" & grepl("^X", municipality) ~ "Nueva Santa Rosa", + TRUE ~ municipality + ) + ) %>% + return() + } + + #--------------------------------------------------------------------------* + # Get data + #--------------------------------------------------------------------------* + # Read file contents pop_sheet <- read_excel( path = file, sheet = sheet, skip = skip_lines , na = "?" @@ -230,6 +250,8 @@ read_population_2011 <- function(file_path, skip = 3){ ), department = department ) %>% + # Fix missingmunicipality names + fix_municipalities() %>% select(year, department, municipality, sex = sexo, age, population) %>% mutate( # Fix factors @@ -568,12 +590,13 @@ pop_2016_2020 <- pop_2016_2020_predicted %>% population <- pop_2000_2015 %>% bind_rows(pop_2016_2020) +processed_file <- "data/processed/gt_2000_2020_municipality_population.RData" + # Save population data for use in R save( - population, file = "data/processed/gt_2000_2020_municipality_population.RData" + population, file = processed_file ) - - +cat(processed_file) # End of script diff --git a/scripts/collect-raw-vital-stats.R b/scripts/collect-raw-vital-stats.R new file mode 100644 index 0000000..8c3ce9d --- /dev/null +++ b/scripts/collect-raw-vital-stats.R @@ -0,0 +1,534 @@ +#------------------------------------------------------------------------------* +# Collect raw vital statistics at the department level +#------------------------------------------------------------------------------* + +#------------------------------------------------------------------------------* +# Prepare environment ---- +#------------------------------------------------------------------------------* + +# Load used packages +library(package = "lubridate") +library(package = "haven") +library(package = "tidyverse") + +# Set metadata +data_path <- "data/raw/vital-statistics/" + + + + +#------------------------------------------------------------------------------* +# Load locations data ---- +#------------------------------------------------------------------------------* + +# Get municipality codes +municipalities <- sf::read_sf("data/raw/geo-data/municipalities.shp") %>% + select(muni_id = COD_MUNI, municipality = MUNICIPIO) %>% + mutate( + # Fix encoding and case + municipality = iconv(municipality, from = "Latin1", to = "ASCII//TRANSLIT"), + municipality = gsub("(^| )([a-z])", "\\1\\U\\2", municipality, perl = TRUE), + # Fix errors in names + municipality = recode( + muni_id, + "0413" = "San Andres Itzapa", + "2009" = "Quezaltepeque", + "0204" = "San Cristobal Acasaguastlan", + "0502" = "Santa Lucia Cotzumalguapa", + "0506" = "Tiquizate", + "0117" = "San Miguel Petapa", + "0111" = "San Raimundo", + "1314" = "San Rafael La Independencia", + "1320" = "San Sebastian Huehuetenango", + "1326" = "Santa Cruz Barillas", + "2107" = "Mataquescuintla", + "2217" = "Quezada", + "1420" = "Ixcan", + "1105" = "San Felipe", + "0712" = "San Antonio Palopo", + "0711" = "Santa Catarina Palopo", + "1004" = "San Bernardino", + "1009" = "San Pablo Jocopilas", + "1011" = "San Miguel Panan", + "0806" = "Santa Maria Chiquimula", + .default = municipality + ) + ) + + + + +#------------------------------------------------------------------------------* +# Load births data ---- +#------------------------------------------------------------------------------* + +# Read labeled data +births <- list.files(path = data_path, pattern = "births") %>% + paste0(data_path, .) %>% + set_names(gsub("[^0-9]", "", .)) %>% + map(read_spss) %>% + map(~set_names(.x, tolower(iconv(names(.x), to = "ASCII//TRANSLIT")))) + +# Read labels from imported data +birth_labels <- births %>% + map(~map(.x, ~attr(.x, "labels"))) + +# Bind imported datasets +births <- births %>% + map(~map_df(.x, zap_labels)) %>% + map(~map_df(.x, as.character)) %>% + bind_rows(.id = "file_year") + +# Prepare birth "events" dataset +births <- births %>% + mutate( + record_date = ymd( + paste( + stringr::str_pad(anoreg, width = 2, side = "left", pad = "0"), + mesreg, "01", sep = "-" + ) + ), + anoocu = ifelse(is.na(anoocu), file_year, anoocu), + event_date = ymd( + paste( + stringr::str_pad(anoocu, width = 2, side = "left", pad = "0"), + mesocu, diaocu, sep = "-" + ) + ), + event_year = year(event_date) + ) %>% + select( + # Event data + event_year, event_date, event_department = depocu, event_municipality = mupocu, + # Record metadata + record_date, record_department = depreg, record_municipality = mupreg, + # Mother residency location + mother_department = deprem, mother_municipality = muprem + ) + + + + +#------------------------------------------------------------------------------* +# Define age groups ---- +#------------------------------------------------------------------------------* + +# Order age groups for output +age_groups <- c( + "0-27 days", "28 days-<3 month", "3-5 months", "6-8 months", "9-11 months", + "0-11 months", + "12-23 months", "24-35 months", "36-59 months", + "0-59 months", + "12-59 months", "24-59 months" +) + + + + +#------------------------------------------------------------------------------* +# Calculate mid-year counts ---- +#------------------------------------------------------------------------------* +# Mid year counts for ages < 1 year used to estimate the proportion of children +# in each age group, which will be used to estimate the population in each +# age group from the official population projections. +#------------------------------------------------------------------------------* + +# Mid year dates +mid_years <- births %>% + pull(event_date) %>% + year() %>% + unique() %>% + paste0("-07-01") %>% + ymd() %>% + data_frame(mid_year = ., event_year = NA) %>% + group_by(mid_year) %>% + do({ + data_frame( + event_year = seq(min(year(births$event_date)), year(.$mid_year)) + ) + }) %>% + ungroup() + +# Calculate mid year ages given birth date +mid_year_ages <- mid_years %>% + # Relevant births for each year + left_join(unique(select(births, event_year, event_date))) %>% + filter(event_date < mid_year) + + +labeled_ages <- mid_year_ages %>% + # Tag all births for both age group types + mutate( + correlative = 1, + label = "0-27 days", + date_threshold = mid_year - days(28) + ) %>% + bind_rows( + mutate( + ., + correlative = 2, + label = "28 days-<3 month", + date_threshold = mid_year - months(3) + ), + mutate( + ., + correlative = 3, + label = "3-5 months", + date_threshold = mid_year - months(6) + ), + mutate( + ., + correlative = 4, + label = "6-8 months", + date_threshold = mid_year - months(9) + ), + mutate( + ., + group = "exclusive", + correlative = 5, + label = "9-11 months", + date_threshold = mid_year - months(12) + ) + ) %>% + # Assign possible age groups + mutate( + keep = event_date >= date_threshold + ) %>% + filter(keep) %>% + select(-keep, -date_threshold) %>% + # Pick oldest applicable age group + arrange(mid_year, event_date, correlative) %>% + group_by(mid_year, event_date) %>% + filter(correlative == min(correlative)) %>% + ungroup %>% + select(mid_year, event_date, label) + + +# Count live people by age group +local_births <- births %>% + # Births by date for each location + count( + year = event_year, + department = mother_department, municipality = mother_municipality, + event_date + ) %>% + # Label with ages at each mid-year + left_join(labeled_ages) %>% + filter( + # Only keep births inside the mid year pediods + !is.na(label), + # Ignore first half of 2009 for every succesive year + (mid_year > ymd("2009-07-01") & event_date >= ymd("2009-07-01")), + # Ignore first year, which can not be completed + mid_year > ymd("2009-07-01") + ) %>% + # Children born by age group at each mid-year + count(year = year(mid_year), department, municipality, age_group = label) %>% + rename(births = nn) %>% + mutate( + age_group = factor(age_group, levels = age_groups, ordered = TRUE) + ) %>% + arrange(department, municipality, year, age_group) %>% + # Keep only used departments + filter(department %in% c("6", "9")) %>% + mutate( + # Fix municipality codes + municipality = ifelse( + test = as.integer(municipality) < 99, + yes = paste0( + stringr::str_pad(department, width = 2, side = "left", pad = "0"), + stringr::str_pad(municipality, width = 2, side = "left", pad = "0") + ), + no = municipality + ), + # Label departments + department = recode( + department, + "6" = "Santa Rosa", + "9" = "Quetzaltenango" + ) + ) %>% + # Tag with municipality names + rename(muni_id = municipality) %>% + left_join(municipalities) %>% + select(year, department, municipality, age_group, births) + + + + +#------------------------------------------------------------------------------* +# Load deaths data ---- +#------------------------------------------------------------------------------* + +# Read labeled data +deaths <- list.files(path = data_path, pattern = "deaths") %>% + paste0(data_path, .) %>% + set_names(gsub("[^0-9]", "", .)) %>% + map(read_spss) %>% + map(~set_names(.x, tolower(iconv(names(.x), to = "ASCII//TRANSLIT")))) + +# Read labels from imported data +death_labels <- deaths %>% + map(~map(.x, ~attr(.x, "labels"))) + +# Bind imported datasets +deaths <- deaths %>% + map(~map_df(.x, zap_labels)) %>% + map(~map_df(.x, as.character)) %>% + bind_rows(.id = "file_year") + +# Prepare deaths "events" dataset +local_deaths <- deaths %>% + filter( + # Ignore unknown ages + as.integer(edadif) < 999, + perdif != "9", + # Ignore >= 1 year + perdif %in% c("1", "2"), + # Keep only used departments + depreg %in% c("6", "9") + ) %>% + mutate( + record_date = ymd( + paste( + stringr::str_pad(anoreg, width = 2, side = "left", pad = "0"), + mesreg, "01", sep = "-" + ) + ), + anoocu = ifelse(is.na(anoocu), file_year, anoocu), + event_date = ymd( + paste( + stringr::str_pad(anoocu, width = 2, side = "left", pad = "0"), + mesocu, diaocu, sep = "-" + ) + ), + event_year = year(event_date), + age_unit = recode( + perdif, + "1" = "days", + "2" = "months", + "3" = "years", + .default = NA_character_, + .missing = NA_character_ + ), + age_value = as.integer(edadif) + ) %>% + # Calculate ages case by case + rowwise() %>% + do({ + bind_cols( + ., + data_frame(birth_date = .$event_date - period(.$age_value, units = first(.$age_unit))) + ) + }) %>% + ungroup %>% + select( + # Event data + event_year, event_date, event_department = depreg, event_municipality = mupreg, + # Deceased data + age_value, age_unit, birth_date + ) + + +# Label death events +labeled_deaths <- local_deaths %>% + # Add mid year dates + left_join(mid_years) %>% + # Keep only relevant dates (i.e. died before mid-year) + filter( + event_date < mid_year, # died before mid-year + birth_date > (mid_year - years(1)) # born at most one year prior + ) %>% + # Count relevant deaths by birth date + count( + mid_year, event_department, event_municipality, birth_date + ) %>% + # Tag all births for both age group types + mutate( + correlative = 1, + label = "0-27 days", + date_threshold = mid_year - days(28) + ) %>% + bind_rows( + mutate( + ., + correlative = 2, + label = "28 days-<3 month", + date_threshold = mid_year - months(3) + ), + mutate( + ., + correlative = 3, + label = "3-5 months", + date_threshold = mid_year - months(6) + ), + mutate( + ., + correlative = 4, + label = "6-8 months", + date_threshold = mid_year - months(9) + ), + mutate( + ., + correlative = 5, + label = "9-11 months", + date_threshold = mid_year - months(12) + ) + ) %>% + # Assign possible age groups + mutate( + keep = birth_date >= date_threshold + ) %>% + filter(keep) %>% + select(-keep, -date_threshold) %>% + # Pick oldest applicable age group + arrange(mid_year, birth_date, correlative) %>% + group_by(mid_year, birth_date) %>% + filter(correlative == min(correlative)) %>% + ungroup %>% + # Label municipalities + left_join(municipalities, by = c(event_municipality = "muni_id")) %>% + mutate( + # Label departments + department = recode( + event_department, + `6` = "Santa Rosa", + `9` = "Quetzaltenango" + ), + # Configure age group as factor + label = factor(label, levels = age_groups, ordered = TRUE) + ) %>% + # Count deaths by age group + group_by( + year = year(mid_year), department, municipality, age_group = label + ) %>% + summarize( + deaths = sum(n) + ) %>% + ungroup + + + + +#------------------------------------------------------------------------------* +# Calculate proportion contributed by each <1 year age group ---- +#------------------------------------------------------------------------------* + +# Get proportions by year, department, municipality and age group +proportion_age_group <- local_births %>% + left_join(labeled_deaths) %>% + mutate( + # Fill missing values with 0 + deaths = ifelse(is.na(deaths), 0, deaths), + # Get alive count at mid-year + alive = births - deaths + ) %>% + group_by(year, department, municipality) %>% + mutate( + proportion = alive / sum(alive) + ) + +# PLot proportions by year, department, municipality and age group +plot_age_groups <- proportion_age_group %>% + ggplot(aes(x = year, y = proportion)) + + geom_line( + aes(group = municipality), + alpha = 0.3 + ) + + geom_line( + data = summarize( + group_by(proportion_age_group, year, department, age_group), + proportion = mean(proportion) + ), + color = "red", size = 1 + ) + + facet_grid(age_group ~ department) + + theme_bw() + +# Summarize to single proportion by age group <1 year +proportion_age_group <- proportion_age_group %>% + group_by(age_group) %>% + summarize( + proportion = mean(proportion) + ) + +# Check deviations from uniform distribution assumption +different_rows <- proportion_age_group %>% + mutate( + group_month_interval = c(1, 2, 3, 3, 3), + uniform_proportion = group_month_interval / 12, + deviation = proportion - uniform_proportion + ) %>% + filter( + deviation > 0.01 + ) + +if( nrow(different_rows) > 0 ) stop("Data deviates from uniform assumption") + + + + +#------------------------------------------------------------------------------* +# Fix missing ages for each mid-year ---- +#------------------------------------------------------------------------------* +# Use simple year age projections data +#------------------------------------------------------------------------------* + +# Prepare simple year projections from raw data +processed_file <- system("Rscript scripts/collect-raw-municipality.R", intern = TRUE) +load(file = processed_file[length(processed_file)] ) +rm(processed_file) + +# Sum simple year projections for +flu_age_groups <- population %>% + filter( + between(year, 2010, 2015), # Years missing data + between(age, 0, 4), # Relevant ages + department %in% c("Santa Rosa", "Quetzaltenango") # Relevant departments + ) %>% + group_by(year, department, municipality, age) %>% + summarize(population = sum(population)) %>% + # Add population proportion for each age group <1 + left_join( + mutate(proportion_age_group, age = 0, age_group = as.character(age_group)) + ) %>% + mutate( + # Estimate population for age groups <1 year + population = ifelse( + test = age == 0 & !is.na(proportion), + yes = population * proportion, + no = population + ), + age_group = recode( + age, + "1" = "12-23 months", + "2" = "24-35 months", + "3" = "36-59 months", + "4" = "36-59 months", + .default = age_group + ), + age_group = factor(age_group, levels = age_groups, ordered = TRUE) + ) %>% + # Add cummulative age_groups + bind_rows( + mutate(filter(., age_group < "12-23 months"), age_group = "0-11 months"), + mutate(., age_group = "0-59 months"), + mutate(filter(., age_group > "0-11 months"), age_group = "12-59 months"), + mutate(filter(., age_group > "12-23 months"), age_group = "24-59 months") + ) %>% + # Orger age groups + mutate( + age_group = factor(age_group, levels = age_groups, ordered = TRUE) + ) %>% + # Summarize + group_by(department, municipality, year, age_group) %>% + summarize(population = sum(population)) + + +# Save population estimates for the age groups +write_csv(flu_age_groups, path = "output/flu_edinburgh_population.csv") + + + + +# End of script diff --git a/scripts/get-processed-tarball.R b/scripts/get-processed-tarball.R new file mode 100644 index 0000000..0441398 --- /dev/null +++ b/scripts/get-processed-tarball.R @@ -0,0 +1,31 @@ + + +library(package = "jsonlite") +library(package = "httr") +library(package = "tidyverse") + +# Set request parameters +prerelease <- scan(file = "lastest-prerelease", what = "character") + +releases <- GET( + paste0( + "https://api.github.com/repos/odeleongt/gt-population/releases/tags/", + prerelease + ) +) + +# Get asset uri +asset <- releases %>% + content(as = "text") %>% + fromJSON() %>% + map(~ifelse(is.list(.x), as.data.frame(list(.x)), .x)) %>% + flatten_df() %>% + pull(assets) %>% + GET %>% + content(as = "text") %>% + fromJSON() %>% + getElement("browser_download_url") + +# Download file +download.file(asset, "data/processed/gt_2000_2020_municipality_population.RData") + diff --git a/scripts/process-municipality-data.R b/scripts/process-municipality-data.R new file mode 100644 index 0000000..a2f2fb2 --- /dev/null +++ b/scripts/process-municipality-data.R @@ -0,0 +1,19 @@ +#------------------------------------------------------------------------------* +# Produce population data at the municipality level +#------------------------------------------------------------------------------* + + +#------------------------------------------------------------------------------* +# Load data ---- +#------------------------------------------------------------------------------* + +# Prepare snapshot from raw data +processed_file <- system("Rscript scripts/collect-raw-municipality.R", intern = TRUE) +load(file = processed_file[length(processed_file)] ) + + +# Clean up +rm(processed_file) + + +# End of script