From 3cb900d60d57de091bdcd407fe7f52bb51f44e93 Mon Sep 17 00:00:00 2001 From: camille-s Date: Mon, 29 Apr 2024 15:37:14 -0400 Subject: [PATCH] First commit: setup --- .gitignore | 11 ++++ README.qmd | 53 +++++++++++++++++ Snakefile | 171 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 235 insertions(+) create mode 100644 .gitignore create mode 100644 README.qmd create mode 100644 Snakefile diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ec01508 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +.Rproj.user +.Rhistory +.Rdata +.httr-oauth +.DS_Store +Digraph* +.vscode +.snakemake +.env +*_uploaded.json +*_downloaded.json \ No newline at end of file diff --git a/README.qmd b/README.qmd new file mode 100644 index 0000000..9ba557b --- /dev/null +++ b/README.qmd @@ -0,0 +1,53 @@ +--- +title: Data for legislative district profiles--2022 ACS, 2023 PLACES +engine: knitr +execute: + echo: false +format: gfm +--- + +# README + +Sources for the profile data are the most recent ACS (2022), CDC PLACES (2023 release), and USALEEP (not updated). There's no fresh analysis done in this repo---this is a way to assemble data from other projects with readable headings and descriptions, and prep it for each city's neighborhood profiles and online visualization. + +Datasets prepped for download from other repos are in their respective tagged releases to ensure their stability and reproduceability. Not all assets from each tag are used, but the files in those releases are: + +```{r} +#| message: false +repos <- list("2022acs" = "dist", + "cdc_aggs" = "v2023", + "scratchpad" = "meta") |> + tibble::enframe(name = "repo", value = "tag") |> + tidyr::unnest(tag) + +repos |> + purrr::pmap(function(repo, tag) { + q <- stringr::str_glue("gh release view {tag} --repo CT-Data-Haven/{repo} --json tagName,assets,url") + system(q, intern = TRUE) + }) |> + purrr::map(jsonlite::fromJSON) |> + purrr::map(dplyr::as_tibble) |> + purrr::map(tidyr::unnest, assets, names_sep = "_") |> + purrr::map(dplyr::select, tag = tagName, assets_name, url, updated = assets_updatedAt) |> + purrr::map(dplyr::mutate, repo = stringr::str_extract(url, "(?<=CT\\-Data\\-Haven\\/)(\\w+)(?=\\/)")) |> + dplyr::bind_rows() |> + dplyr::mutate(tag = stringr::str_glue("[{tag}]({url})")) |> + dplyr::group_by(repo, tag, updated) |> + dplyr::summarise(assets = toString(assets_name)) |> + knitr::kable() +``` + +This uses snakemake to build. Rules available are: + +```{bash} +snakemake --list-rules +``` + +Build process is as follows: + + +```{bash} +snakemake --filegraph | dot -T png > dag.png +``` + +![snakemake DAG](dag.png) \ No newline at end of file diff --git a/Snakefile b/Snakefile new file mode 100644 index 0000000..9ed9bb6 --- /dev/null +++ b/Snakefile @@ -0,0 +1,171 @@ +from dotenv import load_dotenv +import os +load_dotenv() +dw_key = os.getenv('DW_AUTH_TOKEN') +# ---- SETUP ---- +acs_year = 2022 +cdc_year = 2023 +houses = ['upper', 'lower'] + + +def r_with_args(script): + cmd = f'Rscript {script} {acs_year} {cdc_year}' + return cmd + +envvars: + 'DW_AUTH_TOKEN' +# ---- RULES ---- +rule download_data: + output: + acs = f'input_data/acs_town_basic_profile_{acs_year}.rds', + cdc = f'input_data/cdc_health_all_lvls_nhood_{cdc_year}.rds', + acs_head = '_utils/acs_indicator_headings.txt', + cdc_head = '_utils/cdc_indicators.txt', + flag = '.meta_downloaded.json', + params: + acs_year = acs_year, + cdc_year = cdc_year, + shell: + ''' + bash ./scripts/00a_download_data.sh {params.acs_year} {params.cdc_year} + ''' + +rule headings: + input: + rules.download_data.output.acs_head, + rules.download_data.output.cdc_head, + output: + headings = 'to_viz/indicators.json', + script: + 'scripts/00b_make_headings.R' + +rule legislators: + output: + legislators = '_utils/legislators.rds', + xwalk = '_utils/town_dist_xwalk.rds', + script: + 'scripts/00c_scrape_cga.R' + +rule notes: + input: + legislators = rules.legislators.output.legislators, + xwalk = rules.legislators.output.xwalk, + sources = '_utils/manual/sources.txt', + output: + notes = 'to_viz/notes.json', + members = 'to_viz/members.json', + script: + 'scripts/08_make_geo_notes.R' + +rule combine_datasets: + input: + rules.download_data.output.acs, + rules.download_data.output.cdc, + 'scripts/01_join_acs_health.R', + params: + acs_year = acs_year, + cdc_year = cdc_year, + output: + comb = f'output_data/all_legis_{acs_year}_acs_health_comb.rds', + script: + 'scripts/01_join_acs_health.R' + +rule distro: + input: + rules.headings.output.headings, + rules.combine_datasets.output.comb, + params: + acs_year = acs_year, + output: + expand('to_distro/{house}_legis_{year}_acs_health_comb.csv', house = houses, year = acs_year), + script: + 'scripts/02_prep_distro.R' + +rule viz_data: + input: + rules.combine_datasets.output.comb, + params: + acs_year = acs_year, + output: + viz = f'to_viz/legis_wide_{acs_year}.json', + script: + 'scripts/03_prep_json_to_viz.R' + + +rule make_shapes: + output: + expand('to_viz/shapes/{house}_topo.json', house = houses), + script: + 'scripts/04_make_shapefiles.R' + + +rule upload_shapes: + input: + rules.make_shapes.output, + output: + '.shapes_uploaded.json' + shell: + 'bash ./scripts/05_upload_shapes_release.sh {input}' + + +rule upload_viz_data: + input: + data = rules.viz_data.output.viz, + headings = rules.headings.output.headings, + notes = rules.notes.output.notes, + output: + '.viz_uploaded.json', + shell: + 'bash ./scripts/07_upload_data_release.sh {input.data} {input.headings} {input.notes}' + + +# rule sync_to_dw: +# input: +# rules.distro.output, +# output: +# '.dw_uploaded.json', +# params: +# key = os.environ['DW_AUTH_TOKEN'], +# year = acs_year, +# files = rules.distro.output, +# shell: +# ''' +# bash ./scripts/06_sync_to_dw.sh {params.key} {params.year} {params.files} +# ''' + + +# ---- MAIN TARGETS ---- + +rule readme: + input: + readme = 'README.qmd', + snakefile = 'Snakefile', + output: + md = 'README.md', + dag = 'dag.png', + shell: + 'quarto render {input.readme}' + +rule all: + default_target: True + input: + rules.readme.output.md, + rules.viz_data.output, + rules.distro.output, + rules.upload_shapes.output, + rules.upload_viz_data.output, + # rules.sync_to_dw.output, + rules.download_data.output.flag, + +# ---- CLEANUP ---- +rule clean: + shell: + ''' + rm -f to_distro/*.csv \ + to_viz/*.json \ + to_viz/shapes/*.json \ + input_data/*.rds \ + output_data/*.rds \ + _utils/*.txt \ + _utils/*.rds + ''' \ No newline at end of file