Skip to content

Commit

Permalink
First commit: setup
Browse files Browse the repository at this point in the history
  • Loading branch information
camille-s committed Apr 29, 2024
0 parents commit 3cb900d
Show file tree
Hide file tree
Showing 3 changed files with 235 additions and 0 deletions.
11 changes: 11 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
.Rproj.user
.Rhistory
.Rdata
.httr-oauth
.DS_Store
Digraph*
.vscode
.snakemake
.env
*_uploaded.json
*_downloaded.json
53 changes: 53 additions & 0 deletions README.qmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
---
title: Data for legislative district profiles--2022 ACS, 2023 PLACES
engine: knitr
execute:
echo: false
format: gfm
---

# README

Sources for the profile data are the most recent ACS (2022), CDC PLACES (2023 release), and USALEEP (not updated). There's no fresh analysis done in this repo---this is a way to assemble data from other projects with readable headings and descriptions, and prep it for each city's neighborhood profiles and online visualization.

Datasets prepped for download from other repos are in their respective tagged releases to ensure their stability and reproduceability. Not all assets from each tag are used, but the files in those releases are:

```{r}
#| message: false
repos <- list("2022acs" = "dist",
"cdc_aggs" = "v2023",
"scratchpad" = "meta") |>
tibble::enframe(name = "repo", value = "tag") |>
tidyr::unnest(tag)
repos |>
purrr::pmap(function(repo, tag) {
q <- stringr::str_glue("gh release view {tag} --repo CT-Data-Haven/{repo} --json tagName,assets,url")
system(q, intern = TRUE)
}) |>
purrr::map(jsonlite::fromJSON) |>
purrr::map(dplyr::as_tibble) |>
purrr::map(tidyr::unnest, assets, names_sep = "_") |>
purrr::map(dplyr::select, tag = tagName, assets_name, url, updated = assets_updatedAt) |>
purrr::map(dplyr::mutate, repo = stringr::str_extract(url, "(?<=CT\\-Data\\-Haven\\/)(\\w+)(?=\\/)")) |>
dplyr::bind_rows() |>
dplyr::mutate(tag = stringr::str_glue("[{tag}]({url})")) |>
dplyr::group_by(repo, tag, updated) |>
dplyr::summarise(assets = toString(assets_name)) |>
knitr::kable()
```

This uses snakemake to build. Rules available are:

```{bash}
snakemake --list-rules
```

Build process is as follows:


```{bash}
snakemake --filegraph | dot -T png > dag.png
```

![snakemake DAG](dag.png)
171 changes: 171 additions & 0 deletions Snakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
from dotenv import load_dotenv
import os
load_dotenv()
dw_key = os.getenv('DW_AUTH_TOKEN')
# ---- SETUP ----
acs_year = 2022
cdc_year = 2023
houses = ['upper', 'lower']


def r_with_args(script):
cmd = f'Rscript {script} {acs_year} {cdc_year}'
return cmd

envvars:
'DW_AUTH_TOKEN'
# ---- RULES ----
rule download_data:
output:
acs = f'input_data/acs_town_basic_profile_{acs_year}.rds',
cdc = f'input_data/cdc_health_all_lvls_nhood_{cdc_year}.rds',
acs_head = '_utils/acs_indicator_headings.txt',
cdc_head = '_utils/cdc_indicators.txt',
flag = '.meta_downloaded.json',
params:
acs_year = acs_year,
cdc_year = cdc_year,
shell:
'''
bash ./scripts/00a_download_data.sh {params.acs_year} {params.cdc_year}
'''

rule headings:
input:
rules.download_data.output.acs_head,
rules.download_data.output.cdc_head,
output:
headings = 'to_viz/indicators.json',
script:
'scripts/00b_make_headings.R'

rule legislators:
output:
legislators = '_utils/legislators.rds',
xwalk = '_utils/town_dist_xwalk.rds',
script:
'scripts/00c_scrape_cga.R'

rule notes:
input:
legislators = rules.legislators.output.legislators,
xwalk = rules.legislators.output.xwalk,
sources = '_utils/manual/sources.txt',
output:
notes = 'to_viz/notes.json',
members = 'to_viz/members.json',
script:
'scripts/08_make_geo_notes.R'

rule combine_datasets:
input:
rules.download_data.output.acs,
rules.download_data.output.cdc,
'scripts/01_join_acs_health.R',
params:
acs_year = acs_year,
cdc_year = cdc_year,
output:
comb = f'output_data/all_legis_{acs_year}_acs_health_comb.rds',
script:
'scripts/01_join_acs_health.R'

rule distro:
input:
rules.headings.output.headings,
rules.combine_datasets.output.comb,
params:
acs_year = acs_year,
output:
expand('to_distro/{house}_legis_{year}_acs_health_comb.csv', house = houses, year = acs_year),
script:
'scripts/02_prep_distro.R'

rule viz_data:
input:
rules.combine_datasets.output.comb,
params:
acs_year = acs_year,
output:
viz = f'to_viz/legis_wide_{acs_year}.json',
script:
'scripts/03_prep_json_to_viz.R'


rule make_shapes:
output:
expand('to_viz/shapes/{house}_topo.json', house = houses),
script:
'scripts/04_make_shapefiles.R'


rule upload_shapes:
input:
rules.make_shapes.output,
output:
'.shapes_uploaded.json'
shell:
'bash ./scripts/05_upload_shapes_release.sh {input}'


rule upload_viz_data:
input:
data = rules.viz_data.output.viz,
headings = rules.headings.output.headings,
notes = rules.notes.output.notes,
output:
'.viz_uploaded.json',
shell:
'bash ./scripts/07_upload_data_release.sh {input.data} {input.headings} {input.notes}'


# rule sync_to_dw:
# input:
# rules.distro.output,
# output:
# '.dw_uploaded.json',
# params:
# key = os.environ['DW_AUTH_TOKEN'],
# year = acs_year,
# files = rules.distro.output,
# shell:
# '''
# bash ./scripts/06_sync_to_dw.sh {params.key} {params.year} {params.files}
# '''


# ---- MAIN TARGETS ----

rule readme:
input:
readme = 'README.qmd',
snakefile = 'Snakefile',
output:
md = 'README.md',
dag = 'dag.png',
shell:
'quarto render {input.readme}'

rule all:
default_target: True
input:
rules.readme.output.md,
rules.viz_data.output,
rules.distro.output,
rules.upload_shapes.output,
rules.upload_viz_data.output,
# rules.sync_to_dw.output,
rules.download_data.output.flag,

# ---- CLEANUP ----
rule clean:
shell:
'''
rm -f to_distro/*.csv \
to_viz/*.json \
to_viz/shapes/*.json \
input_data/*.rds \
output_data/*.rds \
_utils/*.txt \
_utils/*.rds
'''

0 comments on commit 3cb900d

Please sign in to comment.