-
Notifications
You must be signed in to change notification settings - Fork 0
/
092.save-raws-to-Rdata.Rmd
102 lines (81 loc) · 3.21 KB
/
092.save-raws-to-Rdata.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
---
title: "Save raw data into .Rdata to be loaded in by analysis notebooks"
output:
html_document:
theme: flatly
toc: true
toc_float: true
code_download: true
highlight: tango
knit: (function(inputFile, encoding) {
rmarkdown::render(inputFile, encoding = encoding, output_dir = "docs") })
---
```{r setup, include=FALSE}
library(tidyverse)
library(lubridate)
source('utils/r-utils.R')
library(rnaturalearth)
theme_set(theme_bw() + theme(legend.title = element_blank()))
```
```{r}
world <- rnaturalearth::ne_countries(scale='medium', returnclass = 'sf')
```
### General data read-in
```{r}
all_full_names <- readr::read_tsv('data/names/full-names.tsv.xz') %>% distinct()
nat_to_reg <- world %>%
select(- geometry) %>%
as_tibble() %>%
select(iso_a2, name, region_wb) %>%
rename('countries' = iso_a2,
'country_name' = name,
'region' = region_wb) %>%
mutate(country_name = country_name %>%
gsub('United States of America', 'United States', .))
nat_to_reg[nat_to_reg$country_name == 'Norway', 'countries'] <- 'NO'
nat_to_reg[nat_to_reg$country_name == 'Somaliland', 'countries'] <- 'SO'
nat_to_reg[nat_to_reg$country_name == 'France', 'countries'] <- 'FR'
nat_to_reg %>% filter(is.na(countries))
articles <- readr::read_tsv('data/pubmed/articles.tsv.xz') %>%
mutate(year = substr(publication_date, 1, 4) %>% ymd(truncated = 2),
publication_date = ymd(publication_date, truncated = 2)) %>%
filter(year(publication_date) < 2020)
# citations <- xml2::read_xml('data/pubmed/esummary/compbio-english.xml.xz')
corr_authors <- readr::read_tsv(
'data/names/corresponding-authors.tsv.xz',
col_types = readr::cols(fore_name_simple = readr::col_character())) %>%
inner_join(articles, by = 'pmid') %>%
mutate(adjusted_citations = sqrt(pmc_cited_by_count + 1))
keynotes <- readr::read_tsv('data/iscb/keynotes.tsv') %>%
mutate(publication_date = ymd(year, truncated = 2),
year = ymd(year, truncated = 2)) %>%
left_join(select(all_full_names, - full_name), by = c('fore_name', 'last_name')) %>%
filter(year(year) < 2020, conference != 'PSB') # remove PSB, exclude ISCB Fellows and ISMB speakers in 2020 for now
keynotes %>% filter(is.na(fore_name_simple))
large_jours <- articles %>%
count(journal, sort = T) %>%
head(10)
nationalize_df <- read_tsv('https://raw.githubusercontent.com/greenelab/wiki-nationality-estimate/6ab0feeca430ae9997dbaf8f81707359be50a17d/data/NamePrism_results_authors.tsv') %>%
rename('full_name' = X1) %>%
distinct(full_name, .keep_all = T) %>%
left_join(all_full_names, by = 'full_name')
```
- Number of articles from 1993-2019: `r nrow(articles)` (~ 100 articles with no authors).
- Number of last authors: `r nrow(corr_authors)`.
```{r}
corr_authors %>%
count(year, name = 'Number of articles with last authors') %>%
DT::datatable(rownames = F)
```
**If we set a threshold at least 200 articles a year, we should only consider articles from 1998 on.**
```{r}
corr_authors <- corr_authors %>%
add_count(year, name = 'n_aut_yr') %>%
filter(n_aut_yr > 200) %>%
select(- n_aut_yr)
nrow(corr_authors)
```
```{r}
save(nationalize_df, nat_to_reg, corr_authors, keynotes, large_jours, world,
file = 'Rdata/raws.Rdata')
```