It's now recomended to use this package instead: https://github.com/r-world-devs/GitStats
The aim of this package is to provide a wrapper on gh
to quickly get
you key Github repo information you need.The code here is used within
Roche to quickly let me pull answer simple questions like:
- How many studies have more than 1 data scientist (and roughly what’s the commit split)
- What are the common languages being used (proxied through file type distribution within repos)
- Pull commit metadata to enrich other study info held in other systems
You can install the released version of GithubMetrics from CRAN with:
install.packages("GithubMetrics")
library(GithubMetrics)
library(tidyverse)
library(glue)
organisation <- "openpharma"
Pull all the repos present within an org (that I can see).
repos_raw <- gh_repos_get(
org = organisation
)
repos_clean <- gh_repos_clean(repos_raw)
glimpse(repos_clean)
#> Rows: 14
#> Columns: 7
#> $ name <chr> "BBS-causality-training", "GithubMetrics", "facetsr", …
#> $ full_name <chr> "openpharma/BBS-causality-training", "openpharma/Githu…
#> $ size <int> 27, 118, 2163, 5435, 87, 939, 1817, 79487, 329, 0, 482…
#> $ updated_at <chr> "2021-01-29T18:01:35Z", "2021-02-03T07:07:43Z", "2020-…
#> $ default_branch <chr> "main", "master", "master", "master", "master", "maste…
#> $ language <chr> "R", "R", "R", "Unsure", "Python", "R", "C", "R", "R",…
#> $ MB <dbl> 0.0, 0.1, 2.1, 5.3, 0.1, 0.9, 1.8, 77.6, 0.3, 0.0, 0.5…
Realistically, research code is likely to be on Github Enterprise, so
the .api_url
and .token
parameters can be passed through to gh()
.
Commented code below shows how you can use an on-premise Github server.
# repos_raw <- gh_repos_get(
# org = organisation,
# .api_url = "https://github.roche.com/api/v3",
# .token = Sys.getenv("GITHUB_PAT_ROCHE")
# )
Get every commit for all the repos in this organisation.
repo_all_commits <- gh_commits_get(
repos_clean %>% filter(size > 0) %>% pull(full_name),
days_back = 365*10
)
glimpse(repo_all_commits)
#> Rows: 1,762
#> Columns: 5
#> $ full_name <chr> "openpharma/BBS-causality-training", "openpharma/BBS-c…
#> $ author <chr> "heinzmann537", "heinzmann537", "heinzmann537", "epiji…
#> $ datetime <chr> "2021-01-29T18:00:10Z", "2021-01-29T12:55:54Z", "2021-…
#> $ sha <chr> "5ac98df2a99db3b50abae114e37c00e433903094", "059569252…
#> $ commit_message <chr> "Update variable naming ADALM", "Small change", "First…
Pull all the people that have committed in r
.
contributors <- repo_all_commits %>%
group_by(author) %>%
summarise(
commits = n()
) %>%
filter(!author %in% c(".gitconfig missing email","actions-user"))
contributors <- contributors %>%
left_join(
gh_user_get(contributors$author),
by = c("author"="username")
)
contributors %>%
arrange(-commits) %>%
mutate(
last_active = Sys.Date() - last_active,
contributor = glue('<img src="{avatar}" alt="" height="30"> {author}'),
blog = case_when(
blog == "" ~ "",
TRUE ~ as.character(glue('<a href="{blog}">link</a>'))
)
) %>%
select(contributor,commits,name,last_active,company,location,blog) %>%
knitr::kable(
)
contributor | commits | name | last_active | company | location | blog |
---|---|---|---|---|---|---|
evanmiller | 936 | Evan Miller | 17 days | NA | Chicago, IL | link |
SHAESEN2 | 127 | Steven Haesendonckx | 20 days | NA | NA | |
diego-s | 122 | Diego S | 255 days | NA | NA | |
bailliem | 109 | Mark Baillie | 0 days | NA | Basel, CH | link |
epijim | 89 | James Black | 5 days | Roche | Basel, Switzerland | link |
jaredhobbs | 70 | Jared Hobbs | 89 days | YearEnd, Inc. | Salt Lake City, UT | link |
kalimu | 42 | Kamil Wais | 9 days | 7N / Roche | Rzeszów | link |
Jonnie-Bevan | 28 | NA | 63 days | NA | NA | |
cschaerfe | 21 | Charlotta | 118 days | NA | NA | |
davidanthoff | 12 | David Anthoff | 1 days | University of California, Berkeley | Berkeley, CA | link |
jar1karp | 12 | Jari Karppinen | 154 days | NA | NA | link |
mikmart | 12 | Mikko Marttila | 2 days | NA | NA | link |
reikoch | 8 | NA | 6 days | NA | NA | |
afeld | 6 | Aidan Feldman | 0 days | @GSA and personal projects | Brooklyn, NY | link |
erblast | 6 | Björn Oettinghaus | 22 days | NA | Switzerland | link |
lionel- | 6 | Lionel Henry | 70 days | @rstudio | NA | |
bpfoley | 5 | Brian Foley | 94 days | NA | Seattle, Washington | |
rebecca-albrecht | 4 | NA | 5 days | NA | NA | |
dazim | 3 | Tim Treis | 23 days | NA | Heidelberg, Germany | |
heinzmann537 | 3 | NA | 5 days | NA | NA | |
kentm4 | 3 | Matt Kent | 2 days | Genesis Research | NA | |
PaulJordan57 | 3 | NA | 19 days | NA | NA | |
galachad | 2 | Adam Foryś | 20 days | @Roche | Warsaw, Poland | link |
gerph | 2 | Charles Ferguson | 8 days | NA | NA | |
hadley | 2 | Hadley Wickham | 0 days | @rstudio | Houston, TX | link |
kawap | 2 | NA | 289 days | Roche / 7N | NA | |
kleschenko | 2 | Kostya Leschenko | 5 days | @datarobot | Lviv, Ukraine | |
kshedden | 2 | Kerby Shedden | 1 days | NA | NA | |
kurt-vd | 2 | Kurt Van Dijck | 63 days | NA | NA | |
mrocklin | 2 | Matthew Rocklin | 2 days | @coiled | San Juan Capistrano, CA | link |
thomas-neitmann | 2 | Thomas Neitmann | 1 days | Roche | Basel, Switzerland | link |
waddella | 2 | Adrian Waddell | 27 days | NA | NA | link |
ararslan | 1 | Alex Arslan | 0 days | Beacon Biosignals | Seattle, WA | |
ginberg | 1 | NA | 14 days | NA | Remote | link |
ivarref | 1 | Ivar Refsdal | 13 days | NA | Bergen, Norway | |
jonathon-love | 1 | Jonathon Love | 1 days | NA | NA | link |
Karissa | 1 | NA | 363 days | NA | NA | |
thanos-siadimas | 1 | NA | 1 days | NA | NA |
Pull a specific file using gh_file_get()
.
desc_formatted <- gh_file_get(
repo = "GithubMetrics",
org = "OpenPharma",
file = "DESCRIPTION"
) %>%
# format the description
desc::desc(text = .)
# Print it
desc_formatted$get(c("Package","Title","Version")) %>%
tibble::enframe() %>%
knitr::kable()
name | value |
---|---|
Package | GithubMetrics |
Title | Quickly get key metrics on Github repositaries |
Version | 0.1.0 |
Get all of the files present in the last commit of all the repos using
gh_repo_files_get()
.
repo_files <- gh_repo_files_get(
repo_commits = repo_all_commits,
only_last_commit = TRUE
)
#> Pulling files in latest commit from 13 repos
glimpse(repo_files)
#> Rows: 1,311
#> Columns: 6
#> $ repo <chr> "openpharma/visR-docs", "openpharma/visR-docs", "openpharm…
#> $ file <chr> "readme.md", "docs", "docs/404.html", "docs/code_of_conduc…
#> $ sha_repo <chr> "5b35fdbc39b87a154c9426e363c8f5a2c83d66b0", "5b35fdbc39b87…
#> $ sha_commit <chr> "642856728e165746076a17c6522b9264f693f37d", "642856728e165…
#> $ extension <chr> "md", "docs", "html", "html", "html", "html", "png", "png"…
#> $ lang <chr> "Markdown", NA, "HTML", "HTML", "HTML", "HTML", NA, NA, NA…
repo_files %>%
group_by(repo) %>%
summarise(
Files = n(),
`R files` = sum(lang %in% "R"),
`Python files` = sum(lang %in% c("Python","Jupyter Notebook"))
) %>% knitr::kable(
caption = "Types of files in the organisation"
)
repo | Files | R files | Python files |
---|---|---|---|
openpharma/BBS-causality-training | 4 | 2 | 0 |
openpharma/CTP | 100 | 30 | 0 |
openpharma/facetsr | 63 | 13 | 0 |
openpharma/GithubMetrics | 43 | 22 | 0 |
openpharma/openpharma.github.io | 76 | 1 | 0 |
openpharma/pypharma_nlp | 131 | 0 | 49 |
openpharma/RDO | 105 | 11 | 0 |
openpharma/ReadStat | 207 | 0 | 0 |
openpharma/sas7bdat | 8 | 0 | 2 |
openpharma/simaerep | 145 | 32 | 0 |
openpharma/syntrial | 67 | 24 | 0 |
openpharma/visR | 177 | 81 | 0 |
openpharma/visR-docs | 185 | 0 | 0 |
Types of files in the organisation
results <- gh_repo_search(
code = "tidyverse",
organisation = organisation
)
glimpse(results)
#> Rows: 12
#> Columns: 7
#> $ full_name <chr> "openpharma/GithubMetrics", "openpharma/GithubMetrics", "op…
#> $ name <chr> "GithubMetrics", "GithubMetrics", "GithubMetrics", "GithubM…
#> $ file_name <chr> "README.md", "README.Rmd", "DESCRIPTION", "test-gh_repos_XX…
#> $ path <chr> "README.md", "README.Rmd", "DESCRIPTION", "tests/testthat/t…
#> $ url <chr> "https://github.com/openpharma/GithubMetrics/blob/fa7764869…
#> $ score <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
#> $ lang <chr> "Markdown", "R", NA, "R", "Markdown", "R", "Markdown", "R",…
helper_gh_repo_search <- function(x, org = "openpharma"){
## Slow it down! as search has 30 calls a minute rate limit.
## If you prem the search rate limit is higher, so usually not needed
if(interactive()){message("Wait 5 seconds")}
Sys.sleep(5)
## End slow down
results <- gh_repo_search(
code = x,
organisation = org
)
if(is.na(results)) {
results <- return()
}
results %>%
mutate(Package = x, Organisation = org) %>%
group_by(Organisation,Package) %>%
summarise(
Repos = n_distinct(full_name), .groups = "drop"
)
}
packages <- c(
"tidyverse","pkgdown","dplyr","data.table"
)
package_use <- bind_rows(
packages %>%
map_df(
helper_gh_repo_search, org = "PHCAnalytics"
),
packages %>%
map_df(
helper_gh_repo_search, org = "openpharma"
),
packages %>%
map_df(
helper_gh_repo_search, org = "AstraZeneca"
),
packages %>%
map_df(
helper_gh_repo_search, org = "Roche"
),
packages %>%
map_df(
helper_gh_repo_search, org = "Genentech"
),
packages %>%
map_df(
helper_gh_repo_search, org = "Novartis"
)
)
#> pkgdown does not appear in PHCAnalytics.
#> query = 'pkgdown in:file user:PHCAnalytics'
#> tidyverse does not appear in AstraZeneca.
#> query = 'tidyverse in:file user:AstraZeneca'
#> pkgdown does not appear in AstraZeneca.
#> query = 'pkgdown in:file user:AstraZeneca'
#> data.table does not appear in AstraZeneca.
#> query = 'data.table in:file user:AstraZeneca'
package_use %>%
pivot_wider(names_from = "Package", values_from = "Repos") %>%
mutate(Total = rowSums(.[,-1], na.rm = TRUE)) %>%
arrange(-Total) %>%
knitr::kable(
caption = "Package use detected within repositaries in Pharma orgs"
)
Organisation | tidyverse | dplyr | data.table | pkgdown | Total |
---|---|---|---|---|---|
Novartis | 4 | 10 | 12 | 6 | 32 |
openpharma | 4 | 6 | 2 | 6 | 18 |
Roche | 3 | 2 | 3 | 3 | 11 |
Genentech | 3 | 3 | 3 | 2 | 11 |
PHCAnalytics | 2 | 4 | 4 | NA | 10 |
AstraZeneca | NA | 1 | NA | NA | 1 |
Package use detected within repositaries in Pharma orgs