Skip to content
This repository has been archived by the owner on Nov 23, 2023. It is now read-only.
/ GithubMetrics Public archive

gh wrapper for easier management of Github related study metadata

License

Unknown, MIT licenses found

Licenses found

Unknown
LICENSE
MIT
LICENSE.md
Notifications You must be signed in to change notification settings

openpharma/GithubMetrics

Repository files navigation

GithubMetrics

It's now recomended to use this package instead: https://github.com/r-world-devs/GitStats

R-CMD-check Codecov test coverage HitCount

The aim of this package is to provide a wrapper on gh to quickly get you key Github repo information you need.The code here is used within Roche to quickly let me pull answer simple questions like:

  • How many studies have more than 1 data scientist (and roughly what’s the commit split)
  • What are the common languages being used (proxied through file type distribution within repos)
  • Pull commit metadata to enrich other study info held in other systems

Installation

You can install the released version of GithubMetrics from CRAN with:

install.packages("GithubMetrics")

Setup

library(GithubMetrics)
library(tidyverse)
library(glue)

organisation <- "openpharma"

Repos in an org

Pull all the repos present within an org (that I can see).

repos_raw <- gh_repos_get(
  org = organisation
  )

repos_clean <- gh_repos_clean(repos_raw)

glimpse(repos_clean) 
#> Rows: 14
#> Columns: 7
#> $ name           <chr> "BBS-causality-training", "GithubMetrics", "facetsr", …
#> $ full_name      <chr> "openpharma/BBS-causality-training", "openpharma/Githu…
#> $ size           <int> 27, 118, 2163, 5435, 87, 939, 1817, 79487, 329, 0, 482…
#> $ updated_at     <chr> "2021-01-29T18:01:35Z", "2021-02-03T07:07:43Z", "2020-…
#> $ default_branch <chr> "main", "master", "master", "master", "master", "maste…
#> $ language       <chr> "R", "R", "R", "Unsure", "Python", "R", "C", "R", "R",…
#> $ MB             <dbl> 0.0, 0.1, 2.1, 5.3, 0.1, 0.9, 1.8, 77.6, 0.3, 0.0, 0.5…

Realistically, research code is likely to be on Github Enterprise, so the .api_url and .token parameters can be passed through to gh(). Commented code below shows how you can use an on-premise Github server.

# repos_raw <- gh_repos_get(
#   org = organisation,
#   .api_url = "https://github.roche.com/api/v3",
#   .token = Sys.getenv("GITHUB_PAT_ROCHE")
#   )

Commits

Get every commit for all the repos in this organisation.

repo_all_commits <- gh_commits_get(
  repos_clean %>% filter(size > 0) %>% pull(full_name), 
  days_back = 365*10
)

glimpse(repo_all_commits)
#> Rows: 1,762
#> Columns: 5
#> $ full_name      <chr> "openpharma/BBS-causality-training", "openpharma/BBS-c…
#> $ author         <chr> "heinzmann537", "heinzmann537", "heinzmann537", "epiji…
#> $ datetime       <chr> "2021-01-29T18:00:10Z", "2021-01-29T12:55:54Z", "2021-…
#> $ sha            <chr> "5ac98df2a99db3b50abae114e37c00e433903094", "059569252…
#> $ commit_message <chr> "Update variable naming ADALM", "Small change", "First…

People

Pull all the people that have committed in r.

contributors <- repo_all_commits %>%
  group_by(author) %>%
  summarise(
    commits = n()
  ) %>%
  filter(!author %in% c(".gitconfig missing email","actions-user"))
  
contributors <- contributors %>%
  left_join(
    gh_user_get(contributors$author),
    by = c("author"="username")
  )

contributors %>%
  arrange(-commits) %>%
  mutate(
    last_active = Sys.Date() - last_active,
    contributor = glue('<img src="{avatar}" alt="" height="30"> {author}'),
    blog = case_when(
      blog == "" ~ "",
      TRUE ~ as.character(glue('<a href="{blog}">link</a>'))
      )
    ) %>%
  select(contributor,commits,name,last_active,company,location,blog) %>%
  knitr::kable(
    
  )
contributor commits name last_active company location blog
evanmiller 936 Evan Miller 17 days NA Chicago, IL link
SHAESEN2 127 Steven Haesendonckx 20 days NA NA
diego-s 122 Diego S 255 days NA NA
bailliem 109 Mark Baillie 0 days NA Basel, CH link
epijim 89 James Black 5 days Roche Basel, Switzerland link
jaredhobbs 70 Jared Hobbs 89 days YearEnd, Inc. Salt Lake City, UT link
kalimu 42 Kamil Wais 9 days 7N / Roche Rzeszów link
Jonnie-Bevan 28 NA 63 days NA NA
cschaerfe 21 Charlotta 118 days NA NA
davidanthoff 12 David Anthoff 1 days University of California, Berkeley Berkeley, CA link
jar1karp 12 Jari Karppinen 154 days NA NA link
mikmart 12 Mikko Marttila 2 days NA NA link
reikoch 8 NA 6 days NA NA
afeld 6 Aidan Feldman 0 days @GSA and personal projects Brooklyn, NY link
erblast 6 Björn Oettinghaus 22 days NA Switzerland link
lionel- 6 Lionel Henry 70 days @rstudio NA
bpfoley 5 Brian Foley 94 days NA Seattle, Washington
rebecca-albrecht 4 NA 5 days NA NA
dazim 3 Tim Treis 23 days NA Heidelberg, Germany
heinzmann537 3 NA 5 days NA NA
kentm4 3 Matt Kent 2 days Genesis Research NA
PaulJordan57 3 NA 19 days NA NA
galachad 2 Adam Foryś 20 days @Roche Warsaw, Poland link
gerph 2 Charles Ferguson 8 days NA NA
hadley 2 Hadley Wickham 0 days @rstudio Houston, TX link
kawap 2 NA 289 days Roche / 7N NA
kleschenko 2 Kostya Leschenko 5 days @datarobot Lviv, Ukraine
kshedden 2 Kerby Shedden 1 days NA NA
kurt-vd 2 Kurt Van Dijck 63 days NA NA
mrocklin 2 Matthew Rocklin 2 days @coiled San Juan Capistrano, CA link
thomas-neitmann 2 Thomas Neitmann 1 days Roche Basel, Switzerland link
waddella 2 Adrian Waddell 27 days NA NA link
ararslan 1 Alex Arslan 0 days Beacon Biosignals Seattle, WA
ginberg 1 NA 14 days NA Remote link
ivarref 1 Ivar Refsdal 13 days NA Bergen, Norway
jonathon-love 1 Jonathon Love 1 days NA NA link
Karissa 1 NA 363 days NA NA
thanos-siadimas 1 NA 1 days NA NA

Files

Pull a specific file using gh_file_get().

desc_formatted <- gh_file_get(
  repo = "GithubMetrics",
  org = "OpenPharma",
  file = "DESCRIPTION"
) %>%
  # format the description
  desc::desc(text = .)

# Print it
desc_formatted$get(c("Package","Title","Version")) %>%
  tibble::enframe() %>%
  knitr::kable()
name value
Package GithubMetrics
Title Quickly get key metrics on Github repositaries
Version 0.1.0

Get all of the files present in the last commit of all the repos using gh_repo_files_get().

repo_files <- gh_repo_files_get(
  repo_commits = repo_all_commits,
  only_last_commit = TRUE
)
#> Pulling files in latest commit from 13 repos

glimpse(repo_files)
#> Rows: 1,311
#> Columns: 6
#> $ repo       <chr> "openpharma/visR-docs", "openpharma/visR-docs", "openpharm…
#> $ file       <chr> "readme.md", "docs", "docs/404.html", "docs/code_of_conduc…
#> $ sha_repo   <chr> "5b35fdbc39b87a154c9426e363c8f5a2c83d66b0", "5b35fdbc39b87…
#> $ sha_commit <chr> "642856728e165746076a17c6522b9264f693f37d", "642856728e165…
#> $ extension  <chr> "md", "docs", "html", "html", "html", "html", "png", "png"…
#> $ lang       <chr> "Markdown", NA, "HTML", "HTML", "HTML", "HTML", NA, NA, NA…

repo_files %>%
  group_by(repo) %>%
  summarise(
    Files = n(),
    `R files` = sum(lang %in% "R"),
    `Python files` = sum(lang %in% c("Python","Jupyter Notebook"))
  ) %>% knitr::kable(
    caption = "Types of files in the organisation"
  )
repo Files R files Python files
openpharma/BBS-causality-training 4 2 0
openpharma/CTP 100 30 0
openpharma/facetsr 63 13 0
openpharma/GithubMetrics 43 22 0
openpharma/openpharma.github.io 76 1 0
openpharma/pypharma_nlp 131 0 49
openpharma/RDO 105 11 0
openpharma/ReadStat 207 0 0
openpharma/sas7bdat 8 0 2
openpharma/simaerep 145 32 0
openpharma/syntrial 67 24 0
openpharma/visR 177 81 0
openpharma/visR-docs 185 0 0

Types of files in the organisation

results <- gh_repo_search(
  code = "tidyverse",
  organisation = organisation
)

glimpse(results)
#> Rows: 12
#> Columns: 7
#> $ full_name <chr> "openpharma/GithubMetrics", "openpharma/GithubMetrics", "op…
#> $ name      <chr> "GithubMetrics", "GithubMetrics", "GithubMetrics", "GithubM…
#> $ file_name <chr> "README.md", "README.Rmd", "DESCRIPTION", "test-gh_repos_XX…
#> $ path      <chr> "README.md", "README.Rmd", "DESCRIPTION", "tests/testthat/t…
#> $ url       <chr> "https://github.com/openpharma/GithubMetrics/blob/fa7764869…
#> $ score     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
#> $ lang      <chr> "Markdown", "R", NA, "R", "Markdown", "R", "Markdown", "R",…
helper_gh_repo_search <- function(x, org = "openpharma"){
  
  ## Slow it down! as search has 30 calls a minute rate limit.
  ## If you prem the search rate limit is higher, so usually not needed
  if(interactive()){message("Wait 5 seconds")}
  Sys.sleep(5)
  ## End slow down
  
  
   results <- gh_repo_search(
      code = x,
      organisation = org
    ) 
   
  if(is.na(results)) {
    results <- return()
  }
  results %>% 
    mutate(Package = x, Organisation = org) %>%
    group_by(Organisation,Package) %>%
    summarise(
      Repos = n_distinct(full_name), .groups = "drop"
    )
}

packages <- c(
  "tidyverse","pkgdown","dplyr","data.table"
  )

package_use <- bind_rows(
  packages %>%
    map_df(
      helper_gh_repo_search, org = "PHCAnalytics"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "openpharma"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "AstraZeneca"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "Roche"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "Genentech"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "Novartis"
    )
)
#> pkgdown does not appear in PHCAnalytics.
#> query = 'pkgdown in:file  user:PHCAnalytics'
#> tidyverse does not appear in AstraZeneca.
#> query = 'tidyverse in:file  user:AstraZeneca'
#> pkgdown does not appear in AstraZeneca.
#> query = 'pkgdown in:file  user:AstraZeneca'
#> data.table does not appear in AstraZeneca.
#> query = 'data.table in:file  user:AstraZeneca'


package_use %>%
  pivot_wider(names_from = "Package", values_from = "Repos") %>%
  mutate(Total = rowSums(.[,-1], na.rm = TRUE)) %>%
  arrange(-Total) %>%
  knitr::kable(
    caption = "Package use detected within repositaries in Pharma orgs"
  )
Organisation tidyverse dplyr data.table pkgdown Total
Novartis 4 10 12 6 32
openpharma 4 6 2 6 18
Roche 3 2 3 3 11
Genentech 3 3 3 2 11
PHCAnalytics 2 4 4 NA 10
AstraZeneca NA 1 NA NA 1

Package use detected within repositaries in Pharma orgs