GithubMetrics

It's now recomended to use this package instead: https://github.com/r-world-devs/GitStats

The aim of this package is to provide a wrapper on gh to quickly get you key Github repo information you need.The code here is used within Roche to quickly let me pull answer simple questions like:

How many studies have more than 1 data scientist (and roughly what’s the commit split)
What are the common languages being used (proxied through file type distribution within repos)
Pull commit metadata to enrich other study info held in other systems

Installation

You can install the released version of GithubMetrics from CRAN with:

install.packages("GithubMetrics")

Setup

library(GithubMetrics)
library(tidyverse)
library(glue)

organisation <- "openpharma"

Repos in an org

Pull all the repos present within an org (that I can see).

repos_raw <- gh_repos_get(
  org = organisation
  )

repos_clean <- gh_repos_clean(repos_raw)

glimpse(repos_clean) 
#> Rows: 14
#> Columns: 7
#> $ name           <chr> "BBS-causality-training", "GithubMetrics", "facetsr", …
#> $ full_name      <chr> "openpharma/BBS-causality-training", "openpharma/Githu…
#> $ size           <int> 27, 118, 2163, 5435, 87, 939, 1817, 79487, 329, 0, 482…
#> $ updated_at     <chr> "2021-01-29T18:01:35Z", "2021-02-03T07:07:43Z", "2020-…
#> $ default_branch <chr> "main", "master", "master", "master", "master", "maste…
#> $ language       <chr> "R", "R", "R", "Unsure", "Python", "R", "C", "R", "R",…
#> $ MB             <dbl> 0.0, 0.1, 2.1, 5.3, 0.1, 0.9, 1.8, 77.6, 0.3, 0.0, 0.5…

Realistically, research code is likely to be on Github Enterprise, so the .api_url and .token parameters can be passed through to gh(). Commented code below shows how you can use an on-premise Github server.

# repos_raw <- gh_repos_get(
#   org = organisation,
#   .api_url = "https://github.roche.com/api/v3",
#   .token = Sys.getenv("GITHUB_PAT_ROCHE")
#   )

Commits

Get every commit for all the repos in this organisation.

repo_all_commits <- gh_commits_get(
  repos_clean %>% filter(size > 0) %>% pull(full_name), 
  days_back = 365*10
)

glimpse(repo_all_commits)
#> Rows: 1,762
#> Columns: 5
#> $ full_name      <chr> "openpharma/BBS-causality-training", "openpharma/BBS-c…
#> $ author         <chr> "heinzmann537", "heinzmann537", "heinzmann537", "epiji…
#> $ datetime       <chr> "2021-01-29T18:00:10Z", "2021-01-29T12:55:54Z", "2021-…
#> $ sha            <chr> "5ac98df2a99db3b50abae114e37c00e433903094", "059569252…
#> $ commit_message <chr> "Update variable naming ADALM", "Small change", "First…

People

Pull all the people that have committed in r.

contributors <- repo_all_commits %>%
  group_by(author) %>%
  summarise(
    commits = n()
  ) %>%
  filter(!author %in% c(".gitconfig missing email","actions-user"))
  
contributors <- contributors %>%
  left_join(
    gh_user_get(contributors$author),
    by = c("author"="username")
  )

contributors %>%
  arrange(-commits) %>%
  mutate(
    last_active = Sys.Date() - last_active,
    contributor = glue('<img src="{avatar}" alt="" height="30"> {author}'),
    blog = case_when(
      blog == "" ~ "",
      TRUE ~ as.character(glue('<a href="{blog}">link</a>'))
      )
    ) %>%
  select(contributor,commits,name,last_active,company,location,blog) %>%
  knitr::kable(
    
  )

contributor	commits	name	last_active	company	location	blog
evanmiller	936	Evan Miller	17 days	NA	Chicago, IL	link
SHAESEN2	127	Steven Haesendonckx	20 days	NA	NA
diego-s	122	Diego S	255 days	NA	NA
bailliem	109	Mark Baillie	0 days	NA	Basel, CH	link
epijim	89	James Black	5 days	Roche	Basel, Switzerland	link
jaredhobbs	70	Jared Hobbs	89 days	YearEnd, Inc.	Salt Lake City, UT	link
kalimu	42	Kamil Wais	9 days	7N / Roche	Rzeszów	link
Jonnie-Bevan	28	NA	63 days	NA	NA
cschaerfe	21	Charlotta	118 days	NA	NA
davidanthoff	12	David Anthoff	1 days	University of California, Berkeley	Berkeley, CA	link
jar1karp	12	Jari Karppinen	154 days	NA	NA	link
mikmart	12	Mikko Marttila	2 days	NA	NA	link
reikoch	8	NA	6 days	NA	NA
afeld	6	Aidan Feldman	0 days	@GSA and personal projects	Brooklyn, NY	link
erblast	6	Björn Oettinghaus	22 days	NA	Switzerland	link
lionel-	6	Lionel Henry	70 days	@rstudio	NA
bpfoley	5	Brian Foley	94 days	NA	Seattle, Washington
rebecca-albrecht	4	NA	5 days	NA	NA
dazim	3	Tim Treis	23 days	NA	Heidelberg, Germany
heinzmann537	3	NA	5 days	NA	NA
kentm4	3	Matt Kent	2 days	Genesis Research	NA
PaulJordan57	3	NA	19 days	NA	NA
galachad	2	Adam Foryś	20 days	@Roche	Warsaw, Poland	link
gerph	2	Charles Ferguson	8 days	NA	NA
hadley	2	Hadley Wickham	0 days	@rstudio	Houston, TX	link
kawap	2	NA	289 days	Roche / 7N	NA
kleschenko	2	Kostya Leschenko	5 days	@datarobot	Lviv, Ukraine
kshedden	2	Kerby Shedden	1 days	NA	NA
kurt-vd	2	Kurt Van Dijck	63 days	NA	NA
mrocklin	2	Matthew Rocklin	2 days	@coiled	San Juan Capistrano, CA	link
thomas-neitmann	2	Thomas Neitmann	1 days	Roche	Basel, Switzerland	link
waddella	2	Adrian Waddell	27 days	NA	NA	link
ararslan	1	Alex Arslan	0 days	Beacon Biosignals	Seattle, WA
ginberg	1	NA	14 days	NA	Remote	link
ivarref	1	Ivar Refsdal	13 days	NA	Bergen, Norway
jonathon-love	1	Jonathon Love	1 days	NA	NA	link
Karissa	1	NA	363 days	NA	NA
thanos-siadimas	1	NA	1 days	NA	NA

Files

Pull a specific file using gh_file_get().

desc_formatted <- gh_file_get(
  repo = "GithubMetrics",
  org = "OpenPharma",
  file = "DESCRIPTION"
) %>%
  # format the description
  desc::desc(text = .)

# Print it
desc_formatted$get(c("Package","Title","Version")) %>%
  tibble::enframe() %>%
  knitr::kable()

name	value
Package	GithubMetrics
Title	Quickly get key metrics on Github repositaries
Version	0.1.0

Get all of the files present in the last commit of all the repos using gh_repo_files_get().

repo_files <- gh_repo_files_get(
  repo_commits = repo_all_commits,
  only_last_commit = TRUE
)
#> Pulling files in latest commit from 13 repos

glimpse(repo_files)
#> Rows: 1,311
#> Columns: 6
#> $ repo       <chr> "openpharma/visR-docs", "openpharma/visR-docs", "openpharm…
#> $ file       <chr> "readme.md", "docs", "docs/404.html", "docs/code_of_conduc…
#> $ sha_repo   <chr> "5b35fdbc39b87a154c9426e363c8f5a2c83d66b0", "5b35fdbc39b87…
#> $ sha_commit <chr> "642856728e165746076a17c6522b9264f693f37d", "642856728e165…
#> $ extension  <chr> "md", "docs", "html", "html", "html", "html", "png", "png"…
#> $ lang       <chr> "Markdown", NA, "HTML", "HTML", "HTML", "HTML", NA, NA, NA…

repo_files %>%
  group_by(repo) %>%
  summarise(
    Files = n(),
    `R files` = sum(lang %in% "R"),
    `Python files` = sum(lang %in% c("Python","Jupyter Notebook"))
  ) %>% knitr::kable(
    caption = "Types of files in the organisation"
  )

repo	Files	R files	Python files
openpharma/BBS-causality-training	4	2	0
openpharma/CTP	100	30	0
openpharma/facetsr	63	13	0
openpharma/GithubMetrics	43	22	0
openpharma/openpharma.github.io	76	1	0
openpharma/pypharma_nlp	131	0	49
openpharma/RDO	105	11	0
openpharma/ReadStat	207	0	0
openpharma/sas7bdat	8	0	2
openpharma/simaerep	145	32	0
openpharma/syntrial	67	24	0
openpharma/visR	177	81	0
openpharma/visR-docs	185	0	0

Types of files in the organisation

results <- gh_repo_search(
  code = "tidyverse",
  organisation = organisation
)

glimpse(results)
#> Rows: 12
#> Columns: 7
#> $ full_name <chr> "openpharma/GithubMetrics", "openpharma/GithubMetrics", "op…
#> $ name      <chr> "GithubMetrics", "GithubMetrics", "GithubMetrics", "GithubM…
#> $ file_name <chr> "README.md", "README.Rmd", "DESCRIPTION", "test-gh_repos_XX…
#> $ path      <chr> "README.md", "README.Rmd", "DESCRIPTION", "tests/testthat/t…
#> $ url       <chr> "https://github.com/openpharma/GithubMetrics/blob/fa7764869…
#> $ score     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1
#> $ lang      <chr> "Markdown", "R", NA, "R", "Markdown", "R", "Markdown", "R",…

helper_gh_repo_search <- function(x, org = "openpharma"){
  
  ## Slow it down! as search has 30 calls a minute rate limit.
  ## If you prem the search rate limit is higher, so usually not needed
  if(interactive()){message("Wait 5 seconds")}
  Sys.sleep(5)
  ## End slow down
  
  
   results <- gh_repo_search(
      code = x,
      organisation = org
    ) 
   
  if(is.na(results)) {
    results <- return()
  }
  results %>% 
    mutate(Package = x, Organisation = org) %>%
    group_by(Organisation,Package) %>%
    summarise(
      Repos = n_distinct(full_name), .groups = "drop"
    )
}

packages <- c(
  "tidyverse","pkgdown","dplyr","data.table"
  )

package_use <- bind_rows(
  packages %>%
    map_df(
      helper_gh_repo_search, org = "PHCAnalytics"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "openpharma"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "AstraZeneca"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "Roche"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "Genentech"
    ),
  packages %>%
    map_df(
      helper_gh_repo_search, org = "Novartis"
    )
)
#> pkgdown does not appear in PHCAnalytics.
#> query = 'pkgdown in:file  user:PHCAnalytics'
#> tidyverse does not appear in AstraZeneca.
#> query = 'tidyverse in:file  user:AstraZeneca'
#> pkgdown does not appear in AstraZeneca.
#> query = 'pkgdown in:file  user:AstraZeneca'
#> data.table does not appear in AstraZeneca.
#> query = 'data.table in:file  user:AstraZeneca'


package_use %>%
  pivot_wider(names_from = "Package", values_from = "Repos") %>%
  mutate(Total = rowSums(.[,-1], na.rm = TRUE)) %>%
  arrange(-Total) %>%
  knitr::kable(
    caption = "Package use detected within repositaries in Pharma orgs"
  )

Organisation	tidyverse	dplyr	data.table	pkgdown	Total
Novartis	4	10	12	6	32
openpharma	4	6	2	6	18
Roche	3	2	3	3	11
Genentech	3	3	3	2	11
PHCAnalytics	2	4	4	NA	10
AstraZeneca	NA	1	NA	NA	1

Package use detected within repositaries in Pharma orgs

Name		Name	Last commit message	Last commit date
Latest commit History 73 Commits
.github		.github
R		R
data-raw		data-raw
data		data
man		man
tests		tests
vignettes		vignettes
.Rbuildignore		.Rbuildignore
.gitattributes		.gitattributes
.gitignore		.gitignore
CRAN-RELEASE		CRAN-RELEASE
DESCRIPTION		DESCRIPTION
GithubMetrics.Rproj		GithubMetrics.Rproj
LICENSE		LICENSE
LICENSE.md		LICENSE.md
Makefile		Makefile
NAMESPACE		NAMESPACE
NEWS.md		NEWS.md
README.Rmd		README.Rmd
README.md		README.md
_pkgdown.yml		_pkgdown.yml
codecov.yml		codecov.yml
cran-comments.md		cran-comments.md
renv.lock		renv.lock

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Licenses found

Repository files navigation

GithubMetrics

Installation

Setup

Repos in an org

Commits

People

Files

About

Licenses found

Releases 1

Contributors 2

Languages

License

Licenses found

openpharma/GithubMetrics

Folders and files

Latest commit

History

Repository files navigation

GithubMetrics

Installation

Setup

Repos in an org

Commits

People

Files

About

Topics

Resources

License

Licenses found

Stars

Watchers

Forks

Releases 1

Contributors 2

Languages