index.qmd

---
title: "`r yaml::read_yaml('settings.yml')$view` Task View analysis"
logo: images/logo_white.svg
date: last-modified
format: 
  dashboard:
    include-in-header: google-analytics.html
    orientation: columns
    nav-buttons: github
theme: 
  - cosmo
  - custom.scss
---

```{r}
knitr::opts_chunk$set(
  echo = FALSE
)
```

```{r, message = FALSE}
library(ctv)
library(pkgsearch)
library(dplyr)
library(purrr)
library(ggplot2)
library(yaml)
library(tidyr)
library(lubridate)
library(askpass)
library(stringr)
library(stringi)
library(glue)
library(jsonlite)
library(scales)
library(gt)
library(report)
theme_set(
  theme_minimal(base_size = 18)
)
```

# Analysis

```{r}
pkgs <- NULL
```

```{r, eval=!is.null(yaml::read_yaml('settings.yml')$view)}
tf <- tempfile(fileext = ".md")
# Default branch name can change between repos so we check first
ctv_file_address <- gh::gh(
  "/repos/{owner}/{repo}/contents/{path}",
  owner = "cran-task-views",
  repo = yaml::read_yaml("settings.yml")$view,
  path = paste0(yaml::read_yaml("settings.yml")$view, ".md")
)
download.file(
  ctv_file_address$download_url,
  tf
)
pkgs <- ctv::read.ctv(tf) |>
  purrr::pluck("packagelist", "name") |>
  c(pkgs)
```

```{r, eval=!is.null(yaml::read_yaml('settings.yml')$packages)}
pkgs <- yaml::read_yaml('settings.yml')$packages |> 
  c(pkgs)
```

```{r}
pkgs <- unique(pkgs)
if (length(pkgs) == 0) {
  knitr::knit_exit("No packages found")
}
```

```{r, cache = TRUE}
pkg_versions <- pkgs |> 
  purrr::map(pkgsearch::cran_package_history) |> 
  dplyr::bind_rows() |> 
  dplyr::mutate(
    date = lubridate::ymd_hms(date),
    maintainer = trimws(stringr::str_extract(Maintainer, "^[^<]+"))
  )
```

```{r}
ctv_pkg_descriptions <- pkg_versions |> 
  group_by(Package) |> 
  filter(date == max(date)) |> 
  ungroup()
```

```{r eval=identical(yaml::read_yaml('settings.yml')$view, "Epidemiology")}
gh_repos <- read.csv("https://raw.githubusercontent.com/cran-task-views/Epidemiology/main/data/source_repositories.csv")
```

```{r eval=!identical(yaml::read_yaml('settings.yml')$view, "Epidemiology")}
gh_repos <- ctv_pkg_descriptions |> 
  transmute(
    package = Package,
    github_repo = dplyr::coalesce(
      stringr::str_match(URL, "https://github.com/([^/]+/[^/,\\s]+)")[, 2],
      stringr::str_match(BugReports, "https://github.com/([^/]+/[^/,\\s]+)/issues")[, 2]
    )
  )
```

```{r}
authors <- pkg_versions |> 
  dplyr::mutate(
    package = Package,
    parsed_authors_r = purrr::map(authoritative::parse_authors_r(`Authors@R`), format, include = c("given", "family")),
    parsed_authors = authoritative::parse_authors(Author),
    .keep = "unused"
  ) |> 
  dplyr::mutate(
    auts = coalesce(parsed_authors, parsed_authors_r),
    auts = map(auts, stringi::stri_trans_general, "ASCII"),
    auts = map(auts, stringr::str_replace_all, "[[:punct:]]", " "),
    auts = map(auts, stringr::str_replace_all, "[[:space:]]+", " "),
    auts = map(auts, stringr::str_remove_all, "\\b(User|Developer)\\b"),
    auts = map(auts, trimws),
    maintainer = stringi::stri_trans_general(maintainer, "ASCII"),
    maintainer = stringr::str_remove_all(maintainer, "[[:punct:]]"),
    maintainer = trimws(maintainer),
    .keep = "unused"
  ) |> 
  tidyr::unnest_longer(auts) |> 
  dplyr::mutate(auts = authoritative::expand_names(auts, auts)) |> 
  # Standardize capitalization
  dplyr::mutate(
    auts = stringr::str_to_title(auts)
  ) |> 
  dplyr::mutate(auts = case_when(
    auts == "Lozano Jose E" ~ "Jose Lozano Alonso",
    auts == "Kevis Weiss" ~ "Kevin Weiss",
    auts == "Ahmad family = Rabiee" ~ "Ahmad Rabiee",
    auts == "Ed Ionides" ~ "Edward Ionides",
    TRUE ~ auts
  ))
```

## Dependencies and popularity {width=33%}

### Row {height=20%}

#### Col

```{r}
#| content: valuebox
#| title: "Packages"
list(
  color = "#10BED2",
  value = nrow(ctv_pkg_descriptions)
)
```

#### Col

```{r}
#| content: valuebox
#| title: "Package authors"
list(
  color = "#AEC800",
  value = n_distinct(authors$auts)
)
```

### Row {height=20%}

```{r github_stars, cache = TRUE}
stars <- gh_repos |> 
  na.omit() |> 
  dplyr::mutate(
    stars = purrr::map_int(github_repo, ~ length(gh::gh("/repos/{repo}/stargazers", repo = .x)))
  )
```

```{r}
#| content: valuebox
#| title: "GitHub stars"
list(
  color = "#EBE6E0",
  value = scales::label_number(big.mark = ",")(sum(stars$stars))
)
```

```{r cran-downloads, cache = TRUE}
pkgs_string <- glue::glue_collapse(pkgs, sep = ",")
yesterday <- Sys.Date() - 1

ctv_cran_stats <- "https://cranlogs.r-pkg.org/downloads/daily/2012-05-13:{yesterday}/{pkgs_string}" |> 
  glue::glue() |> 
  jsonlite::fromJSON() |> 
  dplyr::select(-start, -end) |> 
  tidyr::unnest(downloads)
```

```{r}
#| content: valuebox
#| title: "CRAN downloads"
list(
  color = "#DEFF00",
  value = scales::label_number(scale_cut = scales::cut_short_scale())(sum(ctv_cran_stats$downloads))
)
```

### Row {.flow} {.tabset}

```{r}
dep_count <- pkg_versions |> 
  mutate(
    nonbase_dependencies = purrr::map(
      dependencies, 
      ~ .x[!.x$package %in% tools:::.get_standard_package_names()$base, ]
    ),
    Soft = map_int(nonbase_dependencies, ~ sum(.x$type == "Suggests")),
    Hard = map_int(nonbase_dependencies, ~ n_distinct(.x$package[.x$type %in% c("Depends", "Imports", "LinkingTo")]))
  ) |> 
  tidyr::pivot_longer(
    cols = c(Soft, Hard),
    names_to = "Dependency type",
    values_to = "count"
  )
```

```{r}
dep_plot <- dep_count |>
  ggplot(aes(x = date, y = count, color = `Dependency type`, fill = `Dependency type`)) +
    geom_point(alpha = 0.07) +
    geom_hline(yintercept = 20, linetype = "dashed", color = "#F04A4C") +
    annotate("text", x = as.POSIXct("2006-06-01"), y = 23, label = "Maximum number of hard dependencies for CRAN", hjust = 0.5, vjust = 1, color = "#F04A4C") +
    labs(
      title = "Number of hard and soft dependencies over time",
      subtitle = "(excluding base packages)",
      x = "Date",
      y = "Number of dependencies"
    ) +
    theme(legend.position = "bottom") +
    lims(y = c(0, max(dep_count$count) + 2))
```

```{r}
#| title: Data points
dep_plot +
    geom_point() +
    scale_colour_manual(values = c(Soft = "#10BED2", Hard = "#AEC800"))
```

```{r}
#| title: Trend line
dep_plot +
    geom_smooth() +
    scale_colour_manual(values = c(Soft = "#10BED2", Hard = "#AEC800"), guide = "none") +
    scale_fill_manual(values = c(Soft = "#10BED2", Hard = "#AEC800"))
```


## Best practices {width=33%}

```{r}
desc_checks <- ctv_pkg_descriptions %>%
  transmute(
    package            = Package,
    uses_forge         = grepl("\\bhttps?://([^./]\\.)?(github|gitlab|gitea|codeberg)\\.", URL) | grepl("\\bhttps?://([^./]\\.)?(github|gitlab|gitea|codeberg)\\.", BugReports),
    uses_roxygen       = !is.na(RoxygenNote),
    has_knitr_vignette = grepl("knitr", VignetteBuilder),
    uses_testing       = purrr::map_lgl(dependencies, ~ any(.x$package %in% c("testthat", "testit", "unitizer", "RUnit", "tinytest"))),
    has_no_deprecated  = !purrr::map_lgl(dependencies, ~ any(.x$package %in% c("RUnit", "XML", "RCurl", "plyr", "reshape2"))),
    uses_authors_r     = !is.na(`Authors@R`),
    uses_orcid         = grepl("<https://orcid.org/", Author, fixed = TRUE)
  )
```

```{r gh-repo-contents, cache = TRUE}
gh_contents <- gh_repos %>%
  na.omit() %>%
  mutate(
    contents = purrr::map(github_repo, ~ purrr::map_chr(gh::gh("/repos/{repo}/git/trees/HEAD?recursive=1", repo = .x)$tree, "path"))
  )
```

```{r gh-checks}
gh_checks <- gh_contents %>%
  transmute(
    package          = package,
    has_rmd_readme   = purrr::map_lgl(contents, ~ "README.Rmd" %in% .x),
    has_md_license   = purrr::map_lgl(contents, ~ "LICENSE.md" %in% .x),
    uses_pkgdown     = purrr::map_lgl(contents, ~ any(c("_pkgdown.yml", "_pkgdown.yaml", "pkgdown/pkgdown.yml") %in% .x)),
    uses_gha         = purrr::map_lgl(contents, ~ ".github/workflows" %in% .x),
    has_news         = purrr::map_lgl(contents, ~ "NEWS.md" %in% .x),
    has_contributing = purrr::map_lgl(contents, ~ any(c("CONTRIBUTING.md", ".github/CONTRIBUTING.md") %in% .x)),
    has_coc          = purrr::map_lgl(contents, ~ any(c("CODE_OF_CONDUCT.md", ".github/CODE_OF_CONDUCT.md") %in% .x))
  )
```

```{r}
all_checks <- merge(
  gh_checks,
  desc_checks,
  all = TRUE,
  by = "package"
) 
n_pkgs <- nrow(all_checks)
```

```{r}
check_descriptions <- tribble(
  ~check, ~description, ~details, ~category,
  "uses_forge", "Uses a software forge", "Links to a software forge (Codeberg, Gitea, GitHub, GitLab) in `URL` or `BugReports`", "Metadata",
  "uses_roxygen", "Uses Roxygen", "As indicated by the presence of the `RoxygenNote` field in `DESCRIPTION`", "Documentation",
  "has_knitr_vignette", "Has a knitr vignette", "", "Documentation",
  "uses_testing", "Uses a testing framework", "As indicated by the presence of testthat, testit, unitizer, RUnit, tinytest in `Suggests`", "Testing & CI",
  "has_no_deprecated", "Does not depend on deprecated packages", "XML, RCurl, RUnit, plyr, or reshape2 packages", "Sustainability",
  "uses_authors_r", "Has Authors@R field", "", "Metadata",
  "uses_orcid", "Has ORCID in Author field", "", "Metadata",
  "has_rmd_readme", "Has README.Rmd", "", "Documentation",
  "has_md_license", "Has LICENSE.md", "", "Metadata",
  "uses_pkgdown", "Uses pkgdown", "", "Documentation",
  "uses_gha", "Uses GitHub Actions", "", "Testing & CI",
  "has_news", "Has NEWS.md", "", "Documentation",
  "has_contributing", "Has a contributing guide", "", "Community",
  "has_coc", "Has a code of conduct", "", "Community"
)
```

```{r}
all_checks |>
  tibble::column_to_rownames("package") |>
  t() |>
  as.data.frame() |>
  tibble::rownames_to_column("check") |>
  left_join(check_descriptions) |>
  select(-check) |> 
  mutate(
    passed = rowSums(pick(!c(description, details, category)), na.rm = TRUE) / n_pkgs * 100,
    .keep = "unused"
  ) |>
  arrange(desc(passed)) |> 
  gt::gt(groupname_col = "category", rowname_col = "description") |> 
  gt::cols_merge(
    columns = c(description, details),
    pattern = "{1}<br><small><i>{2}</i></small>"
  ) |> 
  gt::cols_width(passed ~ gt::px(100)) |> 
  gt::fmt_number(
    columns = passed,
    decimals = 0,
    pattern = "{x}%"
  ) |> 
  gt::data_color(
    method = "numeric",
    palette = "viridis",
    domain = c(0, 100)
  ) |> 
  gt::tab_stub_indent(
    rows = gt::everything(),
    indent = 5
  ) |> 
  gt::tab_header(
    title = gt::md("Adherence to good practices"),
    subtitle = gt::md("Good practices as defined by [rOpenSci dev guide](https://devguide.ropensci.org/index.html), and [Epiverse-TRACE blueprints](https://epiverse-trace.github.io/blueprints/).")
  ) |> 
  gt::tab_options(
    column_labels.hidden = TRUE,
    row_group.font.weight = "bold",
    table.font.size = "80%"
  )
```

## Authorship trends and sustainability {width=33%} 

### Row {.flow}

```{r}
maintainers_by_year <- pkg_versions |> 
  group_by(name = maintainer) |> 
  summarise(
    maintainer_since = lubridate::year(min(date))
  )

authors_by_year <- authors |>
  group_by(name = auts) |> 
  summarise(
    author_since = lubridate::year(min(date))
  )

full_join(maintainers_by_year, authors_by_year, by = "name") |>
  tidyr::pivot_longer(
    cols = c(maintainer_since, author_since),
    names_to = "role",
    values_to = "year"
  ) |> 
  dplyr::mutate(
    role = tools::toTitleCase(gsub("_since$", "", role))
  ) |> 
  ggplot(aes(x = year, fill = role)) +
    geom_histogram(binwidth = 1) +
    labs(
      title = "Recruitment of new authors & maintainers",
      x = "Date",
      y = "Number of new authors/maintainers",
      fill = ""
    ) +
    theme(legend.position = "bottom") +
    scale_fill_manual(values = c("Maintainer" = "#AEC800", "Author" = "#106BA0"))
```

### Row  {.flow}

```{r}
source("_scripts/parse_archive.R")

yearly_archivals <- archive_df |> 
  filter(package %in% pkgs) |> 
  dplyr::mutate(
    date = lubridate::ymd(archivals),
    year = lubridate::year(date),
    .keep = "unused"
  ) |> 
  count(year, name = "archivals") |> 
  dplyr::mutate(archivals = -archivals)
```

```{r}
pkg_versions |> 
  dplyr::mutate(
    first_release = Version == min(Version), 
    maintainer_change = maintainer != dplyr::lag(maintainer),
    .by = Package
  ) |>
  dplyr::group_by(year = lubridate::year(date)) |> 
  summarise(
    updates = sum(!first_release),
    `first releases` = sum(first_release),
    `maintainer transitions` = sum(maintainer_change),
    .groups = "drop"
  ) |> 
  dplyr::full_join(yearly_archivals, by = "year") |> 
  tidyr::pivot_longer(
    cols = c(updates, `first releases`, `maintainer transitions`, archivals),
    names_to = "release_type",
    values_to = "yearly_releases"
  ) |>
  ggplot(aes(x = year, y = yearly_releases, fill = release_type)) +
    geom_col() +
    labs(
      title = "New releases, updates and archivals",
      x = "Date", y = "Number of events",
      fill = ""
    ) +
    theme(legend.position = "bottom") +
    scale_fill_brewer(palette = "Set1")
```

# About

::: {.card title="References"}

```{r, results='asis'}
summary(report::report(sessionInfo()))
```

<details><summary>Citations</summary>

```{r, results='asis'}
report::cite_packages()
```

</details>

#### Related work

- [Lluís Revilla Sancho's blog posts about the CRAN ecosystem](https://llrs.dev/categories/cran/)
- ["Six degrees of Hadley Wickham: The CRAN co-authorship network", by David Schoch](https://blog.schochastics.net/posts/2024-01-17_six-degrees-of-hadley-wickham/)
- ["Historical Trends in R Package Structure and Interdependency on CRAN", by Mark Padgham and Noam Ross](https://mpadge.github.io/pkgstats-analyses/articles/pkgstats.html)
- ["More than 25 years of CRAN" keynote at useR!2024 by Kurt Hornik](https://raw.githubusercontent.com/gorinsimon/user2024-talks-files/main/attached_files/3_user2024_CRAN.pdf)

:::

::: {.card title="Authors and acknowledgements"}


This analysis was initially started by Hugo Gruson, before being picked up in an Epiverse hackathon at the WHO Collaboratory in Berlin in September 2023 by James Baker, Chathura Edirisuriya and Hugo Gruson. The final dashboard creation and automation was done by Hugo Gruson.

Additional contributions and reviews were provided by Chris Hartgerink.

#### Funding

Hugo Gruson was funded for this work by the Wellcome Trust 224140/Z/21/Z. 

:::