Merge pull request #382 from PGScatalog/integrate-utils-1.4.1

2.0.0 release
PGScatalog · Nov 1, 2024 · 205cbfd · 205cbfd
2 parents 9bd9c43 + b02f00b
commit 205cbfd
Show file tree

Hide file tree

Showing 27 changed files with 293 additions and 203 deletions.
diff --git a/CITATIONS.md b/CITATIONS.md
@@ -1,6 +1,6 @@
 # PGScatalog/pgsc_calc: Citations
 
-> Lambert, Wingfield _et al._ (2024) The Polygenic Score Catalog: new functionality and tools to enable FAIR research. medRxiv. doi:[10.1101/2024.05.29.24307783](https://doi.org/10.1101/2024.05.29.24307783).
+> Lambert, Wingfield _et al._ (2024) Enhancing the Polygenic Score Catalog with tools for score calculation and ancestry normalization. Nature Genetics. doi:[10.1038/s41588-024-01937-x](https://doi.org/10.1038/s41588-024-01937-x).
   
 ## [nf-core](https://pubmed.ncbi.nlm.nih.gov/32055031/)
 
@@ -13,11 +13,11 @@
 ## Pipeline tools
 
 * [PGS Catalog API](https://pubmed.ncbi.nlm.nih.gov/33692568/)
-    > Lambert SA, Gil L, Jupp S, Ritchie SC, Xu Y, Buniello A, McMahon A, Abraham G, Chapman M, Parkinson H, Danesh J. The Polygenic Score Catalog as an open database for reproducibility and systematic evaluation. Nature Genetics. 2021 Apr;53(4):420-5. doi: 10.1038/s41588-021-00783-5. PubMed PMID: 33692568.
+    > Lambert, Wingfield _et al._ (2024) Enhancing the Polygenic Score Catalog with tools for score calculation and ancestry normalization. Nature Genetics. doi:[10.1038/s41588-024-01937-x](https://doi.org/10.1038/s41588-024-01937-x).
 
 * [pygscatalog](https://github.com/PGScatalog/pygscatalog)
 
-    > Lambert, Wingfield _et al._ (2024) The Polygenic Score Catalog: new functionality and tools to enable FAIR research. medRxiv. doi:[10.1101/2024.05.29.24307783](https://doi.org/10.1101/2024.05.29.24307783).
+    > Lambert, Wingfield _et al._ (2024) Enhancing the Polygenic Score Catalog with tools for score calculation and ancestry normalization. Nature Genetics. doi:[10.1038/s41588-024-01937-x](https://doi.org/10.1038/s41588-024-01937-x).
     
 * [PLINK 2](https://pubmed.ncbi.nlm.nih.gov/25722852/)
     > Chang CC, Chow CC, Tellier LC, Vattikuti S, Purcell SM, Lee JJ. Second-generation PLINK: rising to the challenge of larger and richer datasets. Gigascience. 2015 Dec 1;4(1):s13742-015. doi: 10.1186/s13742-015-0047-8. PubMed PMID: 25722852. PubMed Central PMCID: PMC4342193.

diff --git a/README.md b/README.md
@@ -104,9 +104,9 @@ from Aoife McMahon (EBI). Development of new features, testing, and code review
 is ongoing including Inouye lab members (Rodrigo Canovas, Scott Ritchie) and others. If 
 you use the tool we ask you to cite our paper describing software and updated PGS Catalog resource:
 
-- >Lambert, Wingfield _et al._ (2024) The Polygenic Score Catalog: new functionality
-  and tools to enable FAIR research.  medRxiv.
-  doi:[10.1101/2024.05.29.24307783](https://doi.org/10.1101/2024.05.29.24307783).
+- >Lambert, Wingfield _et al._ (2024) Enhancing the Polygenic Score Catalog with tools for score 
+  calculation and ancestry normalization. Nature Genetics.
+  doi:[10.1038/s41588-024-01937-x](https://doi.org/10.1038/s41588-024-01937-x).
 
 This pipeline is distrubuted under an [Apache License](LICENSE) amd uses code and 
 infrastructure developed and maintained by the [nf-core](https://nf-co.re) community 

diff --git a/assets/report/img/PGS_Logo.png → assets/report/PGS_Logo.png b/assets/report/img/PGS_Logo.png → assets/report/PGS_Logo.png
diff --git a/assets/report/logo.css b/assets/report/logo.css
@@ -1,5 +1,5 @@
 .quarto-title-block .quarto-title-banner {
-  background-image: url(img/PGS_Logo.png), url(img/pgs_header_background.png);
+  background-image: url(PGS_Logo.png), url(pgs_header_background.png);
   background-size: 90px, 170px;
   background-position: left, right;
   background-repeat: no-repeat;

diff --git a/assets/report/img/pgs_header_background.png → assets/report/pgs_header_background.png b/assets/report/img/pgs_header_background.png → assets/report/pgs_header_background.png
diff --git a/assets/report/report.qmd b/assets/report/report.qmd
@@ -36,6 +36,10 @@ library(DT)
 library(tibble)
 library(forcats)
 library(readr)
+
+# prevent plots with small sample sets
+MINIMUM_N_SAMPLES <- 50
+LOW_SAMPLE_SIZE <- TRUE
 ```
 
 ```{r setup_logs, echo=FALSE}
@@ -64,6 +68,14 @@ log_df$sampleset <- gsub("_", " ", log_df$sampleset)  # page breaking issues
 cat command.txt | fold -w 80 -s | awk -F ' ' 'NR==1 { print "$", $0} NR>1 { print "    " $0}' | sed 's/$/\\/' | sed '$ s/.$//' 
 ```
 
+```{asis, echo = grepl("-profile test", readLines("command.txt"))}
+:::{.callout-tip}
+* If you're using the test profile, this report and these results are not biologically meaningful 
+* The test profile is only used to check that all software is installed and working correctly 
+* If you're reading this message, then that means everything is OK and you're ready to use your own data!
+:::
+```
+
 ## Version
 
 ```{r, echo=FALSE}
@@ -76,7 +88,6 @@ message(params$version)
 
 ```{r load_scorefiles}
 json_list <- jsonlite::fromJSON(params$log_scorefiles, simplifyVector = FALSE)
-json_scorefiles <- unlist(json_list, recursive=FALSE)
 
 link_traits <- function(trait_efo, mapped) {
   if (length(trait_efo) == 0) {
@@ -87,12 +98,12 @@ link_traits <- function(trait_efo, mapped) {
 }
 
 extract_traits <- function(x) {
-  trait_efo <- purrr::map(json_scorefiles, ~ extract_chr_handle_null(.x, "trait_efo"))
-  mapped <- purrr::map(json_scorefiles, ~ extract_chr_handle_null(.x, "trait_mapped"))
+  trait_efo <- purrr::map(x, ~ extract_chr_handle_null(.x$header, "trait_efo"))
+  mapped <- purrr::map(x, ~ extract_chr_handle_null(.x$header, "trait_mapped"))
   trait_display <- purrr::map2(trait_efo, mapped, link_traits)
   mapped_trait_links <- purrr::map_chr(trait_display, ~ paste(.x, collapse = "<br />"))
-  reported_traits <- purrr::map(json_scorefiles, ~ extract_chr_handle_null(.x, "trait_reported"))
-  purrr::map2(reported_traits, mapped_trait_links, ~ {
+  reported_traits <- purrr::map(x, ~ extract_chr_handle_null(.x, "trait_reported"))
+  purrr::map2_chr(reported_traits, mapped_trait_links, ~ {
     stringr::str_glue("<u>Reported trait:</u> {.x} <br /> <u>Mapped trait(s):</u> {.y}")
   })
 }
@@ -121,21 +132,18 @@ annotate_genome_build <- function(original_build, harmonised_build) {
   return(stringr::str_glue("<u>Original build:</u> {original_build} <br /> <u>Harmonised build:</u> {harmonised_build}"))
 }
 
-tibble::tibble(json = json_scorefiles) %>%
-  # extract fields from json list
-  mutate(pgs_id = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "pgs_id")),
-         pgs_name = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "pgs_name")),
-         pgp_id = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "pgp_id")),
-         citation = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "citation")),
-         # trait_efo = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "trait_efo")),
-         # trait_reported = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "trait_reported")),
-         # trait_mapped = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "trait_mapped")),
-         trait_display = extract_traits(.),
-         genome_build = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "genome_build")),
-         harmonised_build = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "HmPOS_build")),
-         n_variants = purrr::map_chr(json, ~ .x$variants_number),
-         accession = stringr::str_replace_all(names(json), "_", " ")
-         ) %>%
+# extract fields from json list
+tibble(
+  pgs_id = map_chr(json_list, "pgs_id"),
+  pgs_name = map_chr(json_list, ~ extract_chr_handle_null(.x$header, "pgs_name")),
+  pgp_id = map_chr(json_list, ~ extract_chr_handle_null(.x$header, "pgp_id")),
+  citation = map_chr(json_list, ~ extract_chr_handle_null(.x$header, "citation")),
+  trait_display = extract_traits(json_list),
+  genome_build = purrr::map_chr(json_list, ~ extract_chr_handle_null(.x$header, "genome_build")),
+  harmonised_build = purrr::map_chr(json_list, ~ extract_chr_handle_null(.x$header, "HmPOS_build")),
+  n_variants = purrr::map_chr(json_list, ~ extract_chr_handle_null(.x$header, "variants_number")),
+  compatible_effect_type = map_lgl(json_list, "compatible_effect_type"),
+  has_complex_alleles = map_lgl(json_list, "has_complex_alleles")) %>%
   # add links to pgs catalog identifiers
   mutate(pgs_id = purrr::map_chr(pgs_id, ~ link_pgscatalog(.x, "score")),
          pgp_id = purrr::map_chr(pgp_id, ~ link_pgscatalog(.x, "publication"))) %>%
@@ -144,31 +152,58 @@ tibble::tibble(json = json_scorefiles) %>%
          pgs_id = purrr::map2_chr(pgs_id, pgs_name, ~ add_note(.x, .y)),
          genome_build = purrr::map2_chr(genome_build, harmonised_build, ~ annotate_genome_build(.x, .y))) %>% 
   # pick columns
-  select(accession, pgs_id, pgp_id, trait_display, n_variants, genome_build) -> scorefile_metadata
+  select(pgs_id, pgp_id, trait_display, n_variants, genome_build, has_complex_alleles, compatible_effect_type) -> scorefile_metadata
 ```
 
 :::{.column-body-outset}
 
 ```{r, echo=FALSE}
+tooltip_text <- c(
+  "Polygenic Score ID" = "Unique identifier for the polygenic score.",
+  "Publication" = "Reference publication for the score.",
+  "Traits" = "Traits associated with the score.",
+  "Number of variants" = "Total number of genetic variants (defined in the header)",
+  "Genome build" = "The genome assembly version used.",
+  "Complex alleles present?" = "Describes if complex non-SNP alleles included in the scoring file, e.g. APOE/HLA. These variants are excluded from the PGS calculation in the current version",
+  "Effect types compatible?" = "Describes if the scoring file is compatible with the Calculator. Scores with dosage-specific weights are removed."
+)
+
 DT::datatable(
   scorefile_metadata,
   rownames = FALSE,
   escape = FALSE,
-  colnames = c(
-    "Scoring file" = "accession",
-    "Polygenic Score ID" = "pgs_id",
-    "Publication" = "pgp_id",
-    "Traits" = "trait_display",
-    "Number of variants" = "n_variants",
-    "Genome build" = "genome_build"
+  colnames = setNames(
+    paste0('<span title="', tooltip_text, '">', names(tooltip_text), '</span>'), 
+    NULL
   ),
   extensions = 'Buttons',
   options = list(dom = 'Bfrtip',
                  buttons = c('csv'))
-)
+) %>%
+    DT::formatStyle('has_complex_alleles', 
+                  backgroundColor = DT::styleEqual(c(FALSE, TRUE), c('#a6dba0', '#c2a5cf'))) %>% 
+    DT::formatStyle('compatible_effect_type', 
+                  backgroundColor = DT::styleEqual(c(FALSE, TRUE), c('#c2a5cf', '#a6dba0')))
+
+```
+
+:::
+
+```{asis, echo = any(!scorefile_metadata$compatible_effect_type)}
+::: {.callout-warning title="Incompatible effect types detected"}
+* Some scoring files contain variants with dosage dependent effect weights (for example, [PGS002253](https://www.pgscatalog.org/score/PGS002253/))
+* Scores with dosage-specific weights are removed from calculation 
+* Scores that contain variants with recessive, dominant, or additive effect types are supported
+:::
 ```
 
+```{asis, echo = any(scorefile_metadata$has_complex_alleles)}
+::: {.callout-warning title="Complex alleles detected"}
+* Some scoring files contain complex alleles (e.g. APOE / HLA / CYP)
+* These variants are excluded from the PGS calculation in the current version
+* Please check [Appendix A - Curation of PGS including complex alleles](https://www.pgscatalog.org/docs/curation) for more detailed information
 :::
+```
 
 # Variant matching 
 
@@ -386,18 +421,23 @@ pop_summary %>%
 scores <- readr::read_tsv(params$score_path) 
 n_scores <- length(unique(scores$PGS))
 n_samples <- length(unique(scores$IID))
-print(n_samples)
+if (n_samples < MINIMUM_N_SAMPLES) {
+  LOW_SAMPLE_SIZE <- TRUE
+} else {
+  LOW_SAMPLE_SIZE <- FALSE
+}
 ```
 
-```{asis, echo = any(table(scores$sampleset) < 50) && !params$run_ancestry}
+
+```{asis, echo = (LOW_SAMPLE_SIZE && !params$run_ancestry)}
 
 ::: {.callout-important title="Warning: small sampleset size (n < 50) detected"}
 * plink2 uses allele frequency data to [mean impute](https://www.cog-genomics.org/plink/2.0/score) the dosages of missing genotypes
 * Currently `pgsc_calc` disables mean-imputation in these small sample sets to make sure that the calculated PGS is as consistent with the genotype data as possible
 * With a small sample size, the resulting score sums may be inconsistent between samples
 * The average `([scorename]_AVG)` may be more applicable as it calculates an average weighting over all genotypes present
 
-In the future mean-imputation will be supported in small samplesets using ancestry-matched reference samplesets to ensure consistent calculation of score sums (e.g. 1000G Genomes).
+It's recommended to use `--run_ancestry` with small samplesets to ensure consistent calculation of score sums (e.g. 1000G Genomes).
 :::
 
 ```
@@ -419,24 +459,21 @@ In the future mean-imputation will be supported in small samplesets using ancest
 
 ### Score data 
 
-#### Score extract
+#### Density plot(s)
 
+```{asis, echo = !LOW_SAMPLE_SIZE}
 ::: {.callout-note}
-Below is a summary of the aggregated scores, which might be useful for debugging. See here for an explanation of [plink2](https://www.cog-genomics.org/plink/2.0/formats#sscore) column names
+The summary density plots show up to six scoring files
 :::
-
-```{r, echo = FALSE}
-scores %>%
-  tibble::as_tibble(.)
 ```
 
-#### Density plot(s)
-
-::: {.callout-note}
-The summary density plots show up to six scoring files
+```{asis, echo = LOW_SAMPLE_SIZE}
+::: {.callout-warning}
+Density plots are disabled for low sample sizes
 :::
+```
 
-```{r density_ancestry, echo=FALSE, message=FALSE, warning=FALSE, eval=params$run_ancestry}
+```{r density_ancestry, echo=FALSE, message=FALSE, warning=FALSE, eval=(!LOW_SAMPLE_SIZE & params$run_ancestry)}
 # Select which PGS to plot
 uscores <- unique(scores$PGS)
 uscores_plot <- uscores[1:min(length(uscores), 6)] # plot max 6 PGS
@@ -454,7 +491,7 @@ for(current_pgs in uscores_plot){
 }
 ```
 
-```{r, echo = FALSE, message=FALSE, warning=FALSE, eval=!params$run_ancestry}
+```{r, echo = FALSE, message=FALSE, warning=FALSE, eval=(!LOW_SAMPLE_SIZE & !params$run_ancestry)}
 scores %>%
   ungroup() %>%
   select(IID, sampleset, PGS, SUM) %>%
@@ -488,7 +525,9 @@ stringr::str_glue("{params$sampleset}/score/aggregated_scores.txt.gz")
 
 # Citation
 
-> Lambert, Wingfield, et al. (2024) The Polygenic Score Catalog: new functionality and tools to enable FAIR research. medRxiv. doi:[10.1101/2024.05.29.24307783](https://doi.org/10.1101/2024.05.29.24307783).
+> Samuel A. Lambert, Benjamin Wingfield, Joel T. Gibson, Laurent Gil, Santhi Ramachandran, Florent Yvon, Shirin Saverimuttu, Emily Tinsley, Elizabeth Lewis, Scott C. Ritchie, Jingqin Wu, Rodrigo Canovas, Aoife McMahon, Laura W. Harris, Helen Parkinson, Michael Inouye.
+Enhancing the Polygenic Score Catalog with tools for score calculation and ancestry normalization.
+Nature Genetics (2024) | doi: [10.1038/s41588-024-01937-x](https://doi.org/10.1038/s41588-024-01937-x)
 
 ::: {.callout-important}
 For scores from the PGS Catalog, please remember to cite the original publications from which they came (these are listed in the metadata table).
@@ -507,13 +546,12 @@ For scores from the PGS Catalog, please remember to cite the original publicatio
 # as of 2023-12-12 only non-default licenses are recorded in the scoring file header
 default_ebi_terms <- "PGS obtained from the Catalog should be cited appropriately, and used in accordance with any licensing restrictions set by the authors. See EBI Terms of Use (https://www.ebi.ac.uk/about/terms-of-use/) for additional details."
 
-tibble::tibble(json = json_scorefiles) %>%
-  mutate(pgs_id = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "pgs_id")),
-         license_text = purrr::map_chr(json, ~ extract_chr_handle_null(.x, "license"))) %>%
+tibble(
+    pgs_id = map_chr(json_list, "pgs_id"),
+    license_text = map_chr(json_list, ~ extract_chr_handle_null(.x$header, "license"))) %>%
   mutate(license_text = ifelse(license_text == "", default_ebi_terms, license_text)) %>%
   # display license terms for files in the PGS Catalog only (with a PGS ID)
   filter(startsWith(pgs_id, "PGS")) %>%
-  select(-json) %>%
   DT::datatable(., colnames = c(
       "PGS ID" = "pgs_id",
       "License text" = "license_text"

diff --git a/conf/modules.config b/conf/modules.config
@@ -38,8 +38,8 @@ process {
         ext.conda = "$projectDir/environments/pgscatalog_utils/environment.yml"
         ext.docker = 'ghcr.io/pgscatalog/pygscatalog'
         ext.singularity = 'oras://ghcr.io/pgscatalog/pygscatalog'
-        ext.docker_version = ':pgscatalog-utils-1.3.1'
-        ext.singularity_version = ':pgscatalog-utils-1.3.1-singularity'
+        ext.docker_version = ':pgscatalog-utils-1.4.4'
+        ext.singularity_version = ':pgscatalog-utils-1.4.4-singularity'
     }
 
     withLabel: plink2 {

diff --git a/docs/explanation/match.rst b/docs/explanation/match.rst
@@ -22,7 +22,7 @@ What is matching?
 
 The calculator carefully checks that variants (rows) in a scoring file are present in your target genomes.
 
-The matching procedure `is described in the preprint supplement <https://www.medrxiv.org/content/10.1101/2024.05.29.24307783v1.supplementary-material>`_. 
+The matching procedure `is described in supplement of our recent publication <https://www.nature.com/articles/s41588-024-01937-x#Sec6>`_.
 
 The matching procedure never makes any changes to target genome data and only seeks to match variants in the scoring file to the genome.