From dd1f58dfc5cf776dc8ba861e866bcf75013af9b1 Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Thu, 7 Nov 2024 14:18:11 -0500 Subject: [PATCH 01/12] Simplify the copyKAT script and do not setwd, but copy files instead. Also, add argument to use a threshold --- .../scripts/05_copyKAT.R | 95 ++++++++++--------- 1 file changed, 52 insertions(+), 43 deletions(-) diff --git a/analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R b/analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R index 24d58b76b..f5067b7e3 100644 --- a/analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R +++ b/analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R @@ -1,21 +1,21 @@ #!/usr/bin/env Rscript # Run `copyKAT` for one sample with or without a healthy reference -# copyKAT # # USAGE: # Rscript copyKAT.R \ -# --sample_id SCPCS000194 +# --sample_id SCPCS000194 \ +# --use_reference \ +# --distance \ # --ncore 16 # - +# Additional optional arguments include: +# --score_threshold: Annotation prediction score threshold to use when identifying cells to use in reference +# --seed: Integer to set the random seed library(optparse) library(Seurat) library(copykat) -library(fs) -library(jpeg) -library(png) # Parse arguments -------------------------------------------------------------- # set up arguments @@ -29,7 +29,7 @@ option_list <- list( make_option( opt_str = c("-c", "--n_core"), type = "integer", - default = 16, + default = 8, help = "number of cores used to run copyKAT" ), make_option( @@ -44,6 +44,12 @@ option_list <- list( default = "ref", help = "either to run copyKAT with or without reference normal cells" ), + make_option( + opt_str = c("-t", "--score_threshold"), + type = "numeric", + default = 0.85, + help = "Threshold prediction score from label transfer to consider a normal cell in the reference" + ), make_option( opt_str = "--seed", type = "integer", @@ -69,37 +75,29 @@ result_dir <- file.path(module_base, "results", opts$sample_id) # Create directories to save the results of copykat with/without reference using opts$distance -dir.create(file.path(result_dir, "05_copyKAT", opts$use_reference, opts$distance), recursive = TRUE) - -# define scratch directory for tempory saving the output of copykat -scratch_dir <- file.path(module_base, "scratch", opts$sample_id) -dir.create(scratch_dir, recursive = TRUE) - -# path for copykat rds output -name_file <- glue::glue("05_copykat_", opts$sample_id, "_", opts$use_reference, "_distance-", opts$distance, ".rds") -name_full <- file.path(result_dir, "05_copyKAT", opts$use_reference, opts$distance, name_file) - +output_dir <- file.path(result_dir, "05_copyKAT", opts$use_reference, opts$distance) +fs::dir_create(output_dir) -# path to scratch and final heatmap file to copy over -jpeg_file <- glue::glue(opts$sample_id, "_copykat_heatmap.jpeg") -scratch_jpeg <- file.path(scratch_dir, jpeg_file) -output_jpeg_ref <- file.path(result_dir, "05_copyKAT", opts$use_reference, opts$distance, glue::glue("05_copykat_", opts$sample_id, "_", opts$use_reference, "_distance-", opts$distance, "_copykat_heatmap.png")) - -# path to scratch and final .txt prediction file to copy over -prediction_file <- glue::glue(opts$sample_id, "_copykat_prediction.txt") -scratch_prediction <- file.path(scratch_dir, prediction_file) -output_prediction_ref <- file.path(result_dir, "05_copyKAT", opts$use_reference, opts$distance, glue::glue("05_copykat_", opts$sample_id, "_", opts$use_reference, "_distance-", opts$distance, "_copykat_prediction.txt")) +# Define output file names +output_rds <- file.path( + output_dir, + glue::glue("05_copykat_{opts$sample_id}_{opts$use_reference}_distance-{opts$distance}.rds") +) -# path to scratch and final .txt CNA file to copy over -CNA_file <- glue::glue(opts$sample_id, "_copykat_CNA_results.txt") -scratch_CNA <- file.path(scratch_dir, CNA_file) -output_CNA_ref <- file.path(result_dir, "05_copyKAT", opts$use_reference, opts$distance, glue::glue("05_copykat_", opts$sample_id, "_", opts$use_reference, "_distance-", opts$distance, "_copykat_CNA_results.txt")) +output_heatmap_file <- file.path( + output_dir, + glue::glue("05_copykat_{opts$sample_id}_{opts$use_reference}_distance-{opts$distance}_copykat_heatmap.jpg") +) +output_prediction_file <- file.path( + output_dir, + glue::glue("05_copykat_{opts$sample_id}_{opts$use_reference}_distance-{opts$distance}_copykat_prediction.txt") +) -# change working directory of the script to the scratch directory -# this ensures copykat files get saved to the right location -# there is no option to specify an output directory when running copykat -setwd(scratch_dir) +output_cna_file <- file.path( + output_dir, + glue::glue("05_copykat_{opts$sample_id}_{opts$use_reference}_distance-{opts$distance}_copykat_CNA_results.txt") +) # Read in data ----------------------------------------------------------------- srat <- readRDS( @@ -109,8 +107,16 @@ srat <- readRDS( # Extract raw counts ----------------------------------------------------------- exp.rawdata <- GetAssayData(object = srat, assay = "RNA", layer = "counts") -# Extract normal cells --------------------------------------------------------- -normal_cell <- WhichCells(object = srat, expression = fetal_kidney_predicted.compartment %in% c("endothelium", "immune")) +# Define normal cells, if reference should be used ---------------------------- +if (opts$use_reference == "ref") { + normal_cells <- WhichCells( + object = srat, + expression = fetal_kidney_predicted.compartment %in% c("endothelium", "immune") & + fetal_kidney_predicted.compartment.score > opts$score_threshold + ) +} else { + normal_cells <- "" +} # Run copyKAT without reference ------------------------------------------------ @@ -119,7 +125,7 @@ copykat.ref <- copykat( rawmat = exp.rawdata, sam.name = opts$sample_id, distance = opts$distance, - norm.cell.names = ifelse(opts$use_reference == "ref", normal_cell, ""), + norm.cell.names = normal_cells, genome = "hg20", n.cores = opts$n_core, id.type = "E", @@ -128,12 +134,15 @@ copykat.ref <- copykat( KS.cut = 0.05 ) -# Save copykat output reference ---------------------------------------- +# Clean up and copykat output reference ---------------------------------------- -saveRDS(copykat.ref, name_full) +saveRDS(copykat.ref, output_rds) -img <- readJPEG(scratch_jpeg) -writePNG(img, target = output_jpeg_ref) +# move output files to final location +fs::file_move(glue::glue("{opts$sample_id}_copykat_prediction.txt"), output_prediction_file) +fs::file_move(glue::glue("{opts$sample_id}_copykat_heatmap.jpeg"), output_heatmap_file) +fs::file_move(glue::glue("{opts$sample_id}_copykat_CNA_results.txt"), output_cna_file) -fs::file_move(scratch_prediction, output_prediction_ref, overwrite = TRUE) -fs::file_move(scratch_CNA, output_CNA_ref, overwrite = TRUE) +# remove extra files we don't need to save +fs::file_delete(glue::glue("{opts$sample_id}_copykat_CNA_raw_results_gene_by_cell.txt")) +fs::file_delete(glue::glue("{opts$sample_id}_copykat_clustering_results.rds")) From afb30c98956db5bf32b60b10293a366e8528e58c Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Thu, 7 Nov 2024 16:21:13 -0500 Subject: [PATCH 02/12] no longer using png files, and notebook styled --- .../05_copykat_exploration.Rmd | 89 +++++++++++-------- 1 file changed, 51 insertions(+), 38 deletions(-) diff --git a/analyses/cell-type-wilms-tumor-06/notebook_template/05_copykat_exploration.Rmd b/analyses/cell-type-wilms-tumor-06/notebook_template/05_copykat_exploration.Rmd index e3a68f693..52950010b 100644 --- a/analyses/cell-type-wilms-tumor-06/notebook_template/05_copykat_exploration.Rmd +++ b/analyses/cell-type-wilms-tumor-06/notebook_template/05_copykat_exploration.Rmd @@ -3,7 +3,7 @@ title: "Copykat CNV results exploration for `r params$sample_id`" author: "Maud PLASCHKA" date: "`r Sys.Date()`" params: - sample_id: "SCPCS000194" + sample_id: "SCPCS000179" seed: 12345 output: html_document: @@ -16,12 +16,11 @@ output: ```{r setup, include=FALSE} - -knitr::opts_chunk$set(echo = TRUE, - message=FALSE, - warnings=FALSE +knitr::opts_chunk$set( + echo = TRUE, + message = FALSE, + warnings = FALSE ) - ``` @@ -29,14 +28,12 @@ knitr::opts_chunk$set(echo = TRUE, ```{r} - subdiagnosis <- readr::read_tsv( file.path("..", "..", "..", "data", "current", "SCPCP000006", "single_cell_metadata.tsv"), show_col_types = FALSE ) |> dplyr::filter(scpca_sample_id == params$sample_id) |> dplyr::pull(subdiagnosis) - ``` This notebook explores using [`CopyKAT`](https://github.com/navinlabcode/copykat) to estimate tumor and normal cells in `r params$sample_id` from SCPCP000006. @@ -59,7 +56,7 @@ These results are read into this notebook and used to: ```{r packages, message=FALSE, warning=FALSE} library(Seurat) -library(SCpubr) # for plotting +library(SCpubr) # for plotting library(tidyverse) library(patchwork) @@ -87,14 +84,33 @@ In this notebook, we are working with the Wilms tumor sample defined in `r param We work with the pre-processed and labeled `Seurat` object that is the output of `02b_label-transfer_fetal_kidney_reference_Stewart.Rmd` saved in the `results` directory. ```{r} - result_dir <- file.path(module_base, "results", params$sample_id) predictions_paths <- list() full_ck_result_paths <- list() -for(ref_value in c("ref", "noref")){ - for(distance_value in c("euclidean", "spearman")){ - predictions_file <- glue::glue("05_copykat_", {params$sample_id}, "_",ref_value,"_distance-", distance_value, "_copykat_prediction.txt") - full_ck_result_file <- glue::glue("05_copykat_", {params$sample_id}, "_",ref_value,"_distance-", distance_value, "_copykat_CNA_results.txt") +for (ref_value in c("ref", "noref")) { + for (distance_value in c("euclidean", "spearman")) { + predictions_file <- glue::glue( + "05_copykat_", + { + params$sample_id + }, + "_", + ref_value, + "_distance-", + distance_value, + "_copykat_prediction.txt" + ) + full_ck_result_file <- glue::glue( + "05_copykat_", + { + params$sample_id + }, + "_", + ref_value, + "_distance-", + distance_value, + "_copykat_CNA_results.txt" + ) predictions_paths[[glue::glue(ref_value, "_", distance_value)]] <- file.path(result_dir, "05_copyKAT", ref_value, distance_value, predictions_file) full_ck_result_paths[[glue::glue(ref_value, "_", distance_value)]] <- file.path(result_dir, "05_copyKAT", ref_value, distance_value, full_ck_result_file) @@ -121,7 +137,7 @@ Here we defined function that will be used multiple time all along the notebook. ### Load the pre-processed `Seurat` object ```{r load, message=FALSE, warning=FALSE} # open the processed rds object -srat <- readRDS(file.path(result_dir, paste0("02b-fetal_kidney_label-transfer_", params$sample_id,".Rds"))) +srat <- readRDS(file.path(result_dir, paste0("02b-fetal_kidney_label-transfer_", params$sample_id, ".Rds"))) DefaultAssay(srat) <- "SCT" ``` @@ -134,21 +150,21 @@ Below we look at the heatmaps produced by `CopyKAT`. ##### Euclidean distance -![](`r file.path(result_dir, "05_copyKAT", "noref", "euclidean", glue::glue("05_copykat_", params$sample_id, "_noref_distance-euclidean_copykat_heatmap.png"))`) +![](`r file.path(result_dir, "05_copyKAT", "noref", "euclidean", glue::glue("05_copykat_", params$sample_id, "_noref_distance-euclidean_copykat_heatmap.jpg"))`) ##### Spearman distance -![](`r file.path(result_dir, "05_copyKAT", "noref", "spearman", glue::glue("05_copykat_", params$sample_id, "_noref_distance-spearman_copykat_heatmap.png"))`) +![](`r file.path(result_dir, "05_copyKAT", "noref", "spearman", glue::glue("05_copykat_", params$sample_id, "_noref_distance-spearman_copykat_heatmap.jpg"))`) #### Heatmap with endothelial cells as reference ##### Euclidean distance -![](`r file.path(result_dir, "05_copyKAT", "ref", "euclidean", glue::glue("05_copykat_", params$sample_id, "_ref_distance-euclidean_copykat_heatmap.png"))`) +![](`r file.path(result_dir, "05_copyKAT", "ref", "euclidean", glue::glue("05_copykat_", params$sample_id, "_ref_distance-euclidean_copykat_heatmap.jpg"))`) ##### Spearman distance -![](`r file.path(result_dir, "05_copyKAT", "ref", "spearman", glue::glue("05_copykat_", params$sample_id, "_ref_distance-spearman_copykat_heatmap.png"))`) +![](`r file.path(result_dir, "05_copyKAT", "ref", "spearman", glue::glue("05_copykat_", params$sample_id, "_ref_distance-spearman_copykat_heatmap.jpg"))`) #### UMAP @@ -158,19 +174,17 @@ We show a side by side UMAP with results from running `CopyKAT` both with and wi ```{r} # read in ck predictions from both reference types (no_normal and with_normal) -ck_results_df <- predictions_paths |> - purrr::map(readr::read_tsv) |> +ck_results_df <- predictions_paths |> + purrr::map(readr::read_tsv) |> dplyr::bind_rows(.id = "reference_used") # get umap coordinate umap_df <- srat[["umap"]]@cell.embeddings |> as.data.frame() |> - rownames_to_column('barcodes') + rownames_to_column("barcodes") cnv_df <- umap_df |> - dplyr::left_join(ck_results_df, by = c("barcodes" = "cell.names")) - - + dplyr::left_join(ck_results_df, by = c("barcodes" = "cell.names")) ``` ```{r} @@ -201,27 +215,26 @@ We might expect that tumor cells would show an increased estimated copy number i ```{r} # read in full gene by cell copy number detection results full_ck_results_df <- full_ck_result_paths |> - purrr::map(readr::read_tsv) |> - dplyr::bind_rows(.id = "reference_used") - + purrr::map(readr::read_tsv) |> + dplyr::bind_rows(.id = "reference_used") + # for every cell, calculate the mean detection level across all genes in a given chromosome full_cnv_df <- full_ck_results_df |> - tidyr::pivot_longer( + tidyr::pivot_longer( cols = -c( - reference_used, - chrom + reference_used, + chrom ), names_to = "barcodes", values_to = "cnv_detection" - ) |> - dplyr::group_by(chrom, barcodes, reference_used) |> - dplyr::summarise(mean_cnv_detection = mean(cnv_detection)) - + ) |> + dplyr::group_by(chrom, barcodes, reference_used) |> + dplyr::summarise(mean_cnv_detection = mean(cnv_detection)) + # join with cnv info cnv_df <- cnv_df |> - dplyr::left_join(full_cnv_df, by = c("barcodes", "reference_used")) |> - dplyr::filter(!is.na(chrom)) - + dplyr::left_join(full_cnv_df, by = c("barcodes", "reference_used")) |> + dplyr::filter(!is.na(chrom)) ``` Let's look at the distribution of CNV estimation in cells that are called aneuploid and diploid by `CopyKAT`. From 63e3806e3ff5ceee2457c71ccaf208c2ffca44e6 Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Fri, 8 Nov 2024 10:09:28 -0500 Subject: [PATCH 03/12] remove the jpeg packaged; not needed --- analyses/cell-type-wilms-tumor-06/renv.lock | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/analyses/cell-type-wilms-tumor-06/renv.lock b/analyses/cell-type-wilms-tumor-06/renv.lock index fe029e6e2..070935f95 100644 --- a/analyses/cell-type-wilms-tumor-06/renv.lock +++ b/analyses/cell-type-wilms-tumor-06/renv.lock @@ -3165,16 +3165,6 @@ ], "Hash": "8954069286b4b2b0d023d1b288dce978" }, - "jpeg": { - "Package": "jpeg", - "Version": "0.1-10", - "Source": "Repository", - "Repository": "RSPM", - "Requirements": [ - "R" - ], - "Hash": "031a0b683d001a7519202f0628fc0358" - }, "jquerylib": { "Package": "jquerylib", "Version": "0.1.4", From 9cf8fdab415290206abdba01f82a00b09062c34f Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Thu, 7 Nov 2024 13:28:45 -0500 Subject: [PATCH 04/12] update scripts/README --- .../cell-type-wilms-tumor-06/scripts/README.md | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/analyses/cell-type-wilms-tumor-06/scripts/README.md b/analyses/cell-type-wilms-tumor-06/scripts/README.md index 320dbee78..6a1e8ce89 100644 --- a/analyses/cell-type-wilms-tumor-06/scripts/README.md +++ b/analyses/cell-type-wilms-tumor-06/scripts/README.md @@ -7,17 +7,15 @@ This directory contains all scripts used for cell typing Wilms tumor samples fro This script is used to create the fetal kidney reference (Stewart et al) and the full fetal reference (Cao et al). -## `explore-cnv-methods.R` +## `explore-cnv-methods.sh` -This script is used to run 2 CNV inference methods `copykat` and `infercnv` for a selection of samples from SCPCP000006. +This script is used to run and explores 2 CNV inference methods `copykat` and `infercnv` for a selection of samples from SCPCP000006: -We selected previously 5 samples to test for different parameters of `copykat` and `infercnv`: - -- sample SCPCS000194 has > 85 % of cells predicted as kidney and 234 + 83 endothelium and immune cells. -- sample SCPCS000179 has > 94 % of cells predicted as kidney and 25 + 111 endothelium and immune cells. -- sample SCPCS000184 has > 96 % of cells predicted as kidney and 39 + 70 endothelium and immune cells. -- sample SCPCS000205 has > 89 % of cells predicted as kidney and 92 + 76 endothelium and immune cells. -- sample SCPCS000208 has > 95 % of cells predicted as kidney and 18 + 35 endothelium and immune cells. +- SCPCS000194 +- SCPCS000179 +- SCPCS000184 +- SCPCS000205 +- SCPCS000208 ### `06a_build-geneposition.R` From a631fc1c3dbc7bb43df2ad032d040c08c728eab8 Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Thu, 7 Nov 2024 16:22:14 -0500 Subject: [PATCH 05/12] Add shell script explore-cnv-methods to replace R script --- .../scripts/explore-cnv-methods.R | 87 --------------- .../scripts/explore-cnv-methods.sh | 105 ++++++++++++++++++ 2 files changed, 105 insertions(+), 87 deletions(-) delete mode 100644 analyses/cell-type-wilms-tumor-06/scripts/explore-cnv-methods.R create mode 100755 analyses/cell-type-wilms-tumor-06/scripts/explore-cnv-methods.sh diff --git a/analyses/cell-type-wilms-tumor-06/scripts/explore-cnv-methods.R b/analyses/cell-type-wilms-tumor-06/scripts/explore-cnv-methods.R deleted file mode 100644 index 237ab3a17..000000000 --- a/analyses/cell-type-wilms-tumor-06/scripts/explore-cnv-methods.R +++ /dev/null @@ -1,87 +0,0 @@ -# Define path ------------------------------------------------------------------- -# The base path for the OpenScPCA repository, found by its (hidden) .git directory -repository_base <- rprojroot::find_root(rprojroot::is_git_root) -# The path to this module -module_base <- file.path(repository_base, "analyses", "cell-type-wilms-tumor-06") - -# paths to the notebook templates and outputs directory -notebook_template_dir <- file.path(module_base, "notebook_template") -notebook_output_dir <- file.path(module_base, "notebook") - -# We run copyKAT and infercnv for a subselection of samples selected in "04_annotation_Across_Samples_exploration.Rmd" with and without healthy cells as reference - -for (sample_id in c("SCPCS000179", - "SCPCS000184", - "SCPCS000194", - "SCPCS000205", - "SCPCS000208")){ - - ############################################################################## - #################################`copykat` ################################## - ############################################################################## - - # We run and explore copykat using euclidian distance parameter and normal cell as reference - system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "05_copyKAT.R"), " --sample_id ", sample_id, " --n_core ", 32, " --distance ", "euclidean", " --use_reference ", "ref")) - - # We run and explore copykat using spearman distance parameter and normal cell as reference - - system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "05_copyKAT.R"), " --sample_id ", sample_id, " --n_core ", 32, " --distance ", "spearman", " --use_reference ", "ref")) - - # We run and explore copykat using euclidian distance parameter and without normal cell as reference - system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "05_copyKAT.R"), " --sample_id ", sample_id, " --n_core ", 32, " --distance ", "euclidean", " --use_reference ", "noref")) - - # We run and explore copykat using spearman distance parameter and without normal cell as reference - - system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "05_copyKAT.R"), " --sample_id ", sample_id, " --n_core ", 32, " --distance ", "spearman", " --use_reference ", "noref")) - - # We explore `copykat` result rendering one notebook per sample tested - - rmarkdown::render(input = file.path(notebook_template_dir, "05_copykat_exploration.Rmd"), - params = list(sample_id = sample_id, seed = 12345), - output_format = "html_document", - output_file = paste0("05_copykat_exploration_", sample_id,".html"), - output_dir = file.path(notebook_output_dir, sample_id)) - - ############################################################################## - #################################`infercnv` ################################# - ############################################################################## - - # We run and explore infercnv using immune cells as reference and no HMM model - system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "06_infercnv.R"), " --sample_id ", sample_id, " --reference ", "immune", " --HMM ", "no")) - - # We run and explore infercnv using endothelial cells as reference and no HMM model - system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "06_infercnv.R"), " --sample_id ", sample_id, " --reference ", "endothelium", " --HMM ", "no")) - - # We run and explore infercnv using no normal reference and no HMM model - system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "06_infercnv.R"), " --sample_id ", sample_id, " --reference ", "none", " --HMM ", "no")) - - # We run and explore infercnv using both endothelial and immune cells as reference and no HMM model - - system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "06_infercnv.R"), " --sample_id ", sample_id, " --reference ", "both", " --HMM ", "no")) - - # We run and explore infercnv using both endothelial and immune cells as reference and i3 HMM model - - system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "06_infercnv.R"), " --sample_id ", sample_id, " --reference ", "both", " --HMM ", "i3")) - - # We run and explore infercnv using both endothelial and immune cells as reference and i6 HMM model - - system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "06_infercnv.R"), " --sample_id ", sample_id, " --reference ", "both", " --HMM ", "i6")) - - # We run and explore infercnv using both endothelial and immune cells from all non-treated Wilms tumor patients as reference and i3 HMM model - system(command = glue::glue("Rscript ", file.path(module_base,"scripts", "06_infercnv.R"), " --sample_id ", sample_id, " --reference ", "pull", " --HMM ", "i3")) - - rmarkdown::render(input = file.path(notebook_template_dir, "06_cnv_infercnv_exploration.Rmd"), - params = list(sample_id = sample_id, seed = 12345), - output_format = "html_document", - output_file = paste0("06_infercnv_exploration_", sample_id,".html"), - output_dir = file.path(notebook_output_dir, sample_id)) - -} - - - - - - - - \ No newline at end of file diff --git a/analyses/cell-type-wilms-tumor-06/scripts/explore-cnv-methods.sh b/analyses/cell-type-wilms-tumor-06/scripts/explore-cnv-methods.sh new file mode 100755 index 000000000..b2fc24f7d --- /dev/null +++ b/analyses/cell-type-wilms-tumor-06/scripts/explore-cnv-methods.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +# This script performs exploratory analysis by running copyKAT and inferCNV +# on several samples under different conditions, including with and without a +# specified reference, and exploring their results in notebooks. +# +# Default usage: +# ./explore-cnv-methods.sh +# +# Usage if test data is being used: +# TESTING=1 ./explore-cnv-methods.sh +# +# By default, copyKAT will use 32 threads. +# This can be overridden with the THREADS variable: +# THREADS=16 ./explore-cnv-methods.sh +# + +set -euo pipefail + +TESTING=${TESTING:-0} +THREADS=${THREADS:-32} + +# Ensure script is being run from the _module directory_, which is one level up +# from this script's directory. +# This ensures access to the module renv environment +module_dir=$(dirname "${BASH_SOURCE[0]}") +cd ${module_dir} +cd .. + +# Define directories +notebook_template_dir="notebook_template" +results_dir="results" + +# Define argument to use for inferCNV when TESTING==1 +if [[ $TESTING -eq 1 ]]; then + test_string="--testing" +else + test_string="" +fi + +for sample_id in SCPCS000179; do # SCPCS000184 SCPCS000194 SCPCS000205 SCPCS000208; do + + # define notebook output directory + output_dir="${results}/${sample_id}" + mkdir -p ${output_dir} + + ############################################################################## + #################################`copykat` ################################## + ############################################################################## + + + # We run and explore copykat using euclidian distance parameter and normal cell as reference + Rscript scripts/05_copyKAT.R --sample_id ${sample_id} --n_core ${THREADS} --distance "euclidean" --use_reference "ref" + + # We run and explore copykat using spearman distance parameter and normal cell as reference + Rscript scripts/05_copyKAT.R --sample_id ${sample_id} --n_core ${THREADS} --distance "spearman" --use_reference "ref" + + # We run and explore copykat using euclidian distance parameter and without normal cell as reference + Rscript scripts/05_copyKAT.R --sample_id ${sample_id} --n_core ${THREADS} --distance "euclidean" --use_reference "noref" + + # We run and explore copykat using spearman distance parameter and without normal cell as reference + Rscript scripts/05_copyKAT.R --sample_id ${sample_id} --n_core ${THREADS} --distance "spearman" --use_reference "noref" + + # We explore `copykat` results for this sample + Rscript -e "rmarkdown::render(input = '${notebook_template_dir}/05_copykat_exploration.Rmd', + params = list(sample_id = '${sample_id}', seed = 12345), + output_format = 'html_document', + output_file = '05_copykat_exploration_${sample_id}.html', + output_dir = '${output_dir}')" + + + ############################################################################## + #################################`infercnv` ################################# + ############################################################################## + + # We run and explore infercnv using immune cells as reference and no HMM model + Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "immune" --HMM "no" ${test_string} + + # We run and explore infercnv using endothelial cells as reference and no HMM model + Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "endothelium" --HMM "no" ${test_string} + + # We run and explore infercnv using no normal reference and no HMM model + Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "none" --HMM "no" ${test_string} + + # We run and explore infercnv using both endothelial and immune cells as reference and no HMM model + Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "both" --HMM "no" ${test_string} + + # We run and explore infercnv using both endothelial and immune cells as reference and i3 HMM model + Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "both" --HMM "i3" ${test_string} + + # We run and explore infercnv using both endothelial and immune cells as reference and i6 HMM model + Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "both" --HMM "i6" ${test_string} + + # We run and explore infercnv using both endothelial and immune cells from all non-treated Wilms tumor patients as reference and i3 HMM model + Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "pull" --HMM "i3" ${test_string} + + + # We explore `inferCNV` results for this sample + Rscript -e "rmarkdown::render(input = '${notebook_template_dir}/06_cnv_infercnv_exploration.Rmd', + params = list(sample_id = '${sample_id}', seed = 12345), + output_format = 'html_document', + output_file = '06_infercnv_exploration_${sample_id}.html', + output_dir = '$output_dir')" +done + From 38ccc52739d16b20cdfc3ad8cbc6e7c3fb32014f Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Thu, 7 Nov 2024 16:25:17 -0500 Subject: [PATCH 06/12] update workflow script to use explore-cnv-methods.sh (shell script, not R script) --- analyses/cell-type-wilms-tumor-06/00_run_workflow.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/analyses/cell-type-wilms-tumor-06/00_run_workflow.sh b/analyses/cell-type-wilms-tumor-06/00_run_workflow.sh index aeb00964e..3a85815fd 100755 --- a/analyses/cell-type-wilms-tumor-06/00_run_workflow.sh +++ b/analyses/cell-type-wilms-tumor-06/00_run_workflow.sh @@ -130,7 +130,8 @@ if [[ $RUN_EXPLORATORY -eq 1 ]]; then # Run infercnv and copykat for a selection of samples # This script calls scripts/05_copyKAT.R and scripts/06_infercnv.R - Rscript scripts/explore-cnv-methods.R + # By default, copyKAT as called by this script uses 32 cores + TESTING=${IS_CI} ./scripts/explore-cnv-methods.sh fi From 8e447a70a9fcb94a9d6ee7bddc6141332b0cf2ac Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Fri, 8 Nov 2024 09:22:49 -0500 Subject: [PATCH 07/12] add THREADS variable to send to copykat step --- analyses/cell-type-wilms-tumor-06/00_run_workflow.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/analyses/cell-type-wilms-tumor-06/00_run_workflow.sh b/analyses/cell-type-wilms-tumor-06/00_run_workflow.sh index 3a85815fd..4c72722a4 100755 --- a/analyses/cell-type-wilms-tumor-06/00_run_workflow.sh +++ b/analyses/cell-type-wilms-tumor-06/00_run_workflow.sh @@ -10,6 +10,9 @@ # steps that do not directly contribute to final cell type annotations # should be run. Setting RUN_EXPLORATORY=1 will run those steps. # This variable is 0 by default. +# 3. The THREADS variable controls how many cores are used for inference +# with copyKAT, which is an exploratory step in the workflow. +# The variable is 32 by default. # # Default usage: # ./00_run_workflow.sh @@ -24,6 +27,7 @@ set -euo pipefail IS_CI=${TESTING:-0} RUN_EXPLORATORY=${RUN_EXPLORATORY:-0} +THREADS=${THREADS:-32} project_id="SCPCP000006" # Ensure script is being run from its directory @@ -131,7 +135,7 @@ if [[ $RUN_EXPLORATORY -eq 1 ]]; then # Run infercnv and copykat for a selection of samples # This script calls scripts/05_copyKAT.R and scripts/06_infercnv.R # By default, copyKAT as called by this script uses 32 cores - TESTING=${IS_CI} ./scripts/explore-cnv-methods.sh + THREADS=${THREADS} TESTING=${IS_CI} ./scripts/explore-cnv-methods.sh fi From 6eac2538c56842a870138b97354336aa750d0aab Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Fri, 8 Nov 2024 10:11:13 -0500 Subject: [PATCH 08/12] fix renv inconsistency - now activate is at the same version as lockfile --- .../cell-type-wilms-tumor-06/renv/activate.R | 105 ++---------------- 1 file changed, 9 insertions(+), 96 deletions(-) diff --git a/analyses/cell-type-wilms-tumor-06/renv/activate.R b/analyses/cell-type-wilms-tumor-06/renv/activate.R index 2ebe40347..d13f9932a 100644 --- a/analyses/cell-type-wilms-tumor-06/renv/activate.R +++ b/analyses/cell-type-wilms-tumor-06/renv/activate.R @@ -2,7 +2,7 @@ local({ # the requested version of renv - version <- "1.0.8" + version <- "1.0.7" attr(version, "sha") <- NULL # the project directory @@ -98,66 +98,6 @@ local({ unloadNamespace("renv") # load bootstrap tools - ansify <- function(text) { - if (renv_ansify_enabled()) - renv_ansify_enhanced(text) - else - renv_ansify_default(text) - } - - renv_ansify_enabled <- function() { - - override <- Sys.getenv("RENV_ANSIFY_ENABLED", unset = NA) - if (!is.na(override)) - return(as.logical(override)) - - pane <- Sys.getenv("RSTUDIO_CHILD_PROCESS_PANE", unset = NA) - if (identical(pane, "build")) - return(FALSE) - - testthat <- Sys.getenv("TESTTHAT", unset = "false") - if (tolower(testthat) %in% "true") - return(FALSE) - - iderun <- Sys.getenv("R_CLI_HAS_HYPERLINK_IDE_RUN", unset = "false") - if (tolower(iderun) %in% "false") - return(FALSE) - - TRUE - - } - - renv_ansify_default <- function(text) { - text - } - - renv_ansify_enhanced <- function(text) { - - # R help links - pattern <- "`\\?(renv::(?:[^`])+)`" - replacement <- "`\033]8;;ide:help:\\1\a?\\1\033]8;;\a`" - text <- gsub(pattern, replacement, text, perl = TRUE) - - # runnable code - pattern <- "`(renv::(?:[^`])+)`" - replacement <- "`\033]8;;ide:run:\\1\a\\1\033]8;;\a`" - text <- gsub(pattern, replacement, text, perl = TRUE) - - # return ansified text - text - - } - - renv_ansify_init <- function() { - - envir <- renv_envir_self() - if (renv_ansify_enabled()) - assign("ansify", renv_ansify_enhanced, envir = envir) - else - assign("ansify", renv_ansify_default, envir = envir) - - } - `%||%` <- function(x, y) { if (is.null(x)) y else x } @@ -202,10 +142,7 @@ local({ # compute common indent indent <- regexpr("[^[:space:]]", lines) common <- min(setdiff(indent, -1L)) - leave - text <- paste(substring(lines, common), collapse = "\n") - - # substitute in ANSI links for executable renv code - ansify(text) + paste(substring(lines, common), collapse = "\n") } @@ -369,11 +306,7 @@ local({ ) if ("headers" %in% names(formals(utils::download.file))) - { - headers <- renv_bootstrap_download_custom_headers(url) - if (length(headers) && is.character(headers)) - args$headers <- headers - } + args$headers <- renv_bootstrap_download_custom_headers(url) do.call(utils::download.file, args) @@ -452,22 +385,10 @@ local({ for (type in types) { for (repos in renv_bootstrap_repos()) { - # build arguments for utils::available.packages() call - args <- list(type = type, repos = repos) - - # add custom headers if available -- note that - # utils::available.packages() will pass this to download.file() - if ("headers" %in% names(formals(utils::download.file))) - { - headers <- renv_bootstrap_download_custom_headers(url) - if (length(headers) && is.character(headers)) - args$headers <- headers - } - # retrieve package database db <- tryCatch( as.data.frame( - do.call(utils::available.packages, args), + utils::available.packages(type = type, repos = repos), stringsAsFactors = FALSE ), error = identity @@ -549,14 +470,6 @@ local({ } - renv_bootstrap_github_token <- function() { - for (envvar in c("GITHUB_TOKEN", "GITHUB_PAT", "GH_TOKEN")) { - envval <- Sys.getenv(envvar, unset = NA) - if (!is.na(envval)) - return(envval) - } - } - renv_bootstrap_download_github <- function(version) { enabled <- Sys.getenv("RENV_BOOTSTRAP_FROM_GITHUB", unset = "TRUE") @@ -564,16 +477,16 @@ local({ return(FALSE) # prepare download options - token <- renv_bootstrap_github_token() - if (nzchar(Sys.which("curl")) && nzchar(token)) { + pat <- Sys.getenv("GITHUB_PAT") + if (nzchar(Sys.which("curl")) && nzchar(pat)) { fmt <- "--location --fail --header \"Authorization: token %s\"" - extra <- sprintf(fmt, token) + extra <- sprintf(fmt, pat) saved <- options("download.file.method", "download.file.extra") options(download.file.method = "curl", download.file.extra = extra) on.exit(do.call(base::options, saved), add = TRUE) - } else if (nzchar(Sys.which("wget")) && nzchar(token)) { + } else if (nzchar(Sys.which("wget")) && nzchar(pat)) { fmt <- "--header=\"Authorization: token %s\"" - extra <- sprintf(fmt, token) + extra <- sprintf(fmt, pat) saved <- options("download.file.method", "download.file.extra") options(download.file.method = "wget", download.file.extra = extra) on.exit(do.call(base::options, saved), add = TRUE) From 68e0269d7377106cdcb625e84675d245325709ed Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Fri, 8 Nov 2024 10:16:17 -0500 Subject: [PATCH 09/12] fix output_dir typo --- .../cell-type-wilms-tumor-06/scripts/explore-cnv-methods.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/analyses/cell-type-wilms-tumor-06/scripts/explore-cnv-methods.sh b/analyses/cell-type-wilms-tumor-06/scripts/explore-cnv-methods.sh index b2fc24f7d..191139742 100755 --- a/analyses/cell-type-wilms-tumor-06/scripts/explore-cnv-methods.sh +++ b/analyses/cell-type-wilms-tumor-06/scripts/explore-cnv-methods.sh @@ -41,7 +41,7 @@ fi for sample_id in SCPCS000179; do # SCPCS000184 SCPCS000194 SCPCS000205 SCPCS000208; do # define notebook output directory - output_dir="${results}/${sample_id}" + output_dir="${results_dir}/${sample_id}" mkdir -p ${output_dir} ############################################################################## @@ -100,6 +100,6 @@ for sample_id in SCPCS000179; do # SCPCS000184 SCPCS000194 SCPCS000205 SCPCS0002 params = list(sample_id = '${sample_id}', seed = 12345), output_format = 'html_document', output_file = '06_infercnv_exploration_${sample_id}.html', - output_dir = '$output_dir')" + output_dir = '${output_dir}')" done From 575b87d9720ce08b2aab5f7dc0d29ee448d23028 Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Fri, 8 Nov 2024 10:21:10 -0500 Subject: [PATCH 10/12] restore previous cores default --- analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R b/analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R index f5067b7e3..01bd02823 100644 --- a/analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R +++ b/analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R @@ -7,7 +7,7 @@ # --sample_id SCPCS000194 \ # --use_reference \ # --distance \ -# --ncore 16 +# --ncore 8 # # Additional optional arguments include: # --score_threshold: Annotation prediction score threshold to use when identifying cells to use in reference @@ -29,7 +29,7 @@ option_list <- list( make_option( opt_str = c("-c", "--n_core"), type = "integer", - default = 8, + default = 16, help = "number of cores used to run copyKAT" ), make_option( From 048ac3d45fc58159a624aebf4baa474592b97066 Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Fri, 8 Nov 2024 10:30:48 -0500 Subject: [PATCH 11/12] remove testing argument (that is for next PR) and undo some temp testing code --- .../scripts/explore-cnv-methods.sh | 27 ++++++------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/analyses/cell-type-wilms-tumor-06/scripts/explore-cnv-methods.sh b/analyses/cell-type-wilms-tumor-06/scripts/explore-cnv-methods.sh index 191139742..5f9bab86b 100755 --- a/analyses/cell-type-wilms-tumor-06/scripts/explore-cnv-methods.sh +++ b/analyses/cell-type-wilms-tumor-06/scripts/explore-cnv-methods.sh @@ -7,9 +7,6 @@ # Default usage: # ./explore-cnv-methods.sh # -# Usage if test data is being used: -# TESTING=1 ./explore-cnv-methods.sh -# # By default, copyKAT will use 32 threads. # This can be overridden with the THREADS variable: # THREADS=16 ./explore-cnv-methods.sh @@ -17,7 +14,6 @@ set -euo pipefail -TESTING=${TESTING:-0} THREADS=${THREADS:-32} # Ensure script is being run from the _module directory_, which is one level up @@ -31,14 +27,7 @@ cd .. notebook_template_dir="notebook_template" results_dir="results" -# Define argument to use for inferCNV when TESTING==1 -if [[ $TESTING -eq 1 ]]; then - test_string="--testing" -else - test_string="" -fi - -for sample_id in SCPCS000179; do # SCPCS000184 SCPCS000194 SCPCS000205 SCPCS000208; do +for sample_id in SCPCS000179 SCPCS000184 SCPCS000194 SCPCS000205 SCPCS000208; do # define notebook output directory output_dir="${results_dir}/${sample_id}" @@ -74,25 +63,25 @@ for sample_id in SCPCS000179; do # SCPCS000184 SCPCS000194 SCPCS000205 SCPCS0002 ############################################################################## # We run and explore infercnv using immune cells as reference and no HMM model - Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "immune" --HMM "no" ${test_string} + Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "immune" --HMM "no" # We run and explore infercnv using endothelial cells as reference and no HMM model - Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "endothelium" --HMM "no" ${test_string} + Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "endothelium" --HMM "no" # We run and explore infercnv using no normal reference and no HMM model - Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "none" --HMM "no" ${test_string} + Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "none" --HMM "no" # We run and explore infercnv using both endothelial and immune cells as reference and no HMM model - Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "both" --HMM "no" ${test_string} + Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "both" --HMM "no" # We run and explore infercnv using both endothelial and immune cells as reference and i3 HMM model - Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "both" --HMM "i3" ${test_string} + Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "both" --HMM "i3" # We run and explore infercnv using both endothelial and immune cells as reference and i6 HMM model - Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "both" --HMM "i6" ${test_string} + Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "both" --HMM "i6" # We run and explore infercnv using both endothelial and immune cells from all non-treated Wilms tumor patients as reference and i3 HMM model - Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "pull" --HMM "i3" ${test_string} + Rscript scripts/06_infercnv.R --sample_id ${sample_id} --reference "pull" --HMM "i3" # We explore `inferCNV` results for this sample From f5c7b1cb3515152a4476c729529d47305be1c1cd Mon Sep 17 00:00:00 2001 From: Stephanie Spielman Date: Mon, 11 Nov 2024 11:02:47 -0500 Subject: [PATCH 12/12] update comments --- analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R b/analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R index 01bd02823..b5a81a629 100644 --- a/analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R +++ b/analyses/cell-type-wilms-tumor-06/scripts/05_copyKAT.R @@ -7,11 +7,11 @@ # --sample_id SCPCS000194 \ # --use_reference \ # --distance \ -# --ncore 8 +# --n_core 8 # # Additional optional arguments include: -# --score_threshold: Annotation prediction score threshold to use when identifying cells to use in reference -# --seed: Integer to set the random seed +# --score_threshold: Annotation prediction score threshold to use when identifying cells to use in reference. Default is 0.85 +# --seed: Integer to set the random seed. Default is 12345 library(optparse) library(Seurat)