morinlab · vladimirsouza · Apr 24, 2024 · Apr 24, 2024 · Apr 24, 2024 · Apr 25, 2024
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -30,6 +30,7 @@ Depends:
     R (>= 2.10)
 LazyData: true
 Imports:
+    circlize,
     data.table,
     dplyr,
     ggplot2,

diff --git a/NAMESPACE b/NAMESPACE
@@ -8,6 +8,8 @@ export(check_excess_params)
 export(collate_results)
 export(get_ashm_count_matrix)
 export(get_cn_segments)
+export(get_cn_states)
+export(get_cnv_and_ssm_status)
 export(get_coding_ssm)
 export(get_coding_ssm_status)
 export(get_colours)
@@ -23,6 +25,7 @@ export(id_ease)
 export(process_regions)
 export(region_to_chunks)
 export(review_hotspots)
+import(circlize)
 import(data.table, except = c("last", "first", "between", "transpose"))
 import(dplyr)
 import(ggplot2)

diff --git a/R/get_cn_states.R b/R/get_cn_states.R
@@ -0,0 +1,130 @@
+#' @title Get CN States.
+#'
+#' @description Get a copy number matrix for all samples based on segmented data in the database.
+#'
+#' @details This function returns CN states for the specified regions.
+#' For how to specify regions, refer to the parameter descriptions and function examples.
+#' Is this function not what you are looking for? Try one of the following, similar, functions; [GAMBLR.results::assign_cn_to_ssm], [GAMBLR.results::get_cn_segments], [GAMBLR.results::get_sample_cn_segments]
+#'
+#' @param regions_list A vector of regions in the format chrom:start-end.
+#' @param regions_bed A bed file with one row for each region you want to determine the CN state from.
+#' @param region_names Subset CN states on specific regions (gene symbols e.g FCGR2B).
+#' @param these_samples_metadata A metadata table to auto-subset the data to samples in that table before returning.
+#' @param this_seq_type Seq type for returned CN segments. One of "genome" (default) or "capture".
+#' @param all_cytobands Include all cytobands, default is set to FALSE. Currently only supports hg19.
+#' @param use_cytoband_name Use cytoband names instead of region names, e.g p36.33.
+#' @param missing_data_as_diploid Fill in any sample/region combinations with missing data as diploid (e.g., CN state like 2). Default is FALSE.
+#'
+#' @return Copy number matrix.
+#'
+#' @import dplyr circlize tibble stringr tidyr
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' #basic usage, generic lymphoma gene list
+#' cn_matrix = get_cn_states(regions_bed=GAMBLR.data::grch37_lymphoma_genes_bed)
+#'
+#' myc_region <- GAMBLR.utils::gene_to_region(
+#'  gene_symbol = "MYC",
+#'  projection = "grch37",
+#'  return_as = "region"
+#' )
+#'
+#' single_gene_cn <- get_cn_states(
+#'  regions_list = myc_region,
+#'  region_names = "MYC"
+#' )
+#'
+#' # For capture
+#' single_gene_cn <- get_cn_states(
+#'  regions_list = myc_region,
+#'  region_names = "MYC",
+#'  this_seq_type = "capture"
+#' )
+#' }
+#'
+get_cn_states = function(regions_list,
+                         regions_bed,
+                         region_names,
+                         these_samples_metadata,
+                         this_seq_type = "genome",
+                         all_cytobands = FALSE,
+                         use_cytoband_name = FALSE,
+                         missing_data_as_diploid = FALSE){
+
+  if(missing(these_samples_metadata)){
+    these_samples_metadata = get_gambl_metadata(seq_type_filter=this_seq_type)
+  }else{
+    these_samples_metadata = dplyr::filter(these_samples_metadata,seq_type==this_seq_type)
+  }
+  if(all_cytobands){
+    message("Currently, only grch37 is supported")
+  }
+  #retrieve the CN value for this region for every segment that overlaps it
+  bed2region=function(x){
+    paste0(x[1], ":", as.numeric(x[2]), "-", as.numeric(x[3]))
+  }
+  if(all_cytobands){
+    message("Cytobands are in respect to hg19. This will take awhile but it does work, trust me!")
+    use_cytoband_name = TRUE
+    regions_bed = circlize::read.cytoband(species = "hg19")$df
+    colnames(regions_bed) = c("chromosome_name", "start_position", "end_position", "name", "dunno")
+    if(use_cytoband_name){
+      regions_bed = mutate(regions_bed, region_name = paste0(str_remove(chromosome_name, pattern = "chr"), name))
+      region_names = pull(regions_bed, region_name)
+    }else{
+      region_names = pull(regions_bed, region_name)
+    }
+    regions = apply(regions_bed, 1, bed2region)
+    #use the cytobands from the circlize package (currently hg19 but can extend to hg38 once GAMBLR handles it) Has this been updated?
+  }else if(missing(regions_list)){
+    if(!missing(regions_bed)){
+      regions = apply(regions_bed, 1, bed2region)
+    }else{
+      warning("You must supply either regions_list or regions_df")
+    }
+  }else{
+    regions = regions_list
+  }
+  region_segs = lapply(regions,function(x){get_cn_segments(region = x, streamlined = TRUE, this_seq_type = this_seq_type)})
+  if(missing(region_names) & !use_cytoband_name){
+    region_names = regions
+  }
+  tibbled_data = tibble(region_segs, region_name = region_names)
+  unnested_df = tibbled_data %>%
+    unnest_longer(region_segs)
+
+  # check whether copy number segments were found
+  if(nrow(unnested_df) == 0){
+    warning("No copy number segments could be found using the specified parameters.")
+    return(NULL)
+  }
+
+  seg_df = data.frame(ID = unnested_df$region_segs$ID, CN = unnested_df$region_segs$CN,region_name = unnested_df$region_name)
+  #arbitrarily take the first segment for each region/ID combination
+  seg_df = seg_df %>%
+    dplyr::group_by(ID, region_name) %>%
+    dplyr::slice(1) %>%
+    dplyr::rename("sample_id" = "ID")
+
+  meta_arranged = these_samples_metadata %>%
+    dplyr::select(sample_id, pathology, lymphgen) %>%
+    arrange(pathology, lymphgen)
+
+  eg = expand_grid(sample_id = pull(meta_arranged, sample_id), region_name = as.character(unique(seg_df$region_name)))
+  all_cn = left_join(eg, seg_df, by = c("sample_id" = "sample_id", "region_name" = "region_name"))
+
+  #fill in any sample/region combinations with missing data as diploid
+  if(missing_data_as_diploid){
+    all_cn = mutate(all_cn, CN = replace_na(CN, 2))
+  }
+
+  cn_matrix = pivot_wider(all_cn, id_cols = "sample_id", names_from = "region_name", values_from = "CN") %>%
+    column_to_rownames("sample_id")
+
+  #order the regions the same way the user provided them for convenience
+  cn_matrix = cn_matrix[,region_names, drop=FALSE]
+
+  return(cn_matrix)
+}
diff --git a/R/get_cnv_and_ssm_status.R b/R/get_cnv_and_ssm_status.R
@@ -0,0 +1,240 @@
+#' @title Get CNV and coding SSM combined status
+#' 
+#' @description For each specified chromosome region (gene name), return status 1 if the copy number (CN) 
+#'   state is non-neutral, *i.e.* different from 2, or if the region contains any coding simple somatic mutation (SSM).
+#' 
+#' @details The user can choose from which regions are intended to return only copy number variation (CNV) status, 
+#'   only coding SSM status, or at least the presence of one of them. This behavior is controlled by the arguments 
+#'   `genes_and_cn_threshs` (column `cn_thresh`) and `only_cnv`.
+#'   
+#'   This function internally calls the `get_cn_states`, `get_ssm_by_samples` and `get_coding_ssm_status`functions. 
+#'   Therefore, many of its arguments are assigned to these functions. If needed, see the documentation of these 
+#'   functions for more information. 
+#'   
+#'   In the case of returning NA values, this is due to the `get_cn_segments` function not being able to internally 
+#'   return any copy number segments from the specified chromosome region.
+#' 
+#' @param genes_and_cn_threshs A data frame with columns "gene_id" and "cn_thresh". The "gene_id" column stores 
+#'   gene symbols (characters) which determine the regions to return CNV and/or coding SSM status. The "cn_thresh" 
+#'   column stores integers that mean the maximum or minimum CN states to return status 1 (contains CNV) for 
+#'   its respective gene. If this integer is below 2 (neutral CN state for diploids), it is taken as the maximum 
+#'   (gene consider as tumor suppressor); if above 2, it is the minimum (oncogene); if equal to 2, do not consider
+#'   CNV to return status.
+#' @param these_sample_ids A vector of sample IDs that you want results for.
+#' @param these_samples_metadata A metadata table with sample IDs that you want results for. Can be created with 
+#'   the `get_gambl_metadata` function.
+#' @param this_seq_type The seq type to get results for. Possible values are "genome" (default) or "capture".
+#' @param only_cnv A vector of gene names indicating the genes for which only CNV status should be considered, 
+#'   ignoring SSM status. Set this argument to "all" or "none" (default) to apply this behavior to all or none 
+#'   of the genes, respectively.
+#' @param projection Reference genome build. Possible values are "grch37" (default) or "hg38".
+#' @param from_flatfile Logical parameter indicating whether to use flat file to retrieve mutations. Set to FALSE 
+#' to use database instead. Default is TRUE.
+#' @param include_hotspots Logical parameter indicating whether hotspots object should also be tabulated. Default is TRUE.
+#' @param review_hotspots Logical parameter indicating whether hotspots object should be reviewed to include 
+#'   functionally relevant mutations or rare lymphoma-related genes. Default is TRUE.
+#' @param subset_from_merge Argument to internally pass to `get_ssm_by_samples` function. If set to TRUE, 
+#'   the data will be subset from a pre-merged MAF of samples with the specified this_seq_type, Instead of merging 
+#'   individual MAFs. Default is FALSE.
+#' @param augmented Argument to internally pass to the functions `get_ssm_by_samples` and `get_coding_ssm_status`. 
+#'   A logical parameter (default: TRUE). Set to FALSE to use multi-sample patients, instead of the original MAF 
+#'   from each sample.
+#' @param min_read_support_ssm Only consider SSMs with at least this many reads in t_alt_count (for cleaning 
+#'   up augmented MAFs).
+#'
+#' @return A data frame with CNV and SSM combined status.
+#' 
+#' @import dplyr
+#' @export
+#'
+#' @examples
+#' # Define samples
+#' these_sample_ids = c(
+#'   "BLGSP-71-06-00160-01A-03D",
+#'   "BLGSP-71-06-00252-01A-01D",
+#'   "BLGSP-71-19-00122-09A.1-01D",
+#'   "BLGSP-71-19-00523-09A-01D",
+#'   "BLGSP-71-21-00187-01A-01E",
+#'   "BLGSP-71-21-00188-01A-04E"
+#' )
+#' 
+#' # For MYC and SYNCRIP, return CNV and SSM combined status; for MIR17HG, 
+#' # return only CNV status; for CCND3 return only SSM status
+#' genes_and_cn_threshs = data.frame(
+#'   gene_id = c("MYC", "MIR17HG", "CCND3", "SYNCRIP"),
+#'   cn_thresh = c(3, 3, 2, 1)
+#' )
+#' get_cnv_and_ssm_status(
+#'   genes_and_cn_threshs = genes_and_cn_threshs,
+#'   these_sample_ids = these_sample_ids,
+#'   only_cnv = "MIR17HG",
+#' )
+#' 
+#' # For all genes of interest, return only CNV status
+#' genes_and_cn_threshs = data.frame(
+#'   gene_id = c("MYC", "MIR17HG", "SYNCRIP"),
+#'   cn_thresh = c(3, 3, 1)
+#' )
+#' get_cnv_and_ssm_status(
+#'   genes_and_cn_threshs = genes_and_cn_threshs,
+#'   these_sample_ids = these_sample_ids,
+#'   only_cnv = "all",
+#' )
+#' 
+get_cnv_and_ssm_status = function(genes_and_cn_threshs,
+                                  these_sample_ids = NULL,
+                                  these_samples_metadata = NULL,
+                                  this_seq_type = "genome",
+                                  only_cnv = "none",
+                                  projection = "grch37",
+                                  from_flatfile = TRUE,
+                                  include_hotspots = TRUE,
+                                  review_hotspots = TRUE,
+                                  subset_from_merge = FALSE,
+                                  augmented = TRUE,
+                                  min_read_support_ssm = 3){
+
+  # check parameters
+  stopifnot('`genes_and_cn_threshs` argument is missing.' = !missing(genes_and_cn_threshs))
+
+  stopifnot('`genes_and_cn_threshs` argument must be a data frame with columns "gene_id" (characters) and "cn_thresh" (integers).' = {
+    k = class(genes_and_cn_threshs) == "data.frame" &
+      all( c("gene_id", "cn_thresh") %in% names(genes_and_cn_threshs) )
+    if(k){
+      is.character(genes_and_cn_threshs$gene_id) &
+        is.numeric(genes_and_cn_threshs$cn_thresh) &
+        identical(genes_and_cn_threshs$cn_thresh, trunc(genes_and_cn_threshs$cn_thresh))
+    }
+  })
+
+  stopifnot('`projection` argument must be "grch37" or "hg38."' = projection %in% c("grch37", "hg38"))
+
+  stopifnot('`this_seq_type` argument must be "genome" or "capture."' = this_seq_type %in% c("genome", "capture"))
+
+  stopifnot('`only_cnv` argument must be "none", "all", or a subset of `genes_and_cn_threshs$gene_id`' = {
+    only_cnv == "none" |
+      only_cnv == "all" |
+      all(only_cnv %in% genes_and_cn_threshs$gene_id)
+  })
+
+  # get metadata with the dedicated helper function
+  these_samples_metadata = id_ease(
+    these_samples_metadata = these_samples_metadata,
+    these_sample_ids = these_sample_ids,
+    this_seq_type = this_seq_type,
+    verbose = FALSE
+  )
+
+  # get gene regions (don't use GAMBLR.utils::gene_to_region due to dependency issues)
+  if(projection == "grch37"){
+    gene_coordinates = GAMBLR.data::grch37_gene_coordinates
+    # chr_select = paste0(c(c(1:22),"X","Y"))
+  }else{
+    gene_coordinates = GAMBLR.data::hg38_gene_coordinates
+    # chr_select = paste0("chr", c(c(1:22),"X","Y"))
+  }
+  gene_coordinates = filter(gene_coordinates, gene_name %in% genes_and_cn_threshs$gene_id)
+  my_regions = with(gene_coordinates, paste0(chromosome, ":", start, "-", end)) %>% 
+    setNames(gene_coordinates$gene_name) %>% 
+    .[genes_and_cn_threshs$gene_id]
+
+  if(length(my_regions) < nrow(genes_and_cn_threshs)){
+    genes_and_cn_threshs = dplyr::filter(genes_and_cn_threshs, gene_id %in% names(my_regions))
+  }
+
+  ### cnv
+  thresh_2 = genes_and_cn_threshs$cn_thresh == 2
+  genes_and_cn_threshs_non_neutral = genes_and_cn_threshs[!thresh_2,]
+  check_cnv = nrow(genes_and_cn_threshs_non_neutral) > 0
+
+  if(check_cnv){
+    # get cn states
+    cn_matrix = get_cn_states(
+      regions_list = my_regions[genes_and_cn_threshs_non_neutral$gene_id],
+      region_names = genes_and_cn_threshs_non_neutral$gene_id,
+      these_samples_metadata = these_samples_metadata,
+      this_seq_type = this_seq_type
+    )
+    cn_matrix = cn_matrix[these_samples_metadata$sample_id,, drop=FALSE]
+
+    # get cnv status
+    cnv_status = mapply(function(cnstate, thresh){
+      if(thresh < 2){
+        cnstate <= thresh
+      }else if(thresh == 2){
+        cnstate != thresh
+      }else if(thresh > 2){
+        cnstate >= thresh
+      }
+    }, cn_matrix, genes_and_cn_threshs_non_neutral$cn_thresh, USE.NAMES = TRUE, SIMPLIFY = FALSE) %>% 
+      as.data.frame %>% 
+      {. * 1}
+    rownames(cnv_status) = rownames(cn_matrix)
+
+    # if only CNV statuses are desired, output them
+    if(only_cnv == "all"){
+      return(cnv_status)
+    }
+
+    # add cnv status as zero to genes whose cn threshold is 2
+    if(any(thresh_2)){
+      cnv_status = genes_and_cn_threshs$gene_id[thresh_2] %>% 
+        { matrix(0, nrow = nrow(cnv_status), ncol = length(.), dimnames = list(NULL, .)) } %>% 
+        cbind(cnv_status)
+    }
+    cnv_status = dplyr::select(cnv_status, genes_and_cn_threshs$gene_id)
+  }
+
+  ### ssm
+  # genes to get ssm status
+  if(only_cnv == "nome"){
+    genes_to_check_ssm = genes_and_cn_threshs$gene_id
+  }else{
+    genes_to_check_ssm = genes_and_cn_threshs$gene_id [ !(genes_and_cn_threshs$gene_id %in% only_cnv) ]
+  }
+
+  # get maf data
+  my_maf = get_ssm_by_samples(
+    these_samples_metadata = these_samples_metadata,
+    projection = projection,
+    this_seq_type = this_seq_type,
+    min_read_support = min_read_support_ssm,
+    these_genes = genes_to_check_ssm,
+    subset_from_merge = subset_from_merge,
+    augmented = augmented
+  )
+
+  # get ssm status
+  if(projection == "grch37"){
+    genome_build = "hg19"
+  }
+  ssm_status = get_coding_ssm_status(
+    gene_symbols = genes_to_check_ssm,
+    these_samples_metadata = these_samples_metadata,
+    maf_data = my_maf,
+    projection = projection,
+    genome_build = genome_build,
+    min_read_support = min_read_support_ssm,
+    from_flatfile = from_flatfile,
+    include_hotspots = include_hotspots,
+    include_silent = FALSE,
+    augmented = augmented
+  ) %>% column_to_rownames("sample_id")
+
+  # add missing regions to ssm_status as zero statuses
+  missing_regions = !(genes_and_cn_threshs$gene_id %in% names(ssm_status))
+  if(any(missing_regions)){
+    ssm_status = genes_and_cn_threshs$gene_id[missing_regions] %>% 
+      { matrix(0, nrow(ssm_status), length(.), dimnames = list(NULL, .)) } %>% 
+      cbind(ssm_status) 
+  }
+  ssm_status = dplyr::select(ssm_status, genes_and_cn_threshs$gene_id)
+  ssm_status = ssm_status[these_samples_metadata$sample_id,, drop=FALSE]
+
+  # combine cnv and ssm status
+  if(check_cnv){
+    (cnv_status | ssm_status) * 1
+  }else{
+    ssm_status
+  }
+}