Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add get_cnv_and_ssm_status .data version #84

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Depends:
R (>= 2.10)
LazyData: true
Imports:
circlize,
data.table,
dplyr,
ggplot2,
Expand Down
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ export(check_excess_params)
export(collate_results)
export(get_ashm_count_matrix)
export(get_cn_segments)
export(get_cn_states)
export(get_cnv_and_ssm_status)
export(get_coding_ssm)
export(get_coding_ssm_status)
export(get_colours)
Expand All @@ -23,6 +25,7 @@ export(id_ease)
export(process_regions)
export(region_to_chunks)
export(review_hotspots)
import(circlize)
import(data.table, except = c("last", "first", "between", "transpose"))
import(dplyr)
import(ggplot2)
Expand Down
130 changes: 130 additions & 0 deletions R/get_cn_states.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
#' @title Get CN States.
#'
#' @description Get a copy number matrix for all samples based on segmented data in the database.
#'
#' @details This function returns CN states for the specified regions.
#' For how to specify regions, refer to the parameter descriptions and function examples.
#' Is this function not what you are looking for? Try one of the following, similar, functions; [GAMBLR.results::assign_cn_to_ssm], [GAMBLR.results::get_cn_segments], [GAMBLR.results::get_sample_cn_segments]
#'
#' @param regions_list A vector of regions in the format chrom:start-end.
#' @param regions_bed A bed file with one row for each region you want to determine the CN state from.
#' @param region_names Subset CN states on specific regions (gene symbols e.g FCGR2B).
#' @param these_samples_metadata A metadata table to auto-subset the data to samples in that table before returning.
#' @param this_seq_type Seq type for returned CN segments. One of "genome" (default) or "capture".
#' @param all_cytobands Include all cytobands, default is set to FALSE. Currently only supports hg19.
#' @param use_cytoband_name Use cytoband names instead of region names, e.g p36.33.
#' @param missing_data_as_diploid Fill in any sample/region combinations with missing data as diploid (e.g., CN state like 2). Default is FALSE.
#'
#' @return Copy number matrix.
#'
#' @import dplyr circlize tibble stringr tidyr
#' @export
#'
#' @examples
#' \dontrun{
#' #basic usage, generic lymphoma gene list
#' cn_matrix = get_cn_states(regions_bed=GAMBLR.data::grch37_lymphoma_genes_bed)
#'
#' myc_region <- GAMBLR.utils::gene_to_region(
#' gene_symbol = "MYC",
#' projection = "grch37",
#' return_as = "region"
#' )
#'
#' single_gene_cn <- get_cn_states(
#' regions_list = myc_region,
#' region_names = "MYC"
#' )
#'
#' # For capture
#' single_gene_cn <- get_cn_states(
#' regions_list = myc_region,
#' region_names = "MYC",
#' this_seq_type = "capture"
#' )
#' }
#'
get_cn_states = function(regions_list,
regions_bed,
region_names,
these_samples_metadata,
this_seq_type = "genome",
all_cytobands = FALSE,
use_cytoband_name = FALSE,
missing_data_as_diploid = FALSE){

if(missing(these_samples_metadata)){
these_samples_metadata = get_gambl_metadata(seq_type_filter=this_seq_type)
}else{
these_samples_metadata = dplyr::filter(these_samples_metadata,seq_type==this_seq_type)
}
if(all_cytobands){
message("Currently, only grch37 is supported")
}
#retrieve the CN value for this region for every segment that overlaps it
bed2region=function(x){
paste0(x[1], ":", as.numeric(x[2]), "-", as.numeric(x[3]))
}
if(all_cytobands){
message("Cytobands are in respect to hg19. This will take awhile but it does work, trust me!")
use_cytoband_name = TRUE
regions_bed = circlize::read.cytoband(species = "hg19")$df
colnames(regions_bed) = c("chromosome_name", "start_position", "end_position", "name", "dunno")
if(use_cytoband_name){
regions_bed = mutate(regions_bed, region_name = paste0(str_remove(chromosome_name, pattern = "chr"), name))
region_names = pull(regions_bed, region_name)
}else{
region_names = pull(regions_bed, region_name)
}
regions = apply(regions_bed, 1, bed2region)
#use the cytobands from the circlize package (currently hg19 but can extend to hg38 once GAMBLR handles it) Has this been updated?
}else if(missing(regions_list)){
if(!missing(regions_bed)){
regions = apply(regions_bed, 1, bed2region)
}else{
warning("You must supply either regions_list or regions_df")
}
}else{
regions = regions_list
}
region_segs = lapply(regions,function(x){get_cn_segments(region = x, streamlined = TRUE, this_seq_type = this_seq_type)})
if(missing(region_names) & !use_cytoband_name){
region_names = regions
}
tibbled_data = tibble(region_segs, region_name = region_names)
unnested_df = tibbled_data %>%
unnest_longer(region_segs)

# check whether copy number segments were found
if(nrow(unnested_df) == 0){
warning("No copy number segments could be found using the specified parameters.")
return(NULL)
}

seg_df = data.frame(ID = unnested_df$region_segs$ID, CN = unnested_df$region_segs$CN,region_name = unnested_df$region_name)
#arbitrarily take the first segment for each region/ID combination
seg_df = seg_df %>%
dplyr::group_by(ID, region_name) %>%
dplyr::slice(1) %>%
dplyr::rename("sample_id" = "ID")

meta_arranged = these_samples_metadata %>%
dplyr::select(sample_id, pathology, lymphgen) %>%
arrange(pathology, lymphgen)

eg = expand_grid(sample_id = pull(meta_arranged, sample_id), region_name = as.character(unique(seg_df$region_name)))
all_cn = left_join(eg, seg_df, by = c("sample_id" = "sample_id", "region_name" = "region_name"))

#fill in any sample/region combinations with missing data as diploid
if(missing_data_as_diploid){
all_cn = mutate(all_cn, CN = replace_na(CN, 2))
}

cn_matrix = pivot_wider(all_cn, id_cols = "sample_id", names_from = "region_name", values_from = "CN") %>%
column_to_rownames("sample_id")

#order the regions the same way the user provided them for convenience
cn_matrix = cn_matrix[,region_names, drop=FALSE]

return(cn_matrix)
}
240 changes: 240 additions & 0 deletions R/get_cnv_and_ssm_status.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,240 @@
#' @title Get CNV and coding SSM combined status
#'
#' @description For each specified chromosome region (gene name), return status 1 if the copy number (CN)
#' state is non-neutral, *i.e.* different from 2, or if the region contains any coding simple somatic mutation (SSM).
#'
#' @details The user can choose from which regions are intended to return only copy number variation (CNV) status,
#' only coding SSM status, or at least the presence of one of them. This behavior is controlled by the arguments
#' `genes_and_cn_threshs` (column `cn_thresh`) and `only_cnv`.
#'
#' This function internally calls the `get_cn_states`, `get_ssm_by_samples` and `get_coding_ssm_status`functions.
#' Therefore, many of its arguments are assigned to these functions. If needed, see the documentation of these
#' functions for more information.
#'
#' In the case of returning NA values, this is due to the `get_cn_segments` function not being able to internally
#' return any copy number segments from the specified chromosome region.
#'
#' @param genes_and_cn_threshs A data frame with columns "gene_id" and "cn_thresh". The "gene_id" column stores
#' gene symbols (characters) which determine the regions to return CNV and/or coding SSM status. The "cn_thresh"
#' column stores integers that mean the maximum or minimum CN states to return status 1 (contains CNV) for
#' its respective gene. If this integer is below 2 (neutral CN state for diploids), it is taken as the maximum
#' (gene consider as tumor suppressor); if above 2, it is the minimum (oncogene); if equal to 2, do not consider
#' CNV to return status.
#' @param these_sample_ids A vector of sample IDs that you want results for.
#' @param these_samples_metadata A metadata table with sample IDs that you want results for. Can be created with
#' the `get_gambl_metadata` function.
#' @param this_seq_type The seq type to get results for. Possible values are "genome" (default) or "capture".
#' @param only_cnv A vector of gene names indicating the genes for which only CNV status should be considered,
#' ignoring SSM status. Set this argument to "all" or "none" (default) to apply this behavior to all or none
#' of the genes, respectively.
#' @param projection Reference genome build. Possible values are "grch37" (default) or "hg38".
#' @param from_flatfile Logical parameter indicating whether to use flat file to retrieve mutations. Set to FALSE
#' to use database instead. Default is TRUE.
#' @param include_hotspots Logical parameter indicating whether hotspots object should also be tabulated. Default is TRUE.
#' @param review_hotspots Logical parameter indicating whether hotspots object should be reviewed to include
#' functionally relevant mutations or rare lymphoma-related genes. Default is TRUE.
#' @param subset_from_merge Argument to internally pass to `get_ssm_by_samples` function. If set to TRUE,
#' the data will be subset from a pre-merged MAF of samples with the specified this_seq_type, Instead of merging
#' individual MAFs. Default is FALSE.
#' @param augmented Argument to internally pass to the functions `get_ssm_by_samples` and `get_coding_ssm_status`.
#' A logical parameter (default: TRUE). Set to FALSE to use multi-sample patients, instead of the original MAF
#' from each sample.
#' @param min_read_support_ssm Only consider SSMs with at least this many reads in t_alt_count (for cleaning
#' up augmented MAFs).
#'
#' @return A data frame with CNV and SSM combined status.
#'
#' @import dplyr
#' @export
#'
#' @examples
#' # Define samples
#' these_sample_ids = c(
#' "BLGSP-71-06-00160-01A-03D",
#' "BLGSP-71-06-00252-01A-01D",
#' "BLGSP-71-19-00122-09A.1-01D",
#' "BLGSP-71-19-00523-09A-01D",
#' "BLGSP-71-21-00187-01A-01E",
#' "BLGSP-71-21-00188-01A-04E"
#' )
#'
#' # For MYC and SYNCRIP, return CNV and SSM combined status; for MIR17HG,
#' # return only CNV status; for CCND3 return only SSM status
#' genes_and_cn_threshs = data.frame(
#' gene_id = c("MYC", "MIR17HG", "CCND3", "SYNCRIP"),
#' cn_thresh = c(3, 3, 2, 1)
#' )
#' get_cnv_and_ssm_status(
#' genes_and_cn_threshs = genes_and_cn_threshs,
#' these_sample_ids = these_sample_ids,
#' only_cnv = "MIR17HG",
#' )
#'
#' # For all genes of interest, return only CNV status
#' genes_and_cn_threshs = data.frame(
#' gene_id = c("MYC", "MIR17HG", "SYNCRIP"),
#' cn_thresh = c(3, 3, 1)
#' )
#' get_cnv_and_ssm_status(
#' genes_and_cn_threshs = genes_and_cn_threshs,
#' these_sample_ids = these_sample_ids,
#' only_cnv = "all",
#' )
#'
get_cnv_and_ssm_status = function(genes_and_cn_threshs,
these_sample_ids = NULL,
these_samples_metadata = NULL,
this_seq_type = "genome",
only_cnv = "none",
projection = "grch37",
from_flatfile = TRUE,
include_hotspots = TRUE,
review_hotspots = TRUE,
subset_from_merge = FALSE,
augmented = TRUE,
min_read_support_ssm = 3){

# check parameters
stopifnot('`genes_and_cn_threshs` argument is missing.' = !missing(genes_and_cn_threshs))

stopifnot('`genes_and_cn_threshs` argument must be a data frame with columns "gene_id" (characters) and "cn_thresh" (integers).' = {
k = class(genes_and_cn_threshs) == "data.frame" &
all( c("gene_id", "cn_thresh") %in% names(genes_and_cn_threshs) )
if(k){
is.character(genes_and_cn_threshs$gene_id) &
is.numeric(genes_and_cn_threshs$cn_thresh) &
identical(genes_and_cn_threshs$cn_thresh, trunc(genes_and_cn_threshs$cn_thresh))
}
})

stopifnot('`projection` argument must be "grch37" or "hg38."' = projection %in% c("grch37", "hg38"))

stopifnot('`this_seq_type` argument must be "genome" or "capture."' = this_seq_type %in% c("genome", "capture"))

stopifnot('`only_cnv` argument must be "none", "all", or a subset of `genes_and_cn_threshs$gene_id`' = {
only_cnv == "none" |
only_cnv == "all" |
all(only_cnv %in% genes_and_cn_threshs$gene_id)
})

# get metadata with the dedicated helper function
these_samples_metadata = id_ease(
these_samples_metadata = these_samples_metadata,
these_sample_ids = these_sample_ids,
this_seq_type = this_seq_type,
verbose = FALSE
)

# get gene regions (don't use GAMBLR.utils::gene_to_region due to dependency issues)
if(projection == "grch37"){
gene_coordinates = GAMBLR.data::grch37_gene_coordinates
# chr_select = paste0(c(c(1:22),"X","Y"))
}else{
gene_coordinates = GAMBLR.data::hg38_gene_coordinates
# chr_select = paste0("chr", c(c(1:22),"X","Y"))
}
gene_coordinates = filter(gene_coordinates, gene_name %in% genes_and_cn_threshs$gene_id)
my_regions = with(gene_coordinates, paste0(chromosome, ":", start, "-", end)) %>%
setNames(gene_coordinates$gene_name) %>%
.[genes_and_cn_threshs$gene_id]

if(length(my_regions) < nrow(genes_and_cn_threshs)){
genes_and_cn_threshs = dplyr::filter(genes_and_cn_threshs, gene_id %in% names(my_regions))
}

### cnv
thresh_2 = genes_and_cn_threshs$cn_thresh == 2
genes_and_cn_threshs_non_neutral = genes_and_cn_threshs[!thresh_2,]
check_cnv = nrow(genes_and_cn_threshs_non_neutral) > 0

if(check_cnv){
# get cn states
cn_matrix = get_cn_states(
regions_list = my_regions[genes_and_cn_threshs_non_neutral$gene_id],
region_names = genes_and_cn_threshs_non_neutral$gene_id,
these_samples_metadata = these_samples_metadata,
this_seq_type = this_seq_type
)
cn_matrix = cn_matrix[these_samples_metadata$sample_id,, drop=FALSE]

# get cnv status
cnv_status = mapply(function(cnstate, thresh){
if(thresh < 2){
cnstate <= thresh
}else if(thresh == 2){
cnstate != thresh
}else if(thresh > 2){
cnstate >= thresh
}
}, cn_matrix, genes_and_cn_threshs_non_neutral$cn_thresh, USE.NAMES = TRUE, SIMPLIFY = FALSE) %>%
as.data.frame %>%
{. * 1}
rownames(cnv_status) = rownames(cn_matrix)

# if only CNV statuses are desired, output them
if(only_cnv == "all"){
return(cnv_status)
}

# add cnv status as zero to genes whose cn threshold is 2
if(any(thresh_2)){
cnv_status = genes_and_cn_threshs$gene_id[thresh_2] %>%
{ matrix(0, nrow = nrow(cnv_status), ncol = length(.), dimnames = list(NULL, .)) } %>%
cbind(cnv_status)
}
cnv_status = dplyr::select(cnv_status, genes_and_cn_threshs$gene_id)
}

### ssm
# genes to get ssm status
if(only_cnv == "nome"){
genes_to_check_ssm = genes_and_cn_threshs$gene_id
}else{
genes_to_check_ssm = genes_and_cn_threshs$gene_id [ !(genes_and_cn_threshs$gene_id %in% only_cnv) ]
}

# get maf data
my_maf = get_ssm_by_samples(
these_samples_metadata = these_samples_metadata,
projection = projection,
this_seq_type = this_seq_type,
min_read_support = min_read_support_ssm,
these_genes = genes_to_check_ssm,
subset_from_merge = subset_from_merge,
augmented = augmented
)

# get ssm status
if(projection == "grch37"){
genome_build = "hg19"
}
ssm_status = get_coding_ssm_status(
gene_symbols = genes_to_check_ssm,
these_samples_metadata = these_samples_metadata,
maf_data = my_maf,
projection = projection,
genome_build = genome_build,
min_read_support = min_read_support_ssm,
from_flatfile = from_flatfile,
include_hotspots = include_hotspots,
include_silent = FALSE,
augmented = augmented
) %>% column_to_rownames("sample_id")

# add missing regions to ssm_status as zero statuses
missing_regions = !(genes_and_cn_threshs$gene_id %in% names(ssm_status))
if(any(missing_regions)){
ssm_status = genes_and_cn_threshs$gene_id[missing_regions] %>%
{ matrix(0, nrow(ssm_status), length(.), dimnames = list(NULL, .)) } %>%
cbind(ssm_status)
}
ssm_status = dplyr::select(ssm_status, genes_and_cn_threshs$gene_id)
ssm_status = ssm_status[these_samples_metadata$sample_id,, drop=FALSE]

# combine cnv and ssm status
if(check_cnv){
(cnv_status | ssm_status) * 1
}else{
ssm_status
}
}
Loading
Loading