Skip to content

Commit

Permalink
Fixphasing (#47)
Browse files Browse the repository at this point in the history
* feature: use all cells for phasing when cluster for phasing has a diploid region

* feature: enable the diploid phasing to only happen in specific chromosomes

* Increment version number to 0.9.0
  • Loading branch information
marcjwilliams1 authored May 31, 2023
1 parent 3ec4d49 commit e05a44b
Show file tree
Hide file tree
Showing 8 changed files with 182 additions and 11 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: signals
Title: Single Cell Genomes with Allele Specificity
Version: 0.8.0
Version: 0.9.0
Author@R: c(person("Marc", "Williams", email = "[email protected]",
role = c("aut", "cre")),
person("Tyler", "Funnell",
Expand Down
4 changes: 4 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
# signals 0.9.0

* Fix phasing issue that happens when the cluster identified to phase relative to has a diploid region

# signals 0.8.0

* Add option to mask bins during inference
Expand Down
56 changes: 51 additions & 5 deletions R/callHSCN.R
Original file line number Diff line number Diff line change
Expand Up @@ -483,9 +483,20 @@ prop_to_list <- function(haplotypes, prop, phasebyarm = FALSE) {
}

#' @export
phase_haplotypes_bychr <- function(haplotypes, chrlist, phasebyarm = FALSE) {
phase_haplotypes_bychr <- function(ascn,
haplotypes,
chrlist,
phasebyarm = FALSE,
global_phasing_for_balanced = TRUE,
chrs_for_global_phasing = NULL) {

haplotypes <- as.data.table(haplotypes)

#use all chromosomes if null
if (is.null(chrs_for_global_phasing)){
chrs_for_global_phasing <- unique(haplotypes$chr)
}

if (phasebyarm) {
haplotypes$chrarm <- paste0(haplotypes$chr, coord_to_arm(haplotypes$chr, haplotypes$start))
phased_haplotypes <- data.table()
Expand All @@ -496,6 +507,34 @@ phase_haplotypes_bychr <- function(haplotypes, chrlist, phasebyarm = FALSE) {
.[, c("allele1", "allele0") := NULL]
phased_haplotypes <- rbind(phased_haplotypes, phased_haplotypes_temp)
}
} else if (global_phasing_for_balanced == TRUE) {
phased_haplotypes <- data.table()
for (i in names(chrlist)) {
consensus_cn <- ascn %>%
dplyr::filter(cell_id %in% chrlist[[i]]) %>%
consensuscopynumber(.) %>%
dplyr::filter(state_phase == "Balanced")
if (i %in% chrs_for_global_phasing){
#phase haplotypes in diploid region using cells in cluster
phased_haplotypes_temp1 <- haplotypes[cell_id %in% chrlist[[i]] & chr == i & !(start %in% consensus_cn$start)] %>%
.[, lapply(.SD, sum), by = .(chr, start, end, hap_label), .SDcols = c("allele1", "allele0")] %>%
.[, phase := ifelse(allele0 < allele1, "allele0", "allele1")] %>%
.[, c("allele1", "allele0") := NULL]
#phase haplotypes in diploid region using all cells
phased_haplotypes_temp2 <- haplotypes[chr == i & (start %in% consensus_cn$start)] %>%
.[, lapply(.SD, sum), by = .(chr, start, end, hap_label), .SDcols = c("allele1", "allele0")] %>%
.[, phase := ifelse(allele0 < allele1, "allele0", "allele1")] %>%
.[, c("allele1", "allele0") := NULL]
phased_haplotypes <- rbind(phased_haplotypes, phased_haplotypes_temp1)
phased_haplotypes <- rbind(phased_haplotypes, phased_haplotypes_temp2)
} else{
phased_haplotypes_temp <- haplotypes[cell_id %in% chrlist[[i]] & chr == i] %>%
.[, lapply(.SD, sum), by = .(chr, start, end, hap_label), .SDcols = c("allele1", "allele0")] %>%
.[, phase := ifelse(allele0 < allele1, "allele0", "allele1")] %>%
.[, c("allele1", "allele0") := NULL]
phased_haplotypes <- rbind(phased_haplotypes, phased_haplotypes_temp)
}
}
} else {
phased_haplotypes <- data.table()
for (i in names(chrlist)) {
Expand All @@ -504,7 +543,7 @@ phase_haplotypes_bychr <- function(haplotypes, chrlist, phasebyarm = FALSE) {
.[, phase := ifelse(allele0 < allele1, "allele0", "allele1")] %>%
.[, c("allele1", "allele0") := NULL]
phased_haplotypes <- rbind(phased_haplotypes, phased_haplotypes_temp)
}
}
}

return(phased_haplotypes)
Expand Down Expand Up @@ -593,7 +632,9 @@ filter_haplotypes <- function(haplotypes, fraction){
#' @param firstpassfiltering Filter out cells with large discrepancy after first pass state assignment
#' @param smoothsingletons Remove singleton bins by smoothing over based on states in adjacent bins
#' @param fillmissing For bins with missing counts fill in values based on neighbouring bins
#'
#' @param global_phasing_for_diploid When using cluster_per_chr, use all cells for phasing diploid regions within the cluster
#' @param chrs_for_global_phasing Which chromosomes to phase using all cells for diploid regions, default is NULL which uses all chromosomes
#'
#' @return Haplotype specific copy number object
#'
#' @details The haplotype specific copy number object include the following additional columns
Expand Down Expand Up @@ -648,7 +689,9 @@ callHaplotypeSpecificCN <- function(CNbins,
filterhaplotypes = 0.1,
firstpassfiltering = TRUE,
smoothsingletons = TRUE,
fillmissing = TRUE) {
fillmissing = TRUE,
global_phasing_for_balanced = FALSE,
chrs_for_global_phasing = NULL) {
if (!clustering_method %in% c("copy", "breakpoints")) {
stop("Clustering method must be one of copy or breakpoints")
}
Expand Down Expand Up @@ -790,9 +833,12 @@ callHaplotypeSpecificCN <- function(CNbins,
propdf <- chrlist$propdf
chrlist <- chrlist$chrlist
phased_haplotypes <- phase_haplotypes_bychr(
ascn = ascn,
haplotypes = haplotypes,
chrlist = chrlist,
phasebyarm = phasebyarm
phasebyarm = phasebyarm,
global_phasing_for_balanced = global_phasing_for_balanced,
chrs_for_global_phasing = chrs_for_global_phasing
)
} else {
chrlist <- NULL
Expand Down
2 changes: 1 addition & 1 deletion man/callAlleleSpecificCN.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/callAlleleSpecificCNfromHSCN.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 8 additions & 2 deletions man/callHaplotypeSpecificCN.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion man/getBins.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

115 changes: 115 additions & 0 deletions tests/testthat/test-complexphasing.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@

loherror <- 0.02
sim_data_bb1 <- simulate_data_cohort(
clone_num = c(20),
clonal_events = list(
list("1" = c(2, 0))),
loherror = loherror,
coverage = 20,
rho = 0.02,
likelihood = "betabinomial",
nchr = 0
)

sim_data_bb2 <- simulate_data_cohort(
clone_num = c(20),
clonal_events = list(
list("1" = c(2, 1))),
loherror = loherror,
coverage = 20,
rho = 0.02,
likelihood = "betabinomial",
nchr = 0
)

sim_data_bb3 <- simulate_data_cohort(
clone_num = c(20),
clonal_events = list(
list("1" = c(2, 0))),
loherror = loherror,
coverage = 20,
rho = 0.02,
likelihood = "betabinomial",
nchr = 0
)

sim_data_bb4 <- simulate_data_cohort(
clone_num = c(20),
clonal_events = list(
list("1" = c(2, 1))),
loherror = loherror,
coverage = 20,
rho = 0.02,
likelihood = "betabinomial",
nchr = 0
)

start_switch <- 80e6
end_switch <- 150e6

sim_data_bb2$CNbins$cell_id <- sim_data_bb1$CNbins$cell_id
sim_data_bb2$haplotypes$cell_id <- sim_data_bb1$haplotypes$cell_id
sim_data_bb3$CNbins$cell_id <- sim_data_bb4$CNbins$cell_id
sim_data_bb3$haplotypes$cell_id <- sim_data_bb4$haplotypes$cell_id

cndata <- sim_data_bb1$CNbins %>%
dplyr::filter(start < start_switch | start > end_switch)
cndata <- cndata %>%
dplyr::bind_rows(sim_data_bb2$CNbins %>%
dplyr::filter(start > start_switch & start <= end_switch))
cndata <- cndata %>%
dplyr::bind_rows(sim_data_bb4$CNbins %>%
dplyr::filter(start < start_switch | start > end_switch))
cndata <- cndata %>%
dplyr::bind_rows(sim_data_bb3$CNbins %>%
dplyr::filter(start > start_switch & start <= end_switch))

hapsdata <- sim_data_bb1$haplotypes %>%
dplyr::filter(start < start_switch | start > end_switch)
hapsdata <- hapsdata %>%
dplyr::bind_rows(sim_data_bb2$haplotypes %>%
dplyr::filter(start > start_switch & start <= end_switch))
hapsdata <- hapsdata %>%
dplyr::bind_rows(sim_data_bb4$haplotypes %>%
dplyr::filter(start < start_switch | start > end_switch))
hapsdata <- hapsdata %>%
dplyr::bind_rows(sim_data_bb3$haplotypes %>%
dplyr::filter(start > start_switch & start <= end_switch))

#all the above is a hack to create a copy number structure where there is a small segment that is LOH that is unique to a clone,
#this causes phasing issues because phasing is done relative to the clone with the largest amount of LOH per chromosome.
#to see what this looks like see plotHeatmap

results <- callHaplotypeSpecificCN(cndata %>% dplyr::filter(chr == "1"),
hapsdata %>% dplyr::filter(chr == "1"),
likelihood = "betabinomial")
results_fix <- callHaplotypeSpecificCN(cndata %>% dplyr::filter(chr == "1"),
hapsdata %>% dplyr::filter(chr == "1"),
likelihood = "betabinomial",
global_phasing_for_balanced = TRUE)
#to check that specifying chromosomes works, here I'm specifying the "wrong" chromosome
#so results should match results and not results_fix
results_fix_chr <- callHaplotypeSpecificCN(cndata %>% dplyr::filter(chr == "1"),
hapsdata %>% dplyr::filter(chr == "1"),
likelihood = "betabinomial",
global_phasing_for_balanced = TRUE,
chrs_for_global_phasing = c("2"))

nsegs_per_cell <- create_segments(results$data, field = "state_phase") %>%
dplyr::group_by(cell_id) %>%
dplyr::summarize(n = dplyr::n())

nsegs_per_cell_fix <- create_segments(results_fix$data, field = "state_phase") %>%
dplyr::group_by(cell_id) %>%
dplyr::summarize(n = dplyr::n())

nsegs_per_cell_fix_chr <- create_segments(results_fix_chr$data, field = "state_phase") %>%
dplyr::group_by(cell_id) %>%
dplyr::summarize(n = dplyr::n())

test_that("Test that using all cells for phasing regions that are diploid in phasing clusters improves inference", {
expect_true(all(nsegs_per_cell_fix$n == 3))
expect_gt(nsegs_per_cell$n %>% mean, nsegs_per_cell_fix$n %>% mean)
expect_gt(nsegs_per_cell_fix_chr$n %>% mean, nsegs_per_cell_fix$n %>% mean)
})

0 comments on commit e05a44b

Please sign in to comment.