Skip to content

Commit

Permalink
**v.0.3.9**
Browse files Browse the repository at this point in the history
* I'm pleased to announce that `assigner` now works in parallel with **Windows**
* bug fix introduce in last commit in `write_gsi_sim` where the file was not
created properly from an internal module.
  • Loading branch information
thierrygosselin committed Nov 28, 2016
1 parent b92c38c commit 1643330
Show file tree
Hide file tree
Showing 16 changed files with 216 additions and 77 deletions.
8 changes: 4 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: assigner
Type: Package
Title: Assignment Analysis with GBS/RADseq Data using R
Version: 0.3.8
Date: 2016-11-24
Version: 0.3.9
Date: 2016-11-28
Encoding: UTF-8
Authors@R: c(
person("Thierry", "Gosselin", email = "[email protected]", role = c("aut", "cre")),
Expand All @@ -12,9 +12,9 @@ Authors@R: c(
Maintainer: Thierry Gosselin <[email protected]>
Description: Set of tools for assignment analysis, tailored for GBS/RADseq data.
Depends:
R (>= 3.0.1),
adegenet
R (>= 3.0.1)
Imports:
adegenet,
data.table,
dplyr (>= 0.5.0),
gdsfmt,
Expand Down
7 changes: 6 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,12 @@ importFrom(dplyr,tally)
importFrom(dplyr,ungroup)
importFrom(lazyeval,interp)
importFrom(magrittr,"%>%")
importFrom(parallel,clusterExport)
importFrom(parallel,detectCores)
importFrom(parallel,makeCluster)
importFrom(parallel,mclapply)
importFrom(parallel,parLapply)
importFrom(parallel,stopCluster)
importFrom(purrr,discard)
importFrom(purrr,flatten)
importFrom(purrr,flatten_df)
Expand All @@ -82,11 +86,11 @@ importFrom(stackr,detect_biallelic_markers)
importFrom(stackr,detect_genomic_format)
importFrom(stackr,discard_monomorphic_markers)
importFrom(stackr,keep_common_markers)
importFrom(stackr,read_long_tidy_wide)
importFrom(stackr,snp_ld)
importFrom(stackr,stackr_imputations_module)
importFrom(stackr,stackr_maf_module)
importFrom(stackr,tidy_genomic_data)
importFrom(stackr,tidy_wide)
importFrom(stackr,write_genind)
importFrom(stats,as.dist)
importFrom(stats,dist)
Expand Down Expand Up @@ -114,3 +118,4 @@ importFrom(tidyr,unite)
importFrom(utils,combn)
importFrom(utils,count.fields)
importFrom(utils,download.file)
importFrom(utils,sessionInfo)
7 changes: 7 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# assigner v.0.3.9
* I'm pleased to announce that `assigner` now works in parallel with **Windows**
* bug fix introduce in last commit in `write_gsi_sim` where the file was not
created properly from an internal module.



# assigner v.0.3.8
* `assigner::fst_WC84` can now use [SNPRelate](https://github.com/zhengxwen/SNPRelate)
to compute Fst. The confidence intervals are not implemented, yet.
Expand Down
6 changes: 3 additions & 3 deletions R/assignment_mixture.R
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@

#' @export
#' @rdname assignment_mixture
#' @importFrom parallel mclapply detectCores
#' @importFrom parallel detectCores
#' @importFrom stringi stri_join stri_sub stri_replace_all_fixed stri_detect_fixed stri_replace_na
#' @importFrom adegenet optim.a.score predict.dapc dapc indNames
#' @importFrom dplyr select distinct n_distinct group_by ungroup rename arrange tally filter if_else mutate summarise left_join inner_join right_join anti_join semi_join full_join summarise_each_ funs sample_n sample_frac mutate_each summarise_each_ intersect
Expand Down Expand Up @@ -1048,7 +1048,7 @@ assignment_mixture <- function(
} # End of iterations for both with and without imputations

assignment.res <- NULL
assignment.res <- parallel::mclapply(
assignment.res <- .assigner_parallel(
X = marker.random.list,
FUN = assignment_random,
mc.preschedule = FALSE,
Expand Down Expand Up @@ -1293,7 +1293,7 @@ assignment_mixture <- function(

# using mclapply
assignment.res <- list()
assignment.res <- parallel::mclapply(
assignment.res <- .assigner_parallel(
X = marker.number,
FUN = assignment_marker_loop,
mc.preschedule = FALSE,
Expand Down
93 changes: 51 additions & 42 deletions R/assignment_ngs.R
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@

#' @export
#' @rdname assignment_ngs
#' @importFrom parallel mclapply detectCores
#' @importFrom parallel detectCores
#' @importFrom stringi stri_join stri_sub stri_replace_all_fixed stri_detect_fixed stri_replace_na
#' @importFrom dplyr select distinct n_distinct group_by ungroup rename arrange tally filter if_else mutate summarise left_join inner_join right_join anti_join semi_join full_join summarise_each_ funs sample_n sample_frac mutate_each summarise_each_
#' @importFrom stackr tidy_genomic_data change_pop_names stackr_imputations_module write_genind snp_ld keep_common_markers stackr_maf_module detect_genomic_format
Expand Down Expand Up @@ -481,7 +481,7 @@ assignment_ngs <- function(
# need to remove space in POP_ID name to work in gsi_sim
input$POP_ID <- stringi::stri_replace_all_fixed(input$POP_ID, pattern = " ", replacement = "_", vectorize_all = FALSE)

input <- stackr::change_pop_names(data = input)
input <- stackr::change_pop_names(data = input, pop.levels = pop.levels, pop.labels = pop.labels)

# create a strata.df
strata.df <- input %>%
Expand Down Expand Up @@ -538,7 +538,7 @@ assignment_ngs <- function(
message("Subsampling: not selected")
} else {
message("Subsampling: selected")
subsampling.individuals <- bind_rows(subsample.list)
subsampling.individuals <- dplyr::bind_rows(subsample.list)
readr::write_tsv(
x = subsampling.individuals,
path = paste0(directory, "subsampling_individuals.tsv"),
Expand All @@ -549,8 +549,7 @@ assignment_ngs <- function(
} # End subsampling

# unused objects
subsampling.individuals <- NULL
ind.pop.df <- NULL
subsampling.individuals <- ind.pop.df <- NULL

# assignment analysis --------------------------------------------------------
# Function:
Expand All @@ -573,7 +572,7 @@ assignment_ngs <- function(

# Keep only the subsample
input <- input %>%
semi_join(subsampling.individuals, by = c("POP_ID", "INDIVIDUALS"))
dplyr::semi_join(subsampling.individuals, by = c("POP_ID", "INDIVIDUALS"))

# unused object
subsampling.individuals <- data <- NULL
Expand Down Expand Up @@ -661,14 +660,21 @@ assignment_ngs <- function(
# Functions ----------------------------------------------------------------
# Assignment with gsi_sim
assignment_gsi_sim <- function(
data, select.markers,
markers.names, missing.data, i, m, holdout, filename, ...) {
data = NULL,
select.markers = NULL,
markers.names = NULL,
missing.data = NULL,
i = NULL,
m = NULL,
holdout = NULL,
filename = NULL,
...
) {
# data <- input #test
# missing.data <- "no.imputation" #test

data.select <- suppressWarnings(
data %>%
dplyr::semi_join(select.markers, by = "MARKERS") %>%
dplyr::semi_join(data, select.markers, by = "MARKERS") %>%
dplyr::arrange(POP_ID, INDIVIDUALS, MARKERS)
)

Expand All @@ -687,7 +693,7 @@ assignment_ngs <- function(
system(paste(gsi_sim_binary(), "-b", input.gsi, "--self-assign > ", output.gsi))

# Option remove the input file from directory to save space
if (keep.gsi.files == FALSE) file.remove(input.gsi)
if (!keep.gsi.files) file.remove(input.gsi)

# Get Assignment results -------------------------------------------------
# Keep track of the holdout individual
Expand All @@ -704,16 +710,15 @@ assignment_ngs <- function(
n.locus <- m

assignment <- suppressWarnings(
readr::read_delim(output.gsi, col_names = "ID", delim = "\t") %>%
suppressMessages(readr::read_delim(output.gsi, col_names = "ID", delim = "\t")) %>%
tidyr::separate(ID, c("KEEPER", "ASSIGN"), sep = ":/", extra = "warn") %>%
dplyr::filter(KEEPER == "SELF_ASSIGN_A_LA_GC_CSV") %>%
tidyr::separate(ASSIGN, c("INDIVIDUALS", "ASSIGN"), sep = ";", extra = "merge") %>%
tidyr::separate(ASSIGN, c("INFERRED", "OTHERS"), sep = ";", convert = TRUE, numerals = "no.loss", extra = "merge") %>%
tidyr::separate(OTHERS, c("SCORE", "OTHERS"), sep = ";;", convert = TRUE, numerals = "no.loss", extra = "merge") %>%
tidyr::separate(OTHERS, c("SECOND_BEST_POP", "OTHERS"), sep = ";", convert = TRUE, numerals = "no.loss", extra = "merge") %>%
tidyr::separate(OTHERS, c("SECOND_BEST_SCORE", "OTHERS"), sep = ";;", convert = TRUE, numerals = "no.loss")
)

)
assignment <- suppressWarnings(
assignment %>%
dplyr::mutate(INDIVIDUALS = as.character(INDIVIDUALS)) %>%
Expand Down Expand Up @@ -750,7 +755,7 @@ assignment_ngs <- function(
}

# Option remove the output file from directory to save space
if (keep.gsi.files == FALSE) file.remove(output.gsi)
if (!keep.gsi.files) file.remove(output.gsi)

# saving preliminary results
if (sampling.method == "ranked") {
Expand All @@ -772,9 +777,19 @@ assignment_ngs <- function(

# Assignment with adegenet
assignment_adegenet <- function(
data, select.markers,
adegenet.dapc.opt, adegenet.n.rep, adegenet.training,
parallel.core, markers.names, missing.data, i, m, holdout, ...) {
data = NULL,
select.markers = NULL,
adegenet.dapc.opt = "optim.a.score",
adegenet.n.rep = 30,
adegenet.training = 0.9,
parallel.core = parallel.core,
markers.names = NULL,
missing.data = NULL,
i = NULL,
m = NULL,
holdout = NULL,
...
) {
# data <- genind.object #test
# missing.data <- "no.imputation" #test
data.select <- data[loc = select.markers$MARKERS]
Expand All @@ -790,7 +805,7 @@ assignment_ngs <- function(
dapc.best.optim.a.score <- adegenet::optim.a.score(
adegenet::dapc(data.select,
n.da = length(levels(pop.data)),
n.pca = round((length(indNames(data.select))/3) - 1, 0)),
n.pca = round((length(adegenet::indNames(data.select))/3) - 1, 0)),
pop = pop.data,
plot = FALSE
)$best
Expand Down Expand Up @@ -827,18 +842,18 @@ assignment_ngs <- function(
if (sampling.method == "ranked") {

# Alpha-Score DAPC training data
training.data <- data.select[!indNames(data.select) %in% holdout$INDIVIDUALS] # training dataset
training.data <- data.select[!adegenet::indNames(data.select) %in% holdout$INDIVIDUALS] # training dataset
pop.training <- training.data@pop
pop.training <- droplevels(pop.training)

dapc.best.optim.a.score <- adegenet::optim.a.score(dapc(training.data, n.da = length(levels(pop.training)), n.pca = round(((length(indNames(training.data))/3) - 1), 0)), pop = pop.training, plot = FALSE)$best
dapc.best.optim.a.score <- adegenet::optim.a.score(dapc(training.data, n.da = length(levels(pop.training)), n.pca = round(((length(adegenet::indNames(training.data))/3) - 1), 0)), pop = pop.training, plot = FALSE)$best
message(stringi::stri_join("a-score optimisation for iteration:", i, sep = " "))

dapc.training <- adegenet::dapc(training.data, n.da = length(levels(pop.training)), n.pca = dapc.best.optim.a.score, pop = pop.training)
message(stringi::stri_join("DAPC of training data set for iteration:", i, sep = " "))

# DAPC holdout individuals
holdout.data <- data.select[indNames(data.select) %in% holdout$INDIVIDUALS] # holdout dataset
holdout.data <- data.select[adegenet::indNames(data.select) %in% holdout$INDIVIDUALS] # holdout dataset
pop.holdout <- holdout.data@pop
pop.holdout <- droplevels(pop.holdout)
assignment.levels <- levels(pop.holdout) # for figure
Expand Down Expand Up @@ -869,7 +884,7 @@ assignment_ngs <- function(
dplyr::select(POP_ID, ASSIGNMENT_PERC)
}
if (sampling.method == "ranked") {
assignment <- data.frame(INDIVIDUALS = indNames(holdout.data), POP_ID = pop.holdout, ASSIGN = dapc.predict.holdout$assign, dapc.predict.holdout$posterior) %>%
assignment <- data.frame(INDIVIDUALS = adegenet::indNames(holdout.data), POP_ID = pop.holdout, ASSIGN = dapc.predict.holdout$assign, dapc.predict.holdout$posterior) %>%
dplyr::rename(CURRENT = POP_ID, INFERRED = ASSIGN) %>%
dplyr::mutate(
CURRENT = factor(CURRENT, levels = rev.assignment.levels, ordered = TRUE),
Expand Down Expand Up @@ -951,12 +966,11 @@ assignment_ngs <- function(

message("Starting parallel computations for the assignment analysis: random markers")
message("For progress: monitor activity in the folder...")
mrl <- NULL
holdout <- NULL
mrl <- holdout <- NULL
assignment.random <- list()
assignment_random <- function(marker.random.list, ...) {
mrl <- data.frame(marker.random.list) # marker random list
# mrl <- marker.random.list[1] # test
mrl <- data.frame(marker.random.list) # marker random list
# mrl <- marker.random.list[[1]] # test
i <- as.numeric(unique(mrl$ITERATIONS)) # iteration
m <- as.numeric(unique(mrl$MARKER_NUMBER)) # number of marker selected

Expand Down Expand Up @@ -1059,8 +1073,7 @@ assignment_ngs <- function(
}

# unused objects
select.markers <- NULL
markers.names <- NULL
select.markers <- markers.names <- NULL
} # End with imputations

# Compile assignment results each marker number for the iteration
Expand All @@ -1075,7 +1088,9 @@ assignment_ngs <- function(
} # End of iterations for both with and without imputations

assignment.res <- NULL
assignment.res <- parallel::mclapply(

# assignment.res <- parallel::mclapply(
assignment.res <- .assigner_parallel(
X = marker.random.list,
FUN = assignment_random,
mc.preschedule = TRUE,
Expand Down Expand Up @@ -1322,7 +1337,7 @@ assignment_ngs <- function(
} else {
set.seed(random.seed)
}


# Create x (iterations) list of y (thl) proportion of individuals per pop.
if (stringi::stri_detect_fixed(thl, ".") & thl < 1) {
Expand Down Expand Up @@ -1461,9 +1476,6 @@ assignment_ngs <- function(
assignment_marker_loop <- function(m, ...) {
message("Marker number: ", m)
# m <- 100 # test
# m <- 150 # test
# m <- 200 # test
# m <- 400 # test
m <- as.numeric(m)
RANKING <- NULL
select.markers <- dplyr::filter(.data = fst.ranked, RANKING <= m) %>%
Expand Down Expand Up @@ -1564,16 +1576,13 @@ assignment_ngs <- function(
}

# unused objects
select.markers <- NULL
markers.names <- NULL
RANKING <- NULL
select.markers <- markers.names <- RANKING <- NULL
} # End with imputations

#compile assignment results each marker number for the iteration
if (is.null(imputation.method)) {# with imputations
assignment <- assignment.no.imp
fst.ranked.imp <- NULL
input.imp <- NULL
fst.ranked.imp <- input.imp <- NULL
} else {
assignment <- dplyr::bind_rows(assignment.no.imp, assignment.imp)
}
Expand Down Expand Up @@ -1619,7 +1628,8 @@ assignment_ngs <- function(

# using mclapply
assignment.res <- list()
assignment.res <- parallel::mclapply(
# assignment.res <- parallel::mclapply(
assignment.res <- .assigner_parallel(
X = iterations.list,
FUN = assignment_ranking,
mc.preschedule = TRUE,
Expand Down Expand Up @@ -1866,8 +1876,7 @@ assignment_ngs <- function(

# Summary of the subsampling iterations
if (iteration.subsample > 1) {
res.pop <- res %>%
dplyr::filter(CURRENT != "OVERALL") %>%
res.pop <- dplyr::filter(.data = res, CURRENT != "OVERALL") %>%
dplyr::group_by(CURRENT, MARKER_NUMBER, MISSING_DATA, METHOD) %>%
dplyr::rename(ASSIGNMENT_PERC = MEAN) %>%
dplyr::summarise(
Expand Down
Loading

0 comments on commit 1643330

Please sign in to comment.