diff --git a/ChangeLog b/ChangeLog index 4713b94..d308548 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,6 +1,16 @@ +2018-01-19 Li Jianfeng + + * added db_annovar_docm, db_annovar_intogen, + db_annovar_disgenet, db_annovar_cancer_hotspots in db + * rename db_cancer_hotspot to db_cancer_hotspots + * added RESM, radia in github + * added rMATS, PARADA, IGV, Marina, PARADIGM, Meerkat, + vadir, in nongithub + * source parse_version.R in local env + 2018-01-10 Li Jianfeng - * added GIGGLE (genomic search engine) + * added facets, GIGGLE (genomic search engine) in github * added absolute, hapseg, atlas2, beagle, contest in nongithub diff --git a/DESCRIPTION b/DESCRIPTION index f3e8bb3..8e31861 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BioInstaller Title: Lightweight Biology Software Installer -Version: 0.3.2.2 +Version: 0.3.2.3 Authors@R: person("Jianfeng", "Li", email = "lee_jianfeng@sjtu.edu.cn", role = c("aut", "cre")) Description: Can be used to install and download massive bioinformatics analysis softwares and databases, such as NGS reads mapping tools with its required databases. diff --git a/R/versions.R b/R/versions.R index b0987bb..fac7492 100644 --- a/R/versions.R +++ b/R/versions.R @@ -41,7 +41,7 @@ use.github.response <- function(config) { nongithub2versions <- function(name) { script <- system.file("extdata", "scripts/parse_version.R", package = "BioInstaller") - source(script) + source(script, local = TRUE) text <- sprintf("get.%s.versions()", name) tryCatch(eval(parse(text = text)), error = function(e) { NULL diff --git a/inst/extdata/config/db/db_annovar.toml b/inst/extdata/config/db/db_annovar.toml index e5a0a92..56c957a 100644 --- a/inst/extdata/config/db/db_annovar.toml +++ b/inst/extdata/config/db/db_annovar.toml @@ -675,3 +675,28 @@ source_url = "http://bioinfo.rjh.com.cn/download/annovarR/humandb/tall_somatic_g version_available = "20171206" buildver_available = ["hg19", "hg38"] description = "Collected from recently published papers, 1) Recurrent SPI1 (PU.1) fusions in high-risk pediatric T cell acute lymphoblastic leukemia 2) The genomic landscape of pediatric and young adult T-lineage acute lymphoblastic leukemia 3) Identification of fusion genes and characterization of transcriptome features in T-cell acute lymphoblastic leukemia" + +[db_annovar_docm] +source_url = "http://bioinfo.rjh.com.cn/download/annovarR/humandb/{{buildver}}_{{version}}.txt.gz" +version_available = "docm_3.2" +buildver_available = ["hg19"] +description = "DoCM, the Database of Curated Mutations, is a highly curated database of known, disease-causing mutations that provides easily explorable variant lists with direct links to source citations for easy verification. http://docm.genome.wustl.edu/about" + +[db_annovar_intogen] +source_url = "http://bioinfo.rjh.com.cn/download/annovarR/humandb/{{buildver}}_{{version}}.sqlite.sql.gz" +version_available = "intogen_20180119" +buildver_available = ["hg19"] +description = "Merged intogen table intogen_mutations_catalog [Driver or passenger mutations]. https://www.intogen.org/downloads" +install = "#R#sql2sqlite('{{buildver}}_{{version}}.sqlite.sql', sqlite.path = '{{buildver}}_{{version}}.sqlite')#R#" + +[db_annovar_disgenet] +source_url = "http://bioinfo.rjh.com.cn/download/annovarR/humandb/{{version}}.txt.gz" +version_available = ["disgenet_befree_gene_disease", "disgenet_befree_rs_disease", "disgenet_curated_gene_disease", "disgenet_curated_variant_disease", "disgenet_gene_disease", "disgenet_pubannotator_variant_disease", "disgenet_rs_disease"] +buildver_available = ["hg19", "hg38"] +description = "See http://www.disgenet.org/web/DisGeNET/menu/downloads" + +[db_annovar_cancer_hotspots] +source_url = "http://bioinfo.rjh.com.cn/download/annovarR/humandb/{{version}}.txt.gz" +version_available = ["cancer_hotspots_v2"] +buildver_available = ["hg19", "hg38"] +description = "See http://cancerhotspots.org/#/home Hotspot Results V2 sheet1" diff --git a/inst/extdata/config/db/db_main.toml b/inst/extdata/config/db/db_main.toml index 82a8e5c..c0d6c0e 100644 --- a/inst/extdata/config/db/db_main.toml +++ b/inst/extdata/config/db/db_main.toml @@ -320,8 +320,8 @@ version_available = ["RNAedit", "dbSNP"] source_url = "http://bioinfo.rjh.com.cn/download/bioinstaller/docm/docm_{{version}}.txt.gz" version_available = ["3.2", "3_clinvar_export"] -[db_cancer_hotspot] -source_url = "http://bioinfo.rjh.com.cn/download/bioinstaller/cancer_hotspot/cancer_hotspot_{{version}}.txt.gz" +[db_cancer_hotspots] +source_url = "http://bioinfo.rjh.com.cn/download/bioinstaller/cancer_hotspots/cancer_hotspots_{{version}}.txt.gz" version_available = ["v1_sheet1", "v1_sheet2", "v2_sheet1", "v2_sheet2"] [db_intogen] diff --git a/inst/extdata/config/db/db_meta.toml b/inst/extdata/config/db/db_meta.toml index 8d9d8b3..a1c5dea 100644 --- a/inst/extdata/config/db/db_meta.toml +++ b/inst/extdata/config/db/db_meta.toml @@ -269,7 +269,7 @@ description = "DoCM, the Database of Curated Mutations, is a highly curated data publication = "A correspondence describing DoCM has been published in Nature Methods: DoCM: a database of curated mutations in cancer. Nature Methods (2016) doi:10.1038/nmeth.4000." tag = ["NGS", "database"] -[db.item.cancer_hotspot] +[db.item.cancer_hotspots] title = "A RESOURCE FOR STATISTICALLY SIGNIFICANT MUTATIONS IN CANCER" description = "This resource is maintained by the Kravis Center for Molecular Oncology at Memorial Sloan Kettering Cancer Center. It provides information about statistically significantly recurrent mutations identified in large scale cancer genomics data." publication = ["Chang et al., Accelerating discovery of functional mutant alleles in cancer. Cancer Discovery, 10.1158/2159-8290.CD-17-0321 (2017)", diff --git a/inst/extdata/config/github/github.toml b/inst/extdata/config/github/github.toml index 49755b8..fd5a1e9 100644 --- a/inst/extdata/config/github/github.toml +++ b/inst/extdata/config/github/github.toml @@ -623,3 +623,13 @@ github_url = "https://github.com/mskcc/facets" [facets.install] linux = "#R#devtools::install('.', build_vignettes = TRUE);devtools::install_github('mskcc/pctGCdata')#R#" mac = "#R#devtools::install('.', build_vignettes = TRUE);devtools::install_github('mskcc/pctGCdata')#R#" + +[resm] +github_url = "https://github.com/deweylab/RSEM" +[resm.install] +linux = "make;make ebseq; make install DESTDIR={{destdir}} prefix=''" +mac = "make;make ebseq; make install DESTDIR={{destdir}} prefix=''" +windows = "make cygwin=true; make ebseq; make install DESTDIR={{destdir}} prefix=''" + +[radia] +github_url = "https://github.com/aradenbaugh/radia/" diff --git a/inst/extdata/config/github/github_meta.toml b/inst/extdata/config/github/github_meta.toml index 854bc20..f5de4e8 100644 --- a/inst/extdata/config/github/github_meta.toml +++ b/inst/extdata/config/github/github_meta.toml @@ -58,3 +58,15 @@ publication = "Liu B, Guan D, Teng M, et al. rHAT: fast alignment of noisy long title = "GIGGLE: a search engine for large-scale integrated genome analysis" description = "GIGGLE is a genomics search engine that identifies and ranks the significance of genomic loci shared between query features and thousands of genome interval files. GIGGLE (https:// github.com/ryanlayer/giggle) scales to billions of intervals and is over three orders of magnitude faster than existing methods. Its speed extends the accessibility and utility of resources such as ENCODE , Roadmap Epigenomics, and GTE x by facilitating data integration and hypothesis generation." publication = "Layer, R.M. et al. GIGGLE: a search engine for large-scale integrated genome analysis. Nat Methods (2018)." + +[github.item.resm] +title = "RSEM: accurate quantification of gene and isoform expression from RNA-Seq data" +description = "RSEM is a software package for estimating gene and isoform expression levels from RNA-Seq data. The RSEM package provides an user-friendly interface, supports threads for parallel computation of the EM algorithm, single-end and paired-end read data, quality scores, variable-length reads and RSPD estimation. In addition, it provides posterior mean and 95% credibility interval estimates for expression levels. For visualization, It can generate BAM and Wiggle files in both transcript-coordinate and genomic-coordinate. Genomic-coordinate files can be visualized by both UCSC Genome browser and Broad Institute's Integrative Genomics Viewer (IGV). Transcript-coordinate files can be visualized by IGV. RSEM also has its own scripts to generate transcript read depth plots in pdf format. The unique feature of RSEM is, the read depth plots can be stacked, with read depth contributed to unique reads shown in black and contributed to multi-reads shown in red. In addition, models learned from data can also be visualized. Last but not least, RSEM contains a simulator." +publication = "Li B, Dewey C N. RSEM: accurate transcript quantification from RNA-Seq data with or without a reference genome[J]. BMC bioinformatics, 2011, 12(1): 323." + +[github.item.radia] +title = "RADIA: RNA and DNA Integrated Analysis for Somatic Mutation Detection" +description = """RADIA identifies RNA and DNA variants in BAM files. RADIA is typically run on 3 BAM files consisting of the Normal DNA, Tumor DNA and Tumor RNA. If no RNA is available from the tumor, then it is run on the normal/tumor pairs. For the normal DNA, RADIA outputs any differences compared to the reference which could be potential Germline mutations. For the tumor DNA, RADIA outputs any differences compared to the reference and the normal DNA which could be potential Somatic mutations. RADIA combines the tumor DNA and tumor RNA to augment the somatic mutation calls. It also uses the tumor RNA to identify potential RNA editing events. + +The DNA Only Method (DOM) uses just the tumor/normal pairs of DNA (ignoring the RNA), while the Triple BAM Method (TBM) uses all three datasets from the same patient to detect somatic mutations. The mutations from the TBM are further categorized into 2 sub-groups: RNA Confirmation and RNA Rescue calls. RNA Confirmation calls are those that are made by both the DOM and the TBM due to the strong read support in both the DNA and RNA. RNA Rescue calls are those that had very little DNA support, hence not called by the DOM, but strong RNA support, and thus called by the TBM. RNA Rescue calls are typically missed by traditional methods that only interrogate the DNA.""" +publication = "Radenbaugh AJ, Ma S, Ewing A, Stuart JM, Collisson EA, Zhu J, Haussler D. (2014) RADIA: RNA and DNA Integrated Analysis for Somatic Mutation Detection. PLoS ONE 9(11): e111516. doi:10.1371/journal.pone.0111516" diff --git a/inst/extdata/config/nongithub/nongithub.toml b/inst/extdata/config/nongithub/nongithub.toml index b2ec608..25c00bd 100644 --- a/inst/extdata/config/nongithub/nongithub.toml +++ b/inst/extdata/config/nongithub/nongithub.toml @@ -717,3 +717,36 @@ version_available = "1.0.24530" [contest.install] linux = "mkdir -p {{destdir}}/bin; cp {{download.dir}}/*.jar {{destdir}}/bin" mac = "mkdir -p {{destdir}}/bin; cp {{download.dir}}/*.jar {{destdir}}/bin" + +[rmats] +source_url = "https://sourceforge.net/projects/rnaseq-mats/files/MATS/rMATS.{{version}}.tgz" +version_available = ["4.0.1", "3.2.5"] + +[rmats_reffa] +source_url = "http://rmaps.cecsresearch.org/{{version}}/{{version}}index.tgz" +version_available = "STAR" + +[prada] +source_url = "https://sourceforge.net/projects/prada/files/pyPRADA/pyPRADA_{{version}}.tar.gz" +version_available = "1.2" + +[igv] +source_url = "http://data.broadinstitute.org/igv/projects/downloads/2.4/IGV_{{version}}.zip" +version_available = "2.4.6" + +[marina] +source_url = "http://bioinfo.rjh.com.cn/download/bioinstaller/marina_matlab/marina_matlab-{{version}}.tar.gz" +version_available = "4" + +[paradigm] +source_url = "http://paradigm.five3genomics.com/five3_paradigm_webapi.py" +version_available = "latest" + +[meerkat] +source_url = "http://bioinfo.rjh.com.cn/download/bioinstaller/meerkat/meerkat.{{version}}.tar.gz" +version_available = "0.189" + +[vadir] +source_url = ["http://bioinfo.rjh.com.cn/download/bioinstaller/vadir/VaDiR.tar.gz", + "ftp://penguin.genomics.cn/pub/10.5524/100001_101000/100360/VaDiR.tar.gz"] +version_available = "latest" diff --git a/inst/extdata/config/nongithub/nongithub_meta.toml b/inst/extdata/config/nongithub/nongithub_meta.toml index b137322..8cfb660 100644 --- a/inst/extdata/config/nongithub/nongithub_meta.toml +++ b/inst/extdata/config/nongithub/nongithub_meta.toml @@ -62,3 +62,63 @@ publication = ["S R Browning and B L Browning (2007) Rapid and accurate haplotyp title = "ContEst is a tool (and method) for estimating the amount of cross-sample contamination in next generation sequencing data. Using a Bayesian framework, contamination levels are estimated from array based genotypes and sequencing reads." description = "Here, we present ContEst, a tool for estimating the level of cross-individual contamination in next-generation sequencing data. We demonstrate the accuracy of ContEst across a range of contamination levels, sources and read depths using sequencing data mixed in silico at known concentrations. We applied our tool to published cancer sequencing datasets and report their estimated contamination levels." publication = "Cibulskis K, Mckenna A, Fennell T, et al. ContEst: estimating cross-contamination of human samples in next-generation sequencing data[J]. Bioinformatics, 2011, 27(18):2601-2602." + +[nongithub.item.rmats] +title = "Multivariate Analysis of Transcript Splicing (MATS)" +description = "MATS is a computational tool to detect differential alternative splicing events from RNA-Seq data. The statistical model of MATS calculates the P-value and false discovery rate that the difference in the isoform ratio of a gene between two conditions exceeds a given user-defined threshold. From the RNA-Seq data, MATS can automatically detect and analyze alternative splicing events corresponding to all major types of alternative splicing patterns. MATS handles replicate RNA-Seq data from both paired and unpaired study design." +publication = ["Shen S., Park JW., Lu ZX., Lin L., Henry MD., Wu YN., Zhou Q., Xing Y. rMATS: Robust and Flexible Detection of Differential Alternative Splicing from Replicate RNA-Seq Data. PNAS, 111(51):E5593-601. doi: 10.1073/pnas.1419161111", + "Park JW., Tokheim C., Shen S., Xing Y. Identifying differential alternative splicing events from RNA sequencing data using RNASeq-MATS. Methods in Molecular Biology: Deep Sequencing Data Analysis, 2013;1038:171-179 doi: 10.1007/978-1-62703-514-9_10", + "Shen S., Park JW., Huang J., Dittmar KA., Lu ZX., Zhou Q., Carstens RP., Xing Y. MATS: A Bayesian Framework for Flexible Detection of Differential Alternative Splicing from RNA-Seq Data. Nucleic Acids Research, 2012;40(8):e61 doi: 10.1093/nar/gkr1291"] + +[nongithub.item.prada] +title = "PRADA : Pipeline for RNA-Sequencing Data Analysis" +description = """Massively parallel sequencing of cDNA reverse transcribed from RNA (RNASeq) provides an accurate estimate of the quantity and composition of mRNAs. To characterize the transcriptome through the analysis of RNA-seq data, we developed PRADA. PRADA focuses on the processing and analysis of gene expression estimates, supervised and unsupervised gene fusion identification, and supervised intragenic deletion identification. +PRADA currently supports 7 modules to process and identify abnormalities from RNAseq data: +preprocess: Generates aligned and recalibrated BAM files. +expression: Generates gene expression (RPKM) and quality metrics. +fusion: Identifies candidate gene fusions. +guess-ft: Supervised search for fusion transcripts. +guess-if: Supervised search for intragenic fusions. +homology: Calculates homology between given two genes. +frame: Predicts functional consequence of fusion transcript""" +publication = "PRADA: pipeline for RNA sequencing data analysis[J]. Bioinformatics, 2014, 30(15): 2224-2226. https://doi.org/10.1093/bioinformatics/btu169" + +[nongithub.item.igv] +title = "The Integrative Genomics Viewer (IGV)" +description = "The Integrative Genomics Viewer (IGV) is a high-performance visualization tool for interactive exploration of large, integrated genomic datasets. It supports a wide variety of data types, including array-based and next-generation sequence data, and genomic annotations." +publication = ["Integrative Genomics Viewer. Nature Biotechnology 29, 24–26 (2011)", + "Integrative Genomics Viewer (IGV): high-performance genomics data visualization and exploration. Briefings in Bioinformatics 14, 178-192 (2013)."] + + +[nongithub.item.marina] +title = "Master Regulator Inference Algorithm" +description = "MARINA (Master Regulator Inference Algorithm) MAster Regulator INference algorithm (MARINa), designed to infer transcription factors (TFs) controlling the transition between the two phenotypes, A and B, and the maintenance of the latter phenotype. Expression at the mRNA level is often a poor predictor of a TF's regulatory activity and an even worst predictor of its biological relevance in regulating phenotype-specific programs. To obviate this problem, MARINa infers TF activity from the global transcriptional activation of its regulon (i.e. its activated and repressed targets) and its biological relevance by TF-regulon overlap with phenotype-specific programs." +publication = "Lefebvre C, Rajbhandari P, Alvarez MJ, Bandaru P, Lim WK, Sato M, Wang K, Sumazin P, Kustagi M, Bisikirska BC, Basso K, Beltrao P, Krogan N, Gautier J, Dalla-Favera R, Califano A. A human B-cell interactome identifies MYB and FOXM1 as master regulators of proliferation in germinal centers. Mol Syst Biol. 2010 Jun 8;6:377." + +[nongithub.item.paradigm] +title = "PAthway Representation and Analysis by Direct Inference on Graphical Models" +description = "High-dimensional ‘-omics’ profiling provides a detailed molecular view of individual cancers; however, understanding the mechanisms by which tumors evade cellular defenses requires deep knowledge of the underlying cellular pathways within each cancer sample. We extended the PARADIGM algorithm (Vaske et al., 2010, Bioinformatics, 26, i237–i245), a pathway analysis method for combining multiple ‘-omics’ data types, to learn the strength and direction of 9139 gene and protein interactions curated from the literature. Using genomic and mRNA expression data from 1936 samples in The Cancer Genome Atlas (TCGA) cohort, we learned interactions that provided support for and relative strength of 7138 (78%) of the curated links. Gene set enrichment found that genes involved in the strongest interactions were significantly enriched for transcriptional regulation, apoptosis, cell cycle regulation and response to tumor cells. Within the TCGA breast cancer cohort, we assessed different interaction strengths between breast cancer subtypes, and found interactions associated with the MYC pathway and the ER alpha network to be among the most differential between basal and luminal A subtypes. PARADIGM with the Naive Bayesian assumption produced gene activity predictions that, when clustered, found groups of patients with better separation in survival than both the original version of PARADIGM and a version without the assumption. We found that this Naive Bayes assumption was valid for the vast majority of co-regulators, indicating that most co-regulators act independently on their shared target." +publication = "Sedgewick A J, Benz S C, Rabizadeh S, et al. Learning subgroup-specific regulatory interactions and regulator independence with PARADIGM[J]. Bioinformatics, 2013, 29(13): i62-i70. https://doi.org/10.1093/bioinformatics/btt229" + +[nongithub.item.meerkat] +title = "http://dx.doi.org/10.1016/j.cell.2013.04.010" +description = "Identification of somatic rearrangements in cancer genomes has accelerated through analysis of high-throughput sequencing data. However, characterization of complex structural alterations and their underlying mechanisms remains inadequate. Here, applying an algorithm to predict structural variations from short reads, we report a comprehensive catalog of somatic structural variations and the mechanisms generating them, using high-coverage whole-genome sequencing data from 140 patients across ten tumor types. We characterize the relative contributions of different types of rearrangements and their mutational mechanisms, find that ∼20% of the somatic deletions are complex deletions formed by replication errors, and describe the differences between the mutational mechanisms in somatic and germline alterations. Importantly, we provide detailed reconstructions of the events responsible for loss of CDKN2A/B and gain of EGFR in glioblastoma, revealing that these alterations can result from multiple mechanisms even in a single genome and that both DNA double-strand breaks and replication errors drive somatic rearrangements." +publication = "Yang L, Luquette L J, Gehlenborg N, et al. Diverse Mechanisms of Somatic Structural Variations in Human Cancer Genomes[J]. Cell, 2013, 153(4):919-29." + +[nongithub.item.vadir] +title = "VaDiR: an integrated approach to Variant Detection in RNA" +description = """Advances in next-generation DNA sequencing technologies are now enabling detailed +characterization of sequence variations in cancer genomes. With whole genome sequencing, variations in +coding and non-coding sequences can be discovered. But the cost associated with it is currently limiting its +general use in research. Whole exome sequencing is used to characterize sequence variations in coding regions, +but the cost associated with capture reagents and biases in capture rate limit its full use in research. Additional +limitations include uncertainty in assigning the functional signi cance of the mutations when these mutations +are observed in the non-coding region or in genes that are not expressed in cancer tissue. +We investigated the feasibility of uncovering mutations from expressed genes using RNA sequencing +datasets with a method called VaDiR: Variant Detection in RNA" that integrate three variant callers, namely: +SNPiR, RVBoost and MuTect2. The combination of all three methods, which we called Tier1 variants, +produced the highest precision with true positive mutations from RNA-seq that could be validated at the DNA +level. We also found that the integration of Tier1 variants with those called by MuTect2 and SNPiR produced +the highest recall with acceptable precision. Finally, we observed higher rate of mutation discovery in genes +that are expressed at higher levels.""" +publication = "Neums L, Suenaga S, Beyerlein P, et al. VaDiR: an integrated approach to Variant Detection in RNA[J]. GigaScience, 2017. https://doi.org/10.1093/gigascience/gix122"