From 14e1e7e6a242884fdfb8aa05009c427dceb3415c Mon Sep 17 00:00:00 2001 From: Sebastian Hollizeck Date: Tue, 15 Dec 2020 09:37:00 +1100 Subject: [PATCH] cleanup of spagetti code through inheritance each cram version is now a subclass of the bam version. this allows changes in one place to affect both. --- .../variantcalling/multisample/__init__.py | 11 +- .../multisample/freebayes/__init__.py | 2 + .../freebayessomaticworkflow.py | 64 ++++-- .../freebayessomaticworkflow_cram.py | 30 +++ .../freebayessomaticworkflow_cram.py | 195 ------------------ .../multisample/mutect2/__init__.py | 2 + .../mutect2jointsomaticworkflow.py | 94 ++++++--- .../mutect2jointsomaticworkflow_cram.py | 37 ++++ .../mutect2jointsomaticworkflow_cram.py | 146 ------------- .../multisample/steps/__init__.py | 1 - .../steps/strelka2passanalysisstep1_cram.py | 105 ---------- .../steps/strelka2passanalysisstep2_cram.py | 96 --------- .../multisample/strelka2/__init__.py | 2 + .../multisample/strelka2/steps/__init__.py | 1 + .../steps/strelka2passanalysisstep1.py | 39 ++-- .../steps/strelka2passanalysisstep1_cram.py | 37 ++++ .../steps/strelka2passanalysisstep2.py | 31 ++- .../steps/strelka2passanalysisstep2_cram.py | 29 +++ .../{ => strelka2}/strelka2passworkflow.py | 87 ++++++-- .../strelka2/strelka2passworkflow_cram.py | 37 ++++ .../multisample/strelka2passworkflow_cram.py | 142 ------------- 21 files changed, 419 insertions(+), 769 deletions(-) create mode 100644 janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayes/__init__.py rename janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/{ => freebayes}/freebayessomaticworkflow.py (75%) create mode 100644 janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayes/freebayessomaticworkflow_cram.py delete mode 100644 janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayessomaticworkflow_cram.py create mode 100644 janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2/__init__.py rename janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/{ => mutect2}/mutect2jointsomaticworkflow.py (57%) create mode 100644 janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2/mutect2jointsomaticworkflow_cram.py delete mode 100644 janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2jointsomaticworkflow_cram.py delete mode 100644 janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/__init__.py delete mode 100644 janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/strelka2passanalysisstep1_cram.py delete mode 100644 janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/strelka2passanalysisstep2_cram.py create mode 100644 janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/__init__.py create mode 100644 janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/__init__.py rename janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/{ => strelka2}/steps/strelka2passanalysisstep1.py (76%) create mode 100644 janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/strelka2passanalysisstep1_cram.py rename janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/{ => strelka2}/steps/strelka2passanalysisstep2.py (78%) create mode 100644 janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/strelka2passanalysisstep2_cram.py rename janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/{ => strelka2}/strelka2passworkflow.py (59%) create mode 100644 janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/strelka2passworkflow_cram.py delete mode 100644 janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2passworkflow_cram.py diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/__init__.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/__init__.py index d296af114..39bd189b2 100644 --- a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/__init__.py +++ b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/__init__.py @@ -1,8 +1,3 @@ -from .freebayessomaticworkflow import FreeBayesSomaticWorkflow -from .freebayessomaticworkflow_cram import FreeBayesSomaticWorkflowCram - -from .mutect2jointsomaticworkflow import Mutect2JointSomaticWorkflow -from .mutect2jointsomaticworkflow_cram import Mutect2JointSomaticWorkflowCram - -from .strelka2passworkflow import Strelka2PassWorkflow -from .strelka2passworkflow_cram import Strelka2PassWorkflowCram +from .freebayes import * +from .mutect2 import * +from .strelka2 import * diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayes/__init__.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayes/__init__.py new file mode 100644 index 000000000..7cfc1f2cc --- /dev/null +++ b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayes/__init__.py @@ -0,0 +1,2 @@ +from .freebayessomaticworkflow import FreeBayesSomaticWorkflow +from .freebayessomaticworkflow_cram import FreeBayesSomaticWorkflowCram diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayessomaticworkflow.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayes/freebayessomaticworkflow.py similarity index 75% rename from janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayessomaticworkflow.py rename to janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayes/freebayessomaticworkflow.py index a64871c9f..200fc0274 100644 --- a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayessomaticworkflow.py +++ b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayes/freebayessomaticworkflow.py @@ -1,13 +1,13 @@ from datetime import date -from janis_bioinformatics.data_types import BamBai, FastaFai +from janis_bioinformatics.data_types import FastaFai from janis_bioinformatics.tools import BioinformaticsWorkflow from janis_bioinformatics.tools.bcftools import BcfToolsNormLatest as BcfToolsNorm from janis_bioinformatics.tools.dawson import ( CallSomaticFreeBayes_0_1 as CallSomaticFreeBayes, ) from janis_bioinformatics.tools.dawson.createcallregions.base import CreateCallRegions -from janis_bioinformatics.tools.freebayes.versions import FreeBayes_1_3 as FreeBayes + from janis_bioinformatics.tools.htslib import BGZipLatest as BGZip, TabixLatest as Tabix from janis_bioinformatics.tools.vcflib import ( VcfAllelicPrimitivesLatest as VcfAllelicPrimitives, @@ -31,10 +31,10 @@ def tool_provider(self): return "Dawson Labs" def version(self): - return "0.1" + return "0.1.1" def bind_metadata(self): - self.metadata.version = "0.1" + self.metadata.version = "0.1.1" self.metadata.dateCreated = date(2019, 10, 18) self.metadata.dateUpdated = date(2020, 12, 10) @@ -52,23 +52,63 @@ def bind_metadata(self): This allows a joint somatic genotyping of multiple samples of the same individual. """.strip() + # this is a way to get the tool without spagetti code in bam and cram format + def getFreebayesTool(self): + from janis_bioinformatics.tools.freebayes.versions import ( + FreeBayes_1_3 as freebayes, + ) + + return freebayes + + def getFreebayesInputType(self): + from janis_bioinformatics.data_types import BamBai + + return BamBai + def constructor(self): - self.input("bams", Array(BamBai)) + self.input( + "bams", + Array(self.getFreebayesInputType()), + doc="All bams to be analysed. Samples can be split over multiple bams as well as multiple samples can be contained in one bam as long as the sample ids are set properly.", + ) - self.input("reference", FastaFai) - self.input("regionSize", int, default=10000000) + self.input( + "reference", + FastaFai, + doc="The reference the bams were aligned to, with a fai index.", + ) + self.input( + "regionSize", + int, + default=10000000, + doc="the size of the regions, to parallelise the analysis over. This needs to be adjusted if there are lots of samples or very high depth sequencing in the analysis.", + ) - self.input("normalSample", String) + self.input( + "normalSample", + String, + doc="The sample id of the normal sample, as it is specified in the bam header.", + ) # this is the coverage per sample that is the max we will analyse. It will automatically # multiplied by the amount of input bams we get - self.input("skipCov", Int(optional=True), default=500) + self.input( + "skipCov", + Int(optional=True), + default=500, + doc="The depth per sample, at which the variant calling process will skip a region. This is used to ignore regions with mapping issues, like the centromeres as well as heterochromatin. A good value is 3 times the maximum expected coverage.", + ) # the same is true for min cov - self.input("minCov", Int(optional=True), default=10) + self.input( + "minCov", + Int(optional=True), + default=10, + doc="Minimum coverage over all samples, to still call variants.", + ) - # this should be a conditional (if the callregions are supplied we use them, otherwise we + # this could be a conditional (if the callregions are supplied we use them, otherwise we # create them) self.step( "createCallRegions", @@ -79,7 +119,7 @@ def constructor(self): self.step( "callVariants", - FreeBayes( + self.getFreebayesTool()( bams=self.bams, reference=self.reference, pooledDiscreteFlag=True, diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayes/freebayessomaticworkflow_cram.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayes/freebayessomaticworkflow_cram.py new file mode 100644 index 000000000..3f1bad2eb --- /dev/null +++ b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayes/freebayessomaticworkflow_cram.py @@ -0,0 +1,30 @@ +from .freebayessomaticworkflow import ( + FreeBayesSomaticWorkflow, +) + + +class FreeBayesSomaticWorkflowCram(FreeBayesSomaticWorkflow): + def id(self): + return "FreeBayesSomaticWorkflowCram" + + def friendly_name(self): + return "Freebayes somatic workflow (CRAM)" + + # this is a way to get the tool without spagetti code in bam and cram format + def getFreebayesTool(self): + from janis_bioinformatics.tools.freebayes.versions import ( + FreeBayesCram_1_3 as freebayes, + ) + + return freebayes + + def getFreebayesInputType(self): + from janis_bioinformatics.data_types import CramCrai + + return CramCrai + + +if __name__ == "__main__": + + wf = FreeBayesSomaticWorkflowCram() + wdl = wf.translate("wdl", to_console=True, to_disk=False, write_inputs_file=False) diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayessomaticworkflow_cram.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayessomaticworkflow_cram.py deleted file mode 100644 index 7a349a9fb..000000000 --- a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/freebayessomaticworkflow_cram.py +++ /dev/null @@ -1,195 +0,0 @@ -from datetime import date - -from janis_bioinformatics.data_types import CramCrai, FastaFai -from janis_bioinformatics.tools import BioinformaticsWorkflow -from janis_bioinformatics.tools.bcftools import BcfToolsNormLatest as BcfToolsNorm -from janis_bioinformatics.tools.dawson import ( - CallSomaticFreeBayes_0_1 as CallSomaticFreeBayes, -) -from janis_bioinformatics.tools.dawson.createcallregions.base import CreateCallRegions -from janis_bioinformatics.tools.freebayes.versions import FreeBayesCram_1_3 as FreeBayes -from janis_bioinformatics.tools.htslib import BGZipLatest as BGZip, TabixLatest as Tabix -from janis_bioinformatics.tools.vcflib import ( - VcfAllelicPrimitivesLatest as VcfAllelicPrimitives, - VcfCombineLatest as VcfCombine, - VcfFixUpLatest as VcfFixUp, - VcfStreamSortLatest as VcfStreamSort, - VcfUniqAllelesLatest as VcfUniqAlleles, - VcfUniqLatest as VcfUniq, -) -from janis_core import Array, Int, String - - -class FreeBayesSomaticWorkflowCram(BioinformaticsWorkflow): - def id(self): - return "FreeBayesSomaticWorkflowCram" - - def friendly_name(self): - return "Freebayes somatic workflow (CRAM)" - - def tool_provider(self): - return "Dawson Labs" - - def version(self): - return "0.1" - - def bind_metadata(self): - self.metadata.version = "0.1" - self.metadata.dateCreated = date(2019, 10, 18) - self.metadata.dateUpdated = date(2020, 12, 10) - - self.contributors = ["Sebastian Hollizeck"] - self.metadata.keywords = [ - "variants", - "freebayes", - "variant caller", - "multi sample", - ] - self.metadata.documentation = """ - This workflow uses the capabilities of freebayes to output all variants independent of the - diploid model which then in turn allows us to create a likelihood based difference between - the normal sample and an arbitrary amount of samples. - This allows a joint somatic genotyping of multiple samples of the same individual. - """.strip() - - def constructor(self): - - self.input("bams", Array(CramCrai)) - - self.input("reference", FastaFai) - self.input("regionSize", int, default=10000000) - - self.input("normalSample", String) - - # this is the coverage per sample that is the max we will analyse. It will automatically - # multiplied by the amount of input bams we get - self.input("skipCov", Int(optional=True), default=500) - - # the same is true for min cov - self.input("minCov", Int(optional=True), default=10) - - # this should be a conditional (if the callregions are supplied we use them, otherwise we - # create them) - self.step( - "createCallRegions", - CreateCallRegions( - reference=self.reference, regionSize=self.regionSize, equalize=True - ), - ) - - self.step( - "callVariants", - FreeBayes( - bams=self.bams, - reference=self.reference, - pooledDiscreteFlag=True, - gtQuals=True, - strictFlag=True, - pooledContinousFlag=True, - reportMaxGLFlag=True, - noABPriorsFlag=True, - maxNumOfAlleles=4, - noPartObsFlag=True, - region=self.createCallRegions.regions, - # here we multiply the skipCov input by the amount of input that we have - skipCov=(self.skipCov * self.bams.length()), - # things that are actually default, but janis does not recognize yet - useDupFlag=False, - minBaseQual=1, - minSupMQsum=0, - minSupQsum=0, - minCov=self.minCov, - # now here we are trying to play with the detection limits - # we set the fraction to be very low, to include ALL of the sites in a potential analysis - minAltFrac=0.01, - # and we want at least one sample that has two high quality variants OR multiple - # lower quality ones - minAltQSum=70, - # but we also want to have at least two reads overall with that variants - # we do not care if they are between samples or if they are in the same sample, but - # 2 is better than one - minAltTotal=2, - ), - scatter="region", - ) - # might actually rewrite this once everything works, to not combine the files here, but do - # all of it scattered and then only combine the final output - # self.step("combineRegions", VcfCombine(vcf=self.callVariants.out)) - - # - - # self.step("compressAll", BGZip(file=self.sortAll.out)) - # self.step("indexAll", Tabix(file=self.compressAll.out)) - - self.step( - "callSomatic", - CallSomaticFreeBayes( - vcf=self.callVariants.out, normalSampleName=self.normalSample - ), - # added for parallel - scatter="vcf", - ) - - self.step("combineRegions", VcfCombine(vcf=self.callSomatic.out)) - - # should not be necessary here, but just to be save - self.step( - "sortSomatic1", - VcfStreamSort(vcf=self.combineRegions.out, inMemoryFlag=True), - ) - - # no need to compress this here if it leads to problems when we dont have an index for the allelic allelicPrimitves - self.step( - "normalizeSomatic1", - BcfToolsNorm( - vcf=self.sortSomatic1.out, - reference=self.reference, - outputType="v", - outputFilename="normalised.vcf", - ), - ) - - self.step( - "allelicPrimitves", - VcfAllelicPrimitives( - vcf=self.normalizeSomatic1.out, - tagParsed="DECOMPOSED", - keepGenoFlag=True, - ), - ) - - self.step("fixSplitLines", VcfFixUp(vcf=self.allelicPrimitves.out)) - - self.step( - "sortSomatic2", VcfStreamSort(vcf=self.fixSplitLines.out, inMemoryFlag=True) - ) - - self.step( - "normalizeSomatic2", - BcfToolsNorm( - vcf=self.sortSomatic2.out, - reference=self.reference, - outputType="v", - outputFilename="normalised.vcf", - ), - ) - - self.step("uniqueAlleles", VcfUniqAlleles(vcf=self.normalizeSomatic2.out)) - - self.step( - "sortFinal", VcfStreamSort(vcf=self.uniqueAlleles.out, inMemoryFlag=True) - ) - - self.step("uniqVcf", VcfUniq(vcf=self.sortFinal.out)) - - self.step("compressFinal", BGZip(file=self.uniqVcf.out)) - - self.step("indexFinal", Tabix(inp=self.compressFinal.out)) - - self.output("somaticOutVcf", source=self.indexFinal) - - -if __name__ == "__main__": - - wf = FreeBayesSomaticWorkflow() - wdl = wf.translate("wdl", to_console=True, to_disk=False, write_inputs_file=False) diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2/__init__.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2/__init__.py new file mode 100644 index 000000000..ad3d5f107 --- /dev/null +++ b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2/__init__.py @@ -0,0 +1,2 @@ +from .mutect2jointsomaticworkflow import Mutect2JointSomaticWorkflow +from .mutect2jointsomaticworkflow_cram import Mutect2JointSomaticWorkflowCram diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2jointsomaticworkflow.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2/mutect2jointsomaticworkflow.py similarity index 57% rename from janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2jointsomaticworkflow.py rename to janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2/mutect2jointsomaticworkflow.py index 87ffb1554..e03377351 100644 --- a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2jointsomaticworkflow.py +++ b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2/mutect2jointsomaticworkflow.py @@ -1,8 +1,10 @@ from datetime import date from janis_core import Array, String -from janis_bioinformatics.data_types import BamBai, FastaWithDict, VcfTabix +from janis_bioinformatics.data_types import FastaWithDict, VcfTabix from janis_bioinformatics.tools import BioinformaticsWorkflow +from janis_core.operators.standard import FirstOperator + from janis_bioinformatics.tools.bcftools import ( BcfToolsConcat_1_9 as BcfToolsConcat, BcfToolsIndex_1_9 as BcfToolsIndex, @@ -10,7 +12,6 @@ ) from janis_bioinformatics.tools.dawson.createcallregions.base import CreateCallRegions -# these dont need cram, because they dont work on crams from janis_bioinformatics.tools.gatk4 import ( Gatk4CalculateContaminationLatest as CalculateContamination, Gatk4FilterMutectCallsLatest as FilterMutectCalls, @@ -18,14 +19,6 @@ Gatk4MergeMutectStatsLatest as MergeMutectStats, ) -from janis_bioinformatics.tools.gatk4.mutect2.versions import ( - GatkMutect2_4_1_8 as Mutect2, -) - -from janis_bioinformatics.tools.gatk4.getpileupsummaries.versions import ( - Gatk4GetPileUpSummaries_4_1_8 as GetPileUpSummaries, -) - class Mutect2JointSomaticWorkflow(BioinformaticsWorkflow): def id(self): @@ -38,10 +31,10 @@ def tool_provider(self): return "Dawson Labs" def version(self): - return "0.1" + return "0.1.1" def bind_metadata(self): - self.metadata.version = "0.1" + self.metadata.version = "0.1.1" self.metadata.dateCreated = date(2019, 10, 30) self.metadata.dateUpdated = date(2020, 12, 10) @@ -58,36 +51,91 @@ def bind_metadata(self): There are also som major tweaks we have to do for runtime, as the amount of data might overwhelm the tools otherwise. """.strip() + # this is a way to get the tool without spagetti code in bam and cram format + def getMutect2Tool(self): + from janis_bioinformatics.tools.gatk4.mutect2.versions import ( + GatkMutect2_4_1_8 as Mutect2, + ) + + return Mutect2 + + def getPileUpTool(self): + from janis_bioinformatics.tools.gatk4.getpileupsummaries.versions import ( + Gatk4GetPileUpSummaries_4_1_8 as Pileup, + ) + + return Pileup + + def getMutect2InputType(self): + from janis_bioinformatics.data_types import BamBai + + return BamBai + def constructor(self): # we have to split the bam into the ones of the normal sample (can be multiple) and the # tumor, because some tools only work with the tumor bams - self.input("normalBams", Array(BamBai)) - self.input("tumorBams", Array(BamBai)) + self.input( + "normalBams", + Array(self.getMutect2InputType()), + doc="The bams that make up the normal sample. Generally Mutect will expect one bam per sample, but as long as the sample ids in the bam header are set appropriatly, multiple bams per sample will work", + ) + self.input( + "tumorBams", + Array(self.getMutect2InputType()), + doc="The bams that contain the tumour samples. Generally Mutect will expect one bam per sample, but as long as the sample ids in the bam header are set appropriatly, multiple bams per sample will work", + ) # we also need the name of the normal sample (needs to be the name in the bams as well) - self.input("normalName", String) + self.input( + "normalName", + String, + doc="The sample id of the normal sample. This id will be used to distingiush reads from this sample from all other samples. This id needs to tbe the one set in the bam header", + ) - self.input("biallelicSites", VcfTabix) + self.input( + "biallelicSites", + VcfTabix, + doc="A vcf of common biallalic sites from a population. This will be used to estimate sample contamination.", + ) - self.input("reference", FastaWithDict) + self.input( + "reference", + FastaWithDict, + doc="A fasta and dict indexed reference, which needs to be the reference, the bams were aligned to.", + ) - self.input("regionSize", int, default=10000000) + self.input( + "regionSize", + int, + default=10000000, + doc="The size of the regions over which to parallelise the analysis. This should be adjusted, if there are lots of samples or a very high sequencing depth. default: 10M bp", + ) - self.input("panelOfNormals", VcfTabix) + self.input( + "panelOfNormals", + VcfTabix, + doc="The panel of normals, which summarises the technical and biological sites of errors. Its usually a good idea to generate this for your own cohort, but GATK suggests around 30 normals, so their panel is usually a good idea.", + ) - self.input("germlineResource", VcfTabix) + self.input( + "germlineResource", + VcfTabix, + doc="Vcf of germline variants. GATK provides this as well, but it can easily substituted with the newst gnomad etc vcf.", + ) self.step( "createCallRegions", CreateCallRegions( - reference=self.reference, regionSize=self.regionSize, equalize=True + reference=self.reference, + regionSize=self.regionSize, + equalize=True, ), ) self.step( "mutect2", - Mutect2( + self.getMutect2Tool()( tumorBams=self.tumorBams, normalBams=self.normalBams, normalSample=self.normalName, @@ -110,7 +158,7 @@ def constructor(self): self.step( "pileup", - GetPileUpSummaries( + self.getPileUpTool()( bam=self.tumorBams, sites=self.biallelicSites, intervals=self.biallelicSites, diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2/mutect2jointsomaticworkflow_cram.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2/mutect2jointsomaticworkflow_cram.py new file mode 100644 index 000000000..8c00c94b3 --- /dev/null +++ b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2/mutect2jointsomaticworkflow_cram.py @@ -0,0 +1,37 @@ +from .mutect2jointsomaticworkflow import ( + Mutect2JointSomaticWorkflow, +) + + +class Mutect2JointSomaticWorkflowCram(Mutect2JointSomaticWorkflow): + def id(self): + return "Mutect2JointSomaticWorkflowCram" + + def friendly_name(self): + return "Mutect2 joint somatic variant calling workflow (CRAM)" + + # this is a way to get the tool without spagetti code in bam and cram format + def getMutect2Tool(self): + from janis_bioinformatics.tools.gatk4.mutect2.versions import ( + GatkMutect2Cram_4_1_8 as Mutect2, + ) + + return Mutect2 + + def getPileUpTool(self): + from janis_bioinformatics.tools.gatk4.getpileupsummaries.versions import ( + Gatk4GetPileUpSummariesCram_4_1_8 as Pileup, + ) + + return Pileup + + def getMutect2InputType(self): + from janis_bioinformatics.data_types import CramCrai + + return CramCrai + + +if __name__ == "__main__": + + wf = Mutect2JointSomaticWorkflowCram() + wdl = wf.translate("wdl", to_console=True, to_disk=False, write_inputs_file=False) diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2jointsomaticworkflow_cram.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2jointsomaticworkflow_cram.py deleted file mode 100644 index 5ff1e2690..000000000 --- a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/mutect2jointsomaticworkflow_cram.py +++ /dev/null @@ -1,146 +0,0 @@ -from datetime import date - -from janis_core import Array, String -from janis_bioinformatics.data_types import CramCrai, FastaWithDict, VcfTabix -from janis_bioinformatics.tools import BioinformaticsWorkflow -from janis_bioinformatics.tools.bcftools import ( - BcfToolsConcat_1_9 as BcfToolsConcat, - BcfToolsIndex_1_9 as BcfToolsIndex, - BcfToolsNorm_1_9 as BcfToolsNorm, -) -from janis_bioinformatics.tools.dawson.createcallregions.base import CreateCallRegions - -# these dont need cram, because they dont work on crams -from janis_bioinformatics.tools.gatk4 import ( - Gatk4CalculateContaminationLatest as CalculateContamination, - Gatk4FilterMutectCallsLatest as FilterMutectCalls, - Gatk4LearnReadOrientationModelLatest as LearnReadOrientationModel, - Gatk4MergeMutectStatsLatest as MergeMutectStats, -) - -# for these we need the special cram version because they work directly on the cram -from janis_bioinformatics.tools.gatk4.mutect2.versions import ( - GatkMutect2Cram_4_1_8 as Mutect2, -) - -from janis_bioinformatics.tools.gatk4.getpileupsummaries.versions import ( - Gatk4GetPileUpSummariesCram_4_1_8 as GetPileUpSummaries, -) - - -class Mutect2JointSomaticWorkflowCram(BioinformaticsWorkflow): - def id(self): - return "Mutect2JointSomaticWorkflowCram" - - def friendly_name(self): - return "Mutect2 joint somatic variant calling workflow (CRAM)" - - def tool_provider(self): - return "Dawson Labs" - - def version(self): - return "0.1" - - def bind_metadata(self): - self.metadata.version = "0.1" - self.metadata.dateCreated = date(2019, 10, 30) - self.metadata.dateUpdated = date(2020, 12, 10) - - self.contributors = ["Sebastian Hollizeck"] - self.metadata.keywords = [ - "variants", - "mutect2", - "variant caller", - "multi sample", - ] - self.metadata.documentation = """ - This workflow uses the capability of mutect2 to call several samples at the same time and improve recall and accuracy through a joint model. - Most of these tools are still in a beta state and not intended for main production (as of 4.1.4.0) - There are also som major tweaks we have to do for runtime, as the amount of data might overwhelm the tools otherwise. - """.strip() - - def constructor(self): - - # we have to split the bam into the ones of the normal sample (can be multiple) and the - # tumor, because some tools only work with the tumor bams - self.input("normalBams", Array(CramCrai)) - self.input("tumorBams", Array(CramCrai)) - - # we also need the name of the normal sample (needs to be the name in the bams as well) - self.input("normalName", String) - - self.input("biallelicSites", VcfTabix) - - self.input("reference", FastaWithDict) - - self.input("regionSize", int, default=10000000) - - self.input("panelOfNormals", VcfTabix) - - self.input("germlineResource", VcfTabix) - - self.step( - "createCallRegions", - CreateCallRegions( - reference=self.reference, regionSize=self.regionSize, equalize=True - ), - ) - - self.step( - "mutect2", - Mutect2( - tumorBams=self.tumorBams, - normalBams=self.normalBams, - normalSample=self.normalName, - intervals=self.createCallRegions.regions, - reference=self.reference, - panelOfNormals=self.panelOfNormals, - germlineResource=self.germlineResource, - ), - scatter="intervals", - ) - - self.step("concat", BcfToolsConcat(vcf=self.mutect2.out)) - self.step("indexUnfiltered", BcfToolsIndex(vcf=self.concat.out)) - - self.step( - "learn", LearnReadOrientationModel(f1r2CountsFiles=self.mutect2.f1f2r_out) - ) - - self.step("mergeMutect2", MergeMutectStats(statsFiles=self.mutect2.stats)) - - self.step( - "pileup", - GetPileUpSummaries( - bam=self.tumorBams, - sites=self.biallelicSites, - intervals=self.biallelicSites, - reference=self.reference, - ), - ) - - self.step("contamination", CalculateContamination(pileupTable=self.pileup.out)) - - self.step( - "filtering", - FilterMutectCalls( - vcf=self.indexUnfiltered.out, - reference=self.reference, - segmentationFile=self.contamination.segOut, - contaminationTable=self.contamination.contOut, - readOrientationModel=self.learn.out, - statsFile=self.mergeMutect2.out, - ), - ) - - self.step( - "normalise", BcfToolsNorm(vcf=self.filtering.out, reference=self.reference) - ) - self.step("indexFiltered", BcfToolsIndex(vcf=self.normalise.out)) - self.output("out", source=self.indexFiltered.out) - - -if __name__ == "__main__": - - wf = Mutect2JointSomaticWorkflow() - wdl = wf.translate("wdl", to_console=True, to_disk=False, write_inputs_file=False) diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/__init__.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/__init__.py deleted file mode 100644 index 5a62cc8e1..000000000 --- a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/__init__.py +++ /dev/null @@ -1 +0,0 @@ -#do not export the steps, these are just meant tobe used in workflows and not directly diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/strelka2passanalysisstep1_cram.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/strelka2passanalysisstep1_cram.py deleted file mode 100644 index 6111dec74..000000000 --- a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/strelka2passanalysisstep1_cram.py +++ /dev/null @@ -1,105 +0,0 @@ -from datetime import date - -from janis_bioinformatics.data_types import BedTabix, CramCrai, FastaFai -from janis_bioinformatics.tools import BioinformaticsWorkflow -from janis_bioinformatics.tools.bcftools import ( - BcfToolsIndex_1_9 as BcfToolsIndex, - BcfToolsNorm_1_9 as BcfToolsNorm, -) -from janis_bioinformatics.tools.illumina.manta.manta import MantaCram_1_5_0 as Manta -from janis_bioinformatics.tools.illumina.strelkasomatic.strelkasomatic import ( - StrelkaSomaticCram_2_9_10 as Strelka, -) -from janis_core import Boolean, File - - -class Strelka2PassWorkflowStep1(BioinformaticsWorkflow): - def id(self): - return "Strelka2PassWorkflowStep1" - - def friendly_name(self): - return "Strelka 2Pass analysis step1" - - def tool_provider(self): - return "Dawson Labs" - - def version(self): - return "0.1" - - def bind_metadata(self): - self.metadata.version = "0.1" - self.metadata.dateCreated = date(2019, 10, 11) - self.metadata.dateUpdated = date(2020, 8, 4) - - self.metadata.contributors = ["Sebastian Hollizeck"] - self.metadata.keywords = [ - "variants", - "strelka2", - "variant caller", - "multi sample", - ] - self.metadata.documentation = """ - This is the first step for joint somatic variant calling - based on a 2pass analysis common in RNASeq. - - It runs manta and strelka on the bams as is best practice - for somatic variant calling with strelka2 - - It also normalises and indexes the output vcfs - """.strip() - - def constructor(self): - - self.input("normalBam", CramCrai) - self.input("tumorBam", CramCrai) - - self.input("reference", FastaFai) - self.input("callRegions", BedTabix(optional=True)) - self.input("exome", Boolean(optional=True), default=False) - self.input("configStrelka", File(optional=True)) - - self.step( - "manta", - Manta( - bam=self.normalBam, - tumorBam=self.tumorBam, - reference=self.reference, - callRegions=self.callRegions, - exome=self.exome, - ), - ) - self.step( - "strelka", - Strelka( - indelCandidates=self.manta.candidateSmallIndels, - normalBam=self.normalBam, - tumorBam=self.tumorBam, - reference=self.reference, - callRegions=self.callRegions, - exome=self.exome, - config=self.configStrelka, - ), - ) - self.step( - "normaliseSNVs", - BcfToolsNorm(vcf=self.strelka.snvs, reference=self.reference), - ) - self.step("indexSNVs", BcfToolsIndex(vcf=self.normaliseSNVs.out)) - - self.step( - "normaliseINDELs", - BcfToolsNorm(vcf=self.strelka.indels, reference=self.reference), - ) - self.step("indexINDELs", BcfToolsIndex(vcf=self.normaliseINDELs.out)) - - self.output("diploid", source=self.manta.diploidSV) - self.output("candIndels", source=self.manta.candidateSmallIndels) - self.output("indels", source=self.indexINDELs.out) - self.output("snvs", source=self.indexSNVs.out) - self.output("somaticSVs", source=self.manta.somaticSVs) - - -if __name__ == "__main__": - - wf = Strelka2PassWorkflowStep1() - wdl = wf.translate("wdl", to_console=True, to_disk=False, write_inputs_file=False) diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/strelka2passanalysisstep2_cram.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/strelka2passanalysisstep2_cram.py deleted file mode 100644 index 23eda60a1..000000000 --- a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/strelka2passanalysisstep2_cram.py +++ /dev/null @@ -1,96 +0,0 @@ -from datetime import date - -from janis_bioinformatics.data_types import BedTabix, CramCrai, FastaFai, VcfTabix -from janis_bioinformatics.tools import BioinformaticsWorkflow -from janis_bioinformatics.tools.bcftools import ( - BcfToolsIndex_1_9 as BcfToolsIndex, - BcfToolsNorm_1_9 as BcfToolsNorm, -) -from janis_bioinformatics.tools.illumina.strelkasomatic.strelkasomatic import ( - StrelkaSomaticCram_2_9_10 as Strelka, -) -from janis_core import Array, Boolean, File - - -class Strelka2PassWorkflowStep2(BioinformaticsWorkflow): - def id(self): - return "Strelka2PassWorkflowStep2" - - def friendly_name(self): - return "Strelka 2Pass analysis step 2" - - def tool_provider(self): - return "Dawson Labs" - - def version(self): - return "0.1" - - def bind_metadata(self): - self.metadata.version = "0.1" - self.metadata.dateCreated = date(2019, 10, 11) - self.metadata.dateUpdated = date(2020, 8, 4) - - self.metadata.contributors = ["Sebastian Hollizeck"] - self.metadata.keywords = [ - "variants", - "strelka2", - "variant caller", - "multi sample", - ] - self.metadata.documentation = """ - This is the second step for joint somatic variant calling - based on a 2pass analysis common in RNASeq. - - It runs strelka2 again with the variants found in all of the other samples as input to be forced to genotype these. - - It also normalises and indexes the output vcfs - """.strip() - - def constructor(self): - - self.input("normalBam", CramCrai) - self.input("tumorBam", CramCrai) - - self.input("reference", FastaFai) - self.input("callRegions", BedTabix(optional=True)) - self.input("exome", Boolean(optional=True), default=False) - self.input("configStrelka", File(optional=True)) - - self.input("indelCandidates", Array(VcfTabix)) - self.input("strelkaSNVs", Array(VcfTabix)) - # self.input("strelkaIndels", Array(VcfTabix)) - - self.step( - "strelka2pass", - Strelka( - indelCandidates=self.indelCandidates, - # indelCandidates=self.strelkaIndels, - forcedgt=self.strelkaSNVs, - normalBam=self.normalBam, - tumorBam=self.tumorBam, - reference=self.reference, - callRegions=self.callRegions, - exome=self.exome, - config=self.configStrelka, - ), - ) - self.step( - "normaliseSNVs", - BcfToolsNorm(vcf=self.strelka2pass.snvs, reference=self.reference), - ) - self.step("indexSNVs", BcfToolsIndex(vcf=self.normaliseSNVs.out)) - - self.step( - "normaliseINDELs", - BcfToolsNorm(vcf=self.strelka2pass.indels, reference=self.reference), - ) - self.step("indexINDELs", BcfToolsIndex(vcf=self.normaliseINDELs.out)) - - self.output("indels", source=self.indexINDELs.out) - self.output("snvs", source=self.indexSNVs.out) - - -if __name__ == "__main__": - - wf = Strelka2PassWorkflowStep2() - wdl = wf.translate("wdl", to_console=True, to_disk=False, write_inputs_file=False) diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/__init__.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/__init__.py new file mode 100644 index 000000000..ad9ede8fc --- /dev/null +++ b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/__init__.py @@ -0,0 +1,2 @@ +from .strelka2passworkflow import Strelka2PassWorkflow +from .strelka2passworkflow_cram import Strelka2PassWorkflowCram diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/__init__.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/__init__.py new file mode 100644 index 000000000..0afedc0eb --- /dev/null +++ b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/__init__.py @@ -0,0 +1 @@ +# dont export this, because we dont want people to use it other than in the workflow diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/strelka2passanalysisstep1.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/strelka2passanalysisstep1.py similarity index 76% rename from janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/strelka2passanalysisstep1.py rename to janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/strelka2passanalysisstep1.py index d7b429437..b2f357b14 100644 --- a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/strelka2passanalysisstep1.py +++ b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/strelka2passanalysisstep1.py @@ -1,15 +1,12 @@ from datetime import date -from janis_bioinformatics.data_types import BedTabix, BamBai, FastaFai +from janis_bioinformatics.data_types import BedTabix, FastaFai from janis_bioinformatics.tools import BioinformaticsWorkflow from janis_bioinformatics.tools.bcftools import ( BcfToolsIndex_1_9 as BcfToolsIndex, BcfToolsNorm_1_9 as BcfToolsNorm, ) -from janis_bioinformatics.tools.illumina.manta.manta import Manta_1_5_0 as Manta -from janis_bioinformatics.tools.illumina.strelkasomatic.strelkasomatic import ( - StrelkaSomatic_2_9_10 as Strelka, -) + from janis_core import Boolean, File @@ -24,12 +21,12 @@ def tool_provider(self): return "Dawson Labs" def version(self): - return "0.1" + return "0.1.1" def bind_metadata(self): - self.metadata.version = "0.1" + self.metadata.version = "0.1.1" self.metadata.dateCreated = date(2019, 10, 11) - self.metadata.dateUpdated = date(2020, 8, 4) + self.metadata.dateUpdated = date(2020, 12, 10) self.metadata.contributors = ["Sebastian Hollizeck"] self.metadata.keywords = [ @@ -48,10 +45,28 @@ def bind_metadata(self): It also normalises and indexes the output vcfs """.strip() + # this is a way to get the tool without spagetti code in bam and cram format + def getMantaTool(self): + from janis_bioinformatics.tools.illumina.manta.manta import Manta_1_5_0 as Manta + + return Manta + + def getStrelka2Tool(self): + from janis_bioinformatics.tools.illumina.strelkasomatic.strelkasomatic import ( + StrelkaSomatic_2_9_10 as Strelka, + ) + + return Strelka + + def getStrelka2InputType(self): + from janis_bioinformatics.data_types import BamBai + + return BamBai + def constructor(self): - self.input("normalBam", BamBai) - self.input("tumorBam", BamBai) + self.input("normalBam", self.getStrelka2InputType()) + self.input("tumorBam", self.getStrelka2InputType()) self.input("reference", FastaFai) self.input("callRegions", BedTabix(optional=True)) @@ -60,7 +75,7 @@ def constructor(self): self.step( "manta", - Manta( + self.getMantaTool()( bam=self.normalBam, tumorBam=self.tumorBam, reference=self.reference, @@ -70,7 +85,7 @@ def constructor(self): ) self.step( "strelka", - Strelka( + self.getStrelka2Tool()( indelCandidates=self.manta.candidateSmallIndels, normalBam=self.normalBam, tumorBam=self.tumorBam, diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/strelka2passanalysisstep1_cram.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/strelka2passanalysisstep1_cram.py new file mode 100644 index 000000000..567d8aa96 --- /dev/null +++ b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/strelka2passanalysisstep1_cram.py @@ -0,0 +1,37 @@ +from .strelka2passanalysisstep1 import ( + Strelka2PassWorkflowStep1, +) + + +class Strelka2PassWorkflowStep1Cram(Strelka2PassWorkflowStep1): + def id(self): + return "Strelka2PassWorkflowStep1Cram" + + def friendly_name(self): + return "Strelka 2Pass analysis step1 (CRAM)" + + # this is a way to get the tool without spagetti code in bam and cram format + def getMantaTool(self): + from janis_bioinformatics.tools.illumina.manta.manta import ( + MantaCram_1_5_0 as Manta, + ) + + return Manta + + def getStrelka2Tool(self): + from janis_bioinformatics.tools.illumina.strelkasomatic.strelkasomatic import ( + StrelkaSomaticCram_2_9_10 as Strelka, + ) + + return Strelka + + def getStrelka2InputType(self): + from janis_bioinformatics.data_types import CramCrai + + return CramCrai + + +if __name__ == "__main__": + + wf = Strelka2PassWorkflowStep1Cram() + wdl = wf.translate("wdl", to_console=True, to_disk=False, write_inputs_file=False) diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/strelka2passanalysisstep2.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/strelka2passanalysisstep2.py similarity index 78% rename from janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/strelka2passanalysisstep2.py rename to janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/strelka2passanalysisstep2.py index da6b31173..44abed1a8 100644 --- a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/steps/strelka2passanalysisstep2.py +++ b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/strelka2passanalysisstep2.py @@ -1,14 +1,12 @@ from datetime import date -from janis_bioinformatics.data_types import BedTabix, BamBai, FastaFai, VcfTabix +from janis_bioinformatics.data_types import BedTabix, FastaFai, VcfTabix from janis_bioinformatics.tools import BioinformaticsWorkflow from janis_bioinformatics.tools.bcftools import ( BcfToolsIndex_1_9 as BcfToolsIndex, BcfToolsNorm_1_9 as BcfToolsNorm, ) -from janis_bioinformatics.tools.illumina.strelkasomatic.strelkasomatic import ( - StrelkaSomatic_2_9_10 as Strelka, -) + from janis_core import Array, Boolean, File @@ -23,12 +21,12 @@ def tool_provider(self): return "Dawson Labs" def version(self): - return "0.1" + return "0.1.1" def bind_metadata(self): - self.metadata.version = "0.1" + self.metadata.version = "0.1.1" self.metadata.dateCreated = date(2019, 10, 11) - self.metadata.dateUpdated = date(2020, 8, 4) + self.metadata.dateUpdated = date(2020, 12, 10) self.metadata.contributors = ["Sebastian Hollizeck"] self.metadata.keywords = [ @@ -46,10 +44,23 @@ def bind_metadata(self): It also normalises and indexes the output vcfs """.strip() + # this is a way to get the tool without spagetti code in bam and cram format + def getStrelka2Tool(self): + from janis_bioinformatics.tools.illumina.strelkasomatic.strelkasomatic import ( + StrelkaSomatic_2_9_10 as Strelka, + ) + + return Strelka + + def getStrelka2InputType(self): + from janis_bioinformatics.data_types import BamBai + + return BamBai + def constructor(self): - self.input("normalBam", BamBai) - self.input("tumorBam", BamBai) + self.input("normalBam", self.getStrelka2InputType()) + self.input("tumorBam", self.getStrelka2InputType()) self.input("reference", FastaFai) self.input("callRegions", BedTabix(optional=True)) @@ -62,7 +73,7 @@ def constructor(self): self.step( "strelka2pass", - Strelka( + self.getStrelka2Tool()( indelCandidates=self.indelCandidates, # indelCandidates=self.strelkaIndels, forcedgt=self.strelkaSNVs, diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/strelka2passanalysisstep2_cram.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/strelka2passanalysisstep2_cram.py new file mode 100644 index 000000000..1af928cc2 --- /dev/null +++ b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/steps/strelka2passanalysisstep2_cram.py @@ -0,0 +1,29 @@ +from .strelka2passanalysisstep2 import ( + Strelka2PassWorkflowStep2, +) + + +class Strelka2PassWorkflowStep2Cram(Strelka2PassWorkflowStep2): + def id(self): + return "Strelka2PassWorkflowStep2Cram" + + def friendly_name(self): + return "Strelka 2Pass analysis step 2 (CRAM)" + + def getStrelka2Tool(self): + from janis_bioinformatics.tools.illumina.strelkasomatic.strelkasomatic import ( + StrelkaSomaticCram_2_9_10 as Strelka, + ) + + return Strelka + + def getStrelka2InputType(self): + from janis_bioinformatics.data_types import CramCrai + + return CramCrai + + +if __name__ == "__main__": + + wf = Strelka2PassWorkflowStep2Cram() + wdl = wf.translate("wdl", to_console=True, to_disk=False, write_inputs_file=False) diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2passworkflow.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/strelka2passworkflow.py similarity index 59% rename from janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2passworkflow.py rename to janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/strelka2passworkflow.py index 757bf449b..26f2f4012 100644 --- a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2passworkflow.py +++ b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/strelka2passworkflow.py @@ -1,16 +1,11 @@ from datetime import date -from janis_bioinformatics.data_types import BedTabix, BamBai, FastaFai +from janis_bioinformatics.data_types import BedTabix, FastaFai from janis_bioinformatics.tools import BioinformaticsWorkflow from janis_bioinformatics.tools.dawson import ( RefilterStrelka2Calls_0_1 as RefilterStrelka2Calls, ) -from .steps.strelka2passanalysisstep1 import ( - Strelka2PassWorkflowStep1, -) -from .steps.strelka2passanalysisstep2 import ( - Strelka2PassWorkflowStep2, -) + from janis_bioinformatics.tools.htslib import BGZipLatest as BGZip, TabixLatest as Tabix from janis_core import Array, Boolean, String, File, Int from janis_bioinformatics.data_types import VcfTabix @@ -27,10 +22,10 @@ def tool_provider(self): return "Dawson Labs" def version(self): - return "0.1" + return "0.2" def bind_metadata(self): - self.metadata.version = "0.1" + self.metadata.version = "0.2" self.metadata.dateCreated = date(2019, 10, 11) self.metadata.dateUpdated = date(2020, 12, 10) @@ -53,23 +48,77 @@ def bind_metadata(self): * output resuults """.strip() + # this is a way to get the tool without spagetti code in bam and cram format + def getStep1Tool(self): + from .steps.strelka2passanalysisstep1 import ( + Strelka2PassWorkflowStep1, + ) + + return Strelka2PassWorkflowStep1 + + def getStep2Tool(self): + from .steps.strelka2passanalysisstep2 import ( + Strelka2PassWorkflowStep2, + ) + + return Strelka2PassWorkflowStep2 + + def getStrelka2InputType(self): + from janis_bioinformatics.data_types import BamBai + + return BamBai + def constructor(self): - self.input("normalBam", BamBai) - self.input("tumorBams", Array(BamBai)) + self.input( + "normalBam", + self.getStrelka2InputType(), + doc="The bam of the normal sample. Strelka will assign any read in this bam to the normal sample, even if this bam contains multiple samples", + ) + self.input( + "tumorBams", + Array(self.getStrelka2InputType()), + doc="The bam of the tumour sample. Strelka will assign any read in this bam to the normal sample, even if this bam contains multiple samples", + ) - self.input("reference", FastaFai) + self.input( + "reference", + FastaFai, + doc="The fai indexed fasta reference, the bams were aligned to.", + ) - self.input("configStrelka", File(optional=True)) - self.input("callRegions", BedTabix(optional=True)) - self.input("exome", Boolean(optional=True), default=False) + self.input( + "configStrelka", + File(optional=True), + doc="The possibly changed ini to use for Strelka2. This can be used to skip regions with extreme depth, like in heterochromatin regions, which lead to very long runtimes.", + ) + self.input( + "callRegions", + BedTabix(optional=True), + doc="The tabix indexed bed file of regions to restict the analysis on. If this is unset, every site in the genome will be analysed.", + ) + self.input( + "exome", + Boolean(optional=True), + default=False, + doc="Sets the flag to analyse everything in exome mode. This will adjust the parameter for a non uniform coverage profile.", + ) - self.input("sampleNames", Array(String, optional=True)) - self.input("minAD", Int(optional=True), default=2) + self.input( + "sampleNames", + Array(String, optional=True), + doc="The names of the tumour samples. This will only be used to rename output files. if unset, the output will be numbered in the same order as the input files.", + ) + self.input( + "minAD", + Int(optional=True), + default=2, + doc="Minimum read support for a variant to be considered a true variant.", + ) self.step( "step1", - Strelka2PassWorkflowStep1( + self.getStep1Tool()( normalBam=self.normalBam, tumorBam=self.tumorBams, reference=self.reference, @@ -82,7 +131,7 @@ def constructor(self): self.step( "step2", - Strelka2PassWorkflowStep2( + self.getStep2Tool()( normalBam=self.normalBam, tumorBam=self.tumorBams, reference=self.reference, diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/strelka2passworkflow_cram.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/strelka2passworkflow_cram.py new file mode 100644 index 000000000..2940c0759 --- /dev/null +++ b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2/strelka2passworkflow_cram.py @@ -0,0 +1,37 @@ +from .strelka2passworkflow import ( + Strelka2PassWorkflow, +) + + +class Strelka2PassWorkflowCram(Strelka2PassWorkflow): + def id(self): + return "Strelka2PassWorkflowCram" + + def friendly_name(self): + return "Strelka 2Pass analysis (CRAM)" + + # this is a way to get the tool without spagetti code in bam and cram format + def getStep1Tool(self): + from .steps.strelka2passanalysisstep1_cram import ( + Strelka2PassWorkflowStep1Cram as Strelka2PassWorkflowStep1, + ) + + return Strelka2PassWorkflowStep1 + + def getStep2Tool(self): + from .steps.strelka2passanalysisstep2_cram import ( + Strelka2PassWorkflowStep2Cram as Strelka2PassWorkflowStep2, + ) + + return Strelka2PassWorkflowStep2 + + def getStrelka2InputType(self): + from janis_bioinformatics.data_types import CramCrai + + return CramCrai + + +if __name__ == "__main__": + + wf = Strelka2PassWorkflowCram() + wdl = wf.translate("wdl", to_console=True, to_disk=False, write_inputs_file=False) diff --git a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2passworkflow_cram.py b/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2passworkflow_cram.py deleted file mode 100644 index 8f0c21905..000000000 --- a/janis_bioinformatics/tools/dawson/workflows/variantcalling/multisample/strelka2passworkflow_cram.py +++ /dev/null @@ -1,142 +0,0 @@ -from datetime import date - -from janis_bioinformatics.data_types import BedTabix, CramCrai, FastaFai -from janis_bioinformatics.tools import BioinformaticsWorkflow -from janis_bioinformatics.tools.dawson import ( - RefilterStrelka2Calls_0_1 as RefilterStrelka2Calls, -) -from .steps.strelka2passanalysisstep1_cram import ( - Strelka2PassWorkflowStep1, -) -from .steps.strelka2passanalysisstep2_cram import ( - Strelka2PassWorkflowStep2, -) -from janis_bioinformatics.tools.htslib import BGZipLatest as BGZip, TabixLatest as Tabix -from janis_core import Array, Boolean, String, File, Int -from janis_bioinformatics.data_types import VcfTabix - - -class Strelka2PassWorkflowCram(BioinformaticsWorkflow): - def id(self): - return "Strelka2PassWorkflowCram" - - def friendly_name(self): - return "Strelka 2Pass analysis (CRAM)" - - def tool_provider(self): - return "Dawson Labs" - - def version(self): - return "0.1" - - def bind_metadata(self): - self.metadata.version = "0.1" - self.metadata.dateCreated = date(2019, 10, 11) - self.metadata.dateUpdated = date(2020, 12, 10) - - self.metadata.contributors = ["Sebastian Hollizeck"] - self.metadata.keywords = [ - "variants", - "strelka2", - "variant caller", - "multi sample", - ] - self.metadata.documentation = """ - This is the full 2pass analysis workflow to do joint somatic variant calling with strelka2. - The idea is similar to the RNASeq 2pass analysis, when the input of the first analysis is used to guide the second analysis. - - The workflow will - * run manta - * run strelka with manata output - * run strelka with strelka and manta output - * reannotate the filter column - * output resuults - """.strip() - - def constructor(self): - - self.input("normalBam", CramCrai) - self.input("tumorBams", Array(CramCrai)) - - self.input("reference", FastaFai) - - self.input("configStrelka", File(optional=True)) - self.input("callRegions", BedTabix(optional=True)) - self.input("exome", Boolean(optional=True), default=False) - - self.input("sampleNames", Array(String, optional=True)) - self.input("minAD", Int(optional=True), default=2) - - self.step( - "step1", - Strelka2PassWorkflowStep1( - normalBam=self.normalBam, - tumorBam=self.tumorBams, - reference=self.reference, - callRegions=self.callRegions, - exome=self.exome, - configStrelka=self.configStrelka, - ), - scatter="tumorBam", - ) - - self.step( - "step2", - Strelka2PassWorkflowStep2( - normalBam=self.normalBam, - tumorBam=self.tumorBams, - reference=self.reference, - callRegions=self.callRegions, - strelkaSNVs=self.step1.snvs, - indelCandidates=self.step1.candIndels, - # as soon as janis allows flattening of arguments, we need this - # indelCandidates=self.step1.indels, - exome=self.exome, - configStrelka=self.configStrelka, - ), - scatter="tumorBam", - ) - - self.step( - "refilterSNVs", - RefilterStrelka2Calls( - inputFiles=self.step2.snvs, - sampleNames=self.sampleNames, - minAD=self.minAD, - ), - ) - self.step("compressSNVs", BGZip(file=self.refilterSNVs.out), scatter="file") - self.step("indexSNVs", Tabix(inp=self.compressSNVs.out), scatter="inp") - - self.step( - "refilterINDELs", - RefilterStrelka2Calls( - inputFiles=self.step2.indels, - sampleNames=self.sampleNames, - minAD=self.minAD, - ), - ) - self.step("compressINDELs", BGZip(file=self.refilterINDELs.out), scatter="file") - self.step("indexINDELs", Tabix(inp=self.compressINDELs.out), scatter="inp") - - self.output( - "snvs", - Array(VcfTabix), - source=self.indexSNVs, - output_folder=self.sampleNames, - ) - self.output( - "indels", - Array(VcfTabix), - source=self.indexINDELs, - output_folder=self.sampleNames, - ) - - # optional output from manta, but we know it will be created - self.output("svs", source=self.step1.somaticSVs, output_folder=self.sampleNames) - - -if __name__ == "__main__": - - wf = Strelka2PassWorkflow() - wdl = wf.translate("wdl", to_console=True, to_disk=False, write_inputs_file=False)