From 65a2afe0d89c60db9fa45676acde4d88dd9562a6 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Tue, 21 Mar 2023 15:34:45 +0100 Subject: [PATCH 1/5] allow for underscores fasta headers in differential mode. Include pseudocount as an option in CLI --- .pre-commit-config.yaml | 6 ++++++ ChangeLog | 16 +++++++++------- README.md | 11 +---------- src/AOS/atac.py | 10 +++++++++- src/AOS/helper.py | 4 ++-- src/AOS/preflight.py | 15 ++++++++++++--- src/AOS/rscripts/edger.R | 4 +++- src/AOS/rules/DE.smk | 1 + 8 files changed, 43 insertions(+), 24 deletions(-) create mode 100644 .pre-commit-config.yaml diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..8838b6d --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,6 @@ +repos: +- repo: "https://github.com/pycqa/flake8" + rev: "6.0.0" + hooks: +- id: flake8 + exclude: ^(build) diff --git a/ChangeLog b/ChangeLog index 9e4291d..984ebb1 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,15 +1,17 @@ CHANGES ======= -* include frig fig, ignore > 1000 for fragsize, set threads motif cluster -* uropa change defaults + checks gtf -* make to preserve key order in yaml dump. Purge ESS -* switch to gene for uropa annotation as default +* allow underscores +* set python/numpy/macs version +* Update seqtools.yml + +v0.0.1 +------ + +* Wd (#14) +* Wd (#13) * Update README.md * Wd (#9) -* meme working -* babysteps into tobias/motifs -* purgebuild * Wd (#8) * Wd (#6) * update readme diff --git a/README.md b/README.md index 3eea5a1..decbdad 100644 --- a/README.md +++ b/README.md @@ -6,16 +6,7 @@ Downstream processing of ATAC data, including QC's and differential accessibilit All samples in a 'run' have to belong to the same group. That means that out of all peaks/sample, a union will be made. -Fasta headers are allowed to contain underscores, but not in 'field' 0 (space delimited): - -This is allowed: - - > >12 dna_sm:chromosome chromosome:GRCm38:12:1:120129022:1 REF - -This is not allowed: - - > >chr9_10L - > >NC_001573.1 +Fasta headers un field 0 (space delimited) are not allowed to contain a pipe character '|'. ## Installation diff --git a/src/AOS/atac.py b/src/AOS/atac.py index 152edb4..fdaa411 100644 --- a/src/AOS/atac.py +++ b/src/AOS/atac.py @@ -111,6 +111,13 @@ show_default=True, help='the feature in the GTF file (column 3) to use for peak annotation.' ) +@click.option( + '--pseudocount', + required=False, + default=8, + show_default=True, + help='Pseudocount to add to the count matrix prior to differential calling.' +) def main(bamdir, outputdir, gtf, @@ -125,7 +132,8 @@ def main(bamdir, mitostring, upstreamuro, downstreamuro, - featureuro + featureuro, + pseudocount ): # Init pf = Preflight(**locals()) diff --git a/src/AOS/helper.py b/src/AOS/helper.py index 57917e3..9c9b043 100644 --- a/src/AOS/helper.py +++ b/src/AOS/helper.py @@ -215,7 +215,7 @@ def tsv_to_bed(tsv, bed, grint): ) if grint == 2: gr2df = pd.DataFrame( - [i.split('_') for i in list(df[(df['logFC'] > 0) & (df['FDR'] < 0.05)]['peak_id'])] + [i.split('|') for i in list(df[(df['logFC'] > 0) & (df['FDR'] < 0.05)]['peak_id'])] ) gr2df.to_csv( bed, @@ -225,7 +225,7 @@ def tsv_to_bed(tsv, bed, grint): ) if grint == 1: gr1df = pd.DataFrame( - [i.split('_') for i in list(df[(df['logFC'] < 0) & (df['FDR'] < 0.05)]['peak_id'])] + [i.split('|') for i in list(df[(df['logFC'] < 0) & (df['FDR'] < 0.05)]['peak_id'])] ) gr1df.to_csv( bed, diff --git a/src/AOS/preflight.py b/src/AOS/preflight.py index 13b11d2..8c03c36 100644 --- a/src/AOS/preflight.py +++ b/src/AOS/preflight.py @@ -26,7 +26,8 @@ def __init__( mitostring, upstreamuro, downstreamuro, - featureuro + featureuro, + pseudocount ): def retabspath(_p): if _p: @@ -58,7 +59,9 @@ def retabspath(_p): 'mitostring': mitostring, 'upstream_uropa': upstreamuro, 'downstream_uropa': downstreamuro, - 'featureuro': featureuro + 'featureuro': featureuro, + 'pseudocount': pseudocount + } self.samples = [os.path.basename(x).replace('.bam', '') for x in glob.glob( os.path.join(os.path.abspath(bamdir), '*.bam') @@ -176,7 +179,13 @@ def checkFna(self): with open(_fna) as f: for line in f: if line.startswith('>'): - continue + _h = line.strip().split(' ')[0] + if '|' in _h: + sys.exit( + "'|' not allowed in fasta header. Please reformat {}".format( + _h + ) + ) else: ESS += len(line.strip()) - line.strip().lower().count('n') self.vars['genomesize'] = ESS diff --git a/src/AOS/rscripts/edger.R b/src/AOS/rscripts/edger.R index dd2b762..f617393 100644 --- a/src/AOS/rscripts/edger.R +++ b/src/AOS/rscripts/edger.R @@ -5,12 +5,14 @@ suppressMessages(library("edgeR")) mat = snakemake@input[[1]] samplesheet = snakemake@params[['samplesheet']] prefix = snakemake@params[['prefix']] +pseudocount = as.integer(snakemake@params[['pseudocount']]) gr1_subset = snakemake@params[['gr1_subset']] gr2_subset = snakemake@params[['gr2_subset']] interactionstr = snakemake@params[['interaction']] of = snakemake@params[['outputfolder']] edgeRtable = snakemake@output[['table']] + # Read in samplesheet. # rownames are samples, columns are factors. samplesheet = read.table( @@ -61,7 +63,7 @@ keep <- filterByExpr( countmat, design=design,min.count = 5, min.prop = 0.49 ) countmat <- countmat[keep,] -countmat <- countmat + 8 #pseudocount of 8 +countmat <- countmat + pseudocount countmat_disp <- estimateGLMCommonDisp(countmat, design, verbose=TRUE) # fit <- glmQLFit(countmat, design=design, dispersion = countmat_disp) diff --git a/src/AOS/rules/DE.smk b/src/AOS/rules/DE.smk index 2168096..97b927f 100644 --- a/src/AOS/rules/DE.smk +++ b/src/AOS/rules/DE.smk @@ -58,6 +58,7 @@ rule diffacc: samples = '{comparison}/samples.txt' params: samplesheet = config['files']['samplesheet'], + pseudocount = config['vars']['pseudocount'], prefix = lambda wildcards: prefix( config['comparison'][wildcards.comparison], ), From d3a73501fe594de15f9c0c4139f16c4d4309fc2b Mon Sep 17 00:00:00 2001 From: WardDeb Date: Tue, 21 Mar 2023 15:38:46 +0100 Subject: [PATCH 2/5] fix precommit --- .pre-commit-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8838b6d..903a704 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -2,5 +2,5 @@ repos: - repo: "https://github.com/pycqa/flake8" rev: "6.0.0" hooks: -- id: flake8 - exclude: ^(build) + - id: flake8 + exclude: ^(build) \ No newline at end of file From 87a9c0491d3656a003e058bb0579fea6ac311b01 Mon Sep 17 00:00:00 2001 From: WardDeb Date: Wed, 22 Mar 2023 10:31:24 +0100 Subject: [PATCH 3/5] cap numpy version for deeptools --- src/AOS/envs/seqtools.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/AOS/envs/seqtools.yml b/src/AOS/envs/seqtools.yml index 2d4c087..3e1a7c7 100644 --- a/src/AOS/envs/seqtools.yml +++ b/src/AOS/envs/seqtools.yml @@ -3,7 +3,7 @@ channels: - bioconda dependencies: - python == 3.9 - - numpy == 1.24.1 + - numpy == 1.20 - bedtools == 2.30.0 - bioconductor-edger == 3.32.0 - r-tidyverse From c5ea400b55e57fd1005f459de9505b63d2d6c6bc Mon Sep 17 00:00:00 2001 From: WardDeb Date: Wed, 22 Mar 2023 13:49:23 +0100 Subject: [PATCH 4/5] split deeptools outside of seqtools --- src/AOS/envs/deeptools.yml | 6 ++++++ src/AOS/envs/seqtools.yml | 9 +++------ src/AOS/preflight.py | 3 +++ src/AOS/rules/DE.smk | 2 +- src/AOS/rules/peaks.smk | 10 +++++----- src/AOS/rules/qc.smk | 4 ++-- 6 files changed, 20 insertions(+), 14 deletions(-) create mode 100644 src/AOS/envs/deeptools.yml diff --git a/src/AOS/envs/deeptools.yml b/src/AOS/envs/deeptools.yml new file mode 100644 index 0000000..e98d3e4 --- /dev/null +++ b/src/AOS/envs/deeptools.yml @@ -0,0 +1,6 @@ +channels: + - conda-forge + - bioconda +dependencies: + - deeptools + - samtools == 1.13 \ No newline at end of file diff --git a/src/AOS/envs/seqtools.yml b/src/AOS/envs/seqtools.yml index 3e1a7c7..0cdda06 100644 --- a/src/AOS/envs/seqtools.yml +++ b/src/AOS/envs/seqtools.yml @@ -3,14 +3,11 @@ channels: - bioconda dependencies: - python == 3.9 - - numpy == 1.20 - bedtools == 2.30.0 - bioconductor-edger == 3.32.0 - r-tidyverse - - macs2 == 2.2.7.1 + - r-essentials + - macs2 - samtools == 1.13 - htslib == 1.13 - - uropa >=4.0.0 - - pip - - pip: - - deeptools + - uropa >=4.0.0 \ No newline at end of file diff --git a/src/AOS/preflight.py b/src/AOS/preflight.py index 8c03c36..87a7de6 100644 --- a/src/AOS/preflight.py +++ b/src/AOS/preflight.py @@ -75,6 +75,9 @@ def retabspath(_p): ), 'meme': os.path.join( self.dirs['scriptsdir'], 'envs', 'meme.yml' + ), + 'deeptools': os.path.join( + self.dirs['scriptsdir'], 'envs', 'deeptools.yml' ) } self.rules = { diff --git a/src/AOS/rules/DE.smk b/src/AOS/rules/DE.smk index 97b927f..d448192 100644 --- a/src/AOS/rules/DE.smk +++ b/src/AOS/rules/DE.smk @@ -126,7 +126,7 @@ rule heatmaps: beds = lambda wildcards: wildcards.comparison + '/*bed', samples = lambda wildcards: readsamples(wildcards.comparison) threads: 10 - conda: config['envs']['seqtools'] + conda: config['envs']['deeptools'] shell:''' computeMatrix reference-point -R {params.beds} -S {params.samples} -o {output.matrix} \ --referencePoint center \ diff --git a/src/AOS/rules/peaks.smk b/src/AOS/rules/peaks.smk index 1394709..9706358 100644 --- a/src/AOS/rules/peaks.smk +++ b/src/AOS/rules/peaks.smk @@ -29,7 +29,7 @@ rule alSieve: output: shortb = 'sieve/{sample}.bam', qc = temp('qc/{sample}_sieve.txt') - conda: config['envs']['seqtools'] + conda: config['envs']['deeptools'] threads: 10 params: rar = '--blackListFileName {}'.format(config['files']['readattractingregions']), @@ -142,8 +142,8 @@ rule countmatrix: rar = '-bl {}'.format( config['files']['readattractingregions'] ) - threads: 10 - conda: config['envs']['seqtools'] + threads: 40 + conda: config['envs']['deeptools'] shell:''' multiBamSummary BED-file --BED {input.peaks} {params.rar} \ -p {threads} --outRawCounts {output.tsv} -o {output.npz} \ @@ -159,7 +159,7 @@ rule multibigwigsum: output: 'peakset/counts.bw.npz' threads: 1 - conda: config['envs']['seqtools'] + conda: config['envs']['deeptools'] shell:''' multiBigwigSummary BED-file --BED {input.peaks} -o {output} -b {input.samples} ''' @@ -172,7 +172,7 @@ rule plotPCA: threads: 1 params: colstr = PCA_colors(config) - conda: config['envs']['seqtools'] + conda: config['envs']['deeptools'] shell:''' plotPCA --corData {input.peakset} -o {output} --transpose --ntop 5000 {params.colstr} ''' \ No newline at end of file diff --git a/src/AOS/rules/qc.smk b/src/AOS/rules/qc.smk index f7deebc..d9af710 100644 --- a/src/AOS/rules/qc.smk +++ b/src/AOS/rules/qc.smk @@ -37,7 +37,7 @@ rule fragsize: output: 'qc/fragsize.tsv' threads: 10 - conda: config['envs']['seqtools'] + conda: config['envs']['deeptools'] shell:''' bamPEFragmentSize -b {input.bams} \ -p {threads} \ @@ -69,7 +69,7 @@ rule bigwigs: rar = config['files']['readattractingregions'], sample = "{sample}" threads: 10 - conda: config['envs']['seqtools'] + conda: config['envs']['deeptools'] shell:''' SCALEFAC=$(grep {params.sample} {input.scalefactors} | cut -f2 -d ' ') bamCoverage --scaleFactor $SCALEFAC \ From db664610d0f59f36636e4765067594a28d55f1d4 Mon Sep 17 00:00:00 2001 From: Ward D Date: Wed, 29 Mar 2023 15:17:12 +0200 Subject: [PATCH 5/5] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index decbdad..4e2a083 100644 --- a/README.md +++ b/README.md @@ -13,8 +13,8 @@ Fasta headers un field 0 (space delimited) are not allowed to contain a pipe cha set up the environment: > git clone git@github.com:maxplanck-ie/ATACofthesnake.git > cd ATACofthesnake -> conda create -n ATACofthesnake python=3 -> conda activate ATACofthesnake +> conda env create -f env.yml -n aos +> conda activate aos > pip install ./ ## Quickstart