Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: amplicon utils #612

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions workflow/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ include: "rules/generate_output.smk"
include: "rules/benchmarking.smk"
include: "rules/long_read.smk"
include: "rules/lineage_variant_calling.smk"
include: "rules/amplicon_stats.smk"
include: "rules/freyja.smk"


if config["data-handling"]["use-data-handling"]:
Expand Down
6 changes: 6 additions & 0 deletions workflow/envs/freyja.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
channels:
- conda-forge
- bioconda
- nodefaults
dependencies:
- freyja =1.4.5
1 change: 1 addition & 0 deletions workflow/envs/pysam.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ dependencies:
- requests =2.26
- dnachisel =3.2
- gffutils =0.10
- natsort =5.5.0
1 change: 1 addition & 0 deletions workflow/report/lineage-variant-overview.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Overview report for all given variants from covariants.org
87 changes: 87 additions & 0 deletions workflow/rules/amplicon_stats.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
rule get_ampliconstats:
input:
bed="resources/primer-v4.bed",
bam="results/{date}/read-sorted/pe~position/{sample}.hardclipped.bam",
output:
"results/{date}/ampliconstats/{sample}.ampliconstats.txt",
log:
"logs/{date}/samtools/ampliconstats/{sample}.log",
conda:
"../envs/samtools.yaml"
shell:
"samtools ampliconstats {input.bed} {input.bam} > {output} 2> {log}"


rule plot_amplicon_coverage:
input:
bedpe="resources/primer.bedpe",
amp_stats=lambda wildcards: expand(
"results/{{date}}/ampliconstats/{sample}.ampliconstats.txt",
sample=get_samples_for_date(wildcards.date),
),
names="resources/ww-sample-name-dict.csv",
weeks="resources/ww-sample-week-dict.csv",
output:
plot="results/{date}/plots/amplicon-abundance/all.svg",
stats="results/{date}/tables/amplicon-abundance/all.csv",
params:
samples=lambda wildcards: get_samples_for_date(wildcards.date),
log:
"logs/{date}/plot-amplicon-coverage.log",
conda:
"../envs/python.yaml"
script:
"../scripts/plot-amplicon-coverage.py"


rule amplicon_profiles:
input:
stats="results/{date}/ampliconstats/{sample}.ampliconstats.txt",
var_df="results/{date}/lineage-variant-report/{sample}.csv",
output:
profiles="results/{date}/amplicon-profiles-sorted/{sample}.txt",
log:
"logs/{date}/amplicon-profiles/{sample}.txt",
threads: 32
conda:
"../envs/python.yaml"
script:
"../scripts/get-amplicon-profiles.py"


rule amplicon_profiles_snv:
input:
csv=lambda wildcards: expand(
"results/{{date}}/lineage-variant-report/{sample}.csv",
sample=get_samples_for_date(wildcards.date),
),
output:
ampprofile="results/{date}/amplicon-profiles-snv/all.csv",
log:
"logs/{date}/amplicon-profiles-snv/all.log",
params:
sample=lambda wildcards: get_samples_for_date(wildcards.date),
threads: 32
conda:
"../envs/python.yaml"
script:
"../scripts/get-amplicon-profiles-snv.py"


rule get_amplicon_stat_output:
input:
expand(
"results/{date}/amplicon-profiles-snv/all.csv",
date="2023-09-08",
sample=get_samples_for_date("2023-09-08"),
),
expand(
"results/{date}/plots/amplicon-abundance/all.svg",
date="2023-09-08",
sample=get_samples_for_date("2023-09-08"),
),
expand(
"results/{date}/lineage-abundance/all.demix.csv",
date="2023-09-08",
sample=get_samples_for_date("2023-09-08"),
),
56 changes: 56 additions & 0 deletions workflow/rules/freyja.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
rule freyja_variants:
input:
bam="results/{date}/read-clipping/hardclipped/pe/{sample}/{sample}.bam",
ref="resources/genomes/MN908947.fasta",
output:
var="results/{date}/lineage-abundance/freyja/{sample}.variants.tsv",
depth="results/{date}/lineage-abundance/freyja/{sample}.depths.txt",
log:
"logs/{date}/freyja/variants/{sample}.log"
params:
var=lambda w, output: os.path.splitext(output.var)[0],
conda:
"../envs/freyja.yaml"
shell:
"freyja variants {input.bam} --variants {params.var} --depths {output.depth} --ref {input.ref} > {log} 2>&1"


rule freyja_demix:
input:
var="results/{date}/lineage-abundance/freyja/{sample}.variants.tsv",
depth="results/{date}/lineage-abundance/freyja/{sample}.depths.txt",
output:
"results/{date}/lineage-abundance/freyja/{sample}.demix.txt",
log:
"logs/{date}/freyja/demix/{sample}.log"
conda:
"../envs/freyja.yaml"
shell:
"freyja demix {input.var} {input.depth} --output {output} > {log} 2>&1"


rule aggregate_freyja:
input:
demix=lambda wildcards: expand(
"results/{{date}}/lineage-abundance/freyja/{sample}.demix.txt",
sample=get_timeseries_samples("sample"),
),
output:
all="results/{date}/lineage-abundance/freyja/all.demix.csv",
all_count="results/{date}/lineage-abundance/freyja/all.count.csv",
pivot="results/{date}/lineage-abundance/freyja/all.pivot.csv",
log:
"logs/{date}/aggregate_freyja/all.log",
params:
sample=lambda wildcards: get_timeseries_samples("sample"),
location=lambda wildcards: get_timeseries_samples("location"),
timestamp=lambda wildcards: get_timeseries_samples("timestamp"),
conda:
"../envs/python.yaml"
script:
"../scripts/aggregate-freyja.py"


rule get_aggregated_freyja:
input:
"results/2022-12-21/lineage-abundance/freyja/all.demix.csv",
57 changes: 53 additions & 4 deletions workflow/rules/lineage_variant_calling.smk
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,62 @@ rule generate_lineage_variant_table:
"../scripts/generate-lineage-variant-table.py"


rule get_lineage_variant_table:
rule aggregate_lineage_variants:
input:
csv=lambda wildcards: expand(
"results/{{date}}/lineage-variant-report/{sample}.csv",
sample=get_samples_for_date(wildcards.date),
),
annotation="resources/annotation_known_variants.gff.gz",
output:
"results/{date}/lineage-variant-overview/all.csv",
log:
"logs/{date}/aggregate-lineage-variants/all.log",
params:
sample=lambda wildcards: get_samples_for_date(wildcards.date)
conda:
"../envs/pysam.yaml"
script:
"../scripts/aggreagte-lineage-variants.py"


rule get_aggregated_lineage_variant_table:
input:
expand(
"results/{date}/lineage-variant-report/{sample}.csv",
date="2022-05-16",
sample=get_samples_for_date("2022-05-16"),
"results/{date}/lineage-variant-report/all.csv",
date="2022-12-21",
),


rule render_datavzrd_config:
input:
template=workflow.source_path("../../resources/lineage-variant-overview.template.datavzrd.yaml"),
table="results/{date}/lineage-variant-overview/all.csv",
output:
"results/{date}/datavzrd/variant-table-model.yaml",
log:
"logs/{date}/yte/render-datavzrd-config/variant-table-model.log",
template_engine:
"yte"


rule render_lineage_variant_table:
input:
config="results/{date}/datavzrd/variant-table-model.yaml",
table="results/{date}/lineage-variant-overview/all.csv",
output:
report(
directory("results/{date}/lineage-variant-report/all"),
htmlindex="index.html",
caption="../report/lineage-variant-overview.rst",
category="2. Variant Call Details",
subcategory=" VOC variant overview",
),
log:
"logs/{date}/lineage-variant-overview/all.log",
wrapper:
"v2.1.0/utils/datavzrd"



use rule overview_table_html as generate_lineage_variant_report with:
Expand Down
57 changes: 57 additions & 0 deletions workflow/scripts/aggreagte-lineage-variants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Copyright 2022 Thomas Battenfeld, Alexander Thomas, Johannes Köster.
# Licensed under the BSD 2-Clause License (https://opensource.org/licenses/BSD-2-Clause)
# This file may not be copied, modified, or distributed
# except according to those terms.

import sys
import pandas as pd
import numpy as np
import gffutils
from natsort import natsort_keygen

sys.stderr = open(snakemake.log[0], "w")

data = pd.DataFrame()

for file, sample in zip(snakemake.input.csv, snakemake.params.sample):
data = pd.concat(
[
data,
pd.read_csv(file, usecols=["Mutations","Probability","Frequency","ReadDepth"])
]
)
data.rename(columns={"Probability": sample + ": " + "Probability"}, inplace=True)
data.rename(columns={"Frequency": sample + ": " + "VAF"}, inplace=True)
data.rename(columns={"ReadDepth": sample + ": " + "Read Depth"}, inplace=True)

data = data.groupby(["Mutations"]).agg(func={column: np.max for column in data.columns})
# add feature column for sorting
data["Features"] = data.index.to_series().str.extract(r"(.+)[:].+|\*")

# position of variant for sorting and change type
data["Position"] = data.index.to_series().str.extract(
r"(.*:?[A-Z]+|\*$|-)([0-9]+)([A-Z]+$|\*$|-)$"
)[1]
# data = data.astype({"Position": "int64"})

# generate sorting list from .gff with correct order of features
gff = gffutils.create_db(snakemake.input.annotation, dbfn=":memory:")
gene_start = {gene["gene_name"][0]: gene.start for gene in gff.features_of_type("gene")}
sorter = [k[0] for k in sorted(gene_start.items(), key=lambda item: item[1])]
sorterIndex = dict(zip(sorter, range(len(sorter))))
data["Features_Rank"] = data["Features"].map(sorterIndex)

data["Features_Rank"].replace(np.NaN, 12, inplace=True)

data.drop(index=["Lineage", "Similarity"], inplace=True)
data.sort_values(
by=["Features_Rank", "Position"],
ascending=[True, True],
na_position="last",
inplace=True,
key=natsort_keygen(),
)

data.drop(columns=["Mutations", "Features_Rank"], index=[], inplace=True)
print(data)
data.to_csv(snakemake.output[0])
50 changes: 50 additions & 0 deletions workflow/scripts/aggregate-freyja.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
# Copyright 2022 Thomas Battenfeld, Alexander Thomas, Johannes Köster.
# Licensed under the BSD 2-Clause License (https://opensource.org/licenses/BSD-2-Clause)
# This file may not be copied, modified, or distributed
# except according to those terms.

import pandas as pd
import numpy as np
from natsort import index_natsorted

data = pd.DataFrame()
data2 = pd.DataFrame()

for file, sample, location, timestamp in zip(snakemake.input.demix, snakemake.params.sample, snakemake.params.location, snakemake.params.timestamp):
lineages = []
abundances = []
# name, num = sample.split("-")
with open(file, "r") as infile:
for line in infile.read().splitlines():
if line.startswith("lineages"):
lineages = line.split("\t")[1].split(" ")
elif line.startswith("abundances"):
abundances = line.split("\t")[1].split(" ")
rounded_abundances = [ '%.2f' % float(elem) for elem in abundances ]
data.at[timestamp, location] = ", ".join([lin + ":" + ab for lin, ab in zip(lineages, rounded_abundances)])
data2.at[timestamp, location] = len(lineages)

data.sort_index(key= lambda x: np.argsort(index_natsorted(data.index)), inplace=True)
data.to_csv(snakemake.output.all)
data2.to_csv(snakemake.output.all_count)


data = pd.DataFrame()

for file, sample in zip(snakemake.input.demix, snakemake.params.sample):
lineages = []
abundances = []
with open(file, "r") as infile:
for line in infile.read().splitlines():
if line.startswith("lineages"):
lineages = line.split("\t")[1].split(" ")
elif line.startswith("abundances"):
abundances = line.split("\t")[1].split(" ")
rounded_abundances = [ '%.2f' % float(elem) for elem in abundances ]
for lin, ab in zip(lineages, rounded_abundances):
data.at[lin, sample] = ab

data.sort_index(key= lambda x: np.argsort(index_natsorted(data.index)), inplace=True)
# data.sort_index(key= lambda x: np.argsort(data.columns.to_series().str[1:].astype(int)), inplace=True)
data = data[sorted(data.columns, key=lambda x: tuple(map(int,x[-2:])))]
data.to_csv(snakemake.output.pivot)
Loading
Loading