Skip to content

Commit

Permalink
Add workflow to postprocess the output of robomics/call_tad_cliques
Browse files Browse the repository at this point in the history
  • Loading branch information
robomics committed Aug 26, 2023
1 parent c7ac53b commit a69346e
Show file tree
Hide file tree
Showing 3 changed files with 245 additions and 0 deletions.
162 changes: 162 additions & 0 deletions bin/plot_maximal_clique_sizes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
#!/usr/bin/env python3

# Copyright (C) 2023 Roberto Rossini <[email protected]>
# SPDX-License-Identifier: MIT


import argparse
import pathlib
from typing import Dict, Iterable, List, Tuple

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns


def make_cli() -> argparse.ArgumentParser:
def existing_file(arg):
if (file := pathlib.Path(arg)).exists():
return file

raise FileNotFoundError(arg)

cli = argparse.ArgumentParser(description="Plot maximal clique size distribution across conditions.")

cli.add_argument(
"cliques",
nargs="+",
type=existing_file,
help="Path to one or more TSV file with the list of cliques.",
)
cli.add_argument(
"-o",
"--output-prefix",
required=True,
type=pathlib.Path,
help="Path to output prefix.",
)
cli.add_argument(
"--labels",
nargs="+",
type=str,
help="Sample labels to use for plotting.\n" "When not provided, labels are inferred from input file names.",
)

cli.add_argument(
"-s",
"--stat",
type=str,
choices={"count", "probability", "percent", "density"},
default="count",
help="Aggregate statistic to compute in each bin.",
)

cli.add_argument(
"--raise-on-empty-files",
action="store_true",
default=False,
help="Raise an exception when any of the input file(s) is empty.",
)

cli.add_argument(
"--force",
action="store_true",
default=False,
help="Force overwrite existing files.",
)

return cli


def handle_path_collisions(*paths: pathlib.Path) -> None:
collisions = [p for p in paths if p.exists()]

if len(collisions) != 0:
collisions = "\n - ".join((str(p) for p in collisions))
raise RuntimeError(
"Refusing to overwrite file(s):\n" f" - {collisions}\n" "Pass --force to overwrite existing file(s)."
)


def save_plot_to_file(fig: plt.Figure, outprefix: pathlib.Path, force: bool, close_after_save: bool = True) -> None:
png = outprefix.with_suffix(".png")
svg = outprefix.with_suffix(".svg")
if not force:
handle_path_collisions(png, svg)

outprefix.parent.mkdir(exist_ok=True, parents=True)

fig.savefig(png, bbox_inches="tight", dpi=300)
fig.savefig(svg, bbox_inches="tight")
if close_after_save:
plt.close(fig)


def read_cliques(cliques: pathlib.Path, raise_on_empty_files: bool) -> pd.DataFrame:
df = pd.read_table(cliques).rename(columns={"name": "clique"})
assert df.columns.tolist() == ["clique", "tad_ids", "size"]

if raise_on_empty_files and len(df) == 0:
raise RuntimeError(f"Unable to read any record from {cliques}")

return df.set_index("clique")


def compute_counts_minmax(cliques: Iterable[pd.DataFrame]) -> Tuple[int, int]:
lb = min((df["size"].min() for df in cliques))
ub = max((df["size"].max() for df in cliques))
return int(lb), int(ub)


def plot_maximal_clique_sizes(cliques: Dict[str, pd.DataFrame], stat: str) -> plt.Figure:
fig, ax = plt.subplots(1, 1)

data = []
for label, df in cliques.items():
data.extend([[label, size] for size in df["size"]])
df = pd.DataFrame(data, columns=["label", "size"])
sns.histplot(
df,
x="size",
hue="label",
multiple="dodge",
ax=ax,
shrink=0.8,
discrete=True,
stat=stat,
common_norm=False,
)

ax.set_xticks(range(df["size"].min(), df["size"].max() + 1))

return fig


def generate_labels(paths: Iterable[pathlib.Path]) -> List[str]:
labels = []
for p in paths:
labels.append(str(p.name).rstrip("".join(p.suffixes)))

return labels


def main():
args = vars(make_cli().parse_args())

path_to_cliques = args["cliques"]
labels = args.get("labels")

if labels is None:
labels = generate_labels(path_to_cliques)

if len(labels) != len(path_to_cliques):
raise RuntimeError(f"Expected {len(path_to_cliques)} labels, found {len(labels)}")

cliques = {label: read_cliques(path, args["raise_on_empty_files"]) for label, path in zip(labels, args["cliques"])}

fig = plot_maximal_clique_sizes(cliques, stat=args["stat"])
save_plot_to_file(fig, args["output_prefix"], args["force"])


if __name__ == "__main__":
main()
23 changes: 23 additions & 0 deletions configs/postprocess_call_tad_cliques.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// Copyright (C) 2023 Roberto Rossini <[email protected]>
//
// SPDX-License-Identifier: MIT

params {
data_dir = 'data'
input_dir = "${data_dir}/output/cliques"
output_dir = "${data_dir}/output/cliques"

cliques_cis = "${input_dir}/*_cis_cliques.tsv.gz"
cliques_trans = "${input_dir}/*_trans_cliques.tsv.gz"
cliques_all = "${input_dir}/*_all_cliques.tsv.gz"

cliques_cis_wt = "${input_dir}/aggregate_on_wt/*_cis_cliques.tsv.gz"
cliques_trans_wt = "${input_dir}/aggregate_on_wt/*_trans_cliques.tsv.gz"
cliques_all_wt = "${input_dir}/aggregate_on_wt/*_all_cliques.tsv.gz"

labels = ['MCF10A', 'MCF10AT1', 'MCF10CA1a']
}

process {
container = 'ghcr.io/paulsengroup/2022-mcf10a-cancer-progression/plotting:1.0.0'
}
60 changes: 60 additions & 0 deletions workflows/postprocess_call_tad_cliques.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#!/usr/bin/env nextflow
// Copyright (C) 2023 Roberto Rossini <[email protected]>
//
// SPDX-License-Identifier: MIT

nextflow.enable.dsl=2


workflow {
Channel.empty()
.mix(Channel.fromPath(params.cliques_cis, checkIfExists: true).collect().map { tuple("cis-only", it) })
.mix(Channel.fromPath(params.cliques_trans, checkIfExists: true).collect().map { tuple("trans-only", it) })
.mix(Channel.fromPath(params.cliques_all, checkIfExists: true).collect().map { tuple("all", it) })
.map{ tuple(it[0], it[1], params.labels.join(","), "") }
.set { cliques }

Channel.empty()
.mix(Channel.fromPath(params.cliques_cis_wt, checkIfExists: true).collect().map { tuple("cis-only", it) })
.mix(Channel.fromPath(params.cliques_trans_wt, checkIfExists: true).collect().map { tuple("trans-only", it) })
.mix(Channel.fromPath(params.cliques_all_wt, checkIfExists: true).collect().map { tuple("all", it) })
.map{ tuple(it[0], it[1], params.labels.join(","), "aggregate_on_wt") }
.set { cliques_wt }

plot_maximal_clique_sizes(
cliques.mix(cliques_wt)
)
}

process plot_maximal_clique_sizes {
publishDir "${params.output_dir}/${output_folder}/plots", mode: 'copy'


tag "${type}"

input:
tuple val(type),
path(cliques),
val(labels),
val(output_folder)

output:
tuple val(type),
path("*.png"), emit: png

tuple val(type),
path("*.svg"), emit: svg

shell:
'''
plot_maximal_clique_sizes.py \\
*{WT,T1,C1}_*cliques.tsv.gz \\
-o '!{type}_maximal_clique_size_abs'
plot_maximal_clique_sizes.py \\
*{WT,T1,C1}_*cliques.tsv.gz \\
--stat='density' \\
-o '!{type}_maximal_clique_size_rel'
'''
}

0 comments on commit a69346e

Please sign in to comment.