-
Notifications
You must be signed in to change notification settings - Fork 1
/
Snakefile
113 lines (97 loc) · 3.64 KB
/
Snakefile
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
chris_path = "/home/leect/workspace/slide"
samples = [
"mmu_dRNA_3T3_mion_1",
"mmu_dRNA_3T3_PION_1",
]
rule run_all:
input:
expand("prep/experiments/{s}/transcript_bin_distribution.tab", s = samples),
expand("prep/experiments/{s}/joined_features.tab", s=samples),
expand("prep/experiments/{s}/joined_features_expanded.tab", s=samples),
expand("prep/experiments/{s}/joined_features_expanded_agg{type}_cleaned.tab", s = samples, type = "max")
rule run_extract_bam_features:
input:
bam= chris_path + "/samples/{s}/align/reads.sanitize.noribo.toTranscriptome.sorted.uxi.bam",
bed= chris_path + "/data/mm10/genic_elements.mrna.bed",
output:
meta_coords= "{s}.meta_coordinates.tab",
hist5p= "{s}.hist5p.tab",
hist3p= "{s}.hist3p.tab",
pdf= "{s}.pdf",
shell:
"""
python scripts/extract_bam_meta_features.py \
-i {input.bam} \
-b {input.bed} \
--bins 10 \
-n {wildcards.s} \
--pdf {output.pdf} \
--ohist_5p {output.hist5p} \
--ohist_3p {output.hist3p} \
> {output.meta_coords}
"""
rule aggregate_transcript_meta_features:
input:
meta_coords = "{s}.meta_coordinates.tab",
output:
transcr_bin_distro = "prep/experiments/{s}/transcript_bin_distribution.tab",
shell:
"""
python scripts/aggregate-transcript-meta-features.py \
--input {input.meta_coords} \
--bins 10 \
> {output.transcr_bin_distro}
"""
rule join_features:
input:
transcr_bin_distro = "prep/experiments/{s}/transcript_bin_distribution.tab",
transc_feature = "prep/experiments/{s}/features_v1.csv"
params: key1 = "transcript", key2 = "transcript_id"
output: joined_table = "prep/experiments/{s}/joined_features.tab"
shell:
"""
awk 'BEGIN {{ FS = \",\"; OFS = \"\t\" }} {{$1=$1; print }}' {input.transc_feature} |
table-join.py -t - \
-g {input.transcr_bin_distro} \
-c {params.key1} \
-d {params.key2} > {output.joined_table}
"""
rule expand_hist_cols:
input:
a="prep/experiments/{s}/joined_features.tab"
output:
a="prep/experiments/{s}/joined_features_expanded.tab"
run:
import pandas as pd
df = pd.read_csv(input.a, sep = "\t", header = 0)
df[["hist5p_"+str(i) for i in range(10)]] = df['hist5p'].str.split(',', expand = True)
df[["hist3p_"+str(i) for i in range(10)]] = df['hist3p'].str.split(',', expand = True)
df.drop(["hist5p","hist3p"], axis=1, inplace = True)
df.to_csv(output.a, index = False, sep = "\t")
rule aggregate_features:
input:
"prep/experiments/{s}/joined_features_expanded.tab"
output:
"prep/experiments/{s}/joined_features_expanded_agg{type}_.tab"
shell:
"""
python prep/scripts/agg_per_gene.py \
-f {input} \
-g gene \
-c n_readlength \
-t {wildcards.type} \
-d '\t' \
-o {output}
"""
rule clean_halflives:
input:
a="prep/experiments/{s}/joined_features_expanded_agg{type}_.tab"
output:
a="prep/experiments/{s}/joined_features_expanded_agg{type}_cleaned.tab"
run:
import pandas as pd
df = pd.read_csv(input.a, sep = "\t", header = 0)
df = df[df["t5"] != "--"]
df["t5"] = pd.to_numeric(df["t5"])
df = df[(df["t5"] > 0.1) & (df["t5"]<12)]
df.to_csv(output.a, index = False, sep = "\t")