From 2837ae558e98a3984cd86c1889b602df4544ea7e Mon Sep 17 00:00:00 2001
From: Anusri Pampari
Date: Tue, 27 Aug 2024 06:59:01 -0700
Subject: [PATCH] upload json scripts
---
.../chrombpnet/atac_prepare.py | 116 +++++++
.../chrombpnet/dnase_prepare.py | 115 +++++++
.../READMES/counts.deepshap.README | 0
.../counts_contrib_upload/atac_tar.py | 215 +++++++++++++
.../atac_tar_k5_and_hep.py | 193 +++++++++++
.../counts_contrib_upload/dnase_tar.py | 222 +++++++++++++
.../dnase_tar_k5_and_hep.py | 209 ++++++++++++
.../bias_models/READMEs/bias.training.README | 63 ++++
.../atac_bias_model_chrombpnet.csv | 6 +
.../bias_models/atac_bias_model_upload.py | 10 +-
.../bias_models/atac_bias_upload_utils.py | 32 +-
.../chrombpnet/READMEs/bias.models.README | 92 ++++++
.../chrombpnet/READMEs/bias.training.README | 63 ++++
.../chrombpnet/READMEs/models.README | 98 ++++++
.../chrombpnet/READMEs/training.README | 66 ++++
.../chrombpnet/atac_bias_model_chrombpnet.csv | 26 ++
.../chrombpnet/atac_bias_model_upload.py | 260 +++++++++++++++
.../chrombpnet/atac_bias_upload_utils.py | 174 ++++++++++
.../dnase_bias_model_chrombpnet.csv | 0
.../chrombpnet/dnase_bias_model_upload.py | 283 ++++++++++++++++
.../chrombpnet/dnase_bias_upload_utils.py | 301 ++++++++++++++++++
.../get_gc_matched_negatives_test.py | 175 ++++++++++
.../make_test_negatives/run_script.py | 26 ++
.../make_test_negatives/run_script_dnase.py | 36 +++
.../chrombpnet/make_test_negatives/script.sh | 15 +
.../make_test_negatives/script_dnase.sh | 16 +
.../chrombpnet/model_upload_utils.py | 235 ++++++++++++++
.../bias_models/chrombpnet/temp.sh | 2 +
.../chrombpnet/READMEs/bias.models.README | 92 ++++++
.../chrombpnet/READMEs/bias.training.README | 63 ++++
.../chrombpnet/atac_bias_model_chrombpnet.csv | 26 ++
.../atac_prepare_file_for_upload_models.py | 159 +++++++++
.../dnase_prepare_file_for_upload_models.py | 204 ++++++++++++
.../chrombpnet/upload_utils.py | 281 ++++++++++++++++
.../dnase_prepare_file_for_upload_models.py | 2 +-
.../chrombpnet_models/upload_utils.py | 6 +-
.../READMEs/modisco.report.README | 0
.../modisco_uploads/atac_prepare.py | 75 +++++
.../modisco_uploads/dnase_prepare.py | 75 +++++
.../chrombpnet/READMEs/bc.predicted.README | 71 +++++
.../chrombpnet/READMEs/predicted.README | 71 +++++
.../chrombpnet/atac_prepare.py | 32 ++
.../chrombpnet/atac_prepare_tar.py | 139 ++++++++
.../chrombpnet/atac_prepare_tar_w_bias.py | 139 ++++++++
.../chrombpnet/dnase_prepare.py | 31 ++
.../chrombpnet/dnase_prepare_tar.py | 146 +++++++++
.../chrombpnet/dnase_prepare_tar_w_bias.py | 144 +++++++++
.../dnase_prepare_tar.py | 0
.../READMES/profile.deepshap.README | 0
.../profile_contrib_upload/atac_tar.py | 215 +++++++++++++
.../atac_tar_k5_and_hep.py | 193 +++++++++++
.../profile_contrib_upload/dnase_tar.py | 221 +++++++++++++
.../dnase_tar_k5_and_hep.py | 212 ++++++++++++
53 files changed, 5627 insertions(+), 19 deletions(-)
create mode 100644 upload_jsons/upload_jsons_scripts/counts_bigwig_uploads/chrombpnet/atac_prepare.py
create mode 100644 upload_jsons/upload_jsons_scripts/counts_bigwig_uploads/chrombpnet/dnase_prepare.py
create mode 100644 upload_jsons/upload_jsons_scripts/counts_contrib_upload/READMES/counts.deepshap.README
create mode 100644 upload_jsons/upload_jsons_scripts/counts_contrib_upload/atac_tar.py
create mode 100644 upload_jsons/upload_jsons_scripts/counts_contrib_upload/atac_tar_k5_and_hep.py
create mode 100644 upload_jsons/upload_jsons_scripts/counts_contrib_upload/dnase_tar.py
create mode 100644 upload_jsons/upload_jsons_scripts/counts_contrib_upload/dnase_tar_k5_and_hep.py
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/READMEs/bias.training.README
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_model_chrombpnet.csv
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/bias.models.README
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/bias.training.README
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/models.README
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/training.README
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_model_chrombpnet.csv
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_model_upload.py
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_upload_utils.py
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_model_chrombpnet.csv
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_model_upload.py
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_upload_utils.py
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/get_gc_matched_negatives_test.py
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/run_script.py
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/run_script_dnase.py
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/script.sh
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/script_dnase.sh
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/model_upload_utils.py
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/temp.sh
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/READMEs/bias.models.README
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/READMEs/bias.training.README
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/atac_bias_model_chrombpnet.csv
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/atac_prepare_file_for_upload_models.py
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/dnase_prepare_file_for_upload_models.py
create mode 100644 upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/upload_utils.py
create mode 100644 upload_jsons/upload_jsons_scripts/modisco_uploads/READMEs/modisco.report.README
create mode 100644 upload_jsons/upload_jsons_scripts/modisco_uploads/atac_prepare.py
create mode 100644 upload_jsons/upload_jsons_scripts/modisco_uploads/dnase_prepare.py
create mode 100644 upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/READMEs/bc.predicted.README
create mode 100644 upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/READMEs/predicted.README
create mode 100644 upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare.py
create mode 100644 upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare_tar.py
create mode 100644 upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare_tar_w_bias.py
create mode 100644 upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare.py
create mode 100644 upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare_tar.py
create mode 100644 upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare_tar_w_bias.py
create mode 100644 upload_jsons/upload_jsons_scripts/profile_bigwigs_uploads/dnase_prepare_tar.py
create mode 100644 upload_jsons/upload_jsons_scripts/profile_contrib_upload/READMES/profile.deepshap.README
create mode 100644 upload_jsons/upload_jsons_scripts/profile_contrib_upload/atac_tar.py
create mode 100644 upload_jsons/upload_jsons_scripts/profile_contrib_upload/atac_tar_k5_and_hep.py
create mode 100644 upload_jsons/upload_jsons_scripts/profile_contrib_upload/dnase_tar.py
create mode 100644 upload_jsons/upload_jsons_scripts/profile_contrib_upload/dnase_tar_k5_and_hep.py
diff --git a/upload_jsons/upload_jsons_scripts/counts_bigwig_uploads/chrombpnet/atac_prepare.py b/upload_jsons/upload_jsons_scripts/counts_bigwig_uploads/chrombpnet/atac_prepare.py
new file mode 100644
index 00000000..25898df8
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/counts_bigwig_uploads/chrombpnet/atac_prepare.py
@@ -0,0 +1,116 @@
+import os
+import json
+import pandas as pd
+import pybedtools
+
+encids = ["K562", "HEPG2", "IMR90", "H1ESC", "GM12878"]
+#encids = ["K562", "HEPG2"]
+#encids = [ "IMR90", "H1ESC", "GM12878"]
+
+def make_bb_file(in_bed, out_bb):
+ assert(os.path.isfile("atac_temp.bed")==False)
+ command = "zcat "+in_bed+" | LC_COLLATE=C sort -k1,1 -k2,2n > atac_temp.bed"
+ print(command)
+ os.system(command)
+
+ command = "bedToBigBed atac_temp.bed /oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/reference/chrom.sizes "+out_bb
+ print(command)
+ os.system(command)
+
+
+ command = "rm atac_temp.bed"
+ print(command)
+ os.system(command)
+
+chrs = list(map(str,list(range(1,23))))
+chrs = ['chr'+i for i in chrs]
+chrs = chrs + ['chrX', 'chrY']
+
+def make_sel_bedfile(in_bed, out_bed):
+ input_bed = pd.read_csv(in_bed, sep="\t", header=None)
+ print(input_bed.shape)
+ input_bed = input_bed[input_bed[0].isin(chrs)]
+ print(input_bed.shape)
+
+ input_bed[1] = input_bed[1]+input_bed[9]-500
+ input_bed[2] = input_bed[1] + 1000
+ print(input_bed.head())
+ x = pybedtools.BedTool.from_dataframe(input_bed[[0,1,2]])
+ x = x.sort().merge()
+ output_bed = x.to_dataframe()
+ print(output_bed.shape)
+ print(output_bed.head())
+ output_bed.to_csv(out_bed, sep='\t', header=False, index=False)
+
+encode_id = {"K562": "ENCSR868FGK",
+"GM12878": "ENCSR637XSC",
+"HEPG2": "ENCSR291GJU",
+"IMR90": "ENCSR200OML",
+"H1ESC": "GSE267154"}
+
+odir='atac/'
+for encid in encids:
+ print(encid)
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores_new_compressed.stats"
+ if os.path.isfile(ofile):
+ counts_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores_new_compressed.bw"
+ else:
+ print(ofile)
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores.stats"
+ if os.path.isfile(ofile):
+ counts_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores.bw"
+ else:
+ counts_bw = None
+ print(ofile)
+ continue
+
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.stats"
+ if os.path.isfile(ofile):
+ profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.bw"
+ else:
+ print(ofile)
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.stats"
+ if os.path.isfile(ofile):
+ profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.bw"
+ else:
+ profile_bw = None
+ print(ofile)
+
+ continue
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/per_folds.inputs.bed.gz"
+ if os.path.isfile(ofile):
+ sel_path = os.path.join("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/", encid+"/interpret_upload/average_preds/selected.regions.valid.merged.bed.gz" )
+ sel_path_bb = os.path.join("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/", encid+"/interpret_upload/average_preds/selected.regions.valid.merged.bigBed" )
+ if not os.path.isfile(sel_path):
+ make_sel_bedfile(ofile, sel_path)
+
+ if os.path.isfile(sel_path) and (not os.path.isfile(sel_path_bb)):
+ make_bb_file(sel_path, sel_path_bb)
+
+ else:
+ sel_path=None
+ sel_path_bb=None
+ print(ofile)
+ continue
+
+ assert(os.path.isfile(counts_bw)==True)
+ assert(os.path.isfile(profile_bw)==True)
+
+
+ output_json = {}
+ output_json["experiment"] = encode_id[encid]
+ output_json["counts sequence contribution scores bigWig"] = counts_bw
+ output_json["profile sequence contribution scores bigWig"] = profile_bw
+
+ if os.path.isfile(sel_path_bb):
+ output_json["selected regions for predicted signal and sequence contribution scores bigBed"] = sel_path_bb
+
+ if os.path.isfile(sel_path):
+ output_json["selected regions for predicted signal and sequence contribution scores bed"] = sel_path
+
+ if not os.path.isfile(odir+encode_id[encid]+".json"):
+ f = open(odir+encode_id[encid]+".json", "w")
+ json.dump(output_json, f, indent=4)
+ f.close()
diff --git a/upload_jsons/upload_jsons_scripts/counts_bigwig_uploads/chrombpnet/dnase_prepare.py b/upload_jsons/upload_jsons_scripts/counts_bigwig_uploads/chrombpnet/dnase_prepare.py
new file mode 100644
index 00000000..f49d31c2
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/counts_bigwig_uploads/chrombpnet/dnase_prepare.py
@@ -0,0 +1,115 @@
+import os
+import json
+import pandas as pd
+import pybedtools
+
+#encids = ["K562", "HEPG2", "IMR90", "H1ESC", "GM12878"]
+#encids = ["K562", "HEPG2"]
+encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"]
+
+encode_id = {"HEPG2": "ENCSR149XIL",
+ "K562": "ENCSR000EOT",
+ "IMR90_new": "ENCSR477RTP",
+ "GM12878_new": "ENCSR000EMT",
+ "H1ESC_new": "ENCSR000EMU"}
+odir='dnase/'
+def make_bb_file(in_bed, out_bb):
+ assert(os.path.isfile("atac_temp.bed")==False)
+ command = "zcat "+in_bed+" | LC_COLLATE=C sort -k1,1 -k2,2n > atac_temp.bed"
+ print(command)
+ os.system(command)
+
+ command = "bedToBigBed atac_temp.bed /oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/reference/chrom.sizes "+out_bb
+ print(command)
+ os.system(command)
+
+
+ command = "rm atac_temp.bed"
+ print(command)
+ os.system(command)
+
+chrs = list(map(str,list(range(1,23))))
+chrs = ['chr'+i for i in chrs]
+chrs = chrs + ['chrX', 'chrY']
+
+def make_sel_bedfile(in_bed, out_bed):
+ input_bed = pd.read_csv(in_bed, sep="\t", header=None)
+ print(input_bed.shape)
+ input_bed = input_bed[input_bed[0].isin(chrs)]
+ print(input_bed.shape)
+
+ input_bed[1] = input_bed[1]+input_bed[9]-500
+ input_bed[2] = input_bed[1] + 1000
+ print(input_bed.head())
+ x = pybedtools.BedTool.from_dataframe(input_bed[[0,1,2]])
+ x = x.sort().merge()
+ output_bed = x.to_dataframe()
+ print(output_bed.shape)
+ print(output_bed.head())
+ output_bed.to_csv(out_bed, sep='\t', header=False, index=False)
+
+for encid in encids:
+ print(encid)
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores_new_compressed.stats"
+ if os.path.isfile(ofile):
+ counts_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores_new_compressed.bw"
+ else:
+ print(ofile)
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores.stats"
+ if os.path.isfile(ofile):
+ counts_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores.bw"
+ else:
+ counts_bw = None
+ print(ofile)
+ continue
+
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.stats"
+ if os.path.isfile(ofile):
+ profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.bw"
+ else:
+ print(ofile)
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.stats"
+ if os.path.isfile(ofile):
+ profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.bw"
+ else:
+ profile_bw = None
+ print(ofile)
+
+ continue
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/per_folds.inputs.bed.gz"
+ if os.path.isfile(ofile):
+ sel_path = os.path.join("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/", encid+"/interpret_upload/average_preds/selected.regions.valid.merged.bed.gz" )
+ sel_path_bb = os.path.join("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/", encid+"/interpret_upload/average_preds/selected.regions.valid.merged.bigBed" )
+ if not os.path.isfile(sel_path):
+ make_sel_bedfile(ofile, sel_path)
+
+ if os.path.isfile(sel_path) and (not os.path.isfile(sel_path_bb)):
+ make_bb_file(sel_path, sel_path_bb)
+
+ else:
+ sel_path=None
+ sel_path_bb=None
+ print(ofile)
+ continue
+
+ assert(os.path.isfile(counts_bw)==True)
+ assert(os.path.isfile(profile_bw)==True)
+
+
+ output_json = {}
+ output_json["experiment"] = encode_id[encid]
+ output_json["counts sequence contribution scores bigWig"] = counts_bw
+ output_json["profile sequence contribution scores bigWig"] = profile_bw
+
+ if os.path.isfile(sel_path_bb):
+ output_json["selected regions for predicted signal and sequence contribution scores bigBed"] = sel_path_bb
+
+ if os.path.isfile(sel_path):
+ output_json["selected regions for predicted signal and sequence contribution scores bed"] = sel_path
+
+ if not os.path.isfile(odir+encode_id[encid]+".json"):
+ f = open(odir+encode_id[encid]+".json", "w")
+ json.dump(output_json, f, indent=4)
+ f.close()
diff --git a/upload_jsons/upload_jsons_scripts/counts_contrib_upload/READMES/counts.deepshap.README b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/READMES/counts.deepshap.README
new file mode 100644
index 00000000..e69de29b
diff --git a/upload_jsons/upload_jsons_scripts/counts_contrib_upload/atac_tar.py b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/atac_tar.py
new file mode 100644
index 00000000..5196b748
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/atac_tar.py
@@ -0,0 +1,215 @@
+import os
+import json
+import pandas as pd
+
+#encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"]
+encids = ["IMR90", "H1ESC", "GM12878"]
+
+encode_id = {"K562": "ENCSR868FGK",
+"GM12878": "ENCSR637XSC",
+"HEPG2": "ENCSR291GJU",
+"IMR90": "ENCSR200OML",
+"H1ESC": "GSE267154"}
+odir='atac/'
+
+model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/model_dir_atac.csv",sep=",", header=None)
+
+def fetch_per_fold_counts(odir,model_path, encid, i, name):
+
+ model_path_orig=model_path
+ model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]
+ data_paths = []
+ log_paths = []
+ log_paths_opt = []
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/fold_"+str(i)+"/"
+ input_h5 = os.path.join(odir, name+"_counts_attribs_reformatted.h5")
+ data_paths.append((input_h5, "seq_contrib.counts.fold_"+str(i)+"."+encid+".h5"))
+
+ #model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/ATAC_SE_04.27.2024//chrombpnet_model"
+
+ # ATAC regions logs
+
+ model_path=model_path+"/chrombpnet_model"
+ input_log=model_path+"/interpret_dnase/full_"+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.dnase_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+
+ print(input_log)
+ input_log=model_path+"/interpret_dnase/full_"+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.dnase_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_dnase/ATAC_peaks_full.counts.interpret.log1.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.dnase_regions.fold_"+str(i)+"."+encid+".stderr.txt"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_dnase/ATAC_peaks_full.counts.interpret.log1.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.dnase_regions.fold_"+str(i)+"."+encid+".stdout.txt"))
+ else:
+ print(input_log)
+
+ # atac regions logs
+
+ input_log=model_path_orig+"/interpret/merged."+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+ input_log=model_path_orig+"/interpret/merged."+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ # atac regions logs
+
+
+ input_log=model_path+"/interpret/full_"+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full_"+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full.counts.interpret.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".stderr.txt"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full.counts.interpret.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".stdout.txt"))
+ else:
+ print(input_log)
+
+
+ return data_paths, log_paths, log_paths_opt
+
+def fetch_counts_tar(encid, args_json, model_paths, name):
+ success = False
+ args_json["counts sequence contribution scores tar"] = {}
+ readme_file = "READMES/counts.deepshap.README"
+ assert(os.path.isfile(readme_file))
+ args_json["counts sequence contribution scores tar"]["file.paths"] = [(readme_file, "README.md")]
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid] = {"file.paths": []}
+
+ ## full h5 path
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/"
+
+ input_h5 = os.path.join(odir, name+"_counts_attribs_reformatted.h5")
+ if os.path.isfile(input_h5):
+ args_json["counts sequence contribution scores tar"]["file.paths"].append((input_h5,"seq_contrib.counts.fold_mean."+encid+".h5"))
+ else:
+ success = False
+ return success, args_json
+
+ ## modisoc h5 path
+
+ modisco_input = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/merge_folds_new_may_05_24/in_peaks.counts_scores_new_compressed.h5"
+ if os.path.isfile(modisco_input):
+ args_json["counts sequence contribution scores tar"]["file.paths"].append((modisco_input,"seq_contrib.counts.fold_mean.modisco_input."+encid+".h5"))
+ else:
+ success = False
+ return success, args_json
+
+ # log files
+
+
+ input_file=model_paths[0]+"/chrombpnet_model/interpret_all/full_"+name+".interpreted_regions_counts.bed"
+ newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/per_folds.inputs.bed.gz"
+ input_bed = pd.read_csv(input_file, compression='gzip', sep='\t', header=None)
+ if os.path.isfile(input_file):
+ if not os.path.isfile(newf):
+ input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip')
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((newf,"logs.seq_contrib.counts.input_regions.per_fold."+encid+".bed.gz"))
+
+
+ input_file="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/merge_folds_new_may_05_24/in_peaks.counts.interpreted_regions.bed"
+ newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/modisco.inputs.bed.gz"
+ input_bed = pd.read_csv(input_file, sep='\t', header=None)
+ if os.path.isfile(input_file):
+ if not os.path.isfile(newf):
+ input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip')
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((newf,"logs.seq_contrib.counts.input_regions."+encid+".bed.gz"))
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/"
+
+ input_log = os.path.join(odir, "reformat.log.e")
+ if os.path.isfile(input_log):
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((input_log, "logs.seq_contrib.counts.fold_mean.reformat"+encid+".stderr.txt"))
+
+ input_log = os.path.join(odir, "reformat.log.e")
+ if os.path.isfile(input_log):
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((input_log, "logs.seq_contrib.counts.fold_mean.reformat"+encid+".stdout.txt"))
+
+ assert(len(args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"])==4)
+
+ for i in range(5):
+ data_paths, log_paths, log_paths_opt = fetch_per_fold_counts(odir,model_paths[i], encid, i, name)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["counts sequence contribution scores tar"]["fold_"+str(i)] = {}
+ args_json["counts sequence contribution scores tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["counts sequence contribution scores tar"]["fold_"+str(i)]["logs.seq_contrib.counts.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt}
+ assert(len(data_paths) == 1)
+ print(len(log_paths))
+ assert(len(log_paths) >= 4)
+
+ success=True
+ return success, args_json
+
+for encid in encids:
+ print(encid)
+
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores_new_compressed.stats"
+ if os.path.isfile(ofile):
+ counts_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores_new_compressed.bw"
+ else:
+ counts_bw = None
+ print(ofile)
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.stats"
+ if os.path.isfile(ofile):
+ profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.bw"
+ else:
+ profile_bw = None
+ print(ofile)
+ continue
+
+ assert(os.path.isfile(counts_bw)==True)
+ assert(os.path.isfile(profile_bw)==True)
+
+ model_paths = model_atac[model_atac[1]==encid.replace("_new","")][2].values
+ print(model_paths)
+ args_json = {}
+ args_json["experiment"] = encode_id[encid]
+
+
+ success, args_json = fetch_counts_tar(encode_id[encid], args_json, model_paths, encid)
+ if not success:
+ print("ERR counts tar")
+ continue
+
+ if not os.path.isfile(odir+encode_id[encid]+".json"):
+ f = open(odir+encode_id[encid]+".json", "w")
+ json.dump(args_json, f, indent=4)
+ f.close()
diff --git a/upload_jsons/upload_jsons_scripts/counts_contrib_upload/atac_tar_k5_and_hep.py b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/atac_tar_k5_and_hep.py
new file mode 100644
index 00000000..30d5f8da
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/atac_tar_k5_and_hep.py
@@ -0,0 +1,193 @@
+import os
+import json
+import pandas as pd
+
+#encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"]
+encids = ["K562", "HEPG2"]
+
+encode_id = {"K562": "ENCSR868FGK",
+"GM12878": "ENCSR637XSC",
+"HEPG2": "ENCSR291GJU",
+"IMR90": "ENCSR200OML",
+"H1ESC": "GSE267154"}
+odir='atac/'
+
+model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/model_dir_atac.csv",sep=",", header=None)
+
+def fetch_per_fold_counts(odir,model_path, encid, i, name):
+
+ model_path_orig=model_path
+ model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]
+ data_paths = []
+ log_paths = []
+ log_paths_opt = []
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/fold_"+str(i)+"/"
+ input_h5 = os.path.join(odir, name+"_counts_attribs_reformatted.h5")
+ data_paths.append((input_h5, "seq_contrib.counts.fold_"+str(i)+"."+encid+".h5"))
+
+ #model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/ATAC_SE_04.27.2024//chrombpnet_model"
+
+
+ model_path = model_path+"/chrombpnet_model"
+
+ # all regs logs
+
+ input_log=model_path_orig+"/interpret/merged."+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.atacs_regs.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+ input_log=model_path_orig+"/interpret/merged."+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.atacs_regs.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ # atac regs logs
+
+
+ input_log=model_path+"/interpret/full_"+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.atacs_regs.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full_"+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.atacs_regs.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full.counts.interpret.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.atacs_regs.fold_"+str(i)+"."+encid+".stderr.txt"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full.counts.interpret.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.atacs_regs.fold_"+str(i)+"."+encid+".stdout.txt"))
+ else:
+ print(input_log)
+
+
+ return data_paths, log_paths, log_paths_opt
+
+def fetch_counts_tar(encid, args_json, model_paths, name):
+ success = False
+ args_json["counts sequence contribution scores tar"] = {}
+ readme_file = "READMES/counts.deepshap.README"
+ assert(os.path.isfile(readme_file))
+ args_json["counts sequence contribution scores tar"]["file.paths"] = [(readme_file, "README.md")]
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid] = {"file.paths": []}
+
+ ## full h5 path
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/"
+
+ input_h5 = os.path.join(odir, name+"_counts_attribs_reformatted.h5")
+ if os.path.isfile(input_h5):
+ args_json["counts sequence contribution scores tar"]["file.paths"].append((input_h5,"seq_contrib.counts.fold_mean."+encid+".h5"))
+ else:
+ print(input_h5)
+ success = False
+ return success, args_json
+
+ ## modisoc h5 path
+
+ modisco_input = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/merge_folds_new_may_05_24/in_peaks.counts_scores_new_compressed.h5"
+ if os.path.isfile(modisco_input):
+ args_json["counts sequence contribution scores tar"]["file.paths"].append((modisco_input,"seq_contrib.counts.fold_mean.modisco_input."+encid+".h5"))
+ else:
+ print(modisco_input)
+ success = False
+ return success, args_json
+
+ # log files
+
+
+ input_file=model_paths[1]+"/chrombpnet_model/interpret/full_"+name+".interpreted_regions_counts.bed"
+ newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/per_folds.inputs.bed.gz"
+ input_bed = pd.read_csv(input_file, sep='\t', header=None)
+ if os.path.isfile(input_file):
+ if not os.path.isfile(newf):
+ input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip')
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((newf,"logs.seq_contrib.counts.input_regions.per_fold."+encid+".bed.gz"))
+
+
+ input_file="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/merge_folds_new_may_05_24/in_peaks.counts.interpreted_regions.bed"
+ newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/modisco.inputs.bed.gz"
+ input_bed = pd.read_csv(input_file, sep='\t', header=None)
+ if os.path.isfile(input_file):
+ if not os.path.isfile(newf):
+ input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip')
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((newf,"logs.seq_contrib.counts.input_regions."+encid+".bed.gz"))
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/"
+
+ input_log = os.path.join(odir, "reformat.log.e")
+ if os.path.isfile(input_log):
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((input_log, "logs.seq_contrib.counts.fold_mean.reformat"+encid+".stderr.txt"))
+
+ input_log = os.path.join(odir, "reformat.log.e")
+ if os.path.isfile(input_log):
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((input_log, "logs.seq_contrib.counts.fold_mean.reformat"+encid+".stdout.txt"))
+
+ assert(len(args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"])==4)
+
+ for i in range(5):
+ data_paths, log_paths, log_paths_opt = fetch_per_fold_counts(odir,model_paths[i], encid, i, name)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["counts sequence contribution scores tar"]["fold_"+str(i)] = {}
+ args_json["counts sequence contribution scores tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["counts sequence contribution scores tar"]["fold_"+str(i)]["logs.seq_contrib.counts.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt}
+ assert(len(data_paths) == 1)
+ print(len(log_paths))
+ assert(len(log_paths) >= 1)
+
+ success=True
+ return success, args_json
+
+for encid in encids:
+ print(encid)
+
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores.stats"
+ if os.path.isfile(ofile):
+ counts_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores.bw"
+ else:
+ counts_bw = None
+ print(ofile)
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.stats"
+ if os.path.isfile(ofile):
+ profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.bw"
+ else:
+ profile_bw = None
+ print(ofile)
+ continue
+
+ assert(os.path.isfile(counts_bw)==True)
+ assert(os.path.isfile(profile_bw)==True)
+
+ model_paths = model_atac[model_atac[1]==encid.replace("_new","")][2].values
+ print(model_paths)
+ args_json = {}
+ args_json["experiment"] = encode_id[encid]
+
+
+ success, args_json = fetch_counts_tar(encode_id[encid], args_json, model_paths, encid)
+ if not success:
+ print("ERR counts tar")
+ continue
+
+ if not os.path.isfile(odir+encode_id[encid]+".json"):
+ f = open(odir+encode_id[encid]+".json", "w")
+ json.dump(args_json, f, indent=4)
+ f.close()
diff --git a/upload_jsons/upload_jsons_scripts/counts_contrib_upload/dnase_tar.py b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/dnase_tar.py
new file mode 100644
index 00000000..c69a7440
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/dnase_tar.py
@@ -0,0 +1,222 @@
+import os
+import json
+import pandas as pd
+
+#encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"]
+encids = ["IMR90_new", "H1ESC_new", "GM12878_new"]
+
+encode_id = {"HEPG2": "ENCSR149XIL",
+ "K562": "ENCSR000EOT",
+ "IMR90_new": "ENCSR477RTP",
+ "GM12878_new": "ENCSR000EMT",
+ "H1ESC_new": "ENCSR000EMU"}
+odir='dnase/'
+
+model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/v1/model_dir_dnase_v2_interpret.csv",sep=",", header=None)
+
+def fetch_per_fold_counts(odir,model_path, encid, i, name):
+
+ data_paths = []
+ log_paths = []
+ log_paths_opt = []
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/fold_"+str(i)+"/"
+ input_h5 = os.path.join(odir, name+"_counts_attribs_reformatted.h5")
+ data_paths.append((input_h5, "seq_contrib.counts.fold_"+str(i)+"."+encid+".h5"))
+
+ #model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/DNASE_SE_04.27.2024//chrombpnet_model"
+
+ # dnase regions logs
+
+ model_path=model_path+"/chrombpnet_model"
+ input_log=model_path+"/interpret_orig/full_"+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.dnase_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+ input_log=model_path+"/interpret_orig/full_"+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.dnase_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_orig/ATAC_peaks_full.counts.interpret.log1.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.dnase_regions.fold_"+str(i)+"."+encid+".stderr.txt"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_orig/ATAC_peaks_full.counts.interpret.log1.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.dnase_regions.fold_"+str(i)+"."+encid+".stdout.txt"))
+ else:
+ print(input_log)
+
+ # atac regions logs
+
+ input_log=model_path+"/interpret/full_"+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full_"+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/ATAC_peaks_full.counts.interpret.log1.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".stderr.txt"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/ATAC_peaks_full.counts.interpret.log1.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".stdout.txt"))
+ else:
+ print(input_log)
+
+ # ccre regions logs
+
+ input_log=model_path+"/interpret_ccre/full_"+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.ccre_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_ccre/full_"+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.ccre_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_ccre/full.counts.interpret.log1.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.ccre_regions.fold_"+str(i)+"."+encid+".stderr.txt"))
+ else:
+ print(input_log)
+ input_log=model_path+"/interpret_ccre/full.counts.interpret.log1.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.ccre_regions.fold_"+str(i)+"."+encid+".stdout.txt"))
+ else:
+ print(input_log)
+
+ return data_paths, log_paths, log_paths_opt
+
+def fetch_counts_tar(encid, args_json, model_paths, name):
+ success = False
+ args_json["counts sequence contribution scores tar"] = {}
+ readme_file = "READMES/counts.deepshap.README"
+ assert(os.path.isfile(readme_file))
+ args_json["counts sequence contribution scores tar"]["file.paths"] = [(readme_file, "README.md")]
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid] = {"file.paths": []}
+
+ ## full h5 path
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/"
+
+ input_h5 = os.path.join(odir, name+"_counts_attribs_reformatted.h5")
+ if os.path.isfile(input_h5):
+ args_json["counts sequence contribution scores tar"]["file.paths"].append((input_h5,"seq_contrib.counts.fold_mean."+encid+".h5"))
+ else:
+ success = False
+ return success, args_json
+
+ ## modisoc h5 path
+
+ modisco_input = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/merge_folds_new_may_05_24/in_peaks.counts_scores_new_compressed.h5"
+ if os.path.isfile(modisco_input):
+ args_json["counts sequence contribution scores tar"]["file.paths"].append((modisco_input,"seq_contrib.counts.fold_mean.modisco_input."+encid+".h5"))
+ else:
+ success = False
+ return success, args_json
+
+ # log files
+
+
+ input_file=model_paths[0]+"/chrombpnet_model/interpret_all_with_ccre/full_"+name+".interpreted_regions_counts.bed"
+ newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/per_folds.inputs.bed.gz"
+ input_bed = pd.read_csv(input_file, compression='gzip', sep='\t', header=None)
+ if os.path.isfile(input_file):
+ if not os.path.isfile(newf):
+ input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip')
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((newf,"logs.seq_contrib.counts.input_regions.per_fold."+encid+".bed.gz"))
+
+
+ input_file="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/merge_folds_new_may_05_24/in_peaks.counts_scores_new_compressed.bed"
+ newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/modisco.inputs.bed.gz"
+ input_bed = pd.read_csv(input_file, compression='gzip', sep='\t', header=None)
+ if os.path.isfile(input_file):
+ if not os.path.isfile(newf):
+ input_bed = input_bed[~(input_bed[0]=="chrM")]
+ input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip')
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((newf,"logs.seq_contrib.counts.input_regions."+encid+".bed.gz"))
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/"
+
+ input_log = os.path.join(odir, "reformat.log.e")
+ if os.path.isfile(input_log):
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((input_log, "logs.seq_contrib.counts.fold_mean.reformat"+encid+".stderr.txt"))
+
+ input_log = os.path.join(odir, "reformat.log.e")
+ if os.path.isfile(input_log):
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((input_log, "logs.seq_contrib.counts.fold_mean.reformat"+encid+".stdout.txt"))
+
+ assert(len(args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"])==4)
+
+ for i in range(5):
+ data_paths, log_paths, log_paths_opt = fetch_per_fold_counts(odir,model_paths[i], encid, i, name)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["counts sequence contribution scores tar"]["fold_"+str(i)] = {}
+ args_json["counts sequence contribution scores tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["counts sequence contribution scores tar"]["fold_"+str(i)]["logs.seq_contrib.counts.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt}
+ assert(len(data_paths) == 1)
+ print(len(log_paths))
+ assert(len(log_paths) == 12)
+
+ success=True
+ return success, args_json
+
+for encid in encids:
+ print(encid)
+
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores_new_compressed.stats"
+ if os.path.isfile(ofile):
+ counts_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores_new_compressed.bw"
+ else:
+ counts_bw = None
+ print(ofile)
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.stats"
+ if os.path.isfile(ofile):
+ profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.bw"
+ else:
+ profile_bw = None
+ print(ofile)
+ continue
+
+ assert(os.path.isfile(counts_bw)==True)
+ assert(os.path.isfile(profile_bw)==True)
+
+ model_paths = model_atac[model_atac[1]==encid.replace("_new","")][2].values
+ print(model_paths)
+ args_json = {}
+ args_json["experiment"] = encode_id[encid]
+
+
+ success, args_json = fetch_counts_tar(encode_id[encid], args_json, model_paths, encid)
+ if not success:
+ print("ERR counts tar")
+ continue
+
+ if not os.path.isfile(odir+encode_id[encid]+".json"):
+ f = open(odir+encode_id[encid]+".json", "w")
+ json.dump(args_json, f, indent=4)
+ f.close()
diff --git a/upload_jsons/upload_jsons_scripts/counts_contrib_upload/dnase_tar_k5_and_hep.py b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/dnase_tar_k5_and_hep.py
new file mode 100644
index 00000000..174a8710
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/dnase_tar_k5_and_hep.py
@@ -0,0 +1,209 @@
+import os
+import json
+import pandas as pd
+
+#encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"]
+encids = ["K562", "HEPG2"]
+
+encode_id = {"K562": "ENCSR000EOT",
+"HEPG2": "ENCSR149XIL"}
+odir='dnase/'
+
+model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/model_dir_dnase.csv",sep=",", header=None)
+
+def fetch_per_fold_counts(odir,model_path, encid, i, name):
+
+ model_path_orig=model_path
+ model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-1]
+ data_paths = []
+ log_paths = []
+ log_paths_opt = []
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/fold_"+str(i)+"/"
+ input_h5 = os.path.join(odir, name+"_counts_attribs_reformatted.h5")
+ data_paths.append((input_h5, "seq_contrib.counts.fold_"+str(i)+"."+encid+".h5"))
+
+ #model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/ATAC_SE_04.27.2024//chrombpnet_model"
+
+ # atac regions logs
+
+
+ model_path = model_path+"/chrombpnet_model"
+ input_log=model_path+"/interpret_ccre/full_"+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.ccre_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_ccre/full_"+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.ccre_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_ccre/full.counts.interpret.log1.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.ccre_regions.fold_"+str(i)+"."+encid+".stderr.txt"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_ccre/full.counts.interpret.log1.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.all_regions.fold_"+str(i)+"."+encid+".stdout.txt"))
+ else:
+ print(input_log)
+
+
+ # all regions logs
+
+ input_log=model_path_orig+"/interpret/merged."+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.all_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+
+ input_log=model_path_orig+"/interpret/merged."+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.all_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ # atac regions logs
+
+
+ input_log=model_path+"/interpret/full_"+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.all_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full_"+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.all_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full.counts.interpret.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.all_regions.fold_"+str(i)+"."+encid+".stderr.txt"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full.counts.interpret.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.counts.all_regions.fold_"+str(i)+"."+encid+".stdout.txt"))
+ else:
+ print(input_log)
+
+
+ return data_paths, log_paths, log_paths_opt
+
+def fetch_counts_tar(encid, args_json, model_paths, name):
+ success = False
+ args_json["counts sequence contribution scores tar"] = {}
+ readme_file = "READMES/counts.deepshap.README"
+ assert(os.path.isfile(readme_file))
+ args_json["counts sequence contribution scores tar"]["file.paths"] = [(readme_file, "README.md")]
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid] = {"file.paths": []}
+
+ ## full h5 path
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/"
+
+ input_h5 = os.path.join(odir, name+"_counts_attribs_reformatted.h5")
+ if os.path.isfile(input_h5):
+ args_json["counts sequence contribution scores tar"]["file.paths"].append((input_h5,"seq_contrib.counts.fold_mean."+encid+".h5"))
+ else:
+ print(input_h5)
+ success = False
+ return success, args_json
+
+ ## modisoc h5 path
+
+ modisco_input = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/merge_folds_new_may_05_24/in_peaks.counts_scores_new_compressed.h5"
+ if os.path.isfile(modisco_input):
+ args_json["counts sequence contribution scores tar"]["file.paths"].append((modisco_input,"seq_contrib.counts.fold_mean.modisco_input."+encid+".h5"))
+ else:
+ print(modisco_input)
+ success = False
+ return success, args_json
+
+ # log files
+
+
+ input_file=model_paths[1]+"/chrombpnet_model/interpret_all_with_ccre/full_"+name+".interpreted_regions_counts.bed"
+ newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/per_folds.inputs.bed.gz"
+ input_bed = pd.read_csv(input_file, compression='gzip', sep='\t', header=None)
+ if os.path.isfile(input_file):
+ if not os.path.isfile(newf):
+ input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip')
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((newf,"logs.seq_contrib.counts.input_regions.per_fold."+encid+".bed.gz"))
+
+
+ input_file="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/merge_folds_new_may_05_24/in_peaks.counts.interpreted_regions.bed"
+ newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/modisco.inputs.bed.gz"
+ input_bed = pd.read_csv(input_file, sep='\t', header=None)
+ if os.path.isfile(input_file):
+ if not os.path.isfile(newf):
+ input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip')
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((newf,"logs.seq_contrib.counts.input_regions."+encid+".bed.gz"))
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/"
+
+ input_log = os.path.join(odir, "reformat.log.e")
+ if os.path.isfile(input_log):
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((input_log, "logs.seq_contrib.counts.fold_mean.reformat"+encid+".stderr.txt"))
+
+ input_log = os.path.join(odir, "reformat.log.e")
+ if os.path.isfile(input_log):
+ args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((input_log, "logs.seq_contrib.counts.fold_mean.reformat"+encid+".stdout.txt"))
+
+ assert(len(args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"])==4)
+
+ for i in range(5):
+ data_paths, log_paths, log_paths_opt = fetch_per_fold_counts(odir,model_paths[i], encid, i, name)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["counts sequence contribution scores tar"]["fold_"+str(i)] = {}
+ args_json["counts sequence contribution scores tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["counts sequence contribution scores tar"]["fold_"+str(i)]["logs.seq_contrib.counts.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt}
+ assert(len(data_paths) == 1)
+ print(len(log_paths))
+ assert(len(log_paths) >= 5)
+
+ success=True
+ return success, args_json
+
+for encid in encids:
+ print(encid)
+
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores.stats"
+ if os.path.isfile(ofile):
+ counts_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores.bw"
+ else:
+ counts_bw = None
+ print(ofile)
+
+
+ assert(os.path.isfile(counts_bw)==True)
+
+ model_paths = model_atac[model_atac[1]==encid.replace("_new","")][2].values
+ print(model_paths)
+ args_json = {}
+ args_json["experiment"] = encode_id[encid]
+
+
+ success, args_json = fetch_counts_tar(encode_id[encid], args_json, model_paths, encid)
+ if not success:
+ print("ERR counts tar")
+ continue
+
+ if not os.path.isfile(odir+encode_id[encid]+".json"):
+ f = open(odir+encode_id[encid]+".json", "w")
+ json.dump(args_json, f, indent=4)
+ f.close()
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/READMEs/bias.training.README b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/READMEs/bias.training.README
new file mode 100644
index 00000000..8faa0ea2
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/READMEs/bias.training.README
@@ -0,0 +1,63 @@
+# Directory Structure Format
+.
+├── peaks.all_input_regions.encid.bed.gz # Peaks input to the bias training script
+├── logs.bias.training_test_regions.encid # folder containing log files for peak and nonpeak generation scripts
+│
+├── fold_0
+│ ├── cv_params.fold_0.json # training, validation and test chromosomes used in fold 0
+│ ├── nonpeaks.all_input_regions.fold_0.encid.bed.gz # Non peaks input to the bias training script
+│ ├── nonpeaks.trainingset.fold_0.encid.bed.gz # nonpeaks used in training set of fold 0 bias model
+│ ├── nonpeaks.validationset.fold_0.encid.bed.gz # nonpeaks used in validation set of fold 0 bias model
+│ ├── nonpeaks.testset.fold_0.encid.bed.gz # nonpeaks used in test set of fold 0 bias model
+│ └── logs.bias.training_test_regions.fold_0.encid # folder containing log files for training bias model on fold 0
+│
+├── fold_1
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_2
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_3
+│ └── ... # similar directory structure as fold_0 directory above
+│
+└── fold_4
+ └── ... # similar directory structure as fold_0 directory above
+
+# Bed File Format for Peaks
+
+* All the bed files are in narrowpeak format with 10 columns.
+
+1) chrom - Name of the chromosome (or contig, scaffold, etc.).
+2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
+3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
+4) name - Name given to a region (preferably unique). Use "." if no name is assigned.
+5) score - Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were "'0"' when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000.
+6) strand - +/- to denote strand or orientation (whenever applicable). Use "." if no orientation is assigned.
+7) signalValue - Measurement of overall (usually, average) enrichment for the region.
+8) pValue - Measurement of statistical significance (-log10). Use -1 if no pValue is assigned.
+9) qValue - Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
+10) peak - Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.
+
+# Bed File Format for Nonpeaks
+
+* All the bed files are in narrowpeak format with 10 columns.
+
+1) chrom - Name of the chromosome (or contig, scaffold, etc.).
+2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
+3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
+4) empty character - "."
+5) empty character - "."
+6) empty character - "."
+7) empty character - "."
+8) empty character - "."
+9) empty character - "."
+10) (chromEnd-chromStart)/2
+
+# Format of file `cv_params.fold_0.json`
+
+A dictionary with following (key,value) pairs,
+
+1) ("CV_type", "chr_holdout")
+2) ("train", list_of_chrs_trainingset)
+3) ("valid", list_of_chrs_validationset)
+4) ("test", list_of_chrs_testset)
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_model_chrombpnet.csv b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_model_chrombpnet.csv
new file mode 100644
index 00000000..da180a8d
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_model_chrombpnet.csv
@@ -0,0 +1,6 @@
+fold_0,K562,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/K562/nautilus_runs/K562_02.17.2022_bias_128_4_1234_0.5_fold_0
+fold_1,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_1_data_type_ATAC_PE
+fold_2,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_2_data_type_ATAC_PE
+fold_3,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_3_data_type_ATAC_PE
+fold_4,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_4_data_type_ATAC_PE
+
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_model_upload.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_model_upload.py
index 46c7f0e6..8b534c68 100755
--- a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_model_upload.py
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_model_upload.py
@@ -12,7 +12,7 @@
#encids = open("../chromatin_atlas_atac/test_encid.txt").readlines()
#encids = [line.strip() for line in encids]
-model_atac = pd.read_csv("atac_bias_model.csv",sep=",", header=None)
+model_atac = pd.read_csv("atac_bias_model_chrombpnet.csv",sep=",", header=None)
encode_id = {"K562": "ENCSR868FGK"}
data_to_bam = {"K562": ["ENCFF077FBI", "ENCFF128WZG", "ENCFF534DCE"]}
def main_fetch_preprocessing_files(encid, args_json, bam_ids, name):
@@ -40,7 +40,7 @@ def main_fetch_preprocessing_files(encid, args_json, bam_ids, name):
def main_fetch_bias_model_files(encid, args_json, models_path):
success = False
args_json["bias models tar"] = {}
- readme_file = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/READMES/bias.models.README"
+ readme_file = "READMEs/bias.training.README"
assert(os.path.isfile(readme_file))
args_json["bias models tar"]["file.paths"] = [(readme_file, "README.md")]
#args_json["bias models tar"]["logs.bias.models."+encid] = {"file.paths": None}
@@ -68,7 +68,7 @@ def main_fetch_bias_training_files(encid, args_json, models_path, name):
# find the training test regions
args_json["bias training and test regions tar"] = {}
- readme_file = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/READMES/bias.training_test_regions.README"
+ readme_file = "READMEs/bias.training.README"
assert(os.path.isfile(readme_file))
args_json["bias training and test regions tar"]["file.paths"] = [(readme_file, "README.md")]
@@ -82,7 +82,7 @@ def main_fetch_bias_training_files(encid, args_json, models_path, name):
log_paths = upload_utils.bias_fetch_preprocessing_log_files(odir, encid, main_dir, name)
args_json["bias training and test regions tar"]["logs.bias.training_test_regions."+encid] = {"file.paths": log_paths}
- assert(len(log_paths) == 4)
+ assert(len(log_paths) == 3)
for i in range(5):
data_paths, log_paths = upload_utils.fetch_per_fold_training_data_bias(odir, models_path[i], encid, i, main_dir, name)
@@ -90,6 +90,8 @@ def main_fetch_bias_training_files(encid, args_json, models_path, name):
args_json["bias training and test regions tar"]["fold_"+str(i)] = {}
args_json["bias training and test regions tar"]["fold_"+str(i)]["file.paths"] = data_paths
args_json["bias training and test regions tar"]["fold_"+str(i)]["logs.bias.training_test_regions.fold_"+str(i)+"."+encid] = {"file.paths": log_paths}
+ #print(len(data_paths))
+ #print(data_paths)
assert(len(data_paths) == 5)
assert(len(log_paths) == 2)
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_upload_utils.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_upload_utils.py
index 1b908615..0c79edb4 100755
--- a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_upload_utils.py
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_upload_utils.py
@@ -13,10 +13,10 @@ def bias_fetch_preprocessing_log_files(odir, encid, main_dir, name):
# preprocessing, peak-calling
# preprocessing log files
- temp_dir="/oak/stanford/groups/akundaje/projects/chrombpnet/model_inputs/ENCODE_ATAC_downloads/"
- preprocessing_log = os.path.join(temp_dir, name + "/script.sh")
- if os.stat(preprocessing_log).st_size != 0:
- log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script_v1.sh"))
+# temp_dir="/oak/stanford/groups/akundaje/projects/chrombpnet/model_inputs/ENCODE_ATAC_downloads/"
+# preprocessing_log = os.path.join(temp_dir, name + "/script.sh")
+# if os.stat(preprocessing_log).st_size != 0:
+# log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script_v1.sh"))
preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_preprocessing.log")
if os.stat(preprocessing_log).st_size != 0:
@@ -43,9 +43,10 @@ def fetch_per_fold_training_data_bias(odir, model_dir, encid, fold_num, main_dir
if os.path.isfile(filtered_regions_bed):
input_paths.append((filtered_regions_bed,"cv_params.fold_"+str(fold_num)+".json"))
- temp_dir="/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/atlas_model_k562_fold_0/"
+ #temp_dir="/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/atlas_model_k562_fold_0/"
if fold_num==0:
- filtered_regions_bed = os.path.join(temp_dir, "negatives_data/negatives_with_summit.bed.gz")
+ filtered_regions_bed = os.path.join(main_dir, name+"/negatives_data/negatives_with_summit.bed.gz")
+ #print(filtered_regions_bed)
if os.path.isfile(filtered_regions_bed):
input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz"))
else:
@@ -59,15 +60,15 @@ def fetch_per_fold_training_data_bias(odir, model_dir, encid, fold_num, main_dir
# if os.path.isfile(filtered_regions_bed):
# input_paths.append((filtered_regions_bed,"peaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
- filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias/nonpeaks.trainingset.bed.gz")
+ filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.trainingset.bed.gz")
if os.path.isfile(filtered_regions_bed):
input_paths.append((filtered_regions_bed,"nonpeaks.trainingset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
- filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias/nonpeaks.validationset.bed.gz")
+ filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.validationset.bed.gz")
if os.path.isfile(filtered_regions_bed):
input_paths.append((filtered_regions_bed,"nonpeaks.validationset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
- filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias/nonpeaks.testset.bed.gz")
+ filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.testset.bed.gz")
if os.path.isfile(filtered_regions_bed):
input_paths.append((filtered_regions_bed,"nonpeaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
@@ -75,7 +76,9 @@ def fetch_per_fold_training_data_bias(odir, model_dir, encid, fold_num, main_dir
#print(filtered_regions_bed)
if fold_num==0:
+ #negatives_log = os.path.join(temp_dir, name+"/negatives_data/make_background_regions.log")
negatives_log = os.path.join(main_dir, name+"/negatives_data/make_background_regions.log")
+
if os.stat(negatives_log).st_size != 0:
log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.txt"))
else:
@@ -85,7 +88,8 @@ def fetch_per_fold_training_data_bias(odir, model_dir, encid, fold_num, main_dir
if fold_num==0:
- negatives_log = os.path.join(temp_dir, "negatives_data/negatives_compared_with_foreground.png")
+# negatives_log = os.path.join(temp_dir, "negatives_data/negatives_compared_with_foreground.png")
+ negatives_log = os.path.join(main_dir, name+"/negatives_data/negatives_compared_with_foreground.png")
if os.stat(negatives_log).st_size != 0:
log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png"))
else:
@@ -125,9 +129,11 @@ def fetch_per_fold_bias_models(odir, model_dir, encid, fold_num):
#### fetch model training log files ########
modelling_log = os.path.join(model_dir, "bias_model/train_bias_model.log")
- if os.stat(modelling_log).st_size != 0:
- log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".stdout.txt"))
-
+ if os.path.exists(modelling_log)
+ if os.stat(modelling_log).st_size != 0:
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".stdout.txt"))
+ else:
+ print(modelling_log)
modelling_log = os.path.join(model_dir, "bias_model/bias.args.json")
if os.stat(modelling_log).st_size != 0:
log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".args.json"))
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/bias.models.README b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/bias.models.README
new file mode 100644
index 00000000..315b971b
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/bias.models.README
@@ -0,0 +1,92 @@
+# Directory Structure Format
+.
+├── fold_0
+│ ├── model.bias.fold_0.encid.h5 # bias model in .h5 format
+│ ├── model.bias.fold_0.encid.h5 # bias model in SavedModel format
+│ │ after being untarred, it results in a directory named "bias"
+│ └── logs.bias.models.fold_0.encid # folder containing log files for training models
+│
+├── fold_1
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_2
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_3
+│ └── ... # similar directory structure as fold_0 directory above
+│
+└── fold_4
+ └── ... # similar directory structure as fold_0 directory above
+
+
+# Pseudocode for loading models in .h5 format
+
+(1) Use the code in python after appropriately defining `model_in_h5_format` and `inputs`.
+(2) `inputs` is a one hot encoded sequence of shape (N,2114,4). Here N corresponds to the
+number of tested sequences, 2114 is the input sequence length and 4 corresponds to [A,C,G,T].
+
+```
+import tensorflow as tf
+from tensorflow.keras.utils import get_custom_objects
+from tensorflow.keras.models import load_model
+
+custom_objects={"tf": tf}
+get_custom_objects().update(custom_objects)
+
+model=load_model(model_in_h5_format,compile=False)
+outputs = model(inputs)
+```
+
+The list `outputs` consists of two elements. The first element has a shape of (N, 1000) and
+contains logit predictions for a 1000-base-pair output. The second element, with a shape of
+(N, 1), contains logcount predictions. To transform these predictions into per-base signals,
+follow the provided pseudo code lines below.
+
+```
+import numpy as np
+
+def softmax(x, temp=1):
+ norm_x = x - np.mean(x,axis=1, keepdims=True)
+ return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True)
+
+predictions = softmax(outputs[0]) * (np.exp(outputs[1])-1)
+```
+
+# Pseudocode for loading models in .tar format
+
+(1) First untar the directory as follows `tar -xvf model.tar`
+(2) Use the code below in python after appropriately defining `model_dir_untared` and `inputs`
+(3) `inputs` is a one hot encoded sequence of shape (N,2114,4). Here N corresponds to the number
+of tested sequences, 2114 is the input sequence length and 4 corresponds to ACGT.
+
+Reference: https://www.tensorflow.org/api_docs/python/tf/saved_model/load
+
+```
+import tensorflow as tf
+
+model = tf.saved_model.load('model_dir_untared')
+outputs = model.signatures['serving_default'](**{'sequence':inputs.astype('float32')})
+```
+
+The variable `outputs` represents a dictionary containing two key-value pairs. The first key
+is `logits_profile_predictions`, holding a value with a shape of (N, 1000). This value corresponds
+to logit predictions for a 1000-base-pair output. The second key, named `logcount_predictions``,
+is associated with a value of shape (N, 1), representing logcount predictions. To transform these
+predictions into per-base signals, utilize the provided pseudo code lines mentioned below.
+
+```
+import numpy as np
+def softmax(x, temp=1):
+ norm_x = x - np.mean(x,axis=1, keepdims=True)
+ return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True)
+
+predictions = softmax(outputs["logits_profile_predictions"]) * (np.exp(outputs["logcount_predictions"])-1)
+```
+
+# Docker image to load and use the models
+
+https://hub.docker.com/r/kundajelab/chrombpnet-atlas/ (tag:v1)
+
+# Tool box to do downstream analysis with the models
+
+https://github.com/kundajelab/chrombpnet/wiki
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/bias.training.README b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/bias.training.README
new file mode 100644
index 00000000..8faa0ea2
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/bias.training.README
@@ -0,0 +1,63 @@
+# Directory Structure Format
+.
+├── peaks.all_input_regions.encid.bed.gz # Peaks input to the bias training script
+├── logs.bias.training_test_regions.encid # folder containing log files for peak and nonpeak generation scripts
+│
+├── fold_0
+│ ├── cv_params.fold_0.json # training, validation and test chromosomes used in fold 0
+│ ├── nonpeaks.all_input_regions.fold_0.encid.bed.gz # Non peaks input to the bias training script
+│ ├── nonpeaks.trainingset.fold_0.encid.bed.gz # nonpeaks used in training set of fold 0 bias model
+│ ├── nonpeaks.validationset.fold_0.encid.bed.gz # nonpeaks used in validation set of fold 0 bias model
+│ ├── nonpeaks.testset.fold_0.encid.bed.gz # nonpeaks used in test set of fold 0 bias model
+│ └── logs.bias.training_test_regions.fold_0.encid # folder containing log files for training bias model on fold 0
+│
+├── fold_1
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_2
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_3
+│ └── ... # similar directory structure as fold_0 directory above
+│
+└── fold_4
+ └── ... # similar directory structure as fold_0 directory above
+
+# Bed File Format for Peaks
+
+* All the bed files are in narrowpeak format with 10 columns.
+
+1) chrom - Name of the chromosome (or contig, scaffold, etc.).
+2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
+3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
+4) name - Name given to a region (preferably unique). Use "." if no name is assigned.
+5) score - Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were "'0"' when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000.
+6) strand - +/- to denote strand or orientation (whenever applicable). Use "." if no orientation is assigned.
+7) signalValue - Measurement of overall (usually, average) enrichment for the region.
+8) pValue - Measurement of statistical significance (-log10). Use -1 if no pValue is assigned.
+9) qValue - Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
+10) peak - Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.
+
+# Bed File Format for Nonpeaks
+
+* All the bed files are in narrowpeak format with 10 columns.
+
+1) chrom - Name of the chromosome (or contig, scaffold, etc.).
+2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
+3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
+4) empty character - "."
+5) empty character - "."
+6) empty character - "."
+7) empty character - "."
+8) empty character - "."
+9) empty character - "."
+10) (chromEnd-chromStart)/2
+
+# Format of file `cv_params.fold_0.json`
+
+A dictionary with following (key,value) pairs,
+
+1) ("CV_type", "chr_holdout")
+2) ("train", list_of_chrs_trainingset)
+3) ("valid", list_of_chrs_validationset)
+4) ("test", list_of_chrs_testset)
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/models.README b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/models.README
new file mode 100644
index 00000000..90a59aa1
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/models.README
@@ -0,0 +1,98 @@
+# Directory Structure Format
+.
+├── fold_0
+│ ├── model.chrombpnet.fold_0.encid.h5 # full chrombpnet model that combines both bias and corrected model in .h5 format
+│ ├── model.chrombpnet_nobias.fold_0.encid.h5 # bias-corrected accessibility model in .h5 format (Use for all biological discovery)
+│ ├── model.bias_scaled.fold_0.encid.h5 # bias model in .h5 format
+│ ├── model.chrombpnet.fold_0.encid.tar # full chrombpnet model that combines both bias and corrected model in SavedModel format.
+│ │ after being untarred, it results in a directory named "chrombpnet".
+│ ├── model.chrombpnet_nobias.fold_0.encid.tar # bias-corrected accessibility model in SavedModel format (Use for all biological discovery).
+│ │ after being untarred, it results in a directory named "chrombpnet_wo_bias".
+│ ├── model.bias_scaled.fold_0.encid.h5 # bias model in SavedModel format
+│ │ after being untarred, it results in a directory named "bias_model_scaled".
+│ └── logs.models.fold_0.encid # folder containing log files for training models
+│
+├── fold_1
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_2
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_3
+│ └── ... # similar directory structure as fold_0 directory above
+│
+└── fold_4
+ └── ... # similar directory structure as fold_0 directory above
+
+
+# Pseudocode for loading models in .h5 format
+
+(1) Use the code in python after appropriately defining `model_in_h5_format` and `inputs`.
+(2) `inputs` is a one hot encoded sequence of shape (N,2114,4). Here N corresponds to the
+number of tested sequences, 2114 is the input sequence length and 4 corresponds to [A,C,G,T].
+
+```
+import tensorflow as tf
+from tensorflow.keras.utils import get_custom_objects
+from tensorflow.keras.models import load_model
+
+custom_objects={"tf": tf}
+get_custom_objects().update(custom_objects)
+
+model=load_model(model_in_h5_format,compile=False)
+outputs = model(inputs)
+```
+
+The list `outputs` consists of two elements. The first element has a shape of (N, 1000) and
+contains logit predictions for a 1000-base-pair output. The second element, with a shape of
+(N, 1), contains logcount predictions. To transform these predictions into per-base signals,
+follow the provided pseudo code lines below.
+
+```
+import numpy as np
+
+def softmax(x, temp=1):
+ norm_x = x - np.mean(x,axis=1, keepdims=True)
+ return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True)
+
+predictions = softmax(outputs[0]) * (np.exp(outputs[1])-1)
+```
+
+# Pseudocode for loading models in .tar format
+
+(1) First untar the directory as follows `tar -xvf model.tar`
+(2) Use the code below in python after appropriately defining `model_dir_untared` and `inputs`
+(3) `inputs` is a one hot encoded sequence of shape (N,2114,4). Here N corresponds to the number
+of tested sequences, 2114 is the input sequence length and 4 corresponds to ACGT.
+
+Reference: https://www.tensorflow.org/api_docs/python/tf/saved_model/load
+
+```
+import tensorflow as tf
+
+model = tf.saved_model.load('model_dir_untared')
+outputs = model.signatures['serving_default'](**{'sequence':inputs.astype('float32')})
+```
+
+The variable `outputs` represents a dictionary containing two key-value pairs. The first key
+is `logits_profile_predictions`, holding a value with a shape of (N, 1000). This value corresponds
+to logit predictions for a 1000-base-pair output. The second key, named `logcount_predictions``,
+is associated with a value of shape (N, 1), representing logcount predictions. To transform these
+predictions into per-base signals, utilize the provided pseudo code lines mentioned below.
+
+```
+import numpy as np
+def softmax(x, temp=1):
+ norm_x = x - np.mean(x,axis=1, keepdims=True)
+ return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True)
+
+predictions = softmax(outputs["logits_profile_predictions"]) * (np.exp(outputs["logcount_predictions"])-1)
+```)
+
+# Docker image to load and use the models
+
+https://hub.docker.com/r/kundajelab/chrombpnet-atlas/ (tag:v1)
+
+# Tool box to do downstream analysis with the models
+
+https://github.com/kundajelab/chrombpnet/wiki
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/training.README b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/training.README
new file mode 100644
index 00000000..56f8d835
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/training.README
@@ -0,0 +1,66 @@
+# Directory Structure Format
+.
+├── peaks.all_input_regions.encid.bed.gz # Peaks input to the chrombpnet training script
+├── nonpeaks.all_input_regions.encid.bed.gz # Non peaks input to the chrombpnet training script
+├── logs.training_test_regions.encid # folder containing log files for peak and nonpeak generation scripts
+│
+├── fold_0
+│ ├── cv_params.fold_0.json # training, validation and test chromosomes used in fold 0
+│ ├── peaks.trainingset.fold_0.encid.bed.gz # peaks used in training set of fold 0 model
+│ ├── nonpeaks.trainingset.fold_0.encid.bed.gz # nonpeaks used in training set of fold 0 model
+│ ├── peaks.validationset.fold_0.encid.bed.gz # peaks used in validation set of fold 0 model
+│ ├── nonpeaks.validationset.fold_0.encid.bed.gz # nonpeaks used in validation set of fold 0 model
+│ ├── peaks.testset.fold_0.encid.bed.gz # peaks used in test set of fold 0 model
+│ ├── nonpeaks.testset.fold_0.encid.bed.gz # nonpeaks used in test set of fold 0 model
+│ └── logs.training_test_regions.fold_0.encid # folder containing log files for training chrombpnet model on fold 0
+│
+├── fold_1
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_2
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_3
+│ └── ... # similar directory structure as fold_0 directory above
+│
+└── fold_4
+ └── ... # similar directory structure as fold_0 directory above
+
+# Bed File Format for Peaks
+
+* All the bed files are in narrowpeak format with 10 columns.
+
+1) chrom - Name of the chromosome (or contig, scaffold, etc.).
+2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
+3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
+4) name - Name given to a region (preferably unique). Use "." if no name is assigned.
+5) score - Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were "'0"' when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000.
+6) strand - +/- to denote strand or orientation (whenever applicable). Use "." if no orientation is assigned.
+7) signalValue - Measurement of overall (usually, average) enrichment for the region.
+8) pValue - Measurement of statistical significance (-log10). Use -1 if no pValue is assigned.
+9) qValue - Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
+10) peak - Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.
+
+# Bed File Format for Nonpeaks
+
+* All the bed files are in narrowpeak format with 10 columns.
+
+1) chrom - Name of the chromosome (or contig, scaffold, etc.).
+2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
+3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
+4) empty character - "."
+5) empty character - "."
+6) empty character - "."
+7) empty character - "."
+8) empty character - "."
+9) empty character - "."
+10) midpoint - (chromEnd-chromStart)/2
+
+# Format of file `cv_params.fold_0.json`
+
+A dictionary with following (key,value) pairs,
+
+1) ("CV_type", "chr_holdout")
+2) ("train", list_of_chrs_trainingset)
+3) ("valid", list_of_chrs_validationset)
+4) ("test", list_of_chrs_testset)
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_model_chrombpnet.csv b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_model_chrombpnet.csv
new file mode 100644
index 00000000..15190cf2
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_model_chrombpnet.csv
@@ -0,0 +1,26 @@
+fold_0,GM12878,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/GM12878/nautilus_runs/GM12878_03.01.2022_bias_128_4_1234_0.4_fold_0
+fold_1,GM12878,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/GM12878/GM12878_07.08.2022_bias_128_4_1234_0.4_fold_1_data_type_ATAC_PE
+fold_2,GM12878,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/GM12878/GM12878_07.08.2022_bias_128_4_1234_0.4_fold_2_data_type_ATAC_PE
+fold_3,GM12878,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/GM12878/GM12878_07.14.2022_bias_128_4_1234_0.4_fold_3_data_type_ATAC_PE
+fold_4,GM12878,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/GM12878/GM12878_07.07.2022_bias_128_4_1234_0.4_fold_4_data_type_ATAC_PE
+fold_0,K562,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/K562/nautilus_runs/K562_02.17.2022_bias_128_4_1234_0.5_fold_0
+fold_1,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_1_data_type_ATAC_PE
+fold_2,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_2_data_type_ATAC_PE
+fold_3,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_3_data_type_ATAC_PE
+fold_4,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_4_data_type_ATAC_PE
+fold_0,HEPG2,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/HEPG2/nautilus_runs_jun16/HEPG2_05.09.2022_bias_128_4_1234_0.8_fold_0
+fold_1,HEPG2,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/HEPG2/HEPG2_06.07.2022_bias_128_4_1234_0.8_fold_1
+fold_2,HEPG2,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/HEPG2/HEPG2_05.24.2022_bias_128_4_1234_0.8_fold_2
+fold_3,HEPG2,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/HEPG2/HEPG2_05.22.2022_bias_128_4_1234_0.8_fold_3
+fold_4,HEPG2,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/HEPG2/HEPG2_05.22.2022_bias_128_4_1234_0.8_fold_4
+fold_0,IMR90,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/IMR90/nautilus_runs_apr12/IMR90_04.09.2022_bias_128_4_1234_0.4_fold_0
+fold_1,IMR90,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/IMR90/IMR90_07.17.2022_bias_128_4_1234_0.3_fold_1_data_type_ATAC_PE
+fold_2,IMR90,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/IMR90/IMR90_07.17.2022_bias_128_4_1234_0.3_fold_2_data_type_ATAC_PE
+fold_3,IMR90,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/IMR90/IMR90_07.08.2022_bias_128_4_1234_0.4_fold_3_data_type_ATAC_PE
+fold_4,IMR90,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/IMR90/IMR90_07.07.2022_bias_128_4_1234_0.4_fold_4_data_type_ATAC_PE
+fold_0,H1ESC,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/H1ESC/nautilus_runs_jun16/H1ESC_05.09.2022_bias_128_4_1234_0.8_fold_0
+fold_1,H1ESC,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/H1ESC/H1ESC_07.23.2022_bias_128_4_1234_0.7_fold_1_data_type_ATAC_PE
+fold_2,H1ESC,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/H1ESC/H1ESC_07.17.2022_bias_128_4_1234_0.8_fold_2_data_type_ATAC_PE
+fold_3,H1ESC,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/H1ESC/H1ESC_07.17.2022_bias_128_4_1234_0.8_fold_3_data_type_ATAC_PE
+fold_4,H1ESC,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/H1ESC/H1ESC_07.17.2022_bias_128_4_1234_0.8_fold_4_data_type_ATAC_PE
+
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_model_upload.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_model_upload.py
new file mode 100644
index 00000000..d2e9a145
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_model_upload.py
@@ -0,0 +1,260 @@
+import os
+import atac_bias_upload_utils as upload_utils
+import json
+import pandas as pd
+import model_upload_utils
+
+odir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/ATAC/"
+#output_dir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022-uploads/jsons/ATAC/stage1/jul_17_2023/"
+main_dir="/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/"
+output_dir = "atac_production_uploads/"
+
+encids = os.listdir(odir)
+#encids = open("../chromatin_atlas_atac/test_encid.txt").readlines()
+#encids = [line.strip() for line in encids]
+
+model_atac = pd.read_csv("atac_bias_model_chrombpnet.csv",sep=",", header=None)
+encode_id = {"K562": "ENCSR868FGK",
+"GM12878": "ENCSR637XSC",
+"HEPG2": "ENCSR291GJU",
+"IMR90": "ENCSR200OML",
+"H1ESC": "GSE267154"}
+
+data_to_bam = {"K562": ["ENCFF077FBI", "ENCFF128WZG", "ENCFF534DCE"],
+"GM12878": ["ENCFF440GRZ", "ENCFF981FXV", "ENCFF962FMH"],
+"HEPG2": ["ENCFF624SON", "ENCFF926KFU", "ENCFF990VCP"],
+"IMR90": ["ENCFF848XMR", "ENCFF715NAV"],
+"H1ESC": ["GSM8260976", "GSM8260977"]
+}
+
+def main_fetch_training_files(encid, args_json, model_paths, name):
+ success = False
+
+ # find the training test regions
+ args_json["training and test regions tar"] = {}
+ readme_file = "READMEs/training.README"
+ assert(os.path.isfile(readme_file))
+ args_json["training and test regions tar"]["file.paths"] = [(readme_file, "README.md")]
+
+ input_peaks = os.path.join(main_dir, name + "/data/peaks_no_blacklist.bed.gz")
+ if os.path.isfile(input_peaks):
+ args_json["training and test regions tar"]["file.paths"].append((input_peaks,"peaks.all_input_regions."+encid+".bed.gz"))
+ else:
+ success = False
+ return success, args_json
+
+ log_paths = model_upload_utils.fetch_preprocessing_log_files(odir,encid,main_dir, name)
+ args_json["training and test regions tar"]["logs.training_test_regions."+encid] = {"file.paths": log_paths}
+ assert(len(log_paths) == 3)
+
+ for i in range(5):
+ data_paths, log_paths = model_upload_utils.fetch_per_fold_training_data(odir,model_paths[i], encid, i, main_dir, name)
+
+ args_json["training and test regions tar"]["fold_"+str(i)] = {}
+ args_json["training and test regions tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["training and test regions tar"]["fold_"+str(i)]["logs.training_test_regions.fold_"+str(i)+"."+encid] = {"file.paths": log_paths}
+ print(len(data_paths))
+ assert(len(data_paths) == 8)
+ assert(len(log_paths) == 2)
+
+ if len(data_paths) != 8:
+ success = False
+ return success, args_json
+
+ success = True
+ return success, args_json
+
+def main_fetch_preprocessing_files_for_k562(encid, args_json, bam_ids, name):
+ # define bam_ids, name
+
+ success_flag = False
+
+ args_json["upload bias"] = False
+ #args_json["bias model encid"] = encid
+
+ # find the bams input
+ preprocessing_path = "/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/"+name+"/data/"+name+"_unstranded.bw"
+
+ if os.path.isfile(preprocessing_path):
+ args_json["experiment"] = encid
+ args_json["bam files"] = bam_ids
+ args_json["assay"] = "ATAC-seq"
+ success = True
+ else:
+ success = False
+
+ return success, args_json
+
+def main_fetch_preprocessing_files(encid, args_json, bam_ids, name):
+ # define bam_ids, name
+
+ success_flag = False
+
+ args_json["upload bias"] = True
+ #args_json["bias model encid"] = encid
+
+ # find the bams input
+ preprocessing_path = "/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/"+name+"/data/"+name+"_unstranded.bw"
+
+ if os.path.isfile(preprocessing_path):
+ args_json["experiment"] = encid
+ args_json["bam files"] = bam_ids
+ args_json["assay"] = "ATAC-seq"
+ args_json["observed signal profile bigWig"] = preprocessing_path
+ success = True
+ else:
+ success = False
+
+ return success, args_json
+
+def main_fetch_model_files(encid, args_json, model_paths, name):
+ success = False
+ args_json["models tar"] = {}
+ readme_file = "READMEs/models.README"
+ assert(os.path.isfile(readme_file))
+ args_json["models tar"]["file.paths"] = [(readme_file, "README.md")]
+ args_json["models tar"]["logs.models."+encid] = {"file.paths": None}
+
+ for i in range(5):
+ data_paths, log_paths, log_paths_opt = model_upload_utils.fetch_per_fold_models(odir,model_paths[i], encid, i)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["models tar"]["fold_"+str(i)] = {}
+ args_json["models tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["models tar"]["fold_"+str(i)]["logs.models.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt}
+ assert(len(data_paths) == 6)
+ print(len(log_paths))
+ assert(len(log_paths) >= 6)
+
+ success=True
+ return success, args_json
+
+def main_fetch_bias_model_files(encid, args_json, models_path):
+ success = False
+ args_json["bias models tar"] = {}
+ readme_file = "READMEs/bias.models.README"
+ assert(os.path.isfile(readme_file))
+ args_json["bias models tar"]["file.paths"] = [(readme_file, "README.md")]
+ #args_json["bias models tar"]["logs.bias.models."+encid] = {"file.paths": None}
+
+ for i in range(5):
+ data_paths, log_paths = upload_utils.fetch_per_fold_bias_models(odir, models_path[i], encid, i)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["bias models tar"]["fold_"+str(i)] = {}
+ args_json["bias models tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["bias models tar"]["fold_"+str(i)]["logs.bias.models.fold_"+str(i)+"."+encid] = {"file.paths": log_paths}
+ # 9 log file expected per model
+ print(len(log_paths))
+ assert(len(log_paths) >= 2)
+ assert(len(data_paths) == 2)
+ success=True
+ return success, args_json
+
+def main_fetch_bias_training_files(encid, args_json, models_path, name):
+ success = False
+
+ # find the training test regions
+ args_json["bias training and test regions tar"] = {}
+ readme_file = "READMEs/bias.training.README"
+ assert(os.path.isfile(readme_file))
+ args_json["bias training and test regions tar"]["file.paths"] = [(readme_file, "README.md")]
+
+ input_peaks = os.path.join(main_dir, name + "/data/peaks_no_blacklist.bed.gz")
+ print(input_peaks)
+ if os.path.isfile(input_peaks):
+ args_json["bias training and test regions tar"]["file.paths"].append((input_peaks,"peaks.all_input_regions."+encid+".bed.gz"))
+ else:
+ success = False
+ return success, args_json
+
+ log_paths = upload_utils.bias_fetch_preprocessing_log_files(odir, encid, main_dir, name)
+ args_json["bias training and test regions tar"]["logs.bias.training_test_regions."+encid] = {"file.paths": log_paths}
+ assert(len(log_paths) == 3)
+
+ for i in range(5):
+ data_paths, log_paths = upload_utils.fetch_per_fold_training_data_bias(odir, models_path[i], encid, i, main_dir, name)
+
+ args_json["bias training and test regions tar"]["fold_"+str(i)] = {}
+ args_json["bias training and test regions tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["bias training and test regions tar"]["fold_"+str(i)]["logs.bias.training_test_regions.fold_"+str(i)+"."+encid] = {"file.paths": log_paths}
+ #print(len(data_paths))
+ #print(data_paths)
+ assert(len(data_paths) == 5)
+ assert(len(log_paths) == 2)
+
+ #if len(data_paths) != 3:
+ # success = False
+ # return success, args_json
+
+ success = True
+ return success, args_json
+
+
+
+if __name__ == "__main__":
+
+ for name in ["K562", "GM12878", "HEPG2", "IMR90", "H1ESC"]:
+
+ encid=encode_id[name]
+ model_paths = model_atac[model_atac[1]==name][2].values
+ print(model_paths)
+
+ if os.path.isfile(output_dir+"/"+encid+".json"):
+ continue
+
+ print(encid)
+
+ args_json = {}
+
+ success, args_json = main_fetch_preprocessing_files(encid, args_json, data_to_bam[name], name)
+ if not success:
+ print("ERR prep")
+ continue
+
+ success, args_json = main_fetch_bias_training_files(encid, args_json, model_paths, name)
+ if not success:
+ print("ERR bias prep")
+ continue
+
+ success, args_json = main_fetch_bias_model_files(encid, args_json, model_paths)
+ if not success:
+ print("ERR bias models")
+ continue
+
+ if name == "K562":
+ with open(output_dir+"/"+encid+"_bias.json", "w") as outfile:
+ json.dump(args_json, outfile, indent=4)
+
+ args_json = {}
+ main_fetch_preprocessing_files_for_k562(encid, args_json, data_to_bam[name], name)
+ if not success:
+ print("ERR prep")
+ continue
+
+ success, args_json = main_fetch_model_files(encid, args_json, model_paths, name)
+ if not success:
+ print("fail model")
+ continue
+
+ success, args_json = main_fetch_training_files(encid, args_json, model_paths, name)
+ if not success:
+ print("fail train prep")
+ continue
+
+
+ with open(output_dir+"/"+encid+".json", "w") as outfile:
+ json.dump(args_json, outfile, indent=4)
+
+ #print(args_json)
+
+
+
+
+
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_upload_utils.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_upload_utils.py
new file mode 100644
index 00000000..5f7d236d
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_upload_utils.py
@@ -0,0 +1,174 @@
+import os
+import json
+import numpy as np
+
+### utils for preprocessing
+
+
+### utils for training and testing regions
+
+def bias_fetch_preprocessing_log_files(odir, encid, main_dir, name):
+ # do bed file checks
+ log_paths = []
+ # preprocessing, peak-calling
+
+ # preprocessing log files
+# temp_dir="/oak/stanford/groups/akundaje/projects/chrombpnet/model_inputs/ENCODE_ATAC_downloads/"
+# preprocessing_log = os.path.join(temp_dir, name + "/script.sh")
+# if os.stat(preprocessing_log).st_size != 0:
+# log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script_v1.sh"))
+
+ preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_preprocessing.log")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout.txt"))
+
+ preprocessing_log = os.path.join(main_dir, name + "/data/"+name.lower()+"_atac_fold_0.sh")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script.sh"))
+
+ preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_bias_pwm.png")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".bias_pwm.png"))
+
+ return log_paths
+
+
+def fetch_per_fold_training_data_bias(odir, model_dir, encid, fold_num, main_dir, name):
+ input_paths = []
+ log_paths = []
+
+ #print(model_dir)
+ opath = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/splits_format/"
+ filtered_regions_bed = os.path.join(opath + "/fold_"+str(fold_num)+".json")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"cv_params.fold_"+str(fold_num)+".json"))
+
+ #temp_dir="/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/atlas_model_k562_fold_0/"
+ if fold_num==0:
+ filtered_regions_bed = os.path.join(main_dir, name+"/negatives_data/negatives_with_summit.bed.gz")
+ #print(filtered_regions_bed)
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+ else:
+ filtered_regions_bed = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/negatives_with_summit.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+
+# filtered_regions_bed = os.path.join(model_dir, "bias_model/train_test_regions/peaks.testset.bed.gz")
+# #print(filtered_regions_bed)
+# if os.path.isfile(filtered_regions_bed):
+# input_paths.append((filtered_regions_bed,"peaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.trainingset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.trainingset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.validationset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.validationset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.testset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ #print(input_paths)
+ #print(filtered_regions_bed)
+
+ if fold_num==0:
+ #negatives_log = os.path.join(temp_dir, name+"/negatives_data/make_background_regions.log")
+ negatives_log = os.path.join(main_dir, name+"/negatives_data/make_background_regions.log")
+
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.txt"))
+ else:
+ negatives_log = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/make_background_regions.log")
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.txt"))
+
+
+ if fold_num==0:
+# negatives_log = os.path.join(temp_dir, "negatives_data/negatives_compared_with_foreground.png")
+ negatives_log = os.path.join(main_dir, name+"/negatives_data/negatives_compared_with_foreground.png")
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png"))
+ else:
+ negatives_log = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/negatives_compared_with_foreground.png")
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png"))
+
+# negatives_log = os.path.join(odir, encid + "/negatives_data/test/fold_"+str(fold_num)+"."+encid+"_test.log")
+# if os.stat(negatives_log).st_size != 0:
+# log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.txt"))
+#
+ # add preprocessing data main_dir
+
+ return input_paths, log_paths
+
+
+### utils for model uploads
+
+#just need to add log files
+
+def fetch_per_fold_bias_models(odir, model_dir, encid, fold_num):
+ input_paths = []
+ log_paths = []
+
+ bm_model = os.path.join(model_dir, "bias_model/bias.h5")
+ if os.path.isfile(bm_model):
+ input_paths.append((bm_model,"model.bias.fold_"+str(fold_num)+"."+encid+".h5"))
+ else:
+ return None, None
+
+ bm_model = os.path.join(model_dir, "bias_model/new_model_formats_vf/bias.tar")
+ if os.path.isfile(bm_model):
+ input_paths.append((bm_model,"model.bias.fold_"+str(fold_num)+"."+encid+".tar"))
+ else:
+ return None, None
+
+ #### fetch model training log files ########
+
+ modelling_log = os.path.join(model_dir, "bias_model/train_bias_model.log")
+ if os.path.exists(modelling_log):
+ if os.stat(modelling_log).st_size != 0:
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".stdout.txt"))
+ #else:
+ # print(modelling_log)
+ modelling_log = os.path.join(model_dir, "bias_model/bias.args.json")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".args.json"))
+
+ modelling_log = os.path.join(model_dir, "bias_model/bias_data_params.tsv")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_data_params.tsv"))
+ else:
+ modelling_log = os.path.join(model_dir, "bias_model/newgen/bias_data_params.tsv")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_data_params.tsv"))
+
+
+ modelling_log = os.path.join(model_dir, "bias_model/bias_model_params.tsv")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_train_params.tsv"))
+ else:
+ modelling_log = os.path.join(model_dir, "bias_model/newgen/bias_model_params.tsv")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_train_params.tsv"))
+
+ modelling_log = os.path.join(model_dir, "bias_model/bias.params.json")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_train_params.json"))
+
+ modelling_log = os.path.join(model_dir, "bias_model/bias.log")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".epoch_loss.csv"))
+
+ modelling_log = os.path.join(model_dir, "bias_model/bias.log.batch")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".batch_loss.tsv"))
+
+ return input_paths, log_paths
+
+
+
+
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_model_chrombpnet.csv b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_model_chrombpnet.csv
new file mode 100644
index 00000000..e69de29b
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_model_upload.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_model_upload.py
new file mode 100644
index 00000000..74940191
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_model_upload.py
@@ -0,0 +1,283 @@
+import os
+import dnase_bias_upload_utils as upload_utils
+import json
+import pandas as pd
+import model_upload_utils
+
+odir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/"
+#output_dir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022-uploads/jsons/DNASE/stage1/jul_17_2023/"
+output_dir = "dnase_production_uploads/"
+
+encids = os.listdir(odir)
+#encids = open("../chromatin_atlas_atac/test_encid.txt").readlines()
+#encids = [line.strip() for line in encids]
+
+model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/upload_jsons/upload_scripts/model_dir_dnase_v2.1_bias.csv",sep=",", header=None)
+model_atac_new = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/v1/model_dir_dnase_v2.1.csv",sep=",", header=None)
+
+
+encode_id = {"HEPG2": "ENCSR149XIL",
+ "K562": "ENCSR000EOT",
+ "IMR90": "ENCSR477RTP",
+ "GM12878": "ENCSR000EMT",
+ "H1ESC": "ENCSR000EMU"}
+
+data_to_bam = {"HEPG2": ["ENCFF474LSZ", "ENCFF839SPF"],
+ "K562": ["ENCFF205FNC"],
+ "IMR90": ["ENCFF618FFB"],
+ "GM12878": ["ENCFF467CXY", "ENCFF940NSD"],
+ "H1ESC": ["ENCFF733TCL"]}
+
+def main_fetch_training_files(encid, args_json, model_paths, name):
+ success = False
+
+ # find the training test regions
+ args_json["training and test regions tar"] = {}
+ readme_file = "READMEs/bias.models.README"
+ assert(os.path.isfile(readme_file))
+ args_json["training and test regions tar"]["file.paths"] = [(readme_file, "README.md")]
+
+ if name in ["HEPG2", "K562"]:
+ main_dir="/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_PE/"
+ input_peaks = os.path.join(main_dir, name + "/data/peaks_no_blacklist.bed.gz")
+ else:
+ main_dir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/"
+ input_peaks = os.path.join(odir, encid + "/preprocessing/downloads/peaks.bed.gz")
+
+ if os.path.isfile(input_peaks):
+ args_json["training and test regions tar"]["file.paths"].append((input_peaks,"peaks.all_input_regions."+encid+".bed.gz"))
+ else:
+ success = False
+ return success, args_json
+
+ if name in ["H1ESC"]:
+ main_dir="/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_SE/"
+
+ log_paths = model_upload_utils.fetch_preprocessing_log_files(odir,encid,main_dir, name)
+ args_json["training and test regions tar"]["logs.training_test_regions."+encid] = {"file.paths": log_paths}
+ assert(len(log_paths) == 3)
+
+ for i in range(5):
+ data_paths, log_paths = model_upload_utils.fetch_per_fold_training_data(odir,model_paths[i], encid, i, main_dir, name)
+
+ args_json["training and test regions tar"]["fold_"+str(i)] = {}
+ args_json["training and test regions tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["training and test regions tar"]["fold_"+str(i)]["logs.training_test_regions.fold_"+str(i)+"."+encid] = {"file.paths": log_paths}
+ print(len(data_paths))
+ assert(len(data_paths) == 8)
+ assert(len(log_paths) == 2)
+
+ if len(data_paths) != 8:
+ success = False
+ return success, args_json
+
+ success = True
+ return success, args_json
+
+
+def main_fetch_preprocessing_files(encid, args_json, bam_ids, name):
+
+ success_flag = False
+
+ if name == "HEPG2":
+ args_json["upload bias"] = False
+ else:
+ args_json["upload bias"] = True
+
+ args_json["bias model encid"] = encid
+
+ # find the bams input
+ preprocessing_path = "/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_PE/"+name+"/data/"+name+"_unstranded.bw"
+ preprocessing_path_oak = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/"+encid+"/preprocessing/bigWigs/"+encid+".bigWig"
+ if os.path.isfile(preprocessing_path):
+ args_json["experiment"] = encid
+ args_json["bam files"] = bam_ids
+ args_json["assay"] = "DNase-seq"
+ args_json["observed signal profile bigWig"] = preprocessing_path
+ success = True
+ elif os.path.isfile(preprocessing_path_oak):
+ args_json["experiment"] = encid
+ args_json["bam files"] = bam_ids
+ args_json["assay"] = "DNase-seq"
+ args_json["observed signal profile bigWig"] = preprocessing_path_oak
+ success = True
+ else:
+ success = False
+
+ return success, args_json
+
+def main_fetch_model_files(encid, args_json, model_paths, name):
+ success = False
+ args_json["models tar"] = {}
+ readme_file = "READMEs/models.README"
+ assert(os.path.isfile(readme_file))
+ args_json["models tar"]["file.paths"] = [(readme_file, "README.md")]
+ args_json["models tar"]["logs.models."+encid] = {"file.paths": None}
+
+ for i in range(5):
+ data_paths, log_paths, log_paths_opt = model_upload_utils.fetch_per_fold_models(odir,model_paths[i], encid, i)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["models tar"]["fold_"+str(i)] = {}
+ args_json["models tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["models tar"]["fold_"+str(i)]["logs.models.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt}
+ assert(len(data_paths) == 6)
+ print(len(log_paths))
+ assert(len(log_paths) >= 6)
+
+ success=True
+ return success, args_json
+
+def main_fetch_bias_model_files(encid, args_json, models_path):
+ success = False
+ args_json["bias models tar"] = {}
+ readme_file = "READMEs/bias.models.README"
+ assert(os.path.isfile(readme_file))
+ args_json["bias models tar"]["file.paths"] = [(readme_file, "README.md")]
+ #args_json["bias models tar"]["logs.bias.models."+encid] = {"file.paths": None}
+
+ for i in range(5):
+ data_paths, log_paths = upload_utils.fetch_per_fold_bias_models(odir, models_path[i], encid, i)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["bias models tar"]["fold_"+str(i)] = {}
+ args_json["bias models tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["bias models tar"]["fold_"+str(i)]["logs.bias.models.fold_"+str(i)+"."+encid] = {"file.paths": log_paths}
+ #print(log_paths)
+ # 9 log file expected per model
+ #print(len(log_paths))
+ assert(len(log_paths) >= 2)
+ assert(len(data_paths) == 2)
+ success=True
+ return success, args_json
+
+
+def main_fetch_bias_training_files(encid, args_json, models_path, name):
+ success = False
+
+ # find the training test regions
+ args_json["bias training and test regions tar"] = {}
+ readme_file = "READMEs/bias.training.README"
+ assert(os.path.isfile(readme_file))
+ args_json["bias training and test regions tar"]["file.paths"] = [(readme_file, "README.md")]
+
+ if name in ["HEPG2", "K562"]:
+ main_dir="/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_PE/"
+ input_peaks = os.path.join(main_dir, name + "/data/peaks_no_blacklist.bed.gz")
+ else:
+ main_dir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/"
+ input_peaks = os.path.join(odir, encid + "/preprocessing/downloads/peaks.bed.gz")
+
+ #print(input_peaks)
+ if os.path.isfile(input_peaks):
+ args_json["bias training and test regions tar"]["file.paths"].append((input_peaks,"peaks.all_input_regions."+encid+".bed.gz"))
+ else:
+ success = False
+ return success, args_json
+
+ # log files preprocessing and peak-calling
+ if name in ["HEPG2", "K562"]:
+ log_paths = upload_utils.bias_fetch_preprocessing_log_files_set_1(odir, encid, main_dir, name)
+ #print(len(log_paths))
+ assert(len(log_paths) == 3)
+ elif name in ["H1ESC"]:
+ main_dir="/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_SE/"
+ log_paths = upload_utils.bias_fetch_preprocessing_log_files_set_1(odir, encid, main_dir, name)
+ #print(len(log_paths))
+ assert(len(log_paths) == 3)
+
+ else:
+ log_paths = upload_utils.bias_fetch_preprocessing_log_files_set_2(odir, encid, main_dir, name)
+ assert(len(log_paths) == 8)
+
+
+ args_json["bias training and test regions tar"]["logs.bias.training_test_regions."+encid] = {"file.paths": log_paths}
+
+
+ for i in range(5):
+ data_paths, log_paths = upload_utils.fetch_per_fold_training_data_bias(odir, models_path[i], encid, i, main_dir, name)
+ #print(data_paths)
+ args_json["bias training and test regions tar"]["fold_"+str(i)] = {}
+ args_json["bias training and test regions tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["bias training and test regions tar"]["fold_"+str(i)]["logs.bias.training_test_regions.fold_"+str(i)+"."+encid] = {"file.paths": log_paths}
+ #print(log_paths)
+ #print(log_paths)
+ #print(data_paths)
+ assert(len(data_paths) == 5)
+ assert(len(log_paths) == 2)
+
+ #if len(data_paths) != 3:
+ # success = False
+ # return success, args_json
+
+ success = True
+ return success, args_json
+
+
+
+if __name__ == "__main__":
+
+ # define readmes specfic to bias model
+ #for name in ["HEPG2", "GM12878", "K562", "IMR90", "H1ESC"]:
+ for name in ["HEPG2", "K562", "H1ESC"]:
+
+ encid=encode_id[name]
+ model_paths = model_atac[model_atac[1]==name][2].values
+
+ model_paths_new = model_atac_new[model_atac_new[1]==name][2].values
+
+ print(model_paths)
+
+ if os.path.isfile(output_dir+"/"+encid+".json"):
+ continue
+
+ print(encid)
+
+ args_json = {}
+
+ success, args_json = main_fetch_preprocessing_files(encid, args_json, data_to_bam[name], name)
+ if not success:
+ print("ERR prep")
+ continue
+
+ if name != "HEPG2":
+
+ success, args_json = main_fetch_bias_training_files(encid, args_json, model_paths, name)
+ if not success:
+ print("ERR bias prep")
+ continue
+
+ success, args_json = main_fetch_bias_model_files(encid, args_json, model_paths)
+ if not success:
+ print("ERR bias models")
+ continue
+
+ if name == "H1ESC":
+ model_paths = model_paths_new
+
+ success, args_json = main_fetch_model_files(encid, args_json, model_paths, name)
+ if not success:
+ print("fail model")
+ continue
+
+ success, args_json = main_fetch_training_files(encid, args_json, model_paths, name)
+ if not success:
+ print("fail train prep")
+ continue
+
+
+ with open(output_dir+"/"+encid+".json", "w") as outfile:
+ json.dump(args_json, outfile, indent=4)
+
+ #print(args_json)
+
+
+
+
+
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_upload_utils.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_upload_utils.py
new file mode 100644
index 00000000..053b8008
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_upload_utils.py
@@ -0,0 +1,301 @@
+import os
+import json
+import numpy as np
+
+### utils for preprocessing
+
+
+### utils for training and testing regions
+
+
+def bias_fetch_preprocessing_log_files_set_1(odir, encid, main_dir, name):
+ # do bed file checks
+ log_paths = []
+
+
+ # preprocessing log files
+ preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_preprocessing.log")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout.txt"))
+
+ preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_DNASE_PE.sh")
+ if os.path.isfile(preprocessing_log):
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script.sh"))
+
+ preprocessing_log = os.path.join(main_dir, name + "/data/h1_dnase_fold_0.sh")
+ if os.path.isfile(preprocessing_log):
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script.sh"))
+
+ preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_bias_pwm.png")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".bias_pwm.png"))
+
+ # peak-calling-log-files
+# tmpdir = "/oak/stanford/groups/akundaje/projects/chrombpnet/model_inputs/DNASE/caper/"
+#
+# peaks_log = os.path.join(tmpdir, name + "/metadata.json")
+# if os.path.isfile(peaks_log):
+# log_paths.append((peaks_log,"logfile.peak_calling."+encid+".metadata.json"))
+#
+# peaks_log = os.path.join(tmpdir, name + "/call-reproducibility_overlap/stdout")
+# if os.path.isfile(peaks_log):
+# log_paths.append((peaks_log,"logfile.peak_calling."+encid+".stdout.txt"))
+#
+# peaks_log = os.path.join(tmpdir, name + "/call-reproducibility_overlap/stderr")
+# if os.path.isfile(peaks_log):
+# log_paths.append((peaks_log,"logfile.peak_calling."+encid+".stderr.txt"))
+
+ return log_paths
+
+def bias_fetch_preprocessing_log_files_set_2(odir, encid, main_dir, name):
+ # do bed file checks
+ log_paths = []
+
+ # preprocessing log files
+ preprocessing_log = os.path.join(odir, encid + "/preprocessing/preprocessing.log.e")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stderr.txt"))
+
+ preprocessing_log = os.path.join(odir, encid + "/preprocessing/preprocessing.log.o")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout.txt"))
+
+ preprocessing_log = os.path.join(odir, encid + "/preprocessing/"+encid+".log")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout_v1.txt"))
+
+ preprocessing_log = os.path.join(odir, encid + "/preprocessing/preprocess_"+encid+".log")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout_v2.txt"))
+
+ preprocessing_log = os.path.join(odir, encid + "/preprocessing/params_file.json")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".params_file.json"))
+
+ preprocessing_log = os.path.join(odir, encid + "/preprocessing/bigWigs/"+encid+".png")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".bias_pwm.png"))
+
+ # peak-calling-log-files
+ peaks_log = os.path.join(odir, encid + "/peak_calling/log.e")
+ if os.path.isfile(peaks_log):
+ log_paths.append((peaks_log,"logfile.peak_calling."+encid+".stdout_v1.txt"))
+
+ peaks_log = os.path.join(odir, encid + "/peak_calling/log.o")
+ if os.path.isfile(peaks_log):
+ log_paths.append((peaks_log,"logfile.peak_calling."+encid+".stdout_v2.txt"))
+
+ return log_paths
+
+def fetch_per_fold_training_data_bias(odir, model_dir, encid, fold_num, main_dir, name):
+ input_paths = []
+ log_paths = []
+
+ #print(model_dir)
+ opath = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/splits_format/"
+ filtered_regions_bed = os.path.join(opath + "/fold_"+str(fold_num)+".json")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"cv_params.fold_"+str(fold_num)+".json"))
+
+ if fold_num==0:
+ print(name)
+ if name in ["HEPG2", "K562", "H1ESC"]:
+ filtered_regions_bed = os.path.join(main_dir, name+"/negatives_data/negatives_with_summit.bed.gz")
+ print(filtered_regions_bed)
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+ else:
+
+ filtered_regions_bed = os.path.join(main_dir, name+"/data/negatives_data/negatives_with_summit.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(odir, encid+"/negatives_data/negatives_with_summit.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ else:
+ if name in ["HEPG2", "K562", "H1ESC"]:
+ filtered_regions_bed = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/negatives_with_summit.bed.gz")
+ print(filtered_regions_bed)
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ else:
+ filtered_regions_bed = os.path.join(main_dir, name+"/data/negatives_data_"+str(fold_num)+"/negatives_with_summit.bed.gz")
+ print(filtered_regions_bed)
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+
+ filtered_regions_bed = os.path.join(odir, encid+"/negatives_data_"+str(fold_num)+"/negatives_with_summit.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ #filtered_regions_bed = os.path.join(model_dir, "bias_model/train_test_regions/peaks.testset.bed.gz")
+ #print(filtered_regions_bed)
+ #if os.path.isfile(filtered_regions_bed):
+ # input_paths.append((filtered_regions_bed,"peaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.trainingset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.trainingset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.validationset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.validationset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.testset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ #print(input_paths)
+ #print(filtered_regions_bed)
+
+ if fold_num==0:
+ if name in ["HEPG2", "K562", "H1ESC"]:
+ negatives_log = os.path.join(main_dir, name+"/negatives_data/make_background_regions.log")
+ if os.path.isfile(negatives_log):
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout_v1.txt"))
+ else:
+
+ negatives_log = os.path.join(main_dir, name+"/data/negatives_data/make_background_regions.log")
+ if os.path.isfile(negatives_log):
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout_v1.txt"))
+
+ negatives_log = os.path.join(odir, encid+"/negatives_data/make_background_regions.log")
+ if os.path.isfile(negatives_log):
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout_v1.txt"))
+
+ negatives_log = os.path.join(odir, encid+"/negatives_data/gc_matching.log.o")
+ if os.path.isfile(negatives_log):
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout_v2.txt"))
+
+ else:
+ if name in ["HEPG2", "K562", "H1ESC"]:
+ negatives_log = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/make_background_regions.log")
+ if os.path.isfile(negatives_log):
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout_v1.txt"))
+ else:
+ negatives_log = os.path.join(main_dir, name+"/data/negatives_data_"+str(fold_num)+"/make_background_regions.log")
+ if os.path.isfile(negatives_log):
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout_v1.txt"))
+
+ negatives_log = os.path.join(odir, encid+"/negatives_data_"+str(fold_num)+"/make_background_regions.log")
+ if os.path.isfile(negatives_log):
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout_v1.txt"))
+
+
+ if fold_num==0:
+ if name in ["HEPG2", "K562", "H1ESC"]:
+ negatives_log = os.path.join(main_dir, name+"/negatives_data/negatives_compared_with_foreground.png")
+ #print(negatives_log)
+ if os.path.isfile(negatives_log):
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png"))
+ else:
+ negatives_log = os.path.join(main_dir, name+"/data/negatives_data/negatives_compared_with_foreground.png")
+ if os.path.isfile(negatives_log):
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png"))
+
+ negatives_log = os.path.join(odir, encid+"/negatives_data/negatives_compared_with_foreground.png")
+ if os.path.isfile(negatives_log):
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png"))
+
+ else:
+ if name in ["HEPG2", "K562", "H1ESC"]:
+ negatives_log = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/negatives_compared_with_foreground.png")
+ if os.path.isfile(negatives_log):
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png"))
+ else:
+ negatives_log = os.path.join(main_dir, name+"/data/negatives_data_"+str(fold_num)+"/negatives_compared_with_foreground.png")
+ if os.path.isfile(negatives_log):
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png"))
+
+ negatives_log = os.path.join(odir, encid+"/negatives_data_"+str(fold_num)+"/negatives_compared_with_foreground.png")
+ if os.path.isfile(negatives_log):
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png"))
+
+# negatives_log = os.path.join(odir, encid + "/negatives_data/test/fold_"+str(fold_num)+"."+encid+"_test.log")
+# if os.stat(negatives_log).st_size != 0:
+# log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.txt"))
+
+ # add preprocessing data main_dir
+
+ #print(input_paths)
+ return input_paths, log_paths
+
+
+def fetch_per_fold_bias_models(odir, model_dir, encid, fold_num):
+ input_paths = []
+ log_paths = []
+
+ bm_model = os.path.join(model_dir, "bias_model/bias.h5")
+ if os.path.isfile(bm_model):
+ input_paths.append((bm_model,"model.bias.fold_"+str(fold_num)+"."+encid+".h5"))
+ else:
+ return None, None
+
+ bm_model = os.path.join(model_dir, "bias_model/new_model_formats_vf/bias.tar")
+ if os.path.isfile(bm_model):
+ input_paths.append((bm_model,"model.bias.fold_"+str(fold_num)+"."+encid+".tar"))
+ else:
+ return None, None
+
+ #### fetch model training log files ########
+
+ modelling_log = os.path.join(model_dir, "bias_model/train_bias_model.log")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".stdout.txt"))
+
+ modelling_log = os.path.join(model_dir, "bias_model/bias.args.json")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".args.json"))
+
+ modelling_log = os.path.join(model_dir, "bias_model/bias_data_params.tsv")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_data_params.tsv"))
+ else:
+ modelling_log = os.path.join(model_dir, "bias_model/newgen/bias_data_params.tsv")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_data_params.tsv"))
+
+
+ modelling_log = os.path.join(model_dir, "bias_model/bias_model_params.tsv")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_train_params.tsv"))
+ else:
+ modelling_log = os.path.join(model_dir, "bias_model/newgen/bias_model_params.tsv")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_train_params.tsv"))
+
+ modelling_log = os.path.join(model_dir, "bias_model/bias.params.json")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_train_params.json"))
+
+ modelling_log = os.path.join(model_dir, "bias_model/bias.log")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".epoch_loss.csv"))
+
+ modelling_log = os.path.join(model_dir, "bias_model/bias.log.batch")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".batch_loss.tsv"))
+
+ return input_paths, log_paths
+
+
+
+
+
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/get_gc_matched_negatives_test.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/get_gc_matched_negatives_test.py
new file mode 100644
index 00000000..f0024950
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/get_gc_matched_negatives_test.py
@@ -0,0 +1,175 @@
+import argparse
+import pandas as pd
+import numpy as np
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+import random
+import csv
+import json
+import sys
+
+def parse_args():
+ parser=argparse.ArgumentParser(description="generate a bed file of non-peak regions that are gc-matched with foreground")
+ parser.add_argument("-c","--candidate_negatives",help="candidate negatives bed file with gc content in 4th column rounded to 2 decimals")
+ parser.add_argument("-f","--foreground_gc_bed", help="regions with their corresponding gc fractions for matching, 4th column has gc content value rounded to 2 decimals")
+ parser.add_argument("-o","--output_prefix", help="gc-matched non-peaks output file name")
+ parser.add_argument("-fl", "--chr_fold_path", type=str, required=True, help="Fold information - dictionary with test,valid and train keys and values with corresponding chromosomes")
+ parser.add_argument("-npr", "--neg_to_pos_ratio_train", type=int, default=1, help="Ratio of negatives to positives to sample for training")
+ return parser.parse_args()
+
+def remap_chrom(chrom, splits_dict):
+ '''
+ Remapping chromosome names - we will not differentiate between the train/valid/tes chromsomes
+ when sampling negatives.
+ '''
+ if chrom in splits_dict["train"]:
+ chrom_mod = "chrom_train"
+ elif chrom in splits_dict["valid"]:
+ chrom_mod = "chrom_valid"
+ elif chrom in splits_dict["test"]:
+ chrom_mod = "chrom_test"
+ else:
+ chrom_mod = "ignore"
+ return chrom_mod
+
+
+def make_gc_dict(candidate_negatives, splits_dict):
+ """
+ Imports the candidate negatives into a dictionary structure.
+ The `key` is the gc content fraction, and the `values` are a list
+ containing the (chrom,start,end) of a region with the corresponding
+ gc content fraction.
+ """
+ data=open(candidate_negatives,'r').readlines()
+ gc_dict={}
+ index=0
+ ignored_chroms = []
+ for line in tqdm(list(data)):
+ line=line.strip('\n')
+ index+=1
+ tokens=line.split('\t')
+ chrom=tokens[0]
+ gc=float(tokens[-1])
+ start=tokens[1]
+ end=tokens[2]
+ chrom_real=chrom
+ chrom = remap_chrom(chrom, splits_dict)
+ if chrom == "ignore":
+ ignored_chroms.append(chrom_real)
+ continue
+ if chrom not in gc_dict:
+ gc_dict[chrom]={}
+ if gc not in gc_dict[chrom]:
+ gc_dict[chrom][gc]=[(chrom,start,end,chrom_real)]
+ else:
+ gc_dict[chrom][gc].append((chrom,start,end,chrom_real))
+
+ print("Following background chromosomes {} were ignored since they are not present in the given fold".format(",".join(list(set(ignored_chroms)))))
+ return gc_dict
+
+def scale_gc(cur_gc):
+ """
+ Randomly increase/decrease the gc-fraction value by 0.01
+ """
+ if random.random()>0.5:
+ cur_gc+=0.01
+ else:
+ cur_gc-=0.01
+ cur_gc=round(cur_gc,2)
+ if cur_gc<=0:
+ cur_gc+=0.01
+ if cur_gc>=1:
+ cur_gc-=0.01
+ assert cur_gc >=0
+ assert cur_gc <=1
+ return cur_gc
+
+def adjust_gc(chrom,cur_gc,negatives,used_negatives):
+ """
+ Function that checks if (1) the given gc fraction value is available
+ in the negative candidates or (2) if the given gc fraction value has
+ candidates not already sampled. If eitheir of the condition fails we
+ sample the neighbouring gc_fraction value by randomly scaling with 0.01.
+ """
+ if chrom not in used_negatives:
+ used_negatives[chrom]={}
+
+ if cur_gc not in used_negatives[chrom]:
+ used_negatives[chrom][cur_gc]=[]
+
+ while (cur_gc not in negatives[chrom]) or (len(used_negatives[chrom][cur_gc])>=len(negatives[chrom][cur_gc])):
+ cur_gc=scale_gc(cur_gc)
+ if cur_gc not in used_negatives[chrom]:
+ used_negatives[chrom][cur_gc]=[]
+ return cur_gc,used_negatives
+
+
+
+if __name__=="__main__":
+
+ args=parse_args()
+
+ splits_dict=json.load(open(args.chr_fold_path))
+
+ negatives=make_gc_dict(args.candidate_negatives, splits_dict)
+ used_negatives=dict()
+ cur_peaks=pd.read_csv(args.foreground_gc_bed,header=None,sep='\t')
+ negatives_bed = []
+ print(len(list(cur_peaks.iterrows())))
+
+ foreground_gc_vals = []
+ output_gc_vals = []
+ ignored_chroms = []
+ for index,row in tqdm(list(cur_peaks.iterrows())):
+
+ chrom=row[0]
+ start=row[1]
+ end=row[2]
+ gc_value=row[3]
+
+ chrom_real=chrom
+ chrom = remap_chrom(chrom, splits_dict)
+ if chrom == "ignore":
+ ignored_chroms.append(chrom_real)
+ continue
+
+ if chrom=="chrom_train" or chrom=="chrom_valid":
+ #neg_to_pos_ratio = args.neg_to_pos_ratio_train
+ continue
+ else:
+ neg_to_pos_ratio = 4
+
+ # for every gc value in positive how many negatives to find
+ # we will keep the ratio of positives to negatives in the test set same
+ for rep in range(neg_to_pos_ratio):
+ cur_gc,used_negatives=adjust_gc(chrom,gc_value,negatives,used_negatives)
+ num_candidates=len(negatives[chrom][cur_gc])
+ rand_neg_index=random.randint(0,num_candidates-1)
+ while rand_neg_index in used_negatives[chrom][cur_gc]:
+ cur_gc,used_negatives=adjust_gc(chrom,cur_gc,negatives,used_negatives)
+ num_candidates=len(negatives[chrom][cur_gc])
+ rand_neg_index=random.randint(0,num_candidates-1)
+
+ used_negatives[chrom][cur_gc].append(rand_neg_index)
+ neg_tuple=negatives[chrom][cur_gc][rand_neg_index]
+ neg_chrom=neg_tuple[0]
+ neg_start=neg_tuple[1]
+ neg_end=neg_tuple[2]
+ neg_chrom_real=neg_tuple[3]
+ negatives_bed.append([neg_chrom_real,int(neg_start),int(neg_end), cur_gc])
+ output_gc_vals.append(cur_gc)
+ foreground_gc_vals.append(gc_value)
+
+ print("Following foreground chromosomes {} were ignored since they are not present in the given fold".format(",".join(list(set(ignored_chroms)))))
+ negatives_bed = pd.DataFrame(negatives_bed)
+ negatives_bed.to_csv(args.output_prefix+".bed", sep='\t', index=False, header=False, quoting=csv.QUOTE_NONE)
+
+ # checking how far the true distribution of foreground is compared to the backgrounds generated
+ bins = np.linspace(0, 1, 100)
+ plt.hist([output_gc_vals,foreground_gc_vals], bins, density=True, label=['negatives gc distribution', "foreground gc distribution"])
+ plt.xlabel("GC content")
+ plt.ylabel("Density")
+ plt.legend(loc='upper right')
+ plt.savefig(args.output_prefix+"_compared_with_foreground.png")
+
+
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/run_script.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/run_script.py
new file mode 100644
index 00000000..763f2c12
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/run_script.py
@@ -0,0 +1,26 @@
+import pandas as pd
+import os
+
+model_atac = pd.read_csv("../atac_bias_model_chrombpnet.csv",sep=",", header=None)
+
+
+print(model_atac.head())
+
+for i,r in model_atac.iterrows():
+
+ print(r)
+ if os.path.isfile(os.path.join(r[2], "train_test_regions_may_7_2024/nonpeaks.testset.bed.gz")):
+ try:
+ tdata = pd.read_csv(os.path.join(r[2], "train_test_regions_may_7_2024/nonpeaks.testset.bed.gz"))
+ continue
+ except:
+ pass
+
+ print(os.path.join(r[2], "train_test_regions_may_7_2024/nonpeaks.testset.bed.gz"))
+ if r[0].split("_")[-1] == "0":
+ command = "bash script.sh "+r[2]+" "+r[1]+" "+r[0]
+ else:
+ command = "bash script.sh "+r[2]+" "+r[1]+" "+r[0]+" "+"_"+str(r[0].split("_")[-1])
+
+ print(command)
+ os.system(command)
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/run_script_dnase.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/run_script_dnase.py
new file mode 100644
index 00000000..8d03cd81
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/run_script_dnase.py
@@ -0,0 +1,36 @@
+import pandas as pd
+import os
+
+#model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/upload_jsons/upload_scripts/model_dir_dnase_v2.1_bias.csv",sep=",", header=None)
+model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/v1/model_dir_dnase_v2.1.csv",sep=",", header=None)
+
+
+print(model_atac.head())
+
+for i,r in model_atac.iterrows():
+
+ if r[1] != "H1ESC":
+ continue
+ if r[1] in ["HEPG2", "K562"]:
+ tag="DNASE_PE"
+ mdir=r[1]
+ else:
+ tag="DNASE_SE"
+ #print(r)
+ if os.path.isfile(os.path.join(r[2], "train_test_regions_may_7_2024/nonpeaks.testset.bed.gz")):
+ try:
+ tdata = pd.read_csv(os.path.join(r[2], "train_test_regions_may_7_2024/nonpeaks.testset.bed.gz"))
+ continue
+ except:
+ pass
+
+ print(os.path.join(r[2], "train_test_regions_may_7_2024/nonpeaks.testset.bed.gz"))
+ if r[0].split("_")[-1] == "0":
+ command = "bash script_dnase.sh "+r[2]+" "+r[1]+" "+r[0]+" "+tag
+ else:
+ command = "bash script_dnase.sh "+r[2]+" "+r[1]+" "+r[0]+" "+tag+" "+"_"+str(r[0].split("_")[-1])
+
+ print(command)
+ os.system(command)
+
+#/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_SE/H1ESC/n
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/script.sh b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/script.sh
new file mode 100644
index 00000000..66b6e7d4
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/script.sh
@@ -0,0 +1,15 @@
+modeldir=$1
+celll=$2
+foldn=/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/splits_format/$3.json
+fold=$4
+
+python get_gc_matched_negatives_test.py \
+ -c /mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/$celll/negatives_data$fold/candidate.negatives.bed \
+ -f /mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/$celll/negatives_data$fold/foreground.gc.bed \
+ -o $modeldir/train_test_regions_may_7_2024/negatives \
+ -fl $foldn
+
+awk -v OFS="\t" '{print $1, $2, $3, ".", ".", ".", ".", ".", ".", "1057"}' $modeldir/train_test_regions_may_7_2024/negatives.bed > $modeldir/train_test_regions_may_7_2024/negatives_with_summit.bed
+
+gzip -c $modeldir/train_test_regions_may_7_2024/negatives_with_summit.bed > $modeldir/train_test_regions_may_7_2024/nonpeaks.testset.bed.gz
+
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/script_dnase.sh b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/script_dnase.sh
new file mode 100644
index 00000000..d8df8a0e
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/script_dnase.sh
@@ -0,0 +1,16 @@
+modeldir=$1
+celll=$2
+foldn=/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/splits_format/$3.json
+tag=$4
+fold=$5
+
+python get_gc_matched_negatives_test.py \
+ -c /mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/$tag/$celll/negatives_data$fold/candidate.negatives.bed \
+ -f /mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/$tag/$celll/negatives_data$fold/foreground.gc.bed \
+ -o $modeldir/train_test_regions_may_7_2024/negatives \
+ -fl $foldn
+
+awk -v OFS="\t" '{print $1, $2, $3, ".", ".", ".", ".", ".", ".", "1057"}' $modeldir/train_test_regions_may_7_2024/negatives.bed > $modeldir/train_test_regions_may_7_2024/negatives_with_summit.bed
+
+gzip -c $modeldir/train_test_regions_may_7_2024/negatives_with_summit.bed > $modeldir/train_test_regions_may_7_2024/nonpeaks.testset.bed.gz
+
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/model_upload_utils.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/model_upload_utils.py
new file mode 100644
index 00000000..6b5d3447
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/model_upload_utils.py
@@ -0,0 +1,235 @@
+import os
+import json
+import numpy as np
+
+
+### utils for model uploads
+
+def fetch_per_fold_models(odir, model_dir, encid, fold_num):
+ input_paths = []
+ log_paths = []
+ log_paths_opt = []
+
+ cmb = os.path.join(model_dir, "chrombpnet_model/chrombpnet_wo_bias.h5")
+ if os.path.isfile(cmb):
+ input_paths.append((cmb,"model.chrombpnet_nobias.fold_"+str(fold_num)+"."+encid+".h5"))
+ else:
+ print(cmb)
+ return None, None, None
+
+ cmb = os.path.join(model_dir, "chrombpnet_model/chrombpnet.h5")
+ if os.path.isfile(cmb):
+ input_paths.append((cmb,"model.chrombpnet.fold_"+str(fold_num)+"."+encid+".h5"))
+ else:
+ print(cmb)
+ return None, None, None
+
+# checks_file = os.path.join(model_dir, "new_chrombpnet_model/check_passed.txt")
+# if os.path.isfile(checks_file):
+# cm_model = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet.h5")
+# if os.path.isfile(cm_model):
+# input_paths.append((cm_model,"model.chrombpnet.fold_"+str(fold_num)+"."+encid+".h5"))
+# else:
+# print(cm_model)
+# return None, None, None
+#
+# cm_model = os.path.join(odir, encid + "/" + model_dir + "/new_model_formats/chrombpnet.tar")
+# if os.path.isfile(cm_model):
+# input_paths.append((cm_model,"model.chrombpnet.fold_"+str(fold_num)+"."+encid+".tar"))
+# else:
+# print(cm_model)
+# return None, None, None
+#
+#
+# else:
+# cm_model = os.path.join(odir, encid + "/" + model_dir + "/new_chrombpnet_model/chrombpnet_new.h5")
+# if os.path.isfile(cm_model):
+# input_paths.append((cm_model,"model.chrombpnet.fold_"+str(fold_num)+"."+encid+".h5"))
+# else:
+# print(cm_model)
+# return None, None, None
+#
+# cm_model = os.path.join(odir, encid + "/" + model_dir + "/new_chrombpnet_model/chrombpnet.tar")
+# if os.path.isfile(cm_model):
+# input_paths.append((cm_model,"model.chrombpnet.fold_"+str(fold_num)+"."+encid+".tar"))
+# else:
+# print(cm_model)
+# return None, None, None
+
+
+ bm_model = os.path.join(model_dir, "chrombpnet_model/bias_model_scaled.h5")
+ if os.path.isfile(bm_model):
+ input_paths.append((bm_model,"model.bias_scaled.fold_"+str(fold_num)+"."+encid+".h5"))
+ else:
+ print(cmb)
+ return None, None, None
+
+ cmb = os.path.join(model_dir, "new_model_formats_may_7_24_vf/chrombpnet.tar")
+ if os.path.isfile(cmb):
+ input_paths.append((cmb,"model.chrombpnet.fold_"+str(fold_num)+"."+encid+".tar"))
+ else:
+ print(cmb)
+
+ return None, None, None
+
+ cmb = os.path.join(model_dir, "new_model_formats_may_7_24_vf/chrombpnet_wo_bias.tar")
+ if os.path.isfile(cmb):
+ input_paths.append((cmb,"model.chrombpnet_nobias.fold_"+str(fold_num)+"."+encid+".tar"))
+ else:
+ print(cmb)
+
+ return None, None, None
+
+
+ bm_model = os.path.join(model_dir, "new_model_formats_may_7_24_vf/bias_model_scaled.tar")
+ if os.path.isfile(bm_model):
+ input_paths.append((bm_model,"model.bias_scaled.fold_"+str(fold_num)+"."+encid+".tar"))
+ else:
+ return None, None, None
+
+ ### fetch main logs
+
+ modelling_log = os.path.join(model_dir, "chrombpnet_model/chrombpnet.args.json")
+ if os.path.isfile(modelling_log):
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".args.json"))
+ else:
+ print(modelling_log)
+
+ modelling_log = os.path.join(model_dir, "chrombpnet_model/chrombpnet_data_params.tsv")
+ if os.stat(modelling_log).st_size != 0:
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".chrombpnet_data_params.tsv"))
+ else:
+ print(modelling_log)
+
+ modelling_log = os.path.join(model_dir, "chrombpnet_model/chrombpnet_model_params.tsv")
+ if os.stat(modelling_log).st_size != 0:
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".chrombpnet_model_params.tsv"))
+ else:
+ print(modelling_log)
+
+ modelling_log = os.path.join(model_dir, "chrombpnet_model/chrombpnet.params.json")
+ if os.stat(modelling_log).st_size != 0:
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".chrombpnet.params.json"))
+ else:
+ print(modelling_log)
+
+ modelling_log = os.path.join(model_dir, "chrombpnet_model/chrombpnet.log")
+ if os.stat(modelling_log).st_size != 0:
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".epoch_loss.csv"))
+ else:
+ print(modelling_log)
+
+ modelling_log = os.path.join(model_dir, "chrombpnet_model/chrombpnet.log.batch")
+ if os.stat(modelling_log).st_size != 0:
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".batch_loss.tsv"))
+ else:
+ print(modelling_log)
+
+ modelling_log = os.path.join(model_dir, "chrombpnet_model/train_chrombpnet_model.log")
+ if os.stat(modelling_log).st_size != 0:
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".stdout_v1.txt"))
+ else:
+ print(modelling_log)
+
+
+ return input_paths, log_paths, log_paths_opt
+
+
+### utils for training and testing regions
+
+def fetch_preprocessing_log_files(odir, encid, main_dir, name):
+ # do bed file checks
+ log_paths = []
+
+ preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_preprocessing.log")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout.txt"))
+
+ try:
+ preprocessing_log = os.path.join(main_dir, name + "/data/"+name.lower()+"_atac_fold_0.sh")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script_v2.sh"))
+ except:
+ try:
+ preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_DNASE_PE.sh")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script_v2.sh"))
+ except:
+ preprocessing_log = os.path.join(main_dir, name + "/data/"+"h1_dnase_fold_0.sh")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script_v2.sh"))
+
+ preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_bias_pwm.png")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".bias_pwm.png"))
+
+ return log_paths
+
+def fetch_per_fold_training_data(odir,model_dir,encid, fold_num, main_dir, name):
+ input_paths = []
+ log_paths = []
+
+ opath = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/splits_format/"
+ filtered_regions_bed = os.path.join(opath + "/fold_"+str(fold_num)+".json")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"cv_params.fold_"+str(fold_num)+".json"))
+
+ if fold_num==0:
+ filtered_regions_bed = os.path.join(main_dir, name+"/negatives_data/negatives_with_summit.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+ else:
+ filtered_regions_bed = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/negatives_with_summit.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+
+ filtered_regions_bed = os.path.join(model_dir, "train_test_regions_may_7_2024/peaks.trainingset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"peaks.trainingset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(model_dir, "train_test_regions_may_7_2024/peaks.validationset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"peaks.validationset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(model_dir, "train_test_regions_may_7_2024/peaks.testset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"peaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(model_dir, "train_test_regions_may_7_2024/nonpeaks.trainingset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.trainingset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(model_dir, "train_test_regions_may_7_2024/nonpeaks.validationset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.validationset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(model_dir, "train_test_regions_may_7_2024/nonpeaks.testset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ # preprocessing logs to include
+
+ if fold_num==0:
+ #negatives_log = os.path.join(temp_dir, name+"/negatives_data/make_background_regions.log")
+ negatives_log = os.path.join(main_dir, name+"/negatives_data/make_background_regions.log")
+
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.txt"))
+ else:
+ negatives_log = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/make_background_regions.log")
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.txt"))
+
+
+ if fold_num==0:
+# negatives_log = os.path.join(temp_dir, "negatives_data/negatives_compared_with_foreground.png")
+ negatives_log = os.path.join(main_dir, name+"/negatives_data/negatives_compared_with_foreground.png")
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png"))
+ else:
+ negatives_log = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/negatives_compared_with_foreground.png")
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png"))
+
+ return input_paths, log_paths
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/temp.sh b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/temp.sh
new file mode 100644
index 00000000..128fc01a
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/temp.sh
@@ -0,0 +1,2 @@
+gzip -c /mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_SE/H1ESC/negatives_data_4/negatives_with_summit.bed > /mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_SE/H1ESC/negatives_data_4/negatives_with_summit.bed.gz
+
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/READMEs/bias.models.README b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/READMEs/bias.models.README
new file mode 100644
index 00000000..315b971b
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/READMEs/bias.models.README
@@ -0,0 +1,92 @@
+# Directory Structure Format
+.
+├── fold_0
+│ ├── model.bias.fold_0.encid.h5 # bias model in .h5 format
+│ ├── model.bias.fold_0.encid.h5 # bias model in SavedModel format
+│ │ after being untarred, it results in a directory named "bias"
+│ └── logs.bias.models.fold_0.encid # folder containing log files for training models
+│
+├── fold_1
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_2
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_3
+│ └── ... # similar directory structure as fold_0 directory above
+│
+└── fold_4
+ └── ... # similar directory structure as fold_0 directory above
+
+
+# Pseudocode for loading models in .h5 format
+
+(1) Use the code in python after appropriately defining `model_in_h5_format` and `inputs`.
+(2) `inputs` is a one hot encoded sequence of shape (N,2114,4). Here N corresponds to the
+number of tested sequences, 2114 is the input sequence length and 4 corresponds to [A,C,G,T].
+
+```
+import tensorflow as tf
+from tensorflow.keras.utils import get_custom_objects
+from tensorflow.keras.models import load_model
+
+custom_objects={"tf": tf}
+get_custom_objects().update(custom_objects)
+
+model=load_model(model_in_h5_format,compile=False)
+outputs = model(inputs)
+```
+
+The list `outputs` consists of two elements. The first element has a shape of (N, 1000) and
+contains logit predictions for a 1000-base-pair output. The second element, with a shape of
+(N, 1), contains logcount predictions. To transform these predictions into per-base signals,
+follow the provided pseudo code lines below.
+
+```
+import numpy as np
+
+def softmax(x, temp=1):
+ norm_x = x - np.mean(x,axis=1, keepdims=True)
+ return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True)
+
+predictions = softmax(outputs[0]) * (np.exp(outputs[1])-1)
+```
+
+# Pseudocode for loading models in .tar format
+
+(1) First untar the directory as follows `tar -xvf model.tar`
+(2) Use the code below in python after appropriately defining `model_dir_untared` and `inputs`
+(3) `inputs` is a one hot encoded sequence of shape (N,2114,4). Here N corresponds to the number
+of tested sequences, 2114 is the input sequence length and 4 corresponds to ACGT.
+
+Reference: https://www.tensorflow.org/api_docs/python/tf/saved_model/load
+
+```
+import tensorflow as tf
+
+model = tf.saved_model.load('model_dir_untared')
+outputs = model.signatures['serving_default'](**{'sequence':inputs.astype('float32')})
+```
+
+The variable `outputs` represents a dictionary containing two key-value pairs. The first key
+is `logits_profile_predictions`, holding a value with a shape of (N, 1000). This value corresponds
+to logit predictions for a 1000-base-pair output. The second key, named `logcount_predictions``,
+is associated with a value of shape (N, 1), representing logcount predictions. To transform these
+predictions into per-base signals, utilize the provided pseudo code lines mentioned below.
+
+```
+import numpy as np
+def softmax(x, temp=1):
+ norm_x = x - np.mean(x,axis=1, keepdims=True)
+ return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True)
+
+predictions = softmax(outputs["logits_profile_predictions"]) * (np.exp(outputs["logcount_predictions"])-1)
+```
+
+# Docker image to load and use the models
+
+https://hub.docker.com/r/kundajelab/chrombpnet-atlas/ (tag:v1)
+
+# Tool box to do downstream analysis with the models
+
+https://github.com/kundajelab/chrombpnet/wiki
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/READMEs/bias.training.README b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/READMEs/bias.training.README
new file mode 100644
index 00000000..8faa0ea2
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/READMEs/bias.training.README
@@ -0,0 +1,63 @@
+# Directory Structure Format
+.
+├── peaks.all_input_regions.encid.bed.gz # Peaks input to the bias training script
+├── logs.bias.training_test_regions.encid # folder containing log files for peak and nonpeak generation scripts
+│
+├── fold_0
+│ ├── cv_params.fold_0.json # training, validation and test chromosomes used in fold 0
+│ ├── nonpeaks.all_input_regions.fold_0.encid.bed.gz # Non peaks input to the bias training script
+│ ├── nonpeaks.trainingset.fold_0.encid.bed.gz # nonpeaks used in training set of fold 0 bias model
+│ ├── nonpeaks.validationset.fold_0.encid.bed.gz # nonpeaks used in validation set of fold 0 bias model
+│ ├── nonpeaks.testset.fold_0.encid.bed.gz # nonpeaks used in test set of fold 0 bias model
+│ └── logs.bias.training_test_regions.fold_0.encid # folder containing log files for training bias model on fold 0
+│
+├── fold_1
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_2
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_3
+│ └── ... # similar directory structure as fold_0 directory above
+│
+└── fold_4
+ └── ... # similar directory structure as fold_0 directory above
+
+# Bed File Format for Peaks
+
+* All the bed files are in narrowpeak format with 10 columns.
+
+1) chrom - Name of the chromosome (or contig, scaffold, etc.).
+2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
+3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
+4) name - Name given to a region (preferably unique). Use "." if no name is assigned.
+5) score - Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were "'0"' when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000.
+6) strand - +/- to denote strand or orientation (whenever applicable). Use "." if no orientation is assigned.
+7) signalValue - Measurement of overall (usually, average) enrichment for the region.
+8) pValue - Measurement of statistical significance (-log10). Use -1 if no pValue is assigned.
+9) qValue - Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
+10) peak - Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.
+
+# Bed File Format for Nonpeaks
+
+* All the bed files are in narrowpeak format with 10 columns.
+
+1) chrom - Name of the chromosome (or contig, scaffold, etc.).
+2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
+3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99.
+4) empty character - "."
+5) empty character - "."
+6) empty character - "."
+7) empty character - "."
+8) empty character - "."
+9) empty character - "."
+10) (chromEnd-chromStart)/2
+
+# Format of file `cv_params.fold_0.json`
+
+A dictionary with following (key,value) pairs,
+
+1) ("CV_type", "chr_holdout")
+2) ("train", list_of_chrs_trainingset)
+3) ("valid", list_of_chrs_validationset)
+4) ("test", list_of_chrs_testset)
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/atac_bias_model_chrombpnet.csv b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/atac_bias_model_chrombpnet.csv
new file mode 100644
index 00000000..15190cf2
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/atac_bias_model_chrombpnet.csv
@@ -0,0 +1,26 @@
+fold_0,GM12878,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/GM12878/nautilus_runs/GM12878_03.01.2022_bias_128_4_1234_0.4_fold_0
+fold_1,GM12878,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/GM12878/GM12878_07.08.2022_bias_128_4_1234_0.4_fold_1_data_type_ATAC_PE
+fold_2,GM12878,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/GM12878/GM12878_07.08.2022_bias_128_4_1234_0.4_fold_2_data_type_ATAC_PE
+fold_3,GM12878,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/GM12878/GM12878_07.14.2022_bias_128_4_1234_0.4_fold_3_data_type_ATAC_PE
+fold_4,GM12878,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/GM12878/GM12878_07.07.2022_bias_128_4_1234_0.4_fold_4_data_type_ATAC_PE
+fold_0,K562,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/K562/nautilus_runs/K562_02.17.2022_bias_128_4_1234_0.5_fold_0
+fold_1,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_1_data_type_ATAC_PE
+fold_2,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_2_data_type_ATAC_PE
+fold_3,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_3_data_type_ATAC_PE
+fold_4,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_4_data_type_ATAC_PE
+fold_0,HEPG2,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/HEPG2/nautilus_runs_jun16/HEPG2_05.09.2022_bias_128_4_1234_0.8_fold_0
+fold_1,HEPG2,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/HEPG2/HEPG2_06.07.2022_bias_128_4_1234_0.8_fold_1
+fold_2,HEPG2,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/HEPG2/HEPG2_05.24.2022_bias_128_4_1234_0.8_fold_2
+fold_3,HEPG2,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/HEPG2/HEPG2_05.22.2022_bias_128_4_1234_0.8_fold_3
+fold_4,HEPG2,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/HEPG2/HEPG2_05.22.2022_bias_128_4_1234_0.8_fold_4
+fold_0,IMR90,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/IMR90/nautilus_runs_apr12/IMR90_04.09.2022_bias_128_4_1234_0.4_fold_0
+fold_1,IMR90,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/IMR90/IMR90_07.17.2022_bias_128_4_1234_0.3_fold_1_data_type_ATAC_PE
+fold_2,IMR90,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/IMR90/IMR90_07.17.2022_bias_128_4_1234_0.3_fold_2_data_type_ATAC_PE
+fold_3,IMR90,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/IMR90/IMR90_07.08.2022_bias_128_4_1234_0.4_fold_3_data_type_ATAC_PE
+fold_4,IMR90,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/IMR90/IMR90_07.07.2022_bias_128_4_1234_0.4_fold_4_data_type_ATAC_PE
+fold_0,H1ESC,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/H1ESC/nautilus_runs_jun16/H1ESC_05.09.2022_bias_128_4_1234_0.8_fold_0
+fold_1,H1ESC,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/H1ESC/H1ESC_07.23.2022_bias_128_4_1234_0.7_fold_1_data_type_ATAC_PE
+fold_2,H1ESC,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/H1ESC/H1ESC_07.17.2022_bias_128_4_1234_0.8_fold_2_data_type_ATAC_PE
+fold_3,H1ESC,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/H1ESC/H1ESC_07.17.2022_bias_128_4_1234_0.8_fold_3_data_type_ATAC_PE
+fold_4,H1ESC,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/H1ESC/H1ESC_07.17.2022_bias_128_4_1234_0.8_fold_4_data_type_ATAC_PE
+
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/atac_prepare_file_for_upload_models.py b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/atac_prepare_file_for_upload_models.py
new file mode 100644
index 00000000..e23c945c
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/atac_prepare_file_for_upload_models.py
@@ -0,0 +1,159 @@
+import os
+import upload_utils
+import json
+
+odir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/ATAC/"
+bw_odir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/full_deepshaps/bigwigs/ATAC/"
+#output_dir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022-uploads/jsons/ATAC/stage1/jul_17_2023/"
+models_path = ["chrombpnet_model_feb15", "chrombpnet_model_feb15_fold_1", "chrombpnet_model_feb15_fold_2", "chrombpnet_model_feb15_fold_3", "chrombpnet_model_feb15_fold_4"]
+output_dir = "atac_production_uploads/"
+#encids = os.listdir(odir)
+encids = open("data/atac_passed.txt").readlines()
+encids = [line.strip() for line in encids]
+
+encode_id = {"K562": "ENCSR868FGK",
+"GM12878": "ENCSR637XSC",
+"HEPG2": "ENCSR291GJU",
+"IMR90": "ENCSR200OML",
+"H1ESC": "GSE267154"}
+
+def main_fetch_preprocessing_files(encid, args_json):
+
+ success_flag = False
+ args_json["upload bias"] = False
+ args_json["bias model encid"] = encid
+
+ # find the bams input
+ preprocessing_path = os.path.join(odir, encid + "/preprocessing/bigWigs/"+encid+".bigWig")
+ if os.path.isfile(preprocessing_path):
+ bam_ids = upload_utils.fetch_input_bam_ids(odir,encid)
+
+ if bam_ids == None:
+ success = False
+ return success_flag, args_json
+
+ args_json["experiment"] = encid
+ args_json["bam files"] = bam_ids
+ args_json["assay"] = "ATAC-seq"
+ args_json["observed signal profile bigWig"] = preprocessing_path
+ success = True
+ else:
+ success = False
+
+ return success, args_json
+
+def main_fetch_model_files(encid, args_json):
+ success = False
+ args_json["models tar"] = {}
+ readme_file = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/READMES/models.README"
+ assert(os.path.isfile(readme_file))
+ args_json["models tar"]["file.paths"] = [(readme_file, "README.md")]
+ args_json["models tar"]["logs.models."+encid] = {"file.paths": None}
+
+ for i in range(5):
+ data_paths, log_paths, log_paths_opt = upload_utils.fetch_per_fold_models(odir,models_path[i], encid, i)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["models tar"]["fold_"+str(i)] = {}
+ args_json["models tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["models tar"]["fold_"+str(i)]["logs.models.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt}
+ assert(len(data_paths) == 6)
+ assert(len(log_paths) == 13)
+
+ success=True
+ return success, args_json
+
+def main_fetch_training_files(encid, args_json):
+ success = False
+
+ # find the training test regions
+ args_json["training and test regions tar"] = {}
+ readme_file = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/READMES/training_test_regions.README"
+ assert(os.path.isfile(readme_file))
+ args_json["training and test regions tar"]["file.paths"] = [(readme_file, "README.md")]
+
+ input_peaks = os.path.join(odir, encid + "/preprocessing/downloads/peaks.bed.gz")
+ if os.path.isfile(input_peaks):
+ args_json["training and test regions tar"]["file.paths"].append((input_peaks,"peaks.all_input_regions."+encid+".bed.gz"))
+ else:
+ success = False
+ return success, args_json
+
+ input_nonpeaks = os.path.join(odir, encid + "/negatives_data/negatives_with_summit.bed")
+ if os.path.isfile(input_nonpeaks):
+ import pandas as pd
+ #os.system("gzip "+input_nonpeaks)
+ nonpeaks_data = pd.read_csv(input_nonpeaks, sep="\t", header=None)
+ nonpeaks_data.to_csv(input_nonpeaks+".gz", sep="\t", header=False, index=False, compression="gzip")
+ #os.system("rm "+input_nonpeaks)
+
+ input_nonpeaks = os.path.join(odir, encid + "/negatives_data/negatives_with_summit.bed.gz")
+ if os.path.isfile(input_nonpeaks):
+ args_json["training and test regions tar"]["file.paths"].append((input_nonpeaks,"nonpeaks.all_input_regions."+encid+".bed.gz"))
+ else:
+ success = False
+ return success, args_json
+
+ log_paths = upload_utils.fetch_preprocessing_log_files(odir,encid)
+ args_json["training and test regions tar"]["logs.training_test_regions."+encid] = {"file.paths": log_paths}
+ assert(len(log_paths) == 12)
+
+ for i in range(5):
+ data_paths, log_paths = upload_utils.fetch_per_fold_training_data(odir,models_path[i], encid, i)
+
+ args_json["training and test regions tar"]["fold_"+str(i)] = {}
+ args_json["training and test regions tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["training and test regions tar"]["fold_"+str(i)]["logs.training_test_regions.fold_"+str(i)+"."+encid] = {"file.paths": log_paths}
+ assert(len(data_paths) == 7)
+ assert(len(log_paths) == 4)
+
+ if len(data_paths) != 7:
+ success = False
+ return success, args_json
+
+ success = True
+ return success, args_json
+
+
+if __name__ == "__main__":
+
+
+ for name in ["K562", "GM12878", "HEPG2", "IMR90", "H1ESC"]:
+
+
+ encid=encode_id[name]
+ if os.path.isfile(output_dir+"/"+encid+".json"):
+ continue
+
+ print(encid)
+
+ args_json = {}
+
+ success, args_json = main_fetch_preprocessing_files(encid, args_json)
+ if not success:
+ print("fail prep")
+ continue
+
+ success, args_json = main_fetch_model_files(encid, args_json)
+ if not success:
+ print("fail model")
+ continue
+
+ success, args_json = main_fetch_training_files(encid, args_json)
+ if not success:
+ print("fail train prep")
+ continue
+
+
+ with open(output_dir+"/"+encid+".json", "w") as outfile:
+ json.dump(args_json, outfile, indent=4)
+
+ #print(args_json)
+
+
+
+
+
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/dnase_prepare_file_for_upload_models.py b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/dnase_prepare_file_for_upload_models.py
new file mode 100644
index 00000000..58521913
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/dnase_prepare_file_for_upload_models.py
@@ -0,0 +1,204 @@
+import os
+import upload_utils
+import json
+
+odir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/"
+bw_odir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/full_deepshaps/bigwigs/DNASE/"
+#output_dir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022-uploads/jsons/DNASE/stage1/jul_26_2023/"
+output_dir="dnase_production_uploads/"
+
+tissue_encids = open("../data/tissue_passed.txt").readlines()
+tissue_encids = [line.strip() for line in tissue_encids]
+
+primary_encids = open("../data/primary_passed.txt").readlines()
+primary_encids = [line.strip() for line in primary_encids]
+
+celline_encids = open("../data/cellline_passed.txt").readlines()
+celline_encids = [line.strip() for line in celline_encids]
+
+invitro_encids = open("../data/invitro_passed.txt").readlines()
+invitro_encids = [line.strip() for line in invitro_encids]
+
+
+ary_models_path = ["chrombppnet_model_encsr283tme_bias", "chrombppnet_model_encsr283tme_bias_fold_1", "chrombppnet_model_encsr283tme_bias_fold_2", "chrombppnet_model_encsr283tme_bias_fold_3", "chrombppnet_model_encsr283tme_bias_fold_4"]
+celline_models_path = ["chrombpnet_model_feb15_fold_0", "chrombpnet_model_feb15_fold_1", "chrombpnet_model_feb15_fold_2", "chrombpnet_model_feb15_fold_3", "chrombpnet_model_feb15_fold_4"]
+tissue_models_path = ["chrombpnet_model_encsr880cub_bias","chrombppnet_model_encsr880cub_bias_fold_1","chrombppnet_model_encsr880cub_bias_fold_2","chrombppnet_model_encsr880cub_bias_fold_3","chrombppnet_model_encsr880cub_bias_fold_4"]
+invitro_models_path = ["chrombpnet_model_encsr146kfx_bias", "chrombpnet_model_encsr146kfx_bias_fold_1", "chrombpnet_model_encsr146kfx_bias_fold_2", "chrombpnet_model_encsr146kfx_bias_fold_3", "chrombpnet_model_encsr146kfx_bias_fold_4"]
+
+encids = tissue_encids + primary_encids + celline_encids + invitro_encids
+
+def main_fetch_preprocessing_files(encid, args_json, bias_encid):
+
+ success_flag = False
+ args_json["upload bias"] = False
+ args_json["bias model encid"] = bias_encid
+
+ # find the bams input
+ preprocessing_path = os.path.join(odir, encid + "/preprocessing/bigWigs/"+encid+".bigWig")
+ if os.path.isfile(preprocessing_path):
+ bam_ids = upload_utils.fetch_input_bam_ids(odir,encid)
+
+ if bam_ids == None:
+ success = False
+ return success_flag, args_json
+
+ args_json["experiment"] = encid
+ args_json["bam files"] = bam_ids
+ args_json["assay"] = "DNase-seq"
+ args_json["observed signal profile bigWig"] = preprocessing_path
+ success = True
+ else:
+ success = False
+
+ return success, args_json
+
+def main_fetch_model_files(encid, args_json):
+ success = False
+ args_json["models tar"] = {}
+ readme_file = "READMEs/bias.models.README"
+ assert(os.path.isfile(readme_file))
+ args_json["models tar"]["file.paths"] = [(readme_file, "README.md")]
+ #args_json["models tar"]["logs.models."+encid] = {"file.paths": None}
+
+ for i in range(5):
+ data_paths, log_paths, log_paths_opt = upload_utils.fetch_per_fold_models(odir,models_path[i], encid, i)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["models tar"]["fold_"+str(i)] = {}
+ args_json["models tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["models tar"]["fold_"+str(i)]["logs.models.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt}
+ # 9 log file expected per model
+ assert(len(data_paths) == 6)
+ print(len(log_paths))
+ assert(len(log_paths) == 7)
+
+ success=True
+ return success, args_json
+
+def main_fetch_training_files(encid, args_json):
+ success = False
+
+ # find the training test regions
+ args_json["training and test regions tar"] = {}
+ #readme_file = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/READMES/training_test_regions.README"
+ readme_file = "READMEs/bias.models.README"
+ assert(os.path.isfile(readme_file))
+ args_json["training and test regions tar"]["file.paths"] = [(readme_file, "README.md")]
+
+ input_peaks = os.path.join(odir, encid + "/preprocessing/downloads/peaks.bed.gz")
+ if os.path.isfile(input_peaks):
+ args_json["training and test regions tar"]["file.paths"].append((input_peaks,"peaks.all_input_regions."+encid+".bed.gz"))
+ else:
+ success = False
+ return success, args_json
+
+ input_nonpeaks_gz = os.path.join(odir, encid + "/negatives_data/negatives_with_summit.bed.gz")
+ input_nonpeaks = os.path.join(odir, encid + "/negatives_data/negatives_with_summit.bed")
+ if not os.path.isfile(input_nonpeaks_gz):
+ if os.path.isfile(input_nonpeaks):
+ import pandas as pd
+ #os.system("gzip "+input_nonpeaks)
+ nonpeaks_data = pd.read_csv(input_nonpeaks, sep="\t", header=None)
+ nonpeaks_data.to_csv(input_nonpeaks+".gz", sep="\t", header=False, index=False, compression="gzip")
+ #os.system("rm "+input_nonpeaks)
+
+ input_nonpeaks = os.path.join(odir, encid + "/negatives_data/negatives_with_summit.bed.gz")
+
+ if os.path.isfile(input_nonpeaks):
+ args_json["training and test regions tar"]["file.paths"].append((input_nonpeaks,"nonpeaks.all_input_regions."+encid+".bed.gz"))
+ else:
+ success = False
+ return success, args_json
+
+ log_paths = upload_utils.fetch_preprocessing_log_files(odir,encid)
+ args_json["training and test regions tar"]["logs.training_test_regions."+encid] = {"file.paths": log_paths}
+ #print(len(log_paths))
+ #print(log_paths)
+ assert(len(log_paths) == 12)
+
+ for i in range(5):
+ data_paths, log_paths = upload_utils.fetch_per_fold_training_data(odir,models_path[i], encid, i)
+
+ args_json["training and test regions tar"]["fold_"+str(i)] = {}
+ args_json["training and test regions tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["training and test regions tar"]["fold_"+str(i)]["logs.training_test_regions.fold_"+str(i)+"."+encid] = {"file.paths": log_paths}
+ #print(data_paths)
+ assert(len(data_paths) == 7)
+
+ assert(len(log_paths) == 0)
+
+ if len(data_paths) != 7:
+ success = False
+ return success, args_json
+
+ success = True
+ return success, args_json
+
+
+if __name__ == "__main__":
+
+ ignore_list = []
+
+ for encid in ["ENCSR000EMT", "ENCSR477RTP"]:
+ #if encid in ignore_list:
+ # continue
+
+ if encid in primary_encids:
+ models_path = primary_models_path
+ bias_encid="ENCSR283TME"
+ #print("primary")
+ elif encid in tissue_encids:
+ models_path = tissue_models_path
+ bias_encid="ENCSR880CUB"
+ #print("tissue")
+ elif encid in invitro_encids:
+ models_path = invitro_models_path
+ bias_encid="ENCSR146KFX"
+ #print("invitro")
+ elif encid in celline_encids:
+ models_path = celline_models_path
+ bias_encid="ENCSR149XIL"
+ #print("celline")
+ else:
+ print(encid)
+ print("type not found")
+ continue
+
+ if os.path.isfile(output_dir+"/"+encid+".json"):
+ continue
+
+ print(encid)
+ args_json = {}
+
+
+ success, args_json = main_fetch_preprocessing_files(encid, args_json, bias_encid)
+ if not success:
+ print(encid)
+ print("exit preprocessing")
+ continue
+
+ success, args_json = main_fetch_model_files(encid, args_json)
+ if not success:
+ print(encid)
+ print("exit models")
+ continue
+
+ success, args_json = main_fetch_training_files(encid, args_json)
+ if not success:
+ print(encid)
+ print("exit train test regions")
+ continue
+
+
+ with open(output_dir+"/"+encid+".json", "w") as outfile:
+ json.dump(args_json, outfile, indent=4)
+
+ #print(args_json)
+
+
+
+
+
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/upload_utils.py b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/upload_utils.py
new file mode 100644
index 00000000..9f7f1415
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/upload_utils.py
@@ -0,0 +1,281 @@
+import os
+import json
+import numpy as np
+
+### utils for preprocessing
+
+def fetch_input_bam_ids(odir,encid):
+ log_path = os.path.join(odir, encid + "/preprocessing/preprocess_"+encid+".log")
+ logd = open(log_path).readlines()
+ set_cflag=False
+ set_bflag=False
+
+ bams_ids = []
+
+ for line in logd:
+
+ if set_cflag:
+ words = line.strip().split()
+ if words[1] == "cp":
+ if words[2].split("/")[-1].endswith("bam"):
+ bam_enc = words[2].split("/")[-1].replace(".bam","")
+ bams_ids.append(bam_enc)
+ return bams_ids
+ else:
+ print(encid,"error")
+ return
+ else:
+ print(encid,"error")
+ return
+
+ if set_bflag:
+ words = line.strip().split()
+ if words[1] == "samtools" and words[2] == "merge":
+ encids = words[6:]
+ for encid in encids:
+ if encid.split("/")[-1].endswith(".bam"):
+ bam_enc = encid.split("/")[-1].replace(".bam","")
+ bams_ids.append(bam_enc)
+ else:
+ print(encid,"error")
+ return
+ return bams_ids
+ else:
+ print(encid,"error")
+ return
+
+ if "Only one source bam file found. Copying over as merged file." in line:
+ set_cflag=True
+ if "Merging bam files" in line:
+ set_bflag=True
+
+### utils for training and testing regions
+
+def fetch_preprocessing_log_files(odir, encid):
+ # do bed file checks
+ log_paths = []
+
+ # preprocessing (6 files)
+ preprocessing_log = os.path.join(odir, encid + "/preprocessing/preprocessing.log.e")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stderr.txt"))
+
+ preprocessing_log = os.path.join(odir, encid + "/preprocessing/preprocessing.log.o")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout.txt"))
+
+ preprocessing_log = os.path.join(odir, encid + "/preprocessing/"+encid+".log")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout_v1.txt"))
+
+ preprocessing_log = os.path.join(odir, encid + "/preprocessing/preprocess_"+encid+".log")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout_v2.txt"))
+
+ preprocessing_log = os.path.join(odir, encid + "/preprocessing/params_file.json")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".params_file.json"))
+
+ preprocessing_log = os.path.join(odir, encid + "/preprocessing/bigWigs/"+encid+".png")
+ if os.stat(preprocessing_log).st_size != 0:
+ log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".bias_pwm.png"))
+
+ # peak_logs (2 files)
+ negatives_log = os.path.join(odir, encid + "/peak_calling/log.e")
+ if os.path.isfile(negatives_log):
+ log_paths.append((negatives_log,"logfile.peak_calling."+encid+".stdout_v1.txt"))
+
+ negatives_log = os.path.join(odir, encid + "/peak_calling/log.o")
+ if os.path.isfile(negatives_log):
+ log_paths.append((negatives_log,"logfile.peak_calling."+encid+".stdout_v2.txt"))
+
+ # negative logs (4 files)
+ negatives_log = os.path.join(odir, encid + "/negatives_data/make_background_regions.log")
+ if os.path.isfile(negatives_log):
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching."+encid+".stdout_v1.txt"))
+
+ negatives_log = os.path.join(odir, encid + "/negatives_data/"+encid+".log")
+ if os.path.isfile(negatives_log):
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching."+encid+".stdout_v1.txt"))
+
+
+ negatives_log = os.path.join(odir, encid + "/negatives_data/gc_matching.log.o")
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching."+encid+".stdout.txt"))
+
+ negatives_log = os.path.join(odir, encid + "/negatives_data/gc_matching.log.e")
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching."+encid+".stderr.txt"))
+
+
+ negatives_log = os.path.join(odir, encid + "/negatives_data/negatives_compared_with_foreground.png")
+ if os.stat(negatives_log).st_size != 0:
+ log_paths.append((negatives_log,"logfile.gc_matching."+encid+".stdout.png"))
+
+ return log_paths
+
+def fetch_per_fold_training_data(odir,model_dir,encid, fold_num):
+ input_paths = []
+ log_paths = []
+
+ opath = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/splits_format/"
+ filtered_regions_bed = os.path.join(opath + "/fold_"+str(fold_num)+".json")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"cv_params.fold_"+str(fold_num)+".json"))
+
+ filtered_regions_bed = os.path.join(odir, encid + "/" + model_dir + "/train_test_regions_may_7_2024/peaks.trainingset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"peaks.trainingset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(odir, encid + "/" + model_dir + "/train_test_regions_may_7_2024/peaks.validationset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"peaks.validationset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(odir, encid + "/" + model_dir + "/train_test_regions_may_7_2024/peaks.testset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"peaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(odir, encid + "/" + model_dir + "/train_test_regions_may_7_2024/nonpeaks.trainingset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.trainingset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(odir, encid + "/" + model_dir + "/train_test_regions_may_7_2024/nonpeaks.validationset.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.validationset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ #filtered_regions_bed = os.path.join(odir, encid + "/" + model_dir + "/train_test_regions_may_7_2024/nonpeaks.testset.bed.gz")
+ #if os.path.isfile(filtered_regions_bed):
+ # input_paths.append((filtered_regions_bed,"nonpeaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(odir, encid + "/negatives_data/test/test.fold_"+str(fold_num)+".filtered.negatives_with_summit.bed.gz")
+ if os.path.isfile(filtered_regions_bed):
+ input_paths.append((filtered_regions_bed,"nonpeaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+
+ # preprocessing logs to include
+
+
+ return input_paths, log_paths
+
+### utils for model uploads
+
+def fetch_per_fold_models(odir, model_dir, encid, fold_num):
+ input_paths = []
+ log_paths = []
+ log_paths_opt = []
+
+ cmb = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet_wo_bias.h5")
+ print(cmb)
+ if os.path.isfile(cmb):
+ input_paths.append((cmb,"model.chrombpnet_nobias.fold_"+str(fold_num)+"."+encid+".h5"))
+ else:
+ return None, None, None
+
+ cmb = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet.h5")
+ if os.path.isfile(cmb):
+ input_paths.append((cmb,"model.chrombpnet.fold_"+str(fold_num)+"."+encid+".h5"))
+ else:
+ return None, None, None
+
+
+
+ bm_model = os.path.join(odir, encid + "/" + model_dir + "/bias_model_scaled.h5")
+ if os.path.isfile(bm_model):
+ input_paths.append((bm_model,"model.bias_scaled.fold_"+str(fold_num)+"."+encid+".h5"))
+ else:
+ return None, None, None
+
+ cmb = os.path.join(odir, encid + "/" + model_dir + "/new_model_formats_may_7_24_vf/chrombpnet_wo_bias.tar")
+ if os.path.isfile(cmb):
+ input_paths.append((cmb,"model.chrombpnet_nobias.fold_"+str(fold_num)+"."+encid+".tar"))
+ else:
+ return None, None, None
+
+ cmb = os.path.join(odir, encid + "/" + model_dir + "/new_model_formats_may_7_24_vf/chrombpnet.tar")
+ if os.path.isfile(cmb):
+ input_paths.append((cmb,"model.chrombpnet.fold_"+str(fold_num)+"."+encid+".tar"))
+ else:
+ return None, None, None
+
+
+ bm_model = os.path.join(odir, encid + "/" + model_dir + "/new_model_formats_may_7_24_vf/bias_model_scaled.tar")
+ if os.path.isfile(bm_model):
+ input_paths.append((bm_model,"model.bias_scaled.fold_"+str(fold_num)+"."+encid+".tar"))
+ else:
+ return None, None, None
+
+ ### fetch main logs
+
+ modelling_log = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet.args.json")
+ if os.stat(modelling_log).st_size != 0:
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".args.json"))
+ else:
+ print(modelling_log)
+
+ modelling_log = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet_data_params.tsv")
+ if os.stat(modelling_log).st_size != 0:
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".chrombpnet_data_params.tsv"))
+ else:
+ print(modelling_log)
+
+ modelling_log = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet_model_params.tsv")
+ if os.stat(modelling_log).st_size != 0:
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".chrombpnet_model_params.tsv"))
+ else:
+ print(modelling_log)
+
+ modelling_log = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet.params.json")
+ if os.stat(modelling_log).st_size != 0:
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".chrombpnet.params.json"))
+ else:
+ print(modelling_log)
+
+ modelling_log = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet.log")
+ if os.stat(modelling_log).st_size != 0:
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".epoch_loss.csv"))
+ else:
+ print(modelling_log)
+
+ modelling_log = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet.log.batch")
+ if os.stat(modelling_log).st_size != 0:
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".batch_loss.tsv"))
+ else:
+ print(modelling_log)
+
+ modelling_log = os.path.join(odir, encid + "/" + model_dir + "/train_chrombpnet_model.log")
+ if os.stat(modelling_log).st_size != 0:
+ log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".stdout_v1.txt"))
+ else:
+ print(modelling_log)
+
+ #### fetch model training log files ########
+ modelling_log = os.path.join(odir, encid + "/" + model_dir + "/modelling.log.e")
+ if os.path.isfile(modelling_log):
+ if os.stat(modelling_log).st_size != 0:
+ log_paths_opt.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".stderr.txt"))
+ else:
+ print(modelling_log)
+ else:
+ print(modelling_log)
+
+ modelling_log = os.path.join(odir, encid + "/" + model_dir + "/modelling.log.o")
+ if os.path.isfile(modelling_log):
+ if os.stat(modelling_log).st_size != 0:
+ log_paths_opt.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".stdout.txt"))
+ else:
+ print(modelling_log)
+ else:
+ print(modelling_log)
+
+ #### fetch model conversion log files ########
+ #print(log_paths)
+ return input_paths, log_paths, log_paths_opt
+
+
+
+
+
+
+
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/dnase_prepare_file_for_upload_models.py b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/dnase_prepare_file_for_upload_models.py
index 56da09b4..d7050441 100755
--- a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/dnase_prepare_file_for_upload_models.py
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/dnase_prepare_file_for_upload_models.py
@@ -127,7 +127,7 @@ def main_fetch_training_files(encid, args_json):
assert(len(log_paths) == 4)
- if len(data_paths) != 7:
+ if len(data_paths) != 8:
success = False
return success, args_json
diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/upload_utils.py b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/upload_utils.py
index e4b6d6b5..ddf149f6 100755
--- a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/upload_utils.py
+++ b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/upload_utils.py
@@ -165,7 +165,11 @@ def fetch_per_fold_training_data(odir,model_dir,encid, fold_num):
if os.path.isfile(filtered_regions_bed):
input_paths.append((filtered_regions_bed,"nonpeaks.validationset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
- filtered_regions_bed = os.path.join(odir, encid + "/" + model_dir + "/train_test_regions/nonpeaks.testset.bed.gz")
+ #filtered_regions_bed = os.path.join(odir, encid + "/" + model_dir + "/train_test_regions/nonpeaks.testset.bed.gz")
+ #if os.path.isfile(filtered_regions_bed):
+ # input_paths.append((filtered_regions_bed,"nonpeaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
+
+ filtered_regions_bed = os.path.join(odir, encid + "/negatives_data/test/test.fold_"+str(fold_num)+".filtered.negatives_with_summit.bed.gz")
if os.path.isfile(filtered_regions_bed):
input_paths.append((filtered_regions_bed,"nonpeaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz"))
diff --git a/upload_jsons/upload_jsons_scripts/modisco_uploads/READMEs/modisco.report.README b/upload_jsons/upload_jsons_scripts/modisco_uploads/READMEs/modisco.report.README
new file mode 100644
index 00000000..e69de29b
diff --git a/upload_jsons/upload_jsons_scripts/modisco_uploads/atac_prepare.py b/upload_jsons/upload_jsons_scripts/modisco_uploads/atac_prepare.py
new file mode 100644
index 00000000..dc63d61b
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/modisco_uploads/atac_prepare.py
@@ -0,0 +1,75 @@
+import os
+import json
+
+encids = ["IMR90", "H1ESC", "GM12878", "HEPG2", "K562"]
+
+encode_id = {"K562": "ENCSR868FGK",
+"GM12878": "ENCSR637XSC",
+"HEPG2": "ENCSR291GJU",
+"IMR90": "ENCSR200OML",
+"H1ESC": "GSE267154"}
+
+ooutdir='atac/'
+
+
+
+
+
+for name in encids:
+
+ encid = encode_id[name]
+ args_json = {}
+ args_json["experiment"] = encode_id[name]
+ args_json["sequence motifs tar"] = {}
+
+ success=True
+ readme_file="READMEs/modisco.report.README"
+ if os.path.isfile(readme_file):
+ args_json["sequence motifs tar"]["file.paths"] = [(readme_file, "README.md")]
+
+ args_json["sequence motifs tar"]["counts"] = {"file.paths": []}
+ args_json["sequence motifs tar"]["profile"] = {"file.paths": []}
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"
+ counts_modisco=odir+name+"/merge_folds_new_may_05_24/counts/modisco_counts.h5"
+ if os.path.isfile(counts_modisco):
+ args_json["sequence motifs tar"]["counts"]["file.paths"].append((counts_modisco, "tfmodisco.raw_output.counts."+encid+".hd5"))
+ else:
+ print(counts_modisco)
+ continue
+
+ profile_modisco=odir+name+"/merge_folds_new_may_05_24/profile/modisco_profile.h5"
+ if os.path.isfile(profile_modisco):
+ args_json["sequence motifs tar"]["profile"]["file.paths"].append((profile_modisco, "tfmodisco.raw_output.profile."+encid+".hd5"))
+ else:
+ print(profile_modisco)
+ continue
+
+ args_json["sequence motifs tar"]["counts"]["tfmodisco.seq_contrib.counts.meme."+encid] = {"file.paths": []}
+
+ for formats in ["CWM", "CWM-PFM", "hCWM", "hCWM-PFM", "PFM"]:
+ meme_file=odir+name+"/merge_folds_new_may_05_24/counts/"+formats
+ if os.path.isfile(meme_file):
+ args_json["sequence motifs tar"]["counts"]["tfmodisco.seq_contrib.counts.meme."+encid]["file.paths"].append((meme_file,"tfmodisco.seq_contrib.counts."+formats+".meme."+encid))
+ else:
+ print(meme_file)
+ continue
+ args_json["sequence motifs tar"]["profile"]["tfmodisco.seq_contrib.profile.meme."+encid] = {"file.paths": []}
+
+ for formats in ["CWM", "CWM-PFM", "hCWM", "hCWM-PFM", "PFM"]:
+ meme_file=odir+name+"/merge_folds_new_may_05_24/profile/"+formats
+ if os.path.isfile(meme_file):
+ args_json["sequence motifs tar"]["profile"]["tfmodisco.seq_contrib.profile.meme."+encid]["file.paths"].append((meme_file,"tfmodisco.seq_contrib.profile."+formats+".meme."+encid))
+ else:
+ print(meme_file)
+ success=False
+ break
+ if not success:
+ continue
+
+ if not os.path.isfile(ooutdir+encode_id[name]+".json"):
+ f = open(ooutdir+encode_id[name]+".json", "w")
+ json.dump(args_json, f, indent=4)
+ f.close()
+
+
diff --git a/upload_jsons/upload_jsons_scripts/modisco_uploads/dnase_prepare.py b/upload_jsons/upload_jsons_scripts/modisco_uploads/dnase_prepare.py
new file mode 100644
index 00000000..7a35cd98
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/modisco_uploads/dnase_prepare.py
@@ -0,0 +1,75 @@
+import os
+import json
+
+encids = ["IMR90_new", "H1ESC_new", "GM12878_new", "HEPG2", "K562"]
+
+encode_id = {"HEPG2": "ENCSR149XIL",
+ "K562": "ENCSR000EOT",
+ "IMR90_new": "ENCSR477RTP",
+ "GM12878_new": "ENCSR000EMT",
+ "H1ESC_new": "ENCSR000EMU"}
+
+ooutdir='dnase/'
+
+
+
+
+
+for name in encids:
+
+ encid = encode_id[name]
+ args_json = {}
+ args_json["experiment"] = encode_id[name]
+ args_json["sequence motifs tar"] = {}
+
+ success=True
+ readme_file="READMEs/modisco.report.README"
+ if os.path.isfile(readme_file):
+ args_json["sequence motifs tar"]["file.paths"] = [(readme_file, "README.md")]
+
+ args_json["sequence motifs tar"]["counts"] = {"file.paths": []}
+ args_json["sequence motifs tar"]["profile"] = {"file.paths": []}
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"
+ counts_modisco=odir+name+"/merge_folds_new_may_05_24/counts/modisco_counts.h5"
+ if os.path.isfile(counts_modisco):
+ args_json["sequence motifs tar"]["counts"]["file.paths"].append((counts_modisco, "tfmodisco.raw_output.counts."+encid+".hd5"))
+ else:
+ print(counts_modisco)
+ continue
+
+ profile_modisco=odir+name+"/merge_folds_new_may_05_24/profile/modisco_profile.h5"
+ if os.path.isfile(profile_modisco):
+ args_json["sequence motifs tar"]["profile"]["file.paths"].append((profile_modisco, "tfmodisco.raw_output.profile."+encid+".hd5"))
+ else:
+ print(profile_modisco)
+ continue
+
+ args_json["sequence motifs tar"]["counts"]["tfmodisco.seq_contrib.counts.meme."+encid] = {"file.paths": []}
+
+ for formats in ["CWM", "CWM-PFM", "hCWM", "hCWM-PFM", "PFM"]:
+ meme_file=odir+name+"/merge_folds_new_may_05_24/counts/"+formats
+ if os.path.isfile(meme_file):
+ args_json["sequence motifs tar"]["counts"]["tfmodisco.seq_contrib.counts.meme."+encid]["file.paths"].append((meme_file,"tfmodisco.seq_contrib.counts."+formats+".meme."+encid))
+ else:
+ print(meme_file)
+ continue
+ args_json["sequence motifs tar"]["profile"]["tfmodisco.seq_contrib.profile.meme."+encid] = {"file.paths": []}
+
+ for formats in ["CWM", "CWM-PFM", "hCWM", "hCWM-PFM", "PFM"]:
+ meme_file=odir+name+"/merge_folds_new_may_05_24/profile/"+formats
+ if os.path.isfile(meme_file):
+ args_json["sequence motifs tar"]["profile"]["tfmodisco.seq_contrib.profile.meme."+encid]["file.paths"].append((meme_file,"tfmodisco.seq_contrib.profile."+formats+".meme."+encid))
+ else:
+ print(meme_file)
+ success=False
+ break
+ if not success:
+ continue
+
+ if not os.path.isfile(ooutdir+encode_id[name]+".json"):
+ f = open(ooutdir+encode_id[name]+".json", "w")
+ json.dump(args_json, f, indent=4)
+ f.close()
+
+
diff --git a/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/READMEs/bc.predicted.README b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/READMEs/bc.predicted.README
new file mode 100644
index 00000000..68a4c99f
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/READMEs/bc.predicted.README
@@ -0,0 +1,71 @@
+# Directory Structure Format
+.
+├── input_regions.pred.chrombpnet_nobias.encid.bed.gz # Input bed regions to obtain prediction h5s from chrombpnet_nobias.h5 for each fold
+├── pred.chrombpnet_nobias.fold_mean.encid.h5 # Average of prediction h5s from chrombpnet_nobias.h5 across all folds (input format discussed below)
+├── logs.pred.chrombpnet_nobias.fold_mean.encid # Directory containing log files
+├── fold_0
+│ ├── pred.chrombpnet_nobias.fold_0.encid.h5 # prediction h5s for fold_0 from chrombpnet_nobias.h5 (input format discussed below)
+│ └── logs.pred.chrombpnet_nobias.fold_0.encid # Directory containing log files
+│
+├── fold_1
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_2
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_3
+│ └── ... # similar directory structure as fold_0 directory above
+│
+└── fold_4
+ └── ... # similar directory structure as fold_0 directory above
+
+# Format of bed file
+
+* All the bed files are in narrowpeak format with 10 columns and follow GRCh38 assembly coordinates.
+
+1) chrom - Name of the chromosome (or contig, scaffold, etc.).
+2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
+3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd is a 1-based coordinate. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases 1 to 100.
+4) name - Name given to a region (preferably unique). Use "." if no name is assigned.
+5) score - Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were "'0"' when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000.
+6) strand - +/- to denote strand or orientation (whenever applicable). Use "." if no orientation is assigned.
+7) signalValue - Measurement of overall (usually, average) enrichment for the region.
+8) pValue - Measurement of statistical significance (-log10). Use -1 if no pValue is assigned.
+9) qValue - Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
+10) peak summit - Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.
+
+# Format of h5s
+
+The h5py object consists of two keys: `coords`, `predictions`
+
+Each `coords` object has three keys - `coords_chrom`, `coords_start_dset`, `coords_end_dset`
+(a) The `coords_chrom` has an array of length N (number of regions) containing chromosome names
+(b) The `coords_start_dset` has an array of length N containing chromosome start coordinates. The first base in a chromosome is numbered 0. Follows GRCh38 assembly coordinates.
+(c) The `coords_end_dset` has an array of length N containing chromosome end coordinates. The `coords_end_dset` is a 1-based coordinate.
+
+Each `predictions` object has two keys - `logcounts`, `logits`
+(a) The `logcounts` is again an array of shape Nx1 with logcount (log base e) predictions
+(b) The `logits` is an array of shape Nx1000, which represents the logits of the base resolution predicted probability profiles over 1000 bp for each of the N profiles
+
+The `predictions` align with regions specified in the bed file, centered at the summit and expanded by 500 base pairs (bp)
+on each side. The 'coords' object should contain the corresponding coordinates for each prediction, and the difference
+between 'coords_end_dset' and 'coords_start_dset' should equal 1000.
+
+# Obtaining average h5s and then prediction bigwigs from individual folds
+
+To create the `fold_mean.encid.h5` file from individual h5 files, we start by averaging the logcounts and logits across various folds.
+Next, we utilize a softmax operation on the averaged logits to transition them into probability profiles. In parallel, we exponentiate
+the logcounts to convert them into counts. Multiplying the counts with the derived probability profiles, we generate base-resolution
+predictions, which are subsequently recorded into both h5 and bigWig files.
+
+# Pseudocode for loading h5s
+
+```
+import h5py
+data = h5py.File(predictions_h5, "r")
+logcounts_preds = data['predictions']['logcounts']
+logit_preds = data['predictions']['logits']
+chrom_coords = data['coords']['coords_chrom']
+start_coords = data['coords']['coords_start_dset']
+end_coords = data['coords']['coords_end_dset']
+```
diff --git a/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/READMEs/predicted.README b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/READMEs/predicted.README
new file mode 100644
index 00000000..847959b2
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/READMEs/predicted.README
@@ -0,0 +1,71 @@
+# Directory Structure Format
+.
+├── input_regions.pred.chrombpnet.encid.bed.gz # Input bed regions to obtain prediction h5s from chrombpnet.h5 model for each fold
+├── pred.chrombpnet.fold_mean.encid.h5 # Average of prediction h5s from chrombpnet.h5 model across all folds (input format discussed below)
+├── logs.pred.chrombpnet.fold_mean.encid # Directory containing log files
+├── fold_0
+│ ├── pred.chrombpnet.fold_0.encid.h5 # prediction h5s for fold_0 from chrombpnet.h5 model (input format discussed below)
+│ └── logs.pred.chrombpnet.fold_0.encid # Directory containing log files
+│
+├── fold_1
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_2
+│ └── ... # similar directory structure as fold_0 directory above
+│
+├── fold_3
+│ └── ... # similar directory structure as fold_0 directory above
+│
+└── fold_4
+ └── ... # similar directory structure as fold_0 directory above
+
+# Format of bed file
+
+* All the bed files are in narrowpeak format with 10 columns and follow GRCh38 assembly coordinates.
+
+1) chrom - Name of the chromosome (or contig, scaffold, etc.).
+2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0.
+3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd is a 1-based coordinate. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases 1 to 100.
+4) name - Name given to a region (preferably unique). Use "." if no name is assigned.
+5) score - Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were "'0"' when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000.
+6) strand - +/- to denote strand or orientation (whenever applicable). Use "." if no orientation is assigned.
+7) signalValue - Measurement of overall (usually, average) enrichment for the region.
+8) pValue - Measurement of statistical significance (-log10). Use -1 if no pValue is assigned.
+9) qValue - Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned.
+10) peak summit - Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called.
+
+# Format of h5s
+
+The h5py object consists of two keys: `coords`, `predictions`
+
+Each `coords` object has three keys - `coords_chrom`, `coords_start_dset`, `coords_end_dset`
+(a) The `coords_chrom` has an array of length N (N is the number of regions) containing chromosome names
+(b) The `coords_start_dset` has an array of length N containing chromosome start coordinates. The first base in a chromosome is numbered 0. Follows GRCh38 assembly coordinates.
+(c) The `coords_end_dset` has an array of length N containing chromosome end coordinates. The `coords_end_dset` is a 1-based coordinate.
+
+Each `predictions` object has two keys - `logcounts`, `logits`
+(a) The `logcounts` is again an array of shape Nx1 with logcount (log base e) predictions
+(b) The `logits` is an array of shape Nx1000, which represents the logits of the base resolution predicted probability profiles over 1000 bp for each of the N profiles
+
+The `predictions` align with regions specified in the bed file, centered at the summit and expanded by 500 base pairs (bp)
+on each side. The 'coords' object should contain the corresponding coordinates for each prediction, and the difference
+between 'coords_end_dset' and 'coords_start_dset' should equal 1000.
+
+# Obtaining average h5s and then prediction bigwigs from individual folds
+
+To create the `fold_mean.encid.h5` file from individual h5 files, we start by averaging the logcounts and logits across various folds.
+Next, we utilize a softmax operation on the averaged logits to transition them into probability profiles. In parallel, we exponentiate
+the logcounts to convert them into counts. Multiplying the counts with the derived probability profiles, we generate base-resolution
+predictions, which are subsequently recorded into both h5 and bigWig files.
+
+# Pseudocode for loading h5s
+
+```
+import h5py
+data = h5py.File(predictions_h5, "r")
+logcounts_preds = data['predictions']['logcounts']
+logit_preds = data['predictions']['logits']
+chrom_coords = data['coords']['coords_chrom']
+start_coords = data['coords']['coords_start_dset']
+end_coords = data['coords']['coords_end_dset']
+```
diff --git a/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare.py b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare.py
new file mode 100644
index 00000000..d8014055
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare.py
@@ -0,0 +1,32 @@
+import os
+import json
+
+
+encids = ["K562", "HEPG2", "IMR90", "H1ESC", "GM12878"]
+
+encode_id = {"K562": "ENCSR868FGK",
+"GM12878": "ENCSR637XSC",
+"HEPG2": "ENCSR291GJU",
+"IMR90": "ENCSR200OML",
+"H1ESC": "GSE267154"}
+
+odir='atac/'
+for encid in encids:
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/preds_upload/average_preds/"+encid+".mean_preds_wo_bias.stat"
+ if os.path.isfile(ofile):
+ print(encid)
+ wbias = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/preds_upload/average_preds/"+encid+".mean_preds_w_bias.bw"
+ nobias = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/preds_upload/average_preds/"+encid+".mean_preds_wo_bias.bw"
+
+ assert(os.path.isfile(wbias)==True)
+ assert(os.path.isfile(nobias)==True)
+
+ output_json = {}
+ output_json["experiment"] = encode_id[encid]
+ output_json["predicted signal profile bigWig"] = wbias
+ output_json["bias-corrected predicted signal profile bigWig"] = nobias
+
+ if not os.path.isfile(odir+encode_id[encid]+".json"):
+ f = open(odir+encode_id[encid]+".json", "w")
+ json.dump(output_json, f, indent=4)
+ f.close()
diff --git a/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare_tar.py b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare_tar.py
new file mode 100644
index 00000000..4b336606
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare_tar.py
@@ -0,0 +1,139 @@
+import os
+import json
+import pandas as pd
+
+names = ["K562", "HEPG2", "IMR90", "H1ESC", "GM12878"]
+
+encode_id = {"K562": "ENCSR868FGK",
+"GM12878": "ENCSR637XSC",
+"HEPG2": "ENCSR291GJU",
+"IMR90": "ENCSR200OML",
+"H1ESC": "GSE267154"}
+
+encode_id_dnase = {
+"GM12878": "ENCSR000EMT",
+"IMR90": "ENCSR477RTP",
+"H1ESC": "ENCSR000EMU"}
+
+outdir='atac_tar/'
+
+def fetch_per_fold_preds(odir,model_path, encid, i, name):
+
+ data_paths = []
+ log_paths = []
+ log_paths_opt = []
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/preds_upload/fold_"+str(i)+"/"
+ input_h5 = os.path.join(odir, name+"_wo_bias_all_predictions.h5")
+ data_paths.append((input_h5, "pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".h5"))
+
+ input_log = os.path.join(odir, "pred.counts.log.e")
+ #print(input_log)
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr.txt"))
+
+ input_log = os.path.join(odir, "pred.counts.log.o")
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout.txt"))
+
+ input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]+"/chrombpnet_model/preds_atac/pred.counts.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout_v1.txt"))
+
+ input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]+"/chrombpnet_model/preds_atac/pred.counts.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr_v1.txt"))
+
+ input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]+"/chrombpnet_model/preds_dnase/pred.counts.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout_v2.txt"))
+
+ input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]+"/chrombpnet_model/preds_dnase/pred.counts.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr_v2.txt"))
+
+ return data_paths, log_paths, log_paths_opt
+
+def fetch_pred_tar(encid, args_json, model_paths, name):
+ success = False
+ args_json["bias-corrected predicted signal profile tar"] = {}
+ readme_file = "READMEs/bc.predicted.README"
+ assert(os.path.isfile(readme_file))
+ args_json["bias-corrected predicted signal profile tar"]["file.paths"] = [(readme_file, "README.md")]
+ args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid] = {"file.paths": []}
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/preds_upload/average_preds/"
+
+ input_h5 = os.path.join(odir, name+".mean_preds_wo_bias_predictions.h5")
+ if os.path.isfile(input_h5):
+ args_json["bias-corrected predicted signal profile tar"]["file.paths"].append((input_h5,"pred.chrombpnet_nobias.fold_mean."+encid+".h5"))
+ else:
+ success = False
+ return success, args_json
+
+ if name in ["IMR90", "GM12878", "H1ESC"]:
+ bed1 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/"+encode_id_dnase[name]+"/preprocessing/downloads/peaks.bed.gz", sep='\t', header=None)
+ else:
+ bed1 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/peaks_no_blacklist.bed", sep='\t', header=None)
+
+
+ bed2 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/peaks_no_blacklist.bed", sep='\t', header=None)
+
+ print(bed1.shape)
+ print(bed2.shape)
+ bedf = pd.concat([bed1, bed2])
+ print(bedf.shape)
+
+ input_bed = os.path.join(odir, "input.regions.bed.gz")
+ if os.path.isfile(input_bed):
+ args_json["bias-corrected predicted signal profile tar"]["file.paths"].append((input_bed,"input_regions.pred.chrombpnet_nobias."+encid+".bed.gz"))
+ else:
+ bedf.to_csv(input_bed, sep='\t', header=False, index=False, compression='gzip')
+
+
+ input_log = os.path.join(odir, "merge.preds.log.e")
+ if os.path.isfile(input_log):
+ args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid]["file.paths"].append((input_log, "logs.pred.chrombpnet_nobias.fold_mean."+encid+".stderr.txt"))
+
+ input_log = os.path.join(odir, "merge.preds.log.o")
+ if os.path.isfile(input_log):
+ args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid]["file.paths"].append((input_log, "logs.pred.chrombpnet_nobias.fold_mean."+encid+".stdout.txt"))
+
+
+ for i in range(5):
+ data_paths, log_paths, log_paths_opt = fetch_per_fold_preds(odir,model_paths[i], encid, i, name)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)] = {}
+ args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)]["logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt}
+ assert(len(data_paths) == 1)
+ print(len(log_paths))
+ assert(len(log_paths) == 6)
+
+ success=True
+ return success, args_json
+
+model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/model_dir_atac.csv",sep=",", header=None)
+
+for name in names:
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/preds_upload/average_preds/"+name+".mean_preds_wo_bias.stat"
+ if os.path.isfile(ofile):
+ args_json = {}
+ encid=encode_id[name]
+ args_json['experiment'] = encid
+ model_paths = model_atac[model_atac[1]==name][2].values
+ print(model_paths)
+ success, args_json = fetch_pred_tar(encid, args_json, model_paths, name)
+ if not success:
+ print("ERR preds tar")
+ continue
+
+ if not os.path.isfile(outdir+encid+"_wo_bias.json"):
+ f = open(outdir+encode_id[name]+"_wo_bias.json", "w")
+ json.dump(args_json, f, indent=4)
+ f.close()
+
diff --git a/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare_tar_w_bias.py b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare_tar_w_bias.py
new file mode 100644
index 00000000..0f3bbbe7
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare_tar_w_bias.py
@@ -0,0 +1,139 @@
+import os
+import json
+import pandas as pd
+
+names = ["K562", "HEPG2", "IMR90", "H1ESC", "GM12878"]
+
+encode_id = {"K562": "ENCSR868FGK",
+"GM12878": "ENCSR637XSC",
+"HEPG2": "ENCSR291GJU",
+"IMR90": "ENCSR200OML",
+"H1ESC": "GSE267154"}
+
+encode_id_dnase = {
+"GM12878": "ENCSR000EMT",
+"IMR90": "ENCSR477RTP",
+"H1ESC": "ENCSR000EMU"}
+
+outdir='atac_tar/'
+
+def fetch_per_fold_preds(odir,model_path, encid, i, name):
+
+ data_paths = []
+ log_paths = []
+ log_paths_opt = []
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/preds_upload/fold_"+str(i)+"/"
+ input_h5 = os.path.join(odir, name+"_w_bias_all_predictions.h5")
+ data_paths.append((input_h5, "pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".h5"))
+
+ input_log = os.path.join(odir, "pred.counts.log.e")
+ #print(input_log)
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr.txt"))
+
+ input_log = os.path.join(odir, "pred.counts.log.o")
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout.txt"))
+
+ input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]+"/chrombpnet_model/preds_atac/pred.counts.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout_v1.txt"))
+
+ input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]+"/chrombpnet_model/preds_atac/pred.counts.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr_v1.txt"))
+
+ input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]+"/chrombpnet_model/preds_dnase/pred.counts.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout_v2.txt"))
+
+ input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]+"/chrombpnet_model/preds_dnase/pred.counts.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr_v2.txt"))
+
+ return data_paths, log_paths, log_paths_opt
+
+def fetch_pred_tar(encid, args_json, model_paths, name):
+ success = False
+ args_json["bias-corrected predicted signal profile tar"] = {}
+ readme_file = "READMEs/bc.predicted.README"
+ assert(os.path.isfile(readme_file))
+ args_json["bias-corrected predicted signal profile tar"]["file.paths"] = [(readme_file, "README.md")]
+ args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid] = {"file.paths": []}
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/preds_upload/average_preds/"
+
+ input_h5 = os.path.join(odir, name+".mean_preds_w_bias_predictions.h5")
+ if os.path.isfile(input_h5):
+ args_json["bias-corrected predicted signal profile tar"]["file.paths"].append((input_h5,"pred.chrombpnet_nobias.fold_mean."+encid+".h5"))
+ else:
+ success = False
+ return success, args_json
+
+ if name in ["IMR90", "GM12878", "H1ESC"]:
+ bed1 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/"+encode_id_dnase[name]+"/preprocessing/downloads/peaks.bed.gz", sep='\t', header=None)
+ else:
+ bed1 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/peaks_no_blacklist.bed", sep='\t', header=None)
+
+
+ bed2 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/peaks_no_blacklist.bed", sep='\t', header=None)
+
+ print(bed1.shape)
+ print(bed2.shape)
+ bedf = pd.concat([bed1, bed2])
+ print(bedf.shape)
+
+ input_bed = os.path.join(odir, "input.regions.bed.gz")
+ if os.path.isfile(input_bed):
+ args_json["bias-corrected predicted signal profile tar"]["file.paths"].append((input_bed,"input_regions.pred.chrombpnet_nobias."+encid+".bed.gz"))
+ else:
+ bedf.to_csv(input_bed, sep='\t', header=False, index=False, compression='gzip')
+
+
+ input_log = os.path.join(odir, "merge.preds.log.e")
+ if os.path.isfile(input_log):
+ args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid]["file.paths"].append((input_log, "logs.pred.chrombpnet_nobias.fold_mean."+encid+".stderr.txt"))
+
+ input_log = os.path.join(odir, "merge.preds.log.o")
+ if os.path.isfile(input_log):
+ args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid]["file.paths"].append((input_log, "logs.pred.chrombpnet_nobias.fold_mean."+encid+".stdout.txt"))
+
+
+ for i in range(5):
+ data_paths, log_paths, log_paths_opt = fetch_per_fold_preds(odir,model_paths[i], encid, i, name)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)] = {}
+ args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)]["logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt}
+ assert(len(data_paths) == 1)
+ print(len(log_paths))
+ assert(len(log_paths) == 6)
+
+ success=True
+ return success, args_json
+
+model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/model_dir_atac.csv",sep=",", header=None)
+
+for name in names:
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/preds_upload/average_preds/"+name+".mean_preds_w_bias.stat"
+ if os.path.isfile(ofile):
+ args_json = {}
+ encid=encode_id[name]
+ args_json["experiment"] = encid
+ model_paths = model_atac[model_atac[1]==name][2].values
+ print(model_paths)
+ success, args_json = fetch_pred_tar(encid, args_json, model_paths, name)
+ if not success:
+ print("ERR preds tar")
+ continue
+
+ if not os.path.isfile(outdir+encid+"_w_bias.json"):
+ f = open(outdir+encode_id[name]+".json", "w")
+ json.dump(args_json, f, indent=4)
+ f.close()
+
diff --git a/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare.py b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare.py
new file mode 100644
index 00000000..6f31826f
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare.py
@@ -0,0 +1,31 @@
+import os
+import json
+
+
+encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"]
+
+encode_id = {"HEPG2": "ENCSR149XIL",
+ "K562": "ENCSR000EOT",
+ "IMR90_new": "ENCSR477RTP",
+ "GM12878_new": "ENCSR000EMT",
+ "H1ESC_new": "ENCSR000EMU"}
+odir='dnase/'
+for encid in encids:
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/preds_upload/average_preds_with_ccre_vf/"+encid+".mean_preds_wo_bias.stat"
+ if os.path.isfile(ofile):
+ print(encid)
+ wbias = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/preds_upload/average_preds_with_ccre_vf/"+encid+".mean_preds_w_bias.bw"
+ nobias = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/preds_upload/average_preds_with_ccre_vf/"+encid+".mean_preds_wo_bias.bw"
+
+ assert(os.path.isfile(wbias)==True)
+ assert(os.path.isfile(nobias)==True)
+
+ output_json = {}
+ output_json["experiment"] = encode_id[encid]
+ output_json["predicted signal profile bigWig"] = wbias
+ output_json["bias-corrected predicted signal profile bigWig"] = nobias
+
+ if not os.path.isfile(odir+encode_id[encid]+".json"):
+ f = open(odir+encode_id[encid]+".json", "w")
+ json.dump(output_json, f, indent=4)
+ f.close()
diff --git a/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare_tar.py b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare_tar.py
new file mode 100644
index 00000000..c1d0ce01
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare_tar.py
@@ -0,0 +1,146 @@
+import os
+import json
+import pandas as pd
+
+names = ["IMR90_new", "H1ESC_new", "GM12878_new"]
+#names = ["K562", "HEPG2"]
+
+
+model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/v1/model_dir_dnase_v2_interpret.csv",sep=",", header=None)
+#model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/v1/model_dir_dnase_v2.csv",sep=",", header=None)
+
+encode_id = {"HEPG2": "ENCSR149XIL",
+ "K562": "ENCSR000EOT",
+ "IMR90_new": "ENCSR477RTP",
+ "GM12878_new": "ENCSR000EMT",
+ "H1ESC_new": "ENCSR000EMU"}
+
+
+encode_id_dnase = {
+"GM12878_new": "ENCSR000EMT",
+"IMR90_new": "ENCSR477RTP",
+"H1ESC_new": "ENCSR000EMU"}
+
+outdir='dnase_tar/'
+
+def fetch_per_fold_preds(odir,model_path, encid, i, name):
+
+ data_paths = []
+ log_paths = []
+ log_paths_opt = []
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/preds_upload/fold_"+str(i)+"/"
+ input_h5 = os.path.join(odir, name+"_wo_bias_all_predictions.h5")
+ data_paths.append((input_h5, "pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".h5"))
+
+ input_log = os.path.join(odir, "pred.counts.log.e")
+ print(input_log)
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr.txt"))
+
+ input_log = os.path.join(odir, "pred.counts.log.o")
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout.txt"))
+
+ input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-2]+"/chrombpnet_model/preds_atac/pred.counts.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout_v1.txt"))
+
+ print(input_log)
+ input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-2]+"/chrombpnet_model/preds_atac/pred.counts.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr_v1.txt"))
+
+ input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-2]+"/chrombpnet_model/preds_dnase/pred.counts.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout_v2.txt"))
+
+ input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-2]+"/chrombpnet_model/preds_dnase/pred.counts.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr_v2.txt"))
+
+ return data_paths, log_paths, log_paths_opt
+
+def fetch_pred_tar(encid, args_json, model_paths, name):
+ success = False
+ args_json["bias-corrected predicted signal profile tar"] = {}
+ readme_file = "READMEs/bc.predicted.README"
+ assert(os.path.isfile(readme_file))
+ args_json["bias-corrected predicted signal profile tar"]["file.paths"] = [(readme_file, "README.md")]
+ args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid] = {"file.paths": []}
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/preds_upload/average_preds/"
+
+ input_h5 = os.path.join(odir, name+".mean_preds_wo_bias_predictions.h5")
+ if os.path.isfile(input_h5):
+ args_json["bias-corrected predicted signal profile tar"]["file.paths"].append((input_h5,"pred.chrombpnet_nobias.fold_mean."+encid+".h5"))
+ else:
+ success = False
+ return success, args_json
+
+ if name in ["IMR90_new", "GM12878_new", "H1ESC_new"]:
+ bed1 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/"+encode_id_dnase[name]+"/preprocessing/downloads/peaks.bed.gz", sep='\t', header=None)
+ else:
+ bed1 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/peaks_no_blacklist.bed", sep='\t', header=None)
+
+
+ bed2 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name.replace("_new","")+"/peaks_no_blacklist.bed", sep='\t', header=None)
+
+ print(bed1.shape)
+ print(bed2.shape)
+ bedf = pd.concat([bed1, bed2])
+ print(bedf.shape)
+
+ input_bed = os.path.join(odir, "input.regions.bed.gz")
+ if os.path.isfile(input_bed):
+ args_json["bias-corrected predicted signal profile tar"]["file.paths"].append((input_bed,"input_regions.pred.chrombpnet_nobias."+encid+".bed.gz"))
+ else:
+ bedf.to_csv(input_bed, sep='\t', header=False, index=False, compression='gzip')
+
+
+ input_log = os.path.join(odir, "merge.preds.log.e")
+ if os.path.isfile(input_log):
+ args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid]["file.paths"].append((input_log, "logs.pred.chrombpnet_nobias.fold_mean."+encid+".stderr.txt"))
+
+ input_log = os.path.join(odir, "merge.preds.log.o")
+ if os.path.isfile(input_log):
+ args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid]["file.paths"].append((input_log, "logs.pred.chrombpnet_nobias.fold_mean."+encid+".stdout.txt"))
+
+
+ for i in range(5):
+ data_paths, log_paths, log_paths_opt = fetch_per_fold_preds(odir,model_paths[i], encid, i, name)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)] = {}
+ args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)]["logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt}
+ assert(len(data_paths) == 1)
+ print(len(log_paths))
+ assert(len(log_paths) == 6)
+ #assert(len(log_paths) >= 2)
+
+ success=True
+ return success, args_json
+
+
+for name in names:
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/preds_upload/average_preds/"+name+".mean_preds_wo_bias.stat"
+ if os.path.isfile(ofile):
+ args_json = {}
+ encid=encode_id[name]
+ args_json["experiment"] = encid
+ model_paths = model_atac[model_atac[1]==name.replace("_new","")][2].values
+ print(model_paths)
+ success, args_json = fetch_pred_tar(encid, args_json, model_paths, name)
+ if not success:
+ print("ERR preds tar")
+ continue
+
+ if not os.path.isfile(outdir+encid+"_wo_bias.json"):
+ f = open(outdir+encode_id[name]+"_wo_bias.json", "w")
+ json.dump(args_json, f, indent=4)
+ f.close()
+
diff --git a/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare_tar_w_bias.py b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare_tar_w_bias.py
new file mode 100644
index 00000000..f70a1c30
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare_tar_w_bias.py
@@ -0,0 +1,144 @@
+import os
+import json
+import pandas as pd
+
+names = ["IMR90_new", "H1ESC_new", "GM12878_new"]
+#names = ["K562", "HEPG2"]
+
+model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/v1/model_dir_dnase_v2_interpret.csv",sep=",", header=None)
+#model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/v1/model_dir_dnase_v2.csv",sep=",", header=None)
+
+encode_id = {"HEPG2": "ENCSR149XIL",
+ "K562": "ENCSR000EOT",
+ "IMR90_new": "ENCSR477RTP",
+ "GM12878_new": "ENCSR000EMT",
+ "H1ESC_new": "ENCSR000EMU"}
+
+
+encode_id_dnase = {
+"GM12878_new": "ENCSR000EMT",
+"IMR90_new": "ENCSR477RTP",
+"H1ESC_new": "ENCSR000EMU"}
+
+outdir='dnase_tar/'
+
+def fetch_per_fold_preds(odir,model_path, encid, i, name):
+
+ data_paths = []
+ log_paths = []
+ log_paths_opt = []
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/preds_upload/fold_"+str(i)+"/"
+ input_h5 = os.path.join(odir, name+"_w_bias_all_predictions.h5")
+ data_paths.append((input_h5, "pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".h5"))
+
+ input_log = os.path.join(odir, "pred.counts.log.e")
+ print(input_log)
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr.txt"))
+
+ input_log = os.path.join(odir, "pred.counts.log.o")
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout.txt"))
+
+ input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-2]+"/chrombpnet_model/preds_atac/pred.counts.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout_v1.txt"))
+
+ print(input_log)
+ input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-2]+"/chrombpnet_model/preds_atac/pred.counts.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr_v1.txt"))
+
+ input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-2]+"/chrombpnet_model/preds_dnase/pred.counts.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout_v2.txt"))
+
+ input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-2]+"/chrombpnet_model/preds_dnase/pred.counts.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr_v2.txt"))
+
+ return data_paths, log_paths, log_paths_opt
+
+def fetch_pred_tar(encid, args_json, model_paths, name):
+ success = False
+ args_json["bias-corrected predicted signal profile tar"] = {}
+ readme_file = "READMEs/bc.predicted.README"
+ assert(os.path.isfile(readme_file))
+ args_json["bias-corrected predicted signal profile tar"]["file.paths"] = [(readme_file, "README.md")]
+ args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid] = {"file.paths": []}
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/preds_upload/average_preds/"
+
+ input_h5 = os.path.join(odir, name+".mean_preds_w_bias_predictions.h5")
+ if os.path.isfile(input_h5):
+ args_json["bias-corrected predicted signal profile tar"]["file.paths"].append((input_h5,"pred.chrombpnet_nobias.fold_mean."+encid+".h5"))
+ else:
+ success = False
+ return success, args_json
+
+ if name in ["IMR90_new", "GM12878_new", "H1ESC_new"]:
+ bed1 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/"+encode_id_dnase[name]+"/preprocessing/downloads/peaks.bed.gz", sep='\t', header=None)
+ else:
+ bed1 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/peaks_no_blacklist.bed", sep='\t', header=None)
+
+
+ bed2 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name.replace("_new","")+"/peaks_no_blacklist.bed", sep='\t', header=None)
+
+ print(bed1.shape)
+ print(bed2.shape)
+ bedf = pd.concat([bed1, bed2])
+ print(bedf.shape)
+
+ input_bed = os.path.join(odir, "input.regions.bed.gz")
+ if os.path.isfile(input_bed):
+ args_json["bias-corrected predicted signal profile tar"]["file.paths"].append((input_bed,"input_regions.pred.chrombpnet_nobias."+encid+".bed.gz"))
+ else:
+ bedf.to_csv(input_bed, sep='\t', header=False, index=False, compression='gzip')
+
+
+ input_log = os.path.join(odir, "merge.preds.log.e")
+ if os.path.isfile(input_log):
+ args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid]["file.paths"].append((input_log, "logs.pred.chrombpnet_nobias.fold_mean."+encid+".stderr.txt"))
+
+ input_log = os.path.join(odir, "merge.preds.log.o")
+ if os.path.isfile(input_log):
+ args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid]["file.paths"].append((input_log, "logs.pred.chrombpnet_nobias.fold_mean."+encid+".stdout.txt"))
+
+
+ for i in range(5):
+ data_paths, log_paths, log_paths_opt = fetch_per_fold_preds(odir,model_paths[i], encid, i, name)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)] = {}
+ args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)]["logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt}
+ assert(len(data_paths) == 1)
+ print(len(log_paths))
+ assert(len(log_paths) == 6)
+
+ success=True
+ return success, args_json
+
+
+for name in names:
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/preds_upload/average_preds/"+name+".mean_preds_w_bias.stat"
+ if os.path.isfile(ofile):
+ args_json = {}
+ encid=encode_id[name]
+ args_json["experiment"] = encid
+ model_paths = model_atac[model_atac[1]==name.replace("_new","")][2].values
+ print(model_paths)
+ success, args_json = fetch_pred_tar(encid, args_json, model_paths, name)
+ if not success:
+ print("ERR preds tar")
+ continue
+
+ if not os.path.isfile(outdir+encid+".json"):
+ f = open(outdir+encode_id[name]+".json", "w")
+ json.dump(args_json, f, indent=4)
+ f.close()
+
diff --git a/upload_jsons/upload_jsons_scripts/profile_bigwigs_uploads/dnase_prepare_tar.py b/upload_jsons/upload_jsons_scripts/profile_bigwigs_uploads/dnase_prepare_tar.py
new file mode 100644
index 00000000..e69de29b
diff --git a/upload_jsons/upload_jsons_scripts/profile_contrib_upload/READMES/profile.deepshap.README b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/READMES/profile.deepshap.README
new file mode 100644
index 00000000..e69de29b
diff --git a/upload_jsons/upload_jsons_scripts/profile_contrib_upload/atac_tar.py b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/atac_tar.py
new file mode 100644
index 00000000..b44dfb3c
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/atac_tar.py
@@ -0,0 +1,215 @@
+import os
+import json
+import pandas as pd
+
+#encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"]
+encids = ["IMR90", "H1ESC", "GM12878"]
+
+encode_id = {"K562": "ENCSR868FGK",
+"GM12878": "ENCSR637XSC",
+"HEPG2": "ENCSR291GJU",
+"IMR90": "ENCSR200OML",
+"H1ESC": "GSE267154"}
+odir='atac/'
+
+model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/model_dir_atac.csv",sep=",", header=None)
+
+def fetch_per_fold_profile(odir,model_path, encid, i, name):
+
+ model_path_orig=model_path
+ model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]
+ data_paths = []
+ log_paths = []
+ log_paths_opt = []
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/fold_"+str(i)+"/"
+ input_h5 = os.path.join(odir, name+"_profile_attribs_reformatted.h5")
+ data_paths.append((input_h5, "seq_contrib.profile.fold_"+str(i)+"."+encid+".h5"))
+
+ #model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/ATAC_SE_04.27.2024//chrombpnet_model"
+
+ # ATAC regions logs
+
+ model_path=model_path+"/chrombpnet_model"
+ input_log=model_path+"/interpret_dnase/full_"+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.dnase_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+
+ print(input_log)
+ input_log=model_path+"/interpret_dnase/full_"+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.dnase_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_dnase/ATAC_peaks_full.profile.interpret.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.dnase_regions.fold_"+str(i)+"."+encid+".stderr.txt"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_dnase/ATAC_peaks_full.profile.interpret.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.dnase_regions.fold_"+str(i)+"."+encid+".stdout.txt"))
+ else:
+ print(input_log)
+
+ # atac regions logs
+
+ input_log=model_path_orig+"/interpret/merged."+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+ input_log=model_path_orig+"/interpret/merged."+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ # atac regions logs
+
+
+ input_log=model_path+"/interpret/full_"+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full_"+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full.profile.interpret.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".stderr.txt"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full.profile.interpret.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".stdout.txt"))
+ else:
+ print(input_log)
+
+
+ return data_paths, log_paths, log_paths_opt
+
+def fetch_profile_tar(encid, args_json, model_paths, name):
+ success = False
+ args_json["profile sequence contribution scores tar"] = {}
+ readme_file = "READMES/profile.deepshap.README"
+ assert(os.path.isfile(readme_file))
+ args_json["profile sequence contribution scores tar"]["file.paths"] = [(readme_file, "README.md")]
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid] = {"file.paths": []}
+
+ ## full h5 path
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/"
+
+ input_h5 = os.path.join(odir, name+"_profile_attribs_reformatted.h5")
+ if os.path.isfile(input_h5):
+ args_json["profile sequence contribution scores tar"]["file.paths"].append((input_h5,"seq_contrib.profile.fold_mean."+encid+".h5"))
+ else:
+ success = False
+ return success, args_json
+
+ ## modisoc h5 path
+
+ modisco_input = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/merge_folds_new_may_05_24/in_peaks.profile_scores_new_compressed.h5"
+ if os.path.isfile(modisco_input):
+ args_json["profile sequence contribution scores tar"]["file.paths"].append((modisco_input,"seq_contrib.profile.fold_mean.modisco_input."+encid+".h5"))
+ else:
+ success = False
+ return success, args_json
+
+ # log files
+
+
+ input_file=model_paths[0]+"/chrombpnet_model/interpret_all/full_"+name+".interpreted_regions_profile.bed"
+ newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/per_folds.inputs.bed.gz"
+ input_bed = pd.read_csv(input_file, compression='gzip', sep='\t', header=None)
+ if os.path.isfile(input_file):
+ if not os.path.isfile(newf):
+ input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip')
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((newf,"logs.seq_contrib.profile.input_regions.per_fold."+encid+".bed.gz"))
+
+
+ input_file="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/merge_folds_new_may_05_24/in_peaks.profile.interpreted_regions.bed"
+ newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/modisco.inputs.bed.gz"
+ input_bed = pd.read_csv(input_file, sep='\t', header=None)
+ if os.path.isfile(input_file):
+ if not os.path.isfile(newf):
+ input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip')
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((newf,"logs.seq_contrib.profile.input_regions."+encid+".bed.gz"))
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/"
+
+ input_log = os.path.join(odir, "reformat.log.e")
+ if os.path.isfile(input_log):
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((input_log, "logs.seq_contrib.profile.fold_mean.reformat"+encid+".stderr.txt"))
+
+ input_log = os.path.join(odir, "reformat.log.e")
+ if os.path.isfile(input_log):
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((input_log, "logs.seq_contrib.profile.fold_mean.reformat"+encid+".stdout.txt"))
+
+ assert(len(args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"])==4)
+
+ for i in range(5):
+ data_paths, log_paths, log_paths_opt = fetch_per_fold_profile(odir,model_paths[i], encid, i, name)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["profile sequence contribution scores tar"]["fold_"+str(i)] = {}
+ args_json["profile sequence contribution scores tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["profile sequence contribution scores tar"]["fold_"+str(i)]["logs.seq_contrib.profile.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt}
+ assert(len(data_paths) == 1)
+ print(len(log_paths))
+ assert(len(log_paths) >= 4)
+
+ success=True
+ return success, args_json
+
+for encid in encids:
+ print(encid)
+
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.stats"
+ if os.path.isfile(ofile):
+ profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.bw"
+ else:
+ profile_bw = None
+ print(ofile)
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.stats"
+ if os.path.isfile(ofile):
+ profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.bw"
+ else:
+ profile_bw = None
+ print(ofile)
+ continue
+
+ assert(os.path.isfile(profile_bw)==True)
+ assert(os.path.isfile(profile_bw)==True)
+
+ model_paths = model_atac[model_atac[1]==encid.replace("_new","")][2].values
+ print(model_paths)
+ args_json = {}
+ args_json["experiment"] = encode_id[encid]
+
+
+ success, args_json = fetch_profile_tar(encode_id[encid], args_json, model_paths, encid)
+ if not success:
+ print("ERR profile tar")
+ continue
+
+ if not os.path.isfile(odir+encode_id[encid]+".json"):
+ f = open(odir+encode_id[encid]+".json", "w")
+ json.dump(args_json, f, indent=4)
+ f.close()
diff --git a/upload_jsons/upload_jsons_scripts/profile_contrib_upload/atac_tar_k5_and_hep.py b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/atac_tar_k5_and_hep.py
new file mode 100644
index 00000000..326ac362
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/atac_tar_k5_and_hep.py
@@ -0,0 +1,193 @@
+import os
+import json
+import pandas as pd
+
+#encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"]
+encids = ["K562", "HEPG2"]
+
+encode_id = {"K562": "ENCSR868FGK",
+"GM12878": "ENCSR637XSC",
+"HEPG2": "ENCSR291GJU",
+"IMR90": "ENCSR200OML",
+"H1ESC": "GSE267154"}
+odir='atac/'
+
+model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/model_dir_atac.csv",sep=",", header=None)
+
+def fetch_per_fold_profile(odir,model_path, encid, i, name):
+
+ model_path_orig=model_path
+ model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]
+ data_paths = []
+ log_paths = []
+ log_paths_opt = []
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/fold_"+str(i)+"/"
+ input_h5 = os.path.join(odir, name+"_profile_attribs_reformatted.h5")
+ data_paths.append((input_h5, "seq_contrib.profile.fold_"+str(i)+"."+encid+".h5"))
+
+ #model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/ATAC_SE_04.27.2024//chrombpnet_model"
+
+
+ model_path = model_path+"/chrombpnet_model"
+
+ # all regs logs
+
+ input_log=model_path_orig+"/interpret/merged."+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.atacs_regs.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+ input_log=model_path_orig+"/interpret/merged."+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.atacs_regs.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ # atac regs logs
+
+
+ input_log=model_path+"/interpret/full_"+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.atacs_regs.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full_"+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.atacs_regs.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full.profile.interpret.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.atacs_regs.fold_"+str(i)+"."+encid+".stderr.txt"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full.profile.interpret.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.atacs_regs.fold_"+str(i)+"."+encid+".stdout.txt"))
+ else:
+ print(input_log)
+
+
+ return data_paths, log_paths, log_paths_opt
+
+def fetch_profile_tar(encid, args_json, model_paths, name):
+ success = False
+ args_json["profile sequence contribution scores tar"] = {}
+ readme_file = "READMES/profile.deepshap.README"
+ assert(os.path.isfile(readme_file))
+ args_json["profile sequence contribution scores tar"]["file.paths"] = [(readme_file, "README.md")]
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid] = {"file.paths": []}
+
+ ## full h5 path
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/"
+
+ input_h5 = os.path.join(odir, name+"_profile_attribs_reformatted.h5")
+ if os.path.isfile(input_h5):
+ args_json["profile sequence contribution scores tar"]["file.paths"].append((input_h5,"seq_contrib.profile.fold_mean."+encid+".h5"))
+ else:
+ print(input_h5)
+ success = False
+ return success, args_json
+
+ ## modisoc h5 path
+
+ modisco_input = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/merge_folds_new_may_05_24/in_peaks.profile_scores_new_compressed.h5"
+ if os.path.isfile(modisco_input):
+ args_json["profile sequence contribution scores tar"]["file.paths"].append((modisco_input,"seq_contrib.profile.fold_mean.modisco_input."+encid+".h5"))
+ else:
+ print(modisco_input)
+ success = False
+ return success, args_json
+
+ # log files
+
+
+ input_file=model_paths[1]+"/chrombpnet_model/interpret/full_"+name+".interpreted_regions_profile.bed"
+ newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/per_folds.inputs.bed.gz"
+ input_bed = pd.read_csv(input_file, sep='\t', header=None)
+ if os.path.isfile(input_file):
+ if not os.path.isfile(newf):
+ input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip')
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((newf,"logs.seq_contrib.profile.input_regions.per_fold."+encid+".bed.gz"))
+
+
+ input_file="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/merge_folds_new_may_05_24/in_peaks.profile.interpreted_regions.bed"
+ newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/modisco.inputs.bed.gz"
+ input_bed = pd.read_csv(input_file, sep='\t', header=None)
+ if os.path.isfile(input_file):
+ if not os.path.isfile(newf):
+ input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip')
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((newf,"logs.seq_contrib.profile.input_regions."+encid+".bed.gz"))
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/"
+
+ input_log = os.path.join(odir, "reformat.log.e")
+ if os.path.isfile(input_log):
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((input_log, "logs.seq_contrib.profile.fold_mean.reformat"+encid+".stderr.txt"))
+
+ input_log = os.path.join(odir, "reformat.log.e")
+ if os.path.isfile(input_log):
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((input_log, "logs.seq_contrib.profile.fold_mean.reformat"+encid+".stdout.txt"))
+
+ assert(len(args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"])==4)
+
+ for i in range(5):
+ data_paths, log_paths, log_paths_opt = fetch_per_fold_profile(odir,model_paths[i], encid, i, name)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["profile sequence contribution scores tar"]["fold_"+str(i)] = {}
+ args_json["profile sequence contribution scores tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["profile sequence contribution scores tar"]["fold_"+str(i)]["logs.seq_contrib.profile.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt}
+ assert(len(data_paths) == 1)
+ print(len(log_paths))
+ assert(len(log_paths) >= 1)
+
+ success=True
+ return success, args_json
+
+for encid in encids:
+ print(encid)
+
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.stats"
+ if os.path.isfile(ofile):
+ profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.bw"
+ else:
+ profile_bw = None
+ print(ofile)
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.stats"
+ if os.path.isfile(ofile):
+ profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.bw"
+ else:
+ profile_bw = None
+ print(ofile)
+ continue
+
+ assert(os.path.isfile(profile_bw)==True)
+ assert(os.path.isfile(profile_bw)==True)
+
+ model_paths = model_atac[model_atac[1]==encid.replace("_new","")][2].values
+ print(model_paths)
+ args_json = {}
+ args_json["experiment"] = encode_id[encid]
+
+
+ success, args_json = fetch_profile_tar(encode_id[encid], args_json, model_paths, encid)
+ if not success:
+ print("ERR profile tar")
+ continue
+
+ if not os.path.isfile(odir+encode_id[encid]+".json"):
+ f = open(odir+encode_id[encid]+".json", "w")
+ json.dump(args_json, f, indent=4)
+ f.close()
diff --git a/upload_jsons/upload_jsons_scripts/profile_contrib_upload/dnase_tar.py b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/dnase_tar.py
new file mode 100644
index 00000000..65c84a25
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/dnase_tar.py
@@ -0,0 +1,221 @@
+import os
+import json
+import pandas as pd
+
+#encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"]
+encids = ["IMR90_new", "H1ESC_new", "GM12878_new"]
+
+encode_id = {"HEPG2": "ENCSR149XIL",
+ "K562": "ENCSR000EOT",
+ "IMR90_new": "ENCSR477RTP",
+ "GM12878_new": "ENCSR000EMT",
+ "H1ESC_new": "ENCSR000EMU"}
+odir='dnase/'
+
+model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/v1/model_dir_dnase_v2_interpret.csv",sep=",", header=None)
+
+def fetch_per_fold_profile(odir,model_path, encid, i, name):
+
+ data_paths = []
+ log_paths = []
+ log_paths_opt = []
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/fold_"+str(i)+"/"
+ input_h5 = os.path.join(odir, name+"_profile_attribs_reformatted.h5")
+ data_paths.append((input_h5, "seq_contrib.profile.fold_"+str(i)+"."+encid+".h5"))
+
+ #model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/DNASE_SE_04.27.2024//chrombpnet_model"
+
+ # dnase regions logs
+
+ model_path=model_path+"/chrombpnet_model"
+ input_log=model_path+"/interpret_orig/full_"+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.dnase_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+ input_log=model_path+"/interpret_orig/full_"+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.dnase_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_orig/ATAC_peaks_full.profile.interpret.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.dnase_regions.fold_"+str(i)+"."+encid+".stderr.txt"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_orig/ATAC_peaks_full.profile.interpret.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.dnase_regions.fold_"+str(i)+"."+encid+".stdout.txt"))
+ else:
+ print(input_log)
+
+ # atac regions logs
+
+ input_log=model_path+"/interpret/full_"+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full_"+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/ATAC_peaks_full.profile.interpret.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".stderr.txt"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/ATAC_peaks_full.profile.interpret.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".stdout.txt"))
+ else:
+ print(input_log)
+
+ # ccre regions logs
+
+ input_log=model_path+"/interpret_ccre/full_"+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.ccre_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_ccre/full_"+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.ccre_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_ccre/ATAC_peaks_full.profile.interpret.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.ccre_regions.fold_"+str(i)+"."+encid+".stderr.txt"))
+ else:
+ print(input_log)
+ input_log=model_path+"/interpret_ccre/ATAC_peaks_full.profile.interpret.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.ccre_regions.fold_"+str(i)+"."+encid+".stdout.txt"))
+ else:
+ print(input_log)
+
+ return data_paths, log_paths, log_paths_opt
+
+def fetch_profile_tar(encid, args_json, model_paths, name):
+ success = False
+ args_json["profile sequence contribution scores tar"] = {}
+ readme_file = "READMES/profile.deepshap.README"
+ assert(os.path.isfile(readme_file))
+ args_json["profile sequence contribution scores tar"]["file.paths"] = [(readme_file, "README.md")]
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid] = {"file.paths": []}
+
+ ## full h5 path
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/"
+
+ input_h5 = os.path.join(odir, name+"_profile_attribs_reformatted.h5")
+ if os.path.isfile(input_h5):
+ args_json["profile sequence contribution scores tar"]["file.paths"].append((input_h5,"seq_contrib.profile.fold_mean."+encid+".h5"))
+ else:
+ success = False
+ return success, args_json
+
+ ## modisoc h5 path
+
+ modisco_input = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/merge_folds_new_may_05_24/in_peaks.profile_scores_new_compressed.h5"
+ if os.path.isfile(modisco_input):
+ args_json["profile sequence contribution scores tar"]["file.paths"].append((modisco_input,"seq_contrib.profile.fold_mean.modisco_input."+encid+".h5"))
+ else:
+ success = False
+ return success, args_json
+
+ # log files
+
+
+ input_file=model_paths[0]+"/chrombpnet_model/interpret_all_with_ccre/full_"+name+".interpreted_regions_profile.bed"
+ newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/per_folds.inputs.bed.gz"
+ input_bed = pd.read_csv(input_file, compression='gzip', sep='\t', header=None)
+ if os.path.isfile(input_file):
+ if not os.path.isfile(newf):
+ input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip')
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((newf,"logs.seq_contrib.profile.input_regions.per_fold."+encid+".bed.gz"))
+
+
+ input_file="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/merge_folds_new_may_05_24/in_peaks.profile_scores_new_compressed.bed"
+ newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/modisco.inputs.bed.gz"
+ input_bed = pd.read_csv(input_file, compression='gzip', sep='\t', header=None)
+ if os.path.isfile(input_file):
+ if not os.path.isfile(newf):
+ input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip')
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((newf,"logs.seq_contrib.profile.input_regions."+encid+".bed.gz"))
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/"
+
+ input_log = os.path.join(odir, "reformat.log.e")
+ if os.path.isfile(input_log):
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((input_log, "logs.seq_contrib.profile.fold_mean.reformat"+encid+".stderr.txt"))
+
+ input_log = os.path.join(odir, "reformat.log.e")
+ if os.path.isfile(input_log):
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((input_log, "logs.seq_contrib.profile.fold_mean.reformat"+encid+".stdout.txt"))
+
+ assert(len(args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"])==4)
+
+ for i in range(5):
+ data_paths, log_paths, log_paths_opt = fetch_per_fold_profile(odir,model_paths[i], encid, i, name)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["profile sequence contribution scores tar"]["fold_"+str(i)] = {}
+ args_json["profile sequence contribution scores tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["profile sequence contribution scores tar"]["fold_"+str(i)]["logs.seq_contrib.profile.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt}
+ assert(len(data_paths) == 1)
+ print(len(log_paths))
+ assert(len(log_paths) == 12)
+
+ success=True
+ return success, args_json
+
+for encid in encids:
+ print(encid)
+
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.stats"
+ if os.path.isfile(ofile):
+ profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.bw"
+ else:
+ profile_bw = None
+ print(ofile)
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.stats"
+ if os.path.isfile(ofile):
+ profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.bw"
+ else:
+ profile_bw = None
+ print(ofile)
+ continue
+
+ assert(os.path.isfile(profile_bw)==True)
+ assert(os.path.isfile(profile_bw)==True)
+
+ model_paths = model_atac[model_atac[1]==encid.replace("_new","")][2].values
+ print(model_paths)
+ args_json = {}
+ args_json["experiment"] = encode_id[encid]
+
+
+ success, args_json = fetch_profile_tar(encode_id[encid], args_json, model_paths, encid)
+ if not success:
+ print("ERR profile tar")
+ continue
+
+ if not os.path.isfile(odir+encode_id[encid]+".json"):
+ f = open(odir+encode_id[encid]+".json", "w")
+ json.dump(args_json, f, indent=4)
+ f.close()
diff --git a/upload_jsons/upload_jsons_scripts/profile_contrib_upload/dnase_tar_k5_and_hep.py b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/dnase_tar_k5_and_hep.py
new file mode 100644
index 00000000..5583b77f
--- /dev/null
+++ b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/dnase_tar_k5_and_hep.py
@@ -0,0 +1,212 @@
+import os
+import json
+import pandas as pd
+
+#encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"]
+encids = ["K562", "HEPG2"]
+
+encode_id = {"HEPG2": "ENCSR149XIL",
+ "K562": "ENCSR000EOT",
+ "IMR90_new": "ENCSR477RTP",
+ "GM12878_new": "ENCSR000EMT",
+ "H1ESC_new": "ENCSR000EMU"}
+odir='dnase/'
+
+model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/model_dir_dnase.csv",sep=",", header=None)
+
+def fetch_per_fold_profile(odir,model_path, encid, i, name):
+
+ model_path_orig=model_path
+ model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-1]
+ data_paths = []
+ log_paths = []
+ log_paths_opt = []
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/fold_"+str(i)+"/"
+ input_h5 = os.path.join(odir, name+"_profile_attribs_reformatted.h5")
+ data_paths.append((input_h5, "seq_contrib.profile.fold_"+str(i)+"."+encid+".h5"))
+
+ #model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/ATAC_SE_04.27.2024//chrombpnet_model"
+
+ # atac regions logs
+
+
+ model_path = model_path+"/chrombpnet_model"
+ input_log=model_path+"/interpret_ccre/full_"+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.ccre_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_ccre/full_"+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.ccre_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_ccre/ATAC_peaks_full.profile.interpret.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.ccre_regions.fold_"+str(i)+"."+encid+".stderr.txt"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret_ccre/ATAC_peaks_full.profile.interpret.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.all_regions.fold_"+str(i)+"."+encid+".stdout.txt"))
+ else:
+ print(input_log)
+
+
+ # all regions logs
+
+ input_log=model_path_orig+"/interpret/merged."+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.all_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+
+ input_log=model_path_orig+"/interpret/merged."+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.all_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ # atac regions logs
+
+
+ input_log=model_path+"/interpret/full_"+name+".interpret.args.json"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.all_regions.fold_"+str(i)+"."+encid+".args.json"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full_"+name+".interpet.log"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.all_regions.fold_"+str(i)+"."+encid+".log"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full.profile.interpret.log.e"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.all_regions.fold_"+str(i)+"."+encid+".stderr.txt"))
+ else:
+ print(input_log)
+
+ input_log=model_path+"/interpret/full.profile.interpret.log.o"
+ if os.path.isfile(input_log):
+ log_paths.append((input_log, "logs.seq_contrib.profile.all_regions.fold_"+str(i)+"."+encid+".stdout.txt"))
+ else:
+ print(input_log)
+
+
+ return data_paths, log_paths, log_paths_opt
+
+def fetch_profile_tar(encid, args_json, model_paths, name):
+ success = False
+ args_json["profile sequence contribution scores tar"] = {}
+ readme_file = "READMES/profile.deepshap.README"
+ assert(os.path.isfile(readme_file))
+ args_json["profile sequence contribution scores tar"]["file.paths"] = [(readme_file, "README.md")]
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid] = {"file.paths": []}
+
+ ## full h5 path
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/"
+
+ input_h5 = os.path.join(odir, name+"_profile_attribs_reformatted.h5")
+ if os.path.isfile(input_h5):
+ args_json["profile sequence contribution scores tar"]["file.paths"].append((input_h5,"seq_contrib.profile.fold_mean."+encid+".h5"))
+ else:
+ print(input_h5)
+ success = False
+ return success, args_json
+
+ ## modisoc h5 path
+
+ modisco_input = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/merge_folds_new_may_05_24/in_peaks.profile_scores_new_compressed.h5"
+ if os.path.isfile(modisco_input):
+ args_json["profile sequence contribution scores tar"]["file.paths"].append((modisco_input,"seq_contrib.profile.fold_mean.modisco_input."+encid+".h5"))
+ else:
+ print(modisco_input)
+ success = False
+ return success, args_json
+
+ # log files
+
+
+ input_file=model_paths[1]+"/chrombpnet_model/interpret_all_with_ccre/full_"+name+".interpreted_regions_profile.bed"
+ newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/per_folds.inputs.bed.gz"
+ input_bed = pd.read_csv(input_file, compression='gzip', sep='\t', header=None)
+ if os.path.isfile(input_file):
+ if not os.path.isfile(newf):
+ input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip')
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((newf,"logs.seq_contrib.profile.input_regions.per_fold."+encid+".bed.gz"))
+
+
+ input_file="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/merge_folds_new_may_05_24/in_peaks.profile.interpreted_regions.bed"
+ newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/modisco.inputs.bed.gz"
+ input_bed = pd.read_csv(input_file, sep='\t', header=None)
+ if os.path.isfile(input_file):
+ if not os.path.isfile(newf):
+ input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip')
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((newf,"logs.seq_contrib.profile.input_regions."+encid+".bed.gz"))
+
+ odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/"
+
+ input_log = os.path.join(odir, "reformat.log.e")
+ if os.path.isfile(input_log):
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((input_log, "logs.seq_contrib.profile.fold_mean.reformat"+encid+".stderr.txt"))
+
+ input_log = os.path.join(odir, "reformat.log.e")
+ if os.path.isfile(input_log):
+ args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((input_log, "logs.seq_contrib.profile.fold_mean.reformat"+encid+".stdout.txt"))
+
+ assert(len(args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"])==4)
+
+ for i in range(5):
+ data_paths, log_paths, log_paths_opt = fetch_per_fold_profile(odir,model_paths[i], encid, i, name)
+
+ if data_paths is None:
+ success = False
+ return success, args_json
+
+ args_json["profile sequence contribution scores tar"]["fold_"+str(i)] = {}
+ args_json["profile sequence contribution scores tar"]["fold_"+str(i)]["file.paths"] = data_paths
+ args_json["profile sequence contribution scores tar"]["fold_"+str(i)]["logs.seq_contrib.profile.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt}
+ assert(len(data_paths) == 1)
+ print(len(log_paths))
+ assert(len(log_paths) >= 5)
+
+ success=True
+ return success, args_json
+
+for encid in encids:
+ print(encid)
+
+
+ ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.stats"
+ if os.path.isfile(ofile):
+ profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.bw"
+ else:
+ profile_bw = None
+ print(ofile)
+
+
+ assert(os.path.isfile(profile_bw)==True)
+
+ model_paths = model_atac[model_atac[1]==encid.replace("_new","")][2].values
+ print(model_paths)
+ args_json = {}
+ args_json["experiment"] = encode_id[encid]
+
+
+ success, args_json = fetch_profile_tar(encode_id[encid], args_json, model_paths, encid)
+ if not success:
+ print("ERR profile tar")
+ continue
+
+ if not os.path.isfile(odir+encode_id[encid]+".json"):
+ f = open(odir+encode_id[encid]+".json", "w")
+ json.dump(args_json, f, indent=4)
+ f.close()