diff --git a/upload_jsons/upload_jsons_scripts/counts_bigwig_uploads/chrombpnet/atac_prepare.py b/upload_jsons/upload_jsons_scripts/counts_bigwig_uploads/chrombpnet/atac_prepare.py new file mode 100644 index 00000000..25898df8 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/counts_bigwig_uploads/chrombpnet/atac_prepare.py @@ -0,0 +1,116 @@ +import os +import json +import pandas as pd +import pybedtools + +encids = ["K562", "HEPG2", "IMR90", "H1ESC", "GM12878"] +#encids = ["K562", "HEPG2"] +#encids = [ "IMR90", "H1ESC", "GM12878"] + +def make_bb_file(in_bed, out_bb): + assert(os.path.isfile("atac_temp.bed")==False) + command = "zcat "+in_bed+" | LC_COLLATE=C sort -k1,1 -k2,2n > atac_temp.bed" + print(command) + os.system(command) + + command = "bedToBigBed atac_temp.bed /oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/reference/chrom.sizes "+out_bb + print(command) + os.system(command) + + + command = "rm atac_temp.bed" + print(command) + os.system(command) + +chrs = list(map(str,list(range(1,23)))) +chrs = ['chr'+i for i in chrs] +chrs = chrs + ['chrX', 'chrY'] + +def make_sel_bedfile(in_bed, out_bed): + input_bed = pd.read_csv(in_bed, sep="\t", header=None) + print(input_bed.shape) + input_bed = input_bed[input_bed[0].isin(chrs)] + print(input_bed.shape) + + input_bed[1] = input_bed[1]+input_bed[9]-500 + input_bed[2] = input_bed[1] + 1000 + print(input_bed.head()) + x = pybedtools.BedTool.from_dataframe(input_bed[[0,1,2]]) + x = x.sort().merge() + output_bed = x.to_dataframe() + print(output_bed.shape) + print(output_bed.head()) + output_bed.to_csv(out_bed, sep='\t', header=False, index=False) + +encode_id = {"K562": "ENCSR868FGK", +"GM12878": "ENCSR637XSC", +"HEPG2": "ENCSR291GJU", +"IMR90": "ENCSR200OML", +"H1ESC": "GSE267154"} + +odir='atac/' +for encid in encids: + print(encid) + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores_new_compressed.stats" + if os.path.isfile(ofile): + counts_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores_new_compressed.bw" + else: + print(ofile) + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores.stats" + if os.path.isfile(ofile): + counts_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores.bw" + else: + counts_bw = None + print(ofile) + continue + + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.stats" + if os.path.isfile(ofile): + profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.bw" + else: + print(ofile) + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.stats" + if os.path.isfile(ofile): + profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.bw" + else: + profile_bw = None + print(ofile) + + continue + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/per_folds.inputs.bed.gz" + if os.path.isfile(ofile): + sel_path = os.path.join("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/", encid+"/interpret_upload/average_preds/selected.regions.valid.merged.bed.gz" ) + sel_path_bb = os.path.join("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/", encid+"/interpret_upload/average_preds/selected.regions.valid.merged.bigBed" ) + if not os.path.isfile(sel_path): + make_sel_bedfile(ofile, sel_path) + + if os.path.isfile(sel_path) and (not os.path.isfile(sel_path_bb)): + make_bb_file(sel_path, sel_path_bb) + + else: + sel_path=None + sel_path_bb=None + print(ofile) + continue + + assert(os.path.isfile(counts_bw)==True) + assert(os.path.isfile(profile_bw)==True) + + + output_json = {} + output_json["experiment"] = encode_id[encid] + output_json["counts sequence contribution scores bigWig"] = counts_bw + output_json["profile sequence contribution scores bigWig"] = profile_bw + + if os.path.isfile(sel_path_bb): + output_json["selected regions for predicted signal and sequence contribution scores bigBed"] = sel_path_bb + + if os.path.isfile(sel_path): + output_json["selected regions for predicted signal and sequence contribution scores bed"] = sel_path + + if not os.path.isfile(odir+encode_id[encid]+".json"): + f = open(odir+encode_id[encid]+".json", "w") + json.dump(output_json, f, indent=4) + f.close() diff --git a/upload_jsons/upload_jsons_scripts/counts_bigwig_uploads/chrombpnet/dnase_prepare.py b/upload_jsons/upload_jsons_scripts/counts_bigwig_uploads/chrombpnet/dnase_prepare.py new file mode 100644 index 00000000..f49d31c2 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/counts_bigwig_uploads/chrombpnet/dnase_prepare.py @@ -0,0 +1,115 @@ +import os +import json +import pandas as pd +import pybedtools + +#encids = ["K562", "HEPG2", "IMR90", "H1ESC", "GM12878"] +#encids = ["K562", "HEPG2"] +encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"] + +encode_id = {"HEPG2": "ENCSR149XIL", + "K562": "ENCSR000EOT", + "IMR90_new": "ENCSR477RTP", + "GM12878_new": "ENCSR000EMT", + "H1ESC_new": "ENCSR000EMU"} +odir='dnase/' +def make_bb_file(in_bed, out_bb): + assert(os.path.isfile("atac_temp.bed")==False) + command = "zcat "+in_bed+" | LC_COLLATE=C sort -k1,1 -k2,2n > atac_temp.bed" + print(command) + os.system(command) + + command = "bedToBigBed atac_temp.bed /oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/reference/chrom.sizes "+out_bb + print(command) + os.system(command) + + + command = "rm atac_temp.bed" + print(command) + os.system(command) + +chrs = list(map(str,list(range(1,23)))) +chrs = ['chr'+i for i in chrs] +chrs = chrs + ['chrX', 'chrY'] + +def make_sel_bedfile(in_bed, out_bed): + input_bed = pd.read_csv(in_bed, sep="\t", header=None) + print(input_bed.shape) + input_bed = input_bed[input_bed[0].isin(chrs)] + print(input_bed.shape) + + input_bed[1] = input_bed[1]+input_bed[9]-500 + input_bed[2] = input_bed[1] + 1000 + print(input_bed.head()) + x = pybedtools.BedTool.from_dataframe(input_bed[[0,1,2]]) + x = x.sort().merge() + output_bed = x.to_dataframe() + print(output_bed.shape) + print(output_bed.head()) + output_bed.to_csv(out_bed, sep='\t', header=False, index=False) + +for encid in encids: + print(encid) + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores_new_compressed.stats" + if os.path.isfile(ofile): + counts_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores_new_compressed.bw" + else: + print(ofile) + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores.stats" + if os.path.isfile(ofile): + counts_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores.bw" + else: + counts_bw = None + print(ofile) + continue + + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.stats" + if os.path.isfile(ofile): + profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.bw" + else: + print(ofile) + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.stats" + if os.path.isfile(ofile): + profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.bw" + else: + profile_bw = None + print(ofile) + + continue + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/per_folds.inputs.bed.gz" + if os.path.isfile(ofile): + sel_path = os.path.join("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/", encid+"/interpret_upload/average_preds/selected.regions.valid.merged.bed.gz" ) + sel_path_bb = os.path.join("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/", encid+"/interpret_upload/average_preds/selected.regions.valid.merged.bigBed" ) + if not os.path.isfile(sel_path): + make_sel_bedfile(ofile, sel_path) + + if os.path.isfile(sel_path) and (not os.path.isfile(sel_path_bb)): + make_bb_file(sel_path, sel_path_bb) + + else: + sel_path=None + sel_path_bb=None + print(ofile) + continue + + assert(os.path.isfile(counts_bw)==True) + assert(os.path.isfile(profile_bw)==True) + + + output_json = {} + output_json["experiment"] = encode_id[encid] + output_json["counts sequence contribution scores bigWig"] = counts_bw + output_json["profile sequence contribution scores bigWig"] = profile_bw + + if os.path.isfile(sel_path_bb): + output_json["selected regions for predicted signal and sequence contribution scores bigBed"] = sel_path_bb + + if os.path.isfile(sel_path): + output_json["selected regions for predicted signal and sequence contribution scores bed"] = sel_path + + if not os.path.isfile(odir+encode_id[encid]+".json"): + f = open(odir+encode_id[encid]+".json", "w") + json.dump(output_json, f, indent=4) + f.close() diff --git a/upload_jsons/upload_jsons_scripts/counts_contrib_upload/READMES/counts.deepshap.README b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/READMES/counts.deepshap.README new file mode 100644 index 00000000..e69de29b diff --git a/upload_jsons/upload_jsons_scripts/counts_contrib_upload/atac_tar.py b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/atac_tar.py new file mode 100644 index 00000000..5196b748 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/atac_tar.py @@ -0,0 +1,215 @@ +import os +import json +import pandas as pd + +#encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"] +encids = ["IMR90", "H1ESC", "GM12878"] + +encode_id = {"K562": "ENCSR868FGK", +"GM12878": "ENCSR637XSC", +"HEPG2": "ENCSR291GJU", +"IMR90": "ENCSR200OML", +"H1ESC": "GSE267154"} +odir='atac/' + +model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/model_dir_atac.csv",sep=",", header=None) + +def fetch_per_fold_counts(odir,model_path, encid, i, name): + + model_path_orig=model_path + model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1] + data_paths = [] + log_paths = [] + log_paths_opt = [] + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/fold_"+str(i)+"/" + input_h5 = os.path.join(odir, name+"_counts_attribs_reformatted.h5") + data_paths.append((input_h5, "seq_contrib.counts.fold_"+str(i)+"."+encid+".h5")) + + #model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/ATAC_SE_04.27.2024//chrombpnet_model" + + # ATAC regions logs + + model_path=model_path+"/chrombpnet_model" + input_log=model_path+"/interpret_dnase/full_"+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.dnase_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + + print(input_log) + input_log=model_path+"/interpret_dnase/full_"+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.dnase_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + input_log=model_path+"/interpret_dnase/ATAC_peaks_full.counts.interpret.log1.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.dnase_regions.fold_"+str(i)+"."+encid+".stderr.txt")) + else: + print(input_log) + + input_log=model_path+"/interpret_dnase/ATAC_peaks_full.counts.interpret.log1.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.dnase_regions.fold_"+str(i)+"."+encid+".stdout.txt")) + else: + print(input_log) + + # atac regions logs + + input_log=model_path_orig+"/interpret/merged."+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + input_log=model_path_orig+"/interpret/merged."+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + # atac regions logs + + + input_log=model_path+"/interpret/full_"+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + input_log=model_path+"/interpret/full_"+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + input_log=model_path+"/interpret/full.counts.interpret.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".stderr.txt")) + else: + print(input_log) + + input_log=model_path+"/interpret/full.counts.interpret.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".stdout.txt")) + else: + print(input_log) + + + return data_paths, log_paths, log_paths_opt + +def fetch_counts_tar(encid, args_json, model_paths, name): + success = False + args_json["counts sequence contribution scores tar"] = {} + readme_file = "READMES/counts.deepshap.README" + assert(os.path.isfile(readme_file)) + args_json["counts sequence contribution scores tar"]["file.paths"] = [(readme_file, "README.md")] + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid] = {"file.paths": []} + + ## full h5 path + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/" + + input_h5 = os.path.join(odir, name+"_counts_attribs_reformatted.h5") + if os.path.isfile(input_h5): + args_json["counts sequence contribution scores tar"]["file.paths"].append((input_h5,"seq_contrib.counts.fold_mean."+encid+".h5")) + else: + success = False + return success, args_json + + ## modisoc h5 path + + modisco_input = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/merge_folds_new_may_05_24/in_peaks.counts_scores_new_compressed.h5" + if os.path.isfile(modisco_input): + args_json["counts sequence contribution scores tar"]["file.paths"].append((modisco_input,"seq_contrib.counts.fold_mean.modisco_input."+encid+".h5")) + else: + success = False + return success, args_json + + # log files + + + input_file=model_paths[0]+"/chrombpnet_model/interpret_all/full_"+name+".interpreted_regions_counts.bed" + newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/per_folds.inputs.bed.gz" + input_bed = pd.read_csv(input_file, compression='gzip', sep='\t', header=None) + if os.path.isfile(input_file): + if not os.path.isfile(newf): + input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip') + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((newf,"logs.seq_contrib.counts.input_regions.per_fold."+encid+".bed.gz")) + + + input_file="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/merge_folds_new_may_05_24/in_peaks.counts.interpreted_regions.bed" + newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/modisco.inputs.bed.gz" + input_bed = pd.read_csv(input_file, sep='\t', header=None) + if os.path.isfile(input_file): + if not os.path.isfile(newf): + input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip') + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((newf,"logs.seq_contrib.counts.input_regions."+encid+".bed.gz")) + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/" + + input_log = os.path.join(odir, "reformat.log.e") + if os.path.isfile(input_log): + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((input_log, "logs.seq_contrib.counts.fold_mean.reformat"+encid+".stderr.txt")) + + input_log = os.path.join(odir, "reformat.log.e") + if os.path.isfile(input_log): + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((input_log, "logs.seq_contrib.counts.fold_mean.reformat"+encid+".stdout.txt")) + + assert(len(args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"])==4) + + for i in range(5): + data_paths, log_paths, log_paths_opt = fetch_per_fold_counts(odir,model_paths[i], encid, i, name) + + if data_paths is None: + success = False + return success, args_json + + args_json["counts sequence contribution scores tar"]["fold_"+str(i)] = {} + args_json["counts sequence contribution scores tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["counts sequence contribution scores tar"]["fold_"+str(i)]["logs.seq_contrib.counts.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt} + assert(len(data_paths) == 1) + print(len(log_paths)) + assert(len(log_paths) >= 4) + + success=True + return success, args_json + +for encid in encids: + print(encid) + + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores_new_compressed.stats" + if os.path.isfile(ofile): + counts_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores_new_compressed.bw" + else: + counts_bw = None + print(ofile) + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.stats" + if os.path.isfile(ofile): + profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.bw" + else: + profile_bw = None + print(ofile) + continue + + assert(os.path.isfile(counts_bw)==True) + assert(os.path.isfile(profile_bw)==True) + + model_paths = model_atac[model_atac[1]==encid.replace("_new","")][2].values + print(model_paths) + args_json = {} + args_json["experiment"] = encode_id[encid] + + + success, args_json = fetch_counts_tar(encode_id[encid], args_json, model_paths, encid) + if not success: + print("ERR counts tar") + continue + + if not os.path.isfile(odir+encode_id[encid]+".json"): + f = open(odir+encode_id[encid]+".json", "w") + json.dump(args_json, f, indent=4) + f.close() diff --git a/upload_jsons/upload_jsons_scripts/counts_contrib_upload/atac_tar_k5_and_hep.py b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/atac_tar_k5_and_hep.py new file mode 100644 index 00000000..30d5f8da --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/atac_tar_k5_and_hep.py @@ -0,0 +1,193 @@ +import os +import json +import pandas as pd + +#encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"] +encids = ["K562", "HEPG2"] + +encode_id = {"K562": "ENCSR868FGK", +"GM12878": "ENCSR637XSC", +"HEPG2": "ENCSR291GJU", +"IMR90": "ENCSR200OML", +"H1ESC": "GSE267154"} +odir='atac/' + +model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/model_dir_atac.csv",sep=",", header=None) + +def fetch_per_fold_counts(odir,model_path, encid, i, name): + + model_path_orig=model_path + model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1] + data_paths = [] + log_paths = [] + log_paths_opt = [] + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/fold_"+str(i)+"/" + input_h5 = os.path.join(odir, name+"_counts_attribs_reformatted.h5") + data_paths.append((input_h5, "seq_contrib.counts.fold_"+str(i)+"."+encid+".h5")) + + #model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/ATAC_SE_04.27.2024//chrombpnet_model" + + + model_path = model_path+"/chrombpnet_model" + + # all regs logs + + input_log=model_path_orig+"/interpret/merged."+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.atacs_regs.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + input_log=model_path_orig+"/interpret/merged."+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.atacs_regs.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + # atac regs logs + + + input_log=model_path+"/interpret/full_"+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.atacs_regs.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + input_log=model_path+"/interpret/full_"+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.atacs_regs.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + input_log=model_path+"/interpret/full.counts.interpret.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.atacs_regs.fold_"+str(i)+"."+encid+".stderr.txt")) + else: + print(input_log) + + input_log=model_path+"/interpret/full.counts.interpret.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.atacs_regs.fold_"+str(i)+"."+encid+".stdout.txt")) + else: + print(input_log) + + + return data_paths, log_paths, log_paths_opt + +def fetch_counts_tar(encid, args_json, model_paths, name): + success = False + args_json["counts sequence contribution scores tar"] = {} + readme_file = "READMES/counts.deepshap.README" + assert(os.path.isfile(readme_file)) + args_json["counts sequence contribution scores tar"]["file.paths"] = [(readme_file, "README.md")] + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid] = {"file.paths": []} + + ## full h5 path + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/" + + input_h5 = os.path.join(odir, name+"_counts_attribs_reformatted.h5") + if os.path.isfile(input_h5): + args_json["counts sequence contribution scores tar"]["file.paths"].append((input_h5,"seq_contrib.counts.fold_mean."+encid+".h5")) + else: + print(input_h5) + success = False + return success, args_json + + ## modisoc h5 path + + modisco_input = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/merge_folds_new_may_05_24/in_peaks.counts_scores_new_compressed.h5" + if os.path.isfile(modisco_input): + args_json["counts sequence contribution scores tar"]["file.paths"].append((modisco_input,"seq_contrib.counts.fold_mean.modisco_input."+encid+".h5")) + else: + print(modisco_input) + success = False + return success, args_json + + # log files + + + input_file=model_paths[1]+"/chrombpnet_model/interpret/full_"+name+".interpreted_regions_counts.bed" + newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/per_folds.inputs.bed.gz" + input_bed = pd.read_csv(input_file, sep='\t', header=None) + if os.path.isfile(input_file): + if not os.path.isfile(newf): + input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip') + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((newf,"logs.seq_contrib.counts.input_regions.per_fold."+encid+".bed.gz")) + + + input_file="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/merge_folds_new_may_05_24/in_peaks.counts.interpreted_regions.bed" + newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/modisco.inputs.bed.gz" + input_bed = pd.read_csv(input_file, sep='\t', header=None) + if os.path.isfile(input_file): + if not os.path.isfile(newf): + input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip') + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((newf,"logs.seq_contrib.counts.input_regions."+encid+".bed.gz")) + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/" + + input_log = os.path.join(odir, "reformat.log.e") + if os.path.isfile(input_log): + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((input_log, "logs.seq_contrib.counts.fold_mean.reformat"+encid+".stderr.txt")) + + input_log = os.path.join(odir, "reformat.log.e") + if os.path.isfile(input_log): + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((input_log, "logs.seq_contrib.counts.fold_mean.reformat"+encid+".stdout.txt")) + + assert(len(args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"])==4) + + for i in range(5): + data_paths, log_paths, log_paths_opt = fetch_per_fold_counts(odir,model_paths[i], encid, i, name) + + if data_paths is None: + success = False + return success, args_json + + args_json["counts sequence contribution scores tar"]["fold_"+str(i)] = {} + args_json["counts sequence contribution scores tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["counts sequence contribution scores tar"]["fold_"+str(i)]["logs.seq_contrib.counts.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt} + assert(len(data_paths) == 1) + print(len(log_paths)) + assert(len(log_paths) >= 1) + + success=True + return success, args_json + +for encid in encids: + print(encid) + + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores.stats" + if os.path.isfile(ofile): + counts_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores.bw" + else: + counts_bw = None + print(ofile) + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.stats" + if os.path.isfile(ofile): + profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.bw" + else: + profile_bw = None + print(ofile) + continue + + assert(os.path.isfile(counts_bw)==True) + assert(os.path.isfile(profile_bw)==True) + + model_paths = model_atac[model_atac[1]==encid.replace("_new","")][2].values + print(model_paths) + args_json = {} + args_json["experiment"] = encode_id[encid] + + + success, args_json = fetch_counts_tar(encode_id[encid], args_json, model_paths, encid) + if not success: + print("ERR counts tar") + continue + + if not os.path.isfile(odir+encode_id[encid]+".json"): + f = open(odir+encode_id[encid]+".json", "w") + json.dump(args_json, f, indent=4) + f.close() diff --git a/upload_jsons/upload_jsons_scripts/counts_contrib_upload/dnase_tar.py b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/dnase_tar.py new file mode 100644 index 00000000..c69a7440 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/dnase_tar.py @@ -0,0 +1,222 @@ +import os +import json +import pandas as pd + +#encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"] +encids = ["IMR90_new", "H1ESC_new", "GM12878_new"] + +encode_id = {"HEPG2": "ENCSR149XIL", + "K562": "ENCSR000EOT", + "IMR90_new": "ENCSR477RTP", + "GM12878_new": "ENCSR000EMT", + "H1ESC_new": "ENCSR000EMU"} +odir='dnase/' + +model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/v1/model_dir_dnase_v2_interpret.csv",sep=",", header=None) + +def fetch_per_fold_counts(odir,model_path, encid, i, name): + + data_paths = [] + log_paths = [] + log_paths_opt = [] + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/fold_"+str(i)+"/" + input_h5 = os.path.join(odir, name+"_counts_attribs_reformatted.h5") + data_paths.append((input_h5, "seq_contrib.counts.fold_"+str(i)+"."+encid+".h5")) + + #model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/DNASE_SE_04.27.2024//chrombpnet_model" + + # dnase regions logs + + model_path=model_path+"/chrombpnet_model" + input_log=model_path+"/interpret_orig/full_"+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.dnase_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + input_log=model_path+"/interpret_orig/full_"+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.dnase_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + input_log=model_path+"/interpret_orig/ATAC_peaks_full.counts.interpret.log1.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.dnase_regions.fold_"+str(i)+"."+encid+".stderr.txt")) + else: + print(input_log) + + input_log=model_path+"/interpret_orig/ATAC_peaks_full.counts.interpret.log1.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.dnase_regions.fold_"+str(i)+"."+encid+".stdout.txt")) + else: + print(input_log) + + # atac regions logs + + input_log=model_path+"/interpret/full_"+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + input_log=model_path+"/interpret/full_"+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + input_log=model_path+"/interpret/ATAC_peaks_full.counts.interpret.log1.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".stderr.txt")) + else: + print(input_log) + + input_log=model_path+"/interpret/ATAC_peaks_full.counts.interpret.log1.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.atac_regions.fold_"+str(i)+"."+encid+".stdout.txt")) + else: + print(input_log) + + # ccre regions logs + + input_log=model_path+"/interpret_ccre/full_"+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.ccre_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + input_log=model_path+"/interpret_ccre/full_"+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.ccre_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + input_log=model_path+"/interpret_ccre/full.counts.interpret.log1.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.ccre_regions.fold_"+str(i)+"."+encid+".stderr.txt")) + else: + print(input_log) + input_log=model_path+"/interpret_ccre/full.counts.interpret.log1.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.ccre_regions.fold_"+str(i)+"."+encid+".stdout.txt")) + else: + print(input_log) + + return data_paths, log_paths, log_paths_opt + +def fetch_counts_tar(encid, args_json, model_paths, name): + success = False + args_json["counts sequence contribution scores tar"] = {} + readme_file = "READMES/counts.deepshap.README" + assert(os.path.isfile(readme_file)) + args_json["counts sequence contribution scores tar"]["file.paths"] = [(readme_file, "README.md")] + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid] = {"file.paths": []} + + ## full h5 path + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/" + + input_h5 = os.path.join(odir, name+"_counts_attribs_reformatted.h5") + if os.path.isfile(input_h5): + args_json["counts sequence contribution scores tar"]["file.paths"].append((input_h5,"seq_contrib.counts.fold_mean."+encid+".h5")) + else: + success = False + return success, args_json + + ## modisoc h5 path + + modisco_input = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/merge_folds_new_may_05_24/in_peaks.counts_scores_new_compressed.h5" + if os.path.isfile(modisco_input): + args_json["counts sequence contribution scores tar"]["file.paths"].append((modisco_input,"seq_contrib.counts.fold_mean.modisco_input."+encid+".h5")) + else: + success = False + return success, args_json + + # log files + + + input_file=model_paths[0]+"/chrombpnet_model/interpret_all_with_ccre/full_"+name+".interpreted_regions_counts.bed" + newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/per_folds.inputs.bed.gz" + input_bed = pd.read_csv(input_file, compression='gzip', sep='\t', header=None) + if os.path.isfile(input_file): + if not os.path.isfile(newf): + input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip') + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((newf,"logs.seq_contrib.counts.input_regions.per_fold."+encid+".bed.gz")) + + + input_file="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/merge_folds_new_may_05_24/in_peaks.counts_scores_new_compressed.bed" + newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/modisco.inputs.bed.gz" + input_bed = pd.read_csv(input_file, compression='gzip', sep='\t', header=None) + if os.path.isfile(input_file): + if not os.path.isfile(newf): + input_bed = input_bed[~(input_bed[0]=="chrM")] + input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip') + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((newf,"logs.seq_contrib.counts.input_regions."+encid+".bed.gz")) + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/" + + input_log = os.path.join(odir, "reformat.log.e") + if os.path.isfile(input_log): + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((input_log, "logs.seq_contrib.counts.fold_mean.reformat"+encid+".stderr.txt")) + + input_log = os.path.join(odir, "reformat.log.e") + if os.path.isfile(input_log): + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((input_log, "logs.seq_contrib.counts.fold_mean.reformat"+encid+".stdout.txt")) + + assert(len(args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"])==4) + + for i in range(5): + data_paths, log_paths, log_paths_opt = fetch_per_fold_counts(odir,model_paths[i], encid, i, name) + + if data_paths is None: + success = False + return success, args_json + + args_json["counts sequence contribution scores tar"]["fold_"+str(i)] = {} + args_json["counts sequence contribution scores tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["counts sequence contribution scores tar"]["fold_"+str(i)]["logs.seq_contrib.counts.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt} + assert(len(data_paths) == 1) + print(len(log_paths)) + assert(len(log_paths) == 12) + + success=True + return success, args_json + +for encid in encids: + print(encid) + + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores_new_compressed.stats" + if os.path.isfile(ofile): + counts_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores_new_compressed.bw" + else: + counts_bw = None + print(ofile) + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.stats" + if os.path.isfile(ofile): + profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.bw" + else: + profile_bw = None + print(ofile) + continue + + assert(os.path.isfile(counts_bw)==True) + assert(os.path.isfile(profile_bw)==True) + + model_paths = model_atac[model_atac[1]==encid.replace("_new","")][2].values + print(model_paths) + args_json = {} + args_json["experiment"] = encode_id[encid] + + + success, args_json = fetch_counts_tar(encode_id[encid], args_json, model_paths, encid) + if not success: + print("ERR counts tar") + continue + + if not os.path.isfile(odir+encode_id[encid]+".json"): + f = open(odir+encode_id[encid]+".json", "w") + json.dump(args_json, f, indent=4) + f.close() diff --git a/upload_jsons/upload_jsons_scripts/counts_contrib_upload/dnase_tar_k5_and_hep.py b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/dnase_tar_k5_and_hep.py new file mode 100644 index 00000000..174a8710 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/counts_contrib_upload/dnase_tar_k5_and_hep.py @@ -0,0 +1,209 @@ +import os +import json +import pandas as pd + +#encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"] +encids = ["K562", "HEPG2"] + +encode_id = {"K562": "ENCSR000EOT", +"HEPG2": "ENCSR149XIL"} +odir='dnase/' + +model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/model_dir_dnase.csv",sep=",", header=None) + +def fetch_per_fold_counts(odir,model_path, encid, i, name): + + model_path_orig=model_path + model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-1] + data_paths = [] + log_paths = [] + log_paths_opt = [] + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/fold_"+str(i)+"/" + input_h5 = os.path.join(odir, name+"_counts_attribs_reformatted.h5") + data_paths.append((input_h5, "seq_contrib.counts.fold_"+str(i)+"."+encid+".h5")) + + #model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/ATAC_SE_04.27.2024//chrombpnet_model" + + # atac regions logs + + + model_path = model_path+"/chrombpnet_model" + input_log=model_path+"/interpret_ccre/full_"+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.ccre_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + input_log=model_path+"/interpret_ccre/full_"+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.ccre_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + input_log=model_path+"/interpret_ccre/full.counts.interpret.log1.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.ccre_regions.fold_"+str(i)+"."+encid+".stderr.txt")) + else: + print(input_log) + + input_log=model_path+"/interpret_ccre/full.counts.interpret.log1.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.all_regions.fold_"+str(i)+"."+encid+".stdout.txt")) + else: + print(input_log) + + + # all regions logs + + input_log=model_path_orig+"/interpret/merged."+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.all_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + + input_log=model_path_orig+"/interpret/merged."+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.all_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + # atac regions logs + + + input_log=model_path+"/interpret/full_"+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.all_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + input_log=model_path+"/interpret/full_"+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.all_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + input_log=model_path+"/interpret/full.counts.interpret.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.all_regions.fold_"+str(i)+"."+encid+".stderr.txt")) + else: + print(input_log) + + input_log=model_path+"/interpret/full.counts.interpret.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.counts.all_regions.fold_"+str(i)+"."+encid+".stdout.txt")) + else: + print(input_log) + + + return data_paths, log_paths, log_paths_opt + +def fetch_counts_tar(encid, args_json, model_paths, name): + success = False + args_json["counts sequence contribution scores tar"] = {} + readme_file = "READMES/counts.deepshap.README" + assert(os.path.isfile(readme_file)) + args_json["counts sequence contribution scores tar"]["file.paths"] = [(readme_file, "README.md")] + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid] = {"file.paths": []} + + ## full h5 path + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/" + + input_h5 = os.path.join(odir, name+"_counts_attribs_reformatted.h5") + if os.path.isfile(input_h5): + args_json["counts sequence contribution scores tar"]["file.paths"].append((input_h5,"seq_contrib.counts.fold_mean."+encid+".h5")) + else: + print(input_h5) + success = False + return success, args_json + + ## modisoc h5 path + + modisco_input = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/merge_folds_new_may_05_24/in_peaks.counts_scores_new_compressed.h5" + if os.path.isfile(modisco_input): + args_json["counts sequence contribution scores tar"]["file.paths"].append((modisco_input,"seq_contrib.counts.fold_mean.modisco_input."+encid+".h5")) + else: + print(modisco_input) + success = False + return success, args_json + + # log files + + + input_file=model_paths[1]+"/chrombpnet_model/interpret_all_with_ccre/full_"+name+".interpreted_regions_counts.bed" + newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/per_folds.inputs.bed.gz" + input_bed = pd.read_csv(input_file, compression='gzip', sep='\t', header=None) + if os.path.isfile(input_file): + if not os.path.isfile(newf): + input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip') + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((newf,"logs.seq_contrib.counts.input_regions.per_fold."+encid+".bed.gz")) + + + input_file="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/merge_folds_new_may_05_24/in_peaks.counts.interpreted_regions.bed" + newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/modisco.inputs.bed.gz" + input_bed = pd.read_csv(input_file, sep='\t', header=None) + if os.path.isfile(input_file): + if not os.path.isfile(newf): + input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip') + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((newf,"logs.seq_contrib.counts.input_regions."+encid+".bed.gz")) + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/" + + input_log = os.path.join(odir, "reformat.log.e") + if os.path.isfile(input_log): + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((input_log, "logs.seq_contrib.counts.fold_mean.reformat"+encid+".stderr.txt")) + + input_log = os.path.join(odir, "reformat.log.e") + if os.path.isfile(input_log): + args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"].append((input_log, "logs.seq_contrib.counts.fold_mean.reformat"+encid+".stdout.txt")) + + assert(len(args_json["counts sequence contribution scores tar"]["logs.seq_contrib.counts."+encid]["file.paths"])==4) + + for i in range(5): + data_paths, log_paths, log_paths_opt = fetch_per_fold_counts(odir,model_paths[i], encid, i, name) + + if data_paths is None: + success = False + return success, args_json + + args_json["counts sequence contribution scores tar"]["fold_"+str(i)] = {} + args_json["counts sequence contribution scores tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["counts sequence contribution scores tar"]["fold_"+str(i)]["logs.seq_contrib.counts.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt} + assert(len(data_paths) == 1) + print(len(log_paths)) + assert(len(log_paths) >= 5) + + success=True + return success, args_json + +for encid in encids: + print(encid) + + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores.stats" + if os.path.isfile(ofile): + counts_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.counts_scores.bw" + else: + counts_bw = None + print(ofile) + + + assert(os.path.isfile(counts_bw)==True) + + model_paths = model_atac[model_atac[1]==encid.replace("_new","")][2].values + print(model_paths) + args_json = {} + args_json["experiment"] = encode_id[encid] + + + success, args_json = fetch_counts_tar(encode_id[encid], args_json, model_paths, encid) + if not success: + print("ERR counts tar") + continue + + if not os.path.isfile(odir+encode_id[encid]+".json"): + f = open(odir+encode_id[encid]+".json", "w") + json.dump(args_json, f, indent=4) + f.close() diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/READMEs/bias.training.README b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/READMEs/bias.training.README new file mode 100644 index 00000000..8faa0ea2 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/READMEs/bias.training.README @@ -0,0 +1,63 @@ +# Directory Structure Format +. +├── peaks.all_input_regions.encid.bed.gz # Peaks input to the bias training script +├── logs.bias.training_test_regions.encid # folder containing log files for peak and nonpeak generation scripts +│ +├── fold_0 +│ ├── cv_params.fold_0.json # training, validation and test chromosomes used in fold 0 +│ ├── nonpeaks.all_input_regions.fold_0.encid.bed.gz # Non peaks input to the bias training script +│ ├── nonpeaks.trainingset.fold_0.encid.bed.gz # nonpeaks used in training set of fold 0 bias model +│ ├── nonpeaks.validationset.fold_0.encid.bed.gz # nonpeaks used in validation set of fold 0 bias model +│ ├── nonpeaks.testset.fold_0.encid.bed.gz # nonpeaks used in test set of fold 0 bias model +│ └── logs.bias.training_test_regions.fold_0.encid # folder containing log files for training bias model on fold 0 +│ +├── fold_1 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_2 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_3 +│ └── ... # similar directory structure as fold_0 directory above +│ +└── fold_4 + └── ... # similar directory structure as fold_0 directory above + +# Bed File Format for Peaks + +* All the bed files are in narrowpeak format with 10 columns. + +1) chrom - Name of the chromosome (or contig, scaffold, etc.). +2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0. +3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99. +4) name - Name given to a region (preferably unique). Use "." if no name is assigned. +5) score - Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were "'0"' when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000. +6) strand - +/- to denote strand or orientation (whenever applicable). Use "." if no orientation is assigned. +7) signalValue - Measurement of overall (usually, average) enrichment for the region. +8) pValue - Measurement of statistical significance (-log10). Use -1 if no pValue is assigned. +9) qValue - Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned. +10) peak - Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called. + +# Bed File Format for Nonpeaks + +* All the bed files are in narrowpeak format with 10 columns. + +1) chrom - Name of the chromosome (or contig, scaffold, etc.). +2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0. +3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99. +4) empty character - "." +5) empty character - "." +6) empty character - "." +7) empty character - "." +8) empty character - "." +9) empty character - "." +10) (chromEnd-chromStart)/2 + +# Format of file `cv_params.fold_0.json` + +A dictionary with following (key,value) pairs, + +1) ("CV_type", "chr_holdout") +2) ("train", list_of_chrs_trainingset) +3) ("valid", list_of_chrs_validationset) +4) ("test", list_of_chrs_testset) diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_model_chrombpnet.csv b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_model_chrombpnet.csv new file mode 100644 index 00000000..da180a8d --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_model_chrombpnet.csv @@ -0,0 +1,6 @@ +fold_0,K562,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/K562/nautilus_runs/K562_02.17.2022_bias_128_4_1234_0.5_fold_0 +fold_1,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_1_data_type_ATAC_PE +fold_2,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_2_data_type_ATAC_PE +fold_3,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_3_data_type_ATAC_PE +fold_4,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_4_data_type_ATAC_PE + diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_model_upload.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_model_upload.py index 46c7f0e6..8b534c68 100755 --- a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_model_upload.py +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_model_upload.py @@ -12,7 +12,7 @@ #encids = open("../chromatin_atlas_atac/test_encid.txt").readlines() #encids = [line.strip() for line in encids] -model_atac = pd.read_csv("atac_bias_model.csv",sep=",", header=None) +model_atac = pd.read_csv("atac_bias_model_chrombpnet.csv",sep=",", header=None) encode_id = {"K562": "ENCSR868FGK"} data_to_bam = {"K562": ["ENCFF077FBI", "ENCFF128WZG", "ENCFF534DCE"]} def main_fetch_preprocessing_files(encid, args_json, bam_ids, name): @@ -40,7 +40,7 @@ def main_fetch_preprocessing_files(encid, args_json, bam_ids, name): def main_fetch_bias_model_files(encid, args_json, models_path): success = False args_json["bias models tar"] = {} - readme_file = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/READMES/bias.models.README" + readme_file = "READMEs/bias.training.README" assert(os.path.isfile(readme_file)) args_json["bias models tar"]["file.paths"] = [(readme_file, "README.md")] #args_json["bias models tar"]["logs.bias.models."+encid] = {"file.paths": None} @@ -68,7 +68,7 @@ def main_fetch_bias_training_files(encid, args_json, models_path, name): # find the training test regions args_json["bias training and test regions tar"] = {} - readme_file = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/READMES/bias.training_test_regions.README" + readme_file = "READMEs/bias.training.README" assert(os.path.isfile(readme_file)) args_json["bias training and test regions tar"]["file.paths"] = [(readme_file, "README.md")] @@ -82,7 +82,7 @@ def main_fetch_bias_training_files(encid, args_json, models_path, name): log_paths = upload_utils.bias_fetch_preprocessing_log_files(odir, encid, main_dir, name) args_json["bias training and test regions tar"]["logs.bias.training_test_regions."+encid] = {"file.paths": log_paths} - assert(len(log_paths) == 4) + assert(len(log_paths) == 3) for i in range(5): data_paths, log_paths = upload_utils.fetch_per_fold_training_data_bias(odir, models_path[i], encid, i, main_dir, name) @@ -90,6 +90,8 @@ def main_fetch_bias_training_files(encid, args_json, models_path, name): args_json["bias training and test regions tar"]["fold_"+str(i)] = {} args_json["bias training and test regions tar"]["fold_"+str(i)]["file.paths"] = data_paths args_json["bias training and test regions tar"]["fold_"+str(i)]["logs.bias.training_test_regions.fold_"+str(i)+"."+encid] = {"file.paths": log_paths} + #print(len(data_paths)) + #print(data_paths) assert(len(data_paths) == 5) assert(len(log_paths) == 2) diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_upload_utils.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_upload_utils.py index 1b908615..0c79edb4 100755 --- a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_upload_utils.py +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/atac_bias_upload_utils.py @@ -13,10 +13,10 @@ def bias_fetch_preprocessing_log_files(odir, encid, main_dir, name): # preprocessing, peak-calling # preprocessing log files - temp_dir="/oak/stanford/groups/akundaje/projects/chrombpnet/model_inputs/ENCODE_ATAC_downloads/" - preprocessing_log = os.path.join(temp_dir, name + "/script.sh") - if os.stat(preprocessing_log).st_size != 0: - log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script_v1.sh")) +# temp_dir="/oak/stanford/groups/akundaje/projects/chrombpnet/model_inputs/ENCODE_ATAC_downloads/" +# preprocessing_log = os.path.join(temp_dir, name + "/script.sh") +# if os.stat(preprocessing_log).st_size != 0: +# log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script_v1.sh")) preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_preprocessing.log") if os.stat(preprocessing_log).st_size != 0: @@ -43,9 +43,10 @@ def fetch_per_fold_training_data_bias(odir, model_dir, encid, fold_num, main_dir if os.path.isfile(filtered_regions_bed): input_paths.append((filtered_regions_bed,"cv_params.fold_"+str(fold_num)+".json")) - temp_dir="/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/atlas_model_k562_fold_0/" + #temp_dir="/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/atlas_model_k562_fold_0/" if fold_num==0: - filtered_regions_bed = os.path.join(temp_dir, "negatives_data/negatives_with_summit.bed.gz") + filtered_regions_bed = os.path.join(main_dir, name+"/negatives_data/negatives_with_summit.bed.gz") + #print(filtered_regions_bed) if os.path.isfile(filtered_regions_bed): input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz")) else: @@ -59,15 +60,15 @@ def fetch_per_fold_training_data_bias(odir, model_dir, encid, fold_num, main_dir # if os.path.isfile(filtered_regions_bed): # input_paths.append((filtered_regions_bed,"peaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz")) - filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias/nonpeaks.trainingset.bed.gz") + filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.trainingset.bed.gz") if os.path.isfile(filtered_regions_bed): input_paths.append((filtered_regions_bed,"nonpeaks.trainingset.fold_"+str(fold_num)+"."+encid+".bed.gz")) - filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias/nonpeaks.validationset.bed.gz") + filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.validationset.bed.gz") if os.path.isfile(filtered_regions_bed): input_paths.append((filtered_regions_bed,"nonpeaks.validationset.fold_"+str(fold_num)+"."+encid+".bed.gz")) - filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias/nonpeaks.testset.bed.gz") + filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.testset.bed.gz") if os.path.isfile(filtered_regions_bed): input_paths.append((filtered_regions_bed,"nonpeaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz")) @@ -75,7 +76,9 @@ def fetch_per_fold_training_data_bias(odir, model_dir, encid, fold_num, main_dir #print(filtered_regions_bed) if fold_num==0: + #negatives_log = os.path.join(temp_dir, name+"/negatives_data/make_background_regions.log") negatives_log = os.path.join(main_dir, name+"/negatives_data/make_background_regions.log") + if os.stat(negatives_log).st_size != 0: log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.txt")) else: @@ -85,7 +88,8 @@ def fetch_per_fold_training_data_bias(odir, model_dir, encid, fold_num, main_dir if fold_num==0: - negatives_log = os.path.join(temp_dir, "negatives_data/negatives_compared_with_foreground.png") +# negatives_log = os.path.join(temp_dir, "negatives_data/negatives_compared_with_foreground.png") + negatives_log = os.path.join(main_dir, name+"/negatives_data/negatives_compared_with_foreground.png") if os.stat(negatives_log).st_size != 0: log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png")) else: @@ -125,9 +129,11 @@ def fetch_per_fold_bias_models(odir, model_dir, encid, fold_num): #### fetch model training log files ######## modelling_log = os.path.join(model_dir, "bias_model/train_bias_model.log") - if os.stat(modelling_log).st_size != 0: - log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".stdout.txt")) - + if os.path.exists(modelling_log) + if os.stat(modelling_log).st_size != 0: + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".stdout.txt")) + else: + print(modelling_log) modelling_log = os.path.join(model_dir, "bias_model/bias.args.json") if os.stat(modelling_log).st_size != 0: log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".args.json")) diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/bias.models.README b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/bias.models.README new file mode 100644 index 00000000..315b971b --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/bias.models.README @@ -0,0 +1,92 @@ +# Directory Structure Format +. +├── fold_0 +│ ├── model.bias.fold_0.encid.h5 # bias model in .h5 format +│ ├── model.bias.fold_0.encid.h5 # bias model in SavedModel format +│ │ after being untarred, it results in a directory named "bias" +│ └── logs.bias.models.fold_0.encid # folder containing log files for training models +│ +├── fold_1 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_2 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_3 +│ └── ... # similar directory structure as fold_0 directory above +│ +└── fold_4 + └── ... # similar directory structure as fold_0 directory above + + +# Pseudocode for loading models in .h5 format + +(1) Use the code in python after appropriately defining `model_in_h5_format` and `inputs`. +(2) `inputs` is a one hot encoded sequence of shape (N,2114,4). Here N corresponds to the +number of tested sequences, 2114 is the input sequence length and 4 corresponds to [A,C,G,T]. + +``` +import tensorflow as tf +from tensorflow.keras.utils import get_custom_objects +from tensorflow.keras.models import load_model + +custom_objects={"tf": tf} +get_custom_objects().update(custom_objects) + +model=load_model(model_in_h5_format,compile=False) +outputs = model(inputs) +``` + +The list `outputs` consists of two elements. The first element has a shape of (N, 1000) and +contains logit predictions for a 1000-base-pair output. The second element, with a shape of +(N, 1), contains logcount predictions. To transform these predictions into per-base signals, +follow the provided pseudo code lines below. + +``` +import numpy as np + +def softmax(x, temp=1): + norm_x = x - np.mean(x,axis=1, keepdims=True) + return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True) + +predictions = softmax(outputs[0]) * (np.exp(outputs[1])-1) +``` + +# Pseudocode for loading models in .tar format + +(1) First untar the directory as follows `tar -xvf model.tar` +(2) Use the code below in python after appropriately defining `model_dir_untared` and `inputs` +(3) `inputs` is a one hot encoded sequence of shape (N,2114,4). Here N corresponds to the number +of tested sequences, 2114 is the input sequence length and 4 corresponds to ACGT. + +Reference: https://www.tensorflow.org/api_docs/python/tf/saved_model/load + +``` +import tensorflow as tf + +model = tf.saved_model.load('model_dir_untared') +outputs = model.signatures['serving_default'](**{'sequence':inputs.astype('float32')}) +``` + +The variable `outputs` represents a dictionary containing two key-value pairs. The first key +is `logits_profile_predictions`, holding a value with a shape of (N, 1000). This value corresponds +to logit predictions for a 1000-base-pair output. The second key, named `logcount_predictions``, +is associated with a value of shape (N, 1), representing logcount predictions. To transform these +predictions into per-base signals, utilize the provided pseudo code lines mentioned below. + +``` +import numpy as np +def softmax(x, temp=1): + norm_x = x - np.mean(x,axis=1, keepdims=True) + return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True) + +predictions = softmax(outputs["logits_profile_predictions"]) * (np.exp(outputs["logcount_predictions"])-1) +``` + +# Docker image to load and use the models + +https://hub.docker.com/r/kundajelab/chrombpnet-atlas/ (tag:v1) + +# Tool box to do downstream analysis with the models + +https://github.com/kundajelab/chrombpnet/wiki diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/bias.training.README b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/bias.training.README new file mode 100644 index 00000000..8faa0ea2 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/bias.training.README @@ -0,0 +1,63 @@ +# Directory Structure Format +. +├── peaks.all_input_regions.encid.bed.gz # Peaks input to the bias training script +├── logs.bias.training_test_regions.encid # folder containing log files for peak and nonpeak generation scripts +│ +├── fold_0 +│ ├── cv_params.fold_0.json # training, validation and test chromosomes used in fold 0 +│ ├── nonpeaks.all_input_regions.fold_0.encid.bed.gz # Non peaks input to the bias training script +│ ├── nonpeaks.trainingset.fold_0.encid.bed.gz # nonpeaks used in training set of fold 0 bias model +│ ├── nonpeaks.validationset.fold_0.encid.bed.gz # nonpeaks used in validation set of fold 0 bias model +│ ├── nonpeaks.testset.fold_0.encid.bed.gz # nonpeaks used in test set of fold 0 bias model +│ └── logs.bias.training_test_regions.fold_0.encid # folder containing log files for training bias model on fold 0 +│ +├── fold_1 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_2 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_3 +│ └── ... # similar directory structure as fold_0 directory above +│ +└── fold_4 + └── ... # similar directory structure as fold_0 directory above + +# Bed File Format for Peaks + +* All the bed files are in narrowpeak format with 10 columns. + +1) chrom - Name of the chromosome (or contig, scaffold, etc.). +2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0. +3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99. +4) name - Name given to a region (preferably unique). Use "." if no name is assigned. +5) score - Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were "'0"' when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000. +6) strand - +/- to denote strand or orientation (whenever applicable). Use "." if no orientation is assigned. +7) signalValue - Measurement of overall (usually, average) enrichment for the region. +8) pValue - Measurement of statistical significance (-log10). Use -1 if no pValue is assigned. +9) qValue - Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned. +10) peak - Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called. + +# Bed File Format for Nonpeaks + +* All the bed files are in narrowpeak format with 10 columns. + +1) chrom - Name of the chromosome (or contig, scaffold, etc.). +2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0. +3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99. +4) empty character - "." +5) empty character - "." +6) empty character - "." +7) empty character - "." +8) empty character - "." +9) empty character - "." +10) (chromEnd-chromStart)/2 + +# Format of file `cv_params.fold_0.json` + +A dictionary with following (key,value) pairs, + +1) ("CV_type", "chr_holdout") +2) ("train", list_of_chrs_trainingset) +3) ("valid", list_of_chrs_validationset) +4) ("test", list_of_chrs_testset) diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/models.README b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/models.README new file mode 100644 index 00000000..90a59aa1 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/models.README @@ -0,0 +1,98 @@ +# Directory Structure Format +. +├── fold_0 +│ ├── model.chrombpnet.fold_0.encid.h5 # full chrombpnet model that combines both bias and corrected model in .h5 format +│ ├── model.chrombpnet_nobias.fold_0.encid.h5 # bias-corrected accessibility model in .h5 format (Use for all biological discovery) +│ ├── model.bias_scaled.fold_0.encid.h5 # bias model in .h5 format +│ ├── model.chrombpnet.fold_0.encid.tar # full chrombpnet model that combines both bias and corrected model in SavedModel format. +│ │ after being untarred, it results in a directory named "chrombpnet". +│ ├── model.chrombpnet_nobias.fold_0.encid.tar # bias-corrected accessibility model in SavedModel format (Use for all biological discovery). +│ │ after being untarred, it results in a directory named "chrombpnet_wo_bias". +│ ├── model.bias_scaled.fold_0.encid.h5 # bias model in SavedModel format +│ │ after being untarred, it results in a directory named "bias_model_scaled". +│ └── logs.models.fold_0.encid # folder containing log files for training models +│ +├── fold_1 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_2 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_3 +│ └── ... # similar directory structure as fold_0 directory above +│ +└── fold_4 + └── ... # similar directory structure as fold_0 directory above + + +# Pseudocode for loading models in .h5 format + +(1) Use the code in python after appropriately defining `model_in_h5_format` and `inputs`. +(2) `inputs` is a one hot encoded sequence of shape (N,2114,4). Here N corresponds to the +number of tested sequences, 2114 is the input sequence length and 4 corresponds to [A,C,G,T]. + +``` +import tensorflow as tf +from tensorflow.keras.utils import get_custom_objects +from tensorflow.keras.models import load_model + +custom_objects={"tf": tf} +get_custom_objects().update(custom_objects) + +model=load_model(model_in_h5_format,compile=False) +outputs = model(inputs) +``` + +The list `outputs` consists of two elements. The first element has a shape of (N, 1000) and +contains logit predictions for a 1000-base-pair output. The second element, with a shape of +(N, 1), contains logcount predictions. To transform these predictions into per-base signals, +follow the provided pseudo code lines below. + +``` +import numpy as np + +def softmax(x, temp=1): + norm_x = x - np.mean(x,axis=1, keepdims=True) + return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True) + +predictions = softmax(outputs[0]) * (np.exp(outputs[1])-1) +``` + +# Pseudocode for loading models in .tar format + +(1) First untar the directory as follows `tar -xvf model.tar` +(2) Use the code below in python after appropriately defining `model_dir_untared` and `inputs` +(3) `inputs` is a one hot encoded sequence of shape (N,2114,4). Here N corresponds to the number +of tested sequences, 2114 is the input sequence length and 4 corresponds to ACGT. + +Reference: https://www.tensorflow.org/api_docs/python/tf/saved_model/load + +``` +import tensorflow as tf + +model = tf.saved_model.load('model_dir_untared') +outputs = model.signatures['serving_default'](**{'sequence':inputs.astype('float32')}) +``` + +The variable `outputs` represents a dictionary containing two key-value pairs. The first key +is `logits_profile_predictions`, holding a value with a shape of (N, 1000). This value corresponds +to logit predictions for a 1000-base-pair output. The second key, named `logcount_predictions``, +is associated with a value of shape (N, 1), representing logcount predictions. To transform these +predictions into per-base signals, utilize the provided pseudo code lines mentioned below. + +``` +import numpy as np +def softmax(x, temp=1): + norm_x = x - np.mean(x,axis=1, keepdims=True) + return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True) + +predictions = softmax(outputs["logits_profile_predictions"]) * (np.exp(outputs["logcount_predictions"])-1) +```) + +# Docker image to load and use the models + +https://hub.docker.com/r/kundajelab/chrombpnet-atlas/ (tag:v1) + +# Tool box to do downstream analysis with the models + +https://github.com/kundajelab/chrombpnet/wiki diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/training.README b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/training.README new file mode 100644 index 00000000..56f8d835 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/READMEs/training.README @@ -0,0 +1,66 @@ +# Directory Structure Format +. +├── peaks.all_input_regions.encid.bed.gz # Peaks input to the chrombpnet training script +├── nonpeaks.all_input_regions.encid.bed.gz # Non peaks input to the chrombpnet training script +├── logs.training_test_regions.encid # folder containing log files for peak and nonpeak generation scripts +│ +├── fold_0 +│ ├── cv_params.fold_0.json # training, validation and test chromosomes used in fold 0 +│ ├── peaks.trainingset.fold_0.encid.bed.gz # peaks used in training set of fold 0 model +│ ├── nonpeaks.trainingset.fold_0.encid.bed.gz # nonpeaks used in training set of fold 0 model +│ ├── peaks.validationset.fold_0.encid.bed.gz # peaks used in validation set of fold 0 model +│ ├── nonpeaks.validationset.fold_0.encid.bed.gz # nonpeaks used in validation set of fold 0 model +│ ├── peaks.testset.fold_0.encid.bed.gz # peaks used in test set of fold 0 model +│ ├── nonpeaks.testset.fold_0.encid.bed.gz # nonpeaks used in test set of fold 0 model +│ └── logs.training_test_regions.fold_0.encid # folder containing log files for training chrombpnet model on fold 0 +│ +├── fold_1 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_2 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_3 +│ └── ... # similar directory structure as fold_0 directory above +│ +└── fold_4 + └── ... # similar directory structure as fold_0 directory above + +# Bed File Format for Peaks + +* All the bed files are in narrowpeak format with 10 columns. + +1) chrom - Name of the chromosome (or contig, scaffold, etc.). +2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0. +3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99. +4) name - Name given to a region (preferably unique). Use "." if no name is assigned. +5) score - Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were "'0"' when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000. +6) strand - +/- to denote strand or orientation (whenever applicable). Use "." if no orientation is assigned. +7) signalValue - Measurement of overall (usually, average) enrichment for the region. +8) pValue - Measurement of statistical significance (-log10). Use -1 if no pValue is assigned. +9) qValue - Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned. +10) peak - Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called. + +# Bed File Format for Nonpeaks + +* All the bed files are in narrowpeak format with 10 columns. + +1) chrom - Name of the chromosome (or contig, scaffold, etc.). +2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0. +3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99. +4) empty character - "." +5) empty character - "." +6) empty character - "." +7) empty character - "." +8) empty character - "." +9) empty character - "." +10) midpoint - (chromEnd-chromStart)/2 + +# Format of file `cv_params.fold_0.json` + +A dictionary with following (key,value) pairs, + +1) ("CV_type", "chr_holdout") +2) ("train", list_of_chrs_trainingset) +3) ("valid", list_of_chrs_validationset) +4) ("test", list_of_chrs_testset) diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_model_chrombpnet.csv b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_model_chrombpnet.csv new file mode 100644 index 00000000..15190cf2 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_model_chrombpnet.csv @@ -0,0 +1,26 @@ +fold_0,GM12878,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/GM12878/nautilus_runs/GM12878_03.01.2022_bias_128_4_1234_0.4_fold_0 +fold_1,GM12878,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/GM12878/GM12878_07.08.2022_bias_128_4_1234_0.4_fold_1_data_type_ATAC_PE +fold_2,GM12878,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/GM12878/GM12878_07.08.2022_bias_128_4_1234_0.4_fold_2_data_type_ATAC_PE +fold_3,GM12878,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/GM12878/GM12878_07.14.2022_bias_128_4_1234_0.4_fold_3_data_type_ATAC_PE +fold_4,GM12878,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/GM12878/GM12878_07.07.2022_bias_128_4_1234_0.4_fold_4_data_type_ATAC_PE +fold_0,K562,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/K562/nautilus_runs/K562_02.17.2022_bias_128_4_1234_0.5_fold_0 +fold_1,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_1_data_type_ATAC_PE +fold_2,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_2_data_type_ATAC_PE +fold_3,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_3_data_type_ATAC_PE +fold_4,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_4_data_type_ATAC_PE +fold_0,HEPG2,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/HEPG2/nautilus_runs_jun16/HEPG2_05.09.2022_bias_128_4_1234_0.8_fold_0 +fold_1,HEPG2,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/HEPG2/HEPG2_06.07.2022_bias_128_4_1234_0.8_fold_1 +fold_2,HEPG2,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/HEPG2/HEPG2_05.24.2022_bias_128_4_1234_0.8_fold_2 +fold_3,HEPG2,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/HEPG2/HEPG2_05.22.2022_bias_128_4_1234_0.8_fold_3 +fold_4,HEPG2,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/HEPG2/HEPG2_05.22.2022_bias_128_4_1234_0.8_fold_4 +fold_0,IMR90,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/IMR90/nautilus_runs_apr12/IMR90_04.09.2022_bias_128_4_1234_0.4_fold_0 +fold_1,IMR90,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/IMR90/IMR90_07.17.2022_bias_128_4_1234_0.3_fold_1_data_type_ATAC_PE +fold_2,IMR90,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/IMR90/IMR90_07.17.2022_bias_128_4_1234_0.3_fold_2_data_type_ATAC_PE +fold_3,IMR90,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/IMR90/IMR90_07.08.2022_bias_128_4_1234_0.4_fold_3_data_type_ATAC_PE +fold_4,IMR90,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/IMR90/IMR90_07.07.2022_bias_128_4_1234_0.4_fold_4_data_type_ATAC_PE +fold_0,H1ESC,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/H1ESC/nautilus_runs_jun16/H1ESC_05.09.2022_bias_128_4_1234_0.8_fold_0 +fold_1,H1ESC,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/H1ESC/H1ESC_07.23.2022_bias_128_4_1234_0.7_fold_1_data_type_ATAC_PE +fold_2,H1ESC,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/H1ESC/H1ESC_07.17.2022_bias_128_4_1234_0.8_fold_2_data_type_ATAC_PE +fold_3,H1ESC,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/H1ESC/H1ESC_07.17.2022_bias_128_4_1234_0.8_fold_3_data_type_ATAC_PE +fold_4,H1ESC,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/H1ESC/H1ESC_07.17.2022_bias_128_4_1234_0.8_fold_4_data_type_ATAC_PE + diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_model_upload.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_model_upload.py new file mode 100644 index 00000000..d2e9a145 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_model_upload.py @@ -0,0 +1,260 @@ +import os +import atac_bias_upload_utils as upload_utils +import json +import pandas as pd +import model_upload_utils + +odir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/ATAC/" +#output_dir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022-uploads/jsons/ATAC/stage1/jul_17_2023/" +main_dir="/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/" +output_dir = "atac_production_uploads/" + +encids = os.listdir(odir) +#encids = open("../chromatin_atlas_atac/test_encid.txt").readlines() +#encids = [line.strip() for line in encids] + +model_atac = pd.read_csv("atac_bias_model_chrombpnet.csv",sep=",", header=None) +encode_id = {"K562": "ENCSR868FGK", +"GM12878": "ENCSR637XSC", +"HEPG2": "ENCSR291GJU", +"IMR90": "ENCSR200OML", +"H1ESC": "GSE267154"} + +data_to_bam = {"K562": ["ENCFF077FBI", "ENCFF128WZG", "ENCFF534DCE"], +"GM12878": ["ENCFF440GRZ", "ENCFF981FXV", "ENCFF962FMH"], +"HEPG2": ["ENCFF624SON", "ENCFF926KFU", "ENCFF990VCP"], +"IMR90": ["ENCFF848XMR", "ENCFF715NAV"], +"H1ESC": ["GSM8260976", "GSM8260977"] +} + +def main_fetch_training_files(encid, args_json, model_paths, name): + success = False + + # find the training test regions + args_json["training and test regions tar"] = {} + readme_file = "READMEs/training.README" + assert(os.path.isfile(readme_file)) + args_json["training and test regions tar"]["file.paths"] = [(readme_file, "README.md")] + + input_peaks = os.path.join(main_dir, name + "/data/peaks_no_blacklist.bed.gz") + if os.path.isfile(input_peaks): + args_json["training and test regions tar"]["file.paths"].append((input_peaks,"peaks.all_input_regions."+encid+".bed.gz")) + else: + success = False + return success, args_json + + log_paths = model_upload_utils.fetch_preprocessing_log_files(odir,encid,main_dir, name) + args_json["training and test regions tar"]["logs.training_test_regions."+encid] = {"file.paths": log_paths} + assert(len(log_paths) == 3) + + for i in range(5): + data_paths, log_paths = model_upload_utils.fetch_per_fold_training_data(odir,model_paths[i], encid, i, main_dir, name) + + args_json["training and test regions tar"]["fold_"+str(i)] = {} + args_json["training and test regions tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["training and test regions tar"]["fold_"+str(i)]["logs.training_test_regions.fold_"+str(i)+"."+encid] = {"file.paths": log_paths} + print(len(data_paths)) + assert(len(data_paths) == 8) + assert(len(log_paths) == 2) + + if len(data_paths) != 8: + success = False + return success, args_json + + success = True + return success, args_json + +def main_fetch_preprocessing_files_for_k562(encid, args_json, bam_ids, name): + # define bam_ids, name + + success_flag = False + + args_json["upload bias"] = False + #args_json["bias model encid"] = encid + + # find the bams input + preprocessing_path = "/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/"+name+"/data/"+name+"_unstranded.bw" + + if os.path.isfile(preprocessing_path): + args_json["experiment"] = encid + args_json["bam files"] = bam_ids + args_json["assay"] = "ATAC-seq" + success = True + else: + success = False + + return success, args_json + +def main_fetch_preprocessing_files(encid, args_json, bam_ids, name): + # define bam_ids, name + + success_flag = False + + args_json["upload bias"] = True + #args_json["bias model encid"] = encid + + # find the bams input + preprocessing_path = "/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/"+name+"/data/"+name+"_unstranded.bw" + + if os.path.isfile(preprocessing_path): + args_json["experiment"] = encid + args_json["bam files"] = bam_ids + args_json["assay"] = "ATAC-seq" + args_json["observed signal profile bigWig"] = preprocessing_path + success = True + else: + success = False + + return success, args_json + +def main_fetch_model_files(encid, args_json, model_paths, name): + success = False + args_json["models tar"] = {} + readme_file = "READMEs/models.README" + assert(os.path.isfile(readme_file)) + args_json["models tar"]["file.paths"] = [(readme_file, "README.md")] + args_json["models tar"]["logs.models."+encid] = {"file.paths": None} + + for i in range(5): + data_paths, log_paths, log_paths_opt = model_upload_utils.fetch_per_fold_models(odir,model_paths[i], encid, i) + + if data_paths is None: + success = False + return success, args_json + + args_json["models tar"]["fold_"+str(i)] = {} + args_json["models tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["models tar"]["fold_"+str(i)]["logs.models.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt} + assert(len(data_paths) == 6) + print(len(log_paths)) + assert(len(log_paths) >= 6) + + success=True + return success, args_json + +def main_fetch_bias_model_files(encid, args_json, models_path): + success = False + args_json["bias models tar"] = {} + readme_file = "READMEs/bias.models.README" + assert(os.path.isfile(readme_file)) + args_json["bias models tar"]["file.paths"] = [(readme_file, "README.md")] + #args_json["bias models tar"]["logs.bias.models."+encid] = {"file.paths": None} + + for i in range(5): + data_paths, log_paths = upload_utils.fetch_per_fold_bias_models(odir, models_path[i], encid, i) + + if data_paths is None: + success = False + return success, args_json + + args_json["bias models tar"]["fold_"+str(i)] = {} + args_json["bias models tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["bias models tar"]["fold_"+str(i)]["logs.bias.models.fold_"+str(i)+"."+encid] = {"file.paths": log_paths} + # 9 log file expected per model + print(len(log_paths)) + assert(len(log_paths) >= 2) + assert(len(data_paths) == 2) + success=True + return success, args_json + +def main_fetch_bias_training_files(encid, args_json, models_path, name): + success = False + + # find the training test regions + args_json["bias training and test regions tar"] = {} + readme_file = "READMEs/bias.training.README" + assert(os.path.isfile(readme_file)) + args_json["bias training and test regions tar"]["file.paths"] = [(readme_file, "README.md")] + + input_peaks = os.path.join(main_dir, name + "/data/peaks_no_blacklist.bed.gz") + print(input_peaks) + if os.path.isfile(input_peaks): + args_json["bias training and test regions tar"]["file.paths"].append((input_peaks,"peaks.all_input_regions."+encid+".bed.gz")) + else: + success = False + return success, args_json + + log_paths = upload_utils.bias_fetch_preprocessing_log_files(odir, encid, main_dir, name) + args_json["bias training and test regions tar"]["logs.bias.training_test_regions."+encid] = {"file.paths": log_paths} + assert(len(log_paths) == 3) + + for i in range(5): + data_paths, log_paths = upload_utils.fetch_per_fold_training_data_bias(odir, models_path[i], encid, i, main_dir, name) + + args_json["bias training and test regions tar"]["fold_"+str(i)] = {} + args_json["bias training and test regions tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["bias training and test regions tar"]["fold_"+str(i)]["logs.bias.training_test_regions.fold_"+str(i)+"."+encid] = {"file.paths": log_paths} + #print(len(data_paths)) + #print(data_paths) + assert(len(data_paths) == 5) + assert(len(log_paths) == 2) + + #if len(data_paths) != 3: + # success = False + # return success, args_json + + success = True + return success, args_json + + + +if __name__ == "__main__": + + for name in ["K562", "GM12878", "HEPG2", "IMR90", "H1ESC"]: + + encid=encode_id[name] + model_paths = model_atac[model_atac[1]==name][2].values + print(model_paths) + + if os.path.isfile(output_dir+"/"+encid+".json"): + continue + + print(encid) + + args_json = {} + + success, args_json = main_fetch_preprocessing_files(encid, args_json, data_to_bam[name], name) + if not success: + print("ERR prep") + continue + + success, args_json = main_fetch_bias_training_files(encid, args_json, model_paths, name) + if not success: + print("ERR bias prep") + continue + + success, args_json = main_fetch_bias_model_files(encid, args_json, model_paths) + if not success: + print("ERR bias models") + continue + + if name == "K562": + with open(output_dir+"/"+encid+"_bias.json", "w") as outfile: + json.dump(args_json, outfile, indent=4) + + args_json = {} + main_fetch_preprocessing_files_for_k562(encid, args_json, data_to_bam[name], name) + if not success: + print("ERR prep") + continue + + success, args_json = main_fetch_model_files(encid, args_json, model_paths, name) + if not success: + print("fail model") + continue + + success, args_json = main_fetch_training_files(encid, args_json, model_paths, name) + if not success: + print("fail train prep") + continue + + + with open(output_dir+"/"+encid+".json", "w") as outfile: + json.dump(args_json, outfile, indent=4) + + #print(args_json) + + + + + diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_upload_utils.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_upload_utils.py new file mode 100644 index 00000000..5f7d236d --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/atac_bias_upload_utils.py @@ -0,0 +1,174 @@ +import os +import json +import numpy as np + +### utils for preprocessing + + +### utils for training and testing regions + +def bias_fetch_preprocessing_log_files(odir, encid, main_dir, name): + # do bed file checks + log_paths = [] + # preprocessing, peak-calling + + # preprocessing log files +# temp_dir="/oak/stanford/groups/akundaje/projects/chrombpnet/model_inputs/ENCODE_ATAC_downloads/" +# preprocessing_log = os.path.join(temp_dir, name + "/script.sh") +# if os.stat(preprocessing_log).st_size != 0: +# log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script_v1.sh")) + + preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_preprocessing.log") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout.txt")) + + preprocessing_log = os.path.join(main_dir, name + "/data/"+name.lower()+"_atac_fold_0.sh") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script.sh")) + + preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_bias_pwm.png") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".bias_pwm.png")) + + return log_paths + + +def fetch_per_fold_training_data_bias(odir, model_dir, encid, fold_num, main_dir, name): + input_paths = [] + log_paths = [] + + #print(model_dir) + opath = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/splits_format/" + filtered_regions_bed = os.path.join(opath + "/fold_"+str(fold_num)+".json") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"cv_params.fold_"+str(fold_num)+".json")) + + #temp_dir="/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/atlas_model_k562_fold_0/" + if fold_num==0: + filtered_regions_bed = os.path.join(main_dir, name+"/negatives_data/negatives_with_summit.bed.gz") + #print(filtered_regions_bed) + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz")) + else: + filtered_regions_bed = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/negatives_with_summit.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + +# filtered_regions_bed = os.path.join(model_dir, "bias_model/train_test_regions/peaks.testset.bed.gz") +# #print(filtered_regions_bed) +# if os.path.isfile(filtered_regions_bed): +# input_paths.append((filtered_regions_bed,"peaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.trainingset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.trainingset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.validationset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.validationset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.testset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + #print(input_paths) + #print(filtered_regions_bed) + + if fold_num==0: + #negatives_log = os.path.join(temp_dir, name+"/negatives_data/make_background_regions.log") + negatives_log = os.path.join(main_dir, name+"/negatives_data/make_background_regions.log") + + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.txt")) + else: + negatives_log = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/make_background_regions.log") + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.txt")) + + + if fold_num==0: +# negatives_log = os.path.join(temp_dir, "negatives_data/negatives_compared_with_foreground.png") + negatives_log = os.path.join(main_dir, name+"/negatives_data/negatives_compared_with_foreground.png") + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png")) + else: + negatives_log = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/negatives_compared_with_foreground.png") + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png")) + +# negatives_log = os.path.join(odir, encid + "/negatives_data/test/fold_"+str(fold_num)+"."+encid+"_test.log") +# if os.stat(negatives_log).st_size != 0: +# log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.txt")) +# + # add preprocessing data main_dir + + return input_paths, log_paths + + +### utils for model uploads + +#just need to add log files + +def fetch_per_fold_bias_models(odir, model_dir, encid, fold_num): + input_paths = [] + log_paths = [] + + bm_model = os.path.join(model_dir, "bias_model/bias.h5") + if os.path.isfile(bm_model): + input_paths.append((bm_model,"model.bias.fold_"+str(fold_num)+"."+encid+".h5")) + else: + return None, None + + bm_model = os.path.join(model_dir, "bias_model/new_model_formats_vf/bias.tar") + if os.path.isfile(bm_model): + input_paths.append((bm_model,"model.bias.fold_"+str(fold_num)+"."+encid+".tar")) + else: + return None, None + + #### fetch model training log files ######## + + modelling_log = os.path.join(model_dir, "bias_model/train_bias_model.log") + if os.path.exists(modelling_log): + if os.stat(modelling_log).st_size != 0: + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".stdout.txt")) + #else: + # print(modelling_log) + modelling_log = os.path.join(model_dir, "bias_model/bias.args.json") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".args.json")) + + modelling_log = os.path.join(model_dir, "bias_model/bias_data_params.tsv") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_data_params.tsv")) + else: + modelling_log = os.path.join(model_dir, "bias_model/newgen/bias_data_params.tsv") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_data_params.tsv")) + + + modelling_log = os.path.join(model_dir, "bias_model/bias_model_params.tsv") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_train_params.tsv")) + else: + modelling_log = os.path.join(model_dir, "bias_model/newgen/bias_model_params.tsv") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_train_params.tsv")) + + modelling_log = os.path.join(model_dir, "bias_model/bias.params.json") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_train_params.json")) + + modelling_log = os.path.join(model_dir, "bias_model/bias.log") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".epoch_loss.csv")) + + modelling_log = os.path.join(model_dir, "bias_model/bias.log.batch") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".batch_loss.tsv")) + + return input_paths, log_paths + + + + diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_model_chrombpnet.csv b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_model_chrombpnet.csv new file mode 100644 index 00000000..e69de29b diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_model_upload.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_model_upload.py new file mode 100644 index 00000000..74940191 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_model_upload.py @@ -0,0 +1,283 @@ +import os +import dnase_bias_upload_utils as upload_utils +import json +import pandas as pd +import model_upload_utils + +odir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/" +#output_dir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022-uploads/jsons/DNASE/stage1/jul_17_2023/" +output_dir = "dnase_production_uploads/" + +encids = os.listdir(odir) +#encids = open("../chromatin_atlas_atac/test_encid.txt").readlines() +#encids = [line.strip() for line in encids] + +model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/upload_jsons/upload_scripts/model_dir_dnase_v2.1_bias.csv",sep=",", header=None) +model_atac_new = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/v1/model_dir_dnase_v2.1.csv",sep=",", header=None) + + +encode_id = {"HEPG2": "ENCSR149XIL", + "K562": "ENCSR000EOT", + "IMR90": "ENCSR477RTP", + "GM12878": "ENCSR000EMT", + "H1ESC": "ENCSR000EMU"} + +data_to_bam = {"HEPG2": ["ENCFF474LSZ", "ENCFF839SPF"], + "K562": ["ENCFF205FNC"], + "IMR90": ["ENCFF618FFB"], + "GM12878": ["ENCFF467CXY", "ENCFF940NSD"], + "H1ESC": ["ENCFF733TCL"]} + +def main_fetch_training_files(encid, args_json, model_paths, name): + success = False + + # find the training test regions + args_json["training and test regions tar"] = {} + readme_file = "READMEs/bias.models.README" + assert(os.path.isfile(readme_file)) + args_json["training and test regions tar"]["file.paths"] = [(readme_file, "README.md")] + + if name in ["HEPG2", "K562"]: + main_dir="/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_PE/" + input_peaks = os.path.join(main_dir, name + "/data/peaks_no_blacklist.bed.gz") + else: + main_dir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/" + input_peaks = os.path.join(odir, encid + "/preprocessing/downloads/peaks.bed.gz") + + if os.path.isfile(input_peaks): + args_json["training and test regions tar"]["file.paths"].append((input_peaks,"peaks.all_input_regions."+encid+".bed.gz")) + else: + success = False + return success, args_json + + if name in ["H1ESC"]: + main_dir="/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_SE/" + + log_paths = model_upload_utils.fetch_preprocessing_log_files(odir,encid,main_dir, name) + args_json["training and test regions tar"]["logs.training_test_regions."+encid] = {"file.paths": log_paths} + assert(len(log_paths) == 3) + + for i in range(5): + data_paths, log_paths = model_upload_utils.fetch_per_fold_training_data(odir,model_paths[i], encid, i, main_dir, name) + + args_json["training and test regions tar"]["fold_"+str(i)] = {} + args_json["training and test regions tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["training and test regions tar"]["fold_"+str(i)]["logs.training_test_regions.fold_"+str(i)+"."+encid] = {"file.paths": log_paths} + print(len(data_paths)) + assert(len(data_paths) == 8) + assert(len(log_paths) == 2) + + if len(data_paths) != 8: + success = False + return success, args_json + + success = True + return success, args_json + + +def main_fetch_preprocessing_files(encid, args_json, bam_ids, name): + + success_flag = False + + if name == "HEPG2": + args_json["upload bias"] = False + else: + args_json["upload bias"] = True + + args_json["bias model encid"] = encid + + # find the bams input + preprocessing_path = "/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_PE/"+name+"/data/"+name+"_unstranded.bw" + preprocessing_path_oak = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/"+encid+"/preprocessing/bigWigs/"+encid+".bigWig" + if os.path.isfile(preprocessing_path): + args_json["experiment"] = encid + args_json["bam files"] = bam_ids + args_json["assay"] = "DNase-seq" + args_json["observed signal profile bigWig"] = preprocessing_path + success = True + elif os.path.isfile(preprocessing_path_oak): + args_json["experiment"] = encid + args_json["bam files"] = bam_ids + args_json["assay"] = "DNase-seq" + args_json["observed signal profile bigWig"] = preprocessing_path_oak + success = True + else: + success = False + + return success, args_json + +def main_fetch_model_files(encid, args_json, model_paths, name): + success = False + args_json["models tar"] = {} + readme_file = "READMEs/models.README" + assert(os.path.isfile(readme_file)) + args_json["models tar"]["file.paths"] = [(readme_file, "README.md")] + args_json["models tar"]["logs.models."+encid] = {"file.paths": None} + + for i in range(5): + data_paths, log_paths, log_paths_opt = model_upload_utils.fetch_per_fold_models(odir,model_paths[i], encid, i) + + if data_paths is None: + success = False + return success, args_json + + args_json["models tar"]["fold_"+str(i)] = {} + args_json["models tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["models tar"]["fold_"+str(i)]["logs.models.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt} + assert(len(data_paths) == 6) + print(len(log_paths)) + assert(len(log_paths) >= 6) + + success=True + return success, args_json + +def main_fetch_bias_model_files(encid, args_json, models_path): + success = False + args_json["bias models tar"] = {} + readme_file = "READMEs/bias.models.README" + assert(os.path.isfile(readme_file)) + args_json["bias models tar"]["file.paths"] = [(readme_file, "README.md")] + #args_json["bias models tar"]["logs.bias.models."+encid] = {"file.paths": None} + + for i in range(5): + data_paths, log_paths = upload_utils.fetch_per_fold_bias_models(odir, models_path[i], encid, i) + + if data_paths is None: + success = False + return success, args_json + + args_json["bias models tar"]["fold_"+str(i)] = {} + args_json["bias models tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["bias models tar"]["fold_"+str(i)]["logs.bias.models.fold_"+str(i)+"."+encid] = {"file.paths": log_paths} + #print(log_paths) + # 9 log file expected per model + #print(len(log_paths)) + assert(len(log_paths) >= 2) + assert(len(data_paths) == 2) + success=True + return success, args_json + + +def main_fetch_bias_training_files(encid, args_json, models_path, name): + success = False + + # find the training test regions + args_json["bias training and test regions tar"] = {} + readme_file = "READMEs/bias.training.README" + assert(os.path.isfile(readme_file)) + args_json["bias training and test regions tar"]["file.paths"] = [(readme_file, "README.md")] + + if name in ["HEPG2", "K562"]: + main_dir="/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_PE/" + input_peaks = os.path.join(main_dir, name + "/data/peaks_no_blacklist.bed.gz") + else: + main_dir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/" + input_peaks = os.path.join(odir, encid + "/preprocessing/downloads/peaks.bed.gz") + + #print(input_peaks) + if os.path.isfile(input_peaks): + args_json["bias training and test regions tar"]["file.paths"].append((input_peaks,"peaks.all_input_regions."+encid+".bed.gz")) + else: + success = False + return success, args_json + + # log files preprocessing and peak-calling + if name in ["HEPG2", "K562"]: + log_paths = upload_utils.bias_fetch_preprocessing_log_files_set_1(odir, encid, main_dir, name) + #print(len(log_paths)) + assert(len(log_paths) == 3) + elif name in ["H1ESC"]: + main_dir="/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_SE/" + log_paths = upload_utils.bias_fetch_preprocessing_log_files_set_1(odir, encid, main_dir, name) + #print(len(log_paths)) + assert(len(log_paths) == 3) + + else: + log_paths = upload_utils.bias_fetch_preprocessing_log_files_set_2(odir, encid, main_dir, name) + assert(len(log_paths) == 8) + + + args_json["bias training and test regions tar"]["logs.bias.training_test_regions."+encid] = {"file.paths": log_paths} + + + for i in range(5): + data_paths, log_paths = upload_utils.fetch_per_fold_training_data_bias(odir, models_path[i], encid, i, main_dir, name) + #print(data_paths) + args_json["bias training and test regions tar"]["fold_"+str(i)] = {} + args_json["bias training and test regions tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["bias training and test regions tar"]["fold_"+str(i)]["logs.bias.training_test_regions.fold_"+str(i)+"."+encid] = {"file.paths": log_paths} + #print(log_paths) + #print(log_paths) + #print(data_paths) + assert(len(data_paths) == 5) + assert(len(log_paths) == 2) + + #if len(data_paths) != 3: + # success = False + # return success, args_json + + success = True + return success, args_json + + + +if __name__ == "__main__": + + # define readmes specfic to bias model + #for name in ["HEPG2", "GM12878", "K562", "IMR90", "H1ESC"]: + for name in ["HEPG2", "K562", "H1ESC"]: + + encid=encode_id[name] + model_paths = model_atac[model_atac[1]==name][2].values + + model_paths_new = model_atac_new[model_atac_new[1]==name][2].values + + print(model_paths) + + if os.path.isfile(output_dir+"/"+encid+".json"): + continue + + print(encid) + + args_json = {} + + success, args_json = main_fetch_preprocessing_files(encid, args_json, data_to_bam[name], name) + if not success: + print("ERR prep") + continue + + if name != "HEPG2": + + success, args_json = main_fetch_bias_training_files(encid, args_json, model_paths, name) + if not success: + print("ERR bias prep") + continue + + success, args_json = main_fetch_bias_model_files(encid, args_json, model_paths) + if not success: + print("ERR bias models") + continue + + if name == "H1ESC": + model_paths = model_paths_new + + success, args_json = main_fetch_model_files(encid, args_json, model_paths, name) + if not success: + print("fail model") + continue + + success, args_json = main_fetch_training_files(encid, args_json, model_paths, name) + if not success: + print("fail train prep") + continue + + + with open(output_dir+"/"+encid+".json", "w") as outfile: + json.dump(args_json, outfile, indent=4) + + #print(args_json) + + + + + diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_upload_utils.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_upload_utils.py new file mode 100644 index 00000000..053b8008 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/dnase_bias_upload_utils.py @@ -0,0 +1,301 @@ +import os +import json +import numpy as np + +### utils for preprocessing + + +### utils for training and testing regions + + +def bias_fetch_preprocessing_log_files_set_1(odir, encid, main_dir, name): + # do bed file checks + log_paths = [] + + + # preprocessing log files + preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_preprocessing.log") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout.txt")) + + preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_DNASE_PE.sh") + if os.path.isfile(preprocessing_log): + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script.sh")) + + preprocessing_log = os.path.join(main_dir, name + "/data/h1_dnase_fold_0.sh") + if os.path.isfile(preprocessing_log): + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script.sh")) + + preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_bias_pwm.png") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".bias_pwm.png")) + + # peak-calling-log-files +# tmpdir = "/oak/stanford/groups/akundaje/projects/chrombpnet/model_inputs/DNASE/caper/" +# +# peaks_log = os.path.join(tmpdir, name + "/metadata.json") +# if os.path.isfile(peaks_log): +# log_paths.append((peaks_log,"logfile.peak_calling."+encid+".metadata.json")) +# +# peaks_log = os.path.join(tmpdir, name + "/call-reproducibility_overlap/stdout") +# if os.path.isfile(peaks_log): +# log_paths.append((peaks_log,"logfile.peak_calling."+encid+".stdout.txt")) +# +# peaks_log = os.path.join(tmpdir, name + "/call-reproducibility_overlap/stderr") +# if os.path.isfile(peaks_log): +# log_paths.append((peaks_log,"logfile.peak_calling."+encid+".stderr.txt")) + + return log_paths + +def bias_fetch_preprocessing_log_files_set_2(odir, encid, main_dir, name): + # do bed file checks + log_paths = [] + + # preprocessing log files + preprocessing_log = os.path.join(odir, encid + "/preprocessing/preprocessing.log.e") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stderr.txt")) + + preprocessing_log = os.path.join(odir, encid + "/preprocessing/preprocessing.log.o") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout.txt")) + + preprocessing_log = os.path.join(odir, encid + "/preprocessing/"+encid+".log") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout_v1.txt")) + + preprocessing_log = os.path.join(odir, encid + "/preprocessing/preprocess_"+encid+".log") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout_v2.txt")) + + preprocessing_log = os.path.join(odir, encid + "/preprocessing/params_file.json") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".params_file.json")) + + preprocessing_log = os.path.join(odir, encid + "/preprocessing/bigWigs/"+encid+".png") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".bias_pwm.png")) + + # peak-calling-log-files + peaks_log = os.path.join(odir, encid + "/peak_calling/log.e") + if os.path.isfile(peaks_log): + log_paths.append((peaks_log,"logfile.peak_calling."+encid+".stdout_v1.txt")) + + peaks_log = os.path.join(odir, encid + "/peak_calling/log.o") + if os.path.isfile(peaks_log): + log_paths.append((peaks_log,"logfile.peak_calling."+encid+".stdout_v2.txt")) + + return log_paths + +def fetch_per_fold_training_data_bias(odir, model_dir, encid, fold_num, main_dir, name): + input_paths = [] + log_paths = [] + + #print(model_dir) + opath = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/splits_format/" + filtered_regions_bed = os.path.join(opath + "/fold_"+str(fold_num)+".json") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"cv_params.fold_"+str(fold_num)+".json")) + + if fold_num==0: + print(name) + if name in ["HEPG2", "K562", "H1ESC"]: + filtered_regions_bed = os.path.join(main_dir, name+"/negatives_data/negatives_with_summit.bed.gz") + print(filtered_regions_bed) + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz")) + else: + + filtered_regions_bed = os.path.join(main_dir, name+"/data/negatives_data/negatives_with_summit.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(odir, encid+"/negatives_data/negatives_with_summit.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + else: + if name in ["HEPG2", "K562", "H1ESC"]: + filtered_regions_bed = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/negatives_with_summit.bed.gz") + print(filtered_regions_bed) + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + else: + filtered_regions_bed = os.path.join(main_dir, name+"/data/negatives_data_"+str(fold_num)+"/negatives_with_summit.bed.gz") + print(filtered_regions_bed) + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + + filtered_regions_bed = os.path.join(odir, encid+"/negatives_data_"+str(fold_num)+"/negatives_with_summit.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + #filtered_regions_bed = os.path.join(model_dir, "bias_model/train_test_regions/peaks.testset.bed.gz") + #print(filtered_regions_bed) + #if os.path.isfile(filtered_regions_bed): + # input_paths.append((filtered_regions_bed,"peaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.trainingset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.trainingset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.validationset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.validationset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(model_dir, "train_test_regions_bias_may_7_2024/nonpeaks.testset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + #print(input_paths) + #print(filtered_regions_bed) + + if fold_num==0: + if name in ["HEPG2", "K562", "H1ESC"]: + negatives_log = os.path.join(main_dir, name+"/negatives_data/make_background_regions.log") + if os.path.isfile(negatives_log): + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout_v1.txt")) + else: + + negatives_log = os.path.join(main_dir, name+"/data/negatives_data/make_background_regions.log") + if os.path.isfile(negatives_log): + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout_v1.txt")) + + negatives_log = os.path.join(odir, encid+"/negatives_data/make_background_regions.log") + if os.path.isfile(negatives_log): + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout_v1.txt")) + + negatives_log = os.path.join(odir, encid+"/negatives_data/gc_matching.log.o") + if os.path.isfile(negatives_log): + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout_v2.txt")) + + else: + if name in ["HEPG2", "K562", "H1ESC"]: + negatives_log = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/make_background_regions.log") + if os.path.isfile(negatives_log): + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout_v1.txt")) + else: + negatives_log = os.path.join(main_dir, name+"/data/negatives_data_"+str(fold_num)+"/make_background_regions.log") + if os.path.isfile(negatives_log): + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout_v1.txt")) + + negatives_log = os.path.join(odir, encid+"/negatives_data_"+str(fold_num)+"/make_background_regions.log") + if os.path.isfile(negatives_log): + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout_v1.txt")) + + + if fold_num==0: + if name in ["HEPG2", "K562", "H1ESC"]: + negatives_log = os.path.join(main_dir, name+"/negatives_data/negatives_compared_with_foreground.png") + #print(negatives_log) + if os.path.isfile(negatives_log): + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png")) + else: + negatives_log = os.path.join(main_dir, name+"/data/negatives_data/negatives_compared_with_foreground.png") + if os.path.isfile(negatives_log): + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png")) + + negatives_log = os.path.join(odir, encid+"/negatives_data/negatives_compared_with_foreground.png") + if os.path.isfile(negatives_log): + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png")) + + else: + if name in ["HEPG2", "K562", "H1ESC"]: + negatives_log = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/negatives_compared_with_foreground.png") + if os.path.isfile(negatives_log): + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png")) + else: + negatives_log = os.path.join(main_dir, name+"/data/negatives_data_"+str(fold_num)+"/negatives_compared_with_foreground.png") + if os.path.isfile(negatives_log): + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png")) + + negatives_log = os.path.join(odir, encid+"/negatives_data_"+str(fold_num)+"/negatives_compared_with_foreground.png") + if os.path.isfile(negatives_log): + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png")) + +# negatives_log = os.path.join(odir, encid + "/negatives_data/test/fold_"+str(fold_num)+"."+encid+"_test.log") +# if os.stat(negatives_log).st_size != 0: +# log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.txt")) + + # add preprocessing data main_dir + + #print(input_paths) + return input_paths, log_paths + + +def fetch_per_fold_bias_models(odir, model_dir, encid, fold_num): + input_paths = [] + log_paths = [] + + bm_model = os.path.join(model_dir, "bias_model/bias.h5") + if os.path.isfile(bm_model): + input_paths.append((bm_model,"model.bias.fold_"+str(fold_num)+"."+encid+".h5")) + else: + return None, None + + bm_model = os.path.join(model_dir, "bias_model/new_model_formats_vf/bias.tar") + if os.path.isfile(bm_model): + input_paths.append((bm_model,"model.bias.fold_"+str(fold_num)+"."+encid+".tar")) + else: + return None, None + + #### fetch model training log files ######## + + modelling_log = os.path.join(model_dir, "bias_model/train_bias_model.log") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".stdout.txt")) + + modelling_log = os.path.join(model_dir, "bias_model/bias.args.json") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".args.json")) + + modelling_log = os.path.join(model_dir, "bias_model/bias_data_params.tsv") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_data_params.tsv")) + else: + modelling_log = os.path.join(model_dir, "bias_model/newgen/bias_data_params.tsv") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_data_params.tsv")) + + + modelling_log = os.path.join(model_dir, "bias_model/bias_model_params.tsv") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_train_params.tsv")) + else: + modelling_log = os.path.join(model_dir, "bias_model/newgen/bias_model_params.tsv") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_train_params.tsv")) + + modelling_log = os.path.join(model_dir, "bias_model/bias.params.json") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".bias_train_params.json")) + + modelling_log = os.path.join(model_dir, "bias_model/bias.log") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".epoch_loss.csv")) + + modelling_log = os.path.join(model_dir, "bias_model/bias.log.batch") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".batch_loss.tsv")) + + return input_paths, log_paths + + + + + diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/get_gc_matched_negatives_test.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/get_gc_matched_negatives_test.py new file mode 100644 index 00000000..f0024950 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/get_gc_matched_negatives_test.py @@ -0,0 +1,175 @@ +import argparse +import pandas as pd +import numpy as np +from tqdm import tqdm +import matplotlib.pyplot as plt +import random +import csv +import json +import sys + +def parse_args(): + parser=argparse.ArgumentParser(description="generate a bed file of non-peak regions that are gc-matched with foreground") + parser.add_argument("-c","--candidate_negatives",help="candidate negatives bed file with gc content in 4th column rounded to 2 decimals") + parser.add_argument("-f","--foreground_gc_bed", help="regions with their corresponding gc fractions for matching, 4th column has gc content value rounded to 2 decimals") + parser.add_argument("-o","--output_prefix", help="gc-matched non-peaks output file name") + parser.add_argument("-fl", "--chr_fold_path", type=str, required=True, help="Fold information - dictionary with test,valid and train keys and values with corresponding chromosomes") + parser.add_argument("-npr", "--neg_to_pos_ratio_train", type=int, default=1, help="Ratio of negatives to positives to sample for training") + return parser.parse_args() + +def remap_chrom(chrom, splits_dict): + ''' + Remapping chromosome names - we will not differentiate between the train/valid/tes chromsomes + when sampling negatives. + ''' + if chrom in splits_dict["train"]: + chrom_mod = "chrom_train" + elif chrom in splits_dict["valid"]: + chrom_mod = "chrom_valid" + elif chrom in splits_dict["test"]: + chrom_mod = "chrom_test" + else: + chrom_mod = "ignore" + return chrom_mod + + +def make_gc_dict(candidate_negatives, splits_dict): + """ + Imports the candidate negatives into a dictionary structure. + The `key` is the gc content fraction, and the `values` are a list + containing the (chrom,start,end) of a region with the corresponding + gc content fraction. + """ + data=open(candidate_negatives,'r').readlines() + gc_dict={} + index=0 + ignored_chroms = [] + for line in tqdm(list(data)): + line=line.strip('\n') + index+=1 + tokens=line.split('\t') + chrom=tokens[0] + gc=float(tokens[-1]) + start=tokens[1] + end=tokens[2] + chrom_real=chrom + chrom = remap_chrom(chrom, splits_dict) + if chrom == "ignore": + ignored_chroms.append(chrom_real) + continue + if chrom not in gc_dict: + gc_dict[chrom]={} + if gc not in gc_dict[chrom]: + gc_dict[chrom][gc]=[(chrom,start,end,chrom_real)] + else: + gc_dict[chrom][gc].append((chrom,start,end,chrom_real)) + + print("Following background chromosomes {} were ignored since they are not present in the given fold".format(",".join(list(set(ignored_chroms))))) + return gc_dict + +def scale_gc(cur_gc): + """ + Randomly increase/decrease the gc-fraction value by 0.01 + """ + if random.random()>0.5: + cur_gc+=0.01 + else: + cur_gc-=0.01 + cur_gc=round(cur_gc,2) + if cur_gc<=0: + cur_gc+=0.01 + if cur_gc>=1: + cur_gc-=0.01 + assert cur_gc >=0 + assert cur_gc <=1 + return cur_gc + +def adjust_gc(chrom,cur_gc,negatives,used_negatives): + """ + Function that checks if (1) the given gc fraction value is available + in the negative candidates or (2) if the given gc fraction value has + candidates not already sampled. If eitheir of the condition fails we + sample the neighbouring gc_fraction value by randomly scaling with 0.01. + """ + if chrom not in used_negatives: + used_negatives[chrom]={} + + if cur_gc not in used_negatives[chrom]: + used_negatives[chrom][cur_gc]=[] + + while (cur_gc not in negatives[chrom]) or (len(used_negatives[chrom][cur_gc])>=len(negatives[chrom][cur_gc])): + cur_gc=scale_gc(cur_gc) + if cur_gc not in used_negatives[chrom]: + used_negatives[chrom][cur_gc]=[] + return cur_gc,used_negatives + + + +if __name__=="__main__": + + args=parse_args() + + splits_dict=json.load(open(args.chr_fold_path)) + + negatives=make_gc_dict(args.candidate_negatives, splits_dict) + used_negatives=dict() + cur_peaks=pd.read_csv(args.foreground_gc_bed,header=None,sep='\t') + negatives_bed = [] + print(len(list(cur_peaks.iterrows()))) + + foreground_gc_vals = [] + output_gc_vals = [] + ignored_chroms = [] + for index,row in tqdm(list(cur_peaks.iterrows())): + + chrom=row[0] + start=row[1] + end=row[2] + gc_value=row[3] + + chrom_real=chrom + chrom = remap_chrom(chrom, splits_dict) + if chrom == "ignore": + ignored_chroms.append(chrom_real) + continue + + if chrom=="chrom_train" or chrom=="chrom_valid": + #neg_to_pos_ratio = args.neg_to_pos_ratio_train + continue + else: + neg_to_pos_ratio = 4 + + # for every gc value in positive how many negatives to find + # we will keep the ratio of positives to negatives in the test set same + for rep in range(neg_to_pos_ratio): + cur_gc,used_negatives=adjust_gc(chrom,gc_value,negatives,used_negatives) + num_candidates=len(negatives[chrom][cur_gc]) + rand_neg_index=random.randint(0,num_candidates-1) + while rand_neg_index in used_negatives[chrom][cur_gc]: + cur_gc,used_negatives=adjust_gc(chrom,cur_gc,negatives,used_negatives) + num_candidates=len(negatives[chrom][cur_gc]) + rand_neg_index=random.randint(0,num_candidates-1) + + used_negatives[chrom][cur_gc].append(rand_neg_index) + neg_tuple=negatives[chrom][cur_gc][rand_neg_index] + neg_chrom=neg_tuple[0] + neg_start=neg_tuple[1] + neg_end=neg_tuple[2] + neg_chrom_real=neg_tuple[3] + negatives_bed.append([neg_chrom_real,int(neg_start),int(neg_end), cur_gc]) + output_gc_vals.append(cur_gc) + foreground_gc_vals.append(gc_value) + + print("Following foreground chromosomes {} were ignored since they are not present in the given fold".format(",".join(list(set(ignored_chroms))))) + negatives_bed = pd.DataFrame(negatives_bed) + negatives_bed.to_csv(args.output_prefix+".bed", sep='\t', index=False, header=False, quoting=csv.QUOTE_NONE) + + # checking how far the true distribution of foreground is compared to the backgrounds generated + bins = np.linspace(0, 1, 100) + plt.hist([output_gc_vals,foreground_gc_vals], bins, density=True, label=['negatives gc distribution', "foreground gc distribution"]) + plt.xlabel("GC content") + plt.ylabel("Density") + plt.legend(loc='upper right') + plt.savefig(args.output_prefix+"_compared_with_foreground.png") + + diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/run_script.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/run_script.py new file mode 100644 index 00000000..763f2c12 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/run_script.py @@ -0,0 +1,26 @@ +import pandas as pd +import os + +model_atac = pd.read_csv("../atac_bias_model_chrombpnet.csv",sep=",", header=None) + + +print(model_atac.head()) + +for i,r in model_atac.iterrows(): + + print(r) + if os.path.isfile(os.path.join(r[2], "train_test_regions_may_7_2024/nonpeaks.testset.bed.gz")): + try: + tdata = pd.read_csv(os.path.join(r[2], "train_test_regions_may_7_2024/nonpeaks.testset.bed.gz")) + continue + except: + pass + + print(os.path.join(r[2], "train_test_regions_may_7_2024/nonpeaks.testset.bed.gz")) + if r[0].split("_")[-1] == "0": + command = "bash script.sh "+r[2]+" "+r[1]+" "+r[0] + else: + command = "bash script.sh "+r[2]+" "+r[1]+" "+r[0]+" "+"_"+str(r[0].split("_")[-1]) + + print(command) + os.system(command) diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/run_script_dnase.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/run_script_dnase.py new file mode 100644 index 00000000..8d03cd81 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/run_script_dnase.py @@ -0,0 +1,36 @@ +import pandas as pd +import os + +#model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/upload_jsons/upload_scripts/model_dir_dnase_v2.1_bias.csv",sep=",", header=None) +model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/v1/model_dir_dnase_v2.1.csv",sep=",", header=None) + + +print(model_atac.head()) + +for i,r in model_atac.iterrows(): + + if r[1] != "H1ESC": + continue + if r[1] in ["HEPG2", "K562"]: + tag="DNASE_PE" + mdir=r[1] + else: + tag="DNASE_SE" + #print(r) + if os.path.isfile(os.path.join(r[2], "train_test_regions_may_7_2024/nonpeaks.testset.bed.gz")): + try: + tdata = pd.read_csv(os.path.join(r[2], "train_test_regions_may_7_2024/nonpeaks.testset.bed.gz")) + continue + except: + pass + + print(os.path.join(r[2], "train_test_regions_may_7_2024/nonpeaks.testset.bed.gz")) + if r[0].split("_")[-1] == "0": + command = "bash script_dnase.sh "+r[2]+" "+r[1]+" "+r[0]+" "+tag + else: + command = "bash script_dnase.sh "+r[2]+" "+r[1]+" "+r[0]+" "+tag+" "+"_"+str(r[0].split("_")[-1]) + + print(command) + os.system(command) + +#/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_SE/H1ESC/n diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/script.sh b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/script.sh new file mode 100644 index 00000000..66b6e7d4 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/script.sh @@ -0,0 +1,15 @@ +modeldir=$1 +celll=$2 +foldn=/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/splits_format/$3.json +fold=$4 + +python get_gc_matched_negatives_test.py \ + -c /mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/$celll/negatives_data$fold/candidate.negatives.bed \ + -f /mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/$celll/negatives_data$fold/foreground.gc.bed \ + -o $modeldir/train_test_regions_may_7_2024/negatives \ + -fl $foldn + +awk -v OFS="\t" '{print $1, $2, $3, ".", ".", ".", ".", ".", ".", "1057"}' $modeldir/train_test_regions_may_7_2024/negatives.bed > $modeldir/train_test_regions_may_7_2024/negatives_with_summit.bed + +gzip -c $modeldir/train_test_regions_may_7_2024/negatives_with_summit.bed > $modeldir/train_test_regions_may_7_2024/nonpeaks.testset.bed.gz + diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/script_dnase.sh b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/script_dnase.sh new file mode 100644 index 00000000..d8df8a0e --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/make_test_negatives/script_dnase.sh @@ -0,0 +1,16 @@ +modeldir=$1 +celll=$2 +foldn=/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/splits_format/$3.json +tag=$4 +fold=$5 + +python get_gc_matched_negatives_test.py \ + -c /mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/$tag/$celll/negatives_data$fold/candidate.negatives.bed \ + -f /mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/$tag/$celll/negatives_data$fold/foreground.gc.bed \ + -o $modeldir/train_test_regions_may_7_2024/negatives \ + -fl $foldn + +awk -v OFS="\t" '{print $1, $2, $3, ".", ".", ".", ".", ".", ".", "1057"}' $modeldir/train_test_regions_may_7_2024/negatives.bed > $modeldir/train_test_regions_may_7_2024/negatives_with_summit.bed + +gzip -c $modeldir/train_test_regions_may_7_2024/negatives_with_summit.bed > $modeldir/train_test_regions_may_7_2024/nonpeaks.testset.bed.gz + diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/model_upload_utils.py b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/model_upload_utils.py new file mode 100644 index 00000000..6b5d3447 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/model_upload_utils.py @@ -0,0 +1,235 @@ +import os +import json +import numpy as np + + +### utils for model uploads + +def fetch_per_fold_models(odir, model_dir, encid, fold_num): + input_paths = [] + log_paths = [] + log_paths_opt = [] + + cmb = os.path.join(model_dir, "chrombpnet_model/chrombpnet_wo_bias.h5") + if os.path.isfile(cmb): + input_paths.append((cmb,"model.chrombpnet_nobias.fold_"+str(fold_num)+"."+encid+".h5")) + else: + print(cmb) + return None, None, None + + cmb = os.path.join(model_dir, "chrombpnet_model/chrombpnet.h5") + if os.path.isfile(cmb): + input_paths.append((cmb,"model.chrombpnet.fold_"+str(fold_num)+"."+encid+".h5")) + else: + print(cmb) + return None, None, None + +# checks_file = os.path.join(model_dir, "new_chrombpnet_model/check_passed.txt") +# if os.path.isfile(checks_file): +# cm_model = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet.h5") +# if os.path.isfile(cm_model): +# input_paths.append((cm_model,"model.chrombpnet.fold_"+str(fold_num)+"."+encid+".h5")) +# else: +# print(cm_model) +# return None, None, None +# +# cm_model = os.path.join(odir, encid + "/" + model_dir + "/new_model_formats/chrombpnet.tar") +# if os.path.isfile(cm_model): +# input_paths.append((cm_model,"model.chrombpnet.fold_"+str(fold_num)+"."+encid+".tar")) +# else: +# print(cm_model) +# return None, None, None +# +# +# else: +# cm_model = os.path.join(odir, encid + "/" + model_dir + "/new_chrombpnet_model/chrombpnet_new.h5") +# if os.path.isfile(cm_model): +# input_paths.append((cm_model,"model.chrombpnet.fold_"+str(fold_num)+"."+encid+".h5")) +# else: +# print(cm_model) +# return None, None, None +# +# cm_model = os.path.join(odir, encid + "/" + model_dir + "/new_chrombpnet_model/chrombpnet.tar") +# if os.path.isfile(cm_model): +# input_paths.append((cm_model,"model.chrombpnet.fold_"+str(fold_num)+"."+encid+".tar")) +# else: +# print(cm_model) +# return None, None, None + + + bm_model = os.path.join(model_dir, "chrombpnet_model/bias_model_scaled.h5") + if os.path.isfile(bm_model): + input_paths.append((bm_model,"model.bias_scaled.fold_"+str(fold_num)+"."+encid+".h5")) + else: + print(cmb) + return None, None, None + + cmb = os.path.join(model_dir, "new_model_formats_may_7_24_vf/chrombpnet.tar") + if os.path.isfile(cmb): + input_paths.append((cmb,"model.chrombpnet.fold_"+str(fold_num)+"."+encid+".tar")) + else: + print(cmb) + + return None, None, None + + cmb = os.path.join(model_dir, "new_model_formats_may_7_24_vf/chrombpnet_wo_bias.tar") + if os.path.isfile(cmb): + input_paths.append((cmb,"model.chrombpnet_nobias.fold_"+str(fold_num)+"."+encid+".tar")) + else: + print(cmb) + + return None, None, None + + + bm_model = os.path.join(model_dir, "new_model_formats_may_7_24_vf/bias_model_scaled.tar") + if os.path.isfile(bm_model): + input_paths.append((bm_model,"model.bias_scaled.fold_"+str(fold_num)+"."+encid+".tar")) + else: + return None, None, None + + ### fetch main logs + + modelling_log = os.path.join(model_dir, "chrombpnet_model/chrombpnet.args.json") + if os.path.isfile(modelling_log): + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".args.json")) + else: + print(modelling_log) + + modelling_log = os.path.join(model_dir, "chrombpnet_model/chrombpnet_data_params.tsv") + if os.stat(modelling_log).st_size != 0: + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".chrombpnet_data_params.tsv")) + else: + print(modelling_log) + + modelling_log = os.path.join(model_dir, "chrombpnet_model/chrombpnet_model_params.tsv") + if os.stat(modelling_log).st_size != 0: + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".chrombpnet_model_params.tsv")) + else: + print(modelling_log) + + modelling_log = os.path.join(model_dir, "chrombpnet_model/chrombpnet.params.json") + if os.stat(modelling_log).st_size != 0: + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".chrombpnet.params.json")) + else: + print(modelling_log) + + modelling_log = os.path.join(model_dir, "chrombpnet_model/chrombpnet.log") + if os.stat(modelling_log).st_size != 0: + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".epoch_loss.csv")) + else: + print(modelling_log) + + modelling_log = os.path.join(model_dir, "chrombpnet_model/chrombpnet.log.batch") + if os.stat(modelling_log).st_size != 0: + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".batch_loss.tsv")) + else: + print(modelling_log) + + modelling_log = os.path.join(model_dir, "chrombpnet_model/train_chrombpnet_model.log") + if os.stat(modelling_log).st_size != 0: + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".stdout_v1.txt")) + else: + print(modelling_log) + + + return input_paths, log_paths, log_paths_opt + + +### utils for training and testing regions + +def fetch_preprocessing_log_files(odir, encid, main_dir, name): + # do bed file checks + log_paths = [] + + preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_preprocessing.log") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout.txt")) + + try: + preprocessing_log = os.path.join(main_dir, name + "/data/"+name.lower()+"_atac_fold_0.sh") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script_v2.sh")) + except: + try: + preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_DNASE_PE.sh") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script_v2.sh")) + except: + preprocessing_log = os.path.join(main_dir, name + "/data/"+"h1_dnase_fold_0.sh") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".script_v2.sh")) + + preprocessing_log = os.path.join(main_dir, name + "/data/"+name+"_bias_pwm.png") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".bias_pwm.png")) + + return log_paths + +def fetch_per_fold_training_data(odir,model_dir,encid, fold_num, main_dir, name): + input_paths = [] + log_paths = [] + + opath = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/splits_format/" + filtered_regions_bed = os.path.join(opath + "/fold_"+str(fold_num)+".json") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"cv_params.fold_"+str(fold_num)+".json")) + + if fold_num==0: + filtered_regions_bed = os.path.join(main_dir, name+"/negatives_data/negatives_with_summit.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz")) + else: + filtered_regions_bed = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/negatives_with_summit.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.all_input_regions.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + + filtered_regions_bed = os.path.join(model_dir, "train_test_regions_may_7_2024/peaks.trainingset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"peaks.trainingset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(model_dir, "train_test_regions_may_7_2024/peaks.validationset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"peaks.validationset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(model_dir, "train_test_regions_may_7_2024/peaks.testset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"peaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(model_dir, "train_test_regions_may_7_2024/nonpeaks.trainingset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.trainingset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(model_dir, "train_test_regions_may_7_2024/nonpeaks.validationset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.validationset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(model_dir, "train_test_regions_may_7_2024/nonpeaks.testset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + # preprocessing logs to include + + if fold_num==0: + #negatives_log = os.path.join(temp_dir, name+"/negatives_data/make_background_regions.log") + negatives_log = os.path.join(main_dir, name+"/negatives_data/make_background_regions.log") + + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.txt")) + else: + negatives_log = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/make_background_regions.log") + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.txt")) + + + if fold_num==0: +# negatives_log = os.path.join(temp_dir, "negatives_data/negatives_compared_with_foreground.png") + negatives_log = os.path.join(main_dir, name+"/negatives_data/negatives_compared_with_foreground.png") + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png")) + else: + negatives_log = os.path.join(main_dir, name+"/negatives_data_"+str(fold_num)+"/negatives_compared_with_foreground.png") + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching.fold_"+str(fold_num)+"."+encid+".stdout.png")) + + return input_paths, log_paths diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/temp.sh b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/temp.sh new file mode 100644 index 00000000..128fc01a --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/bias_models/chrombpnet/temp.sh @@ -0,0 +1,2 @@ +gzip -c /mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_SE/H1ESC/negatives_data_4/negatives_with_summit.bed > /mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/DNASE_SE/H1ESC/negatives_data_4/negatives_with_summit.bed.gz + diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/READMEs/bias.models.README b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/READMEs/bias.models.README new file mode 100644 index 00000000..315b971b --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/READMEs/bias.models.README @@ -0,0 +1,92 @@ +# Directory Structure Format +. +├── fold_0 +│ ├── model.bias.fold_0.encid.h5 # bias model in .h5 format +│ ├── model.bias.fold_0.encid.h5 # bias model in SavedModel format +│ │ after being untarred, it results in a directory named "bias" +│ └── logs.bias.models.fold_0.encid # folder containing log files for training models +│ +├── fold_1 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_2 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_3 +│ └── ... # similar directory structure as fold_0 directory above +│ +└── fold_4 + └── ... # similar directory structure as fold_0 directory above + + +# Pseudocode for loading models in .h5 format + +(1) Use the code in python after appropriately defining `model_in_h5_format` and `inputs`. +(2) `inputs` is a one hot encoded sequence of shape (N,2114,4). Here N corresponds to the +number of tested sequences, 2114 is the input sequence length and 4 corresponds to [A,C,G,T]. + +``` +import tensorflow as tf +from tensorflow.keras.utils import get_custom_objects +from tensorflow.keras.models import load_model + +custom_objects={"tf": tf} +get_custom_objects().update(custom_objects) + +model=load_model(model_in_h5_format,compile=False) +outputs = model(inputs) +``` + +The list `outputs` consists of two elements. The first element has a shape of (N, 1000) and +contains logit predictions for a 1000-base-pair output. The second element, with a shape of +(N, 1), contains logcount predictions. To transform these predictions into per-base signals, +follow the provided pseudo code lines below. + +``` +import numpy as np + +def softmax(x, temp=1): + norm_x = x - np.mean(x,axis=1, keepdims=True) + return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True) + +predictions = softmax(outputs[0]) * (np.exp(outputs[1])-1) +``` + +# Pseudocode for loading models in .tar format + +(1) First untar the directory as follows `tar -xvf model.tar` +(2) Use the code below in python after appropriately defining `model_dir_untared` and `inputs` +(3) `inputs` is a one hot encoded sequence of shape (N,2114,4). Here N corresponds to the number +of tested sequences, 2114 is the input sequence length and 4 corresponds to ACGT. + +Reference: https://www.tensorflow.org/api_docs/python/tf/saved_model/load + +``` +import tensorflow as tf + +model = tf.saved_model.load('model_dir_untared') +outputs = model.signatures['serving_default'](**{'sequence':inputs.astype('float32')}) +``` + +The variable `outputs` represents a dictionary containing two key-value pairs. The first key +is `logits_profile_predictions`, holding a value with a shape of (N, 1000). This value corresponds +to logit predictions for a 1000-base-pair output. The second key, named `logcount_predictions``, +is associated with a value of shape (N, 1), representing logcount predictions. To transform these +predictions into per-base signals, utilize the provided pseudo code lines mentioned below. + +``` +import numpy as np +def softmax(x, temp=1): + norm_x = x - np.mean(x,axis=1, keepdims=True) + return np.exp(temp*norm_x)/np.sum(np.exp(temp*norm_x), axis=1, keepdims=True) + +predictions = softmax(outputs["logits_profile_predictions"]) * (np.exp(outputs["logcount_predictions"])-1) +``` + +# Docker image to load and use the models + +https://hub.docker.com/r/kundajelab/chrombpnet-atlas/ (tag:v1) + +# Tool box to do downstream analysis with the models + +https://github.com/kundajelab/chrombpnet/wiki diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/READMEs/bias.training.README b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/READMEs/bias.training.README new file mode 100644 index 00000000..8faa0ea2 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/READMEs/bias.training.README @@ -0,0 +1,63 @@ +# Directory Structure Format +. +├── peaks.all_input_regions.encid.bed.gz # Peaks input to the bias training script +├── logs.bias.training_test_regions.encid # folder containing log files for peak and nonpeak generation scripts +│ +├── fold_0 +│ ├── cv_params.fold_0.json # training, validation and test chromosomes used in fold 0 +│ ├── nonpeaks.all_input_regions.fold_0.encid.bed.gz # Non peaks input to the bias training script +│ ├── nonpeaks.trainingset.fold_0.encid.bed.gz # nonpeaks used in training set of fold 0 bias model +│ ├── nonpeaks.validationset.fold_0.encid.bed.gz # nonpeaks used in validation set of fold 0 bias model +│ ├── nonpeaks.testset.fold_0.encid.bed.gz # nonpeaks used in test set of fold 0 bias model +│ └── logs.bias.training_test_regions.fold_0.encid # folder containing log files for training bias model on fold 0 +│ +├── fold_1 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_2 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_3 +│ └── ... # similar directory structure as fold_0 directory above +│ +└── fold_4 + └── ... # similar directory structure as fold_0 directory above + +# Bed File Format for Peaks + +* All the bed files are in narrowpeak format with 10 columns. + +1) chrom - Name of the chromosome (or contig, scaffold, etc.). +2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0. +3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99. +4) name - Name given to a region (preferably unique). Use "." if no name is assigned. +5) score - Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were "'0"' when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000. +6) strand - +/- to denote strand or orientation (whenever applicable). Use "." if no orientation is assigned. +7) signalValue - Measurement of overall (usually, average) enrichment for the region. +8) pValue - Measurement of statistical significance (-log10). Use -1 if no pValue is assigned. +9) qValue - Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned. +10) peak - Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called. + +# Bed File Format for Nonpeaks + +* All the bed files are in narrowpeak format with 10 columns. + +1) chrom - Name of the chromosome (or contig, scaffold, etc.). +2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0. +3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd base is not included in the display of the feature. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases numbered 0-99. +4) empty character - "." +5) empty character - "." +6) empty character - "." +7) empty character - "." +8) empty character - "." +9) empty character - "." +10) (chromEnd-chromStart)/2 + +# Format of file `cv_params.fold_0.json` + +A dictionary with following (key,value) pairs, + +1) ("CV_type", "chr_holdout") +2) ("train", list_of_chrs_trainingset) +3) ("valid", list_of_chrs_validationset) +4) ("test", list_of_chrs_testset) diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/atac_bias_model_chrombpnet.csv b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/atac_bias_model_chrombpnet.csv new file mode 100644 index 00000000..15190cf2 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/atac_bias_model_chrombpnet.csv @@ -0,0 +1,26 @@ +fold_0,GM12878,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/GM12878/nautilus_runs/GM12878_03.01.2022_bias_128_4_1234_0.4_fold_0 +fold_1,GM12878,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/GM12878/GM12878_07.08.2022_bias_128_4_1234_0.4_fold_1_data_type_ATAC_PE +fold_2,GM12878,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/GM12878/GM12878_07.08.2022_bias_128_4_1234_0.4_fold_2_data_type_ATAC_PE +fold_3,GM12878,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/GM12878/GM12878_07.14.2022_bias_128_4_1234_0.4_fold_3_data_type_ATAC_PE +fold_4,GM12878,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/GM12878/GM12878_07.07.2022_bias_128_4_1234_0.4_fold_4_data_type_ATAC_PE +fold_0,K562,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/K562/nautilus_runs/K562_02.17.2022_bias_128_4_1234_0.5_fold_0 +fold_1,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_1_data_type_ATAC_PE +fold_2,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_2_data_type_ATAC_PE +fold_3,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_3_data_type_ATAC_PE +fold_4,K562,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/K562/K562_07.07.2022_bias_128_4_2356_0.5_fold_4_data_type_ATAC_PE +fold_0,HEPG2,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/HEPG2/nautilus_runs_jun16/HEPG2_05.09.2022_bias_128_4_1234_0.8_fold_0 +fold_1,HEPG2,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/HEPG2/HEPG2_06.07.2022_bias_128_4_1234_0.8_fold_1 +fold_2,HEPG2,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/HEPG2/HEPG2_05.24.2022_bias_128_4_1234_0.8_fold_2 +fold_3,HEPG2,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/HEPG2/HEPG2_05.22.2022_bias_128_4_1234_0.8_fold_3 +fold_4,HEPG2,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/HEPG2/HEPG2_05.22.2022_bias_128_4_1234_0.8_fold_4 +fold_0,IMR90,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/IMR90/nautilus_runs_apr12/IMR90_04.09.2022_bias_128_4_1234_0.4_fold_0 +fold_1,IMR90,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/IMR90/IMR90_07.17.2022_bias_128_4_1234_0.3_fold_1_data_type_ATAC_PE +fold_2,IMR90,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/IMR90/IMR90_07.17.2022_bias_128_4_1234_0.3_fold_2_data_type_ATAC_PE +fold_3,IMR90,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/IMR90/IMR90_07.08.2022_bias_128_4_1234_0.4_fold_3_data_type_ATAC_PE +fold_4,IMR90,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/IMR90/IMR90_07.07.2022_bias_128_4_1234_0.4_fold_4_data_type_ATAC_PE +fold_0,H1ESC,/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/ATAC_PE/H1ESC/nautilus_runs_jun16/H1ESC_05.09.2022_bias_128_4_1234_0.8_fold_0 +fold_1,H1ESC,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/H1ESC/H1ESC_07.23.2022_bias_128_4_1234_0.7_fold_1_data_type_ATAC_PE +fold_2,H1ESC,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/H1ESC/H1ESC_07.17.2022_bias_128_4_1234_0.8_fold_2_data_type_ATAC_PE +fold_3,H1ESC,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/H1ESC/H1ESC_07.17.2022_bias_128_4_1234_0.8_fold_3_data_type_ATAC_PE +fold_4,H1ESC,/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/H1ESC/H1ESC_07.17.2022_bias_128_4_1234_0.8_fold_4_data_type_ATAC_PE + diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/atac_prepare_file_for_upload_models.py b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/atac_prepare_file_for_upload_models.py new file mode 100644 index 00000000..e23c945c --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/atac_prepare_file_for_upload_models.py @@ -0,0 +1,159 @@ +import os +import upload_utils +import json + +odir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/ATAC/" +bw_odir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/full_deepshaps/bigwigs/ATAC/" +#output_dir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022-uploads/jsons/ATAC/stage1/jul_17_2023/" +models_path = ["chrombpnet_model_feb15", "chrombpnet_model_feb15_fold_1", "chrombpnet_model_feb15_fold_2", "chrombpnet_model_feb15_fold_3", "chrombpnet_model_feb15_fold_4"] +output_dir = "atac_production_uploads/" +#encids = os.listdir(odir) +encids = open("data/atac_passed.txt").readlines() +encids = [line.strip() for line in encids] + +encode_id = {"K562": "ENCSR868FGK", +"GM12878": "ENCSR637XSC", +"HEPG2": "ENCSR291GJU", +"IMR90": "ENCSR200OML", +"H1ESC": "GSE267154"} + +def main_fetch_preprocessing_files(encid, args_json): + + success_flag = False + args_json["upload bias"] = False + args_json["bias model encid"] = encid + + # find the bams input + preprocessing_path = os.path.join(odir, encid + "/preprocessing/bigWigs/"+encid+".bigWig") + if os.path.isfile(preprocessing_path): + bam_ids = upload_utils.fetch_input_bam_ids(odir,encid) + + if bam_ids == None: + success = False + return success_flag, args_json + + args_json["experiment"] = encid + args_json["bam files"] = bam_ids + args_json["assay"] = "ATAC-seq" + args_json["observed signal profile bigWig"] = preprocessing_path + success = True + else: + success = False + + return success, args_json + +def main_fetch_model_files(encid, args_json): + success = False + args_json["models tar"] = {} + readme_file = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/READMES/models.README" + assert(os.path.isfile(readme_file)) + args_json["models tar"]["file.paths"] = [(readme_file, "README.md")] + args_json["models tar"]["logs.models."+encid] = {"file.paths": None} + + for i in range(5): + data_paths, log_paths, log_paths_opt = upload_utils.fetch_per_fold_models(odir,models_path[i], encid, i) + + if data_paths is None: + success = False + return success, args_json + + args_json["models tar"]["fold_"+str(i)] = {} + args_json["models tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["models tar"]["fold_"+str(i)]["logs.models.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt} + assert(len(data_paths) == 6) + assert(len(log_paths) == 13) + + success=True + return success, args_json + +def main_fetch_training_files(encid, args_json): + success = False + + # find the training test regions + args_json["training and test regions tar"] = {} + readme_file = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/READMES/training_test_regions.README" + assert(os.path.isfile(readme_file)) + args_json["training and test regions tar"]["file.paths"] = [(readme_file, "README.md")] + + input_peaks = os.path.join(odir, encid + "/preprocessing/downloads/peaks.bed.gz") + if os.path.isfile(input_peaks): + args_json["training and test regions tar"]["file.paths"].append((input_peaks,"peaks.all_input_regions."+encid+".bed.gz")) + else: + success = False + return success, args_json + + input_nonpeaks = os.path.join(odir, encid + "/negatives_data/negatives_with_summit.bed") + if os.path.isfile(input_nonpeaks): + import pandas as pd + #os.system("gzip "+input_nonpeaks) + nonpeaks_data = pd.read_csv(input_nonpeaks, sep="\t", header=None) + nonpeaks_data.to_csv(input_nonpeaks+".gz", sep="\t", header=False, index=False, compression="gzip") + #os.system("rm "+input_nonpeaks) + + input_nonpeaks = os.path.join(odir, encid + "/negatives_data/negatives_with_summit.bed.gz") + if os.path.isfile(input_nonpeaks): + args_json["training and test regions tar"]["file.paths"].append((input_nonpeaks,"nonpeaks.all_input_regions."+encid+".bed.gz")) + else: + success = False + return success, args_json + + log_paths = upload_utils.fetch_preprocessing_log_files(odir,encid) + args_json["training and test regions tar"]["logs.training_test_regions."+encid] = {"file.paths": log_paths} + assert(len(log_paths) == 12) + + for i in range(5): + data_paths, log_paths = upload_utils.fetch_per_fold_training_data(odir,models_path[i], encid, i) + + args_json["training and test regions tar"]["fold_"+str(i)] = {} + args_json["training and test regions tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["training and test regions tar"]["fold_"+str(i)]["logs.training_test_regions.fold_"+str(i)+"."+encid] = {"file.paths": log_paths} + assert(len(data_paths) == 7) + assert(len(log_paths) == 4) + + if len(data_paths) != 7: + success = False + return success, args_json + + success = True + return success, args_json + + +if __name__ == "__main__": + + + for name in ["K562", "GM12878", "HEPG2", "IMR90", "H1ESC"]: + + + encid=encode_id[name] + if os.path.isfile(output_dir+"/"+encid+".json"): + continue + + print(encid) + + args_json = {} + + success, args_json = main_fetch_preprocessing_files(encid, args_json) + if not success: + print("fail prep") + continue + + success, args_json = main_fetch_model_files(encid, args_json) + if not success: + print("fail model") + continue + + success, args_json = main_fetch_training_files(encid, args_json) + if not success: + print("fail train prep") + continue + + + with open(output_dir+"/"+encid+".json", "w") as outfile: + json.dump(args_json, outfile, indent=4) + + #print(args_json) + + + + + diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/dnase_prepare_file_for_upload_models.py b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/dnase_prepare_file_for_upload_models.py new file mode 100644 index 00000000..58521913 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/dnase_prepare_file_for_upload_models.py @@ -0,0 +1,204 @@ +import os +import upload_utils +import json + +odir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/" +bw_odir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/full_deepshaps/bigwigs/DNASE/" +#output_dir = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022-uploads/jsons/DNASE/stage1/jul_26_2023/" +output_dir="dnase_production_uploads/" + +tissue_encids = open("../data/tissue_passed.txt").readlines() +tissue_encids = [line.strip() for line in tissue_encids] + +primary_encids = open("../data/primary_passed.txt").readlines() +primary_encids = [line.strip() for line in primary_encids] + +celline_encids = open("../data/cellline_passed.txt").readlines() +celline_encids = [line.strip() for line in celline_encids] + +invitro_encids = open("../data/invitro_passed.txt").readlines() +invitro_encids = [line.strip() for line in invitro_encids] + + +ary_models_path = ["chrombppnet_model_encsr283tme_bias", "chrombppnet_model_encsr283tme_bias_fold_1", "chrombppnet_model_encsr283tme_bias_fold_2", "chrombppnet_model_encsr283tme_bias_fold_3", "chrombppnet_model_encsr283tme_bias_fold_4"] +celline_models_path = ["chrombpnet_model_feb15_fold_0", "chrombpnet_model_feb15_fold_1", "chrombpnet_model_feb15_fold_2", "chrombpnet_model_feb15_fold_3", "chrombpnet_model_feb15_fold_4"] +tissue_models_path = ["chrombpnet_model_encsr880cub_bias","chrombppnet_model_encsr880cub_bias_fold_1","chrombppnet_model_encsr880cub_bias_fold_2","chrombppnet_model_encsr880cub_bias_fold_3","chrombppnet_model_encsr880cub_bias_fold_4"] +invitro_models_path = ["chrombpnet_model_encsr146kfx_bias", "chrombpnet_model_encsr146kfx_bias_fold_1", "chrombpnet_model_encsr146kfx_bias_fold_2", "chrombpnet_model_encsr146kfx_bias_fold_3", "chrombpnet_model_encsr146kfx_bias_fold_4"] + +encids = tissue_encids + primary_encids + celline_encids + invitro_encids + +def main_fetch_preprocessing_files(encid, args_json, bias_encid): + + success_flag = False + args_json["upload bias"] = False + args_json["bias model encid"] = bias_encid + + # find the bams input + preprocessing_path = os.path.join(odir, encid + "/preprocessing/bigWigs/"+encid+".bigWig") + if os.path.isfile(preprocessing_path): + bam_ids = upload_utils.fetch_input_bam_ids(odir,encid) + + if bam_ids == None: + success = False + return success_flag, args_json + + args_json["experiment"] = encid + args_json["bam files"] = bam_ids + args_json["assay"] = "DNase-seq" + args_json["observed signal profile bigWig"] = preprocessing_path + success = True + else: + success = False + + return success, args_json + +def main_fetch_model_files(encid, args_json): + success = False + args_json["models tar"] = {} + readme_file = "READMEs/bias.models.README" + assert(os.path.isfile(readme_file)) + args_json["models tar"]["file.paths"] = [(readme_file, "README.md")] + #args_json["models tar"]["logs.models."+encid] = {"file.paths": None} + + for i in range(5): + data_paths, log_paths, log_paths_opt = upload_utils.fetch_per_fold_models(odir,models_path[i], encid, i) + + if data_paths is None: + success = False + return success, args_json + + args_json["models tar"]["fold_"+str(i)] = {} + args_json["models tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["models tar"]["fold_"+str(i)]["logs.models.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt} + # 9 log file expected per model + assert(len(data_paths) == 6) + print(len(log_paths)) + assert(len(log_paths) == 7) + + success=True + return success, args_json + +def main_fetch_training_files(encid, args_json): + success = False + + # find the training test regions + args_json["training and test regions tar"] = {} + #readme_file = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/READMES/training_test_regions.README" + readme_file = "READMEs/bias.models.README" + assert(os.path.isfile(readme_file)) + args_json["training and test regions tar"]["file.paths"] = [(readme_file, "README.md")] + + input_peaks = os.path.join(odir, encid + "/preprocessing/downloads/peaks.bed.gz") + if os.path.isfile(input_peaks): + args_json["training and test regions tar"]["file.paths"].append((input_peaks,"peaks.all_input_regions."+encid+".bed.gz")) + else: + success = False + return success, args_json + + input_nonpeaks_gz = os.path.join(odir, encid + "/negatives_data/negatives_with_summit.bed.gz") + input_nonpeaks = os.path.join(odir, encid + "/negatives_data/negatives_with_summit.bed") + if not os.path.isfile(input_nonpeaks_gz): + if os.path.isfile(input_nonpeaks): + import pandas as pd + #os.system("gzip "+input_nonpeaks) + nonpeaks_data = pd.read_csv(input_nonpeaks, sep="\t", header=None) + nonpeaks_data.to_csv(input_nonpeaks+".gz", sep="\t", header=False, index=False, compression="gzip") + #os.system("rm "+input_nonpeaks) + + input_nonpeaks = os.path.join(odir, encid + "/negatives_data/negatives_with_summit.bed.gz") + + if os.path.isfile(input_nonpeaks): + args_json["training and test regions tar"]["file.paths"].append((input_nonpeaks,"nonpeaks.all_input_regions."+encid+".bed.gz")) + else: + success = False + return success, args_json + + log_paths = upload_utils.fetch_preprocessing_log_files(odir,encid) + args_json["training and test regions tar"]["logs.training_test_regions."+encid] = {"file.paths": log_paths} + #print(len(log_paths)) + #print(log_paths) + assert(len(log_paths) == 12) + + for i in range(5): + data_paths, log_paths = upload_utils.fetch_per_fold_training_data(odir,models_path[i], encid, i) + + args_json["training and test regions tar"]["fold_"+str(i)] = {} + args_json["training and test regions tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["training and test regions tar"]["fold_"+str(i)]["logs.training_test_regions.fold_"+str(i)+"."+encid] = {"file.paths": log_paths} + #print(data_paths) + assert(len(data_paths) == 7) + + assert(len(log_paths) == 0) + + if len(data_paths) != 7: + success = False + return success, args_json + + success = True + return success, args_json + + +if __name__ == "__main__": + + ignore_list = [] + + for encid in ["ENCSR000EMT", "ENCSR477RTP"]: + #if encid in ignore_list: + # continue + + if encid in primary_encids: + models_path = primary_models_path + bias_encid="ENCSR283TME" + #print("primary") + elif encid in tissue_encids: + models_path = tissue_models_path + bias_encid="ENCSR880CUB" + #print("tissue") + elif encid in invitro_encids: + models_path = invitro_models_path + bias_encid="ENCSR146KFX" + #print("invitro") + elif encid in celline_encids: + models_path = celline_models_path + bias_encid="ENCSR149XIL" + #print("celline") + else: + print(encid) + print("type not found") + continue + + if os.path.isfile(output_dir+"/"+encid+".json"): + continue + + print(encid) + args_json = {} + + + success, args_json = main_fetch_preprocessing_files(encid, args_json, bias_encid) + if not success: + print(encid) + print("exit preprocessing") + continue + + success, args_json = main_fetch_model_files(encid, args_json) + if not success: + print(encid) + print("exit models") + continue + + success, args_json = main_fetch_training_files(encid, args_json) + if not success: + print(encid) + print("exit train test regions") + continue + + + with open(output_dir+"/"+encid+".json", "w") as outfile: + json.dump(args_json, outfile, indent=4) + + #print(args_json) + + + + + diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/upload_utils.py b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/upload_utils.py new file mode 100644 index 00000000..9f7f1415 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/chrombpnet/upload_utils.py @@ -0,0 +1,281 @@ +import os +import json +import numpy as np + +### utils for preprocessing + +def fetch_input_bam_ids(odir,encid): + log_path = os.path.join(odir, encid + "/preprocessing/preprocess_"+encid+".log") + logd = open(log_path).readlines() + set_cflag=False + set_bflag=False + + bams_ids = [] + + for line in logd: + + if set_cflag: + words = line.strip().split() + if words[1] == "cp": + if words[2].split("/")[-1].endswith("bam"): + bam_enc = words[2].split("/")[-1].replace(".bam","") + bams_ids.append(bam_enc) + return bams_ids + else: + print(encid,"error") + return + else: + print(encid,"error") + return + + if set_bflag: + words = line.strip().split() + if words[1] == "samtools" and words[2] == "merge": + encids = words[6:] + for encid in encids: + if encid.split("/")[-1].endswith(".bam"): + bam_enc = encid.split("/")[-1].replace(".bam","") + bams_ids.append(bam_enc) + else: + print(encid,"error") + return + return bams_ids + else: + print(encid,"error") + return + + if "Only one source bam file found. Copying over as merged file." in line: + set_cflag=True + if "Merging bam files" in line: + set_bflag=True + +### utils for training and testing regions + +def fetch_preprocessing_log_files(odir, encid): + # do bed file checks + log_paths = [] + + # preprocessing (6 files) + preprocessing_log = os.path.join(odir, encid + "/preprocessing/preprocessing.log.e") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stderr.txt")) + + preprocessing_log = os.path.join(odir, encid + "/preprocessing/preprocessing.log.o") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout.txt")) + + preprocessing_log = os.path.join(odir, encid + "/preprocessing/"+encid+".log") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout_v1.txt")) + + preprocessing_log = os.path.join(odir, encid + "/preprocessing/preprocess_"+encid+".log") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".stdout_v2.txt")) + + preprocessing_log = os.path.join(odir, encid + "/preprocessing/params_file.json") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".params_file.json")) + + preprocessing_log = os.path.join(odir, encid + "/preprocessing/bigWigs/"+encid+".png") + if os.stat(preprocessing_log).st_size != 0: + log_paths.append((preprocessing_log,"logfile.preprocessing."+encid+".bias_pwm.png")) + + # peak_logs (2 files) + negatives_log = os.path.join(odir, encid + "/peak_calling/log.e") + if os.path.isfile(negatives_log): + log_paths.append((negatives_log,"logfile.peak_calling."+encid+".stdout_v1.txt")) + + negatives_log = os.path.join(odir, encid + "/peak_calling/log.o") + if os.path.isfile(negatives_log): + log_paths.append((negatives_log,"logfile.peak_calling."+encid+".stdout_v2.txt")) + + # negative logs (4 files) + negatives_log = os.path.join(odir, encid + "/negatives_data/make_background_regions.log") + if os.path.isfile(negatives_log): + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching."+encid+".stdout_v1.txt")) + + negatives_log = os.path.join(odir, encid + "/negatives_data/"+encid+".log") + if os.path.isfile(negatives_log): + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching."+encid+".stdout_v1.txt")) + + + negatives_log = os.path.join(odir, encid + "/negatives_data/gc_matching.log.o") + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching."+encid+".stdout.txt")) + + negatives_log = os.path.join(odir, encid + "/negatives_data/gc_matching.log.e") + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching."+encid+".stderr.txt")) + + + negatives_log = os.path.join(odir, encid + "/negatives_data/negatives_compared_with_foreground.png") + if os.stat(negatives_log).st_size != 0: + log_paths.append((negatives_log,"logfile.gc_matching."+encid+".stdout.png")) + + return log_paths + +def fetch_per_fold_training_data(odir,model_dir,encid, fold_num): + input_paths = [] + log_paths = [] + + opath = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/splits_format/" + filtered_regions_bed = os.path.join(opath + "/fold_"+str(fold_num)+".json") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"cv_params.fold_"+str(fold_num)+".json")) + + filtered_regions_bed = os.path.join(odir, encid + "/" + model_dir + "/train_test_regions_may_7_2024/peaks.trainingset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"peaks.trainingset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(odir, encid + "/" + model_dir + "/train_test_regions_may_7_2024/peaks.validationset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"peaks.validationset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(odir, encid + "/" + model_dir + "/train_test_regions_may_7_2024/peaks.testset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"peaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(odir, encid + "/" + model_dir + "/train_test_regions_may_7_2024/nonpeaks.trainingset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.trainingset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(odir, encid + "/" + model_dir + "/train_test_regions_may_7_2024/nonpeaks.validationset.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.validationset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + #filtered_regions_bed = os.path.join(odir, encid + "/" + model_dir + "/train_test_regions_may_7_2024/nonpeaks.testset.bed.gz") + #if os.path.isfile(filtered_regions_bed): + # input_paths.append((filtered_regions_bed,"nonpeaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(odir, encid + "/negatives_data/test/test.fold_"+str(fold_num)+".filtered.negatives_with_summit.bed.gz") + if os.path.isfile(filtered_regions_bed): + input_paths.append((filtered_regions_bed,"nonpeaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + + # preprocessing logs to include + + + return input_paths, log_paths + +### utils for model uploads + +def fetch_per_fold_models(odir, model_dir, encid, fold_num): + input_paths = [] + log_paths = [] + log_paths_opt = [] + + cmb = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet_wo_bias.h5") + print(cmb) + if os.path.isfile(cmb): + input_paths.append((cmb,"model.chrombpnet_nobias.fold_"+str(fold_num)+"."+encid+".h5")) + else: + return None, None, None + + cmb = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet.h5") + if os.path.isfile(cmb): + input_paths.append((cmb,"model.chrombpnet.fold_"+str(fold_num)+"."+encid+".h5")) + else: + return None, None, None + + + + bm_model = os.path.join(odir, encid + "/" + model_dir + "/bias_model_scaled.h5") + if os.path.isfile(bm_model): + input_paths.append((bm_model,"model.bias_scaled.fold_"+str(fold_num)+"."+encid+".h5")) + else: + return None, None, None + + cmb = os.path.join(odir, encid + "/" + model_dir + "/new_model_formats_may_7_24_vf/chrombpnet_wo_bias.tar") + if os.path.isfile(cmb): + input_paths.append((cmb,"model.chrombpnet_nobias.fold_"+str(fold_num)+"."+encid+".tar")) + else: + return None, None, None + + cmb = os.path.join(odir, encid + "/" + model_dir + "/new_model_formats_may_7_24_vf/chrombpnet.tar") + if os.path.isfile(cmb): + input_paths.append((cmb,"model.chrombpnet.fold_"+str(fold_num)+"."+encid+".tar")) + else: + return None, None, None + + + bm_model = os.path.join(odir, encid + "/" + model_dir + "/new_model_formats_may_7_24_vf/bias_model_scaled.tar") + if os.path.isfile(bm_model): + input_paths.append((bm_model,"model.bias_scaled.fold_"+str(fold_num)+"."+encid+".tar")) + else: + return None, None, None + + ### fetch main logs + + modelling_log = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet.args.json") + if os.stat(modelling_log).st_size != 0: + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".args.json")) + else: + print(modelling_log) + + modelling_log = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet_data_params.tsv") + if os.stat(modelling_log).st_size != 0: + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".chrombpnet_data_params.tsv")) + else: + print(modelling_log) + + modelling_log = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet_model_params.tsv") + if os.stat(modelling_log).st_size != 0: + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".chrombpnet_model_params.tsv")) + else: + print(modelling_log) + + modelling_log = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet.params.json") + if os.stat(modelling_log).st_size != 0: + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".chrombpnet.params.json")) + else: + print(modelling_log) + + modelling_log = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet.log") + if os.stat(modelling_log).st_size != 0: + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".epoch_loss.csv")) + else: + print(modelling_log) + + modelling_log = os.path.join(odir, encid + "/" + model_dir + "/chrombpnet.log.batch") + if os.stat(modelling_log).st_size != 0: + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".batch_loss.tsv")) + else: + print(modelling_log) + + modelling_log = os.path.join(odir, encid + "/" + model_dir + "/train_chrombpnet_model.log") + if os.stat(modelling_log).st_size != 0: + log_paths.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".stdout_v1.txt")) + else: + print(modelling_log) + + #### fetch model training log files ######## + modelling_log = os.path.join(odir, encid + "/" + model_dir + "/modelling.log.e") + if os.path.isfile(modelling_log): + if os.stat(modelling_log).st_size != 0: + log_paths_opt.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".stderr.txt")) + else: + print(modelling_log) + else: + print(modelling_log) + + modelling_log = os.path.join(odir, encid + "/" + model_dir + "/modelling.log.o") + if os.path.isfile(modelling_log): + if os.stat(modelling_log).st_size != 0: + log_paths_opt.append((modelling_log,"logfile.modelling.fold_"+str(fold_num)+"."+encid+".stdout.txt")) + else: + print(modelling_log) + else: + print(modelling_log) + + #### fetch model conversion log files ######## + #print(log_paths) + return input_paths, log_paths, log_paths_opt + + + + + + + diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/dnase_prepare_file_for_upload_models.py b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/dnase_prepare_file_for_upload_models.py index 56da09b4..d7050441 100755 --- a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/dnase_prepare_file_for_upload_models.py +++ b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/dnase_prepare_file_for_upload_models.py @@ -127,7 +127,7 @@ def main_fetch_training_files(encid, args_json): assert(len(log_paths) == 4) - if len(data_paths) != 7: + if len(data_paths) != 8: success = False return success, args_json diff --git a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/upload_utils.py b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/upload_utils.py index e4b6d6b5..ddf149f6 100755 --- a/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/upload_utils.py +++ b/upload_jsons/upload_jsons_scripts/model_uploads/chrombpnet_models/upload_utils.py @@ -165,7 +165,11 @@ def fetch_per_fold_training_data(odir,model_dir,encid, fold_num): if os.path.isfile(filtered_regions_bed): input_paths.append((filtered_regions_bed,"nonpeaks.validationset.fold_"+str(fold_num)+"."+encid+".bed.gz")) - filtered_regions_bed = os.path.join(odir, encid + "/" + model_dir + "/train_test_regions/nonpeaks.testset.bed.gz") + #filtered_regions_bed = os.path.join(odir, encid + "/" + model_dir + "/train_test_regions/nonpeaks.testset.bed.gz") + #if os.path.isfile(filtered_regions_bed): + # input_paths.append((filtered_regions_bed,"nonpeaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz")) + + filtered_regions_bed = os.path.join(odir, encid + "/negatives_data/test/test.fold_"+str(fold_num)+".filtered.negatives_with_summit.bed.gz") if os.path.isfile(filtered_regions_bed): input_paths.append((filtered_regions_bed,"nonpeaks.testset.fold_"+str(fold_num)+"."+encid+".bed.gz")) diff --git a/upload_jsons/upload_jsons_scripts/modisco_uploads/READMEs/modisco.report.README b/upload_jsons/upload_jsons_scripts/modisco_uploads/READMEs/modisco.report.README new file mode 100644 index 00000000..e69de29b diff --git a/upload_jsons/upload_jsons_scripts/modisco_uploads/atac_prepare.py b/upload_jsons/upload_jsons_scripts/modisco_uploads/atac_prepare.py new file mode 100644 index 00000000..dc63d61b --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/modisco_uploads/atac_prepare.py @@ -0,0 +1,75 @@ +import os +import json + +encids = ["IMR90", "H1ESC", "GM12878", "HEPG2", "K562"] + +encode_id = {"K562": "ENCSR868FGK", +"GM12878": "ENCSR637XSC", +"HEPG2": "ENCSR291GJU", +"IMR90": "ENCSR200OML", +"H1ESC": "GSE267154"} + +ooutdir='atac/' + + + + + +for name in encids: + + encid = encode_id[name] + args_json = {} + args_json["experiment"] = encode_id[name] + args_json["sequence motifs tar"] = {} + + success=True + readme_file="READMEs/modisco.report.README" + if os.path.isfile(readme_file): + args_json["sequence motifs tar"]["file.paths"] = [(readme_file, "README.md")] + + args_json["sequence motifs tar"]["counts"] = {"file.paths": []} + args_json["sequence motifs tar"]["profile"] = {"file.paths": []} + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/" + counts_modisco=odir+name+"/merge_folds_new_may_05_24/counts/modisco_counts.h5" + if os.path.isfile(counts_modisco): + args_json["sequence motifs tar"]["counts"]["file.paths"].append((counts_modisco, "tfmodisco.raw_output.counts."+encid+".hd5")) + else: + print(counts_modisco) + continue + + profile_modisco=odir+name+"/merge_folds_new_may_05_24/profile/modisco_profile.h5" + if os.path.isfile(profile_modisco): + args_json["sequence motifs tar"]["profile"]["file.paths"].append((profile_modisco, "tfmodisco.raw_output.profile."+encid+".hd5")) + else: + print(profile_modisco) + continue + + args_json["sequence motifs tar"]["counts"]["tfmodisco.seq_contrib.counts.meme."+encid] = {"file.paths": []} + + for formats in ["CWM", "CWM-PFM", "hCWM", "hCWM-PFM", "PFM"]: + meme_file=odir+name+"/merge_folds_new_may_05_24/counts/"+formats + if os.path.isfile(meme_file): + args_json["sequence motifs tar"]["counts"]["tfmodisco.seq_contrib.counts.meme."+encid]["file.paths"].append((meme_file,"tfmodisco.seq_contrib.counts."+formats+".meme."+encid)) + else: + print(meme_file) + continue + args_json["sequence motifs tar"]["profile"]["tfmodisco.seq_contrib.profile.meme."+encid] = {"file.paths": []} + + for formats in ["CWM", "CWM-PFM", "hCWM", "hCWM-PFM", "PFM"]: + meme_file=odir+name+"/merge_folds_new_may_05_24/profile/"+formats + if os.path.isfile(meme_file): + args_json["sequence motifs tar"]["profile"]["tfmodisco.seq_contrib.profile.meme."+encid]["file.paths"].append((meme_file,"tfmodisco.seq_contrib.profile."+formats+".meme."+encid)) + else: + print(meme_file) + success=False + break + if not success: + continue + + if not os.path.isfile(ooutdir+encode_id[name]+".json"): + f = open(ooutdir+encode_id[name]+".json", "w") + json.dump(args_json, f, indent=4) + f.close() + + diff --git a/upload_jsons/upload_jsons_scripts/modisco_uploads/dnase_prepare.py b/upload_jsons/upload_jsons_scripts/modisco_uploads/dnase_prepare.py new file mode 100644 index 00000000..7a35cd98 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/modisco_uploads/dnase_prepare.py @@ -0,0 +1,75 @@ +import os +import json + +encids = ["IMR90_new", "H1ESC_new", "GM12878_new", "HEPG2", "K562"] + +encode_id = {"HEPG2": "ENCSR149XIL", + "K562": "ENCSR000EOT", + "IMR90_new": "ENCSR477RTP", + "GM12878_new": "ENCSR000EMT", + "H1ESC_new": "ENCSR000EMU"} + +ooutdir='dnase/' + + + + + +for name in encids: + + encid = encode_id[name] + args_json = {} + args_json["experiment"] = encode_id[name] + args_json["sequence motifs tar"] = {} + + success=True + readme_file="READMEs/modisco.report.README" + if os.path.isfile(readme_file): + args_json["sequence motifs tar"]["file.paths"] = [(readme_file, "README.md")] + + args_json["sequence motifs tar"]["counts"] = {"file.paths": []} + args_json["sequence motifs tar"]["profile"] = {"file.paths": []} + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/" + counts_modisco=odir+name+"/merge_folds_new_may_05_24/counts/modisco_counts.h5" + if os.path.isfile(counts_modisco): + args_json["sequence motifs tar"]["counts"]["file.paths"].append((counts_modisco, "tfmodisco.raw_output.counts."+encid+".hd5")) + else: + print(counts_modisco) + continue + + profile_modisco=odir+name+"/merge_folds_new_may_05_24/profile/modisco_profile.h5" + if os.path.isfile(profile_modisco): + args_json["sequence motifs tar"]["profile"]["file.paths"].append((profile_modisco, "tfmodisco.raw_output.profile."+encid+".hd5")) + else: + print(profile_modisco) + continue + + args_json["sequence motifs tar"]["counts"]["tfmodisco.seq_contrib.counts.meme."+encid] = {"file.paths": []} + + for formats in ["CWM", "CWM-PFM", "hCWM", "hCWM-PFM", "PFM"]: + meme_file=odir+name+"/merge_folds_new_may_05_24/counts/"+formats + if os.path.isfile(meme_file): + args_json["sequence motifs tar"]["counts"]["tfmodisco.seq_contrib.counts.meme."+encid]["file.paths"].append((meme_file,"tfmodisco.seq_contrib.counts."+formats+".meme."+encid)) + else: + print(meme_file) + continue + args_json["sequence motifs tar"]["profile"]["tfmodisco.seq_contrib.profile.meme."+encid] = {"file.paths": []} + + for formats in ["CWM", "CWM-PFM", "hCWM", "hCWM-PFM", "PFM"]: + meme_file=odir+name+"/merge_folds_new_may_05_24/profile/"+formats + if os.path.isfile(meme_file): + args_json["sequence motifs tar"]["profile"]["tfmodisco.seq_contrib.profile.meme."+encid]["file.paths"].append((meme_file,"tfmodisco.seq_contrib.profile."+formats+".meme."+encid)) + else: + print(meme_file) + success=False + break + if not success: + continue + + if not os.path.isfile(ooutdir+encode_id[name]+".json"): + f = open(ooutdir+encode_id[name]+".json", "w") + json.dump(args_json, f, indent=4) + f.close() + + diff --git a/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/READMEs/bc.predicted.README b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/READMEs/bc.predicted.README new file mode 100644 index 00000000..68a4c99f --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/READMEs/bc.predicted.README @@ -0,0 +1,71 @@ +# Directory Structure Format +. +├── input_regions.pred.chrombpnet_nobias.encid.bed.gz # Input bed regions to obtain prediction h5s from chrombpnet_nobias.h5 for each fold +├── pred.chrombpnet_nobias.fold_mean.encid.h5 # Average of prediction h5s from chrombpnet_nobias.h5 across all folds (input format discussed below) +├── logs.pred.chrombpnet_nobias.fold_mean.encid # Directory containing log files +├── fold_0 +│ ├── pred.chrombpnet_nobias.fold_0.encid.h5 # prediction h5s for fold_0 from chrombpnet_nobias.h5 (input format discussed below) +│ └── logs.pred.chrombpnet_nobias.fold_0.encid # Directory containing log files +│ +├── fold_1 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_2 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_3 +│ └── ... # similar directory structure as fold_0 directory above +│ +└── fold_4 + └── ... # similar directory structure as fold_0 directory above + +# Format of bed file + +* All the bed files are in narrowpeak format with 10 columns and follow GRCh38 assembly coordinates. + +1) chrom - Name of the chromosome (or contig, scaffold, etc.). +2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0. +3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd is a 1-based coordinate. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases 1 to 100. +4) name - Name given to a region (preferably unique). Use "." if no name is assigned. +5) score - Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were "'0"' when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000. +6) strand - +/- to denote strand or orientation (whenever applicable). Use "." if no orientation is assigned. +7) signalValue - Measurement of overall (usually, average) enrichment for the region. +8) pValue - Measurement of statistical significance (-log10). Use -1 if no pValue is assigned. +9) qValue - Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned. +10) peak summit - Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called. + +# Format of h5s + +The h5py object consists of two keys: `coords`, `predictions` + +Each `coords` object has three keys - `coords_chrom`, `coords_start_dset`, `coords_end_dset` +(a) The `coords_chrom` has an array of length N (number of regions) containing chromosome names +(b) The `coords_start_dset` has an array of length N containing chromosome start coordinates. The first base in a chromosome is numbered 0. Follows GRCh38 assembly coordinates. +(c) The `coords_end_dset` has an array of length N containing chromosome end coordinates. The `coords_end_dset` is a 1-based coordinate. + +Each `predictions` object has two keys - `logcounts`, `logits` +(a) The `logcounts` is again an array of shape Nx1 with logcount (log base e) predictions +(b) The `logits` is an array of shape Nx1000, which represents the logits of the base resolution predicted probability profiles over 1000 bp for each of the N profiles + +The `predictions` align with regions specified in the bed file, centered at the summit and expanded by 500 base pairs (bp) +on each side. The 'coords' object should contain the corresponding coordinates for each prediction, and the difference +between 'coords_end_dset' and 'coords_start_dset' should equal 1000. + +# Obtaining average h5s and then prediction bigwigs from individual folds + +To create the `fold_mean.encid.h5` file from individual h5 files, we start by averaging the logcounts and logits across various folds. +Next, we utilize a softmax operation on the averaged logits to transition them into probability profiles. In parallel, we exponentiate +the logcounts to convert them into counts. Multiplying the counts with the derived probability profiles, we generate base-resolution +predictions, which are subsequently recorded into both h5 and bigWig files. + +# Pseudocode for loading h5s + +``` +import h5py +data = h5py.File(predictions_h5, "r") +logcounts_preds = data['predictions']['logcounts'] +logit_preds = data['predictions']['logits'] +chrom_coords = data['coords']['coords_chrom'] +start_coords = data['coords']['coords_start_dset'] +end_coords = data['coords']['coords_end_dset'] +``` diff --git a/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/READMEs/predicted.README b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/READMEs/predicted.README new file mode 100644 index 00000000..847959b2 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/READMEs/predicted.README @@ -0,0 +1,71 @@ +# Directory Structure Format +. +├── input_regions.pred.chrombpnet.encid.bed.gz # Input bed regions to obtain prediction h5s from chrombpnet.h5 model for each fold +├── pred.chrombpnet.fold_mean.encid.h5 # Average of prediction h5s from chrombpnet.h5 model across all folds (input format discussed below) +├── logs.pred.chrombpnet.fold_mean.encid # Directory containing log files +├── fold_0 +│ ├── pred.chrombpnet.fold_0.encid.h5 # prediction h5s for fold_0 from chrombpnet.h5 model (input format discussed below) +│ └── logs.pred.chrombpnet.fold_0.encid # Directory containing log files +│ +├── fold_1 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_2 +│ └── ... # similar directory structure as fold_0 directory above +│ +├── fold_3 +│ └── ... # similar directory structure as fold_0 directory above +│ +└── fold_4 + └── ... # similar directory structure as fold_0 directory above + +# Format of bed file + +* All the bed files are in narrowpeak format with 10 columns and follow GRCh38 assembly coordinates. + +1) chrom - Name of the chromosome (or contig, scaffold, etc.). +2) chromStart - The starting position of the feature in the chromosome or scaffold. The first base in a chromosome is numbered 0. +3) chromEnd - The ending position of the feature in the chromosome or scaffold. The chromEnd is a 1-based coordinate. For example, the first 100 bases of a chromosome are defined as chromStart=0, chromEnd=100, and span the bases 1 to 100. +4) name - Name given to a region (preferably unique). Use "." if no name is assigned. +5) score - Indicates how dark the peak will be displayed in the browser (0-1000). If all scores were "'0"' when the data were submitted to the DCC, the DCC assigned scores 1-1000 based on signal value. Ideally the average signalValue per base spread is between 100-1000. +6) strand - +/- to denote strand or orientation (whenever applicable). Use "." if no orientation is assigned. +7) signalValue - Measurement of overall (usually, average) enrichment for the region. +8) pValue - Measurement of statistical significance (-log10). Use -1 if no pValue is assigned. +9) qValue - Measurement of statistical significance using false discovery rate (-log10). Use -1 if no qValue is assigned. +10) peak summit - Point-source called for this peak; 0-based offset from chromStart. Use -1 if no point-source called. + +# Format of h5s + +The h5py object consists of two keys: `coords`, `predictions` + +Each `coords` object has three keys - `coords_chrom`, `coords_start_dset`, `coords_end_dset` +(a) The `coords_chrom` has an array of length N (N is the number of regions) containing chromosome names +(b) The `coords_start_dset` has an array of length N containing chromosome start coordinates. The first base in a chromosome is numbered 0. Follows GRCh38 assembly coordinates. +(c) The `coords_end_dset` has an array of length N containing chromosome end coordinates. The `coords_end_dset` is a 1-based coordinate. + +Each `predictions` object has two keys - `logcounts`, `logits` +(a) The `logcounts` is again an array of shape Nx1 with logcount (log base e) predictions +(b) The `logits` is an array of shape Nx1000, which represents the logits of the base resolution predicted probability profiles over 1000 bp for each of the N profiles + +The `predictions` align with regions specified in the bed file, centered at the summit and expanded by 500 base pairs (bp) +on each side. The 'coords' object should contain the corresponding coordinates for each prediction, and the difference +between 'coords_end_dset' and 'coords_start_dset' should equal 1000. + +# Obtaining average h5s and then prediction bigwigs from individual folds + +To create the `fold_mean.encid.h5` file from individual h5 files, we start by averaging the logcounts and logits across various folds. +Next, we utilize a softmax operation on the averaged logits to transition them into probability profiles. In parallel, we exponentiate +the logcounts to convert them into counts. Multiplying the counts with the derived probability profiles, we generate base-resolution +predictions, which are subsequently recorded into both h5 and bigWig files. + +# Pseudocode for loading h5s + +``` +import h5py +data = h5py.File(predictions_h5, "r") +logcounts_preds = data['predictions']['logcounts'] +logit_preds = data['predictions']['logits'] +chrom_coords = data['coords']['coords_chrom'] +start_coords = data['coords']['coords_start_dset'] +end_coords = data['coords']['coords_end_dset'] +``` diff --git a/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare.py b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare.py new file mode 100644 index 00000000..d8014055 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare.py @@ -0,0 +1,32 @@ +import os +import json + + +encids = ["K562", "HEPG2", "IMR90", "H1ESC", "GM12878"] + +encode_id = {"K562": "ENCSR868FGK", +"GM12878": "ENCSR637XSC", +"HEPG2": "ENCSR291GJU", +"IMR90": "ENCSR200OML", +"H1ESC": "GSE267154"} + +odir='atac/' +for encid in encids: + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/preds_upload/average_preds/"+encid+".mean_preds_wo_bias.stat" + if os.path.isfile(ofile): + print(encid) + wbias = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/preds_upload/average_preds/"+encid+".mean_preds_w_bias.bw" + nobias = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/preds_upload/average_preds/"+encid+".mean_preds_wo_bias.bw" + + assert(os.path.isfile(wbias)==True) + assert(os.path.isfile(nobias)==True) + + output_json = {} + output_json["experiment"] = encode_id[encid] + output_json["predicted signal profile bigWig"] = wbias + output_json["bias-corrected predicted signal profile bigWig"] = nobias + + if not os.path.isfile(odir+encode_id[encid]+".json"): + f = open(odir+encode_id[encid]+".json", "w") + json.dump(output_json, f, indent=4) + f.close() diff --git a/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare_tar.py b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare_tar.py new file mode 100644 index 00000000..4b336606 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare_tar.py @@ -0,0 +1,139 @@ +import os +import json +import pandas as pd + +names = ["K562", "HEPG2", "IMR90", "H1ESC", "GM12878"] + +encode_id = {"K562": "ENCSR868FGK", +"GM12878": "ENCSR637XSC", +"HEPG2": "ENCSR291GJU", +"IMR90": "ENCSR200OML", +"H1ESC": "GSE267154"} + +encode_id_dnase = { +"GM12878": "ENCSR000EMT", +"IMR90": "ENCSR477RTP", +"H1ESC": "ENCSR000EMU"} + +outdir='atac_tar/' + +def fetch_per_fold_preds(odir,model_path, encid, i, name): + + data_paths = [] + log_paths = [] + log_paths_opt = [] + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/preds_upload/fold_"+str(i)+"/" + input_h5 = os.path.join(odir, name+"_wo_bias_all_predictions.h5") + data_paths.append((input_h5, "pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".h5")) + + input_log = os.path.join(odir, "pred.counts.log.e") + #print(input_log) + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr.txt")) + + input_log = os.path.join(odir, "pred.counts.log.o") + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout.txt")) + + input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]+"/chrombpnet_model/preds_atac/pred.counts.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout_v1.txt")) + + input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]+"/chrombpnet_model/preds_atac/pred.counts.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr_v1.txt")) + + input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]+"/chrombpnet_model/preds_dnase/pred.counts.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout_v2.txt")) + + input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]+"/chrombpnet_model/preds_dnase/pred.counts.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr_v2.txt")) + + return data_paths, log_paths, log_paths_opt + +def fetch_pred_tar(encid, args_json, model_paths, name): + success = False + args_json["bias-corrected predicted signal profile tar"] = {} + readme_file = "READMEs/bc.predicted.README" + assert(os.path.isfile(readme_file)) + args_json["bias-corrected predicted signal profile tar"]["file.paths"] = [(readme_file, "README.md")] + args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid] = {"file.paths": []} + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/preds_upload/average_preds/" + + input_h5 = os.path.join(odir, name+".mean_preds_wo_bias_predictions.h5") + if os.path.isfile(input_h5): + args_json["bias-corrected predicted signal profile tar"]["file.paths"].append((input_h5,"pred.chrombpnet_nobias.fold_mean."+encid+".h5")) + else: + success = False + return success, args_json + + if name in ["IMR90", "GM12878", "H1ESC"]: + bed1 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/"+encode_id_dnase[name]+"/preprocessing/downloads/peaks.bed.gz", sep='\t', header=None) + else: + bed1 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/peaks_no_blacklist.bed", sep='\t', header=None) + + + bed2 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/peaks_no_blacklist.bed", sep='\t', header=None) + + print(bed1.shape) + print(bed2.shape) + bedf = pd.concat([bed1, bed2]) + print(bedf.shape) + + input_bed = os.path.join(odir, "input.regions.bed.gz") + if os.path.isfile(input_bed): + args_json["bias-corrected predicted signal profile tar"]["file.paths"].append((input_bed,"input_regions.pred.chrombpnet_nobias."+encid+".bed.gz")) + else: + bedf.to_csv(input_bed, sep='\t', header=False, index=False, compression='gzip') + + + input_log = os.path.join(odir, "merge.preds.log.e") + if os.path.isfile(input_log): + args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid]["file.paths"].append((input_log, "logs.pred.chrombpnet_nobias.fold_mean."+encid+".stderr.txt")) + + input_log = os.path.join(odir, "merge.preds.log.o") + if os.path.isfile(input_log): + args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid]["file.paths"].append((input_log, "logs.pred.chrombpnet_nobias.fold_mean."+encid+".stdout.txt")) + + + for i in range(5): + data_paths, log_paths, log_paths_opt = fetch_per_fold_preds(odir,model_paths[i], encid, i, name) + + if data_paths is None: + success = False + return success, args_json + + args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)] = {} + args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)]["logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt} + assert(len(data_paths) == 1) + print(len(log_paths)) + assert(len(log_paths) == 6) + + success=True + return success, args_json + +model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/model_dir_atac.csv",sep=",", header=None) + +for name in names: + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/preds_upload/average_preds/"+name+".mean_preds_wo_bias.stat" + if os.path.isfile(ofile): + args_json = {} + encid=encode_id[name] + args_json['experiment'] = encid + model_paths = model_atac[model_atac[1]==name][2].values + print(model_paths) + success, args_json = fetch_pred_tar(encid, args_json, model_paths, name) + if not success: + print("ERR preds tar") + continue + + if not os.path.isfile(outdir+encid+"_wo_bias.json"): + f = open(outdir+encode_id[name]+"_wo_bias.json", "w") + json.dump(args_json, f, indent=4) + f.close() + diff --git a/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare_tar_w_bias.py b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare_tar_w_bias.py new file mode 100644 index 00000000..0f3bbbe7 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/atac_prepare_tar_w_bias.py @@ -0,0 +1,139 @@ +import os +import json +import pandas as pd + +names = ["K562", "HEPG2", "IMR90", "H1ESC", "GM12878"] + +encode_id = {"K562": "ENCSR868FGK", +"GM12878": "ENCSR637XSC", +"HEPG2": "ENCSR291GJU", +"IMR90": "ENCSR200OML", +"H1ESC": "GSE267154"} + +encode_id_dnase = { +"GM12878": "ENCSR000EMT", +"IMR90": "ENCSR477RTP", +"H1ESC": "ENCSR000EMU"} + +outdir='atac_tar/' + +def fetch_per_fold_preds(odir,model_path, encid, i, name): + + data_paths = [] + log_paths = [] + log_paths_opt = [] + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/preds_upload/fold_"+str(i)+"/" + input_h5 = os.path.join(odir, name+"_w_bias_all_predictions.h5") + data_paths.append((input_h5, "pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".h5")) + + input_log = os.path.join(odir, "pred.counts.log.e") + #print(input_log) + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr.txt")) + + input_log = os.path.join(odir, "pred.counts.log.o") + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout.txt")) + + input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]+"/chrombpnet_model/preds_atac/pred.counts.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout_v1.txt")) + + input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]+"/chrombpnet_model/preds_atac/pred.counts.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr_v1.txt")) + + input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]+"/chrombpnet_model/preds_dnase/pred.counts.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout_v2.txt")) + + input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1]+"/chrombpnet_model/preds_dnase/pred.counts.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr_v2.txt")) + + return data_paths, log_paths, log_paths_opt + +def fetch_pred_tar(encid, args_json, model_paths, name): + success = False + args_json["bias-corrected predicted signal profile tar"] = {} + readme_file = "READMEs/bc.predicted.README" + assert(os.path.isfile(readme_file)) + args_json["bias-corrected predicted signal profile tar"]["file.paths"] = [(readme_file, "README.md")] + args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid] = {"file.paths": []} + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/preds_upload/average_preds/" + + input_h5 = os.path.join(odir, name+".mean_preds_w_bias_predictions.h5") + if os.path.isfile(input_h5): + args_json["bias-corrected predicted signal profile tar"]["file.paths"].append((input_h5,"pred.chrombpnet_nobias.fold_mean."+encid+".h5")) + else: + success = False + return success, args_json + + if name in ["IMR90", "GM12878", "H1ESC"]: + bed1 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/"+encode_id_dnase[name]+"/preprocessing/downloads/peaks.bed.gz", sep='\t', header=None) + else: + bed1 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/peaks_no_blacklist.bed", sep='\t', header=None) + + + bed2 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/peaks_no_blacklist.bed", sep='\t', header=None) + + print(bed1.shape) + print(bed2.shape) + bedf = pd.concat([bed1, bed2]) + print(bedf.shape) + + input_bed = os.path.join(odir, "input.regions.bed.gz") + if os.path.isfile(input_bed): + args_json["bias-corrected predicted signal profile tar"]["file.paths"].append((input_bed,"input_regions.pred.chrombpnet_nobias."+encid+".bed.gz")) + else: + bedf.to_csv(input_bed, sep='\t', header=False, index=False, compression='gzip') + + + input_log = os.path.join(odir, "merge.preds.log.e") + if os.path.isfile(input_log): + args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid]["file.paths"].append((input_log, "logs.pred.chrombpnet_nobias.fold_mean."+encid+".stderr.txt")) + + input_log = os.path.join(odir, "merge.preds.log.o") + if os.path.isfile(input_log): + args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid]["file.paths"].append((input_log, "logs.pred.chrombpnet_nobias.fold_mean."+encid+".stdout.txt")) + + + for i in range(5): + data_paths, log_paths, log_paths_opt = fetch_per_fold_preds(odir,model_paths[i], encid, i, name) + + if data_paths is None: + success = False + return success, args_json + + args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)] = {} + args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)]["logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt} + assert(len(data_paths) == 1) + print(len(log_paths)) + assert(len(log_paths) == 6) + + success=True + return success, args_json + +model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/model_dir_atac.csv",sep=",", header=None) + +for name in names: + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/preds_upload/average_preds/"+name+".mean_preds_w_bias.stat" + if os.path.isfile(ofile): + args_json = {} + encid=encode_id[name] + args_json["experiment"] = encid + model_paths = model_atac[model_atac[1]==name][2].values + print(model_paths) + success, args_json = fetch_pred_tar(encid, args_json, model_paths, name) + if not success: + print("ERR preds tar") + continue + + if not os.path.isfile(outdir+encid+"_w_bias.json"): + f = open(outdir+encode_id[name]+".json", "w") + json.dump(args_json, f, indent=4) + f.close() + diff --git a/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare.py b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare.py new file mode 100644 index 00000000..6f31826f --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare.py @@ -0,0 +1,31 @@ +import os +import json + + +encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"] + +encode_id = {"HEPG2": "ENCSR149XIL", + "K562": "ENCSR000EOT", + "IMR90_new": "ENCSR477RTP", + "GM12878_new": "ENCSR000EMT", + "H1ESC_new": "ENCSR000EMU"} +odir='dnase/' +for encid in encids: + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/preds_upload/average_preds_with_ccre_vf/"+encid+".mean_preds_wo_bias.stat" + if os.path.isfile(ofile): + print(encid) + wbias = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/preds_upload/average_preds_with_ccre_vf/"+encid+".mean_preds_w_bias.bw" + nobias = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/preds_upload/average_preds_with_ccre_vf/"+encid+".mean_preds_wo_bias.bw" + + assert(os.path.isfile(wbias)==True) + assert(os.path.isfile(nobias)==True) + + output_json = {} + output_json["experiment"] = encode_id[encid] + output_json["predicted signal profile bigWig"] = wbias + output_json["bias-corrected predicted signal profile bigWig"] = nobias + + if not os.path.isfile(odir+encode_id[encid]+".json"): + f = open(odir+encode_id[encid]+".json", "w") + json.dump(output_json, f, indent=4) + f.close() diff --git a/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare_tar.py b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare_tar.py new file mode 100644 index 00000000..c1d0ce01 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare_tar.py @@ -0,0 +1,146 @@ +import os +import json +import pandas as pd + +names = ["IMR90_new", "H1ESC_new", "GM12878_new"] +#names = ["K562", "HEPG2"] + + +model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/v1/model_dir_dnase_v2_interpret.csv",sep=",", header=None) +#model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/v1/model_dir_dnase_v2.csv",sep=",", header=None) + +encode_id = {"HEPG2": "ENCSR149XIL", + "K562": "ENCSR000EOT", + "IMR90_new": "ENCSR477RTP", + "GM12878_new": "ENCSR000EMT", + "H1ESC_new": "ENCSR000EMU"} + + +encode_id_dnase = { +"GM12878_new": "ENCSR000EMT", +"IMR90_new": "ENCSR477RTP", +"H1ESC_new": "ENCSR000EMU"} + +outdir='dnase_tar/' + +def fetch_per_fold_preds(odir,model_path, encid, i, name): + + data_paths = [] + log_paths = [] + log_paths_opt = [] + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/preds_upload/fold_"+str(i)+"/" + input_h5 = os.path.join(odir, name+"_wo_bias_all_predictions.h5") + data_paths.append((input_h5, "pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".h5")) + + input_log = os.path.join(odir, "pred.counts.log.e") + print(input_log) + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr.txt")) + + input_log = os.path.join(odir, "pred.counts.log.o") + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout.txt")) + + input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-2]+"/chrombpnet_model/preds_atac/pred.counts.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout_v1.txt")) + + print(input_log) + input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-2]+"/chrombpnet_model/preds_atac/pred.counts.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr_v1.txt")) + + input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-2]+"/chrombpnet_model/preds_dnase/pred.counts.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout_v2.txt")) + + input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-2]+"/chrombpnet_model/preds_dnase/pred.counts.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr_v2.txt")) + + return data_paths, log_paths, log_paths_opt + +def fetch_pred_tar(encid, args_json, model_paths, name): + success = False + args_json["bias-corrected predicted signal profile tar"] = {} + readme_file = "READMEs/bc.predicted.README" + assert(os.path.isfile(readme_file)) + args_json["bias-corrected predicted signal profile tar"]["file.paths"] = [(readme_file, "README.md")] + args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid] = {"file.paths": []} + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/preds_upload/average_preds/" + + input_h5 = os.path.join(odir, name+".mean_preds_wo_bias_predictions.h5") + if os.path.isfile(input_h5): + args_json["bias-corrected predicted signal profile tar"]["file.paths"].append((input_h5,"pred.chrombpnet_nobias.fold_mean."+encid+".h5")) + else: + success = False + return success, args_json + + if name in ["IMR90_new", "GM12878_new", "H1ESC_new"]: + bed1 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/"+encode_id_dnase[name]+"/preprocessing/downloads/peaks.bed.gz", sep='\t', header=None) + else: + bed1 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/peaks_no_blacklist.bed", sep='\t', header=None) + + + bed2 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name.replace("_new","")+"/peaks_no_blacklist.bed", sep='\t', header=None) + + print(bed1.shape) + print(bed2.shape) + bedf = pd.concat([bed1, bed2]) + print(bedf.shape) + + input_bed = os.path.join(odir, "input.regions.bed.gz") + if os.path.isfile(input_bed): + args_json["bias-corrected predicted signal profile tar"]["file.paths"].append((input_bed,"input_regions.pred.chrombpnet_nobias."+encid+".bed.gz")) + else: + bedf.to_csv(input_bed, sep='\t', header=False, index=False, compression='gzip') + + + input_log = os.path.join(odir, "merge.preds.log.e") + if os.path.isfile(input_log): + args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid]["file.paths"].append((input_log, "logs.pred.chrombpnet_nobias.fold_mean."+encid+".stderr.txt")) + + input_log = os.path.join(odir, "merge.preds.log.o") + if os.path.isfile(input_log): + args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid]["file.paths"].append((input_log, "logs.pred.chrombpnet_nobias.fold_mean."+encid+".stdout.txt")) + + + for i in range(5): + data_paths, log_paths, log_paths_opt = fetch_per_fold_preds(odir,model_paths[i], encid, i, name) + + if data_paths is None: + success = False + return success, args_json + + args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)] = {} + args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)]["logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt} + assert(len(data_paths) == 1) + print(len(log_paths)) + assert(len(log_paths) == 6) + #assert(len(log_paths) >= 2) + + success=True + return success, args_json + + +for name in names: + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/preds_upload/average_preds/"+name+".mean_preds_wo_bias.stat" + if os.path.isfile(ofile): + args_json = {} + encid=encode_id[name] + args_json["experiment"] = encid + model_paths = model_atac[model_atac[1]==name.replace("_new","")][2].values + print(model_paths) + success, args_json = fetch_pred_tar(encid, args_json, model_paths, name) + if not success: + print("ERR preds tar") + continue + + if not os.path.isfile(outdir+encid+"_wo_bias.json"): + f = open(outdir+encode_id[name]+"_wo_bias.json", "w") + json.dump(args_json, f, indent=4) + f.close() + diff --git a/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare_tar_w_bias.py b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare_tar_w_bias.py new file mode 100644 index 00000000..f70a1c30 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/predction_uploads/chrombpnet/dnase_prepare_tar_w_bias.py @@ -0,0 +1,144 @@ +import os +import json +import pandas as pd + +names = ["IMR90_new", "H1ESC_new", "GM12878_new"] +#names = ["K562", "HEPG2"] + +model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/v1/model_dir_dnase_v2_interpret.csv",sep=",", header=None) +#model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/v1/model_dir_dnase_v2.csv",sep=",", header=None) + +encode_id = {"HEPG2": "ENCSR149XIL", + "K562": "ENCSR000EOT", + "IMR90_new": "ENCSR477RTP", + "GM12878_new": "ENCSR000EMT", + "H1ESC_new": "ENCSR000EMU"} + + +encode_id_dnase = { +"GM12878_new": "ENCSR000EMT", +"IMR90_new": "ENCSR477RTP", +"H1ESC_new": "ENCSR000EMU"} + +outdir='dnase_tar/' + +def fetch_per_fold_preds(odir,model_path, encid, i, name): + + data_paths = [] + log_paths = [] + log_paths_opt = [] + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/preds_upload/fold_"+str(i)+"/" + input_h5 = os.path.join(odir, name+"_w_bias_all_predictions.h5") + data_paths.append((input_h5, "pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".h5")) + + input_log = os.path.join(odir, "pred.counts.log.e") + print(input_log) + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr.txt")) + + input_log = os.path.join(odir, "pred.counts.log.o") + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout.txt")) + + input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-2]+"/chrombpnet_model/preds_atac/pred.counts.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout_v1.txt")) + + print(input_log) + input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-2]+"/chrombpnet_model/preds_atac/pred.counts.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr_v1.txt")) + + input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-2]+"/chrombpnet_model/preds_dnase/pred.counts.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stdout_v2.txt")) + + input_log="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-2]+"/chrombpnet_model/preds_dnase/pred.counts.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid+".stderr_v2.txt")) + + return data_paths, log_paths, log_paths_opt + +def fetch_pred_tar(encid, args_json, model_paths, name): + success = False + args_json["bias-corrected predicted signal profile tar"] = {} + readme_file = "READMEs/bc.predicted.README" + assert(os.path.isfile(readme_file)) + args_json["bias-corrected predicted signal profile tar"]["file.paths"] = [(readme_file, "README.md")] + args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid] = {"file.paths": []} + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/preds_upload/average_preds/" + + input_h5 = os.path.join(odir, name+".mean_preds_w_bias_predictions.h5") + if os.path.isfile(input_h5): + args_json["bias-corrected predicted signal profile tar"]["file.paths"].append((input_h5,"pred.chrombpnet_nobias.fold_mean."+encid+".h5")) + else: + success = False + return success, args_json + + if name in ["IMR90_new", "GM12878_new", "H1ESC_new"]: + bed1 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/DNASE/"+encode_id_dnase[name]+"/preprocessing/downloads/peaks.bed.gz", sep='\t', header=None) + else: + bed1 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/peaks_no_blacklist.bed", sep='\t', header=None) + + + bed2 = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name.replace("_new","")+"/peaks_no_blacklist.bed", sep='\t', header=None) + + print(bed1.shape) + print(bed2.shape) + bedf = pd.concat([bed1, bed2]) + print(bedf.shape) + + input_bed = os.path.join(odir, "input.regions.bed.gz") + if os.path.isfile(input_bed): + args_json["bias-corrected predicted signal profile tar"]["file.paths"].append((input_bed,"input_regions.pred.chrombpnet_nobias."+encid+".bed.gz")) + else: + bedf.to_csv(input_bed, sep='\t', header=False, index=False, compression='gzip') + + + input_log = os.path.join(odir, "merge.preds.log.e") + if os.path.isfile(input_log): + args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid]["file.paths"].append((input_log, "logs.pred.chrombpnet_nobias.fold_mean."+encid+".stderr.txt")) + + input_log = os.path.join(odir, "merge.preds.log.o") + if os.path.isfile(input_log): + args_json["bias-corrected predicted signal profile tar"]["logs.pred.chrombpnet_nobias.fold_mean."+encid]["file.paths"].append((input_log, "logs.pred.chrombpnet_nobias.fold_mean."+encid+".stdout.txt")) + + + for i in range(5): + data_paths, log_paths, log_paths_opt = fetch_per_fold_preds(odir,model_paths[i], encid, i, name) + + if data_paths is None: + success = False + return success, args_json + + args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)] = {} + args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["bias-corrected predicted signal profile tar"]["fold_"+str(i)]["logs.pred.chrombpnet_nobias.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt} + assert(len(data_paths) == 1) + print(len(log_paths)) + assert(len(log_paths) == 6) + + success=True + return success, args_json + + +for name in names: + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/preds_upload/average_preds/"+name+".mean_preds_w_bias.stat" + if os.path.isfile(ofile): + args_json = {} + encid=encode_id[name] + args_json["experiment"] = encid + model_paths = model_atac[model_atac[1]==name.replace("_new","")][2].values + print(model_paths) + success, args_json = fetch_pred_tar(encid, args_json, model_paths, name) + if not success: + print("ERR preds tar") + continue + + if not os.path.isfile(outdir+encid+".json"): + f = open(outdir+encode_id[name]+".json", "w") + json.dump(args_json, f, indent=4) + f.close() + diff --git a/upload_jsons/upload_jsons_scripts/profile_bigwigs_uploads/dnase_prepare_tar.py b/upload_jsons/upload_jsons_scripts/profile_bigwigs_uploads/dnase_prepare_tar.py new file mode 100644 index 00000000..e69de29b diff --git a/upload_jsons/upload_jsons_scripts/profile_contrib_upload/READMES/profile.deepshap.README b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/READMES/profile.deepshap.README new file mode 100644 index 00000000..e69de29b diff --git a/upload_jsons/upload_jsons_scripts/profile_contrib_upload/atac_tar.py b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/atac_tar.py new file mode 100644 index 00000000..b44dfb3c --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/atac_tar.py @@ -0,0 +1,215 @@ +import os +import json +import pandas as pd + +#encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"] +encids = ["IMR90", "H1ESC", "GM12878"] + +encode_id = {"K562": "ENCSR868FGK", +"GM12878": "ENCSR637XSC", +"HEPG2": "ENCSR291GJU", +"IMR90": "ENCSR200OML", +"H1ESC": "GSE267154"} +odir='atac/' + +model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/model_dir_atac.csv",sep=",", header=None) + +def fetch_per_fold_profile(odir,model_path, encid, i, name): + + model_path_orig=model_path + model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1] + data_paths = [] + log_paths = [] + log_paths_opt = [] + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/fold_"+str(i)+"/" + input_h5 = os.path.join(odir, name+"_profile_attribs_reformatted.h5") + data_paths.append((input_h5, "seq_contrib.profile.fold_"+str(i)+"."+encid+".h5")) + + #model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/ATAC_SE_04.27.2024//chrombpnet_model" + + # ATAC regions logs + + model_path=model_path+"/chrombpnet_model" + input_log=model_path+"/interpret_dnase/full_"+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.dnase_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + + print(input_log) + input_log=model_path+"/interpret_dnase/full_"+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.dnase_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + input_log=model_path+"/interpret_dnase/ATAC_peaks_full.profile.interpret.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.dnase_regions.fold_"+str(i)+"."+encid+".stderr.txt")) + else: + print(input_log) + + input_log=model_path+"/interpret_dnase/ATAC_peaks_full.profile.interpret.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.dnase_regions.fold_"+str(i)+"."+encid+".stdout.txt")) + else: + print(input_log) + + # atac regions logs + + input_log=model_path_orig+"/interpret/merged."+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + input_log=model_path_orig+"/interpret/merged."+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + # atac regions logs + + + input_log=model_path+"/interpret/full_"+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + input_log=model_path+"/interpret/full_"+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + input_log=model_path+"/interpret/full.profile.interpret.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".stderr.txt")) + else: + print(input_log) + + input_log=model_path+"/interpret/full.profile.interpret.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".stdout.txt")) + else: + print(input_log) + + + return data_paths, log_paths, log_paths_opt + +def fetch_profile_tar(encid, args_json, model_paths, name): + success = False + args_json["profile sequence contribution scores tar"] = {} + readme_file = "READMES/profile.deepshap.README" + assert(os.path.isfile(readme_file)) + args_json["profile sequence contribution scores tar"]["file.paths"] = [(readme_file, "README.md")] + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid] = {"file.paths": []} + + ## full h5 path + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/" + + input_h5 = os.path.join(odir, name+"_profile_attribs_reformatted.h5") + if os.path.isfile(input_h5): + args_json["profile sequence contribution scores tar"]["file.paths"].append((input_h5,"seq_contrib.profile.fold_mean."+encid+".h5")) + else: + success = False + return success, args_json + + ## modisoc h5 path + + modisco_input = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/merge_folds_new_may_05_24/in_peaks.profile_scores_new_compressed.h5" + if os.path.isfile(modisco_input): + args_json["profile sequence contribution scores tar"]["file.paths"].append((modisco_input,"seq_contrib.profile.fold_mean.modisco_input."+encid+".h5")) + else: + success = False + return success, args_json + + # log files + + + input_file=model_paths[0]+"/chrombpnet_model/interpret_all/full_"+name+".interpreted_regions_profile.bed" + newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/per_folds.inputs.bed.gz" + input_bed = pd.read_csv(input_file, compression='gzip', sep='\t', header=None) + if os.path.isfile(input_file): + if not os.path.isfile(newf): + input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip') + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((newf,"logs.seq_contrib.profile.input_regions.per_fold."+encid+".bed.gz")) + + + input_file="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/merge_folds_new_may_05_24/in_peaks.profile.interpreted_regions.bed" + newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/modisco.inputs.bed.gz" + input_bed = pd.read_csv(input_file, sep='\t', header=None) + if os.path.isfile(input_file): + if not os.path.isfile(newf): + input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip') + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((newf,"logs.seq_contrib.profile.input_regions."+encid+".bed.gz")) + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/" + + input_log = os.path.join(odir, "reformat.log.e") + if os.path.isfile(input_log): + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((input_log, "logs.seq_contrib.profile.fold_mean.reformat"+encid+".stderr.txt")) + + input_log = os.path.join(odir, "reformat.log.e") + if os.path.isfile(input_log): + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((input_log, "logs.seq_contrib.profile.fold_mean.reformat"+encid+".stdout.txt")) + + assert(len(args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"])==4) + + for i in range(5): + data_paths, log_paths, log_paths_opt = fetch_per_fold_profile(odir,model_paths[i], encid, i, name) + + if data_paths is None: + success = False + return success, args_json + + args_json["profile sequence contribution scores tar"]["fold_"+str(i)] = {} + args_json["profile sequence contribution scores tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["profile sequence contribution scores tar"]["fold_"+str(i)]["logs.seq_contrib.profile.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt} + assert(len(data_paths) == 1) + print(len(log_paths)) + assert(len(log_paths) >= 4) + + success=True + return success, args_json + +for encid in encids: + print(encid) + + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.stats" + if os.path.isfile(ofile): + profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.bw" + else: + profile_bw = None + print(ofile) + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.stats" + if os.path.isfile(ofile): + profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.bw" + else: + profile_bw = None + print(ofile) + continue + + assert(os.path.isfile(profile_bw)==True) + assert(os.path.isfile(profile_bw)==True) + + model_paths = model_atac[model_atac[1]==encid.replace("_new","")][2].values + print(model_paths) + args_json = {} + args_json["experiment"] = encode_id[encid] + + + success, args_json = fetch_profile_tar(encode_id[encid], args_json, model_paths, encid) + if not success: + print("ERR profile tar") + continue + + if not os.path.isfile(odir+encode_id[encid]+".json"): + f = open(odir+encode_id[encid]+".json", "w") + json.dump(args_json, f, indent=4) + f.close() diff --git a/upload_jsons/upload_jsons_scripts/profile_contrib_upload/atac_tar_k5_and_hep.py b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/atac_tar_k5_and_hep.py new file mode 100644 index 00000000..326ac362 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/atac_tar_k5_and_hep.py @@ -0,0 +1,193 @@ +import os +import json +import pandas as pd + +#encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"] +encids = ["K562", "HEPG2"] + +encode_id = {"K562": "ENCSR868FGK", +"GM12878": "ENCSR637XSC", +"HEPG2": "ENCSR291GJU", +"IMR90": "ENCSR200OML", +"H1ESC": "GSE267154"} +odir='atac/' + +model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/model_dir_atac.csv",sep=",", header=None) + +def fetch_per_fold_profile(odir,model_path, encid, i, name): + + model_path_orig=model_path + model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/"+model_path.split("/")[-1] + data_paths = [] + log_paths = [] + log_paths_opt = [] + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/fold_"+str(i)+"/" + input_h5 = os.path.join(odir, name+"_profile_attribs_reformatted.h5") + data_paths.append((input_h5, "seq_contrib.profile.fold_"+str(i)+"."+encid+".h5")) + + #model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/ATAC_SE_04.27.2024//chrombpnet_model" + + + model_path = model_path+"/chrombpnet_model" + + # all regs logs + + input_log=model_path_orig+"/interpret/merged."+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.atacs_regs.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + input_log=model_path_orig+"/interpret/merged."+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.atacs_regs.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + # atac regs logs + + + input_log=model_path+"/interpret/full_"+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.atacs_regs.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + input_log=model_path+"/interpret/full_"+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.atacs_regs.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + input_log=model_path+"/interpret/full.profile.interpret.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.atacs_regs.fold_"+str(i)+"."+encid+".stderr.txt")) + else: + print(input_log) + + input_log=model_path+"/interpret/full.profile.interpret.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.atacs_regs.fold_"+str(i)+"."+encid+".stdout.txt")) + else: + print(input_log) + + + return data_paths, log_paths, log_paths_opt + +def fetch_profile_tar(encid, args_json, model_paths, name): + success = False + args_json["profile sequence contribution scores tar"] = {} + readme_file = "READMES/profile.deepshap.README" + assert(os.path.isfile(readme_file)) + args_json["profile sequence contribution scores tar"]["file.paths"] = [(readme_file, "README.md")] + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid] = {"file.paths": []} + + ## full h5 path + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/" + + input_h5 = os.path.join(odir, name+"_profile_attribs_reformatted.h5") + if os.path.isfile(input_h5): + args_json["profile sequence contribution scores tar"]["file.paths"].append((input_h5,"seq_contrib.profile.fold_mean."+encid+".h5")) + else: + print(input_h5) + success = False + return success, args_json + + ## modisoc h5 path + + modisco_input = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/merge_folds_new_may_05_24/in_peaks.profile_scores_new_compressed.h5" + if os.path.isfile(modisco_input): + args_json["profile sequence contribution scores tar"]["file.paths"].append((modisco_input,"seq_contrib.profile.fold_mean.modisco_input."+encid+".h5")) + else: + print(modisco_input) + success = False + return success, args_json + + # log files + + + input_file=model_paths[1]+"/chrombpnet_model/interpret/full_"+name+".interpreted_regions_profile.bed" + newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/per_folds.inputs.bed.gz" + input_bed = pd.read_csv(input_file, sep='\t', header=None) + if os.path.isfile(input_file): + if not os.path.isfile(newf): + input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip') + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((newf,"logs.seq_contrib.profile.input_regions.per_fold."+encid+".bed.gz")) + + + input_file="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/merge_folds_new_may_05_24/in_peaks.profile.interpreted_regions.bed" + newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/modisco.inputs.bed.gz" + input_bed = pd.read_csv(input_file, sep='\t', header=None) + if os.path.isfile(input_file): + if not os.path.isfile(newf): + input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip') + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((newf,"logs.seq_contrib.profile.input_regions."+encid+".bed.gz")) + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+name+"/interpret_upload/average_preds/" + + input_log = os.path.join(odir, "reformat.log.e") + if os.path.isfile(input_log): + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((input_log, "logs.seq_contrib.profile.fold_mean.reformat"+encid+".stderr.txt")) + + input_log = os.path.join(odir, "reformat.log.e") + if os.path.isfile(input_log): + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((input_log, "logs.seq_contrib.profile.fold_mean.reformat"+encid+".stdout.txt")) + + assert(len(args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"])==4) + + for i in range(5): + data_paths, log_paths, log_paths_opt = fetch_per_fold_profile(odir,model_paths[i], encid, i, name) + + if data_paths is None: + success = False + return success, args_json + + args_json["profile sequence contribution scores tar"]["fold_"+str(i)] = {} + args_json["profile sequence contribution scores tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["profile sequence contribution scores tar"]["fold_"+str(i)]["logs.seq_contrib.profile.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt} + assert(len(data_paths) == 1) + print(len(log_paths)) + assert(len(log_paths) >= 1) + + success=True + return success, args_json + +for encid in encids: + print(encid) + + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.stats" + if os.path.isfile(ofile): + profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.bw" + else: + profile_bw = None + print(ofile) + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.stats" + if os.path.isfile(ofile): + profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.bw" + else: + profile_bw = None + print(ofile) + continue + + assert(os.path.isfile(profile_bw)==True) + assert(os.path.isfile(profile_bw)==True) + + model_paths = model_atac[model_atac[1]==encid.replace("_new","")][2].values + print(model_paths) + args_json = {} + args_json["experiment"] = encode_id[encid] + + + success, args_json = fetch_profile_tar(encode_id[encid], args_json, model_paths, encid) + if not success: + print("ERR profile tar") + continue + + if not os.path.isfile(odir+encode_id[encid]+".json"): + f = open(odir+encode_id[encid]+".json", "w") + json.dump(args_json, f, indent=4) + f.close() diff --git a/upload_jsons/upload_jsons_scripts/profile_contrib_upload/dnase_tar.py b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/dnase_tar.py new file mode 100644 index 00000000..65c84a25 --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/dnase_tar.py @@ -0,0 +1,221 @@ +import os +import json +import pandas as pd + +#encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"] +encids = ["IMR90_new", "H1ESC_new", "GM12878_new"] + +encode_id = {"HEPG2": "ENCSR149XIL", + "K562": "ENCSR000EOT", + "IMR90_new": "ENCSR477RTP", + "GM12878_new": "ENCSR000EMT", + "H1ESC_new": "ENCSR000EMU"} +odir='dnase/' + +model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/v1/model_dir_dnase_v2_interpret.csv",sep=",", header=None) + +def fetch_per_fold_profile(odir,model_path, encid, i, name): + + data_paths = [] + log_paths = [] + log_paths_opt = [] + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/fold_"+str(i)+"/" + input_h5 = os.path.join(odir, name+"_profile_attribs_reformatted.h5") + data_paths.append((input_h5, "seq_contrib.profile.fold_"+str(i)+"."+encid+".h5")) + + #model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/DNASE_SE_04.27.2024//chrombpnet_model" + + # dnase regions logs + + model_path=model_path+"/chrombpnet_model" + input_log=model_path+"/interpret_orig/full_"+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.dnase_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + input_log=model_path+"/interpret_orig/full_"+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.dnase_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + input_log=model_path+"/interpret_orig/ATAC_peaks_full.profile.interpret.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.dnase_regions.fold_"+str(i)+"."+encid+".stderr.txt")) + else: + print(input_log) + + input_log=model_path+"/interpret_orig/ATAC_peaks_full.profile.interpret.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.dnase_regions.fold_"+str(i)+"."+encid+".stdout.txt")) + else: + print(input_log) + + # atac regions logs + + input_log=model_path+"/interpret/full_"+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + input_log=model_path+"/interpret/full_"+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + input_log=model_path+"/interpret/ATAC_peaks_full.profile.interpret.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".stderr.txt")) + else: + print(input_log) + + input_log=model_path+"/interpret/ATAC_peaks_full.profile.interpret.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.atac_regions.fold_"+str(i)+"."+encid+".stdout.txt")) + else: + print(input_log) + + # ccre regions logs + + input_log=model_path+"/interpret_ccre/full_"+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.ccre_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + input_log=model_path+"/interpret_ccre/full_"+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.ccre_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + input_log=model_path+"/interpret_ccre/ATAC_peaks_full.profile.interpret.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.ccre_regions.fold_"+str(i)+"."+encid+".stderr.txt")) + else: + print(input_log) + input_log=model_path+"/interpret_ccre/ATAC_peaks_full.profile.interpret.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.ccre_regions.fold_"+str(i)+"."+encid+".stdout.txt")) + else: + print(input_log) + + return data_paths, log_paths, log_paths_opt + +def fetch_profile_tar(encid, args_json, model_paths, name): + success = False + args_json["profile sequence contribution scores tar"] = {} + readme_file = "READMES/profile.deepshap.README" + assert(os.path.isfile(readme_file)) + args_json["profile sequence contribution scores tar"]["file.paths"] = [(readme_file, "README.md")] + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid] = {"file.paths": []} + + ## full h5 path + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/" + + input_h5 = os.path.join(odir, name+"_profile_attribs_reformatted.h5") + if os.path.isfile(input_h5): + args_json["profile sequence contribution scores tar"]["file.paths"].append((input_h5,"seq_contrib.profile.fold_mean."+encid+".h5")) + else: + success = False + return success, args_json + + ## modisoc h5 path + + modisco_input = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/merge_folds_new_may_05_24/in_peaks.profile_scores_new_compressed.h5" + if os.path.isfile(modisco_input): + args_json["profile sequence contribution scores tar"]["file.paths"].append((modisco_input,"seq_contrib.profile.fold_mean.modisco_input."+encid+".h5")) + else: + success = False + return success, args_json + + # log files + + + input_file=model_paths[0]+"/chrombpnet_model/interpret_all_with_ccre/full_"+name+".interpreted_regions_profile.bed" + newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/per_folds.inputs.bed.gz" + input_bed = pd.read_csv(input_file, compression='gzip', sep='\t', header=None) + if os.path.isfile(input_file): + if not os.path.isfile(newf): + input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip') + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((newf,"logs.seq_contrib.profile.input_regions.per_fold."+encid+".bed.gz")) + + + input_file="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/merge_folds_new_may_05_24/in_peaks.profile_scores_new_compressed.bed" + newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/modisco.inputs.bed.gz" + input_bed = pd.read_csv(input_file, compression='gzip', sep='\t', header=None) + if os.path.isfile(input_file): + if not os.path.isfile(newf): + input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip') + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((newf,"logs.seq_contrib.profile.input_regions."+encid+".bed.gz")) + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/" + + input_log = os.path.join(odir, "reformat.log.e") + if os.path.isfile(input_log): + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((input_log, "logs.seq_contrib.profile.fold_mean.reformat"+encid+".stderr.txt")) + + input_log = os.path.join(odir, "reformat.log.e") + if os.path.isfile(input_log): + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((input_log, "logs.seq_contrib.profile.fold_mean.reformat"+encid+".stdout.txt")) + + assert(len(args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"])==4) + + for i in range(5): + data_paths, log_paths, log_paths_opt = fetch_per_fold_profile(odir,model_paths[i], encid, i, name) + + if data_paths is None: + success = False + return success, args_json + + args_json["profile sequence contribution scores tar"]["fold_"+str(i)] = {} + args_json["profile sequence contribution scores tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["profile sequence contribution scores tar"]["fold_"+str(i)]["logs.seq_contrib.profile.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt} + assert(len(data_paths) == 1) + print(len(log_paths)) + assert(len(log_paths) == 12) + + success=True + return success, args_json + +for encid in encids: + print(encid) + + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.stats" + if os.path.isfile(ofile): + profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.bw" + else: + profile_bw = None + print(ofile) + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.stats" + if os.path.isfile(ofile): + profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores_new_compressed.bw" + else: + profile_bw = None + print(ofile) + continue + + assert(os.path.isfile(profile_bw)==True) + assert(os.path.isfile(profile_bw)==True) + + model_paths = model_atac[model_atac[1]==encid.replace("_new","")][2].values + print(model_paths) + args_json = {} + args_json["experiment"] = encode_id[encid] + + + success, args_json = fetch_profile_tar(encode_id[encid], args_json, model_paths, encid) + if not success: + print("ERR profile tar") + continue + + if not os.path.isfile(odir+encode_id[encid]+".json"): + f = open(odir+encode_id[encid]+".json", "w") + json.dump(args_json, f, indent=4) + f.close() diff --git a/upload_jsons/upload_jsons_scripts/profile_contrib_upload/dnase_tar_k5_and_hep.py b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/dnase_tar_k5_and_hep.py new file mode 100644 index 00000000..5583b77f --- /dev/null +++ b/upload_jsons/upload_jsons_scripts/profile_contrib_upload/dnase_tar_k5_and_hep.py @@ -0,0 +1,212 @@ +import os +import json +import pandas as pd + +#encids = ["K562", "HEPG2", "IMR90_new", "H1ESC_new", "GM12878_new"] +encids = ["K562", "HEPG2"] + +encode_id = {"HEPG2": "ENCSR149XIL", + "K562": "ENCSR000EOT", + "IMR90_new": "ENCSR477RTP", + "GM12878_new": "ENCSR000EMT", + "H1ESC_new": "ENCSR000EMU"} +odir='dnase/' + +model_atac = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/logs/checkpoint/JAN_02_2023/model_dir_dnase.csv",sep=",", header=None) + +def fetch_per_fold_profile(odir,model_path, encid, i, name): + + model_path_orig=model_path + model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/"+model_path.split("/")[-1] + data_paths = [] + log_paths = [] + log_paths_opt = [] + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/fold_"+str(i)+"/" + input_h5 = os.path.join(odir, name+"_profile_attribs_reformatted.h5") + data_paths.append((input_h5, "seq_contrib.profile.fold_"+str(i)+"."+encid+".h5")) + + #model_path="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/ATAC_SE_04.27.2024//chrombpnet_model" + + # atac regions logs + + + model_path = model_path+"/chrombpnet_model" + input_log=model_path+"/interpret_ccre/full_"+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.ccre_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + input_log=model_path+"/interpret_ccre/full_"+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.ccre_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + input_log=model_path+"/interpret_ccre/ATAC_peaks_full.profile.interpret.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.ccre_regions.fold_"+str(i)+"."+encid+".stderr.txt")) + else: + print(input_log) + + input_log=model_path+"/interpret_ccre/ATAC_peaks_full.profile.interpret.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.all_regions.fold_"+str(i)+"."+encid+".stdout.txt")) + else: + print(input_log) + + + # all regions logs + + input_log=model_path_orig+"/interpret/merged."+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.all_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + + input_log=model_path_orig+"/interpret/merged."+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.all_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + # atac regions logs + + + input_log=model_path+"/interpret/full_"+name+".interpret.args.json" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.all_regions.fold_"+str(i)+"."+encid+".args.json")) + else: + print(input_log) + + input_log=model_path+"/interpret/full_"+name+".interpet.log" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.all_regions.fold_"+str(i)+"."+encid+".log")) + else: + print(input_log) + + input_log=model_path+"/interpret/full.profile.interpret.log.e" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.all_regions.fold_"+str(i)+"."+encid+".stderr.txt")) + else: + print(input_log) + + input_log=model_path+"/interpret/full.profile.interpret.log.o" + if os.path.isfile(input_log): + log_paths.append((input_log, "logs.seq_contrib.profile.all_regions.fold_"+str(i)+"."+encid+".stdout.txt")) + else: + print(input_log) + + + return data_paths, log_paths, log_paths_opt + +def fetch_profile_tar(encid, args_json, model_paths, name): + success = False + args_json["profile sequence contribution scores tar"] = {} + readme_file = "READMES/profile.deepshap.README" + assert(os.path.isfile(readme_file)) + args_json["profile sequence contribution scores tar"]["file.paths"] = [(readme_file, "README.md")] + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid] = {"file.paths": []} + + ## full h5 path + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/" + + input_h5 = os.path.join(odir, name+"_profile_attribs_reformatted.h5") + if os.path.isfile(input_h5): + args_json["profile sequence contribution scores tar"]["file.paths"].append((input_h5,"seq_contrib.profile.fold_mean."+encid+".h5")) + else: + print(input_h5) + success = False + return success, args_json + + ## modisoc h5 path + + modisco_input = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/merge_folds_new_may_05_24/in_peaks.profile_scores_new_compressed.h5" + if os.path.isfile(modisco_input): + args_json["profile sequence contribution scores tar"]["file.paths"].append((modisco_input,"seq_contrib.profile.fold_mean.modisco_input."+encid+".h5")) + else: + print(modisco_input) + success = False + return success, args_json + + # log files + + + input_file=model_paths[1]+"/chrombpnet_model/interpret_all_with_ccre/full_"+name+".interpreted_regions_profile.bed" + newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/per_folds.inputs.bed.gz" + input_bed = pd.read_csv(input_file, compression='gzip', sep='\t', header=None) + if os.path.isfile(input_file): + if not os.path.isfile(newf): + input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip') + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((newf,"logs.seq_contrib.profile.input_regions.per_fold."+encid+".bed.gz")) + + + input_file="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/merge_folds_new_may_05_24/in_peaks.profile.interpreted_regions.bed" + newf="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/modisco.inputs.bed.gz" + input_bed = pd.read_csv(input_file, sep='\t', header=None) + if os.path.isfile(input_file): + if not os.path.isfile(newf): + input_bed.to_csv(newf, sep='\t', header=False, index=False, compression='gzip') + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((newf,"logs.seq_contrib.profile.input_regions."+encid+".bed.gz")) + + odir="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+name+"/interpret_upload/average_preds/" + + input_log = os.path.join(odir, "reformat.log.e") + if os.path.isfile(input_log): + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((input_log, "logs.seq_contrib.profile.fold_mean.reformat"+encid+".stderr.txt")) + + input_log = os.path.join(odir, "reformat.log.e") + if os.path.isfile(input_log): + args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"].append((input_log, "logs.seq_contrib.profile.fold_mean.reformat"+encid+".stdout.txt")) + + assert(len(args_json["profile sequence contribution scores tar"]["logs.seq_contrib.profile."+encid]["file.paths"])==4) + + for i in range(5): + data_paths, log_paths, log_paths_opt = fetch_per_fold_profile(odir,model_paths[i], encid, i, name) + + if data_paths is None: + success = False + return success, args_json + + args_json["profile sequence contribution scores tar"]["fold_"+str(i)] = {} + args_json["profile sequence contribution scores tar"]["fold_"+str(i)]["file.paths"] = data_paths + args_json["profile sequence contribution scores tar"]["fold_"+str(i)]["logs.seq_contrib.profile.fold_"+str(i)+"."+encid] = {"file.paths": log_paths+log_paths_opt} + assert(len(data_paths) == 1) + print(len(log_paths)) + assert(len(log_paths) >= 5) + + success=True + return success, args_json + +for encid in encids: + print(encid) + + + ofile = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.stats" + if os.path.isfile(ofile): + profile_bw = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/"+encid+"/interpret_upload/average_preds/"+encid+"_folds_merged.profile_scores.bw" + else: + profile_bw = None + print(ofile) + + + assert(os.path.isfile(profile_bw)==True) + + model_paths = model_atac[model_atac[1]==encid.replace("_new","")][2].values + print(model_paths) + args_json = {} + args_json["experiment"] = encode_id[encid] + + + success, args_json = fetch_profile_tar(encode_id[encid], args_json, model_paths, encid) + if not success: + print("ERR profile tar") + continue + + if not os.path.isfile(odir+encode_id[encid]+".json"): + f = open(odir+encode_id[encid]+".json", "w") + json.dump(args_json, f, indent=4) + f.close()