Skip to content

Commit

Permalink
checkpoint and merging scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
panushri25 committed Aug 27, 2024
1 parent d5448e4 commit e326b17
Show file tree
Hide file tree
Showing 20 changed files with 1,954 additions and 30 deletions.
143 changes: 143 additions & 0 deletions logs/checkpoint/JAN_02_2023/atac_combine_deepshap_for_bigwig.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@

import pandas as pd
import os
import deepdish as dd
import numpy as np
from tqdm import tqdm
import pyfaidx
import one_hot

data = pd.read_csv("model_dir_atac.csv",header=None)
ddtpe="ATAC"
ddtpen=ddtpe+"_PE"
#cell_types=["HEPG2", "K562", "GM12878", "H1ESC", "IMR90"]
#cell_types=[ "H1ESC", "IMR90"]
cell_types=["GM12878", "H1ESC", "IMR90"]
#cell_types=["IMR90"]
#itype="counts"
#ddtpen=ddtpe+"_SE"
#cell_types=["HEPG2", "K562", "GM12878", "H1ESC", "IMR90"]
#cell_types=["K562", "GM12878", "H1ESC", "IMR90", "HEPG2"]
#cell_types=["HEPG2", "K562"]
#cell_types=["K562"]
#cell_types=["GM12878", "IMR90"]
#cell_types=["GM12878_new", "IMR90_new", "H1ESC_new"]
#cell_types=["GM12878"]

itype="counts"



#data = pd.read_csv("model_dir_dnase.csv",header=None)

genome_fa="/mnt/lab_data2/anusri/chrombpnet/reference/hg38.genome.fa"

NARROWPEAK_SCHEMA=["chr", "start", "end", "1", "2", "3", "4", "5", "6", "summit"]

def get_seq(peaks_df, genome, width):
"""
Same as get_cts, but fetches sequence from a given genome.
"""
vals = []
peaks_used = []
for i, r in peaks_df.iterrows():
sequence = str(genome[r[0]][(r[1]+r[9] - width//2):(r[1] + r[9] + width//2)])
if len(sequence) == width:
vals.append(sequence)
peaks_used.append(True)
else:
peaks_used.append(False)
#assert(le(peaks_used)==peaks_df.shape[0])
print(sum(peaks_used), len(peaks_used))
return one_hot.dna_to_one_hot(vals)




for cell_type in cell_types:
ndata = data[data[1]==cell_type].reset_index()
#cell_type = cell_type+"_new"

one_hots=None
for i,r in ndata.iterrows():
print(i,r[2])

beds_path = os.path.join(r[2],"chrombpnet_model/interpret_peaks/full_"+cell_type+".interpreted_regions_"+itype+".bed")
if os.path.exists(beds_path):
beds = pd.read_csv(beds_path, sep="\t", header=None)
elif os.path.exists(os.path.join(r[2],"chrombpnet_model/interpret_peaks/merged."+cell_type+".interpreted_regions.bed")):
beds_path = os.path.join(r[2],"chrombpnet_model/interpret_peaks/merged."+cell_type+".interpreted_regions.bed")
beds = pd.read_csv(beds_path, sep="\t", header=None)
else:
beds_path = os.path.join(r[2],"interpret_peaks/"+cell_type+"_ATAC_in_peaks."+itype+".interpreted_regions.bed")
beds = pd.read_csv(beds_path, sep="\t", header=None)

print(beds.shape)

beds_path = os.path.join(r[2],"chrombpnet_model/interpret_dnase/full_"+cell_type+".interpreted_regions_"+itype+".bed")
if os.path.exists(beds_path):
beds2 = pd.read_csv(beds_path, sep="\t", header=None)

elif os.path.exists(os.path.join(r[2],"chrombpnet_model/interpret_dnase/merged."+cell_type+".interpreted_regions.bed")):
beds_path = os.path.join(r[2],"chrombpnet_model/interpret_dnase/merged."+cell_type+".interpreted_regions.bed")
beds2 = pd.read_csv(beds_path, sep="\t", header=None)
elif os.path.exists(os.path.join(r[2],"interpret_dnase/merged."+cell_type+".interpreted_regions.bed")):
beds_path = os.path.join(r[2],"interpret_dnase/merged."+cell_type+".interpreted_regions.bed")
beds2 = pd.read_csv(beds_path, sep="\t", header=None)
else:
tpath = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+cell_type+"/"+r[2].split("/")[-1]
beds_path = os.path.join(tpath,"chrombpnet_model/interpret_dnase/full_"+cell_type+".interpreted_regions_"+itype+".bed")
beds2 = pd.read_csv(beds_path, sep="\t", header=None)


print(beds2.shape)

bedsf = pd.concat([beds, beds2])
print(bedsf.shape)


ppath = os.path.join(r[2],"chrombpnet_model/interpret_peaks/full_"+cell_type+"."+itype+"_scores_new_compressed.h5")
if os.path.exists(ppath):
scores = dd.io.load(ppath)
elif os.path.exists(os.path.join(r[2],"interpret_peaks/merged."+cell_type+"."+itype+"_scores_new_compressed.h5")):
ppath = os.path.join(r[2],"interpret_peaks/merged."+cell_type+"."+itype+"_scores_new_compressed.h5")
scores = dd.io.load(ppath)
else:
ppath = os.path.join(r[2],"interpret_peaks/"+cell_type+"_ATAC_in_peaks."+itype+"_scores_new_compressed.h5")
scores = dd.io.load(ppath)

ppath = os.path.join(r[2],"chrombpnet_model/interpret_dnase/full_"+cell_type+"."+itype+"_scores_new_compressed.h5")
if os.path.exists(ppath):
scores2 = dd.io.load(ppath)
elif os.path.exists(os.path.join(r[2],"interpret_dnase/merged."+cell_type+"."+itype+"_scores_new_compressed.h5")):
ppath = os.path.join(r[2],"interpret_dnase/merged."+cell_type+"."+itype+"_scores_new_compressed.h5")
scores2 = dd.io.load(ppath)
elif os.path.exists(os.path.join(r[2],"chrombpnet_model/interpret_dnase/full_"+cell_type+"."+itype+"_scores_new_compressed.h5")):
ppath = os.path.join(r[2],"chrombpnet_model/interpret_dnase/full_"+cell_type+"."+itype+"_scores_new_compressed.h5")
scores2 = dd.io.load(ppath)
else:
tpath = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+cell_type+"/"+r[2].split("/")[-1]
ppath = os.path.join(tpath,"chrombpnet_model/interpret_dnase/full_"+cell_type+"."+itype+"_scores_new_compressed.h5")
scores2 = dd.io.load(ppath)

print(scores['shap']['seq'].shape)
print(scores2['shap']['seq'].shape)

scoresf = np.concatenate((scores['shap']['seq'], scores2['shap']['seq']), axis=0)
print(scoresf.shape)

profile_scores_dict = {
'shap': {'seq': scoresf},
}


os.makedirs(os.path.join(r[2],"chrombpnet_model/interpret_all/"), exist_ok=True)
print(os.path.join(r[2],"chrombpnet_model/interpret_all/"))
output_prefix=os.path.join(r[2],"chrombpnet_model/interpret_all/full_"+cell_type+"."+itype+"_scores_new_compressed.h5")
dd.io.save(output_prefix,
profile_scores_dict,
compression='blosc')

output_prefix_bed=os.path.join(r[2],"chrombpnet_model/interpret_all/full_"+cell_type+".interpreted_regions_"+itype+".bed")
bedsf.to_csv(output_prefix_bed, sep="\t", header=False, index=False, compression='gzip')
#break
35 changes: 35 additions & 0 deletions logs/checkpoint/JAN_02_2023/check_nums.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pandas as pd

predictions_bed = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/IMR90_new/preds_upload/fold_0/IMR90_new_w_bias_all_regions.bed", sep='\t', header=None).drop_duplicates()
predictions_bed = predictions_bed[predictions_bed[0] != "chrM"].reset_index(drop=True)

interpret_bd = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/IMR90_new/merge_folds_all_regions_may_05_24/IMR90_new_folds_merged.profile_scores_new_compressed.bed.gz", sep='\t', header=None)
interpret_bd[10] = interpret_bd[1]+interpret_bd[9]-500
interpret_bd[11] = interpret_bd[1]+interpret_bd[9]+500

interpret_bd = interpret_bd[[0,10,11]].drop_duplicates().rename(columns={10:1, 11:2}).sort_values(by=[0,1,2]).reset_index(drop=True)
predictions_bed = predictions_bed.sort_values(by=[0,1,2]).reset_index(drop=True)


print(predictions_bed.head())
print(interpret_bd.head())

print(predictions_bed.equals(interpret_bd))

import pybedtools

x = pybedtools.BedTool.from_dataframe(predictions_bed)
x = x.merge()
y = x.to_dataframe()

print(y.head())

temp="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/IMR90_new/preds_upload/average_preds/"
y.to_csv(temp+"merged.viz.bed.gz", compression='gzip', sep='\t', header=False, index=False)

print(interpret_bd.shape)
print(predictions_bed.shape)
print(len(set(interpret_bd[0])))
print(len(set(predictions_bed[0])))


69 changes: 54 additions & 15 deletions logs/checkpoint/JAN_02_2023/combine_deepshap.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,82 @@

import pandas as pd
import os
import deepdish as dd
import numpy as np
from tqdm import tqdm
import pyfaidx
import one_hot

#data = pd.read_csv("model_dir_atac.csv",header=None)
#ddtpe="ATAC"
#ddtpen=ddtpe+"_PE"
#cell_types=["HEPG2", "K562", "GM12878", "H1ESC", "IMR90"]
#cell_types=[ "H1ESC", "IMR90"]
#cell_types=["K562", "GM12878", "H1ESC", "IMR90", "HEPG2"]
#cell_types=["HEPG2"]
#itype="profile"
#cell_types=["IMR90"]
#itype="counts"

data = pd.read_csv("model_dir_dnase.csv",header=None)
#data = pd.read_csv("model_dir_dnase.csv",header=None)
data = pd.read_csv("v1/model_dir_dnase_v2_interpret.csv",header=None)
ddtpe="DNASE"
ddtpen=ddtpe+"_PE"
ddtpen=ddtpe+"_SE"
#ddtpen=ddtpe+"_SE"
#cell_types=["HEPG2", "K562", "GM12878", "H1ESC", "IMR90"]
#cell_types=["K562", "GM12878", "H1ESC", "IMR90", "HEPG2"]
cell_types=["HEPG2", "K562"]
itype="counts"
#cell_types=["HEPG2", "K562"]
#cell_types=["K562"]
#cell_types=["GM12878", "IMR90"]
#cell_types=["GM12878_new", "IMR90_new", "H1ESC_new"]
cell_types=["GM12878", "IMR90", "H1ESC"]
#cell_types=[ "IMR90"]
itype="profile"



#data = pd.read_csv("model_dir_dnase.csv",header=None)

genome_fa="/mnt/lab_data2/anusri/chrombpnet/reference/hg38.genome.fa"



def get_seq(peaks_df, genome, width):
"""
Same as get_cts, but fetches sequence from a given genome.
"""
vals = []
peaks_used = []
for i, r in peaks_df.iterrows():
sequence = str(genome[r[0]][(r[1]+r[9] - width//2):(r[1] + r[9] + width//2)])
if len(sequence) == width:
vals.append(sequence)
peaks_used.append(True)
else:
peaks_used.append(False)
assert(len(peaks_used)==peaks_df.shape[0])
return one_hot.dna_to_one_hot(vals)

NARROWPEAK_SCHEMA = ["chr", "start", "end", "1", "2", "3", "4", "5", "6", "summit"]

def filter_regions_to_peaks(bed_of_interest, merged, scores):

output_prefix="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/"+ddtpe+"/"+cell_type+"/merge_folds_new/in_peaks"
output_prefix="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/"+ddtpe+"/"+cell_type+"/merge_folds_new_may_05_24_atac/in_peaks"

boi = bed_of_interest[["chr", "start", "end", "summit"]].to_numpy().tolist()
merged_val = merged[[0,1,2,9]].to_numpy().tolist()

indices=[]
dups = []
#for i, val in enumerate(tqdm(merged_val)):
for i, val in enumerate(merged_val):
if val in boi:
indices.append(i)

if val not in dups:
indices.append(i)
dups.append(val)

print(len(indices))
print(len(merged_val))
print(len(boi))
#assert(len(indices)==len(boi))
merged.iloc[indices].to_csv("{}.interpreted_regions.bed".format(output_prefix), header=False, index=False, sep="\t")
merged.iloc[indices].to_csv(output_prefix+"."+itype+".interpreted_regions.bed", header=False, index=False, sep="\t")

sub_scores = {
'raw': {'seq': scores['raw']['seq'][indices]},
Expand All @@ -60,6 +93,7 @@ def filter_regions_to_peaks(bed_of_interest, merged, scores):

for cell_type in cell_types:
ndata = data[data[1]==cell_type].reset_index()
cell_type = cell_type+"_new"
bed_of_interest = pd.read_csv("/mnt/lab_data2/anusri/chrombpnet/results/chrombpnet/"+ddtpen+"/"+cell_type+"/data/peaks_no_blacklist.bed", sep="\t", header=None, names=NARROWPEAK_SCHEMA).astype(str)
one_hots=None
for i,r in ndata.iterrows():
Expand Down Expand Up @@ -92,6 +126,7 @@ def filter_regions_to_peaks(bed_of_interest, merged, scores):
output = scores['shap']['seq']
shapez = output.shape
init_beds = beds
print(scores.keys())
#raw = scores['raw']['seq']
if 'raw' in scores:
if one_hots is None:
Expand All @@ -113,6 +148,10 @@ def filter_regions_to_peaks(bed_of_interest, merged, scores):
del scores


if one_hots is None:
genome = pyfaidx.Fasta(genome_fa)
one_hots = get_seq(beds, genome, 2114):

assert(one_hots.shape==output.shape)

#for i,r in ndata.iterrows():
Expand All @@ -133,13 +172,13 @@ def filter_regions_to_peaks(bed_of_interest, merged, scores):
}


os.makedirs("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/"+ddtpe+"/"+cell_type+"/merge_folds_new/", exist_ok=True)
output_prefix="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/"+ddtpe+"/"+cell_type+"/merge_folds_new/"+cell_type+"_folds_merged"
os.makedirs("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/"+ddtpe+"/"+cell_type+"/merge_folds_new_may_05_24_atac/", exist_ok=True)
output_prefix="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/"+ddtpe+"/"+cell_type+"/merge_folds_new_may_05_24_atac/"+cell_type+"_folds_merged"
dd.io.save(output_prefix+"."+itype+"_scores_new_compressed.h5",
profile_scores_dict,
compression='blosc')
output_prefix_bed="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/"+ddtpe+"/"+cell_type+"/merge_folds_new/"+cell_type+"_folds_merged"
output_prefix_bed="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/"+ddtpe+"/"+cell_type+"/merge_folds_new_may_05_24_atac/"+cell_type+"_folds_merged"
beds.to_csv(output_prefix+"."+itype+"_scores_new_compressed.bed", sep="\t", header=False, index=False, compression='gzip')


filter_regions_to_peaks(bed_of_interest, beds.astype(str), profile_scores_dict)
#filter_regions_to_peaks(bed_of_interest, beds.astype(str), profile_scores_dict)
Loading

0 comments on commit e326b17

Please sign in to comment.