-
Notifications
You must be signed in to change notification settings - Fork 34
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
d5448e4
commit e326b17
Showing
20 changed files
with
1,954 additions
and
30 deletions.
There are no files selected for viewing
143 changes: 143 additions & 0 deletions
143
logs/checkpoint/JAN_02_2023/atac_combine_deepshap_for_bigwig.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
|
||
import pandas as pd | ||
import os | ||
import deepdish as dd | ||
import numpy as np | ||
from tqdm import tqdm | ||
import pyfaidx | ||
import one_hot | ||
|
||
data = pd.read_csv("model_dir_atac.csv",header=None) | ||
ddtpe="ATAC" | ||
ddtpen=ddtpe+"_PE" | ||
#cell_types=["HEPG2", "K562", "GM12878", "H1ESC", "IMR90"] | ||
#cell_types=[ "H1ESC", "IMR90"] | ||
cell_types=["GM12878", "H1ESC", "IMR90"] | ||
#cell_types=["IMR90"] | ||
#itype="counts" | ||
#ddtpen=ddtpe+"_SE" | ||
#cell_types=["HEPG2", "K562", "GM12878", "H1ESC", "IMR90"] | ||
#cell_types=["K562", "GM12878", "H1ESC", "IMR90", "HEPG2"] | ||
#cell_types=["HEPG2", "K562"] | ||
#cell_types=["K562"] | ||
#cell_types=["GM12878", "IMR90"] | ||
#cell_types=["GM12878_new", "IMR90_new", "H1ESC_new"] | ||
#cell_types=["GM12878"] | ||
|
||
itype="counts" | ||
|
||
|
||
|
||
#data = pd.read_csv("model_dir_dnase.csv",header=None) | ||
|
||
genome_fa="/mnt/lab_data2/anusri/chrombpnet/reference/hg38.genome.fa" | ||
|
||
NARROWPEAK_SCHEMA=["chr", "start", "end", "1", "2", "3", "4", "5", "6", "summit"] | ||
|
||
def get_seq(peaks_df, genome, width): | ||
""" | ||
Same as get_cts, but fetches sequence from a given genome. | ||
""" | ||
vals = [] | ||
peaks_used = [] | ||
for i, r in peaks_df.iterrows(): | ||
sequence = str(genome[r[0]][(r[1]+r[9] - width//2):(r[1] + r[9] + width//2)]) | ||
if len(sequence) == width: | ||
vals.append(sequence) | ||
peaks_used.append(True) | ||
else: | ||
peaks_used.append(False) | ||
#assert(le(peaks_used)==peaks_df.shape[0]) | ||
print(sum(peaks_used), len(peaks_used)) | ||
return one_hot.dna_to_one_hot(vals) | ||
|
||
|
||
|
||
|
||
for cell_type in cell_types: | ||
ndata = data[data[1]==cell_type].reset_index() | ||
#cell_type = cell_type+"_new" | ||
|
||
one_hots=None | ||
for i,r in ndata.iterrows(): | ||
print(i,r[2]) | ||
|
||
beds_path = os.path.join(r[2],"chrombpnet_model/interpret_peaks/full_"+cell_type+".interpreted_regions_"+itype+".bed") | ||
if os.path.exists(beds_path): | ||
beds = pd.read_csv(beds_path, sep="\t", header=None) | ||
elif os.path.exists(os.path.join(r[2],"chrombpnet_model/interpret_peaks/merged."+cell_type+".interpreted_regions.bed")): | ||
beds_path = os.path.join(r[2],"chrombpnet_model/interpret_peaks/merged."+cell_type+".interpreted_regions.bed") | ||
beds = pd.read_csv(beds_path, sep="\t", header=None) | ||
else: | ||
beds_path = os.path.join(r[2],"interpret_peaks/"+cell_type+"_ATAC_in_peaks."+itype+".interpreted_regions.bed") | ||
beds = pd.read_csv(beds_path, sep="\t", header=None) | ||
|
||
print(beds.shape) | ||
|
||
beds_path = os.path.join(r[2],"chrombpnet_model/interpret_dnase/full_"+cell_type+".interpreted_regions_"+itype+".bed") | ||
if os.path.exists(beds_path): | ||
beds2 = pd.read_csv(beds_path, sep="\t", header=None) | ||
|
||
elif os.path.exists(os.path.join(r[2],"chrombpnet_model/interpret_dnase/merged."+cell_type+".interpreted_regions.bed")): | ||
beds_path = os.path.join(r[2],"chrombpnet_model/interpret_dnase/merged."+cell_type+".interpreted_regions.bed") | ||
beds2 = pd.read_csv(beds_path, sep="\t", header=None) | ||
elif os.path.exists(os.path.join(r[2],"interpret_dnase/merged."+cell_type+".interpreted_regions.bed")): | ||
beds_path = os.path.join(r[2],"interpret_dnase/merged."+cell_type+".interpreted_regions.bed") | ||
beds2 = pd.read_csv(beds_path, sep="\t", header=None) | ||
else: | ||
tpath = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+cell_type+"/"+r[2].split("/")[-1] | ||
beds_path = os.path.join(tpath,"chrombpnet_model/interpret_dnase/full_"+cell_type+".interpreted_regions_"+itype+".bed") | ||
beds2 = pd.read_csv(beds_path, sep="\t", header=None) | ||
|
||
|
||
print(beds2.shape) | ||
|
||
bedsf = pd.concat([beds, beds2]) | ||
print(bedsf.shape) | ||
|
||
|
||
ppath = os.path.join(r[2],"chrombpnet_model/interpret_peaks/full_"+cell_type+"."+itype+"_scores_new_compressed.h5") | ||
if os.path.exists(ppath): | ||
scores = dd.io.load(ppath) | ||
elif os.path.exists(os.path.join(r[2],"interpret_peaks/merged."+cell_type+"."+itype+"_scores_new_compressed.h5")): | ||
ppath = os.path.join(r[2],"interpret_peaks/merged."+cell_type+"."+itype+"_scores_new_compressed.h5") | ||
scores = dd.io.load(ppath) | ||
else: | ||
ppath = os.path.join(r[2],"interpret_peaks/"+cell_type+"_ATAC_in_peaks."+itype+"_scores_new_compressed.h5") | ||
scores = dd.io.load(ppath) | ||
|
||
ppath = os.path.join(r[2],"chrombpnet_model/interpret_dnase/full_"+cell_type+"."+itype+"_scores_new_compressed.h5") | ||
if os.path.exists(ppath): | ||
scores2 = dd.io.load(ppath) | ||
elif os.path.exists(os.path.join(r[2],"interpret_dnase/merged."+cell_type+"."+itype+"_scores_new_compressed.h5")): | ||
ppath = os.path.join(r[2],"interpret_dnase/merged."+cell_type+"."+itype+"_scores_new_compressed.h5") | ||
scores2 = dd.io.load(ppath) | ||
elif os.path.exists(os.path.join(r[2],"chrombpnet_model/interpret_dnase/full_"+cell_type+"."+itype+"_scores_new_compressed.h5")): | ||
ppath = os.path.join(r[2],"chrombpnet_model/interpret_dnase/full_"+cell_type+"."+itype+"_scores_new_compressed.h5") | ||
scores2 = dd.io.load(ppath) | ||
else: | ||
tpath = "/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/ATAC/"+cell_type+"/"+r[2].split("/")[-1] | ||
ppath = os.path.join(tpath,"chrombpnet_model/interpret_dnase/full_"+cell_type+"."+itype+"_scores_new_compressed.h5") | ||
scores2 = dd.io.load(ppath) | ||
|
||
print(scores['shap']['seq'].shape) | ||
print(scores2['shap']['seq'].shape) | ||
|
||
scoresf = np.concatenate((scores['shap']['seq'], scores2['shap']['seq']), axis=0) | ||
print(scoresf.shape) | ||
|
||
profile_scores_dict = { | ||
'shap': {'seq': scoresf}, | ||
} | ||
|
||
|
||
os.makedirs(os.path.join(r[2],"chrombpnet_model/interpret_all/"), exist_ok=True) | ||
print(os.path.join(r[2],"chrombpnet_model/interpret_all/")) | ||
output_prefix=os.path.join(r[2],"chrombpnet_model/interpret_all/full_"+cell_type+"."+itype+"_scores_new_compressed.h5") | ||
dd.io.save(output_prefix, | ||
profile_scores_dict, | ||
compression='blosc') | ||
|
||
output_prefix_bed=os.path.join(r[2],"chrombpnet_model/interpret_all/full_"+cell_type+".interpreted_regions_"+itype+".bed") | ||
bedsf.to_csv(output_prefix_bed, sep="\t", header=False, index=False, compression='gzip') | ||
#break |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import pandas as pd | ||
|
||
predictions_bed = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/IMR90_new/preds_upload/fold_0/IMR90_new_w_bias_all_regions.bed", sep='\t', header=None).drop_duplicates() | ||
predictions_bed = predictions_bed[predictions_bed[0] != "chrM"].reset_index(drop=True) | ||
|
||
interpret_bd = pd.read_csv("/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/IMR90_new/merge_folds_all_regions_may_05_24/IMR90_new_folds_merged.profile_scores_new_compressed.bed.gz", sep='\t', header=None) | ||
interpret_bd[10] = interpret_bd[1]+interpret_bd[9]-500 | ||
interpret_bd[11] = interpret_bd[1]+interpret_bd[9]+500 | ||
|
||
interpret_bd = interpret_bd[[0,10,11]].drop_duplicates().rename(columns={10:1, 11:2}).sort_values(by=[0,1,2]).reset_index(drop=True) | ||
predictions_bed = predictions_bed.sort_values(by=[0,1,2]).reset_index(drop=True) | ||
|
||
|
||
print(predictions_bed.head()) | ||
print(interpret_bd.head()) | ||
|
||
print(predictions_bed.equals(interpret_bd)) | ||
|
||
import pybedtools | ||
|
||
x = pybedtools.BedTool.from_dataframe(predictions_bed) | ||
x = x.merge() | ||
y = x.to_dataframe() | ||
|
||
print(y.head()) | ||
|
||
temp="/oak/stanford/groups/akundaje/projects/chromatin-atlas-2022/chrombpnet/folds/DNASE/IMR90_new/preds_upload/average_preds/" | ||
y.to_csv(temp+"merged.viz.bed.gz", compression='gzip', sep='\t', header=False, index=False) | ||
|
||
print(interpret_bd.shape) | ||
print(predictions_bed.shape) | ||
print(len(set(interpret_bd[0]))) | ||
print(len(set(predictions_bed[0]))) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.