-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
- Loading branch information
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Notebooks for analysis from "Mapping the gene space at single-cell resolution with gene signal pattern analysis" | ||
|
||
Gene Signal Pattern Analysis is a Python package for mapping the gene space from single-cell data. For a detailed explanation of GSPA and potential downstream application, see: | ||
|
||
Mapping the gene space at single-cell resolution with Gene Signal Pattern Analysis. Aarthi Venkat, Sam Leone, Scott E. Youlten, Eric Fagerberg, John Attanasio, Nikhil S. Joshi, Michael Perlmutter, Smita Krishnaswamy. | ||
|
||
By considering gene expression values as signals on the cell-cell graph, GSPA enables complex analyses of gene-gene relationships, including gene cluster analysis, cell-cell communication, and patient manifold learning from gene-gene graphs. | ||
|
||
See the following directories to generate results: | ||
|
||
`analysis_batch_correction` for Extended Data Figure 1 | ||
`analysis_wavelets` for Extended Data Figure 3, 4 | ||
`analysis_coexpression` for Figure 2, Extended Data Figure 5, 6, 7, 10, Extended Data Table 1 | ||
`analysis_localization` for Figure 3, Extended Data Figure 9, 10, Extended Data Table 1 | ||
`analysis_tcells` for Figure 4, Extended Data Figure 11, 12, Extended Data Table 2 | ||
`analysis_cellcomm` for Figure 5, Extended Data Figure 13, 14, Extended Data Table 3 | ||
`analysis_spatial` for Figure 6, Extended Data Figure 15, Extended Data Table 4 | ||
`analysis_patient` for Figure 7, Extened Data Table 5 |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,113 @@ | ||
""" | ||
Evaluate gene embeddings for GSPA and comparisons based on anti-correlation between gene embedding distance and true gene-gene coexpression. Stratified for library size (due to differences in correlation based on library size) and true coexpression (due to large imbalance and majority of gene pairs not coexpressed). | ||
""" | ||
|
||
import numpy as np | ||
from collections import defaultdict | ||
import sys, os | ||
from scipy.stats import spearmanr | ||
|
||
model = sys.argv[1] | ||
dataset = sys.argv[2] | ||
|
||
if dataset == '2_branches': | ||
datafile = 'splatter_simulated_data_2_branches.npz' | ||
extension = '_2_branches' | ||
elif dataset == '3_branches': | ||
datafile = 'splatter_simulated_data_3_branches.npz' | ||
extension = '_3_branches' | ||
elif dataset == 'linear': | ||
datafile = 'splatter_simulated_data.npz' | ||
extension = '' | ||
|
||
# confirm model choice | ||
if model not in ['Eigenscore', 'GFMMD', 'Signals', 'DiffusionEMD', 'GSPA', 'GSPA_QR', 'MAGIC', 'Node2Vec_Gcell', 'GAE_noatt_Gcell', 'GAE_att_Gcell', 'Node2Vec_Ggene', 'GAE_noatt_Ggene', 'GAE_att_Ggene', 'SIMBA', 'siVAE']: | ||
sys.exit('Model choice not in [Eigenscore GFMMD Signals DiffusionEMD GSPA GSPA_QR MAGIC Node2Vec_Gcell GAE_noatt_Gcell GAE_att_Gcell Node2Vec_Ggene GAE_noatt_Ggene GAE_att_Ggene SIMBA siVAE]') | ||
|
||
trajectory_data = np.load(f'../data/{datafile}') | ||
data = trajectory_data['data'] | ||
true_counts = trajectory_data['true_counts'] | ||
true_lib_size = true_counts.T.sum(axis=1) | ||
|
||
# get coexpression embeddings | ||
coexpression_results = {} | ||
|
||
for id in [7, 8, 9]: | ||
run = f'results/{model}/{id}_results{extension}.npz' | ||
res = np.load(run, allow_pickle=True) | ||
name = res['config'][()]['save_as'] | ||
coexpression_results[name] = res['signal_embedding'] | ||
|
||
print ('Stratify Spearman correlation...') | ||
spearman_res = spearmanr(true_counts) | ||
np.fill_diagonal(spearman_res.correlation, 0) | ||
corr_bins = np.linspace(spearman_res.correlation.min(), spearman_res.correlation.max(), 4) | ||
|
||
min_bin_size = float('inf') | ||
for i,corr in enumerate(corr_bins): | ||
if i == 0: continue | ||
choices = np.array(np.where((spearman_res.correlation > corr_bins[i-1]) & (spearman_res.correlation < corr) & (spearman_res.correlation != 0))).T | ||
if choices.shape[0] < min_bin_size: | ||
min_bin_size = choices.shape[0] | ||
|
||
spearmans = defaultdict(list) | ||
|
||
print ('Stratify library size...') | ||
min_lib_size= float('inf') | ||
for i,corr in enumerate(corr_bins): | ||
|
||
choices_bin = [] | ||
if i == 0: continue | ||
|
||
## res.correlation does not equal zero, excluding self edges | ||
choices = np.array(np.where((spearman_res.correlation > corr_bins[i-1]) & (spearman_res.correlation < corr) & (spearman_res.correlation != 0))).T | ||
|
||
lib_size_mean_per_pair = np.vstack((true_lib_size[choices[:, 0]], true_lib_size[choices[:, 1]])).mean(axis=0) | ||
lib_size = np.linspace(lib_size_mean_per_pair.min(), lib_size_mean_per_pair.max(), 3) | ||
|
||
for j,bin in enumerate(lib_size): | ||
if j == 0: continue | ||
choices_ = np.array(np.where((lib_size_mean_per_pair > lib_size[j-1]) & (lib_size_mean_per_pair < bin))).T | ||
if choices_.shape[0] < min_lib_size: | ||
min_lib_size = choices_.shape[0] | ||
|
||
print ('Sample by stratification...') | ||
samples = [] | ||
for i,corr in enumerate(corr_bins): | ||
|
||
choices_bin = [] | ||
if i == 0: continue | ||
|
||
## res.correlation does not equal zero, excluding self edges | ||
choices = np.array(np.where((spearman_res.correlation > corr_bins[i-1]) & (spearman_res.correlation < corr) & (spearman_res.correlation != 0))).T | ||
|
||
lib_size_mean_per_pair = np.vstack((true_lib_size[choices[:, 0]], true_lib_size[choices[:, 1]])).mean(axis=0) | ||
lib_size = np.linspace(lib_size_mean_per_pair.min(), lib_size_mean_per_pair.max(), 3) | ||
|
||
for j,bin in enumerate(lib_size): | ||
if j == 0: continue | ||
choices_ = np.array(np.where((lib_size_mean_per_pair > lib_size[j-1]) & (lib_size_mean_per_pair < bin))).T | ||
choices_bin.append(choices_[np.random.choice(choices_.shape[0], size=min_lib_size, replace=False, )]) | ||
|
||
samples.append(choices[np.vstack(choices_bin).flatten()]) | ||
|
||
samples = np.vstack(samples) | ||
|
||
f = open(f"results/{model}/spearmanr{extension}_demap_789.txt", "w") | ||
|
||
for (name, embedding) in coexpression_results.items(): | ||
X= [] | ||
y=[] | ||
|
||
for (a,b) in samples: | ||
X.append(np.linalg.norm(embedding[a] - embedding[b])) | ||
y.append(spearman_res.correlation[a][b]) | ||
|
||
X = np.array(X) | ||
y = np.array(y) | ||
|
||
spearmans[name].append(spearmanr(X, y).correlation) | ||
|
||
f.write(f'{name} Spearman {np.median(spearmans[name])}\n') | ||
|
||
f.close() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
for model in Eigenscore GFMMD Signals DiffusionEMD GSPA GSPA_QR MAGIC Node2Vec_Ggene GAE_noatt_Ggene GAE_att_Ggene SIMBA siVAE; do | ||
echo ${model} | ||
for dataset in linear 2_branches 3_branches; do | ||
python evaluate_coexpression.py ${model} ${dataset} | ||
done | ||
done |
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
import numpy as np | ||
from collections import defaultdict | ||
import os, sys, glob | ||
from scipy.stats import spearmanr | ||
|
||
model = sys.argv[1] | ||
dataset = sys.argv[2] | ||
|
||
if dataset == '2_branches': | ||
extension = '_2_branches' | ||
if dataset == 'sparse_branches': | ||
extension = '_sparse_branches' | ||
if dataset == '3_branches': | ||
extension = '_3_branches' | ||
elif dataset == 'linear': | ||
extension = '' | ||
|
||
# confirm model choice | ||
if model not in ['SIMBA', 'siVAE', 'Eigenscore','GFMMD', 'Signals', 'DiffusionEMD', 'GSPA', 'GSPA_QR', 'MAGIC', 'Node2Vec_Ggene', 'GAE_noatt_Ggene', 'GAE_att_Ggene']: | ||
sys.exit('Model choice not in [SIMBA siVAE Eigenscore GFMMD Signals DiffusionEMD GSPA GSPA_QR MAGIC Node2Vec_Ggene GAE_noatt_Ggene GAE_att_Ggene]') | ||
|
||
spearmans = defaultdict(list) | ||
labels_y = np.load(f'../data/localization_signals{extension}.npz')['spread'] | ||
|
||
# get embeddings | ||
localization_scores = {} | ||
|
||
for id in [7, 8, 9]: | ||
run = f'results/{model}/{id}_results{extension}.npz' | ||
res = np.load(run, allow_pickle=True) | ||
name = res['config'][()]['save_as'] | ||
localization_scores[name] = res['localization_score'] | ||
|
||
f = open(f'results/{model}/spearmanr{extension}_789.txt', 'w') | ||
|
||
for (name, score) in localization_scores.items(): | ||
spearmans[name] = spearmanr(score, labels_y).correlation | ||
f.write(f'{name} Spearman {spearmans[name]}\n') | ||
|
||
f.close() |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
for model in GFMMD Eigenscore GAE_att_Ggene Signals GSPA GSPA_QR MAGIC DiffusionEMD Node2Vec_Ggene GAE_noatt_Ggene SIMBA siVAE; do | ||
echo ${model} | ||
for dataset in linear 2_branches 3_branches; do | ||
python evaluate_localization.py ${model} ${dataset} | ||
done | ||
done |
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import sys | ||
import numpy as np | ||
import pandas as pd | ||
import time | ||
import gpsa | ||
|
||
count = int(sys.argv[1]) | ||
qr_decompose = sys.argv[2] == 'True' | ||
|
||
data = np.load(f'../data/large_splatter_simulated_data.npz') | ||
counts = pd.DataFrame(data['data']).sample(count).values | ||
|
||
start = time.time() | ||
gspa_op = gspa.GSPA(qr_decompose=qr_decompose) | ||
gspa_op.construct_graph(counts) | ||
end = time.time() | ||
print ('Construct graph', end - start) | ||
|
||
start = time.time() | ||
gspa_op.build_diffusion_operator() | ||
gspa_op.build_wavelet_dictionary() | ||
end = time.time() | ||
print ('Wavelet dictionary', end - start) | ||
|
||
start = time.time() | ||
gene_signals = data.T # embed all measured genes | ||
gene_ae, gene_pc = gspa_op.get_gene_embeddings(gene_signals) | ||
gene_localization = gspa_op.calculate_localization() | ||
end = time.time() | ||
print ('Gene embedding', end - start) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#!/bin/bash | ||
|
||
#SBATCH --job-name=runtime_fast | ||
#SBATCH --ntasks=1 | ||
#SBATCH --nodes=1 | ||
#SBATCH --cpus-per-task=8 | ||
#SBATCH --time=24:00:00 | ||
#SBATCH --partition=scavenge | ||
#SBATCH --mem-per-cpu=100G | ||
#SBATCH --mail-type=ALL | ||
|
||
module load miniconda | ||
conda activate gspa | ||
|
||
for count in 100 1000 5000 25000 50000 100000; | ||
do for reduced in False True; | ||
do echo ${count} ${reduced}; python 0.0_fast_GSPA.py ${count} ${reduced}; | ||
done | ||
done |