-
Notifications
You must be signed in to change notification settings - Fork 70
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
update with loading and deseq2 function
- Loading branch information
1 parent
7ae6a5b
commit 94da22b
Showing
1 changed file
with
87 additions
and
23 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,33 +1,97 @@ | ||
import shutil | ||
from pathlib import Path | ||
from typing import Union | ||
from urllib.request import urlopen | ||
""" | ||
Pydeseq2 with real data | ||
====================================================== | ||
import pandas as pd | ||
This experiment aims to run pydeseq2 on real data. | ||
_URL = "http://genomedata.org/gen-viz-workshop/intro_to_deseq2/tutorial/E-GEOD-50760-raw-counts.tsv" # noqa | ||
_RAW_COUNTS_FNAME = _URL.split("/")[-1] | ||
""" | ||
|
||
from pydeseq2.dds import DeseqDataSet | ||
from pydeseq2.utils import load_example_data | ||
|
||
def _download_raw_counts(output_dir: Union[str, Path]) -> None: | ||
"""Downloads sample RNASeq data (raw counts). | ||
# %% | ||
# Data loading | ||
# ------------ | ||
# | ||
# Let's first download and load the real data. | ||
|
||
Parameters | ||
---------- | ||
output_dir : Union[str, Path] | ||
Local directory where the sample data will be downloaded. | ||
""" | ||
counts_ori = load_example_data( | ||
modality="raw_counts", | ||
dataset="real", | ||
debug=False, | ||
) | ||
|
||
fname = str(Path(output_dir).joinpath(_RAW_COUNTS_FNAME)) | ||
with urlopen(_URL) as response, open(fname, "wb") as out_file: | ||
shutil.copyfileobj(response, out_file) | ||
# Check that file was downloaded | ||
if not Path(fname).is_file(): | ||
raise FileNotFoundError(f"Failed to download sample data from {_URL}!") | ||
metadata = load_example_data( | ||
modality="metadata", | ||
dataset="real", | ||
debug=False, | ||
) | ||
|
||
print(counts_ori.tail()) | ||
|
||
_download_raw_counts(Path.cwd()) | ||
# %% | ||
# In this example, the counts data contains both EnsemblIDs | ||
# and gene symbols (gene names). | ||
# We also see that some EnsemblID do not map to any gene symbol. | ||
# We decide to stay with EnsemblID rather than gene symbol and | ||
# continue with some processing step. | ||
|
||
df = pd.read_csv(Path.cwd().joinpath(_RAW_COUNTS_FNAME), sep="\t") | ||
# %% | ||
# Data filtering read counts | ||
# ^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
|
||
print(df.head()) | ||
counts_df = counts_ori.drop(columns="Gene Name") | ||
# remove the gene name for now | ||
counts_df = counts_df.set_index("Gene ID").T | ||
# now we have a matrice of shape n_samples x n_genes | ||
|
||
# %% | ||
# Further processing might be needed because some genes | ||
# have very high counts on average. | ||
|
||
# %% | ||
print(metadata) | ||
|
||
# %% | ||
# Data filtering metadata | ||
# ^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
|
||
metadata = metadata.set_index("Run")[["Sample Characteristic[biopsy site]"]].rename( | ||
columns={"Sample Characteristic[biopsy site]": "condition"} | ||
) | ||
|
||
# %% | ||
# Now that we have loaded and filtered our data, we may proceed with the differential | ||
# analysis. | ||
|
||
|
||
# %% | ||
# Single factor analysis | ||
# -------------------------- | ||
# | ||
# In this analysis, we use the ``condition`` | ||
# column as our design factor. That is, we compare gene expressions of samples that have | ||
# ``primary tumor`` to those that have ``normal`` condition. | ||
# | ||
|
||
# %% | ||
# .. currentmodule:: pydeseq2.dds | ||
# | ||
# Read counts modeling with the :class:`DeseqDataSet` class | ||
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | ||
# | ||
# We start by creating a :class:`DeseqDataSet` | ||
# object from the count and metadata data. | ||
# A :class:`DeseqDataSet` fits dispersion and | ||
# log-fold change (LFC) parameters from the data, and stores them. | ||
# | ||
|
||
dds = DeseqDataSet( | ||
counts=counts_df, | ||
metadata=metadata, | ||
design_factors="condition", | ||
refit_cooks=True, | ||
n_cpus=8, | ||
) | ||
|
||
dds.deseq2() |