Skip to content

Commit

Permalink
update with loading and deseq2 function
Browse files Browse the repository at this point in the history
  • Loading branch information
Valerieducret committed Oct 17, 2023
1 parent 7ae6a5b commit 94da22b
Showing 1 changed file with 87 additions and 23 deletions.
110 changes: 87 additions & 23 deletions examples/plot_real_data_pydeseq2_example.py
Original file line number Diff line number Diff line change
@@ -1,33 +1,97 @@
import shutil
from pathlib import Path
from typing import Union
from urllib.request import urlopen
"""
Pydeseq2 with real data
======================================================
import pandas as pd
This experiment aims to run pydeseq2 on real data.
_URL = "http://genomedata.org/gen-viz-workshop/intro_to_deseq2/tutorial/E-GEOD-50760-raw-counts.tsv" # noqa
_RAW_COUNTS_FNAME = _URL.split("/")[-1]
"""

from pydeseq2.dds import DeseqDataSet
from pydeseq2.utils import load_example_data

def _download_raw_counts(output_dir: Union[str, Path]) -> None:
"""Downloads sample RNASeq data (raw counts).
# %%
# Data loading
# ------------
#
# Let's first download and load the real data.

Parameters
----------
output_dir : Union[str, Path]
Local directory where the sample data will be downloaded.
"""
counts_ori = load_example_data(
modality="raw_counts",
dataset="real",
debug=False,
)

fname = str(Path(output_dir).joinpath(_RAW_COUNTS_FNAME))
with urlopen(_URL) as response, open(fname, "wb") as out_file:
shutil.copyfileobj(response, out_file)
# Check that file was downloaded
if not Path(fname).is_file():
raise FileNotFoundError(f"Failed to download sample data from {_URL}!")
metadata = load_example_data(
modality="metadata",
dataset="real",
debug=False,
)

print(counts_ori.tail())

_download_raw_counts(Path.cwd())
# %%
# In this example, the counts data contains both EnsemblIDs
# and gene symbols (gene names).
# We also see that some EnsemblID do not map to any gene symbol.
# We decide to stay with EnsemblID rather than gene symbol and
# continue with some processing step.

df = pd.read_csv(Path.cwd().joinpath(_RAW_COUNTS_FNAME), sep="\t")
# %%
# Data filtering read counts
# ^^^^^^^^^^^^^^^^^^^^^^^^^^

print(df.head())
counts_df = counts_ori.drop(columns="Gene Name")
# remove the gene name for now
counts_df = counts_df.set_index("Gene ID").T
# now we have a matrice of shape n_samples x n_genes

# %%
# Further processing might be needed because some genes
# have very high counts on average.

# %%
print(metadata)

# %%
# Data filtering metadata
# ^^^^^^^^^^^^^^^^^^^^^^^^^^

metadata = metadata.set_index("Run")[["Sample Characteristic[biopsy site]"]].rename(
columns={"Sample Characteristic[biopsy site]": "condition"}
)

# %%
# Now that we have loaded and filtered our data, we may proceed with the differential
# analysis.


# %%
# Single factor analysis
# --------------------------
#
# In this analysis, we use the ``condition``
# column as our design factor. That is, we compare gene expressions of samples that have
# ``primary tumor`` to those that have ``normal`` condition.
#

# %%
# .. currentmodule:: pydeseq2.dds
#
# Read counts modeling with the :class:`DeseqDataSet` class
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
# We start by creating a :class:`DeseqDataSet`
# object from the count and metadata data.
# A :class:`DeseqDataSet` fits dispersion and
# log-fold change (LFC) parameters from the data, and stores them.
#

dds = DeseqDataSet(
counts=counts_df,
metadata=metadata,
design_factors="condition",
refit_cooks=True,
n_cpus=8,
)

dds.deseq2()

0 comments on commit 94da22b

Please sign in to comment.