update with loading and deseq2 function

owkin · Oct 17, 2023 · 94da22b · 94da22b
1 parent 7ae6a5b
commit 94da22b
Showing 1 changed file with 87 additions and 23 deletions.
diff --git a/examples/plot_real_data_pydeseq2_example.py b/examples/plot_real_data_pydeseq2_example.py
@@ -1,33 +1,97 @@
-import shutil
-from pathlib import Path
-from typing import Union
-from urllib.request import urlopen
+"""
+Pydeseq2 with real data
+======================================================
 
-import pandas as pd
+This experiment aims to run pydeseq2 on real data.
 
-_URL = "http://genomedata.org/gen-viz-workshop/intro_to_deseq2/tutorial/E-GEOD-50760-raw-counts.tsv"  # noqa
-_RAW_COUNTS_FNAME = _URL.split("/")[-1]
+"""
 
+from pydeseq2.dds import DeseqDataSet
+from pydeseq2.utils import load_example_data
 
-def _download_raw_counts(output_dir: Union[str, Path]) -> None:
-    """Downloads sample RNASeq data (raw counts).
+# %%
+# Data loading
+# ------------
+#
+# Let's first download and load the real data.
 
-    Parameters
-    ----------
-    output_dir : Union[str, Path]
-        Local directory where the sample data will be downloaded.
-    """
+counts_ori = load_example_data(
+    modality="raw_counts",
+    dataset="real",
+    debug=False,
+)
 
-    fname = str(Path(output_dir).joinpath(_RAW_COUNTS_FNAME))
-    with urlopen(_URL) as response, open(fname, "wb") as out_file:
-        shutil.copyfileobj(response, out_file)
-    # Check that file was downloaded
-    if not Path(fname).is_file():
-        raise FileNotFoundError(f"Failed to download sample data from {_URL}!")
+metadata = load_example_data(
+    modality="metadata",
+    dataset="real",
+    debug=False,
+)
 
+print(counts_ori.tail())
 
-_download_raw_counts(Path.cwd())
+# %%
+# In this example, the counts data contains both EnsemblIDs
+# and gene symbols (gene names).
+# We also see that some EnsemblID do not map to any gene symbol.
+# We decide to stay with EnsemblID rather than gene symbol and
+# continue with some processing step.
 
-df = pd.read_csv(Path.cwd().joinpath(_RAW_COUNTS_FNAME), sep="\t")
+# %%
+# Data filtering read counts
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-print(df.head())
+counts_df = counts_ori.drop(columns="Gene Name")
+# remove the gene name for now
+counts_df = counts_df.set_index("Gene ID").T
+# now we have a matrice of shape n_samples x n_genes
+
+# %%
+# Further processing might be needed because some genes
+# have very high counts on average.
+
+# %%
+print(metadata)
+
+# %%
+# Data filtering metadata
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+metadata = metadata.set_index("Run")[["Sample Characteristic[biopsy site]"]].rename(
+    columns={"Sample Characteristic[biopsy site]": "condition"}
+)
+
+# %%
+# Now that we have loaded and filtered our data, we may proceed with the differential
+# analysis.
+
+
+# %%
+# Single factor analysis
+# --------------------------
+#
+# In this analysis, we use the ``condition``
+# column as our design factor. That is, we compare gene expressions of samples that have
+# ``primary tumor`` to those that have ``normal`` condition.
+#
+
+# %%
+# .. currentmodule:: pydeseq2.dds
+#
+# Read counts modeling with the :class:`DeseqDataSet` class
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# We start by creating a :class:`DeseqDataSet`
+# object from the count and metadata data.
+# A :class:`DeseqDataSet` fits dispersion and
+# log-fold change (LFC) parameters from the data, and stores them.
+#
+
+dds = DeseqDataSet(
+    counts=counts_df,
+    metadata=metadata,
+    design_factors="condition",
+    refit_cooks=True,
+    n_cpus=8,
+)
+
+dds.deseq2()