From 301ceb7269f56cd9d9fdef24d8795ef430f325c8 Mon Sep 17 00:00:00 2001 From: Louise Deconinck Date: Wed, 11 Sep 2024 21:56:52 +0200 Subject: [PATCH] Add html slides --- slides/slides.html | 3157 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 3157 insertions(+) create mode 100644 slides/slides.html diff --git a/slides/slides.html b/slides/slides.html new file mode 100644 index 0000000..d9f024d --- /dev/null +++ b/slides/slides.html @@ -0,0 +1,3157 @@ + + + + + + + + + + + + + + + + + Polyglot programming for single-cell analysis + + + + + + + + + + + + + + + + + + + +
+
+ +
+

Polyglot programming for single-cell analysis

+ +
+
+
+Louise Deconinck +
+
+
+
+Benjamin Rombaut +
+
+
+
+Robrecht Cannoodt +
+
+
+ +

2024-09-12

+
+
+
+

Introduction

+
    +
  1. How do you interact with a package in another language?
  2. +
  3. How do you make you package useable for developers in other languages?
  4. +
+

We will be focusing on R & Python

+
+
+

Summary

+

Interoperability between languages allows analysts to take advantage of the strengths of different ecosystems

+

On-disk interoperability uses standard file formats to transfer data and is typically more reliable

+

In-memory interoperability transfers data directly between parallel sessions and is convenient for interactive analysis

+

While interoperability is currently possible developers continue to improve the experience

+

Single-cell best practices: Interoperability

+
+
+

How do you interact with a package in another language?

+
    +
  1. In-memory interoperability
  2. +
  3. Disk-based interoperability
  4. +
+
+ +
+

How do you make your package useable for developers in other languages?

+
    +
  1. Package-based interoperability
  2. +
  3. Best practices
  4. +
+
+ +
+

Package-based interoperability

+

or: the question of reimplementation.

+
    +
  • Consider the pros:

    +
      +
    1. Discoverability
    2. +
    3. Can your package be useful in other domains?
    4. +
    5. Very user friendly
    6. +
  • +
  • Consider the cons:

    +
      +
    1. Think twice: is it worth it?
    2. +
    3. It’s a lot of work
    4. +
    5. How will you keep it up to date?
    6. +
    7. How will you ensure parity?
    8. +
  • +
+
+ +
+

Package-based interoperability

+

Please learn both R & Python

+
+ +
+

Best practices

+
    +
  1. Work with the standards
  2. +
  3. Work with matrices, arrays and dataframes
  4. +
  5. Provide vignettes on interoperability
  6. +
+
+ +
+

In-memory interoperability

+ +
+ +
+

Overview

+
    +
  1. Advantages & disadvantages
  2. +
  3. Pitfalls when using Python & R
  4. +
  5. Rpy2
  6. +
  7. Reticulate
  8. +
+
+ +
+

in-memory interoperability advantages

+
    +
  • no need to write & read results
  • +
  • useful when you need a limited amount of functions in another language
  • +
+
+ +
+

in-memory interoperability drawbacks

+
    +
  • not always access to all classes
  • +
  • data duplication
  • +
  • you need to manage the environments
  • +
+
+ +
+

Pitfalls when using Python and R

+

Column major vs row major matrices In R: every dense matrix is stored as column major

+ +
+ +
+

Pitfalls when using Python and R

+

Indexing

+ +
+ +
+

Pitfalls when using Python and R

+

dots and underscores

+
    +
  • mapping in rpy2
  • +
+
from rpy2.robjects.packages import importr
+
+d = {'package.dependencies': 'package_dot_dependencies',
+     'package_dependencies': 'package_uscore_dependencies'}
+tools = importr('tools', robject_translations = d)
+
+ +
+

Pitfalls when using Python and R

+

Integers

+
library(reticulate)
+bi <- reticulate::import_builtins()
+
+bi$list(bi$range(0, 5))
+# TypeError: 'float' object cannot be interpreted as an integer
+
library(reticulate)
+bi <- reticulate::import_builtins()
+
+bi$list(bi$range(0L, 5L))
+# [1] 0 1 2 3 4
+
+ +
+

Rpy2: basics

+
    +
  • Accessing R from Python +
      +
    • rpy2.rinterface, the low-level interface
    • +
    • rpy2.robjects, the high-level interface
    • +
  • +
+
+
import rpy2
+import rpy2.robjects as robjects
+
+vector = robjects.IntVector([1,2,3])
+rsum = robjects.r['sum']
+
+rsum(vector)
+
+ + IntVector with 1 elements. + + + + + + + +
6
+ + +
+
+
+ +
+

Rpy2: basics

+
+
str_vector = robjects.StrVector(['abc', 'def', 'ghi'])
+flt_vector = robjects.FloatVector([0.3, 0.8, 0.7])
+int_vector = robjects.IntVector([1, 2, 3])
+mtx = robjects.r.matrix(robjects.IntVector(range(10)), nrow=5)
+print(mtx)
+
+
     [,1] [,2]
+[1,]    0    5
+[2,]    1    6
+[3,]    2    7
+[4,]    3    8
+[5,]    4    9
+
+
+
+
+ +
+

Rpy2: numpy

+
+
import numpy as np
+
+from rpy2.robjects import numpy2ri
+from rpy2.robjects import default_converter
+
+rd_m = np.random.random((5, 4))
+
+with (default_converter + numpy2ri.converter).context():
+    mtx = robjects.r.matrix(rd_m, nrow = 5)
+    print(mtx)
+
+
[[0.69525594 0.29780005 0.41267065 0.25871805]
+ [0.88313251 0.79471121 0.5369112  0.24752835]
+ [0.68812232 0.24265455 0.51419239 0.80029227]
+ [0.43218943 0.37441082 0.05505875 0.23599726]
+ [0.58236939 0.34859652 0.14651556 0.24370712]]
+
+
+
+ +
+

Rpy2: pandas

+
+
import pandas as pd
+
+from rpy2.robjects import pandas2ri
+
+pd_df = pd.DataFrame({'int_values': [1,2,3],
+                      'str_values': ['abc', 'def', 'ghi']})
+
+with (default_converter + pandas2ri.converter).context():
+    pd_df_r = robjects.DataFrame(pd_df)
+    print(pd_df_r)
+
+
  int_values str_values
+0          1        abc
+1          2        def
+2          3        ghi
+
+
+
+
+ +
+

Rpy2: sparse matrices

+
+
import scipy as sp
+
+from anndata2ri import scipy2ri
+
+sparse_matrix = sp.sparse.csc_matrix(rd_m)
+
+with (default_converter + scipy2ri.converter).context():
+    sp_r = scipy2ri.py2rpy(sparse_matrix)
+    print(sp_r)
+
+
5 x 4 sparse Matrix of class "dgCMatrix"
+                                             
+[1,] 0.6952559 0.2978000 0.41267065 0.2587180
+[2,] 0.8831325 0.7947112 0.53691120 0.2475283
+[3,] 0.6881223 0.2426546 0.51419239 0.8002923
+[4,] 0.4321894 0.3744108 0.05505875 0.2359973
+[5,] 0.5823694 0.3485965 0.14651556 0.2437071
+
+
+
+
+ +
+

Rpy2: anndata

+
import anndata as ad
+import scanpy.datasets as scd
+
+import anndata2ri
+
+adata_paul = scd.paul15()
+
+with anndata2ri.converter.context():
+    sce = anndata2ri.py2rpy(adata_paul)
+    ad2 = anndata2ri.rpy2py(sce)
+
+ +
+

Rpy2: interactivity

+
%load_ext rpy2.ipython  # line magic that loads the rpy2 ipython extension.
+                        # this extension allows the use of the following cell magic
+
+%%R -i input -o output  # this line allows to specify inputs 
+                        # (which will be converted to R objects) and outputs 
+                        # (which will be converted back to Python objects) 
+                        # this line is put at the start of a cell
+                        # the rest of the cell will be run as R code
+
+ +
+

Reticulate

+ +
+ +
+

Reticulate

+
library(reticulate)
+
+bi <- reticulate::import_builtins()
+rd <- reticulate::import("random")
+
+example <- c(1,2,3)
+bi$max(example)
+# [1] 3
+rd$choice(example)
+# [1] 2
+cat(bi$list(bi$reversed(example)))
+# [1] 3 2 1
+
+ +
+

Reticulate numpy

+
np <- reticulate::import("numpy")
+
+a <- np$asarray(tuple(list(1,2), list(3, 4)))
+b <- np$asarray(list(5,6))
+b <- np$reshape(b, newshape = tuple(1L,2L))
+
+np$concatenate(tuple(a, b), axis=0L)
+#      [,1] [,2]
+# [1,]    1    2
+# [2,]    3    4
+# [3,]    5    6
+
+ +
+

Reticulate conversion

+
np <- reticulate::import("numpy", convert = FALSE)
+
+a <- np$asarray(tuple(list(1,2), list(3, 4)))
+b <- np$asarray(list(5,6))
+b <- np$reshape(b, newshape = tuple(1L,2L))
+
+np$concatenate(tuple(a, b), axis=0L)
+# array([[1., 2.],
+#        [3., 4.],
+#        [5., 6.]])
+

You can explicitly convert data types:

+
result <- np$concatenate(tuple(a, b), axis=0L)
+
+py_to_r(result)
+#      [,1] [,2]
+# [1,]    1    2
+# [2,]    3    4
+# [3,]    5    6
+
+result_r <- py_to_r(result)
+r_to_py(result_r)
+# array([[1., 2.],
+#        [3., 4.],
+#        [5., 6.]])
+
+ +
+

Reticulate scanpy

+
library(anndata)
+library(reticulate)
+sc <- import("scanpy")
+
+adata_path <- "../usecase/data/sc_counts_subset.h5ad"
+adata <- anndata::read_h5ad(adata_path)
+

We can preprocess & analyse the data:

+
sc$pp$filter_cells(adata, min_genes = 200)
+sc$pp$filter_genes(adata, min_cells = 3)
+sc$pp$pca(adata)
+sc$pp$neighbors(adata)
+sc$tl$umap(adata)
+
+adata
+# AnnData object with n_obs × n_vars = 32727 × 20542
+#     obs: 'dose_uM', 'timepoint_hr', 'well', 'row', 'col', 'plate_name', 'cell_id', 'cell_type', 'split', 'donor_id', 'sm_name', 'control', 'SMILES', 'sm_lincs_id', 'library_id', 'leiden_res1', 'group', 'cell_type_orig', 'plate_well_celltype_reannotated', 'cell_count_by_well_celltype', 'cell_count_by_plate_well', 'n_genes'
+#     var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'n_cells'
+#     uns: 'cell_type_colors', 'celltypist_celltype_colors', 'donor_id_colors', 'hvg', 'leiden_res1_colors', 'log1p', 'neighbors', 'over_clustering', 'rank_genes_groups', 'pca', 'umap'
+#     obsm: 'HTO_clr', 'X_pca', 'X_umap', 'protein_counts'
+#     varm: 'PCs'
+#     obsp: 'connectivities', 'distances'
+
+ +
+
+

Disk-based interoperability

+ +
+
+

General single cell file formats of interest for Python and R

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
File FormatPythonRSparse matrixLarge imagesLazy chunk loadingRemote storage
RDS
Pickle
CSV
JSON
TIFF
.npy
Parquet
Feather
Lance
HDF5
Zarr
TileDB
+
+
+

Specialized single cell file formats of interest for Python and R

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
File FormatPythonRSparse matrixLarge imagesLazy chunk loadingRemote storage
Seurat RDS
Indexed OME-TIFF
h5Seurat
Loom HDF5
AnnData h5ad
AnnData Zarr
TileDB-SOMA
TileDB-BioImaging
SpatialData Zarr
+
+
+

Workflows

+ +
+ +
+

Takeaways

+
+ +
+
+
+
+ + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file