import rpy2
import rpy2.robjects as robjects
@@ -1437,7 +1438,7 @@ Rpy2: basics
Rpy2: basics
-
+
str_vector = robjects.StrVector(['abc', 'def', 'ghi'])
flt_vector = robjects.FloatVector([0.3, 0.8, 0.7])
int_vector = robjects.IntVector([1, 2, 3])
@@ -1457,7 +1458,7 @@ Rpy2: basics
Rpy2: numpy
-
+
import numpy as np
from rpy2.robjects import numpy2ri
@@ -1469,18 +1470,18 @@ Rpy2: numpy
mtx = robjects.r.matrix(rd_m, nrow = 5)
print(mtx)
-[[0.69525594 0.29780005 0.41267065 0.25871805]
- [0.88313251 0.79471121 0.5369112 0.24752835]
- [0.68812232 0.24265455 0.51419239 0.80029227]
- [0.43218943 0.37441082 0.05505875 0.23599726]
- [0.58236939 0.34859652 0.14651556 0.24370712]]
+[[0.73294749 0.55953375 0.69944132 0.52744075]
+ [0.09756794 0.39535684 0.80669803 0.10540606]
+ [0.35662206 0.70148737 0.12002733 0.28026677]
+ [0.19947608 0.84421019 0.82702188 0.82531633]
+ [0.56938249 0.04640811 0.34178679 0.3285883 ]]
Rpy2: pandas
-
+
import pandas as pd
from rpy2.robjects import pandas2ri
@@ -1503,7 +1504,7 @@ Rpy2: pandas
Rpy2: sparse matrices
-
+
import scipy as sp
from anndata2ri import scipy2ri
@@ -1515,12 +1516,12 @@ Rpy2: sparse matrices
print(sp_r)
5 x 4 sparse Matrix of class "dgCMatrix"
-
-[1,] 0.6952559 0.2978000 0.41267065 0.2587180
-[2,] 0.8831325 0.7947112 0.53691120 0.2475283
-[3,] 0.6881223 0.2426546 0.51419239 0.8002923
-[4,] 0.4321894 0.3744108 0.05505875 0.2359973
-[5,] 0.5823694 0.3485965 0.14651556 0.2437071
+
+[1,] 0.73294749 0.55953375 0.6994413 0.5274408
+[2,] 0.09756794 0.39535684 0.8066980 0.1054061
+[3,] 0.35662206 0.70148737 0.1200273 0.2802668
+[4,] 0.19947608 0.84421019 0.8270219 0.8253163
+[5,] 0.56938249 0.04640811 0.3417868 0.3285883
@@ -1641,10 +1642,33 @@ Reticulate scanpy
# obsp: 'connectivities', 'distances'
-
Disk-based interoperability
+Disk-based interoperability is a strategy for achieving interoperability between tools written in different programming languages by storing intermediate results in standardized, language-agnostic file formats.
+
+- Upside:
+
+- Simple, just add reading and witing lines
+- Modular scripts
+
+- Downside:
+
+- increased disk usage
+- less direct interaction, debugging…
+
+
+
+
+
+Important features of interoperable file formats
+
+- Compression
+- Sparse matrix support
+- Large images
+- Lazy chunk loading
+- Remote storage
+
General single cell file formats of interest for Python and R
@@ -1871,9 +1895,69 @@ Specialized single cell file formats of interest for Python and R
+
+
+Disk-based pipelines
+Script pipeline:
+#!/bin/bash
+
+bash scripts/1_load_data.sh
+python scripts/2_compute_pseudobulk.py
+Rscript scripts/3_analysis_de.R
+Notebook pipeline:
+
+
+
+Just stay in your language and call scripts
+
+
+
+
+Pipelines with different environments
+
+- interleave with environment (de)activation functions
+- use rvenv
+- use Pixi
+
+
+
+Pixi to manage different environments
+
+
+
+Define tasks in Pixi
+...
+[feature.bash.tasks]
+load_data = "bash book/disk_based/scripts/1_load_data.sh"
+...
+[feature.scverse.tasks]
+compute_pseudobulk = "python book/disk_based/scripts/2_compute_pseudobulk.py"
+...
+[feature.rverse.tasks]
+analysis_de = "Rscript --no-init-file book/disk_based/scripts/3_analysis_de.R"
+...
+[tasks]
+pipeline = { depends-on = ["load_data", "compute_pseudobulk", "analysis_de"] }
+
+
+
+Also possible to use containers
+docker pull berombau/polygloty-docker:latest
+docker run -it -v $(pwd)/usecase:/app/usecase -v $(pwd)/book:/app/book berombau/polygloty-docker:latest pixi run pipeline
+Another approach is to use multi-package containers to create custom combinations of packages. - Multi-Package BioContainers - Seqera Containers
+
Workflows
-
+You can go a long way with a folder of notebooks or scripts and the right tools. But as your project grows more bespoke, it can be worth the effort to use a workflow framework like Viash, Nextflow or Snakemake to manage the pipeline for you.
+See https://saeyslab.github.io/polygloty/book/workflow_frameworks/
diff --git a/slides/slides.pdf b/slides/slides.pdf
new file mode 100644
index 0000000..2394fbb
Binary files /dev/null and b/slides/slides.pdf differ
diff --git a/slides/slides.qmd b/slides/slides.qmd
index 6b31b39..632f29d 100644
--- a/slides/slides.qmd
+++ b/slides/slides.qmd
@@ -29,7 +29,7 @@ execute:
We will be focusing on R & Python
-## Summary
+# Summary
**Interoperability** between languages allows analysts to take advantage of the strengths of different ecosystems
@@ -342,13 +342,13 @@ adata
Disk-based interoperability is a strategy for achieving interoperability between tools written in different programming languages by **storing intermediate results in standardized, language-agnostic file formats**.
-Upside:
-- Simple, just add reading and witing lines
-- Modular scripts
+- Upside:
+ - Simple, just add reading and witing lines
+ - Modular scripts
-Downside:
-- increased disk usage
-- less direct interaction, debugging...
+- Downside:
+ - increased disk usage
+ - less direct interaction, debugging...
# Important features of interoperable file formats