scverse · LouiseDck · Dec 4, 2024 · Dec 4, 2024 · Dec 4, 2024 · Dec 5, 2024
diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml
@@ -64,6 +64,14 @@ jobs:
           reticulate::py_install(c("anndata", "scanpy", "dummy-anndata"), pip = TRUE)
         shell: Rscript {0}
 
+      - name: Install h5diff
+        run: |
+          if [ "$RUNNER_OS" == "Linux" ]; then
+                sudo apt-get install hdf5-tools
+          fi
+        shell: bash
+
+
       - uses: r-lib/actions/check-r-package@v2
         with:
           upload-snapshots: true
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -83,7 +83,8 @@ Suggests:
     testthat (>= 3.0.0),
     vctrs,
     withr,
-    yaml
+    yaml,
+    processx
 VignetteBuilder: 
     knitr
 Config/Needs/website: pkgdown, tibble, knitr, rprojroot, stringr, readr,

diff --git a/R/generate_matrix.R b/R/generate_matrix.R
@@ -1,73 +1,82 @@
 # nolint start
+generate_numeric_matrix <- function(n_obs, n_vars, NAs = FALSE) {
+  # byrow = TRUE to mimic the way a matrix gets filled in Python
+  m <- matrix(seq(0.5, n_obs * n_vars), nrow = n_obs, ncol = n_vars, byrow = TRUE) 
+  if (NAs) {
+    m[1, 1] <- NA_real_
+  }
+  m
+}
+
+generate_integer_matrix <- function(n_obs, n_vars, NAs = FALSE) {
+  # byrow = TRUE to mimic the way a matrix gets filled in Python
+  m <- matrix(seq(0L, n_obs * n_vars - 1), nrow = n_obs, ncol = n_vars, byrow = TRUE)
+  if (NAs) {
+    m[1, 1] <- NA_integer_
+  }
+  m
+}
+
 matrix_generators <- list(
   numeric_matrix = function(n_obs, n_vars) {
-    matrix(runif(n_obs * n_vars), nrow = n_obs, ncol = n_vars)
+    generate_numeric_matrix(n_obs, n_vars)
   },
   numeric_dense = function(n_obs, n_vars) {
-    m <- matrix(runif(n_obs * n_vars), nrow = n_obs, ncol = n_vars)
+    m <- generate_numeric_matrix(n_obs, n_vars)
     as(m, "denseMatrix")
   },
   numeric_csparse = function(n_obs, n_vars) {
-    m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
+    m <- generate_numeric_matrix(n_obs, n_vars)
     as(m, "CsparseMatrix")
   },
   numeric_rsparse = function(n_obs, n_vars) {
-    m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
+    m <- generate_numeric_matrix(n_obs, n_vars)
     as(m, "RsparseMatrix")
   },
   numeric_matrix_with_nas = function(n_obs, n_vars) {
-    m <- matrix(runif(n_obs * n_vars), nrow = n_obs, ncol = n_vars)
-    m[seq(1, n_obs * n_vars, by = 2)] <- NA_real_
-    m
+    generate_numeric_matrix(n_obs, n_vars, NAs = TRUE)
   },
   numeric_dense_with_nas = function(n_obs, n_vars) {
-    m <- matrix(runif(n_obs * n_vars), nrow = n_obs, ncol = n_vars)
-    m[seq(1, n_obs * n_vars, by = 2)] <- NA_real_
+    m <- generate_numeric_matrix(n_obs, n_vars, NAs = TRUE)
     as(m, "denseMatrix")
   },
   numeric_csparse_with_nas = function(n_obs, n_vars) {
-    m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
-    m[seq(1, n_obs * n_vars, by = 2)] <- NA_real_
+    m <- generate_numeric_matrix(n_obs, n_vars, NAs = TRUE)
     as(m, "CsparseMatrix")
   },
   numeric_rsparse_with_nas = function(n_obs, n_vars) {
-    m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
-    m[seq(1, n_obs * n_vars, by = 2)] <- NA_real_
+    m <- generate_numeric_matrix(n_obs, n_vars, NAs = TRUE)
     as(m, "RsparseMatrix")
   },
   integer_matrix = function(n_obs, n_vars) {
-    matrix(sample.int(100L, n_obs * n_vars, replace = TRUE), nrow = n_obs, ncol = n_vars)
+    generate_integer_matrix(n_obs, n_vars)
   },
   integer_dense = function(n_obs, n_vars) {
-    m <- matrix(sample.int(100L, n_obs * n_vars, replace = TRUE), nrow = n_obs, ncol = n_vars)
+    m <- generate_integer_matrix(n_obs, n_vars)
     as(m, "denseMatrix")
   },
   integer_csparse = function(n_obs, n_vars) {
-    m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
+    m <- generate_integer_matrix(n_obs, n_vars)
     as(m, "CsparseMatrix")
   },
   integer_rsparse = function(n_obs, n_vars) {
-    m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
+    m <- generate_integer_matrix(n_obs, n_vars)
     as(m, "RsparseMatrix")
   },
   integer_matrix_with_nas = function(n_obs, n_vars) {
-    m <- matrix(sample.int(100L, n_obs * n_vars, replace = TRUE), nrow = n_obs, ncol = n_vars)
-    m[seq(1, n_obs * n_vars, by = 2)] <- NA_integer_
+    m <- generate_integer_matrix(n_obs, n_vars, NAs = TRUE)
     m
   },
   integer_dense_with_nas = function(n_obs, n_vars) {
-    m <- matrix(sample.int(100L, n_obs * n_vars, replace = TRUE), nrow = n_obs, ncol = n_vars)
-    m[seq(1, n_obs * n_vars, by = 2)] <- NA_integer_
+    m <- generate_integer_matrix(n_obs, n_vars, NAs = TRUE)
     as(m, "denseMatrix")
   },
   integer_csparse_with_nas = function(n_obs, n_vars) {
-    m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
-    m[seq(1, n_obs * n_vars, by = 2)] <- NA_integer_
+    m <- generate_integer_matrix(n_obs, n_vars, NAs = TRUE)
     as(m, "CsparseMatrix")
   },
   integer_rsparse_with_nas = function(n_obs, n_vars) {
-    m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
-    m[seq(1, n_obs * n_vars, by = 2)] <- NA_integer_
+    m <- generate_integer_matrix(n_obs, n_vars, NAs = TRUE)
     as(m, "RsparseMatrix")
   }
 )

diff --git a/R/generate_vector.R b/R/generate_vector.R
@@ -1,28 +1,28 @@
 vector_generators <- list(
-  character = function(n) paste0("value", seq_len(n)),
-  integer = function(n) seq_len(n),
-  factor = function(n) factor(paste0("value", seq_len(n))),
-  factor_ordered = function(n) factor(paste0("value", seq_len(n)), ordered = TRUE),
+  character = function(n) paste0("value_", seq(from = 0, to = n - 1)),
+  integer = function(n) seq(from = 0, to = n - 1),
+  factor = function(n) factor(rep(c("Value1", "Value2"), length.out = n)),
+  factor_ordered = function(n) factor(rep(c("Value1", "Value2"), length.out = n), ordered = TRUE),
   logical = function(n) sample(c(TRUE, FALSE), n, replace = TRUE),
-  numeric = function(n) runif(n),
+  numeric = function(n) seq(from = 0.5, to = n),
   character_with_nas = function(n) {
     x <- paste0("value", seq_len(n))
     x[seq(1, n, by = 2)] <- NA_character_
     x
   },
   integer_with_nas = function(n) {
-    x <- seq_len(n)
-    x[seq(1, n, by = 2)] <- NA_integer_
+    x <- seq(from = 0, to = n - 1)
+    x[1] <- NA_integer_
     x
   },
   factor_with_nas = function(n) {
-    x <- factor(paste0("value", seq_len(n)))
-    x[seq(1, n, by = 2)] <- NA_character_
+    x <- factor(rep(c("Value1", "Value2"), length.out = n))
+    x[1] <- NA_character_
     x
   },
   factor_ordered_with_nas = function(n) {
-    x <- factor(paste0("value", seq_len(n)), ordered = TRUE)
-    x[seq(1, n, by = 2)] <- NA_character_
+    x <- factor(rep(c("Value1", "Value2"), length.out = n), ordered = TRUE)
+    x[1] <- NA_character_
     x
   },
   logical_with_nas = function(n) {

diff --git a/inst/known_issues.yaml b/inst/known_issues.yaml
@@ -72,3 +72,107 @@ known_issues:
     proposed_solution: The input checking function for obsm and varm should allow the object to be a vector of the correct length instead of only a matrix or a data frame.
     to_investigate: True
     to_fix: True
+  - backend: HDF5AnnData
+    slot:
+      - X
+    dtype:
+      - float_csparse
+      - float_csparse_nas
+    process: [h5diff]
+    error_message: |
+      Warning: different storage datatype
+      <shape> has file datatype H5T_STD_I64LE
+      <shape> has file datatype H5T_STD_I32LE
+      attribute: <shape of </X>> and <shape of </X>>  
+    description: hdf5py writes the <shape> attribute as a H5T_STD_I64LE, hdf5r writes it as H5T_STD_I32LE.
+    proposed_solution: We should investigate if we can specify the type with which an attribute should be written.
+    to_investigate: True
+    to_fix: True
+  - backend: HDF5AnnData
+    slot:
+      - X
+      - obsm
+      - varm
+      - obsp
+      - varp
+    dtype:
+      - float_csparse
+      - float_csparse_nas
+      - float_rsparse
+      - float_rsparse_nas
+    process: [h5diff]
+    error_message: |
+      dataset: </X/data> and </X/data>
+      Not comparable: </X/data> has rank 1, dimensions [200], max dimensions [18446744073709551615]
+      and </X/data> has rank 1, dimensions [108], max dimensions [108]
+      0 differences found
+      dataset: </X/indices> and </X/indices>
+      Not comparable: </X/indices> has rank 1, dimensions [200], max dimensions [18446744073709551615]
+      and </X/indices> has rank 1, dimensions [108], max dimensions [108]
+      0 differences found
+      dataset: </X/indptr> and </X/indptr>
+      Warning: different maximum dimensions
+      </X/indptr> has max dimensions [18446744073709551615]
+      </X/indptr> has max dimensions [21]
+    description: hdf5py has max dimensions as 2^64 - 1, the max val for an unsigned int. hdf5r has it as the actual value
+    proposed_solution: We should investigate if something goes wrong with h5py, but I think hdf5 provides the expected behaviour.
+    to_investigate: True
+    to_fix: False
+  - backend: HDF5AnnData
+    slot:
+      - obs
+      - var
+    dtype:
+      - integer_with_nas
+    process: [h5diff]
+    error_message: |
+      dataset: </var/nullable_integer_array/mask> and </var/integer_with_nas/mask>
+      Warning: different storage datatype
+      Not comparable: </var/nullable_integer_array/mask> has sign H5T_SGN_2 and </var/integer_with_nas/mask> has sign H5T_SGN_NONE
+      0 differences found
+    description: hdf5py writes a nullable integer array with type H5T_SGN_2, hdf5r writes with type H5T_SGN_NONE
+    proposed_solution: We should investigate if we can specify the type with which an attribute should be written.
+    to_investigate: True
+    to_fix: True
+  - backend: HDF5AnnData
+    slot:
+      - obs
+      - var
+    dtype:
+      - nullable_integer_array
+    process: [h5diff]
+    error_message: |
+      dataset: </var/nullable_integer_array/values> and </var/integer_with_nas/values>
+      Warning: different storage datatype
+      </var/nullable_integer_array/values> has file datatype H5T_STD_I64LE
+      </var/integer_with_nas/values> has file datatype H5T_STD_I32LE
+      size:           [20]           [20]
+      position        values          values          difference          
+      ------------------------------------------------------------
+      [ 0 ]          0               1               1              
+      1 differences found
+    description: hdf5py writes a nullable integer array with type H5T_STD_I64LE, hdf5r writes with type H5T_STD_I32LE
+    proposed_solution: We should investigate if we can specify the type with which an attribute should be written.
+    to_investigate: True
+    to_fix: True
+  - backend: HDF5AnnData
+    slot:
+      - obs
+      - var
+    dtype:
+      - nullable_integer_array
+    process: [h5diff]
+    error_message: |
+      dataset: </var/nullable_integer_array/values> and </var/integer_with_nas/values>
+      Warning: different storage datatype
+      </var/nullable_integer_array/values> has file datatype H5T_STD_I64LE
+      </var/integer_with_nas/values> has file datatype H5T_STD_I32LE
+      size:           [20]           [20]
+      position        values          values          difference          
+      ------------------------------------------------------------
+      [ 0 ]          0               1               1              
+      1 differences found
+    description: On position 0, hdf5py writes a 0 in the values array, hdf5r writes a 1.
+    proposed_solution: We should investigate why this difference happens.
+    to_investigate: True
+    to_fix: True
diff --git a/tests/testthat/helper-py-R-equivalences.R b/tests/testthat/helper-py-R-equivalences.R
@@ -0,0 +1,64 @@
+# python, R
+# does not test numeric_dense, numeric_dense_with_nas or integer_dense
+matrix_equivalences <- list(
+  c("float_matrix", "numeric_matrix"),
+  c("float_matrix_nas", "numeric_matrix_with_nas"),
+  c("integer_matrix", "integer_matrix"),
+  c("float_csparse", "numeric_csparse"),
+  c("float_csparse_nas", "numeric_csparse_with_nas"),
+  c("float_rsparse", "numeric_rsparse"),
+  c("float_rsparse_nas", "numeric_rsparse_with_nas")
+)
+
+# python, R
+vector_equivalences <- list(
+  c("categorical", "factor"),
+  c("categorical_ordered", "factor_ordered"),
+  c("categorical_missing_values", "factor_with_nas"),
+  c("categorical_ordered_missing_values", "factor_ordered_with_nas"),
+  c("string_array", "character"),
+  c("dense_array", "numeric"),
+  c("integer_array", "integer"),
+  c("boolean_array", "logical"),
+  c("nullable_integer_array", "integer_with_nas"),
+  c("nullable_boolean_array", "logical_with_nas")
+)
+
+all_equivalences <- c(matrix_equivalences, vector_equivalences)
+
+check_arg <- function(args, name, falseval) {
+  if (name %in% names(args)) {
+    args[[name]][[1]]
+  } else {
+    falseval
+  }
+}
+
+r_generate_dataset <- function(n_obs, n_vars, write = FALSE, ...) {
+  args <- list(...)
+
+  data <- generate_dataset(n_obs, n_vars,
+                           x_type = check_arg(args, "x_type", "numeric_matrix"),
+                           layer_types = check_arg(args, "layer_types", character()),
+                           obs_types = ifelse("obs_types" %in% names(args), args$obs_types, "integer"),
+                           var_types = ifelse("var_types" %in% names(args), args$var_types, "integer"),
+                           obsm_types = check_arg(args, "obsm_types", character()),
+                           varm_types = check_arg(args, "varm_types", character()),
+                           obsp_types = check_arg(args, "obsp_types", character()),
+                           varp_types = check_arg(args, "varp_types", character()),
+                           uns_types = check_arg(args, "uns_types", character()),
+                           format = "AnnData")
+  if (write) {
+    r_write_dataset(data)
+  }
+
+  data
+}
+
+r_write_dataset <- function(dataset, file = NULL) {
+  if (is.null(file)) {
+    file <- tempfile(pattern = "hdf5_write_R_", fileext = ".h5ad")
+  }
+  write_h5ad(dataset, file)
+  file
+}
diff --git a/tests/testthat/helper-skip_if_no_h5diff.R b/tests/testthat/helper-skip_if_no_h5diff.R
@@ -0,0 +1,7 @@
+# helper function to skip tests if h5diff is not available
+skip_if_no_h5diff <- function() {
+  testthat::skip_if_not({
+    s <- system2(command = "which", args = "h5diff", stdout = TRUE, stderr = TRUE)
+    is.null(attr(s, "status"))
+  }, message = "h5diff not available for testing")
+}
diff --git a/tests/testthat/test-roundtrip-X.R b/tests/testthat/test-roundtrip-X.R
@@ -33,6 +33,7 @@ for (name in test_names) {
   # create a couple of paths
   file_py <- withr::local_file(tempfile(paste0("anndata_py_", name), fileext = ".h5ad"))
   file_r <- withr::local_file(tempfile(paste0("anndata_r_", name), fileext = ".h5ad"))
+  file_r2 <- withr::local_file(tempfile(paste0("anndata_r2_", name), fileext = ".h5ad"))
 
   # write to file
   adata_py$write_h5ad(file_py)
@@ -101,4 +102,35 @@ for (name in test_names) {
       adata_py$X
     )
   })
+
+  skip_if_no_h5diff()
+  # Get all R datatypes that are equivalent to the python datatype (name)
+  res <- Filter(function(x) x[[1]] == name, matrix_equivalences)
+  r_datatypes <- sapply(res, function(x) x[[2]])
+
+  # nolint start
+  for (r_name in r_datatypes) {
+    test_that(paste0("Comparing a python generated .h5ad with X '", name, 
+                      "' with an R generated .h5ad '", r_name, "' works"), {
+      msg <- message_if_known(
+        backend = "HDF5AnnData",
+        slot = c("X"),
+        dtype = name,
+        process = c("h5diff"),
+        known_issues = known_issues
+      )
+      skip_if(!is.null(msg), message = msg)
+
+      # generate an R h5ad
+      adata_r <- r_generate_dataset(10L, 20L, x_type = list(r_name))
+      write_h5ad(adata_r, file_r2)
+
+      # run h5diff
+      res <- processx::run("h5diff", c("-v", file_py, file_r2, "/X"), error_on_status = FALSE)
+
+      expect_equal(res$status, 0, info = res$stdout)
+
+    })
+  }
+  # nolint end
 }