Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Test writing h5ad in Python & in R #207

Open
wants to merge 21 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .github/workflows/R-CMD-check.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,14 @@ jobs:
reticulate::py_install(c("anndata", "scanpy", "dummy-anndata"), pip = TRUE)
shell: Rscript {0}

- name: Install h5diff
run: |
if [ "$RUNNER_OS" == "Linux" ]; then
sudo apt-get install hdf5-tools
fi
shell: bash


- uses: r-lib/actions/check-r-package@v2
with:
upload-snapshots: true
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,8 @@ Suggests:
testthat (>= 3.0.0),
vctrs,
withr,
yaml
yaml,
processx
VignetteBuilder:
knitr
Config/Needs/website: pkgdown, tibble, knitr, rprojroot, stringr, readr,
Expand Down
59 changes: 34 additions & 25 deletions R/generate_matrix.R
Original file line number Diff line number Diff line change
@@ -1,73 +1,82 @@
# nolint start
generate_numeric_matrix <- function(n_obs, n_vars, NAs = FALSE) {
# byrow = TRUE to mimic the way a matrix gets filled in Python
m <- matrix(seq(0.5, n_obs * n_vars), nrow = n_obs, ncol = n_vars, byrow = TRUE)
if (NAs) {
m[1, 1] <- NA_real_
}
m
}

generate_integer_matrix <- function(n_obs, n_vars, NAs = FALSE) {
# byrow = TRUE to mimic the way a matrix gets filled in Python
m <- matrix(seq(0L, n_obs * n_vars - 1), nrow = n_obs, ncol = n_vars, byrow = TRUE)
if (NAs) {
m[1, 1] <- NA_integer_
}
m
}

matrix_generators <- list(
numeric_matrix = function(n_obs, n_vars) {
matrix(runif(n_obs * n_vars), nrow = n_obs, ncol = n_vars)
generate_numeric_matrix(n_obs, n_vars)
},
numeric_dense = function(n_obs, n_vars) {
m <- matrix(runif(n_obs * n_vars), nrow = n_obs, ncol = n_vars)
m <- generate_numeric_matrix(n_obs, n_vars)
as(m, "denseMatrix")
},
numeric_csparse = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m <- generate_numeric_matrix(n_obs, n_vars)
as(m, "CsparseMatrix")
},
numeric_rsparse = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m <- generate_numeric_matrix(n_obs, n_vars)
as(m, "RsparseMatrix")
},
numeric_matrix_with_nas = function(n_obs, n_vars) {
m <- matrix(runif(n_obs * n_vars), nrow = n_obs, ncol = n_vars)
m[seq(1, n_obs * n_vars, by = 2)] <- NA_real_
m
generate_numeric_matrix(n_obs, n_vars, NAs = TRUE)
},
numeric_dense_with_nas = function(n_obs, n_vars) {
m <- matrix(runif(n_obs * n_vars), nrow = n_obs, ncol = n_vars)
m[seq(1, n_obs * n_vars, by = 2)] <- NA_real_
m <- generate_numeric_matrix(n_obs, n_vars, NAs = TRUE)
as(m, "denseMatrix")
},
numeric_csparse_with_nas = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m[seq(1, n_obs * n_vars, by = 2)] <- NA_real_
m <- generate_numeric_matrix(n_obs, n_vars, NAs = TRUE)
as(m, "CsparseMatrix")
},
numeric_rsparse_with_nas = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m[seq(1, n_obs * n_vars, by = 2)] <- NA_real_
m <- generate_numeric_matrix(n_obs, n_vars, NAs = TRUE)
as(m, "RsparseMatrix")
},
integer_matrix = function(n_obs, n_vars) {
matrix(sample.int(100L, n_obs * n_vars, replace = TRUE), nrow = n_obs, ncol = n_vars)
generate_integer_matrix(n_obs, n_vars)
},
integer_dense = function(n_obs, n_vars) {
m <- matrix(sample.int(100L, n_obs * n_vars, replace = TRUE), nrow = n_obs, ncol = n_vars)
m <- generate_integer_matrix(n_obs, n_vars)
as(m, "denseMatrix")
},
integer_csparse = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m <- generate_integer_matrix(n_obs, n_vars)
as(m, "CsparseMatrix")
},
integer_rsparse = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m <- generate_integer_matrix(n_obs, n_vars)
as(m, "RsparseMatrix")
},
integer_matrix_with_nas = function(n_obs, n_vars) {
m <- matrix(sample.int(100L, n_obs * n_vars, replace = TRUE), nrow = n_obs, ncol = n_vars)
m[seq(1, n_obs * n_vars, by = 2)] <- NA_integer_
m <- generate_integer_matrix(n_obs, n_vars, NAs = TRUE)
m
},
integer_dense_with_nas = function(n_obs, n_vars) {
m <- matrix(sample.int(100L, n_obs * n_vars, replace = TRUE), nrow = n_obs, ncol = n_vars)
m[seq(1, n_obs * n_vars, by = 2)] <- NA_integer_
m <- generate_integer_matrix(n_obs, n_vars, NAs = TRUE)
as(m, "denseMatrix")
},
integer_csparse_with_nas = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m[seq(1, n_obs * n_vars, by = 2)] <- NA_integer_
m <- generate_integer_matrix(n_obs, n_vars, NAs = TRUE)
as(m, "CsparseMatrix")
},
integer_rsparse_with_nas = function(n_obs, n_vars) {
m <- Matrix::rsparsematrix(nrow = n_obs, ncol = n_vars, density = .1)
m[seq(1, n_obs * n_vars, by = 2)] <- NA_integer_
m <- generate_integer_matrix(n_obs, n_vars, NAs = TRUE)
as(m, "RsparseMatrix")
}
)
Expand Down
22 changes: 11 additions & 11 deletions R/generate_vector.R
Original file line number Diff line number Diff line change
@@ -1,28 +1,28 @@
vector_generators <- list(
character = function(n) paste0("value", seq_len(n)),
integer = function(n) seq_len(n),
factor = function(n) factor(paste0("value", seq_len(n))),
factor_ordered = function(n) factor(paste0("value", seq_len(n)), ordered = TRUE),
character = function(n) paste0("value_", seq(from = 0, to = n - 1)),
integer = function(n) seq(from = 0, to = n - 1),
factor = function(n) factor(rep(c("Value1", "Value2"), length.out = n)),
factor_ordered = function(n) factor(rep(c("Value1", "Value2"), length.out = n), ordered = TRUE),
logical = function(n) sample(c(TRUE, FALSE), n, replace = TRUE),
numeric = function(n) runif(n),
numeric = function(n) seq(from = 0.5, to = n),
character_with_nas = function(n) {
x <- paste0("value", seq_len(n))
x[seq(1, n, by = 2)] <- NA_character_
x
},
integer_with_nas = function(n) {
x <- seq_len(n)
x[seq(1, n, by = 2)] <- NA_integer_
x <- seq(from = 0, to = n - 1)
x[1] <- NA_integer_
x
},
factor_with_nas = function(n) {
x <- factor(paste0("value", seq_len(n)))
x[seq(1, n, by = 2)] <- NA_character_
x <- factor(rep(c("Value1", "Value2"), length.out = n))
x[1] <- NA_character_
x
},
factor_ordered_with_nas = function(n) {
x <- factor(paste0("value", seq_len(n)), ordered = TRUE)
x[seq(1, n, by = 2)] <- NA_character_
x <- factor(rep(c("Value1", "Value2"), length.out = n), ordered = TRUE)
x[1] <- NA_character_
x
},
logical_with_nas = function(n) {
Expand Down
104 changes: 104 additions & 0 deletions inst/known_issues.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,107 @@ known_issues:
proposed_solution: The input checking function for obsm and varm should allow the object to be a vector of the correct length instead of only a matrix or a data frame.
to_investigate: True
to_fix: True
- backend: HDF5AnnData
slot:
- X
dtype:
- float_csparse
- float_csparse_nas
process: [h5diff]
error_message: |
Warning: different storage datatype
<shape> has file datatype H5T_STD_I64LE
<shape> has file datatype H5T_STD_I32LE
attribute: <shape of </X>> and <shape of </X>>
description: hdf5py writes the <shape> attribute as a H5T_STD_I64LE, hdf5r writes it as H5T_STD_I32LE.
proposed_solution: We should investigate if we can specify the type with which an attribute should be written.
to_investigate: True
to_fix: True
- backend: HDF5AnnData
slot:
- X
- obsm
- varm
- obsp
- varp
dtype:
- float_csparse
- float_csparse_nas
- float_rsparse
- float_rsparse_nas
process: [h5diff]
error_message: |
dataset: </X/data> and </X/data>
Not comparable: </X/data> has rank 1, dimensions [200], max dimensions [18446744073709551615]
and </X/data> has rank 1, dimensions [108], max dimensions [108]
0 differences found
dataset: </X/indices> and </X/indices>
Not comparable: </X/indices> has rank 1, dimensions [200], max dimensions [18446744073709551615]
and </X/indices> has rank 1, dimensions [108], max dimensions [108]
0 differences found
dataset: </X/indptr> and </X/indptr>
Warning: different maximum dimensions
</X/indptr> has max dimensions [18446744073709551615]
</X/indptr> has max dimensions [21]
description: hdf5py has max dimensions as 2^64 - 1, the max val for an unsigned int. hdf5r has it as the actual value
proposed_solution: We should investigate if something goes wrong with h5py, but I think hdf5 provides the expected behaviour.
to_investigate: True
to_fix: False
- backend: HDF5AnnData
slot:
- obs
- var
dtype:
- integer_with_nas
process: [h5diff]
error_message: |
dataset: </var/nullable_integer_array/mask> and </var/integer_with_nas/mask>
Warning: different storage datatype
Not comparable: </var/nullable_integer_array/mask> has sign H5T_SGN_2 and </var/integer_with_nas/mask> has sign H5T_SGN_NONE
0 differences found
description: hdf5py writes a nullable integer array with type H5T_SGN_2, hdf5r writes with type H5T_SGN_NONE
proposed_solution: We should investigate if we can specify the type with which an attribute should be written.
to_investigate: True
to_fix: True
- backend: HDF5AnnData
slot:
- obs
- var
dtype:
- nullable_integer_array
process: [h5diff]
error_message: |
dataset: </var/nullable_integer_array/values> and </var/integer_with_nas/values>
Warning: different storage datatype
</var/nullable_integer_array/values> has file datatype H5T_STD_I64LE
</var/integer_with_nas/values> has file datatype H5T_STD_I32LE
size: [20] [20]
position values values difference
------------------------------------------------------------
[ 0 ] 0 1 1
1 differences found
description: hdf5py writes a nullable integer array with type H5T_STD_I64LE, hdf5r writes with type H5T_STD_I32LE
proposed_solution: We should investigate if we can specify the type with which an attribute should be written.
to_investigate: True
to_fix: True
- backend: HDF5AnnData
slot:
- obs
- var
dtype:
- nullable_integer_array
process: [h5diff]
error_message: |
dataset: </var/nullable_integer_array/values> and </var/integer_with_nas/values>
Warning: different storage datatype
</var/nullable_integer_array/values> has file datatype H5T_STD_I64LE
</var/integer_with_nas/values> has file datatype H5T_STD_I32LE
size: [20] [20]
position values values difference
------------------------------------------------------------
[ 0 ] 0 1 1
1 differences found
description: On position 0, hdf5py writes a 0 in the values array, hdf5r writes a 1.
proposed_solution: We should investigate why this difference happens.
to_investigate: True
to_fix: True
64 changes: 64 additions & 0 deletions tests/testthat/helper-py-R-equivalences.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# python, R
# does not test numeric_dense, numeric_dense_with_nas or integer_dense
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you elaborate why not?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because of issue #198
It might be better to just document this in the known_issues.yaml though

matrix_equivalences <- list(
c("float_matrix", "numeric_matrix"),
c("float_matrix_nas", "numeric_matrix_with_nas"),
c("integer_matrix", "integer_matrix"),
c("float_csparse", "numeric_csparse"),
c("float_csparse_nas", "numeric_csparse_with_nas"),
c("float_rsparse", "numeric_rsparse"),
c("float_rsparse_nas", "numeric_rsparse_with_nas")
)

# python, R
vector_equivalences <- list(
c("categorical", "factor"),
c("categorical_ordered", "factor_ordered"),
c("categorical_missing_values", "factor_with_nas"),
c("categorical_ordered_missing_values", "factor_ordered_with_nas"),
c("string_array", "character"),
c("dense_array", "numeric"),
c("integer_array", "integer"),
c("boolean_array", "logical"),
c("nullable_integer_array", "integer_with_nas"),
c("nullable_boolean_array", "logical_with_nas")
)

all_equivalences <- c(matrix_equivalences, vector_equivalences)

check_arg <- function(args, name, falseval) {
if (name %in% names(args)) {
args[[name]][[1]]
} else {
falseval
}
}

r_generate_dataset <- function(n_obs, n_vars, write = FALSE, ...) {
args <- list(...)

data <- generate_dataset(n_obs, n_vars,
x_type = check_arg(args, "x_type", "numeric_matrix"),
layer_types = check_arg(args, "layer_types", character()),
obs_types = ifelse("obs_types" %in% names(args), args$obs_types, "integer"),
var_types = ifelse("var_types" %in% names(args), args$var_types, "integer"),
obsm_types = check_arg(args, "obsm_types", character()),
varm_types = check_arg(args, "varm_types", character()),
obsp_types = check_arg(args, "obsp_types", character()),
varp_types = check_arg(args, "varp_types", character()),
uns_types = check_arg(args, "uns_types", character()),
format = "AnnData")
if (write) {
r_write_dataset(data)
}

data
}

r_write_dataset <- function(dataset, file = NULL) {
if (is.null(file)) {
file <- tempfile(pattern = "hdf5_write_R_", fileext = ".h5ad")
}
write_h5ad(dataset, file)
file
}
7 changes: 7 additions & 0 deletions tests/testthat/helper-skip_if_no_h5diff.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# helper function to skip tests if h5diff is not available
skip_if_no_h5diff <- function() {
testthat::skip_if_not({
s <- system2(command = "which", args = "h5diff", stdout = TRUE, stderr = TRUE)
is.null(attr(s, "status"))
}, message = "h5diff not available for testing")
}
32 changes: 32 additions & 0 deletions tests/testthat/test-roundtrip-X.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ for (name in test_names) {
# create a couple of paths
file_py <- withr::local_file(tempfile(paste0("anndata_py_", name), fileext = ".h5ad"))
file_r <- withr::local_file(tempfile(paste0("anndata_r_", name), fileext = ".h5ad"))
file_r2 <- withr::local_file(tempfile(paste0("anndata_r2_", name), fileext = ".h5ad"))

# write to file
adata_py$write_h5ad(file_py)
Expand Down Expand Up @@ -101,4 +102,35 @@ for (name in test_names) {
adata_py$X
)
})

skip_if_no_h5diff()
# Get all R datatypes that are equivalent to the python datatype (name)
res <- Filter(function(x) x[[1]] == name, matrix_equivalences)
r_datatypes <- sapply(res, function(x) x[[2]])

# nolint start
for (r_name in r_datatypes) {
test_that(paste0("Comparing a python generated .h5ad with X '", name,
"' with an R generated .h5ad '", r_name, "' works"), {
msg <- message_if_known(
backend = "HDF5AnnData",
slot = c("X"),
dtype = name,
process = c("h5diff"),
known_issues = known_issues
)
skip_if(!is.null(msg), message = msg)

# generate an R h5ad
adata_r <- r_generate_dataset(10L, 20L, x_type = list(r_name))
write_h5ad(adata_r, file_r2)

# run h5diff
res <- processx::run("h5diff", c("-v", file_py, file_r2, "/X"), error_on_status = FALSE)

expect_equal(res$status, 0, info = res$stdout)

})
}
# nolint end
}
Loading
Loading