Add PyArrow memory mapping improvements and allocator documentation (#91

) * poetry and pyarrow updates * set jemalloc as pyarrow memory allocator * add memory mapping for parquet reader * remove allocator function and add notes to docs * add links to allocators * update pycytominer, related scipy deps for tests * update setuptools to avoid gh actions setup issues commit 836aa61 Author: d33bs <[email protected]> Date: Tue Aug 22 14:27:54 2023 -0600 remove test branch commit 4c68dae Author: d33bs <[email protected]> Date: Tue Aug 22 14:25:24 2023 -0600 update setuptools commit c24f5d1 Author: d33bs <[email protected]> Date: Tue Aug 22 14:15:58 2023 -0600 Update test.yml * adding memory map override env variable + docs Co-Authored-By: Faisal Alquaddoomi <[email protected]> * isort linting --------- Co-authored-by: Faisal Alquaddoomi <[email protected]>
cytomining · Aug 22, 2023 · 0563891 · 0563891
1 parent c0d2077
commit 0563891
Show file tree

Hide file tree

Showing 5 changed files with 235 additions and 273 deletions.
diff --git a/cytotable/convert.py b/cytotable/convert.py
@@ -391,9 +391,13 @@ def _prepend_column_name(
 
     import pyarrow.parquet as parquet
 
+    from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
+
     targets = tuple(metadata) + tuple(compartments)
 
-    table = parquet.read_table(source=table_path)
+    table = parquet.read_table(
+        source=table_path, memory_map=CYTOTABLE_ARROW_USE_MEMORY_MAPPING
+    )
 
     # stem of source group name
     # for example:
@@ -543,6 +547,7 @@ def _concat_source_group(
     import pyarrow.parquet as parquet
 
     from cytotable.exceptions import SchemaException
+    from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
 
     # check whether we already have a file as dest_path
     if pathlib.Path(dest_path).is_file():
@@ -599,7 +604,13 @@ def _concat_source_group(
                 # read the file from the list and write to the concatted parquet file
                 # note: we pass column order based on the first chunk file to help ensure schema
                 # compatibility for the writer
-                writer.write_table(parquet.read_table(table, schema=writer_schema))
+                writer.write_table(
+                    parquet.read_table(
+                        table,
+                        schema=writer_schema,
+                        memory_map=CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
+                    )
+                )
                 # remove the file which was written in the concatted parquet file (we no longer need it)
                 pathlib.Path(table).unlink()
 
@@ -646,6 +657,8 @@ def _get_join_chunks(
 
     import pyarrow.parquet as parquet
 
+    from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
+
     # fetch the compartment concat result as the basis for join groups
     for key, source in sources.items():
         if any(name.lower() in pathlib.Path(key).stem.lower() for name in metadata):
@@ -657,7 +670,9 @@ def _get_join_chunks(
 
     # read only the table's chunk_columns
     join_column_rows = parquet.read_table(
-        source=basis[0]["table"], columns=list(chunk_columns)
+        source=basis[0]["table"],
+        columns=list(chunk_columns),
+        memory_map=CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
     ).to_pylist()
 
     # build and return the chunked join column rows
@@ -820,6 +835,8 @@ def _concat_join_sources(
 
     import pyarrow.parquet as parquet
 
+    from cytotable.utils import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
+
     # remove the unjoined concatted compartments to prepare final dest_path usage
     # (we now have joined results)
     flattened_sources = list(itertools.chain(*list(sources.values())))
@@ -837,7 +854,12 @@ def _concat_join_sources(
     # write the concatted result as a parquet file
     parquet.write_table(
         table=pa.concat_tables(
-            tables=[parquet.read_table(table_path) for table_path in join_sources]
+            tables=[
+                parquet.read_table(
+                    table_path, memory_map=CYTOTABLE_ARROW_USE_MEMORY_MAPPING
+                )
+                for table_path in join_sources
+            ]
         ),
         where=dest_path,
     )
@@ -848,7 +870,13 @@ def _concat_join_sources(
     writer_schema = parquet.read_schema(join_sources[0])
     with parquet.ParquetWriter(str(dest_path), writer_schema) as writer:
         for table_path in join_sources:
-            writer.write_table(parquet.read_table(table_path, schema=writer_schema))
+            writer.write_table(
+                parquet.read_table(
+                    table_path,
+                    schema=writer_schema,
+                    memory_map=CYTOTABLE_ARROW_USE_MEMORY_MAPPING,
+                )
+            )
             # remove the file which was written in the concatted parquet file (we no longer need it)
             pathlib.Path(table_path).unlink()
 

diff --git a/cytotable/utils.py b/cytotable/utils.py
@@ -26,6 +26,11 @@
     else int(cast(int, os.environ.get("CYTOTABLE_MAX_THREADS")))
 )
 
+# enables overriding default memory mapping behavior with pyarrow memory mapping
+CYTOTABLE_ARROW_USE_MEMORY_MAPPING = (
+    os.environ.get("CYTOTABLE_ARROW_USE_MEMORY_MAPPING", "1") == "1"
+)
+
 DDB_DATA_TYPE_SYNONYMS = {
     "real": ["float32", "float4", "float"],
     "double": ["float64", "float8", "numeric", "decimal"],

diff --git a/docs/source/architecture.technical.md b/docs/source/architecture.technical.md
@@ -56,6 +56,19 @@ cytotable.convert(
 In addition to using Python native data types, we also accomplish internal data management for CytoTable using [PyArrow (Apache Arrow) Tables](https://arrow.apache.org/docs/python/generated/pyarrow.Table.html).
 Using Arrow-compatible formats is intended to assist cross-platform utility, encourage high-performance, and enable advanced data integration with non-Python tools.
 
+#### Arrow Memory Allocator Selection
+
+PyArrow may select to use [`malloc`](https://en.wikipedia.org/wiki/C_dynamic_memory_allocation), [`jemalloc`](https://github.com/jemalloc/jemalloc), or [`mimalloc`](https://github.com/microsoft/mimalloc) depending on the operating system and allocator availability.
+This memory allocator selection may also be overridden by a developer implementing CytoTable to help with performance aspects related to user environments.
+PyArrow inherits environment configuration from the Arrow C++ implementation ([see note on this page](https://arrow.apache.org/docs/python/env_vars.html)).
+Use the [`ARROW_DEFAULT_MEMORY_POOL` environment variable](https://arrow.apache.org/docs/cpp/env_vars.html#envvar-ARROW_DEFAULT_MEMORY_POOL) to statically define which memory allocator will be used when implementing CytoTable.
+
+#### Arrow Memory Mapping Selection
+
+PyArrow includes functionality which enables [memory mapped](https://en.wikipedia.org/wiki/Memory-mapped_file) parquet file reads for performance benefits ([see `memory_map` parameter](https://arrow.apache.org/docs/python/generated/pyarrow.parquet.read_table.html)).
+This functionality is enabled by default in CytoTable.
+You may disable this functionality by setting environment variable `CYTOTABLE_ARROW_USE_MEMORY_MAPPING` to `0` (for example: `export CYTOTABLE_ARROW_USE_MEMORY_MAPPING=0`).
+
 ### SQL-based Data Management
 
 We use the [DuckDB Python API client](https://duckdb.org/docs/api/python/overview) in some areas to interface with [SQL](https://en.wikipedia.org/wiki/SQL) (for example, SQLite databases) and other data formats.