Eventual-Inc · samster25 · Nov 13, 2024 · Oct 30, 2024 · Oct 30, 2024 · Oct 31, 2024
diff --git a/src/daft-core/src/array/ops/cast.rs b/src/daft-core/src/array/ops/cast.rs
@@ -1596,7 +1596,7 @@ fn cast_sparse_to_dense_for_inner_dtype(
                 if !is_valid {
                     continue;
                 }
-                let index_series: Series = non_zero_indices_array.get(i).unwrap();
+                let index_series: Series = non_zero_indices_array.get(i).unwrap().cast(&DataType::UInt64)?;
                 let index_array = index_series.u64().unwrap().as_arrow();
                 let values_series: Series = non_zero_values_array.get(i).unwrap();
                 let values_array = values_series.downcast::<<$T as DaftDataType>::ArrayType>()
@@ -1612,6 +1612,18 @@ fn cast_sparse_to_dense_for_inner_dtype(
     Ok(item)
 }
 
+fn minimal_uint_dtype(value: u64) -> DataType {
+    if u8::try_from(value).is_ok() {
+        DataType::UInt8
+    } else if u16::try_from(value).is_ok() {
+        DataType::UInt16
+    } else if u32::try_from(value).is_ok() {
+        DataType::UInt32
+    } else {
+        DataType::UInt64
+    }
+}
+
 impl SparseTensorArray {
     pub fn cast(&self, dtype: &DataType) -> DaftResult<Series> {
         match dtype {
@@ -1678,11 +1690,16 @@ impl SparseTensorArray {
                         shape,
                     )));
                 };
+
+                let largest_index = std::cmp::max(shape.iter().product::<u64>(), 1) - 1;
+                let indices_minimal_inner_dtype = minimal_uint_dtype(largest_index);
                 let values_array =
                     va.cast(&DataType::List(Box::new(inner_dtype.as_ref().clone())))?;
+                let indices_array =
+                    ia.cast(&DataType::List(Box::new(indices_minimal_inner_dtype)))?;
                 let struct_array = StructArray::new(
                     Field::new(self.name(), dtype.to_physical()),
-                    vec![values_array, ia.clone().into_series()],
+                    vec![values_array, indices_array],
                     va.validity().cloned(),
                 );
                 let sparse_tensor_array = FixedShapeSparseTensorArray::new(
@@ -1760,6 +1777,7 @@ impl FixedShapeSparseTensorArray {
 
                 let values_arr =
                     va.cast(&DataType::List(Box::new(inner_dtype.as_ref().clone())))?;
+                let indices_arr = ia.cast(&DataType::List(Box::new(DataType::UInt64)))?;
 
                 // List -> Struct
                 let shape_offsets = arrow2::offset::OffsetsBuffer::try_from(shape_offsets)?;
@@ -1776,11 +1794,7 @@ impl FixedShapeSparseTensorArray {
                 let physical_type = dtype.to_physical();
                 let struct_array = StructArray::new(
                     Field::new(self.name(), physical_type),
-                    vec![
-                        values_arr,
-                        ia.clone().into_series(),
-                        shapes_array.into_series(),
-                    ],
+                    vec![values_arr, indices_arr, shapes_array.into_series()],
                     validity.cloned(),
                 );
                 Ok(
@@ -1825,11 +1839,39 @@ impl FixedShapeSparseTensorArray {
                 Ok(fixed_shape_tensor_array.into_series())
             }
             #[cfg(feature = "python")]
-            (DataType::Python, DataType::FixedShapeSparseTensor(inner_dtype, _)) => {
-                let sparse_tensor_series =
-                    self.cast(&DataType::SparseTensor(inner_dtype.clone()))?;
-                let sparse_pytensor_series = sparse_tensor_series.cast(&DataType::Python)?;
-                Ok(sparse_pytensor_series)
+            (DataType::Python, DataType::FixedShapeSparseTensor(_, tensor_shape)) => {
+                Python::with_gil(|py| {
+                    let mut pydicts: Vec<Py<PyAny>> = Vec::with_capacity(self.len());
+                    let va = self.values_array();
+                    let ia = self.indices_array();
+                    let pyarrow = py.import_bound(pyo3::intern!(py, "pyarrow"))?;
+                    for (values_array, indices_array) in va.into_iter().zip(ia.into_iter()) {
+                        if let (Some(values_array), Some(indices_array)) =
+                            (values_array, indices_array)
+                        {
+                            let py_values_array =
+                                ffi::to_py_array(py, values_array.to_arrow(), &pyarrow)?
+                                    .call_method1(pyo3::intern!(py, "to_numpy"), (false,))?;
+                            let py_indices_array =
+                                ffi::to_py_array(py, indices_array.to_arrow(), &pyarrow)?
+                                    .call_method1(pyo3::intern!(py, "to_numpy"), (false,))?;
+                            let pydict = pyo3::types::PyDict::new_bound(py);
+                            pydict.set_item("values", py_values_array)?;
+                            pydict.set_item("indices", py_indices_array)?;
+                            pydict.set_item("shape", tensor_shape)?;
+                            pydicts.push(pydict.unbind().into());
+                        } else {
+                            pydicts.push(py.None());
+                        }
+                    }
+                    let py_objects_array =
+                        PseudoArrowArray::new(pydicts.into(), self.physical.validity().cloned());
+                    Ok(PythonArray::new(
+                        Field::new(self.name(), dtype.clone()).into(),
+                        py_objects_array.to_boxed(),
+                    )?
+                    .into_series())
+                })
             }
             (_, _) => self.physical.cast(dtype),
         }
@@ -1966,9 +2008,15 @@ impl FixedShapeTensorArray {
                     offsets_cloned.into(),
                     validity.cloned(),
                 );
+
+                let largest_index = tensor_shape.iter().product::<u64>() - 1;
+                let indices_minimal_inner_dtype = minimal_uint_dtype(largest_index);
+                let casted_indices = indices_list_arr
+                    .cast(&DataType::List(Box::new(indices_minimal_inner_dtype)))?;
+
                 let sparse_struct_array = StructArray::new(
                     Field::new(self.name(), dtype.to_physical()),
-                    vec![data_list_arr.into_series(), indices_list_arr.into_series()],
+                    vec![data_list_arr.into_series(), casted_indices],
                     validity.cloned(),
                 );
                 Ok(FixedShapeSparseTensorArray::new(

diff --git a/src/daft-core/src/array/ops/sparse_tensor.rs b/src/daft-core/src/array/ops/sparse_tensor.rs
@@ -113,4 +113,36 @@ mod tests {
 
         Ok(())
     }
+
+    #[test]
+    fn test_fixed_shape_sparse_datatype() -> DaftResult<()> {
+        const INDICES_IDX: usize = 1;
+        let element_counts = [2u64.pow(8), 2u64.pow(16), 2u64.pow(32), 2u64.pow(64)];
+        let indices_minimal_dtype = [
+            DataType::UInt8,
+            DataType::UInt16,
+            DataType::UInt32,
+            DataType::UInt64,
+        ];
+
+        for (n_elements, minimal_dtype) in element_counts.iter().zip(indices_minimal_dtype.iter()) {
+            let dtype =
+                DataType::FixedShapeSparseTensor(Box::new(DataType::Float32), vec![*n_elements]);
+            let physical_dtype = dtype.to_physical();
+            if let DataType::Struct(fields) = physical_dtype {
+                assert_eq!(fields.len(), 2, "Expected exactly 2 fields in Struct");
+
+                let indices_field = &fields[INDICES_IDX];
+                assert_eq!(indices_field.name, "indices");
+                assert_eq!(
+                    indices_field.dtype,
+                    DataType::List(Box::new(minimal_dtype.clone()))
+                );
+            } else {
+                panic!("Expected Struct DataType, got {:?}", physical_dtype);
+            }
+        }
+
+        Ok(())
+    }
 }
diff --git a/src/daft-schema/src/dtype.rs b/src/daft-schema/src/dtype.rs
@@ -343,9 +343,23 @@ impl DataType {
                 Field::new("indices", List(Box::new(Self::UInt64))),
                 Field::new("shape", List(Box::new(Self::UInt64))),
             ]),
-            FixedShapeSparseTensor(dtype, _) => Struct(vec![
+            FixedShapeSparseTensor(dtype, shape) => Struct(vec![
                 Field::new("values", List(Box::new(*dtype.clone()))),
-                Field::new("indices", List(Box::new(Self::UInt64))),
+                {
+                    let largest_index = std::cmp::max(shape.iter().product::<u64>(), 1) - 1;
+                    let minimal_indices_dtype = {
+                        if u8::try_from(largest_index).is_ok() {
+                            Self::UInt8
+                        } else if u16::try_from(largest_index).is_ok() {
+                            Self::UInt16
+                        } else if u32::try_from(largest_index).is_ok() {
+                            Self::UInt32
+                        } else {
+                            Self::UInt64
+                        }
+                    };
+                    Field::new("indices", List(Box::new(minimal_indices_dtype)))
+                },
             ]),
             _ => {
                 assert!(self.is_physical());

diff --git a/tests/series/test_cast.py b/tests/series/test_cast.py
@@ -1166,11 +1166,20 @@ def test_series_cast_fixed_size_list_to_list() -> None:
 ### Sparse ###
 
 
+def minimal_indices_dtype(shape: tuple[int]) -> np.dtype:
+    largest_index_possible = np.prod(shape) - 1
+    minimal_dtype = np.min_scalar_type(largest_index_possible)
+    return minimal_dtype
+
+
 def to_coo_sparse_dict(ndarray: np.ndarray) -> dict[str, np.ndarray]:
     flat_array = ndarray.ravel()
-    indices = np.flatnonzero(flat_array).astype(np.uint64)
+    indices = np.flatnonzero(flat_array)
     values = flat_array[indices]
     shape = list(ndarray.shape)
+
+    indices_dtype = minimal_indices_dtype(shape)
+    indices = indices.astype(indices_dtype)
     return {"values": values, "indices": indices, "shape": shape}
 
 

diff --git a/tests/series/test_sparse_tensor.py b/tests/series/test_sparse_tensor.py
@@ -58,3 +58,19 @@ def test_sparse_tensor_repr():
 ╰─────────────────────────────╯
 """
     )
+
+
+@pytest.mark.parametrize("indices_dtype", [np.uint8, np.uint16])
+def test_minimal_indices_dtype_for_fixed_shape_sparse(indices_dtype: np.dtype):
+    largest_index_possible = np.iinfo(indices_dtype).max
+    tensor_shape = (largest_index_possible + 1, 1)
+
+    series = Series.from_pylist([np.zeros(shape=tensor_shape)]).cast(
+        DataType.tensor(DataType.float32(), shape=tensor_shape)
+    )
+    sparse_series = series.cast(DataType.sparse_tensor(DataType.float32(), shape=tensor_shape))
+
+    received_tensor = sparse_series.to_pylist().pop()
+    assert received_tensor["values"].dtype == np.float32
+    assert received_tensor["indices"].dtype == indices_dtype
+    assert received_tensor["shape"] == list(tensor_shape)