[CHORE] Add tests for decimal casting (#3179)

There are no existing tests for decimal casting. This PR adds some randomized tests.
Eventual-Inc · Nov 6, 2024 · 2829a09 · 2829a09
1 parent 64e35f8
commit 2829a09
Showing 4 changed files with 177 additions and 1 deletion.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/src/daft-core/Cargo.toml b/src/daft-core/Cargo.toml
@@ -41,6 +41,7 @@ mur3 = "0.1.0"
 ndarray = "0.15.6"
 num-traits = {workspace = true}
 pyo3 = {workspace = true, optional = true}
+rand = "0.8.5"
 regex = {workspace = true}
 serde = {workspace = true}
 sketches-ddsketch = {workspace = true}

diff --git a/src/daft-core/src/array/ops/cast.rs b/src/daft-core/src/array/ops/cast.rs
@@ -2253,3 +2253,156 @@ where
         .into_series())
     })
 }
+
+#[cfg(test)]
+mod tests {
+    use arrow2::array::PrimitiveArray;
+    use rand::{thread_rng, Rng};
+
+    use super::*;
+    use crate::{
+        datatypes::DataArray,
+        prelude::{Decimal128Type, Float64Array},
+    };
+
+    fn create_test_decimal_array(
+        values: Vec<i128>,
+        precision: usize,
+        scale: usize,
+    ) -> DataArray<Decimal128Type> {
+        let arrow_array = PrimitiveArray::from_vec(values)
+            .to(arrow2::datatypes::DataType::Decimal(precision, scale));
+        let field = Arc::new(Field::new(
+            "test_decimal",
+            DataType::Decimal128(precision, scale),
+        ));
+        DataArray::<Decimal128Type>::from_arrow(field, Box::new(arrow_array))
+            .expect("Failed to create test decimal array")
+    }
+
+    fn create_test_f64_array(values: Vec<f64>) -> Float64Array {
+        let arrow_array = PrimitiveArray::from_vec(values).to(arrow2::datatypes::DataType::Float64);
+        let field = Arc::new(Field::new("test_float", DataType::Float64));
+        Float64Array::from_arrow(field, Box::new(arrow_array))
+            .expect("Failed to create test float array")
+    }
+
+    fn create_test_i64_array(values: Vec<i64>) -> Int64Array {
+        let arrow_array = PrimitiveArray::from_vec(values).to(arrow2::datatypes::DataType::Int64);
+        let field = Arc::new(Field::new("test_int", DataType::Int64));
+        Int64Array::from_arrow(field, Box::new(arrow_array))
+            .expect("Failed to create test int array")
+    }
+
+    // For a Decimal(p, s) to be valid, p, s, and max_val must satisfy:
+    //   p > ceil(log_9(max_val * 10^s)) - 1
+    // So with a max_val of 10^10, we get:
+    //   p > ceil(log_9(10^(10+s))) - 1
+    // Since p <= 32, for this inequality to hold, we need s <= 20.
+    const MAX_VAL: f64 = 1e10;
+    const MAX_SCALE: usize = 20;
+    const MIN_DIFF_FOR_PRECISION: usize = 12;
+    #[test]
+    fn test_decimal_to_decimal_cast() {
+        let mut rng = thread_rng();
+        let mut values: Vec<f64> = (0..100).map(|_| rng.gen_range(-MAX_VAL..MAX_VAL)).collect();
+        values.extend_from_slice(&[0.0, -0.0]);
+
+        let initial_scale: usize = rng.gen_range(0..=MAX_SCALE);
+        let initial_precision: usize = rng.gen_range(initial_scale + MIN_DIFF_FOR_PRECISION..=32);
+        let min_integral_comp = initial_precision - initial_scale;
+        let i128_values: Vec<i128> = values
+            .iter()
+            .map(|&x| (x * 10_f64.powi(initial_scale as i32) as f64) as i128)
+            .collect();
+        let original = create_test_decimal_array(i128_values, initial_precision, initial_scale);
+
+        // We always widen the Decimal, otherwise we lose information and can no longer compare with the original Decimal values.
+        let intermediate_scale: usize = rng.gen_range(initial_scale..=32 - min_integral_comp);
+        let intermediate_precision: usize =
+            rng.gen_range(intermediate_scale + min_integral_comp..=32);
+
+        let result = original
+            .cast(&DataType::Decimal128(
+                intermediate_precision,
+                intermediate_scale,
+            ))
+            .expect("Failed to cast to intermediate decimal")
+            .cast(&DataType::Decimal128(initial_precision, initial_scale))
+            .expect("Failed to cast back to original decimal");
+
+        assert!(
+            original.into_series() == result,
+            "Failed with intermediate decimal({}, {})",
+            intermediate_precision,
+            intermediate_scale,
+        );
+    }
+
+    // We do fuzzy equality when comparing floats converted to and from decimals. This test is
+    // primarily sanity checking that we don't repeat the mistake of shifting the scale and precision
+    // of floats during casting, while avoiding flakiness due small differences in floats.
+    const EPSILON: f64 = 0.1;
+    #[test]
+    fn test_decimal_to_float() {
+        let mut rng = thread_rng();
+        let mut values: Vec<f64> = (0..100).map(|_| rng.gen_range(-MAX_VAL..MAX_VAL)).collect();
+        values.extend_from_slice(&[0.0, -0.0]);
+        let num_values = values.len();
+
+        let scale: usize = rng.gen_range(0..=MAX_SCALE);
+        let precision: usize = rng.gen_range(scale + MIN_DIFF_FOR_PRECISION..=32);
+        let i128_values: Vec<i128> = values
+            .iter()
+            .map(|&x| (x * 10_f64.powi(scale as i32) as f64) as i128)
+            .collect();
+        let original = create_test_decimal_array(i128_values, precision, scale);
+
+        let result = original
+            .cast(&DataType::Float64)
+            .expect("Failed to cast to float");
+        let original = create_test_f64_array(values);
+
+        let epsilon_series = create_test_f64_array(vec![EPSILON; num_values]).into_series();
+
+        assert!(
+            result.fuzzy_eq(&original.into_series(), &epsilon_series),
+            "Failed with decimal({}, {})",
+            precision,
+            scale,
+        );
+    }
+
+    // 2^63 gives us 18 unrestricted digits. So precision - scale has to be <= 18.
+    const MAX_DIFF_FOR_PRECISION: usize = 18;
+    #[test]
+    fn test_decimal_to_int() {
+        let mut rng = thread_rng();
+        let mut values: Vec<f64> = (0..100).map(|_| rng.gen_range(-MAX_VAL..MAX_VAL)).collect();
+        values.extend_from_slice(&[0.0, -0.0]);
+
+        let scale: usize = rng.gen_range(0..=MAX_SCALE);
+        let precision: usize =
+            rng.gen_range(scale + MIN_DIFF_FOR_PRECISION..=scale + MAX_DIFF_FOR_PRECISION);
+        let i128_values: Vec<i128> = values
+            .iter()
+            .map(|&x| (x * 10_f64.powi(scale as i32) as f64) as i128)
+            .collect();
+        let original = create_test_decimal_array(i128_values, precision, scale);
+
+        let result = original
+            .cast(&DataType::Int64)
+            .expect("Failed to cast to int64");
+
+        // Convert the original floats directly to integers.
+        let values = values.into_iter().map(|f| f as i64).collect();
+        let original = create_test_i64_array(values);
+
+        assert!(
+            original.into_series() == result,
+            "Failed with decimal({}, {})",
+            precision,
+            scale,
+        );
+    }
+}
diff --git a/src/daft-core/src/series/mod.rs b/src/daft-core/src/series/mod.rs
@@ -4,7 +4,7 @@ mod ops;
 mod serdes;
 mod series_like;
 mod utils;
-use std::sync::Arc;
+use std::{ops::Sub, sync::Arc};
 
 pub use array_impl::IntoSeries;
 use common_display::table_display::{make_comfy_table, StrValue};
@@ -148,4 +148,25 @@ impl Series {
         let data: &DataArray<N::DAFTTYPE> = self.downcast()?;
         Ok(data.as_slice())
     }
+
+    /// Helper function to check that two series of floats are within some `epsilon` of each other.
+    pub fn fuzzy_eq(&self, other: &Self, epsilon: &Self) -> bool {
+        if self.data_type() != other.data_type() {
+            return false;
+        }
+        match self.data_type() {
+            DataType::Float32 | DataType::Float64 => {
+                let diff = self
+                    .sub(other)
+                    .expect("Failed to subtract one series from the other")
+                    .abs()
+                    .expect("Failed to get absolute difference between the two given series");
+                match diff.lte(epsilon) {
+                    Ok(arr) => arr.into_iter().all(|x| x.unwrap_or(false)),
+                    Err(_) => false,
+                }
+            }
+            _ => self == other,
+        }
+    }
 }