From 80ed5f57ee45d9703d8c8707013ece7b8bbb766f Mon Sep 17 00:00:00 2001 From: FreezyLemon Date: Sun, 10 Nov 2024 23:46:57 +0100 Subject: [PATCH 1/8] test: new tests for Empirical --- src/distribution/empirical.rs | 72 +++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/src/distribution/empirical.rs b/src/distribution/empirical.rs index 21832b3c..fa002143 100644 --- a/src/distribution/empirical.rs +++ b/src/distribution/empirical.rs @@ -242,6 +242,78 @@ impl ContinuousCDF for Empirical { #[cfg(test)] mod tests { use super::*; + + #[test] + fn test_add_nan() { + let mut empirical = Empirical::new().unwrap(); + + // should not panic + empirical.add(f64::NAN); + } + + #[test] + fn test_remove_nan() { + let mut empirical = Empirical::new().unwrap(); + + empirical.add(5.2); + // should not panic + empirical.remove(f64::NAN); + } + + #[test] + fn test_remove_nonexisting() { + let mut empirical = Empirical::new().unwrap(); + + empirical.add(5.2); + // should not panic + empirical.remove(10.0); + } + + #[test] + fn test_remove_all() { + let mut empirical = Empirical::new().unwrap(); + + empirical.add(17.123); + empirical.add(-10.0); + empirical.add(0.0); + empirical.remove(-10.0); + empirical.remove(17.123); + empirical.remove(0.0); + + assert!(empirical.mean().is_none()); + assert!(empirical.variance().is_none()); + } + + #[test] + fn test_mean() { + fn test_mean_for_samples(expected_mean: f64, samples: Vec) { + let dist = Empirical::from_iter(samples); + assert_relative_eq!(dist.mean().unwrap(), expected_mean); + } + + let dist = Empirical::from_iter(vec![]); + assert!(dist.mean().is_none()); + + test_mean_for_samples(4.0, vec![4.0; 100]); + test_mean_for_samples(-0.2, vec![-0.2; 100]); + test_mean_for_samples(28.5, vec![21.3, 38.4, 12.7, 41.6]); + } + + #[test] + fn test_var() { + fn test_var_for_samples(expected_var: f64, samples: Vec) { + let dist = Empirical::from_iter(samples); + assert_relative_eq!(dist.variance().unwrap(), expected_var); + } + + let dist = Empirical::from_iter(vec![]); + assert!(dist.variance().is_none()); + + test_var_for_samples(0.0, vec![4.0; 100]); + test_var_for_samples(0.0, vec![-0.2; 100]); + test_var_for_samples(190.36666666666667, vec![21.3, 38.4, 12.7, 41.6]); + } + #[test] fn test_cdf() { let samples = vec![5.0, 10.0]; From d3d2dc288b1969e4fadd71e2c3fed9284698af9a Mon Sep 17 00:00:00 2001 From: FreezyLemon Date: Sun, 10 Nov 2024 19:10:00 +0100 Subject: [PATCH 2/8] refactor: Move NonNan to module, define some API --- src/distribution/empirical.rs | 120 ++++++++++++++++++++-------------- 1 file changed, 72 insertions(+), 48 deletions(-) diff --git a/src/distribution/empirical.rs b/src/distribution/empirical.rs index fa002143..8198e577 100644 --- a/src/distribution/empirical.rs +++ b/src/distribution/empirical.rs @@ -1,22 +1,43 @@ use crate::distribution::ContinuousCDF; use crate::statistics::*; -use core::cmp::Ordering; +use non_nan::NonNan; use std::collections::BTreeMap; -#[derive(Clone, PartialEq, Debug)] -struct NonNan(T); +mod non_nan { + use core::cmp::Ordering; -impl Eq for NonNan {} + #[derive(Clone, Copy, PartialEq, Debug)] + pub struct NonNan(T); -impl PartialOrd for NonNan { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) + impl NonNan { + pub fn get(self) -> T { + self.0 + } } -} -impl Ord for NonNan { - fn cmp(&self, other: &Self) -> Ordering { - self.0.partial_cmp(&other.0).unwrap() + impl NonNan { + #[inline] + pub fn new(x: f64) -> Option { + if x.is_nan() { + None + } else { + Some(Self(x)) + } + } + } + + impl Eq for NonNan {} + + impl PartialOrd for NonNan { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + + impl Ord for NonNan { + fn cmp(&self, other: &Self) -> Ordering { + self.0.partial_cmp(&other.0).unwrap() + } } } @@ -66,43 +87,46 @@ impl Empirical { } pub fn add(&mut self, data_point: f64) { - if !data_point.is_nan() { - self.sum += 1.; - match self.mean_and_var { - Some((mean, var)) => { - let sum = self.sum; - let var = var + (sum - 1.) * (data_point - mean) * (data_point - mean) / sum; - let mean = mean + (data_point - mean) / sum; - self.mean_and_var = Some((mean, var)); - } - None => { - self.mean_and_var = Some((data_point, 0.)); - } + let map_key = match NonNan::new(data_point) { + Some(valid) => valid, + None => return, + }; + + self.sum += 1.; + match self.mean_and_var { + Some((mean, var)) => { + let sum = self.sum; + let var = var + (sum - 1.) * (data_point - mean) * (data_point - mean) / sum; + let mean = mean + (data_point - mean) / sum; + self.mean_and_var = Some((mean, var)); + } + None => { + self.mean_and_var = Some((data_point, 0.)); } - *self.data.entry(NonNan(data_point)).or_insert(0) += 1; } + *self.data.entry(map_key).or_insert(0) += 1; } pub fn remove(&mut self, data_point: f64) { - if !data_point.is_nan() { - if let (Some(val), Some((mean, var))) = - (self.data.remove(&NonNan(data_point)), self.mean_and_var) - { - if val == 1 && self.data.is_empty() { - self.mean_and_var = None; - self.sum = 0.; - return; - }; - // reset mean and var - let mean = (self.sum * mean - data_point) / (self.sum - 1.); - let var = - var - (self.sum - 1.) * (data_point - mean) * (data_point - mean) / self.sum; - self.sum -= 1.; - if val != 1 { - self.data.insert(NonNan(data_point), val - 1); - }; - self.mean_and_var = Some((mean, var)); - } + let map_key = match NonNan::new(data_point) { + Some(valid) => valid, + None => return, + }; + + if let (Some(val), Some((mean, var))) = (self.data.remove(&map_key), self.mean_and_var) { + if val == 1 && self.data.is_empty() { + self.mean_and_var = None; + self.sum = 0.; + return; + }; + // reset mean and var + let mean = (self.sum * mean - data_point) / (self.sum - 1.); + let var = var - (self.sum - 1.) * (data_point - mean) * (data_point - mean) / self.sum; + self.sum -= 1.; + if val != 1 { + self.data.insert(map_key, val - 1); + }; + self.mean_and_var = Some((mean, var)); } } @@ -148,7 +172,7 @@ impl std::fmt::Display for Empirical { let mut enumerated_values = self .data .iter() - .flat_map(|(&NonNan(x), &count)| std::iter::repeat(x).take(count as usize)); + .flat_map(|(x, &count)| std::iter::repeat(x.get()).take(count as usize)); if let Some(x) = enumerated_values.next() { write!(f, "Empirical([{x:.3e}")?; @@ -190,14 +214,14 @@ impl ::rand::distributions::Distribution for Empirical { /// Panics if number of samples is zero impl Max for Empirical { fn max(&self) -> f64 { - self.data.keys().rev().map(|key| key.0).next().unwrap() + self.data.keys().rev().map(|key| key.get()).next().unwrap() } } /// Panics if number of samples is zero impl Min for Empirical { fn min(&self) -> f64 { - self.data.keys().map(|key| key.0).next().unwrap() + self.data.keys().map(|key| key.get()).next().unwrap() } } @@ -215,7 +239,7 @@ impl ContinuousCDF for Empirical { fn cdf(&self, x: f64) -> f64 { let mut sum = 0; for (keys, values) in &self.data { - if keys.0 > x { + if keys.get() > x { return sum as f64 / self.sum; } sum += values; @@ -226,7 +250,7 @@ impl ContinuousCDF for Empirical { fn sf(&self, x: f64) -> f64 { let mut sum = 0; for (keys, values) in self.data.iter().rev() { - if keys.0 <= x { + if keys.get() <= x { return sum as f64 / self.sum; } sum += values; From 224c66afe08ac3514ee96579bf5898e876f0690d Mon Sep 17 00:00:00 2001 From: FreezyLemon Date: Sun, 10 Nov 2024 19:19:58 +0100 Subject: [PATCH 3/8] refactor: Use u64 to store Empirical::sum --- src/distribution/empirical.rs | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/src/distribution/empirical.rs b/src/distribution/empirical.rs index 8198e577..a326591e 100644 --- a/src/distribution/empirical.rs +++ b/src/distribution/empirical.rs @@ -57,10 +57,14 @@ mod non_nan { /// ``` #[derive(Clone, PartialEq, Debug)] pub struct Empirical { - sum: f64, mean_and_var: Option<(f64, f64)>, // keys are data points, values are number of data points with equal value data: BTreeMap, u64>, + + // The following fields are only logically valid if !data.is_empty(): + /// Total amount of data points (== sum of all _values_ inside self.data). + /// Must be 0 iff data.is_empty() + sum: u64, } impl Empirical { @@ -80,7 +84,7 @@ impl Empirical { #[allow(clippy::result_unit_err)] pub fn new() -> Result { Ok(Empirical { - sum: 0., + sum: 0, mean_and_var: None, data: BTreeMap::new(), }) @@ -92,10 +96,10 @@ impl Empirical { None => return, }; - self.sum += 1.; + self.sum += 1; match self.mean_and_var { Some((mean, var)) => { - let sum = self.sum; + let sum = self.sum as f64; let var = var + (sum - 1.) * (data_point - mean) * (data_point - mean) / sum; let mean = mean + (data_point - mean) / sum; self.mean_and_var = Some((mean, var)); @@ -116,13 +120,14 @@ impl Empirical { if let (Some(val), Some((mean, var))) = (self.data.remove(&map_key), self.mean_and_var) { if val == 1 && self.data.is_empty() { self.mean_and_var = None; - self.sum = 0.; + self.sum = 0; return; }; // reset mean and var - let mean = (self.sum * mean - data_point) / (self.sum - 1.); - let var = var - (self.sum - 1.) * (data_point - mean) * (data_point - mean) / self.sum; - self.sum -= 1.; + let sum = self.sum as f64; + let mean = (sum * mean - data_point) / (sum - 1.); + let var = var - (sum - 1.) * (data_point - mean) * (data_point - mean) / sum; + self.sum -= 1; if val != 1 { self.data.insert(map_key, val - 1); }; @@ -231,7 +236,8 @@ impl Distribution for Empirical { } fn variance(&self) -> Option { - self.mean_and_var.map(|(_, var)| var / (self.sum - 1.)) + self.mean_and_var + .map(|(_, var)| var / (self.sum as f64 - 1.)) } } @@ -240,22 +246,22 @@ impl ContinuousCDF for Empirical { let mut sum = 0; for (keys, values) in &self.data { if keys.get() > x { - return sum as f64 / self.sum; + return sum as f64 / self.sum as f64; } sum += values; } - sum as f64 / self.sum + sum as f64 / self.sum as f64 } fn sf(&self, x: f64) -> f64 { let mut sum = 0; for (keys, values) in self.data.iter().rev() { if keys.get() <= x { - return sum as f64 / self.sum; + return sum as f64 / self.sum as f64; } sum += values; } - sum as f64 / self.sum + sum as f64 / self.sum as f64 } fn inverse_cdf(&self, p: f64) -> f64 { From 02a13220af70d16222884b86771dd50be534bf0b Mon Sep 17 00:00:00 2001 From: FreezyLemon Date: Sun, 10 Nov 2024 20:39:37 +0100 Subject: [PATCH 4/8] refactor: Separate Empirical::mean_and_var --- src/distribution/empirical.rs | 72 +++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/src/distribution/empirical.rs b/src/distribution/empirical.rs index a326591e..1d02ab65 100644 --- a/src/distribution/empirical.rs +++ b/src/distribution/empirical.rs @@ -57,7 +57,6 @@ mod non_nan { /// ``` #[derive(Clone, PartialEq, Debug)] pub struct Empirical { - mean_and_var: Option<(f64, f64)>, // keys are data points, values are number of data points with equal value data: BTreeMap, u64>, @@ -65,6 +64,8 @@ pub struct Empirical { /// Total amount of data points (== sum of all _values_ inside self.data). /// Must be 0 iff data.is_empty() sum: u64, + mean: f64, + var: f64, } impl Empirical { @@ -84,9 +85,10 @@ impl Empirical { #[allow(clippy::result_unit_err)] pub fn new() -> Result { Ok(Empirical { - sum: 0, - mean_and_var: None, data: BTreeMap::new(), + sum: 0, + mean: 0.0, + var: 0.0, }) } @@ -97,17 +99,10 @@ impl Empirical { }; self.sum += 1; - match self.mean_and_var { - Some((mean, var)) => { - let sum = self.sum as f64; - let var = var + (sum - 1.) * (data_point - mean) * (data_point - mean) / sum; - let mean = mean + (data_point - mean) / sum; - self.mean_and_var = Some((mean, var)); - } - None => { - self.mean_and_var = Some((data_point, 0.)); - } - } + let sum = self.sum as f64; + self.var += (sum - 1.) * (data_point - self.mean) * (data_point - self.mean) / sum; + self.mean += (data_point - self.mean) / sum; + *self.data.entry(map_key).or_insert(0) += 1; } @@ -117,21 +112,25 @@ impl Empirical { None => return, }; - if let (Some(val), Some((mean, var))) = (self.data.remove(&map_key), self.mean_and_var) { - if val == 1 && self.data.is_empty() { - self.mean_and_var = None; - self.sum = 0; - return; - }; - // reset mean and var - let sum = self.sum as f64; - let mean = (sum * mean - data_point) / (sum - 1.); - let var = var - (sum - 1.) * (data_point - mean) * (data_point - mean) / sum; - self.sum -= 1; - if val != 1 { - self.data.insert(map_key, val - 1); - }; - self.mean_and_var = Some((mean, var)); + let val = match self.data.remove(&map_key) { + Some(v) => v, + None => return, + }; + + if val == 1 && self.data.is_empty() { + self.sum = 0; + self.mean = 0.0; + self.var = 0.0; + return; + }; + + // reset mean and var + let sum = self.sum as f64; + self.mean = (sum * self.mean - data_point) / (sum - 1.); + self.var -= (sum - 1.) * (data_point - self.mean) * (data_point - self.mean) / sum; + self.sum -= 1; + if val != 1 { + self.data.insert(map_key, val - 1); } } @@ -232,12 +231,19 @@ impl Min for Empirical { impl Distribution for Empirical { fn mean(&self) -> Option { - self.mean_and_var.map(|(mean, _)| mean) + if self.data.is_empty() { + None + } else { + Some(self.mean) + } } fn variance(&self) -> Option { - self.mean_and_var - .map(|(_, var)| var / (self.sum as f64 - 1.)) + if self.data.is_empty() { + None + } else { + Some(self.var / (self.sum as f64 - 1.)) + } } } @@ -293,7 +299,7 @@ mod tests { #[test] fn test_remove_nonexisting() { let mut empirical = Empirical::new().unwrap(); - + empirical.add(5.2); // should not panic empirical.remove(10.0); From 60661f31316e9bf07e80d6c57ab1f9fd0bc1558e Mon Sep 17 00:00:00 2001 From: FreezyLemon Date: Sun, 10 Nov 2024 21:16:54 +0100 Subject: [PATCH 5/8] refactor: Rewrite Empirical::cdf and ::sf --- src/distribution/empirical.rs | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/src/distribution/empirical.rs b/src/distribution/empirical.rs index 1d02ab65..e15c6154 100644 --- a/src/distribution/empirical.rs +++ b/src/distribution/empirical.rs @@ -2,6 +2,7 @@ use crate::distribution::ContinuousCDF; use crate::statistics::*; use non_nan::NonNan; use std::collections::BTreeMap; +use std::ops::Bound; mod non_nan { use core::cmp::Ordering; @@ -249,24 +250,18 @@ impl Distribution for Empirical { impl ContinuousCDF for Empirical { fn cdf(&self, x: f64) -> f64 { - let mut sum = 0; - for (keys, values) in &self.data { - if keys.get() > x { - return sum as f64 / self.sum as f64; - } - sum += values; - } + let start = Bound::Unbounded; + let end = Bound::Included(NonNan::new(x).expect("x must not be NaN")); + + let sum: u64 = self.data.range((start, end)).map(|(_, v)| v).sum(); sum as f64 / self.sum as f64 } fn sf(&self, x: f64) -> f64 { - let mut sum = 0; - for (keys, values) in self.data.iter().rev() { - if keys.get() <= x { - return sum as f64 / self.sum as f64; - } - sum += values; - } + let start = Bound::Excluded(NonNan::new(x).expect("x must not be NaN")); + let end = Bound::Unbounded; + + let sum: u64 = self.data.range((start, end)).map(|(_, v)| v).sum(); sum as f64 / self.sum as f64 } From 10d8c5e3059159f1f5f7ff21e91f0e993e10bd66 Mon Sep 17 00:00:00 2001 From: FreezyLemon Date: Sun, 10 Nov 2024 23:13:48 +0100 Subject: [PATCH 6/8] refactor: Empirical::remove: Try to modify inplace The old code always removed entries and re-inserted them if necessary. The new code will instead modify the value (= data_point count) in-place and only remove the key- value pair from the map if the count would've dropped to zero. --- src/distribution/empirical.rs | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/src/distribution/empirical.rs b/src/distribution/empirical.rs index e15c6154..0726fb00 100644 --- a/src/distribution/empirical.rs +++ b/src/distribution/empirical.rs @@ -1,7 +1,7 @@ use crate::distribution::ContinuousCDF; use crate::statistics::*; use non_nan::NonNan; -use std::collections::BTreeMap; +use std::collections::btree_map::{BTreeMap, Entry}; use std::ops::Bound; mod non_nan { @@ -113,26 +113,30 @@ impl Empirical { None => return, }; - let val = match self.data.remove(&map_key) { - Some(v) => v, - None => return, + let mut entry = match self.data.entry(map_key) { + Entry::Occupied(entry) => entry, + Entry::Vacant(_) => return, // no entry found }; - if val == 1 && self.data.is_empty() { - self.sum = 0; - self.mean = 0.0; - self.var = 0.0; - return; - }; + if *entry.get() == 1 { + entry.remove(); + if self.data.is_empty() { + // logically, this should not need special handling. + // FP math can result in mean or var being != 0.0 though. + self.sum = 0; + self.mean = 0.0; + self.var = 0.0; + return; + } + } else { + *entry.get_mut() -= 1; + } // reset mean and var let sum = self.sum as f64; self.mean = (sum * self.mean - data_point) / (sum - 1.); self.var -= (sum - 1.) * (data_point - self.mean) * (data_point - self.mean) / sum; self.sum -= 1; - if val != 1 { - self.data.insert(map_key, val - 1); - } } // Due to issues with rounding and floating-point accuracy the default From 35bce3e16b8314c1fc3d65039165ab0c82c17647 Mon Sep 17 00:00:00 2001 From: FreezyLemon Date: Sun, 10 Nov 2024 23:19:25 +0100 Subject: [PATCH 7/8] refactor: Empirical::add: Use `and_modify` --- src/distribution/empirical.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/distribution/empirical.rs b/src/distribution/empirical.rs index 0726fb00..a6522fd1 100644 --- a/src/distribution/empirical.rs +++ b/src/distribution/empirical.rs @@ -104,7 +104,10 @@ impl Empirical { self.var += (sum - 1.) * (data_point - self.mean) * (data_point - self.mean) / sum; self.mean += (data_point - self.mean) / sum; - *self.data.entry(map_key).or_insert(0) += 1; + self.data + .entry(map_key) + .and_modify(|c| *c += 1) + .or_insert(1); } pub fn remove(&mut self, data_point: f64) { From b604dbb9150e7f5d6b4292f38b14297ecc07dbb1 Mon Sep 17 00:00:00 2001 From: FreezyLemon Date: Mon, 11 Nov 2024 12:25:25 +0100 Subject: [PATCH 8/8] refactor!: Empirical::new -> Result<_, Infallible> No value of `Infallible` can ever exist, so it is statically proven that `Result` can never exist as a `Result::Err` variant. This allows layout optimizations and is arguably a clearer API. --- src/distribution/empirical.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/distribution/empirical.rs b/src/distribution/empirical.rs index a6522fd1..c1d8bc71 100644 --- a/src/distribution/empirical.rs +++ b/src/distribution/empirical.rs @@ -2,6 +2,7 @@ use crate::distribution::ContinuousCDF; use crate::statistics::*; use non_nan::NonNan; use std::collections::btree_map::{BTreeMap, Entry}; +use std::convert::Infallible; use std::ops::Bound; mod non_nan { @@ -83,8 +84,7 @@ impl Empirical { /// let mut result = Empirical::new(); /// assert!(result.is_ok()); /// ``` - #[allow(clippy::result_unit_err)] - pub fn new() -> Result { + pub fn new() -> Result { Ok(Empirical { data: BTreeMap::new(), sum: 0,