From cd0f72c05ca8e99df6c9a58582dd60b306280d9d Mon Sep 17 00:00:00 2001 From: RishabhSaini Date: Wed, 19 Apr 2023 17:19:32 -0400 Subject: [PATCH] FIxing algo --- lib/src/chunking.rs | 170 +++++++++++++++++++++++++++----------------- 1 file changed, 106 insertions(+), 64 deletions(-) diff --git a/lib/src/chunking.rs b/lib/src/chunking.rs index 8e7f5769..538c4c38 100644 --- a/lib/src/chunking.rs +++ b/lib/src/chunking.rs @@ -417,31 +417,56 @@ fn std_deviation(data: &[u64]) -> Option { } } +fn median_absolute_deviation(data: &mut Vec) -> (f64, f64) { + //Sort data + //data.sort_by(|a, b| a.partial_cmp(b).unwrap()); + + //Find median of data + let median_data : f64 = match data.len() % 2 { + 1 => data[data.len() / 2] as f64, + _ => 0.5 * (data[data.len() / 2 - 1] + data[data.len() / 2]) as f64 + }; + + //Absolute deviations + let mut absolute_deviations = Vec::new(); + for size in data{ + absolute_deviations.push(f64::abs(*size as f64 - median_data)) + } + + absolute_deviations.sort_by(|a, b| a.partial_cmp(b).unwrap()); + let mad : f64 = match absolute_deviations.len() % 2 { + 1 => absolute_deviations[absolute_deviations.len() / 2], + _ => 0.5 * (absolute_deviations[absolute_deviations.len() / 2 - 1] + absolute_deviations[absolute_deviations.len() / 2]) + }; + + (median_data, mad) +} + +//Assumes components is sorted by descending size +//Use MAD as threshold to partition packages [abs(low_limit), high_limit] fn get_partitions_with_threshold( components: Vec<&ObjectSourceMetaSized>, + limit_hs_bins: usize, threshold: f64, ) -> Option>> { + let mut bins: BTreeMap> = BTreeMap::new(); let mut med_size: Vec<&ObjectSourceMetaSized> = Vec::new(); + let mut high_size: Vec<&ObjectSourceMetaSized> = Vec::new(); - //Calculate Mean and Stddev for Size - let sizes: Vec = components.iter().map(|a| a.size).collect(); - let mean_size = mean(&sizes)?; - let stddev_size = std_deviation(&sizes)?; - let mut size_low_limit = mean_size - threshold * stddev_size; - if size_low_limit < 0 as f64 { - size_low_limit = 100000_f64; - } - let size_high_limit = mean_size + threshold * stddev_size; + let mut sizes: Vec = components.iter().map(|a| a.size).collect(); + let (median_size, mad_size) = median_absolute_deviation(&mut sizes); + + let size_low_limit = 0.5 * f64::abs(median_size - threshold * mad_size); + let size_high_limit = median_size + threshold * mad_size; + println!("stddev/mean Method Low: {:#?}, High: {:#?} ", &size_low_limit, &size_high_limit); for pkg in components { let size = pkg.size as f64; //hs if size >= size_high_limit { - bins.entry("1hs".to_string()) - .and_modify(|bin| bin.push(pkg)) - .or_insert_with(|| vec![pkg]); + high_size.push(pkg); } //ls else if size <= size_low_limit { @@ -455,22 +480,33 @@ fn get_partitions_with_threshold( } } - let med_frequencies: Vec = med_size + //Extra hs packages + let mut remaining_pkgs: Vec<_> = high_size.drain(limit_hs_bins..).collect(); + assert_eq!(high_size.len(), limit_hs_bins); + + //Concatenate Extra hs packages + med_sizes keeps it still descending sorted + remaining_pkgs.append(&mut med_size); + bins.insert("1hs".to_string(), high_size); + + let mut med_sizes: Vec = remaining_pkgs.iter().map(|a| a.size).collect(); + let mut med_frequencies: Vec = remaining_pkgs .iter() .map(|a| a.meta.change_frequency.into()) .collect(); - let med_sizes: Vec = med_size.iter().map(|a| a.size).collect(); - let med_mean_freq = mean(&med_frequencies)?; + + let med_mean_freq = mean(&med_frequencies)?; let med_stddev_freq = std_deviation(&med_frequencies)?; let med_mean_size = mean(&med_sizes)?; - let med_stddev_size = std_deviation(&med_sizes)?; - - let med_freq_low_limit = med_mean_freq - threshold * med_stddev_freq; + let med_stddev_size = std_deviation(&med_sizes)?; + + let med_freq_low_limit = 0.5f64 * f64::abs(med_mean_freq - threshold * med_stddev_freq); let med_freq_high_limit = med_mean_freq + threshold * med_stddev_freq; - let med_size_low_limit = med_mean_size - threshold * med_stddev_size; + let med_size_low_limit = 0.5f64 * f64::abs(med_mean_size - threshold * med_stddev_size); let med_size_high_limit = med_mean_size + threshold * med_stddev_size; - for pkg in med_size { + println!("fl: {}, fh: {}, sl: {}, sh: {}", med_freq_low_limit, med_freq_high_limit, med_size_low_limit, med_size_high_limit); + + for pkg in remaining_pkgs { let size = pkg.size as f64; let freq = pkg.meta.change_frequency as f64; @@ -545,7 +581,7 @@ fn get_partitions_with_threshold( .or_insert_with(|| vec![pkg]); } } - + for (name, pkgs) in &bins { println!("{:#?}: {:#?}", name, pkgs.len()); } @@ -557,6 +593,24 @@ fn get_partitions_with_threshold( /// and a number of bins (possible container layers) to use, determine which components /// go in which bin. This algorithm is pretty simple: +// Todo +// +// 2 stats to use: +// - Size +// - Probability[update] = no of changelogs * last buildtime epoch +// +// Total available bins = n +// +// 1 bin for all max_freq pkgs +// 1 bin for all new pkgs +// 1 bin for all low size pkgs +// +// 60% of n-3 bins for HS +// 40% of n-3 bins for MS +// +// If HS bins > limit, spillover to MS to package with LF(LS, MS) +// If MS bins > limit, fold by merging 2 bins from the end +// fn basic_packing<'a>( components: &'a [ObjectSourceMetaSized], bin_size: NonZeroU32, @@ -641,7 +695,6 @@ fn basic_packing<'a>( println!("Creating new packing structure"); - components.sort_by(|a, b| a.meta.change_frequency.cmp(&b.meta.change_frequency)); let mut max_freq_components: Vec<&ObjectSourceMetaSized> = Vec::new(); components.retain(|pkg| { let retain: bool = pkg.meta.change_frequency != u32::MAX; @@ -654,68 +707,54 @@ fn basic_packing<'a>( match components_len_after_max_freq { 0 => (), _ => { - let partitions = get_partitions_with_threshold(components, 0.5) - .expect("Partitioning components into sets"); - - // Max_bins -: - // 1 for max_freq - // 1 for new_pkgs - // 1 for ls - // n for hs - // Left for ms - - let qty_hs_bins = match partitions.get("1hs") { + //Defining Limits of each bins + let limit_ls_bins = 1usize; + let limit_new_bins = 1usize; + let limit_new_pkgs = 0usize; + let limit_max_frequency_bins = 1usize; + let limit_max_frequency_pkgs = max_freq_components.len(); + let limit_hs_bins = (0.6 * (bin_size.get() - (limit_ls_bins + limit_new_bins + limit_max_frequency_bins) as u32) as f32).floor(); + let limit_ms_bins = (0.4 * (bin_size.get() - (limit_ls_bins + limit_new_bins + limit_max_frequency_bins) as u32) as f32).floor(); + + let partitions = get_partitions_with_threshold(components, limit_hs_bins as usize, 2f64).expect("Partitioning components into sets"); + + let limit_ls_pkgs = match partitions.get("2ls") { Some(n) => n.len(), None => 0usize, }; - let qty_hs_pkgs = qty_hs_bins.clone(); - let qty_ls_bins = 1usize; - let qty_ls_pkgs = match partitions.get("2ls") { - Some(n) => n.len(), - None => 0usize, - }; - - let qty_new_bins = 1usize; - let _qty_new_pkgs = 0usize; - - let qty_max_frequency_bins = 1usize; - let _qty_max_frequency_pkgs = max_freq_components.len(); - - //Can be negative or very low if qty_hs_pkgs is very high - let qty_ms_bins = bin_size.get() as usize - - (qty_hs_bins + qty_ls_bins + qty_new_bins + qty_max_frequency_bins); - - let pkg_per_bin_ms: usize = - match (components_len_after_max_freq - qty_hs_pkgs - qty_ls_pkgs) - .checked_div(qty_ms_bins) - { - Some(n) => { - if n >= 1 { - n - } else { - 3usize - } + let pkg_per_bin_ms : usize = match (components_len_after_max_freq - limit_hs_bins as usize - limit_ls_pkgs).checked_div(limit_ms_bins as usize) { + Some(n) => { + if n < 1 { + panic!("Error: No of bins <= 3"); } - None => 6usize, - }; - + n + }, + None => { + panic!("Error: No of bins <= 3") + } + }; + + //Bins assignment for partition in partitions.keys() { let pkgs = partitions.get(partition).expect("hashset"); if partition == "1hs" { for pkg in pkgs { + //println!("hs Size: {:#?} , Frequency: {:#?}", pkg.size, pkg.meta.change_frequency); r.push(vec![*pkg]); } } else if partition == "2ls" { let mut bin: Vec<&ObjectSourceMetaSized> = Vec::new(); for pkg in pkgs { + //println!("ls Size: {:#?} , Frequency: {:#?}", pkg.size, pkg.meta.change_frequency); bin.push(*pkg); } r.push(bin); } else { let mut bin: Vec<&ObjectSourceMetaSized> = Vec::new(); for (i, pkg) in pkgs.iter().enumerate() { + //println!("ms_{:#?} Size: {:#?} , Frequency: {:#?}", partition, pkg.size, pkg.meta.change_frequency); if bin.len() < pkg_per_bin_ms { bin.push(*pkg); } else { @@ -730,8 +769,10 @@ fn basic_packing<'a>( } } } - println!("Bins before unoptimized build: {}", r.len()); + + //Addressing MS breach + //Leave second last bin for max_freq_components //Leave last bin for new packages added, so to not disturb //previous bins. @@ -754,6 +795,7 @@ fn basic_packing<'a>( } } r.push(max_freq_components); + let new_pkgs_bin: Vec<&ObjectSourceMetaSized> = Vec::new(); r.push(new_pkgs_bin); let mut after_processing_pkgs_len = 0;