Skip to content
This repository has been archived by the owner on Nov 7, 2024. It is now read-only.

Commit

Permalink
FIxing algo
Browse files Browse the repository at this point in the history
  • Loading branch information
RishabhSaini committed Apr 21, 2023
1 parent 22e27df commit cd0f72c
Showing 1 changed file with 106 additions and 64 deletions.
170 changes: 106 additions & 64 deletions lib/src/chunking.rs
Original file line number Diff line number Diff line change
Expand Up @@ -417,31 +417,56 @@ fn std_deviation(data: &[u64]) -> Option<f64> {
}
}

fn median_absolute_deviation(data: &mut Vec<u64>) -> (f64, f64) {
//Sort data
//data.sort_by(|a, b| a.partial_cmp(b).unwrap());

//Find median of data
let median_data : f64 = match data.len() % 2 {
1 => data[data.len() / 2] as f64,
_ => 0.5 * (data[data.len() / 2 - 1] + data[data.len() / 2]) as f64
};

//Absolute deviations
let mut absolute_deviations = Vec::new();
for size in data{
absolute_deviations.push(f64::abs(*size as f64 - median_data))
}

absolute_deviations.sort_by(|a, b| a.partial_cmp(b).unwrap());
let mad : f64 = match absolute_deviations.len() % 2 {
1 => absolute_deviations[absolute_deviations.len() / 2],
_ => 0.5 * (absolute_deviations[absolute_deviations.len() / 2 - 1] + absolute_deviations[absolute_deviations.len() / 2])
};

(median_data, mad)
}

//Assumes components is sorted by descending size
//Use MAD as threshold to partition packages [abs(low_limit), high_limit]
fn get_partitions_with_threshold(
components: Vec<&ObjectSourceMetaSized>,
limit_hs_bins: usize,
threshold: f64,
) -> Option<BTreeMap<String, Vec<&ObjectSourceMetaSized>>> {

let mut bins: BTreeMap<String, Vec<&ObjectSourceMetaSized>> = BTreeMap::new();
let mut med_size: Vec<&ObjectSourceMetaSized> = Vec::new();
let mut high_size: Vec<&ObjectSourceMetaSized> = Vec::new();

//Calculate Mean and Stddev for Size
let sizes: Vec<u64> = components.iter().map(|a| a.size).collect();
let mean_size = mean(&sizes)?;
let stddev_size = std_deviation(&sizes)?;
let mut size_low_limit = mean_size - threshold * stddev_size;
if size_low_limit < 0 as f64 {
size_low_limit = 100000_f64;
}
let size_high_limit = mean_size + threshold * stddev_size;
let mut sizes: Vec<u64> = components.iter().map(|a| a.size).collect();
let (median_size, mad_size) = median_absolute_deviation(&mut sizes);

let size_low_limit = 0.5 * f64::abs(median_size - threshold * mad_size);
let size_high_limit = median_size + threshold * mad_size;
println!("stddev/mean Method Low: {:#?}, High: {:#?} ", &size_low_limit, &size_high_limit);

for pkg in components {
let size = pkg.size as f64;

//hs
if size >= size_high_limit {
bins.entry("1hs".to_string())
.and_modify(|bin| bin.push(pkg))
.or_insert_with(|| vec![pkg]);
high_size.push(pkg);
}
//ls
else if size <= size_low_limit {
Expand All @@ -455,22 +480,33 @@ fn get_partitions_with_threshold(
}
}

let med_frequencies: Vec<u64> = med_size
//Extra hs packages
let mut remaining_pkgs: Vec<_> = high_size.drain(limit_hs_bins..).collect();
assert_eq!(high_size.len(), limit_hs_bins);

//Concatenate Extra hs packages + med_sizes keeps it still descending sorted
remaining_pkgs.append(&mut med_size);
bins.insert("1hs".to_string(), high_size);

let mut med_sizes: Vec<u64> = remaining_pkgs.iter().map(|a| a.size).collect();
let mut med_frequencies: Vec<u64> = remaining_pkgs
.iter()
.map(|a| a.meta.change_frequency.into())
.collect();
let med_sizes: Vec<u64> = med_size.iter().map(|a| a.size).collect();
let med_mean_freq = mean(&med_frequencies)?;
let med_mean_freq = mean(&med_frequencies)?;
let med_stddev_freq = std_deviation(&med_frequencies)?;
let med_mean_size = mean(&med_sizes)?;
let med_stddev_size = std_deviation(&med_sizes)?;

let med_freq_low_limit = med_mean_freq - threshold * med_stddev_freq;
let med_stddev_size = std_deviation(&med_sizes)?;
let med_freq_low_limit = 0.5f64 * f64::abs(med_mean_freq - threshold * med_stddev_freq);
let med_freq_high_limit = med_mean_freq + threshold * med_stddev_freq;
let med_size_low_limit = med_mean_size - threshold * med_stddev_size;
let med_size_low_limit = 0.5f64 * f64::abs(med_mean_size - threshold * med_stddev_size);
let med_size_high_limit = med_mean_size + threshold * med_stddev_size;

for pkg in med_size {
println!("fl: {}, fh: {}, sl: {}, sh: {}", med_freq_low_limit, med_freq_high_limit, med_size_low_limit, med_size_high_limit);

for pkg in remaining_pkgs {
let size = pkg.size as f64;
let freq = pkg.meta.change_frequency as f64;

Expand Down Expand Up @@ -545,7 +581,7 @@ fn get_partitions_with_threshold(
.or_insert_with(|| vec![pkg]);
}
}

for (name, pkgs) in &bins {
println!("{:#?}: {:#?}", name, pkgs.len());
}
Expand All @@ -557,6 +593,24 @@ fn get_partitions_with_threshold(
/// and a number of bins (possible container layers) to use, determine which components
/// go in which bin. This algorithm is pretty simple:
// Todo
//
// 2 stats to use:
// - Size
// - Probability[update] = no of changelogs * last buildtime epoch
//
// Total available bins = n
//
// 1 bin for all max_freq pkgs
// 1 bin for all new pkgs
// 1 bin for all low size pkgs
//
// 60% of n-3 bins for HS
// 40% of n-3 bins for MS
//
// If HS bins > limit, spillover to MS to package with LF(LS, MS)
// If MS bins > limit, fold by merging 2 bins from the end
//
fn basic_packing<'a>(
components: &'a [ObjectSourceMetaSized],
bin_size: NonZeroU32,
Expand Down Expand Up @@ -641,7 +695,6 @@ fn basic_packing<'a>(

println!("Creating new packing structure");

components.sort_by(|a, b| a.meta.change_frequency.cmp(&b.meta.change_frequency));
let mut max_freq_components: Vec<&ObjectSourceMetaSized> = Vec::new();
components.retain(|pkg| {
let retain: bool = pkg.meta.change_frequency != u32::MAX;
Expand All @@ -654,68 +707,54 @@ fn basic_packing<'a>(
match components_len_after_max_freq {
0 => (),
_ => {
let partitions = get_partitions_with_threshold(components, 0.5)
.expect("Partitioning components into sets");

// Max_bins -:
// 1 for max_freq
// 1 for new_pkgs
// 1 for ls
// n for hs
// Left for ms

let qty_hs_bins = match partitions.get("1hs") {
//Defining Limits of each bins
let limit_ls_bins = 1usize;
let limit_new_bins = 1usize;
let limit_new_pkgs = 0usize;
let limit_max_frequency_bins = 1usize;
let limit_max_frequency_pkgs = max_freq_components.len();
let limit_hs_bins = (0.6 * (bin_size.get() - (limit_ls_bins + limit_new_bins + limit_max_frequency_bins) as u32) as f32).floor();
let limit_ms_bins = (0.4 * (bin_size.get() - (limit_ls_bins + limit_new_bins + limit_max_frequency_bins) as u32) as f32).floor();

let partitions = get_partitions_with_threshold(components, limit_hs_bins as usize, 2f64).expect("Partitioning components into sets");

let limit_ls_pkgs = match partitions.get("2ls") {
Some(n) => n.len(),
None => 0usize,
};
let qty_hs_pkgs = qty_hs_bins.clone();

let qty_ls_bins = 1usize;
let qty_ls_pkgs = match partitions.get("2ls") {
Some(n) => n.len(),
None => 0usize,
};

let qty_new_bins = 1usize;
let _qty_new_pkgs = 0usize;

let qty_max_frequency_bins = 1usize;
let _qty_max_frequency_pkgs = max_freq_components.len();

//Can be negative or very low if qty_hs_pkgs is very high
let qty_ms_bins = bin_size.get() as usize
- (qty_hs_bins + qty_ls_bins + qty_new_bins + qty_max_frequency_bins);

let pkg_per_bin_ms: usize =
match (components_len_after_max_freq - qty_hs_pkgs - qty_ls_pkgs)
.checked_div(qty_ms_bins)
{
Some(n) => {
if n >= 1 {
n
} else {
3usize
}
let pkg_per_bin_ms : usize = match (components_len_after_max_freq - limit_hs_bins as usize - limit_ls_pkgs).checked_div(limit_ms_bins as usize) {
Some(n) => {
if n < 1 {
panic!("Error: No of bins <= 3");
}
None => 6usize,
};

n
},
None => {
panic!("Error: No of bins <= 3")
}
};

//Bins assignment
for partition in partitions.keys() {
let pkgs = partitions.get(partition).expect("hashset");

if partition == "1hs" {
for pkg in pkgs {
//println!("hs Size: {:#?} , Frequency: {:#?}", pkg.size, pkg.meta.change_frequency);
r.push(vec![*pkg]);
}
} else if partition == "2ls" {
let mut bin: Vec<&ObjectSourceMetaSized> = Vec::new();
for pkg in pkgs {
//println!("ls Size: {:#?} , Frequency: {:#?}", pkg.size, pkg.meta.change_frequency);
bin.push(*pkg);
}
r.push(bin);
} else {
let mut bin: Vec<&ObjectSourceMetaSized> = Vec::new();
for (i, pkg) in pkgs.iter().enumerate() {
//println!("ms_{:#?} Size: {:#?} , Frequency: {:#?}", partition, pkg.size, pkg.meta.change_frequency);
if bin.len() < pkg_per_bin_ms {
bin.push(*pkg);
} else {
Expand All @@ -730,8 +769,10 @@ fn basic_packing<'a>(
}
}
}

println!("Bins before unoptimized build: {}", r.len());

//Addressing MS breach

//Leave second last bin for max_freq_components
//Leave last bin for new packages added, so to not disturb
//previous bins.
Expand All @@ -754,6 +795,7 @@ fn basic_packing<'a>(
}
}
r.push(max_freq_components);

let new_pkgs_bin: Vec<&ObjectSourceMetaSized> = Vec::new();
r.push(new_pkgs_bin);
let mut after_processing_pkgs_len = 0;
Expand Down

0 comments on commit cd0f72c

Please sign in to comment.