chunking: Bin packing algorithm which allows to minimize

layer deltas using historical builds Revamp basic_packing to follow the prior packing structure if the --prior-build flag exists. This simply modifies existing layers with upgrades/downgrades/removal of packages. The last layer contains any new addition to packages. In the case where --prior-build flag does not exist, the frequency of updates of the packages (frequencyinfo) and size is utilized to segment packages into different partitions (all combinations of low, medium, high frequency and low, medium, high size). The partition that each package falls into is decided by its deviation from mean. Then the packages are alloted to different layers to ensure 1) low frequency packages don't mix with high frequency packages 2) High sized packages are alloted separate bins 3) Low sized packages can be put together in the same bin This problem is aka multi-objective bin packing problem with constraints aka multiple knapsack problem. The objectives are conflicting given our constraints and hence a compromise is taken to minimize layer deltas while respecting the hard limit of overlayfs that the kernel can handle.
ostreedev · May 15, 2023 · 5159164 · 5159164
1 parent 95f2366
commit 5159164
Show file tree

Hide file tree

Showing 11 changed files with 745 additions and 88 deletions.
diff --git a/lib/src/chunking.rs b/lib/src/chunking.rs
diff --git a/lib/src/cli.rs b/lib/src/cli.rs
@@ -584,7 +584,7 @@ async fn container_export(
         ..Default::default()
     };
     let pushed =
-        crate::container::encapsulate(repo, rev, &config, Some(opts), None, imgref).await?;
+        crate::container::encapsulate(repo, rev, &config, None, Some(opts), None, imgref).await?;
     println!("{}", pushed);
     Ok(())
 }

diff --git a/lib/src/container/encapsulate.rs b/lib/src/container/encapsulate.rs
@@ -1,7 +1,7 @@
 //! APIs for creating container images from OSTree commits
 
 use super::ocidir::{Layer, OciDir};
-use super::{ocidir, OstreeImageReference, Transport};
+use super::{ocidir, OstreeImageReference, Transport, CONTENT_ANNOTATION};
 use super::{ImageReference, SignatureSource, OSTREE_COMMIT_LABEL};
 use crate::chunking::{Chunk, Chunking, ObjectMetaSized};
 use crate::container::skopeo;
@@ -104,7 +104,7 @@ fn export_chunks(
     ociw: &mut OciDir,
     chunks: Vec<Chunk>,
     opts: &ExportOpts,
-) -> Result<Vec<(Layer, String)>> {
+) -> Result<Vec<(Layer, String, Vec<String>)>> {
     chunks
         .into_iter()
         .enumerate()
@@ -113,7 +113,7 @@ fn export_chunks(
             ostree_tar::export_chunk(repo, commit, chunk.content, &mut w)
                 .with_context(|| format!("Exporting chunk {i}"))?;
             let w = w.into_inner()?;
-            Ok((w.complete()?, chunk.name))
+            Ok((w.complete()?, chunk.name, chunk.packages))
         })
         .collect()
 }
@@ -151,11 +151,20 @@ fn export_chunked(
         .clone();
 
     // Add the ostree layer
-    ociw.push_layer(manifest, imgcfg, ostree_layer, description);
+    ociw.push_layer(manifest, imgcfg, ostree_layer, description, None);
     // Add the component/content layers
-    for (layer, name) in layers {
-        ociw.push_layer(manifest, imgcfg, layer, name.as_str());
+    for (layer, name, packages) in layers {
+        let mut annotation_component_layer = HashMap::new();
+        annotation_component_layer.insert(CONTENT_ANNOTATION.to_string(), packages.join(","));
+        ociw.push_layer(
+            manifest,
+            imgcfg,
+            layer,
+            name.as_str(),
+            Some(annotation_component_layer),
+        );
     }
+
     // This label (mentioned above) points to the last layer that is part of
     // the ostree commit.
     labels.insert(
@@ -167,13 +176,15 @@ fn export_chunked(
 
 /// Generate an OCI image from a given ostree root
 #[context("Building oci")]
+#[allow(clippy::too_many_arguments)]
 fn build_oci(
     repo: &ostree::Repo,
     rev: &str,
     ocidir_path: &Path,
     tag: Option<&str>,
     config: &Config,
     opts: ExportOpts,
+    prior_build: Option<&oci_image::ImageManifest>,
     contentmeta: Option<crate::chunking::ObjectMetaSized>,
 ) -> Result<ImageReference> {
     if !ocidir_path.exists() {
@@ -209,7 +220,15 @@ fn build_oci(
     let mut manifest = ocidir::new_empty_manifest().build().unwrap();
 
     let chunking = contentmeta
-        .map(|meta| crate::chunking::Chunking::from_mapping(repo, commit, meta, opts.max_layers))
+        .map(|meta| {
+            crate::chunking::Chunking::from_mapping(
+                repo,
+                commit,
+                meta,
+                &opts.max_layers,
+                prior_build,
+            )
+        })
         .transpose()?;
     // If no chunking was provided, create a logical single chunk.
     let chunking = chunking
@@ -291,6 +310,7 @@ async fn build_impl(
     repo: &ostree::Repo,
     ostree_ref: &str,
     config: &Config,
+    prior_build: Option<&oci_image::ImageManifest>,
     opts: Option<ExportOpts>,
     contentmeta: Option<ObjectMetaSized>,
     dest: &ImageReference,
@@ -308,6 +328,7 @@ async fn build_impl(
             tag,
             config,
             opts,
+            prior_build,
             contentmeta,
         )?;
         None
@@ -323,6 +344,7 @@ async fn build_impl(
             None,
             config,
             opts,
+            prior_build,
             contentmeta,
         )?;
 
@@ -377,9 +399,19 @@ pub async fn encapsulate<S: AsRef<str>>(
     repo: &ostree::Repo,
     ostree_ref: S,
     config: &Config,
+    prior_build: Option<&oci_image::ImageManifest>,
     opts: Option<ExportOpts>,
     contentmeta: Option<ObjectMetaSized>,
     dest: &ImageReference,
 ) -> Result<String> {
-    build_impl(repo, ostree_ref.as_ref(), config, opts, contentmeta, dest).await
+    build_impl(
+        repo,
+        ostree_ref.as_ref(),
+        config,
+        prior_build,
+        opts,
+        contentmeta,
+        dest,
+    )
+    .await
 }
diff --git a/lib/src/container/mod.rs b/lib/src/container/mod.rs
@@ -37,6 +37,10 @@ use std::str::FromStr;
 /// The label injected into a container image that contains the ostree commit SHA-256.
 pub const OSTREE_COMMIT_LABEL: &str = "ostree.commit";
 
+/// The name of an annotation attached to a layer which names the packages/components
+/// which are part of it.
+pub(crate) const CONTENT_ANNOTATION: &str = "ostree.components";
+
 /// Our generic catchall fatal error, expected to be converted
 /// to a string to output to a terminal or logs.
 type Result<T> = anyhow::Result<T>;

diff --git a/lib/src/container/ocidir.rs b/lib/src/container/ocidir.rs
@@ -203,8 +203,8 @@ impl OciDir {
         config: &mut oci_image::ImageConfiguration,
         layer: Layer,
         description: &str,
+        annotations: Option<HashMap<String, String>>,
     ) {
-        let annotations: Option<HashMap<String, String>> = None;
         self.push_layer_annotated(manifest, config, layer, annotations, description);
     }
 
@@ -531,7 +531,8 @@ mod tests {
         let mut config = oci_image::ImageConfigurationBuilder::default()
             .build()
             .unwrap();
-        w.push_layer(&mut manifest, &mut config, root_layer, "root");
+        let annotations: Option<HashMap<String, String>> = None;
+        w.push_layer(&mut manifest, &mut config, root_layer, "root", annotations);
         let config = w.write_config(config)?;
         manifest.set_config(config);
         w.replace_with_single_manifest(manifest.clone(), oci_image::Platform::default())?;

diff --git a/lib/src/fixture.rs b/lib/src/fixture.rs
@@ -168,7 +168,9 @@ d tmp
 "## };
 pub const CONTENTS_CHECKSUM_V0: &str =
     "5e41de82f9f861fa51e53ce6dd640a260e4fb29b7657f5a3f14157e93d2c0659";
-pub static CONTENTS_V0_LEN: Lazy<usize> = Lazy::new(|| OWNERS.len().checked_sub(1).unwrap());
+// 1 for ostree commit, 2 for max frequency packages, 3 as empty layer
+pub const LAYERS_V0_LEN: usize = 3usize;
+pub const PKGS_V0_LEN: usize = 7usize;
 
 #[derive(Debug, PartialEq, Eq)]
 enum SeLabel {
@@ -317,6 +319,7 @@ fn build_mapping_recurse(
                         name: Rc::clone(&owner),
                         srcid: Rc::clone(&owner),
                         change_time_offset: u32::MAX,
+                        change_frequency: u32::MAX,
                     });
                 }
 
@@ -661,11 +664,15 @@ impl Fixture {
         let contentmeta = self.get_object_meta().context("Computing object meta")?;
         let contentmeta = ObjectMetaSized::compute_sizes(self.srcrepo(), contentmeta)
             .context("Computing sizes")?;
-        let opts = ExportOpts::default();
+        let opts = ExportOpts {
+            max_layers: std::num::NonZeroU32::new(PKGS_V0_LEN as u32),
+            ..Default::default()
+        };
         let digest = crate::container::encapsulate(
             self.srcrepo(),
             self.testref(),
             &config,
+            None,
             Some(opts),
             Some(contentmeta),
             &imgref,

diff --git a/lib/src/fixtures/fedora-coreos-contentmeta.json.gz b/lib/src/fixtures/fedora-coreos-contentmeta.json.gz
diff --git a/lib/src/lib.rs b/lib/src/lib.rs
@@ -51,6 +51,7 @@ pub mod objectsource;
 pub(crate) mod objgv;
 #[cfg(feature = "internal-testing-api")]
 pub mod ostree_manual;
+pub(crate) mod statistics;
 
 mod utils;
 

diff --git a/lib/src/objectsource.rs b/lib/src/objectsource.rs
@@ -41,9 +41,7 @@ pub struct ObjectSourceMeta {
     /// Unique identifier, does not need to be human readable, but can be.
     #[serde(with = "rcstr_serialize")]
     pub identifier: ContentID,
-    /// Identifier for this source (e.g. package name-version, git repo).
-    /// Unlike the [`ContentID`], this should be human readable.  It likely comes from an external source,
-    /// and may be re-serialized.
+    /// Just the name of the package (no version), needs to be human readable.
     #[serde(with = "rcstr_serialize")]
     pub name: Rc<str>,
     /// Identifier for the *source* of this content; for example, if multiple binary
@@ -54,6 +52,8 @@ pub struct ObjectSourceMeta {
     /// One suggested way to generate this number is to have it be in units of hours or days
     /// since the earliest changed item.
     pub change_time_offset: u32,
+    /// Change frequency
+    pub change_frequency: u32,
 }
 
 impl PartialEq for ObjectSourceMeta {

diff --git a/lib/src/statistics.rs b/lib/src/statistics.rs
@@ -0,0 +1,109 @@
+//! This module holds implementations of some basic statistical properties, such as mean and standard deviation.
+
+pub(crate) fn mean(data: &[u64]) -> Option<f64> {
+    if data.is_empty() {
+        None
+    } else {
+        Some(data.iter().sum::<u64>() as f64 / data.len() as f64)
+    }
+}
+
+pub(crate) fn std_deviation(data: &[u64]) -> Option<f64> {
+    match (mean(data), data.len()) {
+        (Some(data_mean), count) if count > 0 => {
+            let variance = data
+                .iter()
+                .map(|value| {
+                    let diff = data_mean - (*value as f64);
+                    diff * diff
+                })
+                .sum::<f64>()
+                / count as f64;
+            Some(variance.sqrt())
+        }
+        _ => None,
+    }
+}
+
+//Assumed sorted
+pub(crate) fn median_absolute_deviation(data: &mut [u64]) -> Option<(f64, f64)> {
+    if data.is_empty() {
+        None
+    } else {
+        //Sort data
+        //data.sort_by(|a, b| a.partial_cmp(b).unwrap());
+
+        //Find median of data
+        let median_data: f64 = match data.len() % 2 {
+            1 => data[data.len() / 2] as f64,
+            _ => 0.5 * (data[data.len() / 2 - 1] + data[data.len() / 2]) as f64,
+        };
+
+        //Absolute deviations
+        let mut absolute_deviations = Vec::new();
+        for size in data {
+            absolute_deviations.push(f64::abs(*size as f64 - median_data))
+        }
+
+        absolute_deviations.sort_by(|a, b| a.partial_cmp(b).unwrap());
+        let l = absolute_deviations.len();
+        let mad: f64 = match l % 2 {
+            1 => absolute_deviations[l / 2],
+            _ => 0.5 * (absolute_deviations[l / 2 - 1] + absolute_deviations[l / 2]),
+        };
+
+        Some((median_data, mad))
+    }
+}
+
+#[test]
+fn test_mean() {
+    assert_eq!(mean(&[]), None);
+    for v in [0u64, 1, 5, 100] {
+        assert_eq!(mean(&[v]), Some(v as f64));
+    }
+    assert_eq!(mean(&[0, 1]), Some(0.5));
+    assert_eq!(mean(&[0, 5, 100]), Some(35.0));
+    assert_eq!(mean(&[7, 4, 30, 14]), Some(13.75));
+}
+
+#[test]
+fn test_std_deviation() {
+    assert_eq!(std_deviation(&[]), None);
+    for v in [0u64, 1, 5, 100] {
+        assert_eq!(std_deviation(&[v]), Some(0 as f64));
+    }
+    assert_eq!(std_deviation(&[1, 4]), Some(1.5));
+    assert_eq!(std_deviation(&[2, 2, 2, 2]), Some(0.0));
+    assert_eq!(
+        std_deviation(&[1, 20, 300, 4000, 50000, 600000, 7000000, 80000000]),
+        Some(26193874.56387471)
+    );
+}
+
+#[test]
+fn test_median_absolute_deviation() {
+    //Assumes sorted
+    assert_eq!(median_absolute_deviation(&mut []), None);
+    for v in [0u64, 1, 5, 100] {
+        assert_eq!(median_absolute_deviation(&mut [v]), Some((v as f64, 0.0)));
+    }
+    assert_eq!(median_absolute_deviation(&mut [1, 4]), Some((2.5, 1.5)));
+    assert_eq!(
+        median_absolute_deviation(&mut [2, 2, 2, 2]),
+        Some((2.0, 0.0))
+    );
+    assert_eq!(
+        median_absolute_deviation(&mut [
+            1, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 6, 7, 7, 7, 8, 9, 12, 52, 90
+        ]),
+        Some((6.0, 2.0))
+    );
+
+    //if more than half of the data has the same value, MAD = 0, thus any
+    //value different from the residual median is classified as an outlier
+    assert_eq!(
+        median_absolute_deviation(&mut [0, 1, 1, 1, 1, 1, 1, 1, 0]),
+        Some((1.0, 0.0))
+    );
+}