Skip to content

Commit

Permalink
ociruntime: handle images with high layer count (#7630)
Browse files Browse the repository at this point in the history
When the action required an image with more than 20 layers, our mount
will fail with

```
create OCI bundle: create rootfs: mount overlayfs: no such file or directory
```

After some digging, the mount options string cannot exceed 4095 characters.

Add special logic to break down images with many mounts into smaller groups. 
For each group, create an overlayfs mount called "merged<group-id>" in the same bundle dir. 
The final overlayfs will then be composed of these "merged" groups as lowerdirs.
  • Loading branch information
sluongng authored Oct 21, 2024
1 parent fd14e66 commit 3ffab9c
Show file tree
Hide file tree
Showing 5 changed files with 250 additions and 28 deletions.
8 changes: 8 additions & 0 deletions MODULE.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -528,12 +528,20 @@ oci.pull(
image = "gcr.io/distroless/java17-debian12",
platforms = ["linux/amd64"],
)
oci.pull(
name = "busybox",
digest = "sha256:c230832bd3b0be59a6c47ed64294f9ce71e91b327957920b6929a0caa8353140",
image = "mirror.gcr.io/library/busybox:1.36.1",
platforms = ["linux/amd64"],
)
use_repo(
oci,
"bazel_oci_image_base",
"bazel_oci_image_base_linux_amd64",
"buildbuddy_go_oci_image_base",
"buildbuddy_go_oci_image_base_linux_amd64",
"busybox",
"busybox_linux_amd64",
)

register_toolchains(
Expand Down
7 changes: 7 additions & 0 deletions WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,13 @@ oci_pull(
platforms = ["linux/amd64"],
)

oci_pull(
name = "busybox",
digest = "sha256:c230832bd3b0be59a6c47ed64294f9ce71e91b327957920b6929a0caa8353140",
image = "mirror.gcr.io/library/busybox:1.36.1",
platforms = ["linux/amd64"],
)

# BuildBuddy Toolchain
# Keep up-to-date with docs/rbe-setup.md and docs/rbe-github-actions.md
http_archive(
Expand Down
10 changes: 10 additions & 0 deletions enterprise/server/remote_execution/containers/ociruntime/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ go_test(
":busybox",
":crun",
"//enterprise/server/remote_execution/runner/testworker",
"@busybox",
],
exec_properties = {
"test.workload-isolation-type": "firecracker",
Expand All @@ -65,6 +66,7 @@ go_test(
x_defs = {
"crunRlocationpath": "$(rlocationpath :crun)",
"busyboxRlocationpath": "$(rlocationpath :busybox)",
"ociBusyboxRlocationpath": "$(rlocationpath @busybox)",
"testworkerRlocationpath": "$(rlocationpath //enterprise/server/remote_execution/runner/testworker)",
},
deps = [
Expand All @@ -86,6 +88,14 @@ go_test(
"//server/util/status",
"//server/util/testing/flags",
"//server/util/uuid",
"@com_github_google_go_containerregistry//pkg/name",
"@com_github_google_go_containerregistry//pkg/registry",
"@com_github_google_go_containerregistry//pkg/v1:pkg",
"@com_github_google_go_containerregistry//pkg/v1/layout",
"@com_github_google_go_containerregistry//pkg/v1/mutate",
"@com_github_google_go_containerregistry//pkg/v1/partial",
"@com_github_google_go_containerregistry//pkg/v1/remote",
"@com_github_google_go_containerregistry//pkg/v1/types",
"@com_github_stretchr_testify//assert",
"@com_github_stretchr_testify//require",
"@io_bazel_rules_go//go/runfiles:go_default_library",
Expand Down
106 changes: 81 additions & 25 deletions enterprise/server/remote_execution/containers/ociruntime/ociruntime.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ const (
// backwards-compatible changes to image cache storage, and older version
// directories can be cleaned up.
imageCacheVersion = "v1" // TODO: add automatic cleanup if this is bumped.

// Maximum length of overlayfs mount options string.
maxMntOptsLength = 4095
)

//go:embed seccomp.json
Expand Down Expand Up @@ -210,6 +213,7 @@ type ociContainer struct {

cid string
workDir string
mergedMounts []string
overlayfsMounted bool
stats container.UsageStats
networkPool *networking.ContainerNetworkPool
Expand Down Expand Up @@ -441,8 +445,16 @@ func (c *ociContainer) Remove(ctx context.Context) error {
firstErr = status.UnavailableErrorf("delete container: %s", err)
}

if len(c.mergedMounts) > 0 {
for _, merged := range c.mergedMounts {
if err := unix.Unmount(merged, unix.MNT_FORCE); err != nil && firstErr == nil {
firstErr = status.UnavailableErrorf("unmount overlayfs: %s", err)
}
}
}

if c.overlayfsMounted {
if err := syscall.Unmount(c.rootfsPath(), syscall.MNT_FORCE); err != nil && firstErr == nil {
if err := unix.Unmount(c.rootfsPath(), unix.MNT_FORCE); err != nil && firstErr == nil {
firstErr = status.UnavailableErrorf("unmount overlayfs: %s", err)
}
}
Expand Down Expand Up @@ -546,16 +558,27 @@ func (c *ociContainer) createRootfs(ctx context.Context) error {
}

// Create an overlayfs with the pulled image layers.
var lowerDirs []string
image, ok := c.imageStore.CachedImage(c.imageRef)
if !ok {
return fmt.Errorf("bad state: attempted to create rootfs before pulling image")
}
// overlayfs "lowerdir" mount args are ordered from uppermost to lowermost,
// but manifest layers are ordered from lowermost to uppermost. So we
// iterate in reverse order when building the lowerdir args.
for i := len(image.Layers) - 1; i >= 0; i-- {
layer := image.Layers[i]

// Create workdir and upperdir.
workdir := filepath.Join(c.bundlePath(), "tmp", "rootfs.work")
if err := os.MkdirAll(workdir, 0755); err != nil {
return fmt.Errorf("create overlay workdir: %w", err)
}
upperdir := filepath.Join(c.bundlePath(), "tmp", "rootfs.upper")
if err := os.MkdirAll(upperdir, 0755); err != nil {
return fmt.Errorf("create overlay upperdir: %w", err)
}

// - userxattr is needed for compatibility with older kernels
// - volatile disables fsync, as a performance optimization
optionsTpl := "lowerdir=%s,upperdir=%s,workdir=%s,userxattr,volatile"
tplLen := len(optionsTpl) - 3*len("%s")
var lowerDirs []string
for _, layer := range image.Layers {
path := layerPath(c.imageCacheRoot, layer.DiffID)
// Skip empty dirs - these can cause conflicts since they will always
// have the same digest, and also just add more overhead.
Expand All @@ -567,28 +590,61 @@ func (c *ociContainer) createRootfs(ctx context.Context) error {
if len(children) == 0 {
continue
}
lowerDirs = append(lowerDirs, path)
}
// Create workdir and upperdir.
workdir := filepath.Join(c.bundlePath(), "tmp", "rootfs.work")
if err := os.MkdirAll(workdir, 0755); err != nil {
return fmt.Errorf("create overlay workdir: %w", err)
newLowerDirs := append(lowerDirs, path)
mergedWorkdir := filepath.Join(c.bundlePath(), "tmp", fmt.Sprintf("merged%d.work", len(c.mergedMounts)))
mergedUpperdir := filepath.Join(c.bundlePath(), "tmp", fmt.Sprintf("merged%d.upper", len(c.mergedMounts)))
mntOptsLen := tplLen + len(strings.Join(append(c.mergedMounts, newLowerDirs...), ":")) + max(
// mergedWorkdir and mergedUpperdir are always longer than workDir and upperdir.
// So this `max` is not strictly necessary, but it's here to fend off future changes.
len(mergedWorkdir)+len(mergedUpperdir),
len(workdir)+len(upperdir),
)
if len(newLowerDirs) == 1 || mntOptsLen <= maxMntOptsLength {
lowerDirs = newLowerDirs
continue
}

// If the total length of the lowerDirs exceeds the kernel page size,
// create a merged overlay mount to reduce the number of layers.
if err := os.MkdirAll(mergedWorkdir, 0755); err != nil {
return fmt.Errorf("create overlay workdir: %w", err)
}
if err := os.MkdirAll(mergedUpperdir, 0755); err != nil {
return fmt.Errorf("create overlay upperdir: %w", err)
}
merged := filepath.Join(c.bundlePath(), "tmp", fmt.Sprintf("merged%d", len(c.mergedMounts)))
if err := os.MkdirAll(merged, 0755); err != nil {
return fmt.Errorf("create overlay merged: %w", err)
}
slices.Reverse(lowerDirs)
mntOpts := fmt.Sprintf(optionsTpl, strings.Join(lowerDirs, ":"), mergedUpperdir, mergedWorkdir)
log.CtxDebugf(ctx, "Mounting merged overlayfs to %q, options=%q, len=%d", merged, mntOpts, len(mntOpts))
if len(mntOpts) > maxMntOptsLength {
return fmt.Errorf("mount options too long: %d / %d. Consider using container image with fewer layers.", len(mntOpts), maxMntOptsLength)
}
if err := unix.Mount("none", merged, "overlay", 0, mntOpts); err != nil {
return fmt.Errorf("mount overlayfs: %w", err)
}
c.mergedMounts = append(c.mergedMounts, merged)
lowerDirs = []string{path}
}
upperdir := filepath.Join(c.bundlePath(), "tmp", "rootfs.upper")
if err := os.MkdirAll(upperdir, 0755); err != nil {
return fmt.Errorf("create overlay upperdir: %w", err)
if len(c.mergedMounts) != 0 {
lowerDirs = append(c.mergedMounts, lowerDirs...)
}

// overlayfs "lowerdir" mount args are ordered from uppermost to lowermost,
// but manifest layers are ordered from lowermost to uppermost. So we need to
// reverse the order before constructing the mount option.
slices.Reverse(lowerDirs)

// TODO: do this mount inside a namespace so that it gets removed even if
// the executor crashes (also needed for rootless support)

// - userxattr is needed for compatibility with older kernels
// - volatile disables fsync, as a performance optimization
options := fmt.Sprintf(
"lowerdir=%s,upperdir=%s,workdir=%s,userxattr,volatile",
strings.Join(lowerDirs, ":"), upperdir, workdir)
log.CtxDebugf(ctx, "Mounting overlayfs to %q, options=%q", c.rootfsPath(), options)
if err := syscall.Mount("none", c.rootfsPath(), "overlay", 0, options); err != nil {
options := fmt.Sprintf(optionsTpl, strings.Join(lowerDirs, ":"), upperdir, workdir)
if len(options) > maxMntOptsLength {
return fmt.Errorf("mount options too long: %d / %d. Consider using container image with fewer layers.", len(options), maxMntOptsLength)
}
log.CtxDebugf(ctx, "Mounting overlayfs to %q, options=%q, length=%d", c.rootfsPath(), options, len(options))
if err := unix.Mount("none", c.rootfsPath(), "overlay", 0, options); err != nil {
return fmt.Errorf("mount overlayfs: %w", err)
}
c.overlayfsMounted = true
Expand Down Expand Up @@ -751,7 +807,7 @@ func (c *ociContainer) createSpec(ctx context.Context, cmd *repb.Command) (*spec
Annotations: map[string]string{
// Annotate with podman's default stop signal.
// TODO: is this strictly needed?
"org.opencontainers.image.stopSignal": syscall.SIGTERM.String(),
"org.opencontainers.image.stopSignal": unix.SIGTERM.String(),
},
Linux: &specs.Linux{
// TODO: set up cgroups
Expand Down
Loading

0 comments on commit 3ffab9c

Please sign in to comment.