From 4a2234de3c6325a3c7ead04e50a2f81954291b75 Mon Sep 17 00:00:00 2001 From: David Son Date: Thu, 6 Jun 2024 16:42:48 +0000 Subject: [PATCH] Add support for idmapped layers This commit adds ID mapping functionality in SOCI. ID mapping is enabled if the correct labels are passed through. To avoid having containerd handle the ID mapping, we must declare in the containerd config file that the snapshotter supports ID mapping. Note that usage of this feature requires proxy plugins to have capabilities, which is only supported in containerd v1.7.23 onwards. Signed-off-by: David Son --- fs/fs.go | 113 +++++++++++++++--- fs/fs_test.go | 11 +- fs/layer/layer.go | 7 +- fs/layer/node.go | 39 ++++--- fs/layer/node_test.go | 3 +- fs/layer/util_test.go | 3 +- fs/source/source.go | 3 + go.mod | 2 +- idtools/idmap.go | 66 +++++++++++ integration/run_test.go | 235 ++++++++++++++++++++++++++++++++++++++ integration/util_test.go | 1 + snapshot/snapshot.go | 122 +++++++++++++++++++- snapshot/snapshot_test.go | 17 +++ 13 files changed, 571 insertions(+), 51 deletions(-) diff --git a/fs/fs.go b/fs/fs.go index 3f3dbbb58..e5edd18a7 100644 --- a/fs/fs.go +++ b/fs/fs.go @@ -45,9 +45,13 @@ package fs import ( "context" "fmt" + "io" golog "log" "net/http" + "os" "os/exec" + "path/filepath" + "strings" "sync" "syscall" "time" @@ -59,6 +63,7 @@ import ( layermetrics "github.com/awslabs/soci-snapshotter/fs/metrics/layer" "github.com/awslabs/soci-snapshotter/fs/remote" "github.com/awslabs/soci-snapshotter/fs/source" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/awslabs/soci-snapshotter/metadata" "github.com/awslabs/soci-snapshotter/snapshot" "github.com/awslabs/soci-snapshotter/soci" @@ -67,6 +72,7 @@ import ( ctdsnapshotters "github.com/containerd/containerd/pkg/snapshotters" "github.com/containerd/containerd/reference" "github.com/containerd/containerd/remotes/docker" + "github.com/containerd/errdefs" "github.com/containerd/log" metrics "github.com/docker/go-metrics" fusefs "github.com/hanwen/go-fuse/v2/fs" @@ -455,6 +461,58 @@ func (fs *filesystem) getSociContext(ctx context.Context, imageRef, indexDigest, return c, err } +func getIDMappedMountpoint(mountpoint, activeLayerID string) string { + d := filepath.Dir(mountpoint) + return filepath.Join(fmt.Sprintf("%s_%s", d, activeLayerID), "fs") +} + +func (fs *filesystem) IDMapMount(ctx context.Context, mountpoint, activeLayerID string, idmapper idtools.IDMap) (string, error) { + newMountpoint := getIDMappedMountpoint(mountpoint, activeLayerID) + logger := log.G(ctx).WithField("mountpoint", newMountpoint) + + logger.Debug("creating remote id-mapped mount") + if err := os.Mkdir(filepath.Dir(newMountpoint), 0700); err != nil { + return "", err + } + if err := os.Mkdir(newMountpoint, 0755); err != nil { + return "", err + } + + fs.layerMu.Lock() + l := fs.layer[mountpoint] + if l == nil { + fs.layerMu.Unlock() + logger.Error("failed to create remote id-mapped mount") + return "", errdefs.ErrNotFound + } + fs.layer[newMountpoint] = l + fs.layerMu.Unlock() + node, err := l.RootNode(0, idmapper) + if err != nil { + return "", err + } + + fuseLogger := log.L. + WithField("mountpoint", mountpoint). + WriterLevel(logrus.TraceLevel) + + return newMountpoint, fs.setupFuseServer(ctx, newMountpoint, node, l, fuseLogger, nil) +} + +func (fs *filesystem) IDMapMountLocal(ctx context.Context, mountpoint, activeLayerID string, idmapper idtools.IDMap) (string, error) { + newMountpoint := getIDMappedMountpoint(mountpoint, activeLayerID) + logger := log.G(ctx).WithField("mountpoint", newMountpoint) + + logger.Debug("creating local id-mapped mount") + if err := idtools.RemapDir(ctx, mountpoint, newMountpoint, idmapper); err != nil { + logger.WithError(err).Error("failed to create local mount") + return "", err + } + + logger.Debug("successfully created local mountpoint") + return newMountpoint, nil +} + func (fs *filesystem) Mount(ctx context.Context, mountpoint string, labels map[string]string) (retErr error) { // Setting the start time to measure the Mount operation duration. start := time.Now() @@ -560,7 +618,7 @@ func (fs *filesystem) Mount(ctx context.Context, mountpoint string, labels map[s } }() - node, err := l.RootNode(0) + node, err := l.RootNode(0, idtools.IDMap{}) if err != nil { log.G(ctx).WithError(err).Warnf("Failed to get root node") retErr = fmt.Errorf("failed to get root node: %w", err) @@ -577,6 +635,17 @@ func (fs *filesystem) Mount(ctx context.Context, mountpoint string, labels map[s fs.layerMu.Unlock() fs.metricsController.Add(mountpoint, l) + // Pass in a logger to go-fuse with the layer digest + // The go-fuse logs are useful for tracing exactly what's happening at the fuse level. + fuseLogger := log.L. + WithField("layerDigest", labels[ctdsnapshotters.TargetLayerDigestLabel]). + WriterLevel(logrus.TraceLevel) + + retErr = fs.setupFuseServer(ctx, mountpoint, node, l, fuseLogger, c) + return +} + +func (fs *filesystem) setupFuseServer(ctx context.Context, mountpoint string, node fusefs.InodeEmbedder, l layer.Layer, logger *io.PipeWriter, c *sociContext) error { // mount the node to the specified mountpoint // TODO: bind mount the state directory as a read-only fs on snapshotter's side rawFS := fusefs.NewNodeFS(node, &fusefs.Options{ @@ -585,11 +654,6 @@ func (fs *filesystem) Mount(ctx context.Context, mountpoint string, labels map[s NegativeTimeout: &fs.negativeTimeout, NullPermissions: true, }) - // Pass in a logger to go-fuse with the layer digest - // The go-fuse logs are useful for tracing exactly what's happening at the fuse level. - logger := log.L. - WithField("layerDigest", labels[ctdsnapshotters.TargetLayerDigestLabel]). - WriterLevel(logrus.TraceLevel) mountOpts := &fuse.MountOptions{ AllowOther: true, // allow users other than root&mounter to access fs FsName: "soci", // name this filesystem as "soci" @@ -600,25 +664,26 @@ func (fs *filesystem) Mount(ctx context.Context, mountpoint string, labels map[s if _, err := exec.LookPath(fusermountBin); err == nil { mountOpts.Options = []string{"suid"} // option for fusermount; allow setuid inside container } else { - log.G(ctx).WithError(err).Infof("%s not installed; trying direct mount", fusermountBin) + log.G(ctx).WithField("binary", fusermountBin).WithError(err).Info("fusermount binary not installed; trying direct mount") mountOpts.DirectMount = true } server, err := fuse.NewServer(rawFS, mountpoint, mountOpts) if err != nil { - log.G(ctx).WithError(err).Debug("failed to make filesystem server") - retErr = err - return + log.G(ctx).WithError(err).Error("failed to make filesystem server") + return err } go server.Serve() - // Send a signal to the background fetcher that a new image is being mounted - // and to pause all background fetches. - c.bgFetchPauseOnce.Do(func() { - if fs.bgFetcher != nil { - fs.bgFetcher.Pause() - } - }) + if c != nil { + // Send a signal to the background fetcher that a new image is being mounted + // and to pause all background fetches. + c.bgFetchPauseOnce.Do(func() { + if fs.bgFetcher != nil { + fs.bgFetcher.Pause() + } + }) + } return server.WaitMount() } @@ -681,6 +746,11 @@ func (fs *filesystem) check(ctx context.Context, l layer.Layer, labels map[strin return rErr } +func isIDMappedDir(mountpoint string) bool { + dirName := filepath.Base(mountpoint) + return len(strings.Split(dirName, "_")) > 1 +} + func (fs *filesystem) Unmount(ctx context.Context, mountpoint string) error { fs.layerMu.Lock() l, ok := fs.layer[mountpoint] @@ -688,8 +758,13 @@ func (fs *filesystem) Unmount(ctx context.Context, mountpoint string) error { fs.layerMu.Unlock() return fmt.Errorf("specified path %q isn't a mountpoint", mountpoint) } - delete(fs.layer, mountpoint) // unregisters the corresponding layer - l.Done() + + delete(fs.layer, mountpoint) + // If the mountpoint is an id-mapped layer, it is pointing to the + // underlying layer, so we cannot call done on it. + if !isIDMappedDir(mountpoint) { + l.Done() + } fs.layerMu.Unlock() fs.metricsController.Remove(mountpoint) // The goroutine which serving the mountpoint possibly becomes not responding. diff --git a/fs/fs_test.go b/fs/fs_test.go index 2591a270a..119ec9c2c 100644 --- a/fs/fs_test.go +++ b/fs/fs_test.go @@ -46,6 +46,7 @@ import ( "github.com/awslabs/soci-snapshotter/fs/layer" "github.com/awslabs/soci-snapshotter/fs/remote" "github.com/awslabs/soci-snapshotter/fs/source" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/containerd/containerd/reference" "github.com/containerd/containerd/remotes/docker" fusefs "github.com/hanwen/go-fuse/v2/fs" @@ -83,10 +84,12 @@ func (l *breakableLayer) Info() layer.Info { Size: 1, } } -func (l *breakableLayer) DisableXAttrs() bool { return false } -func (l *breakableLayer) RootNode(uint32) (fusefs.InodeEmbedder, error) { return nil, nil } -func (l *breakableLayer) Verify(tocDigest digest.Digest) error { return nil } -func (l *breakableLayer) SkipVerify() {} +func (l *breakableLayer) DisableXAttrs() bool { return false } +func (l *breakableLayer) RootNode(uint32, idtools.IDMap) (fusefs.InodeEmbedder, error) { + return nil, nil +} +func (l *breakableLayer) Verify(tocDigest digest.Digest) error { return nil } +func (l *breakableLayer) SkipVerify() {} func (l *breakableLayer) ReadAt([]byte, int64, ...remote.Option) (int, error) { return 0, fmt.Errorf("fail") } diff --git a/fs/layer/layer.go b/fs/layer/layer.go index 5d802fee9..d5ef17fe9 100644 --- a/fs/layer/layer.go +++ b/fs/layer/layer.go @@ -58,6 +58,7 @@ import ( "github.com/awslabs/soci-snapshotter/fs/remote" spanmanager "github.com/awslabs/soci-snapshotter/fs/span-manager" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/awslabs/soci-snapshotter/metadata" "github.com/awslabs/soci-snapshotter/soci" "github.com/awslabs/soci-snapshotter/util/lrucache" @@ -86,7 +87,7 @@ type Layer interface { Info() Info // RootNode returns the root node of this layer. - RootNode(baseInode uint32) (fusefs.InodeEmbedder, error) + RootNode(baseInode uint32, idMapper idtools.IDMap) (fusefs.InodeEmbedder, error) // Check checks if the layer is still connectable. Check() error @@ -456,11 +457,11 @@ func (l *layerRef) Done() { l.done() } -func (l *layer) RootNode(baseInode uint32) (fusefs.InodeEmbedder, error) { +func (l *layer) RootNode(baseInode uint32, idMapper idtools.IDMap) (fusefs.InodeEmbedder, error) { if l.isClosed() { return nil, fmt.Errorf("layer is already closed") } - return newNode(l.desc.Digest, l.r, l.blob, baseInode, l.resolver.overlayOpaqueType, l.resolver.config.LogFuseOperations, l.fuseOperationCounter) + return newNode(l.desc.Digest, l.r, l.blob, baseInode, l.resolver.overlayOpaqueType, l.resolver.config.LogFuseOperations, l.fuseOperationCounter, idMapper) } func (l *layer) ReadAt(p []byte, offset int64, opts ...remote.Option) (int, error) { diff --git a/fs/layer/node.go b/fs/layer/node.go index 6459c1a48..553a073d7 100644 --- a/fs/layer/node.go +++ b/fs/layer/node.go @@ -56,6 +56,7 @@ import ( commonmetrics "github.com/awslabs/soci-snapshotter/fs/metrics/common" "github.com/awslabs/soci-snapshotter/fs/reader" "github.com/awslabs/soci-snapshotter/fs/remote" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/awslabs/soci-snapshotter/metadata" "github.com/containerd/log" fusefs "github.com/hanwen/go-fuse/v2/fs" @@ -189,7 +190,7 @@ func (f *FuseOperationCounter) Run(ctx context.Context) { // logFSOperations may cause sensitive information to be emitted to logs // e.g. filenames and paths within an image -func newNode(layerDgst digest.Digest, r reader.Reader, blob remote.Blob, baseInode uint32, opaque OverlayOpaqueType, logFSOperations bool, opCounter *FuseOperationCounter) (fusefs.InodeEmbedder, error) { +func newNode(layerDgst digest.Digest, r reader.Reader, blob remote.Blob, baseInode uint32, opaque OverlayOpaqueType, logFSOperations bool, opCounter *FuseOperationCounter, idMapper idtools.IDMap) (fusefs.InodeEmbedder, error) { rootID := r.Metadata().RootID() rootAttr, err := r.Metadata().GetAttr(rootID) if err != nil { @@ -210,9 +211,10 @@ func newNode(layerDgst digest.Digest, r reader.Reader, blob remote.Blob, baseIno } ffs.s = ffs.newState(layerDgst, blob) return &node{ - id: rootID, - attr: rootAttr, - fs: ffs, + id: rootID, + attr: rootAttr, + fs: ffs, + idMapper: idMapper, }, nil } @@ -272,9 +274,10 @@ func (fs *fs) inodeOfID(id uint32) (uint64, error) { // node is a filesystem inode abstraction. type node struct { fusefs.Inode - fs *fs - id uint32 - attr metadata.Attr + fs *fs + id uint32 + attr metadata.Attr + idMapper idtools.IDMap ents []fuse.DirEntry entsCached bool @@ -407,14 +410,14 @@ func (n *node) Lookup(ctx context.Context, name string, out *fuse.EntryOut) (*fu n.fs.reportFailure(fuseOpLookup, fmt.Errorf("%s: %v", fuseOpLookup, err)) return nil, syscall.EIO } - entryToAttr(ino, tn.attr, &out.Attr) + n.entryToAttr(ino, tn.attr, &out.Attr) case *whiteout: ino, err := n.fs.inodeOfID(tn.id) if err != nil { n.fs.reportFailure(fuseOpLookup, fmt.Errorf("%s: %v", fuseOpLookup, err)) return nil, syscall.EIO } - entryToAttr(ino, tn.attr, &out.Attr) + n.entryToAttr(ino, tn.attr, &out.Attr) default: n.fs.reportFailure(fuseOpLookup, fmt.Errorf("%s: unknown node type detected", fuseOpLookup)) return nil, syscall.EIO @@ -463,10 +466,11 @@ func (n *node) Lookup(ctx context.Context, name string, out *fuse.EntryOut) (*fu return nil, syscall.EIO } return n.NewInode(ctx, &node{ - id: id, - fs: n.fs, - attr: ce, - }, entryToAttr(ino, ce, &out.Attr)), 0 + id: id, + fs: n.fs, + attr: ce, + idMapper: n.idMapper, + }, n.entryToAttr(ino, ce, &out.Attr)), 0 } var _ = (fusefs.NodeOpener)((*node)(nil)) @@ -495,7 +499,7 @@ func (n *node) Getattr(ctx context.Context, f fusefs.FileHandle, out *fuse.AttrO n.fs.reportFailure(fuseOpGetattr, fmt.Errorf("%s: %v", fuseOpGetattr, err)) return syscall.EIO } - entryToAttr(ino, n.attr, &out.Attr) + n.entryToAttr(ino, n.attr, &out.Attr) return 0 } @@ -594,7 +598,7 @@ func (f *file) Getattr(ctx context.Context, out *fuse.AttrOut) syscall.Errno { f.n.fs.reportFailure(fuseOpFileGetattr, fmt.Errorf("%s: %v", fuseOpFileGetattr, err)) return syscall.EIO } - entryToAttr(ino, f.n.attr, &out.Attr) + f.n.entryToAttr(ino, f.n.attr, &out.Attr) return 0 } @@ -797,7 +801,7 @@ func (sf *statFile) updateStatUnlocked() ([]byte, error) { } // entryToAttr converts metadata.Attr to go-fuse's Attr. -func entryToAttr(ino uint64, e metadata.Attr, out *fuse.Attr) fusefs.StableAttr { +func (n *node) entryToAttr(ino uint64, e metadata.Attr, out *fuse.Attr) fusefs.StableAttr { out.Ino = ino out.Size = uint64(e.Size) if e.Mode&os.ModeSymlink != 0 { @@ -808,7 +812,8 @@ func entryToAttr(ino uint64, e metadata.Attr, out *fuse.Attr) fusefs.StableAttr mtime := e.ModTime out.SetTimes(nil, &mtime, nil) out.Mode = fileModeToSystemMode(e.Mode) - out.Owner = fuse.Owner{Uid: uint32(e.UID), Gid: uint32(e.GID)} + mappedID, _ := n.idMapper.ToHost(idtools.User{Uid: uint32(e.UID), Gid: uint32(e.GID)}) + out.Owner = fuse.Owner{Uid: mappedID.Uid, Gid: mappedID.Gid} out.Rdev = uint32(unix.Mkdev(uint32(e.DevMajor), uint32(e.DevMinor))) out.Nlink = uint32(e.NumLink) if out.Nlink == 0 { diff --git a/fs/layer/node_test.go b/fs/layer/node_test.go index 432b3b66e..9f12e6810 100644 --- a/fs/layer/node_test.go +++ b/fs/layer/node_test.go @@ -50,7 +50,8 @@ func TestEntryToAttr(t *testing.T) { tc := tc t.Run(tc.name, func(t *testing.T) { var actual fuse.Attr - entryToAttr(0, tc.attr, &actual) + var n node + n.entryToAttr(0, tc.attr, &actual) tc.expected.Mtime = actual.Mtime if actual != tc.expected { t.Fatalf("unexpected fuse attr. actual %v expected %v", actual, tc.expected) diff --git a/fs/layer/util_test.go b/fs/layer/util_test.go index 2fec52d89..dcb24420f 100644 --- a/fs/layer/util_test.go +++ b/fs/layer/util_test.go @@ -56,6 +56,7 @@ import ( "github.com/awslabs/soci-snapshotter/fs/reader" "github.com/awslabs/soci-snapshotter/fs/remote" spanmanager "github.com/awslabs/soci-snapshotter/fs/span-manager" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/awslabs/soci-snapshotter/metadata" "github.com/awslabs/soci-snapshotter/util/testutil" "github.com/awslabs/soci-snapshotter/ztoc" @@ -360,7 +361,7 @@ func hasSize(name string, size int) check { } func getRootNode(t *testing.T, r reader.Reader, opaque OverlayOpaqueType) *node { - rootNode, err := newNode(testStateLayerDigest, &testReader{r}, &testBlobState{10, 5}, 100, opaque, false, nil) + rootNode, err := newNode(testStateLayerDigest, &testReader{r}, &testBlobState{10, 5}, 100, opaque, false, nil, idtools.IDMap{}) if err != nil { t.Fatalf("failed to get root node: %v", err) } diff --git a/fs/source/source.go b/fs/source/source.go index ce394d840..9da0b36b2 100644 --- a/fs/source/source.go +++ b/fs/source/source.go @@ -84,6 +84,9 @@ const ( // TargetSociIndexDigestLabel is a label which contains the digest of the soci index. TargetSociIndexDigestLabel = "containerd.io/snapshot/remote/soci.index.digest" + + // HasSociIndexDigest is a label that tells if the layer was pulled with a SOCI index. + HasSociIndexDigest = "containerd.io/snapshot/remote/has.soci.index.digest" ) // RegistryHosts is copied from [github.com/awslabs/soci-snapshotter/service/resolver.RegistryHosts] diff --git a/go.mod b/go.mod index 520b1de5b..74d0df687 100644 --- a/go.mod +++ b/go.mod @@ -27,6 +27,7 @@ require ( github.com/prometheus/client_golang v1.20.5 github.com/rs/xid v1.6.0 github.com/sirupsen/logrus v1.9.3 + github.com/stretchr/testify v1.9.0 go.etcd.io/bbolt v1.3.11 golang.org/x/crypto v0.28.0 golang.org/x/sync v0.8.0 @@ -90,7 +91,6 @@ require ( github.com/prometheus/common v0.55.0 // indirect github.com/prometheus/procfs v0.15.1 // indirect github.com/spf13/pflag v1.0.5 // indirect - github.com/stretchr/testify v1.9.0 // indirect go.opencensus.io v0.24.0 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.45.0 // indirect go.opentelemetry.io/otel v1.21.0 // indirect diff --git a/idtools/idmap.go b/idtools/idmap.go index d5d677285..33d7c632b 100644 --- a/idtools/idmap.go +++ b/idtools/idmap.go @@ -37,10 +37,17 @@ package idtools import ( + "context" "errors" "fmt" + "os" + "os/exec" + "path/filepath" "strings" + "syscall" + "github.com/containerd/containerd/mount" + "github.com/containerd/containerd/snapshots" "github.com/opencontainers/runtime-spec/specs-go" ) @@ -64,6 +71,18 @@ type IDMap struct { GidMap []specs.LinuxIDMapping `json:"GidMap"` } +func LoadIDMap(id string, labels map[string]string) (IDMap, error) { + var idmap IDMap + uidmapJSON, okUID := labels[snapshots.LabelSnapshotUIDMapping] + gidmapJSON, okGID := labels[snapshots.LabelSnapshotGIDMapping] + if okUID && okGID { + if err := idmap.Unmarshal(uidmapJSON, gidmapJSON); err != nil { + return IDMap{}, err + } + } + return idmap, nil +} + // ToHost returns the host user ID pair for the container ID pair. func (i IDMap) ToHost(pair User) (User, error) { var ( @@ -167,3 +186,50 @@ func safeSum(x, y uint32) (uint32, error) { } return z, nil } + +func RemapDir(ctx context.Context, originalMountpoint, newMountpoint string, idMap IDMap) error { + idmappedSnapshotBase := filepath.Dir(newMountpoint) + if err := os.Mkdir(idmappedSnapshotBase, 0755); err != nil { + return err + } + + // Use --no-preserve=links to avoid copying hardlinks + // (Copying hardlinks results in issues with chown as + // it will attempt to chown twice on the same inode) + if err := exec.Command("cp", "-a", "--no-preserve=links", originalMountpoint, idmappedSnapshotBase).Run(); err != nil { + return err + } + return filepath.Walk(newMountpoint, chown(idMap)) +} + +func RemapRoot(ctx context.Context, root string, idMap IDMap) error { + return filepath.Walk(root, chown(idMap)) +} + +func RemapRootFS(ctx context.Context, mounts []mount.Mount, idmap IDMap) error { + return mount.WithTempMount(ctx, mounts, func(root string) error { + return filepath.Walk(root, chown(idmap)) + }) +} + +func chown(idMap IDMap) filepath.WalkFunc { + return func(path string, info os.FileInfo, err error) error { + if err != nil { + return err + } + stat := info.Sys().(*syscall.Stat_t) + h, cerr := idMap.ToHost(User{Uid: stat.Uid, Gid: stat.Gid}) + if cerr != nil { + return cerr + } + // be sure the lchown the path as to not de-reference the symlink to a host file + if cerr = os.Lchown(path, int(h.Uid), int(h.Gid)); cerr != nil { + return cerr + } + // we must retain special permissions such as setuid, setgid and sticky bits + if mode := info.Mode(); mode&os.ModeSymlink == 0 && mode&(os.ModeSetuid|os.ModeSetgid|os.ModeSticky) != 0 { + return os.Chmod(path, mode) + } + return nil + } +} diff --git a/integration/run_test.go b/integration/run_test.go index 260eb7a56..0992b0a3a 100644 --- a/integration/run_test.go +++ b/integration/run_test.go @@ -35,8 +35,10 @@ package integration import ( "bufio" "bytes" + "errors" "fmt" "os" + "path/filepath" "regexp" "strconv" "strings" @@ -520,3 +522,236 @@ func TestRunInNamespace(t *testing.T) { } } } + +func TestRunWithIdMap(t *testing.T) { + type checker struct { + path string + + // All expected UID/GIDs should be of length 5, + // so make sure to left pad any UIDs/GIDs to + // be strings of length 5 + expectedUIDOnHost string + expectedGIDOnHost string + expectedUIDInContainer string + expectedGIDInContainer string + } + + baseSnapshotDir := "/var/lib/soci-snapshotter-grpc/snapshotter/snapshots" + baseRuntimeDir := "/run/containerd/io.containerd.runtime.v2.task/default" + + uidPath := "/etc/subuid" + gidPath := "/etc/subgid" + + dummyuser := "dummy-user" + dummygroup := "dummy-group" + + minContainerdVersion := "v1.7.23" + shouldSkip := func(version string) bool { + semVer := strings.Replace(version, "v", "", 1) + onlySemVer := strings.Split(semVer, "-")[0] // remove rc tag if exists + onlySemVerArr := strings.Split(onlySemVer, ".") + + major, _ := strconv.Atoi(onlySemVerArr[0]) + minor, _ := strconv.Atoi(onlySemVerArr[1]) + patch, _ := strconv.Atoi(onlySemVerArr[2]) + + switch { + case major >= 2 || minor >= 8: + return false + case minor == 7 && patch >= 23: + return false + default: + return true + } + } + + checkUIDGID := func(stat, uid, gid string) error { + if len(uid) < 5 || len(gid) < 5 { + return errors.New("UID or GID not a string of length 5") + } + + matchUID := fmt.Sprintf("Uid: (%s", uid) + if !strings.Contains(stat, matchUID) { + return fmt.Errorf("expected UID: %s; actual UID: %s", matchUID, uid) + } + + matchGID := fmt.Sprintf("Uid: (%s", gid) + if !strings.Contains(stat, matchGID) { + return fmt.Errorf("expected GID: %s; actual GID: %s", matchGID, gid) + } + + return nil + } + + modes := []struct { + name string + indexBuilderFn func(sh *shell.Shell, src imageInfo, opts ...indexBuildOption) string + }{ + { + name: "with only FUSE layers", + indexBuilderFn: func(sh *shell.Shell, src imageInfo, opts ...indexBuildOption) string { + opts = append(opts, withMinLayerSize(0)) + return buildIndex(sh, src, opts...) + }, + }, + { + name: "with mixed layers", + indexBuilderFn: func(sh *shell.Shell, src imageInfo, opts ...indexBuildOption) string { + return buildIndex(sh, src, opts...) + }, + }, + { + name: "with no SOCI index", + indexBuilderFn: func(sh *shell.Shell, src imageInfo, opts ...indexBuildOption) string { + return "" + }, + }, + } + + tests := []struct { + name string + imageName string + subUIDContents string + subGIDContents string + checkFiles []checker + }{ + { + name: "with one set of substitutions", + imageName: rabbitmqImage, + subUIDContents: fmt.Sprintf("%s:12345:1001", dummyuser), + subGIDContents: fmt.Sprintf("%s:12345:1001", dummyuser), + checkFiles: []checker{ + { + path: "/usr/bin/sh", + expectedUIDOnHost: "12345", + expectedGIDOnHost: "12345", + expectedUIDInContainer: " 0", + expectedGIDInContainer: " 0", + }, + }, + }, + { + name: "with multiple substitutions", + // This version of ubuntu has a default "ubuntu" user, so we can use that + // to ensure that the user there has proper multi-uid/gid perms done + // Note that as the ubuntu image is one layer, the mixed layer use case will + // effectively act the same as the only FUSE layer usecase. + // Maybe we can find a more suitable image for this test. + imageName: "ubuntu:24.04", + subUIDContents: fmt.Sprintf("%s:12345:1000\n%s:22222:1", dummyuser, dummyuser), + subGIDContents: fmt.Sprintf("%s:12345:1000\n%s:22222:1", dummyuser, dummyuser), + checkFiles: []checker{ + { + path: "/usr/bin/sh", + expectedUIDOnHost: "12345", + expectedGIDOnHost: "12345", + expectedUIDInContainer: " 0", + expectedGIDInContainer: " 0", + }, + { + path: "/home/ubuntu", + expectedUIDOnHost: "22222", + expectedGIDOnHost: "22222", + expectedUIDInContainer: " 1000", + expectedGIDInContainer: " 1000", + }, + }, + }, + } + + for _, mode := range modes { + mode := mode + for _, tt := range tests { + tt := tt + t.Run(tt.name+" "+mode.name, func(t *testing.T) { + regConfig := newRegistryConfig() + sh, done := newShellWithRegistry(t, regConfig) + defer done() + + arr := strings.Split(string(sh.O("containerd", "--version")), " ") + if len(arr) < 3 { + t.Fatal("error parsing containerd version") + } + + // index 2 contains the semantic version when running 'containerd --version' + ver := arr[2] + if shouldSkip(ver) { + t.Skipf("version %s is less than %s, skipping", ver, minContainerdVersion) + } + + sh.X("groupadd", "-g", "12345", dummygroup) + sh.X("useradd", "-u", "12345", "-g", "12345", "-m", dummyuser) + + sh.Pipe(nil, shell.C("echo", tt.subUIDContents), shell.C("tee", uidPath)) + sh.Pipe(nil, shell.C("echo", tt.subGIDContents), shell.C("tee", gidPath)) + + rebootContainerd(t, sh, getContainerdConfigToml(t, false), getSnapshotterConfigToml(t, false)) + imageInfo := dockerhub(tt.imageName) + sh.X("nerdctl", "pull", "-q", imageInfo.ref) + + filenames, err := sh.OLog("ls", baseSnapshotDir) + if err != nil { + t.Fatalf("error listing files in %s", baseSnapshotDir) + } + + // Copy image, remove blobs, and re-pull with SOCI + mirrorInfo := regConfig.mirror(imageInfo.ref) + copyImage(sh, imageInfo, mirrorInfo) + indexDigest := mode.indexBuilderFn(sh, mirrorInfo) + if indexDigest != "" { + sh.X("soci", "push", "--user", regConfig.creds(), mirrorInfo.ref) + } + sh.X("rm", "-rf", filepath.Join(store.DefaultSociContentStorePath, "blobs", "sha256")) + + pullCmd := imagePullCmd + if indexDigest != "" { + pullCmd = append(pullCmd, "--soci-index-digest", indexDigest) + } + sh.X(append(pullCmd, mirrorInfo.ref)...) + containerID := strings.TrimSpace(string(sh.O("nerdctl-with-idmapping", "run", "-d", + "--net", "none", + "--pull", "never", + "--userns", dummyuser, + "--snapshotter", "soci", + imageInfo.ref, "sleep", "infinity", + ))) + + newFilenames, err := sh.OLog("ls", baseSnapshotDir) + if err != nil { + t.Fatalf("error listing files in %s", baseSnapshotDir) + } + + if len(filenames) == len(newFilenames) { + t.Fatalf("error: id-mapping failed") + } + + for _, check := range tt.checkFiles { + // Check UID/GID on host + fullCheckPath := filepath.Join(baseRuntimeDir, containerID, "rootfs", check.path) + statHost, err := sh.OLog("stat", fullCheckPath) + if err != nil { + t.Fatalf("error stat files in %s", fullCheckPath) + } + strstatHost := string(statHost) + + err = checkUIDGID(strstatHost, check.expectedUIDOnHost, check.expectedGIDOnHost) + if err != nil { + t.Fatalf("incorrect IDs on host at %s: %v", check.path, err) + } + + // Check UID/GID in container + statContainer, err := sh.OLog("nerdctl", "exec", containerID, "stat", check.path) + if err != nil { + t.Fatalf("error stat files in %s", fullCheckPath) + } + strStatContainer := string(statContainer) + + err = checkUIDGID(strStatContainer, check.expectedUIDInContainer, check.expectedGIDInContainer) + if err != nil { + t.Fatalf("incorrect IDs in container at %s: %v", check.path, err) + } + } + }) + } + } +} diff --git a/integration/util_test.go b/integration/util_test.go index 60c3f823f..7c3f38475 100644 --- a/integration/util_test.go +++ b/integration/util_test.go @@ -106,6 +106,7 @@ const proxySnapshotterConfig = ` [proxy_plugins.soci] type = "snapshot" address = "/run/soci-snapshotter-grpc/soci-snapshotter-grpc.sock" + capabilities = ["multi-remap-ids", "remap-ids"] ` const containerdConfigTemplate = ` diff --git a/snapshot/snapshot.go b/snapshot/snapshot.go index f770b5b5b..b1207bf75 100644 --- a/snapshot/snapshot.go +++ b/snapshot/snapshot.go @@ -40,10 +40,12 @@ import ( "path/filepath" "strconv" "strings" + "sync" "syscall" commonmetrics "github.com/awslabs/soci-snapshotter/fs/metrics/common" "github.com/awslabs/soci-snapshotter/fs/source" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/containerd/containerd/mount" ctdsnapshotters "github.com/containerd/containerd/pkg/snapshotters" "github.com/containerd/containerd/snapshots" @@ -105,6 +107,8 @@ type FileSystem interface { Check(ctx context.Context, mountpoint string, labels map[string]string) error Unmount(ctx context.Context, mountpoint string) error MountLocal(ctx context.Context, mountpoint string, labels map[string]string, mounts []mount.Mount) error + IDMapMount(ctx context.Context, mountpoint, activeLayerID string, idmap idtools.IDMap) (string, error) + IDMapMountLocal(ctx context.Context, mountpoint, activeLayerID string, idmap idtools.IDMap) (string, error) } // SnapshotterConfig is used to configure the remote snapshotter instance @@ -150,6 +154,7 @@ type snapshotter struct { userxattr bool // whether to enable "userxattr" mount option minLayerSize int64 // minimum layer size for remote mounting allowInvalidMountsOnRestart bool + idmapped *sync.Map } // NewSnapshotter returns a Snapshotter which can use unpacked remote layers @@ -192,6 +197,8 @@ func NewSnapshotter(ctx context.Context, root string, targetFs FileSystem, opts logrus.WithError(err).Warnf("cannot detect whether \"userxattr\" option needs to be used, assuming to be %v", userxattr) } + idMap := &sync.Map{} + o := &snapshotter{ root: root, ms: ms, @@ -200,6 +207,7 @@ func NewSnapshotter(ctx context.Context, root string, targetFs FileSystem, opts userxattr: userxattr, minLayerSize: config.minLayerSize, allowInvalidMountsOnRestart: config.allowInvalidMountsOnRestart, + idmapped: idMap, } if err := o.restoreRemoteSnapshot(ctx); err != nil { @@ -285,6 +293,48 @@ func (o *snapshotter) Usage(ctx context.Context, key string) (snapshots.Usage, e return usage, nil } +func (o *snapshotter) setupIDMap(ctx context.Context, s storage.Snapshot, parent string, labels map[string]string) error { + // load id-map if appropriate labels are present. + idmap, err := idtools.LoadIDMap(s.ID, labels) + if err != nil { + log.G(ctx).WithError(err).Error("failed to load id-map") + return err + } + + if !idmap.Empty() { + parentSnapshot, err := o.Stat(ctx, parent) + if err != nil { + log.G(ctx).WithError(err).Error("failed to stat parent snapshot") + return err + } + + // If there is no SOCI index, you can safely mount from the root without copying over every single layer + if _, ok := parentSnapshot.Labels[source.HasSociIndexDigest]; !ok { + // Fallback to overlay + log.G(ctx).Debug("no SOCI index found, remapping from root") + mounts, err := o.mounts(ctx, s, parent) + if err != nil { + return err + } + + err = idtools.RemapRootFS(ctx, mounts, idmap) + if err != nil { + return err + } + } else { + o.idmapped.Store(s.ID, struct{}{}) + err = o.createIDMapMounts(ctx, s, idmap) + if err != nil { + log.G(ctx).WithError(err).Error("failed to create id-mapped mounts") + return err + } + } + + log.G(ctx).Debug("id-mapping successful") + } + return nil +} + func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...snapshots.Opt) ([]mount.Mount, error) { log.G(ctx).WithField("key", key).WithField("parent", parent).Debug("prepare") s, err := o.createSnapshot(ctx, snapshots.KindActive, key, parent, opts) @@ -302,7 +352,13 @@ func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...s } target, ok := base.Labels[targetSnapshotLabel] + // !ok means we are in an active snapshot if !ok { + // Setup id-mapped mounts if config allows. + // Any error here needs to stop the container from starting. + if err := o.setupIDMap(ctx, s, parent, base.Labels); err != nil { + return nil, err + } return o.mounts(ctx, s, parent) } @@ -319,7 +375,8 @@ func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...s if !o.skipRemoteSnapshotPrepare(lCtx, base.Labels) { err := o.prepareRemoteSnapshot(lCtx, key, base.Labels) if err == nil { - base.Labels[remoteLabel] = remoteLabelVal // Mark this snapshot as remote + base.Labels[remoteLabel] = remoteLabelVal // Mark this snapshot as remote + base.Labels[source.HasSociIndexDigest] = "true" // Mark that this snapshot was loaded with a SOCI index err := o.commit(ctx, true, target, key, append(opts, snapshots.WithLabels(base.Labels))...) if err == nil || errdefs.IsAlreadyExists(err) { // count also AlreadyExists as "success" @@ -361,6 +418,7 @@ func (o *snapshotter) Prepare(ctx context.Context, key, parent string, opts ...s log.G(ctx).WithField("layerDigest", base.Labels[ctdsnapshotters.TargetLayerDigestLabel]).Info("preparing snapshot as local snapshot") err = o.prepareLocalSnapshot(lCtx, key, base.Labels, mounts) if err == nil { + base.Labels[source.HasSociIndexDigest] = "true" // Mark that this snapshot was loaded with a SOCI index err := o.commit(ctx, false, target, key, append(opts, snapshots.WithLabels(base.Labels))...) if err == nil || errdefs.IsAlreadyExists(err) { // count also AlreadyExists as "success" @@ -578,7 +636,18 @@ func (o *snapshotter) getCleanupDirectories(ctx context.Context, t storage.Trans cleanup := []string{} for _, d := range dirs { if !cleanupCommitted { - if _, ok := ids[d]; ok { + // If the directory name is just a number (e.g '2'), + // we want to check if the dir name (2) must be cleaned + // If the directory has an underscore (e.g. '1_2'), + // we want to check the suffix (2) to determine if + // the directory must be cleaned + cleanupID := d + temp := strings.Split(d, "_") + if len(temp) > 1 { + cleanupID = temp[1] + } + + if _, ok := ids[cleanupID]; ok { continue } } @@ -757,15 +826,16 @@ func (o *snapshotter) mounts(ctx context.Context, s storage.Snapshot, checkKey s }, nil } - parentPaths := make([]string, len(s.ParentIDs)) - for i := range s.ParentIDs { - parentPaths[i] = o.upperPath(s.ParentIDs[i]) + parentPaths, err := o.getParentPaths(s) + if err != nil { + return nil, err } options = append(options, fmt.Sprintf("lowerdir=%s", strings.Join(parentPaths, ":"))) if o.userxattr { options = append(options, "userxattr") } + return []mount.Mount{ { Type: "overlay", @@ -773,7 +843,49 @@ func (o *snapshotter) mounts(ctx context.Context, s storage.Snapshot, checkKey s Options: options, }, }, nil +} + +func (o *snapshotter) getParentPaths(s storage.Snapshot) ([]string, error) { + parentPaths := make([]string, len(s.ParentIDs)) + + for i, id := range s.ParentIDs { + if _, ok := o.idmapped.Load(s.ID); ok { + id = fmt.Sprintf("%s_%s", id, s.ID) + } + parentPaths[i] = o.upperPath(id) + } + + return parentPaths, nil +} +func (o *snapshotter) createIDMapMounts(ctx context.Context, s storage.Snapshot, idmap idtools.IDMap) error { + log.G(ctx).Debug("mapping ids") + + for _, id := range s.ParentIDs { + err := o.createIDMapMount(ctx, o.upperPath(id), s.ID, idmap) + if err != nil { + return err + } + } + + return idtools.RemapRoot(ctx, o.upperPath(s.ID), idmap) +} + +func (o *snapshotter) createIDMapMount(ctx context.Context, path, id string, idmap idtools.IDMap) error { + // s.ID is the shortest unique identifier for each new container, + // so append it to the end of the new mountpoint + _, err := o.fs.IDMapMount(ctx, path, id, idmap) + if errdefs.IsNotFound(err) { + // Remote mount failed, attempt to create a local id-mapped mount + + // Cleanup dirty snapshot folder — perhaps we can have a return cleanup func? + dirtyDir := fmt.Sprintf("%s_%s", filepath.Dir(path), id) + if err := os.RemoveAll(dirtyDir); err != nil { + return err + } + _, err = o.fs.IDMapMountLocal(ctx, path, id, idmap) + } + return err } // upperPath produces a file path like "{snapshotter.root}/snapshots/{id}/fs" diff --git a/snapshot/snapshot_test.go b/snapshot/snapshot_test.go index cc798a607..ff2542044 100644 --- a/snapshot/snapshot_test.go +++ b/snapshot/snapshot_test.go @@ -41,6 +41,7 @@ import ( "syscall" "testing" + "github.com/awslabs/soci-snapshotter/idtools" "github.com/containerd/containerd/mount" "github.com/containerd/containerd/pkg/testutil" "github.com/containerd/containerd/snapshots" @@ -417,6 +418,14 @@ func (fs *bindFs) MountLocal(ctx context.Context, mountpoint string, labels map[ return nil } +func (fs *bindFs) IDMapMount(ctx context.Context, mountpoint, activeLayerID string, idmap idtools.IDMap) (string, error) { + return mountpoint, nil +} + +func (fs *bindFs) IDMapMountLocal(ctx context.Context, mountpoint, activeLayerID string, idmap idtools.IDMap) (string, error) { + return mountpoint, nil +} + func dummyFileSystem() FileSystem { return &dummyFs{} } type dummyFs struct{} @@ -437,6 +446,14 @@ func (fs *dummyFs) MountLocal(ctx context.Context, mountpoint string, labels map return fmt.Errorf("dummy") } +func (fs *dummyFs) IDMapMount(ctx context.Context, mountpoint, activeLayerID string, idmap idtools.IDMap) (string, error) { + return "", fmt.Errorf("dummy") +} + +func (fs *dummyFs) IDMapMountLocal(ctx context.Context, mountpoint, activeLayerID string, idmap idtools.IDMap) (string, error) { + return "", fmt.Errorf("dummy") +} + // ============================================================================= // Tests backword-comaptibility of overlayfs snapshotter.